classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,280 @@
1
+ """Broken links detector for URL reachability and empty responses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from collections.abc import Iterable
8
+ from dataclasses import dataclass
9
+
10
+ import requests
11
+
12
+ from ...models.generated_detectors import BrokenLinksDetectorConfig, DetectorConfig, Severity
13
+ from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType
14
+ from ...utils.hashing import normalize_http_url
15
+ from ..base import BaseDetector
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class LinkScanResult:
22
+ url: str
23
+ line: int
24
+ start: int
25
+ end: int
26
+ finding_type: str
27
+ confidence: float
28
+ metadata: dict[str, object]
29
+
30
+
31
+ class BrokenLinksDetector(BaseDetector):
32
+ """
33
+ Detector for broken links and empty link targets.
34
+
35
+ Input content is expected to be newline-delimited URLs
36
+ (one URL per line), passed with `application/x.asset-links`.
37
+ """
38
+
39
+ detector_type = "broken_links"
40
+ detector_name = "broken_links"
41
+
42
+ _REQUEST_TIMEOUT_SECONDS = 8
43
+ _MAX_CONCURRENCY = 12
44
+ _USER_AGENT = "classifyre-broken-links-detector/1.0"
45
+
46
+ def __init__(self, config: DetectorConfig | None = None) -> None:
47
+ super().__init__(config)
48
+ self._cfg: BrokenLinksDetectorConfig = (
49
+ config if isinstance(config, BrokenLinksDetectorConfig) else BrokenLinksDetectorConfig()
50
+ )
51
+ self._session = requests.Session()
52
+ self._session.headers.update({"User-Agent": self._USER_AGENT})
53
+
54
+ async def detect(
55
+ self, content: str | bytes, content_type: str = "application/x.asset-links"
56
+ ) -> list[DetectionResult]:
57
+ if isinstance(content, bytes):
58
+ return []
59
+ if content_type not in self.get_supported_content_types():
60
+ return []
61
+
62
+ links = self._extract_links(content)
63
+ if not links:
64
+ return []
65
+
66
+ semaphore = asyncio.Semaphore(self._MAX_CONCURRENCY)
67
+
68
+ async def check_link(url: str, line: int, start: int, end: int) -> LinkScanResult | None:
69
+ async with semaphore:
70
+ return await asyncio.to_thread(
71
+ self._scan_link,
72
+ url,
73
+ line,
74
+ start,
75
+ end,
76
+ )
77
+
78
+ tasks = [
79
+ check_link(url=url, line=line, start=start, end=end) for url, line, start, end in links
80
+ ]
81
+ results = await asyncio.gather(*tasks, return_exceptions=True)
82
+
83
+ findings: list[DetectionResult] = []
84
+ for result in results:
85
+ if isinstance(result, Exception):
86
+ logger.debug("Broken links detector task failed: %s", result)
87
+ continue
88
+ if result is None:
89
+ continue
90
+
91
+ findings.append(
92
+ DetectionResult(
93
+ detector_type=DetectorType.BROKEN_LINKS,
94
+ finding_type=result.finding_type,
95
+ category="link_integrity",
96
+ severity=Severity.low,
97
+ confidence=result.confidence,
98
+ matched_content=result.url,
99
+ location=None,
100
+ metadata=result.metadata,
101
+ )
102
+ )
103
+
104
+ if self._cfg.max_findings and len(findings) > self._cfg.max_findings:
105
+ findings = findings[: self._cfg.max_findings]
106
+
107
+ return findings
108
+
109
+ def get_supported_content_types(self) -> list[str]:
110
+ return ["application/x.asset-links"]
111
+
112
+ def _extract_links(self, content: str) -> list[tuple[str, int, int, int]]:
113
+ links: list[tuple[str, int, int, int]] = []
114
+ seen: set[str] = set()
115
+ offset = 0
116
+ for line_number, raw_line in enumerate(content.splitlines(), start=1):
117
+ line = raw_line.strip()
118
+ if not line:
119
+ offset += len(raw_line) + 1
120
+ continue
121
+
122
+ normalized = normalize_http_url(line)
123
+ if not normalized:
124
+ offset += len(raw_line) + 1
125
+ continue
126
+
127
+ if normalized in seen:
128
+ offset += len(raw_line) + 1
129
+ continue
130
+
131
+ seen.add(normalized)
132
+ stripped_index = raw_line.find(line)
133
+ start = offset + (stripped_index if stripped_index >= 0 else 0)
134
+ end = start + len(line)
135
+ links.append((normalized, line_number, start, end))
136
+ offset += len(raw_line) + 1
137
+
138
+ return links
139
+
140
+ def _scan_link(
141
+ self,
142
+ url: str,
143
+ line: int,
144
+ start: int,
145
+ end: int,
146
+ ) -> LinkScanResult | None:
147
+ head_response: requests.Response | None = None
148
+ try:
149
+ head_response = self._session.head(
150
+ url,
151
+ allow_redirects=True,
152
+ timeout=self._REQUEST_TIMEOUT_SECONDS,
153
+ )
154
+ status_code = head_response.status_code
155
+
156
+ if status_code in {405, 501}:
157
+ return self._scan_with_get(url, line, start, end, "head_not_supported")
158
+
159
+ if status_code >= 400:
160
+ return LinkScanResult(
161
+ url=url,
162
+ line=line,
163
+ start=start,
164
+ end=end,
165
+ finding_type="unreachable",
166
+ confidence=0.95,
167
+ metadata={"status_code": status_code, "reason": "http_error"},
168
+ )
169
+
170
+ content_length = self._parse_content_length(head_response.headers)
171
+ if content_length == 0:
172
+ return LinkScanResult(
173
+ url=url,
174
+ line=line,
175
+ start=start,
176
+ end=end,
177
+ finding_type="empty_content",
178
+ confidence=0.9,
179
+ metadata={"status_code": status_code, "reason": "empty_head_content_length"},
180
+ )
181
+
182
+ # Some servers omit Content-Length, so perform a lightweight GET check.
183
+ if content_length is None:
184
+ return self._scan_with_get(url, line, start, end, "missing_content_length")
185
+
186
+ return None
187
+ except requests.RequestException as exc:
188
+ return LinkScanResult(
189
+ url=url,
190
+ line=line,
191
+ start=start,
192
+ end=end,
193
+ finding_type="unreachable",
194
+ confidence=0.95,
195
+ metadata={"reason": "request_exception", "error": str(exc)},
196
+ )
197
+ finally:
198
+ if head_response is not None:
199
+ head_response.close()
200
+
201
+ def _scan_with_get(
202
+ self,
203
+ url: str,
204
+ line: int,
205
+ start: int,
206
+ end: int,
207
+ reason: str,
208
+ ) -> LinkScanResult | None:
209
+ get_response: requests.Response | None = None
210
+ try:
211
+ get_response = self._session.get(
212
+ url,
213
+ allow_redirects=True,
214
+ timeout=self._REQUEST_TIMEOUT_SECONDS,
215
+ stream=True,
216
+ )
217
+ status_code = get_response.status_code
218
+ if status_code >= 400:
219
+ return LinkScanResult(
220
+ url=url,
221
+ line=line,
222
+ start=start,
223
+ end=end,
224
+ finding_type="unreachable",
225
+ confidence=0.95,
226
+ metadata={"status_code": status_code, "reason": reason},
227
+ )
228
+
229
+ content_length = self._parse_content_length(get_response.headers)
230
+ if content_length == 0:
231
+ return LinkScanResult(
232
+ url=url,
233
+ line=line,
234
+ start=start,
235
+ end=end,
236
+ finding_type="empty_content",
237
+ confidence=0.9,
238
+ metadata={"status_code": status_code, "reason": reason},
239
+ )
240
+
241
+ has_payload = self._response_has_payload(get_response.iter_content(chunk_size=1))
242
+ if not has_payload:
243
+ return LinkScanResult(
244
+ url=url,
245
+ line=line,
246
+ start=start,
247
+ end=end,
248
+ finding_type="empty_content",
249
+ confidence=0.9,
250
+ metadata={"status_code": status_code, "reason": "empty_body"},
251
+ )
252
+ return None
253
+ except requests.RequestException as exc:
254
+ return LinkScanResult(
255
+ url=url,
256
+ line=line,
257
+ start=start,
258
+ end=end,
259
+ finding_type="unreachable",
260
+ confidence=0.95,
261
+ metadata={"reason": reason, "error": str(exc)},
262
+ )
263
+ finally:
264
+ if get_response is not None:
265
+ get_response.close()
266
+
267
+ def _parse_content_length(self, headers: dict[str, object]) -> int | None:
268
+ raw_length = headers.get("Content-Length")
269
+ if raw_length is None:
270
+ return None
271
+ try:
272
+ return int(str(raw_length))
273
+ except (TypeError, ValueError):
274
+ return None
275
+
276
+ def _response_has_payload(self, chunks: Iterable[bytes]) -> bool:
277
+ for chunk in chunks:
278
+ if chunk:
279
+ return True
280
+ return False
@@ -0,0 +1,59 @@
1
+ """Shared detector config resolution and type mapping."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from ..models.generated_detectors import (
8
+ BrokenLinksDetectorConfig,
9
+ CustomDetectorConfig,
10
+ DetectorConfig,
11
+ PIIDetectorConfig,
12
+ SecretsDetectorConfig,
13
+ ThreatDetectorConfig,
14
+ )
15
+
16
+ type DetectorTypedConfig = (
17
+ DetectorConfig
18
+ | CustomDetectorConfig
19
+ | SecretsDetectorConfig
20
+ | PIIDetectorConfig
21
+ | ThreatDetectorConfig
22
+ | BrokenLinksDetectorConfig
23
+ )
24
+
25
+ _DETECTOR_NAME_BY_TYPE: dict[str, str] = {
26
+ "SECRETS": "secrets",
27
+ "PII": "pii",
28
+ "YARA": "yara",
29
+ "BROKEN_LINKS": "broken_links",
30
+ "CODE_SECURITY": "code_security",
31
+ "CUSTOM": "custom",
32
+ }
33
+
34
+ _DETECTOR_CONFIG_BY_TYPE: dict[str, type[DetectorConfig]] = {
35
+ "SECRETS": SecretsDetectorConfig,
36
+ "PII": PIIDetectorConfig,
37
+ "YARA": ThreatDetectorConfig,
38
+ "BROKEN_LINKS": BrokenLinksDetectorConfig,
39
+ "CUSTOM": CustomDetectorConfig,
40
+ }
41
+
42
+
43
+ def normalize_detector_type(detector_type: str) -> str:
44
+ return detector_type.strip().upper()
45
+
46
+
47
+ def get_detector_name(detector_type: str) -> str:
48
+ normalized = normalize_detector_type(detector_type)
49
+ return _DETECTOR_NAME_BY_TYPE.get(normalized, normalized.lower())
50
+
51
+
52
+ def parse_detector_config(detector_type: str, raw_config: Any) -> tuple[str, DetectorTypedConfig]:
53
+ normalized = normalize_detector_type(detector_type)
54
+ detector_name = get_detector_name(normalized)
55
+ config_cls = _DETECTOR_CONFIG_BY_TYPE.get(normalized, DetectorConfig)
56
+ if not isinstance(raw_config, dict):
57
+ raw_config = {}
58
+ typed_config = config_cls.model_validate(raw_config)
59
+ return detector_name, typed_config
File without changes
@@ -0,0 +1,13 @@
1
+ """Custom detector implementations."""
2
+
3
+ from .detector import CustomDetector
4
+ from .runners import BaseRunner, GLiNER2Runner, LLMRunner, RegexRunner, create_runner
5
+
6
+ __all__ = [
7
+ "BaseRunner",
8
+ "CustomDetector",
9
+ "GLiNER2Runner",
10
+ "LLMRunner",
11
+ "RegexRunner",
12
+ "create_runner",
13
+ ]
@@ -0,0 +1,45 @@
1
+ """Custom detector — delegates to the appropriate runner via the runner factory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ...models.generated_detectors import (
8
+ CustomDetectorConfig,
9
+ DetectorConfig,
10
+ )
11
+ from ...models.generated_single_asset_scan_results import DetectionResult
12
+ from ..base import BaseDetector
13
+ from .runners import BaseRunner, create_runner
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class CustomDetector(BaseDetector):
19
+ """Schema-driven detector backed by a pluggable runner (GLINER2 | REGEX | LLM | transformer)."""
20
+
21
+ detector_type = "custom"
22
+ detector_name = "custom"
23
+
24
+ def __init__(self, config: DetectorConfig | None = None):
25
+ super().__init__(config)
26
+ if not isinstance(self.config, CustomDetectorConfig):
27
+ raise ValueError("CustomDetector requires CustomDetectorConfig with pipeline_schema")
28
+ self.custom_config: CustomDetectorConfig = self.config
29
+ self._runner: BaseRunner = create_runner(
30
+ self.custom_config.pipeline_schema,
31
+ detector_key=self.custom_config.custom_detector_key,
32
+ detector_name=self.custom_config.name,
33
+ )
34
+
35
+ async def detect(
36
+ self, content: str | bytes, content_type: str = "text/plain"
37
+ ) -> list[DetectionResult]:
38
+ findings = self._runner.detect(content, content_type)
39
+ max_findings = self.custom_config.max_findings
40
+ if isinstance(max_findings, int) and max_findings > 0:
41
+ findings = findings[:max_findings]
42
+ return findings
43
+
44
+ def get_supported_content_types(self) -> list[str]:
45
+ return self._runner.get_supported_content_types()
@@ -0,0 +1,56 @@
1
+ """Custom detector runner package.
2
+
3
+ Public surface re-exported here so that existing imports of the form
4
+ from .runners import BaseRunner, create_runner, GLiNER2Runner, ...
5
+ continue to work unchanged.
6
+ """
7
+
8
+ from ._base import (
9
+ _DEFAULT_GLINER2_MODEL,
10
+ _DEFAULT_IMAGE_CLASSIFICATION_MODEL,
11
+ _IMAGE_CONTENT_TYPES,
12
+ _TEXT_CONTENT_TYPES,
13
+ BaseRunner,
14
+ _resolve_pipeline_severity,
15
+ )
16
+ from ._factory import create_runner
17
+ from ._feature_extraction import FeatureExtractionRunner, _chunk_text_with_offsets, _pool_hidden
18
+ from ._gliner2 import (
19
+ GLiNER2Runner,
20
+ _apply_classification_validation,
21
+ _apply_entity_validation,
22
+ _normalise_classification_output,
23
+ _normalise_entity_output,
24
+ _normalise_span,
25
+ )
26
+ from ._image_classification import ImageClassificationRunner
27
+ from ._llm import LLMRunner
28
+ from ._object_detection import ObjectDetectionRunner
29
+ from ._regex import RegexRunner, _load_regex_engine
30
+ from ._text_classification import TextClassificationRunner, _chunk_text
31
+
32
+ __all__ = [
33
+ "_DEFAULT_GLINER2_MODEL",
34
+ "_DEFAULT_IMAGE_CLASSIFICATION_MODEL",
35
+ "_IMAGE_CONTENT_TYPES",
36
+ "_TEXT_CONTENT_TYPES",
37
+ "BaseRunner",
38
+ "FeatureExtractionRunner",
39
+ "GLiNER2Runner",
40
+ "ImageClassificationRunner",
41
+ "LLMRunner",
42
+ "ObjectDetectionRunner",
43
+ "RegexRunner",
44
+ "TextClassificationRunner",
45
+ "_apply_classification_validation",
46
+ "_apply_entity_validation",
47
+ "_chunk_text",
48
+ "_chunk_text_with_offsets",
49
+ "_load_regex_engine",
50
+ "_normalise_classification_output",
51
+ "_normalise_entity_output",
52
+ "_normalise_span",
53
+ "_pool_hidden",
54
+ "_resolve_pipeline_severity",
55
+ "create_runner",
56
+ ]
@@ -0,0 +1,177 @@
1
+ """Base runner interface and shared utilities for all pipeline execution strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from abc import ABC, abstractmethod
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from ....models.generated_detectors import (
11
+ PipelineResult,
12
+ PipelineSeverityRule,
13
+ Severity,
14
+ )
15
+ from ....models.generated_single_asset_scan_results import (
16
+ DetectionResult,
17
+ DetectorType,
18
+ Location,
19
+ )
20
+
21
+ _DEFAULT_GLINER2_MODEL = "fastino/gliner2-base-v1"
22
+ _DEFAULT_IMAGE_CLASSIFICATION_MODEL = "google/vit-base-patch16-224"
23
+
24
+ _TEXT_CONTENT_TYPES = [
25
+ "text/plain",
26
+ "text/html",
27
+ "text/markdown",
28
+ "application/json",
29
+ "application/xml",
30
+ "text/xml",
31
+ ]
32
+ _IMAGE_CONTENT_TYPES = [
33
+ "image/jpeg",
34
+ "image/jpg",
35
+ "image/png",
36
+ "image/gif",
37
+ "image/webp",
38
+ "image/bmp",
39
+ "image/tiff",
40
+ ]
41
+
42
+
43
+ def _resolve_pipeline_severity(
44
+ label: str,
45
+ severity_map: list[PipelineSeverityRule] | None,
46
+ default: Severity = Severity.info,
47
+ ) -> Severity:
48
+ """Return the first severity whose pattern matches label (case-insensitive)."""
49
+ if not severity_map:
50
+ return default
51
+ label_lower = label.lower()
52
+ for rule in severity_map:
53
+ try:
54
+ if re.search(rule.pattern, label_lower, re.IGNORECASE):
55
+ return rule.severity
56
+ except re.error:
57
+ if rule.pattern.lower() in label_lower:
58
+ return rule.severity
59
+ return default
60
+
61
+
62
+ class BaseRunner(ABC):
63
+ """Common interface for all pipeline execution strategies."""
64
+
65
+ _detector_key: str = ""
66
+ _detector_name: str = ""
67
+
68
+ @abstractmethod
69
+ def run(self, text: str) -> PipelineResult:
70
+ """Execute the pipeline on *text* and return a normalised PipelineResult."""
71
+ ...
72
+
73
+ def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
74
+ """Run detection and return findings. Default: text-only path via run()."""
75
+ if isinstance(content, bytes):
76
+ return []
77
+ text = content.strip()
78
+ if not text:
79
+ return []
80
+ result = self.run(text)
81
+ return self._result_to_findings(text, result)
82
+
83
+ def get_supported_content_types(self) -> list[str]:
84
+ return list(_TEXT_CONTENT_TYPES)
85
+
86
+ def _make_result(
87
+ self,
88
+ *,
89
+ finding_type: str,
90
+ category: str,
91
+ severity: Severity,
92
+ confidence: float,
93
+ matched_content: str,
94
+ location: Location | None,
95
+ metadata: dict[str, Any],
96
+ ) -> DetectionResult:
97
+ return DetectionResult(
98
+ detector_type=DetectorType.CUSTOM,
99
+ finding_type=finding_type,
100
+ category=category,
101
+ severity=severity,
102
+ confidence=confidence,
103
+ matched_content=matched_content,
104
+ location=location,
105
+ custom_detector_key=self._detector_key,
106
+ custom_detector_name=self._detector_name,
107
+ detected_at=datetime.now(UTC),
108
+ metadata=metadata,
109
+ )
110
+
111
+ def _result_to_findings(self, text: str, result: PipelineResult) -> list[DetectionResult]:
112
+ findings: list[DetectionResult] = []
113
+ runner_type = result.metadata.get("runner", "GLINER2")
114
+
115
+ for label, spans in result.entities.items():
116
+ for span in spans:
117
+ confidence = float(span.get("confidence", 0.0))
118
+ value = str(span.get("value", ""))
119
+ start = span.get("start")
120
+ end = span.get("end")
121
+
122
+ loc: Location | None = None
123
+ if isinstance(start, int) and isinstance(end, int):
124
+ loc = Location(start=start, end=end, path=f"{runner_type.lower()}-entity")
125
+
126
+ finding_type = f"regex:{label}" if runner_type == "REGEX" else f"entity:{label}"
127
+
128
+ span_severity = span.get("severity")
129
+ if isinstance(span_severity, str) and span_severity in Severity.__members__:
130
+ sev = Severity(span_severity)
131
+ else:
132
+ sev = Severity.medium if confidence < 0.9 else Severity.high
133
+
134
+ meta: dict[str, Any] = {
135
+ "runner": runner_type,
136
+ "entity_label": label,
137
+ "pipeline_result": result.model_dump(),
138
+ }
139
+ if "groups" in span:
140
+ meta["capture_groups"] = span["groups"]
141
+
142
+ findings.append(
143
+ self._make_result(
144
+ finding_type=finding_type,
145
+ category="CLASSIFICATION",
146
+ severity=sev,
147
+ confidence=min(0.99, confidence),
148
+ matched_content=value,
149
+ location=loc,
150
+ metadata=meta,
151
+ )
152
+ )
153
+
154
+ for task, outcome in result.classification.items():
155
+ label = str(outcome.get("label", ""))
156
+ confidence = float(outcome.get("confidence", 0.0))
157
+ if not label:
158
+ continue
159
+
160
+ findings.append(
161
+ self._make_result(
162
+ finding_type=f"classification:{task}:{label}",
163
+ category="CLASSIFICATION",
164
+ severity=Severity.medium if confidence < 0.9 else Severity.high,
165
+ confidence=min(0.99, confidence),
166
+ matched_content=text[:320],
167
+ location=None,
168
+ metadata={
169
+ "runner": runner_type,
170
+ "task": task,
171
+ "label": label,
172
+ "pipeline_result": result.model_dump(),
173
+ },
174
+ )
175
+ )
176
+
177
+ return findings
@@ -0,0 +1,51 @@
1
+ """Factory function for creating runner instances from pipeline schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ....models.generated_detectors import (
6
+ FeatureExtractionPipelineSchema,
7
+ GLiNER2PipelineSchema,
8
+ ImageClassificationPipelineSchema,
9
+ LLMPipelineSchema,
10
+ ObjectDetectionPipelineSchema,
11
+ RegexPipelineSchema,
12
+ TextClassificationPipelineSchema,
13
+ )
14
+ from ._base import BaseRunner
15
+ from ._feature_extraction import FeatureExtractionRunner
16
+ from ._gliner2 import GLiNER2Runner
17
+ from ._image_classification import ImageClassificationRunner
18
+ from ._llm import LLMRunner
19
+ from ._object_detection import ObjectDetectionRunner
20
+ from ._regex import RegexRunner
21
+ from ._text_classification import TextClassificationRunner
22
+
23
+
24
+ def create_runner(
25
+ schema: (
26
+ GLiNER2PipelineSchema
27
+ | RegexPipelineSchema
28
+ | LLMPipelineSchema
29
+ | TextClassificationPipelineSchema
30
+ | ImageClassificationPipelineSchema
31
+ | FeatureExtractionPipelineSchema
32
+ | ObjectDetectionPipelineSchema
33
+ ),
34
+ detector_key: str = "",
35
+ detector_name: str = "",
36
+ ) -> BaseRunner:
37
+ """Return the appropriate runner for *schema* based on its type discriminator."""
38
+ if isinstance(schema, TextClassificationPipelineSchema):
39
+ return TextClassificationRunner(schema, detector_key, detector_name)
40
+ if isinstance(schema, ImageClassificationPipelineSchema):
41
+ return ImageClassificationRunner(schema, detector_key, detector_name)
42
+ if isinstance(schema, FeatureExtractionPipelineSchema):
43
+ return FeatureExtractionRunner(schema, detector_key, detector_name)
44
+ if isinstance(schema, ObjectDetectionPipelineSchema):
45
+ return ObjectDetectionRunner(schema, detector_key, detector_name)
46
+ if isinstance(schema, RegexPipelineSchema):
47
+ return RegexRunner(schema, detector_key, detector_name)
48
+ if isinstance(schema, LLMPipelineSchema):
49
+ return LLMRunner(schema, detector_key, detector_name)
50
+ # GLiNER2PipelineSchema is the default / backward-compat path
51
+ return GLiNER2Runner(schema, detector_key, detector_name)