classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Broken links detector for URL reachability and empty responses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from ...models.generated_detectors import BrokenLinksDetectorConfig, DetectorConfig, Severity
|
|
13
|
+
from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType
|
|
14
|
+
from ...utils.hashing import normalize_http_url
|
|
15
|
+
from ..base import BaseDetector
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class LinkScanResult:
|
|
22
|
+
url: str
|
|
23
|
+
line: int
|
|
24
|
+
start: int
|
|
25
|
+
end: int
|
|
26
|
+
finding_type: str
|
|
27
|
+
confidence: float
|
|
28
|
+
metadata: dict[str, object]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BrokenLinksDetector(BaseDetector):
|
|
32
|
+
"""
|
|
33
|
+
Detector for broken links and empty link targets.
|
|
34
|
+
|
|
35
|
+
Input content is expected to be newline-delimited URLs
|
|
36
|
+
(one URL per line), passed with `application/x.asset-links`.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
detector_type = "broken_links"
|
|
40
|
+
detector_name = "broken_links"
|
|
41
|
+
|
|
42
|
+
_REQUEST_TIMEOUT_SECONDS = 8
|
|
43
|
+
_MAX_CONCURRENCY = 12
|
|
44
|
+
_USER_AGENT = "classifyre-broken-links-detector/1.0"
|
|
45
|
+
|
|
46
|
+
def __init__(self, config: DetectorConfig | None = None) -> None:
|
|
47
|
+
super().__init__(config)
|
|
48
|
+
self._cfg: BrokenLinksDetectorConfig = (
|
|
49
|
+
config if isinstance(config, BrokenLinksDetectorConfig) else BrokenLinksDetectorConfig()
|
|
50
|
+
)
|
|
51
|
+
self._session = requests.Session()
|
|
52
|
+
self._session.headers.update({"User-Agent": self._USER_AGENT})
|
|
53
|
+
|
|
54
|
+
async def detect(
|
|
55
|
+
self, content: str | bytes, content_type: str = "application/x.asset-links"
|
|
56
|
+
) -> list[DetectionResult]:
|
|
57
|
+
if isinstance(content, bytes):
|
|
58
|
+
return []
|
|
59
|
+
if content_type not in self.get_supported_content_types():
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
links = self._extract_links(content)
|
|
63
|
+
if not links:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
semaphore = asyncio.Semaphore(self._MAX_CONCURRENCY)
|
|
67
|
+
|
|
68
|
+
async def check_link(url: str, line: int, start: int, end: int) -> LinkScanResult | None:
|
|
69
|
+
async with semaphore:
|
|
70
|
+
return await asyncio.to_thread(
|
|
71
|
+
self._scan_link,
|
|
72
|
+
url,
|
|
73
|
+
line,
|
|
74
|
+
start,
|
|
75
|
+
end,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
tasks = [
|
|
79
|
+
check_link(url=url, line=line, start=start, end=end) for url, line, start, end in links
|
|
80
|
+
]
|
|
81
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
82
|
+
|
|
83
|
+
findings: list[DetectionResult] = []
|
|
84
|
+
for result in results:
|
|
85
|
+
if isinstance(result, Exception):
|
|
86
|
+
logger.debug("Broken links detector task failed: %s", result)
|
|
87
|
+
continue
|
|
88
|
+
if result is None:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
findings.append(
|
|
92
|
+
DetectionResult(
|
|
93
|
+
detector_type=DetectorType.BROKEN_LINKS,
|
|
94
|
+
finding_type=result.finding_type,
|
|
95
|
+
category="link_integrity",
|
|
96
|
+
severity=Severity.low,
|
|
97
|
+
confidence=result.confidence,
|
|
98
|
+
matched_content=result.url,
|
|
99
|
+
location=None,
|
|
100
|
+
metadata=result.metadata,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if self._cfg.max_findings and len(findings) > self._cfg.max_findings:
|
|
105
|
+
findings = findings[: self._cfg.max_findings]
|
|
106
|
+
|
|
107
|
+
return findings
|
|
108
|
+
|
|
109
|
+
def get_supported_content_types(self) -> list[str]:
|
|
110
|
+
return ["application/x.asset-links"]
|
|
111
|
+
|
|
112
|
+
def _extract_links(self, content: str) -> list[tuple[str, int, int, int]]:
|
|
113
|
+
links: list[tuple[str, int, int, int]] = []
|
|
114
|
+
seen: set[str] = set()
|
|
115
|
+
offset = 0
|
|
116
|
+
for line_number, raw_line in enumerate(content.splitlines(), start=1):
|
|
117
|
+
line = raw_line.strip()
|
|
118
|
+
if not line:
|
|
119
|
+
offset += len(raw_line) + 1
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
normalized = normalize_http_url(line)
|
|
123
|
+
if not normalized:
|
|
124
|
+
offset += len(raw_line) + 1
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
if normalized in seen:
|
|
128
|
+
offset += len(raw_line) + 1
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
seen.add(normalized)
|
|
132
|
+
stripped_index = raw_line.find(line)
|
|
133
|
+
start = offset + (stripped_index if stripped_index >= 0 else 0)
|
|
134
|
+
end = start + len(line)
|
|
135
|
+
links.append((normalized, line_number, start, end))
|
|
136
|
+
offset += len(raw_line) + 1
|
|
137
|
+
|
|
138
|
+
return links
|
|
139
|
+
|
|
140
|
+
def _scan_link(
|
|
141
|
+
self,
|
|
142
|
+
url: str,
|
|
143
|
+
line: int,
|
|
144
|
+
start: int,
|
|
145
|
+
end: int,
|
|
146
|
+
) -> LinkScanResult | None:
|
|
147
|
+
head_response: requests.Response | None = None
|
|
148
|
+
try:
|
|
149
|
+
head_response = self._session.head(
|
|
150
|
+
url,
|
|
151
|
+
allow_redirects=True,
|
|
152
|
+
timeout=self._REQUEST_TIMEOUT_SECONDS,
|
|
153
|
+
)
|
|
154
|
+
status_code = head_response.status_code
|
|
155
|
+
|
|
156
|
+
if status_code in {405, 501}:
|
|
157
|
+
return self._scan_with_get(url, line, start, end, "head_not_supported")
|
|
158
|
+
|
|
159
|
+
if status_code >= 400:
|
|
160
|
+
return LinkScanResult(
|
|
161
|
+
url=url,
|
|
162
|
+
line=line,
|
|
163
|
+
start=start,
|
|
164
|
+
end=end,
|
|
165
|
+
finding_type="unreachable",
|
|
166
|
+
confidence=0.95,
|
|
167
|
+
metadata={"status_code": status_code, "reason": "http_error"},
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
content_length = self._parse_content_length(head_response.headers)
|
|
171
|
+
if content_length == 0:
|
|
172
|
+
return LinkScanResult(
|
|
173
|
+
url=url,
|
|
174
|
+
line=line,
|
|
175
|
+
start=start,
|
|
176
|
+
end=end,
|
|
177
|
+
finding_type="empty_content",
|
|
178
|
+
confidence=0.9,
|
|
179
|
+
metadata={"status_code": status_code, "reason": "empty_head_content_length"},
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Some servers omit Content-Length, so perform a lightweight GET check.
|
|
183
|
+
if content_length is None:
|
|
184
|
+
return self._scan_with_get(url, line, start, end, "missing_content_length")
|
|
185
|
+
|
|
186
|
+
return None
|
|
187
|
+
except requests.RequestException as exc:
|
|
188
|
+
return LinkScanResult(
|
|
189
|
+
url=url,
|
|
190
|
+
line=line,
|
|
191
|
+
start=start,
|
|
192
|
+
end=end,
|
|
193
|
+
finding_type="unreachable",
|
|
194
|
+
confidence=0.95,
|
|
195
|
+
metadata={"reason": "request_exception", "error": str(exc)},
|
|
196
|
+
)
|
|
197
|
+
finally:
|
|
198
|
+
if head_response is not None:
|
|
199
|
+
head_response.close()
|
|
200
|
+
|
|
201
|
+
def _scan_with_get(
|
|
202
|
+
self,
|
|
203
|
+
url: str,
|
|
204
|
+
line: int,
|
|
205
|
+
start: int,
|
|
206
|
+
end: int,
|
|
207
|
+
reason: str,
|
|
208
|
+
) -> LinkScanResult | None:
|
|
209
|
+
get_response: requests.Response | None = None
|
|
210
|
+
try:
|
|
211
|
+
get_response = self._session.get(
|
|
212
|
+
url,
|
|
213
|
+
allow_redirects=True,
|
|
214
|
+
timeout=self._REQUEST_TIMEOUT_SECONDS,
|
|
215
|
+
stream=True,
|
|
216
|
+
)
|
|
217
|
+
status_code = get_response.status_code
|
|
218
|
+
if status_code >= 400:
|
|
219
|
+
return LinkScanResult(
|
|
220
|
+
url=url,
|
|
221
|
+
line=line,
|
|
222
|
+
start=start,
|
|
223
|
+
end=end,
|
|
224
|
+
finding_type="unreachable",
|
|
225
|
+
confidence=0.95,
|
|
226
|
+
metadata={"status_code": status_code, "reason": reason},
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
content_length = self._parse_content_length(get_response.headers)
|
|
230
|
+
if content_length == 0:
|
|
231
|
+
return LinkScanResult(
|
|
232
|
+
url=url,
|
|
233
|
+
line=line,
|
|
234
|
+
start=start,
|
|
235
|
+
end=end,
|
|
236
|
+
finding_type="empty_content",
|
|
237
|
+
confidence=0.9,
|
|
238
|
+
metadata={"status_code": status_code, "reason": reason},
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
has_payload = self._response_has_payload(get_response.iter_content(chunk_size=1))
|
|
242
|
+
if not has_payload:
|
|
243
|
+
return LinkScanResult(
|
|
244
|
+
url=url,
|
|
245
|
+
line=line,
|
|
246
|
+
start=start,
|
|
247
|
+
end=end,
|
|
248
|
+
finding_type="empty_content",
|
|
249
|
+
confidence=0.9,
|
|
250
|
+
metadata={"status_code": status_code, "reason": "empty_body"},
|
|
251
|
+
)
|
|
252
|
+
return None
|
|
253
|
+
except requests.RequestException as exc:
|
|
254
|
+
return LinkScanResult(
|
|
255
|
+
url=url,
|
|
256
|
+
line=line,
|
|
257
|
+
start=start,
|
|
258
|
+
end=end,
|
|
259
|
+
finding_type="unreachable",
|
|
260
|
+
confidence=0.95,
|
|
261
|
+
metadata={"reason": reason, "error": str(exc)},
|
|
262
|
+
)
|
|
263
|
+
finally:
|
|
264
|
+
if get_response is not None:
|
|
265
|
+
get_response.close()
|
|
266
|
+
|
|
267
|
+
def _parse_content_length(self, headers: dict[str, object]) -> int | None:
|
|
268
|
+
raw_length = headers.get("Content-Length")
|
|
269
|
+
if raw_length is None:
|
|
270
|
+
return None
|
|
271
|
+
try:
|
|
272
|
+
return int(str(raw_length))
|
|
273
|
+
except (TypeError, ValueError):
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
def _response_has_payload(self, chunks: Iterable[bytes]) -> bool:
|
|
277
|
+
for chunk in chunks:
|
|
278
|
+
if chunk:
|
|
279
|
+
return True
|
|
280
|
+
return False
|
src/detectors/config.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Shared detector config resolution and type mapping."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ..models.generated_detectors import (
|
|
8
|
+
BrokenLinksDetectorConfig,
|
|
9
|
+
CustomDetectorConfig,
|
|
10
|
+
DetectorConfig,
|
|
11
|
+
PIIDetectorConfig,
|
|
12
|
+
SecretsDetectorConfig,
|
|
13
|
+
ThreatDetectorConfig,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
type DetectorTypedConfig = (
|
|
17
|
+
DetectorConfig
|
|
18
|
+
| CustomDetectorConfig
|
|
19
|
+
| SecretsDetectorConfig
|
|
20
|
+
| PIIDetectorConfig
|
|
21
|
+
| ThreatDetectorConfig
|
|
22
|
+
| BrokenLinksDetectorConfig
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_DETECTOR_NAME_BY_TYPE: dict[str, str] = {
|
|
26
|
+
"SECRETS": "secrets",
|
|
27
|
+
"PII": "pii",
|
|
28
|
+
"YARA": "yara",
|
|
29
|
+
"BROKEN_LINKS": "broken_links",
|
|
30
|
+
"CODE_SECURITY": "code_security",
|
|
31
|
+
"CUSTOM": "custom",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
_DETECTOR_CONFIG_BY_TYPE: dict[str, type[DetectorConfig]] = {
|
|
35
|
+
"SECRETS": SecretsDetectorConfig,
|
|
36
|
+
"PII": PIIDetectorConfig,
|
|
37
|
+
"YARA": ThreatDetectorConfig,
|
|
38
|
+
"BROKEN_LINKS": BrokenLinksDetectorConfig,
|
|
39
|
+
"CUSTOM": CustomDetectorConfig,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def normalize_detector_type(detector_type: str) -> str:
|
|
44
|
+
return detector_type.strip().upper()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_detector_name(detector_type: str) -> str:
|
|
48
|
+
normalized = normalize_detector_type(detector_type)
|
|
49
|
+
return _DETECTOR_NAME_BY_TYPE.get(normalized, normalized.lower())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def parse_detector_config(detector_type: str, raw_config: Any) -> tuple[str, DetectorTypedConfig]:
|
|
53
|
+
normalized = normalize_detector_type(detector_type)
|
|
54
|
+
detector_name = get_detector_name(normalized)
|
|
55
|
+
config_cls = _DETECTOR_CONFIG_BY_TYPE.get(normalized, DetectorConfig)
|
|
56
|
+
if not isinstance(raw_config, dict):
|
|
57
|
+
raw_config = {}
|
|
58
|
+
typed_config = config_cls.model_validate(raw_config)
|
|
59
|
+
return detector_name, typed_config
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Custom detector implementations."""
|
|
2
|
+
|
|
3
|
+
from .detector import CustomDetector
|
|
4
|
+
from .runners import BaseRunner, GLiNER2Runner, LLMRunner, RegexRunner, create_runner
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BaseRunner",
|
|
8
|
+
"CustomDetector",
|
|
9
|
+
"GLiNER2Runner",
|
|
10
|
+
"LLMRunner",
|
|
11
|
+
"RegexRunner",
|
|
12
|
+
"create_runner",
|
|
13
|
+
]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Custom detector — delegates to the appropriate runner via the runner factory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ...models.generated_detectors import (
|
|
8
|
+
CustomDetectorConfig,
|
|
9
|
+
DetectorConfig,
|
|
10
|
+
)
|
|
11
|
+
from ...models.generated_single_asset_scan_results import DetectionResult
|
|
12
|
+
from ..base import BaseDetector
|
|
13
|
+
from .runners import BaseRunner, create_runner
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CustomDetector(BaseDetector):
|
|
19
|
+
"""Schema-driven detector backed by a pluggable runner (GLINER2 | REGEX | LLM | transformer)."""
|
|
20
|
+
|
|
21
|
+
detector_type = "custom"
|
|
22
|
+
detector_name = "custom"
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: DetectorConfig | None = None):
|
|
25
|
+
super().__init__(config)
|
|
26
|
+
if not isinstance(self.config, CustomDetectorConfig):
|
|
27
|
+
raise ValueError("CustomDetector requires CustomDetectorConfig with pipeline_schema")
|
|
28
|
+
self.custom_config: CustomDetectorConfig = self.config
|
|
29
|
+
self._runner: BaseRunner = create_runner(
|
|
30
|
+
self.custom_config.pipeline_schema,
|
|
31
|
+
detector_key=self.custom_config.custom_detector_key,
|
|
32
|
+
detector_name=self.custom_config.name,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
async def detect(
|
|
36
|
+
self, content: str | bytes, content_type: str = "text/plain"
|
|
37
|
+
) -> list[DetectionResult]:
|
|
38
|
+
findings = self._runner.detect(content, content_type)
|
|
39
|
+
max_findings = self.custom_config.max_findings
|
|
40
|
+
if isinstance(max_findings, int) and max_findings > 0:
|
|
41
|
+
findings = findings[:max_findings]
|
|
42
|
+
return findings
|
|
43
|
+
|
|
44
|
+
def get_supported_content_types(self) -> list[str]:
|
|
45
|
+
return self._runner.get_supported_content_types()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Custom detector runner package.
|
|
2
|
+
|
|
3
|
+
Public surface re-exported here so that existing imports of the form
|
|
4
|
+
from .runners import BaseRunner, create_runner, GLiNER2Runner, ...
|
|
5
|
+
continue to work unchanged.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ._base import (
|
|
9
|
+
_DEFAULT_GLINER2_MODEL,
|
|
10
|
+
_DEFAULT_IMAGE_CLASSIFICATION_MODEL,
|
|
11
|
+
_IMAGE_CONTENT_TYPES,
|
|
12
|
+
_TEXT_CONTENT_TYPES,
|
|
13
|
+
BaseRunner,
|
|
14
|
+
_resolve_pipeline_severity,
|
|
15
|
+
)
|
|
16
|
+
from ._factory import create_runner
|
|
17
|
+
from ._feature_extraction import FeatureExtractionRunner, _chunk_text_with_offsets, _pool_hidden
|
|
18
|
+
from ._gliner2 import (
|
|
19
|
+
GLiNER2Runner,
|
|
20
|
+
_apply_classification_validation,
|
|
21
|
+
_apply_entity_validation,
|
|
22
|
+
_normalise_classification_output,
|
|
23
|
+
_normalise_entity_output,
|
|
24
|
+
_normalise_span,
|
|
25
|
+
)
|
|
26
|
+
from ._image_classification import ImageClassificationRunner
|
|
27
|
+
from ._llm import LLMRunner
|
|
28
|
+
from ._object_detection import ObjectDetectionRunner
|
|
29
|
+
from ._regex import RegexRunner, _load_regex_engine
|
|
30
|
+
from ._text_classification import TextClassificationRunner, _chunk_text
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"_DEFAULT_GLINER2_MODEL",
|
|
34
|
+
"_DEFAULT_IMAGE_CLASSIFICATION_MODEL",
|
|
35
|
+
"_IMAGE_CONTENT_TYPES",
|
|
36
|
+
"_TEXT_CONTENT_TYPES",
|
|
37
|
+
"BaseRunner",
|
|
38
|
+
"FeatureExtractionRunner",
|
|
39
|
+
"GLiNER2Runner",
|
|
40
|
+
"ImageClassificationRunner",
|
|
41
|
+
"LLMRunner",
|
|
42
|
+
"ObjectDetectionRunner",
|
|
43
|
+
"RegexRunner",
|
|
44
|
+
"TextClassificationRunner",
|
|
45
|
+
"_apply_classification_validation",
|
|
46
|
+
"_apply_entity_validation",
|
|
47
|
+
"_chunk_text",
|
|
48
|
+
"_chunk_text_with_offsets",
|
|
49
|
+
"_load_regex_engine",
|
|
50
|
+
"_normalise_classification_output",
|
|
51
|
+
"_normalise_entity_output",
|
|
52
|
+
"_normalise_span",
|
|
53
|
+
"_pool_hidden",
|
|
54
|
+
"_resolve_pipeline_severity",
|
|
55
|
+
"create_runner",
|
|
56
|
+
]
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Base runner interface and shared utilities for all pipeline execution strategies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ....models.generated_detectors import (
|
|
11
|
+
PipelineResult,
|
|
12
|
+
PipelineSeverityRule,
|
|
13
|
+
Severity,
|
|
14
|
+
)
|
|
15
|
+
from ....models.generated_single_asset_scan_results import (
|
|
16
|
+
DetectionResult,
|
|
17
|
+
DetectorType,
|
|
18
|
+
Location,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_DEFAULT_GLINER2_MODEL = "fastino/gliner2-base-v1"
|
|
22
|
+
_DEFAULT_IMAGE_CLASSIFICATION_MODEL = "google/vit-base-patch16-224"
|
|
23
|
+
|
|
24
|
+
_TEXT_CONTENT_TYPES = [
|
|
25
|
+
"text/plain",
|
|
26
|
+
"text/html",
|
|
27
|
+
"text/markdown",
|
|
28
|
+
"application/json",
|
|
29
|
+
"application/xml",
|
|
30
|
+
"text/xml",
|
|
31
|
+
]
|
|
32
|
+
_IMAGE_CONTENT_TYPES = [
|
|
33
|
+
"image/jpeg",
|
|
34
|
+
"image/jpg",
|
|
35
|
+
"image/png",
|
|
36
|
+
"image/gif",
|
|
37
|
+
"image/webp",
|
|
38
|
+
"image/bmp",
|
|
39
|
+
"image/tiff",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _resolve_pipeline_severity(
|
|
44
|
+
label: str,
|
|
45
|
+
severity_map: list[PipelineSeverityRule] | None,
|
|
46
|
+
default: Severity = Severity.info,
|
|
47
|
+
) -> Severity:
|
|
48
|
+
"""Return the first severity whose pattern matches label (case-insensitive)."""
|
|
49
|
+
if not severity_map:
|
|
50
|
+
return default
|
|
51
|
+
label_lower = label.lower()
|
|
52
|
+
for rule in severity_map:
|
|
53
|
+
try:
|
|
54
|
+
if re.search(rule.pattern, label_lower, re.IGNORECASE):
|
|
55
|
+
return rule.severity
|
|
56
|
+
except re.error:
|
|
57
|
+
if rule.pattern.lower() in label_lower:
|
|
58
|
+
return rule.severity
|
|
59
|
+
return default
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class BaseRunner(ABC):
|
|
63
|
+
"""Common interface for all pipeline execution strategies."""
|
|
64
|
+
|
|
65
|
+
_detector_key: str = ""
|
|
66
|
+
_detector_name: str = ""
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def run(self, text: str) -> PipelineResult:
|
|
70
|
+
"""Execute the pipeline on *text* and return a normalised PipelineResult."""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
74
|
+
"""Run detection and return findings. Default: text-only path via run()."""
|
|
75
|
+
if isinstance(content, bytes):
|
|
76
|
+
return []
|
|
77
|
+
text = content.strip()
|
|
78
|
+
if not text:
|
|
79
|
+
return []
|
|
80
|
+
result = self.run(text)
|
|
81
|
+
return self._result_to_findings(text, result)
|
|
82
|
+
|
|
83
|
+
def get_supported_content_types(self) -> list[str]:
|
|
84
|
+
return list(_TEXT_CONTENT_TYPES)
|
|
85
|
+
|
|
86
|
+
def _make_result(
|
|
87
|
+
self,
|
|
88
|
+
*,
|
|
89
|
+
finding_type: str,
|
|
90
|
+
category: str,
|
|
91
|
+
severity: Severity,
|
|
92
|
+
confidence: float,
|
|
93
|
+
matched_content: str,
|
|
94
|
+
location: Location | None,
|
|
95
|
+
metadata: dict[str, Any],
|
|
96
|
+
) -> DetectionResult:
|
|
97
|
+
return DetectionResult(
|
|
98
|
+
detector_type=DetectorType.CUSTOM,
|
|
99
|
+
finding_type=finding_type,
|
|
100
|
+
category=category,
|
|
101
|
+
severity=severity,
|
|
102
|
+
confidence=confidence,
|
|
103
|
+
matched_content=matched_content,
|
|
104
|
+
location=location,
|
|
105
|
+
custom_detector_key=self._detector_key,
|
|
106
|
+
custom_detector_name=self._detector_name,
|
|
107
|
+
detected_at=datetime.now(UTC),
|
|
108
|
+
metadata=metadata,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _result_to_findings(self, text: str, result: PipelineResult) -> list[DetectionResult]:
|
|
112
|
+
findings: list[DetectionResult] = []
|
|
113
|
+
runner_type = result.metadata.get("runner", "GLINER2")
|
|
114
|
+
|
|
115
|
+
for label, spans in result.entities.items():
|
|
116
|
+
for span in spans:
|
|
117
|
+
confidence = float(span.get("confidence", 0.0))
|
|
118
|
+
value = str(span.get("value", ""))
|
|
119
|
+
start = span.get("start")
|
|
120
|
+
end = span.get("end")
|
|
121
|
+
|
|
122
|
+
loc: Location | None = None
|
|
123
|
+
if isinstance(start, int) and isinstance(end, int):
|
|
124
|
+
loc = Location(start=start, end=end, path=f"{runner_type.lower()}-entity")
|
|
125
|
+
|
|
126
|
+
finding_type = f"regex:{label}" if runner_type == "REGEX" else f"entity:{label}"
|
|
127
|
+
|
|
128
|
+
span_severity = span.get("severity")
|
|
129
|
+
if isinstance(span_severity, str) and span_severity in Severity.__members__:
|
|
130
|
+
sev = Severity(span_severity)
|
|
131
|
+
else:
|
|
132
|
+
sev = Severity.medium if confidence < 0.9 else Severity.high
|
|
133
|
+
|
|
134
|
+
meta: dict[str, Any] = {
|
|
135
|
+
"runner": runner_type,
|
|
136
|
+
"entity_label": label,
|
|
137
|
+
"pipeline_result": result.model_dump(),
|
|
138
|
+
}
|
|
139
|
+
if "groups" in span:
|
|
140
|
+
meta["capture_groups"] = span["groups"]
|
|
141
|
+
|
|
142
|
+
findings.append(
|
|
143
|
+
self._make_result(
|
|
144
|
+
finding_type=finding_type,
|
|
145
|
+
category="CLASSIFICATION",
|
|
146
|
+
severity=sev,
|
|
147
|
+
confidence=min(0.99, confidence),
|
|
148
|
+
matched_content=value,
|
|
149
|
+
location=loc,
|
|
150
|
+
metadata=meta,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
for task, outcome in result.classification.items():
|
|
155
|
+
label = str(outcome.get("label", ""))
|
|
156
|
+
confidence = float(outcome.get("confidence", 0.0))
|
|
157
|
+
if not label:
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
findings.append(
|
|
161
|
+
self._make_result(
|
|
162
|
+
finding_type=f"classification:{task}:{label}",
|
|
163
|
+
category="CLASSIFICATION",
|
|
164
|
+
severity=Severity.medium if confidence < 0.9 else Severity.high,
|
|
165
|
+
confidence=min(0.99, confidence),
|
|
166
|
+
matched_content=text[:320],
|
|
167
|
+
location=None,
|
|
168
|
+
metadata={
|
|
169
|
+
"runner": runner_type,
|
|
170
|
+
"task": task,
|
|
171
|
+
"label": label,
|
|
172
|
+
"pipeline_result": result.model_dump(),
|
|
173
|
+
},
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return findings
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Factory function for creating runner instances from pipeline schemas."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ....models.generated_detectors import (
|
|
6
|
+
FeatureExtractionPipelineSchema,
|
|
7
|
+
GLiNER2PipelineSchema,
|
|
8
|
+
ImageClassificationPipelineSchema,
|
|
9
|
+
LLMPipelineSchema,
|
|
10
|
+
ObjectDetectionPipelineSchema,
|
|
11
|
+
RegexPipelineSchema,
|
|
12
|
+
TextClassificationPipelineSchema,
|
|
13
|
+
)
|
|
14
|
+
from ._base import BaseRunner
|
|
15
|
+
from ._feature_extraction import FeatureExtractionRunner
|
|
16
|
+
from ._gliner2 import GLiNER2Runner
|
|
17
|
+
from ._image_classification import ImageClassificationRunner
|
|
18
|
+
from ._llm import LLMRunner
|
|
19
|
+
from ._object_detection import ObjectDetectionRunner
|
|
20
|
+
from ._regex import RegexRunner
|
|
21
|
+
from ._text_classification import TextClassificationRunner
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_runner(
|
|
25
|
+
schema: (
|
|
26
|
+
GLiNER2PipelineSchema
|
|
27
|
+
| RegexPipelineSchema
|
|
28
|
+
| LLMPipelineSchema
|
|
29
|
+
| TextClassificationPipelineSchema
|
|
30
|
+
| ImageClassificationPipelineSchema
|
|
31
|
+
| FeatureExtractionPipelineSchema
|
|
32
|
+
| ObjectDetectionPipelineSchema
|
|
33
|
+
),
|
|
34
|
+
detector_key: str = "",
|
|
35
|
+
detector_name: str = "",
|
|
36
|
+
) -> BaseRunner:
|
|
37
|
+
"""Return the appropriate runner for *schema* based on its type discriminator."""
|
|
38
|
+
if isinstance(schema, TextClassificationPipelineSchema):
|
|
39
|
+
return TextClassificationRunner(schema, detector_key, detector_name)
|
|
40
|
+
if isinstance(schema, ImageClassificationPipelineSchema):
|
|
41
|
+
return ImageClassificationRunner(schema, detector_key, detector_name)
|
|
42
|
+
if isinstance(schema, FeatureExtractionPipelineSchema):
|
|
43
|
+
return FeatureExtractionRunner(schema, detector_key, detector_name)
|
|
44
|
+
if isinstance(schema, ObjectDetectionPipelineSchema):
|
|
45
|
+
return ObjectDetectionRunner(schema, detector_key, detector_name)
|
|
46
|
+
if isinstance(schema, RegexPipelineSchema):
|
|
47
|
+
return RegexRunner(schema, detector_key, detector_name)
|
|
48
|
+
if isinstance(schema, LLMPipelineSchema):
|
|
49
|
+
return LLMRunner(schema, detector_key, detector_name)
|
|
50
|
+
# GLiNER2PipelineSchema is the default / backward-compat path
|
|
51
|
+
return GLiNER2Runner(schema, detector_key, detector_name)
|