classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
"""Pipeline for running detectors on extracted assets."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator, Awaitable, Callable
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..detectors.base import BaseDetector
|
|
10
|
+
from ..models.generated_single_asset_scan_results import (
|
|
11
|
+
AssetType as OutputAssetType,
|
|
12
|
+
)
|
|
13
|
+
from ..models.generated_single_asset_scan_results import (
|
|
14
|
+
DetectionResult,
|
|
15
|
+
DetectorType,
|
|
16
|
+
ScanStats,
|
|
17
|
+
SingleAssetScanResults,
|
|
18
|
+
)
|
|
19
|
+
from ..sources.base import BaseSource
|
|
20
|
+
from ..utils.file_parser import resolve_mime_type
|
|
21
|
+
from .content_provider import ContentProvider
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DetectorPipeline:
|
|
27
|
+
"""
|
|
28
|
+
Pipeline for running detectors on extracted assets.
|
|
29
|
+
|
|
30
|
+
Adds detector findings to assets (CoreOutput schema).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
detectors: list[BaseDetector],
|
|
36
|
+
source: BaseSource,
|
|
37
|
+
runner_id: str,
|
|
38
|
+
content_size_limit: int = 1_048_576, # 1MB default
|
|
39
|
+
max_concurrent_assets: int = 10,
|
|
40
|
+
content_provider: ContentProvider | None = None,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize detector pipeline.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
detectors: List of detector instances to run
|
|
47
|
+
source: Source instance for fetching content
|
|
48
|
+
runner_id: ID of the runner executing this pipeline
|
|
49
|
+
content_size_limit: Maximum content size in bytes
|
|
50
|
+
max_concurrent_assets: Max assets to process in parallel within a batch
|
|
51
|
+
content_provider: Optional provider — if None, source is used directly
|
|
52
|
+
"""
|
|
53
|
+
self.detectors = detectors
|
|
54
|
+
self.source = source
|
|
55
|
+
self.runner_id = runner_id
|
|
56
|
+
self.content_size_limit = content_size_limit
|
|
57
|
+
self.max_concurrent_assets = max_concurrent_assets
|
|
58
|
+
self._detector_semaphore = asyncio.Semaphore(max_concurrent_assets)
|
|
59
|
+
if content_provider is not None:
|
|
60
|
+
self.content_provider: ContentProvider = content_provider
|
|
61
|
+
else:
|
|
62
|
+
from .parsed_content_provider import ParsedContentProvider
|
|
63
|
+
|
|
64
|
+
self.content_provider = ParsedContentProvider(source)
|
|
65
|
+
self.init_warnings: list[str] = []
|
|
66
|
+
|
|
67
|
+
async def process(self, assets: list[SingleAssetScanResults]) -> list[SingleAssetScanResults]:
|
|
68
|
+
"""Process assets through detector pipeline, returning all results at once."""
|
|
69
|
+
results: list[SingleAssetScanResults] = []
|
|
70
|
+
async for asset in self.process_stream(assets):
|
|
71
|
+
results.append(asset)
|
|
72
|
+
return results
|
|
73
|
+
|
|
74
|
+
async def process_stream(
|
|
75
|
+
self, assets: list[SingleAssetScanResults]
|
|
76
|
+
) -> AsyncGenerator[SingleAssetScanResults, None]:
|
|
77
|
+
"""Process assets concurrently, yielding in completion order.
|
|
78
|
+
|
|
79
|
+
Total concurrent detector invocations across all assets and pages
|
|
80
|
+
are bounded by ``self._detector_semaphore``.
|
|
81
|
+
"""
|
|
82
|
+
tasks = {asyncio.create_task(self.process_single_asset(a)) for a in assets}
|
|
83
|
+
for coro in asyncio.as_completed(tasks):
|
|
84
|
+
yield await coro
|
|
85
|
+
|
|
86
|
+
async def process_single_asset(
|
|
87
|
+
self,
|
|
88
|
+
asset: SingleAssetScanResults,
|
|
89
|
+
*,
|
|
90
|
+
on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]] | None = None,
|
|
91
|
+
findings_flush_size: int = 50,
|
|
92
|
+
) -> SingleAssetScanResults:
|
|
93
|
+
"""Process a single asset through detectors.
|
|
94
|
+
|
|
95
|
+
When *on_findings_flushed* is provided the text-detector phase switches to
|
|
96
|
+
sequential page processing and calls the callback every *findings_flush_size*
|
|
97
|
+
new findings so callers can push partial results without waiting for the full
|
|
98
|
+
asset (important for ALL-strategy tabular sources with thousands of pages).
|
|
99
|
+
"""
|
|
100
|
+
# 1. If no detectors, return asset as-is with empty findings
|
|
101
|
+
if not self.detectors:
|
|
102
|
+
asset.findings = []
|
|
103
|
+
return asset
|
|
104
|
+
|
|
105
|
+
# Record scan start time
|
|
106
|
+
scan_started = datetime.now(UTC)
|
|
107
|
+
ocr_enabled = self.source.ocr_enabled()
|
|
108
|
+
text_content_type = self._text_content_type_for_asset(asset.asset_type, ocr_enabled)
|
|
109
|
+
link_content = self._build_links_payload(asset.links)
|
|
110
|
+
|
|
111
|
+
text_detectors = []
|
|
112
|
+
if text_content_type:
|
|
113
|
+
text_detectors = [
|
|
114
|
+
detector
|
|
115
|
+
for detector in self.detectors
|
|
116
|
+
if self._supports_content_type(
|
|
117
|
+
detector.get_supported_content_types(),
|
|
118
|
+
text_content_type,
|
|
119
|
+
)
|
|
120
|
+
]
|
|
121
|
+
asset_has_binary_primary = self._asset_has_binary_primary_payload(asset.asset_type)
|
|
122
|
+
binary_detectors = [
|
|
123
|
+
detector
|
|
124
|
+
for detector in self.detectors
|
|
125
|
+
if self._is_binary_detector(detector)
|
|
126
|
+
and (
|
|
127
|
+
asset_has_binary_primary
|
|
128
|
+
or not text_content_type
|
|
129
|
+
or not self._supports_content_type(
|
|
130
|
+
detector.get_supported_content_types(),
|
|
131
|
+
text_content_type,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
]
|
|
135
|
+
link_detectors = [
|
|
136
|
+
detector
|
|
137
|
+
for detector in self.detectors
|
|
138
|
+
if link_content
|
|
139
|
+
and self._supports_content_type(
|
|
140
|
+
detector.get_supported_content_types(),
|
|
141
|
+
"application/x.asset-links",
|
|
142
|
+
)
|
|
143
|
+
]
|
|
144
|
+
should_warn_on_empty_text = asset.asset_type in {
|
|
145
|
+
OutputAssetType.TXT,
|
|
146
|
+
OutputAssetType.TABLE,
|
|
147
|
+
OutputAssetType.URL,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
all_active = text_detectors + binary_detectors + link_detectors
|
|
151
|
+
detector_names = [self._detector_log_label(d) for d in all_active]
|
|
152
|
+
logger.info("Scanning %s [%s]", asset.name, ", ".join(detector_names))
|
|
153
|
+
|
|
154
|
+
findings: list[DetectionResult] = []
|
|
155
|
+
detector_types_run: list[DetectorType] = []
|
|
156
|
+
scan_warnings: list[str] = list(self.init_warnings)
|
|
157
|
+
scan_errors: list[str] = []
|
|
158
|
+
|
|
159
|
+
if text_detectors:
|
|
160
|
+
(
|
|
161
|
+
text_findings,
|
|
162
|
+
text_detector_types_run,
|
|
163
|
+
content_size,
|
|
164
|
+
text_warnings,
|
|
165
|
+
text_errors,
|
|
166
|
+
) = await self._run_text_detectors_for_asset(
|
|
167
|
+
asset=asset,
|
|
168
|
+
text_content_type=text_content_type,
|
|
169
|
+
detectors=text_detectors,
|
|
170
|
+
warn_on_empty_content=should_warn_on_empty_text,
|
|
171
|
+
on_findings_flushed=on_findings_flushed,
|
|
172
|
+
findings_flush_size=findings_flush_size,
|
|
173
|
+
)
|
|
174
|
+
findings.extend(text_findings)
|
|
175
|
+
scan_warnings.extend(text_warnings)
|
|
176
|
+
scan_errors.extend(text_errors)
|
|
177
|
+
detector_types_run = self._merge_detector_types(
|
|
178
|
+
detector_types_run,
|
|
179
|
+
text_detector_types_run,
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
content_size = 0
|
|
183
|
+
|
|
184
|
+
if binary_detectors:
|
|
185
|
+
(
|
|
186
|
+
binary_findings,
|
|
187
|
+
binary_detector_types_run,
|
|
188
|
+
bin_warnings,
|
|
189
|
+
bin_errors,
|
|
190
|
+
) = await self._run_binary_detectors_for_asset(
|
|
191
|
+
asset=asset,
|
|
192
|
+
detectors=binary_detectors,
|
|
193
|
+
)
|
|
194
|
+
findings.extend(binary_findings)
|
|
195
|
+
scan_warnings.extend(bin_warnings)
|
|
196
|
+
scan_errors.extend(bin_errors)
|
|
197
|
+
detector_types_run = self._merge_detector_types(
|
|
198
|
+
detector_types_run,
|
|
199
|
+
binary_detector_types_run,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if link_detectors:
|
|
203
|
+
link_findings, link_detector_types_run, link_errors = await self._run_detectors(
|
|
204
|
+
detectors=link_detectors,
|
|
205
|
+
content=link_content,
|
|
206
|
+
content_type="application/x.asset-links",
|
|
207
|
+
asset_name=asset.name,
|
|
208
|
+
)
|
|
209
|
+
findings.extend(link_findings)
|
|
210
|
+
scan_errors.extend(link_errors)
|
|
211
|
+
detector_types_run = self._merge_detector_types(
|
|
212
|
+
detector_types_run,
|
|
213
|
+
link_detector_types_run,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
for finding in link_findings:
|
|
217
|
+
self.content_provider.enrich_finding_location(finding, asset, "")
|
|
218
|
+
|
|
219
|
+
scan_duration = int((datetime.now(UTC) - scan_started).total_seconds() * 1000)
|
|
220
|
+
|
|
221
|
+
asset.findings = findings
|
|
222
|
+
asset.scan_stats = ScanStats(
|
|
223
|
+
scanned_at=scan_started,
|
|
224
|
+
duration_ms=scan_duration,
|
|
225
|
+
detectors_run=detector_types_run,
|
|
226
|
+
content_size_bytes=content_size,
|
|
227
|
+
findings_count=len(findings),
|
|
228
|
+
warnings=scan_warnings or None,
|
|
229
|
+
errors=scan_errors or None,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if findings:
|
|
233
|
+
logger.info(
|
|
234
|
+
"Scanned %s: %d finding(s) in %dms",
|
|
235
|
+
asset.name,
|
|
236
|
+
len(findings),
|
|
237
|
+
scan_duration,
|
|
238
|
+
)
|
|
239
|
+
else:
|
|
240
|
+
logger.info("Scanned %s: no findings (%dms)", asset.name, scan_duration)
|
|
241
|
+
|
|
242
|
+
return asset
|
|
243
|
+
|
|
244
|
+
async def _run_text_detectors_for_asset(
|
|
245
|
+
self,
|
|
246
|
+
*,
|
|
247
|
+
asset: SingleAssetScanResults,
|
|
248
|
+
text_content_type: str,
|
|
249
|
+
detectors: list[BaseDetector],
|
|
250
|
+
warn_on_empty_content: bool = True,
|
|
251
|
+
on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]] | None = None,
|
|
252
|
+
findings_flush_size: int = 50,
|
|
253
|
+
) -> tuple[list[DetectionResult], list[DetectorType], int, list[str], list[str]]:
|
|
254
|
+
if on_findings_flushed is not None:
|
|
255
|
+
return await self._run_text_detectors_streaming(
|
|
256
|
+
asset=asset,
|
|
257
|
+
text_content_type=text_content_type,
|
|
258
|
+
detectors=detectors,
|
|
259
|
+
warn_on_empty_content=warn_on_empty_content,
|
|
260
|
+
on_findings_flushed=on_findings_flushed,
|
|
261
|
+
findings_flush_size=findings_flush_size,
|
|
262
|
+
)
|
|
263
|
+
findings: list[DetectionResult] = []
|
|
264
|
+
detector_types_run: list[DetectorType] = []
|
|
265
|
+
warnings: list[str] = []
|
|
266
|
+
errors: list[str] = []
|
|
267
|
+
content_size = 0
|
|
268
|
+
|
|
269
|
+
pending_tasks: set[
|
|
270
|
+
asyncio.Task[tuple[list[DetectionResult], list[DetectorType], list[str], str]]
|
|
271
|
+
] = set()
|
|
272
|
+
|
|
273
|
+
async def _detect_page(
|
|
274
|
+
page_content: str,
|
|
275
|
+
) -> tuple[list[DetectionResult], list[DetectorType], list[str], str]:
|
|
276
|
+
async with self._detector_semaphore:
|
|
277
|
+
page_findings, page_types, page_errors = await self._run_detectors(
|
|
278
|
+
detectors=detectors,
|
|
279
|
+
content=page_content,
|
|
280
|
+
content_type=text_content_type,
|
|
281
|
+
asset_name=asset.name,
|
|
282
|
+
)
|
|
283
|
+
return page_findings, page_types, page_errors, page_content
|
|
284
|
+
|
|
285
|
+
def _collect_done() -> None:
|
|
286
|
+
done = {t for t in pending_tasks if t.done()}
|
|
287
|
+
for task in done:
|
|
288
|
+
pending_tasks.discard(task)
|
|
289
|
+
page_findings, page_types, page_errors, page_content = task.result()
|
|
290
|
+
findings.extend(page_findings)
|
|
291
|
+
errors.extend(page_errors)
|
|
292
|
+
nonlocal detector_types_run
|
|
293
|
+
detector_types_run = self._merge_detector_types(
|
|
294
|
+
detector_types_run,
|
|
295
|
+
page_types,
|
|
296
|
+
)
|
|
297
|
+
for finding in page_findings:
|
|
298
|
+
self.content_provider.enrich_finding_location(
|
|
299
|
+
finding,
|
|
300
|
+
asset,
|
|
301
|
+
page_content,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
async for text_content in self._iter_text_content_pages(asset):
|
|
305
|
+
content_size += len(text_content)
|
|
306
|
+
|
|
307
|
+
detector_content = text_content
|
|
308
|
+
if len(detector_content) > self.content_size_limit:
|
|
309
|
+
msg = (
|
|
310
|
+
f"Content truncated from {len(detector_content)} to "
|
|
311
|
+
f"{self.content_size_limit} bytes for {asset.name}"
|
|
312
|
+
)
|
|
313
|
+
logger.warning(msg)
|
|
314
|
+
warnings.append(msg)
|
|
315
|
+
detector_content = detector_content[: self.content_size_limit]
|
|
316
|
+
|
|
317
|
+
if not detector_content:
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
task = asyncio.create_task(_detect_page(detector_content))
|
|
321
|
+
pending_tasks.add(task)
|
|
322
|
+
_collect_done()
|
|
323
|
+
|
|
324
|
+
if pending_tasks:
|
|
325
|
+
await asyncio.gather(*pending_tasks)
|
|
326
|
+
_collect_done()
|
|
327
|
+
|
|
328
|
+
if content_size == 0 and warn_on_empty_content:
|
|
329
|
+
msg = f"No content available for asset {asset.name}"
|
|
330
|
+
logger.warning(msg)
|
|
331
|
+
warnings.append(msg)
|
|
332
|
+
|
|
333
|
+
return findings, detector_types_run, content_size, warnings, errors
|
|
334
|
+
|
|
335
|
+
async def _run_text_detectors_streaming(
|
|
336
|
+
self,
|
|
337
|
+
*,
|
|
338
|
+
asset: SingleAssetScanResults,
|
|
339
|
+
text_content_type: str,
|
|
340
|
+
detectors: list[BaseDetector],
|
|
341
|
+
warn_on_empty_content: bool = True,
|
|
342
|
+
on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]],
|
|
343
|
+
findings_flush_size: int = 50,
|
|
344
|
+
) -> tuple[list[DetectionResult], list[DetectorType], int, list[str], list[str]]:
|
|
345
|
+
"""Sequential variant: processes one page at a time and calls back every N findings."""
|
|
346
|
+
findings: list[DetectionResult] = []
|
|
347
|
+
detector_types_run: list[DetectorType] = []
|
|
348
|
+
warnings: list[str] = []
|
|
349
|
+
errors: list[str] = []
|
|
350
|
+
content_size = 0
|
|
351
|
+
unflushed_count = 0
|
|
352
|
+
|
|
353
|
+
async for text_content in self._iter_text_content_pages(asset):
|
|
354
|
+
content_size += len(text_content)
|
|
355
|
+
|
|
356
|
+
detector_content = text_content
|
|
357
|
+
if len(detector_content) > self.content_size_limit:
|
|
358
|
+
msg = (
|
|
359
|
+
f"Content truncated from {len(detector_content)} to "
|
|
360
|
+
f"{self.content_size_limit} bytes for {asset.name}"
|
|
361
|
+
)
|
|
362
|
+
logger.warning(msg)
|
|
363
|
+
warnings.append(msg)
|
|
364
|
+
detector_content = detector_content[: self.content_size_limit]
|
|
365
|
+
|
|
366
|
+
if not detector_content:
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
async with self._detector_semaphore:
|
|
370
|
+
page_findings, page_types, page_errors = await self._run_detectors(
|
|
371
|
+
detectors=detectors,
|
|
372
|
+
content=detector_content,
|
|
373
|
+
content_type=text_content_type,
|
|
374
|
+
asset_name=asset.name,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
for finding in page_findings:
|
|
378
|
+
self.content_provider.enrich_finding_location(finding, asset, detector_content)
|
|
379
|
+
|
|
380
|
+
findings.extend(page_findings)
|
|
381
|
+
errors.extend(page_errors)
|
|
382
|
+
detector_types_run = self._merge_detector_types(detector_types_run, page_types)
|
|
383
|
+
unflushed_count += len(page_findings)
|
|
384
|
+
|
|
385
|
+
if unflushed_count >= findings_flush_size and page_findings:
|
|
386
|
+
await on_findings_flushed(list(findings))
|
|
387
|
+
unflushed_count = 0
|
|
388
|
+
|
|
389
|
+
if content_size == 0 and warn_on_empty_content:
|
|
390
|
+
msg = f"No content available for asset {asset.name}"
|
|
391
|
+
logger.warning(msg)
|
|
392
|
+
warnings.append(msg)
|
|
393
|
+
|
|
394
|
+
return findings, detector_types_run, content_size, warnings, errors
|
|
395
|
+
|
|
396
|
+
async def _iter_text_content_pages(self, asset: SingleAssetScanResults):
|
|
397
|
+
candidate_ids: list[str] = []
|
|
398
|
+
|
|
399
|
+
for candidate in (asset.external_url, asset.hash):
|
|
400
|
+
value = str(candidate or "").strip()
|
|
401
|
+
if not value or value in candidate_ids:
|
|
402
|
+
continue
|
|
403
|
+
candidate_ids.append(value)
|
|
404
|
+
|
|
405
|
+
for candidate_id in candidate_ids:
|
|
406
|
+
saw_candidate_content = False
|
|
407
|
+
async for text_content in self.content_provider.fetch_text_pages(candidate_id):
|
|
408
|
+
if not text_content:
|
|
409
|
+
continue
|
|
410
|
+
saw_candidate_content = True
|
|
411
|
+
yield text_content
|
|
412
|
+
|
|
413
|
+
if saw_candidate_content:
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
async def _run_binary_detectors_for_asset(
|
|
417
|
+
self,
|
|
418
|
+
*,
|
|
419
|
+
asset: SingleAssetScanResults,
|
|
420
|
+
detectors: list[BaseDetector],
|
|
421
|
+
) -> tuple[list[DetectionResult], list[DetectorType], list[str], list[str]]:
|
|
422
|
+
"""Fetch raw bytes for an asset and run binary/image detectors."""
|
|
423
|
+
warnings: list[str] = []
|
|
424
|
+
candidate_ids: list[str] = []
|
|
425
|
+
for candidate in (asset.external_url, asset.hash):
|
|
426
|
+
value = str(candidate or "").strip()
|
|
427
|
+
if not value or value in candidate_ids:
|
|
428
|
+
continue
|
|
429
|
+
candidate_ids.append(value)
|
|
430
|
+
|
|
431
|
+
for candidate_id in candidate_ids:
|
|
432
|
+
result = await self.content_provider.fetch_bytes(candidate_id)
|
|
433
|
+
if result is None:
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
raw_bytes, mime_type = result
|
|
437
|
+
if len(raw_bytes) > self.content_size_limit:
|
|
438
|
+
msg = (
|
|
439
|
+
f"Binary content truncated from {len(raw_bytes)} to "
|
|
440
|
+
f"{self.content_size_limit} bytes for {asset.name}"
|
|
441
|
+
)
|
|
442
|
+
logger.warning(msg)
|
|
443
|
+
warnings.append(msg)
|
|
444
|
+
raw_bytes = raw_bytes[: self.content_size_limit]
|
|
445
|
+
|
|
446
|
+
if not raw_bytes:
|
|
447
|
+
continue
|
|
448
|
+
|
|
449
|
+
effective_mime_type = self._resolve_binary_mime_type(
|
|
450
|
+
raw_bytes=raw_bytes,
|
|
451
|
+
declared_mime_type=mime_type,
|
|
452
|
+
asset=asset,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
compatible = [
|
|
456
|
+
d
|
|
457
|
+
for d in detectors
|
|
458
|
+
if self._supports_content_type(d.get_supported_content_types(), effective_mime_type)
|
|
459
|
+
]
|
|
460
|
+
if not compatible:
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
findings, detector_types_run, errors = await self._run_detectors(
|
|
464
|
+
detectors=compatible,
|
|
465
|
+
content=raw_bytes,
|
|
466
|
+
content_type=effective_mime_type,
|
|
467
|
+
asset_name=asset.name,
|
|
468
|
+
)
|
|
469
|
+
for finding in findings:
|
|
470
|
+
self.content_provider.enrich_finding_location(finding, asset, "")
|
|
471
|
+
return findings, detector_types_run, warnings, errors
|
|
472
|
+
|
|
473
|
+
return [], [], [], []
|
|
474
|
+
|
|
475
|
+
@staticmethod
|
|
476
|
+
def _resolve_binary_mime_type(
|
|
477
|
+
*,
|
|
478
|
+
raw_bytes: bytes,
|
|
479
|
+
declared_mime_type: str,
|
|
480
|
+
asset: SingleAssetScanResults,
|
|
481
|
+
) -> str:
|
|
482
|
+
file_name = str(asset.name or "").strip() or str(asset.external_url or "").strip()
|
|
483
|
+
return resolve_mime_type(
|
|
484
|
+
raw_bytes,
|
|
485
|
+
declared_mime_type=declared_mime_type,
|
|
486
|
+
file_name=file_name,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
@staticmethod
|
|
490
|
+
def _is_binary_detector(detector: BaseDetector) -> bool:
|
|
491
|
+
"""Return True if the detector handles binary content types (images, etc.)."""
|
|
492
|
+
for ct in detector.get_supported_content_types():
|
|
493
|
+
if ct.startswith(("image/", "audio/", "video/")) or ct == "application/octet-stream":
|
|
494
|
+
return True
|
|
495
|
+
return False
|
|
496
|
+
|
|
497
|
+
@staticmethod
|
|
498
|
+
def _detector_log_label(detector: BaseDetector) -> str:
|
|
499
|
+
"""Return a human-readable detector label for logs."""
|
|
500
|
+
config_name = getattr(getattr(detector, "config", None), "name", None)
|
|
501
|
+
if isinstance(config_name, str) and config_name.strip():
|
|
502
|
+
return config_name.strip()
|
|
503
|
+
|
|
504
|
+
detector_name = getattr(detector, "detector_name", "")
|
|
505
|
+
if isinstance(detector_name, str) and detector_name.strip() and detector_name != "base":
|
|
506
|
+
return detector_name.strip()
|
|
507
|
+
|
|
508
|
+
return detector.__class__.__name__
|
|
509
|
+
|
|
510
|
+
@staticmethod
|
|
511
|
+
def _merge_detector_types(
|
|
512
|
+
existing: list[DetectorType],
|
|
513
|
+
incoming: list[DetectorType],
|
|
514
|
+
) -> list[DetectorType]:
|
|
515
|
+
merged = list(existing)
|
|
516
|
+
seen = set(existing)
|
|
517
|
+
for detector_type in incoming:
|
|
518
|
+
if detector_type in seen:
|
|
519
|
+
continue
|
|
520
|
+
seen.add(detector_type)
|
|
521
|
+
merged.append(detector_type)
|
|
522
|
+
return merged
|
|
523
|
+
|
|
524
|
+
async def _fetch_content(self, asset: SingleAssetScanResults) -> tuple[str, str]:
|
|
525
|
+
"""Fetch content for an asset."""
|
|
526
|
+
content_type = self._asset_type_to_content_type(asset.asset_type)
|
|
527
|
+
|
|
528
|
+
async for text_content in self._iter_text_content_pages(asset):
|
|
529
|
+
return text_content, content_type
|
|
530
|
+
|
|
531
|
+
return "", content_type
|
|
532
|
+
|
|
533
|
+
async def _run_detectors(
|
|
534
|
+
self,
|
|
535
|
+
*,
|
|
536
|
+
detectors: list[BaseDetector],
|
|
537
|
+
content: str | bytes,
|
|
538
|
+
content_type: str,
|
|
539
|
+
asset_name: str = "",
|
|
540
|
+
) -> tuple[list[DetectionResult], list[DetectorType], list[str]]:
|
|
541
|
+
"""Run all compatible detectors in parallel for a single payload."""
|
|
542
|
+
if not content:
|
|
543
|
+
return [], [], []
|
|
544
|
+
|
|
545
|
+
tasks = []
|
|
546
|
+
runnable_detectors: list[BaseDetector] = []
|
|
547
|
+
|
|
548
|
+
for detector in detectors:
|
|
549
|
+
supported = detector.get_supported_content_types()
|
|
550
|
+
if self._supports_content_type(supported, content_type):
|
|
551
|
+
tasks.append(self._run_single_detector(detector, content, content_type))
|
|
552
|
+
runnable_detectors.append(detector)
|
|
553
|
+
|
|
554
|
+
if not tasks:
|
|
555
|
+
return [], [], []
|
|
556
|
+
|
|
557
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
558
|
+
|
|
559
|
+
detector_types_run: list[DetectorType] = []
|
|
560
|
+
seen_detector_types: set[DetectorType] = set()
|
|
561
|
+
for detector in runnable_detectors:
|
|
562
|
+
detector_type = getattr(detector, "detector_type", "")
|
|
563
|
+
if not detector_type:
|
|
564
|
+
continue
|
|
565
|
+
try:
|
|
566
|
+
detector_type_enum = DetectorType(detector_type.upper())
|
|
567
|
+
except ValueError:
|
|
568
|
+
logger.warning(f"Unknown detector type during scan stats: {detector_type}")
|
|
569
|
+
continue
|
|
570
|
+
if detector_type_enum in seen_detector_types:
|
|
571
|
+
continue
|
|
572
|
+
seen_detector_types.add(detector_type_enum)
|
|
573
|
+
detector_types_run.append(detector_type_enum)
|
|
574
|
+
|
|
575
|
+
all_findings: list[DetectionResult] = []
|
|
576
|
+
errors: list[str] = []
|
|
577
|
+
detected_at = datetime.now(UTC)
|
|
578
|
+
|
|
579
|
+
for detector, result in zip(runnable_detectors, results, strict=False):
|
|
580
|
+
detector_name = detector.__class__.__name__
|
|
581
|
+
if isinstance(result, Exception):
|
|
582
|
+
logger.error("Detector %s failed for %s: %s", detector_name, asset_name, result)
|
|
583
|
+
errors.append(f"{detector_name}: {result}")
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
detector_findings: list[DetectionResult] = []
|
|
587
|
+
if isinstance(result, list):
|
|
588
|
+
for finding in result:
|
|
589
|
+
if isinstance(finding, DetectionResult):
|
|
590
|
+
finding_with_meta = finding.model_copy(
|
|
591
|
+
update={
|
|
592
|
+
"runner_id": self.runner_id,
|
|
593
|
+
"detected_at": detected_at,
|
|
594
|
+
}
|
|
595
|
+
)
|
|
596
|
+
detector_findings.append(finding_with_meta)
|
|
597
|
+
|
|
598
|
+
if detector_findings:
|
|
599
|
+
logger.info(
|
|
600
|
+
" %s on %s: %d finding(s)",
|
|
601
|
+
detector_name,
|
|
602
|
+
asset_name,
|
|
603
|
+
len(detector_findings),
|
|
604
|
+
)
|
|
605
|
+
else:
|
|
606
|
+
logger.info(" %s on %s: no findings", detector_name, asset_name)
|
|
607
|
+
|
|
608
|
+
all_findings.extend(detector_findings)
|
|
609
|
+
|
|
610
|
+
return all_findings, detector_types_run, errors
|
|
611
|
+
|
|
612
|
+
def _build_links_payload(self, links: list[str] | None) -> str:
|
|
613
|
+
if not links:
|
|
614
|
+
return ""
|
|
615
|
+
|
|
616
|
+
unique_links: list[str] = []
|
|
617
|
+
seen_links: set[str] = set()
|
|
618
|
+
for link in links:
|
|
619
|
+
value = str(link).strip()
|
|
620
|
+
if not value:
|
|
621
|
+
continue
|
|
622
|
+
|
|
623
|
+
resolved = self.content_provider.resolve_link_for_detection(value)
|
|
624
|
+
if not resolved or resolved in seen_links:
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
seen_links.add(resolved)
|
|
628
|
+
unique_links.append(resolved)
|
|
629
|
+
|
|
630
|
+
return "\n".join(unique_links)
|
|
631
|
+
|
|
632
|
+
async def _run_single_detector(
|
|
633
|
+
self, detector: BaseDetector, content: str | bytes, content_type: str
|
|
634
|
+
) -> list[DetectionResult]:
|
|
635
|
+
"""Run a single detector."""
|
|
636
|
+
return await detector.detect(content, content_type)
|
|
637
|
+
|
|
638
|
+
def _text_content_type_for_asset(
|
|
639
|
+
self,
|
|
640
|
+
asset_type: OutputAssetType,
|
|
641
|
+
ocr_enabled: bool,
|
|
642
|
+
) -> str | None:
|
|
643
|
+
"""Map an asset type to the text payload MIME used for text-capable detectors."""
|
|
644
|
+
mapping = {
|
|
645
|
+
OutputAssetType.TXT: "text/plain",
|
|
646
|
+
OutputAssetType.TABLE: "text/plain",
|
|
647
|
+
# URL assets usually resolve to HTML pages and are scanned as extracted text.
|
|
648
|
+
OutputAssetType.URL: "text/html",
|
|
649
|
+
}
|
|
650
|
+
if asset_type in mapping:
|
|
651
|
+
return mapping[asset_type]
|
|
652
|
+
if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
|
|
653
|
+
return "text/plain"
|
|
654
|
+
return None
|
|
655
|
+
|
|
656
|
+
@staticmethod
|
|
657
|
+
def _asset_has_binary_primary_payload(asset_type: OutputAssetType) -> bool:
|
|
658
|
+
return asset_type in {
|
|
659
|
+
OutputAssetType.IMAGE,
|
|
660
|
+
OutputAssetType.VIDEO,
|
|
661
|
+
OutputAssetType.AUDIO,
|
|
662
|
+
OutputAssetType.BINARY,
|
|
663
|
+
OutputAssetType.OTHER,
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
def _supports_content_type(self, supported: list[str], content_type: str) -> bool:
|
|
667
|
+
"""
|
|
668
|
+
Check MIME compatibility, including wildcard and text fallback behavior.
|
|
669
|
+
"""
|
|
670
|
+
if content_type in supported:
|
|
671
|
+
return True
|
|
672
|
+
|
|
673
|
+
for supported_type in supported:
|
|
674
|
+
if supported_type.endswith("/*"):
|
|
675
|
+
prefix = supported_type[:-1]
|
|
676
|
+
if content_type.startswith(prefix):
|
|
677
|
+
return True
|
|
678
|
+
|
|
679
|
+
# Compatibility fallback: text detectors that declare text/plain
|
|
680
|
+
# should still process extracted HTML text content.
|
|
681
|
+
if content_type == "text/html" and "text/plain" in supported:
|
|
682
|
+
return True
|
|
683
|
+
|
|
684
|
+
return False
|
|
685
|
+
|
|
686
|
+
@classmethod
|
|
687
|
+
def from_recipe(
|
|
688
|
+
cls,
|
|
689
|
+
recipe: dict[str, Any],
|
|
690
|
+
source: BaseSource,
|
|
691
|
+
runner_id: str,
|
|
692
|
+
max_concurrent_assets: int = 10,
|
|
693
|
+
) -> "DetectorPipeline":
|
|
694
|
+
"""Create pipeline from recipe configuration."""
|
|
695
|
+
from ..detectors import get_detector
|
|
696
|
+
from ..detectors.config import parse_detector_config
|
|
697
|
+
|
|
698
|
+
# New schema: detectors is an array of {type, enabled, config}
|
|
699
|
+
detector_configs = recipe.get("detectors", [])
|
|
700
|
+
|
|
701
|
+
if not detector_configs:
|
|
702
|
+
# Return empty pipeline (no detectors)
|
|
703
|
+
return cls(detectors=[], source=source, runner_id=runner_id)
|
|
704
|
+
|
|
705
|
+
detectors = []
|
|
706
|
+
init_warnings: list[str] = []
|
|
707
|
+
|
|
708
|
+
for detector_item in detector_configs:
|
|
709
|
+
if not detector_item.get("enabled", True):
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
detector_type = detector_item.get("type", "").upper()
|
|
713
|
+
raw_config = detector_item.get("config", {})
|
|
714
|
+
|
|
715
|
+
try:
|
|
716
|
+
detector_name, typed_config = parse_detector_config(
|
|
717
|
+
detector_type=detector_type,
|
|
718
|
+
raw_config=raw_config,
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
detector = get_detector(detector_name, typed_config)
|
|
722
|
+
detectors.append(detector)
|
|
723
|
+
logger.info(f"Initialized detector: {detector_name}")
|
|
724
|
+
except Exception as e:
|
|
725
|
+
msg = f"Failed to initialize detector {detector_type}: {e}"
|
|
726
|
+
logger.error(msg)
|
|
727
|
+
init_warnings.append(msg)
|
|
728
|
+
|
|
729
|
+
from .parsed_content_provider import ParsedContentProvider
|
|
730
|
+
|
|
731
|
+
content_size_limit = 1_048_576 # 1MB
|
|
732
|
+
|
|
733
|
+
pipeline = cls(
|
|
734
|
+
detectors=detectors,
|
|
735
|
+
source=source,
|
|
736
|
+
runner_id=runner_id,
|
|
737
|
+
content_size_limit=content_size_limit,
|
|
738
|
+
max_concurrent_assets=max_concurrent_assets,
|
|
739
|
+
content_provider=ParsedContentProvider(source),
|
|
740
|
+
)
|
|
741
|
+
pipeline.init_warnings = init_warnings
|
|
742
|
+
return pipeline
|