classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,742 @@
1
+ """Pipeline for running detectors on extracted assets."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator, Awaitable, Callable
6
+ from datetime import UTC, datetime
7
+ from typing import Any
8
+
9
+ from ..detectors.base import BaseDetector
10
+ from ..models.generated_single_asset_scan_results import (
11
+ AssetType as OutputAssetType,
12
+ )
13
+ from ..models.generated_single_asset_scan_results import (
14
+ DetectionResult,
15
+ DetectorType,
16
+ ScanStats,
17
+ SingleAssetScanResults,
18
+ )
19
+ from ..sources.base import BaseSource
20
+ from ..utils.file_parser import resolve_mime_type
21
+ from .content_provider import ContentProvider
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class DetectorPipeline:
27
+ """
28
+ Pipeline for running detectors on extracted assets.
29
+
30
+ Adds detector findings to assets (CoreOutput schema).
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ detectors: list[BaseDetector],
36
+ source: BaseSource,
37
+ runner_id: str,
38
+ content_size_limit: int = 1_048_576, # 1MB default
39
+ max_concurrent_assets: int = 10,
40
+ content_provider: ContentProvider | None = None,
41
+ ):
42
+ """
43
+ Initialize detector pipeline.
44
+
45
+ Args:
46
+ detectors: List of detector instances to run
47
+ source: Source instance for fetching content
48
+ runner_id: ID of the runner executing this pipeline
49
+ content_size_limit: Maximum content size in bytes
50
+ max_concurrent_assets: Max assets to process in parallel within a batch
51
+ content_provider: Optional provider — if None, source is used directly
52
+ """
53
+ self.detectors = detectors
54
+ self.source = source
55
+ self.runner_id = runner_id
56
+ self.content_size_limit = content_size_limit
57
+ self.max_concurrent_assets = max_concurrent_assets
58
+ self._detector_semaphore = asyncio.Semaphore(max_concurrent_assets)
59
+ if content_provider is not None:
60
+ self.content_provider: ContentProvider = content_provider
61
+ else:
62
+ from .parsed_content_provider import ParsedContentProvider
63
+
64
+ self.content_provider = ParsedContentProvider(source)
65
+ self.init_warnings: list[str] = []
66
+
67
+ async def process(self, assets: list[SingleAssetScanResults]) -> list[SingleAssetScanResults]:
68
+ """Process assets through detector pipeline, returning all results at once."""
69
+ results: list[SingleAssetScanResults] = []
70
+ async for asset in self.process_stream(assets):
71
+ results.append(asset)
72
+ return results
73
+
74
+ async def process_stream(
75
+ self, assets: list[SingleAssetScanResults]
76
+ ) -> AsyncGenerator[SingleAssetScanResults, None]:
77
+ """Process assets concurrently, yielding in completion order.
78
+
79
+ Total concurrent detector invocations across all assets and pages
80
+ are bounded by ``self._detector_semaphore``.
81
+ """
82
+ tasks = {asyncio.create_task(self.process_single_asset(a)) for a in assets}
83
+ for coro in asyncio.as_completed(tasks):
84
+ yield await coro
85
+
86
+ async def process_single_asset(
87
+ self,
88
+ asset: SingleAssetScanResults,
89
+ *,
90
+ on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]] | None = None,
91
+ findings_flush_size: int = 50,
92
+ ) -> SingleAssetScanResults:
93
+ """Process a single asset through detectors.
94
+
95
+ When *on_findings_flushed* is provided the text-detector phase switches to
96
+ sequential page processing and calls the callback every *findings_flush_size*
97
+ new findings so callers can push partial results without waiting for the full
98
+ asset (important for ALL-strategy tabular sources with thousands of pages).
99
+ """
100
+ # 1. If no detectors, return asset as-is with empty findings
101
+ if not self.detectors:
102
+ asset.findings = []
103
+ return asset
104
+
105
+ # Record scan start time
106
+ scan_started = datetime.now(UTC)
107
+ ocr_enabled = self.source.ocr_enabled()
108
+ text_content_type = self._text_content_type_for_asset(asset.asset_type, ocr_enabled)
109
+ link_content = self._build_links_payload(asset.links)
110
+
111
+ text_detectors = []
112
+ if text_content_type:
113
+ text_detectors = [
114
+ detector
115
+ for detector in self.detectors
116
+ if self._supports_content_type(
117
+ detector.get_supported_content_types(),
118
+ text_content_type,
119
+ )
120
+ ]
121
+ asset_has_binary_primary = self._asset_has_binary_primary_payload(asset.asset_type)
122
+ binary_detectors = [
123
+ detector
124
+ for detector in self.detectors
125
+ if self._is_binary_detector(detector)
126
+ and (
127
+ asset_has_binary_primary
128
+ or not text_content_type
129
+ or not self._supports_content_type(
130
+ detector.get_supported_content_types(),
131
+ text_content_type,
132
+ )
133
+ )
134
+ ]
135
+ link_detectors = [
136
+ detector
137
+ for detector in self.detectors
138
+ if link_content
139
+ and self._supports_content_type(
140
+ detector.get_supported_content_types(),
141
+ "application/x.asset-links",
142
+ )
143
+ ]
144
+ should_warn_on_empty_text = asset.asset_type in {
145
+ OutputAssetType.TXT,
146
+ OutputAssetType.TABLE,
147
+ OutputAssetType.URL,
148
+ }
149
+
150
+ all_active = text_detectors + binary_detectors + link_detectors
151
+ detector_names = [self._detector_log_label(d) for d in all_active]
152
+ logger.info("Scanning %s [%s]", asset.name, ", ".join(detector_names))
153
+
154
+ findings: list[DetectionResult] = []
155
+ detector_types_run: list[DetectorType] = []
156
+ scan_warnings: list[str] = list(self.init_warnings)
157
+ scan_errors: list[str] = []
158
+
159
+ if text_detectors:
160
+ (
161
+ text_findings,
162
+ text_detector_types_run,
163
+ content_size,
164
+ text_warnings,
165
+ text_errors,
166
+ ) = await self._run_text_detectors_for_asset(
167
+ asset=asset,
168
+ text_content_type=text_content_type,
169
+ detectors=text_detectors,
170
+ warn_on_empty_content=should_warn_on_empty_text,
171
+ on_findings_flushed=on_findings_flushed,
172
+ findings_flush_size=findings_flush_size,
173
+ )
174
+ findings.extend(text_findings)
175
+ scan_warnings.extend(text_warnings)
176
+ scan_errors.extend(text_errors)
177
+ detector_types_run = self._merge_detector_types(
178
+ detector_types_run,
179
+ text_detector_types_run,
180
+ )
181
+ else:
182
+ content_size = 0
183
+
184
+ if binary_detectors:
185
+ (
186
+ binary_findings,
187
+ binary_detector_types_run,
188
+ bin_warnings,
189
+ bin_errors,
190
+ ) = await self._run_binary_detectors_for_asset(
191
+ asset=asset,
192
+ detectors=binary_detectors,
193
+ )
194
+ findings.extend(binary_findings)
195
+ scan_warnings.extend(bin_warnings)
196
+ scan_errors.extend(bin_errors)
197
+ detector_types_run = self._merge_detector_types(
198
+ detector_types_run,
199
+ binary_detector_types_run,
200
+ )
201
+
202
+ if link_detectors:
203
+ link_findings, link_detector_types_run, link_errors = await self._run_detectors(
204
+ detectors=link_detectors,
205
+ content=link_content,
206
+ content_type="application/x.asset-links",
207
+ asset_name=asset.name,
208
+ )
209
+ findings.extend(link_findings)
210
+ scan_errors.extend(link_errors)
211
+ detector_types_run = self._merge_detector_types(
212
+ detector_types_run,
213
+ link_detector_types_run,
214
+ )
215
+
216
+ for finding in link_findings:
217
+ self.content_provider.enrich_finding_location(finding, asset, "")
218
+
219
+ scan_duration = int((datetime.now(UTC) - scan_started).total_seconds() * 1000)
220
+
221
+ asset.findings = findings
222
+ asset.scan_stats = ScanStats(
223
+ scanned_at=scan_started,
224
+ duration_ms=scan_duration,
225
+ detectors_run=detector_types_run,
226
+ content_size_bytes=content_size,
227
+ findings_count=len(findings),
228
+ warnings=scan_warnings or None,
229
+ errors=scan_errors or None,
230
+ )
231
+
232
+ if findings:
233
+ logger.info(
234
+ "Scanned %s: %d finding(s) in %dms",
235
+ asset.name,
236
+ len(findings),
237
+ scan_duration,
238
+ )
239
+ else:
240
+ logger.info("Scanned %s: no findings (%dms)", asset.name, scan_duration)
241
+
242
+ return asset
243
+
244
+ async def _run_text_detectors_for_asset(
245
+ self,
246
+ *,
247
+ asset: SingleAssetScanResults,
248
+ text_content_type: str,
249
+ detectors: list[BaseDetector],
250
+ warn_on_empty_content: bool = True,
251
+ on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]] | None = None,
252
+ findings_flush_size: int = 50,
253
+ ) -> tuple[list[DetectionResult], list[DetectorType], int, list[str], list[str]]:
254
+ if on_findings_flushed is not None:
255
+ return await self._run_text_detectors_streaming(
256
+ asset=asset,
257
+ text_content_type=text_content_type,
258
+ detectors=detectors,
259
+ warn_on_empty_content=warn_on_empty_content,
260
+ on_findings_flushed=on_findings_flushed,
261
+ findings_flush_size=findings_flush_size,
262
+ )
263
+ findings: list[DetectionResult] = []
264
+ detector_types_run: list[DetectorType] = []
265
+ warnings: list[str] = []
266
+ errors: list[str] = []
267
+ content_size = 0
268
+
269
+ pending_tasks: set[
270
+ asyncio.Task[tuple[list[DetectionResult], list[DetectorType], list[str], str]]
271
+ ] = set()
272
+
273
+ async def _detect_page(
274
+ page_content: str,
275
+ ) -> tuple[list[DetectionResult], list[DetectorType], list[str], str]:
276
+ async with self._detector_semaphore:
277
+ page_findings, page_types, page_errors = await self._run_detectors(
278
+ detectors=detectors,
279
+ content=page_content,
280
+ content_type=text_content_type,
281
+ asset_name=asset.name,
282
+ )
283
+ return page_findings, page_types, page_errors, page_content
284
+
285
+ def _collect_done() -> None:
286
+ done = {t for t in pending_tasks if t.done()}
287
+ for task in done:
288
+ pending_tasks.discard(task)
289
+ page_findings, page_types, page_errors, page_content = task.result()
290
+ findings.extend(page_findings)
291
+ errors.extend(page_errors)
292
+ nonlocal detector_types_run
293
+ detector_types_run = self._merge_detector_types(
294
+ detector_types_run,
295
+ page_types,
296
+ )
297
+ for finding in page_findings:
298
+ self.content_provider.enrich_finding_location(
299
+ finding,
300
+ asset,
301
+ page_content,
302
+ )
303
+
304
+ async for text_content in self._iter_text_content_pages(asset):
305
+ content_size += len(text_content)
306
+
307
+ detector_content = text_content
308
+ if len(detector_content) > self.content_size_limit:
309
+ msg = (
310
+ f"Content truncated from {len(detector_content)} to "
311
+ f"{self.content_size_limit} bytes for {asset.name}"
312
+ )
313
+ logger.warning(msg)
314
+ warnings.append(msg)
315
+ detector_content = detector_content[: self.content_size_limit]
316
+
317
+ if not detector_content:
318
+ continue
319
+
320
+ task = asyncio.create_task(_detect_page(detector_content))
321
+ pending_tasks.add(task)
322
+ _collect_done()
323
+
324
+ if pending_tasks:
325
+ await asyncio.gather(*pending_tasks)
326
+ _collect_done()
327
+
328
+ if content_size == 0 and warn_on_empty_content:
329
+ msg = f"No content available for asset {asset.name}"
330
+ logger.warning(msg)
331
+ warnings.append(msg)
332
+
333
+ return findings, detector_types_run, content_size, warnings, errors
334
+
335
+ async def _run_text_detectors_streaming(
336
+ self,
337
+ *,
338
+ asset: SingleAssetScanResults,
339
+ text_content_type: str,
340
+ detectors: list[BaseDetector],
341
+ warn_on_empty_content: bool = True,
342
+ on_findings_flushed: Callable[[list[DetectionResult]], Awaitable[None]],
343
+ findings_flush_size: int = 50,
344
+ ) -> tuple[list[DetectionResult], list[DetectorType], int, list[str], list[str]]:
345
+ """Sequential variant: processes one page at a time and calls back every N findings."""
346
+ findings: list[DetectionResult] = []
347
+ detector_types_run: list[DetectorType] = []
348
+ warnings: list[str] = []
349
+ errors: list[str] = []
350
+ content_size = 0
351
+ unflushed_count = 0
352
+
353
+ async for text_content in self._iter_text_content_pages(asset):
354
+ content_size += len(text_content)
355
+
356
+ detector_content = text_content
357
+ if len(detector_content) > self.content_size_limit:
358
+ msg = (
359
+ f"Content truncated from {len(detector_content)} to "
360
+ f"{self.content_size_limit} bytes for {asset.name}"
361
+ )
362
+ logger.warning(msg)
363
+ warnings.append(msg)
364
+ detector_content = detector_content[: self.content_size_limit]
365
+
366
+ if not detector_content:
367
+ continue
368
+
369
+ async with self._detector_semaphore:
370
+ page_findings, page_types, page_errors = await self._run_detectors(
371
+ detectors=detectors,
372
+ content=detector_content,
373
+ content_type=text_content_type,
374
+ asset_name=asset.name,
375
+ )
376
+
377
+ for finding in page_findings:
378
+ self.content_provider.enrich_finding_location(finding, asset, detector_content)
379
+
380
+ findings.extend(page_findings)
381
+ errors.extend(page_errors)
382
+ detector_types_run = self._merge_detector_types(detector_types_run, page_types)
383
+ unflushed_count += len(page_findings)
384
+
385
+ if unflushed_count >= findings_flush_size and page_findings:
386
+ await on_findings_flushed(list(findings))
387
+ unflushed_count = 0
388
+
389
+ if content_size == 0 and warn_on_empty_content:
390
+ msg = f"No content available for asset {asset.name}"
391
+ logger.warning(msg)
392
+ warnings.append(msg)
393
+
394
+ return findings, detector_types_run, content_size, warnings, errors
395
+
396
+ async def _iter_text_content_pages(self, asset: SingleAssetScanResults):
397
+ candidate_ids: list[str] = []
398
+
399
+ for candidate in (asset.external_url, asset.hash):
400
+ value = str(candidate or "").strip()
401
+ if not value or value in candidate_ids:
402
+ continue
403
+ candidate_ids.append(value)
404
+
405
+ for candidate_id in candidate_ids:
406
+ saw_candidate_content = False
407
+ async for text_content in self.content_provider.fetch_text_pages(candidate_id):
408
+ if not text_content:
409
+ continue
410
+ saw_candidate_content = True
411
+ yield text_content
412
+
413
+ if saw_candidate_content:
414
+ return
415
+
416
+ async def _run_binary_detectors_for_asset(
417
+ self,
418
+ *,
419
+ asset: SingleAssetScanResults,
420
+ detectors: list[BaseDetector],
421
+ ) -> tuple[list[DetectionResult], list[DetectorType], list[str], list[str]]:
422
+ """Fetch raw bytes for an asset and run binary/image detectors."""
423
+ warnings: list[str] = []
424
+ candidate_ids: list[str] = []
425
+ for candidate in (asset.external_url, asset.hash):
426
+ value = str(candidate or "").strip()
427
+ if not value or value in candidate_ids:
428
+ continue
429
+ candidate_ids.append(value)
430
+
431
+ for candidate_id in candidate_ids:
432
+ result = await self.content_provider.fetch_bytes(candidate_id)
433
+ if result is None:
434
+ continue
435
+
436
+ raw_bytes, mime_type = result
437
+ if len(raw_bytes) > self.content_size_limit:
438
+ msg = (
439
+ f"Binary content truncated from {len(raw_bytes)} to "
440
+ f"{self.content_size_limit} bytes for {asset.name}"
441
+ )
442
+ logger.warning(msg)
443
+ warnings.append(msg)
444
+ raw_bytes = raw_bytes[: self.content_size_limit]
445
+
446
+ if not raw_bytes:
447
+ continue
448
+
449
+ effective_mime_type = self._resolve_binary_mime_type(
450
+ raw_bytes=raw_bytes,
451
+ declared_mime_type=mime_type,
452
+ asset=asset,
453
+ )
454
+
455
+ compatible = [
456
+ d
457
+ for d in detectors
458
+ if self._supports_content_type(d.get_supported_content_types(), effective_mime_type)
459
+ ]
460
+ if not compatible:
461
+ continue
462
+
463
+ findings, detector_types_run, errors = await self._run_detectors(
464
+ detectors=compatible,
465
+ content=raw_bytes,
466
+ content_type=effective_mime_type,
467
+ asset_name=asset.name,
468
+ )
469
+ for finding in findings:
470
+ self.content_provider.enrich_finding_location(finding, asset, "")
471
+ return findings, detector_types_run, warnings, errors
472
+
473
+ return [], [], [], []
474
+
475
+ @staticmethod
476
+ def _resolve_binary_mime_type(
477
+ *,
478
+ raw_bytes: bytes,
479
+ declared_mime_type: str,
480
+ asset: SingleAssetScanResults,
481
+ ) -> str:
482
+ file_name = str(asset.name or "").strip() or str(asset.external_url or "").strip()
483
+ return resolve_mime_type(
484
+ raw_bytes,
485
+ declared_mime_type=declared_mime_type,
486
+ file_name=file_name,
487
+ )
488
+
489
+ @staticmethod
490
+ def _is_binary_detector(detector: BaseDetector) -> bool:
491
+ """Return True if the detector handles binary content types (images, etc.)."""
492
+ for ct in detector.get_supported_content_types():
493
+ if ct.startswith(("image/", "audio/", "video/")) or ct == "application/octet-stream":
494
+ return True
495
+ return False
496
+
497
+ @staticmethod
498
+ def _detector_log_label(detector: BaseDetector) -> str:
499
+ """Return a human-readable detector label for logs."""
500
+ config_name = getattr(getattr(detector, "config", None), "name", None)
501
+ if isinstance(config_name, str) and config_name.strip():
502
+ return config_name.strip()
503
+
504
+ detector_name = getattr(detector, "detector_name", "")
505
+ if isinstance(detector_name, str) and detector_name.strip() and detector_name != "base":
506
+ return detector_name.strip()
507
+
508
+ return detector.__class__.__name__
509
+
510
+ @staticmethod
511
+ def _merge_detector_types(
512
+ existing: list[DetectorType],
513
+ incoming: list[DetectorType],
514
+ ) -> list[DetectorType]:
515
+ merged = list(existing)
516
+ seen = set(existing)
517
+ for detector_type in incoming:
518
+ if detector_type in seen:
519
+ continue
520
+ seen.add(detector_type)
521
+ merged.append(detector_type)
522
+ return merged
523
+
524
+ async def _fetch_content(self, asset: SingleAssetScanResults) -> tuple[str, str]:
525
+ """Fetch content for an asset."""
526
+ content_type = self._asset_type_to_content_type(asset.asset_type)
527
+
528
+ async for text_content in self._iter_text_content_pages(asset):
529
+ return text_content, content_type
530
+
531
+ return "", content_type
532
+
533
+ async def _run_detectors(
534
+ self,
535
+ *,
536
+ detectors: list[BaseDetector],
537
+ content: str | bytes,
538
+ content_type: str,
539
+ asset_name: str = "",
540
+ ) -> tuple[list[DetectionResult], list[DetectorType], list[str]]:
541
+ """Run all compatible detectors in parallel for a single payload."""
542
+ if not content:
543
+ return [], [], []
544
+
545
+ tasks = []
546
+ runnable_detectors: list[BaseDetector] = []
547
+
548
+ for detector in detectors:
549
+ supported = detector.get_supported_content_types()
550
+ if self._supports_content_type(supported, content_type):
551
+ tasks.append(self._run_single_detector(detector, content, content_type))
552
+ runnable_detectors.append(detector)
553
+
554
+ if not tasks:
555
+ return [], [], []
556
+
557
+ results = await asyncio.gather(*tasks, return_exceptions=True)
558
+
559
+ detector_types_run: list[DetectorType] = []
560
+ seen_detector_types: set[DetectorType] = set()
561
+ for detector in runnable_detectors:
562
+ detector_type = getattr(detector, "detector_type", "")
563
+ if not detector_type:
564
+ continue
565
+ try:
566
+ detector_type_enum = DetectorType(detector_type.upper())
567
+ except ValueError:
568
+ logger.warning(f"Unknown detector type during scan stats: {detector_type}")
569
+ continue
570
+ if detector_type_enum in seen_detector_types:
571
+ continue
572
+ seen_detector_types.add(detector_type_enum)
573
+ detector_types_run.append(detector_type_enum)
574
+
575
+ all_findings: list[DetectionResult] = []
576
+ errors: list[str] = []
577
+ detected_at = datetime.now(UTC)
578
+
579
+ for detector, result in zip(runnable_detectors, results, strict=False):
580
+ detector_name = detector.__class__.__name__
581
+ if isinstance(result, Exception):
582
+ logger.error("Detector %s failed for %s: %s", detector_name, asset_name, result)
583
+ errors.append(f"{detector_name}: {result}")
584
+ continue
585
+
586
+ detector_findings: list[DetectionResult] = []
587
+ if isinstance(result, list):
588
+ for finding in result:
589
+ if isinstance(finding, DetectionResult):
590
+ finding_with_meta = finding.model_copy(
591
+ update={
592
+ "runner_id": self.runner_id,
593
+ "detected_at": detected_at,
594
+ }
595
+ )
596
+ detector_findings.append(finding_with_meta)
597
+
598
+ if detector_findings:
599
+ logger.info(
600
+ " %s on %s: %d finding(s)",
601
+ detector_name,
602
+ asset_name,
603
+ len(detector_findings),
604
+ )
605
+ else:
606
+ logger.info(" %s on %s: no findings", detector_name, asset_name)
607
+
608
+ all_findings.extend(detector_findings)
609
+
610
+ return all_findings, detector_types_run, errors
611
+
612
+ def _build_links_payload(self, links: list[str] | None) -> str:
613
+ if not links:
614
+ return ""
615
+
616
+ unique_links: list[str] = []
617
+ seen_links: set[str] = set()
618
+ for link in links:
619
+ value = str(link).strip()
620
+ if not value:
621
+ continue
622
+
623
+ resolved = self.content_provider.resolve_link_for_detection(value)
624
+ if not resolved or resolved in seen_links:
625
+ continue
626
+
627
+ seen_links.add(resolved)
628
+ unique_links.append(resolved)
629
+
630
+ return "\n".join(unique_links)
631
+
632
+ async def _run_single_detector(
633
+ self, detector: BaseDetector, content: str | bytes, content_type: str
634
+ ) -> list[DetectionResult]:
635
+ """Run a single detector."""
636
+ return await detector.detect(content, content_type)
637
+
638
+ def _text_content_type_for_asset(
639
+ self,
640
+ asset_type: OutputAssetType,
641
+ ocr_enabled: bool,
642
+ ) -> str | None:
643
+ """Map an asset type to the text payload MIME used for text-capable detectors."""
644
+ mapping = {
645
+ OutputAssetType.TXT: "text/plain",
646
+ OutputAssetType.TABLE: "text/plain",
647
+ # URL assets usually resolve to HTML pages and are scanned as extracted text.
648
+ OutputAssetType.URL: "text/html",
649
+ }
650
+ if asset_type in mapping:
651
+ return mapping[asset_type]
652
+ if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
653
+ return "text/plain"
654
+ return None
655
+
656
+ @staticmethod
657
+ def _asset_has_binary_primary_payload(asset_type: OutputAssetType) -> bool:
658
+ return asset_type in {
659
+ OutputAssetType.IMAGE,
660
+ OutputAssetType.VIDEO,
661
+ OutputAssetType.AUDIO,
662
+ OutputAssetType.BINARY,
663
+ OutputAssetType.OTHER,
664
+ }
665
+
666
+ def _supports_content_type(self, supported: list[str], content_type: str) -> bool:
667
+ """
668
+ Check MIME compatibility, including wildcard and text fallback behavior.
669
+ """
670
+ if content_type in supported:
671
+ return True
672
+
673
+ for supported_type in supported:
674
+ if supported_type.endswith("/*"):
675
+ prefix = supported_type[:-1]
676
+ if content_type.startswith(prefix):
677
+ return True
678
+
679
+ # Compatibility fallback: text detectors that declare text/plain
680
+ # should still process extracted HTML text content.
681
+ if content_type == "text/html" and "text/plain" in supported:
682
+ return True
683
+
684
+ return False
685
+
686
+ @classmethod
687
+ def from_recipe(
688
+ cls,
689
+ recipe: dict[str, Any],
690
+ source: BaseSource,
691
+ runner_id: str,
692
+ max_concurrent_assets: int = 10,
693
+ ) -> "DetectorPipeline":
694
+ """Create pipeline from recipe configuration."""
695
+ from ..detectors import get_detector
696
+ from ..detectors.config import parse_detector_config
697
+
698
+ # New schema: detectors is an array of {type, enabled, config}
699
+ detector_configs = recipe.get("detectors", [])
700
+
701
+ if not detector_configs:
702
+ # Return empty pipeline (no detectors)
703
+ return cls(detectors=[], source=source, runner_id=runner_id)
704
+
705
+ detectors = []
706
+ init_warnings: list[str] = []
707
+
708
+ for detector_item in detector_configs:
709
+ if not detector_item.get("enabled", True):
710
+ continue
711
+
712
+ detector_type = detector_item.get("type", "").upper()
713
+ raw_config = detector_item.get("config", {})
714
+
715
+ try:
716
+ detector_name, typed_config = parse_detector_config(
717
+ detector_type=detector_type,
718
+ raw_config=raw_config,
719
+ )
720
+
721
+ detector = get_detector(detector_name, typed_config)
722
+ detectors.append(detector)
723
+ logger.info(f"Initialized detector: {detector_name}")
724
+ except Exception as e:
725
+ msg = f"Failed to initialize detector {detector_type}: {e}"
726
+ logger.error(msg)
727
+ init_warnings.append(msg)
728
+
729
+ from .parsed_content_provider import ParsedContentProvider
730
+
731
+ content_size_limit = 1_048_576 # 1MB
732
+
733
+ pipeline = cls(
734
+ detectors=detectors,
735
+ source=source,
736
+ runner_id=runner_id,
737
+ content_size_limit=content_size_limit,
738
+ max_concurrent_assets=max_concurrent_assets,
739
+ content_provider=ParsedContentProvider(source),
740
+ )
741
+ pipeline.init_warnings = init_warnings
742
+ return pipeline