classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,679 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import itertools
5
+ import logging
6
+ import random
7
+ from abc import ABC, abstractmethod
8
+ from collections.abc import AsyncGenerator, Iterator
9
+ from dataclasses import dataclass
10
+ from datetime import UTC, datetime
11
+ from pathlib import PurePosixPath
12
+ from typing import Any
13
+
14
+ from ...models.generated_input import SamplingStrategy
15
+ from ...models.generated_single_asset_scan_results import (
16
+ AssetType as OutputAssetType,
17
+ )
18
+ from ...models.generated_single_asset_scan_results import (
19
+ DetectionResult,
20
+ Location,
21
+ SingleAssetScanResults,
22
+ )
23
+ from ...utils.file_parser import infer_mime_type_from_file_name, resolve_mime_type
24
+ from ...utils.hashing import hash_id, unhash_id
25
+ from ..base import BaseSource
26
+ from ..dependencies import require_module
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ _TEXT_MIME_TYPES = {
31
+ "application/json",
32
+ "application/xml",
33
+ "text/xml",
34
+ "application/x-ndjson",
35
+ "application/ld+json",
36
+ }
37
+
38
+ _TABULAR_MIME_TYPES = {
39
+ "text/csv",
40
+ "text/tab-separated-values",
41
+ "application/vnd.ms-excel",
42
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
43
+ "application/parquet",
44
+ "application/vnd.apache.parquet",
45
+ }
46
+
47
+
48
+ _FILE_EXTENSION_HINTS: dict[str, OutputAssetType] = {
49
+ ".png": OutputAssetType.IMAGE,
50
+ ".jpg": OutputAssetType.IMAGE,
51
+ ".jpeg": OutputAssetType.IMAGE,
52
+ ".gif": OutputAssetType.IMAGE,
53
+ ".webp": OutputAssetType.IMAGE,
54
+ ".svg": OutputAssetType.IMAGE,
55
+ ".bmp": OutputAssetType.IMAGE,
56
+ ".ico": OutputAssetType.IMAGE,
57
+ ".mp4": OutputAssetType.VIDEO,
58
+ ".webm": OutputAssetType.VIDEO,
59
+ ".mov": OutputAssetType.VIDEO,
60
+ ".mkv": OutputAssetType.VIDEO,
61
+ ".avi": OutputAssetType.VIDEO,
62
+ ".mp3": OutputAssetType.AUDIO,
63
+ ".wav": OutputAssetType.AUDIO,
64
+ ".aac": OutputAssetType.AUDIO,
65
+ ".ogg": OutputAssetType.AUDIO,
66
+ ".pdf": OutputAssetType.BINARY,
67
+ ".doc": OutputAssetType.BINARY,
68
+ ".docx": OutputAssetType.BINARY,
69
+ ".xls": OutputAssetType.TABLE,
70
+ ".xlsx": OutputAssetType.TABLE,
71
+ ".ppt": OutputAssetType.BINARY,
72
+ ".pptx": OutputAssetType.BINARY,
73
+ ".zip": OutputAssetType.BINARY,
74
+ ".rar": OutputAssetType.BINARY,
75
+ ".7z": OutputAssetType.BINARY,
76
+ ".tar": OutputAssetType.BINARY,
77
+ ".gz": OutputAssetType.BINARY,
78
+ ".parquet": OutputAssetType.TABLE,
79
+ ".json": OutputAssetType.TXT,
80
+ ".xml": OutputAssetType.TXT,
81
+ ".txt": OutputAssetType.TXT,
82
+ ".csv": OutputAssetType.TABLE,
83
+ ".tsv": OutputAssetType.TABLE,
84
+ ".md": OutputAssetType.TXT,
85
+ ".html": OutputAssetType.TXT,
86
+ ".htm": OutputAssetType.TXT,
87
+ }
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class ObjectRef:
92
+ key: str
93
+ size: int
94
+ last_modified: datetime
95
+ etag: str | None = None
96
+ content_type_hint: str | None = None
97
+
98
+
99
+ @dataclass(frozen=True)
100
+ class ContentSnapshot:
101
+ mime_type: str
102
+ raw_content: str
103
+ text_content: str
104
+ parse_error: str | None
105
+ downloaded_bytes: int
106
+ truncated: bool
107
+ # Raw bytes retained for batchable tabular files so fetch_content_pages() can
108
+ # iterate rows in configurable-sized pages instead of one monolithic text blob.
109
+ raw_bytes: bytes | None = None
110
+
111
+
112
+ class ObjectStorageSourceBase(BaseSource, ABC):
113
+ provider_label = "OBJECT_STORAGE"
114
+ input_model: Any = None
115
+
116
+ def __init__(
117
+ self,
118
+ recipe: dict[str, Any],
119
+ source_id: str | None = None,
120
+ runner_id: str | None = None,
121
+ ) -> None:
122
+ super().__init__(recipe, source_id=source_id, runner_id=runner_id)
123
+ if self.input_model is None:
124
+ raise ValueError("input_model must be set in source subclass")
125
+ self.config = self.input_model.model_validate(recipe)
126
+ self.runner_id = runner_id or "local-run"
127
+ self._cached_client: Any | None = None
128
+
129
+ self._seen_hashes: set[str] = set()
130
+ self._content_cache: dict[str, tuple[str, str]] = {}
131
+ self._hash_to_uri: dict[str, str] = {}
132
+ self._object_ref_by_hash: dict[str, ObjectRef] = {}
133
+ self._file_processing_deps_checked = False
134
+ # Keyed by both asset_hash and external_url for O(1) lookup from either.
135
+ self._bytes_cache: dict[str, bytes] = {}
136
+ self._mime_cache: dict[str, str] = {}
137
+
138
+ def _asset_type_value(self) -> str:
139
+ type_value = self.config.type
140
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
141
+
142
+ def _connection_option(self, key: str, default: Any = None) -> Any:
143
+ optional = self.config.optional
144
+ if optional and optional.connection:
145
+ value = getattr(optional.connection, key, None)
146
+ if value is not None:
147
+ return value
148
+ return default
149
+
150
+ def _scope_option(self, key: str, default: Any = None) -> Any:
151
+ optional = self.config.optional
152
+ if optional and optional.scope:
153
+ value = getattr(optional.scope, key, None)
154
+ if value is not None:
155
+ return value
156
+ return default
157
+
158
+ def _masked_value(self, key: str) -> str | None:
159
+ masked = getattr(self.config, "masked", None)
160
+ if masked is None:
161
+ return None
162
+ value = getattr(masked, key, None)
163
+ if isinstance(value, str) and value.strip():
164
+ return value.strip()
165
+ return None
166
+
167
+ def _string_or_none(self, value: Any) -> str | None:
168
+ if value is None:
169
+ return None
170
+ text = str(value).strip()
171
+ return text if text else None
172
+
173
+ def _prefix(self) -> str:
174
+ value = self._scope_option("prefix", "")
175
+ return str(value).strip() if value else ""
176
+
177
+ def _max_keys_per_page(self) -> int:
178
+ value = self._connection_option("max_keys_per_page", 200)
179
+ try:
180
+ parsed = int(value)
181
+ except (TypeError, ValueError):
182
+ return 200
183
+ return min(max(parsed, 1), 1000)
184
+
185
+ def _request_timeout_seconds(self) -> float:
186
+ value = self._connection_option("request_timeout_seconds", 30)
187
+ try:
188
+ parsed = float(value)
189
+ except (TypeError, ValueError):
190
+ return 30.0
191
+ return max(parsed, 1.0)
192
+
193
+ def _verify_ssl(self) -> bool:
194
+ value = self._connection_option("verify_ssl", True)
195
+ return bool(value) if isinstance(value, bool) else True
196
+
197
+ def _max_object_bytes(self) -> int:
198
+ value = self._connection_option("max_object_bytes", 5_242_880)
199
+ try:
200
+ parsed = int(value)
201
+ except (TypeError, ValueError):
202
+ return 5_242_880
203
+ return min(max(parsed, 1_024), 52_428_800)
204
+
205
+ def _include_empty_objects(self) -> bool:
206
+ return bool(self._scope_option("include_empty_objects", False))
207
+
208
+ def _include_object_metadata(self) -> bool:
209
+ return bool(self._scope_option("include_object_metadata", True))
210
+
211
+ def _include_content_preview(self) -> bool:
212
+ return bool(self._scope_option("include_content_preview", True))
213
+
214
+ def _normalized_extension_filters(self, key: str) -> list[str]:
215
+ values = self._scope_option(key, [])
216
+ if not isinstance(values, list):
217
+ return []
218
+ normalized: list[str] = []
219
+ for value in values:
220
+ if not isinstance(value, str):
221
+ continue
222
+ cleaned = value.strip().lower()
223
+ if not cleaned:
224
+ continue
225
+ if not cleaned.startswith("."):
226
+ cleaned = f".{cleaned}"
227
+ normalized.append(cleaned)
228
+ return normalized
229
+
230
+ def _include_extensions(self) -> list[str]:
231
+ return self._normalized_extension_filters("include_extensions")
232
+
233
+ def _exclude_extensions(self) -> list[str]:
234
+ return self._normalized_extension_filters("exclude_extensions")
235
+
236
+ def _object_matches_extension_filters(self, key: str) -> bool:
237
+ key_lower = key.lower()
238
+ include_extensions = self._include_extensions()
239
+ exclude_extensions = self._exclude_extensions()
240
+
241
+ if include_extensions and not any(key_lower.endswith(ext) for ext in include_extensions):
242
+ return False
243
+ if exclude_extensions and any(key_lower.endswith(ext) for ext in exclude_extensions):
244
+ return False
245
+ return True
246
+
247
+ def _parse_datetime(self, value: Any) -> datetime:
248
+ if isinstance(value, datetime):
249
+ if value.tzinfo is None:
250
+ return value.replace(tzinfo=UTC)
251
+ return value.astimezone(UTC)
252
+
253
+ if isinstance(value, str) and value.strip():
254
+ normalized = value.strip().replace("Z", "+00:00")
255
+ try:
256
+ parsed = datetime.fromisoformat(normalized)
257
+ if parsed.tzinfo is None:
258
+ return parsed.replace(tzinfo=UTC)
259
+ return parsed.astimezone(UTC)
260
+ except ValueError:
261
+ pass
262
+
263
+ return datetime.now(UTC)
264
+
265
+ def _apply_sampling(self, refs: Iterator[ObjectRef]) -> list[ObjectRef]:
266
+ strategy = self.config.sampling.strategy
267
+ limit = int(self.config.sampling.rows_per_page or 100)
268
+
269
+ if strategy == SamplingStrategy.ALL:
270
+ return list(refs)
271
+
272
+ materialized = list(refs)
273
+
274
+ if strategy == SamplingStrategy.RANDOM:
275
+ if limit >= len(materialized):
276
+ return materialized
277
+ generator = random.Random(0)
278
+ indexes = sorted(generator.sample(range(len(materialized)), k=limit))
279
+ return [materialized[index] for index in indexes]
280
+
281
+ materialized.sort(key=lambda ref: ref.last_modified, reverse=True)
282
+ return materialized[:limit]
283
+
284
+ def _file_extension(self, key: str) -> str:
285
+ return PurePosixPath(key).suffix.lower()
286
+
287
+ def _asset_type_from_mime_or_key(self, mime_type: str | None, key: str) -> OutputAssetType:
288
+ normalized_mime = (mime_type or "").split(";", maxsplit=1)[0].strip().lower()
289
+ extension = self._file_extension(key)
290
+
291
+ if normalized_mime in _TABULAR_MIME_TYPES:
292
+ return OutputAssetType.TABLE
293
+ if normalized_mime.startswith("image/"):
294
+ return OutputAssetType.IMAGE
295
+ if normalized_mime.startswith("video/"):
296
+ return OutputAssetType.VIDEO
297
+ if normalized_mime.startswith("audio/"):
298
+ return OutputAssetType.AUDIO
299
+ if normalized_mime.startswith("text/") or normalized_mime in _TEXT_MIME_TYPES:
300
+ return OutputAssetType.TXT
301
+
302
+ if extension in _FILE_EXTENSION_HINTS:
303
+ return _FILE_EXTENSION_HINTS[extension]
304
+
305
+ if normalized_mime and normalized_mime != "application/octet-stream":
306
+ return OutputAssetType.BINARY
307
+
308
+ return OutputAssetType.OTHER
309
+
310
+ def _ensure_file_processing_dependencies(self) -> None:
311
+ if self._file_processing_deps_checked:
312
+ return
313
+ self._file_processing_deps_checked = True
314
+
315
+ # Object storage sources rely on file-processing extras for MIME detection
316
+ # and document text extraction (PDF/DOCX/XLSX).
317
+ for module_name in ("filetype", "pdfplumber", "docx", "openpyxl"):
318
+ try:
319
+ require_module(
320
+ module_name=module_name,
321
+ source_name=f"{self.provider_label} source",
322
+ uv_groups=["file-processing"],
323
+ detail=(
324
+ "Object storage text extraction requires file-processing dependencies."
325
+ ),
326
+ )
327
+ except Exception as exc:
328
+ logger.debug(
329
+ "Optional file-processing module %s unavailable for %s: %s",
330
+ module_name,
331
+ self.provider_label,
332
+ exc,
333
+ )
334
+
335
+ def _build_snapshot(self, ref: ObjectRef) -> ContentSnapshot:
336
+ if self._discovery_only or not self._include_content_preview():
337
+ mime = (ref.content_type_hint or "").split(";", maxsplit=1)[0].strip().lower()
338
+ if not mime:
339
+ mime = infer_mime_type_from_file_name(ref.key)
340
+ return ContentSnapshot(
341
+ mime_type=mime or "application/octet-stream",
342
+ raw_content="",
343
+ text_content="",
344
+ parse_error=None,
345
+ downloaded_bytes=0,
346
+ truncated=False,
347
+ )
348
+
349
+ try:
350
+ file_bytes, content_type_hint, truncated = self._download_object(ref)
351
+ except Exception as exc:
352
+ logger.warning("Failed to download object %s: %s", ref.key, exc)
353
+ return ContentSnapshot(
354
+ mime_type=(ref.content_type_hint or "application/octet-stream"),
355
+ raw_content="",
356
+ text_content="",
357
+ parse_error=str(exc),
358
+ downloaded_bytes=0,
359
+ truncated=False,
360
+ )
361
+
362
+ self._ensure_file_processing_dependencies()
363
+ mime_type = resolve_mime_type(
364
+ file_bytes,
365
+ declared_mime_type=content_type_hint or ref.content_type_hint or "",
366
+ file_name=ref.key,
367
+ )
368
+ normalized_mime = mime_type.split(";", 1)[0].strip().lower()
369
+
370
+ # Non-extractable types (images, audio, video, opaque binary) carry no text.
371
+ # Everything else defers extraction to fetch_content_pages() so detectors
372
+ # receive content in configurable-sized pages instead of one monolithic blob.
373
+ is_non_extractable = normalized_mime.startswith(
374
+ ("image/", "audio/", "video/")
375
+ ) or normalized_mime in (
376
+ "application/octet-stream",
377
+ "application/zip",
378
+ )
379
+
380
+ return ContentSnapshot(
381
+ mime_type=mime_type,
382
+ raw_content="",
383
+ text_content="",
384
+ parse_error=None,
385
+ downloaded_bytes=len(file_bytes),
386
+ truncated=truncated,
387
+ raw_bytes=None if is_non_extractable else file_bytes,
388
+ )
389
+
390
+ def _to_asset(self, ref: ObjectRef) -> SingleAssetScanResults:
391
+ external_url = self._external_url(ref.key)
392
+ asset_hash = self.generate_hash_id(external_url)
393
+
394
+ snapshot = self._build_snapshot(ref)
395
+ asset_type = self._asset_type_from_mime_or_key(snapshot.mime_type, ref.key)
396
+
397
+ if snapshot.text_content:
398
+ self._content_cache[asset_hash] = (snapshot.raw_content, snapshot.text_content)
399
+ if snapshot.raw_bytes is not None:
400
+ # Store under both keys (asset_hash and external_url) so fetch_content_pages()
401
+ # resolves with O(1) regardless of which candidate_id the pipeline supplies.
402
+ self._bytes_cache[asset_hash] = snapshot.raw_bytes
403
+ self._bytes_cache[external_url] = snapshot.raw_bytes
404
+ self._mime_cache[asset_hash] = snapshot.mime_type
405
+ self._mime_cache[external_url] = snapshot.mime_type
406
+
407
+ metadata: dict[str, Any] = {
408
+ "provider": self.provider_label,
409
+ "object_key": ref.key,
410
+ "asset_type": asset_type.value,
411
+ }
412
+ if self._include_object_metadata():
413
+ metadata.update(
414
+ {
415
+ "size_bytes": ref.size,
416
+ "etag": ref.etag,
417
+ "last_modified": ref.last_modified.isoformat(),
418
+ "mime_type": snapshot.mime_type,
419
+ "downloaded_bytes": snapshot.downloaded_bytes,
420
+ "truncated_download": snapshot.truncated,
421
+ "parse_error": snapshot.parse_error,
422
+ }
423
+ )
424
+
425
+ asset = SingleAssetScanResults(
426
+ hash=asset_hash,
427
+ checksum=self.calculate_checksum(metadata),
428
+ name=ref.key.split("/")[-1] or ref.key,
429
+ external_url=external_url,
430
+ links=[],
431
+ asset_type=asset_type,
432
+ source_id=self.source_id,
433
+ created_at=ref.last_modified,
434
+ updated_at=ref.last_modified,
435
+ runner_id=self.runner_id,
436
+ )
437
+ self._hash_to_uri[asset_hash] = external_url
438
+ self._object_ref_by_hash[asset_hash] = ref
439
+ return asset
440
+
441
+ def test_connection(self) -> dict[str, Any]:
442
+ result = {
443
+ "timestamp": datetime.now(UTC).isoformat(),
444
+ "source_type": self.recipe.get("type"),
445
+ }
446
+ try:
447
+ count = sum(1 for _ in itertools.islice(self._list_objects(), 100))
448
+ result["status"] = "SUCCESS"
449
+ result["message"] = (
450
+ f"Connected to {self.provider_label}. "
451
+ f"Found {'100+' if count >= 100 else count} object(s) in current scope."
452
+ )
453
+ except Exception as exc:
454
+ result["status"] = "FAILURE"
455
+ result["message"] = f"Failed to connect to {self.provider_label}: {exc}"
456
+ return result
457
+
458
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
459
+ if self._aborted:
460
+ return
461
+
462
+ self._seen_hashes = set()
463
+ self._content_cache = {}
464
+ self._hash_to_uri = {}
465
+ self._object_ref_by_hash = {}
466
+ self._bytes_cache = {}
467
+ self._mime_cache = {}
468
+
469
+ refs = self._list_objects()
470
+ sampled_refs = self._apply_sampling(refs)
471
+
472
+ batch: list[SingleAssetScanResults] = []
473
+ for ref in sampled_refs:
474
+ if self._aborted:
475
+ break
476
+
477
+ try:
478
+ asset = self._to_asset(ref)
479
+ except Exception as exc:
480
+ logger.warning("Skipping object %s due to transformation error: %s", ref.key, exc)
481
+ continue
482
+
483
+ if asset.hash in self._seen_hashes:
484
+ continue
485
+
486
+ self._seen_hashes.add(asset.hash)
487
+ batch.append(asset)
488
+
489
+ if len(batch) >= self.BATCH_SIZE:
490
+ yield batch
491
+ batch = []
492
+
493
+ if batch:
494
+ yield batch
495
+
496
+ async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
497
+ raw_bytes = self._bytes_cache.get(asset_id)
498
+ mime = self._mime_cache.get(asset_id, "")
499
+ if raw_bytes is not None and mime:
500
+ return raw_bytes, mime
501
+
502
+ external_url = self._hash_to_uri.get(asset_id)
503
+ asset_hash = asset_id
504
+ if external_url is None:
505
+ decoded = asset_id
506
+ if "_#_" not in decoded:
507
+ try:
508
+ decoded = unhash_id(asset_id)
509
+ except Exception:
510
+ decoded = asset_id
511
+ if "_#_" in decoded:
512
+ _, candidate = decoded.split("_#_", maxsplit=1)
513
+ external_url = candidate
514
+ asset_hash = self.generate_hash_id(candidate)
515
+ else:
516
+ external_url = asset_id
517
+ asset_hash = self.generate_hash_id(asset_id)
518
+
519
+ ref = self._object_ref_by_hash.get(asset_hash)
520
+ if ref is None:
521
+ return None
522
+
523
+ try:
524
+ file_bytes, content_type_hint, _truncated = self._download_object(ref)
525
+ except Exception as exc:
526
+ logger.warning("Failed to download object %s for binary fetch: %s", ref.key, exc)
527
+ return None
528
+
529
+ mime_type = resolve_mime_type(
530
+ file_bytes,
531
+ declared_mime_type=content_type_hint or ref.content_type_hint or "",
532
+ file_name=ref.key,
533
+ )
534
+ self._mime_cache[asset_hash] = mime_type
535
+ if external_url:
536
+ self._mime_cache[external_url] = mime_type
537
+ return file_bytes, mime_type
538
+
539
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
540
+ raw_bytes = self._bytes_cache.get(asset_id)
541
+ mime = self._mime_cache.get(asset_id, "")
542
+
543
+ if raw_bytes is not None:
544
+ sampling = self.config.sampling
545
+ batch_size = int(sampling.rows_per_page or 100)
546
+ include_col_names = bool(
547
+ sampling.include_column_names if sampling.include_column_names is not None else True
548
+ )
549
+ # Run the (potentially blocking) file parsing in a thread so pyarrow /
550
+ # pdfplumber can't freeze the event loop during large file iteration.
551
+ pages: list[str] = await asyncio.to_thread(
552
+ list,
553
+ self.iter_asset_pages(
554
+ raw_bytes,
555
+ mime,
556
+ batch_size,
557
+ include_col_names,
558
+ file_name=self._file_name_for_asset_id(asset_id),
559
+ ),
560
+ )
561
+ for batch_text in pages:
562
+ yield "", batch_text
563
+ return
564
+
565
+ result = await self.fetch_content(asset_id)
566
+ if result:
567
+ yield result
568
+
569
+ def _file_name_for_asset_id(self, asset_id: str) -> str:
570
+ external_url = self._hash_to_uri.get(asset_id)
571
+ if external_url is None:
572
+ decoded = asset_id
573
+ if "_#_" not in decoded:
574
+ try:
575
+ decoded = unhash_id(asset_id)
576
+ except Exception:
577
+ decoded = asset_id
578
+ if "_#_" in decoded:
579
+ _, candidate = decoded.split("_#_", maxsplit=1)
580
+ external_url = candidate
581
+ else:
582
+ external_url = asset_id
583
+
584
+ ref_hash = self.generate_hash_id(external_url)
585
+ ref = self._object_ref_by_hash.get(ref_hash)
586
+ if ref is not None:
587
+ return ref.key
588
+ return external_url
589
+
590
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
591
+ if asset_id in self._content_cache:
592
+ return self._content_cache[asset_id]
593
+
594
+ external_url = self._hash_to_uri.get(asset_id)
595
+ asset_hash = asset_id
596
+ if external_url is None:
597
+ decoded = asset_id
598
+ if "_#_" not in decoded:
599
+ try:
600
+ decoded = unhash_id(asset_id)
601
+ except Exception:
602
+ decoded = asset_id
603
+ if "_#_" in decoded:
604
+ _, candidate = decoded.split("_#_", maxsplit=1)
605
+ external_url = candidate
606
+ asset_hash = self.generate_hash_id(candidate)
607
+ else:
608
+ external_url = asset_id
609
+ asset_hash = self.generate_hash_id(asset_id)
610
+
611
+ cached = self._content_cache.get(asset_hash)
612
+ if cached is not None:
613
+ return cached
614
+
615
+ ref = self._object_ref_by_hash.get(asset_hash)
616
+ if ref is None:
617
+ return None
618
+
619
+ snapshot = self._build_snapshot(ref)
620
+ if not snapshot.text_content:
621
+ return None
622
+
623
+ content = (snapshot.raw_content, snapshot.text_content)
624
+ self._content_cache[asset_hash] = content
625
+ if external_url:
626
+ self._hash_to_uri[asset_hash] = external_url
627
+ return content
628
+
629
+ def generate_hash_id(self, asset_id: str) -> str:
630
+ asset_hash = hash_id(self._asset_type_value(), asset_id)
631
+ self._hash_to_uri[asset_hash] = asset_id
632
+ return asset_hash
633
+
634
+ def enrich_finding_location(
635
+ self,
636
+ finding: DetectionResult,
637
+ asset: SingleAssetScanResults,
638
+ text_content: str,
639
+ ) -> None:
640
+ _ = text_content
641
+ finding.location = Location(path=asset.external_url)
642
+
643
+ def evict_asset_cache(self, asset_hash: str) -> None:
644
+ external_url = self._hash_to_uri.get(asset_hash)
645
+ self._content_cache.pop(asset_hash, None)
646
+ self._bytes_cache.pop(asset_hash, None)
647
+ self._mime_cache.pop(asset_hash, None)
648
+ self._object_ref_by_hash.pop(asset_hash, None)
649
+ if external_url:
650
+ self._content_cache.pop(external_url, None)
651
+ self._bytes_cache.pop(external_url, None)
652
+ self._mime_cache.pop(external_url, None)
653
+
654
+ def abort(self) -> None:
655
+ logger.info("Aborting object storage extraction...")
656
+ super().abort()
657
+
658
+ def cleanup(self) -> None:
659
+ client = self._cached_client
660
+ if client is None:
661
+ return
662
+ close_fn = getattr(client, "close", None)
663
+ if callable(close_fn):
664
+ try:
665
+ close_fn()
666
+ except Exception:
667
+ logger.debug("Failed to close object storage client cleanly")
668
+
669
+ @abstractmethod
670
+ def _list_objects(self) -> Iterator[ObjectRef]:
671
+ raise NotImplementedError
672
+
673
+ @abstractmethod
674
+ def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
675
+ raise NotImplementedError
676
+
677
+ @abstractmethod
678
+ def _external_url(self, key: str) -> str:
679
+ raise NotImplementedError
@@ -0,0 +1,3 @@
1
+ from .source import OracleSource
2
+
3
+ __all__ = ["OracleSource"]