classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,679 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import itertools
|
|
5
|
+
import logging
|
|
6
|
+
import random
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from collections.abc import AsyncGenerator, Iterator
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from pathlib import PurePosixPath
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from ...models.generated_input import SamplingStrategy
|
|
15
|
+
from ...models.generated_single_asset_scan_results import (
|
|
16
|
+
AssetType as OutputAssetType,
|
|
17
|
+
)
|
|
18
|
+
from ...models.generated_single_asset_scan_results import (
|
|
19
|
+
DetectionResult,
|
|
20
|
+
Location,
|
|
21
|
+
SingleAssetScanResults,
|
|
22
|
+
)
|
|
23
|
+
from ...utils.file_parser import infer_mime_type_from_file_name, resolve_mime_type
|
|
24
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
25
|
+
from ..base import BaseSource
|
|
26
|
+
from ..dependencies import require_module
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
_TEXT_MIME_TYPES = {
|
|
31
|
+
"application/json",
|
|
32
|
+
"application/xml",
|
|
33
|
+
"text/xml",
|
|
34
|
+
"application/x-ndjson",
|
|
35
|
+
"application/ld+json",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
_TABULAR_MIME_TYPES = {
|
|
39
|
+
"text/csv",
|
|
40
|
+
"text/tab-separated-values",
|
|
41
|
+
"application/vnd.ms-excel",
|
|
42
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
43
|
+
"application/parquet",
|
|
44
|
+
"application/vnd.apache.parquet",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
_FILE_EXTENSION_HINTS: dict[str, OutputAssetType] = {
|
|
49
|
+
".png": OutputAssetType.IMAGE,
|
|
50
|
+
".jpg": OutputAssetType.IMAGE,
|
|
51
|
+
".jpeg": OutputAssetType.IMAGE,
|
|
52
|
+
".gif": OutputAssetType.IMAGE,
|
|
53
|
+
".webp": OutputAssetType.IMAGE,
|
|
54
|
+
".svg": OutputAssetType.IMAGE,
|
|
55
|
+
".bmp": OutputAssetType.IMAGE,
|
|
56
|
+
".ico": OutputAssetType.IMAGE,
|
|
57
|
+
".mp4": OutputAssetType.VIDEO,
|
|
58
|
+
".webm": OutputAssetType.VIDEO,
|
|
59
|
+
".mov": OutputAssetType.VIDEO,
|
|
60
|
+
".mkv": OutputAssetType.VIDEO,
|
|
61
|
+
".avi": OutputAssetType.VIDEO,
|
|
62
|
+
".mp3": OutputAssetType.AUDIO,
|
|
63
|
+
".wav": OutputAssetType.AUDIO,
|
|
64
|
+
".aac": OutputAssetType.AUDIO,
|
|
65
|
+
".ogg": OutputAssetType.AUDIO,
|
|
66
|
+
".pdf": OutputAssetType.BINARY,
|
|
67
|
+
".doc": OutputAssetType.BINARY,
|
|
68
|
+
".docx": OutputAssetType.BINARY,
|
|
69
|
+
".xls": OutputAssetType.TABLE,
|
|
70
|
+
".xlsx": OutputAssetType.TABLE,
|
|
71
|
+
".ppt": OutputAssetType.BINARY,
|
|
72
|
+
".pptx": OutputAssetType.BINARY,
|
|
73
|
+
".zip": OutputAssetType.BINARY,
|
|
74
|
+
".rar": OutputAssetType.BINARY,
|
|
75
|
+
".7z": OutputAssetType.BINARY,
|
|
76
|
+
".tar": OutputAssetType.BINARY,
|
|
77
|
+
".gz": OutputAssetType.BINARY,
|
|
78
|
+
".parquet": OutputAssetType.TABLE,
|
|
79
|
+
".json": OutputAssetType.TXT,
|
|
80
|
+
".xml": OutputAssetType.TXT,
|
|
81
|
+
".txt": OutputAssetType.TXT,
|
|
82
|
+
".csv": OutputAssetType.TABLE,
|
|
83
|
+
".tsv": OutputAssetType.TABLE,
|
|
84
|
+
".md": OutputAssetType.TXT,
|
|
85
|
+
".html": OutputAssetType.TXT,
|
|
86
|
+
".htm": OutputAssetType.TXT,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass(frozen=True)
|
|
91
|
+
class ObjectRef:
|
|
92
|
+
key: str
|
|
93
|
+
size: int
|
|
94
|
+
last_modified: datetime
|
|
95
|
+
etag: str | None = None
|
|
96
|
+
content_type_hint: str | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass(frozen=True)
|
|
100
|
+
class ContentSnapshot:
|
|
101
|
+
mime_type: str
|
|
102
|
+
raw_content: str
|
|
103
|
+
text_content: str
|
|
104
|
+
parse_error: str | None
|
|
105
|
+
downloaded_bytes: int
|
|
106
|
+
truncated: bool
|
|
107
|
+
# Raw bytes retained for batchable tabular files so fetch_content_pages() can
|
|
108
|
+
# iterate rows in configurable-sized pages instead of one monolithic text blob.
|
|
109
|
+
raw_bytes: bytes | None = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ObjectStorageSourceBase(BaseSource, ABC):
|
|
113
|
+
provider_label = "OBJECT_STORAGE"
|
|
114
|
+
input_model: Any = None
|
|
115
|
+
|
|
116
|
+
def __init__(
|
|
117
|
+
self,
|
|
118
|
+
recipe: dict[str, Any],
|
|
119
|
+
source_id: str | None = None,
|
|
120
|
+
runner_id: str | None = None,
|
|
121
|
+
) -> None:
|
|
122
|
+
super().__init__(recipe, source_id=source_id, runner_id=runner_id)
|
|
123
|
+
if self.input_model is None:
|
|
124
|
+
raise ValueError("input_model must be set in source subclass")
|
|
125
|
+
self.config = self.input_model.model_validate(recipe)
|
|
126
|
+
self.runner_id = runner_id or "local-run"
|
|
127
|
+
self._cached_client: Any | None = None
|
|
128
|
+
|
|
129
|
+
self._seen_hashes: set[str] = set()
|
|
130
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
131
|
+
self._hash_to_uri: dict[str, str] = {}
|
|
132
|
+
self._object_ref_by_hash: dict[str, ObjectRef] = {}
|
|
133
|
+
self._file_processing_deps_checked = False
|
|
134
|
+
# Keyed by both asset_hash and external_url for O(1) lookup from either.
|
|
135
|
+
self._bytes_cache: dict[str, bytes] = {}
|
|
136
|
+
self._mime_cache: dict[str, str] = {}
|
|
137
|
+
|
|
138
|
+
def _asset_type_value(self) -> str:
|
|
139
|
+
type_value = self.config.type
|
|
140
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
141
|
+
|
|
142
|
+
def _connection_option(self, key: str, default: Any = None) -> Any:
|
|
143
|
+
optional = self.config.optional
|
|
144
|
+
if optional and optional.connection:
|
|
145
|
+
value = getattr(optional.connection, key, None)
|
|
146
|
+
if value is not None:
|
|
147
|
+
return value
|
|
148
|
+
return default
|
|
149
|
+
|
|
150
|
+
def _scope_option(self, key: str, default: Any = None) -> Any:
|
|
151
|
+
optional = self.config.optional
|
|
152
|
+
if optional and optional.scope:
|
|
153
|
+
value = getattr(optional.scope, key, None)
|
|
154
|
+
if value is not None:
|
|
155
|
+
return value
|
|
156
|
+
return default
|
|
157
|
+
|
|
158
|
+
def _masked_value(self, key: str) -> str | None:
|
|
159
|
+
masked = getattr(self.config, "masked", None)
|
|
160
|
+
if masked is None:
|
|
161
|
+
return None
|
|
162
|
+
value = getattr(masked, key, None)
|
|
163
|
+
if isinstance(value, str) and value.strip():
|
|
164
|
+
return value.strip()
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
def _string_or_none(self, value: Any) -> str | None:
|
|
168
|
+
if value is None:
|
|
169
|
+
return None
|
|
170
|
+
text = str(value).strip()
|
|
171
|
+
return text if text else None
|
|
172
|
+
|
|
173
|
+
def _prefix(self) -> str:
|
|
174
|
+
value = self._scope_option("prefix", "")
|
|
175
|
+
return str(value).strip() if value else ""
|
|
176
|
+
|
|
177
|
+
def _max_keys_per_page(self) -> int:
|
|
178
|
+
value = self._connection_option("max_keys_per_page", 200)
|
|
179
|
+
try:
|
|
180
|
+
parsed = int(value)
|
|
181
|
+
except (TypeError, ValueError):
|
|
182
|
+
return 200
|
|
183
|
+
return min(max(parsed, 1), 1000)
|
|
184
|
+
|
|
185
|
+
def _request_timeout_seconds(self) -> float:
|
|
186
|
+
value = self._connection_option("request_timeout_seconds", 30)
|
|
187
|
+
try:
|
|
188
|
+
parsed = float(value)
|
|
189
|
+
except (TypeError, ValueError):
|
|
190
|
+
return 30.0
|
|
191
|
+
return max(parsed, 1.0)
|
|
192
|
+
|
|
193
|
+
def _verify_ssl(self) -> bool:
|
|
194
|
+
value = self._connection_option("verify_ssl", True)
|
|
195
|
+
return bool(value) if isinstance(value, bool) else True
|
|
196
|
+
|
|
197
|
+
def _max_object_bytes(self) -> int:
|
|
198
|
+
value = self._connection_option("max_object_bytes", 5_242_880)
|
|
199
|
+
try:
|
|
200
|
+
parsed = int(value)
|
|
201
|
+
except (TypeError, ValueError):
|
|
202
|
+
return 5_242_880
|
|
203
|
+
return min(max(parsed, 1_024), 52_428_800)
|
|
204
|
+
|
|
205
|
+
def _include_empty_objects(self) -> bool:
|
|
206
|
+
return bool(self._scope_option("include_empty_objects", False))
|
|
207
|
+
|
|
208
|
+
def _include_object_metadata(self) -> bool:
|
|
209
|
+
return bool(self._scope_option("include_object_metadata", True))
|
|
210
|
+
|
|
211
|
+
def _include_content_preview(self) -> bool:
|
|
212
|
+
return bool(self._scope_option("include_content_preview", True))
|
|
213
|
+
|
|
214
|
+
def _normalized_extension_filters(self, key: str) -> list[str]:
|
|
215
|
+
values = self._scope_option(key, [])
|
|
216
|
+
if not isinstance(values, list):
|
|
217
|
+
return []
|
|
218
|
+
normalized: list[str] = []
|
|
219
|
+
for value in values:
|
|
220
|
+
if not isinstance(value, str):
|
|
221
|
+
continue
|
|
222
|
+
cleaned = value.strip().lower()
|
|
223
|
+
if not cleaned:
|
|
224
|
+
continue
|
|
225
|
+
if not cleaned.startswith("."):
|
|
226
|
+
cleaned = f".{cleaned}"
|
|
227
|
+
normalized.append(cleaned)
|
|
228
|
+
return normalized
|
|
229
|
+
|
|
230
|
+
def _include_extensions(self) -> list[str]:
|
|
231
|
+
return self._normalized_extension_filters("include_extensions")
|
|
232
|
+
|
|
233
|
+
def _exclude_extensions(self) -> list[str]:
|
|
234
|
+
return self._normalized_extension_filters("exclude_extensions")
|
|
235
|
+
|
|
236
|
+
def _object_matches_extension_filters(self, key: str) -> bool:
|
|
237
|
+
key_lower = key.lower()
|
|
238
|
+
include_extensions = self._include_extensions()
|
|
239
|
+
exclude_extensions = self._exclude_extensions()
|
|
240
|
+
|
|
241
|
+
if include_extensions and not any(key_lower.endswith(ext) for ext in include_extensions):
|
|
242
|
+
return False
|
|
243
|
+
if exclude_extensions and any(key_lower.endswith(ext) for ext in exclude_extensions):
|
|
244
|
+
return False
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
def _parse_datetime(self, value: Any) -> datetime:
|
|
248
|
+
if isinstance(value, datetime):
|
|
249
|
+
if value.tzinfo is None:
|
|
250
|
+
return value.replace(tzinfo=UTC)
|
|
251
|
+
return value.astimezone(UTC)
|
|
252
|
+
|
|
253
|
+
if isinstance(value, str) and value.strip():
|
|
254
|
+
normalized = value.strip().replace("Z", "+00:00")
|
|
255
|
+
try:
|
|
256
|
+
parsed = datetime.fromisoformat(normalized)
|
|
257
|
+
if parsed.tzinfo is None:
|
|
258
|
+
return parsed.replace(tzinfo=UTC)
|
|
259
|
+
return parsed.astimezone(UTC)
|
|
260
|
+
except ValueError:
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
return datetime.now(UTC)
|
|
264
|
+
|
|
265
|
+
def _apply_sampling(self, refs: Iterator[ObjectRef]) -> list[ObjectRef]:
|
|
266
|
+
strategy = self.config.sampling.strategy
|
|
267
|
+
limit = int(self.config.sampling.rows_per_page or 100)
|
|
268
|
+
|
|
269
|
+
if strategy == SamplingStrategy.ALL:
|
|
270
|
+
return list(refs)
|
|
271
|
+
|
|
272
|
+
materialized = list(refs)
|
|
273
|
+
|
|
274
|
+
if strategy == SamplingStrategy.RANDOM:
|
|
275
|
+
if limit >= len(materialized):
|
|
276
|
+
return materialized
|
|
277
|
+
generator = random.Random(0)
|
|
278
|
+
indexes = sorted(generator.sample(range(len(materialized)), k=limit))
|
|
279
|
+
return [materialized[index] for index in indexes]
|
|
280
|
+
|
|
281
|
+
materialized.sort(key=lambda ref: ref.last_modified, reverse=True)
|
|
282
|
+
return materialized[:limit]
|
|
283
|
+
|
|
284
|
+
def _file_extension(self, key: str) -> str:
|
|
285
|
+
return PurePosixPath(key).suffix.lower()
|
|
286
|
+
|
|
287
|
+
def _asset_type_from_mime_or_key(self, mime_type: str | None, key: str) -> OutputAssetType:
|
|
288
|
+
normalized_mime = (mime_type or "").split(";", maxsplit=1)[0].strip().lower()
|
|
289
|
+
extension = self._file_extension(key)
|
|
290
|
+
|
|
291
|
+
if normalized_mime in _TABULAR_MIME_TYPES:
|
|
292
|
+
return OutputAssetType.TABLE
|
|
293
|
+
if normalized_mime.startswith("image/"):
|
|
294
|
+
return OutputAssetType.IMAGE
|
|
295
|
+
if normalized_mime.startswith("video/"):
|
|
296
|
+
return OutputAssetType.VIDEO
|
|
297
|
+
if normalized_mime.startswith("audio/"):
|
|
298
|
+
return OutputAssetType.AUDIO
|
|
299
|
+
if normalized_mime.startswith("text/") or normalized_mime in _TEXT_MIME_TYPES:
|
|
300
|
+
return OutputAssetType.TXT
|
|
301
|
+
|
|
302
|
+
if extension in _FILE_EXTENSION_HINTS:
|
|
303
|
+
return _FILE_EXTENSION_HINTS[extension]
|
|
304
|
+
|
|
305
|
+
if normalized_mime and normalized_mime != "application/octet-stream":
|
|
306
|
+
return OutputAssetType.BINARY
|
|
307
|
+
|
|
308
|
+
return OutputAssetType.OTHER
|
|
309
|
+
|
|
310
|
+
def _ensure_file_processing_dependencies(self) -> None:
|
|
311
|
+
if self._file_processing_deps_checked:
|
|
312
|
+
return
|
|
313
|
+
self._file_processing_deps_checked = True
|
|
314
|
+
|
|
315
|
+
# Object storage sources rely on file-processing extras for MIME detection
|
|
316
|
+
# and document text extraction (PDF/DOCX/XLSX).
|
|
317
|
+
for module_name in ("filetype", "pdfplumber", "docx", "openpyxl"):
|
|
318
|
+
try:
|
|
319
|
+
require_module(
|
|
320
|
+
module_name=module_name,
|
|
321
|
+
source_name=f"{self.provider_label} source",
|
|
322
|
+
uv_groups=["file-processing"],
|
|
323
|
+
detail=(
|
|
324
|
+
"Object storage text extraction requires file-processing dependencies."
|
|
325
|
+
),
|
|
326
|
+
)
|
|
327
|
+
except Exception as exc:
|
|
328
|
+
logger.debug(
|
|
329
|
+
"Optional file-processing module %s unavailable for %s: %s",
|
|
330
|
+
module_name,
|
|
331
|
+
self.provider_label,
|
|
332
|
+
exc,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def _build_snapshot(self, ref: ObjectRef) -> ContentSnapshot:
|
|
336
|
+
if self._discovery_only or not self._include_content_preview():
|
|
337
|
+
mime = (ref.content_type_hint or "").split(";", maxsplit=1)[0].strip().lower()
|
|
338
|
+
if not mime:
|
|
339
|
+
mime = infer_mime_type_from_file_name(ref.key)
|
|
340
|
+
return ContentSnapshot(
|
|
341
|
+
mime_type=mime or "application/octet-stream",
|
|
342
|
+
raw_content="",
|
|
343
|
+
text_content="",
|
|
344
|
+
parse_error=None,
|
|
345
|
+
downloaded_bytes=0,
|
|
346
|
+
truncated=False,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
file_bytes, content_type_hint, truncated = self._download_object(ref)
|
|
351
|
+
except Exception as exc:
|
|
352
|
+
logger.warning("Failed to download object %s: %s", ref.key, exc)
|
|
353
|
+
return ContentSnapshot(
|
|
354
|
+
mime_type=(ref.content_type_hint or "application/octet-stream"),
|
|
355
|
+
raw_content="",
|
|
356
|
+
text_content="",
|
|
357
|
+
parse_error=str(exc),
|
|
358
|
+
downloaded_bytes=0,
|
|
359
|
+
truncated=False,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
self._ensure_file_processing_dependencies()
|
|
363
|
+
mime_type = resolve_mime_type(
|
|
364
|
+
file_bytes,
|
|
365
|
+
declared_mime_type=content_type_hint or ref.content_type_hint or "",
|
|
366
|
+
file_name=ref.key,
|
|
367
|
+
)
|
|
368
|
+
normalized_mime = mime_type.split(";", 1)[0].strip().lower()
|
|
369
|
+
|
|
370
|
+
# Non-extractable types (images, audio, video, opaque binary) carry no text.
|
|
371
|
+
# Everything else defers extraction to fetch_content_pages() so detectors
|
|
372
|
+
# receive content in configurable-sized pages instead of one monolithic blob.
|
|
373
|
+
is_non_extractable = normalized_mime.startswith(
|
|
374
|
+
("image/", "audio/", "video/")
|
|
375
|
+
) or normalized_mime in (
|
|
376
|
+
"application/octet-stream",
|
|
377
|
+
"application/zip",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
return ContentSnapshot(
|
|
381
|
+
mime_type=mime_type,
|
|
382
|
+
raw_content="",
|
|
383
|
+
text_content="",
|
|
384
|
+
parse_error=None,
|
|
385
|
+
downloaded_bytes=len(file_bytes),
|
|
386
|
+
truncated=truncated,
|
|
387
|
+
raw_bytes=None if is_non_extractable else file_bytes,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def _to_asset(self, ref: ObjectRef) -> SingleAssetScanResults:
|
|
391
|
+
external_url = self._external_url(ref.key)
|
|
392
|
+
asset_hash = self.generate_hash_id(external_url)
|
|
393
|
+
|
|
394
|
+
snapshot = self._build_snapshot(ref)
|
|
395
|
+
asset_type = self._asset_type_from_mime_or_key(snapshot.mime_type, ref.key)
|
|
396
|
+
|
|
397
|
+
if snapshot.text_content:
|
|
398
|
+
self._content_cache[asset_hash] = (snapshot.raw_content, snapshot.text_content)
|
|
399
|
+
if snapshot.raw_bytes is not None:
|
|
400
|
+
# Store under both keys (asset_hash and external_url) so fetch_content_pages()
|
|
401
|
+
# resolves with O(1) regardless of which candidate_id the pipeline supplies.
|
|
402
|
+
self._bytes_cache[asset_hash] = snapshot.raw_bytes
|
|
403
|
+
self._bytes_cache[external_url] = snapshot.raw_bytes
|
|
404
|
+
self._mime_cache[asset_hash] = snapshot.mime_type
|
|
405
|
+
self._mime_cache[external_url] = snapshot.mime_type
|
|
406
|
+
|
|
407
|
+
metadata: dict[str, Any] = {
|
|
408
|
+
"provider": self.provider_label,
|
|
409
|
+
"object_key": ref.key,
|
|
410
|
+
"asset_type": asset_type.value,
|
|
411
|
+
}
|
|
412
|
+
if self._include_object_metadata():
|
|
413
|
+
metadata.update(
|
|
414
|
+
{
|
|
415
|
+
"size_bytes": ref.size,
|
|
416
|
+
"etag": ref.etag,
|
|
417
|
+
"last_modified": ref.last_modified.isoformat(),
|
|
418
|
+
"mime_type": snapshot.mime_type,
|
|
419
|
+
"downloaded_bytes": snapshot.downloaded_bytes,
|
|
420
|
+
"truncated_download": snapshot.truncated,
|
|
421
|
+
"parse_error": snapshot.parse_error,
|
|
422
|
+
}
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
asset = SingleAssetScanResults(
|
|
426
|
+
hash=asset_hash,
|
|
427
|
+
checksum=self.calculate_checksum(metadata),
|
|
428
|
+
name=ref.key.split("/")[-1] or ref.key,
|
|
429
|
+
external_url=external_url,
|
|
430
|
+
links=[],
|
|
431
|
+
asset_type=asset_type,
|
|
432
|
+
source_id=self.source_id,
|
|
433
|
+
created_at=ref.last_modified,
|
|
434
|
+
updated_at=ref.last_modified,
|
|
435
|
+
runner_id=self.runner_id,
|
|
436
|
+
)
|
|
437
|
+
self._hash_to_uri[asset_hash] = external_url
|
|
438
|
+
self._object_ref_by_hash[asset_hash] = ref
|
|
439
|
+
return asset
|
|
440
|
+
|
|
441
|
+
def test_connection(self) -> dict[str, Any]:
|
|
442
|
+
result = {
|
|
443
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
444
|
+
"source_type": self.recipe.get("type"),
|
|
445
|
+
}
|
|
446
|
+
try:
|
|
447
|
+
count = sum(1 for _ in itertools.islice(self._list_objects(), 100))
|
|
448
|
+
result["status"] = "SUCCESS"
|
|
449
|
+
result["message"] = (
|
|
450
|
+
f"Connected to {self.provider_label}. "
|
|
451
|
+
f"Found {'100+' if count >= 100 else count} object(s) in current scope."
|
|
452
|
+
)
|
|
453
|
+
except Exception as exc:
|
|
454
|
+
result["status"] = "FAILURE"
|
|
455
|
+
result["message"] = f"Failed to connect to {self.provider_label}: {exc}"
|
|
456
|
+
return result
|
|
457
|
+
|
|
458
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
459
|
+
if self._aborted:
|
|
460
|
+
return
|
|
461
|
+
|
|
462
|
+
self._seen_hashes = set()
|
|
463
|
+
self._content_cache = {}
|
|
464
|
+
self._hash_to_uri = {}
|
|
465
|
+
self._object_ref_by_hash = {}
|
|
466
|
+
self._bytes_cache = {}
|
|
467
|
+
self._mime_cache = {}
|
|
468
|
+
|
|
469
|
+
refs = self._list_objects()
|
|
470
|
+
sampled_refs = self._apply_sampling(refs)
|
|
471
|
+
|
|
472
|
+
batch: list[SingleAssetScanResults] = []
|
|
473
|
+
for ref in sampled_refs:
|
|
474
|
+
if self._aborted:
|
|
475
|
+
break
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
asset = self._to_asset(ref)
|
|
479
|
+
except Exception as exc:
|
|
480
|
+
logger.warning("Skipping object %s due to transformation error: %s", ref.key, exc)
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
if asset.hash in self._seen_hashes:
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
self._seen_hashes.add(asset.hash)
|
|
487
|
+
batch.append(asset)
|
|
488
|
+
|
|
489
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
490
|
+
yield batch
|
|
491
|
+
batch = []
|
|
492
|
+
|
|
493
|
+
if batch:
|
|
494
|
+
yield batch
|
|
495
|
+
|
|
496
|
+
async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
|
|
497
|
+
raw_bytes = self._bytes_cache.get(asset_id)
|
|
498
|
+
mime = self._mime_cache.get(asset_id, "")
|
|
499
|
+
if raw_bytes is not None and mime:
|
|
500
|
+
return raw_bytes, mime
|
|
501
|
+
|
|
502
|
+
external_url = self._hash_to_uri.get(asset_id)
|
|
503
|
+
asset_hash = asset_id
|
|
504
|
+
if external_url is None:
|
|
505
|
+
decoded = asset_id
|
|
506
|
+
if "_#_" not in decoded:
|
|
507
|
+
try:
|
|
508
|
+
decoded = unhash_id(asset_id)
|
|
509
|
+
except Exception:
|
|
510
|
+
decoded = asset_id
|
|
511
|
+
if "_#_" in decoded:
|
|
512
|
+
_, candidate = decoded.split("_#_", maxsplit=1)
|
|
513
|
+
external_url = candidate
|
|
514
|
+
asset_hash = self.generate_hash_id(candidate)
|
|
515
|
+
else:
|
|
516
|
+
external_url = asset_id
|
|
517
|
+
asset_hash = self.generate_hash_id(asset_id)
|
|
518
|
+
|
|
519
|
+
ref = self._object_ref_by_hash.get(asset_hash)
|
|
520
|
+
if ref is None:
|
|
521
|
+
return None
|
|
522
|
+
|
|
523
|
+
try:
|
|
524
|
+
file_bytes, content_type_hint, _truncated = self._download_object(ref)
|
|
525
|
+
except Exception as exc:
|
|
526
|
+
logger.warning("Failed to download object %s for binary fetch: %s", ref.key, exc)
|
|
527
|
+
return None
|
|
528
|
+
|
|
529
|
+
mime_type = resolve_mime_type(
|
|
530
|
+
file_bytes,
|
|
531
|
+
declared_mime_type=content_type_hint or ref.content_type_hint or "",
|
|
532
|
+
file_name=ref.key,
|
|
533
|
+
)
|
|
534
|
+
self._mime_cache[asset_hash] = mime_type
|
|
535
|
+
if external_url:
|
|
536
|
+
self._mime_cache[external_url] = mime_type
|
|
537
|
+
return file_bytes, mime_type
|
|
538
|
+
|
|
539
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
540
|
+
raw_bytes = self._bytes_cache.get(asset_id)
|
|
541
|
+
mime = self._mime_cache.get(asset_id, "")
|
|
542
|
+
|
|
543
|
+
if raw_bytes is not None:
|
|
544
|
+
sampling = self.config.sampling
|
|
545
|
+
batch_size = int(sampling.rows_per_page or 100)
|
|
546
|
+
include_col_names = bool(
|
|
547
|
+
sampling.include_column_names if sampling.include_column_names is not None else True
|
|
548
|
+
)
|
|
549
|
+
# Run the (potentially blocking) file parsing in a thread so pyarrow /
|
|
550
|
+
# pdfplumber can't freeze the event loop during large file iteration.
|
|
551
|
+
pages: list[str] = await asyncio.to_thread(
|
|
552
|
+
list,
|
|
553
|
+
self.iter_asset_pages(
|
|
554
|
+
raw_bytes,
|
|
555
|
+
mime,
|
|
556
|
+
batch_size,
|
|
557
|
+
include_col_names,
|
|
558
|
+
file_name=self._file_name_for_asset_id(asset_id),
|
|
559
|
+
),
|
|
560
|
+
)
|
|
561
|
+
for batch_text in pages:
|
|
562
|
+
yield "", batch_text
|
|
563
|
+
return
|
|
564
|
+
|
|
565
|
+
result = await self.fetch_content(asset_id)
|
|
566
|
+
if result:
|
|
567
|
+
yield result
|
|
568
|
+
|
|
569
|
+
def _file_name_for_asset_id(self, asset_id: str) -> str:
|
|
570
|
+
external_url = self._hash_to_uri.get(asset_id)
|
|
571
|
+
if external_url is None:
|
|
572
|
+
decoded = asset_id
|
|
573
|
+
if "_#_" not in decoded:
|
|
574
|
+
try:
|
|
575
|
+
decoded = unhash_id(asset_id)
|
|
576
|
+
except Exception:
|
|
577
|
+
decoded = asset_id
|
|
578
|
+
if "_#_" in decoded:
|
|
579
|
+
_, candidate = decoded.split("_#_", maxsplit=1)
|
|
580
|
+
external_url = candidate
|
|
581
|
+
else:
|
|
582
|
+
external_url = asset_id
|
|
583
|
+
|
|
584
|
+
ref_hash = self.generate_hash_id(external_url)
|
|
585
|
+
ref = self._object_ref_by_hash.get(ref_hash)
|
|
586
|
+
if ref is not None:
|
|
587
|
+
return ref.key
|
|
588
|
+
return external_url
|
|
589
|
+
|
|
590
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
591
|
+
if asset_id in self._content_cache:
|
|
592
|
+
return self._content_cache[asset_id]
|
|
593
|
+
|
|
594
|
+
external_url = self._hash_to_uri.get(asset_id)
|
|
595
|
+
asset_hash = asset_id
|
|
596
|
+
if external_url is None:
|
|
597
|
+
decoded = asset_id
|
|
598
|
+
if "_#_" not in decoded:
|
|
599
|
+
try:
|
|
600
|
+
decoded = unhash_id(asset_id)
|
|
601
|
+
except Exception:
|
|
602
|
+
decoded = asset_id
|
|
603
|
+
if "_#_" in decoded:
|
|
604
|
+
_, candidate = decoded.split("_#_", maxsplit=1)
|
|
605
|
+
external_url = candidate
|
|
606
|
+
asset_hash = self.generate_hash_id(candidate)
|
|
607
|
+
else:
|
|
608
|
+
external_url = asset_id
|
|
609
|
+
asset_hash = self.generate_hash_id(asset_id)
|
|
610
|
+
|
|
611
|
+
cached = self._content_cache.get(asset_hash)
|
|
612
|
+
if cached is not None:
|
|
613
|
+
return cached
|
|
614
|
+
|
|
615
|
+
ref = self._object_ref_by_hash.get(asset_hash)
|
|
616
|
+
if ref is None:
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
snapshot = self._build_snapshot(ref)
|
|
620
|
+
if not snapshot.text_content:
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
content = (snapshot.raw_content, snapshot.text_content)
|
|
624
|
+
self._content_cache[asset_hash] = content
|
|
625
|
+
if external_url:
|
|
626
|
+
self._hash_to_uri[asset_hash] = external_url
|
|
627
|
+
return content
|
|
628
|
+
|
|
629
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
630
|
+
asset_hash = hash_id(self._asset_type_value(), asset_id)
|
|
631
|
+
self._hash_to_uri[asset_hash] = asset_id
|
|
632
|
+
return asset_hash
|
|
633
|
+
|
|
634
|
+
def enrich_finding_location(
|
|
635
|
+
self,
|
|
636
|
+
finding: DetectionResult,
|
|
637
|
+
asset: SingleAssetScanResults,
|
|
638
|
+
text_content: str,
|
|
639
|
+
) -> None:
|
|
640
|
+
_ = text_content
|
|
641
|
+
finding.location = Location(path=asset.external_url)
|
|
642
|
+
|
|
643
|
+
def evict_asset_cache(self, asset_hash: str) -> None:
|
|
644
|
+
external_url = self._hash_to_uri.get(asset_hash)
|
|
645
|
+
self._content_cache.pop(asset_hash, None)
|
|
646
|
+
self._bytes_cache.pop(asset_hash, None)
|
|
647
|
+
self._mime_cache.pop(asset_hash, None)
|
|
648
|
+
self._object_ref_by_hash.pop(asset_hash, None)
|
|
649
|
+
if external_url:
|
|
650
|
+
self._content_cache.pop(external_url, None)
|
|
651
|
+
self._bytes_cache.pop(external_url, None)
|
|
652
|
+
self._mime_cache.pop(external_url, None)
|
|
653
|
+
|
|
654
|
+
def abort(self) -> None:
|
|
655
|
+
logger.info("Aborting object storage extraction...")
|
|
656
|
+
super().abort()
|
|
657
|
+
|
|
658
|
+
def cleanup(self) -> None:
|
|
659
|
+
client = self._cached_client
|
|
660
|
+
if client is None:
|
|
661
|
+
return
|
|
662
|
+
close_fn = getattr(client, "close", None)
|
|
663
|
+
if callable(close_fn):
|
|
664
|
+
try:
|
|
665
|
+
close_fn()
|
|
666
|
+
except Exception:
|
|
667
|
+
logger.debug("Failed to close object storage client cleanly")
|
|
668
|
+
|
|
669
|
+
@abstractmethod
|
|
670
|
+
def _list_objects(self) -> Iterator[ObjectRef]:
|
|
671
|
+
raise NotImplementedError
|
|
672
|
+
|
|
673
|
+
@abstractmethod
|
|
674
|
+
def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
|
|
675
|
+
raise NotImplementedError
|
|
676
|
+
|
|
677
|
+
@abstractmethod
|
|
678
|
+
def _external_url(self, key: str) -> str:
|
|
679
|
+
raise NotImplementedError
|