classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...models.generated_input import (
|
|
11
|
+
MongoDBInput,
|
|
12
|
+
MongoDBMaskedNone,
|
|
13
|
+
MongoDBMaskedUsernamePassword,
|
|
14
|
+
MongoDBOptionalConnection,
|
|
15
|
+
MongoDBOptionalScope,
|
|
16
|
+
MongoDBRequiredAtlas,
|
|
17
|
+
SamplingConfig,
|
|
18
|
+
SamplingStrategy,
|
|
19
|
+
)
|
|
20
|
+
from ...models.generated_single_asset_scan_results import (
|
|
21
|
+
AssetType as OutputAssetType,
|
|
22
|
+
)
|
|
23
|
+
from ...models.generated_single_asset_scan_results import (
|
|
24
|
+
DetectionResult,
|
|
25
|
+
Location,
|
|
26
|
+
SingleAssetScanResults,
|
|
27
|
+
)
|
|
28
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
29
|
+
from ..base import BaseSource
|
|
30
|
+
from ..dependencies import require_module
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
_DEFAULT_EXCLUDED_DATABASES = {"admin", "config", "local"}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class CollectionRef:
|
|
39
|
+
database: str
|
|
40
|
+
collection: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class MongoDBSource(BaseSource):
|
|
44
|
+
source_type = "mongodb"
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
recipe: dict[str, Any],
|
|
49
|
+
source_id: str | None = None,
|
|
50
|
+
runner_id: str | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
super().__init__(recipe, source_id, runner_id)
|
|
53
|
+
self.config = MongoDBInput.model_validate(recipe)
|
|
54
|
+
self.runner_id = runner_id or "local-run"
|
|
55
|
+
self._pymongo = require_module(
|
|
56
|
+
module_name="pymongo",
|
|
57
|
+
source_name="MongoDB",
|
|
58
|
+
uv_groups=["mongodb"],
|
|
59
|
+
detail="The MongoDB connector is optional.",
|
|
60
|
+
)
|
|
61
|
+
self._collection_lookup: dict[str, CollectionRef] = {}
|
|
62
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
63
|
+
self._mongo_client: Any | None = None
|
|
64
|
+
|
|
65
|
+
def _asset_type_value(self) -> str:
|
|
66
|
+
type_value = self.config.type
|
|
67
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
68
|
+
|
|
69
|
+
def _sampling(self) -> SamplingConfig:
|
|
70
|
+
return self.config.sampling
|
|
71
|
+
|
|
72
|
+
def _connection_options(self) -> MongoDBOptionalConnection:
|
|
73
|
+
if self.config.optional and self.config.optional.connection:
|
|
74
|
+
return self.config.optional.connection
|
|
75
|
+
return MongoDBOptionalConnection()
|
|
76
|
+
|
|
77
|
+
def _scope_options(self) -> MongoDBOptionalScope:
|
|
78
|
+
if self.config.optional and self.config.optional.scope:
|
|
79
|
+
return self.config.optional.scope
|
|
80
|
+
return MongoDBOptionalScope()
|
|
81
|
+
|
|
82
|
+
def _username_password(self) -> tuple[str | None, str | None]:
|
|
83
|
+
masked = self.config.masked
|
|
84
|
+
if isinstance(masked, MongoDBMaskedUsernamePassword):
|
|
85
|
+
return masked.username, masked.password
|
|
86
|
+
if isinstance(masked, MongoDBMaskedNone):
|
|
87
|
+
return None, None
|
|
88
|
+
return None, None
|
|
89
|
+
|
|
90
|
+
def _atlas_cluster_host(self) -> str:
|
|
91
|
+
required = self.config.required
|
|
92
|
+
if not isinstance(required, MongoDBRequiredAtlas):
|
|
93
|
+
return ""
|
|
94
|
+
|
|
95
|
+
cleaned = required.cluster_host.strip()
|
|
96
|
+
for prefix in ("mongodb+srv://", "mongodb://"):
|
|
97
|
+
if cleaned.startswith(prefix):
|
|
98
|
+
cleaned = cleaned[len(prefix) :]
|
|
99
|
+
|
|
100
|
+
if "@" in cleaned:
|
|
101
|
+
cleaned = cleaned.split("@", maxsplit=1)[-1]
|
|
102
|
+
if "/" in cleaned:
|
|
103
|
+
cleaned = cleaned.split("/", maxsplit=1)[0]
|
|
104
|
+
return cleaned
|
|
105
|
+
|
|
106
|
+
def _is_atlas(self) -> bool:
|
|
107
|
+
return isinstance(self.config.required, MongoDBRequiredAtlas)
|
|
108
|
+
|
|
109
|
+
def _build_connection_uri(self) -> str:
|
|
110
|
+
required = self.config.required
|
|
111
|
+
if isinstance(required, MongoDBRequiredAtlas):
|
|
112
|
+
return f"mongodb+srv://{self._atlas_cluster_host()}"
|
|
113
|
+
return f"mongodb://{required.host}:{int(required.port)}"
|
|
114
|
+
|
|
115
|
+
def _build_client_kwargs(self) -> dict[str, Any]:
|
|
116
|
+
options = self._connection_options()
|
|
117
|
+
username, password = self._username_password()
|
|
118
|
+
|
|
119
|
+
kwargs: dict[str, Any] = {
|
|
120
|
+
"connectTimeoutMS": int(options.connect_timeout_ms or 30000),
|
|
121
|
+
}
|
|
122
|
+
if username:
|
|
123
|
+
kwargs["username"] = username
|
|
124
|
+
if password:
|
|
125
|
+
kwargs["password"] = password
|
|
126
|
+
|
|
127
|
+
if options.auth_mechanism and str(options.auth_mechanism) != "DEFAULT":
|
|
128
|
+
mechanism = (
|
|
129
|
+
options.auth_mechanism.value
|
|
130
|
+
if hasattr(options.auth_mechanism, "value")
|
|
131
|
+
else str(options.auth_mechanism)
|
|
132
|
+
)
|
|
133
|
+
kwargs["authMechanism"] = mechanism
|
|
134
|
+
if options.auth_source:
|
|
135
|
+
kwargs["authSource"] = options.auth_source
|
|
136
|
+
if options.app_name:
|
|
137
|
+
kwargs["appname"] = options.app_name
|
|
138
|
+
if options.tls is not None:
|
|
139
|
+
kwargs["tls"] = bool(options.tls)
|
|
140
|
+
if options.replica_set:
|
|
141
|
+
kwargs["replicaSet"] = options.replica_set
|
|
142
|
+
if options.direct_connection is not None:
|
|
143
|
+
kwargs["directConnection"] = bool(options.direct_connection)
|
|
144
|
+
|
|
145
|
+
additional = options.options or {}
|
|
146
|
+
if isinstance(additional, dict):
|
|
147
|
+
kwargs.update(additional)
|
|
148
|
+
return kwargs
|
|
149
|
+
|
|
150
|
+
def _client(self) -> Any:
|
|
151
|
+
if self._mongo_client is not None:
|
|
152
|
+
return self._mongo_client
|
|
153
|
+
|
|
154
|
+
client = self._pymongo.MongoClient(
|
|
155
|
+
self._build_connection_uri(),
|
|
156
|
+
**self._build_client_kwargs(),
|
|
157
|
+
)
|
|
158
|
+
self._mongo_client = client
|
|
159
|
+
return client
|
|
160
|
+
|
|
161
|
+
def _excluded_databases(self) -> set[str]:
|
|
162
|
+
configured = self._scope_options().exclude_databases or []
|
|
163
|
+
excluded = {name.strip() for name in configured if name and name.strip()}
|
|
164
|
+
if not excluded:
|
|
165
|
+
excluded = set(_DEFAULT_EXCLUDED_DATABASES)
|
|
166
|
+
return excluded
|
|
167
|
+
|
|
168
|
+
def _collection_allowlist(self) -> set[str]:
|
|
169
|
+
configured = self._scope_options().include_collections or []
|
|
170
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
171
|
+
|
|
172
|
+
def _collection_denylist(self) -> set[str]:
|
|
173
|
+
configured = self._scope_options().exclude_collections or []
|
|
174
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
175
|
+
|
|
176
|
+
def _include_system_collections(self) -> bool:
|
|
177
|
+
return bool(self._scope_options().include_system_collections)
|
|
178
|
+
|
|
179
|
+
def _resolve_databases(self) -> list[str]:
|
|
180
|
+
scope_options = self._scope_options()
|
|
181
|
+
include_all = scope_options.include_all_databases is not False
|
|
182
|
+
configured_database = scope_options.database
|
|
183
|
+
|
|
184
|
+
if not include_all:
|
|
185
|
+
if configured_database:
|
|
186
|
+
return [configured_database]
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"MongoDB source requires optional.scope.database when include_all_databases is false. "
|
|
189
|
+
"Set optional.scope.database (e.g. 'app_db') or enable include_all_databases."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
excluded = self._excluded_databases()
|
|
193
|
+
discovered = [
|
|
194
|
+
database
|
|
195
|
+
for database in self._client().list_database_names()
|
|
196
|
+
if isinstance(database, str) and database and database not in excluded
|
|
197
|
+
]
|
|
198
|
+
discovered.sort()
|
|
199
|
+
|
|
200
|
+
if configured_database and configured_database not in discovered:
|
|
201
|
+
discovered.insert(0, configured_database)
|
|
202
|
+
return discovered
|
|
203
|
+
|
|
204
|
+
def _collection_allowed(self, database: str, collection: str) -> bool:
|
|
205
|
+
if not self._include_system_collections() and collection.startswith("system."):
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
normalized_collection = collection.lower()
|
|
209
|
+
normalized_scoped = f"{database}.{collection}".lower()
|
|
210
|
+
|
|
211
|
+
allowlist = self._collection_allowlist()
|
|
212
|
+
if (
|
|
213
|
+
allowlist
|
|
214
|
+
and normalized_collection not in allowlist
|
|
215
|
+
and normalized_scoped not in allowlist
|
|
216
|
+
):
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
denylist = self._collection_denylist()
|
|
220
|
+
if normalized_collection in denylist or normalized_scoped in denylist:
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
def _list_collections_for_database(self, database: str) -> list[CollectionRef]:
|
|
226
|
+
collection_limit = self._scope_options().collection_limit
|
|
227
|
+
limit = int(collection_limit) if collection_limit else None
|
|
228
|
+
|
|
229
|
+
collections: list[CollectionRef] = []
|
|
230
|
+
for collection in self._client()[database].list_collection_names():
|
|
231
|
+
if not isinstance(collection, str) or not collection:
|
|
232
|
+
continue
|
|
233
|
+
if not self._collection_allowed(database, collection):
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
collections.append(CollectionRef(database=database, collection=collection))
|
|
237
|
+
if limit is not None and len(collections) >= limit:
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
return collections
|
|
241
|
+
|
|
242
|
+
def _iter_collections(self) -> list[CollectionRef]:
|
|
243
|
+
collections: list[CollectionRef] = []
|
|
244
|
+
for database in self._resolve_databases():
|
|
245
|
+
if self._aborted:
|
|
246
|
+
break
|
|
247
|
+
try:
|
|
248
|
+
collections.extend(self._list_collections_for_database(database))
|
|
249
|
+
except Exception as exc:
|
|
250
|
+
logger.warning("Skipping database %s due to listing error: %s", database, exc)
|
|
251
|
+
return collections
|
|
252
|
+
|
|
253
|
+
def test_connection(self) -> dict[str, Any]:
|
|
254
|
+
logger.info("Testing connection to MongoDB...")
|
|
255
|
+
result = {
|
|
256
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
257
|
+
"source_type": self.recipe.get("type"),
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
self._client().admin.command("ping")
|
|
262
|
+
databases = self._resolve_databases()
|
|
263
|
+
result["status"] = "SUCCESS"
|
|
264
|
+
deployment = "Atlas" if self._is_atlas() else "On-prem"
|
|
265
|
+
result["message"] = (
|
|
266
|
+
f"Successfully connected to MongoDB ({deployment}). "
|
|
267
|
+
f"Reachable databases: {len(databases)}."
|
|
268
|
+
)
|
|
269
|
+
except Exception as exc:
|
|
270
|
+
result["status"] = "FAILURE"
|
|
271
|
+
result["message"] = f"Failed to connect to MongoDB: {exc}"
|
|
272
|
+
|
|
273
|
+
return result
|
|
274
|
+
|
|
275
|
+
def _collection_raw_id(self, collection_ref: CollectionRef) -> str:
|
|
276
|
+
return f"{collection_ref.database}_#_{collection_ref.collection}"
|
|
277
|
+
|
|
278
|
+
def _collection_to_asset(self, collection_ref: CollectionRef) -> SingleAssetScanResults:
|
|
279
|
+
asset_name = f"{collection_ref.database}.{collection_ref.collection}"
|
|
280
|
+
raw_id = self._collection_raw_id(collection_ref)
|
|
281
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
282
|
+
external_url = self.ensure_location(
|
|
283
|
+
self._collection_external_url(collection_ref),
|
|
284
|
+
fallback=f"mongodb://{asset_name}",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
metadata = {
|
|
288
|
+
"database": collection_ref.database,
|
|
289
|
+
"collection": collection_ref.collection,
|
|
290
|
+
"deployment": "ATLAS" if self._is_atlas() else "ON_PREM",
|
|
291
|
+
"sampling": {
|
|
292
|
+
"strategy": str(self._sampling().strategy),
|
|
293
|
+
},
|
|
294
|
+
}
|
|
295
|
+
now = datetime.now(UTC)
|
|
296
|
+
|
|
297
|
+
return SingleAssetScanResults(
|
|
298
|
+
hash=asset_hash,
|
|
299
|
+
checksum=self.calculate_checksum(metadata),
|
|
300
|
+
name=asset_name,
|
|
301
|
+
external_url=external_url,
|
|
302
|
+
links=[],
|
|
303
|
+
asset_type=OutputAssetType.TXT,
|
|
304
|
+
source_id=self.source_id,
|
|
305
|
+
created_at=now,
|
|
306
|
+
updated_at=now,
|
|
307
|
+
runner_id=self.runner_id,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def _collection_external_url(self, collection_ref: CollectionRef) -> str:
|
|
311
|
+
if self._is_atlas():
|
|
312
|
+
return (
|
|
313
|
+
f"mongodb+srv://{self._atlas_cluster_host()}/"
|
|
314
|
+
f"{collection_ref.database}/{collection_ref.collection}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
required = self.config.required
|
|
318
|
+
return (
|
|
319
|
+
f"mongodb://{required.host}:{int(required.port)}/"
|
|
320
|
+
f"{collection_ref.database}/{collection_ref.collection}"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
STREAM_DETECTIONS = True
|
|
324
|
+
|
|
325
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
326
|
+
if self._aborted:
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
batch: list[SingleAssetScanResults] = []
|
|
330
|
+
for collection_ref in self._iter_collections():
|
|
331
|
+
if self._aborted:
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
asset = self._collection_to_asset(collection_ref)
|
|
335
|
+
self._collection_lookup[asset.hash] = collection_ref
|
|
336
|
+
batch.append(asset)
|
|
337
|
+
|
|
338
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
339
|
+
yield batch
|
|
340
|
+
batch = []
|
|
341
|
+
|
|
342
|
+
if batch:
|
|
343
|
+
yield batch
|
|
344
|
+
|
|
345
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
346
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
347
|
+
|
|
348
|
+
def _parse_collection_ref(self, asset_id: str) -> CollectionRef | None:
|
|
349
|
+
if asset_id in self._collection_lookup:
|
|
350
|
+
return self._collection_lookup[asset_id]
|
|
351
|
+
|
|
352
|
+
decoded = asset_id
|
|
353
|
+
if "_#_" not in decoded:
|
|
354
|
+
try:
|
|
355
|
+
decoded = unhash_id(asset_id)
|
|
356
|
+
except Exception:
|
|
357
|
+
decoded = asset_id
|
|
358
|
+
|
|
359
|
+
parts = decoded.split("_#_")
|
|
360
|
+
if len(parts) >= 3 and parts[0].upper() == "MONGODB":
|
|
361
|
+
return CollectionRef(database=parts[-2], collection=parts[-1])
|
|
362
|
+
if len(parts) >= 2:
|
|
363
|
+
return CollectionRef(database=parts[-2], collection=parts[-1])
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
def _latest_order_field(self) -> str:
|
|
367
|
+
sampling = self._sampling()
|
|
368
|
+
if sampling.order_by_column:
|
|
369
|
+
return sampling.order_by_column
|
|
370
|
+
return "_id"
|
|
371
|
+
|
|
372
|
+
def _sample_random_documents(self, collection: Any, limit: int) -> list[dict[str, Any]]:
|
|
373
|
+
pipeline = [{"$sample": {"size": limit}}]
|
|
374
|
+
return list(collection.aggregate(pipeline, allowDiskUse=True))
|
|
375
|
+
|
|
376
|
+
def _count_collection_documents(self, collection_ref: CollectionRef) -> int | None:
|
|
377
|
+
try:
|
|
378
|
+
collection = self._client()[collection_ref.database][collection_ref.collection]
|
|
379
|
+
return int(collection.count_documents({}))
|
|
380
|
+
except Exception:
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
def _sample_collection_documents(self, collection_ref: CollectionRef) -> list[dict[str, Any]]:
|
|
384
|
+
collection = self._client()[collection_ref.database][collection_ref.collection]
|
|
385
|
+
sampling = self._sampling()
|
|
386
|
+
strategy = sampling.strategy
|
|
387
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
388
|
+
|
|
389
|
+
if strategy == SamplingStrategy.ALL:
|
|
390
|
+
return list(collection.find({}).limit(rows_per_page))
|
|
391
|
+
|
|
392
|
+
if strategy == SamplingStrategy.RANDOM:
|
|
393
|
+
return self._sample_random_documents(collection, rows_per_page)
|
|
394
|
+
|
|
395
|
+
order_field = self._latest_order_field()
|
|
396
|
+
if order_field != "_id":
|
|
397
|
+
try:
|
|
398
|
+
has_field = (
|
|
399
|
+
collection.count_documents({order_field: {"$exists": True}}, limit=1) > 0
|
|
400
|
+
)
|
|
401
|
+
except Exception:
|
|
402
|
+
has_field = True
|
|
403
|
+
if not has_field and sampling.fallback_to_random is not False:
|
|
404
|
+
return self._sample_random_documents(collection, rows_per_page)
|
|
405
|
+
|
|
406
|
+
return list(
|
|
407
|
+
collection.find({}).sort(order_field, self._pymongo.DESCENDING).limit(rows_per_page)
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
def _serialize_document(self, document: dict[str, Any]) -> str:
|
|
411
|
+
return json.dumps(document, ensure_ascii=False, default=str, sort_keys=True)
|
|
412
|
+
|
|
413
|
+
def _format_collection_content(
|
|
414
|
+
self,
|
|
415
|
+
collection_ref: CollectionRef,
|
|
416
|
+
documents: list[dict[str, Any]],
|
|
417
|
+
doc_offset: int = 0,
|
|
418
|
+
) -> tuple[str, str]:
|
|
419
|
+
sampling = self._sampling()
|
|
420
|
+
|
|
421
|
+
strategy = sampling.strategy
|
|
422
|
+
lines = [
|
|
423
|
+
f"collection={collection_ref.database}.{collection_ref.collection}",
|
|
424
|
+
f"sampling_strategy={strategy}",
|
|
425
|
+
f"sampled_documents={len(documents)}",
|
|
426
|
+
"",
|
|
427
|
+
]
|
|
428
|
+
|
|
429
|
+
serialized_documents: list[str] = []
|
|
430
|
+
for index, document in enumerate(documents, start=1 + doc_offset):
|
|
431
|
+
serialized = self._serialize_document(document)
|
|
432
|
+
serialized_documents.append(serialized)
|
|
433
|
+
lines.append(f"doc_{index}: {serialized}")
|
|
434
|
+
|
|
435
|
+
text_content = "\n".join(lines)
|
|
436
|
+
raw_content = json.dumps(
|
|
437
|
+
{
|
|
438
|
+
"database": collection_ref.database,
|
|
439
|
+
"collection": collection_ref.collection,
|
|
440
|
+
"strategy": str(strategy),
|
|
441
|
+
"documents": serialized_documents,
|
|
442
|
+
"doc_offset": doc_offset,
|
|
443
|
+
},
|
|
444
|
+
ensure_ascii=False,
|
|
445
|
+
)
|
|
446
|
+
return raw_content, text_content
|
|
447
|
+
|
|
448
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
449
|
+
cached = self._content_cache.get(asset_id)
|
|
450
|
+
if cached:
|
|
451
|
+
return cached
|
|
452
|
+
|
|
453
|
+
collection_ref = self._parse_collection_ref(asset_id)
|
|
454
|
+
if not collection_ref:
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
documents = self._sample_collection_documents(collection_ref)
|
|
458
|
+
content = self._format_collection_content(collection_ref, documents)
|
|
459
|
+
|
|
460
|
+
self._content_cache[asset_id] = content
|
|
461
|
+
return content
|
|
462
|
+
|
|
463
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
464
|
+
sampling = self._sampling()
|
|
465
|
+
collection_ref = self._parse_collection_ref(asset_id)
|
|
466
|
+
if not collection_ref:
|
|
467
|
+
return
|
|
468
|
+
|
|
469
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
470
|
+
documents = self._sample_collection_documents(collection_ref)
|
|
471
|
+
for i, document in enumerate(documents):
|
|
472
|
+
content = self._format_collection_content(collection_ref, [document], doc_offset=i)
|
|
473
|
+
yield content
|
|
474
|
+
return
|
|
475
|
+
|
|
476
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
477
|
+
collection_label = f"{collection_ref.database}.{collection_ref.collection}"
|
|
478
|
+
|
|
479
|
+
total_docs = self._count_collection_documents(collection_ref)
|
|
480
|
+
total_batches = ((total_docs + rows_per_page - 1) // rows_per_page) if total_docs else None
|
|
481
|
+
if total_docs is not None and total_batches is not None:
|
|
482
|
+
logger.info(
|
|
483
|
+
"Full scan %s: %d documents, %d batches of %d",
|
|
484
|
+
collection_label,
|
|
485
|
+
total_docs,
|
|
486
|
+
total_batches,
|
|
487
|
+
rows_per_page,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
collection = self._client()[collection_ref.database][collection_ref.collection]
|
|
491
|
+
offset = 0
|
|
492
|
+
page_num = 1
|
|
493
|
+
|
|
494
|
+
while not self._aborted:
|
|
495
|
+
if total_batches is not None:
|
|
496
|
+
logger.info("%s batch %d/%d", collection_label, page_num, total_batches)
|
|
497
|
+
|
|
498
|
+
documents = list(collection.find({}).skip(offset).limit(rows_per_page))
|
|
499
|
+
if not documents:
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
for i, document in enumerate(documents):
|
|
503
|
+
content = self._format_collection_content(
|
|
504
|
+
collection_ref, [document], doc_offset=offset + i
|
|
505
|
+
)
|
|
506
|
+
self._content_cache[asset_id] = content
|
|
507
|
+
yield content
|
|
508
|
+
|
|
509
|
+
offset += len(documents)
|
|
510
|
+
page_num += 1
|
|
511
|
+
if len(documents) < rows_per_page:
|
|
512
|
+
break
|
|
513
|
+
|
|
514
|
+
def enrich_finding_location(
|
|
515
|
+
self,
|
|
516
|
+
finding: DetectionResult,
|
|
517
|
+
asset: SingleAssetScanResults,
|
|
518
|
+
text_content: str,
|
|
519
|
+
) -> None:
|
|
520
|
+
import re as _re
|
|
521
|
+
|
|
522
|
+
collection_ref = self._collection_lookup.get(asset.hash)
|
|
523
|
+
if not collection_ref:
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
doc_index: int | None = None
|
|
527
|
+
for line in text_content.splitlines():
|
|
528
|
+
match = _re.match(r"^doc_(\d+):", line)
|
|
529
|
+
if match and finding.matched_content in line:
|
|
530
|
+
doc_index = int(match.group(1))
|
|
531
|
+
break
|
|
532
|
+
|
|
533
|
+
path = f"{collection_ref.database}.{collection_ref.collection}"
|
|
534
|
+
if doc_index is not None:
|
|
535
|
+
path += f", document {doc_index}"
|
|
536
|
+
|
|
537
|
+
finding.location = Location(path=path)
|
|
538
|
+
|
|
539
|
+
def abort(self) -> None:
|
|
540
|
+
logger.info("Aborting MongoDB extraction...")
|
|
541
|
+
super().abort()
|
|
542
|
+
|
|
543
|
+
def cleanup(self) -> None:
|
|
544
|
+
if self._mongo_client is not None:
|
|
545
|
+
try:
|
|
546
|
+
self._mongo_client.close()
|
|
547
|
+
except Exception:
|
|
548
|
+
logger.debug("Failed to close MongoDB client cleanly", exc_info=True)
|
|
549
|
+
finally:
|
|
550
|
+
self._mongo_client = None
|