classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,550 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from ...models.generated_input import (
11
+ MongoDBInput,
12
+ MongoDBMaskedNone,
13
+ MongoDBMaskedUsernamePassword,
14
+ MongoDBOptionalConnection,
15
+ MongoDBOptionalScope,
16
+ MongoDBRequiredAtlas,
17
+ SamplingConfig,
18
+ SamplingStrategy,
19
+ )
20
+ from ...models.generated_single_asset_scan_results import (
21
+ AssetType as OutputAssetType,
22
+ )
23
+ from ...models.generated_single_asset_scan_results import (
24
+ DetectionResult,
25
+ Location,
26
+ SingleAssetScanResults,
27
+ )
28
+ from ...utils.hashing import hash_id, unhash_id
29
+ from ..base import BaseSource
30
+ from ..dependencies import require_module
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _DEFAULT_EXCLUDED_DATABASES = {"admin", "config", "local"}
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class CollectionRef:
39
+ database: str
40
+ collection: str
41
+
42
+
43
+ class MongoDBSource(BaseSource):
44
+ source_type = "mongodb"
45
+
46
+ def __init__(
47
+ self,
48
+ recipe: dict[str, Any],
49
+ source_id: str | None = None,
50
+ runner_id: str | None = None,
51
+ ) -> None:
52
+ super().__init__(recipe, source_id, runner_id)
53
+ self.config = MongoDBInput.model_validate(recipe)
54
+ self.runner_id = runner_id or "local-run"
55
+ self._pymongo = require_module(
56
+ module_name="pymongo",
57
+ source_name="MongoDB",
58
+ uv_groups=["mongodb"],
59
+ detail="The MongoDB connector is optional.",
60
+ )
61
+ self._collection_lookup: dict[str, CollectionRef] = {}
62
+ self._content_cache: dict[str, tuple[str, str]] = {}
63
+ self._mongo_client: Any | None = None
64
+
65
+ def _asset_type_value(self) -> str:
66
+ type_value = self.config.type
67
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
68
+
69
+ def _sampling(self) -> SamplingConfig:
70
+ return self.config.sampling
71
+
72
+ def _connection_options(self) -> MongoDBOptionalConnection:
73
+ if self.config.optional and self.config.optional.connection:
74
+ return self.config.optional.connection
75
+ return MongoDBOptionalConnection()
76
+
77
+ def _scope_options(self) -> MongoDBOptionalScope:
78
+ if self.config.optional and self.config.optional.scope:
79
+ return self.config.optional.scope
80
+ return MongoDBOptionalScope()
81
+
82
+ def _username_password(self) -> tuple[str | None, str | None]:
83
+ masked = self.config.masked
84
+ if isinstance(masked, MongoDBMaskedUsernamePassword):
85
+ return masked.username, masked.password
86
+ if isinstance(masked, MongoDBMaskedNone):
87
+ return None, None
88
+ return None, None
89
+
90
+ def _atlas_cluster_host(self) -> str:
91
+ required = self.config.required
92
+ if not isinstance(required, MongoDBRequiredAtlas):
93
+ return ""
94
+
95
+ cleaned = required.cluster_host.strip()
96
+ for prefix in ("mongodb+srv://", "mongodb://"):
97
+ if cleaned.startswith(prefix):
98
+ cleaned = cleaned[len(prefix) :]
99
+
100
+ if "@" in cleaned:
101
+ cleaned = cleaned.split("@", maxsplit=1)[-1]
102
+ if "/" in cleaned:
103
+ cleaned = cleaned.split("/", maxsplit=1)[0]
104
+ return cleaned
105
+
106
+ def _is_atlas(self) -> bool:
107
+ return isinstance(self.config.required, MongoDBRequiredAtlas)
108
+
109
+ def _build_connection_uri(self) -> str:
110
+ required = self.config.required
111
+ if isinstance(required, MongoDBRequiredAtlas):
112
+ return f"mongodb+srv://{self._atlas_cluster_host()}"
113
+ return f"mongodb://{required.host}:{int(required.port)}"
114
+
115
+ def _build_client_kwargs(self) -> dict[str, Any]:
116
+ options = self._connection_options()
117
+ username, password = self._username_password()
118
+
119
+ kwargs: dict[str, Any] = {
120
+ "connectTimeoutMS": int(options.connect_timeout_ms or 30000),
121
+ }
122
+ if username:
123
+ kwargs["username"] = username
124
+ if password:
125
+ kwargs["password"] = password
126
+
127
+ if options.auth_mechanism and str(options.auth_mechanism) != "DEFAULT":
128
+ mechanism = (
129
+ options.auth_mechanism.value
130
+ if hasattr(options.auth_mechanism, "value")
131
+ else str(options.auth_mechanism)
132
+ )
133
+ kwargs["authMechanism"] = mechanism
134
+ if options.auth_source:
135
+ kwargs["authSource"] = options.auth_source
136
+ if options.app_name:
137
+ kwargs["appname"] = options.app_name
138
+ if options.tls is not None:
139
+ kwargs["tls"] = bool(options.tls)
140
+ if options.replica_set:
141
+ kwargs["replicaSet"] = options.replica_set
142
+ if options.direct_connection is not None:
143
+ kwargs["directConnection"] = bool(options.direct_connection)
144
+
145
+ additional = options.options or {}
146
+ if isinstance(additional, dict):
147
+ kwargs.update(additional)
148
+ return kwargs
149
+
150
+ def _client(self) -> Any:
151
+ if self._mongo_client is not None:
152
+ return self._mongo_client
153
+
154
+ client = self._pymongo.MongoClient(
155
+ self._build_connection_uri(),
156
+ **self._build_client_kwargs(),
157
+ )
158
+ self._mongo_client = client
159
+ return client
160
+
161
+ def _excluded_databases(self) -> set[str]:
162
+ configured = self._scope_options().exclude_databases or []
163
+ excluded = {name.strip() for name in configured if name and name.strip()}
164
+ if not excluded:
165
+ excluded = set(_DEFAULT_EXCLUDED_DATABASES)
166
+ return excluded
167
+
168
+ def _collection_allowlist(self) -> set[str]:
169
+ configured = self._scope_options().include_collections or []
170
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
171
+
172
+ def _collection_denylist(self) -> set[str]:
173
+ configured = self._scope_options().exclude_collections or []
174
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
175
+
176
+ def _include_system_collections(self) -> bool:
177
+ return bool(self._scope_options().include_system_collections)
178
+
179
+ def _resolve_databases(self) -> list[str]:
180
+ scope_options = self._scope_options()
181
+ include_all = scope_options.include_all_databases is not False
182
+ configured_database = scope_options.database
183
+
184
+ if not include_all:
185
+ if configured_database:
186
+ return [configured_database]
187
+ raise ValueError(
188
+ "MongoDB source requires optional.scope.database when include_all_databases is false. "
189
+ "Set optional.scope.database (e.g. 'app_db') or enable include_all_databases."
190
+ )
191
+
192
+ excluded = self._excluded_databases()
193
+ discovered = [
194
+ database
195
+ for database in self._client().list_database_names()
196
+ if isinstance(database, str) and database and database not in excluded
197
+ ]
198
+ discovered.sort()
199
+
200
+ if configured_database and configured_database not in discovered:
201
+ discovered.insert(0, configured_database)
202
+ return discovered
203
+
204
+ def _collection_allowed(self, database: str, collection: str) -> bool:
205
+ if not self._include_system_collections() and collection.startswith("system."):
206
+ return False
207
+
208
+ normalized_collection = collection.lower()
209
+ normalized_scoped = f"{database}.{collection}".lower()
210
+
211
+ allowlist = self._collection_allowlist()
212
+ if (
213
+ allowlist
214
+ and normalized_collection not in allowlist
215
+ and normalized_scoped not in allowlist
216
+ ):
217
+ return False
218
+
219
+ denylist = self._collection_denylist()
220
+ if normalized_collection in denylist or normalized_scoped in denylist:
221
+ return False
222
+
223
+ return True
224
+
225
+ def _list_collections_for_database(self, database: str) -> list[CollectionRef]:
226
+ collection_limit = self._scope_options().collection_limit
227
+ limit = int(collection_limit) if collection_limit else None
228
+
229
+ collections: list[CollectionRef] = []
230
+ for collection in self._client()[database].list_collection_names():
231
+ if not isinstance(collection, str) or not collection:
232
+ continue
233
+ if not self._collection_allowed(database, collection):
234
+ continue
235
+
236
+ collections.append(CollectionRef(database=database, collection=collection))
237
+ if limit is not None and len(collections) >= limit:
238
+ break
239
+
240
+ return collections
241
+
242
+ def _iter_collections(self) -> list[CollectionRef]:
243
+ collections: list[CollectionRef] = []
244
+ for database in self._resolve_databases():
245
+ if self._aborted:
246
+ break
247
+ try:
248
+ collections.extend(self._list_collections_for_database(database))
249
+ except Exception as exc:
250
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
251
+ return collections
252
+
253
+ def test_connection(self) -> dict[str, Any]:
254
+ logger.info("Testing connection to MongoDB...")
255
+ result = {
256
+ "timestamp": datetime.now(UTC).isoformat(),
257
+ "source_type": self.recipe.get("type"),
258
+ }
259
+
260
+ try:
261
+ self._client().admin.command("ping")
262
+ databases = self._resolve_databases()
263
+ result["status"] = "SUCCESS"
264
+ deployment = "Atlas" if self._is_atlas() else "On-prem"
265
+ result["message"] = (
266
+ f"Successfully connected to MongoDB ({deployment}). "
267
+ f"Reachable databases: {len(databases)}."
268
+ )
269
+ except Exception as exc:
270
+ result["status"] = "FAILURE"
271
+ result["message"] = f"Failed to connect to MongoDB: {exc}"
272
+
273
+ return result
274
+
275
+ def _collection_raw_id(self, collection_ref: CollectionRef) -> str:
276
+ return f"{collection_ref.database}_#_{collection_ref.collection}"
277
+
278
+ def _collection_to_asset(self, collection_ref: CollectionRef) -> SingleAssetScanResults:
279
+ asset_name = f"{collection_ref.database}.{collection_ref.collection}"
280
+ raw_id = self._collection_raw_id(collection_ref)
281
+ asset_hash = self.generate_hash_id(raw_id)
282
+ external_url = self.ensure_location(
283
+ self._collection_external_url(collection_ref),
284
+ fallback=f"mongodb://{asset_name}",
285
+ )
286
+
287
+ metadata = {
288
+ "database": collection_ref.database,
289
+ "collection": collection_ref.collection,
290
+ "deployment": "ATLAS" if self._is_atlas() else "ON_PREM",
291
+ "sampling": {
292
+ "strategy": str(self._sampling().strategy),
293
+ },
294
+ }
295
+ now = datetime.now(UTC)
296
+
297
+ return SingleAssetScanResults(
298
+ hash=asset_hash,
299
+ checksum=self.calculate_checksum(metadata),
300
+ name=asset_name,
301
+ external_url=external_url,
302
+ links=[],
303
+ asset_type=OutputAssetType.TXT,
304
+ source_id=self.source_id,
305
+ created_at=now,
306
+ updated_at=now,
307
+ runner_id=self.runner_id,
308
+ )
309
+
310
+ def _collection_external_url(self, collection_ref: CollectionRef) -> str:
311
+ if self._is_atlas():
312
+ return (
313
+ f"mongodb+srv://{self._atlas_cluster_host()}/"
314
+ f"{collection_ref.database}/{collection_ref.collection}"
315
+ )
316
+
317
+ required = self.config.required
318
+ return (
319
+ f"mongodb://{required.host}:{int(required.port)}/"
320
+ f"{collection_ref.database}/{collection_ref.collection}"
321
+ )
322
+
323
+ STREAM_DETECTIONS = True
324
+
325
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
326
+ if self._aborted:
327
+ return
328
+
329
+ batch: list[SingleAssetScanResults] = []
330
+ for collection_ref in self._iter_collections():
331
+ if self._aborted:
332
+ return
333
+
334
+ asset = self._collection_to_asset(collection_ref)
335
+ self._collection_lookup[asset.hash] = collection_ref
336
+ batch.append(asset)
337
+
338
+ if len(batch) >= self.BATCH_SIZE:
339
+ yield batch
340
+ batch = []
341
+
342
+ if batch:
343
+ yield batch
344
+
345
+ def generate_hash_id(self, asset_id: str) -> str:
346
+ return hash_id(self._asset_type_value(), asset_id)
347
+
348
+ def _parse_collection_ref(self, asset_id: str) -> CollectionRef | None:
349
+ if asset_id in self._collection_lookup:
350
+ return self._collection_lookup[asset_id]
351
+
352
+ decoded = asset_id
353
+ if "_#_" not in decoded:
354
+ try:
355
+ decoded = unhash_id(asset_id)
356
+ except Exception:
357
+ decoded = asset_id
358
+
359
+ parts = decoded.split("_#_")
360
+ if len(parts) >= 3 and parts[0].upper() == "MONGODB":
361
+ return CollectionRef(database=parts[-2], collection=parts[-1])
362
+ if len(parts) >= 2:
363
+ return CollectionRef(database=parts[-2], collection=parts[-1])
364
+ return None
365
+
366
+ def _latest_order_field(self) -> str:
367
+ sampling = self._sampling()
368
+ if sampling.order_by_column:
369
+ return sampling.order_by_column
370
+ return "_id"
371
+
372
+ def _sample_random_documents(self, collection: Any, limit: int) -> list[dict[str, Any]]:
373
+ pipeline = [{"$sample": {"size": limit}}]
374
+ return list(collection.aggregate(pipeline, allowDiskUse=True))
375
+
376
+ def _count_collection_documents(self, collection_ref: CollectionRef) -> int | None:
377
+ try:
378
+ collection = self._client()[collection_ref.database][collection_ref.collection]
379
+ return int(collection.count_documents({}))
380
+ except Exception:
381
+ return None
382
+
383
+ def _sample_collection_documents(self, collection_ref: CollectionRef) -> list[dict[str, Any]]:
384
+ collection = self._client()[collection_ref.database][collection_ref.collection]
385
+ sampling = self._sampling()
386
+ strategy = sampling.strategy
387
+ rows_per_page = int(sampling.rows_per_page or 100)
388
+
389
+ if strategy == SamplingStrategy.ALL:
390
+ return list(collection.find({}).limit(rows_per_page))
391
+
392
+ if strategy == SamplingStrategy.RANDOM:
393
+ return self._sample_random_documents(collection, rows_per_page)
394
+
395
+ order_field = self._latest_order_field()
396
+ if order_field != "_id":
397
+ try:
398
+ has_field = (
399
+ collection.count_documents({order_field: {"$exists": True}}, limit=1) > 0
400
+ )
401
+ except Exception:
402
+ has_field = True
403
+ if not has_field and sampling.fallback_to_random is not False:
404
+ return self._sample_random_documents(collection, rows_per_page)
405
+
406
+ return list(
407
+ collection.find({}).sort(order_field, self._pymongo.DESCENDING).limit(rows_per_page)
408
+ )
409
+
410
+ def _serialize_document(self, document: dict[str, Any]) -> str:
411
+ return json.dumps(document, ensure_ascii=False, default=str, sort_keys=True)
412
+
413
+ def _format_collection_content(
414
+ self,
415
+ collection_ref: CollectionRef,
416
+ documents: list[dict[str, Any]],
417
+ doc_offset: int = 0,
418
+ ) -> tuple[str, str]:
419
+ sampling = self._sampling()
420
+
421
+ strategy = sampling.strategy
422
+ lines = [
423
+ f"collection={collection_ref.database}.{collection_ref.collection}",
424
+ f"sampling_strategy={strategy}",
425
+ f"sampled_documents={len(documents)}",
426
+ "",
427
+ ]
428
+
429
+ serialized_documents: list[str] = []
430
+ for index, document in enumerate(documents, start=1 + doc_offset):
431
+ serialized = self._serialize_document(document)
432
+ serialized_documents.append(serialized)
433
+ lines.append(f"doc_{index}: {serialized}")
434
+
435
+ text_content = "\n".join(lines)
436
+ raw_content = json.dumps(
437
+ {
438
+ "database": collection_ref.database,
439
+ "collection": collection_ref.collection,
440
+ "strategy": str(strategy),
441
+ "documents": serialized_documents,
442
+ "doc_offset": doc_offset,
443
+ },
444
+ ensure_ascii=False,
445
+ )
446
+ return raw_content, text_content
447
+
448
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
449
+ cached = self._content_cache.get(asset_id)
450
+ if cached:
451
+ return cached
452
+
453
+ collection_ref = self._parse_collection_ref(asset_id)
454
+ if not collection_ref:
455
+ return None
456
+
457
+ documents = self._sample_collection_documents(collection_ref)
458
+ content = self._format_collection_content(collection_ref, documents)
459
+
460
+ self._content_cache[asset_id] = content
461
+ return content
462
+
463
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
464
+ sampling = self._sampling()
465
+ collection_ref = self._parse_collection_ref(asset_id)
466
+ if not collection_ref:
467
+ return
468
+
469
+ if sampling.strategy != SamplingStrategy.ALL:
470
+ documents = self._sample_collection_documents(collection_ref)
471
+ for i, document in enumerate(documents):
472
+ content = self._format_collection_content(collection_ref, [document], doc_offset=i)
473
+ yield content
474
+ return
475
+
476
+ rows_per_page = int(sampling.rows_per_page or 100)
477
+ collection_label = f"{collection_ref.database}.{collection_ref.collection}"
478
+
479
+ total_docs = self._count_collection_documents(collection_ref)
480
+ total_batches = ((total_docs + rows_per_page - 1) // rows_per_page) if total_docs else None
481
+ if total_docs is not None and total_batches is not None:
482
+ logger.info(
483
+ "Full scan %s: %d documents, %d batches of %d",
484
+ collection_label,
485
+ total_docs,
486
+ total_batches,
487
+ rows_per_page,
488
+ )
489
+
490
+ collection = self._client()[collection_ref.database][collection_ref.collection]
491
+ offset = 0
492
+ page_num = 1
493
+
494
+ while not self._aborted:
495
+ if total_batches is not None:
496
+ logger.info("%s batch %d/%d", collection_label, page_num, total_batches)
497
+
498
+ documents = list(collection.find({}).skip(offset).limit(rows_per_page))
499
+ if not documents:
500
+ break
501
+
502
+ for i, document in enumerate(documents):
503
+ content = self._format_collection_content(
504
+ collection_ref, [document], doc_offset=offset + i
505
+ )
506
+ self._content_cache[asset_id] = content
507
+ yield content
508
+
509
+ offset += len(documents)
510
+ page_num += 1
511
+ if len(documents) < rows_per_page:
512
+ break
513
+
514
+ def enrich_finding_location(
515
+ self,
516
+ finding: DetectionResult,
517
+ asset: SingleAssetScanResults,
518
+ text_content: str,
519
+ ) -> None:
520
+ import re as _re
521
+
522
+ collection_ref = self._collection_lookup.get(asset.hash)
523
+ if not collection_ref:
524
+ return
525
+
526
+ doc_index: int | None = None
527
+ for line in text_content.splitlines():
528
+ match = _re.match(r"^doc_(\d+):", line)
529
+ if match and finding.matched_content in line:
530
+ doc_index = int(match.group(1))
531
+ break
532
+
533
+ path = f"{collection_ref.database}.{collection_ref.collection}"
534
+ if doc_index is not None:
535
+ path += f", document {doc_index}"
536
+
537
+ finding.location = Location(path=path)
538
+
539
+ def abort(self) -> None:
540
+ logger.info("Aborting MongoDB extraction...")
541
+ super().abort()
542
+
543
+ def cleanup(self) -> None:
544
+ if self._mongo_client is not None:
545
+ try:
546
+ self._mongo_client.close()
547
+ except Exception:
548
+ logger.debug("Failed to close MongoDB client cleanly", exc_info=True)
549
+ finally:
550
+ self._mongo_client = None
@@ -0,0 +1,3 @@
1
+ from .source import MSSQLSource
2
+
3
+ __all__ = ["MSSQLSource"]