classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
File without changes
@@ -0,0 +1,523 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from collections.abc import AsyncGenerator, Iterator
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from ...models.generated_input import (
11
+ Neo4jInput,
12
+ Neo4jMaskedNone,
13
+ Neo4jMaskedUsernamePassword,
14
+ Neo4jOptionalConnection,
15
+ Neo4jOptionalScope,
16
+ SamplingConfig,
17
+ SamplingStrategy,
18
+ )
19
+ from ...models.generated_single_asset_scan_results import (
20
+ AssetType as OutputAssetType,
21
+ )
22
+ from ...models.generated_single_asset_scan_results import (
23
+ DetectionResult,
24
+ Location,
25
+ SingleAssetScanResults,
26
+ )
27
+ from ...utils.hashing import hash_id, unhash_id
28
+ from ..base import BaseSource
29
+ from ..dependencies import require_module
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ _DEFAULT_EXCLUDED_LABELS = {
34
+ "_Bloom_Perspective_",
35
+ "_Bloom_Scene_",
36
+ "__Neo4jMigration",
37
+ }
38
+
39
+ # Maximum relationship targets to query per label (avoids unbounded DISTINCT scans)
40
+ _RELATIONSHIP_SCAN_LIMIT = 1000
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class LabelRef:
45
+ label: str
46
+ database: str
47
+
48
+
49
+ def _escape_label(label: str) -> str:
50
+ """Backtick-escape a Neo4j label identifier."""
51
+ return f"`{label.replace('`', '``')}`"
52
+
53
+
54
+ class Neo4jSource(BaseSource):
55
+ source_type = "neo4j"
56
+ STREAM_DETECTIONS = True
57
+ CONTENT_BATCH_SIZE = 500
58
+
59
+ def __init__(
60
+ self,
61
+ recipe: dict[str, Any],
62
+ source_id: str | None = None,
63
+ runner_id: str | None = None,
64
+ ) -> None:
65
+ super().__init__(recipe, source_id, runner_id)
66
+ self.config = Neo4jInput.model_validate(recipe)
67
+ self.runner_id = runner_id or "local-run"
68
+
69
+ self._neo4j = require_module(
70
+ module_name="neo4j",
71
+ source_name="Neo4j",
72
+ uv_groups=["neo4j"],
73
+ detail="The Neo4j connector is optional.",
74
+ )
75
+
76
+ self._label_lookup: dict[str, LabelRef] = {}
77
+ self._content_cache: dict[str, tuple[str, str]] = {}
78
+ self._driver_instance: Any | None = None
79
+
80
+ def _asset_type_value(self) -> str:
81
+ type_value = self.config.type
82
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
83
+
84
+ def _sampling(self) -> SamplingConfig:
85
+ return self.config.sampling
86
+
87
+ def _connection_options(self) -> Neo4jOptionalConnection:
88
+ if self.config.optional and self.config.optional.connection:
89
+ return self.config.optional.connection
90
+ return Neo4jOptionalConnection()
91
+
92
+ def _scope_options(self) -> Neo4jOptionalScope:
93
+ if self.config.optional and self.config.optional.scope:
94
+ return self.config.optional.scope
95
+ return Neo4jOptionalScope()
96
+
97
+ def _uri(self) -> str:
98
+ return str(self.config.required.uri).strip()
99
+
100
+ def _database(self) -> str:
101
+ db = self.config.required.database
102
+ return str(db).strip() if db else "neo4j"
103
+
104
+ def _auth(self) -> Any:
105
+ masked = self.config.masked
106
+ if isinstance(masked, Neo4jMaskedUsernamePassword):
107
+ return self._neo4j.basic_auth(masked.username, masked.password)
108
+ if isinstance(masked, Neo4jMaskedNone):
109
+ return None
110
+ return None
111
+
112
+ def _driver(self) -> Any:
113
+ if self._driver_instance is not None:
114
+ return self._driver_instance
115
+
116
+ options = self._connection_options()
117
+ kwargs: dict[str, Any] = {
118
+ "connection_timeout": int(options.connection_timeout_ms or 30000) / 1000.0,
119
+ "max_connection_pool_size": int(options.max_connection_pool_size or 10),
120
+ }
121
+
122
+ if options.encrypted is not None:
123
+ kwargs["encrypted"] = bool(options.encrypted)
124
+
125
+ if options.trust_strategy is not None:
126
+ strategy = str(options.trust_strategy)
127
+ if strategy == "TRUST_ALL_CERTIFICATES":
128
+ kwargs["trust"] = self._neo4j.TRUST_ALL_CERTIFICATES
129
+ else:
130
+ kwargs["trust"] = self._neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES
131
+
132
+ auth = self._auth()
133
+ self._driver_instance = self._neo4j.GraphDatabase.driver(self._uri(), auth=auth, **kwargs)
134
+ return self._driver_instance
135
+
136
+ def _session(self, **kwargs: Any) -> Any:
137
+ db = self._database()
138
+ return self._driver().session(database=db, **kwargs)
139
+
140
+ def _label_allowlist(self) -> set[str]:
141
+ configured = self._scope_options().include_labels or []
142
+ return {entry.strip() for entry in configured if entry and entry.strip()}
143
+
144
+ def _label_denylist(self) -> set[str]:
145
+ configured = self._scope_options().exclude_labels or []
146
+ denylist = {entry.strip() for entry in configured if entry and entry.strip()}
147
+ if not denylist:
148
+ denylist = set(_DEFAULT_EXCLUDED_LABELS)
149
+ return denylist
150
+
151
+ def _label_allowed(self, label: str) -> bool:
152
+ denylist = self._label_denylist()
153
+ if label in denylist:
154
+ return False
155
+
156
+ allowlist = self._label_allowlist()
157
+ if allowlist and label not in allowlist:
158
+ return False
159
+
160
+ return True
161
+
162
+ def _discover_labels(self) -> list[LabelRef]:
163
+ limit = self._scope_options().node_limit_per_label
164
+ max_labels = int(limit) if limit else None
165
+
166
+ with self._session() as session:
167
+ result = session.run("CALL db.labels() YIELD label RETURN label ORDER BY label")
168
+ labels: list[LabelRef] = []
169
+ for record in result:
170
+ label = record["label"]
171
+ if not isinstance(label, str) or not label:
172
+ continue
173
+ if not self._label_allowed(label):
174
+ continue
175
+ labels.append(LabelRef(label=label, database=self._database()))
176
+ if max_labels is not None and len(labels) >= max_labels:
177
+ break
178
+
179
+ logger.info("Discovered %d node label(s) in database '%s'", len(labels), self._database())
180
+ return labels
181
+
182
+ def _resolve_relationship_links(
183
+ self,
184
+ ref: LabelRef,
185
+ label_hash_map: dict[str, str],
186
+ ) -> list[str]:
187
+ """Return hashes of related labels reachable from this label via any relationship."""
188
+ cypher = (
189
+ f"MATCH ({_escape_label(ref.label)})-[r]->(b) "
190
+ f"WITH DISTINCT labels(b) AS bl UNWIND bl AS target_label "
191
+ f"RETURN DISTINCT target_label LIMIT {_RELATIONSHIP_SCAN_LIMIT}"
192
+ )
193
+ linked_hashes: list[str] = []
194
+ try:
195
+ with self._session() as session:
196
+ result = session.run(cypher)
197
+ for record in result:
198
+ target = record["target_label"]
199
+ if isinstance(target, str) and target in label_hash_map:
200
+ linked_hashes.append(label_hash_map[target])
201
+ except Exception as exc:
202
+ logger.warning("Could not resolve relationships for label '%s': %s", ref.label, exc)
203
+
204
+ return sorted(set(linked_hashes))
205
+
206
+ def _label_raw_id(self, ref: LabelRef) -> str:
207
+ return f"{ref.database}_#_{ref.label}"
208
+
209
+ def _label_to_asset(self, ref: LabelRef, links: list[str]) -> SingleAssetScanResults:
210
+ raw_id = self._label_raw_id(ref)
211
+ asset_hash = self.generate_hash_id(raw_id)
212
+ external_url = self.ensure_location(
213
+ f"{self._uri()}/{ref.database}/{ref.label}",
214
+ fallback=f"neo4j://{ref.database}/{ref.label}",
215
+ )
216
+
217
+ metadata = {
218
+ "label": ref.label,
219
+ "database": ref.database,
220
+ "uri": self._uri(),
221
+ "sampling": {"strategy": str(self._sampling().strategy)},
222
+ }
223
+ now = datetime.now(UTC)
224
+
225
+ return SingleAssetScanResults(
226
+ hash=asset_hash,
227
+ checksum=self.calculate_checksum(metadata),
228
+ name=f"{ref.database}:{ref.label}",
229
+ external_url=external_url,
230
+ links=links,
231
+ asset_type=OutputAssetType.TXT,
232
+ source_id=self.source_id,
233
+ created_at=now,
234
+ updated_at=now,
235
+ runner_id=self.runner_id,
236
+ )
237
+
238
+ def test_connection(self) -> dict[str, Any]:
239
+ logger.info("Testing connection to Neo4j at %s...", self._uri())
240
+ result: dict[str, Any] = {
241
+ "timestamp": datetime.now(UTC).isoformat(),
242
+ "source_type": self.recipe.get("type"),
243
+ }
244
+
245
+ try:
246
+ self._driver().verify_connectivity()
247
+ labels = self._discover_labels()
248
+ result["status"] = "SUCCESS"
249
+ result["message"] = (
250
+ f"Successfully connected to Neo4j at {self._uri()} "
251
+ f"(database='{self._database()}'). "
252
+ f"Reachable node labels: {len(labels)}."
253
+ )
254
+ except Exception as exc:
255
+ result["status"] = "FAILURE"
256
+ result["message"] = f"Failed to connect to Neo4j: {exc}"
257
+
258
+ return result
259
+
260
+ STREAM_DETECTIONS = True
261
+
262
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
263
+ if self._aborted:
264
+ return
265
+
266
+ logger.info("Starting Neo4j extraction: discovering node labels...")
267
+ labels = self._discover_labels()
268
+
269
+ # Build hash map for relationship link resolution
270
+ label_hash_map: dict[str, str] = {
271
+ ref.label: self.generate_hash_id(self._label_raw_id(ref)) for ref in labels
272
+ }
273
+
274
+ include_rels = self._scope_options().include_relationships is not False
275
+
276
+ batch: list[SingleAssetScanResults] = []
277
+ total = len(labels)
278
+
279
+ for i, ref in enumerate(labels, 1):
280
+ if self._aborted:
281
+ return
282
+
283
+ logger.info("Processing label %d/%d: %s", i, total, ref.label)
284
+
285
+ links: list[str] = []
286
+ if include_rels:
287
+ links = self._resolve_relationship_links(ref, label_hash_map)
288
+ if links:
289
+ logger.debug("Label '%s' has %d relationship link(s)", ref.label, len(links))
290
+
291
+ asset = self._label_to_asset(ref, links)
292
+ self._label_lookup[asset.hash] = ref
293
+ batch.append(asset)
294
+
295
+ if len(batch) >= self.BATCH_SIZE:
296
+ logger.info("Emitting batch of %d label asset(s) (total so far: %d)", len(batch), i)
297
+ yield batch
298
+ batch = []
299
+
300
+ if batch:
301
+ logger.info("Emitting final batch of %d asset(s)", len(batch))
302
+ yield batch
303
+
304
+ logger.info("Extraction complete: %d node label(s) emitted", total)
305
+
306
+ def generate_hash_id(self, asset_id: str) -> str:
307
+ return hash_id(self._asset_type_value(), asset_id)
308
+
309
+ def _parse_label_ref(self, asset_id: str) -> LabelRef | None:
310
+ if asset_id in self._label_lookup:
311
+ return self._label_lookup[asset_id]
312
+
313
+ decoded = asset_id
314
+ if "_#_" not in decoded:
315
+ try:
316
+ decoded = unhash_id(asset_id)
317
+ except Exception:
318
+ decoded = asset_id
319
+
320
+ parts = decoded.split("_#_")
321
+ if len(parts) >= 2:
322
+ return LabelRef(database=parts[-2], label=parts[-1])
323
+ return None
324
+
325
+ def _fetch_nodes_page(self, ref: LabelRef, skip: int, limit: int) -> list[dict[str, Any]]:
326
+ cypher = f"MATCH (n:{_escape_label(ref.label)}) RETURN n SKIP {skip} LIMIT {limit}"
327
+ nodes: list[dict[str, Any]] = []
328
+ with self._session() as session:
329
+ result = session.run(cypher)
330
+ for record in result:
331
+ node = record["n"]
332
+ props = dict(node) if node is not None else {}
333
+ nodes.append(props)
334
+ return nodes
335
+
336
+ def _fetch_all_nodes_batched(self, ref: LabelRef) -> Iterator[list[dict[str, Any]]]:
337
+ sampling = self._sampling()
338
+ batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
339
+ label_name = f"{ref.database}:{ref.label}"
340
+
341
+ offset = 0
342
+ batch_num = 0
343
+
344
+ while not self._aborted:
345
+ batch_num += 1
346
+ nodes = self._fetch_nodes_page(ref, skip=offset, limit=batch_size)
347
+ logger.debug(
348
+ "Content batch %d: fetched %d nodes from %s (offset=%d)",
349
+ batch_num,
350
+ len(nodes),
351
+ label_name,
352
+ offset,
353
+ )
354
+ if not nodes:
355
+ break
356
+ yield nodes
357
+ offset += len(nodes)
358
+ if len(nodes) < batch_size:
359
+ break
360
+
361
+ logger.info("Fetched nodes from %s in %d content batch(es)", label_name, batch_num)
362
+
363
+ def _fetch_sample_nodes(self, ref: LabelRef) -> list[dict[str, Any]]:
364
+ sampling = self._sampling()
365
+ strategy = sampling.strategy
366
+ rows = int(sampling.rows_per_page or 100)
367
+
368
+ if strategy == SamplingStrategy.RANDOM:
369
+ cypher = (
370
+ f"MATCH (n:{_escape_label(ref.label)}) "
371
+ f"WITH n, rand() AS r ORDER BY r LIMIT {rows} RETURN n"
372
+ )
373
+ elif strategy == SamplingStrategy.LATEST:
374
+ order_col = sampling.order_by_column
375
+ if order_col:
376
+ cypher = (
377
+ f"MATCH (n:{_escape_label(ref.label)}) "
378
+ f"WHERE n.{order_col} IS NOT NULL "
379
+ f"RETURN n ORDER BY n.{order_col} DESC LIMIT {rows}"
380
+ )
381
+ else:
382
+ # Fallback: ID-ordered (stable and often insertion-ordered)
383
+ cypher = (
384
+ f"MATCH (n:{_escape_label(ref.label)}) "
385
+ f"RETURN n ORDER BY id(n) DESC LIMIT {rows}"
386
+ )
387
+ else:
388
+ # ALL — first page only for fetch_content; full pagination via fetch_content_pages
389
+ batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
390
+ return self._fetch_nodes_page(ref, skip=0, limit=batch_size)
391
+
392
+ nodes: list[dict[str, Any]] = []
393
+ with self._session() as session:
394
+ result = session.run(cypher)
395
+ for record in result:
396
+ node = record["n"]
397
+ props = dict(node) if node is not None else {}
398
+ nodes.append(props)
399
+ return nodes
400
+
401
+ def _serialize_node(self, props: dict[str, Any]) -> str:
402
+ return json.dumps(props, ensure_ascii=False, default=str, sort_keys=True)
403
+
404
+ def _format_label_content(
405
+ self,
406
+ ref: LabelRef,
407
+ nodes: list[dict[str, Any]],
408
+ node_offset: int = 0,
409
+ ) -> tuple[str, str]:
410
+ sampling = self._sampling()
411
+ strategy = sampling.strategy
412
+ lines = [
413
+ f"label={ref.database}:{ref.label}",
414
+ f"sampling_strategy={strategy}",
415
+ f"sampled_nodes={len(nodes)}",
416
+ "",
417
+ ]
418
+
419
+ serialized_nodes: list[str] = []
420
+ for index, props in enumerate(nodes, start=1 + node_offset):
421
+ serialized = self._serialize_node(props)
422
+ serialized_nodes.append(serialized)
423
+ lines.append(f"node_{index}: {serialized}")
424
+
425
+ text_content = "\n".join(lines)
426
+ raw_content = json.dumps(
427
+ {
428
+ "database": ref.database,
429
+ "label": ref.label,
430
+ "strategy": str(strategy),
431
+ "nodes": serialized_nodes,
432
+ "node_offset": node_offset,
433
+ },
434
+ ensure_ascii=False,
435
+ )
436
+ return raw_content, text_content
437
+
438
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
439
+ cached = self._content_cache.get(asset_id)
440
+ if cached:
441
+ return cached
442
+
443
+ ref = self._parse_label_ref(asset_id)
444
+ if not ref:
445
+ return None
446
+
447
+ nodes = self._fetch_sample_nodes(ref)
448
+ content = self._format_label_content(ref, nodes)
449
+ self._content_cache[asset_id] = content
450
+ return content
451
+
452
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
453
+ sampling = self._sampling()
454
+ ref = self._parse_label_ref(asset_id)
455
+ if not ref:
456
+ return
457
+
458
+ if sampling.strategy != SamplingStrategy.ALL:
459
+ nodes = self._fetch_sample_nodes(ref)
460
+ for i, props in enumerate(nodes):
461
+ content = self._format_label_content(ref, [props], node_offset=i)
462
+ yield content
463
+ return
464
+
465
+ label_name = f"{ref.database}:{ref.label}"
466
+ batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
467
+ offset = 0
468
+ batch_num = 0
469
+
470
+ for node_batch in self._fetch_all_nodes_batched(ref):
471
+ batch_num += 1
472
+ logger.info(
473
+ "%s batch %d: %d node(s) (offset=%d)",
474
+ label_name,
475
+ batch_num,
476
+ len(node_batch),
477
+ offset,
478
+ )
479
+ for i, props in enumerate(node_batch):
480
+ content = self._format_label_content(ref, [props], node_offset=offset + i)
481
+ self._content_cache[asset_id] = content
482
+ yield content
483
+ offset += len(node_batch)
484
+ if len(node_batch) < batch_size:
485
+ break
486
+
487
+ def enrich_finding_location(
488
+ self,
489
+ finding: DetectionResult,
490
+ asset: SingleAssetScanResults,
491
+ text_content: str,
492
+ ) -> None:
493
+ import re as _re
494
+
495
+ ref = self._label_lookup.get(asset.hash)
496
+ if not ref:
497
+ return
498
+
499
+ node_index: int | None = None
500
+ for line in text_content.splitlines():
501
+ match = _re.match(r"^node_(\d+):", line)
502
+ if match and finding.matched_content in line:
503
+ node_index = int(match.group(1))
504
+ break
505
+
506
+ path = f"{ref.database}:{ref.label}"
507
+ if node_index is not None:
508
+ path += f", node {node_index}"
509
+
510
+ finding.location = Location(path=path)
511
+
512
+ def abort(self) -> None:
513
+ logger.info("Aborting Neo4j extraction...")
514
+ super().abort()
515
+
516
+ def cleanup(self) -> None:
517
+ if self._driver_instance is not None:
518
+ try:
519
+ self._driver_instance.close()
520
+ except Exception:
521
+ logger.debug("Failed to close Neo4j driver cleanly", exc_info=True)
522
+ finally:
523
+ self._driver_instance = None