classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
File without changes
|
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator, Iterator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...models.generated_input import (
|
|
11
|
+
Neo4jInput,
|
|
12
|
+
Neo4jMaskedNone,
|
|
13
|
+
Neo4jMaskedUsernamePassword,
|
|
14
|
+
Neo4jOptionalConnection,
|
|
15
|
+
Neo4jOptionalScope,
|
|
16
|
+
SamplingConfig,
|
|
17
|
+
SamplingStrategy,
|
|
18
|
+
)
|
|
19
|
+
from ...models.generated_single_asset_scan_results import (
|
|
20
|
+
AssetType as OutputAssetType,
|
|
21
|
+
)
|
|
22
|
+
from ...models.generated_single_asset_scan_results import (
|
|
23
|
+
DetectionResult,
|
|
24
|
+
Location,
|
|
25
|
+
SingleAssetScanResults,
|
|
26
|
+
)
|
|
27
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
28
|
+
from ..base import BaseSource
|
|
29
|
+
from ..dependencies import require_module
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
_DEFAULT_EXCLUDED_LABELS = {
|
|
34
|
+
"_Bloom_Perspective_",
|
|
35
|
+
"_Bloom_Scene_",
|
|
36
|
+
"__Neo4jMigration",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Maximum relationship targets to query per label (avoids unbounded DISTINCT scans)
|
|
40
|
+
_RELATIONSHIP_SCAN_LIMIT = 1000
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class LabelRef:
|
|
45
|
+
label: str
|
|
46
|
+
database: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _escape_label(label: str) -> str:
|
|
50
|
+
"""Backtick-escape a Neo4j label identifier."""
|
|
51
|
+
return f"`{label.replace('`', '``')}`"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Neo4jSource(BaseSource):
|
|
55
|
+
source_type = "neo4j"
|
|
56
|
+
STREAM_DETECTIONS = True
|
|
57
|
+
CONTENT_BATCH_SIZE = 500
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
recipe: dict[str, Any],
|
|
62
|
+
source_id: str | None = None,
|
|
63
|
+
runner_id: str | None = None,
|
|
64
|
+
) -> None:
|
|
65
|
+
super().__init__(recipe, source_id, runner_id)
|
|
66
|
+
self.config = Neo4jInput.model_validate(recipe)
|
|
67
|
+
self.runner_id = runner_id or "local-run"
|
|
68
|
+
|
|
69
|
+
self._neo4j = require_module(
|
|
70
|
+
module_name="neo4j",
|
|
71
|
+
source_name="Neo4j",
|
|
72
|
+
uv_groups=["neo4j"],
|
|
73
|
+
detail="The Neo4j connector is optional.",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self._label_lookup: dict[str, LabelRef] = {}
|
|
77
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
78
|
+
self._driver_instance: Any | None = None
|
|
79
|
+
|
|
80
|
+
def _asset_type_value(self) -> str:
|
|
81
|
+
type_value = self.config.type
|
|
82
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
83
|
+
|
|
84
|
+
def _sampling(self) -> SamplingConfig:
|
|
85
|
+
return self.config.sampling
|
|
86
|
+
|
|
87
|
+
def _connection_options(self) -> Neo4jOptionalConnection:
|
|
88
|
+
if self.config.optional and self.config.optional.connection:
|
|
89
|
+
return self.config.optional.connection
|
|
90
|
+
return Neo4jOptionalConnection()
|
|
91
|
+
|
|
92
|
+
def _scope_options(self) -> Neo4jOptionalScope:
|
|
93
|
+
if self.config.optional and self.config.optional.scope:
|
|
94
|
+
return self.config.optional.scope
|
|
95
|
+
return Neo4jOptionalScope()
|
|
96
|
+
|
|
97
|
+
def _uri(self) -> str:
|
|
98
|
+
return str(self.config.required.uri).strip()
|
|
99
|
+
|
|
100
|
+
def _database(self) -> str:
|
|
101
|
+
db = self.config.required.database
|
|
102
|
+
return str(db).strip() if db else "neo4j"
|
|
103
|
+
|
|
104
|
+
def _auth(self) -> Any:
|
|
105
|
+
masked = self.config.masked
|
|
106
|
+
if isinstance(masked, Neo4jMaskedUsernamePassword):
|
|
107
|
+
return self._neo4j.basic_auth(masked.username, masked.password)
|
|
108
|
+
if isinstance(masked, Neo4jMaskedNone):
|
|
109
|
+
return None
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def _driver(self) -> Any:
|
|
113
|
+
if self._driver_instance is not None:
|
|
114
|
+
return self._driver_instance
|
|
115
|
+
|
|
116
|
+
options = self._connection_options()
|
|
117
|
+
kwargs: dict[str, Any] = {
|
|
118
|
+
"connection_timeout": int(options.connection_timeout_ms or 30000) / 1000.0,
|
|
119
|
+
"max_connection_pool_size": int(options.max_connection_pool_size or 10),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if options.encrypted is not None:
|
|
123
|
+
kwargs["encrypted"] = bool(options.encrypted)
|
|
124
|
+
|
|
125
|
+
if options.trust_strategy is not None:
|
|
126
|
+
strategy = str(options.trust_strategy)
|
|
127
|
+
if strategy == "TRUST_ALL_CERTIFICATES":
|
|
128
|
+
kwargs["trust"] = self._neo4j.TRUST_ALL_CERTIFICATES
|
|
129
|
+
else:
|
|
130
|
+
kwargs["trust"] = self._neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES
|
|
131
|
+
|
|
132
|
+
auth = self._auth()
|
|
133
|
+
self._driver_instance = self._neo4j.GraphDatabase.driver(self._uri(), auth=auth, **kwargs)
|
|
134
|
+
return self._driver_instance
|
|
135
|
+
|
|
136
|
+
def _session(self, **kwargs: Any) -> Any:
|
|
137
|
+
db = self._database()
|
|
138
|
+
return self._driver().session(database=db, **kwargs)
|
|
139
|
+
|
|
140
|
+
def _label_allowlist(self) -> set[str]:
|
|
141
|
+
configured = self._scope_options().include_labels or []
|
|
142
|
+
return {entry.strip() for entry in configured if entry and entry.strip()}
|
|
143
|
+
|
|
144
|
+
def _label_denylist(self) -> set[str]:
|
|
145
|
+
configured = self._scope_options().exclude_labels or []
|
|
146
|
+
denylist = {entry.strip() for entry in configured if entry and entry.strip()}
|
|
147
|
+
if not denylist:
|
|
148
|
+
denylist = set(_DEFAULT_EXCLUDED_LABELS)
|
|
149
|
+
return denylist
|
|
150
|
+
|
|
151
|
+
def _label_allowed(self, label: str) -> bool:
|
|
152
|
+
denylist = self._label_denylist()
|
|
153
|
+
if label in denylist:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
allowlist = self._label_allowlist()
|
|
157
|
+
if allowlist and label not in allowlist:
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
return True
|
|
161
|
+
|
|
162
|
+
def _discover_labels(self) -> list[LabelRef]:
|
|
163
|
+
limit = self._scope_options().node_limit_per_label
|
|
164
|
+
max_labels = int(limit) if limit else None
|
|
165
|
+
|
|
166
|
+
with self._session() as session:
|
|
167
|
+
result = session.run("CALL db.labels() YIELD label RETURN label ORDER BY label")
|
|
168
|
+
labels: list[LabelRef] = []
|
|
169
|
+
for record in result:
|
|
170
|
+
label = record["label"]
|
|
171
|
+
if not isinstance(label, str) or not label:
|
|
172
|
+
continue
|
|
173
|
+
if not self._label_allowed(label):
|
|
174
|
+
continue
|
|
175
|
+
labels.append(LabelRef(label=label, database=self._database()))
|
|
176
|
+
if max_labels is not None and len(labels) >= max_labels:
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
logger.info("Discovered %d node label(s) in database '%s'", len(labels), self._database())
|
|
180
|
+
return labels
|
|
181
|
+
|
|
182
|
+
def _resolve_relationship_links(
|
|
183
|
+
self,
|
|
184
|
+
ref: LabelRef,
|
|
185
|
+
label_hash_map: dict[str, str],
|
|
186
|
+
) -> list[str]:
|
|
187
|
+
"""Return hashes of related labels reachable from this label via any relationship."""
|
|
188
|
+
cypher = (
|
|
189
|
+
f"MATCH ({_escape_label(ref.label)})-[r]->(b) "
|
|
190
|
+
f"WITH DISTINCT labels(b) AS bl UNWIND bl AS target_label "
|
|
191
|
+
f"RETURN DISTINCT target_label LIMIT {_RELATIONSHIP_SCAN_LIMIT}"
|
|
192
|
+
)
|
|
193
|
+
linked_hashes: list[str] = []
|
|
194
|
+
try:
|
|
195
|
+
with self._session() as session:
|
|
196
|
+
result = session.run(cypher)
|
|
197
|
+
for record in result:
|
|
198
|
+
target = record["target_label"]
|
|
199
|
+
if isinstance(target, str) and target in label_hash_map:
|
|
200
|
+
linked_hashes.append(label_hash_map[target])
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
logger.warning("Could not resolve relationships for label '%s': %s", ref.label, exc)
|
|
203
|
+
|
|
204
|
+
return sorted(set(linked_hashes))
|
|
205
|
+
|
|
206
|
+
def _label_raw_id(self, ref: LabelRef) -> str:
|
|
207
|
+
return f"{ref.database}_#_{ref.label}"
|
|
208
|
+
|
|
209
|
+
def _label_to_asset(self, ref: LabelRef, links: list[str]) -> SingleAssetScanResults:
|
|
210
|
+
raw_id = self._label_raw_id(ref)
|
|
211
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
212
|
+
external_url = self.ensure_location(
|
|
213
|
+
f"{self._uri()}/{ref.database}/{ref.label}",
|
|
214
|
+
fallback=f"neo4j://{ref.database}/{ref.label}",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
metadata = {
|
|
218
|
+
"label": ref.label,
|
|
219
|
+
"database": ref.database,
|
|
220
|
+
"uri": self._uri(),
|
|
221
|
+
"sampling": {"strategy": str(self._sampling().strategy)},
|
|
222
|
+
}
|
|
223
|
+
now = datetime.now(UTC)
|
|
224
|
+
|
|
225
|
+
return SingleAssetScanResults(
|
|
226
|
+
hash=asset_hash,
|
|
227
|
+
checksum=self.calculate_checksum(metadata),
|
|
228
|
+
name=f"{ref.database}:{ref.label}",
|
|
229
|
+
external_url=external_url,
|
|
230
|
+
links=links,
|
|
231
|
+
asset_type=OutputAssetType.TXT,
|
|
232
|
+
source_id=self.source_id,
|
|
233
|
+
created_at=now,
|
|
234
|
+
updated_at=now,
|
|
235
|
+
runner_id=self.runner_id,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def test_connection(self) -> dict[str, Any]:
|
|
239
|
+
logger.info("Testing connection to Neo4j at %s...", self._uri())
|
|
240
|
+
result: dict[str, Any] = {
|
|
241
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
242
|
+
"source_type": self.recipe.get("type"),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
self._driver().verify_connectivity()
|
|
247
|
+
labels = self._discover_labels()
|
|
248
|
+
result["status"] = "SUCCESS"
|
|
249
|
+
result["message"] = (
|
|
250
|
+
f"Successfully connected to Neo4j at {self._uri()} "
|
|
251
|
+
f"(database='{self._database()}'). "
|
|
252
|
+
f"Reachable node labels: {len(labels)}."
|
|
253
|
+
)
|
|
254
|
+
except Exception as exc:
|
|
255
|
+
result["status"] = "FAILURE"
|
|
256
|
+
result["message"] = f"Failed to connect to Neo4j: {exc}"
|
|
257
|
+
|
|
258
|
+
return result
|
|
259
|
+
|
|
260
|
+
STREAM_DETECTIONS = True
|
|
261
|
+
|
|
262
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
263
|
+
if self._aborted:
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
logger.info("Starting Neo4j extraction: discovering node labels...")
|
|
267
|
+
labels = self._discover_labels()
|
|
268
|
+
|
|
269
|
+
# Build hash map for relationship link resolution
|
|
270
|
+
label_hash_map: dict[str, str] = {
|
|
271
|
+
ref.label: self.generate_hash_id(self._label_raw_id(ref)) for ref in labels
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
include_rels = self._scope_options().include_relationships is not False
|
|
275
|
+
|
|
276
|
+
batch: list[SingleAssetScanResults] = []
|
|
277
|
+
total = len(labels)
|
|
278
|
+
|
|
279
|
+
for i, ref in enumerate(labels, 1):
|
|
280
|
+
if self._aborted:
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
logger.info("Processing label %d/%d: %s", i, total, ref.label)
|
|
284
|
+
|
|
285
|
+
links: list[str] = []
|
|
286
|
+
if include_rels:
|
|
287
|
+
links = self._resolve_relationship_links(ref, label_hash_map)
|
|
288
|
+
if links:
|
|
289
|
+
logger.debug("Label '%s' has %d relationship link(s)", ref.label, len(links))
|
|
290
|
+
|
|
291
|
+
asset = self._label_to_asset(ref, links)
|
|
292
|
+
self._label_lookup[asset.hash] = ref
|
|
293
|
+
batch.append(asset)
|
|
294
|
+
|
|
295
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
296
|
+
logger.info("Emitting batch of %d label asset(s) (total so far: %d)", len(batch), i)
|
|
297
|
+
yield batch
|
|
298
|
+
batch = []
|
|
299
|
+
|
|
300
|
+
if batch:
|
|
301
|
+
logger.info("Emitting final batch of %d asset(s)", len(batch))
|
|
302
|
+
yield batch
|
|
303
|
+
|
|
304
|
+
logger.info("Extraction complete: %d node label(s) emitted", total)
|
|
305
|
+
|
|
306
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
307
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
308
|
+
|
|
309
|
+
def _parse_label_ref(self, asset_id: str) -> LabelRef | None:
|
|
310
|
+
if asset_id in self._label_lookup:
|
|
311
|
+
return self._label_lookup[asset_id]
|
|
312
|
+
|
|
313
|
+
decoded = asset_id
|
|
314
|
+
if "_#_" not in decoded:
|
|
315
|
+
try:
|
|
316
|
+
decoded = unhash_id(asset_id)
|
|
317
|
+
except Exception:
|
|
318
|
+
decoded = asset_id
|
|
319
|
+
|
|
320
|
+
parts = decoded.split("_#_")
|
|
321
|
+
if len(parts) >= 2:
|
|
322
|
+
return LabelRef(database=parts[-2], label=parts[-1])
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
def _fetch_nodes_page(self, ref: LabelRef, skip: int, limit: int) -> list[dict[str, Any]]:
|
|
326
|
+
cypher = f"MATCH (n:{_escape_label(ref.label)}) RETURN n SKIP {skip} LIMIT {limit}"
|
|
327
|
+
nodes: list[dict[str, Any]] = []
|
|
328
|
+
with self._session() as session:
|
|
329
|
+
result = session.run(cypher)
|
|
330
|
+
for record in result:
|
|
331
|
+
node = record["n"]
|
|
332
|
+
props = dict(node) if node is not None else {}
|
|
333
|
+
nodes.append(props)
|
|
334
|
+
return nodes
|
|
335
|
+
|
|
336
|
+
def _fetch_all_nodes_batched(self, ref: LabelRef) -> Iterator[list[dict[str, Any]]]:
|
|
337
|
+
sampling = self._sampling()
|
|
338
|
+
batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
|
|
339
|
+
label_name = f"{ref.database}:{ref.label}"
|
|
340
|
+
|
|
341
|
+
offset = 0
|
|
342
|
+
batch_num = 0
|
|
343
|
+
|
|
344
|
+
while not self._aborted:
|
|
345
|
+
batch_num += 1
|
|
346
|
+
nodes = self._fetch_nodes_page(ref, skip=offset, limit=batch_size)
|
|
347
|
+
logger.debug(
|
|
348
|
+
"Content batch %d: fetched %d nodes from %s (offset=%d)",
|
|
349
|
+
batch_num,
|
|
350
|
+
len(nodes),
|
|
351
|
+
label_name,
|
|
352
|
+
offset,
|
|
353
|
+
)
|
|
354
|
+
if not nodes:
|
|
355
|
+
break
|
|
356
|
+
yield nodes
|
|
357
|
+
offset += len(nodes)
|
|
358
|
+
if len(nodes) < batch_size:
|
|
359
|
+
break
|
|
360
|
+
|
|
361
|
+
logger.info("Fetched nodes from %s in %d content batch(es)", label_name, batch_num)
|
|
362
|
+
|
|
363
|
+
def _fetch_sample_nodes(self, ref: LabelRef) -> list[dict[str, Any]]:
|
|
364
|
+
sampling = self._sampling()
|
|
365
|
+
strategy = sampling.strategy
|
|
366
|
+
rows = int(sampling.rows_per_page or 100)
|
|
367
|
+
|
|
368
|
+
if strategy == SamplingStrategy.RANDOM:
|
|
369
|
+
cypher = (
|
|
370
|
+
f"MATCH (n:{_escape_label(ref.label)}) "
|
|
371
|
+
f"WITH n, rand() AS r ORDER BY r LIMIT {rows} RETURN n"
|
|
372
|
+
)
|
|
373
|
+
elif strategy == SamplingStrategy.LATEST:
|
|
374
|
+
order_col = sampling.order_by_column
|
|
375
|
+
if order_col:
|
|
376
|
+
cypher = (
|
|
377
|
+
f"MATCH (n:{_escape_label(ref.label)}) "
|
|
378
|
+
f"WHERE n.{order_col} IS NOT NULL "
|
|
379
|
+
f"RETURN n ORDER BY n.{order_col} DESC LIMIT {rows}"
|
|
380
|
+
)
|
|
381
|
+
else:
|
|
382
|
+
# Fallback: ID-ordered (stable and often insertion-ordered)
|
|
383
|
+
cypher = (
|
|
384
|
+
f"MATCH (n:{_escape_label(ref.label)}) "
|
|
385
|
+
f"RETURN n ORDER BY id(n) DESC LIMIT {rows}"
|
|
386
|
+
)
|
|
387
|
+
else:
|
|
388
|
+
# ALL — first page only for fetch_content; full pagination via fetch_content_pages
|
|
389
|
+
batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
|
|
390
|
+
return self._fetch_nodes_page(ref, skip=0, limit=batch_size)
|
|
391
|
+
|
|
392
|
+
nodes: list[dict[str, Any]] = []
|
|
393
|
+
with self._session() as session:
|
|
394
|
+
result = session.run(cypher)
|
|
395
|
+
for record in result:
|
|
396
|
+
node = record["n"]
|
|
397
|
+
props = dict(node) if node is not None else {}
|
|
398
|
+
nodes.append(props)
|
|
399
|
+
return nodes
|
|
400
|
+
|
|
401
|
+
def _serialize_node(self, props: dict[str, Any]) -> str:
|
|
402
|
+
return json.dumps(props, ensure_ascii=False, default=str, sort_keys=True)
|
|
403
|
+
|
|
404
|
+
def _format_label_content(
|
|
405
|
+
self,
|
|
406
|
+
ref: LabelRef,
|
|
407
|
+
nodes: list[dict[str, Any]],
|
|
408
|
+
node_offset: int = 0,
|
|
409
|
+
) -> tuple[str, str]:
|
|
410
|
+
sampling = self._sampling()
|
|
411
|
+
strategy = sampling.strategy
|
|
412
|
+
lines = [
|
|
413
|
+
f"label={ref.database}:{ref.label}",
|
|
414
|
+
f"sampling_strategy={strategy}",
|
|
415
|
+
f"sampled_nodes={len(nodes)}",
|
|
416
|
+
"",
|
|
417
|
+
]
|
|
418
|
+
|
|
419
|
+
serialized_nodes: list[str] = []
|
|
420
|
+
for index, props in enumerate(nodes, start=1 + node_offset):
|
|
421
|
+
serialized = self._serialize_node(props)
|
|
422
|
+
serialized_nodes.append(serialized)
|
|
423
|
+
lines.append(f"node_{index}: {serialized}")
|
|
424
|
+
|
|
425
|
+
text_content = "\n".join(lines)
|
|
426
|
+
raw_content = json.dumps(
|
|
427
|
+
{
|
|
428
|
+
"database": ref.database,
|
|
429
|
+
"label": ref.label,
|
|
430
|
+
"strategy": str(strategy),
|
|
431
|
+
"nodes": serialized_nodes,
|
|
432
|
+
"node_offset": node_offset,
|
|
433
|
+
},
|
|
434
|
+
ensure_ascii=False,
|
|
435
|
+
)
|
|
436
|
+
return raw_content, text_content
|
|
437
|
+
|
|
438
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
439
|
+
cached = self._content_cache.get(asset_id)
|
|
440
|
+
if cached:
|
|
441
|
+
return cached
|
|
442
|
+
|
|
443
|
+
ref = self._parse_label_ref(asset_id)
|
|
444
|
+
if not ref:
|
|
445
|
+
return None
|
|
446
|
+
|
|
447
|
+
nodes = self._fetch_sample_nodes(ref)
|
|
448
|
+
content = self._format_label_content(ref, nodes)
|
|
449
|
+
self._content_cache[asset_id] = content
|
|
450
|
+
return content
|
|
451
|
+
|
|
452
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
453
|
+
sampling = self._sampling()
|
|
454
|
+
ref = self._parse_label_ref(asset_id)
|
|
455
|
+
if not ref:
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
459
|
+
nodes = self._fetch_sample_nodes(ref)
|
|
460
|
+
for i, props in enumerate(nodes):
|
|
461
|
+
content = self._format_label_content(ref, [props], node_offset=i)
|
|
462
|
+
yield content
|
|
463
|
+
return
|
|
464
|
+
|
|
465
|
+
label_name = f"{ref.database}:{ref.label}"
|
|
466
|
+
batch_size = int(sampling.rows_per_page or self.CONTENT_BATCH_SIZE)
|
|
467
|
+
offset = 0
|
|
468
|
+
batch_num = 0
|
|
469
|
+
|
|
470
|
+
for node_batch in self._fetch_all_nodes_batched(ref):
|
|
471
|
+
batch_num += 1
|
|
472
|
+
logger.info(
|
|
473
|
+
"%s batch %d: %d node(s) (offset=%d)",
|
|
474
|
+
label_name,
|
|
475
|
+
batch_num,
|
|
476
|
+
len(node_batch),
|
|
477
|
+
offset,
|
|
478
|
+
)
|
|
479
|
+
for i, props in enumerate(node_batch):
|
|
480
|
+
content = self._format_label_content(ref, [props], node_offset=offset + i)
|
|
481
|
+
self._content_cache[asset_id] = content
|
|
482
|
+
yield content
|
|
483
|
+
offset += len(node_batch)
|
|
484
|
+
if len(node_batch) < batch_size:
|
|
485
|
+
break
|
|
486
|
+
|
|
487
|
+
def enrich_finding_location(
|
|
488
|
+
self,
|
|
489
|
+
finding: DetectionResult,
|
|
490
|
+
asset: SingleAssetScanResults,
|
|
491
|
+
text_content: str,
|
|
492
|
+
) -> None:
|
|
493
|
+
import re as _re
|
|
494
|
+
|
|
495
|
+
ref = self._label_lookup.get(asset.hash)
|
|
496
|
+
if not ref:
|
|
497
|
+
return
|
|
498
|
+
|
|
499
|
+
node_index: int | None = None
|
|
500
|
+
for line in text_content.splitlines():
|
|
501
|
+
match = _re.match(r"^node_(\d+):", line)
|
|
502
|
+
if match and finding.matched_content in line:
|
|
503
|
+
node_index = int(match.group(1))
|
|
504
|
+
break
|
|
505
|
+
|
|
506
|
+
path = f"{ref.database}:{ref.label}"
|
|
507
|
+
if node_index is not None:
|
|
508
|
+
path += f", node {node_index}"
|
|
509
|
+
|
|
510
|
+
finding.location = Location(path=path)
|
|
511
|
+
|
|
512
|
+
def abort(self) -> None:
|
|
513
|
+
logger.info("Aborting Neo4j extraction...")
|
|
514
|
+
super().abort()
|
|
515
|
+
|
|
516
|
+
def cleanup(self) -> None:
|
|
517
|
+
if self._driver_instance is not None:
|
|
518
|
+
try:
|
|
519
|
+
self._driver_instance.close()
|
|
520
|
+
except Exception:
|
|
521
|
+
logger.debug("Failed to close Neo4j driver cleanly", exc_info=True)
|
|
522
|
+
finally:
|
|
523
|
+
self._driver_instance = None
|