voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/builder.py ADDED
@@ -0,0 +1,894 @@
1
+ """
2
+ graph/builder.py — Builds and updates the NetworkX relationship graph from DB entities.
3
+
4
+ The graph is a NetworkX MultiDiGraph (directed, allows multiple edges between the
5
+ same node pair). All public functions accept and return nx.MultiDiGraph so callers
6
+ can chain operations.
7
+
8
+ Public interface
9
+ ----------------
10
+ build_graph_from_db(investigation_id, since) → nx.MultiDiGraph
11
+ add_entity_to_graph(graph, entity) → nx.MultiDiGraph
12
+ add_relationship(graph, source_id, target_id, ...) → nx.MultiDiGraph
13
+ infer_relationships(graph) → nx.MultiDiGraph
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import itertools
19
+ import logging
20
+ import os
21
+ from collections import defaultdict
22
+ from datetime import datetime, timezone
23
+ from typing import Optional
24
+ import uuid
25
+ from urllib.parse import urlparse
26
+
27
+ import networkx as nx
28
+ import sqlalchemy as sa
29
+
30
+ from extractor.normalizer import NormalizedEntity
31
+ from graph.model import EDGE_TYPES, NODE_TYPES
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Mapping: extractor entity_type → graph node_type
37
+ # ---------------------------------------------------------------------------
38
+
39
+ _ENTITY_TYPE_TO_NODE_TYPE: dict[str, str] = {
40
+ "THREAT_ACTOR_HANDLE": NODE_TYPES.THREAT_ACTOR,
41
+ "BITCOIN_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
42
+ "ETHEREUM_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
43
+ "MONERO_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
44
+ "ONION_URL": NODE_TYPES.ONION_URL,
45
+ "EMAIL_ADDRESS": NODE_TYPES.EMAIL_ADDRESS,
46
+ "PGP_KEY_BLOCK": NODE_TYPES.PGP_KEY,
47
+ "CVE_NUMBER": NODE_TYPES.CVE,
48
+ "CVE": "vulnerability",
49
+ "PASTE_URL": NODE_TYPES.PASTE,
50
+ "MALWARE_FAMILY": NODE_TYPES.MALWARE_FAMILY,
51
+ "RANSOMWARE_GROUP": NODE_TYPES.RANSOMWARE_GROUP,
52
+ "IP_ADDRESS": NODE_TYPES.IP_ADDRESS,
53
+ "PHONE_NUMBER": NODE_TYPES.PHONE_NUMBER,
54
+ "ORGANIZATION_NAME": NODE_TYPES.ORGANIZATION,
55
+ "DATE": NODE_TYPES.DATE,
56
+ "FILE_HASH_MD5": "file_hash",
57
+ "FILE_HASH_SHA1": "file_hash",
58
+ "FILE_HASH_SHA256": "file_hash",
59
+ "MITRE_TECHNIQUE": "technique",
60
+ }
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Internal helpers
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ def _extract_domain(url: str) -> str:
69
+ """Extract the netloc (hostname) from a URL, or empty string on failure."""
70
+ try:
71
+ parsed = urlparse(url)
72
+ return parsed.netloc or ""
73
+ except Exception:
74
+ return ""
75
+
76
+
77
+ def _make_node_id(entity_type: str, value: str, source_url: str) -> str:
78
+ """
79
+ Derive a stable node_id for an entity.
80
+
81
+ ThreatActor handles are disambiguated by forum domain so that the same
82
+ handle on two different forums produces two distinct nodes (enabling the
83
+ LIKELY_SAME_ACTOR inference pass). All other entity types are globally
84
+ unique by canonical value.
85
+ """
86
+ if entity_type == "THREAT_ACTOR_HANDLE" and source_url:
87
+ domain = _extract_domain(source_url)
88
+ if domain:
89
+ return f"{value}@{domain}"
90
+ return value
91
+
92
+
93
+ def _now_utc() -> datetime:
94
+ return datetime.now(timezone.utc)
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Public: add_entity_to_graph
99
+ # ---------------------------------------------------------------------------
100
+
101
+
102
+ def add_entity_to_graph(
103
+ graph: nx.MultiDiGraph,
104
+ entity: NormalizedEntity,
105
+ ) -> nx.MultiDiGraph:
106
+ """
107
+ Upsert a single NormalizedEntity as a node in *graph*.
108
+
109
+ - If the node does not exist: create it with all fields from *entity*.
110
+ - If the node already exists:
111
+ * update last_seen if entity's source timestamp is later
112
+ * append source_url if not already present
113
+ Returns the modified graph.
114
+ """
115
+ node_type = _ENTITY_TYPE_TO_NODE_TYPE.get(entity.entity_type)
116
+ if node_type is None:
117
+ return graph # entity type has no graph representation
118
+
119
+ node_id = _make_node_id(entity.entity_type, entity.value, entity.source_url)
120
+ now = _now_utc()
121
+
122
+ if graph.has_node(node_id):
123
+ data = graph.nodes[node_id]
124
+ # Update last_seen to now (we just re-observed this entity)
125
+ data["last_seen"] = now
126
+ # Append source URL if not already recorded
127
+ if entity.source_url and entity.source_url not in data["source_urls"]:
128
+ data["source_urls"] = data["source_urls"] + [entity.source_url]
129
+ else:
130
+ metadata: dict = {}
131
+ if node_type == NODE_TYPES.THREAT_ACTOR:
132
+ metadata["handle"] = entity.value
133
+ domain = _extract_domain(entity.source_url)
134
+ if domain:
135
+ metadata["forum"] = domain
136
+ graph.add_node(
137
+ node_id,
138
+ node_type=node_type,
139
+ first_seen=now,
140
+ last_seen=now,
141
+ source_urls=[entity.source_url] if entity.source_url else [],
142
+ metadata=metadata,
143
+ )
144
+
145
+ return graph
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Public: add_relationship
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ def add_relationship(
154
+ graph: nx.MultiDiGraph,
155
+ source_id: str,
156
+ target_id: str,
157
+ edge_type: str,
158
+ confidence: float,
159
+ source_url: str,
160
+ metadata: Optional[dict] = None,
161
+ ) -> nx.MultiDiGraph:
162
+ """
163
+ Add a directed edge from *source_id* to *target_id*.
164
+
165
+ Nodes referenced by source_id / target_id are auto-created as stubs if
166
+ they do not exist (so the graph stays consistent).
167
+ Returns the modified graph.
168
+ """
169
+ now = _now_utc()
170
+ for nid in (source_id, target_id):
171
+ if not graph.has_node(nid):
172
+ graph.add_node(
173
+ nid,
174
+ node_type="",
175
+ first_seen=now,
176
+ last_seen=now,
177
+ source_urls=[],
178
+ metadata={},
179
+ confidence=0.0,
180
+ )
181
+
182
+ graph.add_edge(
183
+ source_id,
184
+ target_id,
185
+ edge_type=edge_type,
186
+ confidence=confidence,
187
+ source_url=source_url,
188
+ timestamp=now,
189
+ metadata=metadata or {},
190
+ )
191
+ return graph
192
+
193
+
194
+ # ---------------------------------------------------------------------------
195
+ # Public: infer_relationships
196
+ # ---------------------------------------------------------------------------
197
+
198
+
199
+ def infer_relationships(graph: nx.MultiDiGraph) -> nx.MultiDiGraph:
200
+ """
201
+ Run inference passes over the existing graph to add derived edges.
202
+
203
+ Pass 1 — PGP key reuse:
204
+ If a PGPKey node is directly connected (any edge direction) to 2+
205
+ ThreatActor nodes → those actors likely share an identity → add a
206
+ CONFIRMED_SAME_ACTOR edge (confidence=0.95).
207
+
208
+ Pass 2 — Handle similarity:
209
+ If two ThreatActor nodes have the same metadata["handle"] value
210
+ (case-insensitive) but originate from different forums
211
+ (metadata["forum"] differs) → add LIKELY_SAME_ACTOR (confidence=0.6).
212
+
213
+ Returns the modified graph (inferred edges appended in-place).
214
+ """
215
+ now = _now_utc()
216
+
217
+ # --- Pass 1: PGP key reuse ---
218
+ for pgp_id, data in list(graph.nodes(data=True)):
219
+ if data.get("node_type") != NODE_TYPES.PGP_KEY:
220
+ continue
221
+
222
+ # Collect all ThreatActor nodes directly adjacent (either direction)
223
+ adjacent = set(graph.successors(pgp_id)) | set(graph.predecessors(pgp_id))
224
+ actors = [
225
+ n for n in adjacent
226
+ if graph.nodes[n].get("node_type") == NODE_TYPES.THREAT_ACTOR
227
+ ]
228
+
229
+ for actor_a, actor_b in itertools.combinations(actors, 2):
230
+ # Skip if a CONFIRMED_SAME_ACTOR edge already exists in either direction
231
+ existing_types = {
232
+ d.get("edge_type")
233
+ for _, _, d in graph.edges(actor_a, data=True)
234
+ } | {
235
+ d.get("edge_type")
236
+ for _, _, d in graph.edges(actor_b, data=True)
237
+ }
238
+ if EDGE_TYPES.CONFIRMED_SAME_ACTOR in existing_types:
239
+ continue
240
+ graph.add_edge(
241
+ actor_a,
242
+ actor_b,
243
+ edge_type=EDGE_TYPES.CONFIRMED_SAME_ACTOR,
244
+ confidence=0.95,
245
+ source_url="",
246
+ timestamp=now,
247
+ metadata={"inferred_from_pgp": pgp_id},
248
+ )
249
+
250
+ # --- Pass 2: Handle similarity across forums ---
251
+ # Group ThreatActor nodes by their normalised handle value
252
+ handle_groups: dict[str, list[str]] = defaultdict(list)
253
+ for nid, data in graph.nodes(data=True):
254
+ if data.get("node_type") != NODE_TYPES.THREAT_ACTOR:
255
+ continue
256
+ handle = data.get("metadata", {}).get("handle", "")
257
+ if handle:
258
+ handle_groups[handle.lower().strip()].append(nid)
259
+
260
+ for _handle, node_ids in handle_groups.items():
261
+ if len(node_ids) < 2:
262
+ continue
263
+ for nid_a, nid_b in itertools.combinations(node_ids, 2):
264
+ forum_a = graph.nodes[nid_a].get("metadata", {}).get("forum", "")
265
+ forum_b = graph.nodes[nid_b].get("metadata", {}).get("forum", "")
266
+ # Only infer when forums differ (same forum + same handle = same node
267
+ # by construction, but guard anyway)
268
+ if forum_a == forum_b:
269
+ continue
270
+ # Skip if already connected by a same-actor edge
271
+ existing_types = {
272
+ d.get("edge_type")
273
+ for _, _, d in graph.edges(nid_a, data=True)
274
+ } | {
275
+ d.get("edge_type")
276
+ for _, _, d in graph.edges(nid_b, data=True)
277
+ }
278
+ if (
279
+ EDGE_TYPES.LIKELY_SAME_ACTOR in existing_types
280
+ or EDGE_TYPES.CONFIRMED_SAME_ACTOR in existing_types
281
+ ):
282
+ continue
283
+ graph.add_edge(
284
+ nid_a,
285
+ nid_b,
286
+ edge_type=EDGE_TYPES.LIKELY_SAME_ACTOR,
287
+ confidence=0.6,
288
+ source_url="",
289
+ timestamp=now,
290
+ metadata={"inferred_from_handle": _handle},
291
+ )
292
+
293
+ return graph
294
+
295
+
296
+ def _link_cross_page_entities(
297
+ G: nx.MultiDiGraph,
298
+ entities: list,
299
+ investigation_id: Optional[uuid.UUID]
300
+ ) -> int:
301
+ """
302
+ Second pass: link entities from different pages that share investigation context.
303
+
304
+ Strategy:
305
+ 1. Find entities that appear on multiple pages (high-value bridge nodes)
306
+ → Increase their node size/weight
307
+ 2. Find pairs of entities from different pages that share a common
308
+ co-occurring entity → Add CROSS_PAGE_LINKED edge
309
+ 3. Find entity clusters from different pages that are densely connected
310
+ internally → Add inter-cluster edges for the most connected nodes
311
+
312
+ Returns: count of new edges added
313
+ """
314
+ if not entities:
315
+ return 0
316
+
317
+ edges_added = 0
318
+
319
+ # Step 1: Build page→entities map
320
+ page_entity_map: dict[str, list] = defaultdict(list)
321
+ entity_page_map: dict[str, list] = defaultdict(list) # node_id → page_urls
322
+
323
+ for ent in entities:
324
+ page_url = ent.page.url if ent.page else f"unknown_{ent.id}"
325
+ node_id = _make_node_id(ent.entity_type, ent.value, page_url)
326
+ page_entity_map[page_url].append(node_id)
327
+ if page_url not in entity_page_map[node_id]:
328
+ entity_page_map[node_id].append(page_url)
329
+
330
+ # Step 2: Boost multi-page entities (appear on 2+ pages = high significance)
331
+ for node_id, pages in entity_page_map.items():
332
+ if len(pages) > 1 and G.has_node(node_id):
333
+ current_size = G.nodes[node_id].get("size", 10)
334
+ # Each additional page appearance adds 5 to node size (up to 40 max)
335
+ boost = min(len(pages) * 5, 40)
336
+ G.nodes[node_id]["size"] = min(current_size + boost, 40)
337
+ G.nodes[node_id]["page_count"] = len(pages)
338
+ logger.debug(f"Boosted node {node_id}: appears on {len(pages)} pages")
339
+
340
+ # Step 3: Link entities from different pages via shared co-occurrence
341
+ # Build inverted index: entity_id → set of pages containing it (O(entities))
342
+ # Then for each entity, connect all page pairs (O(entities × avg_pages²))
343
+ # This reduces O(pages²) to O(entities × avg_pages²_per_entity)
344
+ entity_to_pages: dict[str, set] = defaultdict(set)
345
+ for node_id, pages_list in entity_page_map.items():
346
+ for page_url in pages_list:
347
+ entity_to_pages[node_id].add(page_url)
348
+
349
+ for node_id, page_set in entity_to_pages.items():
350
+ if len(page_set) < 2:
351
+ continue
352
+ page_list = list(page_set)
353
+ for i, page_a in enumerate(page_list):
354
+ for page_b in page_list[i + 1:]:
355
+ entities_a = page_entity_map.get(page_a, [])
356
+ entities_b = page_entity_map.get(page_b, [])
357
+
358
+ if not entities_a or not entities_b:
359
+ continue
360
+
361
+ entities_a_set = set(entities_a)
362
+ entities_b_set = set(entities_b)
363
+ shared_entities = entities_a_set & entities_b_set
364
+
365
+ if shared_entities:
366
+ unique_to_a = entities_a_set - entities_b_set
367
+ unique_to_b = entities_b_set - entities_a_set
368
+
369
+ for bridge_node in shared_entities:
370
+ unique_list_a = list(unique_to_a)[:3]
371
+ unique_list_b = list(unique_to_b)[:3]
372
+ for entity_a in unique_list_a:
373
+ for entity_b in unique_list_b:
374
+ if G.has_node(entity_a) and G.has_node(entity_b):
375
+ if not G.has_edge(entity_a, entity_b):
376
+ G.add_edge(
377
+ entity_a, entity_b,
378
+ edge_type=EDGE_TYPES.CO_INVESTIGATION,
379
+ confidence=0.3,
380
+ via=bridge_node,
381
+ label="co-investigation",
382
+ timestamp=_now_utc()
383
+ )
384
+ edges_added += 1
385
+
386
+ # Step 4: Direct cross-page linking for same-type high-confidence entities
387
+ # If two THREAT_ACTOR entities appear in the same investigation across different
388
+ # pages, they're likely part of the same ecosystem → link them
389
+ actor_nodes = [
390
+ node for node, data in G.nodes(data=True)
391
+ if data.get("node_type") == NODE_TYPES.THREAT_ACTOR
392
+ # Confidence might be in metadata or root; check both
393
+ and (data.get("metadata", {}).get("confidence", 0) >= 0.85
394
+ or data.get("confidence", 0.85) >= 0.85) # default 0.85 for TA
395
+ ]
396
+
397
+ # Link threat actors that appear across 2+ pages (they're ecosystem-level nodes)
398
+ multi_page_actors = [
399
+ node for node in actor_nodes
400
+ if len(entity_page_map.get(node, [])) >= 2
401
+ ]
402
+
403
+ for i, actor_a in enumerate(multi_page_actors):
404
+ for actor_b in multi_page_actors[i + 1:]:
405
+ if not G.has_edge(actor_a, actor_b):
406
+ G.add_edge(
407
+ actor_a, actor_b,
408
+ edge_type=EDGE_TYPES.CO_INVESTIGATION,
409
+ confidence=0.4,
410
+ label="co-investigation",
411
+ timestamp=_now_utc()
412
+ )
413
+ edges_added += 1
414
+
415
+ return edges_added
416
+
417
+
418
+
419
+ # ---------------------------------------------------------------------------
420
+ # Public: build_graph_from_db
421
+ # ---------------------------------------------------------------------------
422
+
423
+
424
+ def build_graph_from_db(
425
+ investigation_id: Optional[uuid.UUID] = None,
426
+ since: Optional[datetime] = None,
427
+ ) -> nx.MultiDiGraph:
428
+ """
429
+ Build a fresh graph by loading entity records from the database.
430
+
431
+ Filters:
432
+ investigation_id — if given, only load entities for that investigation.
433
+ since — if given, only load entities where first_seen >= since.
434
+
435
+ For every page that has 2+ entities, CO_APPEARED_ON edges are created
436
+ between all pairs of entities on that page.
437
+
438
+ If DATABASE_URL is not set, returns an empty graph without raising.
439
+ Never raises on DB errors (logs a warning and returns the partial graph).
440
+ """
441
+ graph: nx.MultiDiGraph = nx.MultiDiGraph()
442
+
443
+ if not os.getenv("DATABASE_URL"):
444
+ return graph
445
+
446
+ try:
447
+ from db.session import get_session # noqa: PLC0415
448
+ from db.models import Entity, EntityRelationship # noqa: PLC0415
449
+ from sqlalchemy.orm import joinedload # noqa: PLC0415
450
+
451
+ with get_session() as session:
452
+ from db.models import InvestigationEntityLink # noqa: PLC0415
453
+
454
+ if investigation_id is not None:
455
+ query = (
456
+ session.query(Entity)
457
+ .join(
458
+ InvestigationEntityLink,
459
+ InvestigationEntityLink.entity_id == Entity.id,
460
+ )
461
+ .filter(InvestigationEntityLink.investigation_id == investigation_id)
462
+ .options(joinedload(Entity.page))
463
+ )
464
+ else:
465
+ query = session.query(Entity).options(joinedload(Entity.page))
466
+
467
+ if since is not None:
468
+ query = query.filter(Entity.first_seen >= since)
469
+
470
+ matching_entities_count = query.count()
471
+
472
+ total_investigation = (
473
+ session.query(Entity)
474
+ .filter(Entity.investigation_id == investigation_id)
475
+ .count()
476
+ if investigation_id is not None
477
+ else session.query(Entity).count()
478
+ )
479
+ null_inv_count = (
480
+ session.query(Entity)
481
+ .filter(Entity.investigation_id.is_(None))
482
+ .count()
483
+ )
484
+
485
+ logger.warning(
486
+ "build_graph_from_db: investigation_id=%s entity_rows_loaded=%s "
487
+ "count_matching_investigation_filter=%s global_entities_with_null_investigation_id=%s",
488
+ investigation_id,
489
+ matching_entities_count,
490
+ total_investigation,
491
+ null_inv_count,
492
+ )
493
+
494
+ # Entity rows must be processed while the session is open: after close,
495
+ # lazy loads on ent.page raise (joinedload data is expired).
496
+ skipped_unmapped = 0
497
+ page_entity_map: dict[str, list[Entity]] = defaultdict(list)
498
+ all_entities: list[Entity] = []
499
+
500
+ for ent in query.yield_per(2000):
501
+ all_entities.append(ent)
502
+ page_url = ent.page.url if ent.page else ""
503
+ node_type = _ENTITY_TYPE_TO_NODE_TYPE.get(ent.entity_type)
504
+ if node_type is None:
505
+ skipped_unmapped += 1
506
+ continue # skip unmapped types
507
+
508
+ node_id = _make_node_id(ent.entity_type, ent.value, page_url)
509
+
510
+ if graph.has_node(node_id):
511
+ data = graph.nodes[node_id]
512
+ if ent.last_seen and (
513
+ not data.get("last_seen") or ent.last_seen > data["last_seen"]
514
+ ):
515
+ data["last_seen"] = ent.last_seen
516
+ if page_url and page_url not in data["source_urls"]:
517
+ data["source_urls"] = data["source_urls"] + [page_url]
518
+ else:
519
+ meta: dict = {}
520
+ if node_type == NODE_TYPES.THREAT_ACTOR:
521
+ meta["handle"] = ent.value
522
+ domain = _extract_domain(page_url)
523
+ if domain:
524
+ meta["forum"] = domain
525
+ graph.add_node(
526
+ node_id,
527
+ node_type=node_type,
528
+ first_seen=ent.first_seen or _now_utc(),
529
+ last_seen=ent.last_seen or _now_utc(),
530
+ source_urls=[page_url] if page_url else [],
531
+ metadata=meta,
532
+ confidence=ent.confidence,
533
+ )
534
+
535
+ if ent.page_id:
536
+ page_entity_map[str(ent.page_id)].append(ent)
537
+
538
+ for _page_id, page_entities in page_entity_map.items():
539
+ if len(page_entities) < 2:
540
+ continue
541
+ for ent_a, ent_b in itertools.combinations(page_entities, 2):
542
+ page_url_a = ent_a.page.url if ent_a.page else ""
543
+ node_id_a = _make_node_id(ent_a.entity_type, ent_a.value, page_url_a)
544
+ node_id_b = _make_node_id(ent_b.entity_type, ent_b.value, page_url_a)
545
+
546
+ if not graph.has_node(node_id_a) or not graph.has_node(node_id_b):
547
+ continue
548
+
549
+ graph.add_edge(
550
+ node_id_a,
551
+ node_id_b,
552
+ edge_type=EDGE_TYPES.CO_APPEARED_ON,
553
+ confidence=1.0,
554
+ source_url=page_url_a,
555
+ timestamp=_now_utc(),
556
+ metadata={},
557
+ )
558
+
559
+ # Second pass: cross-page entity linking
560
+ cross_page_edges = _link_cross_page_entities(graph, all_entities, investigation_id)
561
+
562
+ # Third pass: Load persistent relationships from DB
563
+ persisted_edges = 0
564
+ try:
565
+ # Get all entities in the graph so we can filter relationships
566
+ graph_entity_ids = [ent.id for ent in all_entities]
567
+ if graph_entity_ids:
568
+ if investigation_id is not None:
569
+ relationships = (
570
+ session.query(EntityRelationship)
571
+ .filter(
572
+ EntityRelationship.investigation_id == investigation_id
573
+ )
574
+ .yield_per(500)
575
+ .all()
576
+ )
577
+ else:
578
+ relationships = session.query(EntityRelationship).filter(
579
+ (EntityRelationship.entity_a_id.in_(graph_entity_ids)) |
580
+ (EntityRelationship.entity_b_id.in_(graph_entity_ids))
581
+ ).all()
582
+
583
+ # Create a map of entity_id -> node_id for easy lookup
584
+ # Since one entity can appear on multiple pages, we use the first one found or a stable mapping
585
+ entity_to_node = {}
586
+ for ent in all_entities:
587
+ page_url = ent.page.url if ent.page else ""
588
+ node_id = _make_node_id(ent.entity_type, ent.value, page_url)
589
+ if str(ent.id) not in entity_to_node:
590
+ entity_to_node[str(ent.id)] = node_id
591
+
592
+ all_missing_ids = set()
593
+ for rel in relationships:
594
+ src = str(rel.entity_a_id)
595
+ tgt = str(rel.entity_b_id)
596
+ if src not in entity_to_node:
597
+ all_missing_ids.add(rel.entity_a_id)
598
+ if tgt not in entity_to_node:
599
+ all_missing_ids.add(rel.entity_b_id)
600
+
601
+ if all_missing_ids:
602
+ from db.models import Entity as EntityModel
603
+
604
+ (
605
+ session.query(EntityModel)
606
+ .options(joinedload(EntityModel.page))
607
+ .filter(EntityModel.id.in_(all_missing_ids))
608
+ )
609
+ missing_entities = (
610
+ session.query(EntityModel)
611
+ .filter(EntityModel.id.in_(all_missing_ids))
612
+ .all()
613
+ )
614
+ for me in missing_entities:
615
+ me_page_url = me.page.url if me.page else ""
616
+ me_node_id = _make_node_id(me.entity_type, me.value, me_page_url)
617
+ entity_to_node[str(me.id)] = me_node_id
618
+ if not graph.has_node(me_node_id):
619
+ graph.add_node(
620
+ me_node_id,
621
+ node_type=_ENTITY_TYPE_TO_NODE_TYPE.get(me.entity_type, ""),
622
+ first_seen=me.first_seen or _now_utc(),
623
+ last_seen=me.last_seen or _now_utc(),
624
+ source_urls=[me_page_url] if me_page_url else [],
625
+ metadata={}
626
+ )
627
+
628
+ for rel in relationships:
629
+ source_node = entity_to_node.get(str(rel.entity_a_id))
630
+ target_node = entity_to_node.get(str(rel.entity_b_id))
631
+
632
+ if source_node and target_node:
633
+ # Add the persisted relationship edge
634
+ if not graph.has_edge(source_node, target_node, key=f"persisted_{rel.id}"):
635
+ graph.add_edge(
636
+ source_node,
637
+ target_node,
638
+ key=f"persisted_{rel.id}",
639
+ edge_type=rel.relationship_type,
640
+ confidence=rel.confidence,
641
+ source_url="",
642
+ timestamp=rel.first_seen or _now_utc(),
643
+ metadata={}
644
+ )
645
+ persisted_edges += 1
646
+ except Exception as e:
647
+ logger.warning(f"Failed to load persistent relationships: {e}")
648
+
649
+
650
+ logger.warning(
651
+ "build_graph_from_db: investigation_id=%s "
652
+ "nodes=%s "
653
+ "intra_page_edges=%s "
654
+ "cross_page_edges=%s "
655
+ "total_edges=%s "
656
+ "skipped_unmapped_entity_types=%s",
657
+ investigation_id,
658
+ len(graph.nodes()),
659
+ len(graph.edges()) - cross_page_edges,
660
+ cross_page_edges,
661
+ len(graph.edges()),
662
+ skipped_unmapped,
663
+ )
664
+
665
+ except Exception as exc:
666
+ logger.warning("build_graph_from_db failed: %s", exc)
667
+
668
+ return graph
669
+
670
+
671
+ def persist_graph_edges(
672
+ G: nx.MultiDiGraph,
673
+ investigation_id: uuid.UUID,
674
+ session,
675
+ ) -> dict:
676
+ """
677
+ Write all edges from the NetworkX graph to entity_relationships table.
678
+
679
+ Called once after build_graph_from_db() completes.
680
+ Uses upsert logic — safe to call multiple times.
681
+
682
+ Edge cap rules:
683
+ - If edge count > 50,000: skip all edges, return {"status": "skipped_overflow", "edges_written": 0}
684
+ - If edge count between 10,000 and 50,000: prune edges where BOTH entities have confidence < 0.85
685
+ - Otherwise: write all edges
686
+
687
+ Returns: dict with keys:
688
+ - status: "written" | "skipped_overflow" | "pruned"
689
+ - edges_written: int
690
+ - original_count: int (for pruned status)
691
+ """
692
+ from db.models import Entity, EntityRelationship
693
+
694
+ from sqlalchemy.orm import joinedload # noqa: PLC0415
695
+
696
+ entity_to_node = {}
697
+ node_to_entity = {}
698
+ entity_confidence: dict[uuid.UUID, float] = {}
699
+
700
+ from db.models import InvestigationEntityLink # noqa: PLC0415
701
+ linked_ids_subq = (
702
+ session.query(InvestigationEntityLink.entity_id)
703
+ .filter(InvestigationEntityLink.investigation_id == investigation_id)
704
+ .subquery()
705
+ )
706
+ entities = (
707
+ session.query(Entity)
708
+ .options(joinedload(Entity.page))
709
+ .filter(
710
+ (Entity.investigation_id == investigation_id)
711
+ | Entity.id.in_(linked_ids_subq)
712
+ )
713
+ .yield_per(2000)
714
+ )
715
+ for ent in entities:
716
+ page_url = ent.page.url if ent.page else ""
717
+ node_id = _make_node_id(ent.entity_type, ent.value, page_url)
718
+ entity_to_node[str(ent.id)] = node_id
719
+ node_to_entity[node_id] = ent.id
720
+ entity_confidence[ent.id] = ent.confidence
721
+
722
+ edges_to_insert: list[dict] = []
723
+ edges_to_update: list[tuple[uuid.UUID, uuid.UUID, str, float]] = []
724
+ edge_keys: set[tuple] = set()
725
+
726
+ potential_edges: list[tuple] = []
727
+ for source_node, target_node, edge_data in G.edges(data=True):
728
+ source_entity_id = node_to_entity.get(source_node)
729
+ target_entity_id = node_to_entity.get(target_node)
730
+
731
+ if not source_entity_id or not target_entity_id:
732
+ continue
733
+
734
+ if source_entity_id == target_entity_id:
735
+ continue
736
+
737
+ relationship_type = edge_data.get("edge_type", "CO_APPEARED_ON")
738
+ confidence = float(edge_data.get("confidence", 0.5))
739
+ key = (source_entity_id, target_entity_id, relationship_type)
740
+ if key not in edge_keys:
741
+ edge_keys.add(key)
742
+ potential_edges.append((source_entity_id, target_entity_id, relationship_type, confidence))
743
+
744
+ if not potential_edges:
745
+ return {"status": "written", "edges_written": 0, "original_count": 0}
746
+
747
+ edge_count = len(potential_edges)
748
+
749
+ # Edge explosion check: > 50,000 edges
750
+ if edge_count > 50000:
751
+ logger.error(
752
+ f"Edge explosion detected: {edge_count} edges for investigation {investigation_id}. "
753
+ f"Graph construction skipped. Reduce entity count first."
754
+ )
755
+ return {"status": "skipped_overflow", "edges_written": 0, "original_count": edge_count}
756
+
757
+ # Edge pruning: between 10,000 and 50,000 - keep only edges where BOTH entities have confidence >= 0.85
758
+ pruned_count = 0
759
+ if edge_count > 10000:
760
+ pruned_edges = []
761
+ for source_eid, target_eid, rel_type, conf in potential_edges:
762
+ src_conf = entity_confidence.get(source_eid, 0)
763
+ tgt_conf = entity_confidence.get(target_eid, 0)
764
+ if src_conf >= 0.85 and tgt_conf >= 0.85:
765
+ pruned_edges.append((source_eid, target_eid, rel_type, conf))
766
+ else:
767
+ pruned_count += 1
768
+ potential_edges = pruned_edges
769
+ if pruned_count:
770
+ logger.warning(
771
+ f"Edge pruning applied: {pruned_count}/{edge_count} edges removed "
772
+ f"(both entity confidences must be >= 0.85). "
773
+ f"Remaining: {len(potential_edges)} edges."
774
+ )
775
+
776
+ if not potential_edges:
777
+ return {"status": "pruned", "edges_written": 0, "original_count": edge_count}
778
+
779
+ source_ids = list({e[0] for e in potential_edges})
780
+ target_ids = list({e[1] for e in potential_edges})
781
+ all_entity_ids = list(set(source_ids + target_ids))
782
+ rel_types = list({e[2] for e in potential_edges})
783
+
784
+ existing_rels = (
785
+ session.query(EntityRelationship)
786
+ .filter(
787
+ sa.or_(
788
+ EntityRelationship.entity_a_id.in_(all_entity_ids),
789
+ EntityRelationship.entity_b_id.in_(all_entity_ids),
790
+ ),
791
+ EntityRelationship.relationship_type.in_(rel_types),
792
+ )
793
+ .all()
794
+ )
795
+
796
+ existing_edge_set: set[tuple] = set()
797
+ existing_confidence_map: dict[tuple, float] = {}
798
+ for rel in existing_rels:
799
+ key = (rel.entity_a_id, rel.entity_b_id, rel.relationship_type)
800
+ existing_edge_set.add(key)
801
+ existing_confidence_map[key] = rel.confidence
802
+
803
+ edges_written = 0
804
+ for source_entity_id, target_entity_id, relationship_type, confidence in potential_edges:
805
+ key = (source_entity_id, target_entity_id, relationship_type)
806
+ if key in existing_edge_set:
807
+ existing_conf = existing_confidence_map.get(key, 0)
808
+ if confidence > existing_conf:
809
+ edges_to_update.append((source_entity_id, target_entity_id, relationship_type, confidence))
810
+ continue
811
+
812
+ rel = EntityRelationship(
813
+ entity_a_id=source_entity_id,
814
+ entity_b_id=target_entity_id,
815
+ relationship_type=relationship_type,
816
+ confidence=confidence,
817
+ source_page_id=None,
818
+ investigation_id=investigation_id,
819
+ )
820
+ session.add(rel)
821
+ edges_written += 1
822
+
823
+ session.commit()
824
+
825
+ status = "pruned" if pruned_count > 0 else "written"
826
+ logger.warning(
827
+ f"persist_graph_edges: investigation={investigation_id} "
828
+ f"status={status} edges_written={edges_written}, edges_skipped={len(potential_edges) - edges_written}"
829
+ )
830
+ return {"status": status, "edges_written": edges_written, "original_count": edge_count}
831
+
832
+
833
+ def build_graph_from_db_cached(investigation_id: uuid.UUID) -> nx.MultiDiGraph:
834
+ """
835
+ Build NetworkX graph from persisted entity_relationships rows.
836
+ Faster than full recompute — reads pre-computed edges from DB.
837
+ """
838
+ from db.models import Entity, EntityRelationship
839
+ from db.session import get_session
840
+ from sqlalchemy.orm import joinedload
841
+
842
+ G: nx.MultiDiGraph = nx.MultiDiGraph()
843
+
844
+ with get_session() as session:
845
+ from db.models import InvestigationEntityLink # noqa: PLC0415
846
+ entities = (
847
+ session.query(Entity)
848
+ .join(
849
+ InvestigationEntityLink,
850
+ InvestigationEntityLink.entity_id == Entity.id,
851
+ )
852
+ .filter(InvestigationEntityLink.investigation_id == investigation_id)
853
+ .options(joinedload(Entity.page))
854
+ .yield_per(500)
855
+ )
856
+
857
+ entity_to_node = {}
858
+ for ent in entities:
859
+ page_url = ent.page.url if ent.page else ""
860
+ node_id = _make_node_id(ent.entity_type, ent.value, page_url)
861
+ G.add_node(
862
+ node_id,
863
+ node_type=_ENTITY_TYPE_TO_NODE_TYPE.get(ent.entity_type, ""),
864
+ first_seen=ent.first_seen or _now_utc(),
865
+ last_seen=ent.last_seen or _now_utc(),
866
+ source_urls=[page_url] if page_url else [],
867
+ metadata={},
868
+ confidence=ent.confidence,
869
+ )
870
+ entity_to_node[str(ent.id)] = node_id
871
+
872
+ relationships = (
873
+ session.query(EntityRelationship)
874
+ .filter(EntityRelationship.investigation_id == investigation_id)
875
+ .yield_per(2000)
876
+ )
877
+
878
+ for rel in relationships:
879
+ source_node = entity_to_node.get(str(rel.entity_a_id))
880
+ target_node = entity_to_node.get(str(rel.entity_b_id))
881
+
882
+ if source_node and target_node and G.has_node(source_node) and G.has_node(target_node):
883
+ G.add_edge(
884
+ source_node,
885
+ target_node,
886
+ edge_type=rel.relationship_type,
887
+ confidence=rel.confidence,
888
+ source_url="",
889
+ timestamp=rel.first_seen or _now_utc(),
890
+ metadata={},
891
+ )
892
+
893
+ return G
894
+