voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/builder.py
ADDED
|
@@ -0,0 +1,894 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph/builder.py — Builds and updates the NetworkX relationship graph from DB entities.
|
|
3
|
+
|
|
4
|
+
The graph is a NetworkX MultiDiGraph (directed, allows multiple edges between the
|
|
5
|
+
same node pair). All public functions accept and return nx.MultiDiGraph so callers
|
|
6
|
+
can chain operations.
|
|
7
|
+
|
|
8
|
+
Public interface
|
|
9
|
+
----------------
|
|
10
|
+
build_graph_from_db(investigation_id, since) → nx.MultiDiGraph
|
|
11
|
+
add_entity_to_graph(graph, entity) → nx.MultiDiGraph
|
|
12
|
+
add_relationship(graph, source_id, target_id, ...) → nx.MultiDiGraph
|
|
13
|
+
infer_relationships(graph) → nx.MultiDiGraph
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import itertools
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from typing import Optional
|
|
24
|
+
import uuid
|
|
25
|
+
from urllib.parse import urlparse
|
|
26
|
+
|
|
27
|
+
import networkx as nx
|
|
28
|
+
import sqlalchemy as sa
|
|
29
|
+
|
|
30
|
+
from extractor.normalizer import NormalizedEntity
|
|
31
|
+
from graph.model import EDGE_TYPES, NODE_TYPES
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Mapping: extractor entity_type → graph node_type
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
_ENTITY_TYPE_TO_NODE_TYPE: dict[str, str] = {
|
|
40
|
+
"THREAT_ACTOR_HANDLE": NODE_TYPES.THREAT_ACTOR,
|
|
41
|
+
"BITCOIN_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
|
|
42
|
+
"ETHEREUM_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
|
|
43
|
+
"MONERO_ADDRESS": NODE_TYPES.CRYPTO_WALLET,
|
|
44
|
+
"ONION_URL": NODE_TYPES.ONION_URL,
|
|
45
|
+
"EMAIL_ADDRESS": NODE_TYPES.EMAIL_ADDRESS,
|
|
46
|
+
"PGP_KEY_BLOCK": NODE_TYPES.PGP_KEY,
|
|
47
|
+
"CVE_NUMBER": NODE_TYPES.CVE,
|
|
48
|
+
"CVE": "vulnerability",
|
|
49
|
+
"PASTE_URL": NODE_TYPES.PASTE,
|
|
50
|
+
"MALWARE_FAMILY": NODE_TYPES.MALWARE_FAMILY,
|
|
51
|
+
"RANSOMWARE_GROUP": NODE_TYPES.RANSOMWARE_GROUP,
|
|
52
|
+
"IP_ADDRESS": NODE_TYPES.IP_ADDRESS,
|
|
53
|
+
"PHONE_NUMBER": NODE_TYPES.PHONE_NUMBER,
|
|
54
|
+
"ORGANIZATION_NAME": NODE_TYPES.ORGANIZATION,
|
|
55
|
+
"DATE": NODE_TYPES.DATE,
|
|
56
|
+
"FILE_HASH_MD5": "file_hash",
|
|
57
|
+
"FILE_HASH_SHA1": "file_hash",
|
|
58
|
+
"FILE_HASH_SHA256": "file_hash",
|
|
59
|
+
"MITRE_TECHNIQUE": "technique",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Internal helpers
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _extract_domain(url: str) -> str:
|
|
69
|
+
"""Extract the netloc (hostname) from a URL, or empty string on failure."""
|
|
70
|
+
try:
|
|
71
|
+
parsed = urlparse(url)
|
|
72
|
+
return parsed.netloc or ""
|
|
73
|
+
except Exception:
|
|
74
|
+
return ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _make_node_id(entity_type: str, value: str, source_url: str) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Derive a stable node_id for an entity.
|
|
80
|
+
|
|
81
|
+
ThreatActor handles are disambiguated by forum domain so that the same
|
|
82
|
+
handle on two different forums produces two distinct nodes (enabling the
|
|
83
|
+
LIKELY_SAME_ACTOR inference pass). All other entity types are globally
|
|
84
|
+
unique by canonical value.
|
|
85
|
+
"""
|
|
86
|
+
if entity_type == "THREAT_ACTOR_HANDLE" and source_url:
|
|
87
|
+
domain = _extract_domain(source_url)
|
|
88
|
+
if domain:
|
|
89
|
+
return f"{value}@{domain}"
|
|
90
|
+
return value
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _now_utc() -> datetime:
|
|
94
|
+
return datetime.now(timezone.utc)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Public: add_entity_to_graph
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def add_entity_to_graph(
|
|
103
|
+
graph: nx.MultiDiGraph,
|
|
104
|
+
entity: NormalizedEntity,
|
|
105
|
+
) -> nx.MultiDiGraph:
|
|
106
|
+
"""
|
|
107
|
+
Upsert a single NormalizedEntity as a node in *graph*.
|
|
108
|
+
|
|
109
|
+
- If the node does not exist: create it with all fields from *entity*.
|
|
110
|
+
- If the node already exists:
|
|
111
|
+
* update last_seen if entity's source timestamp is later
|
|
112
|
+
* append source_url if not already present
|
|
113
|
+
Returns the modified graph.
|
|
114
|
+
"""
|
|
115
|
+
node_type = _ENTITY_TYPE_TO_NODE_TYPE.get(entity.entity_type)
|
|
116
|
+
if node_type is None:
|
|
117
|
+
return graph # entity type has no graph representation
|
|
118
|
+
|
|
119
|
+
node_id = _make_node_id(entity.entity_type, entity.value, entity.source_url)
|
|
120
|
+
now = _now_utc()
|
|
121
|
+
|
|
122
|
+
if graph.has_node(node_id):
|
|
123
|
+
data = graph.nodes[node_id]
|
|
124
|
+
# Update last_seen to now (we just re-observed this entity)
|
|
125
|
+
data["last_seen"] = now
|
|
126
|
+
# Append source URL if not already recorded
|
|
127
|
+
if entity.source_url and entity.source_url not in data["source_urls"]:
|
|
128
|
+
data["source_urls"] = data["source_urls"] + [entity.source_url]
|
|
129
|
+
else:
|
|
130
|
+
metadata: dict = {}
|
|
131
|
+
if node_type == NODE_TYPES.THREAT_ACTOR:
|
|
132
|
+
metadata["handle"] = entity.value
|
|
133
|
+
domain = _extract_domain(entity.source_url)
|
|
134
|
+
if domain:
|
|
135
|
+
metadata["forum"] = domain
|
|
136
|
+
graph.add_node(
|
|
137
|
+
node_id,
|
|
138
|
+
node_type=node_type,
|
|
139
|
+
first_seen=now,
|
|
140
|
+
last_seen=now,
|
|
141
|
+
source_urls=[entity.source_url] if entity.source_url else [],
|
|
142
|
+
metadata=metadata,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return graph
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
# Public: add_relationship
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def add_relationship(
|
|
154
|
+
graph: nx.MultiDiGraph,
|
|
155
|
+
source_id: str,
|
|
156
|
+
target_id: str,
|
|
157
|
+
edge_type: str,
|
|
158
|
+
confidence: float,
|
|
159
|
+
source_url: str,
|
|
160
|
+
metadata: Optional[dict] = None,
|
|
161
|
+
) -> nx.MultiDiGraph:
|
|
162
|
+
"""
|
|
163
|
+
Add a directed edge from *source_id* to *target_id*.
|
|
164
|
+
|
|
165
|
+
Nodes referenced by source_id / target_id are auto-created as stubs if
|
|
166
|
+
they do not exist (so the graph stays consistent).
|
|
167
|
+
Returns the modified graph.
|
|
168
|
+
"""
|
|
169
|
+
now = _now_utc()
|
|
170
|
+
for nid in (source_id, target_id):
|
|
171
|
+
if not graph.has_node(nid):
|
|
172
|
+
graph.add_node(
|
|
173
|
+
nid,
|
|
174
|
+
node_type="",
|
|
175
|
+
first_seen=now,
|
|
176
|
+
last_seen=now,
|
|
177
|
+
source_urls=[],
|
|
178
|
+
metadata={},
|
|
179
|
+
confidence=0.0,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
graph.add_edge(
|
|
183
|
+
source_id,
|
|
184
|
+
target_id,
|
|
185
|
+
edge_type=edge_type,
|
|
186
|
+
confidence=confidence,
|
|
187
|
+
source_url=source_url,
|
|
188
|
+
timestamp=now,
|
|
189
|
+
metadata=metadata or {},
|
|
190
|
+
)
|
|
191
|
+
return graph
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ---------------------------------------------------------------------------
|
|
195
|
+
# Public: infer_relationships
|
|
196
|
+
# ---------------------------------------------------------------------------
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def infer_relationships(graph: nx.MultiDiGraph) -> nx.MultiDiGraph:
|
|
200
|
+
"""
|
|
201
|
+
Run inference passes over the existing graph to add derived edges.
|
|
202
|
+
|
|
203
|
+
Pass 1 — PGP key reuse:
|
|
204
|
+
If a PGPKey node is directly connected (any edge direction) to 2+
|
|
205
|
+
ThreatActor nodes → those actors likely share an identity → add a
|
|
206
|
+
CONFIRMED_SAME_ACTOR edge (confidence=0.95).
|
|
207
|
+
|
|
208
|
+
Pass 2 — Handle similarity:
|
|
209
|
+
If two ThreatActor nodes have the same metadata["handle"] value
|
|
210
|
+
(case-insensitive) but originate from different forums
|
|
211
|
+
(metadata["forum"] differs) → add LIKELY_SAME_ACTOR (confidence=0.6).
|
|
212
|
+
|
|
213
|
+
Returns the modified graph (inferred edges appended in-place).
|
|
214
|
+
"""
|
|
215
|
+
now = _now_utc()
|
|
216
|
+
|
|
217
|
+
# --- Pass 1: PGP key reuse ---
|
|
218
|
+
for pgp_id, data in list(graph.nodes(data=True)):
|
|
219
|
+
if data.get("node_type") != NODE_TYPES.PGP_KEY:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Collect all ThreatActor nodes directly adjacent (either direction)
|
|
223
|
+
adjacent = set(graph.successors(pgp_id)) | set(graph.predecessors(pgp_id))
|
|
224
|
+
actors = [
|
|
225
|
+
n for n in adjacent
|
|
226
|
+
if graph.nodes[n].get("node_type") == NODE_TYPES.THREAT_ACTOR
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
for actor_a, actor_b in itertools.combinations(actors, 2):
|
|
230
|
+
# Skip if a CONFIRMED_SAME_ACTOR edge already exists in either direction
|
|
231
|
+
existing_types = {
|
|
232
|
+
d.get("edge_type")
|
|
233
|
+
for _, _, d in graph.edges(actor_a, data=True)
|
|
234
|
+
} | {
|
|
235
|
+
d.get("edge_type")
|
|
236
|
+
for _, _, d in graph.edges(actor_b, data=True)
|
|
237
|
+
}
|
|
238
|
+
if EDGE_TYPES.CONFIRMED_SAME_ACTOR in existing_types:
|
|
239
|
+
continue
|
|
240
|
+
graph.add_edge(
|
|
241
|
+
actor_a,
|
|
242
|
+
actor_b,
|
|
243
|
+
edge_type=EDGE_TYPES.CONFIRMED_SAME_ACTOR,
|
|
244
|
+
confidence=0.95,
|
|
245
|
+
source_url="",
|
|
246
|
+
timestamp=now,
|
|
247
|
+
metadata={"inferred_from_pgp": pgp_id},
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# --- Pass 2: Handle similarity across forums ---
|
|
251
|
+
# Group ThreatActor nodes by their normalised handle value
|
|
252
|
+
handle_groups: dict[str, list[str]] = defaultdict(list)
|
|
253
|
+
for nid, data in graph.nodes(data=True):
|
|
254
|
+
if data.get("node_type") != NODE_TYPES.THREAT_ACTOR:
|
|
255
|
+
continue
|
|
256
|
+
handle = data.get("metadata", {}).get("handle", "")
|
|
257
|
+
if handle:
|
|
258
|
+
handle_groups[handle.lower().strip()].append(nid)
|
|
259
|
+
|
|
260
|
+
for _handle, node_ids in handle_groups.items():
|
|
261
|
+
if len(node_ids) < 2:
|
|
262
|
+
continue
|
|
263
|
+
for nid_a, nid_b in itertools.combinations(node_ids, 2):
|
|
264
|
+
forum_a = graph.nodes[nid_a].get("metadata", {}).get("forum", "")
|
|
265
|
+
forum_b = graph.nodes[nid_b].get("metadata", {}).get("forum", "")
|
|
266
|
+
# Only infer when forums differ (same forum + same handle = same node
|
|
267
|
+
# by construction, but guard anyway)
|
|
268
|
+
if forum_a == forum_b:
|
|
269
|
+
continue
|
|
270
|
+
# Skip if already connected by a same-actor edge
|
|
271
|
+
existing_types = {
|
|
272
|
+
d.get("edge_type")
|
|
273
|
+
for _, _, d in graph.edges(nid_a, data=True)
|
|
274
|
+
} | {
|
|
275
|
+
d.get("edge_type")
|
|
276
|
+
for _, _, d in graph.edges(nid_b, data=True)
|
|
277
|
+
}
|
|
278
|
+
if (
|
|
279
|
+
EDGE_TYPES.LIKELY_SAME_ACTOR in existing_types
|
|
280
|
+
or EDGE_TYPES.CONFIRMED_SAME_ACTOR in existing_types
|
|
281
|
+
):
|
|
282
|
+
continue
|
|
283
|
+
graph.add_edge(
|
|
284
|
+
nid_a,
|
|
285
|
+
nid_b,
|
|
286
|
+
edge_type=EDGE_TYPES.LIKELY_SAME_ACTOR,
|
|
287
|
+
confidence=0.6,
|
|
288
|
+
source_url="",
|
|
289
|
+
timestamp=now,
|
|
290
|
+
metadata={"inferred_from_handle": _handle},
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return graph
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _link_cross_page_entities(
|
|
297
|
+
G: nx.MultiDiGraph,
|
|
298
|
+
entities: list,
|
|
299
|
+
investigation_id: Optional[uuid.UUID]
|
|
300
|
+
) -> int:
|
|
301
|
+
"""
|
|
302
|
+
Second pass: link entities from different pages that share investigation context.
|
|
303
|
+
|
|
304
|
+
Strategy:
|
|
305
|
+
1. Find entities that appear on multiple pages (high-value bridge nodes)
|
|
306
|
+
→ Increase their node size/weight
|
|
307
|
+
2. Find pairs of entities from different pages that share a common
|
|
308
|
+
co-occurring entity → Add CROSS_PAGE_LINKED edge
|
|
309
|
+
3. Find entity clusters from different pages that are densely connected
|
|
310
|
+
internally → Add inter-cluster edges for the most connected nodes
|
|
311
|
+
|
|
312
|
+
Returns: count of new edges added
|
|
313
|
+
"""
|
|
314
|
+
if not entities:
|
|
315
|
+
return 0
|
|
316
|
+
|
|
317
|
+
edges_added = 0
|
|
318
|
+
|
|
319
|
+
# Step 1: Build page→entities map
|
|
320
|
+
page_entity_map: dict[str, list] = defaultdict(list)
|
|
321
|
+
entity_page_map: dict[str, list] = defaultdict(list) # node_id → page_urls
|
|
322
|
+
|
|
323
|
+
for ent in entities:
|
|
324
|
+
page_url = ent.page.url if ent.page else f"unknown_{ent.id}"
|
|
325
|
+
node_id = _make_node_id(ent.entity_type, ent.value, page_url)
|
|
326
|
+
page_entity_map[page_url].append(node_id)
|
|
327
|
+
if page_url not in entity_page_map[node_id]:
|
|
328
|
+
entity_page_map[node_id].append(page_url)
|
|
329
|
+
|
|
330
|
+
# Step 2: Boost multi-page entities (appear on 2+ pages = high significance)
|
|
331
|
+
for node_id, pages in entity_page_map.items():
|
|
332
|
+
if len(pages) > 1 and G.has_node(node_id):
|
|
333
|
+
current_size = G.nodes[node_id].get("size", 10)
|
|
334
|
+
# Each additional page appearance adds 5 to node size (up to 40 max)
|
|
335
|
+
boost = min(len(pages) * 5, 40)
|
|
336
|
+
G.nodes[node_id]["size"] = min(current_size + boost, 40)
|
|
337
|
+
G.nodes[node_id]["page_count"] = len(pages)
|
|
338
|
+
logger.debug(f"Boosted node {node_id}: appears on {len(pages)} pages")
|
|
339
|
+
|
|
340
|
+
# Step 3: Link entities from different pages via shared co-occurrence
|
|
341
|
+
# Build inverted index: entity_id → set of pages containing it (O(entities))
|
|
342
|
+
# Then for each entity, connect all page pairs (O(entities × avg_pages²))
|
|
343
|
+
# This reduces O(pages²) to O(entities × avg_pages²_per_entity)
|
|
344
|
+
entity_to_pages: dict[str, set] = defaultdict(set)
|
|
345
|
+
for node_id, pages_list in entity_page_map.items():
|
|
346
|
+
for page_url in pages_list:
|
|
347
|
+
entity_to_pages[node_id].add(page_url)
|
|
348
|
+
|
|
349
|
+
for node_id, page_set in entity_to_pages.items():
|
|
350
|
+
if len(page_set) < 2:
|
|
351
|
+
continue
|
|
352
|
+
page_list = list(page_set)
|
|
353
|
+
for i, page_a in enumerate(page_list):
|
|
354
|
+
for page_b in page_list[i + 1:]:
|
|
355
|
+
entities_a = page_entity_map.get(page_a, [])
|
|
356
|
+
entities_b = page_entity_map.get(page_b, [])
|
|
357
|
+
|
|
358
|
+
if not entities_a or not entities_b:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
entities_a_set = set(entities_a)
|
|
362
|
+
entities_b_set = set(entities_b)
|
|
363
|
+
shared_entities = entities_a_set & entities_b_set
|
|
364
|
+
|
|
365
|
+
if shared_entities:
|
|
366
|
+
unique_to_a = entities_a_set - entities_b_set
|
|
367
|
+
unique_to_b = entities_b_set - entities_a_set
|
|
368
|
+
|
|
369
|
+
for bridge_node in shared_entities:
|
|
370
|
+
unique_list_a = list(unique_to_a)[:3]
|
|
371
|
+
unique_list_b = list(unique_to_b)[:3]
|
|
372
|
+
for entity_a in unique_list_a:
|
|
373
|
+
for entity_b in unique_list_b:
|
|
374
|
+
if G.has_node(entity_a) and G.has_node(entity_b):
|
|
375
|
+
if not G.has_edge(entity_a, entity_b):
|
|
376
|
+
G.add_edge(
|
|
377
|
+
entity_a, entity_b,
|
|
378
|
+
edge_type=EDGE_TYPES.CO_INVESTIGATION,
|
|
379
|
+
confidence=0.3,
|
|
380
|
+
via=bridge_node,
|
|
381
|
+
label="co-investigation",
|
|
382
|
+
timestamp=_now_utc()
|
|
383
|
+
)
|
|
384
|
+
edges_added += 1
|
|
385
|
+
|
|
386
|
+
# Step 4: Direct cross-page linking for same-type high-confidence entities
|
|
387
|
+
# If two THREAT_ACTOR entities appear in the same investigation across different
|
|
388
|
+
# pages, they're likely part of the same ecosystem → link them
|
|
389
|
+
actor_nodes = [
|
|
390
|
+
node for node, data in G.nodes(data=True)
|
|
391
|
+
if data.get("node_type") == NODE_TYPES.THREAT_ACTOR
|
|
392
|
+
# Confidence might be in metadata or root; check both
|
|
393
|
+
and (data.get("metadata", {}).get("confidence", 0) >= 0.85
|
|
394
|
+
or data.get("confidence", 0.85) >= 0.85) # default 0.85 for TA
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
# Link threat actors that appear across 2+ pages (they're ecosystem-level nodes)
|
|
398
|
+
multi_page_actors = [
|
|
399
|
+
node for node in actor_nodes
|
|
400
|
+
if len(entity_page_map.get(node, [])) >= 2
|
|
401
|
+
]
|
|
402
|
+
|
|
403
|
+
for i, actor_a in enumerate(multi_page_actors):
|
|
404
|
+
for actor_b in multi_page_actors[i + 1:]:
|
|
405
|
+
if not G.has_edge(actor_a, actor_b):
|
|
406
|
+
G.add_edge(
|
|
407
|
+
actor_a, actor_b,
|
|
408
|
+
edge_type=EDGE_TYPES.CO_INVESTIGATION,
|
|
409
|
+
confidence=0.4,
|
|
410
|
+
label="co-investigation",
|
|
411
|
+
timestamp=_now_utc()
|
|
412
|
+
)
|
|
413
|
+
edges_added += 1
|
|
414
|
+
|
|
415
|
+
return edges_added
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
# Public: build_graph_from_db
|
|
421
|
+
# ---------------------------------------------------------------------------
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def build_graph_from_db(
|
|
425
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
426
|
+
since: Optional[datetime] = None,
|
|
427
|
+
) -> nx.MultiDiGraph:
|
|
428
|
+
"""
|
|
429
|
+
Build a fresh graph by loading entity records from the database.
|
|
430
|
+
|
|
431
|
+
Filters:
|
|
432
|
+
investigation_id — if given, only load entities for that investigation.
|
|
433
|
+
since — if given, only load entities where first_seen >= since.
|
|
434
|
+
|
|
435
|
+
For every page that has 2+ entities, CO_APPEARED_ON edges are created
|
|
436
|
+
between all pairs of entities on that page.
|
|
437
|
+
|
|
438
|
+
If DATABASE_URL is not set, returns an empty graph without raising.
|
|
439
|
+
Never raises on DB errors (logs a warning and returns the partial graph).
|
|
440
|
+
"""
|
|
441
|
+
graph: nx.MultiDiGraph = nx.MultiDiGraph()
|
|
442
|
+
|
|
443
|
+
if not os.getenv("DATABASE_URL"):
|
|
444
|
+
return graph
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
from db.session import get_session # noqa: PLC0415
|
|
448
|
+
from db.models import Entity, EntityRelationship # noqa: PLC0415
|
|
449
|
+
from sqlalchemy.orm import joinedload # noqa: PLC0415
|
|
450
|
+
|
|
451
|
+
with get_session() as session:
|
|
452
|
+
from db.models import InvestigationEntityLink # noqa: PLC0415
|
|
453
|
+
|
|
454
|
+
if investigation_id is not None:
|
|
455
|
+
query = (
|
|
456
|
+
session.query(Entity)
|
|
457
|
+
.join(
|
|
458
|
+
InvestigationEntityLink,
|
|
459
|
+
InvestigationEntityLink.entity_id == Entity.id,
|
|
460
|
+
)
|
|
461
|
+
.filter(InvestigationEntityLink.investigation_id == investigation_id)
|
|
462
|
+
.options(joinedload(Entity.page))
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
query = session.query(Entity).options(joinedload(Entity.page))
|
|
466
|
+
|
|
467
|
+
if since is not None:
|
|
468
|
+
query = query.filter(Entity.first_seen >= since)
|
|
469
|
+
|
|
470
|
+
matching_entities_count = query.count()
|
|
471
|
+
|
|
472
|
+
total_investigation = (
|
|
473
|
+
session.query(Entity)
|
|
474
|
+
.filter(Entity.investigation_id == investigation_id)
|
|
475
|
+
.count()
|
|
476
|
+
if investigation_id is not None
|
|
477
|
+
else session.query(Entity).count()
|
|
478
|
+
)
|
|
479
|
+
null_inv_count = (
|
|
480
|
+
session.query(Entity)
|
|
481
|
+
.filter(Entity.investigation_id.is_(None))
|
|
482
|
+
.count()
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
logger.warning(
|
|
486
|
+
"build_graph_from_db: investigation_id=%s entity_rows_loaded=%s "
|
|
487
|
+
"count_matching_investigation_filter=%s global_entities_with_null_investigation_id=%s",
|
|
488
|
+
investigation_id,
|
|
489
|
+
matching_entities_count,
|
|
490
|
+
total_investigation,
|
|
491
|
+
null_inv_count,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Entity rows must be processed while the session is open: after close,
|
|
495
|
+
# lazy loads on ent.page raise (joinedload data is expired).
|
|
496
|
+
skipped_unmapped = 0
|
|
497
|
+
page_entity_map: dict[str, list[Entity]] = defaultdict(list)
|
|
498
|
+
all_entities: list[Entity] = []
|
|
499
|
+
|
|
500
|
+
for ent in query.yield_per(2000):
|
|
501
|
+
all_entities.append(ent)
|
|
502
|
+
page_url = ent.page.url if ent.page else ""
|
|
503
|
+
node_type = _ENTITY_TYPE_TO_NODE_TYPE.get(ent.entity_type)
|
|
504
|
+
if node_type is None:
|
|
505
|
+
skipped_unmapped += 1
|
|
506
|
+
continue # skip unmapped types
|
|
507
|
+
|
|
508
|
+
node_id = _make_node_id(ent.entity_type, ent.value, page_url)
|
|
509
|
+
|
|
510
|
+
if graph.has_node(node_id):
|
|
511
|
+
data = graph.nodes[node_id]
|
|
512
|
+
if ent.last_seen and (
|
|
513
|
+
not data.get("last_seen") or ent.last_seen > data["last_seen"]
|
|
514
|
+
):
|
|
515
|
+
data["last_seen"] = ent.last_seen
|
|
516
|
+
if page_url and page_url not in data["source_urls"]:
|
|
517
|
+
data["source_urls"] = data["source_urls"] + [page_url]
|
|
518
|
+
else:
|
|
519
|
+
meta: dict = {}
|
|
520
|
+
if node_type == NODE_TYPES.THREAT_ACTOR:
|
|
521
|
+
meta["handle"] = ent.value
|
|
522
|
+
domain = _extract_domain(page_url)
|
|
523
|
+
if domain:
|
|
524
|
+
meta["forum"] = domain
|
|
525
|
+
graph.add_node(
|
|
526
|
+
node_id,
|
|
527
|
+
node_type=node_type,
|
|
528
|
+
first_seen=ent.first_seen or _now_utc(),
|
|
529
|
+
last_seen=ent.last_seen or _now_utc(),
|
|
530
|
+
source_urls=[page_url] if page_url else [],
|
|
531
|
+
metadata=meta,
|
|
532
|
+
confidence=ent.confidence,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
if ent.page_id:
|
|
536
|
+
page_entity_map[str(ent.page_id)].append(ent)
|
|
537
|
+
|
|
538
|
+
for _page_id, page_entities in page_entity_map.items():
|
|
539
|
+
if len(page_entities) < 2:
|
|
540
|
+
continue
|
|
541
|
+
for ent_a, ent_b in itertools.combinations(page_entities, 2):
|
|
542
|
+
page_url_a = ent_a.page.url if ent_a.page else ""
|
|
543
|
+
node_id_a = _make_node_id(ent_a.entity_type, ent_a.value, page_url_a)
|
|
544
|
+
node_id_b = _make_node_id(ent_b.entity_type, ent_b.value, page_url_a)
|
|
545
|
+
|
|
546
|
+
if not graph.has_node(node_id_a) or not graph.has_node(node_id_b):
|
|
547
|
+
continue
|
|
548
|
+
|
|
549
|
+
graph.add_edge(
|
|
550
|
+
node_id_a,
|
|
551
|
+
node_id_b,
|
|
552
|
+
edge_type=EDGE_TYPES.CO_APPEARED_ON,
|
|
553
|
+
confidence=1.0,
|
|
554
|
+
source_url=page_url_a,
|
|
555
|
+
timestamp=_now_utc(),
|
|
556
|
+
metadata={},
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
# Second pass: cross-page entity linking
|
|
560
|
+
cross_page_edges = _link_cross_page_entities(graph, all_entities, investigation_id)
|
|
561
|
+
|
|
562
|
+
# Third pass: Load persistent relationships from DB
|
|
563
|
+
persisted_edges = 0
|
|
564
|
+
try:
|
|
565
|
+
# Get all entities in the graph so we can filter relationships
|
|
566
|
+
graph_entity_ids = [ent.id for ent in all_entities]
|
|
567
|
+
if graph_entity_ids:
|
|
568
|
+
if investigation_id is not None:
|
|
569
|
+
relationships = (
|
|
570
|
+
session.query(EntityRelationship)
|
|
571
|
+
.filter(
|
|
572
|
+
EntityRelationship.investigation_id == investigation_id
|
|
573
|
+
)
|
|
574
|
+
.yield_per(500)
|
|
575
|
+
.all()
|
|
576
|
+
)
|
|
577
|
+
else:
|
|
578
|
+
relationships = session.query(EntityRelationship).filter(
|
|
579
|
+
(EntityRelationship.entity_a_id.in_(graph_entity_ids)) |
|
|
580
|
+
(EntityRelationship.entity_b_id.in_(graph_entity_ids))
|
|
581
|
+
).all()
|
|
582
|
+
|
|
583
|
+
# Create a map of entity_id -> node_id for easy lookup
|
|
584
|
+
# Since one entity can appear on multiple pages, we use the first one found or a stable mapping
|
|
585
|
+
entity_to_node = {}
|
|
586
|
+
for ent in all_entities:
|
|
587
|
+
page_url = ent.page.url if ent.page else ""
|
|
588
|
+
node_id = _make_node_id(ent.entity_type, ent.value, page_url)
|
|
589
|
+
if str(ent.id) not in entity_to_node:
|
|
590
|
+
entity_to_node[str(ent.id)] = node_id
|
|
591
|
+
|
|
592
|
+
all_missing_ids = set()
|
|
593
|
+
for rel in relationships:
|
|
594
|
+
src = str(rel.entity_a_id)
|
|
595
|
+
tgt = str(rel.entity_b_id)
|
|
596
|
+
if src not in entity_to_node:
|
|
597
|
+
all_missing_ids.add(rel.entity_a_id)
|
|
598
|
+
if tgt not in entity_to_node:
|
|
599
|
+
all_missing_ids.add(rel.entity_b_id)
|
|
600
|
+
|
|
601
|
+
if all_missing_ids:
|
|
602
|
+
from db.models import Entity as EntityModel
|
|
603
|
+
|
|
604
|
+
(
|
|
605
|
+
session.query(EntityModel)
|
|
606
|
+
.options(joinedload(EntityModel.page))
|
|
607
|
+
.filter(EntityModel.id.in_(all_missing_ids))
|
|
608
|
+
)
|
|
609
|
+
missing_entities = (
|
|
610
|
+
session.query(EntityModel)
|
|
611
|
+
.filter(EntityModel.id.in_(all_missing_ids))
|
|
612
|
+
.all()
|
|
613
|
+
)
|
|
614
|
+
for me in missing_entities:
|
|
615
|
+
me_page_url = me.page.url if me.page else ""
|
|
616
|
+
me_node_id = _make_node_id(me.entity_type, me.value, me_page_url)
|
|
617
|
+
entity_to_node[str(me.id)] = me_node_id
|
|
618
|
+
if not graph.has_node(me_node_id):
|
|
619
|
+
graph.add_node(
|
|
620
|
+
me_node_id,
|
|
621
|
+
node_type=_ENTITY_TYPE_TO_NODE_TYPE.get(me.entity_type, ""),
|
|
622
|
+
first_seen=me.first_seen or _now_utc(),
|
|
623
|
+
last_seen=me.last_seen or _now_utc(),
|
|
624
|
+
source_urls=[me_page_url] if me_page_url else [],
|
|
625
|
+
metadata={}
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
for rel in relationships:
|
|
629
|
+
source_node = entity_to_node.get(str(rel.entity_a_id))
|
|
630
|
+
target_node = entity_to_node.get(str(rel.entity_b_id))
|
|
631
|
+
|
|
632
|
+
if source_node and target_node:
|
|
633
|
+
# Add the persisted relationship edge
|
|
634
|
+
if not graph.has_edge(source_node, target_node, key=f"persisted_{rel.id}"):
|
|
635
|
+
graph.add_edge(
|
|
636
|
+
source_node,
|
|
637
|
+
target_node,
|
|
638
|
+
key=f"persisted_{rel.id}",
|
|
639
|
+
edge_type=rel.relationship_type,
|
|
640
|
+
confidence=rel.confidence,
|
|
641
|
+
source_url="",
|
|
642
|
+
timestamp=rel.first_seen or _now_utc(),
|
|
643
|
+
metadata={}
|
|
644
|
+
)
|
|
645
|
+
persisted_edges += 1
|
|
646
|
+
except Exception as e:
|
|
647
|
+
logger.warning(f"Failed to load persistent relationships: {e}")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
logger.warning(
|
|
651
|
+
"build_graph_from_db: investigation_id=%s "
|
|
652
|
+
"nodes=%s "
|
|
653
|
+
"intra_page_edges=%s "
|
|
654
|
+
"cross_page_edges=%s "
|
|
655
|
+
"total_edges=%s "
|
|
656
|
+
"skipped_unmapped_entity_types=%s",
|
|
657
|
+
investigation_id,
|
|
658
|
+
len(graph.nodes()),
|
|
659
|
+
len(graph.edges()) - cross_page_edges,
|
|
660
|
+
cross_page_edges,
|
|
661
|
+
len(graph.edges()),
|
|
662
|
+
skipped_unmapped,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
except Exception as exc:
|
|
666
|
+
logger.warning("build_graph_from_db failed: %s", exc)
|
|
667
|
+
|
|
668
|
+
return graph
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def persist_graph_edges(
|
|
672
|
+
G: nx.MultiDiGraph,
|
|
673
|
+
investigation_id: uuid.UUID,
|
|
674
|
+
session,
|
|
675
|
+
) -> dict:
|
|
676
|
+
"""
|
|
677
|
+
Write all edges from the NetworkX graph to entity_relationships table.
|
|
678
|
+
|
|
679
|
+
Called once after build_graph_from_db() completes.
|
|
680
|
+
Uses upsert logic — safe to call multiple times.
|
|
681
|
+
|
|
682
|
+
Edge cap rules:
|
|
683
|
+
- If edge count > 50,000: skip all edges, return {"status": "skipped_overflow", "edges_written": 0}
|
|
684
|
+
- If edge count between 10,000 and 50,000: prune edges where BOTH entities have confidence < 0.85
|
|
685
|
+
- Otherwise: write all edges
|
|
686
|
+
|
|
687
|
+
Returns: dict with keys:
|
|
688
|
+
- status: "written" | "skipped_overflow" | "pruned"
|
|
689
|
+
- edges_written: int
|
|
690
|
+
- original_count: int (for pruned status)
|
|
691
|
+
"""
|
|
692
|
+
from db.models import Entity, EntityRelationship
|
|
693
|
+
|
|
694
|
+
from sqlalchemy.orm import joinedload # noqa: PLC0415
|
|
695
|
+
|
|
696
|
+
entity_to_node = {}
|
|
697
|
+
node_to_entity = {}
|
|
698
|
+
entity_confidence: dict[uuid.UUID, float] = {}
|
|
699
|
+
|
|
700
|
+
from db.models import InvestigationEntityLink # noqa: PLC0415
|
|
701
|
+
linked_ids_subq = (
|
|
702
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
703
|
+
.filter(InvestigationEntityLink.investigation_id == investigation_id)
|
|
704
|
+
.subquery()
|
|
705
|
+
)
|
|
706
|
+
entities = (
|
|
707
|
+
session.query(Entity)
|
|
708
|
+
.options(joinedload(Entity.page))
|
|
709
|
+
.filter(
|
|
710
|
+
(Entity.investigation_id == investigation_id)
|
|
711
|
+
| Entity.id.in_(linked_ids_subq)
|
|
712
|
+
)
|
|
713
|
+
.yield_per(2000)
|
|
714
|
+
)
|
|
715
|
+
for ent in entities:
|
|
716
|
+
page_url = ent.page.url if ent.page else ""
|
|
717
|
+
node_id = _make_node_id(ent.entity_type, ent.value, page_url)
|
|
718
|
+
entity_to_node[str(ent.id)] = node_id
|
|
719
|
+
node_to_entity[node_id] = ent.id
|
|
720
|
+
entity_confidence[ent.id] = ent.confidence
|
|
721
|
+
|
|
722
|
+
edges_to_insert: list[dict] = []
|
|
723
|
+
edges_to_update: list[tuple[uuid.UUID, uuid.UUID, str, float]] = []
|
|
724
|
+
edge_keys: set[tuple] = set()
|
|
725
|
+
|
|
726
|
+
potential_edges: list[tuple] = []
|
|
727
|
+
for source_node, target_node, edge_data in G.edges(data=True):
|
|
728
|
+
source_entity_id = node_to_entity.get(source_node)
|
|
729
|
+
target_entity_id = node_to_entity.get(target_node)
|
|
730
|
+
|
|
731
|
+
if not source_entity_id or not target_entity_id:
|
|
732
|
+
continue
|
|
733
|
+
|
|
734
|
+
if source_entity_id == target_entity_id:
|
|
735
|
+
continue
|
|
736
|
+
|
|
737
|
+
relationship_type = edge_data.get("edge_type", "CO_APPEARED_ON")
|
|
738
|
+
confidence = float(edge_data.get("confidence", 0.5))
|
|
739
|
+
key = (source_entity_id, target_entity_id, relationship_type)
|
|
740
|
+
if key not in edge_keys:
|
|
741
|
+
edge_keys.add(key)
|
|
742
|
+
potential_edges.append((source_entity_id, target_entity_id, relationship_type, confidence))
|
|
743
|
+
|
|
744
|
+
if not potential_edges:
|
|
745
|
+
return {"status": "written", "edges_written": 0, "original_count": 0}
|
|
746
|
+
|
|
747
|
+
edge_count = len(potential_edges)
|
|
748
|
+
|
|
749
|
+
# Edge explosion check: > 50,000 edges
|
|
750
|
+
if edge_count > 50000:
|
|
751
|
+
logger.error(
|
|
752
|
+
f"Edge explosion detected: {edge_count} edges for investigation {investigation_id}. "
|
|
753
|
+
f"Graph construction skipped. Reduce entity count first."
|
|
754
|
+
)
|
|
755
|
+
return {"status": "skipped_overflow", "edges_written": 0, "original_count": edge_count}
|
|
756
|
+
|
|
757
|
+
# Edge pruning: between 10,000 and 50,000 - keep only edges where BOTH entities have confidence >= 0.85
|
|
758
|
+
pruned_count = 0
|
|
759
|
+
if edge_count > 10000:
|
|
760
|
+
pruned_edges = []
|
|
761
|
+
for source_eid, target_eid, rel_type, conf in potential_edges:
|
|
762
|
+
src_conf = entity_confidence.get(source_eid, 0)
|
|
763
|
+
tgt_conf = entity_confidence.get(target_eid, 0)
|
|
764
|
+
if src_conf >= 0.85 and tgt_conf >= 0.85:
|
|
765
|
+
pruned_edges.append((source_eid, target_eid, rel_type, conf))
|
|
766
|
+
else:
|
|
767
|
+
pruned_count += 1
|
|
768
|
+
potential_edges = pruned_edges
|
|
769
|
+
if pruned_count:
|
|
770
|
+
logger.warning(
|
|
771
|
+
f"Edge pruning applied: {pruned_count}/{edge_count} edges removed "
|
|
772
|
+
f"(both entity confidences must be >= 0.85). "
|
|
773
|
+
f"Remaining: {len(potential_edges)} edges."
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
if not potential_edges:
|
|
777
|
+
return {"status": "pruned", "edges_written": 0, "original_count": edge_count}
|
|
778
|
+
|
|
779
|
+
source_ids = list({e[0] for e in potential_edges})
|
|
780
|
+
target_ids = list({e[1] for e in potential_edges})
|
|
781
|
+
all_entity_ids = list(set(source_ids + target_ids))
|
|
782
|
+
rel_types = list({e[2] for e in potential_edges})
|
|
783
|
+
|
|
784
|
+
existing_rels = (
|
|
785
|
+
session.query(EntityRelationship)
|
|
786
|
+
.filter(
|
|
787
|
+
sa.or_(
|
|
788
|
+
EntityRelationship.entity_a_id.in_(all_entity_ids),
|
|
789
|
+
EntityRelationship.entity_b_id.in_(all_entity_ids),
|
|
790
|
+
),
|
|
791
|
+
EntityRelationship.relationship_type.in_(rel_types),
|
|
792
|
+
)
|
|
793
|
+
.all()
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
existing_edge_set: set[tuple] = set()
|
|
797
|
+
existing_confidence_map: dict[tuple, float] = {}
|
|
798
|
+
for rel in existing_rels:
|
|
799
|
+
key = (rel.entity_a_id, rel.entity_b_id, rel.relationship_type)
|
|
800
|
+
existing_edge_set.add(key)
|
|
801
|
+
existing_confidence_map[key] = rel.confidence
|
|
802
|
+
|
|
803
|
+
edges_written = 0
|
|
804
|
+
for source_entity_id, target_entity_id, relationship_type, confidence in potential_edges:
|
|
805
|
+
key = (source_entity_id, target_entity_id, relationship_type)
|
|
806
|
+
if key in existing_edge_set:
|
|
807
|
+
existing_conf = existing_confidence_map.get(key, 0)
|
|
808
|
+
if confidence > existing_conf:
|
|
809
|
+
edges_to_update.append((source_entity_id, target_entity_id, relationship_type, confidence))
|
|
810
|
+
continue
|
|
811
|
+
|
|
812
|
+
rel = EntityRelationship(
|
|
813
|
+
entity_a_id=source_entity_id,
|
|
814
|
+
entity_b_id=target_entity_id,
|
|
815
|
+
relationship_type=relationship_type,
|
|
816
|
+
confidence=confidence,
|
|
817
|
+
source_page_id=None,
|
|
818
|
+
investigation_id=investigation_id,
|
|
819
|
+
)
|
|
820
|
+
session.add(rel)
|
|
821
|
+
edges_written += 1
|
|
822
|
+
|
|
823
|
+
session.commit()
|
|
824
|
+
|
|
825
|
+
status = "pruned" if pruned_count > 0 else "written"
|
|
826
|
+
logger.warning(
|
|
827
|
+
f"persist_graph_edges: investigation={investigation_id} "
|
|
828
|
+
f"status={status} edges_written={edges_written}, edges_skipped={len(potential_edges) - edges_written}"
|
|
829
|
+
)
|
|
830
|
+
return {"status": status, "edges_written": edges_written, "original_count": edge_count}
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def build_graph_from_db_cached(investigation_id: uuid.UUID) -> nx.MultiDiGraph:
|
|
834
|
+
"""
|
|
835
|
+
Build NetworkX graph from persisted entity_relationships rows.
|
|
836
|
+
Faster than full recompute — reads pre-computed edges from DB.
|
|
837
|
+
"""
|
|
838
|
+
from db.models import Entity, EntityRelationship
|
|
839
|
+
from db.session import get_session
|
|
840
|
+
from sqlalchemy.orm import joinedload
|
|
841
|
+
|
|
842
|
+
G: nx.MultiDiGraph = nx.MultiDiGraph()
|
|
843
|
+
|
|
844
|
+
with get_session() as session:
|
|
845
|
+
from db.models import InvestigationEntityLink # noqa: PLC0415
|
|
846
|
+
entities = (
|
|
847
|
+
session.query(Entity)
|
|
848
|
+
.join(
|
|
849
|
+
InvestigationEntityLink,
|
|
850
|
+
InvestigationEntityLink.entity_id == Entity.id,
|
|
851
|
+
)
|
|
852
|
+
.filter(InvestigationEntityLink.investigation_id == investigation_id)
|
|
853
|
+
.options(joinedload(Entity.page))
|
|
854
|
+
.yield_per(500)
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
entity_to_node = {}
|
|
858
|
+
for ent in entities:
|
|
859
|
+
page_url = ent.page.url if ent.page else ""
|
|
860
|
+
node_id = _make_node_id(ent.entity_type, ent.value, page_url)
|
|
861
|
+
G.add_node(
|
|
862
|
+
node_id,
|
|
863
|
+
node_type=_ENTITY_TYPE_TO_NODE_TYPE.get(ent.entity_type, ""),
|
|
864
|
+
first_seen=ent.first_seen or _now_utc(),
|
|
865
|
+
last_seen=ent.last_seen or _now_utc(),
|
|
866
|
+
source_urls=[page_url] if page_url else [],
|
|
867
|
+
metadata={},
|
|
868
|
+
confidence=ent.confidence,
|
|
869
|
+
)
|
|
870
|
+
entity_to_node[str(ent.id)] = node_id
|
|
871
|
+
|
|
872
|
+
relationships = (
|
|
873
|
+
session.query(EntityRelationship)
|
|
874
|
+
.filter(EntityRelationship.investigation_id == investigation_id)
|
|
875
|
+
.yield_per(2000)
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
for rel in relationships:
|
|
879
|
+
source_node = entity_to_node.get(str(rel.entity_a_id))
|
|
880
|
+
target_node = entity_to_node.get(str(rel.entity_b_id))
|
|
881
|
+
|
|
882
|
+
if source_node and target_node and G.has_node(source_node) and G.has_node(target_node):
|
|
883
|
+
G.add_edge(
|
|
884
|
+
source_node,
|
|
885
|
+
target_node,
|
|
886
|
+
edge_type=rel.relationship_type,
|
|
887
|
+
confidence=rel.confidence,
|
|
888
|
+
source_url="",
|
|
889
|
+
timestamp=rel.first_seen or _now_utc(),
|
|
890
|
+
metadata={},
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
return G
|
|
894
|
+
|