voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/export.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph/export.py — Export the relationship graph to external formats.
|
|
3
|
+
|
|
4
|
+
All functions accept an nx.MultiDiGraph as their first argument.
|
|
5
|
+
to_graphml and to_gephi_csv write files; to_json and summary_stats return data.
|
|
6
|
+
|
|
7
|
+
Public interface
|
|
8
|
+
----------------
|
|
9
|
+
to_graphml(graph, filepath) → None
|
|
10
|
+
to_json(graph) → dict
|
|
11
|
+
to_gephi_csv(graph, nodes_path, edges_path) → None
|
|
12
|
+
summary_stats(graph) → dict
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import csv
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import networkx as nx
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Internal helpers
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _serialize_value(v: Any) -> Any:
|
|
35
|
+
"""Convert a value to a JSON-serializable form."""
|
|
36
|
+
if isinstance(v, datetime):
|
|
37
|
+
return v.isoformat()
|
|
38
|
+
if isinstance(v, (list, dict)):
|
|
39
|
+
return v # already JSON-native (contents may recurse in to_json)
|
|
40
|
+
return v
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _node_to_dict(node_id: str, data: dict) -> dict:
|
|
44
|
+
raw_label = data.get("label") or node_id.split("@")[0]
|
|
45
|
+
label = raw_label[:50] if len(raw_label) > 50 else raw_label
|
|
46
|
+
return {
|
|
47
|
+
"id": node_id,
|
|
48
|
+
"label": label,
|
|
49
|
+
"type": data.get("node_type", ""),
|
|
50
|
+
"confidence": data.get("confidence", 0.0),
|
|
51
|
+
"first_seen": _serialize_value(data.get("first_seen")),
|
|
52
|
+
"last_seen": _serialize_value(data.get("last_seen")),
|
|
53
|
+
"source_urls": data.get("source_urls", []),
|
|
54
|
+
"metadata": data.get("metadata", {}),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _edge_to_dict(source: str, target: str, data: dict) -> dict:
|
|
59
|
+
return {
|
|
60
|
+
"source": source,
|
|
61
|
+
"target": target,
|
|
62
|
+
"type": data.get("edge_type", ""),
|
|
63
|
+
"confidence": data.get("confidence", 0.0),
|
|
64
|
+
"source_url": data.get("source_url", ""),
|
|
65
|
+
"timestamp": _serialize_value(data.get("timestamp")),
|
|
66
|
+
"metadata": data.get("metadata", {}),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# Helpers for GraphML (NetworkX only supports basic scalar types)
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _flatten_for_graphml(data: dict) -> dict:
|
|
76
|
+
"""
|
|
77
|
+
Convert node/edge attributes to GraphML-compatible scalars.
|
|
78
|
+
|
|
79
|
+
NetworkX's GraphML writer supports str, int, float, bool — not datetime,
|
|
80
|
+
list, or dict. This function converts everything else to its JSON or
|
|
81
|
+
ISO-8601 string representation.
|
|
82
|
+
"""
|
|
83
|
+
flat: dict = {}
|
|
84
|
+
for key, value in data.items():
|
|
85
|
+
if isinstance(value, datetime):
|
|
86
|
+
flat[key] = value.isoformat()
|
|
87
|
+
elif isinstance(value, (list, dict)):
|
|
88
|
+
flat[key] = json.dumps(value)
|
|
89
|
+
elif isinstance(value, (str, int, float, bool)) or value is None:
|
|
90
|
+
flat[key] = value if value is not None else ""
|
|
91
|
+
else:
|
|
92
|
+
flat[key] = str(value)
|
|
93
|
+
return flat
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Public functions
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def to_graphml(graph: nx.MultiDiGraph, filepath: str) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Export *graph* to GraphML format (opens in Gephi, yEd, Cytoscape).
|
|
104
|
+
|
|
105
|
+
All node/edge attributes are serialised to GraphML-compatible scalar types
|
|
106
|
+
before writing.
|
|
107
|
+
"""
|
|
108
|
+
serialised = nx.MultiDiGraph()
|
|
109
|
+
|
|
110
|
+
for node_id, data in graph.nodes(data=True):
|
|
111
|
+
serialised.add_node(node_id, **_flatten_for_graphml(data))
|
|
112
|
+
|
|
113
|
+
for src, tgt, key, data in graph.edges(data=True, keys=True):
|
|
114
|
+
serialised.add_edge(src, tgt, key=key, **_flatten_for_graphml(data))
|
|
115
|
+
|
|
116
|
+
nx.write_graphml(serialised, filepath)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def to_json(graph: nx.MultiDiGraph) -> dict:
|
|
120
|
+
"""
|
|
121
|
+
Return a JSON-serialisable dict representing the graph.
|
|
122
|
+
|
|
123
|
+
Schema:
|
|
124
|
+
{
|
|
125
|
+
"nodes": [{"id": ..., "type": ..., "first_seen": <ISO-8601>, ...}],
|
|
126
|
+
"edges": [{"source": ..., "target": ..., "type": ..., ...}],
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
All datetime values are serialised as ISO 8601 strings.
|
|
130
|
+
"""
|
|
131
|
+
nodes = [
|
|
132
|
+
_node_to_dict(nid, data)
|
|
133
|
+
for nid, data in graph.nodes(data=True)
|
|
134
|
+
]
|
|
135
|
+
edges = [
|
|
136
|
+
_edge_to_dict(src, tgt, data)
|
|
137
|
+
for src, tgt, data in graph.edges(data=True)
|
|
138
|
+
]
|
|
139
|
+
return {"nodes": nodes, "edges": edges}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def to_gephi_csv(
|
|
143
|
+
graph: nx.MultiDiGraph,
|
|
144
|
+
nodes_path: str,
|
|
145
|
+
edges_path: str,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""
|
|
148
|
+
Export the graph to two CSV files in Gephi node/edge table format.
|
|
149
|
+
|
|
150
|
+
nodes.csv columns: Id, Label, Type, FirstSeen, LastSeen
|
|
151
|
+
edges.csv columns: Source, Target, Type, Confidence, SourceUrl
|
|
152
|
+
"""
|
|
153
|
+
with open(nodes_path, "w", newline="", encoding="utf-8") as f:
|
|
154
|
+
writer = csv.DictWriter(
|
|
155
|
+
f, fieldnames=["Id", "Label", "Type", "FirstSeen", "LastSeen"]
|
|
156
|
+
)
|
|
157
|
+
writer.writeheader()
|
|
158
|
+
for node_id, data in graph.nodes(data=True):
|
|
159
|
+
first_seen = data.get("first_seen")
|
|
160
|
+
last_seen = data.get("last_seen")
|
|
161
|
+
writer.writerow({
|
|
162
|
+
"Id": node_id,
|
|
163
|
+
"Label": node_id,
|
|
164
|
+
"Type": data.get("node_type", ""),
|
|
165
|
+
"FirstSeen": first_seen.isoformat() if isinstance(first_seen, datetime) else str(first_seen or ""),
|
|
166
|
+
"LastSeen": last_seen.isoformat() if isinstance(last_seen, datetime) else str(last_seen or ""),
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
with open(edges_path, "w", newline="", encoding="utf-8") as f:
|
|
170
|
+
writer = csv.DictWriter(
|
|
171
|
+
f, fieldnames=["Source", "Target", "Type", "Confidence", "SourceUrl"]
|
|
172
|
+
)
|
|
173
|
+
writer.writeheader()
|
|
174
|
+
for src, tgt, data in graph.edges(data=True):
|
|
175
|
+
writer.writerow({
|
|
176
|
+
"Source": src,
|
|
177
|
+
"Target": tgt,
|
|
178
|
+
"Type": data.get("edge_type", ""),
|
|
179
|
+
"Confidence": data.get("confidence", ""),
|
|
180
|
+
"SourceUrl": data.get("source_url", ""),
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def summary_stats(graph: nx.MultiDiGraph) -> dict:
|
|
185
|
+
"""
|
|
186
|
+
Return aggregate statistics about the graph.
|
|
187
|
+
|
|
188
|
+
Schema:
|
|
189
|
+
{
|
|
190
|
+
"total_nodes": int,
|
|
191
|
+
"total_edges": int,
|
|
192
|
+
"nodes_by_type": {"ThreatActor": N, ...},
|
|
193
|
+
"edges_by_type": {"CO_APPEARED_ON": N, ...},
|
|
194
|
+
"most_connected": [{"node_id": ..., "degree": N}], # top 5
|
|
195
|
+
}
|
|
196
|
+
"""
|
|
197
|
+
nodes_by_type: dict[str, int] = defaultdict(int)
|
|
198
|
+
for _, data in graph.nodes(data=True):
|
|
199
|
+
ntype = data.get("node_type", "")
|
|
200
|
+
if ntype:
|
|
201
|
+
nodes_by_type[ntype] += 1
|
|
202
|
+
|
|
203
|
+
edges_by_type: dict[str, int] = defaultdict(int)
|
|
204
|
+
for _, _, data in graph.edges(data=True):
|
|
205
|
+
etype = data.get("edge_type", "")
|
|
206
|
+
if etype:
|
|
207
|
+
edges_by_type[etype] += 1
|
|
208
|
+
|
|
209
|
+
degree_list = sorted(
|
|
210
|
+
((nid, graph.degree(nid)) for nid in graph.nodes()),
|
|
211
|
+
key=lambda t: t[1],
|
|
212
|
+
reverse=True,
|
|
213
|
+
)
|
|
214
|
+
most_connected = [
|
|
215
|
+
{"node_id": nid, "degree": deg}
|
|
216
|
+
for nid, deg in degree_list[:5]
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
"total_nodes": graph.number_of_nodes(),
|
|
221
|
+
"total_edges": graph.number_of_edges(),
|
|
222
|
+
"nodes_by_type": dict(nodes_by_type),
|
|
223
|
+
"edges_by_type": dict(edges_by_type),
|
|
224
|
+
"most_connected": most_connected,
|
|
225
|
+
}
|
graph/model.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph/model.py — Pure data definitions for the VoidAccess graph layer.
|
|
3
|
+
|
|
4
|
+
No graph logic here — only node/edge type constants and dataclasses.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# Node type constants
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NODE_TYPES:
|
|
20
|
+
THREAT_ACTOR = "ThreatActor"
|
|
21
|
+
CRYPTO_WALLET = "CryptoWallet"
|
|
22
|
+
ONION_URL = "OnionURL"
|
|
23
|
+
FORUM = "Forum"
|
|
24
|
+
MALWARE_FAMILY = "MalwareFamily"
|
|
25
|
+
RANSOMWARE_GROUP = "RansomwareGroup"
|
|
26
|
+
PGP_KEY = "PGPKey"
|
|
27
|
+
EMAIL_ADDRESS = "EmailAddress"
|
|
28
|
+
CVE = "CVE"
|
|
29
|
+
PASTE = "Paste"
|
|
30
|
+
IP_ADDRESS = "IPAddress"
|
|
31
|
+
PHONE_NUMBER = "PhoneNumber"
|
|
32
|
+
ORGANIZATION = "Organization"
|
|
33
|
+
DATE = "Date"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Edge type constants
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EDGE_TYPES:
|
|
42
|
+
CO_APPEARED_ON = "CO_APPEARED_ON" # two entities on the same page
|
|
43
|
+
POSTED_BY = "POSTED_BY" # content attributed to a handle
|
|
44
|
+
LINKED_TO = "LINKED_TO" # URL links to URL
|
|
45
|
+
MEMBER_OF = "MEMBER_OF" # handle to group/forum
|
|
46
|
+
USED = "USED" # actor used a malware family
|
|
47
|
+
CLAIMED = "CLAIMED" # group claimed an attack
|
|
48
|
+
LIKELY_SAME_ACTOR = "LIKELY_SAME_ACTOR" # inferred, medium confidence
|
|
49
|
+
CONFIRMED_SAME_ACTOR = "CONFIRMED_SAME_ACTOR" # PGP key match, high confidence
|
|
50
|
+
CO_INVESTIGATION = "CO_INVESTIGATION" # Entities found in same investigation across multiple pages
|
|
51
|
+
PAID_TO = "PAID_TO" # financial transaction
|
|
52
|
+
FUNDED_BY = "FUNDED_BY" # financial transaction
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Dataclasses
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class GraphNode:
|
|
63
|
+
"""Represents a single entity node in the relationship graph."""
|
|
64
|
+
|
|
65
|
+
node_id: str # canonical value (wallet address, handle, etc.)
|
|
66
|
+
node_type: str # one of NODE_TYPES constants
|
|
67
|
+
first_seen: datetime
|
|
68
|
+
last_seen: datetime
|
|
69
|
+
source_urls: list[str] = field(default_factory=list)
|
|
70
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class GraphEdge:
|
|
75
|
+
"""Represents a directed relationship edge in the graph."""
|
|
76
|
+
|
|
77
|
+
source_id: str # node_id of the source node
|
|
78
|
+
target_id: str # node_id of the target node
|
|
79
|
+
edge_type: str # one of EDGE_TYPES constants
|
|
80
|
+
confidence: float # 0.0–1.0
|
|
81
|
+
source_url: str # page where the relationship was observed
|
|
82
|
+
timestamp: datetime
|
|
83
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
graph/queries.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph/queries.py — Named query functions that operate on a NetworkX graph.
|
|
3
|
+
|
|
4
|
+
All functions are pure (no side effects, no DB calls).
|
|
5
|
+
All accept a graph as their first argument and return data.
|
|
6
|
+
None of these functions modify the graph.
|
|
7
|
+
|
|
8
|
+
Public interface
|
|
9
|
+
----------------
|
|
10
|
+
get_neighbors(graph, node_id, hops, edge_types) → dict[str, list[GraphNode]]
|
|
11
|
+
find_nodes_by_type(graph, node_type) → list[GraphNode]
|
|
12
|
+
find_co_occurring_entities(graph, node_id) → list[tuple[GraphNode, int]]
|
|
13
|
+
get_new_nodes_since(graph, since) → list[GraphNode]
|
|
14
|
+
find_high_degree_nodes(graph, top_n, node_type) → list[tuple[GraphNode, int]]
|
|
15
|
+
get_shortest_path(graph, source_id, target_id) → list[GraphNode] | None
|
|
16
|
+
get_actor_profile(graph, actor_node_id) → dict
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from collections import defaultdict, deque
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
import networkx as nx
|
|
26
|
+
|
|
27
|
+
from graph.model import EDGE_TYPES, NODE_TYPES, GraphNode
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Internal helpers
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _make_graphnode(node_id: str, data: dict) -> GraphNode:
|
|
36
|
+
"""Reconstruct a GraphNode from raw NetworkX node attribute dict."""
|
|
37
|
+
return GraphNode(
|
|
38
|
+
node_id=node_id,
|
|
39
|
+
node_type=data.get("node_type", ""),
|
|
40
|
+
first_seen=data.get("first_seen", datetime.utcnow()),
|
|
41
|
+
last_seen=data.get("last_seen", datetime.utcnow()),
|
|
42
|
+
source_urls=list(data.get("source_urls", [])),
|
|
43
|
+
metadata=dict(data.get("metadata", {})),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _has_node(graph: nx.MultiDiGraph, node_id: str) -> bool:
|
|
48
|
+
return graph.has_node(node_id)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# Public functions
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_neighbors(
|
|
57
|
+
graph: nx.MultiDiGraph,
|
|
58
|
+
node_id: str,
|
|
59
|
+
hops: int = 2,
|
|
60
|
+
edge_types: Optional[list[str]] = None,
|
|
61
|
+
) -> dict[str, list[GraphNode]]:
|
|
62
|
+
"""
|
|
63
|
+
Return all nodes reachable within *hops* steps from *node_id*.
|
|
64
|
+
|
|
65
|
+
Traversal is bidirectional (follows both outgoing and incoming edges) so
|
|
66
|
+
that CO_APPEARED_ON and similar undirected-semantics edges are fully
|
|
67
|
+
explored regardless of which direction they were stored.
|
|
68
|
+
|
|
69
|
+
If *edge_types* is provided, only edges whose ``edge_type`` attribute
|
|
70
|
+
matches one of the listed types are traversed.
|
|
71
|
+
|
|
72
|
+
Returns a dict keyed by hop distance (as a string):
|
|
73
|
+
{"1": [GraphNode, ...], "2": [GraphNode, ...], ...}
|
|
74
|
+
"""
|
|
75
|
+
if not _has_node(graph, node_id):
|
|
76
|
+
return {}
|
|
77
|
+
|
|
78
|
+
visited: dict[str, int] = {node_id: 0} # node_id → hop at which it was reached
|
|
79
|
+
queue: deque[tuple[str, int]] = deque([(node_id, 0)])
|
|
80
|
+
result: dict[str, list[GraphNode]] = {}
|
|
81
|
+
|
|
82
|
+
while queue:
|
|
83
|
+
current, depth = queue.popleft()
|
|
84
|
+
if depth >= hops:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Collect adjacent nodes (both directions)
|
|
88
|
+
neighbors: set[str] = set()
|
|
89
|
+
|
|
90
|
+
for _, nbr, edge_data in graph.out_edges(current, data=True):
|
|
91
|
+
if edge_types is None or edge_data.get("edge_type") in edge_types:
|
|
92
|
+
neighbors.add(nbr)
|
|
93
|
+
|
|
94
|
+
for pred, _, edge_data in graph.in_edges(current, data=True):
|
|
95
|
+
if edge_types is None or edge_data.get("edge_type") in edge_types:
|
|
96
|
+
neighbors.add(pred)
|
|
97
|
+
|
|
98
|
+
for nbr in neighbors:
|
|
99
|
+
if nbr == node_id:
|
|
100
|
+
continue
|
|
101
|
+
if nbr not in visited:
|
|
102
|
+
hop = depth + 1
|
|
103
|
+
visited[nbr] = hop
|
|
104
|
+
key = str(hop)
|
|
105
|
+
result.setdefault(key, [])
|
|
106
|
+
result[key].append(_make_graphnode(nbr, graph.nodes[nbr]))
|
|
107
|
+
queue.append((nbr, hop))
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def find_nodes_by_type(
|
|
113
|
+
graph: nx.MultiDiGraph,
|
|
114
|
+
node_type: str,
|
|
115
|
+
) -> list[GraphNode]:
|
|
116
|
+
"""Return all nodes in *graph* whose node_type equals *node_type*."""
|
|
117
|
+
return [
|
|
118
|
+
_make_graphnode(nid, data)
|
|
119
|
+
for nid, data in graph.nodes(data=True)
|
|
120
|
+
if data.get("node_type") == node_type
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def find_co_occurring_entities(
|
|
125
|
+
graph: nx.MultiDiGraph,
|
|
126
|
+
node_id: str,
|
|
127
|
+
) -> list[tuple[GraphNode, int]]:
|
|
128
|
+
"""
|
|
129
|
+
Return a list of (GraphNode, co_occurrence_count) for every node that
|
|
130
|
+
co-occurs with *node_id* via CO_APPEARED_ON edges.
|
|
131
|
+
|
|
132
|
+
Co-occurrence count = number of CO_APPEARED_ON edges connecting the pair
|
|
133
|
+
(in either direction). Results are sorted by count descending.
|
|
134
|
+
"""
|
|
135
|
+
if not _has_node(graph, node_id):
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
counts: dict[str, int] = defaultdict(int)
|
|
139
|
+
|
|
140
|
+
for _, nbr, data in graph.out_edges(node_id, data=True):
|
|
141
|
+
if data.get("edge_type") == EDGE_TYPES.CO_APPEARED_ON:
|
|
142
|
+
counts[nbr] += 1
|
|
143
|
+
|
|
144
|
+
for pred, _, data in graph.in_edges(node_id, data=True):
|
|
145
|
+
if data.get("edge_type") == EDGE_TYPES.CO_APPEARED_ON:
|
|
146
|
+
counts[pred] += 1
|
|
147
|
+
|
|
148
|
+
result = [
|
|
149
|
+
(_make_graphnode(nid, graph.nodes[nid]), count)
|
|
150
|
+
for nid, count in counts.items()
|
|
151
|
+
]
|
|
152
|
+
result.sort(key=lambda t: t[1], reverse=True)
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_new_nodes_since(
|
|
157
|
+
graph: nx.MultiDiGraph,
|
|
158
|
+
since: datetime,
|
|
159
|
+
) -> list[GraphNode]:
|
|
160
|
+
"""Return all nodes where first_seen >= *since*."""
|
|
161
|
+
nodes = []
|
|
162
|
+
for nid, data in graph.nodes(data=True):
|
|
163
|
+
first_seen = data.get("first_seen")
|
|
164
|
+
if first_seen is not None and first_seen >= since:
|
|
165
|
+
nodes.append(_make_graphnode(nid, data))
|
|
166
|
+
return nodes
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def find_high_degree_nodes(
|
|
170
|
+
graph: nx.MultiDiGraph,
|
|
171
|
+
top_n: int = 10,
|
|
172
|
+
node_type: Optional[str] = None,
|
|
173
|
+
) -> list[tuple[GraphNode, int]]:
|
|
174
|
+
"""
|
|
175
|
+
Return the *top_n* most-connected nodes by total degree (in + out).
|
|
176
|
+
|
|
177
|
+
If *node_type* is provided, only consider nodes of that type.
|
|
178
|
+
Results are sorted by degree descending.
|
|
179
|
+
"""
|
|
180
|
+
candidates = [
|
|
181
|
+
(nid, data)
|
|
182
|
+
for nid, data in graph.nodes(data=True)
|
|
183
|
+
if node_type is None or data.get("node_type") == node_type
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
scored = [
|
|
187
|
+
(_make_graphnode(nid, data), graph.degree(nid))
|
|
188
|
+
for nid, data in candidates
|
|
189
|
+
]
|
|
190
|
+
scored.sort(key=lambda t: t[1], reverse=True)
|
|
191
|
+
return scored[:top_n]
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_shortest_path(
|
|
195
|
+
graph: nx.MultiDiGraph,
|
|
196
|
+
source_id: str,
|
|
197
|
+
target_id: str,
|
|
198
|
+
) -> Optional[list[GraphNode]]:
|
|
199
|
+
"""
|
|
200
|
+
Return the shortest path between *source_id* and *target_id*.
|
|
201
|
+
|
|
202
|
+
Uses an undirected view of the graph so paths are found regardless of
|
|
203
|
+
edge direction. Returns None if no path exists or either node is absent.
|
|
204
|
+
"""
|
|
205
|
+
if not _has_node(graph, source_id) or not _has_node(graph, target_id):
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
undirected = graph.to_undirected()
|
|
210
|
+
path_ids: list[str] = nx.shortest_path(undirected, source_id, target_id)
|
|
211
|
+
return [_make_graphnode(nid, graph.nodes[nid]) for nid in path_ids]
|
|
212
|
+
except nx.NetworkXNoPath:
|
|
213
|
+
return None
|
|
214
|
+
except nx.NodeNotFound:
|
|
215
|
+
return None
|
|
216
|
+
except Exception:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def get_actor_profile(
|
|
221
|
+
graph: nx.MultiDiGraph,
|
|
222
|
+
actor_node_id: str,
|
|
223
|
+
) -> dict:
|
|
224
|
+
"""
|
|
225
|
+
Return a structured profile dict for a ThreatActor node.
|
|
226
|
+
|
|
227
|
+
Keys:
|
|
228
|
+
node — GraphNode for the actor
|
|
229
|
+
connected_wallets — list[GraphNode] of CryptoWallet neighbours
|
|
230
|
+
connected_malware — list[GraphNode] of MalwareFamily/RansomwareGroup
|
|
231
|
+
connected_forums — list[GraphNode] of Forum/OnionURL neighbours
|
|
232
|
+
co_actors — list[GraphNode] connected via LIKELY/CONFIRMED_SAME_ACTOR
|
|
233
|
+
total_pages_appeared — number of unique source_urls on the node
|
|
234
|
+
first_seen — datetime
|
|
235
|
+
last_seen — datetime
|
|
236
|
+
"""
|
|
237
|
+
if not _has_node(graph, actor_node_id):
|
|
238
|
+
return {}
|
|
239
|
+
|
|
240
|
+
node_data = graph.nodes[actor_node_id]
|
|
241
|
+
actor_node = _make_graphnode(actor_node_id, node_data)
|
|
242
|
+
|
|
243
|
+
_same_actor_types = {EDGE_TYPES.LIKELY_SAME_ACTOR, EDGE_TYPES.CONFIRMED_SAME_ACTOR}
|
|
244
|
+
|
|
245
|
+
connected_wallets: list[GraphNode] = []
|
|
246
|
+
connected_malware: list[GraphNode] = []
|
|
247
|
+
connected_forums: list[GraphNode] = []
|
|
248
|
+
co_actors: list[GraphNode] = []
|
|
249
|
+
|
|
250
|
+
# Collect all adjacent nodes across all edges
|
|
251
|
+
all_adjacent: set[str] = set()
|
|
252
|
+
for _, nbr in graph.out_edges(actor_node_id):
|
|
253
|
+
all_adjacent.add(nbr)
|
|
254
|
+
for pred, _ in graph.in_edges(actor_node_id):
|
|
255
|
+
all_adjacent.add(pred)
|
|
256
|
+
|
|
257
|
+
for nbr_id in all_adjacent:
|
|
258
|
+
if nbr_id == actor_node_id:
|
|
259
|
+
continue
|
|
260
|
+
nbr_data = graph.nodes.get(nbr_id, {})
|
|
261
|
+
nbr_type = nbr_data.get("node_type", "")
|
|
262
|
+
nbr_node = _make_graphnode(nbr_id, nbr_data)
|
|
263
|
+
|
|
264
|
+
if nbr_type == NODE_TYPES.CRYPTO_WALLET:
|
|
265
|
+
connected_wallets.append(nbr_node)
|
|
266
|
+
elif nbr_type in (NODE_TYPES.MALWARE_FAMILY, NODE_TYPES.RANSOMWARE_GROUP):
|
|
267
|
+
connected_malware.append(nbr_node)
|
|
268
|
+
elif nbr_type in (NODE_TYPES.FORUM, NODE_TYPES.ONION_URL):
|
|
269
|
+
connected_forums.append(nbr_node)
|
|
270
|
+
|
|
271
|
+
# Co-actors: nodes connected specifically via same-actor edge types
|
|
272
|
+
for _, nbr, data in graph.out_edges(actor_node_id, data=True):
|
|
273
|
+
if data.get("edge_type") in _same_actor_types:
|
|
274
|
+
co_actors.append(_make_graphnode(nbr, graph.nodes[nbr]))
|
|
275
|
+
|
|
276
|
+
for pred, _, data in graph.in_edges(actor_node_id, data=True):
|
|
277
|
+
if data.get("edge_type") in _same_actor_types:
|
|
278
|
+
co_actors.append(_make_graphnode(pred, graph.nodes[pred]))
|
|
279
|
+
|
|
280
|
+
# Deduplicate co_actors by node_id
|
|
281
|
+
seen_co: set[str] = set()
|
|
282
|
+
deduped_co: list[GraphNode] = []
|
|
283
|
+
for n in co_actors:
|
|
284
|
+
if n.node_id not in seen_co:
|
|
285
|
+
seen_co.add(n.node_id)
|
|
286
|
+
deduped_co.append(n)
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
"node": actor_node,
|
|
290
|
+
"connected_wallets": connected_wallets,
|
|
291
|
+
"connected_malware": connected_malware,
|
|
292
|
+
"connected_forums": connected_forums,
|
|
293
|
+
"co_actors": deduped_co,
|
|
294
|
+
"total_pages_appeared": len(node_data.get("source_urls", [])),
|
|
295
|
+
"first_seen": node_data.get("first_seen"),
|
|
296
|
+
"last_seen": node_data.get("last_seen"),
|
|
297
|
+
}
|