voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/export.py ADDED
@@ -0,0 +1,225 @@
1
+ """
2
+ graph/export.py — Export the relationship graph to external formats.
3
+
4
+ All functions accept an nx.MultiDiGraph as their first argument.
5
+ to_graphml and to_gephi_csv write files; to_json and summary_stats return data.
6
+
7
+ Public interface
8
+ ----------------
9
+ to_graphml(graph, filepath) → None
10
+ to_json(graph) → dict
11
+ to_gephi_csv(graph, nodes_path, edges_path) → None
12
+ summary_stats(graph) → dict
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import csv
18
+ import json
19
+ import logging
20
+ from collections import defaultdict
21
+ from datetime import datetime
22
+ from typing import Any
23
+
24
+ import networkx as nx
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Internal helpers
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ def _serialize_value(v: Any) -> Any:
35
+ """Convert a value to a JSON-serializable form."""
36
+ if isinstance(v, datetime):
37
+ return v.isoformat()
38
+ if isinstance(v, (list, dict)):
39
+ return v # already JSON-native (contents may recurse in to_json)
40
+ return v
41
+
42
+
43
+ def _node_to_dict(node_id: str, data: dict) -> dict:
44
+ raw_label = data.get("label") or node_id.split("@")[0]
45
+ label = raw_label[:50] if len(raw_label) > 50 else raw_label
46
+ return {
47
+ "id": node_id,
48
+ "label": label,
49
+ "type": data.get("node_type", ""),
50
+ "confidence": data.get("confidence", 0.0),
51
+ "first_seen": _serialize_value(data.get("first_seen")),
52
+ "last_seen": _serialize_value(data.get("last_seen")),
53
+ "source_urls": data.get("source_urls", []),
54
+ "metadata": data.get("metadata", {}),
55
+ }
56
+
57
+
58
+ def _edge_to_dict(source: str, target: str, data: dict) -> dict:
59
+ return {
60
+ "source": source,
61
+ "target": target,
62
+ "type": data.get("edge_type", ""),
63
+ "confidence": data.get("confidence", 0.0),
64
+ "source_url": data.get("source_url", ""),
65
+ "timestamp": _serialize_value(data.get("timestamp")),
66
+ "metadata": data.get("metadata", {}),
67
+ }
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Helpers for GraphML (NetworkX only supports basic scalar types)
72
+ # ---------------------------------------------------------------------------
73
+
74
+
75
+ def _flatten_for_graphml(data: dict) -> dict:
76
+ """
77
+ Convert node/edge attributes to GraphML-compatible scalars.
78
+
79
+ NetworkX's GraphML writer supports str, int, float, bool — not datetime,
80
+ list, or dict. This function converts everything else to its JSON or
81
+ ISO-8601 string representation.
82
+ """
83
+ flat: dict = {}
84
+ for key, value in data.items():
85
+ if isinstance(value, datetime):
86
+ flat[key] = value.isoformat()
87
+ elif isinstance(value, (list, dict)):
88
+ flat[key] = json.dumps(value)
89
+ elif isinstance(value, (str, int, float, bool)) or value is None:
90
+ flat[key] = value if value is not None else ""
91
+ else:
92
+ flat[key] = str(value)
93
+ return flat
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Public functions
98
+ # ---------------------------------------------------------------------------
99
+
100
+
101
+ def to_graphml(graph: nx.MultiDiGraph, filepath: str) -> None:
102
+ """
103
+ Export *graph* to GraphML format (opens in Gephi, yEd, Cytoscape).
104
+
105
+ All node/edge attributes are serialised to GraphML-compatible scalar types
106
+ before writing.
107
+ """
108
+ serialised = nx.MultiDiGraph()
109
+
110
+ for node_id, data in graph.nodes(data=True):
111
+ serialised.add_node(node_id, **_flatten_for_graphml(data))
112
+
113
+ for src, tgt, key, data in graph.edges(data=True, keys=True):
114
+ serialised.add_edge(src, tgt, key=key, **_flatten_for_graphml(data))
115
+
116
+ nx.write_graphml(serialised, filepath)
117
+
118
+
119
+ def to_json(graph: nx.MultiDiGraph) -> dict:
120
+ """
121
+ Return a JSON-serialisable dict representing the graph.
122
+
123
+ Schema:
124
+ {
125
+ "nodes": [{"id": ..., "type": ..., "first_seen": <ISO-8601>, ...}],
126
+ "edges": [{"source": ..., "target": ..., "type": ..., ...}],
127
+ }
128
+
129
+ All datetime values are serialised as ISO 8601 strings.
130
+ """
131
+ nodes = [
132
+ _node_to_dict(nid, data)
133
+ for nid, data in graph.nodes(data=True)
134
+ ]
135
+ edges = [
136
+ _edge_to_dict(src, tgt, data)
137
+ for src, tgt, data in graph.edges(data=True)
138
+ ]
139
+ return {"nodes": nodes, "edges": edges}
140
+
141
+
142
+ def to_gephi_csv(
143
+ graph: nx.MultiDiGraph,
144
+ nodes_path: str,
145
+ edges_path: str,
146
+ ) -> None:
147
+ """
148
+ Export the graph to two CSV files in Gephi node/edge table format.
149
+
150
+ nodes.csv columns: Id, Label, Type, FirstSeen, LastSeen
151
+ edges.csv columns: Source, Target, Type, Confidence, SourceUrl
152
+ """
153
+ with open(nodes_path, "w", newline="", encoding="utf-8") as f:
154
+ writer = csv.DictWriter(
155
+ f, fieldnames=["Id", "Label", "Type", "FirstSeen", "LastSeen"]
156
+ )
157
+ writer.writeheader()
158
+ for node_id, data in graph.nodes(data=True):
159
+ first_seen = data.get("first_seen")
160
+ last_seen = data.get("last_seen")
161
+ writer.writerow({
162
+ "Id": node_id,
163
+ "Label": node_id,
164
+ "Type": data.get("node_type", ""),
165
+ "FirstSeen": first_seen.isoformat() if isinstance(first_seen, datetime) else str(first_seen or ""),
166
+ "LastSeen": last_seen.isoformat() if isinstance(last_seen, datetime) else str(last_seen or ""),
167
+ })
168
+
169
+ with open(edges_path, "w", newline="", encoding="utf-8") as f:
170
+ writer = csv.DictWriter(
171
+ f, fieldnames=["Source", "Target", "Type", "Confidence", "SourceUrl"]
172
+ )
173
+ writer.writeheader()
174
+ for src, tgt, data in graph.edges(data=True):
175
+ writer.writerow({
176
+ "Source": src,
177
+ "Target": tgt,
178
+ "Type": data.get("edge_type", ""),
179
+ "Confidence": data.get("confidence", ""),
180
+ "SourceUrl": data.get("source_url", ""),
181
+ })
182
+
183
+
184
+ def summary_stats(graph: nx.MultiDiGraph) -> dict:
185
+ """
186
+ Return aggregate statistics about the graph.
187
+
188
+ Schema:
189
+ {
190
+ "total_nodes": int,
191
+ "total_edges": int,
192
+ "nodes_by_type": {"ThreatActor": N, ...},
193
+ "edges_by_type": {"CO_APPEARED_ON": N, ...},
194
+ "most_connected": [{"node_id": ..., "degree": N}], # top 5
195
+ }
196
+ """
197
+ nodes_by_type: dict[str, int] = defaultdict(int)
198
+ for _, data in graph.nodes(data=True):
199
+ ntype = data.get("node_type", "")
200
+ if ntype:
201
+ nodes_by_type[ntype] += 1
202
+
203
+ edges_by_type: dict[str, int] = defaultdict(int)
204
+ for _, _, data in graph.edges(data=True):
205
+ etype = data.get("edge_type", "")
206
+ if etype:
207
+ edges_by_type[etype] += 1
208
+
209
+ degree_list = sorted(
210
+ ((nid, graph.degree(nid)) for nid in graph.nodes()),
211
+ key=lambda t: t[1],
212
+ reverse=True,
213
+ )
214
+ most_connected = [
215
+ {"node_id": nid, "degree": deg}
216
+ for nid, deg in degree_list[:5]
217
+ ]
218
+
219
+ return {
220
+ "total_nodes": graph.number_of_nodes(),
221
+ "total_edges": graph.number_of_edges(),
222
+ "nodes_by_type": dict(nodes_by_type),
223
+ "edges_by_type": dict(edges_by_type),
224
+ "most_connected": most_connected,
225
+ }
graph/model.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ graph/model.py — Pure data definitions for the VoidAccess graph layer.
3
+
4
+ No graph logic here — only node/edge type constants and dataclasses.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from typing import Any
12
+
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Node type constants
16
+ # ---------------------------------------------------------------------------
17
+
18
+
19
+ class NODE_TYPES:
20
+ THREAT_ACTOR = "ThreatActor"
21
+ CRYPTO_WALLET = "CryptoWallet"
22
+ ONION_URL = "OnionURL"
23
+ FORUM = "Forum"
24
+ MALWARE_FAMILY = "MalwareFamily"
25
+ RANSOMWARE_GROUP = "RansomwareGroup"
26
+ PGP_KEY = "PGPKey"
27
+ EMAIL_ADDRESS = "EmailAddress"
28
+ CVE = "CVE"
29
+ PASTE = "Paste"
30
+ IP_ADDRESS = "IPAddress"
31
+ PHONE_NUMBER = "PhoneNumber"
32
+ ORGANIZATION = "Organization"
33
+ DATE = "Date"
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Edge type constants
38
+ # ---------------------------------------------------------------------------
39
+
40
+
41
+ class EDGE_TYPES:
42
+ CO_APPEARED_ON = "CO_APPEARED_ON" # two entities on the same page
43
+ POSTED_BY = "POSTED_BY" # content attributed to a handle
44
+ LINKED_TO = "LINKED_TO" # URL links to URL
45
+ MEMBER_OF = "MEMBER_OF" # handle to group/forum
46
+ USED = "USED" # actor used a malware family
47
+ CLAIMED = "CLAIMED" # group claimed an attack
48
+ LIKELY_SAME_ACTOR = "LIKELY_SAME_ACTOR" # inferred, medium confidence
49
+ CONFIRMED_SAME_ACTOR = "CONFIRMED_SAME_ACTOR" # PGP key match, high confidence
50
+ CO_INVESTIGATION = "CO_INVESTIGATION" # Entities found in same investigation across multiple pages
51
+ PAID_TO = "PAID_TO" # financial transaction
52
+ FUNDED_BY = "FUNDED_BY" # financial transaction
53
+
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Dataclasses
58
+ # ---------------------------------------------------------------------------
59
+
60
+
61
+ @dataclass
62
+ class GraphNode:
63
+ """Represents a single entity node in the relationship graph."""
64
+
65
+ node_id: str # canonical value (wallet address, handle, etc.)
66
+ node_type: str # one of NODE_TYPES constants
67
+ first_seen: datetime
68
+ last_seen: datetime
69
+ source_urls: list[str] = field(default_factory=list)
70
+ metadata: dict[str, Any] = field(default_factory=dict)
71
+
72
+
73
+ @dataclass
74
+ class GraphEdge:
75
+ """Represents a directed relationship edge in the graph."""
76
+
77
+ source_id: str # node_id of the source node
78
+ target_id: str # node_id of the target node
79
+ edge_type: str # one of EDGE_TYPES constants
80
+ confidence: float # 0.0–1.0
81
+ source_url: str # page where the relationship was observed
82
+ timestamp: datetime
83
+ metadata: dict[str, Any] = field(default_factory=dict)
graph/queries.py ADDED
@@ -0,0 +1,297 @@
1
+ """
2
+ graph/queries.py — Named query functions that operate on a NetworkX graph.
3
+
4
+ All functions are pure (no side effects, no DB calls).
5
+ All accept a graph as their first argument and return data.
6
+ None of these functions modify the graph.
7
+
8
+ Public interface
9
+ ----------------
10
+ get_neighbors(graph, node_id, hops, edge_types) → dict[str, list[GraphNode]]
11
+ find_nodes_by_type(graph, node_type) → list[GraphNode]
12
+ find_co_occurring_entities(graph, node_id) → list[tuple[GraphNode, int]]
13
+ get_new_nodes_since(graph, since) → list[GraphNode]
14
+ find_high_degree_nodes(graph, top_n, node_type) → list[tuple[GraphNode, int]]
15
+ get_shortest_path(graph, source_id, target_id) → list[GraphNode] | None
16
+ get_actor_profile(graph, actor_node_id) → dict
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from collections import defaultdict, deque
22
+ from datetime import datetime
23
+ from typing import Optional
24
+
25
+ import networkx as nx
26
+
27
+ from graph.model import EDGE_TYPES, NODE_TYPES, GraphNode
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Internal helpers
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ def _make_graphnode(node_id: str, data: dict) -> GraphNode:
36
+ """Reconstruct a GraphNode from raw NetworkX node attribute dict."""
37
+ return GraphNode(
38
+ node_id=node_id,
39
+ node_type=data.get("node_type", ""),
40
+ first_seen=data.get("first_seen", datetime.utcnow()),
41
+ last_seen=data.get("last_seen", datetime.utcnow()),
42
+ source_urls=list(data.get("source_urls", [])),
43
+ metadata=dict(data.get("metadata", {})),
44
+ )
45
+
46
+
47
+ def _has_node(graph: nx.MultiDiGraph, node_id: str) -> bool:
48
+ return graph.has_node(node_id)
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Public functions
53
+ # ---------------------------------------------------------------------------
54
+
55
+
56
+ def get_neighbors(
57
+ graph: nx.MultiDiGraph,
58
+ node_id: str,
59
+ hops: int = 2,
60
+ edge_types: Optional[list[str]] = None,
61
+ ) -> dict[str, list[GraphNode]]:
62
+ """
63
+ Return all nodes reachable within *hops* steps from *node_id*.
64
+
65
+ Traversal is bidirectional (follows both outgoing and incoming edges) so
66
+ that CO_APPEARED_ON and similar undirected-semantics edges are fully
67
+ explored regardless of which direction they were stored.
68
+
69
+ If *edge_types* is provided, only edges whose ``edge_type`` attribute
70
+ matches one of the listed types are traversed.
71
+
72
+ Returns a dict keyed by hop distance (as a string):
73
+ {"1": [GraphNode, ...], "2": [GraphNode, ...], ...}
74
+ """
75
+ if not _has_node(graph, node_id):
76
+ return {}
77
+
78
+ visited: dict[str, int] = {node_id: 0} # node_id → hop at which it was reached
79
+ queue: deque[tuple[str, int]] = deque([(node_id, 0)])
80
+ result: dict[str, list[GraphNode]] = {}
81
+
82
+ while queue:
83
+ current, depth = queue.popleft()
84
+ if depth >= hops:
85
+ continue
86
+
87
+ # Collect adjacent nodes (both directions)
88
+ neighbors: set[str] = set()
89
+
90
+ for _, nbr, edge_data in graph.out_edges(current, data=True):
91
+ if edge_types is None or edge_data.get("edge_type") in edge_types:
92
+ neighbors.add(nbr)
93
+
94
+ for pred, _, edge_data in graph.in_edges(current, data=True):
95
+ if edge_types is None or edge_data.get("edge_type") in edge_types:
96
+ neighbors.add(pred)
97
+
98
+ for nbr in neighbors:
99
+ if nbr == node_id:
100
+ continue
101
+ if nbr not in visited:
102
+ hop = depth + 1
103
+ visited[nbr] = hop
104
+ key = str(hop)
105
+ result.setdefault(key, [])
106
+ result[key].append(_make_graphnode(nbr, graph.nodes[nbr]))
107
+ queue.append((nbr, hop))
108
+
109
+ return result
110
+
111
+
112
+ def find_nodes_by_type(
113
+ graph: nx.MultiDiGraph,
114
+ node_type: str,
115
+ ) -> list[GraphNode]:
116
+ """Return all nodes in *graph* whose node_type equals *node_type*."""
117
+ return [
118
+ _make_graphnode(nid, data)
119
+ for nid, data in graph.nodes(data=True)
120
+ if data.get("node_type") == node_type
121
+ ]
122
+
123
+
124
+ def find_co_occurring_entities(
125
+ graph: nx.MultiDiGraph,
126
+ node_id: str,
127
+ ) -> list[tuple[GraphNode, int]]:
128
+ """
129
+ Return a list of (GraphNode, co_occurrence_count) for every node that
130
+ co-occurs with *node_id* via CO_APPEARED_ON edges.
131
+
132
+ Co-occurrence count = number of CO_APPEARED_ON edges connecting the pair
133
+ (in either direction). Results are sorted by count descending.
134
+ """
135
+ if not _has_node(graph, node_id):
136
+ return []
137
+
138
+ counts: dict[str, int] = defaultdict(int)
139
+
140
+ for _, nbr, data in graph.out_edges(node_id, data=True):
141
+ if data.get("edge_type") == EDGE_TYPES.CO_APPEARED_ON:
142
+ counts[nbr] += 1
143
+
144
+ for pred, _, data in graph.in_edges(node_id, data=True):
145
+ if data.get("edge_type") == EDGE_TYPES.CO_APPEARED_ON:
146
+ counts[pred] += 1
147
+
148
+ result = [
149
+ (_make_graphnode(nid, graph.nodes[nid]), count)
150
+ for nid, count in counts.items()
151
+ ]
152
+ result.sort(key=lambda t: t[1], reverse=True)
153
+ return result
154
+
155
+
156
+ def get_new_nodes_since(
157
+ graph: nx.MultiDiGraph,
158
+ since: datetime,
159
+ ) -> list[GraphNode]:
160
+ """Return all nodes where first_seen >= *since*."""
161
+ nodes = []
162
+ for nid, data in graph.nodes(data=True):
163
+ first_seen = data.get("first_seen")
164
+ if first_seen is not None and first_seen >= since:
165
+ nodes.append(_make_graphnode(nid, data))
166
+ return nodes
167
+
168
+
169
+ def find_high_degree_nodes(
170
+ graph: nx.MultiDiGraph,
171
+ top_n: int = 10,
172
+ node_type: Optional[str] = None,
173
+ ) -> list[tuple[GraphNode, int]]:
174
+ """
175
+ Return the *top_n* most-connected nodes by total degree (in + out).
176
+
177
+ If *node_type* is provided, only consider nodes of that type.
178
+ Results are sorted by degree descending.
179
+ """
180
+ candidates = [
181
+ (nid, data)
182
+ for nid, data in graph.nodes(data=True)
183
+ if node_type is None or data.get("node_type") == node_type
184
+ ]
185
+
186
+ scored = [
187
+ (_make_graphnode(nid, data), graph.degree(nid))
188
+ for nid, data in candidates
189
+ ]
190
+ scored.sort(key=lambda t: t[1], reverse=True)
191
+ return scored[:top_n]
192
+
193
+
194
+ def get_shortest_path(
195
+ graph: nx.MultiDiGraph,
196
+ source_id: str,
197
+ target_id: str,
198
+ ) -> Optional[list[GraphNode]]:
199
+ """
200
+ Return the shortest path between *source_id* and *target_id*.
201
+
202
+ Uses an undirected view of the graph so paths are found regardless of
203
+ edge direction. Returns None if no path exists or either node is absent.
204
+ """
205
+ if not _has_node(graph, source_id) or not _has_node(graph, target_id):
206
+ return None
207
+
208
+ try:
209
+ undirected = graph.to_undirected()
210
+ path_ids: list[str] = nx.shortest_path(undirected, source_id, target_id)
211
+ return [_make_graphnode(nid, graph.nodes[nid]) for nid in path_ids]
212
+ except nx.NetworkXNoPath:
213
+ return None
214
+ except nx.NodeNotFound:
215
+ return None
216
+ except Exception:
217
+ return None
218
+
219
+
220
+ def get_actor_profile(
221
+ graph: nx.MultiDiGraph,
222
+ actor_node_id: str,
223
+ ) -> dict:
224
+ """
225
+ Return a structured profile dict for a ThreatActor node.
226
+
227
+ Keys:
228
+ node — GraphNode for the actor
229
+ connected_wallets — list[GraphNode] of CryptoWallet neighbours
230
+ connected_malware — list[GraphNode] of MalwareFamily/RansomwareGroup
231
+ connected_forums — list[GraphNode] of Forum/OnionURL neighbours
232
+ co_actors — list[GraphNode] connected via LIKELY/CONFIRMED_SAME_ACTOR
233
+ total_pages_appeared — number of unique source_urls on the node
234
+ first_seen — datetime
235
+ last_seen — datetime
236
+ """
237
+ if not _has_node(graph, actor_node_id):
238
+ return {}
239
+
240
+ node_data = graph.nodes[actor_node_id]
241
+ actor_node = _make_graphnode(actor_node_id, node_data)
242
+
243
+ _same_actor_types = {EDGE_TYPES.LIKELY_SAME_ACTOR, EDGE_TYPES.CONFIRMED_SAME_ACTOR}
244
+
245
+ connected_wallets: list[GraphNode] = []
246
+ connected_malware: list[GraphNode] = []
247
+ connected_forums: list[GraphNode] = []
248
+ co_actors: list[GraphNode] = []
249
+
250
+ # Collect all adjacent nodes across all edges
251
+ all_adjacent: set[str] = set()
252
+ for _, nbr in graph.out_edges(actor_node_id):
253
+ all_adjacent.add(nbr)
254
+ for pred, _ in graph.in_edges(actor_node_id):
255
+ all_adjacent.add(pred)
256
+
257
+ for nbr_id in all_adjacent:
258
+ if nbr_id == actor_node_id:
259
+ continue
260
+ nbr_data = graph.nodes.get(nbr_id, {})
261
+ nbr_type = nbr_data.get("node_type", "")
262
+ nbr_node = _make_graphnode(nbr_id, nbr_data)
263
+
264
+ if nbr_type == NODE_TYPES.CRYPTO_WALLET:
265
+ connected_wallets.append(nbr_node)
266
+ elif nbr_type in (NODE_TYPES.MALWARE_FAMILY, NODE_TYPES.RANSOMWARE_GROUP):
267
+ connected_malware.append(nbr_node)
268
+ elif nbr_type in (NODE_TYPES.FORUM, NODE_TYPES.ONION_URL):
269
+ connected_forums.append(nbr_node)
270
+
271
+ # Co-actors: nodes connected specifically via same-actor edge types
272
+ for _, nbr, data in graph.out_edges(actor_node_id, data=True):
273
+ if data.get("edge_type") in _same_actor_types:
274
+ co_actors.append(_make_graphnode(nbr, graph.nodes[nbr]))
275
+
276
+ for pred, _, data in graph.in_edges(actor_node_id, data=True):
277
+ if data.get("edge_type") in _same_actor_types:
278
+ co_actors.append(_make_graphnode(pred, graph.nodes[pred]))
279
+
280
+ # Deduplicate co_actors by node_id
281
+ seen_co: set[str] = set()
282
+ deduped_co: list[GraphNode] = []
283
+ for n in co_actors:
284
+ if n.node_id not in seen_co:
285
+ seen_co.add(n.node_id)
286
+ deduped_co.append(n)
287
+
288
+ return {
289
+ "node": actor_node,
290
+ "connected_wallets": connected_wallets,
291
+ "connected_malware": connected_malware,
292
+ "connected_forums": connected_forums,
293
+ "co_actors": deduped_co,
294
+ "total_pages_appeared": len(node_data.get("source_urls", [])),
295
+ "first_seen": node_data.get("first_seen"),
296
+ "last_seen": node_data.get("last_seen"),
297
+ }