voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/enrichment.py ADDED
@@ -0,0 +1,1244 @@
1
+ """
2
+ Threat intelligence enrichment — OTX (AlienVault) and abuse.ch (MalwareBazaar,
3
+ ThreatFox, URLhaus).
4
+
5
+ Returns page-shaped dicts compatible with ``extract_entities_from_pages`` (``url``,
6
+ ``text`` / ``content``, plus ``link``, ``status``, ``source`` for traceability).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import logging
13
+ import os
14
+ import re
15
+ from typing import Any, Optional
16
+ from urllib.parse import urlparse
17
+
18
+ import aiohttp
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ OTX_BASE_URL = "https://otx.alienvault.com/api/v1"
23
+
24
+
25
+ def is_onion_url(url: str) -> bool:
26
+ """Return True if *url* points to a .onion hidden service."""
27
+ if not url:
28
+ return False
29
+ try:
30
+ from urllib.parse import urlparse
31
+ parsed = urlparse(url)
32
+ host = parsed.hostname or ""
33
+ return host.endswith(".onion")
34
+ except Exception:
35
+ return False
36
+ MALWAREBAZAAR_URL = "https://mb-api.abuse.ch/api/v1/"
37
+ URLHAUS_URL = "https://urlhaus-api.abuse.ch/v1/"
38
+ THREATFOX_URL = "https://threatfox-api.abuse.ch/api/v1/"
39
+
40
+ # All HTTP calls use at most 30s client timeout (enforced per request).
41
+
42
+
43
+ def _abusech_headers() -> dict[str, str]:
44
+ key = (os.environ.get("ABUSECH_API_KEY") or "").strip()
45
+ return {"Auth-Key": key} if key else {}
46
+
47
+
48
+ def is_onion_url(url: str) -> bool:
49
+ """
50
+ Return True if *url* looks like a Tor hidden service URL (.onion).
51
+ """
52
+ if not url or not isinstance(url, str):
53
+ return False
54
+ try:
55
+ parsed = urlparse(url.strip())
56
+ host = (parsed.hostname or "").lower()
57
+ return host.endswith(".onion")
58
+ except Exception:
59
+ return ".onion" in url.lower()
60
+
61
+
62
+ async def fetch_otx_pulses(query: str, api_key: str, limit: int = 20) -> list[dict]:
63
+ """
64
+ Search OTX for threat pulses related to the query.
65
+
66
+ Returns list of dicts with pulse metadata and optional ``indicators``.
67
+ """
68
+ if not (api_key or "").strip():
69
+ logger.debug("OTX skipped — no API key configured")
70
+ return []
71
+
72
+ headers = {"X-OTX-API-KEY": api_key.strip()}
73
+ results: list[dict] = []
74
+
75
+ try:
76
+ timeout = aiohttp.ClientTimeout(total=30)
77
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
78
+ url = f"{OTX_BASE_URL}/search/pulses"
79
+ params = {"q": query, "limit": limit, "page": 1}
80
+
81
+ async with session.get(url, params=params) as resp:
82
+ if resp.status != 200:
83
+ logger.warning("OTX pulse search returned HTTP %s", resp.status)
84
+ return []
85
+
86
+ data = await resp.json()
87
+ pulses = data.get("results", [])
88
+ logger.info("OTX: %d results", len(pulses))
89
+
90
+ for pulse in pulses:
91
+ mf = pulse.get("malware_families") or []
92
+ if mf and isinstance(mf[0], str):
93
+ malware_families_fmt: list[Any] = mf
94
+ else:
95
+ malware_families_fmt = mf
96
+
97
+ result = {
98
+ "source": "otx_pulse",
99
+ "pulse_id": pulse.get("id"),
100
+ "title": pulse.get("name", ""),
101
+ "description": pulse.get("description", ""),
102
+ "tags": pulse.get("tags", []),
103
+ "created": pulse.get("created"),
104
+ "modified": pulse.get("modified"),
105
+ "tlp": pulse.get("tlp", "white"),
106
+ "indicator_count": pulse.get("indicator_count", 0),
107
+ "malware_families": malware_families_fmt,
108
+ "attack_ids": [
109
+ a.get("display_name")
110
+ for a in (pulse.get("attack_ids") or [])
111
+ if isinstance(a, dict)
112
+ ],
113
+ "indicators": [],
114
+ }
115
+ results.append(result)
116
+
117
+ for pulse_result in results[:5]:
118
+ indicators = await fetch_otx_pulse_indicators(
119
+ str(pulse_result["pulse_id"]), api_key, session
120
+ )
121
+ pulse_result["indicators"] = indicators
122
+
123
+ except asyncio.TimeoutError:
124
+ logger.warning("OTX: Request timed out")
125
+ except aiohttp.ClientError as e:
126
+ logger.warning("OTX: Client error: %s", e)
127
+ except Exception as e:
128
+ logger.warning("OTX: Error fetching pulses: %s", e)
129
+
130
+ return results
131
+
132
+
133
+ async def fetch_otx_pulse_indicators(
134
+ pulse_id: str, api_key: str, session: aiohttp.ClientSession
135
+ ) -> list[dict]:
136
+ """Fetch IOCs for a pulse."""
137
+ try:
138
+ url = f"{OTX_BASE_URL}/pulses/{pulse_id}/indicators"
139
+ headers = {"X-OTX-API-KEY": api_key}
140
+
141
+ async with session.get(url, headers=headers) as resp:
142
+ if resp.status != 200:
143
+ return []
144
+
145
+ data = await resp.json()
146
+ indicators = data.get("results", [])
147
+
148
+ return [
149
+ {
150
+ "type": ind.get("type"),
151
+ "value": ind.get("indicator"),
152
+ "description": ind.get("description", ""),
153
+ "created": ind.get("created"),
154
+ }
155
+ for ind in indicators
156
+ if ind.get("indicator")
157
+ ]
158
+
159
+ except Exception as e:
160
+ logger.debug("OTX: Error fetching indicators for pulse %s: %s", pulse_id, e)
161
+ return []
162
+
163
+
164
+ def otx_pulse_to_page(pulse: dict) -> dict:
165
+ """Convert an OTX pulse to page-shaped dict for the entity extractor."""
166
+ lines: list[str] = []
167
+
168
+ if pulse.get("title"):
169
+ lines.append(f"Threat Report: {pulse['title']}")
170
+
171
+ if pulse.get("description"):
172
+ lines.append(f"\nDescription: {pulse['description']}")
173
+
174
+ if pulse.get("tags"):
175
+ lines.append(f"\nTags: {', '.join(pulse['tags'])}")
176
+
177
+ mf = pulse.get("malware_families") or []
178
+ if mf:
179
+ families: list[str] = []
180
+ for m in mf:
181
+ if isinstance(m, dict):
182
+ families.append(m.get("display_name") or m.get("name") or "")
183
+ elif isinstance(m, str):
184
+ families.append(m)
185
+ families = [f for f in families if f]
186
+ if families:
187
+ lines.append(f"\nMalware Families: {', '.join(families)}")
188
+
189
+ if pulse.get("attack_ids"):
190
+ lines.append(f"\nMITRE ATT&CK: {', '.join(pulse['attack_ids'])}")
191
+
192
+ indicators = pulse.get("indicators", [])
193
+ if indicators:
194
+ lines.append("\nIndicators of Compromise:")
195
+ for ind in indicators:
196
+ ind_type = ind.get("type", "")
197
+ ind_value = ind.get("value", "")
198
+ ind_desc = ind.get("description", "")
199
+ if ind_value:
200
+ extra = f" ({ind_desc})" if ind_desc else ""
201
+ lines.append(f" {ind_type}: {ind_value}{extra}")
202
+
203
+ content = "\n".join(lines)
204
+ pid = pulse.get("pulse_id") or ""
205
+ link = f"https://otx.alienvault.com/pulse/{pid}"
206
+
207
+ return {
208
+ "link": link,
209
+ "url": link,
210
+ "content": content,
211
+ "text": content,
212
+ "status": 200,
213
+ "source": "alienvault_otx",
214
+ "title": pulse.get("title", "OTX Threat Report"),
215
+ "via": "otx_api",
216
+ }
217
+
218
+
219
+ async def fetch_malwarebazaar(query: str, limit: int = 20) -> list[dict]:
220
+ """Query MalwareBazaar by tag then by signature."""
221
+ results: list[dict] = []
222
+ q = (query or "").strip()
223
+ if not q:
224
+ # Fetch most recent samples (last 100)
225
+ try:
226
+ headers = _abusech_headers()
227
+ timeout = aiohttp.ClientTimeout(total=30)
228
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
229
+ payload = {"query": "get_recent", "selector": "time"}
230
+ async with session.post(MALWAREBAZAAR_URL, data=payload) as resp:
231
+ if resp.status == 200:
232
+ data = await resp.json()
233
+ if data.get("query_status") == "ok":
234
+ samples = data.get("data") or []
235
+ for sample in samples:
236
+ results.append({
237
+ "source": "malwarebazaar",
238
+ "sha256": sample.get("sha256_hash"),
239
+ "signature": sample.get("signature"),
240
+ "malware_family": sample.get("signature", ""),
241
+ "tags": sample.get("tags", []),
242
+ "first_seen": sample.get("first_seen"),
243
+ })
244
+ return results
245
+ except Exception as e:
246
+ logger.warning("MalwareBazaar recent fetch failed: %s", e)
247
+ return []
248
+ return []
249
+
250
+ headers = _abusech_headers()
251
+ timeout = aiohttp.ClientTimeout(total=30)
252
+
253
+ def _map_sample(sample: dict) -> dict:
254
+ return {
255
+ "source": "malwarebazaar",
256
+ "sha256": sample.get("sha256_hash"),
257
+ "md5": sample.get("md5_hash"),
258
+ "file_name": sample.get("file_name"),
259
+ "file_type": sample.get("file_type"),
260
+ "signature": sample.get("signature"),
261
+ "tags": sample.get("tags", []),
262
+ "malware_family": sample.get("signature", ""),
263
+ "first_seen": sample.get("first_seen"),
264
+ "last_seen": sample.get("last_seen"),
265
+ "reporter": sample.get("reporter", ""),
266
+ "comment": sample.get("comment", ""),
267
+ }
268
+
269
+ try:
270
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
271
+ tag_payload = {"query": "get_taginfo", "tag": q, "limit": limit}
272
+ async with session.post(MALWAREBAZAAR_URL, data=tag_payload) as resp:
273
+ if resp.status != 200:
274
+ logger.warning("MalwareBazaar: HTTP %s (tag)", resp.status)
275
+ else:
276
+ data = await resp.json()
277
+ if data.get("query_status") == "no_api_key":
278
+ logger.warning(
279
+ "MalwareBazaar: no_api_key — set ABUSECH_API_KEY for abuse.ch APIs"
280
+ )
281
+ return []
282
+ if data.get("query_status") == "ok":
283
+ samples = data.get("data") or []
284
+ logger.info("MalwareBazaar: %d samples (tag)", len(samples))
285
+ for sample in samples:
286
+ results.append(_map_sample(sample))
287
+ if results:
288
+ return results
289
+
290
+ sig_payload = {"query": "get_siginfo", "signature": q, "limit": limit}
291
+ async with session.post(MALWAREBAZAAR_URL, data=sig_payload) as resp:
292
+ if resp.status != 200:
293
+ logger.warning("MalwareBazaar: HTTP %s (signature)", resp.status)
294
+ return []
295
+ data = await resp.json()
296
+ if data.get("query_status") != "ok":
297
+ return []
298
+ samples = data.get("data") or []
299
+ logger.info("MalwareBazaar: %d samples (signature)", len(samples))
300
+ for sample in samples:
301
+ results.append(_map_sample(sample))
302
+
303
+ except asyncio.TimeoutError:
304
+ logger.warning("MalwareBazaar: Request timed out")
305
+ except aiohttp.ClientError as e:
306
+ logger.warning("MalwareBazaar: Client error: %s", e)
307
+ except Exception as e:
308
+ logger.warning("MalwareBazaar: Error: %s", e)
309
+
310
+ return results
311
+
312
+
313
+ async def fetch_threatfox(query: str, limit: int = 50) -> list[dict]:
314
+ """Search ThreatFox IOCs by search term."""
315
+ results: list[dict] = []
316
+ q = (query or "").strip()
317
+ if not q:
318
+ # Fetch most recent IOCs (last 24 hours)
319
+ payload = {"query": "get_iocs", "days": 1}
320
+ try:
321
+ headers = _abusech_headers()
322
+ timeout = aiohttp.ClientTimeout(total=30)
323
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
324
+ async with session.post(THREATFOX_URL, json=payload) as resp:
325
+ if resp.status == 200:
326
+ data = await resp.json()
327
+ if data.get("query_status") == "ok":
328
+ iocs = data.get("data") or []
329
+ for ioc in iocs[:limit]:
330
+ conf = ioc.get("confidence_level")
331
+ conf_f = float(conf) / 100.0 if conf is not None else 0.0
332
+ results.append({
333
+ "source": "threatfox",
334
+ "ioc_type": ioc.get("ioc_type"),
335
+ "ioc_value": ioc.get("ioc"),
336
+ "malware": ioc.get("malware_printable"),
337
+ "confidence": conf_f,
338
+ "tags": ioc.get("tags", []),
339
+ })
340
+ return results
341
+ except Exception as e:
342
+ logger.warning("ThreatFox recent fetch failed: %s", e)
343
+ return []
344
+ return []
345
+
346
+ headers = _abusech_headers()
347
+ timeout = aiohttp.ClientTimeout(total=30)
348
+ payload = {"query": "search_ioc", "search_term": q}
349
+
350
+ try:
351
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
352
+ async with session.post(THREATFOX_URL, json=payload) as resp:
353
+ if resp.status != 200:
354
+ logger.warning("ThreatFox: HTTP %s", resp.status)
355
+ return []
356
+
357
+ data = await resp.json()
358
+ if data.get("query_status") == "no_api_key":
359
+ logger.warning(
360
+ "ThreatFox: no_api_key — set ABUSECH_API_KEY for abuse.ch APIs"
361
+ )
362
+ return []
363
+ if data.get("query_status") != "ok":
364
+ return []
365
+
366
+ iocs = data.get("data") or []
367
+ logger.info("ThreatFox: %d results", len(iocs))
368
+
369
+ for ioc in iocs[:limit]:
370
+ conf = ioc.get("confidence_level")
371
+ conf_f = float(conf) / 100.0 if conf is not None else 0.0
372
+ results.append(
373
+ {
374
+ "source": "threatfox",
375
+ "ioc_type": ioc.get("ioc_type"),
376
+ "ioc_value": ioc.get("ioc"),
377
+ "malware": ioc.get("malware"),
378
+ "malware_printable": ioc.get("malware_printable"),
379
+ "confidence": conf_f,
380
+ "first_seen": ioc.get("first_seen"),
381
+ "last_seen": ioc.get("last_seen"),
382
+ "tags": ioc.get("tags", []),
383
+ "comment": ioc.get("comment", ""),
384
+ "reporter": ioc.get("reporter", ""),
385
+ }
386
+ )
387
+
388
+ except asyncio.TimeoutError:
389
+ logger.warning("ThreatFox: Request timed out")
390
+ except aiohttp.ClientError as e:
391
+ logger.warning("ThreatFox: Client error: %s", e)
392
+ except Exception as e:
393
+ logger.warning("ThreatFox: Error: %s", e)
394
+
395
+ return results
396
+
397
+
398
+ async def fetch_urlhaus(query: str, limit: int = 20) -> list[dict]:
399
+ """Search URLhaus by tag."""
400
+ results: list[dict] = []
401
+ q = (query or "").strip()
402
+ if not q:
403
+ return []
404
+
405
+ headers = _abusech_headers()
406
+ timeout = aiohttp.ClientTimeout(total=30)
407
+ payload = {"tag": q}
408
+
409
+ try:
410
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
411
+ async with session.post(f"{URLHAUS_URL}tag/", data=payload) as resp:
412
+ if resp.status != 200:
413
+ logger.warning("URLhaus: HTTP %s", resp.status)
414
+ return []
415
+
416
+ data = await resp.json()
417
+ if data.get("query_status") == "no_api_key":
418
+ logger.warning(
419
+ "URLhaus: no_api_key — set ABUSECH_API_KEY for abuse.ch APIs"
420
+ )
421
+ return []
422
+ if data.get("query_status") != "ok":
423
+ return []
424
+
425
+ urls = (data.get("urls") or [])[:limit]
426
+ logger.info("URLhaus: %d results", len(urls))
427
+
428
+ for url_entry in urls:
429
+ results.append(
430
+ {
431
+ "source": "urlhaus",
432
+ "url": url_entry.get("url"),
433
+ "url_status": url_entry.get("url_status"),
434
+ "tags": url_entry.get("tags", []),
435
+ "threat": url_entry.get("threat"),
436
+ "date_added": url_entry.get("date_added"),
437
+ "reporter": url_entry.get("reporter", ""),
438
+ }
439
+ )
440
+
441
+ except asyncio.TimeoutError:
442
+ logger.warning("URLhaus: Request timed out")
443
+ except aiohttp.ClientError as e:
444
+ logger.warning("URLhaus: Client error: %s", e)
445
+ except Exception as e:
446
+ logger.warning("URLhaus: Error: %s", e)
447
+
448
+ return results
449
+
450
+
451
+ def abusech_to_pages(
452
+ malwarebazaar_results: list[dict],
453
+ threatfox_results: list[dict],
454
+ urlhaus_results: list[dict],
455
+ ) -> list[dict]:
456
+ """Group Abuse.ch results into page-shaped dicts."""
457
+ pages: list[dict] = []
458
+
459
+ if malwarebazaar_results:
460
+ lines = ["MalwareBazaar Threat Intelligence Report\n"]
461
+ for sample in malwarebazaar_results[:20]:
462
+ lines.append(f"Malware Family: {sample.get('malware_family', 'Unknown')}")
463
+ if sample.get("sha256"):
464
+ lines.append(f"SHA256: {sample['sha256']}")
465
+ if sample.get("tags"):
466
+ lines.append(f"Tags: {', '.join(sample['tags'])}")
467
+ if sample.get("reporter"):
468
+ lines.append(f"Reporter: {sample['reporter']}")
469
+ if sample.get("first_seen"):
470
+ lines.append(f"First seen: {sample['first_seen']}")
471
+ lines.append("")
472
+
473
+ content = "\n".join(lines)
474
+ link = "https://bazaar.abuse.ch/browse/"
475
+ pages.append(
476
+ {
477
+ "link": link,
478
+ "url": link,
479
+ "content": content,
480
+ "text": content,
481
+ "status": 200,
482
+ "source": "malwarebazaar",
483
+ "via": "abusech_api",
484
+ }
485
+ )
486
+
487
+ if threatfox_results:
488
+ lines = ["ThreatFox IOC Intelligence Report\n"]
489
+ for ioc in threatfox_results[:30]:
490
+ lines.append(f"IOC Type: {ioc.get('ioc_type', 'Unknown')}")
491
+ lines.append(f"IOC Value: {ioc.get('ioc_value', '')}")
492
+ if ioc.get("malware_printable"):
493
+ lines.append(f"Malware: {ioc['malware_printable']}")
494
+ if ioc.get("confidence"):
495
+ lines.append(f"Confidence: {ioc['confidence']:.0%}")
496
+ if ioc.get("tags"):
497
+ lines.append(f"Tags: {', '.join(ioc['tags'])}")
498
+ lines.append("")
499
+
500
+ content = "\n".join(lines)
501
+ link = "https://threatfox.abuse.ch/"
502
+ pages.append(
503
+ {
504
+ "link": link,
505
+ "url": link,
506
+ "content": content,
507
+ "text": content,
508
+ "status": 200,
509
+ "source": "threatfox",
510
+ "via": "abusech_api",
511
+ }
512
+ )
513
+
514
+ if urlhaus_results:
515
+ lines = ["URLhaus Malicious URL Intelligence Report\n"]
516
+ for url_entry in urlhaus_results[:20]:
517
+ lines.append(f"URL: {url_entry.get('url', '')}")
518
+ lines.append(f"Threat: {url_entry.get('threat', 'Unknown')}")
519
+ if url_entry.get("tags"):
520
+ lines.append(f"Tags: {', '.join(url_entry['tags'])}")
521
+ lines.append("")
522
+
523
+ content = "\n".join(lines)
524
+ link = "https://urlhaus.abuse.ch/"
525
+ pages.append(
526
+ {
527
+ "link": link,
528
+ "url": link,
529
+ "content": content,
530
+ "text": content,
531
+ "status": 200,
532
+ "source": "urlhaus",
533
+ "via": "abusech_api",
534
+ }
535
+ )
536
+
537
+ return pages
538
+
539
+
540
+ _RANSOMWARE_LIVE_BASE = "https://api.ransomware.live/v2"
541
+ _RANSOMWARE_LIVE_HEADERS = {"User-Agent": "VoidAccess-OSINT/1.0", "Accept": "application/json"}
542
+
543
+
544
+ def _rl_extract_onion_urls(group: dict) -> list[str]:
545
+ """Extract .onion leak-site URLs from a group dict (available sites first)."""
546
+ locations = group.get("locations") or []
547
+ if not isinstance(locations, list):
548
+ return []
549
+ # available=True sites first, then the rest
550
+ locations = sorted(locations, key=lambda l: not l.get("available", False))
551
+ urls: list[str] = []
552
+ for loc in locations:
553
+ fqdn = (loc.get("fqdn") or "").strip()
554
+ if fqdn and ".onion" in fqdn:
555
+ urls.append(fqdn if fqdn.startswith("http") else f"http://{fqdn}")
556
+ return urls
557
+
558
+
559
+ async def fetch_ransomware_live(query: str) -> list[dict]:
560
+ """
561
+ Search ransomware.live for threat group profiles, leak-site .onion addresses,
562
+ and recent victim claim URLs.
563
+
564
+ Produces three kinds of intelligence:
565
+ 1. Group profile + TTPs (text for entity extraction)
566
+ 2. Leak-site .onion addresses (scrape seeds — bypass search engine discovery)
567
+ 3. Individual victim claim URLs (specific .onion post pages to scrape)
568
+
569
+ Free public API — no key required.
570
+ """
571
+ q = (query or "").strip().lower()
572
+ if not q:
573
+ return []
574
+
575
+ results: list[dict] = []
576
+ timeout = aiohttp.ClientTimeout(total=25)
577
+
578
+ try:
579
+ async with aiohttp.ClientSession(headers=_RANSOMWARE_LIVE_HEADERS, timeout=timeout) as session:
580
+ # ── 1. Match groups from the full group index ──────────────────────
581
+ async with session.get(f"{_RANSOMWARE_LIVE_BASE}/groups") as resp:
582
+ if resp.status != 200:
583
+ logger.warning("ransomware.live /groups HTTP %s", resp.status)
584
+ return []
585
+ all_groups = await resp.json(content_type=None)
586
+
587
+ matched_summary: list[dict] = []
588
+ for g in (all_groups if isinstance(all_groups, list) else []):
589
+ name = (g.get("name") or "").lower()
590
+ if q in name:
591
+ matched_summary.append(g)
592
+
593
+ if not matched_summary:
594
+ logger.info("ransomware.live: no groups matched %r", query)
595
+ return []
596
+
597
+ logger.info("ransomware.live: %d groups matched %r", len(matched_summary), query)
598
+
599
+ # ── 2. Fetch full group detail for each match (has ttps, tools, locations) ──
600
+ async def _fetch_group_detail(gname: str) -> Optional[dict]:
601
+ try:
602
+ async with session.get(f"{_RANSOMWARE_LIVE_BASE}/group/{gname}") as r:
603
+ if r.status == 200:
604
+ text = await r.text()
605
+ if text.strip()[:1] in "[{":
606
+ return await r.json(content_type=None) if False else \
607
+ __import__("json").loads(text)
608
+ except Exception:
609
+ pass
610
+ return None
611
+
612
+ detail_tasks = [_fetch_group_detail(g.get("name", "")) for g in matched_summary[:5]]
613
+ details = await asyncio.gather(*detail_tasks, return_exceptions=True)
614
+
615
+ group_map: dict[str, dict] = {}
616
+ for g, detail in zip(matched_summary[:5], details):
617
+ gname = g.get("name", "")
618
+ if isinstance(detail, dict):
619
+ group_map[gname] = {**g, **detail}
620
+ else:
621
+ group_map[gname] = g
622
+
623
+ # ── 3. Pull recent victims and filter by matched groups ────────────
624
+ recent_victims: list[dict] = []
625
+ matched_names = {g.get("name", "").lower() for g in matched_summary}
626
+ for endpoint in ("/v2/recentvictims", "/v2/recentcyberattacks"):
627
+ try:
628
+ async with session.get(f"https://api.ransomware.live{endpoint}") as r:
629
+ if r.status == 200:
630
+ text = await r.text()
631
+ if text.strip()[:1] == "[":
632
+ raw: list = __import__("json").loads(text)
633
+ for v in raw:
634
+ if (v.get("group") or "").lower() in matched_names:
635
+ recent_victims.append(v)
636
+ except Exception:
637
+ pass
638
+
639
+ logger.info("ransomware.live: %d recent victims for matched groups", len(recent_victims))
640
+
641
+ # ── 4. Assemble results ───────────────────────────────────────────
642
+ for gname, gdata in group_map.items():
643
+ onion_urls = _rl_extract_onion_urls(gdata)
644
+
645
+ # Collect victims for this specific group
646
+ group_victims = [
647
+ v for v in recent_victims
648
+ if (v.get("group") or "").lower() == gname.lower()
649
+ ]
650
+
651
+ # Claim URLs are individual victim post pages on the leak site
652
+ claim_urls = [
653
+ v.get("claim_url") for v in group_victims
654
+ if v.get("claim_url") and ".onion" in (v.get("claim_url") or "")
655
+ ]
656
+
657
+ results.append({
658
+ "group": gname,
659
+ "description": gdata.get("description") or "",
660
+ "onion_urls": onion_urls,
661
+ "claim_urls": claim_urls[:30],
662
+ "victims": group_victims[:50],
663
+ "ttps": gdata.get("ttps") or [],
664
+ "tools": gdata.get("tools") or [],
665
+ "victim_count": gdata.get("_victim_count", 0),
666
+ })
667
+
668
+ except asyncio.TimeoutError:
669
+ logger.warning("ransomware.live: request timed out")
670
+ except aiohttp.ClientError as exc:
671
+ logger.warning("ransomware.live: client error: %s", exc)
672
+ except Exception as exc:
673
+ logger.warning("ransomware.live: unexpected error: %s", exc)
674
+
675
+ return results
676
+
677
+
678
+ def ransomwarelive_to_pages(groups: list[dict]) -> list[dict]:
679
+ """Convert ransomware.live group data into page-shaped dicts.
680
+
681
+ Produces two kinds of pages:
682
+ 1. A rich text summary page (for entity extraction)
683
+ 2. One stub page per discovered .onion URL (so the scraper will visit them)
684
+ """
685
+ pages: list[dict] = []
686
+
687
+ for gd in groups:
688
+ gname = gd.get("group", "Unknown")
689
+ lines: list[str] = [f"Ransomware Group Intelligence Report: {gname}"]
690
+
691
+ if gd.get("description"):
692
+ lines.append(f"\nDescription: {gd['description']}")
693
+
694
+ onion_urls = gd.get("onion_urls", [])
695
+ if onion_urls:
696
+ lines.append(f"\nLeak Site URLs: {', '.join(onion_urls)}")
697
+
698
+ victims = gd.get("victims", [])
699
+ if victims:
700
+ lines.append(f"\nKnown Victims ({len(victims)} total):")
701
+ for v in victims[:40]:
702
+ title = v.get("victim") or v.get("post_title") or v.get("website") or ""
703
+ domain = v.get("domain") or v.get("website") or ""
704
+ date = v.get("attackdate") or v.get("published") or v.get("date") or ""
705
+ country = v.get("country") or ""
706
+ activity = v.get("activity") or ""
707
+ victim_line = f" - {title}"
708
+ if domain and domain != title:
709
+ victim_line += f" ({domain})"
710
+ if country:
711
+ victim_line += f" [{country}]"
712
+ if date:
713
+ victim_line += f" {date}"
714
+ if activity:
715
+ victim_line += f" — {activity}"
716
+ lines.append(victim_line)
717
+
718
+ claim_urls = gd.get("claim_urls", [])
719
+
720
+ content = "\n".join(lines)
721
+ base_link = f"https://www.ransomware.live/group/{gname}"
722
+
723
+ pages.append({
724
+ "link": base_link,
725
+ "url": base_link,
726
+ "content": content,
727
+ "text": content,
728
+ "status": 200,
729
+ "source": "ransomware_live",
730
+ "title": f"ransomware.live — {gname}",
731
+ "via": "ransomware_live_api",
732
+ })
733
+
734
+ # Stub pages for each .onion leak site so the scraper will visit them
735
+ for onion_url in onion_urls:
736
+ if onion_url and ".onion" in onion_url:
737
+ stub = f"{gname} ransomware group leak site: {onion_url}"
738
+ pages.append({
739
+ "link": onion_url,
740
+ "url": onion_url,
741
+ "content": stub,
742
+ "text": stub,
743
+ "status": 200,
744
+ "source": "ransomware_live",
745
+ "title": f"{gname} leak site",
746
+ "via": "ransomware_live_onion_seed",
747
+ "_scrape_seed": True,
748
+ })
749
+
750
+ # Stub pages for individual victim claim URLs (specific post pages on leak sites)
751
+ for claim_url in claim_urls[:20]:
752
+ if claim_url and ".onion" in claim_url:
753
+ stub = f"{gname} ransomware victim post: {claim_url}"
754
+ pages.append({
755
+ "link": claim_url,
756
+ "url": claim_url,
757
+ "content": stub,
758
+ "text": stub,
759
+ "status": 200,
760
+ "source": "ransomware_live",
761
+ "title": f"{gname} victim claim",
762
+ "via": "ransomware_live_claim_seed",
763
+ "_scrape_seed": True,
764
+ })
765
+
766
+ return pages
767
+
768
+
769
+ async def _enrich_new_sources(query: str, entities: list[dict]) -> list[dict]:
770
+ """
771
+ Run the 4 new enrichment sources concurrently and return page-shaped dicts.
772
+
773
+ Sources:
774
+ - CISA KEV + advisories (cisa.py)
775
+ - Shodan InternetDB (shodan.py)
776
+ - VirusTotal (virustotal.py)
777
+ - Historical intel (historical_intel.py)
778
+ """
779
+ from sources.cisa import enrich_cisa
780
+ from sources.shodan import enrich_shodan
781
+ from sources.virustotal import enrich_virustotal
782
+ from sources.historical_intel import enrich_historical
783
+
784
+ async def _gather():
785
+ return await asyncio.gather(
786
+ enrich_cisa(query, entities),
787
+ enrich_shodan(entities),
788
+ enrich_virustotal(entities),
789
+ return_exceptions=True,
790
+ )
791
+
792
+ cisa_results, shodan_results, vt_results = [], [], []
793
+ try:
794
+ packed = await asyncio.wait_for(_gather(), timeout=55.0)
795
+ except asyncio.TimeoutError:
796
+ logger.warning("_enrich_new_sources: deadline exceeded")
797
+ return []
798
+
799
+ cisa_results, shodan_results, vt_results = packed
800
+
801
+ if isinstance(cisa_results, Exception):
802
+ logger.warning("CISA enrichment failed: %s", cisa_results)
803
+ cisa_results = []
804
+ if isinstance(shodan_results, Exception):
805
+ logger.warning("Shodan enrichment failed: %s", shodan_results)
806
+ shodan_results = []
807
+ if isinstance(vt_results, Exception):
808
+ logger.warning("VirusTotal enrichment failed: %s", vt_results)
809
+ vt_results = []
810
+
811
+ pages: list[dict] = []
812
+
813
+ if cisa_results:
814
+ pages.extend(_cisa_results_to_pages(cisa_results, query))
815
+ if shodan_results:
816
+ pages.extend(_shodan_results_to_pages(shodan_results))
817
+ if vt_results:
818
+ pages.extend(_vt_results_to_pages(vt_results))
819
+
820
+ if cisa_results or shodan_results or vt_results:
821
+ unenriched = _group_unenriched_entities(entities, cisa_results, shodan_results, vt_results)
822
+ if unenriched:
823
+ hist_pages = await enrich_historical(unenriched)
824
+ pages.extend(_historical_results_to_pages(hist_pages))
825
+
826
+ # Entity-based MITRE overlay: fires when the caller passes pre-extracted entities
827
+ # that contain actors but zero CVE/MITRE_TECHNIQUE results.
828
+ _actor_types = {"THREAT_ACTOR", "RANSOMWARE_GROUP", "MALWARE_FAMILY"}
829
+ _cve_mitre_types = {"CVE", "MITRE_TECHNIQUE"}
830
+ _actor_ents = [
831
+ e for e in entities
832
+ if (e.get("type") or e.get("entity_type", "")) in _actor_types
833
+ ]
834
+ _has_cve_or_mitre = any(
835
+ (e.get("type") or e.get("entity_type", "")) in _cve_mitre_types
836
+ for e in entities
837
+ )
838
+ if _actor_ents and not _has_cve_or_mitre:
839
+ from sources.historical_intel import get_techniques_for_actor
840
+ for _actor_ent in _actor_ents:
841
+ _actor_name = (
842
+ _actor_ent.get("value")
843
+ or _actor_ent.get("canonical_value")
844
+ or _actor_ent.get("entity_value", "")
845
+ )
846
+ if not _actor_name:
847
+ continue
848
+ try:
849
+ _techniques = await get_techniques_for_actor(_actor_name)
850
+ except Exception as _exc:
851
+ logger.warning("MITRE overlay: failed for '%s': %s", _actor_name, _exc)
852
+ _techniques = []
853
+ if not _techniques:
854
+ continue
855
+ logger.info(f"MITRE overlay: added {len(_techniques)} techniques for actor '{_actor_name}'")
856
+ _oc = (
857
+ f"MITRE ATT&CK Overlay: Techniques associated with {_actor_name} "
858
+ f"(source: mitre_attack_overlay)\n" + "\n".join(_techniques)
859
+ )
860
+ pages.append({
861
+ "link": "https://attack.mitre.org/",
862
+ "url": "https://attack.mitre.org/",
863
+ "content": _oc,
864
+ "text": _oc,
865
+ "status": 200,
866
+ "source": "mitre_attack_overlay",
867
+ "via": "mitre_overlay",
868
+ })
869
+
870
+ return pages
871
+
872
+
873
+ def _cisa_results_to_pages(results: list[dict], query: str) -> list[dict]:
874
+ pages: list[dict] = []
875
+ kev_entries = [r for r in results if r.get("source") == "cisa_kev"]
876
+ adv_entries = [r for r in results if r.get("source") == "cisa_advisory"]
877
+
878
+ if kev_entries:
879
+ lines = ["CISA Known Exploited Vulnerabilities (KEV) Catalog\n"]
880
+ for r in kev_entries:
881
+ lines.append(f"CVE: {r.get('entity_value', '')}")
882
+ if r.get("vendor_project"):
883
+ lines.append(f" Vendor/Project: {r['vendor_project']}")
884
+ if r.get("product"):
885
+ lines.append(f" Product: {r['product']}")
886
+ if r.get("vulnerability_name"):
887
+ lines.append(f" Vulnerability: {r['vulnerability_name']}")
888
+ if r.get("date_added"):
889
+ lines.append(f" Date Added to KEV: {r['date_added']}")
890
+ if r.get("short_description"):
891
+ lines.append(f" Description: {r['short_description']}")
892
+ lines.append("")
893
+ pages.append({
894
+ "link": "https://www.cisa.gov/known-exploited-vulnerabilities-catalog",
895
+ "url": "https://www.cisa.gov/known-exploited-vulnerabilities-catalog",
896
+ "content": "\n".join(lines),
897
+ "text": "\n".join(lines),
898
+ "status": 200,
899
+ "source": "cisa_kev",
900
+ "via": "cisa_feed",
901
+ })
902
+
903
+ if adv_entries:
904
+ lines = ["CISA Cybersecurity Advisories\n"]
905
+ for r in adv_entries:
906
+ lines.append(f"Title: {r.get('advisory_title', '')}")
907
+ if r.get("advisory_url"):
908
+ lines.append(f" URL: {r['advisory_url']}")
909
+ if r.get("advisory_date"):
910
+ lines.append(f" Date: {r['advisory_date']}")
911
+ lines.append("")
912
+ pages.append({
913
+ "link": "https://www.cisa.gov/cybersecurity-advisories",
914
+ "url": "https://www.cisa.gov/cybersecurity-advisories",
915
+ "content": "\n".join(lines),
916
+ "text": "\n".join(lines),
917
+ "status": 200,
918
+ "source": "cisa_advisory",
919
+ "via": "cisa_feed",
920
+ })
921
+
922
+ return pages
923
+
924
+
925
+ def _shodan_results_to_pages(results: list[dict]) -> list[dict]:
926
+ pages: list[dict] = []
927
+ for r in results:
928
+ lines = [f"Shodan InternetDB: {r.get('entity_value', '')}\n"]
929
+ if r.get("open_ports"):
930
+ lines.append(f"Open Ports: {', '.join(str(p) for p in r['open_ports'])}")
931
+ if r.get("hostnames"):
932
+ lines.append(f"Hostnames: {', '.join(r['hostnames'])}")
933
+ if r.get("tags"):
934
+ lines.append(f"Tags: {', '.join(r['tags'])}")
935
+ if r.get("vulns"):
936
+ lines.append(f"Vulnerabilities: {', '.join(r['vulns'])}")
937
+ if r.get("correlated_cves"):
938
+ lines.append(f"Correlated CVEs (also extracted): {', '.join(r['correlated_cves'])}")
939
+ if r.get("high_confidence_c2"):
940
+ lines.append("** HIGH CONFIDENCE C2 **")
941
+ pages.append({
942
+ "link": f"https://internetdb.shodan.io/{r.get('entity_value', '')}",
943
+ "url": f"https://internetdb.shodan.io/{r.get('entity_value', '')}",
944
+ "content": "\n".join(lines),
945
+ "text": "\n".join(lines),
946
+ "status": 200,
947
+ "source": "shodan_internetdb",
948
+ "via": "shodan_api",
949
+ })
950
+ return pages
951
+
952
+
953
+ def _vt_results_to_pages(results: list[dict]) -> list[dict]:
954
+ pages: list[dict] = []
955
+ for r in results:
956
+ lines = [f"VirusTotal: {r.get('entity_value', '')}\n"]
957
+ lines.append(f"Detection: {r.get('malicious_count', 0)}/{r.get('total_engines', 0)} ({r.get('detection_ratio', 0):.0%})")
958
+ if r.get("suggested_threat_label"):
959
+ lines.append(f"Threat Label: {r['suggested_threat_label']}")
960
+ if r.get("first_seen"):
961
+ lines.append(f"First Seen: {r['first_seen']}")
962
+ if r.get("last_seen"):
963
+ lines.append(f"Last Seen: {r['last_seen']}")
964
+ if r.get("confirmed_malicious"):
965
+ lines.append("** CONFIRMED MALICIOUS **")
966
+ pages.append({
967
+ "link": f"https://www.virustotal.com/gui/file/{r.get('entity_value', '')}",
968
+ "url": f"https://www.virustotal.com/gui/file/{r.get('entity_value', '')}",
969
+ "content": "\n".join(lines),
970
+ "text": "\n".join(lines),
971
+ "status": 200,
972
+ "source": "virustotal",
973
+ "via": "virustotal_api",
974
+ })
975
+ return pages
976
+
977
+
978
+ def _group_unenriched_entities(
979
+ entities: list[dict],
980
+ cisa_results: list[dict],
981
+ shodan_results: list[dict],
982
+ vt_results: list[dict],
983
+ ) -> dict[str, list[dict]]:
984
+ """
985
+ Determine which THREAT_ACTOR / RANSOMWARE_GROUP / MALWARE_FAMILY entities
986
+ received zero enrichment results from CISA, Shodan, and VT.
987
+ Returns a dict mapping entity type -> list of entities with no enrichment.
988
+ """
989
+ fallback_types = {"THREAT_ACTOR", "RANSOMWARE_GROUP", "MALWARE_FAMILY"}
990
+ ent_by_type: dict[str, list[dict]] = {t: [] for t in fallback_types}
991
+
992
+ for e in entities:
993
+ et = e.get("type") or e.get("entity_type", "")
994
+ if et in fallback_types:
995
+ ent_by_type[et].append(e)
996
+
997
+ enriched_values: set[str] = set()
998
+ for r in cisa_results:
999
+ ev = r.get("entity_value", "")
1000
+ if ev:
1001
+ enriched_values.add(ev.lower())
1002
+ for r in shodan_results:
1003
+ ev = r.get("entity_value", "")
1004
+ if ev:
1005
+ enriched_values.add(ev.lower())
1006
+ for r in vt_results:
1007
+ ev = r.get("entity_value", "")
1008
+ if ev:
1009
+ enriched_values.add(ev.lower())
1010
+
1011
+ result: dict[str, list[dict]] = {}
1012
+ for et, ent_list in ent_by_type.items():
1013
+ unenriched = [
1014
+ ent for ent in ent_list
1015
+ if (ent.get("value") or ent.get("entity_value", "")).lower() not in enriched_values
1016
+ ]
1017
+ if unenriched:
1018
+ result[et] = unenriched
1019
+
1020
+ return result
1021
+
1022
+
1023
+ def _historical_results_to_pages(results: list[dict]) -> list[dict]:
1024
+ pages: list[dict] = []
1025
+ for r in results:
1026
+ src = r.get("source", "")
1027
+ lines = [f"Historical Intel: {r.get('entity_value', '')}\n"]
1028
+ if src == "mitre_attack":
1029
+ lines.append(f"MITRE ATT&CK ID: {r.get('mitre_id', '')}")
1030
+ lines.append(f"Name: {r.get('mitre_name', '')}")
1031
+ if r.get("aliases"):
1032
+ lines.append(f"Aliases: {', '.join(r['aliases'])}")
1033
+ if r.get("techniques"):
1034
+ lines.append(f"Techniques: {', '.join(r['techniques'])}")
1035
+ if r.get("description"):
1036
+ lines.append(f"Description: {r['description']}")
1037
+ pages.append({
1038
+ "link": f"https://attack.mitre.org/groups/{r.get('mitre_id', '')}",
1039
+ "url": f"https://attack.mitre.org/groups/{r.get('mitre_id', '')}",
1040
+ "content": "\n".join(lines),
1041
+ "text": "\n".join(lines),
1042
+ "status": 200,
1043
+ "source": "mitre_attack",
1044
+ "via": "mitre_cti",
1045
+ })
1046
+ elif src == "fbi_doj_press":
1047
+ lines.append(f"Title: {r.get('press_title', '')}")
1048
+ lines.append(f"Date: {r.get('press_date', '')}")
1049
+ pages.append({
1050
+ "link": r.get("press_url", ""),
1051
+ "url": r.get("press_url", ""),
1052
+ "content": "\n".join(lines),
1053
+ "text": "\n".join(lines),
1054
+ "status": 200,
1055
+ "source": "fbi_doj_press",
1056
+ "via": "fbi_rss",
1057
+ })
1058
+ elif src == "cisa_advisory_historical":
1059
+ lines.append(f"Title: {r.get('advisory_title', '')}")
1060
+ lines.append(f"URL: {r.get('advisory_url', '')}")
1061
+ lines.append(f"Date: {r.get('advisory_date', '')}")
1062
+ pages.append({
1063
+ "link": r.get("advisory_url", ""),
1064
+ "url": r.get("advisory_url", ""),
1065
+ "content": "\n".join(lines),
1066
+ "text": "\n".join(lines),
1067
+ "status": 200,
1068
+ "source": "cisa_advisory",
1069
+ "via": "cisa_feed",
1070
+ })
1071
+ return pages
1072
+
1073
+
1074
+ async def run_dns_enrichment(extracted_entities: list[dict]) -> dict:
1075
+ """
1076
+ Run DNS/WHOIS enrichment on extracted IP and domain entities.
1077
+ Returns ip_enrichments, domain_enrichments, new_entities, infrastructure_clusters.
1078
+ """
1079
+ try:
1080
+ from sources.dns_enrichment import enrich_with_dns
1081
+ return await enrich_with_dns(extracted_entities)
1082
+ except Exception as e:
1083
+ logger.error("DNS enrichment error: %s", e)
1084
+ return {
1085
+ "ip_enrichments": {},
1086
+ "domain_enrichments": {},
1087
+ "new_entities": [],
1088
+ "infrastructure_clusters": [],
1089
+ }
1090
+
1091
+
1092
+ async def enrich_investigation(
1093
+ query: str,
1094
+ otx_api_key: Optional[str] = None,
1095
+ entities: Optional[list[dict]] = None,
1096
+ ) -> list[dict]:
1097
+ """
1098
+ Run all threat intel sources in parallel; return page dicts for extraction.
1099
+
1100
+ Sources:
1101
+ - OTX (AlienVault) — requires OTX_API_KEY
1102
+ - MalwareBazaar — free (ABUSECH_API_KEY improves rate limits)
1103
+ - ThreatFox — free
1104
+ - URLhaus — free
1105
+ - ransomware.live — free, no key required
1106
+ - CISA KEV + advisories — free, no key required (clearnet)
1107
+ - Shodan InternetDB — free, no key required (clearnet)
1108
+ - VirusTotal — requires VT_API_KEY (clearnet)
1109
+
1110
+ Completes within ~60s (enforced via ``asyncio.wait_for``).
1111
+ """
1112
+ logger.info("Starting threat intel enrichment for: %s", query)
1113
+
1114
+ _entities = entities if entities is not None else []
1115
+
1116
+ async def _gather():
1117
+ return await asyncio.gather(
1118
+ fetch_otx_pulses(query, otx_api_key or "", limit=20),
1119
+ fetch_malwarebazaar(query, limit=20),
1120
+ fetch_threatfox(query, limit=50),
1121
+ fetch_urlhaus(query, limit=20),
1122
+ fetch_ransomware_live(query),
1123
+ _enrich_new_sources(query, _entities),
1124
+ return_exceptions=True,
1125
+ )
1126
+
1127
+ try:
1128
+ packed = await asyncio.wait_for(_gather(), timeout=59.0)
1129
+ except asyncio.TimeoutError:
1130
+ logger.warning("Enrichment: deadline exceeded (59s), returning empty")
1131
+ return []
1132
+
1133
+ otx_pulses, mb_results, tf_results, uh_results, rl_groups, new_pages = packed
1134
+
1135
+ if isinstance(otx_pulses, Exception):
1136
+ logger.warning("OTX failed: %s", otx_pulses)
1137
+ otx_pulses = []
1138
+ if isinstance(mb_results, Exception):
1139
+ logger.warning("MalwareBazaar failed: %s", mb_results)
1140
+ mb_results = []
1141
+ if isinstance(tf_results, Exception):
1142
+ logger.warning("ThreatFox failed: %s", tf_results)
1143
+ tf_results = []
1144
+ if isinstance(uh_results, Exception):
1145
+ logger.warning("URLhaus failed: %s", uh_results)
1146
+ uh_results = []
1147
+ if isinstance(rl_groups, Exception):
1148
+ logger.warning("ransomware.live failed: %s", rl_groups)
1149
+ rl_groups = []
1150
+ if isinstance(new_pages, Exception):
1151
+ logger.warning("New enrichment sources failed: %s", new_pages)
1152
+ new_pages = []
1153
+
1154
+ pages: list[dict] = []
1155
+
1156
+ for pulse in otx_pulses:
1157
+ page = otx_pulse_to_page(pulse)
1158
+ if page.get("content"):
1159
+ pages.append(page)
1160
+
1161
+ pages.extend(abusech_to_pages(mb_results, tf_results, uh_results))
1162
+ pages.extend(ransomwarelive_to_pages(rl_groups))
1163
+ pages.extend(new_pages or [])
1164
+
1165
+ # Page-scan MITRE overlay: extract actor names from ransomware.live / OTX results
1166
+ # and inject T-codes when no MITRE techniques appear in any enrichment page.
1167
+ # This fires without a pre-extracted entity list, covering the current pipeline.
1168
+ _overlay_actor_names: list[str] = []
1169
+ for _g in (rl_groups if isinstance(rl_groups, list) else []):
1170
+ _gname = _g.get("group", "")
1171
+ if _gname and _gname not in _overlay_actor_names:
1172
+ _overlay_actor_names.append(_gname)
1173
+ for _pulse in (otx_pulses if isinstance(otx_pulses, list) else []):
1174
+ for _mf in (_pulse.get("malware_families") or []):
1175
+ _mfname = _mf if isinstance(_mf, str) else (_mf.get("display_name") or _mf.get("name", ""))
1176
+ if _mfname and _mfname not in _overlay_actor_names:
1177
+ _overlay_actor_names.append(_mfname)
1178
+
1179
+ if _overlay_actor_names:
1180
+ _t_pattern = re.compile(r'\bT\d{4}(?:\.\d{3})?\b')
1181
+ _t_found = any(
1182
+ _t_pattern.search(p.get("content", "") or p.get("text", ""))
1183
+ for p in pages
1184
+ )
1185
+ if not _t_found:
1186
+ from sources.historical_intel import get_techniques_for_actor
1187
+
1188
+ OVERLAY_TIMEOUT = 20
1189
+
1190
+ _q_lower = query.lower()
1191
+ _capped = _overlay_actor_names[:10]
1192
+ _prioritized = sorted(
1193
+ _capped,
1194
+ key=lambda a: 0 if a.lower() in _q_lower else 1,
1195
+ )
1196
+
1197
+ async def _run_overlay():
1198
+ _results = []
1199
+ for _aname in _prioritized:
1200
+ try:
1201
+ _techs = await get_techniques_for_actor(_aname)
1202
+ except Exception as _oexc:
1203
+ logger.warning("MITRE overlay: failed for '%s': %s", _aname, _oexc)
1204
+ _techs = []
1205
+ if not _techs:
1206
+ continue
1207
+ logger.info(f"MITRE overlay: added {len(_techs)} techniques for actor '{_aname}'")
1208
+ _ocontent = (
1209
+ f"MITRE ATT&CK Overlay: Techniques associated with {_aname} "
1210
+ f"(source: mitre_attack_overlay)\n" + "\n".join(_techs)
1211
+ )
1212
+ _results.append({
1213
+ "link": "https://attack.mitre.org/",
1214
+ "url": "https://attack.mitre.org/",
1215
+ "content": _ocontent,
1216
+ "text": _ocontent,
1217
+ "status": 200,
1218
+ "source": "mitre_attack_overlay",
1219
+ "via": "mitre_overlay",
1220
+ })
1221
+ return _results
1222
+
1223
+ try:
1224
+ _overlay_pages = await asyncio.wait_for(
1225
+ _run_overlay(),
1226
+ timeout=OVERLAY_TIMEOUT,
1227
+ )
1228
+ pages.extend(_overlay_pages)
1229
+ except asyncio.TimeoutError:
1230
+ logger.warning(
1231
+ "MITRE overlay timed out after %ds — skipping",
1232
+ OVERLAY_TIMEOUT,
1233
+ )
1234
+
1235
+ total_onion_seeds = sum(1 for p in pages if p.get("_scrape_seed"))
1236
+ logger.info(
1237
+ "Enrichment complete: %s OTX pulses, %s MalwareBazaar, "
1238
+ "%s ThreatFox IOCs, %s URLhaus, %s ransomware.live groups "
1239
+ "(%s .onion seeds) → %s enrichment pages total",
1240
+ len(otx_pulses), len(mb_results), len(tf_results),
1241
+ len(uh_results), len(rl_groups), total_onion_seeds, len(pages),
1242
+ )
1243
+
1244
+ return pages