voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,856 @@
1
+ """
2
+ sources/hash_reputation.py — File hash behavioral enrichment.
3
+
4
+ Enriches FILE_HASH_MD5, FILE_HASH_SHA1, FILE_HASH_SHA256 entities with
5
+ malware profiles from four sources:
6
+ - Hybrid Analysis: full behavioral sandbox (requires HYBRID_ANALYSIS_API_KEY)
7
+ - MalwareBazaar (abuse.ch): family classification — free, no auth
8
+ - ThreatFox (abuse.ch): IOC database and associated IOCs — free, no auth
9
+ - VirusTotal: AV detection data and sandbox network IOCs (requires VT_API_KEY)
10
+
11
+ Hashes are never suppressed — even a clean hash is a useful data point.
12
+ Cache TTL: 48 h (hashes are immutable).
13
+ Limit: MAX_HASHES = 50 per investigation (SHA256 → SHA1 → MD5 priority).
14
+
15
+ Public interface
16
+ ----------------
17
+ async query_hybrid_analysis(hash_value) → dict
18
+ async query_malwarebazaar(hash_value) → dict
19
+ async query_threatfox(hash_value) → dict
20
+ async query_virustotal_hash(hash_value) → dict
21
+ async check_hash_reputation(hash_value, hash_type, base_conf) → dict
22
+ async enrich_hash_entities(extraction_results, investigation_id) → (results, stats)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import asyncio
28
+ import json
29
+ import logging
30
+ import os
31
+ import re
32
+ import time
33
+ from typing import Any
34
+
35
+ import aiohttp
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ MAX_HASHES = 50
40
+ MAX_IPS_PER_HASH = 10
41
+ MAX_DOMAINS_PER_HASH = 10
42
+
43
+ HASH_CACHE_TTL = 172800.0 # 48 hours
44
+
45
+ MALWAREBAZAAR_URL = "https://mb-api.abuse.ch/api/v1/"
46
+ THREATFOX_URL = "https://threatfox-api.abuse.ch/api/v1/"
47
+ HA_BASE_URL = "https://www.hybrid-analysis.com/api/v2"
48
+ VT_BASE_URL = "https://www.virustotal.com/api/v3"
49
+
50
+ # In-memory per-hash cache: {hash_value: {"result": dict, "loaded_at": float}}
51
+ _hash_cache: dict[str, dict] = {}
52
+
53
+ # Processing priority: lower number = higher priority
54
+ HASH_TYPES = {
55
+ "FILE_HASH_SHA256": 1,
56
+ "FILE_HASH_SHA1": 2,
57
+ "FILE_HASH_MD5": 3,
58
+ }
59
+
60
+ _IP_RE = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
61
+ _HASH_RE = re.compile(r"^[0-9a-fA-F]{32}$|^[0-9a-fA-F]{40}$|^[0-9a-fA-F]{64}$")
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Helpers
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def _is_valid_hash(value: str) -> bool:
69
+ return bool(value and _HASH_RE.match(value.strip()))
70
+
71
+
72
+ def _normalize_family(name: str) -> str:
73
+ """Return a lowercase slug for a malware family name."""
74
+ return re.sub(r"[^a-z0-9]+", "_", (name or "").lower()).strip("_")[:50]
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Source: Hybrid Analysis
79
+ # ---------------------------------------------------------------------------
80
+
81
+ async def query_hybrid_analysis(hash_value: str) -> dict[str, Any]:
82
+ """
83
+ POST /search/hash to Hybrid Analysis. Returns behavioral analysis or {"found": False}.
84
+
85
+ Requires HYBRID_ANALYSIS_API_KEY. Free tier available at hybrid-analysis.com.
86
+ """
87
+ api_key = os.getenv("HYBRID_ANALYSIS_API_KEY", "").strip()
88
+ if not api_key:
89
+ logger.debug("hash_reputation: Hybrid Analysis skipped — no API key")
90
+ return {"found": False, "source": "hybrid_analysis_skipped"}
91
+
92
+ try:
93
+ headers = {
94
+ "api-key": api_key,
95
+ "user-agent": "Falcon Sandbox",
96
+ }
97
+ timeout = aiohttp.ClientTimeout(total=20)
98
+ async with aiohttp.ClientSession(timeout=timeout) as session:
99
+ async with session.post(
100
+ f"{HA_BASE_URL}/search/hash",
101
+ data={"hash": hash_value},
102
+ headers=headers,
103
+ ) as resp:
104
+ if resp.status == 401:
105
+ logger.warning("hash_reputation: Hybrid Analysis — invalid API key")
106
+ return {"found": False, "source": "hybrid_analysis_auth_error"}
107
+ if resp.status == 429:
108
+ logger.warning("hash_reputation: Hybrid Analysis — rate limited")
109
+ return {"found": False, "source": "hybrid_analysis_rate_limited"}
110
+ if resp.status != 200:
111
+ logger.debug(
112
+ "hash_reputation: Hybrid Analysis → HTTP %s for %s",
113
+ resp.status, hash_value[:16],
114
+ )
115
+ return {"found": False, "source": "hybrid_analysis_error"}
116
+ data = await resp.json()
117
+ except Exception as exc:
118
+ logger.debug("hash_reputation: Hybrid Analysis failed for %s: %s", hash_value[:16], exc)
119
+ return {"found": False, "source": "hybrid_analysis_error"}
120
+
121
+ if not data or not isinstance(data, list):
122
+ return {"found": False, "source": "hybrid_analysis_not_found"}
123
+
124
+ report = data[0]
125
+
126
+ network = report.get("network") or {}
127
+
128
+ contacted_ips: list[str] = []
129
+ for host in (network.get("hosts") or []):
130
+ ip = host.get("ip") if isinstance(host, dict) else (host if isinstance(host, str) else "")
131
+ if ip and _IP_RE.match(ip):
132
+ contacted_ips.append(ip)
133
+
134
+ contacted_domains: list[str] = []
135
+ for d in (network.get("domains") or []):
136
+ if isinstance(d, str) and d:
137
+ contacted_domains.append(d)
138
+ for http_entry in (network.get("http") or [])[:20]:
139
+ if isinstance(http_entry, dict):
140
+ host = http_entry.get("host") or ""
141
+ if host and not _IP_RE.match(host) and host not in contacted_domains:
142
+ contacted_domains.append(host)
143
+
144
+ return {
145
+ "found": True,
146
+ "source": "hybrid_analysis",
147
+ "verdict": (report.get("verdict") or "").lower(),
148
+ "malware_family": report.get("vx_family") or "",
149
+ "threat_score": report.get("threat_score"),
150
+ "av_detections": report.get("total_av_detections"),
151
+ "av_total": report.get("av_detect"),
152
+ "file_type": report.get("type_short") or report.get("type") or "",
153
+ "tags": list(report.get("tags") or []),
154
+ "contacted_ips": contacted_ips[:MAX_IPS_PER_HASH],
155
+ "contacted_domains": contacted_domains[:MAX_DOMAINS_PER_HASH],
156
+ }
157
+
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Source: MalwareBazaar
161
+ # ---------------------------------------------------------------------------
162
+
163
+ async def query_malwarebazaar(hash_value: str) -> dict[str, Any]:
164
+ """
165
+ POST get_info to MalwareBazaar for a file hash.
166
+
167
+ No API key required. Returns malware family, file type, first seen date.
168
+ """
169
+ try:
170
+ timeout = aiohttp.ClientTimeout(total=15)
171
+ headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
172
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
173
+ async with session.post(
174
+ MALWAREBAZAAR_URL,
175
+ data={"query": "get_info", "hash": hash_value},
176
+ ) as resp:
177
+ if resp.status != 200:
178
+ logger.debug(
179
+ "hash_reputation: MalwareBazaar → HTTP %s for %s",
180
+ resp.status, hash_value[:16],
181
+ )
182
+ return {"found": False, "source": "malwarebazaar_error"}
183
+ data = await resp.json()
184
+ except Exception as exc:
185
+ logger.debug("hash_reputation: MalwareBazaar failed for %s: %s", hash_value[:16], exc)
186
+ return {"found": False, "source": "malwarebazaar_error"}
187
+
188
+ if data.get("query_status") != "ok":
189
+ return {"found": False, "source": "malwarebazaar_not_found"}
190
+
191
+ samples = data.get("data") or []
192
+ if not samples:
193
+ return {"found": False, "source": "malwarebazaar_not_found"}
194
+
195
+ sample = samples[0]
196
+ return {
197
+ "found": True,
198
+ "source": "malwarebazaar",
199
+ "malware_family": sample.get("signature") or "",
200
+ "file_type": sample.get("file_type") or "",
201
+ "first_seen": sample.get("first_seen") or "",
202
+ "tags": list(sample.get("tags") or []),
203
+ "sha256": sample.get("sha256_hash") or "",
204
+ }
205
+
206
+
207
+ # ---------------------------------------------------------------------------
208
+ # Source: ThreatFox
209
+ # ---------------------------------------------------------------------------
210
+
211
+ async def query_threatfox(hash_value: str) -> dict[str, Any]:
212
+ """
213
+ POST search_ioc to ThreatFox for a file hash.
214
+
215
+ No API key required. Returns malware family and associated IOCs.
216
+ """
217
+ try:
218
+ timeout = aiohttp.ClientTimeout(total=15)
219
+ headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
220
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
221
+ async with session.post(
222
+ THREATFOX_URL,
223
+ json={"query": "search_ioc", "search_term": hash_value},
224
+ ) as resp:
225
+ if resp.status != 200:
226
+ logger.debug(
227
+ "hash_reputation: ThreatFox → HTTP %s for %s",
228
+ resp.status, hash_value[:16],
229
+ )
230
+ return {"found": False, "source": "threatfox_error"}
231
+ data = await resp.json()
232
+ except Exception as exc:
233
+ logger.debug("hash_reputation: ThreatFox failed for %s: %s", hash_value[:16], exc)
234
+ return {"found": False, "source": "threatfox_error"}
235
+
236
+ if data.get("query_status") != "ok":
237
+ return {"found": False, "source": "threatfox_not_found"}
238
+
239
+ iocs = data.get("data") or []
240
+ if not iocs:
241
+ return {"found": False, "source": "threatfox_not_found"}
242
+
243
+ primary = iocs[0]
244
+
245
+ # Collect associated IOCs from the same submission (exclude the queried hash)
246
+ associated_iocs: list[dict] = []
247
+ for item in iocs[:20]:
248
+ ioc_type = item.get("ioc_type") or ""
249
+ ioc_value = item.get("ioc") or ""
250
+ if ioc_type and ioc_value and ioc_value.lower() != hash_value.lower():
251
+ associated_iocs.append({
252
+ "ioc_type": ioc_type,
253
+ "ioc_value": ioc_value,
254
+ "malware": item.get("malware_printable") or item.get("malware") or "",
255
+ })
256
+
257
+ return {
258
+ "found": True,
259
+ "source": "threatfox",
260
+ "malware_family": primary.get("malware_printable") or primary.get("malware") or "",
261
+ "confidence_level": primary.get("confidence_level"),
262
+ "first_seen": primary.get("first_seen") or "",
263
+ "tags": list(primary.get("tags") or []),
264
+ "associated_iocs": associated_iocs,
265
+ }
266
+
267
+
268
+ # ---------------------------------------------------------------------------
269
+ # Source: VirusTotal (extended)
270
+ # ---------------------------------------------------------------------------
271
+
272
+ async def query_virustotal_hash(hash_value: str) -> dict[str, Any]:
273
+ """
274
+ Extended VirusTotal lookup: detection stats + optional behaviour network IOCs.
275
+
276
+ GET /files/{hash} for core data.
277
+ GET /files/{hash}/behaviours for sandbox network contacts (premium feature;
278
+ gracefully skipped if unavailable or on free tier).
279
+
280
+ Requires VT_API_KEY.
281
+ """
282
+ vt_key = os.getenv("VT_API_KEY", "").strip()
283
+ if not vt_key:
284
+ logger.debug("hash_reputation: VirusTotal skipped — no API key")
285
+ return {"found": False, "source": "virustotal_skipped"}
286
+
287
+ headers = {"x-apikey": vt_key}
288
+ timeout = aiohttp.ClientTimeout(total=20)
289
+
290
+ try:
291
+ async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
292
+ async with session.get(f"{VT_BASE_URL}/files/{hash_value}") as resp:
293
+ if resp.status == 404:
294
+ return {"found": False, "source": "virustotal_not_found"}
295
+ if resp.status in (401, 403):
296
+ logger.warning("hash_reputation: VirusTotal — auth error")
297
+ return {"found": False, "source": "virustotal_auth_error"}
298
+ if resp.status == 429:
299
+ logger.warning("hash_reputation: VirusTotal — rate limited")
300
+ return {"found": False, "source": "virustotal_rate_limited"}
301
+ if resp.status != 200:
302
+ logger.debug(
303
+ "hash_reputation: VirusTotal → HTTP %s for %s",
304
+ resp.status, hash_value[:16],
305
+ )
306
+ return {"found": False, "source": "virustotal_error"}
307
+ file_data = await resp.json()
308
+
309
+ attr = file_data.get("data", {}).get("attributes", {})
310
+ stats = attr.get("last_analysis_stats", {})
311
+ malicious = stats.get("malicious", 0)
312
+ total = sum(stats.values())
313
+
314
+ # Extract malware family: prefer popular threat classification;
315
+ # fall back to reliable AV vendor names (Kaspersky > Microsoft > Symantec)
316
+ family = ""
317
+ threat_cls = attr.get("popular_threat_classification") or {}
318
+ family = threat_cls.get("suggested_threat_label") or ""
319
+ if not family:
320
+ reliable_vendors = [
321
+ "Kaspersky", "Microsoft", "Symantec", "Norton", "Bitdefender", "ESET-NOD32",
322
+ ]
323
+ engine_results = attr.get("last_analysis_results") or {}
324
+ for vendor in reliable_vendors:
325
+ vr = engine_results.get(vendor) or {}
326
+ if vr.get("result"):
327
+ family = vr["result"]
328
+ break
329
+
330
+ result: dict[str, Any] = {
331
+ "found": True,
332
+ "source": "virustotal",
333
+ "malicious": malicious,
334
+ "total": total,
335
+ "malware_family": family,
336
+ "file_type": attr.get("type_description") or attr.get("type_tag") or "",
337
+ "first_seen": str(attr.get("first_submission_date") or ""),
338
+ "last_seen": str(attr.get("last_analysis_date") or ""),
339
+ "contacted_ips": [],
340
+ "contacted_domains": [],
341
+ "dropped_hashes": [],
342
+ }
343
+
344
+ # Try behaviour sandbox data (premium / enterprise; gracefully skip)
345
+ try:
346
+ async with session.get(
347
+ f"{VT_BASE_URL}/files/{hash_value}/behaviours"
348
+ ) as behav_resp:
349
+ if behav_resp.status == 200:
350
+ behav_data = await behav_resp.json()
351
+ all_ips: set[str] = set()
352
+ all_domains: set[str] = set()
353
+ all_hashes: list[str] = []
354
+
355
+ for behav in (behav_data.get("data") or [])[:3]:
356
+ ba = behav.get("attributes") or {}
357
+
358
+ for entry in (ba.get("ip_traffic") or [])[:30]:
359
+ ip = (
360
+ entry.get("destination_ip")
361
+ if isinstance(entry, dict) else str(entry)
362
+ )
363
+ if ip and _IP_RE.match(ip):
364
+ all_ips.add(ip)
365
+
366
+ for entry in (ba.get("dns_lookups") or [])[:30]:
367
+ d = (
368
+ entry.get("hostname")
369
+ if isinstance(entry, dict) else str(entry)
370
+ )
371
+ if d and not d.endswith(".onion"):
372
+ all_domains.add(d)
373
+
374
+ for dropped in (ba.get("files_dropped") or [])[:10]:
375
+ if isinstance(dropped, dict):
376
+ sha = dropped.get("sha256") or ""
377
+ if sha and sha.lower() != hash_value.lower():
378
+ all_hashes.append(sha)
379
+
380
+ result["contacted_ips"] = list(all_ips)[:MAX_IPS_PER_HASH]
381
+ result["contacted_domains"] = list(all_domains)[:MAX_DOMAINS_PER_HASH]
382
+ result["dropped_hashes"] = list(dict.fromkeys(all_hashes))[:5]
383
+ except Exception as behav_exc:
384
+ logger.debug(
385
+ "hash_reputation: VT behaviours unavailable for %s: %s",
386
+ hash_value[:16], behav_exc,
387
+ )
388
+
389
+ return result
390
+
391
+ except asyncio.TimeoutError:
392
+ logger.warning("hash_reputation: VirusTotal timeout for %s", hash_value[:16])
393
+ return {"found": False, "source": "virustotal_timeout"}
394
+ except Exception as exc:
395
+ logger.debug("hash_reputation: VirusTotal failed for %s: %s", hash_value[:16], exc)
396
+ return {"found": False, "source": "virustotal_error"}
397
+
398
+
399
+ # ---------------------------------------------------------------------------
400
+ # Core reputation check
401
+ # ---------------------------------------------------------------------------
402
+
403
+ async def check_hash_reputation(
404
+ hash_value: str,
405
+ hash_type: str = "FILE_HASH_SHA256",
406
+ base_confidence: float = 1.0,
407
+ ) -> dict[str, Any]:
408
+ """
409
+ Run all four reputation checks for a single file hash concurrently.
410
+
411
+ Returns a dict with keys:
412
+ hash, hash_type, verdict, malware_families, threat_score,
413
+ av_detections, av_total, file_type, first_seen, new_entities,
414
+ tags, confidence_delta, suppress (always False for hashes)
415
+ """
416
+ # Serve from cache when fresh
417
+ cached = _hash_cache.get(hash_value)
418
+ if cached and (time.time() - cached["loaded_at"]) < HASH_CACHE_TTL:
419
+ return cached["result"]
420
+
421
+ result: dict[str, Any] = {
422
+ "hash": hash_value,
423
+ "hash_type": hash_type,
424
+ "verdict": None,
425
+ "malware_families": [],
426
+ "threat_score": None,
427
+ "av_detections": None,
428
+ "av_total": None,
429
+ "file_type": None,
430
+ "first_seen": None,
431
+ "new_entities": [],
432
+ "tags": [],
433
+ "confidence_delta": 0.0,
434
+ "suppress": False,
435
+ }
436
+
437
+ if not _is_valid_hash(hash_value):
438
+ return result
439
+
440
+ ha_result, mb_result, tf_result, vt_result = await asyncio.gather(
441
+ query_hybrid_analysis(hash_value),
442
+ query_malwarebazaar(hash_value),
443
+ query_threatfox(hash_value),
444
+ query_virustotal_hash(hash_value),
445
+ return_exceptions=True,
446
+ )
447
+
448
+ if isinstance(ha_result, Exception):
449
+ logger.debug("hash_reputation: HA exception for %s: %s", hash_value[:16], ha_result)
450
+ ha_result = {"found": False}
451
+ if isinstance(mb_result, Exception):
452
+ logger.debug("hash_reputation: MB exception for %s: %s", hash_value[:16], mb_result)
453
+ mb_result = {"found": False}
454
+ if isinstance(tf_result, Exception):
455
+ logger.debug("hash_reputation: TF exception for %s: %s", hash_value[:16], tf_result)
456
+ tf_result = {"found": False}
457
+ if isinstance(vt_result, Exception):
458
+ logger.debug("hash_reputation: VT exception for %s: %s", hash_value[:16], vt_result)
459
+ vt_result = {"found": False}
460
+
461
+ # Track malware family agreement across sources {slug: count}
462
+ family_count: dict[str, int] = {}
463
+ family_names: dict[str, str] = {}
464
+
465
+ def _add_family(name: str) -> None:
466
+ if not name:
467
+ return
468
+ slug = _normalize_family(name)
469
+ if not slug:
470
+ return
471
+ family_count[slug] = family_count.get(slug, 0) + 1
472
+ if slug not in family_names:
473
+ family_names[slug] = name
474
+
475
+ # ── Hybrid Analysis ───────────────────────────────────────────────────────
476
+ if ha_result.get("found"):
477
+ verdict = ha_result.get("verdict") or ""
478
+ if "malicious" in verdict:
479
+ result["verdict"] = "malicious"
480
+ result["tags"].append("hybrid_analysis_malicious")
481
+ result["confidence_delta"] += 0.15
482
+ elif "suspicious" in verdict:
483
+ result["verdict"] = result["verdict"] or "suspicious"
484
+ result["tags"].append("hybrid_analysis_suspicious")
485
+ elif "no" in verdict and "threat" in verdict:
486
+ result["verdict"] = result["verdict"] or "no_specific_threat"
487
+ result["tags"].append("hybrid_analysis_clean")
488
+
489
+ if ha_result.get("malware_family"):
490
+ slug = _normalize_family(ha_result["malware_family"])
491
+ _add_family(ha_result["malware_family"])
492
+ if slug:
493
+ result["tags"].append(f"malware_family_{slug}")
494
+
495
+ if ha_result.get("threat_score") is not None:
496
+ result["threat_score"] = ha_result["threat_score"]
497
+ result["tags"].append(f"threat_score_{ha_result['threat_score']}")
498
+
499
+ if ha_result.get("av_detections") is not None:
500
+ result["av_detections"] = ha_result["av_detections"]
501
+ if ha_result.get("av_total") is not None:
502
+ result["av_total"] = ha_result["av_total"]
503
+
504
+ if ha_result.get("file_type"):
505
+ result["file_type"] = ha_result["file_type"]
506
+
507
+ short_hash = hash_value[:16]
508
+ for ip in (ha_result.get("contacted_ips") or [])[:MAX_IPS_PER_HASH]:
509
+ result["new_entities"].append({
510
+ "entity_type": "IP_ADDRESS",
511
+ "value": ip,
512
+ "canonical_value": ip,
513
+ "confidence": 0.82,
514
+ "source": "hybrid_analysis",
515
+ "extraction_method": "enrich",
516
+ "context_snippet": f"Contacted by {short_hash}... (Hybrid Analysis sandbox)",
517
+ })
518
+
519
+ for domain in (ha_result.get("contacted_domains") or [])[:MAX_DOMAINS_PER_HASH]:
520
+ result["new_entities"].append({
521
+ "entity_type": "DOMAIN",
522
+ "value": domain,
523
+ "canonical_value": domain,
524
+ "confidence": 0.80,
525
+ "source": "hybrid_analysis",
526
+ "extraction_method": "enrich",
527
+ "context_snippet": f"Contacted by {short_hash}... (Hybrid Analysis sandbox)",
528
+ })
529
+
530
+ # ── MalwareBazaar ─────────────────────────────────────────────────────────
531
+ if mb_result.get("found"):
532
+ result["confidence_delta"] += 0.10
533
+ result["tags"].append("malwarebazaar_confirmed")
534
+
535
+ if mb_result.get("malware_family"):
536
+ _add_family(mb_result["malware_family"])
537
+
538
+ if mb_result.get("file_type") and not result["file_type"]:
539
+ result["file_type"] = mb_result["file_type"]
540
+ if mb_result.get("first_seen") and not result["first_seen"]:
541
+ result["first_seen"] = mb_result["first_seen"]
542
+
543
+ # ── ThreatFox ─────────────────────────────────────────────────────────────
544
+ if tf_result.get("found"):
545
+ result["confidence_delta"] += 0.10
546
+ result["tags"].append("threatfox_confirmed")
547
+
548
+ if tf_result.get("malware_family"):
549
+ _add_family(tf_result["malware_family"])
550
+ if tf_result.get("first_seen") and not result["first_seen"]:
551
+ result["first_seen"] = tf_result["first_seen"]
552
+
553
+ # Associated IOCs as new entities
554
+ short_hash = hash_value[:16]
555
+ for ioc in (tf_result.get("associated_iocs") or [])[:5]:
556
+ ioc_type = ioc.get("ioc_type") or ""
557
+ ioc_value = ioc.get("ioc_value") or ""
558
+ if not ioc_type or not ioc_value:
559
+ continue
560
+ entity_type = None
561
+ if "ip" in ioc_type:
562
+ entity_type = "IP_ADDRESS"
563
+ elif "domain" in ioc_type:
564
+ entity_type = "DOMAIN"
565
+ elif "sha256" in ioc_type:
566
+ entity_type = "FILE_HASH_SHA256"
567
+ elif "sha1" in ioc_type:
568
+ entity_type = "FILE_HASH_SHA1"
569
+ elif "md5" in ioc_type:
570
+ entity_type = "FILE_HASH_MD5"
571
+ if entity_type:
572
+ result["new_entities"].append({
573
+ "entity_type": entity_type,
574
+ "value": ioc_value,
575
+ "canonical_value": ioc_value,
576
+ "confidence": 0.78,
577
+ "source": "threatfox",
578
+ "extraction_method": "enrich",
579
+ "context_snippet": (
580
+ f"Associated IOC from ThreatFox with {short_hash}..."
581
+ ),
582
+ })
583
+
584
+ # ── VirusTotal ────────────────────────────────────────────────────────────
585
+ if vt_result.get("found"):
586
+ if vt_result.get("malware_family"):
587
+ _add_family(vt_result["malware_family"])
588
+
589
+ if result["av_detections"] is None and vt_result.get("malicious") is not None:
590
+ result["av_detections"] = vt_result["malicious"]
591
+ if result["av_total"] is None and vt_result.get("total"):
592
+ result["av_total"] = vt_result["total"]
593
+ if vt_result.get("file_type") and not result["file_type"]:
594
+ result["file_type"] = vt_result["file_type"]
595
+ if vt_result.get("first_seen") and not result["first_seen"]:
596
+ result["first_seen"] = vt_result["first_seen"]
597
+
598
+ # Fallback verdict from VT when HA is unavailable
599
+ if result["verdict"] is None:
600
+ mal = vt_result.get("malicious", 0) or 0
601
+ tot = vt_result.get("total", 0) or 0
602
+ if tot > 0:
603
+ ratio = mal / tot
604
+ if ratio > 0.5:
605
+ result["verdict"] = "malicious"
606
+ result["tags"].append("hybrid_analysis_malicious")
607
+ elif ratio > 0.1:
608
+ result["verdict"] = "suspicious"
609
+ result["tags"].append("hybrid_analysis_suspicious")
610
+ else:
611
+ result["verdict"] = "no_specific_threat"
612
+ result["tags"].append("hybrid_analysis_clean")
613
+
614
+ # Network IOCs from sandbox behaviours (premium tier)
615
+ short_hash = hash_value[:16]
616
+ existing_ips = {e["value"] for e in result["new_entities"] if e["entity_type"] == "IP_ADDRESS"}
617
+ for ip in (vt_result.get("contacted_ips") or [])[:MAX_IPS_PER_HASH]:
618
+ if ip not in existing_ips:
619
+ result["new_entities"].append({
620
+ "entity_type": "IP_ADDRESS",
621
+ "value": ip,
622
+ "canonical_value": ip,
623
+ "confidence": 0.82,
624
+ "source": "virustotal",
625
+ "extraction_method": "enrich",
626
+ "context_snippet": f"Contacted by {short_hash}... (VirusTotal sandbox)",
627
+ })
628
+
629
+ existing_domains = {e["value"] for e in result["new_entities"] if e["entity_type"] == "DOMAIN"}
630
+ for domain in (vt_result.get("contacted_domains") or [])[:MAX_DOMAINS_PER_HASH]:
631
+ if domain not in existing_domains:
632
+ result["new_entities"].append({
633
+ "entity_type": "DOMAIN",
634
+ "value": domain,
635
+ "canonical_value": domain,
636
+ "confidence": 0.80,
637
+ "source": "virustotal",
638
+ "extraction_method": "enrich",
639
+ "context_snippet": f"Contacted by {short_hash}... (VirusTotal sandbox)",
640
+ })
641
+
642
+ for sha256 in (vt_result.get("dropped_hashes") or []):
643
+ result["new_entities"].append({
644
+ "entity_type": "FILE_HASH_SHA256",
645
+ "value": sha256,
646
+ "canonical_value": sha256,
647
+ "confidence": 0.75,
648
+ "source": "virustotal",
649
+ "extraction_method": "enrich",
650
+ "context_snippet": f"File dropped by {short_hash}... (VirusTotal sandbox)",
651
+ })
652
+
653
+ # ── AV detection tag ──────────────────────────────────────────────────────
654
+ if result["av_detections"] is not None and result["av_total"]:
655
+ n = result["av_detections"]
656
+ t = result["av_total"]
657
+ result["tags"].append(f"av_detections_{n}_of_{t}")
658
+
659
+ # ── Aggregate confirmed malware families ───────────────────────────────────
660
+ result["malware_families"] = [
661
+ family_names[slug]
662
+ for slug, _ in sorted(family_count.items(), key=lambda x: -x[1])
663
+ ]
664
+
665
+ # ── MALWARE_FAMILY entity when confirmed by 2+ sources ────────────────────
666
+ for slug, count in family_count.items():
667
+ if count >= 2:
668
+ display_name = family_names[slug]
669
+ result["new_entities"].append({
670
+ "entity_type": "MALWARE_FAMILY",
671
+ "value": display_name,
672
+ "canonical_value": display_name,
673
+ "confidence": 0.90,
674
+ "source": "hash_enrichment",
675
+ "extraction_method": "enrich",
676
+ "context_snippet": (
677
+ f"Identified as {display_name} by {count} source(s) "
678
+ f"for hash {hash_value[:20]}"
679
+ ),
680
+ })
681
+ break # one MALWARE_FAMILY entity per hash is enough
682
+
683
+ result["confidence_delta"] = min(result["confidence_delta"], 0.35)
684
+
685
+ _hash_cache[hash_value] = {"result": result, "loaded_at": time.time()}
686
+ return result
687
+
688
+
689
+ # ---------------------------------------------------------------------------
690
+ # DB helpers (sync — called via asyncio.to_thread)
691
+ # ---------------------------------------------------------------------------
692
+
693
+ def _update_hash_entities_in_db(
694
+ updates: list[tuple[str, str, float, list[str]]],
695
+ ) -> None:
696
+ """
697
+ Update confidence and corroborating_sources for enriched hash entities.
698
+
699
+ *updates* is a list of (entity_type, hash_value, new_confidence, tags).
700
+ """
701
+ if not os.getenv("DATABASE_URL") or not updates:
702
+ return
703
+ try:
704
+ from db.session import get_session
705
+ from db.models import Entity
706
+
707
+ with get_session() as session:
708
+ for entity_type, hash_val, confidence, tags in updates:
709
+ db_entity = session.query(Entity).filter(
710
+ Entity.entity_type == entity_type,
711
+ Entity.value == hash_val,
712
+ ).first()
713
+ if db_entity is None:
714
+ continue
715
+ if confidence > (db_entity.confidence or 0.0):
716
+ db_entity.confidence = confidence
717
+ if tags:
718
+ existing: list = json.loads(db_entity.corroborating_sources or "[]")
719
+ for tag in tags:
720
+ if tag not in existing:
721
+ existing.append(tag)
722
+ db_entity.corroborating_sources = json.dumps(existing)
723
+ session.commit()
724
+ except Exception as exc:
725
+ logger.warning("hash_reputation: DB update failed: %s", exc)
726
+
727
+
728
+ # ---------------------------------------------------------------------------
729
+ # Pipeline integration
730
+ # ---------------------------------------------------------------------------
731
+
732
+ async def enrich_hash_entities(
733
+ extraction_results: list,
734
+ investigation_id: Any,
735
+ ) -> tuple[list, dict]:
736
+ """
737
+ Post-extraction hash reputation enrichment step.
738
+
739
+ Collects FILE_HASH_SHA256 / SHA1 / MD5 entities from *extraction_results*.
740
+ Processes SHA256 first, then SHA1, then MD5 (HASH_TYPES priority).
741
+ Caps at MAX_HASHES = 50 per investigation.
742
+
743
+ Updates confidence and corroborating_sources for existing hash entities in DB.
744
+ New entities discovered (IPs, domains, malware families) are returned in
745
+ stats for logging — same pattern as domain_reputation.
746
+
747
+ Returns (extraction_results, stats_dict).
748
+ """
749
+ seen: dict[str, tuple[str, float]] = {} # hash_value → (entity_type, confidence)
750
+
751
+ for exr in extraction_results:
752
+ for entity in getattr(exr, "entities", []):
753
+ et = getattr(entity, "entity_type", "")
754
+ if et not in HASH_TYPES:
755
+ continue
756
+ hv = getattr(entity, "value", "").strip()
757
+ if not hv or not _is_valid_hash(hv):
758
+ continue
759
+ if hv not in seen:
760
+ seen[hv] = (et, getattr(entity, "confidence", 1.0))
761
+ else:
762
+ existing_type, existing_conf = seen[hv]
763
+ if HASH_TYPES.get(et, 99) < HASH_TYPES.get(existing_type, 99):
764
+ seen[hv] = (et, getattr(entity, "confidence", 1.0))
765
+
766
+ if not seen:
767
+ return extraction_results, {"hash_reputation": "ok_0_hashes"}
768
+
769
+ # Sort SHA256 first, SHA1 second, MD5 last
770
+ sorted_hashes = sorted(
771
+ seen.items(),
772
+ key=lambda x: HASH_TYPES.get(x[1][0], 99),
773
+ )
774
+
775
+ if len(sorted_hashes) > MAX_HASHES:
776
+ logger.info(
777
+ "hash_reputation: capping to %d of %d unique hashes",
778
+ MAX_HASHES, len(sorted_hashes),
779
+ )
780
+ sorted_hashes = sorted_hashes[:MAX_HASHES]
781
+
782
+ logger.info("hash_reputation: checking %d unique hash(es)", len(sorted_hashes))
783
+
784
+ rep_list = await asyncio.gather(
785
+ *[
786
+ check_hash_reputation(hv, ht, base_confidence=conf)
787
+ for hv, (ht, conf) in sorted_hashes
788
+ ],
789
+ return_exceptions=True,
790
+ )
791
+
792
+ db_updates: list[tuple[str, str, float, list[str]]] = []
793
+ all_new_entities: list[dict] = []
794
+ stats = {
795
+ "hashes_checked": len(sorted_hashes),
796
+ "malicious": 0,
797
+ "suspicious": 0,
798
+ "clean": 0,
799
+ "malware_families_found": 0,
800
+ "new_entities_discovered": 0,
801
+ }
802
+
803
+ for (hv, (ht, base_conf)), rep in zip(sorted_hashes, rep_list):
804
+ if isinstance(rep, Exception):
805
+ logger.debug("hash_reputation: check raised for %s: %s", hv[:16], rep)
806
+ continue
807
+
808
+ verdict = rep.get("verdict")
809
+ if verdict == "malicious":
810
+ stats["malicious"] += 1
811
+ elif verdict == "suspicious":
812
+ stats["suspicious"] += 1
813
+ elif verdict == "no_specific_threat":
814
+ stats["clean"] += 1
815
+
816
+ if rep.get("malware_families"):
817
+ stats["malware_families_found"] += 1
818
+
819
+ tags = rep.get("tags", [])
820
+ new_conf = min(base_conf + rep.get("confidence_delta", 0.0), 1.0)
821
+
822
+ new_entities = rep.get("new_entities", [])
823
+ all_new_entities.extend(new_entities)
824
+ stats["new_entities_discovered"] += len(new_entities)
825
+
826
+ if tags or rep.get("confidence_delta", 0) > 0:
827
+ db_updates.append((ht, hv, new_conf, tags))
828
+
829
+ if db_updates:
830
+ await asyncio.to_thread(_update_hash_entities_in_db, db_updates)
831
+
832
+ if all_new_entities:
833
+ logger.info(
834
+ "hash_reputation: %d new entities discovered (IPs, domains, families)",
835
+ len(all_new_entities),
836
+ )
837
+
838
+ checked = stats["hashes_checked"]
839
+ status = (
840
+ f"ok_{checked}_hashes"
841
+ f"_{stats['malicious']}_malicious"
842
+ f"_{stats['suspicious']}_suspicious"
843
+ )
844
+
845
+ logger.info(
846
+ "hash_reputation: done — %d checked, %d malicious, %d suspicious, "
847
+ "%d clean, %d families, %d new entities",
848
+ checked,
849
+ stats["malicious"],
850
+ stats["suspicious"],
851
+ stats["clean"],
852
+ stats["malware_families_found"],
853
+ stats["new_entities_discovered"],
854
+ )
855
+
856
+ return extraction_results, {"hash_reputation": status, **stats}