voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,643 @@
1
+ """
2
+ sources/domain_reputation.py — Domain reputation enrichment.
3
+
4
+ Enriches extracted DOMAIN entities with infrastructure profiles from three sources:
5
+ - crt.sh (Certificate Transparency): subdomain enumeration — free, no auth
6
+ - URLScan.io: live scan data, malicious indicators, communicating IPs
7
+ - Wayback Machine (Internet Archive): historical snapshots for taken-down domains
8
+
9
+ All three sources queried concurrently per domain. Results are cached.
10
+ New subdomain/IP entities are returned in the result for pipeline reporting.
11
+ Existing DOMAIN entities get confidence and tag updates written to the DB.
12
+
13
+ Public interface
14
+ ----------------
15
+ async query_crt_sh(domain) → list[dict]
16
+ async query_urlscan(domain) → dict
17
+ async query_wayback(domain) → dict
18
+ async check_domain_reputation(domain, confidence) → dict
19
+ async enrich_domain_entities(extraction_results, investigation_id) → (results, stats)
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import json
26
+ import logging
27
+ import os
28
+ import re
29
+ import time
30
+ from typing import Any
31
+
32
+ import aiohttp
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ MAX_DOMAINS = 30
37
+ MAX_SUBDOMAINS_PER_DOMAIN = 20
38
+ MAX_IPS_PER_DOMAIN = 5
39
+
40
+ CRT_SH_URL = "https://crt.sh/?q=%.{domain}&output=json"
41
+ URLSCAN_SEARCH_URL = "https://urlscan.io/api/v1/search/?q=domain:{domain}&size=5"
42
+ URLSCAN_SUBMIT_URL = "https://urlscan.io/api/v1/scan/"
43
+ WAYBACK_CDX_URL = (
44
+ "http://web.archive.org/cdx/search/cdx"
45
+ "?url={domain}&output=json&limit=5&fl=timestamp,statuscode,mimetype"
46
+ )
47
+
48
+ # In-memory per-domain caches (module-level singletons, keyed by domain)
49
+ _crt_cache: dict[str, dict] = {}
50
+ _urlscan_cache: dict[str, dict] = {}
51
+ _wayback_cache: dict[str, dict] = {}
52
+
53
+ CRT_CACHE_TTL = 86400.0 # 24 h
54
+ URLSCAN_CACHE_TTL = 21600.0 # 6 h
55
+ WAYBACK_CACHE_TTL = 86400.0 # 24 h
56
+
57
+ _DOMAIN_RE = re.compile(
58
+ r"^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$"
59
+ )
60
+
61
+ _PRIVATE_SUFFIXES = (".local", ".internal", ".test", ".example", ".invalid", ".localhost")
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Helpers
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def _is_onion(domain: str) -> bool:
69
+ return domain.lower().strip().endswith(".onion")
70
+
71
+
72
+ def _is_private_domain(domain: str) -> bool:
73
+ d = domain.lower().strip()
74
+ if d == "localhost":
75
+ return True
76
+ return any(d.endswith(s) for s in _PRIVATE_SUFFIXES)
77
+
78
+
79
+ def _is_valid_domain(value: str) -> bool:
80
+ if not value or len(value) < 4 or "." not in value:
81
+ return False
82
+ if value.endswith(".onion"):
83
+ return False
84
+ return bool(_DOMAIN_RE.match(value))
85
+
86
+
87
+ def _parse_wayback_timestamp(ts: str) -> str | None:
88
+ """Convert 14-char Wayback timestamp (YYYYMMDDHHmmss) to ISO date (YYYY-MM-DD)."""
89
+ try:
90
+ ts = ts.strip()
91
+ if len(ts) >= 8:
92
+ return f"{ts[:4]}-{ts[4:6]}-{ts[6:8]}"
93
+ return None
94
+ except Exception:
95
+ return None
96
+
97
+
98
+ def _is_established_domain(first_seen: str | None) -> bool:
99
+ """True if first Wayback snapshot is older than 5 years."""
100
+ if not first_seen:
101
+ return False
102
+ try:
103
+ from datetime import datetime, timezone
104
+ dt = datetime.fromisoformat(first_seen)
105
+ if dt.tzinfo is None:
106
+ dt = dt.replace(tzinfo=timezone.utc)
107
+ return (datetime.now(timezone.utc) - dt).days > 1825
108
+ except Exception:
109
+ return False
110
+
111
+
112
+ def _is_newly_observed(first_seen: str | None) -> bool:
113
+ """True if first Wayback snapshot is younger than 90 days."""
114
+ if not first_seen:
115
+ return False
116
+ try:
117
+ from datetime import datetime, timezone
118
+ dt = datetime.fromisoformat(first_seen)
119
+ if dt.tzinfo is None:
120
+ dt = dt.replace(tzinfo=timezone.utc)
121
+ return (datetime.now(timezone.utc) - dt).days < 90
122
+ except Exception:
123
+ return False
124
+
125
+
126
+ # ---------------------------------------------------------------------------
127
+ # crt.sh — Certificate Transparency
128
+ # ---------------------------------------------------------------------------
129
+
130
+ async def query_crt_sh(domain: str) -> list[dict]:
131
+ """
132
+ Query crt.sh for subdomains found in certificate transparency logs.
133
+
134
+ Returns list of dicts with keys: name, first_seen, last_seen, issuer.
135
+ Wildcards (*.example.com) and the parent domain itself are filtered out.
136
+ Results capped at MAX_SUBDOMAINS_PER_DOMAIN. Cached for 24 h.
137
+ """
138
+ cached = _crt_cache.get(domain)
139
+ if cached and (time.time() - cached["loaded_at"]) < CRT_CACHE_TTL:
140
+ return cached["subdomains"]
141
+
142
+ url = CRT_SH_URL.format(domain=domain)
143
+ try:
144
+ timeout = aiohttp.ClientTimeout(connect=10, sock_read=120)
145
+ headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
146
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
147
+ async with session.get(url) as resp:
148
+ if resp.status != 200:
149
+ logger.debug("domain_reputation: crt.sh %s → HTTP %s", domain, resp.status)
150
+ return []
151
+ data = await resp.json(content_type=None)
152
+ except Exception as exc:
153
+ logger.debug("domain_reputation: crt.sh failed for %s: %s", domain, exc)
154
+ return []
155
+
156
+ seen: set[str] = set()
157
+ results: list[dict] = []
158
+ for entry in data or []:
159
+ raw = (entry.get("name_value") or "").strip().lower()
160
+ # name_value may contain newline-separated entries
161
+ for name in raw.split("\n"):
162
+ name = name.strip()
163
+ if not name:
164
+ continue
165
+ if name.startswith("*"):
166
+ continue
167
+ if name == domain:
168
+ continue
169
+ if not name.endswith(f".{domain}"):
170
+ continue
171
+ if not _is_valid_domain(name):
172
+ continue
173
+ if name in seen:
174
+ continue
175
+ seen.add(name)
176
+ results.append({
177
+ "name": name,
178
+ "first_seen": entry.get("not_before", ""),
179
+ "last_seen": entry.get("not_after", ""),
180
+ "issuer": entry.get("issuer_name", ""),
181
+ })
182
+ if len(results) >= MAX_SUBDOMAINS_PER_DOMAIN:
183
+ break
184
+ if len(results) >= MAX_SUBDOMAINS_PER_DOMAIN:
185
+ break
186
+
187
+ _crt_cache[domain] = {"subdomains": results, "loaded_at": time.time()}
188
+ logger.debug("domain_reputation: crt.sh %s → %d subdomains", domain, len(results))
189
+ return results
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # URLScan.io
194
+ # ---------------------------------------------------------------------------
195
+
196
+ async def query_urlscan(domain: str) -> dict[str, Any]:
197
+ """
198
+ Query URLScan.io search API for the most recent scans of a domain.
199
+
200
+ Returns dict: malicious, tags, categories, ips, technologies, screenshot_url.
201
+ Uses URLSCAN_API_KEY env var if present (higher rate limits).
202
+ Cached for 6 h.
203
+ """
204
+ cached = _urlscan_cache.get(domain)
205
+ if cached and (time.time() - cached["loaded_at"]) < URLSCAN_CACHE_TTL:
206
+ return cached["result"]
207
+
208
+ empty: dict[str, Any] = {
209
+ "malicious": False,
210
+ "tags": [],
211
+ "categories": [],
212
+ "ips": [],
213
+ "technologies": [],
214
+ "screenshot_url": None,
215
+ }
216
+
217
+ api_key = (os.getenv("URLSCAN_API_KEY") or "").strip()
218
+ headers: dict[str, str] = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
219
+ if api_key:
220
+ headers["API-Key"] = api_key
221
+
222
+ url = URLSCAN_SEARCH_URL.format(domain=domain)
223
+ try:
224
+ timeout = aiohttp.ClientTimeout(total=15)
225
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
226
+ async with session.get(url) as resp:
227
+ if resp.status != 200:
228
+ logger.debug("domain_reputation: URLScan %s → HTTP %s", domain, resp.status)
229
+ _urlscan_cache[domain] = {"result": empty, "loaded_at": time.time()}
230
+ return empty
231
+ data = await resp.json()
232
+ except Exception as exc:
233
+ logger.debug("domain_reputation: URLScan failed for %s: %s", domain, exc)
234
+ return empty
235
+
236
+ scan_list = data.get("results") or []
237
+ if not scan_list:
238
+ _urlscan_cache[domain] = {"result": empty, "loaded_at": time.time()}
239
+ return empty
240
+
241
+ malicious = False
242
+ all_tags: list[str] = []
243
+ all_categories: list[str] = []
244
+ seen_ips: list[str] = []
245
+ all_tech: list[str] = []
246
+ screenshot_url: str | None = None
247
+
248
+ for scan in scan_list[:5]:
249
+ verdicts = scan.get("verdicts", {})
250
+ overall = verdicts.get("overall", {})
251
+ if overall.get("malicious"):
252
+ malicious = True
253
+ all_tags.extend(overall.get("tags") or [])
254
+ all_categories.extend(overall.get("categories") or [])
255
+
256
+ for ip in (scan.get("lists") or {}).get("ips") or []:
257
+ if isinstance(ip, str) and ip not in seen_ips:
258
+ seen_ips.append(ip)
259
+ if len(seen_ips) >= MAX_IPS_PER_DOMAIN:
260
+ break
261
+
262
+ wappa = (scan.get("meta") or {}).get("processors", {}).get("wappa", {})
263
+ for tech in wappa.get("data") or []:
264
+ name = tech.get("app") or tech.get("name") or ""
265
+ if name and name not in all_tech:
266
+ all_tech.append(name)
267
+
268
+ if screenshot_url is None:
269
+ screenshot_url = (scan.get("task") or {}).get("screenshotURL")
270
+
271
+ result: dict[str, Any] = {
272
+ "malicious": malicious,
273
+ "tags": list(dict.fromkeys(all_tags))[:10],
274
+ "categories": list(dict.fromkeys(all_categories))[:5],
275
+ "ips": seen_ips[:MAX_IPS_PER_DOMAIN],
276
+ "technologies": all_tech[:10],
277
+ "screenshot_url": screenshot_url,
278
+ }
279
+
280
+ _urlscan_cache[domain] = {"result": result, "loaded_at": time.time()}
281
+ logger.debug(
282
+ "domain_reputation: URLScan %s → malicious=%s, %d IPs",
283
+ domain, malicious, len(result["ips"]),
284
+ )
285
+ return result
286
+
287
+
288
+ async def _submit_urlscan(domain: str, api_key: str) -> None:
289
+ """Submit a new public scan to URLScan.io — only when URLSCAN_SUBMIT=true."""
290
+ if (os.getenv("URLSCAN_SUBMIT") or "false").lower().strip() != "true":
291
+ return
292
+ if not api_key:
293
+ return
294
+ try:
295
+ payload = {"url": f"https://{domain}", "visibility": "public"}
296
+ headers = {"API-Key": api_key, "Content-Type": "application/json"}
297
+ timeout = aiohttp.ClientTimeout(total=15)
298
+ async with aiohttp.ClientSession(timeout=timeout) as session:
299
+ async with session.post(URLSCAN_SUBMIT_URL, json=payload, headers=headers) as resp:
300
+ if resp.status in (200, 201):
301
+ logger.debug("domain_reputation: URLScan scan submitted for %s", domain)
302
+ else:
303
+ logger.debug(
304
+ "domain_reputation: URLScan submit %s → HTTP %s", domain, resp.status
305
+ )
306
+ except Exception as exc:
307
+ logger.debug("domain_reputation: URLScan submit failed for %s: %s", domain, exc)
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Wayback Machine
312
+ # ---------------------------------------------------------------------------
313
+
314
+ async def query_wayback(domain: str) -> dict[str, Any]:
315
+ """
316
+ Query the Wayback Machine CDX API for historical snapshots of a domain.
317
+
318
+ Returns dict: exists, first_seen, last_seen, snapshot_url, likely_taken_down.
319
+ A domain shows "likely_taken_down" when earlier snapshots returned 2xx
320
+ and the most recent snapshot returned a 4xx status.
321
+ Cached for 24 h.
322
+ """
323
+ cached = _wayback_cache.get(domain)
324
+ if cached and (time.time() - cached["loaded_at"]) < WAYBACK_CACHE_TTL:
325
+ return cached["result"]
326
+
327
+ empty: dict[str, Any] = {
328
+ "exists": False,
329
+ "first_seen": None,
330
+ "last_seen": None,
331
+ "snapshot_url": None,
332
+ "likely_taken_down": False,
333
+ }
334
+
335
+ url = WAYBACK_CDX_URL.format(domain=domain)
336
+ try:
337
+ timeout = aiohttp.ClientTimeout(total=15)
338
+ headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
339
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
340
+ async with session.get(url) as resp:
341
+ if resp.status != 200:
342
+ logger.debug("domain_reputation: Wayback %s → HTTP %s", domain, resp.status)
343
+ _wayback_cache[domain] = {"result": empty, "loaded_at": time.time()}
344
+ return empty
345
+ rows = await resp.json(content_type=None)
346
+ except Exception as exc:
347
+ logger.debug("domain_reputation: Wayback failed for %s: %s", domain, exc)
348
+ return empty
349
+
350
+ # CDX returns list-of-lists; first row is the header
351
+ if not rows or len(rows) <= 1:
352
+ _wayback_cache[domain] = {"result": empty, "loaded_at": time.time()}
353
+ return empty
354
+
355
+ data_rows = rows[1:]
356
+ timestamps: list[str] = []
357
+ status_codes: list[str] = []
358
+ for row in data_rows:
359
+ if isinstance(row, list) and len(row) >= 2:
360
+ timestamps.append(str(row[0]))
361
+ status_codes.append(str(row[1]))
362
+
363
+ if not timestamps:
364
+ _wayback_cache[domain] = {"result": empty, "loaded_at": time.time()}
365
+ return empty
366
+
367
+ timestamps_sorted = sorted(timestamps)
368
+ first_seen = _parse_wayback_timestamp(timestamps_sorted[0])
369
+ last_seen = _parse_wayback_timestamp(timestamps_sorted[-1])
370
+ snapshot_url = f"https://web.archive.org/web/{timestamps_sorted[-1]}/{domain}"
371
+
372
+ has_200 = any(sc.startswith("2") for sc in status_codes)
373
+ last_status = status_codes[-1] if status_codes else ""
374
+ likely_taken_down = has_200 and last_status.startswith("4")
375
+
376
+ result: dict[str, Any] = {
377
+ "exists": True,
378
+ "first_seen": first_seen,
379
+ "last_seen": last_seen,
380
+ "snapshot_url": snapshot_url,
381
+ "likely_taken_down": likely_taken_down,
382
+ }
383
+
384
+ _wayback_cache[domain] = {"result": result, "loaded_at": time.time()}
385
+ logger.debug(
386
+ "domain_reputation: Wayback %s → archived, taken_down=%s", domain, likely_taken_down
387
+ )
388
+ return result
389
+
390
+
391
+ # ---------------------------------------------------------------------------
392
+ # Core enrichment check
393
+ # ---------------------------------------------------------------------------
394
+
395
+ async def check_domain_reputation(
396
+ domain: str,
397
+ base_confidence: float = 1.0,
398
+ ) -> dict[str, Any]:
399
+ """
400
+ Run all three enrichment sources for a single domain concurrently.
401
+
402
+ Returns:
403
+ domain, crt_subdomains, urlscan_malicious, urlscan_tags, urlscan_ips,
404
+ wayback_exists, wayback_first_seen, wayback_last_seen, likely_taken_down,
405
+ new_entities, tags, confidence_delta
406
+ """
407
+ result: dict[str, Any] = {
408
+ "domain": domain,
409
+ "crt_subdomains": [],
410
+ "urlscan_malicious": False,
411
+ "urlscan_tags": [],
412
+ "urlscan_ips": [],
413
+ "wayback_exists": False,
414
+ "wayback_first_seen": None,
415
+ "wayback_last_seen": None,
416
+ "likely_taken_down": False,
417
+ "new_entities": [],
418
+ "tags": [],
419
+ "confidence_delta": 0.0,
420
+ }
421
+
422
+ if _is_onion(domain) or _is_private_domain(domain):
423
+ return result
424
+
425
+ crt_data, urlscan_data, wayback_data = await asyncio.gather(
426
+ query_crt_sh(domain),
427
+ query_urlscan(domain),
428
+ query_wayback(domain),
429
+ return_exceptions=True,
430
+ )
431
+
432
+ # --- crt.sh ---
433
+ if isinstance(crt_data, list) and crt_data:
434
+ result["crt_subdomains"] = crt_data
435
+ result["tags"].append("has_ct_history")
436
+ n = len(crt_data)
437
+ result["tags"].append(f"subdomain_count_{n}")
438
+ for sub in crt_data:
439
+ name = sub.get("name", "")
440
+ if name:
441
+ result["new_entities"].append({
442
+ "entity_type": "DOMAIN",
443
+ "value": name,
444
+ "canonical_value": name,
445
+ "confidence": 0.70,
446
+ "source": "crt_sh",
447
+ "extraction_method": "domain_enrichment",
448
+ "context_snippet": f"Subdomain of {domain} (certificate transparency logs)",
449
+ })
450
+ elif isinstance(crt_data, Exception):
451
+ logger.debug("domain_reputation: crt.sh error for %s: %s", domain, crt_data)
452
+
453
+ # --- URLScan.io ---
454
+ if isinstance(urlscan_data, dict):
455
+ result["urlscan_malicious"] = urlscan_data.get("malicious", False)
456
+ result["urlscan_tags"] = urlscan_data.get("tags", [])
457
+ result["urlscan_ips"] = urlscan_data.get("ips", [])
458
+
459
+ if urlscan_data.get("malicious"):
460
+ result["tags"].append("urlscan_malicious")
461
+ result["confidence_delta"] += 0.10
462
+
463
+ for ip in urlscan_data.get("ips", [])[:MAX_IPS_PER_DOMAIN]:
464
+ result["new_entities"].append({
465
+ "entity_type": "IP_ADDRESS",
466
+ "value": ip,
467
+ "canonical_value": ip,
468
+ "confidence": 0.72,
469
+ "source": "urlscan",
470
+ "extraction_method": "domain_enrichment",
471
+ "context_snippet": f"IP communicating with {domain} (URLScan.io)",
472
+ })
473
+
474
+ for tech in urlscan_data.get("technologies", []):
475
+ slug = re.sub(r"[^a-z0-9]+", "_", tech.lower())[:40]
476
+ result["tags"].append(f"tech_{slug}")
477
+ elif isinstance(urlscan_data, Exception):
478
+ logger.debug("domain_reputation: URLScan error for %s: %s", domain, urlscan_data)
479
+
480
+ # --- Wayback Machine ---
481
+ if isinstance(wayback_data, dict):
482
+ result["wayback_exists"] = wayback_data.get("exists", False)
483
+ result["wayback_first_seen"] = wayback_data.get("first_seen")
484
+ result["wayback_last_seen"] = wayback_data.get("last_seen")
485
+ result["likely_taken_down"] = wayback_data.get("likely_taken_down", False)
486
+
487
+ if wayback_data.get("exists"):
488
+ result["tags"].append("wayback_archived")
489
+ first_seen = wayback_data.get("first_seen")
490
+ if wayback_data.get("likely_taken_down"):
491
+ result["tags"].append("likely_taken_down")
492
+ if _is_established_domain(first_seen):
493
+ result["tags"].append("established_domain")
494
+ elif _is_newly_observed(first_seen):
495
+ result["tags"].append("newly_observed_domain")
496
+ elif isinstance(wayback_data, Exception):
497
+ logger.debug("domain_reputation: Wayback error for %s: %s", domain, wayback_data)
498
+
499
+ return result
500
+
501
+
502
+ # ---------------------------------------------------------------------------
503
+ # DB helpers (sync — called via asyncio.to_thread)
504
+ # ---------------------------------------------------------------------------
505
+
506
+ def _update_domain_entities_in_db(
507
+ updates: list[tuple[str, float, list[str]]],
508
+ ) -> None:
509
+ """Update confidence and corroborating_sources for enriched DOMAIN entities."""
510
+ if not os.getenv("DATABASE_URL") or not updates:
511
+ return
512
+ try:
513
+ from db.session import get_session
514
+ from db.models import Entity
515
+
516
+ with get_session() as session:
517
+ for domain_val, confidence, tags in updates:
518
+ db_entity = session.query(Entity).filter(
519
+ Entity.entity_type == "DOMAIN",
520
+ Entity.value == domain_val,
521
+ ).first()
522
+ if db_entity is None:
523
+ continue
524
+ if confidence > (db_entity.confidence or 0.0):
525
+ db_entity.confidence = confidence
526
+ if tags:
527
+ existing: list = json.loads(db_entity.corroborating_sources or "[]")
528
+ for tag in tags:
529
+ if tag not in existing:
530
+ existing.append(tag)
531
+ db_entity.corroborating_sources = json.dumps(existing)
532
+ session.commit()
533
+ except Exception as exc:
534
+ logger.warning("domain_reputation: DB update failed: %s", exc)
535
+
536
+
537
+ # ---------------------------------------------------------------------------
538
+ # Pipeline integration
539
+ # ---------------------------------------------------------------------------
540
+
541
+ async def enrich_domain_entities(
542
+ extraction_results: list,
543
+ investigation_id: Any,
544
+ ) -> tuple[list, dict]:
545
+ """
546
+ Post-extraction domain reputation enrichment step.
547
+
548
+ Collects DOMAIN entities from *extraction_results*, skipping ONION_URL and
549
+ private/internal domains. Queries crt.sh, URLScan.io, and Wayback Machine
550
+ concurrently per domain (capped at MAX_DOMAINS).
551
+
552
+ Updates confidence and tags for existing DOMAIN entities in the DB.
553
+ New entities (subdomains, communicating IPs) are returned in stats for logging.
554
+
555
+ Returns (extraction_results, stats_dict).
556
+ """
557
+ seen: dict[str, float] = {}
558
+ for exr in extraction_results:
559
+ for entity in getattr(exr, "entities", []):
560
+ if getattr(entity, "entity_type", "") != "DOMAIN":
561
+ continue
562
+ domain = entity.value
563
+ if _is_onion(domain) or _is_private_domain(domain):
564
+ continue
565
+ if domain not in seen:
566
+ seen[domain] = getattr(entity, "confidence", 1.0)
567
+
568
+ unique_domains = list(seen.keys())
569
+ if not unique_domains:
570
+ return extraction_results, {"domain_reputation": "ok_0_domains"}
571
+
572
+ if len(unique_domains) > MAX_DOMAINS:
573
+ logger.info(
574
+ "domain_reputation: capping to %d of %d unique domains",
575
+ MAX_DOMAINS, len(unique_domains),
576
+ )
577
+ unique_domains = unique_domains[:MAX_DOMAINS]
578
+
579
+ logger.info("domain_reputation: enriching %d unique domain(s)", len(unique_domains))
580
+
581
+ rep_list = await asyncio.gather(
582
+ *[check_domain_reputation(d, base_confidence=seen[d]) for d in unique_domains],
583
+ return_exceptions=True,
584
+ )
585
+
586
+ db_updates: list[tuple[str, float, list[str]]] = []
587
+ all_new_entities: list[dict] = []
588
+ stats = {
589
+ "domains_checked": len(unique_domains),
590
+ "ct_records": 0,
591
+ "urlscan_malicious": 0,
592
+ "wayback_archived": 0,
593
+ "new_entities_discovered": 0,
594
+ }
595
+
596
+ for domain, rep in zip(unique_domains, rep_list):
597
+ if isinstance(rep, Exception):
598
+ logger.debug("domain_reputation: check raised for %s: %s", domain, rep)
599
+ continue
600
+
601
+ base_conf = seen[domain]
602
+ new_conf = min(base_conf + rep["confidence_delta"], 1.0)
603
+ tags = rep.get("tags", [])
604
+
605
+ stats["ct_records"] += len(rep.get("crt_subdomains", []))
606
+ if rep.get("urlscan_malicious"):
607
+ stats["urlscan_malicious"] += 1
608
+ if rep.get("wayback_exists"):
609
+ stats["wayback_archived"] += 1
610
+
611
+ new_entities = rep.get("new_entities", [])
612
+ all_new_entities.extend(new_entities)
613
+ stats["new_entities_discovered"] += len(new_entities)
614
+
615
+ if tags or rep["confidence_delta"] > 0:
616
+ db_updates.append((domain, new_conf, tags))
617
+
618
+ if db_updates:
619
+ await asyncio.to_thread(_update_domain_entities_in_db, db_updates)
620
+
621
+ checked = stats["domains_checked"]
622
+ status = (
623
+ f"ok_{checked}_domains"
624
+ f"_{stats['ct_records']}_ct"
625
+ f"_{stats['urlscan_malicious']}_malicious"
626
+ f"_{stats['wayback_archived']}_archived"
627
+ )
628
+
629
+ if all_new_entities:
630
+ logger.info(
631
+ "domain_reputation: %d new entities discovered (subdomains + IPs)",
632
+ len(all_new_entities),
633
+ )
634
+
635
+ logger.info(
636
+ "domain_reputation: done — %d domains, %d CT records, %d malicious, %d archived",
637
+ checked,
638
+ stats["ct_records"],
639
+ stats["urlscan_malicious"],
640
+ stats["wayback_archived"],
641
+ )
642
+
643
+ return extraction_results, {"domain_reputation": status, **stats}