voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,557 @@
1
+ """
2
+ WHOIS/passive DNS enrichment using CIRCL pDNS, CIRCL pSSL, and RDAP.
3
+
4
+ Enriches extracted IP and domain entities with DNS history, WHOIS data,
5
+ and infrastructure overlap detection. Free, no auth required for CIRCL/RDAP.
6
+ """
7
+
8
+ import asyncio
9
+ import aiohttp
10
+ import ipaddress
11
+ import json
12
+ import logging
13
+ import os
14
+ import re
15
+ from datetime import datetime, timezone
16
+ from typing import Optional
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ CIRCL_PDNS_URL = "https://www.circl.lu/pdns/query"
21
+ CIRCL_PSSL_URL = "https://www.circl.lu/v2pssl/query"
22
+
23
+ RDAP_IP_URL = "https://rdap.arin.net/registry/ip/{ip}"
24
+ RDAP_DOMAIN_URL = "https://rdap.org/domain/{domain}"
25
+
26
+ CIRCL_TIMEOUT = 15
27
+ WHOIS_TIMEOUT = 10
28
+
29
+ MAX_IPS_TO_ENRICH = 20
30
+ MAX_DOMAINS_TO_ENRICH = 20
31
+
32
+ MAX_RELATED_PER_ENTITY = 5
33
+
34
+ CIRCL_DELAY = 0.5
35
+
36
+
37
+ class DNSEnrichment:
38
+ """
39
+ Enriches IP and domain entities with passive DNS history, WHOIS data,
40
+ and infrastructure overlap detection.
41
+
42
+ Uses CIRCL passive DNS (free, no auth).
43
+ Optional: SecurityTrails (free tier, key needed).
44
+ """
45
+
46
+ def __init__(self):
47
+ self._session: Optional[aiohttp.ClientSession] = None
48
+ self._st_key = os.getenv("SECURITYTRAILS_API_KEY", "").strip()
49
+
50
+ async def __aenter__(self):
51
+ headers = {
52
+ "User-Agent": "VoidAccess-OSINT/1.1 (security research)",
53
+ "Accept": "application/json",
54
+ }
55
+ self._session = aiohttp.ClientSession(
56
+ headers=headers,
57
+ timeout=aiohttp.ClientTimeout(total=20),
58
+ )
59
+ return self
60
+
61
+ async def __aexit__(self, *args):
62
+ if self._session:
63
+ await self._session.close()
64
+
65
+ async def enrich_entities(self, entities: list[dict]) -> dict:
66
+ """
67
+ Main entry point. Takes a list of extracted entities, enriches IPs
68
+ and domains with DNS/WHOIS data.
69
+
70
+ Returns:
71
+ {
72
+ "ip_enrichments": {ip: {...}},
73
+ "domain_enrichments": {domain: {...}},
74
+ "new_entities": [...],
75
+ "infrastructure_clusters": [...],
76
+ }
77
+ """
78
+ ips = []
79
+ domains = []
80
+
81
+ for entity in entities:
82
+ etype = entity.get("entity_type", "")
83
+ value = entity.get("canonical_value", "") or entity.get("value", "")
84
+
85
+ if not value:
86
+ continue
87
+
88
+ if etype == "IP_ADDRESS":
89
+ if self._is_valid_public_ip(value):
90
+ ips.append(value)
91
+ elif etype == "DOMAIN":
92
+ domains.append(value)
93
+
94
+ ips = list(set(ips))[:MAX_IPS_TO_ENRICH]
95
+ domains = list(set(domains))[:MAX_DOMAINS_TO_ENRICH]
96
+
97
+ if not ips and not domains:
98
+ return {
99
+ "ip_enrichments": {},
100
+ "domain_enrichments": {},
101
+ "new_entities": [],
102
+ "infrastructure_clusters": [],
103
+ }
104
+
105
+ logger.info(
106
+ "DNS enrichment: %d IPs, %d domains", len(ips), len(domains)
107
+ )
108
+
109
+ sem = asyncio.Semaphore(3)
110
+
111
+ ip_tasks = [self._enrich_ip(ip, sem) for ip in ips]
112
+ domain_tasks = [self._enrich_domain(domain, sem) for domain in domains]
113
+
114
+ ip_results = await asyncio.gather(*ip_tasks, return_exceptions=True)
115
+ domain_results = await asyncio.gather(*domain_tasks, return_exceptions=True)
116
+
117
+ ip_enrichments = {}
118
+ domain_enrichments = {}
119
+ new_entities = []
120
+
121
+ for ip, result in zip(ips, ip_results):
122
+ if isinstance(result, dict):
123
+ ip_enrichments[ip] = result
124
+ new_entities.extend(result.get("new_entities", []))
125
+
126
+ for domain, result in zip(domains, domain_results):
127
+ if isinstance(result, dict):
128
+ domain_enrichments[domain] = result
129
+ new_entities.extend(result.get("new_entities", []))
130
+
131
+ seen: set[str] = set()
132
+ unique_new = []
133
+ for e in new_entities:
134
+ key = f"{e['type']}:{e['value']}"
135
+ if key not in seen:
136
+ seen.add(key)
137
+ unique_new.append(e)
138
+
139
+ clusters = self._detect_infrastructure_clusters(ip_enrichments, domain_enrichments)
140
+
141
+ logger.info(
142
+ "DNS enrichment complete: %d new entities, %d clusters found",
143
+ len(unique_new),
144
+ len(clusters),
145
+ )
146
+
147
+ return {
148
+ "ip_enrichments": ip_enrichments,
149
+ "domain_enrichments": domain_enrichments,
150
+ "new_entities": unique_new,
151
+ "infrastructure_clusters": clusters,
152
+ }
153
+
154
+ async def _enrich_ip(self, ip: str, sem: asyncio.Semaphore) -> dict:
155
+ async with sem:
156
+ result: dict = {
157
+ "ip": ip,
158
+ "passive_dns": [],
159
+ "whois": {},
160
+ "ssl_certs": [],
161
+ "new_entities": [],
162
+ "tags": [],
163
+ }
164
+
165
+ pdns, whois, ssl = await asyncio.gather(
166
+ self._circl_pdns_ip(ip),
167
+ self._rdap_ip(ip),
168
+ self._circl_pssl_ip(ip),
169
+ return_exceptions=True,
170
+ )
171
+
172
+ await asyncio.sleep(CIRCL_DELAY)
173
+
174
+ if isinstance(pdns, list):
175
+ result["passive_dns"] = pdns
176
+ for record in pdns[:MAX_RELATED_PER_ENTITY]:
177
+ rrname = record.get("rrname", "").rstrip(".")
178
+ if rrname and self._is_valid_domain(rrname):
179
+ result["new_entities"].append({
180
+ "type": "DOMAIN",
181
+ "value": rrname,
182
+ "source": "circl_pdns",
183
+ "context": f"Resolved to {ip} (passive DNS)",
184
+ "confidence": 0.75,
185
+ })
186
+ if pdns:
187
+ result["tags"].append("has_pdns_history")
188
+
189
+ if isinstance(whois, dict):
190
+ result["whois"] = whois
191
+ org = whois.get("org", "").lower()
192
+ country = whois.get("country", "")
193
+ C2_HOSTERS = [
194
+ "choopa", "vultr", "digitalocean", "linode",
195
+ "frantech", "m247", "serverius", "combahton",
196
+ "servermania", "sharktech",
197
+ ]
198
+ for hoster in C2_HOSTERS:
199
+ if hoster in org:
200
+ result["tags"].append(f"c2_hoster_{hoster}")
201
+ if country in ("RU", "CN", "KP", "IR"):
202
+ result["tags"].append(f"country_{country.lower()}")
203
+
204
+ if isinstance(ssl, list):
205
+ result["ssl_certs"] = ssl
206
+ for cert in ssl[:MAX_RELATED_PER_ENTITY]:
207
+ cn = cert.get("cn", "")
208
+ if cn and self._is_valid_domain(cn):
209
+ result["new_entities"].append({
210
+ "type": "DOMAIN",
211
+ "value": cn,
212
+ "source": "circl_pssl",
213
+ "context": f"SSL certificate on {ip}",
214
+ "confidence": 0.80,
215
+ })
216
+ if ssl:
217
+ result["tags"].append("has_ssl_history")
218
+
219
+ return result
220
+
221
+ async def _enrich_domain(self, domain: str, sem: asyncio.Semaphore) -> dict:
222
+ async with sem:
223
+ result: dict = {
224
+ "domain": domain,
225
+ "passive_dns": [],
226
+ "whois": {},
227
+ "new_entities": [],
228
+ "tags": [],
229
+ }
230
+
231
+ pdns, whois = await asyncio.gather(
232
+ self._circl_pdns_domain(domain),
233
+ self._rdap_domain(domain),
234
+ return_exceptions=True,
235
+ )
236
+
237
+ await asyncio.sleep(CIRCL_DELAY)
238
+
239
+ if isinstance(pdns, list):
240
+ result["passive_dns"] = pdns
241
+ seen_ips: set[str] = set()
242
+ for record in pdns:
243
+ rdata = record.get("rdata", "")
244
+ if self._is_valid_public_ip(rdata) and rdata not in seen_ips:
245
+ seen_ips.add(rdata)
246
+ result["new_entities"].append({
247
+ "type": "IP_ADDRESS",
248
+ "value": rdata,
249
+ "source": "circl_pdns",
250
+ "context": f"{domain} resolved to this IP (passive DNS)",
251
+ "confidence": 0.80,
252
+ })
253
+ if len(result["new_entities"]) >= MAX_RELATED_PER_ENTITY:
254
+ break
255
+ if pdns:
256
+ result["tags"].append("has_pdns_history")
257
+
258
+ if isinstance(whois, dict):
259
+ result["whois"] = whois
260
+ reg_date = whois.get("registered", "")
261
+ if reg_date:
262
+ result["tags"].append(f"registered_{reg_date[:7]}")
263
+
264
+ registrant = whois.get("registrant", "").lower()
265
+ PRIVACY_SERVICES = [
266
+ "whoisguard", "privacyprotect", "perfect privacy",
267
+ "domainsbyproxy", "withheld for privacy",
268
+ ]
269
+ for svc in PRIVACY_SERVICES:
270
+ if svc in registrant:
271
+ result["tags"].append("privacy_protected")
272
+ break
273
+
274
+ if reg_date:
275
+ try:
276
+ from dateutil.parser import parse as parse_date
277
+
278
+ reg_dt = parse_date(reg_date)
279
+ now = datetime.now(timezone.utc)
280
+ if reg_dt.tzinfo is None:
281
+ reg_dt = reg_dt.replace(tzinfo=timezone.utc)
282
+ age_days = (now - reg_dt).days
283
+ if age_days < 30:
284
+ result["tags"].append("recently_registered")
285
+ elif age_days < 90:
286
+ result["tags"].append("new_domain")
287
+ except Exception:
288
+ pass
289
+
290
+ return result
291
+
292
+ async def _circl_pdns_ip(self, ip: str) -> list:
293
+ if not self._session:
294
+ return []
295
+ try:
296
+ async with self._session.get(
297
+ f"{CIRCL_PDNS_URL}/{ip}",
298
+ timeout=aiohttp.ClientTimeout(total=CIRCL_TIMEOUT),
299
+ ) as resp:
300
+ if resp.status != 200:
301
+ return []
302
+ text = await resp.text()
303
+ records = []
304
+ for line in text.strip().split("\n"):
305
+ if line.strip():
306
+ try:
307
+ records.append(json.loads(line))
308
+ except Exception:
309
+ pass
310
+ return records[:20]
311
+ except Exception as e:
312
+ logger.debug("CIRCL PDNS IP error %s: %s", ip, e)
313
+ return []
314
+
315
+ async def _circl_pdns_domain(self, domain: str) -> list:
316
+ if not self._session:
317
+ return []
318
+ try:
319
+ async with self._session.get(
320
+ f"{CIRCL_PDNS_URL}/{domain}",
321
+ timeout=aiohttp.ClientTimeout(total=CIRCL_TIMEOUT),
322
+ ) as resp:
323
+ if resp.status != 200:
324
+ return []
325
+ text = await resp.text()
326
+ records = []
327
+ for line in text.strip().split("\n"):
328
+ if line.strip():
329
+ try:
330
+ records.append(json.loads(line))
331
+ except Exception:
332
+ pass
333
+ return records[:20]
334
+ except Exception as e:
335
+ logger.debug("CIRCL PDNS domain error %s: %s", domain, e)
336
+ return []
337
+
338
+ async def _circl_pssl_ip(self, ip: str) -> list:
339
+ if not self._session:
340
+ return []
341
+ try:
342
+ async with self._session.get(
343
+ f"{CIRCL_PSSL_URL}/{ip}",
344
+ timeout=aiohttp.ClientTimeout(total=CIRCL_TIMEOUT),
345
+ ) as resp:
346
+ if resp.status != 200:
347
+ return []
348
+ data = await resp.json()
349
+ certs = []
350
+ for sha1, cert_data in list(data.items())[:10]:
351
+ subjects = cert_data.get("subjects", {})
352
+ cn = subjects.get("cn", [])
353
+ if isinstance(cn, list):
354
+ cn = cn[0] if cn else ""
355
+ certs.append({"sha1": sha1, "cn": cn, "subject": subjects})
356
+ return certs
357
+ except Exception as e:
358
+ logger.debug("CIRCL PSSL error %s: %s", ip, e)
359
+ return []
360
+
361
+ async def _rdap_ip(self, ip: str) -> dict:
362
+ if not self._session:
363
+ return {}
364
+ try:
365
+ async with self._session.get(
366
+ RDAP_IP_URL.format(ip=ip),
367
+ timeout=aiohttp.ClientTimeout(total=WHOIS_TIMEOUT),
368
+ ) as resp:
369
+ if resp.status != 200:
370
+ return {}
371
+ data = await resp.json()
372
+
373
+ result: dict = {}
374
+
375
+ for entity in data.get("entities", []):
376
+ vcards = entity.get("vcardArray", [None, []])
377
+ if isinstance(vcards, list) and len(vcards) > 1:
378
+ for vcard in vcards[1]:
379
+ if isinstance(vcard, list) and len(vcard) >= 4:
380
+ if vcard[0] == "fn":
381
+ result["org"] = vcard[3]
382
+ break
383
+
384
+ result["country"] = data.get("country", "")
385
+
386
+ cidrs = data.get("cidr0_cidrs", [])
387
+ if cidrs:
388
+ cidr = cidrs[0]
389
+ result["cidr"] = (
390
+ f"{cidr.get('v4prefix', '')}/{cidr.get('length', '')}"
391
+ )
392
+
393
+ handle = data.get("handle", "")
394
+ if handle.startswith("NET-"):
395
+ result["network"] = handle
396
+ result["raw_handle"] = handle
397
+
398
+ return result
399
+ except Exception as e:
400
+ logger.debug("RDAP IP error %s: %s", ip, e)
401
+ return {}
402
+
403
+ async def _rdap_domain(self, domain: str) -> dict:
404
+ if not self._session:
405
+ return {}
406
+ try:
407
+ async with self._session.get(
408
+ RDAP_DOMAIN_URL.format(domain=domain),
409
+ timeout=aiohttp.ClientTimeout(total=WHOIS_TIMEOUT),
410
+ ) as resp:
411
+ if resp.status != 200:
412
+ return {}
413
+ data = await resp.json()
414
+
415
+ result: dict = {}
416
+
417
+ for event in data.get("events", []):
418
+ action = event.get("eventAction", "")
419
+ date = event.get("eventDate", "")
420
+ if action == "registration":
421
+ result["registered"] = date
422
+ elif action == "expiration":
423
+ result["expires"] = date
424
+ elif action == "last changed":
425
+ result["updated"] = date
426
+
427
+ result["nameservers"] = [
428
+ ns.get("ldhName", "").lower()
429
+ for ns in data.get("nameservers", [])
430
+ ]
431
+
432
+ for entity in data.get("entities", []):
433
+ roles = entity.get("roles", [])
434
+ vcards = entity.get("vcardArray", [None, []])
435
+ if not (isinstance(vcards, list) and len(vcards) > 1):
436
+ continue
437
+ for vcard in vcards[1]:
438
+ if not (isinstance(vcard, list) and len(vcard) >= 4):
439
+ continue
440
+ if vcard[0] == "fn":
441
+ if "registrar" in roles:
442
+ result["registrar"] = vcard[3]
443
+ if "registrant" in roles:
444
+ result["registrant"] = vcard[3]
445
+ break
446
+
447
+ result["status"] = data.get("status", [])
448
+ return result
449
+ except Exception as e:
450
+ logger.debug("RDAP domain error %s: %s", domain, e)
451
+ return {}
452
+
453
+ def _detect_infrastructure_clusters(
454
+ self,
455
+ ip_enrichments: dict,
456
+ domain_enrichments: dict,
457
+ ) -> list[dict]:
458
+ """Find shared IP and shared nameserver clusters across investigated entities."""
459
+ clusters = []
460
+
461
+ ip_to_domains: dict[str, set] = {}
462
+ for ip, data in ip_enrichments.items():
463
+ domains: set[str] = set()
464
+ for record in data.get("passive_dns", []):
465
+ rrname = record.get("rrname", "").rstrip(".")
466
+ if rrname:
467
+ domains.add(rrname)
468
+ ip_to_domains[ip] = domains
469
+
470
+ for ip, domains in ip_to_domains.items():
471
+ investigated = [d for d in domains if d in domain_enrichments]
472
+ if len(investigated) >= 2:
473
+ clusters.append({
474
+ "type": "shared_ip",
475
+ "ip": ip,
476
+ "domains": investigated,
477
+ "description": (
478
+ f"IP {ip} hosts multiple investigated domains: "
479
+ f"{', '.join(investigated)}"
480
+ ),
481
+ })
482
+
483
+ ns_to_domains: dict[str, list] = {}
484
+ for domain, data in domain_enrichments.items():
485
+ for ns in data.get("whois", {}).get("nameservers", []):
486
+ if ns not in ns_to_domains:
487
+ ns_to_domains[ns] = []
488
+ ns_to_domains[ns].append(domain)
489
+
490
+ for ns, domains in ns_to_domains.items():
491
+ if len(domains) >= 2:
492
+ clusters.append({
493
+ "type": "shared_nameserver",
494
+ "nameserver": ns,
495
+ "domains": domains,
496
+ "description": (
497
+ f"Domains sharing nameserver {ns}: "
498
+ f"{', '.join(domains)}"
499
+ ),
500
+ })
501
+
502
+ return clusters
503
+
504
+ def _is_valid_public_ip(self, value: str) -> bool:
505
+ if not value:
506
+ return False
507
+ try:
508
+ ip = ipaddress.ip_address(value.strip())
509
+ return (
510
+ not ip.is_private
511
+ and not ip.is_loopback
512
+ and not ip.is_multicast
513
+ and not ip.is_reserved
514
+ and ip.version == 4
515
+ )
516
+ except ValueError:
517
+ return False
518
+
519
+ def _is_valid_domain(self, value: str) -> bool:
520
+ if not value or len(value) < 4:
521
+ return False
522
+ if "." not in value:
523
+ return False
524
+ if value.endswith(".onion"):
525
+ return False
526
+ pattern = re.compile(
527
+ r"^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$"
528
+ )
529
+ return bool(pattern.match(value))
530
+
531
+
532
+ async def enrich_with_dns(entities: list[dict]) -> dict:
533
+ """
534
+ Main entry point for DNS/WHOIS enrichment.
535
+ Takes extracted entities, returns enrichment results including new entities.
536
+ """
537
+ enabled = os.getenv("DNS_ENRICHMENT_ENABLED", "true").lower() == "true"
538
+
539
+ if not enabled:
540
+ logger.info("DNS enrichment disabled")
541
+ return {
542
+ "ip_enrichments": {},
543
+ "domain_enrichments": {},
544
+ "new_entities": [],
545
+ "infrastructure_clusters": [],
546
+ }
547
+
548
+ if not entities:
549
+ return {
550
+ "ip_enrichments": {},
551
+ "domain_enrichments": {},
552
+ "new_entities": [],
553
+ "infrastructure_clusters": [],
554
+ }
555
+
556
+ async with DNSEnrichment() as enricher:
557
+ return await enricher.enrich_entities(entities)