voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,635 @@
1
+ """
2
+ sources/email_reputation.py — Email reputation enrichment.
3
+
4
+ Enriches EMAIL_ADDRESS entities with identity attribution data from four sources:
5
+ - HaveIBeenPwned (HIBP): breach history and password exposure (requires HIBP_API_KEY)
6
+ - EmailRep.io: reputation scoring, disposable detection, platform presence
7
+ - Disposable domain blocklist: fast local check against known throwaway domains
8
+ - Domain cross-reference: email domain added as DOMAIN entity (custom domains only)
9
+
10
+ Email addresses extracted from dark web content are already public — they appeared
11
+ on dark web forums/markets. Querying HIBP and EmailRep is legitimate security research.
12
+
13
+ Public interface
14
+ ----------------
15
+ async is_disposable_domain(domain) → bool
16
+ async query_hibp(email) → dict
17
+ async query_emailrep(email) → dict
18
+ async check_email_reputation(email, base_confidence) → dict
19
+ async enrich_email_entities(extraction_results, investigation_id) → (results, stats)
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import json
26
+ import logging
27
+ import os
28
+ import re
29
+ import time
30
+ from typing import Any
31
+
32
+ import aiohttp
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ MAX_EMAILS = 30
37
+
38
+ HIBP_BASE_URL = "https://haveibeenpwned.com/api/v3"
39
+ EMAILREP_BASE_URL = "https://emailrep.io"
40
+ DISPOSABLE_LIST_URL = (
41
+ "https://raw.githubusercontent.com/disposable-email-domains/"
42
+ "disposable-email-domains/master/disposable_email_blocklist.conf"
43
+ )
44
+
45
+ HIBP_CACHE_TTL = 86400.0 # 24 h
46
+ EMAILREP_CACHE_TTL = 43200.0 # 12 h
47
+ DISPOSABLE_LIST_CACHE_TTL = 86400.0 # 24 h
48
+
49
+ _EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
50
+
51
+ # Common free/privacy providers — domain cross-reference reveals no attribution signal
52
+ _FREE_PROVIDERS: frozenset[str] = frozenset({
53
+ "gmail.com", "googlemail.com",
54
+ "yahoo.com", "yahoo.co.uk", "yahoo.fr",
55
+ "hotmail.com", "hotmail.co.uk", "outlook.com", "live.com",
56
+ "proton.me", "protonmail.com", "protonmail.ch",
57
+ "tutanota.com", "tutanota.de", "tuta.io",
58
+ "icloud.com", "me.com",
59
+ "aol.com",
60
+ "mail.com",
61
+ })
62
+
63
+ # In-memory per-email caches
64
+ _hibp_cache: dict[str, dict] = {}
65
+ _emailrep_cache: dict[str, dict] = {}
66
+
67
+ # Disposable domain set cache (module-level singleton)
68
+ _disposable_cache: dict[str, Any] = {"domains": frozenset(), "loaded_at": 0.0}
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Helpers
73
+ # ---------------------------------------------------------------------------
74
+
75
+ def _is_valid_email(value: str) -> bool:
76
+ return bool(value and _EMAIL_RE.match(value.strip()))
77
+
78
+
79
+ def _extract_domain(email: str) -> str:
80
+ """Return the domain portion of an email address."""
81
+ try:
82
+ return email.strip().split("@", 1)[1].lower()
83
+ except IndexError:
84
+ return ""
85
+
86
+
87
+ def _safe_log_email(email: str) -> str:
88
+ """Return privacy-safe log representation: first 3 chars + @domain."""
89
+ try:
90
+ local, domain = email.split("@", 1)
91
+ return f"{local[:3]}***@{domain}"
92
+ except Exception:
93
+ return "***@***"
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Source: Disposable domain blocklist
98
+ # ---------------------------------------------------------------------------
99
+
100
+ async def _load_disposable_list() -> frozenset[str]:
101
+ """Fetch and cache the disposable email domain blocklist (24 h TTL)."""
102
+ cache = _disposable_cache
103
+ if time.time() - cache["loaded_at"] < DISPOSABLE_LIST_CACHE_TTL and cache["domains"]:
104
+ return cache["domains"] # type: ignore[return-value]
105
+
106
+ logger.info("email_reputation: Refreshing disposable domain blocklist")
107
+ try:
108
+ timeout = aiohttp.ClientTimeout(total=20)
109
+ headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
110
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
111
+ async with session.get(DISPOSABLE_LIST_URL) as resp:
112
+ if resp.status != 200:
113
+ logger.warning(
114
+ "email_reputation: Disposable list returned HTTP %s", resp.status
115
+ )
116
+ return frozenset(cache["domains"])
117
+ text = await resp.text()
118
+ except Exception as exc:
119
+ logger.warning("email_reputation: Disposable list fetch failed: %s", exc)
120
+ return frozenset(cache["domains"])
121
+
122
+ domains: set[str] = set()
123
+ for line in text.splitlines():
124
+ line = line.strip().lower()
125
+ if line and not line.startswith("#"):
126
+ domains.add(line)
127
+
128
+ frozen = frozenset(domains)
129
+ cache["domains"] = frozen
130
+ cache["loaded_at"] = time.time()
131
+ logger.info("email_reputation: Disposable blocklist: %d domains loaded", len(frozen))
132
+ return frozen
133
+
134
+
135
+ async def is_disposable_domain(domain: str) -> bool:
136
+ """Return True if *domain* appears in the disposable email domain blocklist."""
137
+ blocklist = await _load_disposable_list()
138
+ return domain.lower() in blocklist
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Source: HaveIBeenPwned
143
+ # ---------------------------------------------------------------------------
144
+
145
+ async def query_hibp(email: str) -> dict[str, Any]:
146
+ """
147
+ Query HIBP v3 breachedaccount/{email} for breach history.
148
+
149
+ Requires HIBP_API_KEY. Without a key the check is skipped gracefully.
150
+ HIBP is a paid API ($3.50/month individual) — the most authoritative
151
+ source for email breach data.
152
+ Cached for 24 h.
153
+ """
154
+ cached = _hibp_cache.get(email)
155
+ if cached and (time.time() - cached["loaded_at"]) < HIBP_CACHE_TTL:
156
+ return cached["result"]
157
+
158
+ api_key = (os.getenv("HIBP_API_KEY") or "").strip()
159
+ if not api_key:
160
+ logger.debug("email_reputation: HIBP skipped — no API key")
161
+ return {"found": False, "source": "hibp_skipped"}
162
+
163
+ try:
164
+ headers = {
165
+ "hibp-api-key": api_key,
166
+ "User-Agent": "VoidAccess-OSINT",
167
+ }
168
+ timeout = aiohttp.ClientTimeout(total=15)
169
+ async with aiohttp.ClientSession(timeout=timeout) as session:
170
+ async with session.get(
171
+ f"{HIBP_BASE_URL}/breachedaccount/{email}",
172
+ headers=headers,
173
+ params={"truncateResponse": "false"},
174
+ ) as resp:
175
+ if resp.status == 404:
176
+ result: dict[str, Any] = {"found": False, "source": "hibp_not_found"}
177
+ _hibp_cache[email] = {"result": result, "loaded_at": time.time()}
178
+ return result
179
+ if resp.status == 401:
180
+ logger.warning("email_reputation: HIBP — invalid API key")
181
+ return {"found": False, "source": "hibp_auth_error"}
182
+ if resp.status == 429:
183
+ logger.warning("email_reputation: HIBP — rate limited")
184
+ return {"found": False, "source": "hibp_rate_limited"}
185
+ if resp.status != 200:
186
+ logger.debug(
187
+ "email_reputation: HIBP → HTTP %s for %s",
188
+ resp.status, _safe_log_email(email),
189
+ )
190
+ return {"found": False, "source": "hibp_error"}
191
+ data = await resp.json()
192
+ except Exception as exc:
193
+ logger.debug(
194
+ "email_reputation: HIBP failed for %s: %s", _safe_log_email(email), exc
195
+ )
196
+ return {"found": False, "source": "hibp_error"}
197
+
198
+ if not data or not isinstance(data, list):
199
+ return {"found": False, "source": "hibp_not_found"}
200
+
201
+ breach_names: list[str] = []
202
+ breach_dates: list[str] = []
203
+ password_exposed = False
204
+
205
+ for breach in data:
206
+ name = breach.get("Name") or breach.get("Title") or ""
207
+ date = breach.get("BreachDate") or ""
208
+ data_classes = breach.get("DataClasses") or []
209
+ if name:
210
+ breach_names.append(name)
211
+ if date:
212
+ breach_dates.append(date)
213
+ if any("password" in dc.lower() for dc in data_classes):
214
+ password_exposed = True
215
+
216
+ # YYYY-MM-DD sorts lexicographically — max gives most recent
217
+ most_recent_breach = max(breach_dates) if breach_dates else None
218
+ most_recent_name: str | None = None
219
+ if most_recent_breach and breach_dates:
220
+ idx = breach_dates.index(most_recent_breach)
221
+ most_recent_name = breach_names[idx] if idx < len(breach_names) else (breach_names[-1] if breach_names else None)
222
+
223
+ recently_breached = False
224
+ if most_recent_breach:
225
+ try:
226
+ from datetime import datetime, timezone
227
+ dt = datetime.fromisoformat(most_recent_breach)
228
+ if dt.tzinfo is None:
229
+ dt = dt.replace(tzinfo=timezone.utc)
230
+ recently_breached = (datetime.now(timezone.utc) - dt).days < 365
231
+ except Exception:
232
+ pass
233
+
234
+ result = {
235
+ "found": True,
236
+ "source": "hibp",
237
+ "breach_count": len(breach_names),
238
+ "breach_names": breach_names,
239
+ "breach_dates": breach_dates,
240
+ "password_exposed": password_exposed,
241
+ "most_recent_breach": most_recent_breach,
242
+ "most_recent_name": most_recent_name,
243
+ "recently_breached": recently_breached,
244
+ }
245
+
246
+ _hibp_cache[email] = {"result": result, "loaded_at": time.time()}
247
+ return result
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # Source: EmailRep.io
252
+ # ---------------------------------------------------------------------------
253
+
254
+ async def query_emailrep(email: str) -> dict[str, Any]:
255
+ """
256
+ Query EmailRep.io for reputation data.
257
+
258
+ Optional EMAILREP_API_KEY increases rate limits — works without key.
259
+ Cached for 12 h.
260
+ """
261
+ cached = _emailrep_cache.get(email)
262
+ if cached and (time.time() - cached["loaded_at"]) < EMAILREP_CACHE_TTL:
263
+ return cached["result"]
264
+
265
+ api_key = (os.getenv("EMAILREP_API_KEY") or "").strip()
266
+ headers: dict[str, str] = {
267
+ "User-Agent": "VoidAccess-OSINT/1.1 (security research)",
268
+ "Accept": "application/json",
269
+ }
270
+ if api_key:
271
+ headers["Key"] = api_key
272
+
273
+ empty: dict[str, Any] = {
274
+ "reputation": None,
275
+ "suspicious": False,
276
+ "references": 0,
277
+ "profiles": [],
278
+ "disposable": False,
279
+ "free_provider": False,
280
+ "blacklisted": False,
281
+ "malicious_activity": False,
282
+ "credentials_leaked": False,
283
+ }
284
+
285
+ try:
286
+ timeout = aiohttp.ClientTimeout(total=15)
287
+ async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
288
+ async with session.get(f"{EMAILREP_BASE_URL}/{email}") as resp:
289
+ if resp.status == 400:
290
+ logger.debug(
291
+ "email_reputation: EmailRep → HTTP 400 for %s (invalid email)",
292
+ _safe_log_email(email),
293
+ )
294
+ return empty
295
+ if resp.status == 429:
296
+ logger.warning("email_reputation: EmailRep — rate limited")
297
+ return empty
298
+ if resp.status != 200:
299
+ logger.debug(
300
+ "email_reputation: EmailRep → HTTP %s for %s",
301
+ resp.status, _safe_log_email(email),
302
+ )
303
+ return empty
304
+ data = await resp.json()
305
+ except Exception as exc:
306
+ logger.debug(
307
+ "email_reputation: EmailRep failed for %s: %s", _safe_log_email(email), exc
308
+ )
309
+ return empty
310
+
311
+ attributes = data.get("details") or {}
312
+ result: dict[str, Any] = {
313
+ "reputation": data.get("reputation"),
314
+ "suspicious": bool(data.get("suspicious", False)),
315
+ "references": data.get("references", 0),
316
+ "profiles": list(attributes.get("profiles") or []),
317
+ "disposable": bool(attributes.get("disposable", False)),
318
+ "free_provider": bool(attributes.get("free_provider", False)),
319
+ "blacklisted": bool(attributes.get("blacklisted", False)),
320
+ "malicious_activity": bool(attributes.get("malicious_activity", False)),
321
+ "credentials_leaked": bool(attributes.get("credentials_leaked", False)),
322
+ }
323
+
324
+ _emailrep_cache[email] = {"result": result, "loaded_at": time.time()}
325
+ return result
326
+
327
+
328
+ # ---------------------------------------------------------------------------
329
+ # Core reputation check
330
+ # ---------------------------------------------------------------------------
331
+
332
+ async def check_email_reputation(
333
+ email: str,
334
+ base_confidence: float = 1.0,
335
+ ) -> dict[str, Any]:
336
+ """
337
+ Run all four enrichment sources concurrently for a single email address.
338
+
339
+ Returns a structured result dict with keys:
340
+ email, breached, breach_count, breach_names, password_exposed,
341
+ most_recent_breach, reputation, suspicious, disposable,
342
+ malicious_activity, credentials_leaked, platforms,
343
+ new_entities, tags, confidence_delta
344
+ """
345
+ result: dict[str, Any] = {
346
+ "email": email,
347
+ "breached": False,
348
+ "breach_count": 0,
349
+ "breach_names": [],
350
+ "password_exposed": False,
351
+ "most_recent_breach": None,
352
+ "reputation": None,
353
+ "suspicious": False,
354
+ "disposable": False,
355
+ "malicious_activity": False,
356
+ "credentials_leaked": False,
357
+ "platforms": [],
358
+ "new_entities": [],
359
+ "tags": [],
360
+ "confidence_delta": 0.0,
361
+ }
362
+
363
+ if not _is_valid_email(email):
364
+ return result
365
+
366
+ domain = _extract_domain(email)
367
+ if not domain:
368
+ return result
369
+
370
+ disposable_check, hibp_result, emailrep_result = await asyncio.gather(
371
+ is_disposable_domain(domain),
372
+ query_hibp(email),
373
+ query_emailrep(email),
374
+ return_exceptions=True,
375
+ )
376
+
377
+ if isinstance(disposable_check, Exception):
378
+ logger.debug(
379
+ "email_reputation: disposable check raised for %s: %s",
380
+ _safe_log_email(email), disposable_check,
381
+ )
382
+ disposable_check = False
383
+
384
+ if isinstance(hibp_result, Exception):
385
+ logger.debug(
386
+ "email_reputation: HIBP raised for %s: %s",
387
+ _safe_log_email(email), hibp_result,
388
+ )
389
+ hibp_result = {"found": False}
390
+
391
+ if isinstance(emailrep_result, Exception):
392
+ logger.debug(
393
+ "email_reputation: EmailRep raised for %s: %s",
394
+ _safe_log_email(email), emailrep_result,
395
+ )
396
+ emailrep_result = {}
397
+
398
+ # ── Disposable domain check ────────────────────────────────────────────────
399
+ if disposable_check:
400
+ result["disposable"] = True
401
+ result["tags"].append("disposable_email")
402
+
403
+ # ── HIBP ──────────────────────────────────────────────────────────────────
404
+ if hibp_result.get("found"):
405
+ count = hibp_result.get("breach_count", 0)
406
+ result["breached"] = True
407
+ result["breach_count"] = count
408
+ result["breach_names"] = hibp_result.get("breach_names", [])
409
+ result["password_exposed"] = hibp_result.get("password_exposed", False)
410
+ result["most_recent_breach"] = hibp_result.get("most_recent_breach")
411
+
412
+ result["tags"].append("hibp_breached")
413
+ result["tags"].append(f"hibp_breach_count_{count}")
414
+ result["confidence_delta"] += 0.15
415
+
416
+ if hibp_result.get("password_exposed"):
417
+ result["tags"].append("hibp_password_exposed")
418
+
419
+ if hibp_result.get("recently_breached"):
420
+ result["tags"].append("recently_breached")
421
+ name = hibp_result.get("most_recent_name") or ""
422
+ if name:
423
+ slug = re.sub(r"[^a-z0-9]+", "_", name.lower())[:40]
424
+ result["tags"].append(f"recent_breach_{slug}")
425
+
426
+ # ── EmailRep.io ───────────────────────────────────────────────────────────
427
+ if emailrep_result:
428
+ result["reputation"] = emailrep_result.get("reputation")
429
+ result["suspicious"] = emailrep_result.get("suspicious", False)
430
+ result["platforms"] = emailrep_result.get("profiles", [])
431
+
432
+ if emailrep_result.get("disposable"):
433
+ result["disposable"] = True
434
+ if "disposable_email" not in result["tags"]:
435
+ result["tags"].append("disposable_email")
436
+
437
+ if emailrep_result.get("malicious_activity"):
438
+ result["malicious_activity"] = True
439
+ result["tags"].append("emailrep_malicious")
440
+ result["confidence_delta"] += 0.10
441
+
442
+ if emailrep_result.get("credentials_leaked"):
443
+ result["credentials_leaked"] = True
444
+ result["tags"].append("credentials_leaked")
445
+
446
+ if emailrep_result.get("blacklisted"):
447
+ result["tags"].append("email_blacklisted")
448
+
449
+ # Apply disposable confidence penalty once regardless of which source flagged it
450
+ if result["disposable"]:
451
+ result["confidence_delta"] -= 0.10
452
+
453
+ # ── Domain cross-reference (custom domains only) ───────────────────────────
454
+ if domain and domain not in _FREE_PROVIDERS and not result["disposable"]:
455
+ result["new_entities"].append({
456
+ "entity_type": "DOMAIN",
457
+ "value": domain,
458
+ "canonical_value": domain,
459
+ "confidence": 0.75,
460
+ "source": "email_domain",
461
+ "extraction_method": "enrich",
462
+ "context_snippet": (
463
+ f"Domain extracted from email entity {_safe_log_email(email)}"
464
+ ),
465
+ })
466
+
467
+ return result
468
+
469
+
470
+ # ---------------------------------------------------------------------------
471
+ # DB helpers (sync — called via asyncio.to_thread)
472
+ # ---------------------------------------------------------------------------
473
+
474
+ def _update_email_entities_in_db(
475
+ updates: list[tuple[str, float, list[str]]],
476
+ ) -> None:
477
+ """Update confidence and corroborating_sources for enriched EMAIL_ADDRESS entities."""
478
+ if not os.getenv("DATABASE_URL") or not updates:
479
+ return
480
+ try:
481
+ from db.session import get_session
482
+ from db.models import Entity
483
+
484
+ with get_session() as session:
485
+ for email_val, confidence, tags in updates:
486
+ db_entity = session.query(Entity).filter(
487
+ Entity.entity_type == "EMAIL_ADDRESS",
488
+ Entity.value == email_val,
489
+ ).first()
490
+ if db_entity is None:
491
+ continue
492
+ if confidence > (db_entity.confidence or 0.0):
493
+ db_entity.confidence = confidence
494
+ if tags:
495
+ existing: list = json.loads(db_entity.corroborating_sources or "[]")
496
+ for tag in tags:
497
+ if tag not in existing:
498
+ existing.append(tag)
499
+ db_entity.corroborating_sources = json.dumps(existing)
500
+ session.commit()
501
+ except Exception as exc:
502
+ logger.warning("email_reputation: DB update failed: %s", exc)
503
+
504
+
505
+ # ---------------------------------------------------------------------------
506
+ # Pipeline integration
507
+ # ---------------------------------------------------------------------------
508
+
509
+ async def enrich_email_entities(
510
+ extraction_results: list,
511
+ investigation_id: Any,
512
+ ) -> tuple[list, dict]:
513
+ """
514
+ Post-extraction email reputation enrichment step (STEP 6.4).
515
+
516
+ Email addresses extracted from dark web content are already public —
517
+ they appeared on dark web forums/markets. Querying HIBP and EmailRep
518
+ constitutes legitimate security research.
519
+
520
+ Collects EMAIL_ADDRESS entities from *extraction_results*.
521
+ Caps at MAX_EMAILS = 30 per investigation.
522
+ Confidence floor: 0.50 (disposable addresses may still be real threat actor emails).
523
+ Confidence ceiling: 1.0.
524
+
525
+ Returns (extraction_results, stats_dict).
526
+ """
527
+ seen: dict[str, float] = {}
528
+ for exr in extraction_results:
529
+ for entity in getattr(exr, "entities", []):
530
+ if getattr(entity, "entity_type", "") != "EMAIL_ADDRESS":
531
+ continue
532
+ email = getattr(entity, "value", "").strip()
533
+ if not email or not _is_valid_email(email):
534
+ continue
535
+ if email not in seen:
536
+ seen[email] = getattr(entity, "confidence", 1.0)
537
+
538
+ unique_emails = list(seen.keys())
539
+ if not unique_emails:
540
+ return extraction_results, {"email_reputation": "ok_0_emails"}
541
+
542
+ if len(unique_emails) > MAX_EMAILS:
543
+ logger.info(
544
+ "email_reputation: capping to %d of %d unique emails",
545
+ MAX_EMAILS, len(unique_emails),
546
+ )
547
+ unique_emails = unique_emails[:MAX_EMAILS]
548
+
549
+ logger.info("email_reputation: checking %d unique email(s)", len(unique_emails))
550
+
551
+ rep_list = await asyncio.gather(
552
+ *[
553
+ check_email_reputation(e, base_confidence=seen[e])
554
+ for e in unique_emails
555
+ ],
556
+ return_exceptions=True,
557
+ )
558
+
559
+ db_updates: list[tuple[str, float, list[str]]] = []
560
+ all_new_entities: list[dict] = []
561
+ stats: dict[str, Any] = {
562
+ "emails_checked": len(unique_emails),
563
+ "breached": 0,
564
+ "password_exposed": 0,
565
+ "disposable": 0,
566
+ "malicious": 0,
567
+ "new_entities_discovered": 0,
568
+ }
569
+
570
+ for email, rep in zip(unique_emails, rep_list):
571
+ if isinstance(rep, Exception):
572
+ logger.debug(
573
+ "email_reputation: check raised for %s: %s",
574
+ _safe_log_email(email), rep,
575
+ )
576
+ continue
577
+
578
+ base_conf = seen[email]
579
+ delta = rep.get("confidence_delta", 0.0)
580
+ new_conf = max(0.50, min(base_conf + delta, 1.0))
581
+ tags = rep.get("tags", [])
582
+
583
+ if rep.get("breached"):
584
+ stats["breached"] += 1
585
+ if rep.get("password_exposed"):
586
+ stats["password_exposed"] += 1
587
+ if rep.get("disposable"):
588
+ stats["disposable"] += 1
589
+ if rep.get("malicious_activity"):
590
+ stats["malicious"] += 1
591
+
592
+ # High-value identity signal: breach history + malicious activity
593
+ if rep.get("password_exposed"):
594
+ domain = _extract_domain(email)
595
+ logger.info(
596
+ "[%s] High-value email entity: %s — "
597
+ "breach history + malicious activity confirmed",
598
+ investigation_id,
599
+ _safe_log_email(email),
600
+ )
601
+
602
+ new_entities = rep.get("new_entities", [])
603
+ all_new_entities.extend(new_entities)
604
+ stats["new_entities_discovered"] += len(new_entities)
605
+
606
+ if tags or delta != 0.0:
607
+ db_updates.append((email, new_conf, tags))
608
+
609
+ if db_updates:
610
+ await asyncio.to_thread(_update_email_entities_in_db, db_updates)
611
+
612
+ if all_new_entities:
613
+ logger.info(
614
+ "email_reputation: %d new entities discovered (custom domains)",
615
+ len(all_new_entities),
616
+ )
617
+
618
+ checked = stats["emails_checked"]
619
+ status = (
620
+ f"ok_{checked}_emails"
621
+ f"_{stats['breached']}_breached"
622
+ f"_{stats['disposable']}_disposable"
623
+ )
624
+
625
+ logger.info(
626
+ "email_reputation: done — %d checked, %d breached, %d passwords exposed, "
627
+ "%d disposable, %d malicious",
628
+ checked,
629
+ stats["breached"],
630
+ stats["password_exposed"],
631
+ stats["disposable"],
632
+ stats["malicious"],
633
+ )
634
+
635
+ return extraction_results, {"email_reputation": status, **stats}