PyPI - voidaccess - Versions diffs - 1.3.0__py3-none-any.whl - Mend

voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

analysis/__init__.py +49 -0
analysis/opsec.py +454 -0
analysis/patterns.py +202 -0
analysis/temporal.py +201 -0
api/__init__.py +1 -0
api/auth.py +163 -0
api/main.py +509 -0
api/routes/__init__.py +1 -0
api/routes/admin.py +214 -0
api/routes/auth.py +157 -0
api/routes/entities.py +871 -0
api/routes/export.py +359 -0
api/routes/investigations.py +2567 -0
api/routes/monitors.py +405 -0
api/routes/search.py +157 -0
api/routes/settings.py +851 -0
auth/__init__.py +1 -0
auth/token_blacklist.py +108 -0
cli/__init__.py +3 -0
cli/adapters/__init__.py +1 -0
cli/adapters/sqlite.py +273 -0
cli/browser.py +376 -0
cli/commands/__init__.py +1 -0
cli/commands/configure.py +185 -0
cli/commands/enrich.py +154 -0
cli/commands/export.py +158 -0
cli/commands/investigate.py +601 -0
cli/commands/show.py +87 -0
cli/config.py +180 -0
cli/display.py +212 -0
cli/main.py +154 -0
cli/tor_detect.py +71 -0
config.py +180 -0
crawler/__init__.py +28 -0
crawler/dedup.py +97 -0
crawler/frontier.py +115 -0
crawler/spider.py +462 -0
crawler/utils.py +122 -0
db/__init__.py +47 -0
db/migrations/__init__.py +0 -0
db/migrations/env.py +80 -0
db/migrations/versions/0001_initial_schema.py +270 -0
db/migrations/versions/0002_add_investigation_status_column.py +27 -0
db/migrations/versions/0002_add_missing_tables.py +33 -0
db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
db/migrations/versions/0004_add_page_posted_at.py +41 -0
db/migrations/versions/0005_add_extraction_method.py +32 -0
db/migrations/versions/0006_add_monitor_alerts.py +26 -0
db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
db/migrations/versions/0008_add_users_table.py +47 -0
db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
db/migrations/versions/0013_add_graph_status.py +31 -0
db/migrations/versions/0015_add_progress_fields.py +41 -0
db/migrations/versions/0016_backfill_graph_status.py +33 -0
db/migrations/versions/0017_add_user_api_keys.py +44 -0
db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
db/migrations/versions/0019_add_content_safety_log.py +46 -0
db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
db/models.py +618 -0
db/queries.py +841 -0
db/session.py +270 -0
export/__init__.py +34 -0
export/misp.py +257 -0
export/sigma.py +342 -0
export/stix.py +418 -0
extractor/__init__.py +21 -0
extractor/llm_extract.py +372 -0
extractor/ner.py +512 -0
extractor/normalizer.py +638 -0
extractor/pipeline.py +401 -0
extractor/regex_patterns.py +325 -0
fingerprint/__init__.py +33 -0
fingerprint/profiler.py +240 -0
fingerprint/stylometry.py +249 -0
graph/__init__.py +73 -0
graph/builder.py +894 -0
graph/export.py +225 -0
graph/model.py +83 -0
graph/queries.py +297 -0
graph/visualize.py +178 -0
i18n/__init__.py +24 -0
i18n/detect.py +76 -0
i18n/query_expand.py +72 -0
i18n/translate.py +210 -0
monitor/__init__.py +27 -0
monitor/_db.py +74 -0
monitor/alerts.py +345 -0
monitor/config.py +118 -0
monitor/diff.py +75 -0
monitor/jobs.py +247 -0
monitor/scheduler.py +184 -0
scraper/__init__.py +0 -0
scraper/scrape.py +857 -0
scraper/scrape_js.py +272 -0
search/__init__.py +318 -0
search/circuit_breaker.py +240 -0
search/search.py +334 -0
sources/__init__.py +96 -0
sources/blockchain.py +444 -0
sources/cache.py +93 -0
sources/cisa.py +108 -0
sources/dns_enrichment.py +557 -0
sources/domain_reputation.py +643 -0
sources/email_reputation.py +635 -0
sources/engines.py +244 -0
sources/enrichment.py +1244 -0
sources/github_scraper.py +589 -0
sources/gitlab_scraper.py +624 -0
sources/hash_reputation.py +856 -0
sources/historical_intel.py +253 -0
sources/ip_reputation.py +521 -0
sources/paste_scraper.py +484 -0
sources/pastes.py +278 -0
sources/rss_scraper.py +576 -0
sources/seed_manager.py +373 -0
sources/seeds.py +368 -0
sources/shodan.py +103 -0
sources/telegram.py +199 -0
sources/virustotal.py +113 -0
utils/__init__.py +0 -0
utils/async_utils.py +89 -0
utils/content_safety.py +193 -0
utils/defang.py +94 -0
utils/encryption.py +34 -0
utils/ioc_freshness.py +124 -0
utils/user_keys.py +33 -0
vector/__init__.py +39 -0
vector/embedder.py +100 -0
vector/model_singleton.py +49 -0
vector/search.py +87 -0
vector/store.py +514 -0
voidaccess/__init__.py +0 -0
voidaccess/llm.py +717 -0
voidaccess/llm_utils.py +696 -0
voidaccess-1.3.0.dist-info/METADATA +395 -0
voidaccess-1.3.0.dist-info/RECORD +142 -0
voidaccess-1.3.0.dist-info/WHEEL +5 -0
voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
voidaccess-1.3.0.dist-info/top_level.txt +19 -0

sources/email_reputation.py ADDED Viewed

@@ -0,0 +1,635 @@
+"""
+sources/email_reputation.py — Email reputation enrichment.
+Enriches EMAIL_ADDRESS entities with identity attribution data from four sources:
+  - HaveIBeenPwned (HIBP): breach history and password exposure (requires HIBP_API_KEY)
+  - EmailRep.io: reputation scoring, disposable detection, platform presence
+  - Disposable domain blocklist: fast local check against known throwaway domains
+  - Domain cross-reference: email domain added as DOMAIN entity (custom domains only)
+Email addresses extracted from dark web content are already public — they appeared
+on dark web forums/markets. Querying HIBP and EmailRep is legitimate security research.
+Public interface
+----------------
+async is_disposable_domain(domain)                          → bool
+async query_hibp(email)                                     → dict
+async query_emailrep(email)                                 → dict
+async check_email_reputation(email, base_confidence)        → dict
+async enrich_email_entities(extraction_results, investigation_id) → (results, stats)
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import os
+import re
+import time
+from typing import Any
+import aiohttp
+logger = logging.getLogger(__name__)
+MAX_EMAILS = 30
+HIBP_BASE_URL = "https://haveibeenpwned.com/api/v3"
+EMAILREP_BASE_URL = "https://emailrep.io"
+DISPOSABLE_LIST_URL = (
+    "https://raw.githubusercontent.com/disposable-email-domains/"
+    "disposable-email-domains/master/disposable_email_blocklist.conf"
+)
+HIBP_CACHE_TTL = 86400.0      # 24 h
+EMAILREP_CACHE_TTL = 43200.0  # 12 h
+DISPOSABLE_LIST_CACHE_TTL = 86400.0  # 24 h
+_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
+# Common free/privacy providers — domain cross-reference reveals no attribution signal
+_FREE_PROVIDERS: frozenset[str] = frozenset({
+    "gmail.com", "googlemail.com",
+    "yahoo.com", "yahoo.co.uk", "yahoo.fr",
+    "hotmail.com", "hotmail.co.uk", "outlook.com", "live.com",
+    "proton.me", "protonmail.com", "protonmail.ch",
+    "tutanota.com", "tutanota.de", "tuta.io",
+    "icloud.com", "me.com",
+    "aol.com",
+    "mail.com",
+})
+# In-memory per-email caches
+_hibp_cache: dict[str, dict] = {}
+_emailrep_cache: dict[str, dict] = {}
+# Disposable domain set cache (module-level singleton)
+_disposable_cache: dict[str, Any] = {"domains": frozenset(), "loaded_at": 0.0}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _is_valid_email(value: str) -> bool:
+    return bool(value and _EMAIL_RE.match(value.strip()))
+def _extract_domain(email: str) -> str:
+    """Return the domain portion of an email address."""
+    try:
+        return email.strip().split("@", 1)[1].lower()
+    except IndexError:
+        return ""
+def _safe_log_email(email: str) -> str:
+    """Return privacy-safe log representation: first 3 chars + @domain."""
+    try:
+        local, domain = email.split("@", 1)
+        return f"{local[:3]}***@{domain}"
+    except Exception:
+        return "***@***"
+# ---------------------------------------------------------------------------
+# Source: Disposable domain blocklist
+# ---------------------------------------------------------------------------
+async def _load_disposable_list() -> frozenset[str]:
+    """Fetch and cache the disposable email domain blocklist (24 h TTL)."""
+    cache = _disposable_cache
+    if time.time() - cache["loaded_at"] < DISPOSABLE_LIST_CACHE_TTL and cache["domains"]:
+        return cache["domains"]  # type: ignore[return-value]
+    logger.info("email_reputation: Refreshing disposable domain blocklist")
+    try:
+        timeout = aiohttp.ClientTimeout(total=20)
+        headers = {"User-Agent": "VoidAccess-OSINT/1.1 (security research)"}
+        async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
+            async with session.get(DISPOSABLE_LIST_URL) as resp:
+                if resp.status != 200:
+                    logger.warning(
+                        "email_reputation: Disposable list returned HTTP %s", resp.status
+                    )
+                    return frozenset(cache["domains"])
+                text = await resp.text()
+    except Exception as exc:
+        logger.warning("email_reputation: Disposable list fetch failed: %s", exc)
+        return frozenset(cache["domains"])
+    domains: set[str] = set()
+    for line in text.splitlines():
+        line = line.strip().lower()
+        if line and not line.startswith("#"):
+            domains.add(line)
+    frozen = frozenset(domains)
+    cache["domains"] = frozen
+    cache["loaded_at"] = time.time()
+    logger.info("email_reputation: Disposable blocklist: %d domains loaded", len(frozen))
+    return frozen
+async def is_disposable_domain(domain: str) -> bool:
+    """Return True if *domain* appears in the disposable email domain blocklist."""
+    blocklist = await _load_disposable_list()
+    return domain.lower() in blocklist
+# ---------------------------------------------------------------------------
+# Source: HaveIBeenPwned
+# ---------------------------------------------------------------------------
+async def query_hibp(email: str) -> dict[str, Any]:
+    """
+    Query HIBP v3 breachedaccount/{email} for breach history.
+    Requires HIBP_API_KEY. Without a key the check is skipped gracefully.
+    HIBP is a paid API ($3.50/month individual) — the most authoritative
+    source for email breach data.
+    Cached for 24 h.
+    """
+    cached = _hibp_cache.get(email)
+    if cached and (time.time() - cached["loaded_at"]) < HIBP_CACHE_TTL:
+        return cached["result"]
+    api_key = (os.getenv("HIBP_API_KEY") or "").strip()
+    if not api_key:
+        logger.debug("email_reputation: HIBP skipped — no API key")
+        return {"found": False, "source": "hibp_skipped"}
+    try:
+        headers = {
+            "hibp-api-key": api_key,
+            "User-Agent": "VoidAccess-OSINT",
+        }
+        timeout = aiohttp.ClientTimeout(total=15)
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(
+                f"{HIBP_BASE_URL}/breachedaccount/{email}",
+                headers=headers,
+                params={"truncateResponse": "false"},
+            ) as resp:
+                if resp.status == 404:
+                    result: dict[str, Any] = {"found": False, "source": "hibp_not_found"}
+                    _hibp_cache[email] = {"result": result, "loaded_at": time.time()}
+                    return result
+                if resp.status == 401:
+                    logger.warning("email_reputation: HIBP — invalid API key")
+                    return {"found": False, "source": "hibp_auth_error"}
+                if resp.status == 429:
+                    logger.warning("email_reputation: HIBP — rate limited")
+                    return {"found": False, "source": "hibp_rate_limited"}
+                if resp.status != 200:
+                    logger.debug(
+                        "email_reputation: HIBP → HTTP %s for %s",
+                        resp.status, _safe_log_email(email),
+                    )
+                    return {"found": False, "source": "hibp_error"}
+                data = await resp.json()
+    except Exception as exc:
+        logger.debug(
+            "email_reputation: HIBP failed for %s: %s", _safe_log_email(email), exc
+        )
+        return {"found": False, "source": "hibp_error"}
+    if not data or not isinstance(data, list):
+        return {"found": False, "source": "hibp_not_found"}
+    breach_names: list[str] = []
+    breach_dates: list[str] = []
+    password_exposed = False
+    for breach in data:
+        name = breach.get("Name") or breach.get("Title") or ""
+        date = breach.get("BreachDate") or ""
+        data_classes = breach.get("DataClasses") or []
+        if name:
+            breach_names.append(name)
+        if date:
+            breach_dates.append(date)
+        if any("password" in dc.lower() for dc in data_classes):
+            password_exposed = True
+    # YYYY-MM-DD sorts lexicographically — max gives most recent
+    most_recent_breach = max(breach_dates) if breach_dates else None
+    most_recent_name: str | None = None
+    if most_recent_breach and breach_dates:
+        idx = breach_dates.index(most_recent_breach)
+        most_recent_name = breach_names[idx] if idx < len(breach_names) else (breach_names[-1] if breach_names else None)
+    recently_breached = False
+    if most_recent_breach:
+        try:
+            from datetime import datetime, timezone
+            dt = datetime.fromisoformat(most_recent_breach)
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            recently_breached = (datetime.now(timezone.utc) - dt).days < 365
+        except Exception:
+            pass
+    result = {
+        "found": True,
+        "source": "hibp",
+        "breach_count": len(breach_names),
+        "breach_names": breach_names,
+        "breach_dates": breach_dates,
+        "password_exposed": password_exposed,
+        "most_recent_breach": most_recent_breach,
+        "most_recent_name": most_recent_name,
+        "recently_breached": recently_breached,
+    }
+    _hibp_cache[email] = {"result": result, "loaded_at": time.time()}
+    return result
+# ---------------------------------------------------------------------------
+# Source: EmailRep.io
+# ---------------------------------------------------------------------------
+async def query_emailrep(email: str) -> dict[str, Any]:
+    """
+    Query EmailRep.io for reputation data.
+    Optional EMAILREP_API_KEY increases rate limits — works without key.
+    Cached for 12 h.
+    """
+    cached = _emailrep_cache.get(email)
+    if cached and (time.time() - cached["loaded_at"]) < EMAILREP_CACHE_TTL:
+        return cached["result"]
+    api_key = (os.getenv("EMAILREP_API_KEY") or "").strip()
+    headers: dict[str, str] = {
+        "User-Agent": "VoidAccess-OSINT/1.1 (security research)",
+        "Accept": "application/json",
+    }
+    if api_key:
+        headers["Key"] = api_key
+    empty: dict[str, Any] = {
+        "reputation": None,
+        "suspicious": False,
+        "references": 0,
+        "profiles": [],
+        "disposable": False,
+        "free_provider": False,
+        "blacklisted": False,
+        "malicious_activity": False,
+        "credentials_leaked": False,
+    }
+    try:
+        timeout = aiohttp.ClientTimeout(total=15)
+        async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
+            async with session.get(f"{EMAILREP_BASE_URL}/{email}") as resp:
+                if resp.status == 400:
+                    logger.debug(
+                        "email_reputation: EmailRep → HTTP 400 for %s (invalid email)",
+                        _safe_log_email(email),
+                    )
+                    return empty
+                if resp.status == 429:
+                    logger.warning("email_reputation: EmailRep — rate limited")
+                    return empty
+                if resp.status != 200:
+                    logger.debug(
+                        "email_reputation: EmailRep → HTTP %s for %s",
+                        resp.status, _safe_log_email(email),
+                    )
+                    return empty
+                data = await resp.json()
+    except Exception as exc:
+        logger.debug(
+            "email_reputation: EmailRep failed for %s: %s", _safe_log_email(email), exc
+        )
+        return empty
+    attributes = data.get("details") or {}
+    result: dict[str, Any] = {
+        "reputation": data.get("reputation"),
+        "suspicious": bool(data.get("suspicious", False)),
+        "references": data.get("references", 0),
+        "profiles": list(attributes.get("profiles") or []),
+        "disposable": bool(attributes.get("disposable", False)),
+        "free_provider": bool(attributes.get("free_provider", False)),
+        "blacklisted": bool(attributes.get("blacklisted", False)),
+        "malicious_activity": bool(attributes.get("malicious_activity", False)),
+        "credentials_leaked": bool(attributes.get("credentials_leaked", False)),
+    }
+    _emailrep_cache[email] = {"result": result, "loaded_at": time.time()}
+    return result
+# ---------------------------------------------------------------------------
+# Core reputation check
+# ---------------------------------------------------------------------------
+async def check_email_reputation(
+    email: str,
+    base_confidence: float = 1.0,
+) -> dict[str, Any]:
+    """
+    Run all four enrichment sources concurrently for a single email address.
+    Returns a structured result dict with keys:
+        email, breached, breach_count, breach_names, password_exposed,
+        most_recent_breach, reputation, suspicious, disposable,
+        malicious_activity, credentials_leaked, platforms,
+        new_entities, tags, confidence_delta
+    """
+    result: dict[str, Any] = {
+        "email": email,
+        "breached": False,
+        "breach_count": 0,
+        "breach_names": [],
+        "password_exposed": False,
+        "most_recent_breach": None,
+        "reputation": None,
+        "suspicious": False,
+        "disposable": False,
+        "malicious_activity": False,
+        "credentials_leaked": False,
+        "platforms": [],
+        "new_entities": [],
+        "tags": [],
+        "confidence_delta": 0.0,
+    }
+    if not _is_valid_email(email):
+        return result
+    domain = _extract_domain(email)
+    if not domain:
+        return result
+    disposable_check, hibp_result, emailrep_result = await asyncio.gather(
+        is_disposable_domain(domain),
+        query_hibp(email),
+        query_emailrep(email),
+        return_exceptions=True,
+    )
+    if isinstance(disposable_check, Exception):
+        logger.debug(
+            "email_reputation: disposable check raised for %s: %s",
+            _safe_log_email(email), disposable_check,
+        )
+        disposable_check = False
+    if isinstance(hibp_result, Exception):
+        logger.debug(
+            "email_reputation: HIBP raised for %s: %s",
+            _safe_log_email(email), hibp_result,
+        )
+        hibp_result = {"found": False}
+    if isinstance(emailrep_result, Exception):
+        logger.debug(
+            "email_reputation: EmailRep raised for %s: %s",
+            _safe_log_email(email), emailrep_result,
+        )
+        emailrep_result = {}
+    # ── Disposable domain check ────────────────────────────────────────────────
+    if disposable_check:
+        result["disposable"] = True
+        result["tags"].append("disposable_email")
+    # ── HIBP ──────────────────────────────────────────────────────────────────
+    if hibp_result.get("found"):
+        count = hibp_result.get("breach_count", 0)
+        result["breached"] = True
+        result["breach_count"] = count
+        result["breach_names"] = hibp_result.get("breach_names", [])
+        result["password_exposed"] = hibp_result.get("password_exposed", False)
+        result["most_recent_breach"] = hibp_result.get("most_recent_breach")
+        result["tags"].append("hibp_breached")
+        result["tags"].append(f"hibp_breach_count_{count}")
+        result["confidence_delta"] += 0.15
+        if hibp_result.get("password_exposed"):
+            result["tags"].append("hibp_password_exposed")
+        if hibp_result.get("recently_breached"):
+            result["tags"].append("recently_breached")
+            name = hibp_result.get("most_recent_name") or ""
+            if name:
+                slug = re.sub(r"[^a-z0-9]+", "_", name.lower())[:40]
+                result["tags"].append(f"recent_breach_{slug}")
+    # ── EmailRep.io ───────────────────────────────────────────────────────────
+    if emailrep_result:
+        result["reputation"] = emailrep_result.get("reputation")
+        result["suspicious"] = emailrep_result.get("suspicious", False)
+        result["platforms"] = emailrep_result.get("profiles", [])
+        if emailrep_result.get("disposable"):
+            result["disposable"] = True
+            if "disposable_email" not in result["tags"]:
+                result["tags"].append("disposable_email")
+        if emailrep_result.get("malicious_activity"):
+            result["malicious_activity"] = True
+            result["tags"].append("emailrep_malicious")
+            result["confidence_delta"] += 0.10
+        if emailrep_result.get("credentials_leaked"):
+            result["credentials_leaked"] = True
+            result["tags"].append("credentials_leaked")
+        if emailrep_result.get("blacklisted"):
+            result["tags"].append("email_blacklisted")
+    # Apply disposable confidence penalty once regardless of which source flagged it
+    if result["disposable"]:
+        result["confidence_delta"] -= 0.10
+    # ── Domain cross-reference (custom domains only) ───────────────────────────
+    if domain and domain not in _FREE_PROVIDERS and not result["disposable"]:
+        result["new_entities"].append({
+            "entity_type": "DOMAIN",
+            "value": domain,
+            "canonical_value": domain,
+            "confidence": 0.75,
+            "source": "email_domain",
+            "extraction_method": "enrich",
+            "context_snippet": (
+                f"Domain extracted from email entity {_safe_log_email(email)}"
+            ),
+        })
+    return result
+# ---------------------------------------------------------------------------
+# DB helpers (sync — called via asyncio.to_thread)
+# ---------------------------------------------------------------------------
+def _update_email_entities_in_db(
+    updates: list[tuple[str, float, list[str]]],
+) -> None:
+    """Update confidence and corroborating_sources for enriched EMAIL_ADDRESS entities."""
+    if not os.getenv("DATABASE_URL") or not updates:
+        return
+    try:
+        from db.session import get_session
+        from db.models import Entity
+        with get_session() as session:
+            for email_val, confidence, tags in updates:
+                db_entity = session.query(Entity).filter(
+                    Entity.entity_type == "EMAIL_ADDRESS",
+                    Entity.value == email_val,
+                ).first()
+                if db_entity is None:
+                    continue
+                if confidence > (db_entity.confidence or 0.0):
+                    db_entity.confidence = confidence
+                if tags:
+                    existing: list = json.loads(db_entity.corroborating_sources or "[]")
+                    for tag in tags:
+                        if tag not in existing:
+                            existing.append(tag)
+                    db_entity.corroborating_sources = json.dumps(existing)
+            session.commit()
+    except Exception as exc:
+        logger.warning("email_reputation: DB update failed: %s", exc)
+# ---------------------------------------------------------------------------
+# Pipeline integration
+# ---------------------------------------------------------------------------
+async def enrich_email_entities(
+    extraction_results: list,
+    investigation_id: Any,
+) -> tuple[list, dict]:
+    """
+    Post-extraction email reputation enrichment step (STEP 6.4).
+    Email addresses extracted from dark web content are already public —
+    they appeared on dark web forums/markets. Querying HIBP and EmailRep
+    constitutes legitimate security research.
+    Collects EMAIL_ADDRESS entities from *extraction_results*.
+    Caps at MAX_EMAILS = 30 per investigation.
+    Confidence floor: 0.50 (disposable addresses may still be real threat actor emails).
+    Confidence ceiling: 1.0.
+    Returns (extraction_results, stats_dict).
+    """
+    seen: dict[str, float] = {}
+    for exr in extraction_results:
+        for entity in getattr(exr, "entities", []):
+            if getattr(entity, "entity_type", "") != "EMAIL_ADDRESS":
+                continue
+            email = getattr(entity, "value", "").strip()
+            if not email or not _is_valid_email(email):
+                continue
+            if email not in seen:
+                seen[email] = getattr(entity, "confidence", 1.0)
+    unique_emails = list(seen.keys())
+    if not unique_emails:
+        return extraction_results, {"email_reputation": "ok_0_emails"}
+    if len(unique_emails) > MAX_EMAILS:
+        logger.info(
+            "email_reputation: capping to %d of %d unique emails",
+            MAX_EMAILS, len(unique_emails),
+        )
+        unique_emails = unique_emails[:MAX_EMAILS]
+    logger.info("email_reputation: checking %d unique email(s)", len(unique_emails))
+    rep_list = await asyncio.gather(
+        *[
+            check_email_reputation(e, base_confidence=seen[e])
+            for e in unique_emails
+        ],
+        return_exceptions=True,
+    )
+    db_updates: list[tuple[str, float, list[str]]] = []
+    all_new_entities: list[dict] = []
+    stats: dict[str, Any] = {
+        "emails_checked": len(unique_emails),
+        "breached": 0,
+        "password_exposed": 0,
+        "disposable": 0,
+        "malicious": 0,
+        "new_entities_discovered": 0,
+    }
+    for email, rep in zip(unique_emails, rep_list):
+        if isinstance(rep, Exception):
+            logger.debug(
+                "email_reputation: check raised for %s: %s",
+                _safe_log_email(email), rep,
+            )
+            continue
+        base_conf = seen[email]
+        delta = rep.get("confidence_delta", 0.0)
+        new_conf = max(0.50, min(base_conf + delta, 1.0))
+        tags = rep.get("tags", [])
+        if rep.get("breached"):
+            stats["breached"] += 1
+        if rep.get("password_exposed"):
+            stats["password_exposed"] += 1
+        if rep.get("disposable"):
+            stats["disposable"] += 1
+        if rep.get("malicious_activity"):
+            stats["malicious"] += 1
+            # High-value identity signal: breach history + malicious activity
+            if rep.get("password_exposed"):
+                domain = _extract_domain(email)
+                logger.info(
+                    "[%s] High-value email entity: %s — "
+                    "breach history + malicious activity confirmed",
+                    investigation_id,
+                    _safe_log_email(email),
+                )
+        new_entities = rep.get("new_entities", [])
+        all_new_entities.extend(new_entities)
+        stats["new_entities_discovered"] += len(new_entities)
+        if tags or delta != 0.0:
+            db_updates.append((email, new_conf, tags))
+    if db_updates:
+        await asyncio.to_thread(_update_email_entities_in_db, db_updates)
+    if all_new_entities:
+        logger.info(
+            "email_reputation: %d new entities discovered (custom domains)",
+            len(all_new_entities),
+        )
+    checked = stats["emails_checked"]
+    status = (
+        f"ok_{checked}_emails"
+        f"_{stats['breached']}_breached"
+        f"_{stats['disposable']}_disposable"
+    )
+    logger.info(
+        "email_reputation: done — %d checked, %d breached, %d passwords exposed, "
+        "%d disposable, %d malicious",
+        checked,
+        stats["breached"],
+        stats["password_exposed"],
+        stats["disposable"],
+        stats["malicious"],
+    )
+    return extraction_results, {"email_reputation": status, **stats}