voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
config.py ADDED
@@ -0,0 +1,180 @@
1
+ import os
2
+ import secrets
3
+ import logging
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def _clean_env(name, default=None):
12
+ value = os.getenv(name, default)
13
+ if value is None:
14
+ return None
15
+ value = str(value).strip()
16
+ # Support accidentally quoted values copied into .env
17
+ if len(value) >= 2 and (
18
+ (value[0] == value[-1] == '"') or (value[0] == value[-1] == "'")
19
+ ):
20
+ value = value[1:-1].strip()
21
+ return value
22
+
23
+ # Configuration variables loaded from the .env file
24
+ OPENAI_API_KEY = _clean_env("OPENAI_API_KEY")
25
+ GOOGLE_API_KEY = _clean_env("GOOGLE_API_KEY")
26
+ ANTHROPIC_API_KEY = _clean_env("ANTHROPIC_API_KEY")
27
+ OLLAMA_BASE_URL = _clean_env("OLLAMA_BASE_URL")
28
+ OPENROUTER_BASE_URL = _clean_env("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
29
+ OPENROUTER_API_KEY = _clean_env("OPENROUTER_API_KEY")
30
+ GROQ_API_KEY = _clean_env("GROQ_API_KEY", "")
31
+ LLAMA_CPP_BASE_URL = _clean_env("LLAMA_CPP_BASE_URL")
32
+
33
+ # Database
34
+ DATABASE_URL = _clean_env("DATABASE_URL")
35
+
36
+ # Tor proxy (configurable so Docker can point to a tor service by name)
37
+ TOR_PROXY_HOST = _clean_env("TOR_PROXY_HOST", "127.0.0.1")
38
+ TOR_PROXY_PORT = _clean_env("TOR_PROXY_PORT", "9050")
39
+
40
+ # Phase 1D — expanded sources (all optional; missing vars disable that source)
41
+ DARKSEARCH_API_KEY = _clean_env("DARKSEARCH_API_KEY") # optional free-tier key
42
+ TELEGRAM_API_ID = _clean_env("TELEGRAM_API_ID") # from my.telegram.org
43
+ TELEGRAM_API_HASH = _clean_env("TELEGRAM_API_HASH") # from my.telegram.org
44
+ TELEGRAM_PHONE = _clean_env("TELEGRAM_PHONE") # E.164, e.g. +12025551234
45
+
46
+ # Phase 4 — vector store + alert channels (all optional)
47
+ CHROMA_PERSIST_DIR = _clean_env("CHROMA_PERSIST_DIR", "./chroma_db")
48
+ TELEGRAM_BOT_TOKEN = _clean_env("TELEGRAM_BOT_TOKEN") # one-way alerts; separate from Telethon API vars
49
+ SMTP_HOST = _clean_env("SMTP_HOST")
50
+ SMTP_PORT = _clean_env("SMTP_PORT", "587")
51
+ SMTP_USER = _clean_env("SMTP_USER")
52
+ SMTP_PASS = _clean_env("SMTP_PASS")
53
+
54
+ # Phase 5 — REST API server
55
+ API_HOST = _clean_env("API_HOST", "0.0.0.0")
56
+ API_PORT = _clean_env("API_PORT", "8000")
57
+
58
+ # Phase 6 — advanced capabilities
59
+ DEEPL_API_KEY = _clean_env("DEEPL_API_KEY") # optional; translation
60
+ STYLOMETRY_THRESHOLD = _clean_env("STYLOMETRY_THRESHOLD", "0.85") # same-author detection
61
+
62
+ # LLM extraction cache (optional — defaults to enabled)
63
+ DISABLE_EXTRACTION_CACHE = _clean_env("DISABLE_EXTRACTION_CACHE")
64
+ if DISABLE_EXTRACTION_CACHE is not None:
65
+ DISABLE_EXTRACTION_CACHE = DISABLE_EXTRACTION_CACHE.lower() == "true"
66
+ else:
67
+ DISABLE_EXTRACTION_CACHE = False
68
+
69
+ # i18n / Query Expansion
70
+ # Languages to include in query expansion (comma-separated ISO 639-1 codes)
71
+ # Default: en, ru, zh (English, Russian, Chinese)
72
+ I18N_LANGUAGES = _clean_env("I18N_LANGUAGES", "en,ru,zh")
73
+ if I18N_LANGUAGES and isinstance(I18N_LANGUAGES, str):
74
+ I18N_LANGUAGES = [lang.strip() for lang in I18N_LANGUAGES.split(",") if lang.strip()]
75
+ else:
76
+ I18N_LANGUAGES = ["en", "ru", "zh"]
77
+
78
+ # Threat Intelligence API Keys
79
+ OTX_API_KEY = _clean_env("OTX_API_KEY", "") # AlienVault OTX — free at otx.alienvault.com
80
+ VT_API_KEY = _clean_env("VT_API_KEY", "") # VirusTotal — free tier at virustotal.com
81
+
82
+ # IP Reputation Enrichment (all optional — features degrade gracefully without keys)
83
+ ABUSEIPDB_API_KEY = _clean_env("ABUSEIPDB_API_KEY", "") # Community IP abuse reports
84
+ GREYNOISE_API_KEY = _clean_env("GREYNOISE_API_KEY", "") # Suppresses benign scanner IPs
85
+ C2_FEED_CACHE_TTL = _clean_env("C2_FEED_CACHE_TTL", "24") # Hours between feed refreshes
86
+
87
+ # Domain Reputation Enrichment
88
+ URLSCAN_API_KEY = _clean_env("URLSCAN_API_KEY", "")
89
+ SECURITYTRAILS_API_KEY = _clean_env("SECURITYTRAILS_API_KEY", "")
90
+
91
+ # Code Intelligence (GitHub / GitLab scraping)
92
+ GITHUB_TOKEN = _clean_env("GITHUB_TOKEN", "")
93
+ GITLAB_TOKEN = _clean_env("GITLAB_TOKEN", "")
94
+
95
+ # Hash Reputation Enrichment
96
+ HYBRID_ANALYSIS_API_KEY = _clean_env("HYBRID_ANALYSIS_API_KEY", "")
97
+
98
+ # Email Reputation Enrichment
99
+ HIBP_API_KEY = _clean_env("HIBP_API_KEY", "")
100
+ EMAILREP_API_KEY = _clean_env("EMAILREP_API_KEY", "")
101
+
102
+ SHODAN_RATE_LIMIT_DELAY = 1.0 # seconds between Shodan requests (InternetDB)
103
+ MAX_IPS_PER_INVESTIGATION = 50 # max IPs to query Shodan per investigation
104
+ MAX_HASHES_PER_INVESTIGATION = 20 # max file hashes to query VirusTotal per investigation
105
+
106
+ # Blockchain API Keys (optional — free tiers work without)
107
+ BLOCKCYPHER_TOKEN = _clean_env("BLOCKCYPHER_TOKEN", "")
108
+ ETHERSCAN_API_KEY = _clean_env("ETHERSCAN_API_KEY", "")
109
+
110
+ # Auth — REQUIRED in production. Generate with: python -c "import secrets; print(secrets.token_hex(32))"
111
+ _jwt_secret = _clean_env("JWT_SECRET")
112
+ if _jwt_secret is None:
113
+ raise RuntimeError(
114
+ "JWT_SECRET environment variable is not set. "
115
+ "Generate a secure secret with: python -c \"import secrets; print(secrets.token_hex(32))\" "
116
+ "and set it as JWT_SECRET in your .env file. "
117
+ "Do NOT use a random secret — it will change on restart and invalidate all issued tokens."
118
+ )
119
+ JWT_SECRET = _jwt_secret
120
+
121
+ # Token blacklist Redis (optional — omit to disable blacklist checks)
122
+ REDIS_URL = _clean_env("REDIS_URL")
123
+
124
+ # Playwright JS rendering for JavaScript-heavy .onion sites
125
+ # Set to False to disable (faster startup, lower memory usage)
126
+ # Requires: playwright installed + browsers downloaded
127
+ PLAYWRIGHT_ENABLED = _clean_env("PLAYWRIGHT_ENABLED", "true")
128
+ if PLAYWRIGHT_ENABLED is not None:
129
+ PLAYWRIGHT_ENABLED = PLAYWRIGHT_ENABLED.lower() == "true"
130
+ else:
131
+ PLAYWRIGHT_ENABLED = True
132
+
133
+
134
+ DEFAULT_MODELS = {
135
+ "openrouter": "deepseek/deepseek-chat",
136
+ "openai": "gpt-4o-mini",
137
+ "anthropic": "claude-haiku-4-5-20251001",
138
+ "google": "gemini-1.5-flash",
139
+ "groq": "llama-3.3-70b-versatile",
140
+ "ollama": "llama3.2",
141
+ }
142
+
143
+ DEFAULT_MODEL = os.getenv(
144
+ "DEFAULT_MODEL",
145
+ "openrouter/deepseek/deepseek-chat"
146
+ )
147
+
148
+ REQUIRED_KEYS = [
149
+ "JWT_SECRET",
150
+ ]
151
+
152
+ OPTIONAL_KEYS = [
153
+ "OPENAI_API_KEY",
154
+ "GOOGLE_API_KEY",
155
+ "ANTHROPIC_API_KEY",
156
+ "OTX_API_KEY",
157
+ "DEEPL_API_KEY",
158
+ "DARKSEARCH_API_KEY",
159
+ "OLLAMA_BASE_URL",
160
+ "OPENROUTER_API_KEY",
161
+ "LLAMA_CPP_BASE_URL",
162
+ "BLOCKCYPHER_TOKEN",
163
+ "ETHERSCAN_API_KEY",
164
+ ]
165
+
166
+
167
+ def validate_config():
168
+ missing_required = []
169
+ for key in REQUIRED_KEYS:
170
+ if _clean_env(key) is None:
171
+ missing_required.append(key)
172
+ if missing_required:
173
+ raise RuntimeError(f"Missing required configuration keys: {', '.join(missing_required)}")
174
+ for key in OPTIONAL_KEYS:
175
+ val = _clean_env(key)
176
+ if val is None:
177
+ logger.debug(f"Optional configuration key '%s' is not set - related features will be disabled", key)
178
+
179
+
180
+ validate_config()
crawler/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """
2
+ crawler — Phase 1C recursive .onion spider.
3
+
4
+ Public interface:
5
+ CrawlResult dataclass — crawl statistics + scraped content
6
+ crawl() async fn — entry point; accepts seeds, query, and tuning params
7
+
8
+ Example
9
+ -------
10
+ import asyncio
11
+ from crawler import CrawlResult, crawl
12
+
13
+ result: CrawlResult = asyncio.run(crawl(
14
+ seed_urls=["http://exampleonionaddressv3aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.onion"],
15
+ query="ransomware affiliate recruitment",
16
+ max_depth=2,
17
+ max_pages=50,
18
+ min_relevance=0.3,
19
+ ))
20
+
21
+ print(result.pages_crawled, result.new_urls_discovered)
22
+ for page in result.results:
23
+ print(page["url"], page["content"][:200])
24
+ """
25
+
26
+ from crawler.spider import CrawlResult, crawl
27
+
28
+ __all__ = ["CrawlResult", "crawl"]
crawler/dedup.py ADDED
@@ -0,0 +1,97 @@
1
+ """
2
+ crawler/dedup.py — Two-level deduplication for the .onion crawler.
3
+
4
+ Level 1 — URL dedup:
5
+ In-memory set; never visits the same normalized URL twice within a run.
6
+
7
+ Level 2 — Content dedup:
8
+ SHA-256 of extracted text is checked against the pages table before a DB
9
+ write. If the hash already exists the write is skipped, but the URL is
10
+ still counted as visited (content was seen elsewhere — no need to store it
11
+ again, but crawling this URL was not wasted work).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import hashlib
17
+ import logging
18
+ from typing import Optional
19
+
20
+ _logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Level 1 — URL deduplication
25
+ # ---------------------------------------------------------------------------
26
+
27
+ class UrlDedup:
28
+ """
29
+ In-memory URL deduplication scoped to a single crawl run.
30
+
31
+ Thread-safety: designed for asyncio (single-threaded event loop).
32
+ All operations are synchronous and O(1).
33
+ """
34
+
35
+ def __init__(self) -> None:
36
+ self._seen: set[str] = set()
37
+
38
+ def is_new(self, url: str) -> bool:
39
+ """Return True if *url* has not been seen in this run."""
40
+ return url not in self._seen
41
+
42
+ def mark_seen(self, url: str) -> None:
43
+ """Mark *url* as seen so future is_new() calls return False."""
44
+ self._seen.add(url)
45
+
46
+ def __len__(self) -> int:
47
+ return len(self._seen)
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Level 2 — Content deduplication
52
+ # ---------------------------------------------------------------------------
53
+
54
+ class ContentDedup:
55
+ """
56
+ Content-hash deduplication backed by the database pages table.
57
+
58
+ Uses SHA-256 of raw download bytes (same convention as scrape.py's
59
+ _persist_pages so hashes are consistent across both pipelines).
60
+
61
+ Falls back to "not a duplicate" when DATABASE_URL is not set or the DB
62
+ is unreachable — this keeps the crawler running even without a database.
63
+ """
64
+
65
+ @staticmethod
66
+ def hash_bytes(data: bytes) -> str:
67
+ """Return SHA-256 hex digest of raw bytes."""
68
+ return hashlib.sha256(data).hexdigest()
69
+
70
+ @staticmethod
71
+ def hash_text(text: str) -> str:
72
+ """Return SHA-256 hex digest of text (UTF-8 encoded)."""
73
+ return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
74
+
75
+ @staticmethod
76
+ def is_duplicate(content_hash: str, db_url: Optional[str] = None) -> bool:
77
+ """
78
+ Return True if *content_hash* already exists in the pages table.
79
+
80
+ *db_url* overrides DATABASE_URL — used in tests to point at the
81
+ test SQLite database without modifying the real config.
82
+
83
+ Any exception (DB unreachable, import error, etc.) is silently
84
+ swallowed and returns False so the crawler never crashes on dedup.
85
+ """
86
+ try:
87
+ from config import DATABASE_URL as _cfg_db_url
88
+ target = db_url or _cfg_db_url
89
+ if not target:
90
+ return False
91
+ from db.queries import get_page_by_hash
92
+ from db.session import get_session
93
+ with get_session(target) as session:
94
+ return get_page_by_hash(session, content_hash) is not None
95
+ except Exception as exc:
96
+ _logger.debug("ContentDedup DB check failed: %s", exc)
97
+ return False
crawler/frontier.py ADDED
@@ -0,0 +1,115 @@
1
+ """
2
+ crawler/frontier.py — Priority-queue crawl frontier with relevance scoring.
3
+
4
+ Each URL is scored by cosine similarity between "<url> <snippet>" and the
5
+ investigation query using the sentence-transformers all-MiniLM-L6-v2 model.
6
+ Higher score → popped sooner (max-priority implemented via negated scores on
7
+ a min-heap).
8
+
9
+ Uses vector.model_singleton for the shared SentenceTransformer instance.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import heapq
15
+ import logging
16
+ from typing import Optional, Tuple
17
+
18
+ import numpy as np
19
+
20
+ from vector.model_singleton import get_embedding_model
21
+
22
+ _logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _get_model():
26
+ """Return the SentenceTransformer singleton."""
27
+ return get_embedding_model()
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Cosine similarity
32
+ # ---------------------------------------------------------------------------
33
+
34
+ def _cosine(a: np.ndarray, b: np.ndarray) -> float:
35
+ """Cosine similarity clipped to [0.0, 1.0]."""
36
+ denom = float(np.linalg.norm(a) * np.linalg.norm(b)) + 1e-10
37
+ return float(np.clip(np.dot(a, b) / denom, 0.0, 1.0))
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Frontier
42
+ # ---------------------------------------------------------------------------
43
+
44
+ class Frontier:
45
+ """
46
+ Min-heap priority queue where URLs with *higher* relevance scores are
47
+ popped first (achieved by storing negated scores as heap keys).
48
+
49
+ Not thread-safe — designed for a single asyncio event loop.
50
+ """
51
+
52
+ def __init__(self, query: str) -> None:
53
+ self._query = query
54
+ self._heap: list = []
55
+ self._counter = 0 # monotone tie-breaker so URLs are never compared
56
+ self._query_embedding: Optional[np.ndarray] = None
57
+
58
+ # ------------------------------------------------------------------
59
+ # Embedding helpers
60
+ # ------------------------------------------------------------------
61
+
62
+ def _query_emb(self) -> np.ndarray:
63
+ """Return (cached) embedding for the investigation query."""
64
+ if self._query_embedding is None:
65
+ self._query_embedding = _get_model().encode(
66
+ self._query, convert_to_numpy=True
67
+ )
68
+ return self._query_embedding
69
+
70
+ # ------------------------------------------------------------------
71
+ # Public API
72
+ # ------------------------------------------------------------------
73
+
74
+ def score(self, url: str, snippet: str = "") -> float:
75
+ """
76
+ Compute relevance score (0.0–1.0) for *url* + optional *snippet*.
77
+
78
+ The input text is "<url> <snippet>" embedded and compared to the
79
+ investigation query via cosine similarity. Returns 0.5 on any
80
+ embedding failure so the crawler degrades gracefully.
81
+ """
82
+ text = f"{url} {snippet}".strip()
83
+ try:
84
+ model = _get_model()
85
+ if model is None:
86
+ return 0.5
87
+ emb = model.encode(text, convert_to_numpy=True)
88
+ return _cosine(emb, self._query_emb())
89
+ except Exception as exc:
90
+ _logger.debug("Frontier.score error: %s", exc)
91
+ return 0.5
92
+
93
+ def push(self, url: str, depth: int, score: float) -> None:
94
+ """
95
+ Add *url* at the given *depth* with pre-computed *score*.
96
+
97
+ Call Frontier.score() first to obtain the score; separating scoring
98
+ from pushing lets callers filter by min_relevance before enqueueing.
99
+ """
100
+ heapq.heappush(self._heap, (-score, self._counter, url, depth))
101
+ self._counter += 1
102
+
103
+ def pop(self) -> Tuple[str, int]:
104
+ """
105
+ Return (url, depth) for the highest-relevance item.
106
+ Raises IndexError if the frontier is empty.
107
+ """
108
+ _, _, url, depth = heapq.heappop(self._heap)
109
+ return url, depth
110
+
111
+ def empty(self) -> bool:
112
+ return not self._heap
113
+
114
+ def __len__(self) -> int:
115
+ return len(self._heap)