voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
config.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import secrets
|
|
3
|
+
import logging
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _clean_env(name, default=None):
|
|
12
|
+
value = os.getenv(name, default)
|
|
13
|
+
if value is None:
|
|
14
|
+
return None
|
|
15
|
+
value = str(value).strip()
|
|
16
|
+
# Support accidentally quoted values copied into .env
|
|
17
|
+
if len(value) >= 2 and (
|
|
18
|
+
(value[0] == value[-1] == '"') or (value[0] == value[-1] == "'")
|
|
19
|
+
):
|
|
20
|
+
value = value[1:-1].strip()
|
|
21
|
+
return value
|
|
22
|
+
|
|
23
|
+
# Configuration variables loaded from the .env file
|
|
24
|
+
OPENAI_API_KEY = _clean_env("OPENAI_API_KEY")
|
|
25
|
+
GOOGLE_API_KEY = _clean_env("GOOGLE_API_KEY")
|
|
26
|
+
ANTHROPIC_API_KEY = _clean_env("ANTHROPIC_API_KEY")
|
|
27
|
+
OLLAMA_BASE_URL = _clean_env("OLLAMA_BASE_URL")
|
|
28
|
+
OPENROUTER_BASE_URL = _clean_env("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
|
|
29
|
+
OPENROUTER_API_KEY = _clean_env("OPENROUTER_API_KEY")
|
|
30
|
+
GROQ_API_KEY = _clean_env("GROQ_API_KEY", "")
|
|
31
|
+
LLAMA_CPP_BASE_URL = _clean_env("LLAMA_CPP_BASE_URL")
|
|
32
|
+
|
|
33
|
+
# Database
|
|
34
|
+
DATABASE_URL = _clean_env("DATABASE_URL")
|
|
35
|
+
|
|
36
|
+
# Tor proxy (configurable so Docker can point to a tor service by name)
|
|
37
|
+
TOR_PROXY_HOST = _clean_env("TOR_PROXY_HOST", "127.0.0.1")
|
|
38
|
+
TOR_PROXY_PORT = _clean_env("TOR_PROXY_PORT", "9050")
|
|
39
|
+
|
|
40
|
+
# Phase 1D — expanded sources (all optional; missing vars disable that source)
|
|
41
|
+
DARKSEARCH_API_KEY = _clean_env("DARKSEARCH_API_KEY") # optional free-tier key
|
|
42
|
+
TELEGRAM_API_ID = _clean_env("TELEGRAM_API_ID") # from my.telegram.org
|
|
43
|
+
TELEGRAM_API_HASH = _clean_env("TELEGRAM_API_HASH") # from my.telegram.org
|
|
44
|
+
TELEGRAM_PHONE = _clean_env("TELEGRAM_PHONE") # E.164, e.g. +12025551234
|
|
45
|
+
|
|
46
|
+
# Phase 4 — vector store + alert channels (all optional)
|
|
47
|
+
CHROMA_PERSIST_DIR = _clean_env("CHROMA_PERSIST_DIR", "./chroma_db")
|
|
48
|
+
TELEGRAM_BOT_TOKEN = _clean_env("TELEGRAM_BOT_TOKEN") # one-way alerts; separate from Telethon API vars
|
|
49
|
+
SMTP_HOST = _clean_env("SMTP_HOST")
|
|
50
|
+
SMTP_PORT = _clean_env("SMTP_PORT", "587")
|
|
51
|
+
SMTP_USER = _clean_env("SMTP_USER")
|
|
52
|
+
SMTP_PASS = _clean_env("SMTP_PASS")
|
|
53
|
+
|
|
54
|
+
# Phase 5 — REST API server
|
|
55
|
+
API_HOST = _clean_env("API_HOST", "0.0.0.0")
|
|
56
|
+
API_PORT = _clean_env("API_PORT", "8000")
|
|
57
|
+
|
|
58
|
+
# Phase 6 — advanced capabilities
|
|
59
|
+
DEEPL_API_KEY = _clean_env("DEEPL_API_KEY") # optional; translation
|
|
60
|
+
STYLOMETRY_THRESHOLD = _clean_env("STYLOMETRY_THRESHOLD", "0.85") # same-author detection
|
|
61
|
+
|
|
62
|
+
# LLM extraction cache (optional — defaults to enabled)
|
|
63
|
+
DISABLE_EXTRACTION_CACHE = _clean_env("DISABLE_EXTRACTION_CACHE")
|
|
64
|
+
if DISABLE_EXTRACTION_CACHE is not None:
|
|
65
|
+
DISABLE_EXTRACTION_CACHE = DISABLE_EXTRACTION_CACHE.lower() == "true"
|
|
66
|
+
else:
|
|
67
|
+
DISABLE_EXTRACTION_CACHE = False
|
|
68
|
+
|
|
69
|
+
# i18n / Query Expansion
|
|
70
|
+
# Languages to include in query expansion (comma-separated ISO 639-1 codes)
|
|
71
|
+
# Default: en, ru, zh (English, Russian, Chinese)
|
|
72
|
+
I18N_LANGUAGES = _clean_env("I18N_LANGUAGES", "en,ru,zh")
|
|
73
|
+
if I18N_LANGUAGES and isinstance(I18N_LANGUAGES, str):
|
|
74
|
+
I18N_LANGUAGES = [lang.strip() for lang in I18N_LANGUAGES.split(",") if lang.strip()]
|
|
75
|
+
else:
|
|
76
|
+
I18N_LANGUAGES = ["en", "ru", "zh"]
|
|
77
|
+
|
|
78
|
+
# Threat Intelligence API Keys
|
|
79
|
+
OTX_API_KEY = _clean_env("OTX_API_KEY", "") # AlienVault OTX — free at otx.alienvault.com
|
|
80
|
+
VT_API_KEY = _clean_env("VT_API_KEY", "") # VirusTotal — free tier at virustotal.com
|
|
81
|
+
|
|
82
|
+
# IP Reputation Enrichment (all optional — features degrade gracefully without keys)
|
|
83
|
+
ABUSEIPDB_API_KEY = _clean_env("ABUSEIPDB_API_KEY", "") # Community IP abuse reports
|
|
84
|
+
GREYNOISE_API_KEY = _clean_env("GREYNOISE_API_KEY", "") # Suppresses benign scanner IPs
|
|
85
|
+
C2_FEED_CACHE_TTL = _clean_env("C2_FEED_CACHE_TTL", "24") # Hours between feed refreshes
|
|
86
|
+
|
|
87
|
+
# Domain Reputation Enrichment
|
|
88
|
+
URLSCAN_API_KEY = _clean_env("URLSCAN_API_KEY", "")
|
|
89
|
+
SECURITYTRAILS_API_KEY = _clean_env("SECURITYTRAILS_API_KEY", "")
|
|
90
|
+
|
|
91
|
+
# Code Intelligence (GitHub / GitLab scraping)
|
|
92
|
+
GITHUB_TOKEN = _clean_env("GITHUB_TOKEN", "")
|
|
93
|
+
GITLAB_TOKEN = _clean_env("GITLAB_TOKEN", "")
|
|
94
|
+
|
|
95
|
+
# Hash Reputation Enrichment
|
|
96
|
+
HYBRID_ANALYSIS_API_KEY = _clean_env("HYBRID_ANALYSIS_API_KEY", "")
|
|
97
|
+
|
|
98
|
+
# Email Reputation Enrichment
|
|
99
|
+
HIBP_API_KEY = _clean_env("HIBP_API_KEY", "")
|
|
100
|
+
EMAILREP_API_KEY = _clean_env("EMAILREP_API_KEY", "")
|
|
101
|
+
|
|
102
|
+
SHODAN_RATE_LIMIT_DELAY = 1.0 # seconds between Shodan requests (InternetDB)
|
|
103
|
+
MAX_IPS_PER_INVESTIGATION = 50 # max IPs to query Shodan per investigation
|
|
104
|
+
MAX_HASHES_PER_INVESTIGATION = 20 # max file hashes to query VirusTotal per investigation
|
|
105
|
+
|
|
106
|
+
# Blockchain API Keys (optional — free tiers work without)
|
|
107
|
+
BLOCKCYPHER_TOKEN = _clean_env("BLOCKCYPHER_TOKEN", "")
|
|
108
|
+
ETHERSCAN_API_KEY = _clean_env("ETHERSCAN_API_KEY", "")
|
|
109
|
+
|
|
110
|
+
# Auth — REQUIRED in production. Generate with: python -c "import secrets; print(secrets.token_hex(32))"
|
|
111
|
+
_jwt_secret = _clean_env("JWT_SECRET")
|
|
112
|
+
if _jwt_secret is None:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
"JWT_SECRET environment variable is not set. "
|
|
115
|
+
"Generate a secure secret with: python -c \"import secrets; print(secrets.token_hex(32))\" "
|
|
116
|
+
"and set it as JWT_SECRET in your .env file. "
|
|
117
|
+
"Do NOT use a random secret — it will change on restart and invalidate all issued tokens."
|
|
118
|
+
)
|
|
119
|
+
JWT_SECRET = _jwt_secret
|
|
120
|
+
|
|
121
|
+
# Token blacklist Redis (optional — omit to disable blacklist checks)
|
|
122
|
+
REDIS_URL = _clean_env("REDIS_URL")
|
|
123
|
+
|
|
124
|
+
# Playwright JS rendering for JavaScript-heavy .onion sites
|
|
125
|
+
# Set to False to disable (faster startup, lower memory usage)
|
|
126
|
+
# Requires: playwright installed + browsers downloaded
|
|
127
|
+
PLAYWRIGHT_ENABLED = _clean_env("PLAYWRIGHT_ENABLED", "true")
|
|
128
|
+
if PLAYWRIGHT_ENABLED is not None:
|
|
129
|
+
PLAYWRIGHT_ENABLED = PLAYWRIGHT_ENABLED.lower() == "true"
|
|
130
|
+
else:
|
|
131
|
+
PLAYWRIGHT_ENABLED = True
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
DEFAULT_MODELS = {
|
|
135
|
+
"openrouter": "deepseek/deepseek-chat",
|
|
136
|
+
"openai": "gpt-4o-mini",
|
|
137
|
+
"anthropic": "claude-haiku-4-5-20251001",
|
|
138
|
+
"google": "gemini-1.5-flash",
|
|
139
|
+
"groq": "llama-3.3-70b-versatile",
|
|
140
|
+
"ollama": "llama3.2",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
DEFAULT_MODEL = os.getenv(
|
|
144
|
+
"DEFAULT_MODEL",
|
|
145
|
+
"openrouter/deepseek/deepseek-chat"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
REQUIRED_KEYS = [
|
|
149
|
+
"JWT_SECRET",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
OPTIONAL_KEYS = [
|
|
153
|
+
"OPENAI_API_KEY",
|
|
154
|
+
"GOOGLE_API_KEY",
|
|
155
|
+
"ANTHROPIC_API_KEY",
|
|
156
|
+
"OTX_API_KEY",
|
|
157
|
+
"DEEPL_API_KEY",
|
|
158
|
+
"DARKSEARCH_API_KEY",
|
|
159
|
+
"OLLAMA_BASE_URL",
|
|
160
|
+
"OPENROUTER_API_KEY",
|
|
161
|
+
"LLAMA_CPP_BASE_URL",
|
|
162
|
+
"BLOCKCYPHER_TOKEN",
|
|
163
|
+
"ETHERSCAN_API_KEY",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def validate_config():
|
|
168
|
+
missing_required = []
|
|
169
|
+
for key in REQUIRED_KEYS:
|
|
170
|
+
if _clean_env(key) is None:
|
|
171
|
+
missing_required.append(key)
|
|
172
|
+
if missing_required:
|
|
173
|
+
raise RuntimeError(f"Missing required configuration keys: {', '.join(missing_required)}")
|
|
174
|
+
for key in OPTIONAL_KEYS:
|
|
175
|
+
val = _clean_env(key)
|
|
176
|
+
if val is None:
|
|
177
|
+
logger.debug(f"Optional configuration key '%s' is not set - related features will be disabled", key)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
validate_config()
|
crawler/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler — Phase 1C recursive .onion spider.
|
|
3
|
+
|
|
4
|
+
Public interface:
|
|
5
|
+
CrawlResult dataclass — crawl statistics + scraped content
|
|
6
|
+
crawl() async fn — entry point; accepts seeds, query, and tuning params
|
|
7
|
+
|
|
8
|
+
Example
|
|
9
|
+
-------
|
|
10
|
+
import asyncio
|
|
11
|
+
from crawler import CrawlResult, crawl
|
|
12
|
+
|
|
13
|
+
result: CrawlResult = asyncio.run(crawl(
|
|
14
|
+
seed_urls=["http://exampleonionaddressv3aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.onion"],
|
|
15
|
+
query="ransomware affiliate recruitment",
|
|
16
|
+
max_depth=2,
|
|
17
|
+
max_pages=50,
|
|
18
|
+
min_relevance=0.3,
|
|
19
|
+
))
|
|
20
|
+
|
|
21
|
+
print(result.pages_crawled, result.new_urls_discovered)
|
|
22
|
+
for page in result.results:
|
|
23
|
+
print(page["url"], page["content"][:200])
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from crawler.spider import CrawlResult, crawl
|
|
27
|
+
|
|
28
|
+
__all__ = ["CrawlResult", "crawl"]
|
crawler/dedup.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler/dedup.py — Two-level deduplication for the .onion crawler.
|
|
3
|
+
|
|
4
|
+
Level 1 — URL dedup:
|
|
5
|
+
In-memory set; never visits the same normalized URL twice within a run.
|
|
6
|
+
|
|
7
|
+
Level 2 — Content dedup:
|
|
8
|
+
SHA-256 of extracted text is checked against the pages table before a DB
|
|
9
|
+
write. If the hash already exists the write is skipped, but the URL is
|
|
10
|
+
still counted as visited (content was seen elsewhere — no need to store it
|
|
11
|
+
again, but crawling this URL was not wasted work).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import logging
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
_logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Level 1 — URL deduplication
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
class UrlDedup:
|
|
28
|
+
"""
|
|
29
|
+
In-memory URL deduplication scoped to a single crawl run.
|
|
30
|
+
|
|
31
|
+
Thread-safety: designed for asyncio (single-threaded event loop).
|
|
32
|
+
All operations are synchronous and O(1).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
self._seen: set[str] = set()
|
|
37
|
+
|
|
38
|
+
def is_new(self, url: str) -> bool:
|
|
39
|
+
"""Return True if *url* has not been seen in this run."""
|
|
40
|
+
return url not in self._seen
|
|
41
|
+
|
|
42
|
+
def mark_seen(self, url: str) -> None:
|
|
43
|
+
"""Mark *url* as seen so future is_new() calls return False."""
|
|
44
|
+
self._seen.add(url)
|
|
45
|
+
|
|
46
|
+
def __len__(self) -> int:
|
|
47
|
+
return len(self._seen)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Level 2 — Content deduplication
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
class ContentDedup:
|
|
55
|
+
"""
|
|
56
|
+
Content-hash deduplication backed by the database pages table.
|
|
57
|
+
|
|
58
|
+
Uses SHA-256 of raw download bytes (same convention as scrape.py's
|
|
59
|
+
_persist_pages so hashes are consistent across both pipelines).
|
|
60
|
+
|
|
61
|
+
Falls back to "not a duplicate" when DATABASE_URL is not set or the DB
|
|
62
|
+
is unreachable — this keeps the crawler running even without a database.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def hash_bytes(data: bytes) -> str:
|
|
67
|
+
"""Return SHA-256 hex digest of raw bytes."""
|
|
68
|
+
return hashlib.sha256(data).hexdigest()
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def hash_text(text: str) -> str:
|
|
72
|
+
"""Return SHA-256 hex digest of text (UTF-8 encoded)."""
|
|
73
|
+
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def is_duplicate(content_hash: str, db_url: Optional[str] = None) -> bool:
|
|
77
|
+
"""
|
|
78
|
+
Return True if *content_hash* already exists in the pages table.
|
|
79
|
+
|
|
80
|
+
*db_url* overrides DATABASE_URL — used in tests to point at the
|
|
81
|
+
test SQLite database without modifying the real config.
|
|
82
|
+
|
|
83
|
+
Any exception (DB unreachable, import error, etc.) is silently
|
|
84
|
+
swallowed and returns False so the crawler never crashes on dedup.
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
from config import DATABASE_URL as _cfg_db_url
|
|
88
|
+
target = db_url or _cfg_db_url
|
|
89
|
+
if not target:
|
|
90
|
+
return False
|
|
91
|
+
from db.queries import get_page_by_hash
|
|
92
|
+
from db.session import get_session
|
|
93
|
+
with get_session(target) as session:
|
|
94
|
+
return get_page_by_hash(session, content_hash) is not None
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
_logger.debug("ContentDedup DB check failed: %s", exc)
|
|
97
|
+
return False
|
crawler/frontier.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler/frontier.py — Priority-queue crawl frontier with relevance scoring.
|
|
3
|
+
|
|
4
|
+
Each URL is scored by cosine similarity between "<url> <snippet>" and the
|
|
5
|
+
investigation query using the sentence-transformers all-MiniLM-L6-v2 model.
|
|
6
|
+
Higher score → popped sooner (max-priority implemented via negated scores on
|
|
7
|
+
a min-heap).
|
|
8
|
+
|
|
9
|
+
Uses vector.model_singleton for the shared SentenceTransformer instance.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import heapq
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from vector.model_singleton import get_embedding_model
|
|
21
|
+
|
|
22
|
+
_logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_model():
|
|
26
|
+
"""Return the SentenceTransformer singleton."""
|
|
27
|
+
return get_embedding_model()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Cosine similarity
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
def _cosine(a: np.ndarray, b: np.ndarray) -> float:
|
|
35
|
+
"""Cosine similarity clipped to [0.0, 1.0]."""
|
|
36
|
+
denom = float(np.linalg.norm(a) * np.linalg.norm(b)) + 1e-10
|
|
37
|
+
return float(np.clip(np.dot(a, b) / denom, 0.0, 1.0))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Frontier
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
class Frontier:
|
|
45
|
+
"""
|
|
46
|
+
Min-heap priority queue where URLs with *higher* relevance scores are
|
|
47
|
+
popped first (achieved by storing negated scores as heap keys).
|
|
48
|
+
|
|
49
|
+
Not thread-safe — designed for a single asyncio event loop.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, query: str) -> None:
|
|
53
|
+
self._query = query
|
|
54
|
+
self._heap: list = []
|
|
55
|
+
self._counter = 0 # monotone tie-breaker so URLs are never compared
|
|
56
|
+
self._query_embedding: Optional[np.ndarray] = None
|
|
57
|
+
|
|
58
|
+
# ------------------------------------------------------------------
|
|
59
|
+
# Embedding helpers
|
|
60
|
+
# ------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
def _query_emb(self) -> np.ndarray:
|
|
63
|
+
"""Return (cached) embedding for the investigation query."""
|
|
64
|
+
if self._query_embedding is None:
|
|
65
|
+
self._query_embedding = _get_model().encode(
|
|
66
|
+
self._query, convert_to_numpy=True
|
|
67
|
+
)
|
|
68
|
+
return self._query_embedding
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# Public API
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def score(self, url: str, snippet: str = "") -> float:
|
|
75
|
+
"""
|
|
76
|
+
Compute relevance score (0.0–1.0) for *url* + optional *snippet*.
|
|
77
|
+
|
|
78
|
+
The input text is "<url> <snippet>" embedded and compared to the
|
|
79
|
+
investigation query via cosine similarity. Returns 0.5 on any
|
|
80
|
+
embedding failure so the crawler degrades gracefully.
|
|
81
|
+
"""
|
|
82
|
+
text = f"{url} {snippet}".strip()
|
|
83
|
+
try:
|
|
84
|
+
model = _get_model()
|
|
85
|
+
if model is None:
|
|
86
|
+
return 0.5
|
|
87
|
+
emb = model.encode(text, convert_to_numpy=True)
|
|
88
|
+
return _cosine(emb, self._query_emb())
|
|
89
|
+
except Exception as exc:
|
|
90
|
+
_logger.debug("Frontier.score error: %s", exc)
|
|
91
|
+
return 0.5
|
|
92
|
+
|
|
93
|
+
def push(self, url: str, depth: int, score: float) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Add *url* at the given *depth* with pre-computed *score*.
|
|
96
|
+
|
|
97
|
+
Call Frontier.score() first to obtain the score; separating scoring
|
|
98
|
+
from pushing lets callers filter by min_relevance before enqueueing.
|
|
99
|
+
"""
|
|
100
|
+
heapq.heappush(self._heap, (-score, self._counter, url, depth))
|
|
101
|
+
self._counter += 1
|
|
102
|
+
|
|
103
|
+
def pop(self) -> Tuple[str, int]:
|
|
104
|
+
"""
|
|
105
|
+
Return (url, depth) for the highest-relevance item.
|
|
106
|
+
Raises IndexError if the frontier is empty.
|
|
107
|
+
"""
|
|
108
|
+
_, _, url, depth = heapq.heappop(self._heap)
|
|
109
|
+
return url, depth
|
|
110
|
+
|
|
111
|
+
def empty(self) -> bool:
|
|
112
|
+
return not self._heap
|
|
113
|
+
|
|
114
|
+
def __len__(self) -> int:
|
|
115
|
+
return len(self._heap)
|