voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
crawler/spider.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler/spider.py — Async recursive .onion spider (Phase 1C).
|
|
3
|
+
|
|
4
|
+
Public API:
|
|
5
|
+
CrawlResult dataclass — returned by crawl()
|
|
6
|
+
crawl() async function — main entry point
|
|
7
|
+
|
|
8
|
+
All HTTP requests go through the Tor SOCKS5 proxy (TOR_PROXY_HOST /
|
|
9
|
+
TOR_PROXY_PORT from config.py). No clearnet requests to dark web targets.
|
|
10
|
+
|
|
11
|
+
Politeness rules (non-negotiable for Tor stability):
|
|
12
|
+
- Same domain → random 2–8 s delay between consecutive requests
|
|
13
|
+
- New domain → random 0.5–2 s delay on first access
|
|
14
|
+
- Per-domain concurrency cap: 3 simultaneous requests (asyncio.Semaphore)
|
|
15
|
+
- 1 MB download cap per page (identical to scrape.py)
|
|
16
|
+
|
|
17
|
+
Error handling:
|
|
18
|
+
- A failed page is logged, its source marked 'failed' in the DB, and the
|
|
19
|
+
crawl continues — a single bad page never terminates the run.
|
|
20
|
+
- Retry/backoff mirrors scrape.py: up to 3 retries (2 s / 4 s / 8 s),
|
|
21
|
+
no retry on 4xx responses.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import asyncio
|
|
27
|
+
import hashlib
|
|
28
|
+
import logging
|
|
29
|
+
import random
|
|
30
|
+
import time
|
|
31
|
+
from collections import defaultdict
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
from typing import Dict, List, Optional, Tuple
|
|
34
|
+
from urllib.parse import urlparse
|
|
35
|
+
|
|
36
|
+
import aiohttp
|
|
37
|
+
from aiohttp_socks import ProxyConnector
|
|
38
|
+
|
|
39
|
+
from config import TOR_PROXY_HOST, TOR_PROXY_PORT
|
|
40
|
+
from crawler.dedup import ContentDedup, UrlDedup
|
|
41
|
+
from crawler.frontier import Frontier
|
|
42
|
+
from crawler.utils import extract_onion_links, is_valid_onion, normalize_url
|
|
43
|
+
from scraper.scrape import _extract_text
|
|
44
|
+
|
|
45
|
+
_logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Constants (mirror scrape.py where applicable)
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
MAX_DOWNLOAD_BYTES = 1_000_000 # 1 MB hard cap
|
|
52
|
+
MAX_RETURN_CHARS = 2_000 # truncation in results list
|
|
53
|
+
MAX_RETRIES = 3
|
|
54
|
+
RETRY_DELAYS = (2.0, 4.0, 8.0) # seconds before retry 1, 2, 3
|
|
55
|
+
RETRYABLE_STATUS = {500, 502, 503, 504}
|
|
56
|
+
ALLOWED_CONTENT_TYPES = ("text/html", "application/xhtml+xml", "text/plain")
|
|
57
|
+
|
|
58
|
+
_SAME_DOMAIN_DELAY = (2.0, 8.0) # seconds, random within range
|
|
59
|
+
_NEW_DOMAIN_DELAY = (0.5, 2.0) # seconds, random within range
|
|
60
|
+
_DOMAIN_MAX_CONCURRENT = 3 # asyncio.Semaphore value per domain
|
|
61
|
+
_GLOBAL_CONCURRENCY = 10 # max simultaneous page fetches overall
|
|
62
|
+
|
|
63
|
+
_USER_AGENT = (
|
|
64
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) "
|
|
65
|
+
"Gecko/20100101 Firefox/137.0"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Return type
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class CrawlResult:
|
|
75
|
+
"""
|
|
76
|
+
Summary of a completed crawl run.
|
|
77
|
+
|
|
78
|
+
*results* is a list of dicts, each with keys "url" and "content",
|
|
79
|
+
shaped the same as individual entries from scrape_multiple() so both
|
|
80
|
+
are interchangeable in the intelligence pipeline.
|
|
81
|
+
"""
|
|
82
|
+
pages_crawled: int = 0
|
|
83
|
+
pages_failed: int = 0
|
|
84
|
+
new_urls_discovered: int = 0
|
|
85
|
+
results: List[Dict] = field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# Spider
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
class Spider:
|
|
93
|
+
"""
|
|
94
|
+
Recursive async .onion crawler.
|
|
95
|
+
|
|
96
|
+
Instantiate once per crawl run; do not reuse across runs.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(
|
|
100
|
+
self,
|
|
101
|
+
seed_urls: List[str],
|
|
102
|
+
query: str,
|
|
103
|
+
max_depth: int = 2,
|
|
104
|
+
max_pages: int = 200,
|
|
105
|
+
min_relevance: float = 0.3,
|
|
106
|
+
) -> None:
|
|
107
|
+
self.seed_urls = seed_urls
|
|
108
|
+
self.query = query
|
|
109
|
+
self.max_depth = max_depth
|
|
110
|
+
self.max_pages = max_pages
|
|
111
|
+
self.min_relevance = min_relevance
|
|
112
|
+
|
|
113
|
+
self._frontier = Frontier(query)
|
|
114
|
+
self._url_dedup = UrlDedup()
|
|
115
|
+
self._content_dedup = ContentDedup()
|
|
116
|
+
|
|
117
|
+
# Per-domain politeness state
|
|
118
|
+
self._domain_semaphores: Dict[str, asyncio.Semaphore] = defaultdict(
|
|
119
|
+
lambda: asyncio.Semaphore(_DOMAIN_MAX_CONCURRENT)
|
|
120
|
+
)
|
|
121
|
+
self._domain_last_access: Dict[str, float] = {}
|
|
122
|
+
self._timing_lock = asyncio.Lock()
|
|
123
|
+
|
|
124
|
+
# Counters
|
|
125
|
+
self._pages_crawled = 0
|
|
126
|
+
self._pages_failed = 0
|
|
127
|
+
self._new_urls_discovered = 0
|
|
128
|
+
self._results: List[Dict] = []
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# Politeness
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
async def _polite_delay(self, domain: str) -> None:
|
|
135
|
+
"""
|
|
136
|
+
Compute and sleep the required inter-request delay for *domain*.
|
|
137
|
+
|
|
138
|
+
Uses _timing_lock to read/update last-access atomically in the
|
|
139
|
+
event loop; the actual sleep happens outside the lock so other
|
|
140
|
+
coroutines are not blocked.
|
|
141
|
+
"""
|
|
142
|
+
async with self._timing_lock:
|
|
143
|
+
last = self._domain_last_access.get(domain)
|
|
144
|
+
now = time.monotonic()
|
|
145
|
+
if last is None:
|
|
146
|
+
delay = random.uniform(*_NEW_DOMAIN_DELAY)
|
|
147
|
+
else:
|
|
148
|
+
elapsed = now - last
|
|
149
|
+
needed = random.uniform(*_SAME_DOMAIN_DELAY)
|
|
150
|
+
delay = max(0.0, needed - elapsed)
|
|
151
|
+
# Reserve the slot so concurrent coroutines don't both sleep 0
|
|
152
|
+
self._domain_last_access[domain] = now + delay
|
|
153
|
+
|
|
154
|
+
if delay > 0:
|
|
155
|
+
await asyncio.sleep(delay)
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
# Fetch with retry (mirrors scrape.py's _fetch_one)
|
|
159
|
+
# ------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
async def _fetch(
|
|
162
|
+
self,
|
|
163
|
+
url: str,
|
|
164
|
+
session: aiohttp.ClientSession,
|
|
165
|
+
) -> Optional[Tuple[bytes, str, str]]:
|
|
166
|
+
"""
|
|
167
|
+
Fetch *url* with exponential-backoff retry.
|
|
168
|
+
|
|
169
|
+
Returns (raw_bytes, html, extracted_text) on success, or None on
|
|
170
|
+
any unrecoverable failure. Never raises.
|
|
171
|
+
"""
|
|
172
|
+
headers = {
|
|
173
|
+
"User-Agent": _USER_AGENT,
|
|
174
|
+
"Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
|
|
175
|
+
}
|
|
176
|
+
last_exc: object = None
|
|
177
|
+
|
|
178
|
+
for attempt in range(MAX_RETRIES + 1):
|
|
179
|
+
if attempt > 0:
|
|
180
|
+
await asyncio.sleep(RETRY_DELAYS[attempt - 1])
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
async with session.get(url, headers=headers) as resp:
|
|
184
|
+
if resp.status in RETRYABLE_STATUS:
|
|
185
|
+
last_exc = f"HTTP {resp.status}"
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if resp.status != 200:
|
|
189
|
+
return None # 4xx — not retried
|
|
190
|
+
|
|
191
|
+
ct = (resp.headers.get("Content-Type") or "").lower()
|
|
192
|
+
if ct and not any(t in ct for t in ALLOWED_CONTENT_TYPES):
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
# Stream with 1 MB hard cap
|
|
196
|
+
chunks: List[bytes] = []
|
|
197
|
+
total = 0
|
|
198
|
+
async for chunk in resp.content.iter_chunked(8192):
|
|
199
|
+
if not chunk:
|
|
200
|
+
continue
|
|
201
|
+
total += len(chunk)
|
|
202
|
+
if total > MAX_DOWNLOAD_BYTES:
|
|
203
|
+
break
|
|
204
|
+
chunks.append(chunk)
|
|
205
|
+
|
|
206
|
+
raw_bytes = b"".join(chunks)
|
|
207
|
+
html = raw_bytes.decode(resp.charset or "utf-8", errors="replace")
|
|
208
|
+
text = _extract_text(html)
|
|
209
|
+
return raw_bytes, html, text
|
|
210
|
+
|
|
211
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
|
|
212
|
+
last_exc = exc
|
|
213
|
+
|
|
214
|
+
_logger.debug("All retries exhausted for %s: %s", url, last_exc)
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
# ------------------------------------------------------------------
|
|
218
|
+
# DB helpers (pattern from scrape.py _persist_pages)
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
|
|
221
|
+
def _db_upsert_source(self, url: str, status: str) -> None:
|
|
222
|
+
"""
|
|
223
|
+
Upsert the .onion domain for *url* into the sources table.
|
|
224
|
+
|
|
225
|
+
Only sets *status* when the row is newly created; existing rows are
|
|
226
|
+
left at their current status so we never downgrade 'active' → 'discovered'.
|
|
227
|
+
If *status* is 'failed' or 'active' it is always applied (overwrite).
|
|
228
|
+
"""
|
|
229
|
+
try:
|
|
230
|
+
from config import DATABASE_URL as _db_url
|
|
231
|
+
if not _db_url:
|
|
232
|
+
return
|
|
233
|
+
from db.queries import get_or_create_source, update_source_status
|
|
234
|
+
from db.session import get_session
|
|
235
|
+
except ImportError:
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
hostname = (urlparse(url).hostname or "").lower()
|
|
240
|
+
if not hostname.endswith(".onion"):
|
|
241
|
+
return
|
|
242
|
+
with get_session() as session:
|
|
243
|
+
src, created = get_or_create_source(
|
|
244
|
+
session, hostname, source_type="crawled"
|
|
245
|
+
)
|
|
246
|
+
# Always apply terminal statuses; only apply 'discovered' to new rows
|
|
247
|
+
if status in ("active", "failed") or created:
|
|
248
|
+
update_source_status(session, src.id, status)
|
|
249
|
+
except Exception as exc:
|
|
250
|
+
_logger.debug("DB source upsert failed url=%s status=%s: %s", url, status, exc)
|
|
251
|
+
|
|
252
|
+
def _db_persist_page(
|
|
253
|
+
self,
|
|
254
|
+
url: str,
|
|
255
|
+
raw_bytes: bytes,
|
|
256
|
+
text: str,
|
|
257
|
+
content_hash: str,
|
|
258
|
+
) -> None:
|
|
259
|
+
"""Write a successfully scraped page to the database."""
|
|
260
|
+
try:
|
|
261
|
+
from config import DATABASE_URL as _db_url
|
|
262
|
+
if not _db_url:
|
|
263
|
+
return
|
|
264
|
+
from db.queries import create_page, get_or_create_source, update_source_status
|
|
265
|
+
from db.session import get_session
|
|
266
|
+
except ImportError:
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
with get_session() as session:
|
|
271
|
+
hostname = (urlparse(url).hostname or "").lower()
|
|
272
|
+
source_id = None
|
|
273
|
+
if hostname.endswith(".onion"):
|
|
274
|
+
src, _ = get_or_create_source(
|
|
275
|
+
session, hostname, source_type="crawled"
|
|
276
|
+
)
|
|
277
|
+
update_source_status(session, src.id, "active")
|
|
278
|
+
source_id = src.id
|
|
279
|
+
|
|
280
|
+
create_page(
|
|
281
|
+
session,
|
|
282
|
+
url=url,
|
|
283
|
+
source_id=source_id,
|
|
284
|
+
cleaned_text=text,
|
|
285
|
+
raw_content_hash=content_hash,
|
|
286
|
+
byte_size=len(raw_bytes),
|
|
287
|
+
)
|
|
288
|
+
except Exception as exc:
|
|
289
|
+
_logger.debug("DB page persist failed url=%s: %s", url, exc)
|
|
290
|
+
|
|
291
|
+
# ------------------------------------------------------------------
|
|
292
|
+
# Core page processing
|
|
293
|
+
# ------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
async def _process_url(
|
|
296
|
+
self,
|
|
297
|
+
url: str,
|
|
298
|
+
depth: int,
|
|
299
|
+
session: aiohttp.ClientSession,
|
|
300
|
+
) -> None:
|
|
301
|
+
"""
|
|
302
|
+
Fetch *url*, extract links, and update all state.
|
|
303
|
+
|
|
304
|
+
Acquires the per-domain semaphore after the politeness delay so at
|
|
305
|
+
most _DOMAIN_MAX_CONCURRENT fetches to the same domain run in
|
|
306
|
+
parallel at any time.
|
|
307
|
+
"""
|
|
308
|
+
domain = (urlparse(url).hostname or url).lower()
|
|
309
|
+
await self._polite_delay(domain)
|
|
310
|
+
|
|
311
|
+
async with self._domain_semaphores[domain]:
|
|
312
|
+
try:
|
|
313
|
+
result = await self._fetch(url, session)
|
|
314
|
+
|
|
315
|
+
if result is None:
|
|
316
|
+
self._pages_failed += 1
|
|
317
|
+
_logger.debug("Fetch returned None for %s", url)
|
|
318
|
+
self._db_upsert_source(url, "failed")
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
raw_bytes, html, text = result
|
|
322
|
+
content_hash = hashlib.sha256(raw_bytes).hexdigest()
|
|
323
|
+
|
|
324
|
+
# Content dedup: skip DB write if hash already stored
|
|
325
|
+
if not self._content_dedup.is_duplicate(content_hash):
|
|
326
|
+
self._db_persist_page(url, raw_bytes, text, content_hash)
|
|
327
|
+
else:
|
|
328
|
+
# Source still reached successfully — keep status accurate
|
|
329
|
+
self._db_upsert_source(url, "active")
|
|
330
|
+
|
|
331
|
+
self._pages_crawled += 1
|
|
332
|
+
|
|
333
|
+
# Truncate content for the results list
|
|
334
|
+
snippet = (text or "")[:MAX_RETURN_CHARS]
|
|
335
|
+
suffix = "...(truncated)"
|
|
336
|
+
if len(text or "") > MAX_RETURN_CHARS:
|
|
337
|
+
available = MAX_RETURN_CHARS - len(suffix)
|
|
338
|
+
snippet = (text[:available] + suffix) if available > 0 else suffix
|
|
339
|
+
|
|
340
|
+
self._results.append({"url": url, "content": snippet})
|
|
341
|
+
|
|
342
|
+
# Extract and enqueue child links
|
|
343
|
+
if depth < self.max_depth:
|
|
344
|
+
links = extract_onion_links(html, base_url=url)
|
|
345
|
+
for link in links:
|
|
346
|
+
normed = normalize_url(link)
|
|
347
|
+
if not normed:
|
|
348
|
+
continue
|
|
349
|
+
if self._url_dedup.is_new(normed):
|
|
350
|
+
self._new_urls_discovered += 1
|
|
351
|
+
self._url_dedup.mark_seen(normed)
|
|
352
|
+
self._db_upsert_source(normed, "discovered")
|
|
353
|
+
|
|
354
|
+
link_score = self._frontier.score(normed, (text or "")[:500])
|
|
355
|
+
if link_score >= self.min_relevance:
|
|
356
|
+
self._frontier.push(normed, depth + 1, link_score)
|
|
357
|
+
|
|
358
|
+
except Exception as exc:
|
|
359
|
+
self._pages_failed += 1
|
|
360
|
+
_logger.warning("Unexpected error processing %s: %s", url, exc, exc_info=True)
|
|
361
|
+
self._db_upsert_source(url, "failed")
|
|
362
|
+
|
|
363
|
+
# ------------------------------------------------------------------
|
|
364
|
+
# Main crawl loop
|
|
365
|
+
# ------------------------------------------------------------------
|
|
366
|
+
|
|
367
|
+
async def run(self) -> CrawlResult:
|
|
368
|
+
"""
|
|
369
|
+
Execute the full crawl and return a CrawlResult.
|
|
370
|
+
|
|
371
|
+
Flow:
|
|
372
|
+
1. Normalize and validate seed URLs → push to frontier (score=1.0)
|
|
373
|
+
2. Open one Tor-proxied aiohttp session for the entire run
|
|
374
|
+
3. Dispatch up to _GLOBAL_CONCURRENCY concurrent _process_url tasks
|
|
375
|
+
4. Replenish tasks as each completes; stop when frontier is empty
|
|
376
|
+
or max_pages total have been processed
|
|
377
|
+
"""
|
|
378
|
+
for url in self.seed_urls:
|
|
379
|
+
normed = normalize_url(url)
|
|
380
|
+
if not normed or not is_valid_onion(normed):
|
|
381
|
+
_logger.warning("Skipping invalid seed URL: %s", url)
|
|
382
|
+
continue
|
|
383
|
+
if self._url_dedup.is_new(normed):
|
|
384
|
+
self._url_dedup.mark_seen(normed)
|
|
385
|
+
self._db_upsert_source(normed, "discovered")
|
|
386
|
+
self._frontier.push(normed, depth=0, score=1.0)
|
|
387
|
+
|
|
388
|
+
if self._frontier.empty():
|
|
389
|
+
_logger.warning("No valid seed URLs; returning empty CrawlResult.")
|
|
390
|
+
return CrawlResult()
|
|
391
|
+
|
|
392
|
+
timeout = aiohttp.ClientTimeout(connect=10, sock_read=45)
|
|
393
|
+
connector = ProxyConnector.from_url(
|
|
394
|
+
f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
395
|
+
rdns=True,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
|
399
|
+
active: set[asyncio.Task] = set()
|
|
400
|
+
total_processed = 0
|
|
401
|
+
|
|
402
|
+
while True:
|
|
403
|
+
# Fill task pool up to concurrency cap while pages remain
|
|
404
|
+
while (
|
|
405
|
+
not self._frontier.empty()
|
|
406
|
+
and len(active) < _GLOBAL_CONCURRENCY
|
|
407
|
+
and total_processed + len(active) < self.max_pages
|
|
408
|
+
):
|
|
409
|
+
url, depth = self._frontier.pop()
|
|
410
|
+
task = asyncio.create_task(
|
|
411
|
+
self._process_url(url, depth, session),
|
|
412
|
+
name=f"crawl:{url}",
|
|
413
|
+
)
|
|
414
|
+
active.add(task)
|
|
415
|
+
|
|
416
|
+
if not active:
|
|
417
|
+
break # frontier empty, nothing in flight
|
|
418
|
+
|
|
419
|
+
done, active = await asyncio.wait(
|
|
420
|
+
active, return_when=asyncio.FIRST_COMPLETED
|
|
421
|
+
)
|
|
422
|
+
total_processed += len(done)
|
|
423
|
+
|
|
424
|
+
# Propagate any unexpected task exceptions to the log
|
|
425
|
+
for t in done:
|
|
426
|
+
exc = t.exception()
|
|
427
|
+
if exc:
|
|
428
|
+
_logger.error("Task %s raised: %s", t.get_name(), exc)
|
|
429
|
+
|
|
430
|
+
return CrawlResult(
|
|
431
|
+
pages_crawled=self._pages_crawled,
|
|
432
|
+
pages_failed=self._pages_failed,
|
|
433
|
+
new_urls_discovered=self._new_urls_discovered,
|
|
434
|
+
results=self._results,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
# Public module-level function
|
|
440
|
+
# ---------------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
async def crawl(
|
|
443
|
+
seed_urls: List[str],
|
|
444
|
+
query: str,
|
|
445
|
+
max_depth: int = 2,
|
|
446
|
+
max_pages: int = 200,
|
|
447
|
+
min_relevance: float = 0.3,
|
|
448
|
+
) -> CrawlResult:
|
|
449
|
+
"""
|
|
450
|
+
Recursively crawl from *seed_urls*, prioritising links relevant to *query*.
|
|
451
|
+
|
|
452
|
+
All requests are routed through the Tor SOCKS5 proxy configured in
|
|
453
|
+
TOR_PROXY_HOST / TOR_PROXY_PORT. Returns a CrawlResult dataclass.
|
|
454
|
+
"""
|
|
455
|
+
spider = Spider(
|
|
456
|
+
seed_urls=seed_urls,
|
|
457
|
+
query=query,
|
|
458
|
+
max_depth=max_depth,
|
|
459
|
+
max_pages=max_pages,
|
|
460
|
+
min_relevance=min_relevance,
|
|
461
|
+
)
|
|
462
|
+
return await spider.run()
|
crawler/utils.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler/utils.py — Link extraction and URL helpers for the .onion crawler.
|
|
3
|
+
|
|
4
|
+
Public API:
|
|
5
|
+
extract_onion_links(html, base_url) → List[str]
|
|
6
|
+
is_valid_onion(url) → bool
|
|
7
|
+
normalize_url(url) → str
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from typing import List
|
|
14
|
+
from urllib.parse import urljoin, urlparse, urlunparse
|
|
15
|
+
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Compiled regexes
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
# Base32 alphabet: a-z and 2-7 (RFC 4648)
|
|
23
|
+
# v2 onion: exactly 16 base32 chars (deprecated but still in the wild)
|
|
24
|
+
# v3 onion: exactly 56 base32 chars
|
|
25
|
+
_ONION_HOST_RE = re.compile(
|
|
26
|
+
r"^(?:[a-z2-7]{16}|[a-z2-7]{56})\.onion$",
|
|
27
|
+
re.IGNORECASE,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Public helpers
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def extract_onion_links(html: str, base_url: str = "") -> List[str]:
|
|
36
|
+
"""
|
|
37
|
+
Extract all .onion hrefs from raw HTML and return as absolute URLs.
|
|
38
|
+
|
|
39
|
+
- Resolves relative hrefs against *base_url* when provided.
|
|
40
|
+
- Filters out non-.onion results using is_valid_onion().
|
|
41
|
+
- Deduplicates within the returned list (first occurrence wins).
|
|
42
|
+
- Never raises — returns [] on any parse failure.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
46
|
+
except Exception:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
seen: set[str] = set()
|
|
50
|
+
results: List[str] = []
|
|
51
|
+
|
|
52
|
+
for tag in soup.find_all("a", href=True):
|
|
53
|
+
href = str(tag["href"]).strip()
|
|
54
|
+
if not href or href.startswith("#") or href.lower().startswith("javascript:"):
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
# Resolve relative URLs
|
|
58
|
+
if base_url:
|
|
59
|
+
try:
|
|
60
|
+
absolute = urljoin(base_url, href)
|
|
61
|
+
except Exception:
|
|
62
|
+
continue
|
|
63
|
+
else:
|
|
64
|
+
absolute = href
|
|
65
|
+
|
|
66
|
+
normalized = normalize_url(absolute)
|
|
67
|
+
if not normalized or normalized in seen:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
if is_valid_onion(normalized):
|
|
71
|
+
seen.add(normalized)
|
|
72
|
+
results.append(normalized)
|
|
73
|
+
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def is_valid_onion(url: str) -> bool:
|
|
78
|
+
"""
|
|
79
|
+
Return True if *url* is a syntactically valid .onion URL.
|
|
80
|
+
|
|
81
|
+
Accepts both v2 (16-char base32) and v3 (56-char base32) hostnames.
|
|
82
|
+
Scheme must be http or https. Port, path, and query are allowed.
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
parsed = urlparse(url)
|
|
86
|
+
except Exception:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
if parsed.scheme not in ("http", "https"):
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
hostname = (parsed.hostname or "").lower()
|
|
93
|
+
return bool(_ONION_HOST_RE.match(hostname))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def normalize_url(url: str) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Return a canonical form of *url* suitable for deduplication.
|
|
99
|
+
|
|
100
|
+
Transformations applied:
|
|
101
|
+
- Lowercase scheme and host
|
|
102
|
+
- Strip URL fragment (#…)
|
|
103
|
+
- Strip trailing slashes from path (root "/" preserved as empty)
|
|
104
|
+
- Preserve query string and params unchanged
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
parsed = urlparse(url)
|
|
108
|
+
except Exception:
|
|
109
|
+
return url
|
|
110
|
+
|
|
111
|
+
scheme = parsed.scheme.lower()
|
|
112
|
+
netloc = parsed.netloc.lower()
|
|
113
|
+
path = parsed.path
|
|
114
|
+
|
|
115
|
+
# Strip trailing slashes but keep the path otherwise intact
|
|
116
|
+
if path and path != "/":
|
|
117
|
+
path = path.rstrip("/")
|
|
118
|
+
elif path == "/":
|
|
119
|
+
path = ""
|
|
120
|
+
|
|
121
|
+
# Rebuild without fragment
|
|
122
|
+
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
db/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
db — persistent storage layer (Phase 1A).
|
|
3
|
+
|
|
4
|
+
Public surface:
|
|
5
|
+
Base — SQLAlchemy declarative base; import to create schema
|
|
6
|
+
Investigation — investigation run record
|
|
7
|
+
Source — every .onion domain ever seen
|
|
8
|
+
Page — every scraped page
|
|
9
|
+
Entity — structured intelligence artifact extracted from a page
|
|
10
|
+
EntityRelationship — link between two entities
|
|
11
|
+
investigation_sources — many-to-many junction table (Investigation <-> Source)
|
|
12
|
+
get_engine — create / retrieve a SQLAlchemy Engine
|
|
13
|
+
get_session_factory — return a sessionmaker bound to an engine
|
|
14
|
+
get_session — context-manager that yields a committed/rolled-back Session
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from db.models import (
|
|
18
|
+
Base,
|
|
19
|
+
Investigation,
|
|
20
|
+
Source,
|
|
21
|
+
Page,
|
|
22
|
+
Entity,
|
|
23
|
+
EntityRelationship,
|
|
24
|
+
investigation_sources,
|
|
25
|
+
SourceStatus,
|
|
26
|
+
SourceType,
|
|
27
|
+
EntityType,
|
|
28
|
+
RelationshipType,
|
|
29
|
+
)
|
|
30
|
+
from db.session import get_engine, get_session_factory, get_session
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"Base",
|
|
34
|
+
"Investigation",
|
|
35
|
+
"Source",
|
|
36
|
+
"Page",
|
|
37
|
+
"Entity",
|
|
38
|
+
"EntityRelationship",
|
|
39
|
+
"investigation_sources",
|
|
40
|
+
"SourceStatus",
|
|
41
|
+
"SourceType",
|
|
42
|
+
"EntityType",
|
|
43
|
+
"RelationshipType",
|
|
44
|
+
"get_engine",
|
|
45
|
+
"get_session_factory",
|
|
46
|
+
"get_session",
|
|
47
|
+
]
|
|
File without changes
|