voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
crawler/spider.py ADDED
@@ -0,0 +1,462 @@
1
+ """
2
+ crawler/spider.py — Async recursive .onion spider (Phase 1C).
3
+
4
+ Public API:
5
+ CrawlResult dataclass — returned by crawl()
6
+ crawl() async function — main entry point
7
+
8
+ All HTTP requests go through the Tor SOCKS5 proxy (TOR_PROXY_HOST /
9
+ TOR_PROXY_PORT from config.py). No clearnet requests to dark web targets.
10
+
11
+ Politeness rules (non-negotiable for Tor stability):
12
+ - Same domain → random 2–8 s delay between consecutive requests
13
+ - New domain → random 0.5–2 s delay on first access
14
+ - Per-domain concurrency cap: 3 simultaneous requests (asyncio.Semaphore)
15
+ - 1 MB download cap per page (identical to scrape.py)
16
+
17
+ Error handling:
18
+ - A failed page is logged, its source marked 'failed' in the DB, and the
19
+ crawl continues — a single bad page never terminates the run.
20
+ - Retry/backoff mirrors scrape.py: up to 3 retries (2 s / 4 s / 8 s),
21
+ no retry on 4xx responses.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import asyncio
27
+ import hashlib
28
+ import logging
29
+ import random
30
+ import time
31
+ from collections import defaultdict
32
+ from dataclasses import dataclass, field
33
+ from typing import Dict, List, Optional, Tuple
34
+ from urllib.parse import urlparse
35
+
36
+ import aiohttp
37
+ from aiohttp_socks import ProxyConnector
38
+
39
+ from config import TOR_PROXY_HOST, TOR_PROXY_PORT
40
+ from crawler.dedup import ContentDedup, UrlDedup
41
+ from crawler.frontier import Frontier
42
+ from crawler.utils import extract_onion_links, is_valid_onion, normalize_url
43
+ from scraper.scrape import _extract_text
44
+
45
+ _logger = logging.getLogger(__name__)
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Constants (mirror scrape.py where applicable)
49
+ # ---------------------------------------------------------------------------
50
+
51
+ MAX_DOWNLOAD_BYTES = 1_000_000 # 1 MB hard cap
52
+ MAX_RETURN_CHARS = 2_000 # truncation in results list
53
+ MAX_RETRIES = 3
54
+ RETRY_DELAYS = (2.0, 4.0, 8.0) # seconds before retry 1, 2, 3
55
+ RETRYABLE_STATUS = {500, 502, 503, 504}
56
+ ALLOWED_CONTENT_TYPES = ("text/html", "application/xhtml+xml", "text/plain")
57
+
58
+ _SAME_DOMAIN_DELAY = (2.0, 8.0) # seconds, random within range
59
+ _NEW_DOMAIN_DELAY = (0.5, 2.0) # seconds, random within range
60
+ _DOMAIN_MAX_CONCURRENT = 3 # asyncio.Semaphore value per domain
61
+ _GLOBAL_CONCURRENCY = 10 # max simultaneous page fetches overall
62
+
63
+ _USER_AGENT = (
64
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) "
65
+ "Gecko/20100101 Firefox/137.0"
66
+ )
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Return type
71
+ # ---------------------------------------------------------------------------
72
+
73
+ @dataclass
74
+ class CrawlResult:
75
+ """
76
+ Summary of a completed crawl run.
77
+
78
+ *results* is a list of dicts, each with keys "url" and "content",
79
+ shaped the same as individual entries from scrape_multiple() so both
80
+ are interchangeable in the intelligence pipeline.
81
+ """
82
+ pages_crawled: int = 0
83
+ pages_failed: int = 0
84
+ new_urls_discovered: int = 0
85
+ results: List[Dict] = field(default_factory=list)
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Spider
90
+ # ---------------------------------------------------------------------------
91
+
92
+ class Spider:
93
+ """
94
+ Recursive async .onion crawler.
95
+
96
+ Instantiate once per crawl run; do not reuse across runs.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ seed_urls: List[str],
102
+ query: str,
103
+ max_depth: int = 2,
104
+ max_pages: int = 200,
105
+ min_relevance: float = 0.3,
106
+ ) -> None:
107
+ self.seed_urls = seed_urls
108
+ self.query = query
109
+ self.max_depth = max_depth
110
+ self.max_pages = max_pages
111
+ self.min_relevance = min_relevance
112
+
113
+ self._frontier = Frontier(query)
114
+ self._url_dedup = UrlDedup()
115
+ self._content_dedup = ContentDedup()
116
+
117
+ # Per-domain politeness state
118
+ self._domain_semaphores: Dict[str, asyncio.Semaphore] = defaultdict(
119
+ lambda: asyncio.Semaphore(_DOMAIN_MAX_CONCURRENT)
120
+ )
121
+ self._domain_last_access: Dict[str, float] = {}
122
+ self._timing_lock = asyncio.Lock()
123
+
124
+ # Counters
125
+ self._pages_crawled = 0
126
+ self._pages_failed = 0
127
+ self._new_urls_discovered = 0
128
+ self._results: List[Dict] = []
129
+
130
+ # ------------------------------------------------------------------
131
+ # Politeness
132
+ # ------------------------------------------------------------------
133
+
134
+ async def _polite_delay(self, domain: str) -> None:
135
+ """
136
+ Compute and sleep the required inter-request delay for *domain*.
137
+
138
+ Uses _timing_lock to read/update last-access atomically in the
139
+ event loop; the actual sleep happens outside the lock so other
140
+ coroutines are not blocked.
141
+ """
142
+ async with self._timing_lock:
143
+ last = self._domain_last_access.get(domain)
144
+ now = time.monotonic()
145
+ if last is None:
146
+ delay = random.uniform(*_NEW_DOMAIN_DELAY)
147
+ else:
148
+ elapsed = now - last
149
+ needed = random.uniform(*_SAME_DOMAIN_DELAY)
150
+ delay = max(0.0, needed - elapsed)
151
+ # Reserve the slot so concurrent coroutines don't both sleep 0
152
+ self._domain_last_access[domain] = now + delay
153
+
154
+ if delay > 0:
155
+ await asyncio.sleep(delay)
156
+
157
+ # ------------------------------------------------------------------
158
+ # Fetch with retry (mirrors scrape.py's _fetch_one)
159
+ # ------------------------------------------------------------------
160
+
161
+ async def _fetch(
162
+ self,
163
+ url: str,
164
+ session: aiohttp.ClientSession,
165
+ ) -> Optional[Tuple[bytes, str, str]]:
166
+ """
167
+ Fetch *url* with exponential-backoff retry.
168
+
169
+ Returns (raw_bytes, html, extracted_text) on success, or None on
170
+ any unrecoverable failure. Never raises.
171
+ """
172
+ headers = {
173
+ "User-Agent": _USER_AGENT,
174
+ "Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
175
+ }
176
+ last_exc: object = None
177
+
178
+ for attempt in range(MAX_RETRIES + 1):
179
+ if attempt > 0:
180
+ await asyncio.sleep(RETRY_DELAYS[attempt - 1])
181
+
182
+ try:
183
+ async with session.get(url, headers=headers) as resp:
184
+ if resp.status in RETRYABLE_STATUS:
185
+ last_exc = f"HTTP {resp.status}"
186
+ continue
187
+
188
+ if resp.status != 200:
189
+ return None # 4xx — not retried
190
+
191
+ ct = (resp.headers.get("Content-Type") or "").lower()
192
+ if ct and not any(t in ct for t in ALLOWED_CONTENT_TYPES):
193
+ return None
194
+
195
+ # Stream with 1 MB hard cap
196
+ chunks: List[bytes] = []
197
+ total = 0
198
+ async for chunk in resp.content.iter_chunked(8192):
199
+ if not chunk:
200
+ continue
201
+ total += len(chunk)
202
+ if total > MAX_DOWNLOAD_BYTES:
203
+ break
204
+ chunks.append(chunk)
205
+
206
+ raw_bytes = b"".join(chunks)
207
+ html = raw_bytes.decode(resp.charset or "utf-8", errors="replace")
208
+ text = _extract_text(html)
209
+ return raw_bytes, html, text
210
+
211
+ except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
212
+ last_exc = exc
213
+
214
+ _logger.debug("All retries exhausted for %s: %s", url, last_exc)
215
+ return None
216
+
217
+ # ------------------------------------------------------------------
218
+ # DB helpers (pattern from scrape.py _persist_pages)
219
+ # ------------------------------------------------------------------
220
+
221
+ def _db_upsert_source(self, url: str, status: str) -> None:
222
+ """
223
+ Upsert the .onion domain for *url* into the sources table.
224
+
225
+ Only sets *status* when the row is newly created; existing rows are
226
+ left at their current status so we never downgrade 'active' → 'discovered'.
227
+ If *status* is 'failed' or 'active' it is always applied (overwrite).
228
+ """
229
+ try:
230
+ from config import DATABASE_URL as _db_url
231
+ if not _db_url:
232
+ return
233
+ from db.queries import get_or_create_source, update_source_status
234
+ from db.session import get_session
235
+ except ImportError:
236
+ return
237
+
238
+ try:
239
+ hostname = (urlparse(url).hostname or "").lower()
240
+ if not hostname.endswith(".onion"):
241
+ return
242
+ with get_session() as session:
243
+ src, created = get_or_create_source(
244
+ session, hostname, source_type="crawled"
245
+ )
246
+ # Always apply terminal statuses; only apply 'discovered' to new rows
247
+ if status in ("active", "failed") or created:
248
+ update_source_status(session, src.id, status)
249
+ except Exception as exc:
250
+ _logger.debug("DB source upsert failed url=%s status=%s: %s", url, status, exc)
251
+
252
+ def _db_persist_page(
253
+ self,
254
+ url: str,
255
+ raw_bytes: bytes,
256
+ text: str,
257
+ content_hash: str,
258
+ ) -> None:
259
+ """Write a successfully scraped page to the database."""
260
+ try:
261
+ from config import DATABASE_URL as _db_url
262
+ if not _db_url:
263
+ return
264
+ from db.queries import create_page, get_or_create_source, update_source_status
265
+ from db.session import get_session
266
+ except ImportError:
267
+ return
268
+
269
+ try:
270
+ with get_session() as session:
271
+ hostname = (urlparse(url).hostname or "").lower()
272
+ source_id = None
273
+ if hostname.endswith(".onion"):
274
+ src, _ = get_or_create_source(
275
+ session, hostname, source_type="crawled"
276
+ )
277
+ update_source_status(session, src.id, "active")
278
+ source_id = src.id
279
+
280
+ create_page(
281
+ session,
282
+ url=url,
283
+ source_id=source_id,
284
+ cleaned_text=text,
285
+ raw_content_hash=content_hash,
286
+ byte_size=len(raw_bytes),
287
+ )
288
+ except Exception as exc:
289
+ _logger.debug("DB page persist failed url=%s: %s", url, exc)
290
+
291
+ # ------------------------------------------------------------------
292
+ # Core page processing
293
+ # ------------------------------------------------------------------
294
+
295
+ async def _process_url(
296
+ self,
297
+ url: str,
298
+ depth: int,
299
+ session: aiohttp.ClientSession,
300
+ ) -> None:
301
+ """
302
+ Fetch *url*, extract links, and update all state.
303
+
304
+ Acquires the per-domain semaphore after the politeness delay so at
305
+ most _DOMAIN_MAX_CONCURRENT fetches to the same domain run in
306
+ parallel at any time.
307
+ """
308
+ domain = (urlparse(url).hostname or url).lower()
309
+ await self._polite_delay(domain)
310
+
311
+ async with self._domain_semaphores[domain]:
312
+ try:
313
+ result = await self._fetch(url, session)
314
+
315
+ if result is None:
316
+ self._pages_failed += 1
317
+ _logger.debug("Fetch returned None for %s", url)
318
+ self._db_upsert_source(url, "failed")
319
+ return
320
+
321
+ raw_bytes, html, text = result
322
+ content_hash = hashlib.sha256(raw_bytes).hexdigest()
323
+
324
+ # Content dedup: skip DB write if hash already stored
325
+ if not self._content_dedup.is_duplicate(content_hash):
326
+ self._db_persist_page(url, raw_bytes, text, content_hash)
327
+ else:
328
+ # Source still reached successfully — keep status accurate
329
+ self._db_upsert_source(url, "active")
330
+
331
+ self._pages_crawled += 1
332
+
333
+ # Truncate content for the results list
334
+ snippet = (text or "")[:MAX_RETURN_CHARS]
335
+ suffix = "...(truncated)"
336
+ if len(text or "") > MAX_RETURN_CHARS:
337
+ available = MAX_RETURN_CHARS - len(suffix)
338
+ snippet = (text[:available] + suffix) if available > 0 else suffix
339
+
340
+ self._results.append({"url": url, "content": snippet})
341
+
342
+ # Extract and enqueue child links
343
+ if depth < self.max_depth:
344
+ links = extract_onion_links(html, base_url=url)
345
+ for link in links:
346
+ normed = normalize_url(link)
347
+ if not normed:
348
+ continue
349
+ if self._url_dedup.is_new(normed):
350
+ self._new_urls_discovered += 1
351
+ self._url_dedup.mark_seen(normed)
352
+ self._db_upsert_source(normed, "discovered")
353
+
354
+ link_score = self._frontier.score(normed, (text or "")[:500])
355
+ if link_score >= self.min_relevance:
356
+ self._frontier.push(normed, depth + 1, link_score)
357
+
358
+ except Exception as exc:
359
+ self._pages_failed += 1
360
+ _logger.warning("Unexpected error processing %s: %s", url, exc, exc_info=True)
361
+ self._db_upsert_source(url, "failed")
362
+
363
+ # ------------------------------------------------------------------
364
+ # Main crawl loop
365
+ # ------------------------------------------------------------------
366
+
367
+ async def run(self) -> CrawlResult:
368
+ """
369
+ Execute the full crawl and return a CrawlResult.
370
+
371
+ Flow:
372
+ 1. Normalize and validate seed URLs → push to frontier (score=1.0)
373
+ 2. Open one Tor-proxied aiohttp session for the entire run
374
+ 3. Dispatch up to _GLOBAL_CONCURRENCY concurrent _process_url tasks
375
+ 4. Replenish tasks as each completes; stop when frontier is empty
376
+ or max_pages total have been processed
377
+ """
378
+ for url in self.seed_urls:
379
+ normed = normalize_url(url)
380
+ if not normed or not is_valid_onion(normed):
381
+ _logger.warning("Skipping invalid seed URL: %s", url)
382
+ continue
383
+ if self._url_dedup.is_new(normed):
384
+ self._url_dedup.mark_seen(normed)
385
+ self._db_upsert_source(normed, "discovered")
386
+ self._frontier.push(normed, depth=0, score=1.0)
387
+
388
+ if self._frontier.empty():
389
+ _logger.warning("No valid seed URLs; returning empty CrawlResult.")
390
+ return CrawlResult()
391
+
392
+ timeout = aiohttp.ClientTimeout(connect=10, sock_read=45)
393
+ connector = ProxyConnector.from_url(
394
+ f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
395
+ rdns=True,
396
+ )
397
+
398
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
399
+ active: set[asyncio.Task] = set()
400
+ total_processed = 0
401
+
402
+ while True:
403
+ # Fill task pool up to concurrency cap while pages remain
404
+ while (
405
+ not self._frontier.empty()
406
+ and len(active) < _GLOBAL_CONCURRENCY
407
+ and total_processed + len(active) < self.max_pages
408
+ ):
409
+ url, depth = self._frontier.pop()
410
+ task = asyncio.create_task(
411
+ self._process_url(url, depth, session),
412
+ name=f"crawl:{url}",
413
+ )
414
+ active.add(task)
415
+
416
+ if not active:
417
+ break # frontier empty, nothing in flight
418
+
419
+ done, active = await asyncio.wait(
420
+ active, return_when=asyncio.FIRST_COMPLETED
421
+ )
422
+ total_processed += len(done)
423
+
424
+ # Propagate any unexpected task exceptions to the log
425
+ for t in done:
426
+ exc = t.exception()
427
+ if exc:
428
+ _logger.error("Task %s raised: %s", t.get_name(), exc)
429
+
430
+ return CrawlResult(
431
+ pages_crawled=self._pages_crawled,
432
+ pages_failed=self._pages_failed,
433
+ new_urls_discovered=self._new_urls_discovered,
434
+ results=self._results,
435
+ )
436
+
437
+
438
+ # ---------------------------------------------------------------------------
439
+ # Public module-level function
440
+ # ---------------------------------------------------------------------------
441
+
442
+ async def crawl(
443
+ seed_urls: List[str],
444
+ query: str,
445
+ max_depth: int = 2,
446
+ max_pages: int = 200,
447
+ min_relevance: float = 0.3,
448
+ ) -> CrawlResult:
449
+ """
450
+ Recursively crawl from *seed_urls*, prioritising links relevant to *query*.
451
+
452
+ All requests are routed through the Tor SOCKS5 proxy configured in
453
+ TOR_PROXY_HOST / TOR_PROXY_PORT. Returns a CrawlResult dataclass.
454
+ """
455
+ spider = Spider(
456
+ seed_urls=seed_urls,
457
+ query=query,
458
+ max_depth=max_depth,
459
+ max_pages=max_pages,
460
+ min_relevance=min_relevance,
461
+ )
462
+ return await spider.run()
crawler/utils.py ADDED
@@ -0,0 +1,122 @@
1
+ """
2
+ crawler/utils.py — Link extraction and URL helpers for the .onion crawler.
3
+
4
+ Public API:
5
+ extract_onion_links(html, base_url) → List[str]
6
+ is_valid_onion(url) → bool
7
+ normalize_url(url) → str
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from typing import List
14
+ from urllib.parse import urljoin, urlparse, urlunparse
15
+
16
+ from bs4 import BeautifulSoup
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Compiled regexes
20
+ # ---------------------------------------------------------------------------
21
+
22
+ # Base32 alphabet: a-z and 2-7 (RFC 4648)
23
+ # v2 onion: exactly 16 base32 chars (deprecated but still in the wild)
24
+ # v3 onion: exactly 56 base32 chars
25
+ _ONION_HOST_RE = re.compile(
26
+ r"^(?:[a-z2-7]{16}|[a-z2-7]{56})\.onion$",
27
+ re.IGNORECASE,
28
+ )
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Public helpers
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def extract_onion_links(html: str, base_url: str = "") -> List[str]:
36
+ """
37
+ Extract all .onion hrefs from raw HTML and return as absolute URLs.
38
+
39
+ - Resolves relative hrefs against *base_url* when provided.
40
+ - Filters out non-.onion results using is_valid_onion().
41
+ - Deduplicates within the returned list (first occurrence wins).
42
+ - Never raises — returns [] on any parse failure.
43
+ """
44
+ try:
45
+ soup = BeautifulSoup(html, "html.parser")
46
+ except Exception:
47
+ return []
48
+
49
+ seen: set[str] = set()
50
+ results: List[str] = []
51
+
52
+ for tag in soup.find_all("a", href=True):
53
+ href = str(tag["href"]).strip()
54
+ if not href or href.startswith("#") or href.lower().startswith("javascript:"):
55
+ continue
56
+
57
+ # Resolve relative URLs
58
+ if base_url:
59
+ try:
60
+ absolute = urljoin(base_url, href)
61
+ except Exception:
62
+ continue
63
+ else:
64
+ absolute = href
65
+
66
+ normalized = normalize_url(absolute)
67
+ if not normalized or normalized in seen:
68
+ continue
69
+
70
+ if is_valid_onion(normalized):
71
+ seen.add(normalized)
72
+ results.append(normalized)
73
+
74
+ return results
75
+
76
+
77
+ def is_valid_onion(url: str) -> bool:
78
+ """
79
+ Return True if *url* is a syntactically valid .onion URL.
80
+
81
+ Accepts both v2 (16-char base32) and v3 (56-char base32) hostnames.
82
+ Scheme must be http or https. Port, path, and query are allowed.
83
+ """
84
+ try:
85
+ parsed = urlparse(url)
86
+ except Exception:
87
+ return False
88
+
89
+ if parsed.scheme not in ("http", "https"):
90
+ return False
91
+
92
+ hostname = (parsed.hostname or "").lower()
93
+ return bool(_ONION_HOST_RE.match(hostname))
94
+
95
+
96
+ def normalize_url(url: str) -> str:
97
+ """
98
+ Return a canonical form of *url* suitable for deduplication.
99
+
100
+ Transformations applied:
101
+ - Lowercase scheme and host
102
+ - Strip URL fragment (#…)
103
+ - Strip trailing slashes from path (root "/" preserved as empty)
104
+ - Preserve query string and params unchanged
105
+ """
106
+ try:
107
+ parsed = urlparse(url)
108
+ except Exception:
109
+ return url
110
+
111
+ scheme = parsed.scheme.lower()
112
+ netloc = parsed.netloc.lower()
113
+ path = parsed.path
114
+
115
+ # Strip trailing slashes but keep the path otherwise intact
116
+ if path and path != "/":
117
+ path = path.rstrip("/")
118
+ elif path == "/":
119
+ path = ""
120
+
121
+ # Rebuild without fragment
122
+ return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
db/__init__.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ db — persistent storage layer (Phase 1A).
3
+
4
+ Public surface:
5
+ Base — SQLAlchemy declarative base; import to create schema
6
+ Investigation — investigation run record
7
+ Source — every .onion domain ever seen
8
+ Page — every scraped page
9
+ Entity — structured intelligence artifact extracted from a page
10
+ EntityRelationship — link between two entities
11
+ investigation_sources — many-to-many junction table (Investigation <-> Source)
12
+ get_engine — create / retrieve a SQLAlchemy Engine
13
+ get_session_factory — return a sessionmaker bound to an engine
14
+ get_session — context-manager that yields a committed/rolled-back Session
15
+ """
16
+
17
+ from db.models import (
18
+ Base,
19
+ Investigation,
20
+ Source,
21
+ Page,
22
+ Entity,
23
+ EntityRelationship,
24
+ investigation_sources,
25
+ SourceStatus,
26
+ SourceType,
27
+ EntityType,
28
+ RelationshipType,
29
+ )
30
+ from db.session import get_engine, get_session_factory, get_session
31
+
32
+ __all__ = [
33
+ "Base",
34
+ "Investigation",
35
+ "Source",
36
+ "Page",
37
+ "Entity",
38
+ "EntityRelationship",
39
+ "investigation_sources",
40
+ "SourceStatus",
41
+ "SourceType",
42
+ "EntityType",
43
+ "RelationshipType",
44
+ "get_engine",
45
+ "get_session_factory",
46
+ "get_session",
47
+ ]
File without changes