voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
scraper/scrape.py ADDED
@@ -0,0 +1,857 @@
1
+ """
2
+ scrape.py — async .onion / clearnet page fetcher for VoidAccess.
3
+
4
+ Public API (unchanged from Phase 0 — ui.py compatibility guaranteed):
5
+ scrape_multiple(urls_data, max_workers=5) -> Dict[str, str]
6
+ scrape_single(url_data, ...) -> Tuple[str, str]
7
+ get_tor_session() -> requests.Session
8
+
9
+ Internals rewritten in Phase 1B:
10
+ ThreadPoolExecutor + requests → asyncio + aiohttp-socks
11
+ BeautifulSoup-only extraction → trafilatura first, BeautifulSoup fallback
12
+ hardcoded 127.0.0.1:9050 → TOR_PROXY_HOST / TOR_PROXY_PORT from config
13
+ no retry → 3-attempt exponential backoff (2 s / 4 s / 8 s)
14
+ no DB persistence → pages written to Phase 1A db/ layer when DATABASE_URL is set
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import asyncio
20
+ import hashlib
21
+ import ipaddress
22
+ import logging
23
+ import random
24
+ import re
25
+ import warnings
26
+ from datetime import datetime, timezone
27
+ from typing import Dict, List, Optional, Tuple
28
+ from urllib.parse import urlparse, urlunparse
29
+
30
+ import aiohttp
31
+ import requests
32
+ import trafilatura
33
+ from aiohttp_socks import ProxyConnector
34
+ from bs4 import BeautifulSoup
35
+ from requests.adapters import HTTPAdapter
36
+ from urllib3.util.retry import Retry
37
+
38
+ from config import TOR_PROXY_HOST, TOR_PROXY_PORT, PLAYWRIGHT_ENABLED
39
+
40
+ warnings.filterwarnings("ignore")
41
+
42
+ _logger = logging.getLogger(__name__)
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Module-level constants (identical to Phase 0 — ui.py depends on these)
46
+ # ---------------------------------------------------------------------------
47
+
48
+ USER_AGENTS = [
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
50
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
51
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
52
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
53
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
54
+ "Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
55
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
56
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
57
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
58
+ ]
59
+
60
+ MAX_DOWNLOAD_BYTES = 1_000_000
61
+ MAX_EXTRACTED_TEXT_CHARS = 50_000
62
+ MAX_RETURN_CHARS = 15_000
63
+ ALLOWED_CONTENT_TYPES = ("text/html", "application/xhtml+xml", "text/plain")
64
+
65
+ # Retry configuration
66
+ MAX_RETRIES = 1
67
+ RETRY_DELAYS = (1.0,) # seconds before attempt 1
68
+ RETRYABLE_STATUS = {500, 502, 503, 504}
69
+
70
+ # Tor circuit error patterns - indicates circuit failure, not URL failure
71
+ SOCKS_ERRORS = (
72
+ "SOCKS5",
73
+ "socks5",
74
+ "Host unreachable",
75
+ "Connection refused",
76
+ "General SOCKS",
77
+ "circuit",
78
+ "Tor circuit",
79
+ )
80
+
81
+ # Internal / link-local ranges — block clearnet fetches (SSRF prevention)
82
+ _BLOCKED_IP_RANGES = [
83
+ ipaddress.ip_network("10.0.0.0/8"),
84
+ ipaddress.ip_network("172.16.0.0/12"),
85
+ ipaddress.ip_network("192.168.0.0/16"),
86
+ ipaddress.ip_network("127.0.0.0/8"),
87
+ ipaddress.ip_network("169.254.0.0/16"),
88
+ ipaddress.ip_network("::1/128"),
89
+ ipaddress.ip_network("fc00::/7"),
90
+ ]
91
+
92
+ _BLOCKED_HOSTNAMES = frozenset(
93
+ {
94
+ "localhost",
95
+ "metadata.google.internal",
96
+ "169.254.169.254",
97
+ }
98
+ )
99
+
100
+ # Common HTML timestamp patterns (forums / JSON-LD)
101
+ _TIMESTAMP_PATTERNS = [
102
+ (r'<time[^>]+datetime="([^"]+)"', "iso"),
103
+ (r"[Pp]osted[:\s]+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})", "%Y-%m-%d %H:%M:%S"),
104
+ (r"[Dd]ate[:\s]+(\d{2}/\d{2}/\d{4})", "%d/%m/%Y"),
105
+ (r'data-timestamp="(\d{10})"', "unix10"),
106
+ (
107
+ r'"datePublished":\s*"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})"',
108
+ "%Y-%m-%dT%H:%M:%S",
109
+ ),
110
+ ]
111
+
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Helpers
115
+ # ---------------------------------------------------------------------------
116
+
117
+
118
+ def extract_post_timestamp(html: str) -> Optional[datetime]:
119
+ """
120
+ Attempt to extract the original post timestamp from raw HTML.
121
+
122
+ Returns timezone-aware UTC datetime if found, None if not extractable.
123
+ Never raises — all failures return None.
124
+ """
125
+ try:
126
+ if not html:
127
+ return None
128
+
129
+ for pattern, fmt in _TIMESTAMP_PATTERNS:
130
+ try:
131
+ match = re.search(pattern, html)
132
+ if not match:
133
+ continue
134
+ value = match.group(1).strip()
135
+
136
+ if fmt == "iso":
137
+ s = value.replace("Z", "+00:00")
138
+ if len(s) >= 19 and "T" not in s[:19]:
139
+ s = value
140
+ dt = datetime.fromisoformat(s[:32])
141
+ if dt.tzinfo is None:
142
+ dt = dt.replace(tzinfo=timezone.utc)
143
+ else:
144
+ dt = dt.astimezone(timezone.utc)
145
+ if datetime(2010, 1, 1, tzinfo=timezone.utc) <= dt <= datetime.now(
146
+ timezone.utc
147
+ ):
148
+ return dt
149
+ continue
150
+
151
+ if fmt == "unix10":
152
+ ts = int(value)
153
+ if 1_000_000_000 < ts < 9_999_999_999:
154
+ return datetime.fromtimestamp(ts, tz=timezone.utc)
155
+ continue
156
+
157
+ sample = value[:19] if len(value) >= 19 else value
158
+ dt = datetime.strptime(sample, fmt)
159
+ if dt.tzinfo is None:
160
+ dt = dt.replace(tzinfo=timezone.utc)
161
+ if datetime(2010, 1, 1, tzinfo=timezone.utc) <= dt <= datetime.now(
162
+ timezone.utc
163
+ ):
164
+ return dt
165
+ except (ValueError, OverflowError, OSError, TypeError):
166
+ continue
167
+
168
+ return None
169
+ except Exception:
170
+ return None
171
+
172
+
173
+ def is_safe_url(url: str) -> bool:
174
+ """
175
+ Return False if URL targets internal/reserved addresses (SSRF prevention).
176
+ .onion hostnames are always allowed (Tor handles routing).
177
+ """
178
+ try:
179
+ parsed = urlparse(url)
180
+ hostname = (parsed.hostname or "").strip()
181
+ if hostname.lower().endswith(".onion"):
182
+ return True
183
+ if hostname.lower() in _BLOCKED_HOSTNAMES:
184
+ _logger.warning("SSRF blocked hostname: %s", hostname)
185
+ return False
186
+ try:
187
+ import socket
188
+ resolved_ip_str = socket.gethostbyname(hostname)
189
+ except Exception:
190
+ resolved_ip_str = None
191
+
192
+ ips_to_check = [hostname]
193
+ if resolved_ip_str and resolved_ip_str != hostname:
194
+ ips_to_check.append(resolved_ip_str)
195
+
196
+ for ip_str in ips_to_check:
197
+ try:
198
+ ip = ipaddress.ip_address(ip_str)
199
+ for blocked_range in _BLOCKED_IP_RANGES:
200
+ if ip in blocked_range:
201
+ _logger.warning("SSRF blocked IP %s (from %s) in %s", ip_str, hostname, blocked_range)
202
+ return False
203
+ except ValueError:
204
+ pass
205
+ return True
206
+ except Exception:
207
+ return False
208
+
209
+
210
+ def validate_urls_for_scraping(
211
+ url_dicts: List[dict],
212
+ ) -> Tuple[List[dict], List[str]]:
213
+ """
214
+ Filter URL dicts before scraping. Returns (safe_dicts, blocked_url_strings).
215
+ """
216
+ safe: List[dict] = []
217
+ blocked: List[str] = []
218
+ for url_dict in url_dicts:
219
+ link = url_dict.get("link", url_dict) if isinstance(url_dict, dict) else str(url_dict)
220
+ if is_safe_url(link):
221
+ safe.append(url_dict)
222
+ else:
223
+ blocked.append(link)
224
+ if blocked:
225
+ _logger.warning(
226
+ "SSRF prevention blocked %d URLs: %s",
227
+ len(blocked),
228
+ blocked[:5],
229
+ )
230
+ return safe, blocked
231
+
232
+ def _normalize_url_data(url_data) -> Tuple[str, str]:
233
+ """Extract (url, title) from a search result dict."""
234
+ if not isinstance(url_data, dict):
235
+ return "", "Untitled"
236
+ url = str(url_data.get("link") or "").strip()
237
+ title = str(url_data.get("title") or "Untitled").strip() or "Untitled"
238
+ return url, title
239
+
240
+
241
+ def is_onion_url(url: str) -> bool:
242
+ """Return True if URL is a .onion address requiring Tor."""
243
+ try:
244
+ parsed = urlparse(url)
245
+ hostname = parsed.hostname or ""
246
+ return hostname.lower().endswith(".onion")
247
+ except Exception:
248
+ return False
249
+
250
+
251
+ def normalize_url(url: str) -> str:
252
+ """
253
+ Normalize a URL for consistent storage/dedup.
254
+ Uses crawler.utils.normalize_url for consistency.
255
+ """
256
+ try:
257
+ from crawler.utils import normalize_url as _norm
258
+ return _norm(url)
259
+ except ImportError:
260
+ parsed = urlparse(url)
261
+ scheme = parsed.scheme.lower()
262
+ netloc = parsed.netloc.lower()
263
+ path = parsed.path.rstrip("/") if parsed.path else ""
264
+ return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
265
+
266
+
267
+ def classify_urls(urls: List[dict]) -> Tuple[List[dict], List[dict]]:
268
+ """
269
+ Split URLs into onion (needs Tor) and clearnet (direct fetch).
270
+
271
+ Malformed URLs are treated as clearnet.
272
+ """
273
+ onion_urls: List[dict] = []
274
+ clearnet_urls: List[dict] = []
275
+ for url_dict in urls:
276
+ link = url_dict.get("link", "") if isinstance(url_dict, dict) else str(url_dict)
277
+ if is_onion_url(link):
278
+ onion_urls.append(url_dict)
279
+ else:
280
+ clearnet_urls.append(url_dict)
281
+ return onion_urls, clearnet_urls
282
+
283
+
284
+ def _is_onion(url: str) -> bool:
285
+ """Return True if the URL targets a .onion hostname."""
286
+ return is_onion_url(url)
287
+
288
+
289
+ def _build_proxy_url() -> str:
290
+ """
291
+ SOCKS URL for ``requests`` / urllib3 (PySocks understands ``socks5h`` =
292
+ remote DNS at the proxy, required for ``.onion``).
293
+
294
+ ``aiohttp_socks`` uses ``python_socks.parse_proxy_url``, which does *not*
295
+ accept the ``socks5h`` scheme — use :func:`_tor_aiohttp_connector` instead.
296
+ """
297
+ return f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}"
298
+
299
+
300
+ def _tor_aiohttp_connector() -> ProxyConnector:
301
+ """SOCKS5 with remote DNS (same behavior as socks5h) for aiohttp-socks."""
302
+ return ProxyConnector.from_url(
303
+ f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
304
+ rdns=True,
305
+ limit=20,
306
+ limit_per_host=10,
307
+ )
308
+
309
+
310
+ def _direct_tcp_connector() -> aiohttp.TCPConnector:
311
+ """Direct TCP connector with connection pooling."""
312
+ return aiohttp.TCPConnector(
313
+ limit=30,
314
+ limit_per_host=10,
315
+ )
316
+
317
+
318
+ _tor_session: Optional[aiohttp.ClientSession] = None
319
+ _direct_session: Optional[aiohttp.ClientSession] = None
320
+
321
+
322
+ def get_tor_session_cached() -> aiohttp.ClientSession:
323
+ """Return a cached Tor-proxied session for connection reuse."""
324
+ global _tor_session
325
+ if _tor_session is None or _tor_session.closed:
326
+ connector = _tor_aiohttp_connector()
327
+ _tor_session = aiohttp.ClientSession(
328
+ connector=connector,
329
+ timeout=aiohttp.ClientTimeout(connect=3, sock_read=5),
330
+ )
331
+ return _tor_session
332
+
333
+
334
+ def get_direct_session_cached() -> aiohttp.ClientSession:
335
+ """Return a cached direct session for connection reuse."""
336
+ global _direct_session
337
+ if _direct_session is None or _direct_session.closed:
338
+ connector = _direct_tcp_connector()
339
+ _direct_session = aiohttp.ClientSession(
340
+ connector=connector,
341
+ timeout=aiohttp.ClientTimeout(connect=5, sock_read=25),
342
+ )
343
+ return _direct_session
344
+
345
+
346
+ async def close_cached_sessions() -> None:
347
+ """Close cached sessions - call on shutdown."""
348
+ global _tor_session, _direct_session
349
+ if _tor_session and not _tor_session.closed:
350
+ await _tor_session.close()
351
+ _tor_session = None
352
+ if _direct_session and not _direct_session.closed:
353
+ await _direct_session.close()
354
+ _direct_session = None
355
+
356
+
357
+ async def _reset_tor_session_on_error() -> None:
358
+ """Reset cached Tor session on circuit error to force reconnection."""
359
+ global _tor_session
360
+ if _tor_session is not None and not _tor_session.closed:
361
+ try:
362
+ await _tor_session.close()
363
+ except Exception:
364
+ pass
365
+ _tor_session = None
366
+
367
+
368
+ # ---------------------------------------------------------------------------
369
+ # Content extraction
370
+ # ---------------------------------------------------------------------------
371
+
372
+ def _extract_text(html: str) -> str:
373
+ """
374
+ Extract main textual content from an HTML string.
375
+
376
+ trafilatura is tried first — it strips navbars, footers, ads, and scripts,
377
+ leaving the body text. If trafilatura returns nothing (or crashes), we fall
378
+ back to the BeautifulSoup path used in Phase 0.
379
+
380
+ Always truncates to MAX_EXTRACTED_TEXT_CHARS before returning.
381
+ """
382
+ try:
383
+ text = trafilatura.extract(html, include_comments=False, include_tables=True)
384
+ if text and text.strip():
385
+ return text[:MAX_EXTRACTED_TEXT_CHARS]
386
+ except Exception:
387
+ pass # lxml parse failure or trafilatura bug → fall through
388
+
389
+ soup = BeautifulSoup(html, "html.parser")
390
+ for tag in soup(["script", "style"]):
391
+ tag.extract()
392
+ text = " ".join(soup.get_text(separator=" ").split())
393
+ return text[:MAX_EXTRACTED_TEXT_CHARS]
394
+
395
+
396
+ def _score_content_quality(text: str) -> str:
397
+ """
398
+ Score scraped content quality for prioritization.
399
+
400
+ Returns:
401
+ "empty" - < 100 chars (likely failed fetch)
402
+ "thin" - 100-500 chars (minimal content)
403
+ "medium" - 500-2000 chars (decent content)
404
+ "rich" - > 2000 chars (full content)
405
+ """
406
+ length = len(text) if text else 0
407
+ if length < 100:
408
+ return "empty"
409
+ if length < 500:
410
+ return "thin"
411
+ if length < 2000:
412
+ return "medium"
413
+ return "rich"
414
+
415
+
416
+ # ---------------------------------------------------------------------------
417
+ # Async core — fetch with retry
418
+ # ---------------------------------------------------------------------------
419
+
420
+ async def _fetch_one(
421
+ session: aiohttp.ClientSession,
422
+ url_data: dict,
423
+ semaphore: asyncio.Semaphore,
424
+ ) -> Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]:
425
+ """
426
+ Fetch a single URL with exponential-backoff retry.
427
+
428
+ Returns:
429
+ (url, display_text, raw_bytes, db_text, posted_at)
430
+ - display_text: "{title} - {extracted_text}" — returned in the public dict
431
+ - raw_bytes: raw downloaded content (for SHA-256 hash + DB byte_size)
432
+ - db_text: extracted text only, no title prefix — stored in Page.cleaned_text
433
+ - posted_at: extracted from HTML when possible, else None
434
+
435
+ On any unrecoverable failure returns (url, title, None, None, None).
436
+ Failures never propagate as exceptions — graceful degradation is preserved.
437
+ """
438
+ url, title = _normalize_url_data(url_data)
439
+ if not url:
440
+ return "", title, None, None, None
441
+
442
+ if not is_safe_url(url):
443
+ _logger.warning("SSRF blocked fetch: %s", url)
444
+ return url, title, None, None, None
445
+
446
+ try:
447
+ from utils.content_safety import is_blocked_url
448
+ url_blocked, _reason = is_blocked_url(url)
449
+ if url_blocked:
450
+ _logger.warning(
451
+ "URL blocked — prohibited content. URL hash: %s",
452
+ hashlib.sha256(url.encode()).hexdigest()[:16],
453
+ )
454
+ return url, title, None, None, None
455
+ except Exception:
456
+ pass
457
+
458
+ parsed = urlparse(url)
459
+ if parsed.scheme not in ("http", "https"):
460
+ return url, title, None, None, None
461
+
462
+ headers = {
463
+ "User-Agent": random.choice(USER_AGENTS),
464
+ "Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
465
+ }
466
+
467
+ last_exc: object = None
468
+
469
+ async with semaphore:
470
+ for attempt in range(MAX_RETRIES + 1): # attempts: 0, 1, 2, 3
471
+ if attempt > 0:
472
+ await asyncio.sleep(RETRY_DELAYS[attempt - 1])
473
+
474
+ try:
475
+ async def _get_with_timeout():
476
+ connector = _tor_aiohttp_connector() if is_onion_url(url) else _direct_tcp_connector()
477
+ async with aiohttp.ClientSession(
478
+ connector=connector,
479
+ timeout=aiohttp.ClientTimeout(connect=5, sock_read=25 if not is_onion_url(url) else 5),
480
+ ) as local_session:
481
+ async with local_session.get(url, headers=headers) as resp:
482
+ if resp.status in RETRYABLE_STATUS:
483
+ return "retry", f"HTTP {resp.status}", None, None, None
484
+
485
+ if resp.status != 200:
486
+ return "fail", None, None, None, None
487
+
488
+ content_type = (resp.headers.get("Content-Type") or "").lower()
489
+ if content_type and not any(
490
+ t in content_type for t in ALLOWED_CONTENT_TYPES
491
+ ):
492
+ return "fail", None, None, None, None
493
+
494
+ chunks: List[bytes] = []
495
+ bytes_read = 0
496
+ async for chunk in resp.content.iter_chunked(8192):
497
+ if not chunk:
498
+ continue
499
+ bytes_read += len(chunk)
500
+ if bytes_read > MAX_DOWNLOAD_BYTES:
501
+ break
502
+ chunks.append(chunk)
503
+
504
+ raw_bytes = b"".join(chunks)
505
+ encoding = resp.charset or "utf-8"
506
+ return "ok", raw_bytes, encoding, None, None
507
+
508
+ status_res, r_bytes, enc, _, _ = await asyncio.wait_for(
509
+ _get_with_timeout(), timeout=10.0
510
+ )
511
+
512
+ if status_res == "retry":
513
+ last_exc = r_bytes
514
+ continue
515
+ elif status_res == "fail":
516
+ return url, title, None, None, None
517
+
518
+ raw_bytes = r_bytes
519
+ html = raw_bytes.decode(enc, errors="replace")
520
+
521
+ db_text = _extract_text(html)
522
+ posted_at = extract_post_timestamp(html)
523
+ display_text = f"{title} - {db_text}" if db_text else title
524
+
525
+ # --- Playwright fallback for JS-rendered pages ---
526
+ if PLAYWRIGHT_ENABLED and db_text and len(db_text) < 300:
527
+ # Import lazily to avoid import errors when playwright not installed
528
+ try:
529
+ from scraper.scrape_js import fetch_with_playwright, is_js_rendered
530
+
531
+ if is_js_rendered(html, db_text):
532
+ _logger.debug(
533
+ "Playwright fallback triggered for %s...",
534
+ url[:40] if len(url) > 40 else url,
535
+ )
536
+ js_result = await fetch_with_playwright(
537
+ url=url,
538
+ tor_proxy_host=TOR_PROXY_HOST,
539
+ tor_proxy_port=TOR_PROXY_PORT,
540
+ )
541
+ # Use JS result if it got more content
542
+ if js_result.get("content") and len(js_result.get("content", "")) > len(
543
+ db_text
544
+ ):
545
+ html = js_result.get("raw_html", html)
546
+ db_text = js_result.get("content", "")
547
+ posted_at = js_result.get("posted_at", posted_at)
548
+ display_text = f"{title} - {db_text}" if db_text else title
549
+ _logger.info(
550
+ "Playwright improved content: %d chars from %s...",
551
+ len(db_text),
552
+ url[:40] if len(url) > 40 else url,
553
+ )
554
+ except ImportError:
555
+ # Playwright not installed - skip silently
556
+ pass
557
+ except Exception as e:
558
+ # Keep original aiohttp result if Playwright fails
559
+ _logger.debug("Playwright fallback failed: %s", e)
560
+ pass
561
+
562
+ return url, display_text, raw_bytes, db_text, posted_at
563
+
564
+ except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
565
+ error_str = str(exc)
566
+ if any(err.lower() in error_str.lower() for err in SOCKS_ERRORS):
567
+ _logger.warning(
568
+ "Tor circuit error for %s: %s",
569
+ url[:50] if len(url) > 50 else url,
570
+ error_str[:100],
571
+ )
572
+ await _reset_tor_session_on_error()
573
+ return url, title, None, None, None
574
+ last_exc = exc
575
+ except Exception as exc:
576
+ error_str = str(exc)
577
+ if any(err.lower() in error_str.lower() for err in SOCKS_ERRORS):
578
+ _logger.warning(
579
+ "Tor circuit error for %s: %s",
580
+ url[:50] if len(url) > 50 else url,
581
+ error_str[:100],
582
+ )
583
+ await _reset_tor_session_on_error()
584
+ return url, title, None, None, None
585
+ last_exc = exc
586
+
587
+ # All retries exhausted
588
+ _logger.debug("All retries exhausted for url=%s: %s", url, last_exc)
589
+ return url, title, None, None, None
590
+
591
+
592
+ # ---------------------------------------------------------------------------
593
+ # Async orchestrator
594
+ # ---------------------------------------------------------------------------
595
+
596
+ async def _gather_all(
597
+ unique_urls_data: List[dict],
598
+ max_workers: int,
599
+ ) -> List[Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]]:
600
+ """
601
+ Fan out fetches: .onion URLs through Tor (separate concurrency limit),
602
+ clearnet URLs directly (higher concurrency). Results preserve input order.
603
+ """
604
+ onion_urls, clearnet_urls = classify_urls(unique_urls_data)
605
+ _logger.warning(
606
+ "Scraping %d onion URLs (via Tor) + %d clearnet URLs (direct)",
607
+ len(onion_urls),
608
+ len(clearnet_urls),
609
+ )
610
+
611
+ sem_tor = asyncio.Semaphore(max_workers)
612
+ sem_clearnet = asyncio.Semaphore(15)
613
+
614
+ async def run_onion_batch() -> dict[
615
+ str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
616
+ ]:
617
+ if not onion_urls:
618
+ return {}
619
+ out: dict[
620
+ str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
621
+ ] = {}
622
+ tor_session = get_tor_session_cached()
623
+ tasks = [
624
+ _fetch_one(tor_session, item, sem_tor) for item in onion_urls
625
+ ]
626
+ rows = await asyncio.gather(*tasks)
627
+ for row in rows:
628
+ if row[0]:
629
+ out[row[0]] = row
630
+ return out
631
+
632
+ async def run_clearnet_batch() -> dict[
633
+ str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
634
+ ]:
635
+ if not clearnet_urls:
636
+ return {}
637
+ out: dict[
638
+ str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
639
+ ] = {}
640
+ direct_session = get_direct_session_cached()
641
+ tasks = [
642
+ _fetch_one(direct_session, item, sem_clearnet)
643
+ for item in clearnet_urls
644
+ ]
645
+ rows = await asyncio.gather(*tasks)
646
+ for row in rows:
647
+ if row[0]:
648
+ out[row[0]] = row
649
+ return out
650
+
651
+ tor_map, clearnet_map = await asyncio.gather(
652
+ run_onion_batch(),
653
+ run_clearnet_batch(),
654
+ )
655
+
656
+ merged: List[
657
+ Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
658
+ ] = []
659
+ for item in unique_urls_data:
660
+ url, _title = _normalize_url_data(item)
661
+ if not url:
662
+ merged.append(("", _title, None, None, None))
663
+ continue
664
+ if is_onion_url(url):
665
+ merged.append(tor_map.get(url, (url, _title, None, None, None)))
666
+ else:
667
+ merged.append(clearnet_map.get(url, (url, _title, None, None, None)))
668
+
669
+ tor_ok = sum(1 for r in merged if r[0] and is_onion_url(r[0]) and r[2])
670
+ clear_ok = sum(
671
+ 1 for r in merged if r[0] and not is_onion_url(r[0]) and r[2]
672
+ )
673
+ _logger.warning(
674
+ "Total scraped: %d pages (%d onion, %d clearnet) with stored content",
675
+ tor_ok + clear_ok,
676
+ tor_ok,
677
+ clear_ok,
678
+ )
679
+
680
+ return merged
681
+
682
+
683
+ # ---------------------------------------------------------------------------
684
+ # DB persistence (runs synchronously after asyncio.run() returns)
685
+ # ---------------------------------------------------------------------------
686
+
687
+ def _persist_pages(
688
+ items: List[
689
+ Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
690
+ ],
691
+ ) -> None:
692
+ """
693
+ Write successfully scraped pages to the database.
694
+
695
+ Gracefully skips if:
696
+ - DATABASE_URL is not configured
697
+ - db/ module is not importable (e.g., sqlalchemy not installed)
698
+ - Any per-URL error (IntegrityError on url uniqueness, etc.)
699
+
700
+ One session per URL: a failure on one URL cannot roll back others.
701
+ Content-hash deduplication: identical content at a new URL is not re-inserted.
702
+ """
703
+ try:
704
+ from config import DATABASE_URL as _db_url # re-import for testability
705
+ if not _db_url:
706
+ return
707
+ from db.queries import create_page, get_or_create_source, get_page_by_hash
708
+ from db.session import get_session
709
+ except ImportError:
710
+ return
711
+
712
+ for url, _display, raw_bytes, db_text, posted_at in items:
713
+ if not raw_bytes or not url:
714
+ continue
715
+
716
+ content_hash = hashlib.sha256(raw_bytes).hexdigest()
717
+
718
+ try:
719
+ with get_session() as session:
720
+ # Content-hash dedup: skip if identical content already stored
721
+ if get_page_by_hash(session, content_hash):
722
+ continue
723
+
724
+ hostname = (urlparse(url).hostname or "").lower()
725
+ source_id = None
726
+ if hostname.endswith(".onion"):
727
+ src, _ = get_or_create_source(session, hostname)
728
+ source_id = src.id
729
+
730
+ create_page(
731
+ session,
732
+ url=url,
733
+ source_id=source_id,
734
+ cleaned_text=db_text,
735
+ raw_content_hash=content_hash,
736
+ byte_size=len(raw_bytes),
737
+ posted_at=posted_at,
738
+ )
739
+ except Exception as exc:
740
+ # Swallow silently: URL-uniqueness violations, connection errors, etc.
741
+ # DB persistence must never break the scraping pipeline.
742
+ _logger.debug("DB persist failed url=%s: %s", url, exc)
743
+
744
+
745
+ # ---------------------------------------------------------------------------
746
+ # Public API
747
+ # ---------------------------------------------------------------------------
748
+
749
+ async def scrape_multiple(urls_data, max_workers: int = 5) -> Dict[str, str]:
750
+ """
751
+ Scrape a list of URLs concurrently and return a dict mapping URL → content.
752
+
753
+ Arguments and return type are identical to Phase 0 — ui.py is unchanged.
754
+
755
+ Pipeline:
756
+ 1. Deduplicate input URLs
757
+ 2. await _gather_all(...) — async fetch
758
+ 3. Truncate each result to MAX_RETURN_CHARS
759
+ 4. Write pages to DB if DATABASE_URL is configured
760
+ 5. Return {url: content} dict
761
+ """
762
+ if not isinstance(urls_data, (list, tuple)):
763
+ return {}
764
+
765
+ max_workers = max(1, min(int(max_workers), 16))
766
+
767
+ # Deduplicate by URL (preserve first occurrence)
768
+ unique_urls_data: List[dict] = []
769
+ seen_links: set = set()
770
+ for item in urls_data:
771
+ url, title = _normalize_url_data(item)
772
+ if not url or url in seen_links:
773
+ continue
774
+ seen_links.add(url)
775
+ unique_urls_data.append({"link": url, "title": title})
776
+
777
+ safe_urls, blocked = validate_urls_for_scraping(unique_urls_data)
778
+ if blocked:
779
+ _logger.warning("SSRF: blocked %d unsafe URLs from scrape batch", len(blocked))
780
+ unique_urls_data = safe_urls
781
+
782
+ if not unique_urls_data:
783
+ return {}
784
+
785
+ # Async fetch phase
786
+ raw_results = await _gather_all(unique_urls_data, max_workers)
787
+
788
+ # Assemble public dict with MAX_RETURN_CHARS truncation
789
+ suffix = "...(truncated)"
790
+ results: Dict[str, str] = {}
791
+ db_items: List[
792
+ Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
793
+ ] = []
794
+
795
+ for url, display_text, raw_bytes, db_text, posted_at in raw_results:
796
+ if not url:
797
+ continue
798
+ if len(display_text) > MAX_RETURN_CHARS:
799
+ available = MAX_RETURN_CHARS - len(suffix)
800
+ if available > 0:
801
+ display_text = display_text[:available] + suffix
802
+ else:
803
+ display_text = suffix[:MAX_RETURN_CHARS]
804
+ results[url] = display_text
805
+ db_items.append((url, display_text, raw_bytes, db_text, posted_at))
806
+
807
+ # DB persistence phase
808
+ await asyncio.to_thread(_persist_pages, db_items)
809
+
810
+ return results
811
+
812
+
813
+ async def scrape_single(
814
+ url_data,
815
+ rotate: bool = False,
816
+ rotate_interval: int = 5,
817
+ control_port: int = 9051,
818
+ control_password: Optional[str] = None,
819
+ ) -> Tuple[str, str]:
820
+ """
821
+ Scrape a single URL. Public signature identical to Phase 0.
822
+
823
+ Extra kwargs (rotate, rotate_interval, control_port, control_password) are
824
+ accepted as no-ops.
825
+ # TODO: Tor circuit rotation — Phase 1C
826
+ """
827
+ url, title = _normalize_url_data(url_data)
828
+ if not url:
829
+ return "", title
830
+ results = await scrape_multiple([url_data], max_workers=1)
831
+ return url, results.get(url, title)
832
+
833
+
834
+ def get_tor_session() -> requests.Session:
835
+ """
836
+ Return a requests.Session pre-configured with the Tor SOCKS5 proxy.
837
+
838
+ Kept for backward compatibility with health.py and search.py.
839
+ Proxy host/port are now read from config (TOR_PROXY_HOST / TOR_PROXY_PORT).
840
+ """
841
+ session = requests.Session()
842
+ retry = Retry(
843
+ total=3,
844
+ read=3,
845
+ connect=3,
846
+ backoff_factor=0.5,
847
+ status_forcelist=[500, 502, 503, 504],
848
+ )
849
+ adapter = HTTPAdapter(max_retries=retry)
850
+ session.mount("http://", adapter)
851
+ session.mount("https://", adapter)
852
+ proxy_url = _build_proxy_url()
853
+ session.proxies = {
854
+ "http": proxy_url,
855
+ "https": proxy_url,
856
+ }
857
+ return session