voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
scraper/scrape.py
ADDED
|
@@ -0,0 +1,857 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrape.py — async .onion / clearnet page fetcher for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Public API (unchanged from Phase 0 — ui.py compatibility guaranteed):
|
|
5
|
+
scrape_multiple(urls_data, max_workers=5) -> Dict[str, str]
|
|
6
|
+
scrape_single(url_data, ...) -> Tuple[str, str]
|
|
7
|
+
get_tor_session() -> requests.Session
|
|
8
|
+
|
|
9
|
+
Internals rewritten in Phase 1B:
|
|
10
|
+
ThreadPoolExecutor + requests → asyncio + aiohttp-socks
|
|
11
|
+
BeautifulSoup-only extraction → trafilatura first, BeautifulSoup fallback
|
|
12
|
+
hardcoded 127.0.0.1:9050 → TOR_PROXY_HOST / TOR_PROXY_PORT from config
|
|
13
|
+
no retry → 3-attempt exponential backoff (2 s / 4 s / 8 s)
|
|
14
|
+
no DB persistence → pages written to Phase 1A db/ layer when DATABASE_URL is set
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import hashlib
|
|
21
|
+
import ipaddress
|
|
22
|
+
import logging
|
|
23
|
+
import random
|
|
24
|
+
import re
|
|
25
|
+
import warnings
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from typing import Dict, List, Optional, Tuple
|
|
28
|
+
from urllib.parse import urlparse, urlunparse
|
|
29
|
+
|
|
30
|
+
import aiohttp
|
|
31
|
+
import requests
|
|
32
|
+
import trafilatura
|
|
33
|
+
from aiohttp_socks import ProxyConnector
|
|
34
|
+
from bs4 import BeautifulSoup
|
|
35
|
+
from requests.adapters import HTTPAdapter
|
|
36
|
+
from urllib3.util.retry import Retry
|
|
37
|
+
|
|
38
|
+
from config import TOR_PROXY_HOST, TOR_PROXY_PORT, PLAYWRIGHT_ENABLED
|
|
39
|
+
|
|
40
|
+
warnings.filterwarnings("ignore")
|
|
41
|
+
|
|
42
|
+
_logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Module-level constants (identical to Phase 0 — ui.py depends on these)
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
USER_AGENTS = [
|
|
49
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
50
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
51
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
52
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
53
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
54
|
+
"Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
55
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
|
|
56
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
|
|
57
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
MAX_DOWNLOAD_BYTES = 1_000_000
|
|
61
|
+
MAX_EXTRACTED_TEXT_CHARS = 50_000
|
|
62
|
+
MAX_RETURN_CHARS = 15_000
|
|
63
|
+
ALLOWED_CONTENT_TYPES = ("text/html", "application/xhtml+xml", "text/plain")
|
|
64
|
+
|
|
65
|
+
# Retry configuration
|
|
66
|
+
MAX_RETRIES = 1
|
|
67
|
+
RETRY_DELAYS = (1.0,) # seconds before attempt 1
|
|
68
|
+
RETRYABLE_STATUS = {500, 502, 503, 504}
|
|
69
|
+
|
|
70
|
+
# Tor circuit error patterns - indicates circuit failure, not URL failure
|
|
71
|
+
SOCKS_ERRORS = (
|
|
72
|
+
"SOCKS5",
|
|
73
|
+
"socks5",
|
|
74
|
+
"Host unreachable",
|
|
75
|
+
"Connection refused",
|
|
76
|
+
"General SOCKS",
|
|
77
|
+
"circuit",
|
|
78
|
+
"Tor circuit",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Internal / link-local ranges — block clearnet fetches (SSRF prevention)
|
|
82
|
+
_BLOCKED_IP_RANGES = [
|
|
83
|
+
ipaddress.ip_network("10.0.0.0/8"),
|
|
84
|
+
ipaddress.ip_network("172.16.0.0/12"),
|
|
85
|
+
ipaddress.ip_network("192.168.0.0/16"),
|
|
86
|
+
ipaddress.ip_network("127.0.0.0/8"),
|
|
87
|
+
ipaddress.ip_network("169.254.0.0/16"),
|
|
88
|
+
ipaddress.ip_network("::1/128"),
|
|
89
|
+
ipaddress.ip_network("fc00::/7"),
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
_BLOCKED_HOSTNAMES = frozenset(
|
|
93
|
+
{
|
|
94
|
+
"localhost",
|
|
95
|
+
"metadata.google.internal",
|
|
96
|
+
"169.254.169.254",
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Common HTML timestamp patterns (forums / JSON-LD)
|
|
101
|
+
_TIMESTAMP_PATTERNS = [
|
|
102
|
+
(r'<time[^>]+datetime="([^"]+)"', "iso"),
|
|
103
|
+
(r"[Pp]osted[:\s]+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})", "%Y-%m-%d %H:%M:%S"),
|
|
104
|
+
(r"[Dd]ate[:\s]+(\d{2}/\d{2}/\d{4})", "%d/%m/%Y"),
|
|
105
|
+
(r'data-timestamp="(\d{10})"', "unix10"),
|
|
106
|
+
(
|
|
107
|
+
r'"datePublished":\s*"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})"',
|
|
108
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
109
|
+
),
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# Helpers
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def extract_post_timestamp(html: str) -> Optional[datetime]:
|
|
119
|
+
"""
|
|
120
|
+
Attempt to extract the original post timestamp from raw HTML.
|
|
121
|
+
|
|
122
|
+
Returns timezone-aware UTC datetime if found, None if not extractable.
|
|
123
|
+
Never raises — all failures return None.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
if not html:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
for pattern, fmt in _TIMESTAMP_PATTERNS:
|
|
130
|
+
try:
|
|
131
|
+
match = re.search(pattern, html)
|
|
132
|
+
if not match:
|
|
133
|
+
continue
|
|
134
|
+
value = match.group(1).strip()
|
|
135
|
+
|
|
136
|
+
if fmt == "iso":
|
|
137
|
+
s = value.replace("Z", "+00:00")
|
|
138
|
+
if len(s) >= 19 and "T" not in s[:19]:
|
|
139
|
+
s = value
|
|
140
|
+
dt = datetime.fromisoformat(s[:32])
|
|
141
|
+
if dt.tzinfo is None:
|
|
142
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
143
|
+
else:
|
|
144
|
+
dt = dt.astimezone(timezone.utc)
|
|
145
|
+
if datetime(2010, 1, 1, tzinfo=timezone.utc) <= dt <= datetime.now(
|
|
146
|
+
timezone.utc
|
|
147
|
+
):
|
|
148
|
+
return dt
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
if fmt == "unix10":
|
|
152
|
+
ts = int(value)
|
|
153
|
+
if 1_000_000_000 < ts < 9_999_999_999:
|
|
154
|
+
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
sample = value[:19] if len(value) >= 19 else value
|
|
158
|
+
dt = datetime.strptime(sample, fmt)
|
|
159
|
+
if dt.tzinfo is None:
|
|
160
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
161
|
+
if datetime(2010, 1, 1, tzinfo=timezone.utc) <= dt <= datetime.now(
|
|
162
|
+
timezone.utc
|
|
163
|
+
):
|
|
164
|
+
return dt
|
|
165
|
+
except (ValueError, OverflowError, OSError, TypeError):
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
return None
|
|
169
|
+
except Exception:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def is_safe_url(url: str) -> bool:
|
|
174
|
+
"""
|
|
175
|
+
Return False if URL targets internal/reserved addresses (SSRF prevention).
|
|
176
|
+
.onion hostnames are always allowed (Tor handles routing).
|
|
177
|
+
"""
|
|
178
|
+
try:
|
|
179
|
+
parsed = urlparse(url)
|
|
180
|
+
hostname = (parsed.hostname or "").strip()
|
|
181
|
+
if hostname.lower().endswith(".onion"):
|
|
182
|
+
return True
|
|
183
|
+
if hostname.lower() in _BLOCKED_HOSTNAMES:
|
|
184
|
+
_logger.warning("SSRF blocked hostname: %s", hostname)
|
|
185
|
+
return False
|
|
186
|
+
try:
|
|
187
|
+
import socket
|
|
188
|
+
resolved_ip_str = socket.gethostbyname(hostname)
|
|
189
|
+
except Exception:
|
|
190
|
+
resolved_ip_str = None
|
|
191
|
+
|
|
192
|
+
ips_to_check = [hostname]
|
|
193
|
+
if resolved_ip_str and resolved_ip_str != hostname:
|
|
194
|
+
ips_to_check.append(resolved_ip_str)
|
|
195
|
+
|
|
196
|
+
for ip_str in ips_to_check:
|
|
197
|
+
try:
|
|
198
|
+
ip = ipaddress.ip_address(ip_str)
|
|
199
|
+
for blocked_range in _BLOCKED_IP_RANGES:
|
|
200
|
+
if ip in blocked_range:
|
|
201
|
+
_logger.warning("SSRF blocked IP %s (from %s) in %s", ip_str, hostname, blocked_range)
|
|
202
|
+
return False
|
|
203
|
+
except ValueError:
|
|
204
|
+
pass
|
|
205
|
+
return True
|
|
206
|
+
except Exception:
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def validate_urls_for_scraping(
|
|
211
|
+
url_dicts: List[dict],
|
|
212
|
+
) -> Tuple[List[dict], List[str]]:
|
|
213
|
+
"""
|
|
214
|
+
Filter URL dicts before scraping. Returns (safe_dicts, blocked_url_strings).
|
|
215
|
+
"""
|
|
216
|
+
safe: List[dict] = []
|
|
217
|
+
blocked: List[str] = []
|
|
218
|
+
for url_dict in url_dicts:
|
|
219
|
+
link = url_dict.get("link", url_dict) if isinstance(url_dict, dict) else str(url_dict)
|
|
220
|
+
if is_safe_url(link):
|
|
221
|
+
safe.append(url_dict)
|
|
222
|
+
else:
|
|
223
|
+
blocked.append(link)
|
|
224
|
+
if blocked:
|
|
225
|
+
_logger.warning(
|
|
226
|
+
"SSRF prevention blocked %d URLs: %s",
|
|
227
|
+
len(blocked),
|
|
228
|
+
blocked[:5],
|
|
229
|
+
)
|
|
230
|
+
return safe, blocked
|
|
231
|
+
|
|
232
|
+
def _normalize_url_data(url_data) -> Tuple[str, str]:
|
|
233
|
+
"""Extract (url, title) from a search result dict."""
|
|
234
|
+
if not isinstance(url_data, dict):
|
|
235
|
+
return "", "Untitled"
|
|
236
|
+
url = str(url_data.get("link") or "").strip()
|
|
237
|
+
title = str(url_data.get("title") or "Untitled").strip() or "Untitled"
|
|
238
|
+
return url, title
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def is_onion_url(url: str) -> bool:
|
|
242
|
+
"""Return True if URL is a .onion address requiring Tor."""
|
|
243
|
+
try:
|
|
244
|
+
parsed = urlparse(url)
|
|
245
|
+
hostname = parsed.hostname or ""
|
|
246
|
+
return hostname.lower().endswith(".onion")
|
|
247
|
+
except Exception:
|
|
248
|
+
return False
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def normalize_url(url: str) -> str:
|
|
252
|
+
"""
|
|
253
|
+
Normalize a URL for consistent storage/dedup.
|
|
254
|
+
Uses crawler.utils.normalize_url for consistency.
|
|
255
|
+
"""
|
|
256
|
+
try:
|
|
257
|
+
from crawler.utils import normalize_url as _norm
|
|
258
|
+
return _norm(url)
|
|
259
|
+
except ImportError:
|
|
260
|
+
parsed = urlparse(url)
|
|
261
|
+
scheme = parsed.scheme.lower()
|
|
262
|
+
netloc = parsed.netloc.lower()
|
|
263
|
+
path = parsed.path.rstrip("/") if parsed.path else ""
|
|
264
|
+
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def classify_urls(urls: List[dict]) -> Tuple[List[dict], List[dict]]:
|
|
268
|
+
"""
|
|
269
|
+
Split URLs into onion (needs Tor) and clearnet (direct fetch).
|
|
270
|
+
|
|
271
|
+
Malformed URLs are treated as clearnet.
|
|
272
|
+
"""
|
|
273
|
+
onion_urls: List[dict] = []
|
|
274
|
+
clearnet_urls: List[dict] = []
|
|
275
|
+
for url_dict in urls:
|
|
276
|
+
link = url_dict.get("link", "") if isinstance(url_dict, dict) else str(url_dict)
|
|
277
|
+
if is_onion_url(link):
|
|
278
|
+
onion_urls.append(url_dict)
|
|
279
|
+
else:
|
|
280
|
+
clearnet_urls.append(url_dict)
|
|
281
|
+
return onion_urls, clearnet_urls
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _is_onion(url: str) -> bool:
|
|
285
|
+
"""Return True if the URL targets a .onion hostname."""
|
|
286
|
+
return is_onion_url(url)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _build_proxy_url() -> str:
|
|
290
|
+
"""
|
|
291
|
+
SOCKS URL for ``requests`` / urllib3 (PySocks understands ``socks5h`` =
|
|
292
|
+
remote DNS at the proxy, required for ``.onion``).
|
|
293
|
+
|
|
294
|
+
``aiohttp_socks`` uses ``python_socks.parse_proxy_url``, which does *not*
|
|
295
|
+
accept the ``socks5h`` scheme — use :func:`_tor_aiohttp_connector` instead.
|
|
296
|
+
"""
|
|
297
|
+
return f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}"
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _tor_aiohttp_connector() -> ProxyConnector:
|
|
301
|
+
"""SOCKS5 with remote DNS (same behavior as socks5h) for aiohttp-socks."""
|
|
302
|
+
return ProxyConnector.from_url(
|
|
303
|
+
f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
304
|
+
rdns=True,
|
|
305
|
+
limit=20,
|
|
306
|
+
limit_per_host=10,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _direct_tcp_connector() -> aiohttp.TCPConnector:
|
|
311
|
+
"""Direct TCP connector with connection pooling."""
|
|
312
|
+
return aiohttp.TCPConnector(
|
|
313
|
+
limit=30,
|
|
314
|
+
limit_per_host=10,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
_tor_session: Optional[aiohttp.ClientSession] = None
|
|
319
|
+
_direct_session: Optional[aiohttp.ClientSession] = None
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def get_tor_session_cached() -> aiohttp.ClientSession:
|
|
323
|
+
"""Return a cached Tor-proxied session for connection reuse."""
|
|
324
|
+
global _tor_session
|
|
325
|
+
if _tor_session is None or _tor_session.closed:
|
|
326
|
+
connector = _tor_aiohttp_connector()
|
|
327
|
+
_tor_session = aiohttp.ClientSession(
|
|
328
|
+
connector=connector,
|
|
329
|
+
timeout=aiohttp.ClientTimeout(connect=3, sock_read=5),
|
|
330
|
+
)
|
|
331
|
+
return _tor_session
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def get_direct_session_cached() -> aiohttp.ClientSession:
|
|
335
|
+
"""Return a cached direct session for connection reuse."""
|
|
336
|
+
global _direct_session
|
|
337
|
+
if _direct_session is None or _direct_session.closed:
|
|
338
|
+
connector = _direct_tcp_connector()
|
|
339
|
+
_direct_session = aiohttp.ClientSession(
|
|
340
|
+
connector=connector,
|
|
341
|
+
timeout=aiohttp.ClientTimeout(connect=5, sock_read=25),
|
|
342
|
+
)
|
|
343
|
+
return _direct_session
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
async def close_cached_sessions() -> None:
|
|
347
|
+
"""Close cached sessions - call on shutdown."""
|
|
348
|
+
global _tor_session, _direct_session
|
|
349
|
+
if _tor_session and not _tor_session.closed:
|
|
350
|
+
await _tor_session.close()
|
|
351
|
+
_tor_session = None
|
|
352
|
+
if _direct_session and not _direct_session.closed:
|
|
353
|
+
await _direct_session.close()
|
|
354
|
+
_direct_session = None
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
async def _reset_tor_session_on_error() -> None:
|
|
358
|
+
"""Reset cached Tor session on circuit error to force reconnection."""
|
|
359
|
+
global _tor_session
|
|
360
|
+
if _tor_session is not None and not _tor_session.closed:
|
|
361
|
+
try:
|
|
362
|
+
await _tor_session.close()
|
|
363
|
+
except Exception:
|
|
364
|
+
pass
|
|
365
|
+
_tor_session = None
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# ---------------------------------------------------------------------------
|
|
369
|
+
# Content extraction
|
|
370
|
+
# ---------------------------------------------------------------------------
|
|
371
|
+
|
|
372
|
+
def _extract_text(html: str) -> str:
|
|
373
|
+
"""
|
|
374
|
+
Extract main textual content from an HTML string.
|
|
375
|
+
|
|
376
|
+
trafilatura is tried first — it strips navbars, footers, ads, and scripts,
|
|
377
|
+
leaving the body text. If trafilatura returns nothing (or crashes), we fall
|
|
378
|
+
back to the BeautifulSoup path used in Phase 0.
|
|
379
|
+
|
|
380
|
+
Always truncates to MAX_EXTRACTED_TEXT_CHARS before returning.
|
|
381
|
+
"""
|
|
382
|
+
try:
|
|
383
|
+
text = trafilatura.extract(html, include_comments=False, include_tables=True)
|
|
384
|
+
if text and text.strip():
|
|
385
|
+
return text[:MAX_EXTRACTED_TEXT_CHARS]
|
|
386
|
+
except Exception:
|
|
387
|
+
pass # lxml parse failure or trafilatura bug → fall through
|
|
388
|
+
|
|
389
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
390
|
+
for tag in soup(["script", "style"]):
|
|
391
|
+
tag.extract()
|
|
392
|
+
text = " ".join(soup.get_text(separator=" ").split())
|
|
393
|
+
return text[:MAX_EXTRACTED_TEXT_CHARS]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _score_content_quality(text: str) -> str:
|
|
397
|
+
"""
|
|
398
|
+
Score scraped content quality for prioritization.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
"empty" - < 100 chars (likely failed fetch)
|
|
402
|
+
"thin" - 100-500 chars (minimal content)
|
|
403
|
+
"medium" - 500-2000 chars (decent content)
|
|
404
|
+
"rich" - > 2000 chars (full content)
|
|
405
|
+
"""
|
|
406
|
+
length = len(text) if text else 0
|
|
407
|
+
if length < 100:
|
|
408
|
+
return "empty"
|
|
409
|
+
if length < 500:
|
|
410
|
+
return "thin"
|
|
411
|
+
if length < 2000:
|
|
412
|
+
return "medium"
|
|
413
|
+
return "rich"
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
# ---------------------------------------------------------------------------
|
|
417
|
+
# Async core — fetch with retry
|
|
418
|
+
# ---------------------------------------------------------------------------
|
|
419
|
+
|
|
420
|
+
async def _fetch_one(
|
|
421
|
+
session: aiohttp.ClientSession,
|
|
422
|
+
url_data: dict,
|
|
423
|
+
semaphore: asyncio.Semaphore,
|
|
424
|
+
) -> Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]:
|
|
425
|
+
"""
|
|
426
|
+
Fetch a single URL with exponential-backoff retry.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
(url, display_text, raw_bytes, db_text, posted_at)
|
|
430
|
+
- display_text: "{title} - {extracted_text}" — returned in the public dict
|
|
431
|
+
- raw_bytes: raw downloaded content (for SHA-256 hash + DB byte_size)
|
|
432
|
+
- db_text: extracted text only, no title prefix — stored in Page.cleaned_text
|
|
433
|
+
- posted_at: extracted from HTML when possible, else None
|
|
434
|
+
|
|
435
|
+
On any unrecoverable failure returns (url, title, None, None, None).
|
|
436
|
+
Failures never propagate as exceptions — graceful degradation is preserved.
|
|
437
|
+
"""
|
|
438
|
+
url, title = _normalize_url_data(url_data)
|
|
439
|
+
if not url:
|
|
440
|
+
return "", title, None, None, None
|
|
441
|
+
|
|
442
|
+
if not is_safe_url(url):
|
|
443
|
+
_logger.warning("SSRF blocked fetch: %s", url)
|
|
444
|
+
return url, title, None, None, None
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
from utils.content_safety import is_blocked_url
|
|
448
|
+
url_blocked, _reason = is_blocked_url(url)
|
|
449
|
+
if url_blocked:
|
|
450
|
+
_logger.warning(
|
|
451
|
+
"URL blocked — prohibited content. URL hash: %s",
|
|
452
|
+
hashlib.sha256(url.encode()).hexdigest()[:16],
|
|
453
|
+
)
|
|
454
|
+
return url, title, None, None, None
|
|
455
|
+
except Exception:
|
|
456
|
+
pass
|
|
457
|
+
|
|
458
|
+
parsed = urlparse(url)
|
|
459
|
+
if parsed.scheme not in ("http", "https"):
|
|
460
|
+
return url, title, None, None, None
|
|
461
|
+
|
|
462
|
+
headers = {
|
|
463
|
+
"User-Agent": random.choice(USER_AGENTS),
|
|
464
|
+
"Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
last_exc: object = None
|
|
468
|
+
|
|
469
|
+
async with semaphore:
|
|
470
|
+
for attempt in range(MAX_RETRIES + 1): # attempts: 0, 1, 2, 3
|
|
471
|
+
if attempt > 0:
|
|
472
|
+
await asyncio.sleep(RETRY_DELAYS[attempt - 1])
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
async def _get_with_timeout():
|
|
476
|
+
connector = _tor_aiohttp_connector() if is_onion_url(url) else _direct_tcp_connector()
|
|
477
|
+
async with aiohttp.ClientSession(
|
|
478
|
+
connector=connector,
|
|
479
|
+
timeout=aiohttp.ClientTimeout(connect=5, sock_read=25 if not is_onion_url(url) else 5),
|
|
480
|
+
) as local_session:
|
|
481
|
+
async with local_session.get(url, headers=headers) as resp:
|
|
482
|
+
if resp.status in RETRYABLE_STATUS:
|
|
483
|
+
return "retry", f"HTTP {resp.status}", None, None, None
|
|
484
|
+
|
|
485
|
+
if resp.status != 200:
|
|
486
|
+
return "fail", None, None, None, None
|
|
487
|
+
|
|
488
|
+
content_type = (resp.headers.get("Content-Type") or "").lower()
|
|
489
|
+
if content_type and not any(
|
|
490
|
+
t in content_type for t in ALLOWED_CONTENT_TYPES
|
|
491
|
+
):
|
|
492
|
+
return "fail", None, None, None, None
|
|
493
|
+
|
|
494
|
+
chunks: List[bytes] = []
|
|
495
|
+
bytes_read = 0
|
|
496
|
+
async for chunk in resp.content.iter_chunked(8192):
|
|
497
|
+
if not chunk:
|
|
498
|
+
continue
|
|
499
|
+
bytes_read += len(chunk)
|
|
500
|
+
if bytes_read > MAX_DOWNLOAD_BYTES:
|
|
501
|
+
break
|
|
502
|
+
chunks.append(chunk)
|
|
503
|
+
|
|
504
|
+
raw_bytes = b"".join(chunks)
|
|
505
|
+
encoding = resp.charset or "utf-8"
|
|
506
|
+
return "ok", raw_bytes, encoding, None, None
|
|
507
|
+
|
|
508
|
+
status_res, r_bytes, enc, _, _ = await asyncio.wait_for(
|
|
509
|
+
_get_with_timeout(), timeout=10.0
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
if status_res == "retry":
|
|
513
|
+
last_exc = r_bytes
|
|
514
|
+
continue
|
|
515
|
+
elif status_res == "fail":
|
|
516
|
+
return url, title, None, None, None
|
|
517
|
+
|
|
518
|
+
raw_bytes = r_bytes
|
|
519
|
+
html = raw_bytes.decode(enc, errors="replace")
|
|
520
|
+
|
|
521
|
+
db_text = _extract_text(html)
|
|
522
|
+
posted_at = extract_post_timestamp(html)
|
|
523
|
+
display_text = f"{title} - {db_text}" if db_text else title
|
|
524
|
+
|
|
525
|
+
# --- Playwright fallback for JS-rendered pages ---
|
|
526
|
+
if PLAYWRIGHT_ENABLED and db_text and len(db_text) < 300:
|
|
527
|
+
# Import lazily to avoid import errors when playwright not installed
|
|
528
|
+
try:
|
|
529
|
+
from scraper.scrape_js import fetch_with_playwright, is_js_rendered
|
|
530
|
+
|
|
531
|
+
if is_js_rendered(html, db_text):
|
|
532
|
+
_logger.debug(
|
|
533
|
+
"Playwright fallback triggered for %s...",
|
|
534
|
+
url[:40] if len(url) > 40 else url,
|
|
535
|
+
)
|
|
536
|
+
js_result = await fetch_with_playwright(
|
|
537
|
+
url=url,
|
|
538
|
+
tor_proxy_host=TOR_PROXY_HOST,
|
|
539
|
+
tor_proxy_port=TOR_PROXY_PORT,
|
|
540
|
+
)
|
|
541
|
+
# Use JS result if it got more content
|
|
542
|
+
if js_result.get("content") and len(js_result.get("content", "")) > len(
|
|
543
|
+
db_text
|
|
544
|
+
):
|
|
545
|
+
html = js_result.get("raw_html", html)
|
|
546
|
+
db_text = js_result.get("content", "")
|
|
547
|
+
posted_at = js_result.get("posted_at", posted_at)
|
|
548
|
+
display_text = f"{title} - {db_text}" if db_text else title
|
|
549
|
+
_logger.info(
|
|
550
|
+
"Playwright improved content: %d chars from %s...",
|
|
551
|
+
len(db_text),
|
|
552
|
+
url[:40] if len(url) > 40 else url,
|
|
553
|
+
)
|
|
554
|
+
except ImportError:
|
|
555
|
+
# Playwright not installed - skip silently
|
|
556
|
+
pass
|
|
557
|
+
except Exception as e:
|
|
558
|
+
# Keep original aiohttp result if Playwright fails
|
|
559
|
+
_logger.debug("Playwright fallback failed: %s", e)
|
|
560
|
+
pass
|
|
561
|
+
|
|
562
|
+
return url, display_text, raw_bytes, db_text, posted_at
|
|
563
|
+
|
|
564
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
|
|
565
|
+
error_str = str(exc)
|
|
566
|
+
if any(err.lower() in error_str.lower() for err in SOCKS_ERRORS):
|
|
567
|
+
_logger.warning(
|
|
568
|
+
"Tor circuit error for %s: %s",
|
|
569
|
+
url[:50] if len(url) > 50 else url,
|
|
570
|
+
error_str[:100],
|
|
571
|
+
)
|
|
572
|
+
await _reset_tor_session_on_error()
|
|
573
|
+
return url, title, None, None, None
|
|
574
|
+
last_exc = exc
|
|
575
|
+
except Exception as exc:
|
|
576
|
+
error_str = str(exc)
|
|
577
|
+
if any(err.lower() in error_str.lower() for err in SOCKS_ERRORS):
|
|
578
|
+
_logger.warning(
|
|
579
|
+
"Tor circuit error for %s: %s",
|
|
580
|
+
url[:50] if len(url) > 50 else url,
|
|
581
|
+
error_str[:100],
|
|
582
|
+
)
|
|
583
|
+
await _reset_tor_session_on_error()
|
|
584
|
+
return url, title, None, None, None
|
|
585
|
+
last_exc = exc
|
|
586
|
+
|
|
587
|
+
# All retries exhausted
|
|
588
|
+
_logger.debug("All retries exhausted for url=%s: %s", url, last_exc)
|
|
589
|
+
return url, title, None, None, None
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
# ---------------------------------------------------------------------------
|
|
593
|
+
# Async orchestrator
|
|
594
|
+
# ---------------------------------------------------------------------------
|
|
595
|
+
|
|
596
|
+
async def _gather_all(
|
|
597
|
+
unique_urls_data: List[dict],
|
|
598
|
+
max_workers: int,
|
|
599
|
+
) -> List[Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]]:
|
|
600
|
+
"""
|
|
601
|
+
Fan out fetches: .onion URLs through Tor (separate concurrency limit),
|
|
602
|
+
clearnet URLs directly (higher concurrency). Results preserve input order.
|
|
603
|
+
"""
|
|
604
|
+
onion_urls, clearnet_urls = classify_urls(unique_urls_data)
|
|
605
|
+
_logger.warning(
|
|
606
|
+
"Scraping %d onion URLs (via Tor) + %d clearnet URLs (direct)",
|
|
607
|
+
len(onion_urls),
|
|
608
|
+
len(clearnet_urls),
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
sem_tor = asyncio.Semaphore(max_workers)
|
|
612
|
+
sem_clearnet = asyncio.Semaphore(15)
|
|
613
|
+
|
|
614
|
+
async def run_onion_batch() -> dict[
|
|
615
|
+
str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
616
|
+
]:
|
|
617
|
+
if not onion_urls:
|
|
618
|
+
return {}
|
|
619
|
+
out: dict[
|
|
620
|
+
str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
621
|
+
] = {}
|
|
622
|
+
tor_session = get_tor_session_cached()
|
|
623
|
+
tasks = [
|
|
624
|
+
_fetch_one(tor_session, item, sem_tor) for item in onion_urls
|
|
625
|
+
]
|
|
626
|
+
rows = await asyncio.gather(*tasks)
|
|
627
|
+
for row in rows:
|
|
628
|
+
if row[0]:
|
|
629
|
+
out[row[0]] = row
|
|
630
|
+
return out
|
|
631
|
+
|
|
632
|
+
async def run_clearnet_batch() -> dict[
|
|
633
|
+
str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
634
|
+
]:
|
|
635
|
+
if not clearnet_urls:
|
|
636
|
+
return {}
|
|
637
|
+
out: dict[
|
|
638
|
+
str, Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
639
|
+
] = {}
|
|
640
|
+
direct_session = get_direct_session_cached()
|
|
641
|
+
tasks = [
|
|
642
|
+
_fetch_one(direct_session, item, sem_clearnet)
|
|
643
|
+
for item in clearnet_urls
|
|
644
|
+
]
|
|
645
|
+
rows = await asyncio.gather(*tasks)
|
|
646
|
+
for row in rows:
|
|
647
|
+
if row[0]:
|
|
648
|
+
out[row[0]] = row
|
|
649
|
+
return out
|
|
650
|
+
|
|
651
|
+
tor_map, clearnet_map = await asyncio.gather(
|
|
652
|
+
run_onion_batch(),
|
|
653
|
+
run_clearnet_batch(),
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
merged: List[
|
|
657
|
+
Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
658
|
+
] = []
|
|
659
|
+
for item in unique_urls_data:
|
|
660
|
+
url, _title = _normalize_url_data(item)
|
|
661
|
+
if not url:
|
|
662
|
+
merged.append(("", _title, None, None, None))
|
|
663
|
+
continue
|
|
664
|
+
if is_onion_url(url):
|
|
665
|
+
merged.append(tor_map.get(url, (url, _title, None, None, None)))
|
|
666
|
+
else:
|
|
667
|
+
merged.append(clearnet_map.get(url, (url, _title, None, None, None)))
|
|
668
|
+
|
|
669
|
+
tor_ok = sum(1 for r in merged if r[0] and is_onion_url(r[0]) and r[2])
|
|
670
|
+
clear_ok = sum(
|
|
671
|
+
1 for r in merged if r[0] and not is_onion_url(r[0]) and r[2]
|
|
672
|
+
)
|
|
673
|
+
_logger.warning(
|
|
674
|
+
"Total scraped: %d pages (%d onion, %d clearnet) with stored content",
|
|
675
|
+
tor_ok + clear_ok,
|
|
676
|
+
tor_ok,
|
|
677
|
+
clear_ok,
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
return merged
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
# ---------------------------------------------------------------------------
|
|
684
|
+
# DB persistence (runs synchronously after asyncio.run() returns)
|
|
685
|
+
# ---------------------------------------------------------------------------
|
|
686
|
+
|
|
687
|
+
def _persist_pages(
|
|
688
|
+
items: List[
|
|
689
|
+
Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
690
|
+
],
|
|
691
|
+
) -> None:
|
|
692
|
+
"""
|
|
693
|
+
Write successfully scraped pages to the database.
|
|
694
|
+
|
|
695
|
+
Gracefully skips if:
|
|
696
|
+
- DATABASE_URL is not configured
|
|
697
|
+
- db/ module is not importable (e.g., sqlalchemy not installed)
|
|
698
|
+
- Any per-URL error (IntegrityError on url uniqueness, etc.)
|
|
699
|
+
|
|
700
|
+
One session per URL: a failure on one URL cannot roll back others.
|
|
701
|
+
Content-hash deduplication: identical content at a new URL is not re-inserted.
|
|
702
|
+
"""
|
|
703
|
+
try:
|
|
704
|
+
from config import DATABASE_URL as _db_url # re-import for testability
|
|
705
|
+
if not _db_url:
|
|
706
|
+
return
|
|
707
|
+
from db.queries import create_page, get_or_create_source, get_page_by_hash
|
|
708
|
+
from db.session import get_session
|
|
709
|
+
except ImportError:
|
|
710
|
+
return
|
|
711
|
+
|
|
712
|
+
for url, _display, raw_bytes, db_text, posted_at in items:
|
|
713
|
+
if not raw_bytes or not url:
|
|
714
|
+
continue
|
|
715
|
+
|
|
716
|
+
content_hash = hashlib.sha256(raw_bytes).hexdigest()
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
with get_session() as session:
|
|
720
|
+
# Content-hash dedup: skip if identical content already stored
|
|
721
|
+
if get_page_by_hash(session, content_hash):
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
hostname = (urlparse(url).hostname or "").lower()
|
|
725
|
+
source_id = None
|
|
726
|
+
if hostname.endswith(".onion"):
|
|
727
|
+
src, _ = get_or_create_source(session, hostname)
|
|
728
|
+
source_id = src.id
|
|
729
|
+
|
|
730
|
+
create_page(
|
|
731
|
+
session,
|
|
732
|
+
url=url,
|
|
733
|
+
source_id=source_id,
|
|
734
|
+
cleaned_text=db_text,
|
|
735
|
+
raw_content_hash=content_hash,
|
|
736
|
+
byte_size=len(raw_bytes),
|
|
737
|
+
posted_at=posted_at,
|
|
738
|
+
)
|
|
739
|
+
except Exception as exc:
|
|
740
|
+
# Swallow silently: URL-uniqueness violations, connection errors, etc.
|
|
741
|
+
# DB persistence must never break the scraping pipeline.
|
|
742
|
+
_logger.debug("DB persist failed url=%s: %s", url, exc)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
# ---------------------------------------------------------------------------
|
|
746
|
+
# Public API
|
|
747
|
+
# ---------------------------------------------------------------------------
|
|
748
|
+
|
|
749
|
+
async def scrape_multiple(urls_data, max_workers: int = 5) -> Dict[str, str]:
|
|
750
|
+
"""
|
|
751
|
+
Scrape a list of URLs concurrently and return a dict mapping URL → content.
|
|
752
|
+
|
|
753
|
+
Arguments and return type are identical to Phase 0 — ui.py is unchanged.
|
|
754
|
+
|
|
755
|
+
Pipeline:
|
|
756
|
+
1. Deduplicate input URLs
|
|
757
|
+
2. await _gather_all(...) — async fetch
|
|
758
|
+
3. Truncate each result to MAX_RETURN_CHARS
|
|
759
|
+
4. Write pages to DB if DATABASE_URL is configured
|
|
760
|
+
5. Return {url: content} dict
|
|
761
|
+
"""
|
|
762
|
+
if not isinstance(urls_data, (list, tuple)):
|
|
763
|
+
return {}
|
|
764
|
+
|
|
765
|
+
max_workers = max(1, min(int(max_workers), 16))
|
|
766
|
+
|
|
767
|
+
# Deduplicate by URL (preserve first occurrence)
|
|
768
|
+
unique_urls_data: List[dict] = []
|
|
769
|
+
seen_links: set = set()
|
|
770
|
+
for item in urls_data:
|
|
771
|
+
url, title = _normalize_url_data(item)
|
|
772
|
+
if not url or url in seen_links:
|
|
773
|
+
continue
|
|
774
|
+
seen_links.add(url)
|
|
775
|
+
unique_urls_data.append({"link": url, "title": title})
|
|
776
|
+
|
|
777
|
+
safe_urls, blocked = validate_urls_for_scraping(unique_urls_data)
|
|
778
|
+
if blocked:
|
|
779
|
+
_logger.warning("SSRF: blocked %d unsafe URLs from scrape batch", len(blocked))
|
|
780
|
+
unique_urls_data = safe_urls
|
|
781
|
+
|
|
782
|
+
if not unique_urls_data:
|
|
783
|
+
return {}
|
|
784
|
+
|
|
785
|
+
# Async fetch phase
|
|
786
|
+
raw_results = await _gather_all(unique_urls_data, max_workers)
|
|
787
|
+
|
|
788
|
+
# Assemble public dict with MAX_RETURN_CHARS truncation
|
|
789
|
+
suffix = "...(truncated)"
|
|
790
|
+
results: Dict[str, str] = {}
|
|
791
|
+
db_items: List[
|
|
792
|
+
Tuple[str, str, Optional[bytes], Optional[str], Optional[datetime]]
|
|
793
|
+
] = []
|
|
794
|
+
|
|
795
|
+
for url, display_text, raw_bytes, db_text, posted_at in raw_results:
|
|
796
|
+
if not url:
|
|
797
|
+
continue
|
|
798
|
+
if len(display_text) > MAX_RETURN_CHARS:
|
|
799
|
+
available = MAX_RETURN_CHARS - len(suffix)
|
|
800
|
+
if available > 0:
|
|
801
|
+
display_text = display_text[:available] + suffix
|
|
802
|
+
else:
|
|
803
|
+
display_text = suffix[:MAX_RETURN_CHARS]
|
|
804
|
+
results[url] = display_text
|
|
805
|
+
db_items.append((url, display_text, raw_bytes, db_text, posted_at))
|
|
806
|
+
|
|
807
|
+
# DB persistence phase
|
|
808
|
+
await asyncio.to_thread(_persist_pages, db_items)
|
|
809
|
+
|
|
810
|
+
return results
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
async def scrape_single(
|
|
814
|
+
url_data,
|
|
815
|
+
rotate: bool = False,
|
|
816
|
+
rotate_interval: int = 5,
|
|
817
|
+
control_port: int = 9051,
|
|
818
|
+
control_password: Optional[str] = None,
|
|
819
|
+
) -> Tuple[str, str]:
|
|
820
|
+
"""
|
|
821
|
+
Scrape a single URL. Public signature identical to Phase 0.
|
|
822
|
+
|
|
823
|
+
Extra kwargs (rotate, rotate_interval, control_port, control_password) are
|
|
824
|
+
accepted as no-ops.
|
|
825
|
+
# TODO: Tor circuit rotation — Phase 1C
|
|
826
|
+
"""
|
|
827
|
+
url, title = _normalize_url_data(url_data)
|
|
828
|
+
if not url:
|
|
829
|
+
return "", title
|
|
830
|
+
results = await scrape_multiple([url_data], max_workers=1)
|
|
831
|
+
return url, results.get(url, title)
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def get_tor_session() -> requests.Session:
|
|
835
|
+
"""
|
|
836
|
+
Return a requests.Session pre-configured with the Tor SOCKS5 proxy.
|
|
837
|
+
|
|
838
|
+
Kept for backward compatibility with health.py and search.py.
|
|
839
|
+
Proxy host/port are now read from config (TOR_PROXY_HOST / TOR_PROXY_PORT).
|
|
840
|
+
"""
|
|
841
|
+
session = requests.Session()
|
|
842
|
+
retry = Retry(
|
|
843
|
+
total=3,
|
|
844
|
+
read=3,
|
|
845
|
+
connect=3,
|
|
846
|
+
backoff_factor=0.5,
|
|
847
|
+
status_forcelist=[500, 502, 503, 504],
|
|
848
|
+
)
|
|
849
|
+
adapter = HTTPAdapter(max_retries=retry)
|
|
850
|
+
session.mount("http://", adapter)
|
|
851
|
+
session.mount("https://", adapter)
|
|
852
|
+
proxy_url = _build_proxy_url()
|
|
853
|
+
session.proxies = {
|
|
854
|
+
"http": proxy_url,
|
|
855
|
+
"https": proxy_url,
|
|
856
|
+
}
|
|
857
|
+
return session
|