voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
scraper/scrape_js.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Playwright-based JavaScript renderer for dark web content.
|
|
3
|
+
|
|
4
|
+
Used as a fallback when aiohttp returns empty content from JS-heavy sites.
|
|
5
|
+
Routes traffic through Tor SOCKS5 proxy same as the main scraper.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Playwright browser instance — shared across scrape calls
|
|
15
|
+
_BROWSER = None
|
|
16
|
+
_BROWSER_LOCK = None
|
|
17
|
+
|
|
18
|
+
# Maximum time to wait for page content (ms)
|
|
19
|
+
PAGE_TIMEOUT_MS = 30_000 # 30 seconds
|
|
20
|
+
|
|
21
|
+
# Selectors to wait for — indicates page has rendered
|
|
22
|
+
CONTENT_SELECTORS = [
|
|
23
|
+
"article",
|
|
24
|
+
"main",
|
|
25
|
+
".post",
|
|
26
|
+
".thread",
|
|
27
|
+
".message",
|
|
28
|
+
"#content",
|
|
29
|
+
".content",
|
|
30
|
+
"[role='main']",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# JS app markers for detection
|
|
34
|
+
JS_APP_MARKERS = [
|
|
35
|
+
'id="app"',
|
|
36
|
+
'id="root"',
|
|
37
|
+
'id="__next"',
|
|
38
|
+
"ng-app",
|
|
39
|
+
"data-reactroot",
|
|
40
|
+
"window.__INITIAL_STATE__",
|
|
41
|
+
"window.__NUXT__",
|
|
42
|
+
"<script>window.location",
|
|
43
|
+
# Dark web forum specific
|
|
44
|
+
"Dread",
|
|
45
|
+
"phpBB",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_js_rendered(html: str, extracted_text: str) -> bool:
|
|
50
|
+
"""
|
|
51
|
+
Returns True if the page appears to be a JS-rendered app
|
|
52
|
+
that requires browser execution to get content.
|
|
53
|
+
|
|
54
|
+
Criteria:
|
|
55
|
+
- Extracted text is very short (< 300 chars)
|
|
56
|
+
- Raw HTML contains JS app markers
|
|
57
|
+
- HTML has significant script tags but minimal content tags
|
|
58
|
+
"""
|
|
59
|
+
if len(extracted_text) >= 300:
|
|
60
|
+
return False # Already got content, no need for JS
|
|
61
|
+
|
|
62
|
+
if not html:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
html_lower = html.lower()
|
|
66
|
+
|
|
67
|
+
# Check for JS app markers
|
|
68
|
+
has_marker = any(marker.lower() in html_lower for marker in JS_APP_MARKERS)
|
|
69
|
+
|
|
70
|
+
# Check script-to-content ratio
|
|
71
|
+
script_count = html_lower.count("<script")
|
|
72
|
+
content_count = html_lower.count("<p") + html_lower.count("<div") + html_lower.count("<article")
|
|
73
|
+
|
|
74
|
+
high_script_ratio = script_count > 3 and content_count < script_count
|
|
75
|
+
|
|
76
|
+
return has_marker or high_script_ratio
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def get_browser(tor_proxy_host: str = "tor", tor_proxy_port: int = 9050):
|
|
80
|
+
"""
|
|
81
|
+
Get or create a shared Playwright browser instance.
|
|
82
|
+
Browser routes all traffic through Tor SOCKS5 proxy.
|
|
83
|
+
Launched once, reused across scrape calls.
|
|
84
|
+
"""
|
|
85
|
+
global _BROWSER, _BROWSER_LOCK
|
|
86
|
+
|
|
87
|
+
if _BROWSER_LOCK is None:
|
|
88
|
+
import asyncio
|
|
89
|
+
|
|
90
|
+
_BROWSER_LOCK = asyncio.Lock()
|
|
91
|
+
|
|
92
|
+
async with _BROWSER_LOCK:
|
|
93
|
+
if _BROWSER is not None:
|
|
94
|
+
try:
|
|
95
|
+
if _BROWSER.is_connected():
|
|
96
|
+
return _BROWSER
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
from playwright.async_api import async_playwright
|
|
102
|
+
|
|
103
|
+
playwright = await async_playwright().start()
|
|
104
|
+
|
|
105
|
+
_BROWSER = await playwright.chromium.launch(
|
|
106
|
+
headless=True,
|
|
107
|
+
proxy={
|
|
108
|
+
"server": f"socks5://{tor_proxy_host}:{tor_proxy_port}",
|
|
109
|
+
},
|
|
110
|
+
args=[
|
|
111
|
+
"--no-sandbox",
|
|
112
|
+
"--disable-setuid-sandbox",
|
|
113
|
+
"--disable-dev-shm-usage",
|
|
114
|
+
"--disable-gpu",
|
|
115
|
+
"--no-first-run",
|
|
116
|
+
"--no-zygote",
|
|
117
|
+
"--single-process",
|
|
118
|
+
# Privacy — match Tor Browser fingerprint loosely
|
|
119
|
+
"--disable-blink-features=AutomationControlled",
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
logger.info("Playwright browser launched (via Tor proxy)")
|
|
123
|
+
return _BROWSER
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Failed to launch Playwright browser: {e}")
|
|
127
|
+
raise
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
async def fetch_with_playwright(
|
|
131
|
+
url: str,
|
|
132
|
+
tor_proxy_host: str = "tor",
|
|
133
|
+
tor_proxy_port: int = 9050,
|
|
134
|
+
timeout_ms: int = PAGE_TIMEOUT_MS,
|
|
135
|
+
) -> dict:
|
|
136
|
+
"""
|
|
137
|
+
Fetch a URL using Playwright (headless Chromium through Tor).
|
|
138
|
+
|
|
139
|
+
Waits for JS to execute and content to render before extracting.
|
|
140
|
+
Returns same dict shape as aiohttp scraper for compatibility.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
{link, content, raw_html, status, posted_at, via}
|
|
144
|
+
"""
|
|
145
|
+
result = {
|
|
146
|
+
"link": url,
|
|
147
|
+
"content": "",
|
|
148
|
+
"raw_html": "",
|
|
149
|
+
"status": 0,
|
|
150
|
+
"posted_at": None,
|
|
151
|
+
"via": "playwright",
|
|
152
|
+
"error": None,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
page = None
|
|
156
|
+
context = None
|
|
157
|
+
try:
|
|
158
|
+
import trafilatura
|
|
159
|
+
from scraper.scrape import extract_post_timestamp
|
|
160
|
+
|
|
161
|
+
browser = await get_browser(tor_proxy_host, tor_proxy_port)
|
|
162
|
+
|
|
163
|
+
# Create a new browser context per request (isolation)
|
|
164
|
+
context = await browser.new_context(
|
|
165
|
+
user_agent=(
|
|
166
|
+
"Mozilla/5.0 (Windows NT 10.0; rv:109.0) "
|
|
167
|
+
"Gecko/20100101 Firefox/115.0"
|
|
168
|
+
),
|
|
169
|
+
# Disable unnecessary resource loading
|
|
170
|
+
java_script_enabled=True,
|
|
171
|
+
bypass_csp=False,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Block images, fonts, media — we only need text content
|
|
175
|
+
await context.route(
|
|
176
|
+
"**/*.{png,jpg,jpeg,gif,svg,ico,woff,woff2,ttf,mp4,webm}",
|
|
177
|
+
lambda route: route.abort(),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
page = await context.new_page()
|
|
181
|
+
|
|
182
|
+
# Navigate to URL
|
|
183
|
+
response = await page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
184
|
+
|
|
185
|
+
if response:
|
|
186
|
+
result["status"] = response.status
|
|
187
|
+
|
|
188
|
+
# Wait for content to appear (try each selector)
|
|
189
|
+
content_appeared = False
|
|
190
|
+
for selector in CONTENT_SELECTORS:
|
|
191
|
+
try:
|
|
192
|
+
await page.wait_for_selector(
|
|
193
|
+
selector,
|
|
194
|
+
timeout=5000, # 5s per selector attempt
|
|
195
|
+
state="visible",
|
|
196
|
+
)
|
|
197
|
+
content_appeared = True
|
|
198
|
+
break
|
|
199
|
+
except Exception:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
if not content_appeared:
|
|
203
|
+
# No known content selector found — wait a fixed time
|
|
204
|
+
# for JS to do whatever it does
|
|
205
|
+
await page.wait_for_timeout(3000)
|
|
206
|
+
|
|
207
|
+
# Extract rendered HTML
|
|
208
|
+
raw_html = await page.content()
|
|
209
|
+
result["raw_html"] = raw_html
|
|
210
|
+
|
|
211
|
+
# Extract text with trafilatura (same as aiohttp scraper)
|
|
212
|
+
content = trafilatura.extract(
|
|
213
|
+
raw_html,
|
|
214
|
+
include_comments=False,
|
|
215
|
+
include_tables=True,
|
|
216
|
+
no_fallback=False,
|
|
217
|
+
) or ""
|
|
218
|
+
|
|
219
|
+
# Fallback: get visible text directly if trafilatura returns nothing
|
|
220
|
+
if len(content) < 100:
|
|
221
|
+
content = await page.evaluate(
|
|
222
|
+
"""() => {
|
|
223
|
+
const body = document.body;
|
|
224
|
+
const scripts = body.querySelectorAll('script, style, nav, header, footer');
|
|
225
|
+
scripts.forEach(s => s.remove());
|
|
226
|
+
return body.innerText || body.textContent || '';
|
|
227
|
+
}"""
|
|
228
|
+
)
|
|
229
|
+
content = content.strip() if content else ""
|
|
230
|
+
|
|
231
|
+
result["content"] = content[:15000] # Cap at 15k chars
|
|
232
|
+
|
|
233
|
+
# Extract post timestamp from rendered HTML
|
|
234
|
+
result["posted_at"] = extract_post_timestamp(raw_html)
|
|
235
|
+
|
|
236
|
+
logger.debug(
|
|
237
|
+
f"Playwright scraped {url[:40] if len(url) > 40 else url}... "
|
|
238
|
+
f"→ {len(result['content'])} chars, status={result['status']}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
result["error"] = str(e)[:100]
|
|
243
|
+
logger.warning(
|
|
244
|
+
f"Playwright failed for {url[:40] if len(url) > 40 else url}...: {e}"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
finally:
|
|
248
|
+
if page:
|
|
249
|
+
try:
|
|
250
|
+
await page.close()
|
|
251
|
+
except Exception:
|
|
252
|
+
pass
|
|
253
|
+
if context:
|
|
254
|
+
try:
|
|
255
|
+
await context.close()
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
return result
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
async def close_browser():
|
|
263
|
+
"""Shutdown the shared browser. Call on app shutdown."""
|
|
264
|
+
global _BROWSER
|
|
265
|
+
|
|
266
|
+
if _BROWSER is not None:
|
|
267
|
+
try:
|
|
268
|
+
await _BROWSER.close()
|
|
269
|
+
logger.info("Playwright browser closed")
|
|
270
|
+
except Exception:
|
|
271
|
+
pass
|
|
272
|
+
_BROWSER = None
|
search/__init__.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import aiohttp
|
|
11
|
+
import requests
|
|
12
|
+
from aiohttp_socks import ProxyConnector
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
|
|
15
|
+
from config import TOR_PROXY_HOST, TOR_PROXY_PORT
|
|
16
|
+
from search.circuit_breaker import record_failure, record_success, is_open
|
|
17
|
+
from utils.async_utils import run_async
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
ENGINE_TIMEOUT = 45
|
|
22
|
+
|
|
23
|
+
ENGINE_WEIGHTS = {
|
|
24
|
+
"darksearch": 1.0,
|
|
25
|
+
"ahmia": 0.9,
|
|
26
|
+
"torch": 0.7,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _normalize_for_dedup(url: str) -> str:
|
|
31
|
+
url = url.lower().rstrip("/")
|
|
32
|
+
url = url.replace("https://", "http://")
|
|
33
|
+
return url
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
USER_AGENTS = [
|
|
37
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
38
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
39
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
40
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
41
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
42
|
+
"Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
43
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
|
|
44
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
|
|
45
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
SEARCH_ENGINES = [
|
|
49
|
+
{"name": "Ahmia (Clearnet Proxy)", "url": "https://ahmia.fi/search/?q={query}"},
|
|
50
|
+
{"name": "DarkSearch (API)", "url": "https://darksearch.io/api/search?query={query}"},
|
|
51
|
+
{"name": "Ahmia", "url": "http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?q={query}"},
|
|
52
|
+
{"name": "OnionLand", "url": "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion/search?q={query}"},
|
|
53
|
+
{"name": "Torgle", "url": "http://iy3544gmoeclh5de6gez2256v6pjh4omhpqdh2wpeeppjtvqmjhkfwad.onion/torgle/?query={query}"},
|
|
54
|
+
{"name": "Amnesia", "url": "http://amnesia7u5odx5xbwtpnqk3edybgud5bmiagu75bnqx2crntw5kry7ad.onion/search?query={query}"},
|
|
55
|
+
{"name": "Kaizer", "url": "http://kaizerwfvp5gxu6cppibp7jhcqptavq3iqef66wbxenh6a2fklibdvid.onion/search?q={query}"},
|
|
56
|
+
{"name": "Anima", "url": "http://anima4ffe27xmakwnseih3ic2y7y3l6e7fucwk4oerdn4odf7k74tbid.onion/search?q={query}"},
|
|
57
|
+
{"name": "Tornado", "url": "http://tornadoxn3viscgz647shlysdy7ea5zqzwda7hierekeuokh5eh5b3qd.onion/search?q={query}"},
|
|
58
|
+
{"name": "TorNet", "url": "http://tornetupfu7gcgidt33ftnungxzyfq2pygui5qdoyss34xbgx2qruzid.onion/search?q={query}"},
|
|
59
|
+
{"name": "Torland", "url": "http://torlbmqwtudkorme6prgfpmsnile7ug2zm4u3ejpcncxuhpu4k2j4kyd.onion/index.php?a=search&q={query}"},
|
|
60
|
+
{"name": "Find Tor", "url": "http://findtorroveq5wdnipkaojfpqulxnkhblymc7aramjzajcvpptd4rjqd.onion/search?q={query}"},
|
|
61
|
+
{"name": "Excavator", "url": "http://2fd6cemt4gmccflhm6imvdfvli3nf7zn6rfrwpsy7uhxrgbypvwf5fad.onion/search?query={query}"},
|
|
62
|
+
{"name": "Onionway", "url": "http://oniwayzz74cv2puhsgx4dpjwieww4wdphsydqvf5q7eyz4myjvyw26ad.onion/search.php?s={query}"},
|
|
63
|
+
{"name": "Tor66", "url": "http://tor66sewebgixwhcqfnp5inzp5x5uohhdy3kvtnyfxc2e5mxiuh34iid.onion/search?q={query}"},
|
|
64
|
+
{"name": "OSS", "url": "http://3fzh7yuupdfyjhwt3ugzqqof6ulbcl27ecev33knxe3u7goi3vfn2qqd.onion/oss/index.php?search={query}"},
|
|
65
|
+
{"name": "Torgol", "url": "http://torgolnpeouim56dykfob6jh5r2ps2j73enc42s2um4ufob3ny4fcdyd.onion/?q={query}"},
|
|
66
|
+
{"name": "The Deep Searches", "url": "http://searchgf7gdtauh7bhnbyed4ivxqmuoat3nm6zfrg3ymkq6mtnpye3ad.onion/search?q={query}"},
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
DEFAULT_SEARCH_ENGINES = [e["url"] for e in SEARCH_ENGINES]
|
|
70
|
+
|
|
71
|
+
_ONION_URL_RE = re.compile(r'https?:\/\/[a-z0-9.]+\.onion', re.IGNORECASE)
|
|
72
|
+
|
|
73
|
+
MAX_CONCURRENT = 10
|
|
74
|
+
SEARCH_TIMEOUT = 60
|
|
75
|
+
ENGINE_RETRY_COUNT = 2
|
|
76
|
+
|
|
77
|
+
_ENGINE_STATUS: dict[str, dict] = {}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class EngineResult:
|
|
82
|
+
name: str
|
|
83
|
+
links: list[dict]
|
|
84
|
+
error: Optional[str] = None
|
|
85
|
+
took_ms: int = 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_tor_session():
|
|
89
|
+
session = requests.Session()
|
|
90
|
+
session.proxies = {
|
|
91
|
+
"http": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
92
|
+
"https": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
93
|
+
}
|
|
94
|
+
return session
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Public alias used by health.py
|
|
98
|
+
get_tor_session = _get_tor_session
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _is_onion_url(url: str) -> bool:
|
|
102
|
+
return bool(_ONION_URL_RE.search(url))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _tor_aiohttp_connector() -> ProxyConnector:
|
|
106
|
+
"""SOCKS5 with remote DNS for aiohttp-socks with connection pooling."""
|
|
107
|
+
return ProxyConnector.from_url(
|
|
108
|
+
f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
109
|
+
rdns=True,
|
|
110
|
+
limit=10,
|
|
111
|
+
limit_per_host=2,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_search_session: Optional[aiohttp.ClientSession] = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_search_session() -> aiohttp.ClientSession:
|
|
119
|
+
"""Return a cached session configured for Tor SOCKS5 proxy."""
|
|
120
|
+
global _search_session
|
|
121
|
+
if _search_session is None or _search_session.closed:
|
|
122
|
+
connector = _tor_aiohttp_connector()
|
|
123
|
+
_search_session = aiohttp.ClientSession(
|
|
124
|
+
connector=connector,
|
|
125
|
+
timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT),
|
|
126
|
+
)
|
|
127
|
+
return _search_session
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
async def close_search_session() -> None:
|
|
131
|
+
"""Close cached search session - call on shutdown."""
|
|
132
|
+
global _search_session
|
|
133
|
+
if _search_session and not _search_session.closed:
|
|
134
|
+
await _search_session.close()
|
|
135
|
+
_search_session = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
async def fetch_with_timeout(
|
|
139
|
+
url: str,
|
|
140
|
+
session: Optional[aiohttp.ClientSession] = None,
|
|
141
|
+
) -> aiohttp.ClientResponse:
|
|
142
|
+
"""Fetch a URL with timeout using the provided or cached session."""
|
|
143
|
+
if session is None:
|
|
144
|
+
session = get_search_session()
|
|
145
|
+
return await session.get(url, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
async def _fetch_engine(
|
|
149
|
+
engine: dict,
|
|
150
|
+
query: str,
|
|
151
|
+
session: aiohttp.ClientSession,
|
|
152
|
+
semaphore: asyncio.Semaphore,
|
|
153
|
+
) -> EngineResult:
|
|
154
|
+
url = engine["url"].format(query=query)
|
|
155
|
+
name = engine["name"]
|
|
156
|
+
is_onion = _is_onion_url(url)
|
|
157
|
+
|
|
158
|
+
headers = {"User-Agent": random.choice(USER_AGENTS)}
|
|
159
|
+
|
|
160
|
+
async with semaphore:
|
|
161
|
+
for attempt in range(ENGINE_RETRY_COUNT + 1):
|
|
162
|
+
try:
|
|
163
|
+
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)) as resp:
|
|
164
|
+
if resp.status != 200:
|
|
165
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
166
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
167
|
+
continue
|
|
168
|
+
return EngineResult(
|
|
169
|
+
name=name,
|
|
170
|
+
links=[],
|
|
171
|
+
error=f"HTTP {resp.status}",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
text = await resp.text()
|
|
175
|
+
|
|
176
|
+
if "darksearch.io/api" in url:
|
|
177
|
+
try:
|
|
178
|
+
import json
|
|
179
|
+
data = json.loads(text)
|
|
180
|
+
links = [
|
|
181
|
+
{"title": hit.get("title", "No Title"), "link": hit.get("onion")}
|
|
182
|
+
for hit in data.get("data", [])
|
|
183
|
+
if hit.get("onion")
|
|
184
|
+
]
|
|
185
|
+
return EngineResult(name=name, links=links)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
return EngineResult(name=name, links=[], error=f"JSON parse: {e}")
|
|
188
|
+
|
|
189
|
+
links = _parse_html_links(text, url)
|
|
190
|
+
return EngineResult(name=name, links=links)
|
|
191
|
+
|
|
192
|
+
except asyncio.TimeoutError:
|
|
193
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
194
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
195
|
+
continue
|
|
196
|
+
return EngineResult(name=name, links=[], error="timeout")
|
|
197
|
+
except Exception as e:
|
|
198
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
199
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
200
|
+
continue
|
|
201
|
+
return EngineResult(name=name, links=[], error=str(e))
|
|
202
|
+
|
|
203
|
+
return EngineResult(name=name, links=[], error="max retries")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _parse_html_links(html: str, base_url: str) -> list[dict]:
|
|
207
|
+
links = []
|
|
208
|
+
try:
|
|
209
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
210
|
+
for a in soup.find_all('a'):
|
|
211
|
+
href = a.get('href', '')
|
|
212
|
+
title = a.get_text(strip=True)
|
|
213
|
+
|
|
214
|
+
found = _ONION_URL_RE.findall(href)
|
|
215
|
+
if not found and ".onion" in href:
|
|
216
|
+
found = [href]
|
|
217
|
+
|
|
218
|
+
for link in found:
|
|
219
|
+
if "search" not in link and len(title) > 3:
|
|
220
|
+
links.append({"title": title, "link": link})
|
|
221
|
+
except Exception:
|
|
222
|
+
pass
|
|
223
|
+
return links
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def _search_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[EngineResult]:
|
|
227
|
+
semaphore = asyncio.Semaphore(max_workers)
|
|
228
|
+
search_session = get_search_session()
|
|
229
|
+
|
|
230
|
+
async def run_engine(engine: dict) -> EngineResult:
|
|
231
|
+
name = engine["name"]
|
|
232
|
+
if await is_open(name):
|
|
233
|
+
logger.warning(f"Skipping unhealthy engine: {name}")
|
|
234
|
+
return EngineResult(name=name, links=[], error="circuit_open")
|
|
235
|
+
|
|
236
|
+
url = engine["url"].format(query=query)
|
|
237
|
+
|
|
238
|
+
async def fetch_with_engine_session():
|
|
239
|
+
result = await _fetch_engine(engine, query, search_session, semaphore)
|
|
240
|
+
if result.error:
|
|
241
|
+
if "HTTP 4" not in result.error:
|
|
242
|
+
await record_failure(name)
|
|
243
|
+
logger.warning(f"Engine {name} failed: {result.error}")
|
|
244
|
+
else:
|
|
245
|
+
await record_success(name)
|
|
246
|
+
if not result.links:
|
|
247
|
+
logger.warning(f"Engine {name} returned 0 results")
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
return await asyncio.wait_for(fetch_with_engine_session(), timeout=ENGINE_TIMEOUT)
|
|
252
|
+
except asyncio.TimeoutError:
|
|
253
|
+
await record_failure(name)
|
|
254
|
+
logger.warning(f"Engine {name} timed out")
|
|
255
|
+
return EngineResult(name=name, links=[], error="timeout")
|
|
256
|
+
except Exception as e:
|
|
257
|
+
await record_failure(name)
|
|
258
|
+
logger.warning(f"Engine {name} exception: {e}")
|
|
259
|
+
return EngineResult(name=name, links=[], error=str(e))
|
|
260
|
+
|
|
261
|
+
tasks = [run_engine(e) for e in SEARCH_ENGINES]
|
|
262
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
263
|
+
|
|
264
|
+
processed: list[EngineResult] = []
|
|
265
|
+
for r in results:
|
|
266
|
+
if isinstance(r, Exception):
|
|
267
|
+
logger.warning(f"Engine task exception: {r}")
|
|
268
|
+
continue
|
|
269
|
+
processed.append(r)
|
|
270
|
+
|
|
271
|
+
return processed
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def get_search_results_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
|
|
275
|
+
"""Async search - call from async context."""
|
|
276
|
+
start = time.monotonic()
|
|
277
|
+
|
|
278
|
+
results = run_async(_search_async(query, max_workers))
|
|
279
|
+
|
|
280
|
+
all_links = []
|
|
281
|
+
for result in results:
|
|
282
|
+
engine_name = result.name.lower()
|
|
283
|
+
weight = 0.5
|
|
284
|
+
for known in ENGINE_WEIGHTS:
|
|
285
|
+
if known in engine_name:
|
|
286
|
+
weight = ENGINE_WEIGHTS[known]
|
|
287
|
+
break
|
|
288
|
+
for link in result.links:
|
|
289
|
+
link["source_engine"] = result.name
|
|
290
|
+
link["source_weight"] = weight
|
|
291
|
+
all_links.append(link)
|
|
292
|
+
status = "ok" if not result.error else result.error
|
|
293
|
+
logger.debug(f"Engine {result.name}: {len(result.links)} links ({status})")
|
|
294
|
+
|
|
295
|
+
unique = _dedupe_links(all_links)
|
|
296
|
+
unique.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
|
|
297
|
+
|
|
298
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
299
|
+
logger.info(f"Search completed: {len(unique)} unique links in {elapsed:.0f}ms")
|
|
300
|
+
|
|
301
|
+
return unique
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _dedupe_links(links: list[dict]) -> list[dict]:
|
|
305
|
+
seen: set[str] = set()
|
|
306
|
+
unique = []
|
|
307
|
+
for link_dict in links:
|
|
308
|
+
link = link_dict.get("link", "")
|
|
309
|
+
normalized = _normalize_for_dedup(link)
|
|
310
|
+
if normalized and normalized not in seen:
|
|
311
|
+
seen.add(normalized)
|
|
312
|
+
unique.append(link_dict)
|
|
313
|
+
return unique
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def get_search_results(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
|
|
317
|
+
"""Sync wrapper for backward compatibility."""
|
|
318
|
+
return get_search_results_async(query, max_workers)
|