voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
scraper/scrape_js.py ADDED
@@ -0,0 +1,272 @@
1
+ """
2
+ Playwright-based JavaScript renderer for dark web content.
3
+
4
+ Used as a fallback when aiohttp returns empty content from JS-heavy sites.
5
+ Routes traffic through Tor SOCKS5 proxy same as the main scraper.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+ from datetime import datetime, timezone
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Playwright browser instance — shared across scrape calls
15
+ _BROWSER = None
16
+ _BROWSER_LOCK = None
17
+
18
+ # Maximum time to wait for page content (ms)
19
+ PAGE_TIMEOUT_MS = 30_000 # 30 seconds
20
+
21
+ # Selectors to wait for — indicates page has rendered
22
+ CONTENT_SELECTORS = [
23
+ "article",
24
+ "main",
25
+ ".post",
26
+ ".thread",
27
+ ".message",
28
+ "#content",
29
+ ".content",
30
+ "[role='main']",
31
+ ]
32
+
33
+ # JS app markers for detection
34
+ JS_APP_MARKERS = [
35
+ 'id="app"',
36
+ 'id="root"',
37
+ 'id="__next"',
38
+ "ng-app",
39
+ "data-reactroot",
40
+ "window.__INITIAL_STATE__",
41
+ "window.__NUXT__",
42
+ "<script>window.location",
43
+ # Dark web forum specific
44
+ "Dread",
45
+ "phpBB",
46
+ ]
47
+
48
+
49
+ def is_js_rendered(html: str, extracted_text: str) -> bool:
50
+ """
51
+ Returns True if the page appears to be a JS-rendered app
52
+ that requires browser execution to get content.
53
+
54
+ Criteria:
55
+ - Extracted text is very short (< 300 chars)
56
+ - Raw HTML contains JS app markers
57
+ - HTML has significant script tags but minimal content tags
58
+ """
59
+ if len(extracted_text) >= 300:
60
+ return False # Already got content, no need for JS
61
+
62
+ if not html:
63
+ return False
64
+
65
+ html_lower = html.lower()
66
+
67
+ # Check for JS app markers
68
+ has_marker = any(marker.lower() in html_lower for marker in JS_APP_MARKERS)
69
+
70
+ # Check script-to-content ratio
71
+ script_count = html_lower.count("<script")
72
+ content_count = html_lower.count("<p") + html_lower.count("<div") + html_lower.count("<article")
73
+
74
+ high_script_ratio = script_count > 3 and content_count < script_count
75
+
76
+ return has_marker or high_script_ratio
77
+
78
+
79
+ async def get_browser(tor_proxy_host: str = "tor", tor_proxy_port: int = 9050):
80
+ """
81
+ Get or create a shared Playwright browser instance.
82
+ Browser routes all traffic through Tor SOCKS5 proxy.
83
+ Launched once, reused across scrape calls.
84
+ """
85
+ global _BROWSER, _BROWSER_LOCK
86
+
87
+ if _BROWSER_LOCK is None:
88
+ import asyncio
89
+
90
+ _BROWSER_LOCK = asyncio.Lock()
91
+
92
+ async with _BROWSER_LOCK:
93
+ if _BROWSER is not None:
94
+ try:
95
+ if _BROWSER.is_connected():
96
+ return _BROWSER
97
+ except Exception:
98
+ pass
99
+
100
+ try:
101
+ from playwright.async_api import async_playwright
102
+
103
+ playwright = await async_playwright().start()
104
+
105
+ _BROWSER = await playwright.chromium.launch(
106
+ headless=True,
107
+ proxy={
108
+ "server": f"socks5://{tor_proxy_host}:{tor_proxy_port}",
109
+ },
110
+ args=[
111
+ "--no-sandbox",
112
+ "--disable-setuid-sandbox",
113
+ "--disable-dev-shm-usage",
114
+ "--disable-gpu",
115
+ "--no-first-run",
116
+ "--no-zygote",
117
+ "--single-process",
118
+ # Privacy — match Tor Browser fingerprint loosely
119
+ "--disable-blink-features=AutomationControlled",
120
+ ],
121
+ )
122
+ logger.info("Playwright browser launched (via Tor proxy)")
123
+ return _BROWSER
124
+
125
+ except Exception as e:
126
+ logger.error(f"Failed to launch Playwright browser: {e}")
127
+ raise
128
+
129
+
130
+ async def fetch_with_playwright(
131
+ url: str,
132
+ tor_proxy_host: str = "tor",
133
+ tor_proxy_port: int = 9050,
134
+ timeout_ms: int = PAGE_TIMEOUT_MS,
135
+ ) -> dict:
136
+ """
137
+ Fetch a URL using Playwright (headless Chromium through Tor).
138
+
139
+ Waits for JS to execute and content to render before extracting.
140
+ Returns same dict shape as aiohttp scraper for compatibility.
141
+
142
+ Returns:
143
+ {link, content, raw_html, status, posted_at, via}
144
+ """
145
+ result = {
146
+ "link": url,
147
+ "content": "",
148
+ "raw_html": "",
149
+ "status": 0,
150
+ "posted_at": None,
151
+ "via": "playwright",
152
+ "error": None,
153
+ }
154
+
155
+ page = None
156
+ context = None
157
+ try:
158
+ import trafilatura
159
+ from scraper.scrape import extract_post_timestamp
160
+
161
+ browser = await get_browser(tor_proxy_host, tor_proxy_port)
162
+
163
+ # Create a new browser context per request (isolation)
164
+ context = await browser.new_context(
165
+ user_agent=(
166
+ "Mozilla/5.0 (Windows NT 10.0; rv:109.0) "
167
+ "Gecko/20100101 Firefox/115.0"
168
+ ),
169
+ # Disable unnecessary resource loading
170
+ java_script_enabled=True,
171
+ bypass_csp=False,
172
+ )
173
+
174
+ # Block images, fonts, media — we only need text content
175
+ await context.route(
176
+ "**/*.{png,jpg,jpeg,gif,svg,ico,woff,woff2,ttf,mp4,webm}",
177
+ lambda route: route.abort(),
178
+ )
179
+
180
+ page = await context.new_page()
181
+
182
+ # Navigate to URL
183
+ response = await page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
184
+
185
+ if response:
186
+ result["status"] = response.status
187
+
188
+ # Wait for content to appear (try each selector)
189
+ content_appeared = False
190
+ for selector in CONTENT_SELECTORS:
191
+ try:
192
+ await page.wait_for_selector(
193
+ selector,
194
+ timeout=5000, # 5s per selector attempt
195
+ state="visible",
196
+ )
197
+ content_appeared = True
198
+ break
199
+ except Exception:
200
+ continue
201
+
202
+ if not content_appeared:
203
+ # No known content selector found — wait a fixed time
204
+ # for JS to do whatever it does
205
+ await page.wait_for_timeout(3000)
206
+
207
+ # Extract rendered HTML
208
+ raw_html = await page.content()
209
+ result["raw_html"] = raw_html
210
+
211
+ # Extract text with trafilatura (same as aiohttp scraper)
212
+ content = trafilatura.extract(
213
+ raw_html,
214
+ include_comments=False,
215
+ include_tables=True,
216
+ no_fallback=False,
217
+ ) or ""
218
+
219
+ # Fallback: get visible text directly if trafilatura returns nothing
220
+ if len(content) < 100:
221
+ content = await page.evaluate(
222
+ """() => {
223
+ const body = document.body;
224
+ const scripts = body.querySelectorAll('script, style, nav, header, footer');
225
+ scripts.forEach(s => s.remove());
226
+ return body.innerText || body.textContent || '';
227
+ }"""
228
+ )
229
+ content = content.strip() if content else ""
230
+
231
+ result["content"] = content[:15000] # Cap at 15k chars
232
+
233
+ # Extract post timestamp from rendered HTML
234
+ result["posted_at"] = extract_post_timestamp(raw_html)
235
+
236
+ logger.debug(
237
+ f"Playwright scraped {url[:40] if len(url) > 40 else url}... "
238
+ f"→ {len(result['content'])} chars, status={result['status']}"
239
+ )
240
+
241
+ except Exception as e:
242
+ result["error"] = str(e)[:100]
243
+ logger.warning(
244
+ f"Playwright failed for {url[:40] if len(url) > 40 else url}...: {e}"
245
+ )
246
+
247
+ finally:
248
+ if page:
249
+ try:
250
+ await page.close()
251
+ except Exception:
252
+ pass
253
+ if context:
254
+ try:
255
+ await context.close()
256
+ except Exception:
257
+ pass
258
+
259
+ return result
260
+
261
+
262
+ async def close_browser():
263
+ """Shutdown the shared browser. Call on app shutdown."""
264
+ global _BROWSER
265
+
266
+ if _BROWSER is not None:
267
+ try:
268
+ await _BROWSER.close()
269
+ logger.info("Playwright browser closed")
270
+ except Exception:
271
+ pass
272
+ _BROWSER = None
search/__init__.py ADDED
@@ -0,0 +1,318 @@
1
+ import asyncio
2
+ import logging
3
+ import random
4
+ import re
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+ import aiohttp
11
+ import requests
12
+ from aiohttp_socks import ProxyConnector
13
+ from bs4 import BeautifulSoup
14
+
15
+ from config import TOR_PROXY_HOST, TOR_PROXY_PORT
16
+ from search.circuit_breaker import record_failure, record_success, is_open
17
+ from utils.async_utils import run_async
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ ENGINE_TIMEOUT = 45
22
+
23
+ ENGINE_WEIGHTS = {
24
+ "darksearch": 1.0,
25
+ "ahmia": 0.9,
26
+ "torch": 0.7,
27
+ }
28
+
29
+
30
+ def _normalize_for_dedup(url: str) -> str:
31
+ url = url.lower().rstrip("/")
32
+ url = url.replace("https://", "http://")
33
+ return url
34
+
35
+
36
+ USER_AGENTS = [
37
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
38
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
39
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
40
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
41
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
42
+ "Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
43
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
44
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
45
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
46
+ ]
47
+
48
+ SEARCH_ENGINES = [
49
+ {"name": "Ahmia (Clearnet Proxy)", "url": "https://ahmia.fi/search/?q={query}"},
50
+ {"name": "DarkSearch (API)", "url": "https://darksearch.io/api/search?query={query}"},
51
+ {"name": "Ahmia", "url": "http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?q={query}"},
52
+ {"name": "OnionLand", "url": "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion/search?q={query}"},
53
+ {"name": "Torgle", "url": "http://iy3544gmoeclh5de6gez2256v6pjh4omhpqdh2wpeeppjtvqmjhkfwad.onion/torgle/?query={query}"},
54
+ {"name": "Amnesia", "url": "http://amnesia7u5odx5xbwtpnqk3edybgud5bmiagu75bnqx2crntw5kry7ad.onion/search?query={query}"},
55
+ {"name": "Kaizer", "url": "http://kaizerwfvp5gxu6cppibp7jhcqptavq3iqef66wbxenh6a2fklibdvid.onion/search?q={query}"},
56
+ {"name": "Anima", "url": "http://anima4ffe27xmakwnseih3ic2y7y3l6e7fucwk4oerdn4odf7k74tbid.onion/search?q={query}"},
57
+ {"name": "Tornado", "url": "http://tornadoxn3viscgz647shlysdy7ea5zqzwda7hierekeuokh5eh5b3qd.onion/search?q={query}"},
58
+ {"name": "TorNet", "url": "http://tornetupfu7gcgidt33ftnungxzyfq2pygui5qdoyss34xbgx2qruzid.onion/search?q={query}"},
59
+ {"name": "Torland", "url": "http://torlbmqwtudkorme6prgfpmsnile7ug2zm4u3ejpcncxuhpu4k2j4kyd.onion/index.php?a=search&q={query}"},
60
+ {"name": "Find Tor", "url": "http://findtorroveq5wdnipkaojfpqulxnkhblymc7aramjzajcvpptd4rjqd.onion/search?q={query}"},
61
+ {"name": "Excavator", "url": "http://2fd6cemt4gmccflhm6imvdfvli3nf7zn6rfrwpsy7uhxrgbypvwf5fad.onion/search?query={query}"},
62
+ {"name": "Onionway", "url": "http://oniwayzz74cv2puhsgx4dpjwieww4wdphsydqvf5q7eyz4myjvyw26ad.onion/search.php?s={query}"},
63
+ {"name": "Tor66", "url": "http://tor66sewebgixwhcqfnp5inzp5x5uohhdy3kvtnyfxc2e5mxiuh34iid.onion/search?q={query}"},
64
+ {"name": "OSS", "url": "http://3fzh7yuupdfyjhwt3ugzqqof6ulbcl27ecev33knxe3u7goi3vfn2qqd.onion/oss/index.php?search={query}"},
65
+ {"name": "Torgol", "url": "http://torgolnpeouim56dykfob6jh5r2ps2j73enc42s2um4ufob3ny4fcdyd.onion/?q={query}"},
66
+ {"name": "The Deep Searches", "url": "http://searchgf7gdtauh7bhnbyed4ivxqmuoat3nm6zfrg3ymkq6mtnpye3ad.onion/search?q={query}"},
67
+ ]
68
+
69
+ DEFAULT_SEARCH_ENGINES = [e["url"] for e in SEARCH_ENGINES]
70
+
71
+ _ONION_URL_RE = re.compile(r'https?:\/\/[a-z0-9.]+\.onion', re.IGNORECASE)
72
+
73
+ MAX_CONCURRENT = 10
74
+ SEARCH_TIMEOUT = 60
75
+ ENGINE_RETRY_COUNT = 2
76
+
77
+ _ENGINE_STATUS: dict[str, dict] = {}
78
+
79
+
80
+ @dataclass
81
+ class EngineResult:
82
+ name: str
83
+ links: list[dict]
84
+ error: Optional[str] = None
85
+ took_ms: int = 0
86
+
87
+
88
+ def _get_tor_session():
89
+ session = requests.Session()
90
+ session.proxies = {
91
+ "http": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
92
+ "https": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
93
+ }
94
+ return session
95
+
96
+
97
+ # Public alias used by health.py
98
+ get_tor_session = _get_tor_session
99
+
100
+
101
+ def _is_onion_url(url: str) -> bool:
102
+ return bool(_ONION_URL_RE.search(url))
103
+
104
+
105
+ def _tor_aiohttp_connector() -> ProxyConnector:
106
+ """SOCKS5 with remote DNS for aiohttp-socks with connection pooling."""
107
+ return ProxyConnector.from_url(
108
+ f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
109
+ rdns=True,
110
+ limit=10,
111
+ limit_per_host=2,
112
+ )
113
+
114
+
115
+ _search_session: Optional[aiohttp.ClientSession] = None
116
+
117
+
118
+ def get_search_session() -> aiohttp.ClientSession:
119
+ """Return a cached session configured for Tor SOCKS5 proxy."""
120
+ global _search_session
121
+ if _search_session is None or _search_session.closed:
122
+ connector = _tor_aiohttp_connector()
123
+ _search_session = aiohttp.ClientSession(
124
+ connector=connector,
125
+ timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT),
126
+ )
127
+ return _search_session
128
+
129
+
130
+ async def close_search_session() -> None:
131
+ """Close cached search session - call on shutdown."""
132
+ global _search_session
133
+ if _search_session and not _search_session.closed:
134
+ await _search_session.close()
135
+ _search_session = None
136
+
137
+
138
+ async def fetch_with_timeout(
139
+ url: str,
140
+ session: Optional[aiohttp.ClientSession] = None,
141
+ ) -> aiohttp.ClientResponse:
142
+ """Fetch a URL with timeout using the provided or cached session."""
143
+ if session is None:
144
+ session = get_search_session()
145
+ return await session.get(url, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT))
146
+
147
+
148
+ async def _fetch_engine(
149
+ engine: dict,
150
+ query: str,
151
+ session: aiohttp.ClientSession,
152
+ semaphore: asyncio.Semaphore,
153
+ ) -> EngineResult:
154
+ url = engine["url"].format(query=query)
155
+ name = engine["name"]
156
+ is_onion = _is_onion_url(url)
157
+
158
+ headers = {"User-Agent": random.choice(USER_AGENTS)}
159
+
160
+ async with semaphore:
161
+ for attempt in range(ENGINE_RETRY_COUNT + 1):
162
+ try:
163
+ async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)) as resp:
164
+ if resp.status != 200:
165
+ if attempt < ENGINE_RETRY_COUNT:
166
+ await asyncio.sleep(0.5 * (attempt + 1))
167
+ continue
168
+ return EngineResult(
169
+ name=name,
170
+ links=[],
171
+ error=f"HTTP {resp.status}",
172
+ )
173
+
174
+ text = await resp.text()
175
+
176
+ if "darksearch.io/api" in url:
177
+ try:
178
+ import json
179
+ data = json.loads(text)
180
+ links = [
181
+ {"title": hit.get("title", "No Title"), "link": hit.get("onion")}
182
+ for hit in data.get("data", [])
183
+ if hit.get("onion")
184
+ ]
185
+ return EngineResult(name=name, links=links)
186
+ except Exception as e:
187
+ return EngineResult(name=name, links=[], error=f"JSON parse: {e}")
188
+
189
+ links = _parse_html_links(text, url)
190
+ return EngineResult(name=name, links=links)
191
+
192
+ except asyncio.TimeoutError:
193
+ if attempt < ENGINE_RETRY_COUNT:
194
+ await asyncio.sleep(0.5 * (attempt + 1))
195
+ continue
196
+ return EngineResult(name=name, links=[], error="timeout")
197
+ except Exception as e:
198
+ if attempt < ENGINE_RETRY_COUNT:
199
+ await asyncio.sleep(0.5 * (attempt + 1))
200
+ continue
201
+ return EngineResult(name=name, links=[], error=str(e))
202
+
203
+ return EngineResult(name=name, links=[], error="max retries")
204
+
205
+
206
+ def _parse_html_links(html: str, base_url: str) -> list[dict]:
207
+ links = []
208
+ try:
209
+ soup = BeautifulSoup(html, "html.parser")
210
+ for a in soup.find_all('a'):
211
+ href = a.get('href', '')
212
+ title = a.get_text(strip=True)
213
+
214
+ found = _ONION_URL_RE.findall(href)
215
+ if not found and ".onion" in href:
216
+ found = [href]
217
+
218
+ for link in found:
219
+ if "search" not in link and len(title) > 3:
220
+ links.append({"title": title, "link": link})
221
+ except Exception:
222
+ pass
223
+ return links
224
+
225
+
226
+ async def _search_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[EngineResult]:
227
+ semaphore = asyncio.Semaphore(max_workers)
228
+ search_session = get_search_session()
229
+
230
+ async def run_engine(engine: dict) -> EngineResult:
231
+ name = engine["name"]
232
+ if await is_open(name):
233
+ logger.warning(f"Skipping unhealthy engine: {name}")
234
+ return EngineResult(name=name, links=[], error="circuit_open")
235
+
236
+ url = engine["url"].format(query=query)
237
+
238
+ async def fetch_with_engine_session():
239
+ result = await _fetch_engine(engine, query, search_session, semaphore)
240
+ if result.error:
241
+ if "HTTP 4" not in result.error:
242
+ await record_failure(name)
243
+ logger.warning(f"Engine {name} failed: {result.error}")
244
+ else:
245
+ await record_success(name)
246
+ if not result.links:
247
+ logger.warning(f"Engine {name} returned 0 results")
248
+ return result
249
+
250
+ try:
251
+ return await asyncio.wait_for(fetch_with_engine_session(), timeout=ENGINE_TIMEOUT)
252
+ except asyncio.TimeoutError:
253
+ await record_failure(name)
254
+ logger.warning(f"Engine {name} timed out")
255
+ return EngineResult(name=name, links=[], error="timeout")
256
+ except Exception as e:
257
+ await record_failure(name)
258
+ logger.warning(f"Engine {name} exception: {e}")
259
+ return EngineResult(name=name, links=[], error=str(e))
260
+
261
+ tasks = [run_engine(e) for e in SEARCH_ENGINES]
262
+ results = await asyncio.gather(*tasks, return_exceptions=True)
263
+
264
+ processed: list[EngineResult] = []
265
+ for r in results:
266
+ if isinstance(r, Exception):
267
+ logger.warning(f"Engine task exception: {r}")
268
+ continue
269
+ processed.append(r)
270
+
271
+ return processed
272
+
273
+
274
+ def get_search_results_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
275
+ """Async search - call from async context."""
276
+ start = time.monotonic()
277
+
278
+ results = run_async(_search_async(query, max_workers))
279
+
280
+ all_links = []
281
+ for result in results:
282
+ engine_name = result.name.lower()
283
+ weight = 0.5
284
+ for known in ENGINE_WEIGHTS:
285
+ if known in engine_name:
286
+ weight = ENGINE_WEIGHTS[known]
287
+ break
288
+ for link in result.links:
289
+ link["source_engine"] = result.name
290
+ link["source_weight"] = weight
291
+ all_links.append(link)
292
+ status = "ok" if not result.error else result.error
293
+ logger.debug(f"Engine {result.name}: {len(result.links)} links ({status})")
294
+
295
+ unique = _dedupe_links(all_links)
296
+ unique.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
297
+
298
+ elapsed = (time.monotonic() - start) * 1000
299
+ logger.info(f"Search completed: {len(unique)} unique links in {elapsed:.0f}ms")
300
+
301
+ return unique
302
+
303
+
304
+ def _dedupe_links(links: list[dict]) -> list[dict]:
305
+ seen: set[str] = set()
306
+ unique = []
307
+ for link_dict in links:
308
+ link = link_dict.get("link", "")
309
+ normalized = _normalize_for_dedup(link)
310
+ if normalized and normalized not in seen:
311
+ seen.add(normalized)
312
+ unique.append(link_dict)
313
+ return unique
314
+
315
+
316
+ def get_search_results(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
317
+ """Sync wrapper for backward compatibility."""
318
+ return get_search_results_async(query, max_workers)