voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Circuit breaker for search engine resilience using Redis.
|
|
3
|
+
|
|
4
|
+
Provides shared, persistent state across Uvicorn workers:
|
|
5
|
+
- circuit:{engine_name}:failures — integer counter
|
|
6
|
+
- circuit:{engine_name}:last_success — Unix timestamp
|
|
7
|
+
- circuit:{engine_name}:state — "closed" | "open" | "half_open"
|
|
8
|
+
|
|
9
|
+
Gracefully degrades to in-memory dict if Redis is unavailable.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import time
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import redis.asyncio as redis
|
|
17
|
+
|
|
18
|
+
from config import REDIS_URL
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
FAILURE_THRESHOLD = 8
|
|
23
|
+
OPEN_DURATION_SECONDS = 900
|
|
24
|
+
HALF_OPEN_TEST_INTERVAL = 60
|
|
25
|
+
HALF_OPEN_MAX_ATTEMPTS = 2
|
|
26
|
+
|
|
27
|
+
CIRCUIT_PREFIX = "circuit:"
|
|
28
|
+
|
|
29
|
+
_pool: Optional[redis.ConnectionPool] = None
|
|
30
|
+
_redis_client: Optional[redis.Redis] = None
|
|
31
|
+
_circuit_breaker_enabled = False
|
|
32
|
+
|
|
33
|
+
_engine_failures: dict[str, int] = {}
|
|
34
|
+
_engine_last_success: dict[str, float] = {}
|
|
35
|
+
_engine_state: dict[str, str] = {}
|
|
36
|
+
_engine_open_time: dict[str, float] = {}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def _get_redis() -> Optional[redis.Redis]:
|
|
40
|
+
global _pool, _redis_client, _circuit_breaker_enabled
|
|
41
|
+
|
|
42
|
+
if REDIS_URL is None:
|
|
43
|
+
_circuit_breaker_enabled = False
|
|
44
|
+
logger.warning("REDIS_URL not configured - circuit breaker using in-memory fallback")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
if _redis_client is None:
|
|
48
|
+
try:
|
|
49
|
+
_pool = redis.ConnectionPool.from_url(
|
|
50
|
+
REDIS_URL,
|
|
51
|
+
decode_responses=True,
|
|
52
|
+
)
|
|
53
|
+
_redis_client = redis.Redis(connection_pool=_pool)
|
|
54
|
+
await _redis_client.ping()
|
|
55
|
+
_circuit_breaker_enabled = True
|
|
56
|
+
logger.info("Circuit breaker enabled via Redis")
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.warning(f"Failed to connect to Redis: %s - circuit breaker using in-memory fallback", e)
|
|
59
|
+
_redis_client = None
|
|
60
|
+
_circuit_breaker_enabled = False
|
|
61
|
+
|
|
62
|
+
return _redis_client
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def record_failure(engine_name: str) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Record a failure for the given engine. Opens circuit after FAILURE_THRESHOLD failures.
|
|
68
|
+
"""
|
|
69
|
+
client = await _get_redis()
|
|
70
|
+
|
|
71
|
+
if client is None or not _circuit_breaker_enabled:
|
|
72
|
+
_fallback_record_failure(engine_name)
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
failure_key = f"{CIRCUIT_PREFIX}{engine_name}:failures"
|
|
77
|
+
state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
|
|
78
|
+
|
|
79
|
+
failures = await client.incr(failure_key)
|
|
80
|
+
logger.debug(f"Engine {engine_name} failures: {failures}")
|
|
81
|
+
|
|
82
|
+
if failures >= FAILURE_THRESHOLD:
|
|
83
|
+
await client.set(state_key, "open")
|
|
84
|
+
await client.set(f"{CIRCUIT_PREFIX}{engine_name}:last_failure", str(time.time()))
|
|
85
|
+
logger.warning(f"Circuit opened for {engine_name} after {failures} failures")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Failed to record failure for {engine_name}: %s", e)
|
|
88
|
+
_fallback_record_failure(engine_name)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def record_success(engine_name: str) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Record a success for the given engine. Resets failure counter and closes circuit.
|
|
94
|
+
"""
|
|
95
|
+
client = await _get_redis()
|
|
96
|
+
|
|
97
|
+
if client is None or not _circuit_breaker_enabled:
|
|
98
|
+
_fallback_record_success(engine_name)
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
failure_key = f"{CIRCUIT_PREFIX}{engine_name}:failures"
|
|
103
|
+
state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
|
|
104
|
+
success_key = f"{CIRCUIT_PREFIX}{engine_name}:last_success"
|
|
105
|
+
|
|
106
|
+
await client.set(failure_key, "0")
|
|
107
|
+
await client.set(state_key, "closed")
|
|
108
|
+
await client.set(success_key, str(time.time()))
|
|
109
|
+
logger.debug(f"Circuit closed for {engine_name}")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Failed to record success for {engine_name}: %s", e)
|
|
112
|
+
_fallback_record_success(engine_name)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
async def is_open(engine_name: str) -> bool:
|
|
116
|
+
"""
|
|
117
|
+
Check if circuit is open for the given engine.
|
|
118
|
+
Auto-transitions from open -> half_open after OPEN_DURATION_SECONDS.
|
|
119
|
+
Auto-transitions from half_open -> closed on success.
|
|
120
|
+
"""
|
|
121
|
+
client = await _get_redis()
|
|
122
|
+
|
|
123
|
+
if client is None or not _circuit_breaker_enabled:
|
|
124
|
+
return _fallback_is_open(engine_name)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
|
|
128
|
+
last_failure_key = f"{CIRCUIT_PREFIX}{engine_name}:last_failure"
|
|
129
|
+
|
|
130
|
+
state = await client.get(state_key) or "closed"
|
|
131
|
+
|
|
132
|
+
if state == "open":
|
|
133
|
+
last_failure = await client.get(last_failure_key)
|
|
134
|
+
if last_failure:
|
|
135
|
+
elapsed = time.time() - float(last_failure)
|
|
136
|
+
if elapsed >= OPEN_DURATION_SECONDS:
|
|
137
|
+
await client.set(state_key, "half_open")
|
|
138
|
+
logger.info(f"Circuit for {engine_name} transitioned to half_open")
|
|
139
|
+
return False
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
if state == "half_open":
|
|
143
|
+
last_failure = await client.get(last_failure_key)
|
|
144
|
+
if last_failure:
|
|
145
|
+
elapsed = time.time() - float(last_failure)
|
|
146
|
+
if elapsed >= HALF_OPEN_TEST_INTERVAL:
|
|
147
|
+
await client.set(state_key, "half_open")
|
|
148
|
+
return False
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
return False
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Failed to check circuit state for {engine_name}: %s", e)
|
|
154
|
+
return _fallback_is_open(engine_name)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def get_all_states() -> dict:
|
|
158
|
+
"""
|
|
159
|
+
Get the current state of all circuit breakers.
|
|
160
|
+
Returns dict mapping engine_name to {state, failures, last_success}.
|
|
161
|
+
"""
|
|
162
|
+
client = await _get_redis()
|
|
163
|
+
|
|
164
|
+
if client is None or not _circuit_breaker_enabled:
|
|
165
|
+
return _fallback_get_all_states()
|
|
166
|
+
|
|
167
|
+
result = {}
|
|
168
|
+
try:
|
|
169
|
+
keys = await client.keys(f"{CIRCUIT_PREFIX}*:state")
|
|
170
|
+
for key in keys:
|
|
171
|
+
engine_name = key.replace(f"{CIRCUIT_PREFIX}", "").replace(":state", "")
|
|
172
|
+
state = await client.get(key) or "closed"
|
|
173
|
+
failures = await client.get(f"{CIRCUIT_PREFIX}{engine_name}:failures") or "0"
|
|
174
|
+
last_success = await client.get(f"{CIRCUIT_PREFIX}{engine_name}:last_success")
|
|
175
|
+
|
|
176
|
+
result[engine_name] = {
|
|
177
|
+
"state": state,
|
|
178
|
+
"failures": int(failures),
|
|
179
|
+
"last_success": last_success,
|
|
180
|
+
}
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error(f"Failed to get circuit states: %s", e)
|
|
183
|
+
return _fallback_get_all_states()
|
|
184
|
+
|
|
185
|
+
return result
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _fallback_record_failure(engine_name: str) -> None:
|
|
189
|
+
_engine_failures[engine_name] = _engine_failures.get(engine_name, 0) + 1
|
|
190
|
+
if _engine_failures[engine_name] >= FAILURE_THRESHOLD:
|
|
191
|
+
_engine_state[engine_name] = "open"
|
|
192
|
+
_engine_open_time[engine_name] = time.time()
|
|
193
|
+
logger.warning(f"[Fallback] Circuit opened for {engine_name}")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _fallback_record_success(engine_name: str) -> None:
|
|
197
|
+
_engine_failures[engine_name] = 0
|
|
198
|
+
_engine_last_success[engine_name] = time.time()
|
|
199
|
+
_engine_state[engine_name] = "closed"
|
|
200
|
+
logger.debug(f"[Fallback] Circuit closed for {engine_name}")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _fallback_is_open(engine_name: str) -> bool:
|
|
204
|
+
state = _engine_state.get(engine_name, "closed")
|
|
205
|
+
|
|
206
|
+
if state == "open":
|
|
207
|
+
open_time = _engine_open_time.get(engine_name, 0)
|
|
208
|
+
if time.time() - open_time >= OPEN_DURATION_SECONDS:
|
|
209
|
+
_engine_state[engine_name] = "half_open"
|
|
210
|
+
logger.info(f"[Fallback] Circuit for {engine_name} transitioned to half_open")
|
|
211
|
+
return False
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
if state == "half_open":
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _fallback_get_all_states() -> dict:
|
|
221
|
+
result = {}
|
|
222
|
+
for engine_name in _engine_state:
|
|
223
|
+
result[engine_name] = {
|
|
224
|
+
"state": _engine_state[engine_name],
|
|
225
|
+
"failures": _engine_failures.get(engine_name, 0),
|
|
226
|
+
"last_success": str(_engine_last_success.get(engine_name, 0)),
|
|
227
|
+
}
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def close() -> None:
|
|
232
|
+
"""Close Redis connection pool."""
|
|
233
|
+
global _pool, _redis_client
|
|
234
|
+
|
|
235
|
+
if _redis_client is not None:
|
|
236
|
+
await _redis_client.aclose()
|
|
237
|
+
_redis_client = None
|
|
238
|
+
if _pool is not None:
|
|
239
|
+
await _pool.disconnect()
|
|
240
|
+
_pool = None
|
search/search.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import aiohttp
|
|
11
|
+
import requests
|
|
12
|
+
from aiohttp_socks import ProxyConnector
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
|
|
15
|
+
from config import TOR_PROXY_HOST, TOR_PROXY_PORT
|
|
16
|
+
from search.circuit_breaker import record_failure, record_success, is_open
|
|
17
|
+
from utils.async_utils import run_async
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
ENGINE_TIMEOUT = 30
|
|
22
|
+
|
|
23
|
+
ENGINE_WEIGHTS = {
|
|
24
|
+
"darksearch": 1.0,
|
|
25
|
+
"ahmia": 0.9,
|
|
26
|
+
"torch": 0.7,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _normalize_for_dedup(url: str) -> str:
|
|
31
|
+
url = url.lower().rstrip("/")
|
|
32
|
+
url = url.replace("https://", "http://")
|
|
33
|
+
return url
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
USER_AGENTS = [
|
|
37
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
38
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
39
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
40
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
41
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
42
|
+
"Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
|
|
43
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
|
|
44
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
|
|
45
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
SEARCH_ENGINES = [
|
|
49
|
+
# confirmed working (zero failures in QA Run 5)
|
|
50
|
+
{"name": "Ahmia (Clearnet Proxy)", "url": "https://ahmia.fi/search/?q={query}"},
|
|
51
|
+
{"name": "Ahmia", "url": "http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?q={query}"},
|
|
52
|
+
{"name": "Torland", "url": "http://torlbmqwtudkorme6prgfpmsnile7ug2zm4u3ejpcncxuhpu4k2j4kyd.onion/index.php?a=search&q={query}"},
|
|
53
|
+
{"name": "OnionLand", "url": "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion/search?q={query}"},
|
|
54
|
+
{"name": "Find Tor", "url": "http://findtorroveq5wdnipkaojfpqulxnkhblymc7aramjzajcvpptd4rjqd.onion/search?q={query}"},
|
|
55
|
+
{"name": "TorNet", "url": "http://tornetupfu7gcgidt33ftnungxzyfq2pygui5qdoyss34xbgx2qruzid.onion/search?q={query}"},
|
|
56
|
+
{"name": "Excavator", "url": "http://2fd6cemt4gmccflhm6imvdfvli3nf7zn6rfrwpsy7uhxrgbypvwf5fad.onion/search?query={query}"},
|
|
57
|
+
# unverified - may be intermittent (2 failures in QA Run 5)
|
|
58
|
+
{"name": "Torgle", "url": "http://iy3544gmoeclh5de6gez2256v6pjh4omhpqdh2wpeeppjtvqmjhkfwad.onion/torgle/?query={query}"},
|
|
59
|
+
{"name": "The Deep Searches", "url": "http://searchgf7gdtauh7bhnbyed4ivxqmuoat3nm6zfrg3ymkq6mtnpye3ad.onion/search?q={query}"},
|
|
60
|
+
{"name": "Torgol", "url": "http://torgolnpeouim56dykfob6jh5r2ps2j73enc42s2um4ufob3ny4fcdyd.onion/?q={query}"},
|
|
61
|
+
{"name": "Onionway", "url": "http://oniwayzz74cv2puhsgx4dpjwieww4wdphsydqvf5q7eyz4myjvyw26ad.onion/search.php?s={query}"},
|
|
62
|
+
{"name": "Tor66", "url": "http://tor66sewebgixwhcqfnp5inzp5x5uohhdy3kvtnyfxc2e5mxiuh34iid.onion/search?q={query}"},
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
DEFAULT_SEARCH_ENGINES = [e["url"] for e in SEARCH_ENGINES]
|
|
66
|
+
|
|
67
|
+
_ONION_URL_RE = re.compile(r'https?://[a-z0-9._-]+\.onion(?:/[^\s"\'<>]*)?', re.IGNORECASE)
|
|
68
|
+
|
|
69
|
+
MAX_CONCURRENT = 10
|
|
70
|
+
SEARCH_TIMEOUT = 30
|
|
71
|
+
ENGINE_RETRY_COUNT = 2
|
|
72
|
+
|
|
73
|
+
_ENGINE_STATUS: dict[str, dict] = {}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class EngineResult:
|
|
78
|
+
name: str
|
|
79
|
+
links: list[dict]
|
|
80
|
+
error: Optional[str] = None
|
|
81
|
+
took_ms: int = 0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _get_tor_session():
|
|
85
|
+
session = requests.Session()
|
|
86
|
+
session.proxies = {
|
|
87
|
+
"http": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
88
|
+
"https": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
89
|
+
}
|
|
90
|
+
return session
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _is_onion_url(url: str) -> bool:
|
|
94
|
+
return bool(_ONION_URL_RE.search(url))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _tor_aiohttp_connector() -> ProxyConnector:
|
|
98
|
+
"""SOCKS5 with remote DNS for aiohttp-socks with connection pooling."""
|
|
99
|
+
return ProxyConnector.from_url(
|
|
100
|
+
f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
101
|
+
rdns=True,
|
|
102
|
+
limit=10,
|
|
103
|
+
limit_per_host=2,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def fetch_with_timeout(
|
|
108
|
+
url: str,
|
|
109
|
+
session: aiohttp.ClientSession,
|
|
110
|
+
) -> aiohttp.ClientResponse:
|
|
111
|
+
"""Fetch a URL with timeout using the provided session."""
|
|
112
|
+
return await session.get(url, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
async def _fetch_engine(
|
|
116
|
+
engine: dict,
|
|
117
|
+
query: str,
|
|
118
|
+
session: aiohttp.ClientSession,
|
|
119
|
+
semaphore: asyncio.Semaphore,
|
|
120
|
+
) -> EngineResult:
|
|
121
|
+
url = engine["url"].format(query=query)
|
|
122
|
+
name = engine["name"]
|
|
123
|
+
is_onion = _is_onion_url(url)
|
|
124
|
+
|
|
125
|
+
headers = {"User-Agent": random.choice(USER_AGENTS)}
|
|
126
|
+
|
|
127
|
+
async with semaphore:
|
|
128
|
+
for attempt in range(ENGINE_RETRY_COUNT + 1):
|
|
129
|
+
try:
|
|
130
|
+
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)) as resp:
|
|
131
|
+
if resp.status != 200:
|
|
132
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
133
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
134
|
+
continue
|
|
135
|
+
return EngineResult(
|
|
136
|
+
name=name,
|
|
137
|
+
links=[],
|
|
138
|
+
error=f"HTTP {resp.status}",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
text = await resp.text()
|
|
142
|
+
|
|
143
|
+
if "darksearch.io/api" in url:
|
|
144
|
+
try:
|
|
145
|
+
import json
|
|
146
|
+
data = json.loads(text)
|
|
147
|
+
links = [
|
|
148
|
+
{"title": hit.get("title", "No Title"), "link": hit.get("onion")}
|
|
149
|
+
for hit in data.get("data", [])
|
|
150
|
+
if hit.get("onion")
|
|
151
|
+
]
|
|
152
|
+
return EngineResult(name=name, links=links)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
return EngineResult(name=name, links=[], error=f"JSON parse: {e}")
|
|
155
|
+
|
|
156
|
+
links = _parse_html_links(text, url)
|
|
157
|
+
return EngineResult(name=name, links=links)
|
|
158
|
+
|
|
159
|
+
except asyncio.TimeoutError:
|
|
160
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
161
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
162
|
+
continue
|
|
163
|
+
return EngineResult(name=name, links=[], error="timeout")
|
|
164
|
+
except Exception as e:
|
|
165
|
+
if attempt < ENGINE_RETRY_COUNT:
|
|
166
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
167
|
+
continue
|
|
168
|
+
return EngineResult(name=name, links=[], error=str(e))
|
|
169
|
+
|
|
170
|
+
return EngineResult(name=name, links=[], error="max retries")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _parse_html_links(html: str, base_url: str) -> list[dict]:
|
|
174
|
+
"""Extract .onion result links from a search engine result page.
|
|
175
|
+
|
|
176
|
+
Handles three common formats:
|
|
177
|
+
- Direct href: <a href="http://x.onion/path">
|
|
178
|
+
- Redirect param: <a href="/results?url=http://x.onion/path">
|
|
179
|
+
- Plain text: URLs mentioned in body text but not hyperlinked
|
|
180
|
+
"""
|
|
181
|
+
from urllib.parse import urlparse, parse_qs, unquote # noqa: PLC0415
|
|
182
|
+
|
|
183
|
+
links: list[dict] = []
|
|
184
|
+
seen: set[str] = set()
|
|
185
|
+
base_host = (urlparse(base_url).hostname or "").lower()
|
|
186
|
+
|
|
187
|
+
def _add(url: str, title: str) -> None:
|
|
188
|
+
host = (urlparse(url).hostname or "").lower()
|
|
189
|
+
if host == base_host:
|
|
190
|
+
return
|
|
191
|
+
norm = url.lower().rstrip("/")
|
|
192
|
+
if norm not in seen:
|
|
193
|
+
seen.add(norm)
|
|
194
|
+
links.append({"title": title[:200], "link": url})
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
198
|
+
|
|
199
|
+
for a in soup.find_all("a"):
|
|
200
|
+
href = (a.get("href") or "").strip()
|
|
201
|
+
title = a.get_text(strip=True)
|
|
202
|
+
if not href or len(title) < 3:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
# 1. Direct absolute .onion URL in href
|
|
206
|
+
for match in _ONION_URL_RE.findall(href):
|
|
207
|
+
_add(match, title)
|
|
208
|
+
|
|
209
|
+
# 2. .onion URL hidden in a query parameter (redirect, url, link, site)
|
|
210
|
+
if ".onion" in href and not _ONION_URL_RE.search(href):
|
|
211
|
+
try:
|
|
212
|
+
qs = parse_qs(urlparse(href).query)
|
|
213
|
+
for param in ("url", "redirect", "link", "site", "address", "q"):
|
|
214
|
+
for val in qs.get(param, []):
|
|
215
|
+
decoded = unquote(val)
|
|
216
|
+
for match in _ONION_URL_RE.findall(decoded):
|
|
217
|
+
_add(match, title)
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
|
|
221
|
+
# 3. Any .onion URLs in the raw HTML text not captured via <a> tags
|
|
222
|
+
for match in _ONION_URL_RE.findall(html):
|
|
223
|
+
host = (urlparse(match).hostname or "").lower()
|
|
224
|
+
if host != base_host:
|
|
225
|
+
norm = match.lower().rstrip("/")
|
|
226
|
+
if norm not in seen:
|
|
227
|
+
seen.add(norm)
|
|
228
|
+
links.append({"title": host, "link": match})
|
|
229
|
+
|
|
230
|
+
except Exception:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
return links
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
async def _search_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[EngineResult]:
|
|
237
|
+
semaphore = asyncio.Semaphore(max_workers)
|
|
238
|
+
|
|
239
|
+
connector = _tor_aiohttp_connector()
|
|
240
|
+
async with aiohttp.ClientSession(
|
|
241
|
+
connector=connector,
|
|
242
|
+
timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT),
|
|
243
|
+
) as session:
|
|
244
|
+
|
|
245
|
+
async def run_engine(engine: dict) -> EngineResult:
|
|
246
|
+
name = engine["name"]
|
|
247
|
+
if await is_open(name):
|
|
248
|
+
logger.warning(f"Skipping unhealthy engine: {name}")
|
|
249
|
+
return EngineResult(name=name, links=[], error="circuit_open")
|
|
250
|
+
|
|
251
|
+
url = engine["url"].format(query=query)
|
|
252
|
+
|
|
253
|
+
async def fetch_with_engine_session():
|
|
254
|
+
result = await _fetch_engine(engine, query, session, semaphore)
|
|
255
|
+
if result.error:
|
|
256
|
+
if "HTTP 4" not in result.error:
|
|
257
|
+
await record_failure(name)
|
|
258
|
+
logger.warning(f"Engine {name} failed: {result.error}")
|
|
259
|
+
else:
|
|
260
|
+
await record_success(name)
|
|
261
|
+
if not result.links:
|
|
262
|
+
logger.warning(f"Engine {name} returned 0 results")
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
return await asyncio.wait_for(fetch_with_engine_session(), timeout=ENGINE_TIMEOUT)
|
|
267
|
+
except asyncio.TimeoutError:
|
|
268
|
+
await record_failure(name)
|
|
269
|
+
logger.warning(f"Engine {name} timed out")
|
|
270
|
+
return EngineResult(name=name, links=[], error="timeout")
|
|
271
|
+
except Exception as e:
|
|
272
|
+
await record_failure(name)
|
|
273
|
+
logger.warning(f"Engine {name} exception: {e}")
|
|
274
|
+
return EngineResult(name=name, links=[], error=str(e))
|
|
275
|
+
|
|
276
|
+
tasks = [run_engine(e) for e in SEARCH_ENGINES]
|
|
277
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
278
|
+
|
|
279
|
+
processed: list[EngineResult] = []
|
|
280
|
+
for r in results:
|
|
281
|
+
if isinstance(r, Exception):
|
|
282
|
+
logger.warning(f"Engine task exception: {r}")
|
|
283
|
+
continue
|
|
284
|
+
processed.append(r)
|
|
285
|
+
|
|
286
|
+
return processed
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def get_search_results_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
|
|
290
|
+
"""Async search - call from async context."""
|
|
291
|
+
import time
|
|
292
|
+
start = time.monotonic()
|
|
293
|
+
|
|
294
|
+
results = run_async(_search_async(query, max_workers))
|
|
295
|
+
|
|
296
|
+
all_links = []
|
|
297
|
+
for result in results:
|
|
298
|
+
engine_name = result.name.lower()
|
|
299
|
+
weight = 0.5
|
|
300
|
+
for known in ENGINE_WEIGHTS:
|
|
301
|
+
if known in engine_name:
|
|
302
|
+
weight = ENGINE_WEIGHTS[known]
|
|
303
|
+
break
|
|
304
|
+
for link in result.links:
|
|
305
|
+
link["source_engine"] = result.name
|
|
306
|
+
link["source_weight"] = weight
|
|
307
|
+
all_links.append(link)
|
|
308
|
+
status = "ok" if not result.error else result.error
|
|
309
|
+
logger.debug(f"Engine {result.name}: {len(result.links)} links ({status})")
|
|
310
|
+
|
|
311
|
+
unique = _dedupe_links(all_links)
|
|
312
|
+
unique.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
|
|
313
|
+
|
|
314
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
315
|
+
logger.info(f"Search completed: {len(unique)} unique links in {elapsed:.0f}ms")
|
|
316
|
+
|
|
317
|
+
return unique
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _dedupe_links(links: list[dict]) -> list[dict]:
|
|
321
|
+
seen: set[str] = set()
|
|
322
|
+
unique = []
|
|
323
|
+
for link_dict in links:
|
|
324
|
+
link = link_dict.get("link", "")
|
|
325
|
+
normalized = _normalize_for_dedup(link)
|
|
326
|
+
if normalized and normalized not in seen:
|
|
327
|
+
seen.add(normalized)
|
|
328
|
+
unique.append(link_dict)
|
|
329
|
+
return unique
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_search_results(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
|
|
333
|
+
"""Sync wrapper for backward compatibility."""
|
|
334
|
+
return get_search_results_async(query, max_workers)
|
sources/__init__.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources — Phase 1D expanded source coverage + threat intelligence enrichment.
|
|
3
|
+
|
|
4
|
+
Public API:
|
|
5
|
+
collect_all_sources(query, ...) async — unified aggregator
|
|
6
|
+
enrich_investigation(query, otx_api_key) async — threat intel enrichment
|
|
7
|
+
|
|
8
|
+
Sub-modules:
|
|
9
|
+
engines.py — DarkSearch JSON API + OnionSearch HTML scraping
|
|
10
|
+
seeds.py — curated .onion seed URL list
|
|
11
|
+
pastes.py — .onion paste site monitor
|
|
12
|
+
telegram.py — Telegram public channel monitor (clearnet, optional)
|
|
13
|
+
enrichment.py — AlienVault OTX + Abuse.ch threat intelligence
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import logging
|
|
20
|
+
from typing import Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from sources.enrichment import enrich_investigation
|
|
23
|
+
|
|
24
|
+
_logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
__all__ = ["collect_all_sources", "enrich_investigation"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def collect_all_sources(
|
|
30
|
+
query: str,
|
|
31
|
+
include_telegram: bool = False,
|
|
32
|
+
telegram_channels: Optional[List[str]] = None,
|
|
33
|
+
seed_categories: Optional[List[str]] = None,
|
|
34
|
+
) -> Dict:
|
|
35
|
+
"""
|
|
36
|
+
Aggregate all Phase 1D intelligence sources for *query*.
|
|
37
|
+
|
|
38
|
+
Search engines (DarkSearch + OnionSearch) and the paste monitor run
|
|
39
|
+
concurrently via asyncio.gather(). Telegram runs separately, and only
|
|
40
|
+
when *include_telegram=True* and credentials exist.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
query: investigation query string.
|
|
44
|
+
include_telegram: if True, also fetch matching Telegram messages.
|
|
45
|
+
telegram_channels: list of channel usernames to monitor; ignored when
|
|
46
|
+
include_telegram=False.
|
|
47
|
+
seed_categories: list of seed categories to include (e.g. ["forum",
|
|
48
|
+
"index"]); None returns all categories.
|
|
49
|
+
|
|
50
|
+
Returns dict with keys:
|
|
51
|
+
"search_results" list[dict] — from DarkSearch + OnionSearch
|
|
52
|
+
"paste_results" list[dict] — from paste site monitor
|
|
53
|
+
"telegram_results" list[dict] — from Telegram (empty if skipped)
|
|
54
|
+
"seed_urls" list[dict] — from seeds.py (for crawler to consume)
|
|
55
|
+
"""
|
|
56
|
+
from sources.engines import search_darksearch, search_onionsearch
|
|
57
|
+
from sources.pastes import fetch_recent_pastes
|
|
58
|
+
from sources.seeds import get_seeds
|
|
59
|
+
|
|
60
|
+
# --- Search + pastes run concurrently -----------------------------------
|
|
61
|
+
(darksearch_results, onionsearch_results), paste_results = await asyncio.gather(
|
|
62
|
+
asyncio.gather(
|
|
63
|
+
search_darksearch(query),
|
|
64
|
+
search_onionsearch(query),
|
|
65
|
+
),
|
|
66
|
+
fetch_recent_pastes(query),
|
|
67
|
+
)
|
|
68
|
+
search_results: List[dict] = darksearch_results + onionsearch_results
|
|
69
|
+
|
|
70
|
+
# --- Telegram (optional, sequential after gather) -----------------------
|
|
71
|
+
telegram_results: List[dict] = []
|
|
72
|
+
if include_telegram:
|
|
73
|
+
from sources.telegram import fetch_telegram_messages
|
|
74
|
+
telegram_results = await fetch_telegram_messages(
|
|
75
|
+
channel_usernames=telegram_channels or [],
|
|
76
|
+
query=query,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# --- Seeds (synchronous) ------------------------------------------------
|
|
80
|
+
if seed_categories:
|
|
81
|
+
seen_urls: set[str] = set()
|
|
82
|
+
seeds: List[dict] = []
|
|
83
|
+
for cat in seed_categories:
|
|
84
|
+
for s in get_seeds(category=cat):
|
|
85
|
+
if s["url"] not in seen_urls:
|
|
86
|
+
seen_urls.add(s["url"])
|
|
87
|
+
seeds.append(s)
|
|
88
|
+
else:
|
|
89
|
+
seeds = get_seeds()
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"search_results": search_results,
|
|
93
|
+
"paste_results": paste_results,
|
|
94
|
+
"telegram_results": telegram_results,
|
|
95
|
+
"seed_urls": seeds,
|
|
96
|
+
}
|