voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,240 @@
1
+ """
2
+ Circuit breaker for search engine resilience using Redis.
3
+
4
+ Provides shared, persistent state across Uvicorn workers:
5
+ - circuit:{engine_name}:failures — integer counter
6
+ - circuit:{engine_name}:last_success — Unix timestamp
7
+ - circuit:{engine_name}:state — "closed" | "open" | "half_open"
8
+
9
+ Gracefully degrades to in-memory dict if Redis is unavailable.
10
+ """
11
+
12
+ import logging
13
+ import time
14
+ from typing import Optional
15
+
16
+ import redis.asyncio as redis
17
+
18
+ from config import REDIS_URL
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ FAILURE_THRESHOLD = 8
23
+ OPEN_DURATION_SECONDS = 900
24
+ HALF_OPEN_TEST_INTERVAL = 60
25
+ HALF_OPEN_MAX_ATTEMPTS = 2
26
+
27
+ CIRCUIT_PREFIX = "circuit:"
28
+
29
+ _pool: Optional[redis.ConnectionPool] = None
30
+ _redis_client: Optional[redis.Redis] = None
31
+ _circuit_breaker_enabled = False
32
+
33
+ _engine_failures: dict[str, int] = {}
34
+ _engine_last_success: dict[str, float] = {}
35
+ _engine_state: dict[str, str] = {}
36
+ _engine_open_time: dict[str, float] = {}
37
+
38
+
39
+ async def _get_redis() -> Optional[redis.Redis]:
40
+ global _pool, _redis_client, _circuit_breaker_enabled
41
+
42
+ if REDIS_URL is None:
43
+ _circuit_breaker_enabled = False
44
+ logger.warning("REDIS_URL not configured - circuit breaker using in-memory fallback")
45
+ return None
46
+
47
+ if _redis_client is None:
48
+ try:
49
+ _pool = redis.ConnectionPool.from_url(
50
+ REDIS_URL,
51
+ decode_responses=True,
52
+ )
53
+ _redis_client = redis.Redis(connection_pool=_pool)
54
+ await _redis_client.ping()
55
+ _circuit_breaker_enabled = True
56
+ logger.info("Circuit breaker enabled via Redis")
57
+ except Exception as e:
58
+ logger.warning(f"Failed to connect to Redis: %s - circuit breaker using in-memory fallback", e)
59
+ _redis_client = None
60
+ _circuit_breaker_enabled = False
61
+
62
+ return _redis_client
63
+
64
+
65
+ async def record_failure(engine_name: str) -> None:
66
+ """
67
+ Record a failure for the given engine. Opens circuit after FAILURE_THRESHOLD failures.
68
+ """
69
+ client = await _get_redis()
70
+
71
+ if client is None or not _circuit_breaker_enabled:
72
+ _fallback_record_failure(engine_name)
73
+ return
74
+
75
+ try:
76
+ failure_key = f"{CIRCUIT_PREFIX}{engine_name}:failures"
77
+ state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
78
+
79
+ failures = await client.incr(failure_key)
80
+ logger.debug(f"Engine {engine_name} failures: {failures}")
81
+
82
+ if failures >= FAILURE_THRESHOLD:
83
+ await client.set(state_key, "open")
84
+ await client.set(f"{CIRCUIT_PREFIX}{engine_name}:last_failure", str(time.time()))
85
+ logger.warning(f"Circuit opened for {engine_name} after {failures} failures")
86
+ except Exception as e:
87
+ logger.error(f"Failed to record failure for {engine_name}: %s", e)
88
+ _fallback_record_failure(engine_name)
89
+
90
+
91
+ async def record_success(engine_name: str) -> None:
92
+ """
93
+ Record a success for the given engine. Resets failure counter and closes circuit.
94
+ """
95
+ client = await _get_redis()
96
+
97
+ if client is None or not _circuit_breaker_enabled:
98
+ _fallback_record_success(engine_name)
99
+ return
100
+
101
+ try:
102
+ failure_key = f"{CIRCUIT_PREFIX}{engine_name}:failures"
103
+ state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
104
+ success_key = f"{CIRCUIT_PREFIX}{engine_name}:last_success"
105
+
106
+ await client.set(failure_key, "0")
107
+ await client.set(state_key, "closed")
108
+ await client.set(success_key, str(time.time()))
109
+ logger.debug(f"Circuit closed for {engine_name}")
110
+ except Exception as e:
111
+ logger.error(f"Failed to record success for {engine_name}: %s", e)
112
+ _fallback_record_success(engine_name)
113
+
114
+
115
+ async def is_open(engine_name: str) -> bool:
116
+ """
117
+ Check if circuit is open for the given engine.
118
+ Auto-transitions from open -> half_open after OPEN_DURATION_SECONDS.
119
+ Auto-transitions from half_open -> closed on success.
120
+ """
121
+ client = await _get_redis()
122
+
123
+ if client is None or not _circuit_breaker_enabled:
124
+ return _fallback_is_open(engine_name)
125
+
126
+ try:
127
+ state_key = f"{CIRCUIT_PREFIX}{engine_name}:state"
128
+ last_failure_key = f"{CIRCUIT_PREFIX}{engine_name}:last_failure"
129
+
130
+ state = await client.get(state_key) or "closed"
131
+
132
+ if state == "open":
133
+ last_failure = await client.get(last_failure_key)
134
+ if last_failure:
135
+ elapsed = time.time() - float(last_failure)
136
+ if elapsed >= OPEN_DURATION_SECONDS:
137
+ await client.set(state_key, "half_open")
138
+ logger.info(f"Circuit for {engine_name} transitioned to half_open")
139
+ return False
140
+ return True
141
+
142
+ if state == "half_open":
143
+ last_failure = await client.get(last_failure_key)
144
+ if last_failure:
145
+ elapsed = time.time() - float(last_failure)
146
+ if elapsed >= HALF_OPEN_TEST_INTERVAL:
147
+ await client.set(state_key, "half_open")
148
+ return False
149
+ return False
150
+
151
+ return False
152
+ except Exception as e:
153
+ logger.error(f"Failed to check circuit state for {engine_name}: %s", e)
154
+ return _fallback_is_open(engine_name)
155
+
156
+
157
+ async def get_all_states() -> dict:
158
+ """
159
+ Get the current state of all circuit breakers.
160
+ Returns dict mapping engine_name to {state, failures, last_success}.
161
+ """
162
+ client = await _get_redis()
163
+
164
+ if client is None or not _circuit_breaker_enabled:
165
+ return _fallback_get_all_states()
166
+
167
+ result = {}
168
+ try:
169
+ keys = await client.keys(f"{CIRCUIT_PREFIX}*:state")
170
+ for key in keys:
171
+ engine_name = key.replace(f"{CIRCUIT_PREFIX}", "").replace(":state", "")
172
+ state = await client.get(key) or "closed"
173
+ failures = await client.get(f"{CIRCUIT_PREFIX}{engine_name}:failures") or "0"
174
+ last_success = await client.get(f"{CIRCUIT_PREFIX}{engine_name}:last_success")
175
+
176
+ result[engine_name] = {
177
+ "state": state,
178
+ "failures": int(failures),
179
+ "last_success": last_success,
180
+ }
181
+ except Exception as e:
182
+ logger.error(f"Failed to get circuit states: %s", e)
183
+ return _fallback_get_all_states()
184
+
185
+ return result
186
+
187
+
188
+ def _fallback_record_failure(engine_name: str) -> None:
189
+ _engine_failures[engine_name] = _engine_failures.get(engine_name, 0) + 1
190
+ if _engine_failures[engine_name] >= FAILURE_THRESHOLD:
191
+ _engine_state[engine_name] = "open"
192
+ _engine_open_time[engine_name] = time.time()
193
+ logger.warning(f"[Fallback] Circuit opened for {engine_name}")
194
+
195
+
196
+ def _fallback_record_success(engine_name: str) -> None:
197
+ _engine_failures[engine_name] = 0
198
+ _engine_last_success[engine_name] = time.time()
199
+ _engine_state[engine_name] = "closed"
200
+ logger.debug(f"[Fallback] Circuit closed for {engine_name}")
201
+
202
+
203
+ def _fallback_is_open(engine_name: str) -> bool:
204
+ state = _engine_state.get(engine_name, "closed")
205
+
206
+ if state == "open":
207
+ open_time = _engine_open_time.get(engine_name, 0)
208
+ if time.time() - open_time >= OPEN_DURATION_SECONDS:
209
+ _engine_state[engine_name] = "half_open"
210
+ logger.info(f"[Fallback] Circuit for {engine_name} transitioned to half_open")
211
+ return False
212
+ return True
213
+
214
+ if state == "half_open":
215
+ return False
216
+
217
+ return False
218
+
219
+
220
+ def _fallback_get_all_states() -> dict:
221
+ result = {}
222
+ for engine_name in _engine_state:
223
+ result[engine_name] = {
224
+ "state": _engine_state[engine_name],
225
+ "failures": _engine_failures.get(engine_name, 0),
226
+ "last_success": str(_engine_last_success.get(engine_name, 0)),
227
+ }
228
+ return result
229
+
230
+
231
+ async def close() -> None:
232
+ """Close Redis connection pool."""
233
+ global _pool, _redis_client
234
+
235
+ if _redis_client is not None:
236
+ await _redis_client.aclose()
237
+ _redis_client = None
238
+ if _pool is not None:
239
+ await _pool.disconnect()
240
+ _pool = None
search/search.py ADDED
@@ -0,0 +1,334 @@
1
+ import asyncio
2
+ import logging
3
+ import random
4
+ import re
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+ import aiohttp
11
+ import requests
12
+ from aiohttp_socks import ProxyConnector
13
+ from bs4 import BeautifulSoup
14
+
15
+ from config import TOR_PROXY_HOST, TOR_PROXY_PORT
16
+ from search.circuit_breaker import record_failure, record_success, is_open
17
+ from utils.async_utils import run_async
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ ENGINE_TIMEOUT = 30
22
+
23
+ ENGINE_WEIGHTS = {
24
+ "darksearch": 1.0,
25
+ "ahmia": 0.9,
26
+ "torch": 0.7,
27
+ }
28
+
29
+
30
+ def _normalize_for_dedup(url: str) -> str:
31
+ url = url.lower().rstrip("/")
32
+ url = url.replace("https://", "http://")
33
+ return url
34
+
35
+
36
+ USER_AGENTS = [
37
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
38
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
39
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
40
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
41
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
42
+ "Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
43
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
44
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
45
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
46
+ ]
47
+
48
+ SEARCH_ENGINES = [
49
+ # confirmed working (zero failures in QA Run 5)
50
+ {"name": "Ahmia (Clearnet Proxy)", "url": "https://ahmia.fi/search/?q={query}"},
51
+ {"name": "Ahmia", "url": "http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?q={query}"},
52
+ {"name": "Torland", "url": "http://torlbmqwtudkorme6prgfpmsnile7ug2zm4u3ejpcncxuhpu4k2j4kyd.onion/index.php?a=search&q={query}"},
53
+ {"name": "OnionLand", "url": "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion/search?q={query}"},
54
+ {"name": "Find Tor", "url": "http://findtorroveq5wdnipkaojfpqulxnkhblymc7aramjzajcvpptd4rjqd.onion/search?q={query}"},
55
+ {"name": "TorNet", "url": "http://tornetupfu7gcgidt33ftnungxzyfq2pygui5qdoyss34xbgx2qruzid.onion/search?q={query}"},
56
+ {"name": "Excavator", "url": "http://2fd6cemt4gmccflhm6imvdfvli3nf7zn6rfrwpsy7uhxrgbypvwf5fad.onion/search?query={query}"},
57
+ # unverified - may be intermittent (2 failures in QA Run 5)
58
+ {"name": "Torgle", "url": "http://iy3544gmoeclh5de6gez2256v6pjh4omhpqdh2wpeeppjtvqmjhkfwad.onion/torgle/?query={query}"},
59
+ {"name": "The Deep Searches", "url": "http://searchgf7gdtauh7bhnbyed4ivxqmuoat3nm6zfrg3ymkq6mtnpye3ad.onion/search?q={query}"},
60
+ {"name": "Torgol", "url": "http://torgolnpeouim56dykfob6jh5r2ps2j73enc42s2um4ufob3ny4fcdyd.onion/?q={query}"},
61
+ {"name": "Onionway", "url": "http://oniwayzz74cv2puhsgx4dpjwieww4wdphsydqvf5q7eyz4myjvyw26ad.onion/search.php?s={query}"},
62
+ {"name": "Tor66", "url": "http://tor66sewebgixwhcqfnp5inzp5x5uohhdy3kvtnyfxc2e5mxiuh34iid.onion/search?q={query}"},
63
+ ]
64
+
65
+ DEFAULT_SEARCH_ENGINES = [e["url"] for e in SEARCH_ENGINES]
66
+
67
+ _ONION_URL_RE = re.compile(r'https?://[a-z0-9._-]+\.onion(?:/[^\s"\'<>]*)?', re.IGNORECASE)
68
+
69
+ MAX_CONCURRENT = 10
70
+ SEARCH_TIMEOUT = 30
71
+ ENGINE_RETRY_COUNT = 2
72
+
73
+ _ENGINE_STATUS: dict[str, dict] = {}
74
+
75
+
76
+ @dataclass
77
+ class EngineResult:
78
+ name: str
79
+ links: list[dict]
80
+ error: Optional[str] = None
81
+ took_ms: int = 0
82
+
83
+
84
+ def _get_tor_session():
85
+ session = requests.Session()
86
+ session.proxies = {
87
+ "http": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
88
+ "https": f"socks5h://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
89
+ }
90
+ return session
91
+
92
+
93
+ def _is_onion_url(url: str) -> bool:
94
+ return bool(_ONION_URL_RE.search(url))
95
+
96
+
97
+ def _tor_aiohttp_connector() -> ProxyConnector:
98
+ """SOCKS5 with remote DNS for aiohttp-socks with connection pooling."""
99
+ return ProxyConnector.from_url(
100
+ f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
101
+ rdns=True,
102
+ limit=10,
103
+ limit_per_host=2,
104
+ )
105
+
106
+
107
+ async def fetch_with_timeout(
108
+ url: str,
109
+ session: aiohttp.ClientSession,
110
+ ) -> aiohttp.ClientResponse:
111
+ """Fetch a URL with timeout using the provided session."""
112
+ return await session.get(url, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT))
113
+
114
+
115
+ async def _fetch_engine(
116
+ engine: dict,
117
+ query: str,
118
+ session: aiohttp.ClientSession,
119
+ semaphore: asyncio.Semaphore,
120
+ ) -> EngineResult:
121
+ url = engine["url"].format(query=query)
122
+ name = engine["name"]
123
+ is_onion = _is_onion_url(url)
124
+
125
+ headers = {"User-Agent": random.choice(USER_AGENTS)}
126
+
127
+ async with semaphore:
128
+ for attempt in range(ENGINE_RETRY_COUNT + 1):
129
+ try:
130
+ async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)) as resp:
131
+ if resp.status != 200:
132
+ if attempt < ENGINE_RETRY_COUNT:
133
+ await asyncio.sleep(0.5 * (attempt + 1))
134
+ continue
135
+ return EngineResult(
136
+ name=name,
137
+ links=[],
138
+ error=f"HTTP {resp.status}",
139
+ )
140
+
141
+ text = await resp.text()
142
+
143
+ if "darksearch.io/api" in url:
144
+ try:
145
+ import json
146
+ data = json.loads(text)
147
+ links = [
148
+ {"title": hit.get("title", "No Title"), "link": hit.get("onion")}
149
+ for hit in data.get("data", [])
150
+ if hit.get("onion")
151
+ ]
152
+ return EngineResult(name=name, links=links)
153
+ except Exception as e:
154
+ return EngineResult(name=name, links=[], error=f"JSON parse: {e}")
155
+
156
+ links = _parse_html_links(text, url)
157
+ return EngineResult(name=name, links=links)
158
+
159
+ except asyncio.TimeoutError:
160
+ if attempt < ENGINE_RETRY_COUNT:
161
+ await asyncio.sleep(0.5 * (attempt + 1))
162
+ continue
163
+ return EngineResult(name=name, links=[], error="timeout")
164
+ except Exception as e:
165
+ if attempt < ENGINE_RETRY_COUNT:
166
+ await asyncio.sleep(0.5 * (attempt + 1))
167
+ continue
168
+ return EngineResult(name=name, links=[], error=str(e))
169
+
170
+ return EngineResult(name=name, links=[], error="max retries")
171
+
172
+
173
+ def _parse_html_links(html: str, base_url: str) -> list[dict]:
174
+ """Extract .onion result links from a search engine result page.
175
+
176
+ Handles three common formats:
177
+ - Direct href: <a href="http://x.onion/path">
178
+ - Redirect param: <a href="/results?url=http://x.onion/path">
179
+ - Plain text: URLs mentioned in body text but not hyperlinked
180
+ """
181
+ from urllib.parse import urlparse, parse_qs, unquote # noqa: PLC0415
182
+
183
+ links: list[dict] = []
184
+ seen: set[str] = set()
185
+ base_host = (urlparse(base_url).hostname or "").lower()
186
+
187
+ def _add(url: str, title: str) -> None:
188
+ host = (urlparse(url).hostname or "").lower()
189
+ if host == base_host:
190
+ return
191
+ norm = url.lower().rstrip("/")
192
+ if norm not in seen:
193
+ seen.add(norm)
194
+ links.append({"title": title[:200], "link": url})
195
+
196
+ try:
197
+ soup = BeautifulSoup(html, "html.parser")
198
+
199
+ for a in soup.find_all("a"):
200
+ href = (a.get("href") or "").strip()
201
+ title = a.get_text(strip=True)
202
+ if not href or len(title) < 3:
203
+ continue
204
+
205
+ # 1. Direct absolute .onion URL in href
206
+ for match in _ONION_URL_RE.findall(href):
207
+ _add(match, title)
208
+
209
+ # 2. .onion URL hidden in a query parameter (redirect, url, link, site)
210
+ if ".onion" in href and not _ONION_URL_RE.search(href):
211
+ try:
212
+ qs = parse_qs(urlparse(href).query)
213
+ for param in ("url", "redirect", "link", "site", "address", "q"):
214
+ for val in qs.get(param, []):
215
+ decoded = unquote(val)
216
+ for match in _ONION_URL_RE.findall(decoded):
217
+ _add(match, title)
218
+ except Exception:
219
+ pass
220
+
221
+ # 3. Any .onion URLs in the raw HTML text not captured via <a> tags
222
+ for match in _ONION_URL_RE.findall(html):
223
+ host = (urlparse(match).hostname or "").lower()
224
+ if host != base_host:
225
+ norm = match.lower().rstrip("/")
226
+ if norm not in seen:
227
+ seen.add(norm)
228
+ links.append({"title": host, "link": match})
229
+
230
+ except Exception:
231
+ pass
232
+
233
+ return links
234
+
235
+
236
+ async def _search_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[EngineResult]:
237
+ semaphore = asyncio.Semaphore(max_workers)
238
+
239
+ connector = _tor_aiohttp_connector()
240
+ async with aiohttp.ClientSession(
241
+ connector=connector,
242
+ timeout=aiohttp.ClientTimeout(total=SEARCH_TIMEOUT),
243
+ ) as session:
244
+
245
+ async def run_engine(engine: dict) -> EngineResult:
246
+ name = engine["name"]
247
+ if await is_open(name):
248
+ logger.warning(f"Skipping unhealthy engine: {name}")
249
+ return EngineResult(name=name, links=[], error="circuit_open")
250
+
251
+ url = engine["url"].format(query=query)
252
+
253
+ async def fetch_with_engine_session():
254
+ result = await _fetch_engine(engine, query, session, semaphore)
255
+ if result.error:
256
+ if "HTTP 4" not in result.error:
257
+ await record_failure(name)
258
+ logger.warning(f"Engine {name} failed: {result.error}")
259
+ else:
260
+ await record_success(name)
261
+ if not result.links:
262
+ logger.warning(f"Engine {name} returned 0 results")
263
+ return result
264
+
265
+ try:
266
+ return await asyncio.wait_for(fetch_with_engine_session(), timeout=ENGINE_TIMEOUT)
267
+ except asyncio.TimeoutError:
268
+ await record_failure(name)
269
+ logger.warning(f"Engine {name} timed out")
270
+ return EngineResult(name=name, links=[], error="timeout")
271
+ except Exception as e:
272
+ await record_failure(name)
273
+ logger.warning(f"Engine {name} exception: {e}")
274
+ return EngineResult(name=name, links=[], error=str(e))
275
+
276
+ tasks = [run_engine(e) for e in SEARCH_ENGINES]
277
+ results = await asyncio.gather(*tasks, return_exceptions=True)
278
+
279
+ processed: list[EngineResult] = []
280
+ for r in results:
281
+ if isinstance(r, Exception):
282
+ logger.warning(f"Engine task exception: {r}")
283
+ continue
284
+ processed.append(r)
285
+
286
+ return processed
287
+
288
+
289
+ def get_search_results_async(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
290
+ """Async search - call from async context."""
291
+ import time
292
+ start = time.monotonic()
293
+
294
+ results = run_async(_search_async(query, max_workers))
295
+
296
+ all_links = []
297
+ for result in results:
298
+ engine_name = result.name.lower()
299
+ weight = 0.5
300
+ for known in ENGINE_WEIGHTS:
301
+ if known in engine_name:
302
+ weight = ENGINE_WEIGHTS[known]
303
+ break
304
+ for link in result.links:
305
+ link["source_engine"] = result.name
306
+ link["source_weight"] = weight
307
+ all_links.append(link)
308
+ status = "ok" if not result.error else result.error
309
+ logger.debug(f"Engine {result.name}: {len(result.links)} links ({status})")
310
+
311
+ unique = _dedupe_links(all_links)
312
+ unique.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
313
+
314
+ elapsed = (time.monotonic() - start) * 1000
315
+ logger.info(f"Search completed: {len(unique)} unique links in {elapsed:.0f}ms")
316
+
317
+ return unique
318
+
319
+
320
+ def _dedupe_links(links: list[dict]) -> list[dict]:
321
+ seen: set[str] = set()
322
+ unique = []
323
+ for link_dict in links:
324
+ link = link_dict.get("link", "")
325
+ normalized = _normalize_for_dedup(link)
326
+ if normalized and normalized not in seen:
327
+ seen.add(normalized)
328
+ unique.append(link_dict)
329
+ return unique
330
+
331
+
332
+ def get_search_results(query: str, max_workers: int = MAX_CONCURRENT) -> list[dict]:
333
+ """Sync wrapper for backward compatibility."""
334
+ return get_search_results_async(query, max_workers)
sources/__init__.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ sources — Phase 1D expanded source coverage + threat intelligence enrichment.
3
+
4
+ Public API:
5
+ collect_all_sources(query, ...) async — unified aggregator
6
+ enrich_investigation(query, otx_api_key) async — threat intel enrichment
7
+
8
+ Sub-modules:
9
+ engines.py — DarkSearch JSON API + OnionSearch HTML scraping
10
+ seeds.py — curated .onion seed URL list
11
+ pastes.py — .onion paste site monitor
12
+ telegram.py — Telegram public channel monitor (clearnet, optional)
13
+ enrichment.py — AlienVault OTX + Abuse.ch threat intelligence
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import logging
20
+ from typing import Dict, List, Optional
21
+
22
+ from sources.enrichment import enrich_investigation
23
+
24
+ _logger = logging.getLogger(__name__)
25
+
26
+ __all__ = ["collect_all_sources", "enrich_investigation"]
27
+
28
+
29
+ async def collect_all_sources(
30
+ query: str,
31
+ include_telegram: bool = False,
32
+ telegram_channels: Optional[List[str]] = None,
33
+ seed_categories: Optional[List[str]] = None,
34
+ ) -> Dict:
35
+ """
36
+ Aggregate all Phase 1D intelligence sources for *query*.
37
+
38
+ Search engines (DarkSearch + OnionSearch) and the paste monitor run
39
+ concurrently via asyncio.gather(). Telegram runs separately, and only
40
+ when *include_telegram=True* and credentials exist.
41
+
42
+ Args:
43
+ query: investigation query string.
44
+ include_telegram: if True, also fetch matching Telegram messages.
45
+ telegram_channels: list of channel usernames to monitor; ignored when
46
+ include_telegram=False.
47
+ seed_categories: list of seed categories to include (e.g. ["forum",
48
+ "index"]); None returns all categories.
49
+
50
+ Returns dict with keys:
51
+ "search_results" list[dict] — from DarkSearch + OnionSearch
52
+ "paste_results" list[dict] — from paste site monitor
53
+ "telegram_results" list[dict] — from Telegram (empty if skipped)
54
+ "seed_urls" list[dict] — from seeds.py (for crawler to consume)
55
+ """
56
+ from sources.engines import search_darksearch, search_onionsearch
57
+ from sources.pastes import fetch_recent_pastes
58
+ from sources.seeds import get_seeds
59
+
60
+ # --- Search + pastes run concurrently -----------------------------------
61
+ (darksearch_results, onionsearch_results), paste_results = await asyncio.gather(
62
+ asyncio.gather(
63
+ search_darksearch(query),
64
+ search_onionsearch(query),
65
+ ),
66
+ fetch_recent_pastes(query),
67
+ )
68
+ search_results: List[dict] = darksearch_results + onionsearch_results
69
+
70
+ # --- Telegram (optional, sequential after gather) -----------------------
71
+ telegram_results: List[dict] = []
72
+ if include_telegram:
73
+ from sources.telegram import fetch_telegram_messages
74
+ telegram_results = await fetch_telegram_messages(
75
+ channel_usernames=telegram_channels or [],
76
+ query=query,
77
+ )
78
+
79
+ # --- Seeds (synchronous) ------------------------------------------------
80
+ if seed_categories:
81
+ seen_urls: set[str] = set()
82
+ seeds: List[dict] = []
83
+ for cat in seed_categories:
84
+ for s in get_seeds(category=cat):
85
+ if s["url"] not in seen_urls:
86
+ seen_urls.add(s["url"])
87
+ seeds.append(s)
88
+ else:
89
+ seeds = get_seeds()
90
+
91
+ return {
92
+ "search_results": search_results,
93
+ "paste_results": paste_results,
94
+ "telegram_results": telegram_results,
95
+ "seed_urls": seeds,
96
+ }