voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/engines.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/engines.py — Additional dark web search engines not in search.py.
|
|
3
|
+
|
|
4
|
+
search.py handles 16 engines via the legacy thread-pool path (public API
|
|
5
|
+
unchanged for ui.py compatibility). This module adds engines that need
|
|
6
|
+
special handling:
|
|
7
|
+
|
|
8
|
+
• DarkSearch — JSON REST API, paginated, optional API key
|
|
9
|
+
• OnionSearch — HTML scraping of Torch and Haystack onion search engines
|
|
10
|
+
|
|
11
|
+
Both go through the Tor SOCKS5 proxy (TOR_PROXY_HOST / TOR_PROXY_PORT).
|
|
12
|
+
|
|
13
|
+
Public API:
|
|
14
|
+
async def search_darksearch(query, pages=2) -> list[dict]
|
|
15
|
+
async def search_onionsearch(query) -> list[dict]
|
|
16
|
+
|
|
17
|
+
Each returns list[dict] with keys: title, url, snippet, source.
|
|
18
|
+
Empty list on any error — never raises.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
import re
|
|
25
|
+
from typing import List
|
|
26
|
+
from urllib.parse import quote_plus
|
|
27
|
+
|
|
28
|
+
import aiohttp
|
|
29
|
+
from aiohttp_socks import ProxyConnector
|
|
30
|
+
from bs4 import BeautifulSoup
|
|
31
|
+
|
|
32
|
+
from config import DARKSEARCH_API_KEY, TOR_PROXY_HOST, TOR_PROXY_PORT
|
|
33
|
+
|
|
34
|
+
_logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Constants
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
_DARKSEARCH_API = "http://darksearch.io/api/search"
|
|
41
|
+
|
|
42
|
+
# Torch and Haystack — specifically called out in concept.md; not in search.py
|
|
43
|
+
_ONIONSEARCH_ENGINES = [
|
|
44
|
+
{
|
|
45
|
+
"name": "Torch",
|
|
46
|
+
"url": (
|
|
47
|
+
"http://torchdeedp3i2jigzjdmfpn5ttjhthh5wbmda2rr3jvqjg5p77c54dqd"
|
|
48
|
+
".onion/search?query={query}"
|
|
49
|
+
),
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"name": "Haystack",
|
|
53
|
+
"url": (
|
|
54
|
+
"http://haystak5njsmn2hqkewecpaxetahtwhsbsa64jom2k22z5afxhnpxfid"
|
|
55
|
+
".onion/?q={query}"
|
|
56
|
+
),
|
|
57
|
+
},
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
_TIMEOUT = aiohttp.ClientTimeout(connect=15, sock_read=45)
|
|
61
|
+
_ONION_RE = re.compile(r"https?://[a-z2-7]{16,56}\.onion[^\s\"'<>]*", re.IGNORECASE)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Shared helpers
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def _tor_connector() -> ProxyConnector:
|
|
69
|
+
return ProxyConnector.from_url(
|
|
70
|
+
f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
|
|
71
|
+
rdns=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _ua() -> str:
|
|
76
|
+
return (
|
|
77
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) "
|
|
78
|
+
"Gecko/20100101 Firefox/137.0"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
# DarkSearch JSON API
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
async def search_darksearch(query: str, pages: int = 2) -> List[dict]:
|
|
87
|
+
"""
|
|
88
|
+
Query the DarkSearch JSON API and return up to *pages* pages of results.
|
|
89
|
+
|
|
90
|
+
Routed through Tor for anonymity even though darksearch.io is clearnet.
|
|
91
|
+
Uses DARKSEARCH_API_KEY as Authorization header when configured.
|
|
92
|
+
|
|
93
|
+
Returns list[dict] with keys: title, url, snippet, source.
|
|
94
|
+
Returns [] on any network or parse error.
|
|
95
|
+
"""
|
|
96
|
+
results: List[dict] = []
|
|
97
|
+
headers = {"User-Agent": _ua(), "Accept": "application/json"}
|
|
98
|
+
if DARKSEARCH_API_KEY:
|
|
99
|
+
headers["Authorization"] = f"Bearer {DARKSEARCH_API_KEY}"
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
connector = _tor_connector()
|
|
103
|
+
async with aiohttp.ClientSession(
|
|
104
|
+
connector=connector, timeout=_TIMEOUT
|
|
105
|
+
) as session:
|
|
106
|
+
for page in range(1, pages + 1):
|
|
107
|
+
params = {"query": query, "page": page}
|
|
108
|
+
try:
|
|
109
|
+
async with session.get(
|
|
110
|
+
_DARKSEARCH_API, params=params, headers=headers
|
|
111
|
+
) as resp:
|
|
112
|
+
if resp.status != 200:
|
|
113
|
+
_logger.debug(
|
|
114
|
+
"DarkSearch page %d returned HTTP %d", page, resp.status
|
|
115
|
+
)
|
|
116
|
+
break
|
|
117
|
+
data = await resp.json(content_type=None)
|
|
118
|
+
items = data.get("data") or []
|
|
119
|
+
for item in items:
|
|
120
|
+
link = str(item.get("link") or "").strip()
|
|
121
|
+
if not link:
|
|
122
|
+
continue
|
|
123
|
+
results.append(
|
|
124
|
+
{
|
|
125
|
+
"title": str(item.get("title") or "").strip(),
|
|
126
|
+
"url": link,
|
|
127
|
+
"snippet": str(
|
|
128
|
+
item.get("description") or ""
|
|
129
|
+
).strip()[:500],
|
|
130
|
+
"source": "DarkSearch",
|
|
131
|
+
}
|
|
132
|
+
)
|
|
133
|
+
# Stop early if we've reached the last page
|
|
134
|
+
last = data.get("last_page") or page
|
|
135
|
+
if page >= last:
|
|
136
|
+
break
|
|
137
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
|
|
138
|
+
_logger.debug("DarkSearch page %d error: %s", page, exc)
|
|
139
|
+
break
|
|
140
|
+
except Exception as exc:
|
|
141
|
+
_logger.debug("DarkSearch session error: %s", exc)
|
|
142
|
+
|
|
143
|
+
return results
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
# OnionSearch HTML scraping (Torch + Haystack)
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
|
|
150
|
+
async def search_onionsearch(query: str) -> List[dict]:
|
|
151
|
+
"""
|
|
152
|
+
Scrape Torch and Haystack .onion search engines and return extracted links.
|
|
153
|
+
|
|
154
|
+
Each engine's result page is fetched, all .onion hrefs are extracted, and
|
|
155
|
+
the surrounding anchor text is used as the title. No snippet is available
|
|
156
|
+
from this scraping path (snippet is empty string).
|
|
157
|
+
|
|
158
|
+
Returns list[dict] with keys: title, url, snippet, source.
|
|
159
|
+
Returns [] on any error; partial results from working engines are included.
|
|
160
|
+
"""
|
|
161
|
+
results: List[dict] = []
|
|
162
|
+
encoded = quote_plus(query)
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
connector = _tor_connector()
|
|
166
|
+
async with aiohttp.ClientSession(
|
|
167
|
+
connector=connector, timeout=_TIMEOUT
|
|
168
|
+
) as session:
|
|
169
|
+
for engine in _ONIONSEARCH_ENGINES:
|
|
170
|
+
url = engine["url"].replace("{query}", encoded)
|
|
171
|
+
name = engine["name"]
|
|
172
|
+
try:
|
|
173
|
+
async with session.get(
|
|
174
|
+
url, headers={"User-Agent": _ua()}
|
|
175
|
+
) as resp:
|
|
176
|
+
if resp.status != 200:
|
|
177
|
+
_logger.debug(
|
|
178
|
+
"%s returned HTTP %d", name, resp.status
|
|
179
|
+
)
|
|
180
|
+
continue
|
|
181
|
+
html = await resp.text(errors="replace")
|
|
182
|
+
results.extend(_parse_onion_links(html, name))
|
|
183
|
+
except (aiohttp.ClientError, Exception) as exc:
|
|
184
|
+
_logger.debug("%s fetch error: %s", name, exc)
|
|
185
|
+
except Exception as exc:
|
|
186
|
+
_logger.debug("OnionSearch session error: %s", exc)
|
|
187
|
+
|
|
188
|
+
return _deduplicate(results)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _parse_onion_links(html: str, source_name: str) -> List[dict]:
|
|
192
|
+
"""
|
|
193
|
+
Extract .onion links + anchor text from an HTML results page.
|
|
194
|
+
|
|
195
|
+
Falls back to regex extraction if BeautifulSoup finds nothing useful.
|
|
196
|
+
"""
|
|
197
|
+
items: List[dict] = []
|
|
198
|
+
seen: set[str] = set()
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
202
|
+
for tag in soup.find_all("a", href=True):
|
|
203
|
+
href = str(tag["href"]).strip()
|
|
204
|
+
match = _ONION_RE.match(href)
|
|
205
|
+
if not match:
|
|
206
|
+
continue
|
|
207
|
+
url = match.group(0).rstrip(".,;)'\"")
|
|
208
|
+
if url in seen or "search" in url.lower():
|
|
209
|
+
continue
|
|
210
|
+
title = tag.get_text(strip=True)
|
|
211
|
+
if len(title) < 3:
|
|
212
|
+
continue
|
|
213
|
+
seen.add(url)
|
|
214
|
+
items.append(
|
|
215
|
+
{"title": title, "url": url, "snippet": "", "source": source_name}
|
|
216
|
+
)
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
# Regex fallback when structured parsing yields nothing
|
|
221
|
+
if not items:
|
|
222
|
+
for url in _ONION_RE.findall(html):
|
|
223
|
+
url = url.rstrip(".,;)'\"")
|
|
224
|
+
if url not in seen and "search" not in url.lower():
|
|
225
|
+
seen.add(url)
|
|
226
|
+
items.append(
|
|
227
|
+
{"title": url, "url": url, "snippet": "", "source": source_name}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return items
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _deduplicate(results: List[dict]) -> List[dict]:
|
|
234
|
+
seen: set[str] = set()
|
|
235
|
+
out: List[dict] = []
|
|
236
|
+
for r in results:
|
|
237
|
+
if r["url"] not in seen:
|
|
238
|
+
seen.add(r["url"])
|
|
239
|
+
out.append(r)
|
|
240
|
+
return out
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# asyncio is used inside search_darksearch — import here to avoid circular
|
|
244
|
+
import asyncio # noqa: E402
|