voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/seed_manager.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/seed_manager.py — Curated .onion seed list manager.
|
|
3
|
+
|
|
4
|
+
Maintains a JSON-backed catalogue of known-active dark-web addresses
|
|
5
|
+
organized by category (ransomware leak sites, hacker forums, carding shops,
|
|
6
|
+
search engines, etc.).
|
|
7
|
+
|
|
8
|
+
At investigation time, get_relevant_seeds(query) scores each seed against
|
|
9
|
+
the user query using tag and name matching, and returns the top-N most
|
|
10
|
+
relevant entries. Those seed URLs are injected into the scrape queue
|
|
11
|
+
ahead of the search-engine fan-out so that known intelligence sources are
|
|
12
|
+
always visited for an applicable query.
|
|
13
|
+
|
|
14
|
+
The seed JSON lives at data/onion_seeds.json and is community-editable.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
import aiohttp
|
|
27
|
+
import aiohttp_socks
|
|
28
|
+
|
|
29
|
+
from utils.content_safety import is_blocked_url
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# The seed file lives in voidaccess/data/onion_seeds.json (sibling of sources/)
|
|
34
|
+
SEED_FILE = Path(__file__).resolve().parent.parent / "data" / "onion_seeds.json"
|
|
35
|
+
TOR_PROXY = "socks5://127.0.0.1:9050"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SeedManager:
|
|
39
|
+
"""
|
|
40
|
+
Manages the curated .onion seed list.
|
|
41
|
+
Provides relevance matching and availability checking.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self) -> None:
|
|
45
|
+
self._seeds: list[dict] = []
|
|
46
|
+
self._loaded: bool = False
|
|
47
|
+
|
|
48
|
+
def load(self) -> None:
|
|
49
|
+
"""Load seeds from JSON file."""
|
|
50
|
+
if not SEED_FILE.exists():
|
|
51
|
+
logger.warning("Seed file not found: %s", SEED_FILE)
|
|
52
|
+
self._seeds = []
|
|
53
|
+
self._loaded = True
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
data = json.loads(SEED_FILE.read_text(encoding="utf-8"))
|
|
58
|
+
self._seeds = []
|
|
59
|
+
|
|
60
|
+
for category, cat_data in data.get("categories", {}).items():
|
|
61
|
+
for seed in cat_data.get("seeds", []):
|
|
62
|
+
self._seeds.append({
|
|
63
|
+
**seed,
|
|
64
|
+
"category": category,
|
|
65
|
+
"category_tags": cat_data.get("tags", []),
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
logger.info(
|
|
69
|
+
"Loaded %d seeds from %s",
|
|
70
|
+
len(self._seeds),
|
|
71
|
+
SEED_FILE,
|
|
72
|
+
)
|
|
73
|
+
self._loaded = True
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error("Failed to load seeds: %s", e)
|
|
77
|
+
self._seeds = []
|
|
78
|
+
self._loaded = True
|
|
79
|
+
|
|
80
|
+
def get_relevant_seeds(
|
|
81
|
+
self,
|
|
82
|
+
query: str,
|
|
83
|
+
refined_query: str = "",
|
|
84
|
+
max_seeds: int = 10,
|
|
85
|
+
) -> list[dict]:
|
|
86
|
+
"""
|
|
87
|
+
Return seeds relevant to a query.
|
|
88
|
+
Uses tag matching and keyword scoring.
|
|
89
|
+
"""
|
|
90
|
+
if not self._loaded:
|
|
91
|
+
self.load()
|
|
92
|
+
|
|
93
|
+
if not self._seeds:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
search_text = f"{query} {refined_query}".lower()
|
|
97
|
+
|
|
98
|
+
scored: list[tuple[int, dict]] = []
|
|
99
|
+
for seed in self._seeds:
|
|
100
|
+
# Skip content-safety blocked URLs
|
|
101
|
+
blocked, _ = is_blocked_url(seed.get("url", ""))
|
|
102
|
+
if blocked:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
score = 0
|
|
106
|
+
all_tags = list(seed.get("tags", [])) + list(seed.get("category_tags", []))
|
|
107
|
+
|
|
108
|
+
# Score by tag matches
|
|
109
|
+
for tag in all_tags:
|
|
110
|
+
if tag.lower() in search_text:
|
|
111
|
+
score += 3
|
|
112
|
+
|
|
113
|
+
# Score by name match (only words longer than 3 chars)
|
|
114
|
+
name = seed.get("name", "").lower()
|
|
115
|
+
for word in search_text.split():
|
|
116
|
+
if len(word) > 3 and word in name:
|
|
117
|
+
score += 2
|
|
118
|
+
|
|
119
|
+
# Boost known-active seeds
|
|
120
|
+
if seed.get("status") == "active":
|
|
121
|
+
score += 1
|
|
122
|
+
|
|
123
|
+
# Always include search engines with a base score so generic
|
|
124
|
+
# queries still get a directory to crawl.
|
|
125
|
+
category = seed.get("category", "")
|
|
126
|
+
if "search" in category or "search" in [t.lower() for t in all_tags]:
|
|
127
|
+
score = max(score, 1)
|
|
128
|
+
|
|
129
|
+
if score > 0:
|
|
130
|
+
scored.append((score, seed))
|
|
131
|
+
|
|
132
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
133
|
+
results = [s for _, s in scored[:max_seeds]]
|
|
134
|
+
|
|
135
|
+
logger.info(
|
|
136
|
+
"Seed matching: %d relevant seeds for query '%s'",
|
|
137
|
+
len(results),
|
|
138
|
+
query[:50],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
async def check_seed_availability(
|
|
144
|
+
self,
|
|
145
|
+
url: str,
|
|
146
|
+
timeout: int = 15,
|
|
147
|
+
) -> bool:
|
|
148
|
+
"""
|
|
149
|
+
Check if a seed URL is reachable over Tor.
|
|
150
|
+
Returns True if reachable, False otherwise.
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
connector = aiohttp_socks.ProxyConnector.from_url(TOR_PROXY)
|
|
154
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
|
155
|
+
async with session.get(
|
|
156
|
+
url,
|
|
157
|
+
timeout=aiohttp.ClientTimeout(total=timeout),
|
|
158
|
+
headers={"User-Agent": "Mozilla/5.0 (compatible)"},
|
|
159
|
+
ssl=False,
|
|
160
|
+
) as resp:
|
|
161
|
+
return resp.status < 500
|
|
162
|
+
except Exception:
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
async def validate_seeds(self, concurrency: int = 5) -> dict:
|
|
166
|
+
"""
|
|
167
|
+
Check which seeds are currently reachable.
|
|
168
|
+
Updates status in the JSON file.
|
|
169
|
+
Returns summary of results.
|
|
170
|
+
"""
|
|
171
|
+
if not self._loaded:
|
|
172
|
+
self.load()
|
|
173
|
+
|
|
174
|
+
if not self._seeds:
|
|
175
|
+
return {"checked": 0, "active": 0, "dead": 0}
|
|
176
|
+
|
|
177
|
+
sem = asyncio.Semaphore(concurrency)
|
|
178
|
+
results = {"active": 0, "dead": 0, "checked": 0}
|
|
179
|
+
|
|
180
|
+
async def check_one(seed: dict) -> None:
|
|
181
|
+
async with sem:
|
|
182
|
+
url = seed.get("url", "")
|
|
183
|
+
if not url:
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
is_up = await self.check_seed_availability(url)
|
|
187
|
+
|
|
188
|
+
results["checked"] += 1
|
|
189
|
+
if is_up:
|
|
190
|
+
results["active"] += 1
|
|
191
|
+
seed["status"] = "active"
|
|
192
|
+
seed["last_seen"] = datetime.now(timezone.utc).isoformat()
|
|
193
|
+
else:
|
|
194
|
+
results["dead"] += 1
|
|
195
|
+
seed["status"] = "unreachable"
|
|
196
|
+
|
|
197
|
+
logger.debug(
|
|
198
|
+
"Seed %s %s",
|
|
199
|
+
"ok" if is_up else "down",
|
|
200
|
+
seed.get("name", url[:30]),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
await asyncio.gather(*[check_one(s) for s in self._seeds])
|
|
204
|
+
|
|
205
|
+
# Persist status updates back to disk
|
|
206
|
+
self._save_status_updates()
|
|
207
|
+
|
|
208
|
+
logger.info(
|
|
209
|
+
"Seed validation: %d/%d active",
|
|
210
|
+
results["active"],
|
|
211
|
+
results["checked"],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return results
|
|
215
|
+
|
|
216
|
+
def add_discovered_seed(
|
|
217
|
+
self,
|
|
218
|
+
url: str,
|
|
219
|
+
name: str,
|
|
220
|
+
tags: list[str],
|
|
221
|
+
category: str = "discovered",
|
|
222
|
+
) -> bool:
|
|
223
|
+
"""
|
|
224
|
+
Add a newly discovered onion URL to seeds.
|
|
225
|
+
Called by the pipeline when new onions are found.
|
|
226
|
+
Returns True if added, False if duplicate or blocked.
|
|
227
|
+
"""
|
|
228
|
+
if not self._loaded:
|
|
229
|
+
self.load()
|
|
230
|
+
|
|
231
|
+
existing_urls = {s.get("url") for s in self._seeds}
|
|
232
|
+
if url in existing_urls:
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
blocked, _ = is_blocked_url(url)
|
|
236
|
+
if blocked:
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
new_seed = {
|
|
240
|
+
"name": name,
|
|
241
|
+
"url": url,
|
|
242
|
+
"tags": list(tags),
|
|
243
|
+
"category": category,
|
|
244
|
+
"category_tags": [category],
|
|
245
|
+
"status": "discovered",
|
|
246
|
+
"added": datetime.now(timezone.utc).date().isoformat(),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
self._seeds.append(new_seed)
|
|
250
|
+
self._save()
|
|
251
|
+
|
|
252
|
+
logger.info("Added new seed: %s", url[:50])
|
|
253
|
+
return True
|
|
254
|
+
|
|
255
|
+
def summary(self) -> dict:
|
|
256
|
+
"""Return counts grouped by category and status."""
|
|
257
|
+
if not self._loaded:
|
|
258
|
+
self.load()
|
|
259
|
+
|
|
260
|
+
by_category: dict[str, int] = {}
|
|
261
|
+
by_status: dict[str, int] = {}
|
|
262
|
+
last_validated: Optional[str] = None
|
|
263
|
+
|
|
264
|
+
for seed in self._seeds:
|
|
265
|
+
cat = seed.get("category", "unknown")
|
|
266
|
+
by_category[cat] = by_category.get(cat, 0) + 1
|
|
267
|
+
status = seed.get("status", "unknown")
|
|
268
|
+
by_status[status] = by_status.get(status, 0) + 1
|
|
269
|
+
seen = seed.get("last_seen")
|
|
270
|
+
if seen and (last_validated is None or seen > last_validated):
|
|
271
|
+
last_validated = seen
|
|
272
|
+
|
|
273
|
+
return {
|
|
274
|
+
"total": len(self._seeds),
|
|
275
|
+
"by_category": by_category,
|
|
276
|
+
"by_status": by_status,
|
|
277
|
+
"last_validated": last_validated,
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
def list_seeds(self) -> list[dict]:
|
|
281
|
+
"""Return a snapshot of every seed (admin view)."""
|
|
282
|
+
if not self._loaded:
|
|
283
|
+
self.load()
|
|
284
|
+
return [dict(s) for s in self._seeds]
|
|
285
|
+
|
|
286
|
+
def _load_raw(self) -> dict:
|
|
287
|
+
"""Load the on-disk file structure (preserving category metadata)."""
|
|
288
|
+
if SEED_FILE.exists():
|
|
289
|
+
try:
|
|
290
|
+
return json.loads(SEED_FILE.read_text(encoding="utf-8"))
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning("Could not parse existing seed file: %s", e)
|
|
293
|
+
return {
|
|
294
|
+
"version": "1.0.0",
|
|
295
|
+
"last_updated": datetime.now(timezone.utc).date().isoformat(),
|
|
296
|
+
"description": "Curated list of known dark web addresses for VoidAccess intelligence seeding",
|
|
297
|
+
"categories": {},
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
def _save_status_updates(self) -> None:
|
|
301
|
+
"""Persist status/last_seen changes for known seeds back to disk."""
|
|
302
|
+
try:
|
|
303
|
+
data = self._load_raw()
|
|
304
|
+
categories = data.setdefault("categories", {})
|
|
305
|
+
|
|
306
|
+
# Build a (category, url) → in-memory seed map
|
|
307
|
+
updates = {(s.get("category"), s.get("url")): s for s in self._seeds}
|
|
308
|
+
|
|
309
|
+
for cat_name, cat_data in categories.items():
|
|
310
|
+
for seed in cat_data.get("seeds", []):
|
|
311
|
+
key = (cat_name, seed.get("url"))
|
|
312
|
+
in_mem = updates.get(key)
|
|
313
|
+
if in_mem is None:
|
|
314
|
+
continue
|
|
315
|
+
if "status" in in_mem:
|
|
316
|
+
seed["status"] = in_mem["status"]
|
|
317
|
+
if "last_seen" in in_mem:
|
|
318
|
+
seed["last_seen"] = in_mem["last_seen"]
|
|
319
|
+
|
|
320
|
+
data["last_updated"] = datetime.now(timezone.utc).date().isoformat()
|
|
321
|
+
SEED_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.error("Failed to save seed status updates: %s", e)
|
|
324
|
+
|
|
325
|
+
def _save(self) -> None:
|
|
326
|
+
"""Save current seeds (including discovered ones) back to JSON."""
|
|
327
|
+
try:
|
|
328
|
+
data = self._load_raw()
|
|
329
|
+
categories = data.setdefault("categories", {})
|
|
330
|
+
|
|
331
|
+
# Add discovered seeds to their category bucket
|
|
332
|
+
discovered = [s for s in self._seeds if s.get("category") == "discovered"]
|
|
333
|
+
if discovered:
|
|
334
|
+
bucket = categories.setdefault(
|
|
335
|
+
"discovered",
|
|
336
|
+
{
|
|
337
|
+
"description": "Auto-discovered during investigations",
|
|
338
|
+
"tags": ["discovered"],
|
|
339
|
+
"seeds": [],
|
|
340
|
+
},
|
|
341
|
+
)
|
|
342
|
+
existing_urls = {s["url"] for s in bucket.get("seeds", [])}
|
|
343
|
+
for s in discovered:
|
|
344
|
+
if s["url"] not in existing_urls:
|
|
345
|
+
bucket["seeds"].append({
|
|
346
|
+
"name": s["name"],
|
|
347
|
+
"url": s["url"],
|
|
348
|
+
"tags": s["tags"],
|
|
349
|
+
"status": s["status"],
|
|
350
|
+
"added": s["added"],
|
|
351
|
+
})
|
|
352
|
+
existing_urls.add(s["url"])
|
|
353
|
+
|
|
354
|
+
data["last_updated"] = datetime.now(timezone.utc).date().isoformat()
|
|
355
|
+
SEED_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.error("Failed to save seeds: %s", e)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# ---------------------------------------------------------------------------
|
|
361
|
+
# Global singleton
|
|
362
|
+
# ---------------------------------------------------------------------------
|
|
363
|
+
|
|
364
|
+
_seed_manager: Optional[SeedManager] = None
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def get_seed_manager() -> SeedManager:
|
|
368
|
+
"""Return the process-wide SeedManager, loading on first access."""
|
|
369
|
+
global _seed_manager
|
|
370
|
+
if _seed_manager is None:
|
|
371
|
+
_seed_manager = SeedManager()
|
|
372
|
+
_seed_manager.load()
|
|
373
|
+
return _seed_manager
|