voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,373 @@
1
+ """
2
+ sources/seed_manager.py — Curated .onion seed list manager.
3
+
4
+ Maintains a JSON-backed catalogue of known-active dark-web addresses
5
+ organized by category (ransomware leak sites, hacker forums, carding shops,
6
+ search engines, etc.).
7
+
8
+ At investigation time, get_relevant_seeds(query) scores each seed against
9
+ the user query using tag and name matching, and returns the top-N most
10
+ relevant entries. Those seed URLs are injected into the scrape queue
11
+ ahead of the search-engine fan-out so that known intelligence sources are
12
+ always visited for an applicable query.
13
+
14
+ The seed JSON lives at data/onion_seeds.json and is community-editable.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import asyncio
20
+ import json
21
+ import logging
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+ from typing import Optional
25
+
26
+ import aiohttp
27
+ import aiohttp_socks
28
+
29
+ from utils.content_safety import is_blocked_url
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # The seed file lives in voidaccess/data/onion_seeds.json (sibling of sources/)
34
+ SEED_FILE = Path(__file__).resolve().parent.parent / "data" / "onion_seeds.json"
35
+ TOR_PROXY = "socks5://127.0.0.1:9050"
36
+
37
+
38
+ class SeedManager:
39
+ """
40
+ Manages the curated .onion seed list.
41
+ Provides relevance matching and availability checking.
42
+ """
43
+
44
+ def __init__(self) -> None:
45
+ self._seeds: list[dict] = []
46
+ self._loaded: bool = False
47
+
48
+ def load(self) -> None:
49
+ """Load seeds from JSON file."""
50
+ if not SEED_FILE.exists():
51
+ logger.warning("Seed file not found: %s", SEED_FILE)
52
+ self._seeds = []
53
+ self._loaded = True
54
+ return
55
+
56
+ try:
57
+ data = json.loads(SEED_FILE.read_text(encoding="utf-8"))
58
+ self._seeds = []
59
+
60
+ for category, cat_data in data.get("categories", {}).items():
61
+ for seed in cat_data.get("seeds", []):
62
+ self._seeds.append({
63
+ **seed,
64
+ "category": category,
65
+ "category_tags": cat_data.get("tags", []),
66
+ })
67
+
68
+ logger.info(
69
+ "Loaded %d seeds from %s",
70
+ len(self._seeds),
71
+ SEED_FILE,
72
+ )
73
+ self._loaded = True
74
+
75
+ except Exception as e:
76
+ logger.error("Failed to load seeds: %s", e)
77
+ self._seeds = []
78
+ self._loaded = True
79
+
80
+ def get_relevant_seeds(
81
+ self,
82
+ query: str,
83
+ refined_query: str = "",
84
+ max_seeds: int = 10,
85
+ ) -> list[dict]:
86
+ """
87
+ Return seeds relevant to a query.
88
+ Uses tag matching and keyword scoring.
89
+ """
90
+ if not self._loaded:
91
+ self.load()
92
+
93
+ if not self._seeds:
94
+ return []
95
+
96
+ search_text = f"{query} {refined_query}".lower()
97
+
98
+ scored: list[tuple[int, dict]] = []
99
+ for seed in self._seeds:
100
+ # Skip content-safety blocked URLs
101
+ blocked, _ = is_blocked_url(seed.get("url", ""))
102
+ if blocked:
103
+ continue
104
+
105
+ score = 0
106
+ all_tags = list(seed.get("tags", [])) + list(seed.get("category_tags", []))
107
+
108
+ # Score by tag matches
109
+ for tag in all_tags:
110
+ if tag.lower() in search_text:
111
+ score += 3
112
+
113
+ # Score by name match (only words longer than 3 chars)
114
+ name = seed.get("name", "").lower()
115
+ for word in search_text.split():
116
+ if len(word) > 3 and word in name:
117
+ score += 2
118
+
119
+ # Boost known-active seeds
120
+ if seed.get("status") == "active":
121
+ score += 1
122
+
123
+ # Always include search engines with a base score so generic
124
+ # queries still get a directory to crawl.
125
+ category = seed.get("category", "")
126
+ if "search" in category or "search" in [t.lower() for t in all_tags]:
127
+ score = max(score, 1)
128
+
129
+ if score > 0:
130
+ scored.append((score, seed))
131
+
132
+ scored.sort(key=lambda x: x[0], reverse=True)
133
+ results = [s for _, s in scored[:max_seeds]]
134
+
135
+ logger.info(
136
+ "Seed matching: %d relevant seeds for query '%s'",
137
+ len(results),
138
+ query[:50],
139
+ )
140
+
141
+ return results
142
+
143
+ async def check_seed_availability(
144
+ self,
145
+ url: str,
146
+ timeout: int = 15,
147
+ ) -> bool:
148
+ """
149
+ Check if a seed URL is reachable over Tor.
150
+ Returns True if reachable, False otherwise.
151
+ """
152
+ try:
153
+ connector = aiohttp_socks.ProxyConnector.from_url(TOR_PROXY)
154
+ async with aiohttp.ClientSession(connector=connector) as session:
155
+ async with session.get(
156
+ url,
157
+ timeout=aiohttp.ClientTimeout(total=timeout),
158
+ headers={"User-Agent": "Mozilla/5.0 (compatible)"},
159
+ ssl=False,
160
+ ) as resp:
161
+ return resp.status < 500
162
+ except Exception:
163
+ return False
164
+
165
+ async def validate_seeds(self, concurrency: int = 5) -> dict:
166
+ """
167
+ Check which seeds are currently reachable.
168
+ Updates status in the JSON file.
169
+ Returns summary of results.
170
+ """
171
+ if not self._loaded:
172
+ self.load()
173
+
174
+ if not self._seeds:
175
+ return {"checked": 0, "active": 0, "dead": 0}
176
+
177
+ sem = asyncio.Semaphore(concurrency)
178
+ results = {"active": 0, "dead": 0, "checked": 0}
179
+
180
+ async def check_one(seed: dict) -> None:
181
+ async with sem:
182
+ url = seed.get("url", "")
183
+ if not url:
184
+ return
185
+
186
+ is_up = await self.check_seed_availability(url)
187
+
188
+ results["checked"] += 1
189
+ if is_up:
190
+ results["active"] += 1
191
+ seed["status"] = "active"
192
+ seed["last_seen"] = datetime.now(timezone.utc).isoformat()
193
+ else:
194
+ results["dead"] += 1
195
+ seed["status"] = "unreachable"
196
+
197
+ logger.debug(
198
+ "Seed %s %s",
199
+ "ok" if is_up else "down",
200
+ seed.get("name", url[:30]),
201
+ )
202
+
203
+ await asyncio.gather(*[check_one(s) for s in self._seeds])
204
+
205
+ # Persist status updates back to disk
206
+ self._save_status_updates()
207
+
208
+ logger.info(
209
+ "Seed validation: %d/%d active",
210
+ results["active"],
211
+ results["checked"],
212
+ )
213
+
214
+ return results
215
+
216
+ def add_discovered_seed(
217
+ self,
218
+ url: str,
219
+ name: str,
220
+ tags: list[str],
221
+ category: str = "discovered",
222
+ ) -> bool:
223
+ """
224
+ Add a newly discovered onion URL to seeds.
225
+ Called by the pipeline when new onions are found.
226
+ Returns True if added, False if duplicate or blocked.
227
+ """
228
+ if not self._loaded:
229
+ self.load()
230
+
231
+ existing_urls = {s.get("url") for s in self._seeds}
232
+ if url in existing_urls:
233
+ return False
234
+
235
+ blocked, _ = is_blocked_url(url)
236
+ if blocked:
237
+ return False
238
+
239
+ new_seed = {
240
+ "name": name,
241
+ "url": url,
242
+ "tags": list(tags),
243
+ "category": category,
244
+ "category_tags": [category],
245
+ "status": "discovered",
246
+ "added": datetime.now(timezone.utc).date().isoformat(),
247
+ }
248
+
249
+ self._seeds.append(new_seed)
250
+ self._save()
251
+
252
+ logger.info("Added new seed: %s", url[:50])
253
+ return True
254
+
255
+ def summary(self) -> dict:
256
+ """Return counts grouped by category and status."""
257
+ if not self._loaded:
258
+ self.load()
259
+
260
+ by_category: dict[str, int] = {}
261
+ by_status: dict[str, int] = {}
262
+ last_validated: Optional[str] = None
263
+
264
+ for seed in self._seeds:
265
+ cat = seed.get("category", "unknown")
266
+ by_category[cat] = by_category.get(cat, 0) + 1
267
+ status = seed.get("status", "unknown")
268
+ by_status[status] = by_status.get(status, 0) + 1
269
+ seen = seed.get("last_seen")
270
+ if seen and (last_validated is None or seen > last_validated):
271
+ last_validated = seen
272
+
273
+ return {
274
+ "total": len(self._seeds),
275
+ "by_category": by_category,
276
+ "by_status": by_status,
277
+ "last_validated": last_validated,
278
+ }
279
+
280
+ def list_seeds(self) -> list[dict]:
281
+ """Return a snapshot of every seed (admin view)."""
282
+ if not self._loaded:
283
+ self.load()
284
+ return [dict(s) for s in self._seeds]
285
+
286
+ def _load_raw(self) -> dict:
287
+ """Load the on-disk file structure (preserving category metadata)."""
288
+ if SEED_FILE.exists():
289
+ try:
290
+ return json.loads(SEED_FILE.read_text(encoding="utf-8"))
291
+ except Exception as e:
292
+ logger.warning("Could not parse existing seed file: %s", e)
293
+ return {
294
+ "version": "1.0.0",
295
+ "last_updated": datetime.now(timezone.utc).date().isoformat(),
296
+ "description": "Curated list of known dark web addresses for VoidAccess intelligence seeding",
297
+ "categories": {},
298
+ }
299
+
300
+ def _save_status_updates(self) -> None:
301
+ """Persist status/last_seen changes for known seeds back to disk."""
302
+ try:
303
+ data = self._load_raw()
304
+ categories = data.setdefault("categories", {})
305
+
306
+ # Build a (category, url) → in-memory seed map
307
+ updates = {(s.get("category"), s.get("url")): s for s in self._seeds}
308
+
309
+ for cat_name, cat_data in categories.items():
310
+ for seed in cat_data.get("seeds", []):
311
+ key = (cat_name, seed.get("url"))
312
+ in_mem = updates.get(key)
313
+ if in_mem is None:
314
+ continue
315
+ if "status" in in_mem:
316
+ seed["status"] = in_mem["status"]
317
+ if "last_seen" in in_mem:
318
+ seed["last_seen"] = in_mem["last_seen"]
319
+
320
+ data["last_updated"] = datetime.now(timezone.utc).date().isoformat()
321
+ SEED_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
322
+ except Exception as e:
323
+ logger.error("Failed to save seed status updates: %s", e)
324
+
325
+ def _save(self) -> None:
326
+ """Save current seeds (including discovered ones) back to JSON."""
327
+ try:
328
+ data = self._load_raw()
329
+ categories = data.setdefault("categories", {})
330
+
331
+ # Add discovered seeds to their category bucket
332
+ discovered = [s for s in self._seeds if s.get("category") == "discovered"]
333
+ if discovered:
334
+ bucket = categories.setdefault(
335
+ "discovered",
336
+ {
337
+ "description": "Auto-discovered during investigations",
338
+ "tags": ["discovered"],
339
+ "seeds": [],
340
+ },
341
+ )
342
+ existing_urls = {s["url"] for s in bucket.get("seeds", [])}
343
+ for s in discovered:
344
+ if s["url"] not in existing_urls:
345
+ bucket["seeds"].append({
346
+ "name": s["name"],
347
+ "url": s["url"],
348
+ "tags": s["tags"],
349
+ "status": s["status"],
350
+ "added": s["added"],
351
+ })
352
+ existing_urls.add(s["url"])
353
+
354
+ data["last_updated"] = datetime.now(timezone.utc).date().isoformat()
355
+ SEED_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
356
+ except Exception as e:
357
+ logger.error("Failed to save seeds: %s", e)
358
+
359
+
360
+ # ---------------------------------------------------------------------------
361
+ # Global singleton
362
+ # ---------------------------------------------------------------------------
363
+
364
+ _seed_manager: Optional[SeedManager] = None
365
+
366
+
367
+ def get_seed_manager() -> SeedManager:
368
+ """Return the process-wide SeedManager, loading on first access."""
369
+ global _seed_manager
370
+ if _seed_manager is None:
371
+ _seed_manager = SeedManager()
372
+ _seed_manager.load()
373
+ return _seed_manager