voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/engines.py ADDED
@@ -0,0 +1,244 @@
1
+ """
2
+ sources/engines.py — Additional dark web search engines not in search.py.
3
+
4
+ search.py handles 16 engines via the legacy thread-pool path (public API
5
+ unchanged for ui.py compatibility). This module adds engines that need
6
+ special handling:
7
+
8
+ • DarkSearch — JSON REST API, paginated, optional API key
9
+ • OnionSearch — HTML scraping of Torch and Haystack onion search engines
10
+
11
+ Both go through the Tor SOCKS5 proxy (TOR_PROXY_HOST / TOR_PROXY_PORT).
12
+
13
+ Public API:
14
+ async def search_darksearch(query, pages=2) -> list[dict]
15
+ async def search_onionsearch(query) -> list[dict]
16
+
17
+ Each returns list[dict] with keys: title, url, snippet, source.
18
+ Empty list on any error — never raises.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+ import re
25
+ from typing import List
26
+ from urllib.parse import quote_plus
27
+
28
+ import aiohttp
29
+ from aiohttp_socks import ProxyConnector
30
+ from bs4 import BeautifulSoup
31
+
32
+ from config import DARKSEARCH_API_KEY, TOR_PROXY_HOST, TOR_PROXY_PORT
33
+
34
+ _logger = logging.getLogger(__name__)
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Constants
38
+ # ---------------------------------------------------------------------------
39
+
40
+ _DARKSEARCH_API = "http://darksearch.io/api/search"
41
+
42
+ # Torch and Haystack — specifically called out in concept.md; not in search.py
43
+ _ONIONSEARCH_ENGINES = [
44
+ {
45
+ "name": "Torch",
46
+ "url": (
47
+ "http://torchdeedp3i2jigzjdmfpn5ttjhthh5wbmda2rr3jvqjg5p77c54dqd"
48
+ ".onion/search?query={query}"
49
+ ),
50
+ },
51
+ {
52
+ "name": "Haystack",
53
+ "url": (
54
+ "http://haystak5njsmn2hqkewecpaxetahtwhsbsa64jom2k22z5afxhnpxfid"
55
+ ".onion/?q={query}"
56
+ ),
57
+ },
58
+ ]
59
+
60
+ _TIMEOUT = aiohttp.ClientTimeout(connect=15, sock_read=45)
61
+ _ONION_RE = re.compile(r"https?://[a-z2-7]{16,56}\.onion[^\s\"'<>]*", re.IGNORECASE)
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Shared helpers
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def _tor_connector() -> ProxyConnector:
69
+ return ProxyConnector.from_url(
70
+ f"socks5://{TOR_PROXY_HOST}:{TOR_PROXY_PORT}",
71
+ rdns=True,
72
+ )
73
+
74
+
75
+ def _ua() -> str:
76
+ return (
77
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) "
78
+ "Gecko/20100101 Firefox/137.0"
79
+ )
80
+
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # DarkSearch JSON API
84
+ # ---------------------------------------------------------------------------
85
+
86
+ async def search_darksearch(query: str, pages: int = 2) -> List[dict]:
87
+ """
88
+ Query the DarkSearch JSON API and return up to *pages* pages of results.
89
+
90
+ Routed through Tor for anonymity even though darksearch.io is clearnet.
91
+ Uses DARKSEARCH_API_KEY as Authorization header when configured.
92
+
93
+ Returns list[dict] with keys: title, url, snippet, source.
94
+ Returns [] on any network or parse error.
95
+ """
96
+ results: List[dict] = []
97
+ headers = {"User-Agent": _ua(), "Accept": "application/json"}
98
+ if DARKSEARCH_API_KEY:
99
+ headers["Authorization"] = f"Bearer {DARKSEARCH_API_KEY}"
100
+
101
+ try:
102
+ connector = _tor_connector()
103
+ async with aiohttp.ClientSession(
104
+ connector=connector, timeout=_TIMEOUT
105
+ ) as session:
106
+ for page in range(1, pages + 1):
107
+ params = {"query": query, "page": page}
108
+ try:
109
+ async with session.get(
110
+ _DARKSEARCH_API, params=params, headers=headers
111
+ ) as resp:
112
+ if resp.status != 200:
113
+ _logger.debug(
114
+ "DarkSearch page %d returned HTTP %d", page, resp.status
115
+ )
116
+ break
117
+ data = await resp.json(content_type=None)
118
+ items = data.get("data") or []
119
+ for item in items:
120
+ link = str(item.get("link") or "").strip()
121
+ if not link:
122
+ continue
123
+ results.append(
124
+ {
125
+ "title": str(item.get("title") or "").strip(),
126
+ "url": link,
127
+ "snippet": str(
128
+ item.get("description") or ""
129
+ ).strip()[:500],
130
+ "source": "DarkSearch",
131
+ }
132
+ )
133
+ # Stop early if we've reached the last page
134
+ last = data.get("last_page") or page
135
+ if page >= last:
136
+ break
137
+ except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
138
+ _logger.debug("DarkSearch page %d error: %s", page, exc)
139
+ break
140
+ except Exception as exc:
141
+ _logger.debug("DarkSearch session error: %s", exc)
142
+
143
+ return results
144
+
145
+
146
+ # ---------------------------------------------------------------------------
147
+ # OnionSearch HTML scraping (Torch + Haystack)
148
+ # ---------------------------------------------------------------------------
149
+
150
+ async def search_onionsearch(query: str) -> List[dict]:
151
+ """
152
+ Scrape Torch and Haystack .onion search engines and return extracted links.
153
+
154
+ Each engine's result page is fetched, all .onion hrefs are extracted, and
155
+ the surrounding anchor text is used as the title. No snippet is available
156
+ from this scraping path (snippet is empty string).
157
+
158
+ Returns list[dict] with keys: title, url, snippet, source.
159
+ Returns [] on any error; partial results from working engines are included.
160
+ """
161
+ results: List[dict] = []
162
+ encoded = quote_plus(query)
163
+
164
+ try:
165
+ connector = _tor_connector()
166
+ async with aiohttp.ClientSession(
167
+ connector=connector, timeout=_TIMEOUT
168
+ ) as session:
169
+ for engine in _ONIONSEARCH_ENGINES:
170
+ url = engine["url"].replace("{query}", encoded)
171
+ name = engine["name"]
172
+ try:
173
+ async with session.get(
174
+ url, headers={"User-Agent": _ua()}
175
+ ) as resp:
176
+ if resp.status != 200:
177
+ _logger.debug(
178
+ "%s returned HTTP %d", name, resp.status
179
+ )
180
+ continue
181
+ html = await resp.text(errors="replace")
182
+ results.extend(_parse_onion_links(html, name))
183
+ except (aiohttp.ClientError, Exception) as exc:
184
+ _logger.debug("%s fetch error: %s", name, exc)
185
+ except Exception as exc:
186
+ _logger.debug("OnionSearch session error: %s", exc)
187
+
188
+ return _deduplicate(results)
189
+
190
+
191
+ def _parse_onion_links(html: str, source_name: str) -> List[dict]:
192
+ """
193
+ Extract .onion links + anchor text from an HTML results page.
194
+
195
+ Falls back to regex extraction if BeautifulSoup finds nothing useful.
196
+ """
197
+ items: List[dict] = []
198
+ seen: set[str] = set()
199
+
200
+ try:
201
+ soup = BeautifulSoup(html, "html.parser")
202
+ for tag in soup.find_all("a", href=True):
203
+ href = str(tag["href"]).strip()
204
+ match = _ONION_RE.match(href)
205
+ if not match:
206
+ continue
207
+ url = match.group(0).rstrip(".,;)'\"")
208
+ if url in seen or "search" in url.lower():
209
+ continue
210
+ title = tag.get_text(strip=True)
211
+ if len(title) < 3:
212
+ continue
213
+ seen.add(url)
214
+ items.append(
215
+ {"title": title, "url": url, "snippet": "", "source": source_name}
216
+ )
217
+ except Exception:
218
+ pass
219
+
220
+ # Regex fallback when structured parsing yields nothing
221
+ if not items:
222
+ for url in _ONION_RE.findall(html):
223
+ url = url.rstrip(".,;)'\"")
224
+ if url not in seen and "search" not in url.lower():
225
+ seen.add(url)
226
+ items.append(
227
+ {"title": url, "url": url, "snippet": "", "source": source_name}
228
+ )
229
+
230
+ return items
231
+
232
+
233
+ def _deduplicate(results: List[dict]) -> List[dict]:
234
+ seen: set[str] = set()
235
+ out: List[dict] = []
236
+ for r in results:
237
+ if r["url"] not in seen:
238
+ seen.add(r["url"])
239
+ out.append(r)
240
+ return out
241
+
242
+
243
+ # asyncio is used inside search_darksearch — import here to avoid circular
244
+ import asyncio # noqa: E402