voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/seeds.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/seeds.py — Curated seed URL list for the recursive crawler.
|
|
3
|
+
|
|
4
|
+
SEED_URLS is a hardcoded list of known high-value .onion starting points —
|
|
5
|
+
forums, indexes, directories, and paste sites that are commonly accessible
|
|
6
|
+
and useful as entry points for threat-intelligence crawling.
|
|
7
|
+
|
|
8
|
+
These are starting points only: the crawler follows their links recursively.
|
|
9
|
+
None are assumed to have any particular content; they are known *link hubs*.
|
|
10
|
+
|
|
11
|
+
Addresses were current at time of writing (2025). .onion addresses change
|
|
12
|
+
frequently; the crawler handles unreachable seeds gracefully.
|
|
13
|
+
|
|
14
|
+
Public API:
|
|
15
|
+
SEED_URLS — full list of seed dicts
|
|
16
|
+
get_seeds(category, language, query) -> list[dict] — filtered view
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from typing import List, Optional
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Curated seed list (≥ 20 entries required by spec)
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Each entry: url, category, description, language
|
|
30
|
+
# category: "search" | "index" | "forum" | "paste" | "market_index"
|
|
31
|
+
# language: "en" | "ru" | "multi"
|
|
32
|
+
|
|
33
|
+
SEED_URLS: List[dict] = [
|
|
34
|
+
# ── Search engines ──────────────────────────────────────────────────────
|
|
35
|
+
{
|
|
36
|
+
"url": "http://torchdeedp3i2jigzjdmfpn5ttjhthh5wbmda2rr3jvqjg5p77c54dqd.onion",
|
|
37
|
+
"category": "search",
|
|
38
|
+
"description": "Torch — one of the oldest and largest dark web search engines",
|
|
39
|
+
"language": "en",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"url": "http://haystak5njsmn2hqkewecpaxetahtwhsbsa64jom2k22z5afxhnpxfid.onion",
|
|
43
|
+
"category": "search",
|
|
44
|
+
"description": "Haystack — indexes millions of onion pages, fast results",
|
|
45
|
+
"language": "en",
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"url": "http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion",
|
|
49
|
+
"category": "search",
|
|
50
|
+
"description": "DuckDuckGo official Tor hidden service — clearnet search over Tor",
|
|
51
|
+
"language": "en",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"url": "http://darksearch7bvmqn2sp7gokxbz7gvx5sflhkblekdxs5pfxypufksgfyd.onion",
|
|
55
|
+
"category": "search",
|
|
56
|
+
"description": "DarkSearch — dark web search engine with JSON API",
|
|
57
|
+
"language": "en",
|
|
58
|
+
},
|
|
59
|
+
# ── Indexes / Directories ────────────────────────────────────────────────
|
|
60
|
+
{
|
|
61
|
+
"url": "http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf6otjiycgwqbym2qad.onion/wiki/index.php/Main_Page",
|
|
62
|
+
"category": "index",
|
|
63
|
+
"description": "The Hidden Wiki — primary community .onion link directory",
|
|
64
|
+
"language": "en",
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"url": "http://darkfailenbsdla5mal2mxn2uz66od5vtzd5qozslagrfzachha3f3id.onion",
|
|
68
|
+
"category": "index",
|
|
69
|
+
"description": "dark.fail — curated directory of verified, working onion sites",
|
|
70
|
+
"language": "en",
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"url": "http://danielas3rtn54uwmofdo3x2bsdifr47huasnmbgqzfrec5ubupvtpid.onion",
|
|
74
|
+
"category": "index",
|
|
75
|
+
"description": "Daniel's Hosting — index of hundreds of hosted onion services",
|
|
76
|
+
"language": "en",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"url": "http://bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion",
|
|
80
|
+
"category": "index",
|
|
81
|
+
"description": "BBC News Tor mirror — official BBC onion service for censorship bypass",
|
|
82
|
+
"language": "en",
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"url": "http://p53lf57qovyuvwsc6xnrppyply3vtqm7l6pcobkmyqsiofyeznfu5uqd.onion",
|
|
86
|
+
"category": "index",
|
|
87
|
+
"description": "ProPublica Tor mirror — investigative journalism, primary source links",
|
|
88
|
+
"language": "en",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"url": "http://sdolvtfhatvsysc6l34d65ymdwxcujausv7k5jk4cy5ttzhjoi6fzvyd.onion",
|
|
92
|
+
"category": "index",
|
|
93
|
+
"description": "SecureDrop directory — whistleblower submission platform index",
|
|
94
|
+
"language": "en",
|
|
95
|
+
},
|
|
96
|
+
# ── Forums ───────────────────────────────────────────────────────────────
|
|
97
|
+
{
|
|
98
|
+
"url": "http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion",
|
|
99
|
+
"category": "forum",
|
|
100
|
+
"description": "Dread — dark web Reddit equivalent, hub for market discussion and news",
|
|
101
|
+
"language": "en",
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"url": "http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion",
|
|
105
|
+
"category": "forum",
|
|
106
|
+
"description": "Endchan — decentralized imageboard, uncensored discussion boards",
|
|
107
|
+
"language": "en",
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"url": "http://4usoivrpy52lmc4mgn2h34cmfiltslesthr56yttv2pxudd3dapqciyd.onion",
|
|
111
|
+
"category": "forum",
|
|
112
|
+
"description": "8chan/8kun — decentralized anonymous forum, various topic boards",
|
|
113
|
+
"language": "en",
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"url": "http://crpxfhcgaaqxnpqgcmgrk2uupxrjyqrlc3dnlrgidcjbpq5zxkafbvid.onion",
|
|
117
|
+
"category": "forum",
|
|
118
|
+
"description": "CryptBB — cybercrime forum focusing on hacking and exploit trading",
|
|
119
|
+
"language": "en",
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"url": "http://gg6zxtreajiijztyy5g6bt5o6l3qu32nrg7eulyemlnhbh6tl7r2vyad.onion",
|
|
123
|
+
"category": "forum",
|
|
124
|
+
"description": "XSS.is Tor mirror — Russian-language cybercrime and vulnerability forum",
|
|
125
|
+
"language": "ru",
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"url": "http://exploitivzcm5dawzhe6c32bbylyggbjvh5dyvsvb5lkuz5ptmunkmqd.onion",
|
|
129
|
+
"category": "forum",
|
|
130
|
+
"description": "Exploit.in Tor mirror — Russian exploit marketplace and forum",
|
|
131
|
+
"language": "ru",
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"url": "http://ransomwr3tsydeii.onion",
|
|
135
|
+
"category": "forum",
|
|
136
|
+
"description": "RansomWatch aggregator mirror — tracks ransomware group leak sites",
|
|
137
|
+
"language": "en",
|
|
138
|
+
},
|
|
139
|
+
# ── Paste sites ──────────────────────────────────────────────────────────
|
|
140
|
+
{
|
|
141
|
+
"url": "http://depastedihryjugl7sxhstlqjmqbedofrm3r5vynzw7rl7mwkv4zmcid.onion",
|
|
142
|
+
"category": "paste",
|
|
143
|
+
"description": "DeepPaste — dark web paste service, frequently used for leaks",
|
|
144
|
+
"language": "en",
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"url": "http://zgjnkivynuasfwog7rkkphv5gdtyrcaxp4ihczgyuep2ulokhmuuduuqd.onion",
|
|
148
|
+
"category": "paste",
|
|
149
|
+
"description": "PrivateBin .onion instance — anonymous encrypted paste sharing",
|
|
150
|
+
"language": "en",
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"url": "http://protonirockerxow.onion",
|
|
154
|
+
"category": "paste",
|
|
155
|
+
"description": "ProtonMail Tor mirror — encrypted email, often linked to paste leaks",
|
|
156
|
+
"language": "multi",
|
|
157
|
+
},
|
|
158
|
+
# ── Market indexes (aggregators only — not markets themselves) ────────────
|
|
159
|
+
{
|
|
160
|
+
"url": "http://darknetlidvrsli6iso7my54rjayjursyw637aypb6qambkoepmyq2yd.onion",
|
|
161
|
+
"category": "market_index",
|
|
162
|
+
"description": "Darknet market index — lists active markets and their mirror links",
|
|
163
|
+
"language": "en",
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
"url": "http://dark2web.com.onion",
|
|
167
|
+
"category": "market_index",
|
|
168
|
+
"description": "Dark2Web — aggregator that reviews and indexes dark web markets",
|
|
169
|
+
"language": "en",
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"url": "http://dgdtaovql5oo7ait.onion",
|
|
173
|
+
"category": "market_index",
|
|
174
|
+
"description": "Tor Metrics onion — statistics on the Tor network and onion services",
|
|
175
|
+
"language": "en",
|
|
176
|
+
},
|
|
177
|
+
# ── Multi-language / Russian-language index ───────────────────────────────
|
|
178
|
+
{
|
|
179
|
+
"url": "http://rutorc6mqdinc4cz.onion",
|
|
180
|
+
"category": "index",
|
|
181
|
+
"description": "RuTor — Russian-language dark web link directory and index",
|
|
182
|
+
"language": "ru",
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"url": "http://omgomgomg5j4yrr47fishp4rdwxkn3vkpbxbouys33ew74h6hq47qad.onion",
|
|
186
|
+
"category": "market_index",
|
|
187
|
+
"description": "OMG!OMG! market — large multi-language dark web marketplace index",
|
|
188
|
+
"language": "multi",
|
|
189
|
+
},
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Query-aware topic seeds (verified-stable .onion only; small curated set)
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
TOPIC_SEEDS: dict[str, List[dict]] = {
|
|
198
|
+
"bitcoin": [
|
|
199
|
+
{
|
|
200
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
201
|
+
"category": "index",
|
|
202
|
+
"language": "en",
|
|
203
|
+
"description": "dark.fail index — query-aware bitcoin/crypto seed",
|
|
204
|
+
},
|
|
205
|
+
],
|
|
206
|
+
"ransomware": [
|
|
207
|
+
{
|
|
208
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
209
|
+
"category": "index",
|
|
210
|
+
"language": "en",
|
|
211
|
+
"description": "dark.fail index — query-aware ransomware seed",
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"url": "http://ransomwr3tsydeii.onion",
|
|
215
|
+
"category": "forum",
|
|
216
|
+
"language": "en",
|
|
217
|
+
"description": "RansomWatch — query-aware ransomware seed",
|
|
218
|
+
},
|
|
219
|
+
],
|
|
220
|
+
"malware": [
|
|
221
|
+
{
|
|
222
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
223
|
+
"category": "index",
|
|
224
|
+
"language": "en",
|
|
225
|
+
"description": "dark.fail index — query-aware malware seed",
|
|
226
|
+
},
|
|
227
|
+
],
|
|
228
|
+
"credentials": [
|
|
229
|
+
{
|
|
230
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
231
|
+
"category": "index",
|
|
232
|
+
"language": "en",
|
|
233
|
+
"description": "dark.fail index — query-aware credentials seed",
|
|
234
|
+
},
|
|
235
|
+
],
|
|
236
|
+
"drugs": [
|
|
237
|
+
{
|
|
238
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
239
|
+
"category": "index",
|
|
240
|
+
"language": "en",
|
|
241
|
+
"description": "dark.fail index — query-aware seed (limited)",
|
|
242
|
+
},
|
|
243
|
+
],
|
|
244
|
+
"hacking": [
|
|
245
|
+
{
|
|
246
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
247
|
+
"category": "index",
|
|
248
|
+
"language": "en",
|
|
249
|
+
"description": "dark.fail index — query-aware hacking seed",
|
|
250
|
+
},
|
|
251
|
+
],
|
|
252
|
+
"fraud": [
|
|
253
|
+
{
|
|
254
|
+
"url": "http://darkfailllnkf4vf.onion",
|
|
255
|
+
"category": "index",
|
|
256
|
+
"language": "en",
|
|
257
|
+
"description": "dark.fail index — query-aware fraud seed",
|
|
258
|
+
},
|
|
259
|
+
],
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
TOPIC_KEYWORDS: dict[str, List[str]] = {
|
|
263
|
+
"bitcoin": [
|
|
264
|
+
"bitcoin", "btc", "wallet", "crypto", "cryptocurrency", "blockchain",
|
|
265
|
+
],
|
|
266
|
+
"ransomware": [
|
|
267
|
+
"ransomware", "lockbit", "alphv", "blackcat", "conti", "revil", "ryuk",
|
|
268
|
+
"extortion",
|
|
269
|
+
],
|
|
270
|
+
"malware": [
|
|
271
|
+
"malware", "rat", "trojan", "backdoor", "botnet", "rootkit", "keylogger",
|
|
272
|
+
"stealer",
|
|
273
|
+
],
|
|
274
|
+
"credentials": [
|
|
275
|
+
"credentials", "password", "login", "account", "breach", "leak", "dump",
|
|
276
|
+
"combo",
|
|
277
|
+
],
|
|
278
|
+
"drugs": ["drug", "narcotic", "cannabis", "opioid"],
|
|
279
|
+
"hacking": [
|
|
280
|
+
"hacking", "exploit", "vulnerability", "cve", "0day", "zero-day", "shell",
|
|
281
|
+
"access",
|
|
282
|
+
],
|
|
283
|
+
"fraud": [
|
|
284
|
+
"fraud", "carding", "cc", "credit card", "ssn", "identity", "fake",
|
|
285
|
+
"counterfeit",
|
|
286
|
+
],
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def detect_query_topics(query: str) -> List[str]:
|
|
291
|
+
"""
|
|
292
|
+
Analyze a query string and return relevant topic categories.
|
|
293
|
+
|
|
294
|
+
Examples:
|
|
295
|
+
"lockbit ransomware bitcoin payments" → ["ransomware", "bitcoin"]
|
|
296
|
+
"CVE-2024-1234 exploit kit" → ["hacking"]
|
|
297
|
+
"stolen credentials combo list" → ["credentials"]
|
|
298
|
+
"""
|
|
299
|
+
query_lower = query.lower()
|
|
300
|
+
detected_topics: List[str] = []
|
|
301
|
+
|
|
302
|
+
for topic, keywords in TOPIC_KEYWORDS.items():
|
|
303
|
+
if any(keyword in query_lower for keyword in keywords):
|
|
304
|
+
detected_topics.append(topic)
|
|
305
|
+
|
|
306
|
+
return detected_topics
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ---------------------------------------------------------------------------
|
|
310
|
+
# Public filter function
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_seeds(
|
|
315
|
+
category: Optional[str] = None,
|
|
316
|
+
language: Optional[str] = None,
|
|
317
|
+
query: Optional[str] = None,
|
|
318
|
+
) -> List[dict]:
|
|
319
|
+
"""
|
|
320
|
+
Return the curated seed list, optionally filtered by *category* and/or *language*.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
category: one of "search", "index", "forum", "paste", "market_index",
|
|
324
|
+
or None to return all categories.
|
|
325
|
+
language: "en", "ru", "multi", or None to return all languages.
|
|
326
|
+
query: optional investigation query; adds topic-specific seeds when
|
|
327
|
+
keywords match.
|
|
328
|
+
|
|
329
|
+
Returns a new list; the original SEED_URLS is never mutated.
|
|
330
|
+
"""
|
|
331
|
+
seeds = list(SEED_URLS)
|
|
332
|
+
if category is not None:
|
|
333
|
+
seeds = [s for s in seeds if s["category"] == category]
|
|
334
|
+
if language is not None:
|
|
335
|
+
seeds = [s for s in seeds if s["language"] == language]
|
|
336
|
+
|
|
337
|
+
if query:
|
|
338
|
+
detected_topics = detect_query_topics(query)
|
|
339
|
+
if detected_topics:
|
|
340
|
+
logger.warning("Query topics detected: %s", detected_topics)
|
|
341
|
+
topic_specific: List[dict] = []
|
|
342
|
+
for topic in detected_topics:
|
|
343
|
+
topic_seeds = list(TOPIC_SEEDS.get(topic, []))
|
|
344
|
+
if category is not None:
|
|
345
|
+
topic_seeds = [
|
|
346
|
+
s for s in topic_seeds if s.get("category") == category
|
|
347
|
+
]
|
|
348
|
+
topic_specific.extend(topic_seeds)
|
|
349
|
+
|
|
350
|
+
seen_topic_urls: set[str] = set()
|
|
351
|
+
topic_specific_deduped: List[dict] = []
|
|
352
|
+
for s in topic_specific:
|
|
353
|
+
u = s.get("url")
|
|
354
|
+
if not u or u in seen_topic_urls:
|
|
355
|
+
continue
|
|
356
|
+
seen_topic_urls.add(u)
|
|
357
|
+
topic_specific_deduped.append(s)
|
|
358
|
+
|
|
359
|
+
existing_urls = {s["url"] for s in seeds}
|
|
360
|
+
new_seeds = [s for s in topic_specific_deduped if s["url"] not in existing_urls]
|
|
361
|
+
logger.warning(
|
|
362
|
+
"Adding %d topic-specific seeds for: %s",
|
|
363
|
+
len(new_seeds),
|
|
364
|
+
detected_topics,
|
|
365
|
+
)
|
|
366
|
+
seeds = seeds + new_seeds
|
|
367
|
+
|
|
368
|
+
return list(seeds)
|
sources/shodan.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/shodan.py — Shodan InternetDB integration for C2 infrastructure.
|
|
3
|
+
|
|
4
|
+
No API key required. Queries https://internetdb.shodan.io/{ip} for each
|
|
5
|
+
extracted IP_ADDRESS entity and returns open ports, vulnerabilities,
|
|
6
|
+
tags, and hostnames. Tags are used to flag high-confidence C2.
|
|
7
|
+
|
|
8
|
+
Rate-limited: max 1 request/second, max 50 IPs per investigation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
import aiohttp
|
|
17
|
+
|
|
18
|
+
from config import MAX_IPS_PER_INVESTIGATION, SHODAN_RATE_LIMIT_DELAY
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_SHODAN_INTERNETDB = "https://internetdb.shodan.io"
|
|
23
|
+
|
|
24
|
+
_C2_TAGS = {"c2", "cobalt-strike", "metasploit", "malware"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def enrich_shodan_ip(ip_address: str, extracted_cves: set[str]) -> dict | None:
|
|
28
|
+
"""
|
|
29
|
+
Query Shodan InternetDB for *ip_address*.
|
|
30
|
+
|
|
31
|
+
Returns a dict with open_ports, vulns, tags, hostnames, and
|
|
32
|
+
high_confidence_c2 flag, or None on error / no data.
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
timeout = aiohttp.ClientTimeout(total=5)
|
|
36
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
37
|
+
async with session.get(f"{_SHODAN_INTERNETDB}/{ip_address}") as resp:
|
|
38
|
+
if resp.status == 404:
|
|
39
|
+
return None
|
|
40
|
+
if resp.status != 200:
|
|
41
|
+
logger.warning("Shodan InternetDB: HTTP %s for %s", resp.status, ip_address)
|
|
42
|
+
return None
|
|
43
|
+
data = await resp.json()
|
|
44
|
+
except asyncio.TimeoutError:
|
|
45
|
+
logger.warning("Shodan InternetDB: timeout for %s", ip_address)
|
|
46
|
+
return None
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logger.warning("Shodan InternetDB: error for %s: %s", ip_address, e)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
raw_tags = [t.lower() for t in data.get("tags") or []]
|
|
52
|
+
high_confidence_c2 = bool(raw_tags and set(raw_tags) & _C2_TAGS)
|
|
53
|
+
|
|
54
|
+
vulns = data.get("vulns") or {}
|
|
55
|
+
cve_set = set(vulns.keys())
|
|
56
|
+
correlated_cves = cve_set & extracted_cves if extracted_cves else set()
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"source": "shodan_internetdb",
|
|
60
|
+
"entity_type": "IP_ADDRESS",
|
|
61
|
+
"entity_value": ip_address,
|
|
62
|
+
"open_ports": data.get("ports") or [],
|
|
63
|
+
"vulns": list(cve_set),
|
|
64
|
+
"correlated_cves": list(correlated_cves),
|
|
65
|
+
"tags": raw_tags,
|
|
66
|
+
"hostnames": data.get("hostnames") or [],
|
|
67
|
+
"high_confidence_c2": high_confidence_c2,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def enrich_shodan(entities: list[dict]) -> list[dict]:
|
|
72
|
+
"""
|
|
73
|
+
For each IP_ADDRESS entity in *entities*, query Shodan InternetDB.
|
|
74
|
+
|
|
75
|
+
Rate-limited to SHODAN_RATE_LIMIT_DELAY between requests.
|
|
76
|
+
Capped at MAX_IPS_PER_INVESTIGATION IPs.
|
|
77
|
+
"""
|
|
78
|
+
ip_entities = [
|
|
79
|
+
e for e in entities
|
|
80
|
+
if (e.get("type") or e.get("entity_type", "")) == "IP_ADDRESS"
|
|
81
|
+
and (e.get("value") or e.get("entity_value", ""))
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
extracted_cves: set[str] = {
|
|
85
|
+
e.get("value") or e.get("entity_value", "")
|
|
86
|
+
for e in entities
|
|
87
|
+
if (e.get("type") or e.get("entity_type", "")) == "CVE_NUMBER"
|
|
88
|
+
and (e.get("value") or e.get("entity_value", ""))
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
ips_to_query = [
|
|
92
|
+
ip_ent.get("value") or ip_ent.get("entity_value", "")
|
|
93
|
+
for ip_ent in ip_entities
|
|
94
|
+
][:MAX_IPS_PER_INVESTIGATION]
|
|
95
|
+
|
|
96
|
+
results: list[dict] = []
|
|
97
|
+
for ip in ips_to_query:
|
|
98
|
+
result = await enrich_shodan_ip(ip, extracted_cves)
|
|
99
|
+
if result is not None:
|
|
100
|
+
results.append(result)
|
|
101
|
+
await asyncio.sleep(SHODAN_RATE_LIMIT_DELAY)
|
|
102
|
+
|
|
103
|
+
return results
|
sources/telegram.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/telegram.py — Telegram public channel monitor via Telethon.
|
|
3
|
+
|
|
4
|
+
Telegram is clearnet (NOT routed through Tor) but carries enormous threat
|
|
5
|
+
actor activity in public groups and channels.
|
|
6
|
+
|
|
7
|
+
Credentials are loaded from config.py and treated as optional:
|
|
8
|
+
TELEGRAM_API_ID integer app id from my.telegram.org
|
|
9
|
+
TELEGRAM_API_HASH string hash from my.telegram.org
|
|
10
|
+
TELEGRAM_PHONE E.164 phone number (for initial interactive session auth)
|
|
11
|
+
|
|
12
|
+
If any credential is missing the function returns [] immediately with a
|
|
13
|
+
warning — Telegram is always optional and must never block the pipeline.
|
|
14
|
+
|
|
15
|
+
Initial session setup requires running an interactive auth once (Telethon
|
|
16
|
+
sends a verification code to the phone). Subsequent calls reuse the saved
|
|
17
|
+
session file ("voidaccess_telegram.session" in the working directory).
|
|
18
|
+
|
|
19
|
+
Public API:
|
|
20
|
+
async def fetch_telegram_messages(
|
|
21
|
+
channel_usernames, query, limit_per_channel=100
|
|
22
|
+
) -> list[dict]
|
|
23
|
+
|
|
24
|
+
Each result dict has: channel, message_id, text, date, url.
|
|
25
|
+
Matching messages are also persisted to the DB pages table.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import hashlib
|
|
31
|
+
import logging
|
|
32
|
+
from datetime import timezone
|
|
33
|
+
from typing import List, Optional
|
|
34
|
+
from urllib.parse import urlparse
|
|
35
|
+
|
|
36
|
+
_logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Lazy Telethon import — keeps the module importable even if telethon is not
|
|
40
|
+
# installed (tests can still mock it; real calls will fail with ImportError
|
|
41
|
+
# which is caught and returns []).
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def _import_telethon():
|
|
45
|
+
"""Import Telethon; raise ImportError with a clear message if missing."""
|
|
46
|
+
try:
|
|
47
|
+
from telethon import TelegramClient
|
|
48
|
+
from telethon.errors import SessionPasswordNeededError
|
|
49
|
+
return TelegramClient, SessionPasswordNeededError
|
|
50
|
+
except ImportError as exc:
|
|
51
|
+
raise ImportError(
|
|
52
|
+
"telethon is required for Telegram integration. "
|
|
53
|
+
"Install it with: pip install telethon"
|
|
54
|
+
) from exc
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# Helpers
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
def _matches(text: str, query: str) -> bool:
|
|
62
|
+
"""Case-insensitive: every whitespace-separated query term must appear."""
|
|
63
|
+
text_lower = text.lower()
|
|
64
|
+
return all(term in text_lower for term in query.lower().split())
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _t_me_url(channel: str, message_id: int) -> str:
|
|
68
|
+
return f"https://t.me/{channel.lstrip('@')}/{message_id}"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _persist_message(url: str, text: str) -> None:
|
|
72
|
+
"""Write a matching Telegram message to the DB pages table. Silent on failure."""
|
|
73
|
+
try:
|
|
74
|
+
from config import DATABASE_URL as _db_url
|
|
75
|
+
if not _db_url:
|
|
76
|
+
return
|
|
77
|
+
from db.queries import create_page, get_page_by_hash
|
|
78
|
+
from db.session import get_session
|
|
79
|
+
except ImportError:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
content_hash = hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
83
|
+
try:
|
|
84
|
+
with get_session() as session:
|
|
85
|
+
if get_page_by_hash(session, content_hash):
|
|
86
|
+
return
|
|
87
|
+
# Telegram messages have no .onion source — source_id stays None
|
|
88
|
+
create_page(
|
|
89
|
+
session,
|
|
90
|
+
url=url,
|
|
91
|
+
source_id=None,
|
|
92
|
+
cleaned_text=text,
|
|
93
|
+
raw_content_hash=content_hash,
|
|
94
|
+
byte_size=len(text.encode("utf-8", errors="replace")),
|
|
95
|
+
)
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
_logger.debug("Telegram DB persist failed url=%s: %s", url, exc)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
# Public API
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
async def fetch_telegram_messages(
|
|
105
|
+
channel_usernames: List[str],
|
|
106
|
+
query: str,
|
|
107
|
+
limit_per_channel: int = 100,
|
|
108
|
+
) -> List[dict]:
|
|
109
|
+
"""
|
|
110
|
+
Fetch recent messages from public Telegram channels/groups and return
|
|
111
|
+
those that keyword-match *query*.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
channel_usernames: list of "@handle" or "username" strings.
|
|
115
|
+
query: investigation query; all space-separated terms
|
|
116
|
+
must appear in the message text.
|
|
117
|
+
limit_per_channel: max messages to fetch per channel before filtering.
|
|
118
|
+
|
|
119
|
+
Returns list[dict] with keys: channel, message_id, text, date, url.
|
|
120
|
+
Returns [] immediately (with a warning) when credentials are not set.
|
|
121
|
+
Telethon errors per channel are logged and skipped; the function never
|
|
122
|
+
raises.
|
|
123
|
+
"""
|
|
124
|
+
# Lazy import credentials here so config changes in tests propagate
|
|
125
|
+
try:
|
|
126
|
+
from config import TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_PHONE
|
|
127
|
+
except ImportError:
|
|
128
|
+
_logger.warning("config.py not importable; skipping Telegram.")
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
if not TELEGRAM_API_ID or not TELEGRAM_API_HASH:
|
|
132
|
+
_logger.warning(
|
|
133
|
+
"TELEGRAM_API_ID and TELEGRAM_API_HASH are required for Telegram "
|
|
134
|
+
"integration. Set them in .env and restart."
|
|
135
|
+
)
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
api_id = int(TELEGRAM_API_ID)
|
|
140
|
+
except (ValueError, TypeError):
|
|
141
|
+
_logger.warning("TELEGRAM_API_ID must be an integer. Skipping Telegram.")
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
TelegramClient, SessionPasswordNeededError = _import_telethon()
|
|
146
|
+
except ImportError as exc:
|
|
147
|
+
_logger.warning("%s", exc)
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
results: List[dict] = []
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
# "voidaccess_telegram" = session file name; StringSession("") = fresh in-memory
|
|
154
|
+
# For persistent auth: use "voidaccess_telegram" (creates voidaccess_telegram.session)
|
|
155
|
+
async with TelegramClient("voidaccess_telegram", api_id, TELEGRAM_API_HASH) as client:
|
|
156
|
+
if not await client.is_user_authorized():
|
|
157
|
+
_logger.warning(
|
|
158
|
+
"Telegram session not authorized. Run interactive auth once: "
|
|
159
|
+
"the client will send a code to TELEGRAM_PHONE=%s",
|
|
160
|
+
TELEGRAM_PHONE or "<not set>",
|
|
161
|
+
)
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
for raw_channel in channel_usernames:
|
|
165
|
+
channel = raw_channel.lstrip("@")
|
|
166
|
+
try:
|
|
167
|
+
async for msg in client.iter_messages(
|
|
168
|
+
channel, limit=limit_per_channel
|
|
169
|
+
):
|
|
170
|
+
text = msg.text or ""
|
|
171
|
+
if not text or not _matches(text, query):
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
url = _t_me_url(channel, msg.id)
|
|
175
|
+
date = (
|
|
176
|
+
msg.date.astimezone(timezone.utc)
|
|
177
|
+
if msg.date
|
|
178
|
+
else None
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
entry = {
|
|
182
|
+
"channel": channel,
|
|
183
|
+
"message_id": msg.id,
|
|
184
|
+
"text": text,
|
|
185
|
+
"date": date,
|
|
186
|
+
"url": url,
|
|
187
|
+
}
|
|
188
|
+
results.append(entry)
|
|
189
|
+
_persist_message(url, text)
|
|
190
|
+
|
|
191
|
+
except Exception as exc:
|
|
192
|
+
_logger.debug(
|
|
193
|
+
"Telegram channel %s fetch failed: %s", channel, exc
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
except Exception as exc:
|
|
197
|
+
_logger.warning("Telegram client error: %s", exc)
|
|
198
|
+
|
|
199
|
+
return results
|