voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/seeds.py ADDED
@@ -0,0 +1,368 @@
1
+ """
2
+ sources/seeds.py — Curated seed URL list for the recursive crawler.
3
+
4
+ SEED_URLS is a hardcoded list of known high-value .onion starting points —
5
+ forums, indexes, directories, and paste sites that are commonly accessible
6
+ and useful as entry points for threat-intelligence crawling.
7
+
8
+ These are starting points only: the crawler follows their links recursively.
9
+ None are assumed to have any particular content; they are known *link hubs*.
10
+
11
+ Addresses were current at time of writing (2025). .onion addresses change
12
+ frequently; the crawler handles unreachable seeds gracefully.
13
+
14
+ Public API:
15
+ SEED_URLS — full list of seed dicts
16
+ get_seeds(category, language, query) -> list[dict] — filtered view
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from typing import List, Optional
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Curated seed list (≥ 20 entries required by spec)
28
+ # ---------------------------------------------------------------------------
29
+ # Each entry: url, category, description, language
30
+ # category: "search" | "index" | "forum" | "paste" | "market_index"
31
+ # language: "en" | "ru" | "multi"
32
+
33
+ SEED_URLS: List[dict] = [
34
+ # ── Search engines ──────────────────────────────────────────────────────
35
+ {
36
+ "url": "http://torchdeedp3i2jigzjdmfpn5ttjhthh5wbmda2rr3jvqjg5p77c54dqd.onion",
37
+ "category": "search",
38
+ "description": "Torch — one of the oldest and largest dark web search engines",
39
+ "language": "en",
40
+ },
41
+ {
42
+ "url": "http://haystak5njsmn2hqkewecpaxetahtwhsbsa64jom2k22z5afxhnpxfid.onion",
43
+ "category": "search",
44
+ "description": "Haystack — indexes millions of onion pages, fast results",
45
+ "language": "en",
46
+ },
47
+ {
48
+ "url": "http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion",
49
+ "category": "search",
50
+ "description": "DuckDuckGo official Tor hidden service — clearnet search over Tor",
51
+ "language": "en",
52
+ },
53
+ {
54
+ "url": "http://darksearch7bvmqn2sp7gokxbz7gvx5sflhkblekdxs5pfxypufksgfyd.onion",
55
+ "category": "search",
56
+ "description": "DarkSearch — dark web search engine with JSON API",
57
+ "language": "en",
58
+ },
59
+ # ── Indexes / Directories ────────────────────────────────────────────────
60
+ {
61
+ "url": "http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf6otjiycgwqbym2qad.onion/wiki/index.php/Main_Page",
62
+ "category": "index",
63
+ "description": "The Hidden Wiki — primary community .onion link directory",
64
+ "language": "en",
65
+ },
66
+ {
67
+ "url": "http://darkfailenbsdla5mal2mxn2uz66od5vtzd5qozslagrfzachha3f3id.onion",
68
+ "category": "index",
69
+ "description": "dark.fail — curated directory of verified, working onion sites",
70
+ "language": "en",
71
+ },
72
+ {
73
+ "url": "http://danielas3rtn54uwmofdo3x2bsdifr47huasnmbgqzfrec5ubupvtpid.onion",
74
+ "category": "index",
75
+ "description": "Daniel's Hosting — index of hundreds of hosted onion services",
76
+ "language": "en",
77
+ },
78
+ {
79
+ "url": "http://bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion",
80
+ "category": "index",
81
+ "description": "BBC News Tor mirror — official BBC onion service for censorship bypass",
82
+ "language": "en",
83
+ },
84
+ {
85
+ "url": "http://p53lf57qovyuvwsc6xnrppyply3vtqm7l6pcobkmyqsiofyeznfu5uqd.onion",
86
+ "category": "index",
87
+ "description": "ProPublica Tor mirror — investigative journalism, primary source links",
88
+ "language": "en",
89
+ },
90
+ {
91
+ "url": "http://sdolvtfhatvsysc6l34d65ymdwxcujausv7k5jk4cy5ttzhjoi6fzvyd.onion",
92
+ "category": "index",
93
+ "description": "SecureDrop directory — whistleblower submission platform index",
94
+ "language": "en",
95
+ },
96
+ # ── Forums ───────────────────────────────────────────────────────────────
97
+ {
98
+ "url": "http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion",
99
+ "category": "forum",
100
+ "description": "Dread — dark web Reddit equivalent, hub for market discussion and news",
101
+ "language": "en",
102
+ },
103
+ {
104
+ "url": "http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion",
105
+ "category": "forum",
106
+ "description": "Endchan — decentralized imageboard, uncensored discussion boards",
107
+ "language": "en",
108
+ },
109
+ {
110
+ "url": "http://4usoivrpy52lmc4mgn2h34cmfiltslesthr56yttv2pxudd3dapqciyd.onion",
111
+ "category": "forum",
112
+ "description": "8chan/8kun — decentralized anonymous forum, various topic boards",
113
+ "language": "en",
114
+ },
115
+ {
116
+ "url": "http://crpxfhcgaaqxnpqgcmgrk2uupxrjyqrlc3dnlrgidcjbpq5zxkafbvid.onion",
117
+ "category": "forum",
118
+ "description": "CryptBB — cybercrime forum focusing on hacking and exploit trading",
119
+ "language": "en",
120
+ },
121
+ {
122
+ "url": "http://gg6zxtreajiijztyy5g6bt5o6l3qu32nrg7eulyemlnhbh6tl7r2vyad.onion",
123
+ "category": "forum",
124
+ "description": "XSS.is Tor mirror — Russian-language cybercrime and vulnerability forum",
125
+ "language": "ru",
126
+ },
127
+ {
128
+ "url": "http://exploitivzcm5dawzhe6c32bbylyggbjvh5dyvsvb5lkuz5ptmunkmqd.onion",
129
+ "category": "forum",
130
+ "description": "Exploit.in Tor mirror — Russian exploit marketplace and forum",
131
+ "language": "ru",
132
+ },
133
+ {
134
+ "url": "http://ransomwr3tsydeii.onion",
135
+ "category": "forum",
136
+ "description": "RansomWatch aggregator mirror — tracks ransomware group leak sites",
137
+ "language": "en",
138
+ },
139
+ # ── Paste sites ──────────────────────────────────────────────────────────
140
+ {
141
+ "url": "http://depastedihryjugl7sxhstlqjmqbedofrm3r5vynzw7rl7mwkv4zmcid.onion",
142
+ "category": "paste",
143
+ "description": "DeepPaste — dark web paste service, frequently used for leaks",
144
+ "language": "en",
145
+ },
146
+ {
147
+ "url": "http://zgjnkivynuasfwog7rkkphv5gdtyrcaxp4ihczgyuep2ulokhmuuduuqd.onion",
148
+ "category": "paste",
149
+ "description": "PrivateBin .onion instance — anonymous encrypted paste sharing",
150
+ "language": "en",
151
+ },
152
+ {
153
+ "url": "http://protonirockerxow.onion",
154
+ "category": "paste",
155
+ "description": "ProtonMail Tor mirror — encrypted email, often linked to paste leaks",
156
+ "language": "multi",
157
+ },
158
+ # ── Market indexes (aggregators only — not markets themselves) ────────────
159
+ {
160
+ "url": "http://darknetlidvrsli6iso7my54rjayjursyw637aypb6qambkoepmyq2yd.onion",
161
+ "category": "market_index",
162
+ "description": "Darknet market index — lists active markets and their mirror links",
163
+ "language": "en",
164
+ },
165
+ {
166
+ "url": "http://dark2web.com.onion",
167
+ "category": "market_index",
168
+ "description": "Dark2Web — aggregator that reviews and indexes dark web markets",
169
+ "language": "en",
170
+ },
171
+ {
172
+ "url": "http://dgdtaovql5oo7ait.onion",
173
+ "category": "market_index",
174
+ "description": "Tor Metrics onion — statistics on the Tor network and onion services",
175
+ "language": "en",
176
+ },
177
+ # ── Multi-language / Russian-language index ───────────────────────────────
178
+ {
179
+ "url": "http://rutorc6mqdinc4cz.onion",
180
+ "category": "index",
181
+ "description": "RuTor — Russian-language dark web link directory and index",
182
+ "language": "ru",
183
+ },
184
+ {
185
+ "url": "http://omgomgomg5j4yrr47fishp4rdwxkn3vkpbxbouys33ew74h6hq47qad.onion",
186
+ "category": "market_index",
187
+ "description": "OMG!OMG! market — large multi-language dark web marketplace index",
188
+ "language": "multi",
189
+ },
190
+ ]
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # Query-aware topic seeds (verified-stable .onion only; small curated set)
195
+ # ---------------------------------------------------------------------------
196
+
197
+ TOPIC_SEEDS: dict[str, List[dict]] = {
198
+ "bitcoin": [
199
+ {
200
+ "url": "http://darkfailllnkf4vf.onion",
201
+ "category": "index",
202
+ "language": "en",
203
+ "description": "dark.fail index — query-aware bitcoin/crypto seed",
204
+ },
205
+ ],
206
+ "ransomware": [
207
+ {
208
+ "url": "http://darkfailllnkf4vf.onion",
209
+ "category": "index",
210
+ "language": "en",
211
+ "description": "dark.fail index — query-aware ransomware seed",
212
+ },
213
+ {
214
+ "url": "http://ransomwr3tsydeii.onion",
215
+ "category": "forum",
216
+ "language": "en",
217
+ "description": "RansomWatch — query-aware ransomware seed",
218
+ },
219
+ ],
220
+ "malware": [
221
+ {
222
+ "url": "http://darkfailllnkf4vf.onion",
223
+ "category": "index",
224
+ "language": "en",
225
+ "description": "dark.fail index — query-aware malware seed",
226
+ },
227
+ ],
228
+ "credentials": [
229
+ {
230
+ "url": "http://darkfailllnkf4vf.onion",
231
+ "category": "index",
232
+ "language": "en",
233
+ "description": "dark.fail index — query-aware credentials seed",
234
+ },
235
+ ],
236
+ "drugs": [
237
+ {
238
+ "url": "http://darkfailllnkf4vf.onion",
239
+ "category": "index",
240
+ "language": "en",
241
+ "description": "dark.fail index — query-aware seed (limited)",
242
+ },
243
+ ],
244
+ "hacking": [
245
+ {
246
+ "url": "http://darkfailllnkf4vf.onion",
247
+ "category": "index",
248
+ "language": "en",
249
+ "description": "dark.fail index — query-aware hacking seed",
250
+ },
251
+ ],
252
+ "fraud": [
253
+ {
254
+ "url": "http://darkfailllnkf4vf.onion",
255
+ "category": "index",
256
+ "language": "en",
257
+ "description": "dark.fail index — query-aware fraud seed",
258
+ },
259
+ ],
260
+ }
261
+
262
+ TOPIC_KEYWORDS: dict[str, List[str]] = {
263
+ "bitcoin": [
264
+ "bitcoin", "btc", "wallet", "crypto", "cryptocurrency", "blockchain",
265
+ ],
266
+ "ransomware": [
267
+ "ransomware", "lockbit", "alphv", "blackcat", "conti", "revil", "ryuk",
268
+ "extortion",
269
+ ],
270
+ "malware": [
271
+ "malware", "rat", "trojan", "backdoor", "botnet", "rootkit", "keylogger",
272
+ "stealer",
273
+ ],
274
+ "credentials": [
275
+ "credentials", "password", "login", "account", "breach", "leak", "dump",
276
+ "combo",
277
+ ],
278
+ "drugs": ["drug", "narcotic", "cannabis", "opioid"],
279
+ "hacking": [
280
+ "hacking", "exploit", "vulnerability", "cve", "0day", "zero-day", "shell",
281
+ "access",
282
+ ],
283
+ "fraud": [
284
+ "fraud", "carding", "cc", "credit card", "ssn", "identity", "fake",
285
+ "counterfeit",
286
+ ],
287
+ }
288
+
289
+
290
+ def detect_query_topics(query: str) -> List[str]:
291
+ """
292
+ Analyze a query string and return relevant topic categories.
293
+
294
+ Examples:
295
+ "lockbit ransomware bitcoin payments" → ["ransomware", "bitcoin"]
296
+ "CVE-2024-1234 exploit kit" → ["hacking"]
297
+ "stolen credentials combo list" → ["credentials"]
298
+ """
299
+ query_lower = query.lower()
300
+ detected_topics: List[str] = []
301
+
302
+ for topic, keywords in TOPIC_KEYWORDS.items():
303
+ if any(keyword in query_lower for keyword in keywords):
304
+ detected_topics.append(topic)
305
+
306
+ return detected_topics
307
+
308
+
309
+ # ---------------------------------------------------------------------------
310
+ # Public filter function
311
+ # ---------------------------------------------------------------------------
312
+
313
+
314
+ def get_seeds(
315
+ category: Optional[str] = None,
316
+ language: Optional[str] = None,
317
+ query: Optional[str] = None,
318
+ ) -> List[dict]:
319
+ """
320
+ Return the curated seed list, optionally filtered by *category* and/or *language*.
321
+
322
+ Args:
323
+ category: one of "search", "index", "forum", "paste", "market_index",
324
+ or None to return all categories.
325
+ language: "en", "ru", "multi", or None to return all languages.
326
+ query: optional investigation query; adds topic-specific seeds when
327
+ keywords match.
328
+
329
+ Returns a new list; the original SEED_URLS is never mutated.
330
+ """
331
+ seeds = list(SEED_URLS)
332
+ if category is not None:
333
+ seeds = [s for s in seeds if s["category"] == category]
334
+ if language is not None:
335
+ seeds = [s for s in seeds if s["language"] == language]
336
+
337
+ if query:
338
+ detected_topics = detect_query_topics(query)
339
+ if detected_topics:
340
+ logger.warning("Query topics detected: %s", detected_topics)
341
+ topic_specific: List[dict] = []
342
+ for topic in detected_topics:
343
+ topic_seeds = list(TOPIC_SEEDS.get(topic, []))
344
+ if category is not None:
345
+ topic_seeds = [
346
+ s for s in topic_seeds if s.get("category") == category
347
+ ]
348
+ topic_specific.extend(topic_seeds)
349
+
350
+ seen_topic_urls: set[str] = set()
351
+ topic_specific_deduped: List[dict] = []
352
+ for s in topic_specific:
353
+ u = s.get("url")
354
+ if not u or u in seen_topic_urls:
355
+ continue
356
+ seen_topic_urls.add(u)
357
+ topic_specific_deduped.append(s)
358
+
359
+ existing_urls = {s["url"] for s in seeds}
360
+ new_seeds = [s for s in topic_specific_deduped if s["url"] not in existing_urls]
361
+ logger.warning(
362
+ "Adding %d topic-specific seeds for: %s",
363
+ len(new_seeds),
364
+ detected_topics,
365
+ )
366
+ seeds = seeds + new_seeds
367
+
368
+ return list(seeds)
sources/shodan.py ADDED
@@ -0,0 +1,103 @@
1
+ """
2
+ sources/shodan.py — Shodan InternetDB integration for C2 infrastructure.
3
+
4
+ No API key required. Queries https://internetdb.shodan.io/{ip} for each
5
+ extracted IP_ADDRESS entity and returns open ports, vulnerabilities,
6
+ tags, and hostnames. Tags are used to flag high-confidence C2.
7
+
8
+ Rate-limited: max 1 request/second, max 50 IPs per investigation.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import logging
15
+
16
+ import aiohttp
17
+
18
+ from config import MAX_IPS_PER_INVESTIGATION, SHODAN_RATE_LIMIT_DELAY
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ _SHODAN_INTERNETDB = "https://internetdb.shodan.io"
23
+
24
+ _C2_TAGS = {"c2", "cobalt-strike", "metasploit", "malware"}
25
+
26
+
27
+ async def enrich_shodan_ip(ip_address: str, extracted_cves: set[str]) -> dict | None:
28
+ """
29
+ Query Shodan InternetDB for *ip_address*.
30
+
31
+ Returns a dict with open_ports, vulns, tags, hostnames, and
32
+ high_confidence_c2 flag, or None on error / no data.
33
+ """
34
+ try:
35
+ timeout = aiohttp.ClientTimeout(total=5)
36
+ async with aiohttp.ClientSession(timeout=timeout) as session:
37
+ async with session.get(f"{_SHODAN_INTERNETDB}/{ip_address}") as resp:
38
+ if resp.status == 404:
39
+ return None
40
+ if resp.status != 200:
41
+ logger.warning("Shodan InternetDB: HTTP %s for %s", resp.status, ip_address)
42
+ return None
43
+ data = await resp.json()
44
+ except asyncio.TimeoutError:
45
+ logger.warning("Shodan InternetDB: timeout for %s", ip_address)
46
+ return None
47
+ except Exception as e:
48
+ logger.warning("Shodan InternetDB: error for %s: %s", ip_address, e)
49
+ return None
50
+
51
+ raw_tags = [t.lower() for t in data.get("tags") or []]
52
+ high_confidence_c2 = bool(raw_tags and set(raw_tags) & _C2_TAGS)
53
+
54
+ vulns = data.get("vulns") or {}
55
+ cve_set = set(vulns.keys())
56
+ correlated_cves = cve_set & extracted_cves if extracted_cves else set()
57
+
58
+ return {
59
+ "source": "shodan_internetdb",
60
+ "entity_type": "IP_ADDRESS",
61
+ "entity_value": ip_address,
62
+ "open_ports": data.get("ports") or [],
63
+ "vulns": list(cve_set),
64
+ "correlated_cves": list(correlated_cves),
65
+ "tags": raw_tags,
66
+ "hostnames": data.get("hostnames") or [],
67
+ "high_confidence_c2": high_confidence_c2,
68
+ }
69
+
70
+
71
+ async def enrich_shodan(entities: list[dict]) -> list[dict]:
72
+ """
73
+ For each IP_ADDRESS entity in *entities*, query Shodan InternetDB.
74
+
75
+ Rate-limited to SHODAN_RATE_LIMIT_DELAY between requests.
76
+ Capped at MAX_IPS_PER_INVESTIGATION IPs.
77
+ """
78
+ ip_entities = [
79
+ e for e in entities
80
+ if (e.get("type") or e.get("entity_type", "")) == "IP_ADDRESS"
81
+ and (e.get("value") or e.get("entity_value", ""))
82
+ ]
83
+
84
+ extracted_cves: set[str] = {
85
+ e.get("value") or e.get("entity_value", "")
86
+ for e in entities
87
+ if (e.get("type") or e.get("entity_type", "")) == "CVE_NUMBER"
88
+ and (e.get("value") or e.get("entity_value", ""))
89
+ }
90
+
91
+ ips_to_query = [
92
+ ip_ent.get("value") or ip_ent.get("entity_value", "")
93
+ for ip_ent in ip_entities
94
+ ][:MAX_IPS_PER_INVESTIGATION]
95
+
96
+ results: list[dict] = []
97
+ for ip in ips_to_query:
98
+ result = await enrich_shodan_ip(ip, extracted_cves)
99
+ if result is not None:
100
+ results.append(result)
101
+ await asyncio.sleep(SHODAN_RATE_LIMIT_DELAY)
102
+
103
+ return results
sources/telegram.py ADDED
@@ -0,0 +1,199 @@
1
+ """
2
+ sources/telegram.py — Telegram public channel monitor via Telethon.
3
+
4
+ Telegram is clearnet (NOT routed through Tor) but carries enormous threat
5
+ actor activity in public groups and channels.
6
+
7
+ Credentials are loaded from config.py and treated as optional:
8
+ TELEGRAM_API_ID integer app id from my.telegram.org
9
+ TELEGRAM_API_HASH string hash from my.telegram.org
10
+ TELEGRAM_PHONE E.164 phone number (for initial interactive session auth)
11
+
12
+ If any credential is missing the function returns [] immediately with a
13
+ warning — Telegram is always optional and must never block the pipeline.
14
+
15
+ Initial session setup requires running an interactive auth once (Telethon
16
+ sends a verification code to the phone). Subsequent calls reuse the saved
17
+ session file ("voidaccess_telegram.session" in the working directory).
18
+
19
+ Public API:
20
+ async def fetch_telegram_messages(
21
+ channel_usernames, query, limit_per_channel=100
22
+ ) -> list[dict]
23
+
24
+ Each result dict has: channel, message_id, text, date, url.
25
+ Matching messages are also persisted to the DB pages table.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import hashlib
31
+ import logging
32
+ from datetime import timezone
33
+ from typing import List, Optional
34
+ from urllib.parse import urlparse
35
+
36
+ _logger = logging.getLogger(__name__)
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Lazy Telethon import — keeps the module importable even if telethon is not
40
+ # installed (tests can still mock it; real calls will fail with ImportError
41
+ # which is caught and returns []).
42
+ # ---------------------------------------------------------------------------
43
+
44
+ def _import_telethon():
45
+ """Import Telethon; raise ImportError with a clear message if missing."""
46
+ try:
47
+ from telethon import TelegramClient
48
+ from telethon.errors import SessionPasswordNeededError
49
+ return TelegramClient, SessionPasswordNeededError
50
+ except ImportError as exc:
51
+ raise ImportError(
52
+ "telethon is required for Telegram integration. "
53
+ "Install it with: pip install telethon"
54
+ ) from exc
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Helpers
59
+ # ---------------------------------------------------------------------------
60
+
61
+ def _matches(text: str, query: str) -> bool:
62
+ """Case-insensitive: every whitespace-separated query term must appear."""
63
+ text_lower = text.lower()
64
+ return all(term in text_lower for term in query.lower().split())
65
+
66
+
67
+ def _t_me_url(channel: str, message_id: int) -> str:
68
+ return f"https://t.me/{channel.lstrip('@')}/{message_id}"
69
+
70
+
71
+ def _persist_message(url: str, text: str) -> None:
72
+ """Write a matching Telegram message to the DB pages table. Silent on failure."""
73
+ try:
74
+ from config import DATABASE_URL as _db_url
75
+ if not _db_url:
76
+ return
77
+ from db.queries import create_page, get_page_by_hash
78
+ from db.session import get_session
79
+ except ImportError:
80
+ return
81
+
82
+ content_hash = hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
83
+ try:
84
+ with get_session() as session:
85
+ if get_page_by_hash(session, content_hash):
86
+ return
87
+ # Telegram messages have no .onion source — source_id stays None
88
+ create_page(
89
+ session,
90
+ url=url,
91
+ source_id=None,
92
+ cleaned_text=text,
93
+ raw_content_hash=content_hash,
94
+ byte_size=len(text.encode("utf-8", errors="replace")),
95
+ )
96
+ except Exception as exc:
97
+ _logger.debug("Telegram DB persist failed url=%s: %s", url, exc)
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Public API
102
+ # ---------------------------------------------------------------------------
103
+
104
+ async def fetch_telegram_messages(
105
+ channel_usernames: List[str],
106
+ query: str,
107
+ limit_per_channel: int = 100,
108
+ ) -> List[dict]:
109
+ """
110
+ Fetch recent messages from public Telegram channels/groups and return
111
+ those that keyword-match *query*.
112
+
113
+ Args:
114
+ channel_usernames: list of "@handle" or "username" strings.
115
+ query: investigation query; all space-separated terms
116
+ must appear in the message text.
117
+ limit_per_channel: max messages to fetch per channel before filtering.
118
+
119
+ Returns list[dict] with keys: channel, message_id, text, date, url.
120
+ Returns [] immediately (with a warning) when credentials are not set.
121
+ Telethon errors per channel are logged and skipped; the function never
122
+ raises.
123
+ """
124
+ # Lazy import credentials here so config changes in tests propagate
125
+ try:
126
+ from config import TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_PHONE
127
+ except ImportError:
128
+ _logger.warning("config.py not importable; skipping Telegram.")
129
+ return []
130
+
131
+ if not TELEGRAM_API_ID or not TELEGRAM_API_HASH:
132
+ _logger.warning(
133
+ "TELEGRAM_API_ID and TELEGRAM_API_HASH are required for Telegram "
134
+ "integration. Set them in .env and restart."
135
+ )
136
+ return []
137
+
138
+ try:
139
+ api_id = int(TELEGRAM_API_ID)
140
+ except (ValueError, TypeError):
141
+ _logger.warning("TELEGRAM_API_ID must be an integer. Skipping Telegram.")
142
+ return []
143
+
144
+ try:
145
+ TelegramClient, SessionPasswordNeededError = _import_telethon()
146
+ except ImportError as exc:
147
+ _logger.warning("%s", exc)
148
+ return []
149
+
150
+ results: List[dict] = []
151
+
152
+ try:
153
+ # "voidaccess_telegram" = session file name; StringSession("") = fresh in-memory
154
+ # For persistent auth: use "voidaccess_telegram" (creates voidaccess_telegram.session)
155
+ async with TelegramClient("voidaccess_telegram", api_id, TELEGRAM_API_HASH) as client:
156
+ if not await client.is_user_authorized():
157
+ _logger.warning(
158
+ "Telegram session not authorized. Run interactive auth once: "
159
+ "the client will send a code to TELEGRAM_PHONE=%s",
160
+ TELEGRAM_PHONE or "<not set>",
161
+ )
162
+ return []
163
+
164
+ for raw_channel in channel_usernames:
165
+ channel = raw_channel.lstrip("@")
166
+ try:
167
+ async for msg in client.iter_messages(
168
+ channel, limit=limit_per_channel
169
+ ):
170
+ text = msg.text or ""
171
+ if not text or not _matches(text, query):
172
+ continue
173
+
174
+ url = _t_me_url(channel, msg.id)
175
+ date = (
176
+ msg.date.astimezone(timezone.utc)
177
+ if msg.date
178
+ else None
179
+ )
180
+
181
+ entry = {
182
+ "channel": channel,
183
+ "message_id": msg.id,
184
+ "text": text,
185
+ "date": date,
186
+ "url": url,
187
+ }
188
+ results.append(entry)
189
+ _persist_message(url, text)
190
+
191
+ except Exception as exc:
192
+ _logger.debug(
193
+ "Telegram channel %s fetch failed: %s", channel, exc
194
+ )
195
+
196
+ except Exception as exc:
197
+ _logger.warning("Telegram client error: %s", exc)
198
+
199
+ return results