voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/blockchain.py ADDED
@@ -0,0 +1,444 @@
1
+ """
2
+ Blockchain transaction lookup for extracted wallet addresses.
3
+ Queries free APIs to get transaction history and connected addresses.
4
+
5
+ Supports: Bitcoin (BlockCypher), Ethereum (Etherscan)
6
+ Monero: privacy coin, no public lookup possible
7
+ """
8
+
9
+ import asyncio
10
+ import aiohttp
11
+ import logging
12
+ import uuid
13
+ from typing import Optional, List, Dict, Any
14
+ from datetime import datetime, timezone
15
+
16
+ from db.models import Entity, EntityRelationship, RelationshipType
17
+ from db.queries import upsert_entity_canonical
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ BLOCKCYPHER_BASE = "https://api.blockcypher.com/v1/btc/main"
22
+ ETHERSCAN_BASE = "https://api.etherscan.io/api"
23
+
24
+ # Reasonable caps to avoid hammering free APIs
25
+ MAX_TRANSACTIONS_PER_WALLET = 50
26
+ MAX_CONNECTED_ADDRESSES = 10 # How many counterparty addresses to extract
27
+
28
+ # Entity type constants to match extractor/regex_patterns.py
29
+ BITCOIN_ADDRESS = "BITCOIN_ADDRESS"
30
+ ETHEREUM_ADDRESS = "ETHEREUM_ADDRESS"
31
+ MONERO_ADDRESS = "MONERO_ADDRESS"
32
+
33
+
34
+ def detect_wallet_type(address: str) -> Optional[str]:
35
+ """
36
+ Detect cryptocurrency type from address format.
37
+
38
+ Returns: BITCOIN_ADDRESS, ETHEREUM_ADDRESS, MONERO_ADDRESS, or None
39
+ """
40
+ address = address.strip()
41
+
42
+ # Bitcoin: starts with 1, 3, or bc1
43
+ if address.startswith(("1", "3")) and 25 <= len(address) <= 34:
44
+ return BITCOIN_ADDRESS
45
+ if address.startswith("bc1") and len(address) >= 42:
46
+ return BITCOIN_ADDRESS
47
+
48
+ # Ethereum: 0x prefix, 42 chars total
49
+ if address.startswith("0x") and len(address) == 42:
50
+ return ETHEREUM_ADDRESS
51
+
52
+ # Monero: starts with 4, 95-106 chars
53
+ if address.startswith("4") and 95 <= len(address) <= 106:
54
+ return MONERO_ADDRESS
55
+
56
+ return None
57
+
58
+
59
+ async def lookup_bitcoin_address(
60
+ address: str,
61
+ api_token: str = "",
62
+ ) -> dict:
63
+ """
64
+ Look up Bitcoin address via BlockCypher API.
65
+
66
+ Returns dict with financial metadata and connected addresses.
67
+ """
68
+ result = {
69
+ "address": address,
70
+ "wallet_type": BITCOIN_ADDRESS,
71
+ "total_received_btc": 0.0,
72
+ "total_sent_btc": 0.0,
73
+ "balance_btc": 0.0,
74
+ "transaction_count": 0,
75
+ "first_seen": None,
76
+ "last_seen": None,
77
+ "connected_addresses": [],
78
+ "lookup_successful": False,
79
+ "error": None,
80
+ }
81
+
82
+ params = {"limit": MAX_TRANSACTIONS_PER_WALLET}
83
+ if api_token:
84
+ params["token"] = api_token
85
+
86
+ try:
87
+ connector = aiohttp.TCPConnector(ssl=True)
88
+ timeout = aiohttp.ClientTimeout(total=15)
89
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
90
+ url = f"{BLOCKCYPHER_BASE}/addrs/{address}"
91
+ async with session.get(url, params=params) as resp:
92
+
93
+ if resp.status == 429:
94
+ logger.warning(f"BlockCypher rate limited for {address[:12]}...")
95
+ result["error"] = "rate_limited"
96
+ return result
97
+
98
+ if resp.status == 404:
99
+ # Valid address with no transactions
100
+ result["lookup_successful"] = True
101
+ result["error"] = "no_transactions"
102
+ return result
103
+
104
+ if resp.status != 200:
105
+ result["error"] = f"http_{resp.status}"
106
+ return result
107
+
108
+ data = await resp.json()
109
+
110
+ # Satoshis to BTC conversion
111
+ sat_to_btc = 1 / 100_000_000
112
+
113
+ result["total_received_btc"] = data.get("total_received", 0) * sat_to_btc
114
+ result["total_sent_btc"] = data.get("total_sent", 0) * sat_to_btc
115
+ result["balance_btc"] = data.get("final_balance", 0) * sat_to_btc
116
+ result["transaction_count"] = data.get("n_tx", 0)
117
+ result["lookup_successful"] = True
118
+
119
+ # Extract connected addresses from transaction refs
120
+ txrefs = data.get("txrefs", []) + data.get("unconfirmed_txrefs", [])
121
+
122
+ # Sort by time to get first/last seen
123
+ confirmed_txs = [t for t in txrefs if t.get("confirmed")]
124
+ if confirmed_txs:
125
+ times = [t.get("confirmed") for t in confirmed_txs if t.get("confirmed")]
126
+ if times:
127
+ result["first_seen"] = min(times)
128
+ result["last_seen"] = max(times)
129
+
130
+ # Extract counterparty addresses from inputs/outputs
131
+ # BlockCypher address endpoint gives txrefs but not full tx details
132
+ # We mark incoming as FUNDED_BY, outgoing as PAID_TO
133
+ connected = {}
134
+ for tx in txrefs[:MAX_TRANSACTIONS_PER_WALLET]:
135
+ value_btc = tx.get("value", 0) * sat_to_btc
136
+ tx_hash = tx.get("tx_hash", "")
137
+
138
+ # tx_input_n >= 0 means this address was an input (sending)
139
+ if tx.get("tx_input_n", -1) >= 0:
140
+ direction = "sent" # We sent to someone
141
+ else:
142
+ direction = "received" # Someone sent to us
143
+
144
+ # Note: Getting actual counterparty addresses requires
145
+ # fetching the full transaction. For free tier, we'll simple-link
146
+ if value_btc > 0.001 and tx_hash: # Only significant transactions
147
+ connected[tx_hash] = {
148
+ "direction": direction,
149
+ "amount": round(value_btc, 8),
150
+ "tx_hash": tx_hash,
151
+ "confirmed": tx.get("confirmed", ""),
152
+ }
153
+
154
+ result["connected_addresses"] = list(connected.values())[:MAX_CONNECTED_ADDRESSES]
155
+
156
+ except asyncio.TimeoutError:
157
+ result["error"] = "timeout"
158
+ logger.warning(f"BlockCypher timeout for {address[:12]}...")
159
+ except Exception as e:
160
+ result["error"] = str(e)[:100]
161
+ logger.warning(f"BlockCypher lookup failed for {address[:12]}...: {e}")
162
+
163
+ return result
164
+
165
+
166
+ async def lookup_ethereum_address(
167
+ address: str,
168
+ api_key: str = "",
169
+ ) -> dict:
170
+ """
171
+ Look up Ethereum address via Etherscan API.
172
+ """
173
+ result = {
174
+ "address": address,
175
+ "wallet_type": ETHEREUM_ADDRESS,
176
+ "balance_eth": 0.0,
177
+ "transaction_count": 0,
178
+ "first_seen": None,
179
+ "last_seen": None,
180
+ "connected_addresses": [],
181
+ "lookup_successful": False,
182
+ "error": None,
183
+ }
184
+
185
+ if not api_key:
186
+ result["error"] = "no_api_key"
187
+ return result
188
+
189
+ try:
190
+ connector = aiohttp.TCPConnector(ssl=True)
191
+ timeout = aiohttp.ClientTimeout(total=15)
192
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
193
+
194
+ # Get balance
195
+ async with session.get(
196
+ ETHERSCAN_BASE,
197
+ params={
198
+ "module": "account",
199
+ "action": "balance",
200
+ "address": address,
201
+ "tag": "latest",
202
+ "apikey": api_key,
203
+ },
204
+ ) as resp:
205
+ if resp.status == 200:
206
+ data = await resp.json()
207
+ if data.get("status") == "1":
208
+ wei_to_eth = 1 / 10**18
209
+ result["balance_eth"] = int(data.get("result", 0)) * wei_to_eth
210
+
211
+ # Get transactions
212
+ async with session.get(
213
+ ETHERSCAN_BASE,
214
+ params={
215
+ "module": "account",
216
+ "action": "txlist",
217
+ "address": address,
218
+ "startblock": 0,
219
+ "endblock": 99999999,
220
+ "page": 1,
221
+ "offset": MAX_TRANSACTIONS_PER_WALLET,
222
+ "sort": "desc",
223
+ "apikey": api_key,
224
+ },
225
+ ) as resp:
226
+ if resp.status == 200:
227
+ data = await resp.json()
228
+ if data.get("status") == "1":
229
+ txs = data.get("result", [])
230
+ result["transaction_count"] = len(txs)
231
+ result["lookup_successful"] = True
232
+
233
+ wei_to_eth = 1 / 10**18
234
+ connected = []
235
+
236
+ for tx in txs:
237
+ is_incoming = tx.get("to", "").lower() == address.lower()
238
+ counterparty = tx.get("from") if is_incoming else tx.get("to")
239
+ value_eth = int(tx.get("value", 0)) * wei_to_eth
240
+
241
+ if counterparty and counterparty.lower() != address.lower() and value_eth > 0.001:
242
+ connected.append({
243
+ "address": counterparty,
244
+ "direction": "received" if is_incoming else "sent",
245
+ "amount": round(value_eth, 6),
246
+ "tx_hash": tx.get("hash", ""),
247
+ "confirmed": tx.get("timeStamp", ""),
248
+ })
249
+
250
+ result["connected_addresses"] = connected[:MAX_CONNECTED_ADDRESSES]
251
+
252
+ if txs:
253
+ # Sorted desc, so first is newest
254
+ result["last_seen"] = txs[0].get("timeStamp", "")
255
+ result["first_seen"] = txs[-1].get("timeStamp", "")
256
+
257
+ except asyncio.TimeoutError:
258
+ result["error"] = "timeout"
259
+ except Exception as e:
260
+ result["error"] = str(e)[:100]
261
+ logger.warning(f"Etherscan lookup failed for {address[:12]}...: {e}")
262
+
263
+ return result
264
+
265
+
266
+ async def lookup_wallet(
267
+ address: str,
268
+ blockcypher_token: str = "",
269
+ etherscan_key: str = "",
270
+ ) -> dict:
271
+ """
272
+ Unified wallet lookup. Detects type and routes to correct API.
273
+ """
274
+ wallet_type = detect_wallet_type(address)
275
+
276
+ if wallet_type == BITCOIN_ADDRESS:
277
+ return await lookup_bitcoin_address(address, blockcypher_token)
278
+ elif wallet_type == ETHEREUM_ADDRESS:
279
+ return await lookup_ethereum_address(address, etherscan_key)
280
+ elif wallet_type == MONERO_ADDRESS:
281
+ return {
282
+ "address": address,
283
+ "wallet_type": MONERO_ADDRESS,
284
+ "lookup_successful": False,
285
+ "error": "monero_privacy_coin",
286
+ "note": "Monero transactions are private and cannot be looked up without view key",
287
+ }
288
+ else:
289
+ return {
290
+ "address": address,
291
+ "wallet_type": "unknown",
292
+ "lookup_successful": False,
293
+ "error": "unrecognized_format",
294
+ }
295
+
296
+
297
+ async def enrich_wallets_for_investigation(
298
+ investigation_id: uuid.UUID,
299
+ session: Any,
300
+ blockcypher_token: str = "",
301
+ etherscan_key: str = "",
302
+ max_wallets: int = 10,
303
+ ) -> dict:
304
+ """
305
+ For all crypto wallet entities in an investigation:
306
+ 1. Look up transaction data from blockchain APIs
307
+ 2. Store transaction metadata on the entity
308
+ 3. Create connected address entities and PAID_TO/FUNDED_BY relationships
309
+ """
310
+ stats = {
311
+ "wallets_looked_up": 0,
312
+ "successful_lookups": 0,
313
+ "edges_created": 0,
314
+ "connected_wallets_found": 0,
315
+ "errors": 0,
316
+ }
317
+
318
+ # Get all wallet entities for this investigation
319
+ wallets = (
320
+ session.query(Entity)
321
+ .filter(
322
+ Entity.investigation_id == investigation_id,
323
+ Entity.entity_type.in_([BITCOIN_ADDRESS, ETHEREUM_ADDRESS, MONERO_ADDRESS]),
324
+ Entity.value.isnot(None),
325
+ )
326
+ .limit(max_wallets)
327
+ .all()
328
+ )
329
+
330
+ if not wallets:
331
+ return stats
332
+
333
+ logger.warning(f"Blockchain enrichment: {len(wallets)} wallets to process")
334
+
335
+ for wallet_entity in wallets:
336
+ address = wallet_entity.value.strip()
337
+ stats["wallets_looked_up"] += 1
338
+
339
+ try:
340
+ lookup_result = await lookup_wallet(
341
+ address=address,
342
+ blockcypher_token=blockcypher_token,
343
+ etherscan_key=etherscan_key,
344
+ )
345
+
346
+ if not lookup_result.get("lookup_successful"):
347
+ if lookup_result.get("error") != "monero_privacy_coin":
348
+ stats["errors"] += 1
349
+ continue
350
+
351
+ stats["successful_lookups"] += 1
352
+
353
+ # Update entity historical_context with financial summary
354
+ wallet_type = lookup_result.get("wallet_type", "")
355
+ tx_count = lookup_result.get("transaction_count", 0)
356
+
357
+ if wallet_type == BITCOIN_ADDRESS:
358
+ balance = lookup_result.get("balance_btc", 0)
359
+ summary = f"BTC Balance: {balance:.4f} BTC, Transactions: {tx_count}"
360
+ else:
361
+ balance = lookup_result.get("balance_eth", 0)
362
+ summary = f"ETH Balance: {balance:.4f} ETH, Transactions: {tx_count}"
363
+
364
+ if not wallet_entity.historical_context:
365
+ wallet_entity.historical_context = summary
366
+
367
+ # Update first_seen if available
368
+ first_seen_val = lookup_result.get("first_seen")
369
+ if first_seen_val and not wallet_entity.first_seen:
370
+ try:
371
+ if isinstance(first_seen_val, int) or (isinstance(first_seen_val, str) and first_seen_val.isdigit()):
372
+ wallet_entity.first_seen = datetime.fromtimestamp(int(first_seen_val), tz=timezone.utc)
373
+ elif isinstance(first_seen_val, str):
374
+ # BlockCypher ISO format
375
+ wallet_entity.first_seen = datetime.fromisoformat(first_seen_val.replace("Z", "+00:00"))
376
+ except Exception:
377
+ pass
378
+
379
+ # Process connected addresses
380
+ connected = lookup_result.get("connected_addresses", [])
381
+ for conn in connected:
382
+ conn_address = conn.get("address")
383
+ if not conn_address or conn_address.lower() == address.lower():
384
+ continue
385
+
386
+ # Detect type for counterparty
387
+ conn_type = detect_wallet_type(conn_address) or wallet_type
388
+
389
+ # Create counterparty entity
390
+ conn_entity, _ = upsert_entity_canonical(
391
+ session=session,
392
+ investigation_id=investigation_id,
393
+ entity_type=conn_type,
394
+ entity_value=conn_address,
395
+ confidence=0.95,
396
+ context_snippet=f"Related to {address[:12]} via blockchain transaction",
397
+ extraction_method="blockchain_api",
398
+ )
399
+ stats["connected_wallets_found"] += 1
400
+
401
+ # Build Relationship
402
+ direction = conn.get("direction", "sent")
403
+ if direction == "received":
404
+ # conn -> us (FUNDED_BY) or conn PAID_TO us
405
+ source_id = conn_entity.id
406
+ target_id = wallet_entity.id
407
+ else:
408
+ # us -> conn (PAID_TO)
409
+ source_id = wallet_entity.id
410
+ target_id = conn_entity.id
411
+
412
+ # Check duplication
413
+ existing = session.query(EntityRelationship).filter_by(
414
+ entity_a_id=source_id,
415
+ entity_b_id=target_id,
416
+ relationship_type=RelationshipType.PAID_TO.value
417
+ ).first()
418
+
419
+ if not existing:
420
+ rel = EntityRelationship(
421
+ entity_a_id=source_id,
422
+ entity_b_id=target_id,
423
+ relationship_type=RelationshipType.PAID_TO.value,
424
+ confidence=0.95,
425
+ metadata_json={
426
+ "amount": conn.get("amount"),
427
+ "currency": "BTC" if wallet_type == BITCOIN_ADDRESS else "ETH",
428
+ "tx_hash": conn.get("tx_hash"),
429
+ } if hasattr(EntityRelationship, "metadata_json") else None
430
+ )
431
+ session.add(rel)
432
+ stats["edges_created"] += 1
433
+
434
+ session.flush()
435
+
436
+ except Exception as e:
437
+ stats["errors"] += 1
438
+ logger.warning(f"Wallet enrichment failed for {address[:12]}: {e}")
439
+
440
+ # Respect rate limits
441
+ await asyncio.sleep(0.4)
442
+
443
+ session.commit()
444
+ return stats
sources/cache.py ADDED
@@ -0,0 +1,93 @@
1
+ """
2
+ sources/cache.py — Simple file-based TTL cache for external feeds.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import time
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _memory_cache: dict[str, tuple[Any, float]] = {}
17
+
18
+
19
+ class CachedFeed:
20
+ """
21
+ Fetch a remote JSON feed and cache it to *cache_path* for *ttl_seconds*.
22
+
23
+ On every call to :meth:`fetch`:
24
+ - If a fresh cache file exists (mtime < ttl_seconds), return cached data.
25
+ - If stale or missing: fetch from *url*, save, and return.
26
+ - If fetch fails but stale cache exists: return stale cache with a warning.
27
+ - If fetch fails and no cache exists: log error and return None.
28
+ """
29
+
30
+ def __init__(self, url: str, cache_path: str, ttl_seconds: int):
31
+ self.url = url
32
+ self.cache_path = Path(cache_path)
33
+ self.ttl_seconds = ttl_seconds
34
+
35
+ def _is_fresh(self) -> bool:
36
+ if not self.cache_path.exists():
37
+ return False
38
+ mtime = self.cache_path.stat().st_mtime
39
+ return (time.time() - mtime) < self.ttl_seconds
40
+
41
+ async def fetch(self) -> Optional[dict | list]:
42
+ import aiohttp
43
+
44
+ now = time.time()
45
+ cache_key = self.url
46
+
47
+ if cache_key in _memory_cache:
48
+ cached_data, timestamp = _memory_cache[cache_key]
49
+ if (now - timestamp) < self.ttl_seconds:
50
+ return cached_data
51
+
52
+ if self._is_fresh():
53
+ try:
54
+ with self.cache_path.open("r", encoding="utf-8") as f:
55
+ data = json.load(f)
56
+ _memory_cache[cache_key] = (data, now)
57
+ return data
58
+ except Exception:
59
+ pass
60
+
61
+ try:
62
+ timeout = aiohttp.ClientTimeout(total=30)
63
+ async with aiohttp.ClientSession(timeout=timeout) as session:
64
+ async with session.get(self.url) as resp:
65
+ if resp.status != 200:
66
+ logger.warning("CachedFeed: HTTP %s for %s", resp.status, self.url)
67
+ stale = self._stale_cache()
68
+ if stale is not None and cache_key in _memory_cache:
69
+ return stale
70
+ return stale
71
+ data = await resp.json(content_type=None)
72
+ self.cache_path.parent.mkdir(parents=True, exist_ok=True)
73
+ with self.cache_path.open("w", encoding="utf-8") as f:
74
+ json.dump(data, f)
75
+ _memory_cache[cache_key] = (data, now)
76
+ return data
77
+ except Exception as e:
78
+ logger.warning("CachedFeed: fetch failed for %s: %s", self.url, e)
79
+ if cache_key in _memory_cache:
80
+ cached_data, timestamp = _memory_cache[cache_key]
81
+ logger.warning("CachedFeed: returning stale memory cache for %s", self.url)
82
+ return cached_data
83
+ return self._stale_cache()
84
+
85
+ def _stale_cache(self) -> Optional[dict | list]:
86
+ if self.cache_path.exists():
87
+ try:
88
+ with self.cache_path.open("r", encoding="utf-8") as f:
89
+ logger.warning("CachedFeed: falling back to stale cache %s", self.cache_path)
90
+ return json.load(f)
91
+ except Exception:
92
+ pass
93
+ return None
sources/cisa.py ADDED
@@ -0,0 +1,108 @@
1
+ """
2
+ sources/cisa.py — CISA KEV catalog + CISA advisories feed enrichment.
3
+
4
+ Fetches two CISA feeds (clearnet, not through Tor):
5
+ 1. Known Exploited Vulnerabilities (KEV) catalog — 24-hour TTL cache
6
+ 2. Cybersecurity advisories index — 6-hour TTL cache
7
+
8
+ For CVE entities: checks if they appear in the KEV catalog and marks them
9
+ as actively exploited.
10
+ For THREAT_ACTOR / RANSOMWARE_GROUP / MALWARE_FAMILY entities: searches
11
+ advisory titles and tags for name matches.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from typing import Optional
18
+
19
+ from sources.cache import CachedFeed
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ CISA_KEV_URL = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
24
+ CISA_ADVISORIES_URL = "https://www.cisa.gov/cybersecurity-advisories/all.json"
25
+
26
+ _KEV_CACHE = "/tmp/voidaccess_cisa_kev.json"
27
+ _ADVISORIES_CACHE = "/tmp/voidaccess_cisa_advisories.json"
28
+
29
+ _kev_feed = CachedFeed(CISA_KEV_URL, _KEV_CACHE, ttl_seconds=86400)
30
+ _adv_feed = CachedFeed(CISA_ADVISORIES_URL, _ADVISORIES_CACHE, ttl_seconds=21600)
31
+
32
+
33
+ async def enrich_cisa_cve(cve_id: str) -> list[dict]:
34
+ """
35
+ Check if *cve_id* appears in the CISA KEV catalog.
36
+
37
+ Returns a list with one EnrichmentResult dict if found, empty list otherwise.
38
+ """
39
+ data = await _kev_feed.fetch()
40
+ if data is None:
41
+ return []
42
+
43
+ kev_list = data if isinstance(data, list) else data.get("vulnerabilities", [])
44
+ for entry in kev_list:
45
+ if (entry.get("cveID") or "").upper() == cve_id.upper():
46
+ return [{
47
+ "source": "cisa_kev",
48
+ "entity_type": "CVE_NUMBER",
49
+ "entity_value": cve_id,
50
+ "is_actively_exploited": True,
51
+ "vendor_project": entry.get("vendorProject", ""),
52
+ "product": entry.get("product", ""),
53
+ "vulnerability_name": entry.get("vulnerabilityName", ""),
54
+ "date_added": entry.get("dateAdded", ""),
55
+ "short_description": entry.get("shortDescription", ""),
56
+ }]
57
+ return []
58
+
59
+
60
+ async def enrich_cisa_advisories(entity_value: str, entity_type: str) -> list[dict]:
61
+ """
62
+ Search CISA advisories for *entity_value* matching THREAT_ACTOR,
63
+ RANSOMWARE_GROUP, or MALWARE_FAMILY.
64
+ """
65
+ if entity_type not in ("THREAT_ACTOR", "RANSOMWARE_GROUP", "MALWARE_FAMILY"):
66
+ return []
67
+
68
+ data = await _adv_feed.fetch()
69
+ if data is None:
70
+ return []
71
+
72
+ advisories = data if isinstance(data, list) else data.get("items", [])
73
+ results = []
74
+ q = entity_value.lower()
75
+
76
+ for adv in advisories:
77
+ title = (adv.get("title") or "").lower()
78
+ tags = " ".join(adv.get("tags") or []).lower()
79
+ if q in title or q in tags:
80
+ results.append({
81
+ "source": "cisa_advisory",
82
+ "entity_type": entity_type,
83
+ "entity_value": entity_value,
84
+ "advisory_title": adv.get("title", ""),
85
+ "advisory_url": adv.get("url", ""),
86
+ "advisory_date": adv.get("datePublished", ""),
87
+ })
88
+ return results
89
+
90
+
91
+ async def enrich_cisa(query: str, entities: list[dict]) -> list[dict]:
92
+ """
93
+ Main entry point for CISA enrichment.
94
+
95
+ For each CVE entity, checks KEV.
96
+ For each THREAT_ACTOR / RANSOMWARE_GROUP / MALWARE_FAMILY entity, searches advisories.
97
+ """
98
+ results: list[dict] = []
99
+ for ent in entities:
100
+ et = ent.get("type") or ent.get("entity_type", "")
101
+ ev = ent.get("value") or ent.get("entity_value", "")
102
+
103
+ if et == "CVE_NUMBER" and ev:
104
+ results.extend(await enrich_cisa_cve(ev))
105
+ elif et in ("THREAT_ACTOR", "RANSOMWARE_GROUP", "MALWARE_FAMILY") and ev:
106
+ results.extend(await enrich_cisa_advisories(ev, et))
107
+
108
+ return results