voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
sources/virustotal.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sources/virustotal.py — VirusTotal hash enrichment (file hash lookup).
|
|
3
|
+
|
|
4
|
+
Requires VT_API_KEY in config. Free tier: 4 requests/minute.
|
|
5
|
+
Max 20 hashes per investigation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import aiohttp
|
|
15
|
+
|
|
16
|
+
from config import VT_API_KEY
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_VT_BASE = "https://www.virustotal.com/api/v3"
|
|
21
|
+
_VT_HASH_LIMIT = 20
|
|
22
|
+
_VT_RATE_LIMIT_DELAY = 15.0
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is_enabled() -> bool:
|
|
26
|
+
key = getattr(VT_API_KEY, "strip", lambda: "")()
|
|
27
|
+
return bool(key)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _fetch_hash(hash_value: str, session: aiohttp.ClientSession) -> Optional[dict]:
|
|
31
|
+
try:
|
|
32
|
+
headers = {"x-apikey": VT_API_KEY.strip()}
|
|
33
|
+
timeout = aiohttp.ClientTimeout(total=15)
|
|
34
|
+
async with session.get(
|
|
35
|
+
f"{_VT_BASE}/files/{hash_value}", headers=headers, timeout=timeout
|
|
36
|
+
) as resp:
|
|
37
|
+
if resp.status == 404:
|
|
38
|
+
return None
|
|
39
|
+
if resp.status == 401:
|
|
40
|
+
logger.warning("VirusTotal: invalid API key")
|
|
41
|
+
return None
|
|
42
|
+
if resp.status == 429:
|
|
43
|
+
logger.warning("VirusTotal: rate limited")
|
|
44
|
+
return None
|
|
45
|
+
if resp.status != 200:
|
|
46
|
+
return None
|
|
47
|
+
return await resp.json()
|
|
48
|
+
except asyncio.TimeoutError:
|
|
49
|
+
logger.warning("VirusTotal: timeout for hash %s", hash_value[:16])
|
|
50
|
+
return None
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.warning("VirusTotal: error for hash %s: %s", hash_value[:16], e)
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def enrich_virustotal(entities: list[dict]) -> list[dict]:
|
|
57
|
+
"""
|
|
58
|
+
For each FILE_HASH_MD5 / FILE_HASH_SHA1 / FILE_HASH_SHA256 entity,
|
|
59
|
+
query VirusTotal and return detection stats.
|
|
60
|
+
"""
|
|
61
|
+
if not _is_enabled():
|
|
62
|
+
logger.debug("VirusTotal skipped — no API key configured")
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
hash_type_map = {
|
|
66
|
+
"FILE_HASH_MD5": "md5",
|
|
67
|
+
"FILE_HASH_SHA1": "sha1",
|
|
68
|
+
"FILE_HASH_SHA256": "sha256",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
hash_entities = [
|
|
72
|
+
e for e in entities
|
|
73
|
+
if (e.get("type") or e.get("entity_type", "")) in hash_type_map
|
|
74
|
+
and (e.get("value") or e.get("entity_value", ""))
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
hashes_to_query = [
|
|
78
|
+
(e.get("value") or e.get("entity_value", ""), (e.get("type") or e.get("entity_type", "")))
|
|
79
|
+
for e in hash_entities
|
|
80
|
+
][:_VT_HASH_LIMIT]
|
|
81
|
+
|
|
82
|
+
results: list[dict] = []
|
|
83
|
+
async with aiohttp.ClientSession() as session:
|
|
84
|
+
for hash_val, hash_type in hashes_to_query:
|
|
85
|
+
data = await _fetch_hash(hash_val, session)
|
|
86
|
+
if data is None:
|
|
87
|
+
await asyncio.sleep(_VT_RATE_LIMIT_DELAY)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
attr = data.get("data", {}).get("attributes", {})
|
|
91
|
+
stats = attr.get("last_analysis_stats", {})
|
|
92
|
+
mal = stats.get("malicious", 0)
|
|
93
|
+
total = sum(stats.values())
|
|
94
|
+
detection_ratio = mal / total if total > 0 else 0.0
|
|
95
|
+
|
|
96
|
+
results.append({
|
|
97
|
+
"source": "virustotal",
|
|
98
|
+
"entity_type": hash_type_map.get(hash_type, "FILE_HASH"),
|
|
99
|
+
"entity_value": hash_val,
|
|
100
|
+
"malicious_count": mal,
|
|
101
|
+
"total_engines": total,
|
|
102
|
+
"detection_ratio": detection_ratio,
|
|
103
|
+
"suggested_threat_label": attr.get("popular_threat_classification", {}).get("suggested_threat_label", ""),
|
|
104
|
+
"first_seen": attr.get("creation_date", ""),
|
|
105
|
+
"last_seen": attr.get("last_analysis_date", ""),
|
|
106
|
+
"confirmed_malicious": detection_ratio > 0.5,
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
await asyncio.sleep(_VT_RATE_LIMIT_DELAY)
|
|
110
|
+
|
|
111
|
+
if results:
|
|
112
|
+
logger.info("VirusTotal: %d results", len(results))
|
|
113
|
+
return results
|
utils/__init__.py
ADDED
|
File without changes
|
utils/async_utils.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async utilities for safely running coroutines in various contexts.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
11
|
+
from typing import Any, Coroutine, TypeVar
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_T = TypeVar("_T")
|
|
16
|
+
|
|
17
|
+
_executor: ThreadPoolExecutor | None = None
|
|
18
|
+
_executor_lock = threading.Lock()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_executor() -> ThreadPoolExecutor:
|
|
22
|
+
global _executor
|
|
23
|
+
if _executor is None:
|
|
24
|
+
with _executor_lock:
|
|
25
|
+
if _executor is None:
|
|
26
|
+
_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="async_utils_")
|
|
27
|
+
return _executor
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def run_async(coro: Coroutine[Any, Any, _T]) -> _T:
|
|
31
|
+
"""
|
|
32
|
+
Safely run a coroutine regardless of whether there's already a running event loop.
|
|
33
|
+
|
|
34
|
+
Uses a thread-isolated event loop when called from:
|
|
35
|
+
- An already-running event loop (e.g., inside APScheduler jobs, pytest-asyncio)
|
|
36
|
+
- A synchronous context
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
coro: The coroutine to run
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The result of the coroutine
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
RuntimeError: If the coroutine fails to run
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
loop = asyncio.get_running_loop()
|
|
49
|
+
except RuntimeError:
|
|
50
|
+
loop = None
|
|
51
|
+
|
|
52
|
+
if loop is not None:
|
|
53
|
+
return _run_in_thread(coro)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
return asyncio.run(coro)
|
|
57
|
+
except RuntimeError as e:
|
|
58
|
+
if "already running" in str(e).lower():
|
|
59
|
+
return _run_in_thread(coro)
|
|
60
|
+
raise
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _run_in_thread(coro: Coroutine[Any, Any, _T]) -> _T:
|
|
64
|
+
"""
|
|
65
|
+
Run a coroutine in a dedicated thread with its own event loop.
|
|
66
|
+
"""
|
|
67
|
+
future: Future[_T] = Future()
|
|
68
|
+
|
|
69
|
+
def _run() -> None:
|
|
70
|
+
local_loop = asyncio.new_event_loop()
|
|
71
|
+
asyncio.set_event_loop(local_loop)
|
|
72
|
+
try:
|
|
73
|
+
result = local_loop.run_until_complete(coro)
|
|
74
|
+
future.set_result(result)
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
future.set_exception(exc)
|
|
77
|
+
finally:
|
|
78
|
+
local_loop.close()
|
|
79
|
+
|
|
80
|
+
executor = _get_executor()
|
|
81
|
+
executor.submit(_run)
|
|
82
|
+
return future.result()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def run_async_optional(coro: Coroutine[Any, Any, _T] | None) -> _T | None:
|
|
86
|
+
"""Run a coroutine if provided, otherwise return None."""
|
|
87
|
+
if coro is None:
|
|
88
|
+
return None
|
|
89
|
+
return run_async(coro)
|
utils/content_safety.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
utils/content_safety.py — Mandatory content safety filters for VoidAccess.
|
|
3
|
+
|
|
4
|
+
Operates at multiple layers: query intake, URL filtering, content scanning,
|
|
5
|
+
and post-extraction entity value filtering.
|
|
6
|
+
Never logs actual prohibited content — only event metadata.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
_logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Blocklists
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
BLOCKED_TERMS = [
|
|
23
|
+
# CSAM related
|
|
24
|
+
"child porn", "cp porn", "lolita", "pedo",
|
|
25
|
+
"pedophil", "childporn", "child sex", "minor sex",
|
|
26
|
+
"underage sex", "jailbait", "preteen sex",
|
|
27
|
+
"child abuse material", "csam", "child model",
|
|
28
|
+
"hurtcore", "daisy's destruction",
|
|
29
|
+
# Gore related
|
|
30
|
+
"gore site", "gore video", "snuff film",
|
|
31
|
+
"murder video", "execution video", "beheading video",
|
|
32
|
+
"torture video", "bestgore", "livegore",
|
|
33
|
+
"watchpeopledie", "realsnuff",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
BLOCKED_PATTERNS = [
|
|
37
|
+
r'\bcp\b.{0,20}\bonion\b', # "cp" near "onion"
|
|
38
|
+
r'\bchild.{0,10}\bnaked\b',
|
|
39
|
+
r'\bminor.{0,10}\bnaked\b',
|
|
40
|
+
r'\bkid.{0,10}\bporn\b',
|
|
41
|
+
r'\bteen.{0,10}\bporn\b',
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
BLOCKED_URL_TERMS = [
|
|
45
|
+
"pedo", "loli", "jailbait", "childporn",
|
|
46
|
+
"hurtcore", "csam", "bestgore", "livegore",
|
|
47
|
+
"watchpeople", "realsnuff", "daisy",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
CONTENT_BLOCKLIST = [
|
|
51
|
+
"child pornography", "child porn",
|
|
52
|
+
"child sexual abuse", "csam",
|
|
53
|
+
"snuff film", "murder porn",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Entity value blocklist — applied after extraction, before DB storage
|
|
58
|
+
# Only checked against text-based entity types (not technical IOCs)
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
ENTITY_VALUE_BLOCKLIST: list[str] = [
|
|
62
|
+
# Adult content categories
|
|
63
|
+
"porn", "blowjob", "bdsm", "hardcore",
|
|
64
|
+
"xxx", "nude", "nudes", "naked", "escort",
|
|
65
|
+
"onlyfans", "cam girl", "sex tape",
|
|
66
|
+
"adult content", "adult site",
|
|
67
|
+
# Gore/violence
|
|
68
|
+
"snuff", "gore", "murder video",
|
|
69
|
+
"execution video", "beheading",
|
|
70
|
+
# Exploitation
|
|
71
|
+
"jailbait", "pedo", "csam",
|
|
72
|
+
"child", "minor",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Entity types where prohibited content can appear as names/labels.
|
|
76
|
+
# Technical IOC types (hashes, IPs, CVEs, wallets, onion URLs) are
|
|
77
|
+
# intentionally omitted — they cannot contain prohibited content.
|
|
78
|
+
_TEXT_ENTITY_TYPES: frozenset[str] = frozenset({
|
|
79
|
+
"ORGANIZATION_NAME",
|
|
80
|
+
"THREAT_ACTOR_HANDLE",
|
|
81
|
+
"PERSON_NAME",
|
|
82
|
+
"MALWARE_FAMILY",
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Public API
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def is_blocked_query(query: str) -> tuple[bool, str]:
|
|
91
|
+
"""
|
|
92
|
+
Check if a query should be blocked.
|
|
93
|
+
Returns (is_blocked, reason).
|
|
94
|
+
Never logs the actual query.
|
|
95
|
+
"""
|
|
96
|
+
query_lower = query.lower()
|
|
97
|
+
|
|
98
|
+
for term in BLOCKED_TERMS:
|
|
99
|
+
if term in query_lower:
|
|
100
|
+
return True, "Query contains prohibited content"
|
|
101
|
+
|
|
102
|
+
for pattern in BLOCKED_PATTERNS:
|
|
103
|
+
if re.search(pattern, query_lower, re.IGNORECASE):
|
|
104
|
+
return True, "Query contains prohibited content"
|
|
105
|
+
|
|
106
|
+
return False, ""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def is_blocked_entity_value(entity_type: str, value: str) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Return True if an entity value should be dropped before storage.
|
|
112
|
+
|
|
113
|
+
Only applies to text-based entity types where prohibited content can
|
|
114
|
+
appear as organisation/actor names (ORGANIZATION_NAME, THREAT_ACTOR_HANDLE,
|
|
115
|
+
PERSON_NAME, MALWARE_FAMILY).
|
|
116
|
+
|
|
117
|
+
Never applies to technical IOC types such as FILE_HASH_*, IP_ADDRESS, CVE,
|
|
118
|
+
ONION_URL, or wallet addresses \u2014 these cannot contain prohibited content
|
|
119
|
+
by definition and are intentionally excluded.
|
|
120
|
+
|
|
121
|
+
The check is case-insensitive substring matching against
|
|
122
|
+
ENTITY_VALUE_BLOCKLIST. The actual value is never logged.
|
|
123
|
+
"""
|
|
124
|
+
if entity_type not in _TEXT_ENTITY_TYPES:
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
value_lower = (value or "").lower()
|
|
128
|
+
for term in ENTITY_VALUE_BLOCKLIST:
|
|
129
|
+
if term in value_lower:
|
|
130
|
+
return True
|
|
131
|
+
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def is_blocked_url(url: str) -> tuple[bool, str]:
|
|
136
|
+
"""
|
|
137
|
+
Check if a URL should be blocked from scraping.
|
|
138
|
+
Returns (is_blocked, reason).
|
|
139
|
+
"""
|
|
140
|
+
url_lower = url.lower()
|
|
141
|
+
for term in BLOCKED_URL_TERMS:
|
|
142
|
+
if term in url_lower:
|
|
143
|
+
return True, "URL blocked — prohibited content"
|
|
144
|
+
return False, ""
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def sanitize_content(text: str) -> tuple[str, bool]:
|
|
148
|
+
"""
|
|
149
|
+
Scan scraped text for CSAM/gore indicators.
|
|
150
|
+
Returns (sanitized_text, was_flagged).
|
|
151
|
+
If flagged, returns empty string — the original text is never stored.
|
|
152
|
+
"""
|
|
153
|
+
if not text:
|
|
154
|
+
return text, False
|
|
155
|
+
|
|
156
|
+
text_lower = text.lower()
|
|
157
|
+
for term in CONTENT_BLOCKLIST:
|
|
158
|
+
if term in text_lower:
|
|
159
|
+
return "", True
|
|
160
|
+
|
|
161
|
+
return text, False
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def log_content_safety_event(
|
|
165
|
+
event_type: str,
|
|
166
|
+
content_hash: Optional[str] = None,
|
|
167
|
+
user_id: Optional[int] = None,
|
|
168
|
+
) -> None:
|
|
169
|
+
"""
|
|
170
|
+
Persist a content safety block event to the DB for operator review.
|
|
171
|
+
Fails silently — never disrupts the calling pipeline.
|
|
172
|
+
event_type: one of "query_blocked", "url_blocked", "content_blocked"
|
|
173
|
+
content_hash: SHA-256 hex prefix (≤16 chars) of the blocked item, for correlation only.
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
import os
|
|
177
|
+
if not os.getenv("DATABASE_URL"):
|
|
178
|
+
return
|
|
179
|
+
from db.session import get_session
|
|
180
|
+
from db.models import ContentSafetyEvent
|
|
181
|
+
from datetime import datetime, timezone
|
|
182
|
+
|
|
183
|
+
with get_session() as session:
|
|
184
|
+
event = ContentSafetyEvent(
|
|
185
|
+
event_type=event_type,
|
|
186
|
+
user_id=user_id,
|
|
187
|
+
content_hash=content_hash,
|
|
188
|
+
timestamp=datetime.now(timezone.utc),
|
|
189
|
+
)
|
|
190
|
+
session.add(event)
|
|
191
|
+
session.commit()
|
|
192
|
+
except Exception as exc:
|
|
193
|
+
_logger.debug("content_safety: DB log failed (non-critical): %s", exc)
|
utils/defang.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def defang_url(url: str) -> str:
|
|
5
|
+
"""
|
|
6
|
+
Defang a URL for safe sharing in reports.
|
|
7
|
+
hxxp://example[.]com/path
|
|
8
|
+
"""
|
|
9
|
+
if not url:
|
|
10
|
+
return url
|
|
11
|
+
url = url.replace("http://", "hxxp://")
|
|
12
|
+
url = url.replace("https://", "hxxps://")
|
|
13
|
+
url = url.replace("ftp://", "fxp://")
|
|
14
|
+
parts = url.split("/", 3)
|
|
15
|
+
if len(parts) >= 3:
|
|
16
|
+
parts[2] = parts[2].replace(".", "[.]")
|
|
17
|
+
url = "/".join(parts)
|
|
18
|
+
else:
|
|
19
|
+
url = url.replace(".", "[.]")
|
|
20
|
+
return url
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def defang_ip(ip: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Defang an IP address.
|
|
26
|
+
1.2.3.4 -> 1.2.3[.]4
|
|
27
|
+
"""
|
|
28
|
+
if not ip:
|
|
29
|
+
return ip
|
|
30
|
+
parts = ip.rsplit(".", 1)
|
|
31
|
+
if len(parts) == 2:
|
|
32
|
+
return f"{parts[0]}[.]{parts[1]}"
|
|
33
|
+
return ip
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def defang_email(email: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Defang an email address.
|
|
39
|
+
user@example.com -> user[@]example[.]com
|
|
40
|
+
"""
|
|
41
|
+
if not email:
|
|
42
|
+
return email
|
|
43
|
+
email = email.replace("@", "[@]")
|
|
44
|
+
parts = email.split("[@]", 1)
|
|
45
|
+
if len(parts) == 2:
|
|
46
|
+
parts[1] = parts[1].replace(".", "[.]")
|
|
47
|
+
email = "[@]".join(parts)
|
|
48
|
+
return email
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def defang_value(entity_type: str, value: str) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Defang an entity value based on its type.
|
|
54
|
+
Returns the defanged version for display.
|
|
55
|
+
"""
|
|
56
|
+
if entity_type in (
|
|
57
|
+
"ONION_URL",
|
|
58
|
+
"DOMAIN",
|
|
59
|
+
):
|
|
60
|
+
return defang_url(value)
|
|
61
|
+
elif entity_type == "IP_ADDRESS":
|
|
62
|
+
return defang_ip(value)
|
|
63
|
+
elif entity_type == "EMAIL_ADDRESS":
|
|
64
|
+
return defang_email(value)
|
|
65
|
+
else:
|
|
66
|
+
return value
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def defang_text(text: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Defang all URLs and IPs found in free text.
|
|
72
|
+
Use for report summaries and context snippets.
|
|
73
|
+
"""
|
|
74
|
+
if not text:
|
|
75
|
+
return text
|
|
76
|
+
|
|
77
|
+
text = re.sub(
|
|
78
|
+
r'https?://',
|
|
79
|
+
lambda m: m.group().replace(
|
|
80
|
+
"http://", "hxxp://"
|
|
81
|
+
).replace(
|
|
82
|
+
"https://", "hxxps://"
|
|
83
|
+
),
|
|
84
|
+
text
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
text = re.sub(
|
|
88
|
+
r'\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})'
|
|
89
|
+
r'\.(\d{1,3})\b',
|
|
90
|
+
r'\1.\2.\3[.]\4',
|
|
91
|
+
text,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return text
|
utils/encryption.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Encryption utilities for user API keys.
|
|
3
|
+
|
|
4
|
+
Uses Fernet (AES-128-CBC) with a key derived from JWT_SECRET so that
|
|
5
|
+
no new secret needs to be distributed — only the existing JWT_SECRET
|
|
6
|
+
is required.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import hashlib
|
|
11
|
+
|
|
12
|
+
from cryptography.fernet import Fernet
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _get_fernet() -> Fernet:
|
|
16
|
+
from config import JWT_SECRET
|
|
17
|
+
key_bytes = hashlib.sha256(JWT_SECRET.encode()).digest()
|
|
18
|
+
fernet_key = base64.urlsafe_b64encode(key_bytes)
|
|
19
|
+
return Fernet(fernet_key)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def encrypt_api_key(plaintext: str) -> str:
|
|
23
|
+
if not plaintext:
|
|
24
|
+
return ""
|
|
25
|
+
return _get_fernet().encrypt(plaintext.encode()).decode()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def decrypt_api_key(ciphertext: str) -> str:
|
|
29
|
+
if not ciphertext:
|
|
30
|
+
return ""
|
|
31
|
+
try:
|
|
32
|
+
return _get_fernet().decrypt(ciphertext.encode()).decode()
|
|
33
|
+
except Exception:
|
|
34
|
+
return ""
|
utils/ioc_freshness.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FreshnessTag(str, Enum):
|
|
6
|
+
FRESH = "fresh"
|
|
7
|
+
AGING = "aging"
|
|
8
|
+
STALE = "stale"
|
|
9
|
+
EXPIRED = "expired"
|
|
10
|
+
UNKNOWN = "unknown"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
FRESHNESS_THRESHOLDS = {
|
|
14
|
+
"IP_ADDRESS": {
|
|
15
|
+
"fresh": 14,
|
|
16
|
+
"aging": 30,
|
|
17
|
+
"stale": 90,
|
|
18
|
+
},
|
|
19
|
+
"DOMAIN": {
|
|
20
|
+
"fresh": 30,
|
|
21
|
+
"aging": 90,
|
|
22
|
+
"stale": 180,
|
|
23
|
+
},
|
|
24
|
+
"ONION_URL": {
|
|
25
|
+
"fresh": 60,
|
|
26
|
+
"aging": 180,
|
|
27
|
+
"stale": 365,
|
|
28
|
+
},
|
|
29
|
+
"FILE_HASH_MD5": {
|
|
30
|
+
"fresh": 365,
|
|
31
|
+
"aging": 730,
|
|
32
|
+
"stale": 1825,
|
|
33
|
+
},
|
|
34
|
+
"FILE_HASH_SHA256": {
|
|
35
|
+
"fresh": 365,
|
|
36
|
+
"aging": 730,
|
|
37
|
+
"stale": 1825,
|
|
38
|
+
},
|
|
39
|
+
"CVE": {
|
|
40
|
+
"fresh": 365,
|
|
41
|
+
"aging": 730,
|
|
42
|
+
"stale": 1825,
|
|
43
|
+
},
|
|
44
|
+
"BITCOIN_ADDRESS": {
|
|
45
|
+
"fresh": 90,
|
|
46
|
+
"aging": 180,
|
|
47
|
+
"stale": 365,
|
|
48
|
+
},
|
|
49
|
+
"THREAT_ACTOR": {
|
|
50
|
+
"fresh": 90,
|
|
51
|
+
"aging": 365,
|
|
52
|
+
"stale": 730,
|
|
53
|
+
},
|
|
54
|
+
"DEFAULT": {
|
|
55
|
+
"fresh": 30,
|
|
56
|
+
"aging": 90,
|
|
57
|
+
"stale": 180,
|
|
58
|
+
},
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_freshness_tag(
|
|
63
|
+
entity_type: str,
|
|
64
|
+
last_seen_at: datetime | None,
|
|
65
|
+
first_seen_at: datetime | None = None,
|
|
66
|
+
) -> FreshnessTag:
|
|
67
|
+
"""
|
|
68
|
+
Calculate freshness tag for an entity based on its type and when it was last seen.
|
|
69
|
+
"""
|
|
70
|
+
if not last_seen_at:
|
|
71
|
+
return FreshnessTag.UNKNOWN
|
|
72
|
+
|
|
73
|
+
thresholds = FRESHNESS_THRESHOLDS.get(
|
|
74
|
+
entity_type,
|
|
75
|
+
FRESHNESS_THRESHOLDS["DEFAULT"],
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
now = datetime.now(timezone.utc)
|
|
79
|
+
# Ensure last_seen_at is tz-aware before subtracting
|
|
80
|
+
if last_seen_at.tzinfo is None:
|
|
81
|
+
last_seen_at = last_seen_at.replace(tzinfo=timezone.utc)
|
|
82
|
+
days_since_seen = (now - last_seen_at).days
|
|
83
|
+
|
|
84
|
+
if days_since_seen <= thresholds["fresh"]:
|
|
85
|
+
return FreshnessTag.FRESH
|
|
86
|
+
elif days_since_seen <= thresholds["aging"]:
|
|
87
|
+
return FreshnessTag.AGING
|
|
88
|
+
elif days_since_seen <= thresholds["stale"]:
|
|
89
|
+
return FreshnessTag.STALE
|
|
90
|
+
else:
|
|
91
|
+
return FreshnessTag.EXPIRED
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def get_freshness_display(tag: FreshnessTag) -> dict:
|
|
95
|
+
"""
|
|
96
|
+
Get display config for a freshness tag.
|
|
97
|
+
"""
|
|
98
|
+
return {
|
|
99
|
+
FreshnessTag.FRESH: {
|
|
100
|
+
"label": "Fresh",
|
|
101
|
+
"color": "green",
|
|
102
|
+
"description": "Recently observed",
|
|
103
|
+
},
|
|
104
|
+
FreshnessTag.AGING: {
|
|
105
|
+
"label": "Aging",
|
|
106
|
+
"color": "yellow",
|
|
107
|
+
"description": "Observed 1-3 months ago",
|
|
108
|
+
},
|
|
109
|
+
FreshnessTag.STALE: {
|
|
110
|
+
"label": "Stale",
|
|
111
|
+
"color": "orange",
|
|
112
|
+
"description": "Observed 3-6 months ago — verify before use",
|
|
113
|
+
},
|
|
114
|
+
FreshnessTag.EXPIRED: {
|
|
115
|
+
"label": "Expired",
|
|
116
|
+
"color": "red",
|
|
117
|
+
"description": "Observed over 6 months ago — likely inactive",
|
|
118
|
+
},
|
|
119
|
+
FreshnessTag.UNKNOWN: {
|
|
120
|
+
"label": "Unknown",
|
|
121
|
+
"color": "gray",
|
|
122
|
+
"description": "No date information available",
|
|
123
|
+
},
|
|
124
|
+
}.get(tag, {"label": "Unknown", "color": "gray"})
|
utils/user_keys.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Per-user API key resolution with fallback chain.
|
|
3
|
+
|
|
4
|
+
resolve_api_key checks the user's personal key first, then falls back to
|
|
5
|
+
the server-level environment variable in config.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import select as sa_select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
|
|
11
|
+
from db.models import UserApiKey
|
|
12
|
+
from utils.encryption import decrypt_api_key
|
|
13
|
+
import config as _config
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def get_user_key(user_id: int, key_name: str, session: AsyncSession) -> str | None:
|
|
17
|
+
result = await session.execute(
|
|
18
|
+
sa_select(UserApiKey).where(
|
|
19
|
+
UserApiKey.user_id == user_id,
|
|
20
|
+
UserApiKey.key_name == key_name,
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
record = result.scalar_one_or_none()
|
|
24
|
+
if record:
|
|
25
|
+
return decrypt_api_key(record.encrypted_value)
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def resolve_api_key(user_id: int, key_name: str, session: AsyncSession) -> str:
|
|
30
|
+
user_key = await get_user_key(user_id, key_name, session)
|
|
31
|
+
if user_key:
|
|
32
|
+
return user_key
|
|
33
|
+
return getattr(_config, key_name, "") or ""
|