voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
analysis/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analysis — Temporal, behavioral pattern, and OPSEC failure analysis.
|
|
3
|
+
|
|
4
|
+
Public interface
|
|
5
|
+
---------------
|
|
6
|
+
from analysis.temporal import build_activity_timeline, compute_activity_stats
|
|
7
|
+
from analysis.temporal import detect_anomalies, detect_silence_breaks
|
|
8
|
+
from analysis.patterns import check_exit_scam_pattern, check_law_enforcement_pattern
|
|
9
|
+
from analysis.patterns import check_new_actor_pattern, run_all_patterns
|
|
10
|
+
from analysis.opsec import detect_timezone_leak, detect_language_switch
|
|
11
|
+
from analysis.opsec import detect_clearnet_slip, detect_pgp_reuse
|
|
12
|
+
from analysis.opsec import run_full_opsec_analysis
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from analysis.opsec import (
|
|
16
|
+
detect_clearnet_slip,
|
|
17
|
+
detect_language_switch,
|
|
18
|
+
detect_pgp_reuse,
|
|
19
|
+
detect_timezone_leak,
|
|
20
|
+
run_full_opsec_analysis,
|
|
21
|
+
)
|
|
22
|
+
from analysis.patterns import (
|
|
23
|
+
check_exit_scam_pattern,
|
|
24
|
+
check_law_enforcement_pattern,
|
|
25
|
+
check_new_actor_pattern,
|
|
26
|
+
run_all_patterns,
|
|
27
|
+
)
|
|
28
|
+
from analysis.temporal import (
|
|
29
|
+
build_activity_timeline,
|
|
30
|
+
compute_activity_stats,
|
|
31
|
+
detect_anomalies,
|
|
32
|
+
detect_silence_breaks,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"build_activity_timeline",
|
|
37
|
+
"compute_activity_stats",
|
|
38
|
+
"detect_anomalies",
|
|
39
|
+
"detect_silence_breaks",
|
|
40
|
+
"check_exit_scam_pattern",
|
|
41
|
+
"check_law_enforcement_pattern",
|
|
42
|
+
"check_new_actor_pattern",
|
|
43
|
+
"run_all_patterns",
|
|
44
|
+
"detect_timezone_leak",
|
|
45
|
+
"detect_language_switch",
|
|
46
|
+
"detect_clearnet_slip",
|
|
47
|
+
"detect_pgp_reuse",
|
|
48
|
+
"run_full_opsec_analysis",
|
|
49
|
+
]
|
analysis/opsec.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analysis/opsec.py — Detects operational security failures in threat actor
|
|
3
|
+
communications that inadvertently reveal real-world identity or location.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Known clearnet URL regex — captures domain from http(s) URLs
|
|
16
|
+
_HTTP_URL_RE = re.compile(
|
|
17
|
+
r"https?://([a-zA-Z0-9][-a-zA-Z0-9.]*\.[a-zA-Z]{2,})(?:[/?#][^\s]*)?",
|
|
18
|
+
re.IGNORECASE,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Structured data patterns to strip before language detection
|
|
22
|
+
_BITCOIN_RE = re.compile(r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b")
|
|
23
|
+
_ETH_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
|
|
24
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
|
|
25
|
+
_URL_RE = re.compile(r"https?://\S+")
|
|
26
|
+
_ONION_RE = re.compile(r"\b[a-z2-7]{56}\.onion\b", re.IGNORECASE)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _strip_non_linguistic(text: str) -> str:
|
|
30
|
+
"""Remove URLs, wallet addresses, CVE IDs, and .onion addresses before language detection."""
|
|
31
|
+
text = _URL_RE.sub(" ", text)
|
|
32
|
+
text = _BITCOIN_RE.sub(" ", text)
|
|
33
|
+
text = _ETH_RE.sub(" ", text)
|
|
34
|
+
text = _CVE_RE.sub(" ", text)
|
|
35
|
+
text = _ONION_RE.sub(" ", text)
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def detect_timezone_leak(texts_with_timestamps: list[dict]) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Analyze posting time distribution to infer actor timezone.
|
|
42
|
+
|
|
43
|
+
Input: list of {"text": str, "timestamp": datetime}
|
|
44
|
+
If 80%+ of posts fall within a 6-hour window: infer timezone.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
detected: bool
|
|
48
|
+
probable_timezone_offset: str | None (e.g. "UTC+3")
|
|
49
|
+
confidence: float
|
|
50
|
+
posting_hours: list[int]
|
|
51
|
+
peak_window: str | None (e.g. "09:00-15:00 UTC")
|
|
52
|
+
"""
|
|
53
|
+
if not texts_with_timestamps:
|
|
54
|
+
return {
|
|
55
|
+
"detected": False,
|
|
56
|
+
"probable_timezone_offset": None,
|
|
57
|
+
"confidence": 0.0,
|
|
58
|
+
"posting_hours": [],
|
|
59
|
+
"peak_window": None,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
posting_hours: list[int] = []
|
|
63
|
+
for entry in texts_with_timestamps:
|
|
64
|
+
ts = entry.get("timestamp")
|
|
65
|
+
if ts is None:
|
|
66
|
+
continue
|
|
67
|
+
if hasattr(ts, "utcoffset") and ts.utcoffset() is not None:
|
|
68
|
+
# Convert to UTC
|
|
69
|
+
utc_ts = ts.astimezone(tz=None).replace(tzinfo=None)
|
|
70
|
+
posting_hours.append(utc_ts.hour)
|
|
71
|
+
else:
|
|
72
|
+
posting_hours.append(ts.hour)
|
|
73
|
+
|
|
74
|
+
if not posting_hours:
|
|
75
|
+
return {
|
|
76
|
+
"detected": False,
|
|
77
|
+
"probable_timezone_offset": None,
|
|
78
|
+
"confidence": 0.0,
|
|
79
|
+
"posting_hours": [],
|
|
80
|
+
"peak_window": None,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
total = len(posting_hours)
|
|
84
|
+
|
|
85
|
+
# Sliding 6-hour window to find best coverage
|
|
86
|
+
best_start = 0
|
|
87
|
+
best_count = 0
|
|
88
|
+
for h in range(24):
|
|
89
|
+
window_hours = {(h + offset) % 24 for offset in range(6)}
|
|
90
|
+
count = sum(1 for hour in posting_hours if hour in window_hours)
|
|
91
|
+
if count > best_count:
|
|
92
|
+
best_count = count
|
|
93
|
+
best_start = h
|
|
94
|
+
|
|
95
|
+
coverage = best_count / total
|
|
96
|
+
|
|
97
|
+
if coverage >= 0.80:
|
|
98
|
+
end_hour = (best_start + 6) % 24
|
|
99
|
+
peak_window = f"{best_start:02d}:00-{end_hour:02d}:00 UTC"
|
|
100
|
+
|
|
101
|
+
# Infer timezone: assume actor is active during 09:00-17:00 local.
|
|
102
|
+
# Window center → local noon assumption (midpoint at ~13:00 local)
|
|
103
|
+
window_center_utc = (best_start + 3) % 24
|
|
104
|
+
offset_raw = 13 - window_center_utc
|
|
105
|
+
if offset_raw > 12:
|
|
106
|
+
offset_raw -= 24
|
|
107
|
+
elif offset_raw < -12:
|
|
108
|
+
offset_raw += 24
|
|
109
|
+
|
|
110
|
+
if offset_raw >= 0:
|
|
111
|
+
tz_str = f"UTC+{offset_raw}"
|
|
112
|
+
else:
|
|
113
|
+
tz_str = f"UTC{offset_raw}"
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"detected": True,
|
|
117
|
+
"probable_timezone_offset": tz_str,
|
|
118
|
+
"confidence": round(coverage, 3),
|
|
119
|
+
"posting_hours": posting_hours,
|
|
120
|
+
"peak_window": peak_window,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
"detected": False,
|
|
125
|
+
"probable_timezone_offset": None,
|
|
126
|
+
"confidence": round(coverage, 3),
|
|
127
|
+
"posting_hours": posting_hours,
|
|
128
|
+
"peak_window": None,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def detect_language_switch(texts: list[str]) -> dict:
|
|
133
|
+
"""
|
|
134
|
+
Detect if an actor switches between languages across posts.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
detected: bool
|
|
138
|
+
languages_found: list[str] (ISO 639-1 codes)
|
|
139
|
+
primary_language: str | None
|
|
140
|
+
switch_count: int
|
|
141
|
+
switched_texts_indices: list[int]
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
from langdetect import detect as ld_detect
|
|
145
|
+
except ImportError:
|
|
146
|
+
return {"detected": False}
|
|
147
|
+
|
|
148
|
+
if not texts:
|
|
149
|
+
return {
|
|
150
|
+
"detected": False,
|
|
151
|
+
"languages_found": [],
|
|
152
|
+
"primary_language": None,
|
|
153
|
+
"switch_count": 0,
|
|
154
|
+
"switched_texts_indices": [],
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
detected_langs: list[Optional[str]] = []
|
|
158
|
+
for text in texts:
|
|
159
|
+
if not text:
|
|
160
|
+
detected_langs.append(None)
|
|
161
|
+
continue
|
|
162
|
+
clean_text = _strip_non_linguistic(text)
|
|
163
|
+
if len(clean_text) < 50:
|
|
164
|
+
detected_langs.append(None)
|
|
165
|
+
continue
|
|
166
|
+
try:
|
|
167
|
+
detected_langs.append(ld_detect(clean_text))
|
|
168
|
+
except Exception:
|
|
169
|
+
detected_langs.append(None)
|
|
170
|
+
|
|
171
|
+
valid_langs = [lang for lang in detected_langs if lang is not None]
|
|
172
|
+
if not valid_langs:
|
|
173
|
+
return {
|
|
174
|
+
"detected": False,
|
|
175
|
+
"languages_found": [],
|
|
176
|
+
"primary_language": None,
|
|
177
|
+
"switch_count": 0,
|
|
178
|
+
"switched_texts_indices": [],
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
counter = Counter(valid_langs)
|
|
182
|
+
primary_lang, _ = counter.most_common(1)[0]
|
|
183
|
+
languages_found = list(counter.keys())
|
|
184
|
+
|
|
185
|
+
switched_indices = [
|
|
186
|
+
i
|
|
187
|
+
for i, lang in enumerate(detected_langs)
|
|
188
|
+
if lang is not None and lang != primary_lang
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
detected = len(switched_indices) > 0
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"detected": detected,
|
|
195
|
+
"languages_found": languages_found,
|
|
196
|
+
"primary_language": primary_lang,
|
|
197
|
+
"switch_count": len(switched_indices),
|
|
198
|
+
"switched_texts_indices": switched_indices,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def detect_clearnet_slip(texts: list[str]) -> dict:
|
|
203
|
+
"""
|
|
204
|
+
Find clearnet URLs accidentally posted in a dark web context.
|
|
205
|
+
|
|
206
|
+
Clearnet = any URL whose domain does not end in .onion.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
detected: bool
|
|
210
|
+
clearnet_urls: list[str]
|
|
211
|
+
platforms: list[str] (e.g. ["youtube.com", "reddit.com"])
|
|
212
|
+
"""
|
|
213
|
+
clearnet_urls: list[str] = []
|
|
214
|
+
platforms: set[str] = set()
|
|
215
|
+
|
|
216
|
+
for text in texts:
|
|
217
|
+
if not text:
|
|
218
|
+
continue
|
|
219
|
+
for match in _HTTP_URL_RE.finditer(text):
|
|
220
|
+
domain = match.group(1).lower()
|
|
221
|
+
full_url = match.group(0)
|
|
222
|
+
if not domain.endswith(".onion"):
|
|
223
|
+
clearnet_urls.append(full_url)
|
|
224
|
+
# Extract base domain (last two parts)
|
|
225
|
+
parts = domain.rstrip(".").split(".")
|
|
226
|
+
if len(parts) >= 2:
|
|
227
|
+
platforms.add(".".join(parts[-2:]))
|
|
228
|
+
else:
|
|
229
|
+
platforms.add(domain)
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"detected": len(clearnet_urls) > 0,
|
|
233
|
+
"clearnet_urls": clearnet_urls,
|
|
234
|
+
"platforms": sorted(platforms),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def detect_pgp_reuse(
|
|
239
|
+
pgp_fingerprints: list[str],
|
|
240
|
+
sources: Optional[list[str]] = None,
|
|
241
|
+
) -> dict:
|
|
242
|
+
"""
|
|
243
|
+
Check if the same PGP fingerprint appears across multiple source domains,
|
|
244
|
+
or multiple times in the fingerprint list.
|
|
245
|
+
|
|
246
|
+
When *sources* is provided with the same length as *pgp_fingerprints*,
|
|
247
|
+
reuse is detected if the same fingerprint maps to more than one source.
|
|
248
|
+
|
|
249
|
+
When *sources* is omitted or length mismatches, reuse is detected when
|
|
250
|
+
any fingerprint appears more than once in *pgp_fingerprints*.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
detected: bool
|
|
254
|
+
reused_fingerprints: list[str]
|
|
255
|
+
cross_platform_exposure: list[dict]
|
|
256
|
+
forum_count: int
|
|
257
|
+
fingerprint: str | None
|
|
258
|
+
"""
|
|
259
|
+
if not pgp_fingerprints:
|
|
260
|
+
return {
|
|
261
|
+
"detected": False,
|
|
262
|
+
"reused_fingerprints": [],
|
|
263
|
+
"cross_platform_exposure": [],
|
|
264
|
+
"forum_count": 0,
|
|
265
|
+
"fingerprint": None,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
normalized = [fp.strip() for fp in pgp_fingerprints if fp and str(fp).strip()]
|
|
269
|
+
|
|
270
|
+
if sources is not None and len(sources) == len(pgp_fingerprints):
|
|
271
|
+
fp_to_sources: dict[str, set[str]] = {}
|
|
272
|
+
for fp, src in zip(normalized, sources):
|
|
273
|
+
if fp not in fp_to_sources:
|
|
274
|
+
fp_to_sources[fp] = set()
|
|
275
|
+
fp_to_sources[fp].add(src or "")
|
|
276
|
+
|
|
277
|
+
reused: list[str] = []
|
|
278
|
+
cross_platform: list[dict] = []
|
|
279
|
+
|
|
280
|
+
for fp, srcs in fp_to_sources.items():
|
|
281
|
+
if len(srcs) > 1:
|
|
282
|
+
reused.append(fp)
|
|
283
|
+
cross_platform.append({"fingerprint": fp, "sources": sorted(srcs)})
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
"detected": len(reused) > 0,
|
|
287
|
+
"reused_fingerprints": reused,
|
|
288
|
+
"cross_platform_exposure": cross_platform,
|
|
289
|
+
"forum_count": max((len(s) for s in fp_to_sources.values()), default=0),
|
|
290
|
+
"fingerprint": reused[0] if reused else None,
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
cnt = Counter(normalized)
|
|
294
|
+
dupes = [fp for fp, n in cnt.items() if n > 1]
|
|
295
|
+
return {
|
|
296
|
+
"detected": len(dupes) > 0,
|
|
297
|
+
"reused_fingerprints": dupes,
|
|
298
|
+
"cross_platform_exposure": [],
|
|
299
|
+
"forum_count": 2,
|
|
300
|
+
"fingerprint": dupes[0] if dupes else None,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def run_full_opsec_analysis(
|
|
305
|
+
handle: str,
|
|
306
|
+
texts_with_timestamps: list[dict],
|
|
307
|
+
pgp_fingerprints: Optional[list[str]] = None,
|
|
308
|
+
pgp_sources: Optional[list[str]] = None,
|
|
309
|
+
) -> dict:
|
|
310
|
+
"""
|
|
311
|
+
Run all OPSEC checks for a given actor.
|
|
312
|
+
|
|
313
|
+
Returns combined report with findings, opsec_score (100 = best), risk_level,
|
|
314
|
+
and legacy keys timezone_leak / language_switch / clearnet_slips for callers.
|
|
315
|
+
"""
|
|
316
|
+
texts = [entry.get("text", "") for entry in texts_with_timestamps]
|
|
317
|
+
|
|
318
|
+
tz_result = detect_timezone_leak(texts_with_timestamps)
|
|
319
|
+
lang_result = detect_language_switch(texts)
|
|
320
|
+
clearnet_result = detect_clearnet_slip(texts)
|
|
321
|
+
|
|
322
|
+
if tz_result.get("detected"):
|
|
323
|
+
primary_language = lang_result.get("primary_language", "unknown")
|
|
324
|
+
data_points = len(texts_with_timestamps)
|
|
325
|
+
original_conf = float(tz_result.get("confidence", 0.5))
|
|
326
|
+
tz_result["data_points"] = data_points
|
|
327
|
+
tz_result["primary_language_correlation"] = primary_language
|
|
328
|
+
if primary_language == "en" or data_points < 20:
|
|
329
|
+
tz_result["confidence"] = round(original_conf * 0.5, 3)
|
|
330
|
+
tz_result["confidence_level"] = "low"
|
|
331
|
+
tz_result["note"] = (
|
|
332
|
+
"Insufficient data for reliable timezone inference"
|
|
333
|
+
if data_points < 20
|
|
334
|
+
else "Timezone leak is LOW confidence for English content"
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
tz_result["confidence_level"] = "high" if original_conf >= 0.85 else "medium"
|
|
338
|
+
|
|
339
|
+
findings: list[dict] = []
|
|
340
|
+
score = 100
|
|
341
|
+
|
|
342
|
+
if tz_result.get("detected"):
|
|
343
|
+
conf = float(tz_result.get("confidence", 0.5))
|
|
344
|
+
severity = tz_result.get("confidence_level", "high") if conf >= 0.4 else "low"
|
|
345
|
+
findings.append(
|
|
346
|
+
{
|
|
347
|
+
"type": "timezone_leak",
|
|
348
|
+
"severity": severity,
|
|
349
|
+
"description": (
|
|
350
|
+
f"Timezone leak: probable {tz_result.get('probable_timezone_offset', 'unknown')}"
|
|
351
|
+
),
|
|
352
|
+
"evidence": (
|
|
353
|
+
f"Activity window: {tz_result.get('peak_window', 'unknown')} "
|
|
354
|
+
f"(confidence {conf:.0%})"
|
|
355
|
+
),
|
|
356
|
+
"first_detected": None,
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
score -= 25
|
|
360
|
+
|
|
361
|
+
if lang_result.get("detected"):
|
|
362
|
+
langs = lang_result.get("languages_found", [])
|
|
363
|
+
findings.append(
|
|
364
|
+
{
|
|
365
|
+
"type": "language_switch",
|
|
366
|
+
"severity": "medium",
|
|
367
|
+
"description": (
|
|
368
|
+
f"{lang_result.get('switch_count', 0)} language switch(es) detected"
|
|
369
|
+
),
|
|
370
|
+
"evidence": (
|
|
371
|
+
f"Primary: {lang_result.get('primary_language', 'unknown')}. "
|
|
372
|
+
f"Also: {', '.join(str(l) for l in langs if l != lang_result.get('primary_language'))}"
|
|
373
|
+
),
|
|
374
|
+
"first_detected": None,
|
|
375
|
+
}
|
|
376
|
+
)
|
|
377
|
+
score -= 15
|
|
378
|
+
|
|
379
|
+
if clearnet_result.get("detected"):
|
|
380
|
+
platforms = clearnet_result.get("platforms", [])
|
|
381
|
+
findings.append(
|
|
382
|
+
{
|
|
383
|
+
"type": "clearnet_slip",
|
|
384
|
+
"severity": "high",
|
|
385
|
+
"description": (
|
|
386
|
+
f"{len(clearnet_result.get('clearnet_urls', []))} clearnet URL(s) posted"
|
|
387
|
+
),
|
|
388
|
+
"evidence": f"Platforms: {', '.join(str(p) for p in platforms[:5])}",
|
|
389
|
+
"first_detected": None,
|
|
390
|
+
}
|
|
391
|
+
)
|
|
392
|
+
score -= 15
|
|
393
|
+
|
|
394
|
+
pgp_result: dict = {"detected": False}
|
|
395
|
+
if pgp_fingerprints and len(pgp_fingerprints) > 1:
|
|
396
|
+
pgp_result = detect_pgp_reuse(pgp_fingerprints, pgp_sources)
|
|
397
|
+
if pgp_result.get("detected"):
|
|
398
|
+
fp_short = (pgp_result.get("fingerprint") or "")[:16]
|
|
399
|
+
findings.append(
|
|
400
|
+
{
|
|
401
|
+
"type": "pgp_reuse",
|
|
402
|
+
"severity": "high",
|
|
403
|
+
"description": (
|
|
404
|
+
f"Same PGP key used across {pgp_result.get('forum_count', 2)} forums"
|
|
405
|
+
),
|
|
406
|
+
"evidence": f"Key {fp_short}... reused",
|
|
407
|
+
"first_detected": None,
|
|
408
|
+
}
|
|
409
|
+
)
|
|
410
|
+
score -= 20
|
|
411
|
+
|
|
412
|
+
score = max(0, score)
|
|
413
|
+
|
|
414
|
+
if score >= 80:
|
|
415
|
+
risk_level = "LOW"
|
|
416
|
+
elif score >= 60:
|
|
417
|
+
risk_level = "MEDIUM"
|
|
418
|
+
elif score >= 40:
|
|
419
|
+
risk_level = "HIGH"
|
|
420
|
+
else:
|
|
421
|
+
risk_level = "CRITICAL"
|
|
422
|
+
|
|
423
|
+
# Legacy normalized risk_score (0–1), higher = worse — for backward compatibility
|
|
424
|
+
legacy_scores: list[float] = []
|
|
425
|
+
if tz_result.get("detected"):
|
|
426
|
+
legacy_scores.append(float(tz_result.get("confidence", 0.5)))
|
|
427
|
+
else:
|
|
428
|
+
legacy_scores.append(0.0)
|
|
429
|
+
if lang_result.get("detected"):
|
|
430
|
+
n_texts = len(texts)
|
|
431
|
+
n_switched = lang_result.get("switch_count", 0)
|
|
432
|
+
legacy_scores.append(min(1.0, n_switched / max(n_texts, 1)))
|
|
433
|
+
else:
|
|
434
|
+
legacy_scores.append(0.0)
|
|
435
|
+
if clearnet_result.get("detected"):
|
|
436
|
+
legacy_scores.append(1.0)
|
|
437
|
+
else:
|
|
438
|
+
legacy_scores.append(0.0)
|
|
439
|
+
|
|
440
|
+
risk_score = sum(legacy_scores) / len(legacy_scores) if legacy_scores else 0.0
|
|
441
|
+
if pgp_result.get("detected"):
|
|
442
|
+
risk_score = min(1.0, risk_score + 0.2)
|
|
443
|
+
|
|
444
|
+
return {
|
|
445
|
+
"handle": handle,
|
|
446
|
+
"timezone_leak": tz_result,
|
|
447
|
+
"language_switch": lang_result,
|
|
448
|
+
"clearnet_slips": clearnet_result,
|
|
449
|
+
"pgp_reuse": pgp_result,
|
|
450
|
+
"findings": findings,
|
|
451
|
+
"opsec_score": score,
|
|
452
|
+
"risk_level": risk_level,
|
|
453
|
+
"risk_score": round(risk_score, 3),
|
|
454
|
+
}
|
analysis/patterns.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analysis/patterns.py — Pattern library for known behavioral signatures.
|
|
3
|
+
|
|
4
|
+
Heuristic rules derived from threat intelligence research for detecting
|
|
5
|
+
exit scams, law enforcement actions, and new actor emergence.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import date, timedelta
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from analysis.temporal import ( # noqa: E402 — imported at module level for patchability
|
|
15
|
+
build_activity_timeline,
|
|
16
|
+
compute_activity_stats,
|
|
17
|
+
detect_anomalies,
|
|
18
|
+
detect_silence_breaks,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def check_exit_scam_pattern(timeline: list[dict]) -> dict:
|
|
25
|
+
"""
|
|
26
|
+
Check if a marketplace/forum shows exit scam warning signs.
|
|
27
|
+
|
|
28
|
+
Criteria: activity drops >60% over the last 14 days vs prior 14-day average.
|
|
29
|
+
Returns {"risk": "high"|"medium"|"low", "confidence": float, "reason": str}.
|
|
30
|
+
"""
|
|
31
|
+
if not timeline:
|
|
32
|
+
return {"risk": "low", "confidence": 0.0, "reason": "Insufficient data"}
|
|
33
|
+
|
|
34
|
+
sorted_entries = sorted(timeline, key=lambda x: x["date"])
|
|
35
|
+
|
|
36
|
+
if len(sorted_entries) < 2:
|
|
37
|
+
return {"risk": "low", "confidence": 0.1, "reason": "Insufficient historical data"}
|
|
38
|
+
|
|
39
|
+
last_date = sorted_entries[-1]["date"]
|
|
40
|
+
cutoff_recent = last_date - timedelta(days=14)
|
|
41
|
+
cutoff_prior = cutoff_recent - timedelta(days=14)
|
|
42
|
+
|
|
43
|
+
recent_counts = [
|
|
44
|
+
e["count"] for e in sorted_entries if e["date"] > cutoff_recent
|
|
45
|
+
]
|
|
46
|
+
prior_counts = [
|
|
47
|
+
e["count"]
|
|
48
|
+
for e in sorted_entries
|
|
49
|
+
if cutoff_prior < e["date"] <= cutoff_recent
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
if not prior_counts:
|
|
53
|
+
return {
|
|
54
|
+
"risk": "low",
|
|
55
|
+
"confidence": 0.1,
|
|
56
|
+
"reason": "No prior 14-day baseline available",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
recent_avg = sum(recent_counts) / max(len(recent_counts), 1) if recent_counts else 0.0
|
|
60
|
+
prior_avg = sum(prior_counts) / len(prior_counts)
|
|
61
|
+
|
|
62
|
+
if prior_avg == 0.0:
|
|
63
|
+
return {"risk": "low", "confidence": 0.2, "reason": "No prior activity baseline"}
|
|
64
|
+
|
|
65
|
+
drop_ratio = 1.0 - (recent_avg / prior_avg)
|
|
66
|
+
confidence = min(
|
|
67
|
+
1.0,
|
|
68
|
+
len(sorted_entries) / 30.0, # more data → higher confidence
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if drop_ratio > 0.60:
|
|
72
|
+
return {
|
|
73
|
+
"risk": "high",
|
|
74
|
+
"confidence": round(confidence, 3),
|
|
75
|
+
"reason": (
|
|
76
|
+
f"Activity dropped {drop_ratio:.0%} over the last 14 days "
|
|
77
|
+
f"(from {prior_avg:.1f} to {recent_avg:.1f} posts/day)"
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
elif drop_ratio > 0.30:
|
|
81
|
+
return {
|
|
82
|
+
"risk": "medium",
|
|
83
|
+
"confidence": round(confidence * 0.7, 3),
|
|
84
|
+
"reason": (
|
|
85
|
+
f"Moderate activity decline of {drop_ratio:.0%} over the last 14 days"
|
|
86
|
+
),
|
|
87
|
+
}
|
|
88
|
+
else:
|
|
89
|
+
return {
|
|
90
|
+
"risk": "low",
|
|
91
|
+
"confidence": round(confidence * 0.5, 3),
|
|
92
|
+
"reason": "Activity levels are stable",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_law_enforcement_pattern(timeline: list[dict]) -> dict:
|
|
97
|
+
"""
|
|
98
|
+
Check for sudden complete silence after sustained activity.
|
|
99
|
+
|
|
100
|
+
Criteria: zero activity for 7+ consecutive days after a period of daily activity.
|
|
101
|
+
Returns {"risk": "high"|"medium"|"low", "confidence": float, "reason": str}.
|
|
102
|
+
"""
|
|
103
|
+
if not timeline or len(timeline) < 2:
|
|
104
|
+
return {"risk": "low", "confidence": 0.0, "reason": "Insufficient data"}
|
|
105
|
+
|
|
106
|
+
sorted_entries = sorted(timeline, key=lambda x: x["date"])
|
|
107
|
+
|
|
108
|
+
# Check for silence in the last N calendar days relative to last seen
|
|
109
|
+
last_date = sorted_entries[-1]["date"]
|
|
110
|
+
today = date.today()
|
|
111
|
+
days_since_last = (today - last_date).days
|
|
112
|
+
|
|
113
|
+
# Check if there was sustained prior activity (at least 5 data points)
|
|
114
|
+
has_sustained = len(sorted_entries) >= 5
|
|
115
|
+
confidence_base = min(1.0, len(sorted_entries) / 20.0)
|
|
116
|
+
|
|
117
|
+
if days_since_last >= 7 and has_sustained:
|
|
118
|
+
return {
|
|
119
|
+
"risk": "high",
|
|
120
|
+
"confidence": round(min(confidence_base + 0.3, 1.0), 3),
|
|
121
|
+
"reason": (
|
|
122
|
+
f"Complete silence for {days_since_last} days after sustained activity "
|
|
123
|
+
f"({len(sorted_entries)} active days on record)"
|
|
124
|
+
),
|
|
125
|
+
}
|
|
126
|
+
elif days_since_last >= 3 and has_sustained:
|
|
127
|
+
return {
|
|
128
|
+
"risk": "medium",
|
|
129
|
+
"confidence": round(confidence_base * 0.6, 3),
|
|
130
|
+
"reason": f"Reduced activity for {days_since_last} days",
|
|
131
|
+
}
|
|
132
|
+
else:
|
|
133
|
+
return {
|
|
134
|
+
"risk": "low",
|
|
135
|
+
"confidence": round(confidence_base * 0.3, 3),
|
|
136
|
+
"reason": "Activity pattern appears normal",
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def check_new_actor_pattern(
|
|
141
|
+
entity_value: str,
|
|
142
|
+
entity_type: str,
|
|
143
|
+
) -> dict:
|
|
144
|
+
"""
|
|
145
|
+
Check if this entity has appeared for the first time in the last 7 days.
|
|
146
|
+
|
|
147
|
+
Returns {"is_new": bool, "first_seen": date | None, "days_active": int}.
|
|
148
|
+
"""
|
|
149
|
+
try:
|
|
150
|
+
timeline = build_activity_timeline(entity_value, entity_type)
|
|
151
|
+
if not timeline:
|
|
152
|
+
return {"is_new": False, "first_seen": None, "days_active": 0}
|
|
153
|
+
|
|
154
|
+
stats = compute_activity_stats(timeline)
|
|
155
|
+
first_seen = stats.get("first_seen")
|
|
156
|
+
days_active = int(stats.get("active_days", 0))
|
|
157
|
+
|
|
158
|
+
is_new = False
|
|
159
|
+
if first_seen is not None:
|
|
160
|
+
days_since_first = (date.today() - first_seen).days
|
|
161
|
+
is_new = days_since_first < 7
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
"is_new": is_new,
|
|
165
|
+
"first_seen": first_seen,
|
|
166
|
+
"days_active": days_active,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
except Exception as exc:
|
|
170
|
+
logger.debug("check_new_actor_pattern: error (%s)", exc)
|
|
171
|
+
return {"is_new": False, "first_seen": None, "days_active": 0}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def run_all_patterns(
|
|
175
|
+
entity_value: str,
|
|
176
|
+
entity_type: str,
|
|
177
|
+
) -> dict:
|
|
178
|
+
"""
|
|
179
|
+
Run all pattern checks for an entity.
|
|
180
|
+
|
|
181
|
+
Returns combined dict with keys: exit_scam, law_enforcement, new_actor,
|
|
182
|
+
anomalies, silence_breaks.
|
|
183
|
+
"""
|
|
184
|
+
try:
|
|
185
|
+
timeline = build_activity_timeline(entity_value, entity_type)
|
|
186
|
+
|
|
187
|
+
return {
|
|
188
|
+
"exit_scam": check_exit_scam_pattern(timeline),
|
|
189
|
+
"law_enforcement": check_law_enforcement_pattern(timeline),
|
|
190
|
+
"new_actor": check_new_actor_pattern(entity_value, entity_type),
|
|
191
|
+
"anomalies": detect_anomalies(timeline),
|
|
192
|
+
"silence_breaks": detect_silence_breaks(timeline),
|
|
193
|
+
}
|
|
194
|
+
except Exception as exc:
|
|
195
|
+
logger.debug("run_all_patterns: error (%s)", exc)
|
|
196
|
+
return {
|
|
197
|
+
"exit_scam": {"risk": "low", "confidence": 0.0, "reason": "Error"},
|
|
198
|
+
"law_enforcement": {"risk": "low", "confidence": 0.0, "reason": "Error"},
|
|
199
|
+
"new_actor": {"is_new": False, "first_seen": None, "days_active": 0},
|
|
200
|
+
"anomalies": [],
|
|
201
|
+
"silence_breaks": [],
|
|
202
|
+
}
|