voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
analysis/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ """
2
+ analysis — Temporal, behavioral pattern, and OPSEC failure analysis.
3
+
4
+ Public interface
5
+ ---------------
6
+ from analysis.temporal import build_activity_timeline, compute_activity_stats
7
+ from analysis.temporal import detect_anomalies, detect_silence_breaks
8
+ from analysis.patterns import check_exit_scam_pattern, check_law_enforcement_pattern
9
+ from analysis.patterns import check_new_actor_pattern, run_all_patterns
10
+ from analysis.opsec import detect_timezone_leak, detect_language_switch
11
+ from analysis.opsec import detect_clearnet_slip, detect_pgp_reuse
12
+ from analysis.opsec import run_full_opsec_analysis
13
+ """
14
+
15
+ from analysis.opsec import (
16
+ detect_clearnet_slip,
17
+ detect_language_switch,
18
+ detect_pgp_reuse,
19
+ detect_timezone_leak,
20
+ run_full_opsec_analysis,
21
+ )
22
+ from analysis.patterns import (
23
+ check_exit_scam_pattern,
24
+ check_law_enforcement_pattern,
25
+ check_new_actor_pattern,
26
+ run_all_patterns,
27
+ )
28
+ from analysis.temporal import (
29
+ build_activity_timeline,
30
+ compute_activity_stats,
31
+ detect_anomalies,
32
+ detect_silence_breaks,
33
+ )
34
+
35
+ __all__ = [
36
+ "build_activity_timeline",
37
+ "compute_activity_stats",
38
+ "detect_anomalies",
39
+ "detect_silence_breaks",
40
+ "check_exit_scam_pattern",
41
+ "check_law_enforcement_pattern",
42
+ "check_new_actor_pattern",
43
+ "run_all_patterns",
44
+ "detect_timezone_leak",
45
+ "detect_language_switch",
46
+ "detect_clearnet_slip",
47
+ "detect_pgp_reuse",
48
+ "run_full_opsec_analysis",
49
+ ]
analysis/opsec.py ADDED
@@ -0,0 +1,454 @@
1
+ """
2
+ analysis/opsec.py — Detects operational security failures in threat actor
3
+ communications that inadvertently reveal real-world identity or location.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import re
10
+ from collections import Counter
11
+ from typing import Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Known clearnet URL regex — captures domain from http(s) URLs
16
+ _HTTP_URL_RE = re.compile(
17
+ r"https?://([a-zA-Z0-9][-a-zA-Z0-9.]*\.[a-zA-Z]{2,})(?:[/?#][^\s]*)?",
18
+ re.IGNORECASE,
19
+ )
20
+
21
+ # Structured data patterns to strip before language detection
22
+ _BITCOIN_RE = re.compile(r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b")
23
+ _ETH_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
24
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
25
+ _URL_RE = re.compile(r"https?://\S+")
26
+ _ONION_RE = re.compile(r"\b[a-z2-7]{56}\.onion\b", re.IGNORECASE)
27
+
28
+
29
+ def _strip_non_linguistic(text: str) -> str:
30
+ """Remove URLs, wallet addresses, CVE IDs, and .onion addresses before language detection."""
31
+ text = _URL_RE.sub(" ", text)
32
+ text = _BITCOIN_RE.sub(" ", text)
33
+ text = _ETH_RE.sub(" ", text)
34
+ text = _CVE_RE.sub(" ", text)
35
+ text = _ONION_RE.sub(" ", text)
36
+ return text
37
+
38
+
39
+ def detect_timezone_leak(texts_with_timestamps: list[dict]) -> dict:
40
+ """
41
+ Analyze posting time distribution to infer actor timezone.
42
+
43
+ Input: list of {"text": str, "timestamp": datetime}
44
+ If 80%+ of posts fall within a 6-hour window: infer timezone.
45
+
46
+ Returns:
47
+ detected: bool
48
+ probable_timezone_offset: str | None (e.g. "UTC+3")
49
+ confidence: float
50
+ posting_hours: list[int]
51
+ peak_window: str | None (e.g. "09:00-15:00 UTC")
52
+ """
53
+ if not texts_with_timestamps:
54
+ return {
55
+ "detected": False,
56
+ "probable_timezone_offset": None,
57
+ "confidence": 0.0,
58
+ "posting_hours": [],
59
+ "peak_window": None,
60
+ }
61
+
62
+ posting_hours: list[int] = []
63
+ for entry in texts_with_timestamps:
64
+ ts = entry.get("timestamp")
65
+ if ts is None:
66
+ continue
67
+ if hasattr(ts, "utcoffset") and ts.utcoffset() is not None:
68
+ # Convert to UTC
69
+ utc_ts = ts.astimezone(tz=None).replace(tzinfo=None)
70
+ posting_hours.append(utc_ts.hour)
71
+ else:
72
+ posting_hours.append(ts.hour)
73
+
74
+ if not posting_hours:
75
+ return {
76
+ "detected": False,
77
+ "probable_timezone_offset": None,
78
+ "confidence": 0.0,
79
+ "posting_hours": [],
80
+ "peak_window": None,
81
+ }
82
+
83
+ total = len(posting_hours)
84
+
85
+ # Sliding 6-hour window to find best coverage
86
+ best_start = 0
87
+ best_count = 0
88
+ for h in range(24):
89
+ window_hours = {(h + offset) % 24 for offset in range(6)}
90
+ count = sum(1 for hour in posting_hours if hour in window_hours)
91
+ if count > best_count:
92
+ best_count = count
93
+ best_start = h
94
+
95
+ coverage = best_count / total
96
+
97
+ if coverage >= 0.80:
98
+ end_hour = (best_start + 6) % 24
99
+ peak_window = f"{best_start:02d}:00-{end_hour:02d}:00 UTC"
100
+
101
+ # Infer timezone: assume actor is active during 09:00-17:00 local.
102
+ # Window center → local noon assumption (midpoint at ~13:00 local)
103
+ window_center_utc = (best_start + 3) % 24
104
+ offset_raw = 13 - window_center_utc
105
+ if offset_raw > 12:
106
+ offset_raw -= 24
107
+ elif offset_raw < -12:
108
+ offset_raw += 24
109
+
110
+ if offset_raw >= 0:
111
+ tz_str = f"UTC+{offset_raw}"
112
+ else:
113
+ tz_str = f"UTC{offset_raw}"
114
+
115
+ return {
116
+ "detected": True,
117
+ "probable_timezone_offset": tz_str,
118
+ "confidence": round(coverage, 3),
119
+ "posting_hours": posting_hours,
120
+ "peak_window": peak_window,
121
+ }
122
+
123
+ return {
124
+ "detected": False,
125
+ "probable_timezone_offset": None,
126
+ "confidence": round(coverage, 3),
127
+ "posting_hours": posting_hours,
128
+ "peak_window": None,
129
+ }
130
+
131
+
132
+ def detect_language_switch(texts: list[str]) -> dict:
133
+ """
134
+ Detect if an actor switches between languages across posts.
135
+
136
+ Returns:
137
+ detected: bool
138
+ languages_found: list[str] (ISO 639-1 codes)
139
+ primary_language: str | None
140
+ switch_count: int
141
+ switched_texts_indices: list[int]
142
+ """
143
+ try:
144
+ from langdetect import detect as ld_detect
145
+ except ImportError:
146
+ return {"detected": False}
147
+
148
+ if not texts:
149
+ return {
150
+ "detected": False,
151
+ "languages_found": [],
152
+ "primary_language": None,
153
+ "switch_count": 0,
154
+ "switched_texts_indices": [],
155
+ }
156
+
157
+ detected_langs: list[Optional[str]] = []
158
+ for text in texts:
159
+ if not text:
160
+ detected_langs.append(None)
161
+ continue
162
+ clean_text = _strip_non_linguistic(text)
163
+ if len(clean_text) < 50:
164
+ detected_langs.append(None)
165
+ continue
166
+ try:
167
+ detected_langs.append(ld_detect(clean_text))
168
+ except Exception:
169
+ detected_langs.append(None)
170
+
171
+ valid_langs = [lang for lang in detected_langs if lang is not None]
172
+ if not valid_langs:
173
+ return {
174
+ "detected": False,
175
+ "languages_found": [],
176
+ "primary_language": None,
177
+ "switch_count": 0,
178
+ "switched_texts_indices": [],
179
+ }
180
+
181
+ counter = Counter(valid_langs)
182
+ primary_lang, _ = counter.most_common(1)[0]
183
+ languages_found = list(counter.keys())
184
+
185
+ switched_indices = [
186
+ i
187
+ for i, lang in enumerate(detected_langs)
188
+ if lang is not None and lang != primary_lang
189
+ ]
190
+
191
+ detected = len(switched_indices) > 0
192
+
193
+ return {
194
+ "detected": detected,
195
+ "languages_found": languages_found,
196
+ "primary_language": primary_lang,
197
+ "switch_count": len(switched_indices),
198
+ "switched_texts_indices": switched_indices,
199
+ }
200
+
201
+
202
+ def detect_clearnet_slip(texts: list[str]) -> dict:
203
+ """
204
+ Find clearnet URLs accidentally posted in a dark web context.
205
+
206
+ Clearnet = any URL whose domain does not end in .onion.
207
+
208
+ Returns:
209
+ detected: bool
210
+ clearnet_urls: list[str]
211
+ platforms: list[str] (e.g. ["youtube.com", "reddit.com"])
212
+ """
213
+ clearnet_urls: list[str] = []
214
+ platforms: set[str] = set()
215
+
216
+ for text in texts:
217
+ if not text:
218
+ continue
219
+ for match in _HTTP_URL_RE.finditer(text):
220
+ domain = match.group(1).lower()
221
+ full_url = match.group(0)
222
+ if not domain.endswith(".onion"):
223
+ clearnet_urls.append(full_url)
224
+ # Extract base domain (last two parts)
225
+ parts = domain.rstrip(".").split(".")
226
+ if len(parts) >= 2:
227
+ platforms.add(".".join(parts[-2:]))
228
+ else:
229
+ platforms.add(domain)
230
+
231
+ return {
232
+ "detected": len(clearnet_urls) > 0,
233
+ "clearnet_urls": clearnet_urls,
234
+ "platforms": sorted(platforms),
235
+ }
236
+
237
+
238
+ def detect_pgp_reuse(
239
+ pgp_fingerprints: list[str],
240
+ sources: Optional[list[str]] = None,
241
+ ) -> dict:
242
+ """
243
+ Check if the same PGP fingerprint appears across multiple source domains,
244
+ or multiple times in the fingerprint list.
245
+
246
+ When *sources* is provided with the same length as *pgp_fingerprints*,
247
+ reuse is detected if the same fingerprint maps to more than one source.
248
+
249
+ When *sources* is omitted or length mismatches, reuse is detected when
250
+ any fingerprint appears more than once in *pgp_fingerprints*.
251
+
252
+ Returns:
253
+ detected: bool
254
+ reused_fingerprints: list[str]
255
+ cross_platform_exposure: list[dict]
256
+ forum_count: int
257
+ fingerprint: str | None
258
+ """
259
+ if not pgp_fingerprints:
260
+ return {
261
+ "detected": False,
262
+ "reused_fingerprints": [],
263
+ "cross_platform_exposure": [],
264
+ "forum_count": 0,
265
+ "fingerprint": None,
266
+ }
267
+
268
+ normalized = [fp.strip() for fp in pgp_fingerprints if fp and str(fp).strip()]
269
+
270
+ if sources is not None and len(sources) == len(pgp_fingerprints):
271
+ fp_to_sources: dict[str, set[str]] = {}
272
+ for fp, src in zip(normalized, sources):
273
+ if fp not in fp_to_sources:
274
+ fp_to_sources[fp] = set()
275
+ fp_to_sources[fp].add(src or "")
276
+
277
+ reused: list[str] = []
278
+ cross_platform: list[dict] = []
279
+
280
+ for fp, srcs in fp_to_sources.items():
281
+ if len(srcs) > 1:
282
+ reused.append(fp)
283
+ cross_platform.append({"fingerprint": fp, "sources": sorted(srcs)})
284
+
285
+ return {
286
+ "detected": len(reused) > 0,
287
+ "reused_fingerprints": reused,
288
+ "cross_platform_exposure": cross_platform,
289
+ "forum_count": max((len(s) for s in fp_to_sources.values()), default=0),
290
+ "fingerprint": reused[0] if reused else None,
291
+ }
292
+
293
+ cnt = Counter(normalized)
294
+ dupes = [fp for fp, n in cnt.items() if n > 1]
295
+ return {
296
+ "detected": len(dupes) > 0,
297
+ "reused_fingerprints": dupes,
298
+ "cross_platform_exposure": [],
299
+ "forum_count": 2,
300
+ "fingerprint": dupes[0] if dupes else None,
301
+ }
302
+
303
+
304
+ def run_full_opsec_analysis(
305
+ handle: str,
306
+ texts_with_timestamps: list[dict],
307
+ pgp_fingerprints: Optional[list[str]] = None,
308
+ pgp_sources: Optional[list[str]] = None,
309
+ ) -> dict:
310
+ """
311
+ Run all OPSEC checks for a given actor.
312
+
313
+ Returns combined report with findings, opsec_score (100 = best), risk_level,
314
+ and legacy keys timezone_leak / language_switch / clearnet_slips for callers.
315
+ """
316
+ texts = [entry.get("text", "") for entry in texts_with_timestamps]
317
+
318
+ tz_result = detect_timezone_leak(texts_with_timestamps)
319
+ lang_result = detect_language_switch(texts)
320
+ clearnet_result = detect_clearnet_slip(texts)
321
+
322
+ if tz_result.get("detected"):
323
+ primary_language = lang_result.get("primary_language", "unknown")
324
+ data_points = len(texts_with_timestamps)
325
+ original_conf = float(tz_result.get("confidence", 0.5))
326
+ tz_result["data_points"] = data_points
327
+ tz_result["primary_language_correlation"] = primary_language
328
+ if primary_language == "en" or data_points < 20:
329
+ tz_result["confidence"] = round(original_conf * 0.5, 3)
330
+ tz_result["confidence_level"] = "low"
331
+ tz_result["note"] = (
332
+ "Insufficient data for reliable timezone inference"
333
+ if data_points < 20
334
+ else "Timezone leak is LOW confidence for English content"
335
+ )
336
+ else:
337
+ tz_result["confidence_level"] = "high" if original_conf >= 0.85 else "medium"
338
+
339
+ findings: list[dict] = []
340
+ score = 100
341
+
342
+ if tz_result.get("detected"):
343
+ conf = float(tz_result.get("confidence", 0.5))
344
+ severity = tz_result.get("confidence_level", "high") if conf >= 0.4 else "low"
345
+ findings.append(
346
+ {
347
+ "type": "timezone_leak",
348
+ "severity": severity,
349
+ "description": (
350
+ f"Timezone leak: probable {tz_result.get('probable_timezone_offset', 'unknown')}"
351
+ ),
352
+ "evidence": (
353
+ f"Activity window: {tz_result.get('peak_window', 'unknown')} "
354
+ f"(confidence {conf:.0%})"
355
+ ),
356
+ "first_detected": None,
357
+ }
358
+ )
359
+ score -= 25
360
+
361
+ if lang_result.get("detected"):
362
+ langs = lang_result.get("languages_found", [])
363
+ findings.append(
364
+ {
365
+ "type": "language_switch",
366
+ "severity": "medium",
367
+ "description": (
368
+ f"{lang_result.get('switch_count', 0)} language switch(es) detected"
369
+ ),
370
+ "evidence": (
371
+ f"Primary: {lang_result.get('primary_language', 'unknown')}. "
372
+ f"Also: {', '.join(str(l) for l in langs if l != lang_result.get('primary_language'))}"
373
+ ),
374
+ "first_detected": None,
375
+ }
376
+ )
377
+ score -= 15
378
+
379
+ if clearnet_result.get("detected"):
380
+ platforms = clearnet_result.get("platforms", [])
381
+ findings.append(
382
+ {
383
+ "type": "clearnet_slip",
384
+ "severity": "high",
385
+ "description": (
386
+ f"{len(clearnet_result.get('clearnet_urls', []))} clearnet URL(s) posted"
387
+ ),
388
+ "evidence": f"Platforms: {', '.join(str(p) for p in platforms[:5])}",
389
+ "first_detected": None,
390
+ }
391
+ )
392
+ score -= 15
393
+
394
+ pgp_result: dict = {"detected": False}
395
+ if pgp_fingerprints and len(pgp_fingerprints) > 1:
396
+ pgp_result = detect_pgp_reuse(pgp_fingerprints, pgp_sources)
397
+ if pgp_result.get("detected"):
398
+ fp_short = (pgp_result.get("fingerprint") or "")[:16]
399
+ findings.append(
400
+ {
401
+ "type": "pgp_reuse",
402
+ "severity": "high",
403
+ "description": (
404
+ f"Same PGP key used across {pgp_result.get('forum_count', 2)} forums"
405
+ ),
406
+ "evidence": f"Key {fp_short}... reused",
407
+ "first_detected": None,
408
+ }
409
+ )
410
+ score -= 20
411
+
412
+ score = max(0, score)
413
+
414
+ if score >= 80:
415
+ risk_level = "LOW"
416
+ elif score >= 60:
417
+ risk_level = "MEDIUM"
418
+ elif score >= 40:
419
+ risk_level = "HIGH"
420
+ else:
421
+ risk_level = "CRITICAL"
422
+
423
+ # Legacy normalized risk_score (0–1), higher = worse — for backward compatibility
424
+ legacy_scores: list[float] = []
425
+ if tz_result.get("detected"):
426
+ legacy_scores.append(float(tz_result.get("confidence", 0.5)))
427
+ else:
428
+ legacy_scores.append(0.0)
429
+ if lang_result.get("detected"):
430
+ n_texts = len(texts)
431
+ n_switched = lang_result.get("switch_count", 0)
432
+ legacy_scores.append(min(1.0, n_switched / max(n_texts, 1)))
433
+ else:
434
+ legacy_scores.append(0.0)
435
+ if clearnet_result.get("detected"):
436
+ legacy_scores.append(1.0)
437
+ else:
438
+ legacy_scores.append(0.0)
439
+
440
+ risk_score = sum(legacy_scores) / len(legacy_scores) if legacy_scores else 0.0
441
+ if pgp_result.get("detected"):
442
+ risk_score = min(1.0, risk_score + 0.2)
443
+
444
+ return {
445
+ "handle": handle,
446
+ "timezone_leak": tz_result,
447
+ "language_switch": lang_result,
448
+ "clearnet_slips": clearnet_result,
449
+ "pgp_reuse": pgp_result,
450
+ "findings": findings,
451
+ "opsec_score": score,
452
+ "risk_level": risk_level,
453
+ "risk_score": round(risk_score, 3),
454
+ }
analysis/patterns.py ADDED
@@ -0,0 +1,202 @@
1
+ """
2
+ analysis/patterns.py — Pattern library for known behavioral signatures.
3
+
4
+ Heuristic rules derived from threat intelligence research for detecting
5
+ exit scams, law enforcement actions, and new actor emergence.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from datetime import date, timedelta
12
+ from typing import Any
13
+
14
+ from analysis.temporal import ( # noqa: E402 — imported at module level for patchability
15
+ build_activity_timeline,
16
+ compute_activity_stats,
17
+ detect_anomalies,
18
+ detect_silence_breaks,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def check_exit_scam_pattern(timeline: list[dict]) -> dict:
25
+ """
26
+ Check if a marketplace/forum shows exit scam warning signs.
27
+
28
+ Criteria: activity drops >60% over the last 14 days vs prior 14-day average.
29
+ Returns {"risk": "high"|"medium"|"low", "confidence": float, "reason": str}.
30
+ """
31
+ if not timeline:
32
+ return {"risk": "low", "confidence": 0.0, "reason": "Insufficient data"}
33
+
34
+ sorted_entries = sorted(timeline, key=lambda x: x["date"])
35
+
36
+ if len(sorted_entries) < 2:
37
+ return {"risk": "low", "confidence": 0.1, "reason": "Insufficient historical data"}
38
+
39
+ last_date = sorted_entries[-1]["date"]
40
+ cutoff_recent = last_date - timedelta(days=14)
41
+ cutoff_prior = cutoff_recent - timedelta(days=14)
42
+
43
+ recent_counts = [
44
+ e["count"] for e in sorted_entries if e["date"] > cutoff_recent
45
+ ]
46
+ prior_counts = [
47
+ e["count"]
48
+ for e in sorted_entries
49
+ if cutoff_prior < e["date"] <= cutoff_recent
50
+ ]
51
+
52
+ if not prior_counts:
53
+ return {
54
+ "risk": "low",
55
+ "confidence": 0.1,
56
+ "reason": "No prior 14-day baseline available",
57
+ }
58
+
59
+ recent_avg = sum(recent_counts) / max(len(recent_counts), 1) if recent_counts else 0.0
60
+ prior_avg = sum(prior_counts) / len(prior_counts)
61
+
62
+ if prior_avg == 0.0:
63
+ return {"risk": "low", "confidence": 0.2, "reason": "No prior activity baseline"}
64
+
65
+ drop_ratio = 1.0 - (recent_avg / prior_avg)
66
+ confidence = min(
67
+ 1.0,
68
+ len(sorted_entries) / 30.0, # more data → higher confidence
69
+ )
70
+
71
+ if drop_ratio > 0.60:
72
+ return {
73
+ "risk": "high",
74
+ "confidence": round(confidence, 3),
75
+ "reason": (
76
+ f"Activity dropped {drop_ratio:.0%} over the last 14 days "
77
+ f"(from {prior_avg:.1f} to {recent_avg:.1f} posts/day)"
78
+ ),
79
+ }
80
+ elif drop_ratio > 0.30:
81
+ return {
82
+ "risk": "medium",
83
+ "confidence": round(confidence * 0.7, 3),
84
+ "reason": (
85
+ f"Moderate activity decline of {drop_ratio:.0%} over the last 14 days"
86
+ ),
87
+ }
88
+ else:
89
+ return {
90
+ "risk": "low",
91
+ "confidence": round(confidence * 0.5, 3),
92
+ "reason": "Activity levels are stable",
93
+ }
94
+
95
+
96
+ def check_law_enforcement_pattern(timeline: list[dict]) -> dict:
97
+ """
98
+ Check for sudden complete silence after sustained activity.
99
+
100
+ Criteria: zero activity for 7+ consecutive days after a period of daily activity.
101
+ Returns {"risk": "high"|"medium"|"low", "confidence": float, "reason": str}.
102
+ """
103
+ if not timeline or len(timeline) < 2:
104
+ return {"risk": "low", "confidence": 0.0, "reason": "Insufficient data"}
105
+
106
+ sorted_entries = sorted(timeline, key=lambda x: x["date"])
107
+
108
+ # Check for silence in the last N calendar days relative to last seen
109
+ last_date = sorted_entries[-1]["date"]
110
+ today = date.today()
111
+ days_since_last = (today - last_date).days
112
+
113
+ # Check if there was sustained prior activity (at least 5 data points)
114
+ has_sustained = len(sorted_entries) >= 5
115
+ confidence_base = min(1.0, len(sorted_entries) / 20.0)
116
+
117
+ if days_since_last >= 7 and has_sustained:
118
+ return {
119
+ "risk": "high",
120
+ "confidence": round(min(confidence_base + 0.3, 1.0), 3),
121
+ "reason": (
122
+ f"Complete silence for {days_since_last} days after sustained activity "
123
+ f"({len(sorted_entries)} active days on record)"
124
+ ),
125
+ }
126
+ elif days_since_last >= 3 and has_sustained:
127
+ return {
128
+ "risk": "medium",
129
+ "confidence": round(confidence_base * 0.6, 3),
130
+ "reason": f"Reduced activity for {days_since_last} days",
131
+ }
132
+ else:
133
+ return {
134
+ "risk": "low",
135
+ "confidence": round(confidence_base * 0.3, 3),
136
+ "reason": "Activity pattern appears normal",
137
+ }
138
+
139
+
140
+ def check_new_actor_pattern(
141
+ entity_value: str,
142
+ entity_type: str,
143
+ ) -> dict:
144
+ """
145
+ Check if this entity has appeared for the first time in the last 7 days.
146
+
147
+ Returns {"is_new": bool, "first_seen": date | None, "days_active": int}.
148
+ """
149
+ try:
150
+ timeline = build_activity_timeline(entity_value, entity_type)
151
+ if not timeline:
152
+ return {"is_new": False, "first_seen": None, "days_active": 0}
153
+
154
+ stats = compute_activity_stats(timeline)
155
+ first_seen = stats.get("first_seen")
156
+ days_active = int(stats.get("active_days", 0))
157
+
158
+ is_new = False
159
+ if first_seen is not None:
160
+ days_since_first = (date.today() - first_seen).days
161
+ is_new = days_since_first < 7
162
+
163
+ return {
164
+ "is_new": is_new,
165
+ "first_seen": first_seen,
166
+ "days_active": days_active,
167
+ }
168
+
169
+ except Exception as exc:
170
+ logger.debug("check_new_actor_pattern: error (%s)", exc)
171
+ return {"is_new": False, "first_seen": None, "days_active": 0}
172
+
173
+
174
+ def run_all_patterns(
175
+ entity_value: str,
176
+ entity_type: str,
177
+ ) -> dict:
178
+ """
179
+ Run all pattern checks for an entity.
180
+
181
+ Returns combined dict with keys: exit_scam, law_enforcement, new_actor,
182
+ anomalies, silence_breaks.
183
+ """
184
+ try:
185
+ timeline = build_activity_timeline(entity_value, entity_type)
186
+
187
+ return {
188
+ "exit_scam": check_exit_scam_pattern(timeline),
189
+ "law_enforcement": check_law_enforcement_pattern(timeline),
190
+ "new_actor": check_new_actor_pattern(entity_value, entity_type),
191
+ "anomalies": detect_anomalies(timeline),
192
+ "silence_breaks": detect_silence_breaks(timeline),
193
+ }
194
+ except Exception as exc:
195
+ logger.debug("run_all_patterns: error (%s)", exc)
196
+ return {
197
+ "exit_scam": {"risk": "low", "confidence": 0.0, "reason": "Error"},
198
+ "law_enforcement": {"risk": "low", "confidence": 0.0, "reason": "Error"},
199
+ "new_actor": {"is_new": False, "first_seen": None, "days_active": 0},
200
+ "anomalies": [],
201
+ "silence_breaks": [],
202
+ }