voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor/regex_patterns.py — Pre-compiled regex patterns for entity extraction.
|
|
3
|
+
|
|
4
|
+
All patterns are compiled at module load time. No pattern is ever compiled
|
|
5
|
+
inside a function call.
|
|
6
|
+
|
|
7
|
+
Public interface
|
|
8
|
+
----------------
|
|
9
|
+
extract_all(text) → dict[str, list[str]]
|
|
10
|
+
extract_type(text, entity_type) → list[str] raises ValueError on unknown type
|
|
11
|
+
|
|
12
|
+
Entity type constants are exported so callers can use them symbolically
|
|
13
|
+
(e.g. regex_patterns.BITCOIN_ADDRESS) rather than raw strings.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import ipaddress
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Entity type constants
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
BITCOIN_ADDRESS = "BITCOIN_ADDRESS"
|
|
29
|
+
ETHEREUM_ADDRESS = "ETHEREUM_ADDRESS"
|
|
30
|
+
MONERO_ADDRESS = "MONERO_ADDRESS"
|
|
31
|
+
ONION_URL = "ONION_URL"
|
|
32
|
+
EMAIL_ADDRESS = "EMAIL_ADDRESS"
|
|
33
|
+
PGP_KEY_BLOCK = "PGP_KEY_BLOCK"
|
|
34
|
+
CVE_NUMBER = "CVE_NUMBER"
|
|
35
|
+
IP_ADDRESS = "IP_ADDRESS"
|
|
36
|
+
PHONE_NUMBER = "PHONE_NUMBER"
|
|
37
|
+
PASTE_URL = "PASTE_URL"
|
|
38
|
+
FILE_HASH_MD5 = "FILE_HASH_MD5"
|
|
39
|
+
FILE_HASH_SHA1 = "FILE_HASH_SHA1"
|
|
40
|
+
FILE_HASH_SHA256 = "FILE_HASH_SHA256"
|
|
41
|
+
MITRE_TECHNIQUE = "MITRE_TECHNIQUE"
|
|
42
|
+
|
|
43
|
+
ENTITY_TYPES: frozenset[str] = frozenset({
|
|
44
|
+
BITCOIN_ADDRESS,
|
|
45
|
+
ETHEREUM_ADDRESS,
|
|
46
|
+
MONERO_ADDRESS,
|
|
47
|
+
ONION_URL,
|
|
48
|
+
EMAIL_ADDRESS,
|
|
49
|
+
PGP_KEY_BLOCK,
|
|
50
|
+
CVE_NUMBER,
|
|
51
|
+
IP_ADDRESS,
|
|
52
|
+
PHONE_NUMBER,
|
|
53
|
+
PASTE_URL,
|
|
54
|
+
FILE_HASH_MD5,
|
|
55
|
+
FILE_HASH_SHA1,
|
|
56
|
+
FILE_HASH_SHA256,
|
|
57
|
+
MITRE_TECHNIQUE,
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# Pre-compiled regex patterns
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
# Bitcoin — three formats, all word-bounded:
|
|
65
|
+
# Bech32 (native segwit): bc1 + bech32 charset, 25-62 chars
|
|
66
|
+
# P2PKH legacy: starts with 1, base58 charset, 25-34 chars
|
|
67
|
+
# P2SH: starts with 3, base58 charset, 25-34 chars
|
|
68
|
+
_BITCOIN_RE = re.compile(
|
|
69
|
+
r"\b(?:"
|
|
70
|
+
r"bc1[a-zA-HJ-NP-Z0-9]{25,62}"
|
|
71
|
+
r"|1[a-km-zA-HJ-NP-Z1-9]{25,34}"
|
|
72
|
+
r"|3[a-km-zA-HJ-NP-Z1-9]{25,34}"
|
|
73
|
+
r")\b"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Ethereum — 0x + exactly 40 hex chars, word-bounded to exclude longer hex blobs
|
|
77
|
+
_ETHEREUM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
|
|
78
|
+
|
|
79
|
+
# Monero — starts with 4, second char in [0-9AB], 93 base58 chars, total 95
|
|
80
|
+
_MONERO_RE = re.compile(r"\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b")
|
|
81
|
+
|
|
82
|
+
# Onion URLs — full URL (http/https + path) tried before bare hostname so the
|
|
83
|
+
# longer form is preferred by re.finditer when both would match the same text.
|
|
84
|
+
_ONION_RE = re.compile(
|
|
85
|
+
r"https?://[a-z2-7]{16,56}\.onion(?:/[^\s\"'<>]*)?"
|
|
86
|
+
r"|[a-z2-7]{16,56}\.onion(?:/[^\s\"'<>]*)?",
|
|
87
|
+
re.IGNORECASE,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Email — simplified RFC 5322. Leading/trailing-dot and consecutive-dot
|
|
91
|
+
# validation is done in _is_valid_email() rather than in the regex itself
|
|
92
|
+
# to keep the pattern readable.
|
|
93
|
+
_EMAIL_RE = re.compile(
|
|
94
|
+
r"\b[a-zA-Z0-9][a-zA-Z0-9._%+\-]*@[a-zA-Z0-9][a-zA-Z0-9.\-]*\.[a-zA-Z]{2,}\b"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# PGP — full armored block (multiline, lazy inner match)
|
|
98
|
+
_PGP_BLOCK_RE = re.compile(
|
|
99
|
+
r"-----BEGIN PGP PUBLIC KEY BLOCK-----.*?-----END PGP PUBLIC KEY BLOCK-----",
|
|
100
|
+
re.DOTALL,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# PGP — colon-separated fingerprint: 20 groups of exactly 2 hex chars
|
|
104
|
+
# e.g. AB:CD:EF:01:23:45:67:89:AB:CD:EF:01:23:45:67:89:AB:CD:EF:01
|
|
105
|
+
# Also space-separated (with or without spaces): ABCD 1234 ABCD 1234...
|
|
106
|
+
_PGP_FINGERPRINT_RE = re.compile(
|
|
107
|
+
r"\b[0-9A-Fa-f]{2}(:[0-9A-Fa-f]{2}){19}\b|"
|
|
108
|
+
r"\b[0-9A-F]{4}(?:\s?[0-9A-F]{4}){9}\b",
|
|
109
|
+
re.IGNORECASE,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# PGP — explicit fingerprint keyword context (within 50 chars of hex string)
|
|
113
|
+
_PGP_CONTEXT_RE = re.compile(
|
|
114
|
+
r"fingerprint[\s:]{0,50}[0-9A-Fa-f]{40}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# MD5 — exactly 32 hex chars, word-bounded
|
|
118
|
+
_FILE_HASH_MD5_RE = re.compile(r"\b[0-9a-fA-F]{32}\b")
|
|
119
|
+
|
|
120
|
+
# SHA1 — exactly 40 hex chars, word-bounded (used to exclude from PGP)
|
|
121
|
+
_FILE_HASH_SHA1_RE = re.compile(r"\b[0-9a-fA-F]{40}\b")
|
|
122
|
+
|
|
123
|
+
# SHA256 — exactly 64 hex chars, word-bounded
|
|
124
|
+
_FILE_HASH_SHA256_RE = re.compile(r"\b[0-9a-fA-F]{64}\b")
|
|
125
|
+
|
|
126
|
+
# CVE — case insensitive; 4-digit year + 4-7 digit ID
|
|
127
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b", re.IGNORECASE)
|
|
128
|
+
|
|
129
|
+
# MITRE ATT&CK technique — T + 4 digits, optional . + 3 sub-technique digits
|
|
130
|
+
# e.g. T1486, T1071.001, T1059.003 (case-insensitive)
|
|
131
|
+
_MITRE_TECHNIQUE_RE = re.compile(r"\bT\d{4}(?:\.\d{3})?\b", re.IGNORECASE)
|
|
132
|
+
|
|
133
|
+
# IPv4 — strict octet ranges (0-255), word-bounded.
|
|
134
|
+
# RFC1918/loopback filtering happens in _is_public_ip() — not in regex.
|
|
135
|
+
_IP_RE = re.compile(
|
|
136
|
+
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Phone — E.164 (+[1-9] then 6-14 digits) captures most international formats.
|
|
140
|
+
_PHONE_RE = re.compile(r"\+[1-9]\d{6,14}\b")
|
|
141
|
+
|
|
142
|
+
# Paste site URLs — known domains only, full URL required
|
|
143
|
+
_PASTE_DOMAINS = (
|
|
144
|
+
r"(?:pastebin\.com|rentry\.co|ghostbin\.com|paste\.ee"
|
|
145
|
+
r"|hastebin\.com|privatebin\.net|bin\.bini\.monster)"
|
|
146
|
+
)
|
|
147
|
+
_PASTE_RE = re.compile(
|
|
148
|
+
rf"https?://(?:www\.)?{_PASTE_DOMAINS}/[^\s\"'<>]*",
|
|
149
|
+
re.IGNORECASE,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# Private IP ranges to exclude (RFC1918 + loopback)
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
_PRIVATE_NETS = [
|
|
157
|
+
ipaddress.ip_network("10.0.0.0/8"),
|
|
158
|
+
ipaddress.ip_network("172.16.0.0/12"),
|
|
159
|
+
ipaddress.ip_network("192.168.0.0/16"),
|
|
160
|
+
ipaddress.ip_network("127.0.0.0/8"),
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _is_public_ip(addr: str) -> bool:
|
|
165
|
+
"""Return True if *addr* is a syntactically valid, non-private IPv4 address."""
|
|
166
|
+
try:
|
|
167
|
+
ip = ipaddress.ip_address(addr)
|
|
168
|
+
return not any(ip in net for net in _PRIVATE_NETS)
|
|
169
|
+
except ValueError:
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _is_valid_email(email: str) -> bool:
|
|
174
|
+
"""Return False for emails with consecutive dots or leading/trailing dots."""
|
|
175
|
+
local, _, domain = email.partition("@")
|
|
176
|
+
if ".." in local or ".." in domain:
|
|
177
|
+
return False
|
|
178
|
+
if local.startswith(".") or local.endswith("."):
|
|
179
|
+
return False
|
|
180
|
+
if domain.startswith(".") or domain.endswith("."):
|
|
181
|
+
return False
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _findall(pattern: re.Pattern, text: str) -> list[str]:
|
|
186
|
+
"""Return all non-overlapping matches as full-match strings."""
|
|
187
|
+
return [m.group(0) for m in pattern.finditer(text)]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _dedup(values) -> list[str]:
|
|
191
|
+
"""Deduplicate while preserving first-occurrence order."""
|
|
192
|
+
seen: set[str] = set()
|
|
193
|
+
result: list[str] = []
|
|
194
|
+
for v in values:
|
|
195
|
+
if v not in seen:
|
|
196
|
+
seen.add(v)
|
|
197
|
+
result.append(v)
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Per-type extractor lambdas (used by extract_type)
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
def _extract_bitcoin(text: str) -> list[str]:
|
|
206
|
+
return _dedup(_findall(_BITCOIN_RE, text))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _extract_ethereum(text: str) -> list[str]:
|
|
210
|
+
return _dedup(_findall(_ETHEREUM_RE, text))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _extract_monero(text: str) -> list[str]:
|
|
214
|
+
return _dedup(_findall(_MONERO_RE, text))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _extract_onion(text: str) -> list[str]:
|
|
218
|
+
return _dedup(_findall(_ONION_RE, text))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _extract_email(text: str) -> list[str]:
|
|
222
|
+
return _dedup(m for m in _findall(_EMAIL_RE, text) if _is_valid_email(m))
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _extract_pgp(text: str) -> list[str]:
|
|
226
|
+
blocks = _findall(_PGP_BLOCK_RE, text)
|
|
227
|
+
fingerprints = _findall(_PGP_FINGERPRINT_RE, text)
|
|
228
|
+
context_hits = _findall(_PGP_CONTEXT_RE, text)
|
|
229
|
+
sha1_hashes = set(_findall(_FILE_HASH_SHA1_RE, text))
|
|
230
|
+
result = []
|
|
231
|
+
for h in blocks:
|
|
232
|
+
if h not in sha1_hashes:
|
|
233
|
+
result.append(h)
|
|
234
|
+
for h in fingerprints:
|
|
235
|
+
if h not in sha1_hashes:
|
|
236
|
+
result.append(h)
|
|
237
|
+
for h in context_hits:
|
|
238
|
+
result.append(h)
|
|
239
|
+
return _dedup(result)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _extract_md5(text: str) -> list[str]:
|
|
243
|
+
return _dedup(_findall(_FILE_HASH_MD5_RE, text))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _extract_sha1(text: str) -> list[str]:
|
|
247
|
+
return _dedup(_findall(_FILE_HASH_SHA1_RE, text))
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _extract_sha256(text: str) -> list[str]:
|
|
251
|
+
return _dedup(_findall(_FILE_HASH_SHA256_RE, text))
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _extract_cve(text: str) -> list[str]:
|
|
255
|
+
return _dedup(m.upper() for m in _findall(_CVE_RE, text))
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _extract_mitre(text: str) -> list[str]:
|
|
259
|
+
return _dedup(m.upper() for m in _findall(_MITRE_TECHNIQUE_RE, text))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _extract_ip(text: str) -> list[str]:
|
|
263
|
+
return _dedup(m for m in _findall(_IP_RE, text) if _is_public_ip(m))
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _extract_phone(text: str) -> list[str]:
|
|
267
|
+
return _dedup(_findall(_PHONE_RE, text))
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _extract_paste(text: str) -> list[str]:
|
|
271
|
+
return _dedup(_findall(_PASTE_RE, text))
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
_EXTRACTORS: dict[str, object] = {
|
|
275
|
+
BITCOIN_ADDRESS: _extract_bitcoin,
|
|
276
|
+
ETHEREUM_ADDRESS: _extract_ethereum,
|
|
277
|
+
MONERO_ADDRESS: _extract_monero,
|
|
278
|
+
ONION_URL: _extract_onion,
|
|
279
|
+
EMAIL_ADDRESS: _extract_email,
|
|
280
|
+
PGP_KEY_BLOCK: _extract_pgp,
|
|
281
|
+
FILE_HASH_MD5: _extract_md5,
|
|
282
|
+
FILE_HASH_SHA1: _extract_sha1,
|
|
283
|
+
FILE_HASH_SHA256: _extract_sha256,
|
|
284
|
+
CVE_NUMBER: _extract_cve,
|
|
285
|
+
MITRE_TECHNIQUE: _extract_mitre,
|
|
286
|
+
IP_ADDRESS: _extract_ip,
|
|
287
|
+
PHONE_NUMBER: _extract_phone,
|
|
288
|
+
PASTE_URL: _extract_paste,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
# Public interface
|
|
293
|
+
# ---------------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def extract_all(text: str) -> dict[str, list[str]]:
|
|
297
|
+
"""
|
|
298
|
+
Run all entity patterns against *text*.
|
|
299
|
+
|
|
300
|
+
Returns a dict keyed by entity-type constant. Every key is always present;
|
|
301
|
+
types with no matches map to an empty list. Never raises.
|
|
302
|
+
"""
|
|
303
|
+
result: dict[str, list[str]] = {}
|
|
304
|
+
try:
|
|
305
|
+
for entity_type, extractor in _EXTRACTORS.items():
|
|
306
|
+
result[entity_type] = extractor(text) # type: ignore[operator]
|
|
307
|
+
except Exception:
|
|
308
|
+
logger.exception("extract_all encountered an unexpected error")
|
|
309
|
+
for entity_type in ENTITY_TYPES:
|
|
310
|
+
result.setdefault(entity_type, [])
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def extract_type(text: str, entity_type: str) -> list[str]:
|
|
315
|
+
"""
|
|
316
|
+
Extract a single entity type from *text*.
|
|
317
|
+
|
|
318
|
+
Raises ValueError for unknown entity_type.
|
|
319
|
+
"""
|
|
320
|
+
if entity_type not in _EXTRACTORS:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"Unknown entity type {entity_type!r}. "
|
|
323
|
+
f"Valid types: {sorted(ENTITY_TYPES)}"
|
|
324
|
+
)
|
|
325
|
+
return _EXTRACTORS[entity_type](text) # type: ignore[operator]
|
fingerprint/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fingerprint — Writing style fingerprinting for threat actor identification.
|
|
3
|
+
|
|
4
|
+
Public interface
|
|
5
|
+
---------------
|
|
6
|
+
from fingerprint.stylometry import extract_style_vector, compute_similarity, are_likely_same_author
|
|
7
|
+
from fingerprint.profiler import build_actor_profile, update_profile, match_against_profiles
|
|
8
|
+
from fingerprint.profiler import load_profiles_from_db, save_profile_to_db
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from fingerprint.profiler import (
|
|
12
|
+
build_actor_profile,
|
|
13
|
+
load_profiles_from_db,
|
|
14
|
+
match_against_profiles,
|
|
15
|
+
save_profile_to_db,
|
|
16
|
+
update_profile,
|
|
17
|
+
)
|
|
18
|
+
from fingerprint.stylometry import (
|
|
19
|
+
are_likely_same_author,
|
|
20
|
+
compute_similarity,
|
|
21
|
+
extract_style_vector,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"extract_style_vector",
|
|
26
|
+
"compute_similarity",
|
|
27
|
+
"are_likely_same_author",
|
|
28
|
+
"build_actor_profile",
|
|
29
|
+
"update_profile",
|
|
30
|
+
"match_against_profiles",
|
|
31
|
+
"load_profiles_from_db",
|
|
32
|
+
"save_profile_to_db",
|
|
33
|
+
]
|
fingerprint/profiler.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fingerprint/profiler.py — Builds and maintains style profiles for threat actors.
|
|
3
|
+
|
|
4
|
+
A profile is the mean style vector across all posts attributed to a handle.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
from fingerprint.stylometry import compute_similarity, extract_style_vector
|
|
14
|
+
from vector import store as vector_store
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
SIMILARITY_THRESHOLD = 0.82
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Internal helpers
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
def _mean_vector(vectors: list[dict]) -> dict | None:
|
|
26
|
+
"""Compute the element-wise mean of a list of style vectors."""
|
|
27
|
+
if not vectors:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
first = vectors[0]
|
|
31
|
+
result: dict[str, Any] = {}
|
|
32
|
+
|
|
33
|
+
for key in first:
|
|
34
|
+
if key.startswith("_"):
|
|
35
|
+
continue
|
|
36
|
+
sample = first[key]
|
|
37
|
+
if isinstance(sample, dict):
|
|
38
|
+
all_subkeys: set[str] = set()
|
|
39
|
+
for v in vectors:
|
|
40
|
+
if key in v and isinstance(v[key], dict):
|
|
41
|
+
all_subkeys.update(v[key].keys())
|
|
42
|
+
subdict: dict[str, float] = {}
|
|
43
|
+
for subkey in all_subkeys:
|
|
44
|
+
vals = [
|
|
45
|
+
v[key][subkey]
|
|
46
|
+
for v in vectors
|
|
47
|
+
if key in v and isinstance(v[key], dict) and subkey in v[key]
|
|
48
|
+
]
|
|
49
|
+
subdict[subkey] = sum(vals) / len(vals) if vals else 0.0
|
|
50
|
+
result[key] = subdict
|
|
51
|
+
else:
|
|
52
|
+
vals_scalar = [
|
|
53
|
+
float(v[key])
|
|
54
|
+
for v in vectors
|
|
55
|
+
if key in v and isinstance(v[key], (int, float))
|
|
56
|
+
]
|
|
57
|
+
result[key] = sum(vals_scalar) / len(vals_scalar) if vals_scalar else 0.0
|
|
58
|
+
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Public interface
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
def build_actor_profile(texts: list[str]) -> dict | None:
|
|
67
|
+
"""
|
|
68
|
+
Compute mean style vector across all provided texts.
|
|
69
|
+
|
|
70
|
+
Filters out texts shorter than 100 chars. Returns None if no valid
|
|
71
|
+
texts remain after filtering.
|
|
72
|
+
"""
|
|
73
|
+
valid_vectors: list[dict] = []
|
|
74
|
+
for text in texts:
|
|
75
|
+
if text and len(text) >= 100:
|
|
76
|
+
vec = extract_style_vector(text)
|
|
77
|
+
if vec is not None:
|
|
78
|
+
valid_vectors.append(vec)
|
|
79
|
+
|
|
80
|
+
if not valid_vectors:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
profile = _mean_vector(valid_vectors)
|
|
84
|
+
if profile is not None:
|
|
85
|
+
profile["_sample_count"] = len(valid_vectors)
|
|
86
|
+
profile["_total_chars"] = sum(len(t) for t in texts if t)
|
|
87
|
+
return profile
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def update_profile(existing_profile: dict, new_texts: list[str]) -> dict:
|
|
91
|
+
"""
|
|
92
|
+
Incrementally update a profile with new text samples.
|
|
93
|
+
|
|
94
|
+
Uses a running mean — does not require storing all historical texts.
|
|
95
|
+
"""
|
|
96
|
+
new_vectors: list[dict] = []
|
|
97
|
+
for text in new_texts:
|
|
98
|
+
if text and len(text) >= 100:
|
|
99
|
+
vec = extract_style_vector(text)
|
|
100
|
+
if vec is not None:
|
|
101
|
+
new_vectors.append(vec)
|
|
102
|
+
|
|
103
|
+
if not new_vectors:
|
|
104
|
+
return existing_profile
|
|
105
|
+
|
|
106
|
+
n_old = int(existing_profile.get("_sample_count", 1))
|
|
107
|
+
n_new = len(new_vectors)
|
|
108
|
+
n_total = n_old + n_new
|
|
109
|
+
|
|
110
|
+
new_mean = _mean_vector(new_vectors)
|
|
111
|
+
if new_mean is None:
|
|
112
|
+
return existing_profile
|
|
113
|
+
|
|
114
|
+
result: dict[str, Any] = {}
|
|
115
|
+
all_keys = set(existing_profile.keys()) | set(new_mean.keys())
|
|
116
|
+
|
|
117
|
+
for key in all_keys:
|
|
118
|
+
if key.startswith("_"):
|
|
119
|
+
result[key] = existing_profile.get(key)
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
old_val = existing_profile.get(key)
|
|
123
|
+
new_val = new_mean.get(key)
|
|
124
|
+
|
|
125
|
+
if old_val is None and new_val is None:
|
|
126
|
+
continue
|
|
127
|
+
elif old_val is None:
|
|
128
|
+
result[key] = new_val
|
|
129
|
+
elif new_val is None:
|
|
130
|
+
result[key] = old_val
|
|
131
|
+
elif isinstance(old_val, dict) and isinstance(new_val, dict):
|
|
132
|
+
all_subkeys = set(old_val.keys()) | set(new_val.keys())
|
|
133
|
+
subdict: dict[str, float] = {}
|
|
134
|
+
for subkey in all_subkeys:
|
|
135
|
+
ov = float(old_val.get(subkey, 0.0))
|
|
136
|
+
nv = float(new_val.get(subkey, 0.0))
|
|
137
|
+
subdict[subkey] = (ov * n_old + nv * n_new) / n_total
|
|
138
|
+
result[key] = subdict
|
|
139
|
+
elif isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)):
|
|
140
|
+
result[key] = (float(old_val) * n_old + float(new_val) * n_new) / n_total
|
|
141
|
+
else:
|
|
142
|
+
result[key] = old_val
|
|
143
|
+
|
|
144
|
+
result["_sample_count"] = n_total
|
|
145
|
+
result["_total_chars"] = existing_profile.get("_total_chars", 0) + sum(len(t) for t in new_texts if t)
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def match_against_profiles(
|
|
150
|
+
style_vector: dict,
|
|
151
|
+
top_k: int = 10,
|
|
152
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
153
|
+
) -> list[dict]:
|
|
154
|
+
"""
|
|
155
|
+
Compare a style profile against all stored actor profiles using ANN search.
|
|
156
|
+
|
|
157
|
+
Uses ChromaDB approximate nearest neighbor search for O(log n) performance
|
|
158
|
+
instead of O(n) full table scan.
|
|
159
|
+
"""
|
|
160
|
+
return vector_store.match_actor_profiles(
|
|
161
|
+
style_vector=style_vector,
|
|
162
|
+
top_k=top_k,
|
|
163
|
+
threshold=threshold,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def save_profile_to_db(
|
|
168
|
+
profile: dict,
|
|
169
|
+
canonical_value: str,
|
|
170
|
+
entity_type: str,
|
|
171
|
+
session: Any,
|
|
172
|
+
) -> bool:
|
|
173
|
+
"""
|
|
174
|
+
Store or update an actor style profile in the dedicated DB table
|
|
175
|
+
and sync to ChromaDB for ANN search.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
from db.models import ActorStyleProfile
|
|
179
|
+
|
|
180
|
+
existing = (
|
|
181
|
+
session.query(ActorStyleProfile)
|
|
182
|
+
.filter(
|
|
183
|
+
ActorStyleProfile.canonical_value == canonical_value,
|
|
184
|
+
ActorStyleProfile.entity_type == entity_type,
|
|
185
|
+
)
|
|
186
|
+
.first()
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
sample_count = int(profile.get("_sample_count", 0))
|
|
190
|
+
total_chars = int(profile.get("_total_chars", 0))
|
|
191
|
+
|
|
192
|
+
cleaned_vector = {k: v for k, v in profile.items() if not k.startswith("_")}
|
|
193
|
+
|
|
194
|
+
actor_id = None
|
|
195
|
+
if existing:
|
|
196
|
+
existing.style_vector = cleaned_vector
|
|
197
|
+
existing.sample_count = sample_count
|
|
198
|
+
existing.total_chars = total_chars
|
|
199
|
+
existing.last_updated = datetime.now(timezone.utc)
|
|
200
|
+
actor_id = existing.id
|
|
201
|
+
else:
|
|
202
|
+
new_profile = ActorStyleProfile(
|
|
203
|
+
canonical_value=canonical_value,
|
|
204
|
+
entity_type=entity_type,
|
|
205
|
+
style_vector=cleaned_vector,
|
|
206
|
+
sample_count=sample_count,
|
|
207
|
+
total_chars=total_chars,
|
|
208
|
+
last_updated=datetime.now(timezone.utc),
|
|
209
|
+
)
|
|
210
|
+
session.add(new_profile)
|
|
211
|
+
session.flush()
|
|
212
|
+
actor_id = new_profile.id
|
|
213
|
+
|
|
214
|
+
vector_store.upsert_actor_profile(
|
|
215
|
+
actor_id=actor_id,
|
|
216
|
+
style_vector=cleaned_vector,
|
|
217
|
+
username=canonical_value,
|
|
218
|
+
platform=entity_type,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return True
|
|
222
|
+
except Exception as exc:
|
|
223
|
+
logger.error("save_profile_to_db failed: %s", exc)
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def load_profiles_from_db(session: Any) -> dict[str, dict]:
|
|
228
|
+
"""
|
|
229
|
+
Load all stored style profiles from the DB.
|
|
230
|
+
Returns {canonical_value: style_vector}
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
from db.models import ActorStyleProfile
|
|
234
|
+
profiles: dict[str, dict] = {}
|
|
235
|
+
for row in session.query(ActorStyleProfile).all():
|
|
236
|
+
profiles[row.canonical_value] = row.style_vector
|
|
237
|
+
return profiles
|
|
238
|
+
except Exception as exc:
|
|
239
|
+
logger.error("load_profiles_from_db failed: %s", exc)
|
|
240
|
+
return {}
|