voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,325 @@
1
+ """
2
+ extractor/regex_patterns.py — Pre-compiled regex patterns for entity extraction.
3
+
4
+ All patterns are compiled at module load time. No pattern is ever compiled
5
+ inside a function call.
6
+
7
+ Public interface
8
+ ----------------
9
+ extract_all(text) → dict[str, list[str]]
10
+ extract_type(text, entity_type) → list[str] raises ValueError on unknown type
11
+
12
+ Entity type constants are exported so callers can use them symbolically
13
+ (e.g. regex_patterns.BITCOIN_ADDRESS) rather than raw strings.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import ipaddress
19
+ import logging
20
+ import re
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Entity type constants
26
+ # ---------------------------------------------------------------------------
27
+
28
+ BITCOIN_ADDRESS = "BITCOIN_ADDRESS"
29
+ ETHEREUM_ADDRESS = "ETHEREUM_ADDRESS"
30
+ MONERO_ADDRESS = "MONERO_ADDRESS"
31
+ ONION_URL = "ONION_URL"
32
+ EMAIL_ADDRESS = "EMAIL_ADDRESS"
33
+ PGP_KEY_BLOCK = "PGP_KEY_BLOCK"
34
+ CVE_NUMBER = "CVE_NUMBER"
35
+ IP_ADDRESS = "IP_ADDRESS"
36
+ PHONE_NUMBER = "PHONE_NUMBER"
37
+ PASTE_URL = "PASTE_URL"
38
+ FILE_HASH_MD5 = "FILE_HASH_MD5"
39
+ FILE_HASH_SHA1 = "FILE_HASH_SHA1"
40
+ FILE_HASH_SHA256 = "FILE_HASH_SHA256"
41
+ MITRE_TECHNIQUE = "MITRE_TECHNIQUE"
42
+
43
+ ENTITY_TYPES: frozenset[str] = frozenset({
44
+ BITCOIN_ADDRESS,
45
+ ETHEREUM_ADDRESS,
46
+ MONERO_ADDRESS,
47
+ ONION_URL,
48
+ EMAIL_ADDRESS,
49
+ PGP_KEY_BLOCK,
50
+ CVE_NUMBER,
51
+ IP_ADDRESS,
52
+ PHONE_NUMBER,
53
+ PASTE_URL,
54
+ FILE_HASH_MD5,
55
+ FILE_HASH_SHA1,
56
+ FILE_HASH_SHA256,
57
+ MITRE_TECHNIQUE,
58
+ })
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Pre-compiled regex patterns
62
+ # ---------------------------------------------------------------------------
63
+
64
+ # Bitcoin — three formats, all word-bounded:
65
+ # Bech32 (native segwit): bc1 + bech32 charset, 25-62 chars
66
+ # P2PKH legacy: starts with 1, base58 charset, 25-34 chars
67
+ # P2SH: starts with 3, base58 charset, 25-34 chars
68
+ _BITCOIN_RE = re.compile(
69
+ r"\b(?:"
70
+ r"bc1[a-zA-HJ-NP-Z0-9]{25,62}"
71
+ r"|1[a-km-zA-HJ-NP-Z1-9]{25,34}"
72
+ r"|3[a-km-zA-HJ-NP-Z1-9]{25,34}"
73
+ r")\b"
74
+ )
75
+
76
+ # Ethereum — 0x + exactly 40 hex chars, word-bounded to exclude longer hex blobs
77
+ _ETHEREUM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
78
+
79
+ # Monero — starts with 4, second char in [0-9AB], 93 base58 chars, total 95
80
+ _MONERO_RE = re.compile(r"\b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b")
81
+
82
+ # Onion URLs — full URL (http/https + path) tried before bare hostname so the
83
+ # longer form is preferred by re.finditer when both would match the same text.
84
+ _ONION_RE = re.compile(
85
+ r"https?://[a-z2-7]{16,56}\.onion(?:/[^\s\"'<>]*)?"
86
+ r"|[a-z2-7]{16,56}\.onion(?:/[^\s\"'<>]*)?",
87
+ re.IGNORECASE,
88
+ )
89
+
90
+ # Email — simplified RFC 5322. Leading/trailing-dot and consecutive-dot
91
+ # validation is done in _is_valid_email() rather than in the regex itself
92
+ # to keep the pattern readable.
93
+ _EMAIL_RE = re.compile(
94
+ r"\b[a-zA-Z0-9][a-zA-Z0-9._%+\-]*@[a-zA-Z0-9][a-zA-Z0-9.\-]*\.[a-zA-Z]{2,}\b"
95
+ )
96
+
97
+ # PGP — full armored block (multiline, lazy inner match)
98
+ _PGP_BLOCK_RE = re.compile(
99
+ r"-----BEGIN PGP PUBLIC KEY BLOCK-----.*?-----END PGP PUBLIC KEY BLOCK-----",
100
+ re.DOTALL,
101
+ )
102
+
103
+ # PGP — colon-separated fingerprint: 20 groups of exactly 2 hex chars
104
+ # e.g. AB:CD:EF:01:23:45:67:89:AB:CD:EF:01:23:45:67:89:AB:CD:EF:01
105
+ # Also space-separated (with or without spaces): ABCD 1234 ABCD 1234...
106
+ _PGP_FINGERPRINT_RE = re.compile(
107
+ r"\b[0-9A-Fa-f]{2}(:[0-9A-Fa-f]{2}){19}\b|"
108
+ r"\b[0-9A-F]{4}(?:\s?[0-9A-F]{4}){9}\b",
109
+ re.IGNORECASE,
110
+ )
111
+
112
+ # PGP — explicit fingerprint keyword context (within 50 chars of hex string)
113
+ _PGP_CONTEXT_RE = re.compile(
114
+ r"fingerprint[\s:]{0,50}[0-9A-Fa-f]{40}"
115
+ )
116
+
117
+ # MD5 — exactly 32 hex chars, word-bounded
118
+ _FILE_HASH_MD5_RE = re.compile(r"\b[0-9a-fA-F]{32}\b")
119
+
120
+ # SHA1 — exactly 40 hex chars, word-bounded (used to exclude from PGP)
121
+ _FILE_HASH_SHA1_RE = re.compile(r"\b[0-9a-fA-F]{40}\b")
122
+
123
+ # SHA256 — exactly 64 hex chars, word-bounded
124
+ _FILE_HASH_SHA256_RE = re.compile(r"\b[0-9a-fA-F]{64}\b")
125
+
126
+ # CVE — case insensitive; 4-digit year + 4-7 digit ID
127
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b", re.IGNORECASE)
128
+
129
+ # MITRE ATT&CK technique — T + 4 digits, optional . + 3 sub-technique digits
130
+ # e.g. T1486, T1071.001, T1059.003 (case-insensitive)
131
+ _MITRE_TECHNIQUE_RE = re.compile(r"\bT\d{4}(?:\.\d{3})?\b", re.IGNORECASE)
132
+
133
+ # IPv4 — strict octet ranges (0-255), word-bounded.
134
+ # RFC1918/loopback filtering happens in _is_public_ip() — not in regex.
135
+ _IP_RE = re.compile(
136
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
137
+ )
138
+
139
+ # Phone — E.164 (+[1-9] then 6-14 digits) captures most international formats.
140
+ _PHONE_RE = re.compile(r"\+[1-9]\d{6,14}\b")
141
+
142
+ # Paste site URLs — known domains only, full URL required
143
+ _PASTE_DOMAINS = (
144
+ r"(?:pastebin\.com|rentry\.co|ghostbin\.com|paste\.ee"
145
+ r"|hastebin\.com|privatebin\.net|bin\.bini\.monster)"
146
+ )
147
+ _PASTE_RE = re.compile(
148
+ rf"https?://(?:www\.)?{_PASTE_DOMAINS}/[^\s\"'<>]*",
149
+ re.IGNORECASE,
150
+ )
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # Private IP ranges to exclude (RFC1918 + loopback)
154
+ # ---------------------------------------------------------------------------
155
+
156
+ _PRIVATE_NETS = [
157
+ ipaddress.ip_network("10.0.0.0/8"),
158
+ ipaddress.ip_network("172.16.0.0/12"),
159
+ ipaddress.ip_network("192.168.0.0/16"),
160
+ ipaddress.ip_network("127.0.0.0/8"),
161
+ ]
162
+
163
+
164
+ def _is_public_ip(addr: str) -> bool:
165
+ """Return True if *addr* is a syntactically valid, non-private IPv4 address."""
166
+ try:
167
+ ip = ipaddress.ip_address(addr)
168
+ return not any(ip in net for net in _PRIVATE_NETS)
169
+ except ValueError:
170
+ return False
171
+
172
+
173
+ def _is_valid_email(email: str) -> bool:
174
+ """Return False for emails with consecutive dots or leading/trailing dots."""
175
+ local, _, domain = email.partition("@")
176
+ if ".." in local or ".." in domain:
177
+ return False
178
+ if local.startswith(".") or local.endswith("."):
179
+ return False
180
+ if domain.startswith(".") or domain.endswith("."):
181
+ return False
182
+ return True
183
+
184
+
185
+ def _findall(pattern: re.Pattern, text: str) -> list[str]:
186
+ """Return all non-overlapping matches as full-match strings."""
187
+ return [m.group(0) for m in pattern.finditer(text)]
188
+
189
+
190
+ def _dedup(values) -> list[str]:
191
+ """Deduplicate while preserving first-occurrence order."""
192
+ seen: set[str] = set()
193
+ result: list[str] = []
194
+ for v in values:
195
+ if v not in seen:
196
+ seen.add(v)
197
+ result.append(v)
198
+ return result
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Per-type extractor lambdas (used by extract_type)
203
+ # ---------------------------------------------------------------------------
204
+
205
+ def _extract_bitcoin(text: str) -> list[str]:
206
+ return _dedup(_findall(_BITCOIN_RE, text))
207
+
208
+
209
+ def _extract_ethereum(text: str) -> list[str]:
210
+ return _dedup(_findall(_ETHEREUM_RE, text))
211
+
212
+
213
+ def _extract_monero(text: str) -> list[str]:
214
+ return _dedup(_findall(_MONERO_RE, text))
215
+
216
+
217
+ def _extract_onion(text: str) -> list[str]:
218
+ return _dedup(_findall(_ONION_RE, text))
219
+
220
+
221
+ def _extract_email(text: str) -> list[str]:
222
+ return _dedup(m for m in _findall(_EMAIL_RE, text) if _is_valid_email(m))
223
+
224
+
225
+ def _extract_pgp(text: str) -> list[str]:
226
+ blocks = _findall(_PGP_BLOCK_RE, text)
227
+ fingerprints = _findall(_PGP_FINGERPRINT_RE, text)
228
+ context_hits = _findall(_PGP_CONTEXT_RE, text)
229
+ sha1_hashes = set(_findall(_FILE_HASH_SHA1_RE, text))
230
+ result = []
231
+ for h in blocks:
232
+ if h not in sha1_hashes:
233
+ result.append(h)
234
+ for h in fingerprints:
235
+ if h not in sha1_hashes:
236
+ result.append(h)
237
+ for h in context_hits:
238
+ result.append(h)
239
+ return _dedup(result)
240
+
241
+
242
+ def _extract_md5(text: str) -> list[str]:
243
+ return _dedup(_findall(_FILE_HASH_MD5_RE, text))
244
+
245
+
246
+ def _extract_sha1(text: str) -> list[str]:
247
+ return _dedup(_findall(_FILE_HASH_SHA1_RE, text))
248
+
249
+
250
+ def _extract_sha256(text: str) -> list[str]:
251
+ return _dedup(_findall(_FILE_HASH_SHA256_RE, text))
252
+
253
+
254
+ def _extract_cve(text: str) -> list[str]:
255
+ return _dedup(m.upper() for m in _findall(_CVE_RE, text))
256
+
257
+
258
+ def _extract_mitre(text: str) -> list[str]:
259
+ return _dedup(m.upper() for m in _findall(_MITRE_TECHNIQUE_RE, text))
260
+
261
+
262
+ def _extract_ip(text: str) -> list[str]:
263
+ return _dedup(m for m in _findall(_IP_RE, text) if _is_public_ip(m))
264
+
265
+
266
+ def _extract_phone(text: str) -> list[str]:
267
+ return _dedup(_findall(_PHONE_RE, text))
268
+
269
+
270
+ def _extract_paste(text: str) -> list[str]:
271
+ return _dedup(_findall(_PASTE_RE, text))
272
+
273
+
274
+ _EXTRACTORS: dict[str, object] = {
275
+ BITCOIN_ADDRESS: _extract_bitcoin,
276
+ ETHEREUM_ADDRESS: _extract_ethereum,
277
+ MONERO_ADDRESS: _extract_monero,
278
+ ONION_URL: _extract_onion,
279
+ EMAIL_ADDRESS: _extract_email,
280
+ PGP_KEY_BLOCK: _extract_pgp,
281
+ FILE_HASH_MD5: _extract_md5,
282
+ FILE_HASH_SHA1: _extract_sha1,
283
+ FILE_HASH_SHA256: _extract_sha256,
284
+ CVE_NUMBER: _extract_cve,
285
+ MITRE_TECHNIQUE: _extract_mitre,
286
+ IP_ADDRESS: _extract_ip,
287
+ PHONE_NUMBER: _extract_phone,
288
+ PASTE_URL: _extract_paste,
289
+ }
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # Public interface
293
+ # ---------------------------------------------------------------------------
294
+
295
+
296
+ def extract_all(text: str) -> dict[str, list[str]]:
297
+ """
298
+ Run all entity patterns against *text*.
299
+
300
+ Returns a dict keyed by entity-type constant. Every key is always present;
301
+ types with no matches map to an empty list. Never raises.
302
+ """
303
+ result: dict[str, list[str]] = {}
304
+ try:
305
+ for entity_type, extractor in _EXTRACTORS.items():
306
+ result[entity_type] = extractor(text) # type: ignore[operator]
307
+ except Exception:
308
+ logger.exception("extract_all encountered an unexpected error")
309
+ for entity_type in ENTITY_TYPES:
310
+ result.setdefault(entity_type, [])
311
+ return result
312
+
313
+
314
+ def extract_type(text: str, entity_type: str) -> list[str]:
315
+ """
316
+ Extract a single entity type from *text*.
317
+
318
+ Raises ValueError for unknown entity_type.
319
+ """
320
+ if entity_type not in _EXTRACTORS:
321
+ raise ValueError(
322
+ f"Unknown entity type {entity_type!r}. "
323
+ f"Valid types: {sorted(ENTITY_TYPES)}"
324
+ )
325
+ return _EXTRACTORS[entity_type](text) # type: ignore[operator]
@@ -0,0 +1,33 @@
1
+ """
2
+ fingerprint — Writing style fingerprinting for threat actor identification.
3
+
4
+ Public interface
5
+ ---------------
6
+ from fingerprint.stylometry import extract_style_vector, compute_similarity, are_likely_same_author
7
+ from fingerprint.profiler import build_actor_profile, update_profile, match_against_profiles
8
+ from fingerprint.profiler import load_profiles_from_db, save_profile_to_db
9
+ """
10
+
11
+ from fingerprint.profiler import (
12
+ build_actor_profile,
13
+ load_profiles_from_db,
14
+ match_against_profiles,
15
+ save_profile_to_db,
16
+ update_profile,
17
+ )
18
+ from fingerprint.stylometry import (
19
+ are_likely_same_author,
20
+ compute_similarity,
21
+ extract_style_vector,
22
+ )
23
+
24
+ __all__ = [
25
+ "extract_style_vector",
26
+ "compute_similarity",
27
+ "are_likely_same_author",
28
+ "build_actor_profile",
29
+ "update_profile",
30
+ "match_against_profiles",
31
+ "load_profiles_from_db",
32
+ "save_profile_to_db",
33
+ ]
@@ -0,0 +1,240 @@
1
+ """
2
+ fingerprint/profiler.py — Builds and maintains style profiles for threat actors.
3
+
4
+ A profile is the mean style vector across all posts attributed to a handle.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from datetime import datetime, timezone
11
+ from typing import Any, Optional
12
+
13
+ from fingerprint.stylometry import compute_similarity, extract_style_vector
14
+ from vector import store as vector_store
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ SIMILARITY_THRESHOLD = 0.82
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Internal helpers
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def _mean_vector(vectors: list[dict]) -> dict | None:
26
+ """Compute the element-wise mean of a list of style vectors."""
27
+ if not vectors:
28
+ return None
29
+
30
+ first = vectors[0]
31
+ result: dict[str, Any] = {}
32
+
33
+ for key in first:
34
+ if key.startswith("_"):
35
+ continue
36
+ sample = first[key]
37
+ if isinstance(sample, dict):
38
+ all_subkeys: set[str] = set()
39
+ for v in vectors:
40
+ if key in v and isinstance(v[key], dict):
41
+ all_subkeys.update(v[key].keys())
42
+ subdict: dict[str, float] = {}
43
+ for subkey in all_subkeys:
44
+ vals = [
45
+ v[key][subkey]
46
+ for v in vectors
47
+ if key in v and isinstance(v[key], dict) and subkey in v[key]
48
+ ]
49
+ subdict[subkey] = sum(vals) / len(vals) if vals else 0.0
50
+ result[key] = subdict
51
+ else:
52
+ vals_scalar = [
53
+ float(v[key])
54
+ for v in vectors
55
+ if key in v and isinstance(v[key], (int, float))
56
+ ]
57
+ result[key] = sum(vals_scalar) / len(vals_scalar) if vals_scalar else 0.0
58
+
59
+ return result
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Public interface
64
+ # ---------------------------------------------------------------------------
65
+
66
+ def build_actor_profile(texts: list[str]) -> dict | None:
67
+ """
68
+ Compute mean style vector across all provided texts.
69
+
70
+ Filters out texts shorter than 100 chars. Returns None if no valid
71
+ texts remain after filtering.
72
+ """
73
+ valid_vectors: list[dict] = []
74
+ for text in texts:
75
+ if text and len(text) >= 100:
76
+ vec = extract_style_vector(text)
77
+ if vec is not None:
78
+ valid_vectors.append(vec)
79
+
80
+ if not valid_vectors:
81
+ return None
82
+
83
+ profile = _mean_vector(valid_vectors)
84
+ if profile is not None:
85
+ profile["_sample_count"] = len(valid_vectors)
86
+ profile["_total_chars"] = sum(len(t) for t in texts if t)
87
+ return profile
88
+
89
+
90
+ def update_profile(existing_profile: dict, new_texts: list[str]) -> dict:
91
+ """
92
+ Incrementally update a profile with new text samples.
93
+
94
+ Uses a running mean — does not require storing all historical texts.
95
+ """
96
+ new_vectors: list[dict] = []
97
+ for text in new_texts:
98
+ if text and len(text) >= 100:
99
+ vec = extract_style_vector(text)
100
+ if vec is not None:
101
+ new_vectors.append(vec)
102
+
103
+ if not new_vectors:
104
+ return existing_profile
105
+
106
+ n_old = int(existing_profile.get("_sample_count", 1))
107
+ n_new = len(new_vectors)
108
+ n_total = n_old + n_new
109
+
110
+ new_mean = _mean_vector(new_vectors)
111
+ if new_mean is None:
112
+ return existing_profile
113
+
114
+ result: dict[str, Any] = {}
115
+ all_keys = set(existing_profile.keys()) | set(new_mean.keys())
116
+
117
+ for key in all_keys:
118
+ if key.startswith("_"):
119
+ result[key] = existing_profile.get(key)
120
+ continue
121
+
122
+ old_val = existing_profile.get(key)
123
+ new_val = new_mean.get(key)
124
+
125
+ if old_val is None and new_val is None:
126
+ continue
127
+ elif old_val is None:
128
+ result[key] = new_val
129
+ elif new_val is None:
130
+ result[key] = old_val
131
+ elif isinstance(old_val, dict) and isinstance(new_val, dict):
132
+ all_subkeys = set(old_val.keys()) | set(new_val.keys())
133
+ subdict: dict[str, float] = {}
134
+ for subkey in all_subkeys:
135
+ ov = float(old_val.get(subkey, 0.0))
136
+ nv = float(new_val.get(subkey, 0.0))
137
+ subdict[subkey] = (ov * n_old + nv * n_new) / n_total
138
+ result[key] = subdict
139
+ elif isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)):
140
+ result[key] = (float(old_val) * n_old + float(new_val) * n_new) / n_total
141
+ else:
142
+ result[key] = old_val
143
+
144
+ result["_sample_count"] = n_total
145
+ result["_total_chars"] = existing_profile.get("_total_chars", 0) + sum(len(t) for t in new_texts if t)
146
+ return result
147
+
148
+
149
+ def match_against_profiles(
150
+ style_vector: dict,
151
+ top_k: int = 10,
152
+ threshold: float = SIMILARITY_THRESHOLD,
153
+ ) -> list[dict]:
154
+ """
155
+ Compare a style profile against all stored actor profiles using ANN search.
156
+
157
+ Uses ChromaDB approximate nearest neighbor search for O(log n) performance
158
+ instead of O(n) full table scan.
159
+ """
160
+ return vector_store.match_actor_profiles(
161
+ style_vector=style_vector,
162
+ top_k=top_k,
163
+ threshold=threshold,
164
+ )
165
+
166
+
167
+ def save_profile_to_db(
168
+ profile: dict,
169
+ canonical_value: str,
170
+ entity_type: str,
171
+ session: Any,
172
+ ) -> bool:
173
+ """
174
+ Store or update an actor style profile in the dedicated DB table
175
+ and sync to ChromaDB for ANN search.
176
+ """
177
+ try:
178
+ from db.models import ActorStyleProfile
179
+
180
+ existing = (
181
+ session.query(ActorStyleProfile)
182
+ .filter(
183
+ ActorStyleProfile.canonical_value == canonical_value,
184
+ ActorStyleProfile.entity_type == entity_type,
185
+ )
186
+ .first()
187
+ )
188
+
189
+ sample_count = int(profile.get("_sample_count", 0))
190
+ total_chars = int(profile.get("_total_chars", 0))
191
+
192
+ cleaned_vector = {k: v for k, v in profile.items() if not k.startswith("_")}
193
+
194
+ actor_id = None
195
+ if existing:
196
+ existing.style_vector = cleaned_vector
197
+ existing.sample_count = sample_count
198
+ existing.total_chars = total_chars
199
+ existing.last_updated = datetime.now(timezone.utc)
200
+ actor_id = existing.id
201
+ else:
202
+ new_profile = ActorStyleProfile(
203
+ canonical_value=canonical_value,
204
+ entity_type=entity_type,
205
+ style_vector=cleaned_vector,
206
+ sample_count=sample_count,
207
+ total_chars=total_chars,
208
+ last_updated=datetime.now(timezone.utc),
209
+ )
210
+ session.add(new_profile)
211
+ session.flush()
212
+ actor_id = new_profile.id
213
+
214
+ vector_store.upsert_actor_profile(
215
+ actor_id=actor_id,
216
+ style_vector=cleaned_vector,
217
+ username=canonical_value,
218
+ platform=entity_type,
219
+ )
220
+
221
+ return True
222
+ except Exception as exc:
223
+ logger.error("save_profile_to_db failed: %s", exc)
224
+ return False
225
+
226
+
227
+ def load_profiles_from_db(session: Any) -> dict[str, dict]:
228
+ """
229
+ Load all stored style profiles from the DB.
230
+ Returns {canonical_value: style_vector}
231
+ """
232
+ try:
233
+ from db.models import ActorStyleProfile
234
+ profiles: dict[str, dict] = {}
235
+ for row in session.query(ActorStyleProfile).all():
236
+ profiles[row.canonical_value] = row.style_vector
237
+ return profiles
238
+ except Exception as exc:
239
+ logger.error("load_profiles_from_db failed: %s", exc)
240
+ return {}