voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
vector/store.py
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChromaDB persistence for page embeddings (collection: voidaccess_pages).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# If you migrate from an older collection name, delete ./chroma_db (or CHROMA_PERSIST_DIR)
|
|
6
|
+
# and re-run ingestion so the collection is recreated with the new name.
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
14
|
+
from typing import Any
|
|
15
|
+
from urllib.parse import urlparse, urlunparse
|
|
16
|
+
|
|
17
|
+
import config
|
|
18
|
+
|
|
19
|
+
from . import embedder
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_COLLECTION: Any = None
|
|
24
|
+
_CLIENT: Any = None
|
|
25
|
+
|
|
26
|
+
DEFAULT_PERSIST_DIR = "./chroma_db"
|
|
27
|
+
COLLECTION_NAME = "voidaccess_pages"
|
|
28
|
+
ACTOR_PROFILE_COLLECTION = "actor_style_profiles"
|
|
29
|
+
|
|
30
|
+
_ACTOR_COLLECTION: Any = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _persist_dir() -> str:
|
|
34
|
+
v = getattr(config, "CHROMA_PERSIST_DIR", None) or os.getenv(
|
|
35
|
+
"CHROMA_PERSIST_DIR", DEFAULT_PERSIST_DIR
|
|
36
|
+
)
|
|
37
|
+
return (v or DEFAULT_PERSIST_DIR).strip() or DEFAULT_PERSIST_DIR
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _page_id_str(page_id: int | None) -> str | None:
|
|
41
|
+
if page_id is None:
|
|
42
|
+
return None
|
|
43
|
+
return str(page_id)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_collection():
|
|
47
|
+
"""
|
|
48
|
+
Singleton persistent Chroma collection, or None if chromadb is unavailable.
|
|
49
|
+
Never raises.
|
|
50
|
+
"""
|
|
51
|
+
global _COLLECTION, _CLIENT
|
|
52
|
+
if _COLLECTION is not None:
|
|
53
|
+
return _COLLECTION
|
|
54
|
+
try:
|
|
55
|
+
import chromadb # noqa: PLC0415
|
|
56
|
+
except ImportError:
|
|
57
|
+
logger.warning("chromadb not installed; vector store disabled")
|
|
58
|
+
return None
|
|
59
|
+
try:
|
|
60
|
+
path = os.path.abspath(_persist_dir())
|
|
61
|
+
os.makedirs(path, exist_ok=True)
|
|
62
|
+
_CLIENT = chromadb.PersistentClient(path=path)
|
|
63
|
+
_COLLECTION = _CLIENT.get_or_create_collection(name=COLLECTION_NAME)
|
|
64
|
+
except Exception as exc:
|
|
65
|
+
logger.warning("Failed to open ChromaDB: %s", exc)
|
|
66
|
+
_COLLECTION = None
|
|
67
|
+
_CLIENT = None
|
|
68
|
+
return _COLLECTION
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_actor_collection():
|
|
72
|
+
"""
|
|
73
|
+
Singleton Chroma collection for actor style profiles.
|
|
74
|
+
Uses cosine similarity (L2 normalized).
|
|
75
|
+
"""
|
|
76
|
+
global _ACTOR_COLLECTION
|
|
77
|
+
if _ACTOR_COLLECTION is not None:
|
|
78
|
+
return _ACTOR_COLLECTION
|
|
79
|
+
try:
|
|
80
|
+
import chromadb # noqa: PLC0415
|
|
81
|
+
except ImportError:
|
|
82
|
+
logger.warning("chromadb not installed; actor vector store disabled")
|
|
83
|
+
return None
|
|
84
|
+
try:
|
|
85
|
+
path = os.path.abspath(_persist_dir())
|
|
86
|
+
os.makedirs(path, exist_ok=True)
|
|
87
|
+
if _CLIENT is None:
|
|
88
|
+
_CLIENT = chromadb.PersistentClient(path=path)
|
|
89
|
+
_ACTOR_COLLECTION = _CLIENT.get_or_create_collection(
|
|
90
|
+
name=ACTOR_PROFILE_COLLECTION,
|
|
91
|
+
metadata={"hnsw:space": "cosine"},
|
|
92
|
+
)
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
logger.warning("Failed to open actor ChromaDB collection: %s", exc)
|
|
95
|
+
_ACTOR_COLLECTION = None
|
|
96
|
+
return _ACTOR_COLLECTION
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _stable_id(page_url: str) -> str:
|
|
100
|
+
return hashlib.sha256(page_url.encode("utf-8")).hexdigest()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _normalize_url(url: str) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Normalize URL for consistent cache lookups.
|
|
106
|
+
|
|
107
|
+
Uses crawler.utils.normalize_url for consistency with scraper.
|
|
108
|
+
Falls back to basic normalization if crawler.utils unavailable.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
from crawler.utils import normalize_url
|
|
112
|
+
return normalize_url(url)
|
|
113
|
+
except ImportError:
|
|
114
|
+
pass
|
|
115
|
+
try:
|
|
116
|
+
parsed = urlparse(url)
|
|
117
|
+
scheme = parsed.scheme.lower()
|
|
118
|
+
netloc = parsed.netloc.lower()
|
|
119
|
+
path = parsed.path
|
|
120
|
+
if path and path != "/":
|
|
121
|
+
path = path.rstrip("/")
|
|
122
|
+
elif path == "/":
|
|
123
|
+
path = ""
|
|
124
|
+
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
|
125
|
+
except Exception:
|
|
126
|
+
return url
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _flatten_metadata(
|
|
130
|
+
url: str,
|
|
131
|
+
page_id: int | None,
|
|
132
|
+
ts: str,
|
|
133
|
+
extra: dict | None,
|
|
134
|
+
) -> dict[str, Any]:
|
|
135
|
+
meta: dict[str, Any] = {
|
|
136
|
+
"url": url,
|
|
137
|
+
"timestamp": ts,
|
|
138
|
+
}
|
|
139
|
+
ps = _page_id_str(page_id)
|
|
140
|
+
if ps is not None:
|
|
141
|
+
meta["page_id"] = ps
|
|
142
|
+
if extra:
|
|
143
|
+
for k, v in extra.items():
|
|
144
|
+
if v is None:
|
|
145
|
+
continue
|
|
146
|
+
if isinstance(v, (str, int, float, bool)):
|
|
147
|
+
meta[str(k)] = v
|
|
148
|
+
else:
|
|
149
|
+
meta[str(k)] = str(v)
|
|
150
|
+
return meta
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def upsert_page(
|
|
154
|
+
page_url: str,
|
|
155
|
+
text: str,
|
|
156
|
+
metadata: dict | None = None,
|
|
157
|
+
page_id: int | None = None,
|
|
158
|
+
) -> bool:
|
|
159
|
+
"""
|
|
160
|
+
Embed *text* and upsert into Chroma. id = SHA-256 of page_url.
|
|
161
|
+
Returns False on any failure or missing deps. Never raises.
|
|
162
|
+
"""
|
|
163
|
+
col = get_collection()
|
|
164
|
+
if col is None:
|
|
165
|
+
return False
|
|
166
|
+
emb = embedder.embed_text(text)
|
|
167
|
+
if emb is None:
|
|
168
|
+
return False
|
|
169
|
+
try:
|
|
170
|
+
pid = _stable_id(page_url)
|
|
171
|
+
ts = datetime.now(timezone.utc).isoformat()
|
|
172
|
+
meta = _flatten_metadata(page_url, page_id, ts, metadata)
|
|
173
|
+
col.upsert(
|
|
174
|
+
ids=[pid],
|
|
175
|
+
embeddings=[emb],
|
|
176
|
+
metadatas=[meta],
|
|
177
|
+
documents=[text[:8000]],
|
|
178
|
+
)
|
|
179
|
+
return True
|
|
180
|
+
except Exception as exc:
|
|
181
|
+
logger.warning("upsert_page failed: %s", exc)
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def search_similar(
|
|
186
|
+
query_text: str,
|
|
187
|
+
n_results: int = 10,
|
|
188
|
+
where: dict | None = None,
|
|
189
|
+
offset: int = 0,
|
|
190
|
+
) -> list[dict]:
|
|
191
|
+
"""
|
|
192
|
+
Semantic search; results sorted by distance ascending. Never raises.
|
|
193
|
+
Supports offset for pagination.
|
|
194
|
+
"""
|
|
195
|
+
col = get_collection()
|
|
196
|
+
if col is None:
|
|
197
|
+
return []
|
|
198
|
+
emb = embedder.embed_text(query_text)
|
|
199
|
+
if emb is None:
|
|
200
|
+
return []
|
|
201
|
+
try:
|
|
202
|
+
n = max(1, int(n_results))
|
|
203
|
+
total_needed = offset + n
|
|
204
|
+
res = col.query(
|
|
205
|
+
query_embeddings=[emb],
|
|
206
|
+
n_results=total_needed,
|
|
207
|
+
where=where,
|
|
208
|
+
include=["distances", "metadatas"],
|
|
209
|
+
)
|
|
210
|
+
ids = (res.get("ids") or [[]])[0]
|
|
211
|
+
dists = (res.get("distances") or [[]])[0]
|
|
212
|
+
metas = (res.get("metadatas") or [[]])[0]
|
|
213
|
+
|
|
214
|
+
total = len(ids)
|
|
215
|
+
actual_offset = min(offset, total)
|
|
216
|
+
|
|
217
|
+
out: list[dict] = []
|
|
218
|
+
for i in range(actual_offset, len(ids)):
|
|
219
|
+
if len(out) >= n:
|
|
220
|
+
break
|
|
221
|
+
_pid = ids[i]
|
|
222
|
+
m = metas[i] if i < len(metas) and metas[i] else {}
|
|
223
|
+
md = dict(m) if isinstance(m, dict) else {}
|
|
224
|
+
url = md.get("url", "")
|
|
225
|
+
raw_pid = md.get("page_id")
|
|
226
|
+
page_id_out: int | None = None
|
|
227
|
+
if raw_pid is not None:
|
|
228
|
+
try:
|
|
229
|
+
page_id_out = int(raw_pid)
|
|
230
|
+
except (TypeError, ValueError):
|
|
231
|
+
page_id_out = None
|
|
232
|
+
dist_f = float(dists[i]) if i < len(dists) else 0.0
|
|
233
|
+
out.append(
|
|
234
|
+
{
|
|
235
|
+
"url": url,
|
|
236
|
+
"page_id": page_id_out,
|
|
237
|
+
"distance": dist_f,
|
|
238
|
+
"metadata": md,
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
return out
|
|
242
|
+
except Exception as exc:
|
|
243
|
+
logger.warning("search_similar failed: %s", exc)
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def count_pages() -> int:
|
|
248
|
+
"""Return total page count in vector store."""
|
|
249
|
+
col = get_collection()
|
|
250
|
+
if col is None:
|
|
251
|
+
return 0
|
|
252
|
+
try:
|
|
253
|
+
return int(col.count())
|
|
254
|
+
except Exception:
|
|
255
|
+
return 0
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def is_duplicate(text: str, threshold: float = 0.05) -> bool:
|
|
259
|
+
"""True if the nearest neighbour is within *threshold* distance."""
|
|
260
|
+
col = get_collection()
|
|
261
|
+
if col is None:
|
|
262
|
+
return False
|
|
263
|
+
hits = search_similar(text, n_results=1)
|
|
264
|
+
if not hits:
|
|
265
|
+
return False
|
|
266
|
+
return float(hits[0]["distance"]) < threshold
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def get_collection_stats() -> dict:
|
|
270
|
+
col = get_collection()
|
|
271
|
+
total = 0
|
|
272
|
+
if col is not None:
|
|
273
|
+
try:
|
|
274
|
+
total = int(col.count())
|
|
275
|
+
except Exception:
|
|
276
|
+
total = 0
|
|
277
|
+
return {
|
|
278
|
+
"total_documents": total,
|
|
279
|
+
"persist_directory": os.path.abspath(_persist_dir()),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def get_cached_page(url: str, max_age_hours: int = 24) -> dict | None:
|
|
284
|
+
"""
|
|
285
|
+
Check if a URL was already scraped within max_age_hours.
|
|
286
|
+
|
|
287
|
+
Uses normalized URL for lookup. Returns the cached page dict
|
|
288
|
+
{link, content, status, cached: True} if found and fresh enough, else None.
|
|
289
|
+
"""
|
|
290
|
+
col = get_collection()
|
|
291
|
+
if col is None:
|
|
292
|
+
return None
|
|
293
|
+
normalized = _normalize_url(url)
|
|
294
|
+
if not normalized:
|
|
295
|
+
return None
|
|
296
|
+
try:
|
|
297
|
+
results = col.get(
|
|
298
|
+
where={"url": normalized},
|
|
299
|
+
include=["documents", "metadatas"],
|
|
300
|
+
)
|
|
301
|
+
if not results["ids"]:
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
metadata = results["metadatas"][0]
|
|
305
|
+
content = results["documents"][0]
|
|
306
|
+
|
|
307
|
+
ts_str = metadata.get("timestamp") or metadata.get("scraped_at") or ""
|
|
308
|
+
if ts_str:
|
|
309
|
+
stored_at = datetime.fromisoformat(ts_str)
|
|
310
|
+
if stored_at.tzinfo is None:
|
|
311
|
+
stored_at = stored_at.replace(tzinfo=timezone.utc)
|
|
312
|
+
age_hours = (
|
|
313
|
+
datetime.now(timezone.utc) - stored_at
|
|
314
|
+
).total_seconds() / 3600
|
|
315
|
+
if age_hours > max_age_hours:
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
if not content or len(content) < 100:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
return {
|
|
322
|
+
"link": normalized,
|
|
323
|
+
"content": content,
|
|
324
|
+
"status": 200,
|
|
325
|
+
"cached": True,
|
|
326
|
+
"cached_at": ts_str,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
except Exception as exc:
|
|
330
|
+
logger.debug("Vector cache lookup failed for %s: %s", url, exc)
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def store_page(url: str, content: str, metadata: dict | None = None) -> bool:
|
|
335
|
+
"""
|
|
336
|
+
Store a scraped page in ChromaDB for future cache hits.
|
|
337
|
+
|
|
338
|
+
Normalizes URL before storing for consistent cache lookups.
|
|
339
|
+
Delegates to upsert_page so the embedding is also stored.
|
|
340
|
+
Returns True if stored successfully, False otherwise.
|
|
341
|
+
"""
|
|
342
|
+
if not content or len(content) < 100:
|
|
343
|
+
return False
|
|
344
|
+
normalized = _normalize_url(url)
|
|
345
|
+
if not normalized:
|
|
346
|
+
return False
|
|
347
|
+
return upsert_page(page_url=normalized, text=content, metadata=metadata)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def bulk_check_cache(
|
|
351
|
+
urls: list[str],
|
|
352
|
+
max_age_hours: int = 24,
|
|
353
|
+
) -> tuple[list[dict], list[str]]:
|
|
354
|
+
"""
|
|
355
|
+
Check multiple URLs against cache in a single ChromaDB batch call.
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
cached_pages: list of page dicts for cache hits
|
|
359
|
+
uncached_urls: list of URL strings that need to be scraped
|
|
360
|
+
"""
|
|
361
|
+
if not urls:
|
|
362
|
+
return [], []
|
|
363
|
+
|
|
364
|
+
collection = get_collection()
|
|
365
|
+
if collection is None:
|
|
366
|
+
return [], list(urls)
|
|
367
|
+
|
|
368
|
+
url_to_id = {url: _stable_id(url) for url in urls}
|
|
369
|
+
ids = list(url_to_id.values())
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
results = collection.get(ids=ids, include=["documents", "metadatas"])
|
|
373
|
+
except Exception as exc:
|
|
374
|
+
logger.warning("Bulk cache lookup failed: %s", exc)
|
|
375
|
+
return [], list(urls)
|
|
376
|
+
|
|
377
|
+
idx_map = {doc_id: i for i, doc_id in enumerate(results["ids"] or [])}
|
|
378
|
+
cached_pages: list[dict] = []
|
|
379
|
+
uncached_urls: list[str] = []
|
|
380
|
+
|
|
381
|
+
cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
|
|
382
|
+
|
|
383
|
+
for url, doc_id in url_to_id.items():
|
|
384
|
+
if doc_id not in idx_map:
|
|
385
|
+
uncached_urls.append(url)
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
idx = idx_map[doc_id]
|
|
389
|
+
|
|
390
|
+
metadata = results["metadatas"][idx]
|
|
391
|
+
content = results["documents"][idx]
|
|
392
|
+
|
|
393
|
+
ts_str = metadata.get("timestamp") or metadata.get("scraped_at") or ""
|
|
394
|
+
if ts_str:
|
|
395
|
+
try:
|
|
396
|
+
stored_at = datetime.fromisoformat(ts_str)
|
|
397
|
+
if stored_at.tzinfo is None:
|
|
398
|
+
stored_at = stored_at.replace(tzinfo=timezone.utc)
|
|
399
|
+
if stored_at < cutoff:
|
|
400
|
+
uncached_urls.append(url)
|
|
401
|
+
continue
|
|
402
|
+
except (ValueError, TypeError):
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
if not content or len(content) < 100:
|
|
406
|
+
uncached_urls.append(url)
|
|
407
|
+
continue
|
|
408
|
+
|
|
409
|
+
cached_pages.append({
|
|
410
|
+
"link": url,
|
|
411
|
+
"content": content,
|
|
412
|
+
"status": 200,
|
|
413
|
+
"cached": True,
|
|
414
|
+
"cached_at": ts_str,
|
|
415
|
+
})
|
|
416
|
+
|
|
417
|
+
return cached_pages, uncached_urls
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _dict_to_flat_vector(vector_dict: dict) -> list[float]:
|
|
421
|
+
"""Flatten a style vector dict into a list of floats for ChromaDB."""
|
|
422
|
+
flat: list[float] = []
|
|
423
|
+
for key in sorted(vector_dict.keys()):
|
|
424
|
+
val = vector_dict[key]
|
|
425
|
+
if isinstance(val, dict):
|
|
426
|
+
for subkey in sorted(val.keys()):
|
|
427
|
+
flat.append(float(val.get(subkey, 0.0)))
|
|
428
|
+
else:
|
|
429
|
+
flat.append(float(val) if val is not None else 0.0)
|
|
430
|
+
return flat
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def upsert_actor_profile(
|
|
434
|
+
actor_id: int,
|
|
435
|
+
style_vector: dict,
|
|
436
|
+
username: str | None = None,
|
|
437
|
+
platform: str | None = None,
|
|
438
|
+
) -> bool:
|
|
439
|
+
"""
|
|
440
|
+
Upsert an actor style profile vector into ChromaDB.
|
|
441
|
+
Returns False on failure, True on success.
|
|
442
|
+
"""
|
|
443
|
+
col = get_actor_collection()
|
|
444
|
+
if col is None:
|
|
445
|
+
return False
|
|
446
|
+
if not style_vector:
|
|
447
|
+
return False
|
|
448
|
+
try:
|
|
449
|
+
flat_vec = _dict_to_flat_vector(style_vector)
|
|
450
|
+
if not flat_vec:
|
|
451
|
+
return False
|
|
452
|
+
metadata: dict[str, Any] = {"actor_id": str(actor_id)}
|
|
453
|
+
if username is not None:
|
|
454
|
+
metadata["username"] = str(username)
|
|
455
|
+
if platform is not None:
|
|
456
|
+
metadata["platform"] = str(platform)
|
|
457
|
+
col.upsert(
|
|
458
|
+
ids=[str(actor_id)],
|
|
459
|
+
embeddings=[flat_vec],
|
|
460
|
+
metadatas=[metadata],
|
|
461
|
+
)
|
|
462
|
+
return True
|
|
463
|
+
except Exception as exc:
|
|
464
|
+
logger.warning("upsert_actor_profile failed: %s", exc)
|
|
465
|
+
return False
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def match_actor_profiles(
|
|
469
|
+
style_vector: dict,
|
|
470
|
+
top_k: int = 10,
|
|
471
|
+
threshold: float = 0.85,
|
|
472
|
+
) -> list[dict]:
|
|
473
|
+
"""
|
|
474
|
+
Approximate nearest neighbor search against actor style profiles.
|
|
475
|
+
Returns list of {actor_id, similarity} dicts with similarity >= threshold.
|
|
476
|
+
"""
|
|
477
|
+
col = get_actor_collection()
|
|
478
|
+
if col is None:
|
|
479
|
+
return []
|
|
480
|
+
if not style_vector:
|
|
481
|
+
return []
|
|
482
|
+
try:
|
|
483
|
+
flat_vec = _dict_to_flat_vector(style_vector)
|
|
484
|
+
if not flat_vec:
|
|
485
|
+
return []
|
|
486
|
+
results = col.query(
|
|
487
|
+
query_embeddings=[flat_vec],
|
|
488
|
+
n_results=top_k,
|
|
489
|
+
include=["distances", "metadatas"],
|
|
490
|
+
)
|
|
491
|
+
ids = (results.get("ids") or [[]])[0]
|
|
492
|
+
dists = (results.get("distances") or [[]])[0]
|
|
493
|
+
metas = (results.get("metadatas") or [[]])[0]
|
|
494
|
+
|
|
495
|
+
matches: list[dict] = []
|
|
496
|
+
for doc_id, dist, meta in zip(ids, dists, metas):
|
|
497
|
+
if doc_id is None:
|
|
498
|
+
continue
|
|
499
|
+
similarity = 1.0 - float(dist)
|
|
500
|
+
if similarity >= threshold:
|
|
501
|
+
match: dict[str, Any] = {
|
|
502
|
+
"actor_id": int(doc_id),
|
|
503
|
+
"similarity": similarity,
|
|
504
|
+
}
|
|
505
|
+
if isinstance(meta, dict):
|
|
506
|
+
if "username" in meta:
|
|
507
|
+
match["username"] = meta["username"]
|
|
508
|
+
if "platform" in meta:
|
|
509
|
+
match["platform"] = meta["platform"]
|
|
510
|
+
matches.append(match)
|
|
511
|
+
return matches
|
|
512
|
+
except Exception as exc:
|
|
513
|
+
logger.warning("match_actor_profiles failed: %s", exc)
|
|
514
|
+
return []
|
voidaccess/__init__.py
ADDED
|
File without changes
|