voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
api/routes/monitors.py ADDED
@@ -0,0 +1,405 @@
1
+ """
2
+ api/routes/monitors.py — Monitor/watch management endpoints.
3
+
4
+ GET /monitors — list all watches from monitors.yaml
5
+ POST /monitors — create a new watch (writes to monitors.yaml)
6
+ DELETE /monitors/{watch_name} — delete a watch from monitors.yaml
7
+ POST /monitors/{watch_name}/trigger — trigger a specific watch immediately
8
+ GET /monitors/status — job status for all scheduled watches
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import logging
15
+ import os
16
+
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ from fastapi import APIRouter, Depends, HTTPException, Query
21
+ from filelock import FileLock
22
+ from pydantic import BaseModel
23
+
24
+ # Cross-platform file lock strategy:
25
+ # - Uses `filelock` library (works on Linux/Windows/macOS)
26
+ # - Replaces fcntl.flock() which is Linux-only and silently failed on Windows
27
+ # - FileLock creates a .lock file alongside monitors.yaml for inter-process locking
28
+ # - Provides thread-safety for concurrent config writes across deployments
29
+
30
+ from db.queries import (
31
+ acknowledge_alerts,
32
+ get_alert_counts_by_monitor,
33
+ get_alerts_for_monitor,
34
+ get_monitor_stats,
35
+ get_unacknowledged_alert_count,
36
+ )
37
+ from api.auth import require_password_not_reset_pending, CurrentUser
38
+ from db.session import get_session
39
+
40
+ logger = logging.getLogger(__name__)
41
+ router = APIRouter()
42
+
43
+ # Module-level scheduler reference (populated externally if running with scheduler)
44
+ _scheduler = None
45
+
46
+
47
+ def _get_monitor_config_path() -> Path:
48
+ """Get the path to monitors.yaml, configurable via MONITORS_CONFIG_PATH env var."""
49
+ env_path = os.getenv("MONITORS_CONFIG_PATH")
50
+ if env_path:
51
+ return Path(env_path)
52
+ return Path(__file__).resolve().parents[2] / "data" / "monitors.yaml"
53
+
54
+
55
+ def _ensure_monitors_yaml_exists() -> None:
56
+ """Create default empty monitors.yaml if it doesn't exist."""
57
+ path = _get_monitor_config_path()
58
+ if not path.exists():
59
+ try:
60
+ path.parent.mkdir(parents=True, exist_ok=True)
61
+ import yaml
62
+ path.write_text(
63
+ yaml.dump({"watches": []}, default_flow_style=False),
64
+ encoding="utf-8",
65
+ )
66
+ logger.info(f"Created default monitors.yaml at {path}")
67
+ except Exception as e:
68
+ logger.warning(f"Could not create monitors.yaml: {e}")
69
+
70
+
71
+ _ensure_monitors_yaml_exists()
72
+
73
+
74
+ def set_scheduler(scheduler) -> None:
75
+ """Inject the APScheduler instance into this module."""
76
+ global _scheduler
77
+ _scheduler = scheduler
78
+
79
+
80
+ _monitors_lock = asyncio.Lock()
81
+
82
+
83
+ async def _load_monitors_no_lock() -> list[dict]:
84
+ """Load monitors.yaml safely, return [] if file missing. NOT thread-safe on its own."""
85
+ path = _get_monitor_config_path()
86
+ if not path.exists():
87
+ import yaml
88
+ try:
89
+ await asyncio.to_thread(
90
+ path.write_text,
91
+ yaml.dump({"watches": []}, default_flow_style=False),
92
+ encoding="utf-8"
93
+ )
94
+ except Exception as e:
95
+ logger.error(f"Failed to create default monitors.yaml: {e}")
96
+ return []
97
+
98
+ try:
99
+ import yaml
100
+ content = await asyncio.to_thread(path.read_text, encoding="utf-8")
101
+ data = yaml.safe_load(content)
102
+ if not data or not isinstance(data, dict):
103
+ return []
104
+ watches = data.get("watches", [])
105
+ return watches if isinstance(watches, list) else []
106
+ except Exception as e:
107
+ logger.error(f"Failed to load monitors.yaml: {e}")
108
+ return []
109
+
110
+
111
+ async def _save_monitors_no_lock(watches: list[dict]) -> None:
112
+ """Save monitors.yaml safely with fsync. NOT thread-safe on its own."""
113
+ import yaml
114
+ path = _get_monitor_config_path()
115
+ content = yaml.dump({"watches": watches}, default_flow_style=False, allow_unicode=True)
116
+
117
+ def _sync_save():
118
+ tmp_path = path.with_suffix(".tmp")
119
+ with open(tmp_path, 'w', encoding='utf-8') as f:
120
+ f.write(content)
121
+ f.flush()
122
+ os.fsync(f.fileno())
123
+ os.replace(tmp_path, path)
124
+
125
+ await asyncio.to_thread(_sync_save)
126
+
127
+
128
+ async def _load_monitors() -> list[dict]:
129
+ """Thread-safe YAML load."""
130
+ async with _monitors_lock:
131
+ return await _load_monitors_no_lock()
132
+
133
+
134
+ async def _save_monitors(watches: list[dict]) -> None:
135
+ """Thread-safe YAML save."""
136
+ async with _monitors_lock:
137
+ await _save_monitors_no_lock(watches)
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Request models
142
+ # ---------------------------------------------------------------------------
143
+
144
+
145
+ class AcknowledgeAlertsBody(BaseModel):
146
+ alert_ids: list[int] | None = None
147
+
148
+
149
+ class CreateMonitorRequest(BaseModel):
150
+ name: str
151
+ type: str # "keyword" | "url"
152
+ query: Optional[str] = None
153
+ url: Optional[str] = None
154
+ interval_hours: float = 48.0
155
+ alert_on: str = "new_results"
156
+ webhook_url: Optional[str] = None
157
+ telegram_chat_id: Optional[str] = None
158
+ email: Optional[str] = None
159
+ enabled: bool = True
160
+
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Routes
164
+ # ---------------------------------------------------------------------------
165
+
166
+
167
+ @router.get("")
168
+ async def list_monitors() -> list[dict]:
169
+ """
170
+ Return all watches defined in monitors.yaml with aggregate stats from DB.
171
+ """
172
+ watches = await _load_monitors()
173
+ if not watches:
174
+ return watches
175
+
176
+ with get_session() as session:
177
+ result = []
178
+ for watch in watches:
179
+ name = watch.get("name", "")
180
+ stats = get_monitor_stats(session, name)
181
+ enriched = {**watch, **stats}
182
+ result.append(enriched)
183
+ return result
184
+
185
+
186
+ @router.get("/alerts/count")
187
+ async def get_alert_count() -> dict:
188
+ """
189
+ Total unacknowledged alert count across all monitors.
190
+ Used by MonitorNavBadge for the live count.
191
+ """
192
+ with get_session() as session:
193
+ count = get_unacknowledged_alert_count(session)
194
+ by_monitor = get_alert_counts_by_monitor(session)
195
+ return {
196
+ "total_unacknowledged": count,
197
+ "by_monitor": by_monitor,
198
+ }
199
+
200
+
201
+ @router.get("/status")
202
+ async def monitors_status() -> list[dict]:
203
+ """Return job status for all scheduled watches."""
204
+ try:
205
+ from monitor.scheduler import get_job_status # noqa: PLC0415
206
+
207
+ status = get_job_status(_scheduler)
208
+ result = []
209
+ for s in status:
210
+ result.append({
211
+ "name": s.get("name"),
212
+ "next_run_time": (
213
+ s["next_run_time"].isoformat()
214
+ if s.get("next_run_time") else None
215
+ ),
216
+ "last_run_time": (
217
+ s["last_run_time"].isoformat()
218
+ if s.get("last_run_time") else None
219
+ ),
220
+ })
221
+ return result
222
+ except Exception as exc:
223
+ logger.warning("monitors_status failed: %s", exc)
224
+ return []
225
+
226
+
227
+ @router.get("/{monitor_name}/alerts")
228
+ async def get_monitor_alerts(
229
+ monitor_name: str,
230
+ limit: int = Query(20, ge=1, le=200),
231
+ include_acknowledged: bool = Query(True),
232
+ ) -> dict:
233
+ """
234
+ Alert history for a specific monitor.
235
+ Used by MonitorDetail inline panel.
236
+ """
237
+ with get_session() as session:
238
+ alerts = get_alerts_for_monitor(
239
+ session,
240
+ monitor_name=monitor_name,
241
+ limit=limit,
242
+ include_acknowledged=include_acknowledged,
243
+ )
244
+ return {
245
+ "monitor_name": monitor_name,
246
+ "alerts": [
247
+ {
248
+ "id": a.id,
249
+ "triggered_at": a.triggered_at.isoformat(),
250
+ "change_type": a.change_type,
251
+ "summary": a.summary,
252
+ "severity": str(a.severity),
253
+ "entity_count_delta": a.entity_count_delta,
254
+ "delivered": a.delivered,
255
+ "delivery_channels": a.delivery_channels or [],
256
+ "acknowledged": a.acknowledged,
257
+ "acknowledged_at": (
258
+ a.acknowledged_at.isoformat() if a.acknowledged_at else None
259
+ ),
260
+ "diff_data": a.diff_data,
261
+ }
262
+ for a in alerts
263
+ ],
264
+ "total": len(alerts),
265
+ }
266
+
267
+
268
+ @router.post("/{monitor_name}/alerts/acknowledge")
269
+ async def acknowledge_monitor_alerts(
270
+ monitor_name: str,
271
+ body: AcknowledgeAlertsBody | None = None,
272
+ ) -> dict:
273
+ """
274
+ Mark alerts as acknowledged.
275
+ Body: {"alert_ids": [1, 2, 3]} or empty body to acknowledge all.
276
+ """
277
+ alert_ids = body.alert_ids if body else None
278
+ with get_session() as session:
279
+ count = acknowledge_alerts(session, monitor_name, alert_ids)
280
+ return {"acknowledged": count}
281
+
282
+
283
+ @router.post("")
284
+ async def create_monitor(
285
+ req: CreateMonitorRequest,
286
+ current_user: CurrentUser = Depends(require_password_not_reset_pending),
287
+ ) -> dict:
288
+ """Create a new watch and append it to monitors.yaml."""
289
+ if req.type not in ("keyword", "url"):
290
+ raise HTTPException(status_code=422, detail="type must be 'keyword' or 'url'")
291
+ if req.type == "keyword" and not req.query:
292
+ raise HTTPException(status_code=422, detail="query is required for keyword watches")
293
+ if req.type == "url" and not req.url:
294
+ raise HTTPException(status_code=422, detail="url is required for url watches")
295
+ if req.interval_hours < 0.5:
296
+ raise HTTPException(status_code=422, detail="interval_hours must be >= 0.5")
297
+ valid_alert_on = {"new_results", "any_change", "any_appearance"}
298
+ if req.alert_on not in valid_alert_on:
299
+ raise HTTPException(
300
+ status_code=422,
301
+ detail=f"alert_on must be one of {sorted(valid_alert_on)}",
302
+ )
303
+ if not req.name or not req.name.strip():
304
+ raise HTTPException(status_code=422, detail="name is required")
305
+
306
+ name = req.name.strip()
307
+
308
+ try:
309
+ path = _get_monitor_config_path()
310
+ if not path.exists():
311
+ await _load_monitors()
312
+
313
+ lock_path = str(path) + ".lock"
314
+
315
+ def _sync_create():
316
+ with FileLock(lock_path, timeout=10):
317
+ with open(path, 'r+', encoding='utf-8') as f:
318
+ content = f.read()
319
+ import yaml
320
+ data = yaml.safe_load(content) or {"watches": []}
321
+ watches = data.get("watches", [])
322
+ if not isinstance(watches, list):
323
+ watches = []
324
+
325
+ if any(w.get("name") == name for w in watches if isinstance(w, dict)):
326
+ return "duplicate"
327
+
328
+ entry: dict = {
329
+ "name": name,
330
+ "type": req.type,
331
+ "interval_hours": req.interval_hours,
332
+ "alert_on": req.alert_on,
333
+ "enabled": req.enabled,
334
+ "webhook_url": req.webhook_url or None,
335
+ "telegram_chat_id": req.telegram_chat_id or None,
336
+ "email": req.email or None,
337
+ }
338
+ if req.type == "keyword":
339
+ entry["query"] = req.query.strip()
340
+ else:
341
+ entry["url"] = req.url.strip()
342
+
343
+ watches.append(entry)
344
+
345
+ f.seek(0)
346
+ f.truncate()
347
+ f.write(yaml.dump({"watches": watches}, default_flow_style=False, allow_unicode=True))
348
+ f.flush()
349
+ os.fsync(f.fileno())
350
+ return "ok"
351
+
352
+ res = await asyncio.to_thread(_sync_create)
353
+ if res == "duplicate":
354
+ raise HTTPException(
355
+ status_code=409, detail=f"Monitor {name!r} already exists"
356
+ )
357
+
358
+ return {"created": True, "name": name}
359
+
360
+ except HTTPException:
361
+ raise
362
+ except Exception as exc:
363
+ logger.error("create_monitor failed: %s", exc)
364
+ raise HTTPException(status_code=500, detail=str(exc))
365
+
366
+
367
+ @router.delete("/{watch_name}")
368
+ async def delete_monitor(watch_name: str) -> dict:
369
+ """Remove a watch from monitors.yaml by name."""
370
+ try:
371
+ logger.debug(f"Lock ID: {id(_monitors_lock)}")
372
+ async with _monitors_lock:
373
+ watches = await _load_monitors_no_lock()
374
+ before = len(watches)
375
+ watches = [w for w in watches if not (isinstance(w, dict) and w.get("name") == watch_name)]
376
+ if len(watches) == before:
377
+ raise HTTPException(status_code=404, detail=f"Watch {watch_name!r} not found")
378
+ await _save_monitors_no_lock(watches)
379
+ return {"deleted": True, "name": watch_name}
380
+
381
+ except HTTPException:
382
+ raise
383
+ except Exception as exc:
384
+ logger.error("delete_monitor failed: %s", exc)
385
+ raise HTTPException(status_code=500, detail=str(exc))
386
+
387
+
388
+ @router.post("/{watch_name}/trigger")
389
+ async def trigger_monitor(watch_name: str) -> dict:
390
+ """Trigger a specific watch immediately."""
391
+ try:
392
+ from monitor.config import get_watch_by_name # noqa: PLC0415
393
+ from monitor.scheduler import trigger_job_now # noqa: PLC0415
394
+
395
+ watch = get_watch_by_name(watch_name)
396
+ if watch is None:
397
+ raise HTTPException(status_code=404, detail=f"Watch {watch_name!r} not found")
398
+
399
+ triggered = trigger_job_now(_scheduler, watch_name)
400
+ return {"triggered": triggered, "watch_name": watch_name}
401
+ except HTTPException:
402
+ raise
403
+ except Exception as exc:
404
+ logger.warning("trigger_monitor failed: %s", exc)
405
+ return {"triggered": False, "watch_name": watch_name}
api/routes/search.py ADDED
@@ -0,0 +1,157 @@
1
+ """
2
+ api/routes/search.py — Semantic and full-text search endpoints.
3
+
4
+ POST /search/semantic — vector similarity search against scraped pages
5
+ POST /search/entities — full-text search across entity values in DB
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from typing import Optional
13
+
14
+ from fastapi import APIRouter, Depends, HTTPException
15
+ from pydantic import BaseModel
16
+ from api.auth import CurrentUser, get_current_user
17
+
18
+ logger = logging.getLogger(__name__)
19
+ router = APIRouter()
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Request schemas
24
+ # ---------------------------------------------------------------------------
25
+
26
+
27
+ class SemanticSearchRequest(BaseModel):
28
+ query: str
29
+ n_results: int = 10
30
+ offset: int = 0
31
+
32
+
33
+ class EntitySearchRequest(BaseModel):
34
+ query: str
35
+ entity_types: Optional[list[str]] = None
36
+ offset: int = 0
37
+ limit: int = 50
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Routes
42
+ # ---------------------------------------------------------------------------
43
+
44
+
45
+ @router.post("/semantic")
46
+ async def semantic_search(
47
+ body: SemanticSearchRequest,
48
+ current_user: CurrentUser = Depends(get_current_user),
49
+ ) -> dict:
50
+ """
51
+ Return semantically similar pages from the vector store.
52
+ Uses ChromaDB + sentence-transformers embeddings.
53
+ Supports pagination via offset/n_results.
54
+ """
55
+ try:
56
+ from vector.search import find_related_pages
57
+ from vector.store import count_pages
58
+
59
+ results = find_related_pages(body.query, n_results=body.n_results)
60
+ total = count_pages()
61
+
62
+ if not isinstance(results, list):
63
+ results = []
64
+
65
+ user_inv_ids: set[str] = set()
66
+ if os.getenv("DATABASE_URL"):
67
+ try:
68
+ from db.session import get_session # noqa: PLC0415
69
+ from db.models import Investigation # noqa: PLC0415
70
+
71
+ with get_session() as session:
72
+ rows = (
73
+ session.query(Investigation.id)
74
+ .filter(Investigation.user_id == current_user.user.id)
75
+ .all()
76
+ )
77
+ user_inv_ids = {str(r[0]) for r in rows}
78
+ except Exception as exc:
79
+ logger.warning("semantic_search: failed to load user inv IDs: %s", exc)
80
+
81
+ results = [
82
+ r for r in results
83
+ if str(r.get("metadata", {}).get("investigation_id", "")) in user_inv_ids
84
+ ]
85
+
86
+ return {
87
+ "items": results,
88
+ "total": total,
89
+ "offset": body.offset,
90
+ "n_results": body.n_results,
91
+ }
92
+ except Exception as exc:
93
+ logger.warning("semantic_search failed: %s", exc)
94
+ return {"items": [], "total": 0, "offset": 0, "n_results": 10}
95
+
96
+
97
+ @router.post("/entities")
98
+ async def search_entities(
99
+ body: EntitySearchRequest,
100
+ current_user: CurrentUser = Depends(get_current_user),
101
+ ) -> list[dict]:
102
+ """
103
+ Full-text search across entity values in DB.
104
+ Optionally filter by entity_types list.
105
+ Supports pagination via offset/limit.
106
+ """
107
+ if not os.getenv("DATABASE_URL"):
108
+ return []
109
+ try:
110
+ from db.session import get_session # noqa: PLC0415
111
+ from db.models import Entity, Investigation, InvestigationEntityLink # noqa: PLC0415
112
+ import sqlalchemy as sa # noqa: PLC0415
113
+
114
+ limit = max(1, min(body.limit, 200))
115
+ offset = max(0, body.offset)
116
+
117
+ with get_session() as session:
118
+ user_inv_ids = (
119
+ session.query(Investigation.id)
120
+ .filter(Investigation.user_id == current_user.user.id)
121
+ .subquery()
122
+ )
123
+ linked_entity_ids = (
124
+ session.query(InvestigationEntityLink.entity_id)
125
+ .filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
126
+ .subquery()
127
+ )
128
+ q = session.query(Entity).filter(
129
+ sa.or_(
130
+ Entity.investigation_id.in_(user_inv_ids),
131
+ Entity.id.in_(linked_entity_ids),
132
+ ),
133
+ Entity.value.contains(body.query),
134
+ )
135
+ if body.entity_types:
136
+ q = q.filter(Entity.entity_type.in_(body.entity_types))
137
+ total = q.count()
138
+ entities = q.order_by(Entity.created_at.desc()).offset(offset).limit(limit).all()
139
+ return {
140
+ "items": [
141
+ {
142
+ "id": str(e.id),
143
+ "entity_type": e.entity_type,
144
+ "value": e.value,
145
+ "confidence": e.confidence,
146
+ "investigation_id": str(e.investigation_id) if e.investigation_id else None,
147
+ "created_at": e.created_at.isoformat() if e.created_at else None,
148
+ }
149
+ for e in entities
150
+ ],
151
+ "total": total,
152
+ "offset": offset,
153
+ "limit": limit,
154
+ }
155
+ except Exception as exc:
156
+ logger.warning("search_entities failed: %s", exc)
157
+ return {"items": [], "total": 0, "offset": 0, "limit": 50}