voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
api/routes/entities.py ADDED
@@ -0,0 +1,871 @@
1
+ """
2
+ api/routes/entities.py — Entity query endpoints.
3
+
4
+ GET /entities — paginated entity list with filters
5
+ GET /entities/{entity_id} — single entity full profile
6
+ GET /entities/{entity_id}/neighbors — graph neighbors (sigma.js graph page)
7
+ GET /entities/{entity_id}/related — DB-based related entities for profile page
8
+ GET /entities/{entity_id}/export/stix — export single entity as STIX 2.1
9
+ GET /entities/{entity_id}/export/json — export single entity as JSON
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ import os
17
+ import uuid
18
+ from datetime import datetime
19
+ from typing import Optional
20
+
21
+ from fastapi import APIRouter, Depends, HTTPException, Query
22
+ from fastapi.responses import Response
23
+ from api.auth import CurrentUser, get_current_user
24
+
25
+ logger = logging.getLogger(__name__)
26
+ router = APIRouter()
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Routes
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ @router.get("")
35
+ async def list_entities(
36
+ entity_type: Optional[str] = Query(default=None, description="Filter by entity type"),
37
+ value_contains: Optional[str] = Query(default=None, description="Filter by value substring"),
38
+ since: Optional[str] = Query(default=None, description="ISO datetime lower bound for created_at"),
39
+ limit: int = Query(default=20, ge=1, le=100),
40
+ offset: int = Query(default=0, ge=0),
41
+ current_user: CurrentUser = Depends(get_current_user),
42
+ ) -> dict:
43
+ """Return paginated entities matching optional filters."""
44
+ if not os.getenv("DATABASE_URL"):
45
+ return {"items": [], "total": 0, "skip": 0, "limit": 20}
46
+ try:
47
+ from db.session import get_session # noqa: PLC0415
48
+ from db.models import Entity, Investigation, InvestigationEntityLink # noqa: PLC0415
49
+ import sqlalchemy as sa # noqa: PLC0415
50
+
51
+ since_dt: Optional[datetime] = None
52
+ if since:
53
+ try:
54
+ since_dt = datetime.fromisoformat(since)
55
+ except ValueError:
56
+ raise HTTPException(status_code=422, detail="Invalid 'since' datetime format")
57
+
58
+ with get_session() as session:
59
+ user_inv_ids = (
60
+ session.query(Investigation.id)
61
+ .filter(Investigation.user_id == current_user.user.id)
62
+ .subquery()
63
+ )
64
+ linked_entity_ids = (
65
+ session.query(InvestigationEntityLink.entity_id)
66
+ .filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
67
+ .subquery()
68
+ )
69
+ q = session.query(Entity).filter(
70
+ sa.or_(
71
+ Entity.investigation_id.in_(user_inv_ids),
72
+ Entity.id.in_(linked_entity_ids),
73
+ )
74
+ ).distinct()
75
+ if entity_type:
76
+ q = q.filter(Entity.entity_type == entity_type)
77
+ if value_contains:
78
+ q = q.filter(Entity.value.contains(value_contains))
79
+ if since_dt:
80
+ q = q.filter(Entity.created_at >= since_dt)
81
+ total = q.count()
82
+ entities = (
83
+ q.order_by(Entity.created_at.desc())
84
+ .offset(offset)
85
+ .limit(limit)
86
+ .all()
87
+ )
88
+ return {
89
+ "items": [
90
+ {
91
+ "id": str(e.id),
92
+ "entity_type": e.entity_type,
93
+ "canonical_value": e.canonical_value,
94
+ "value": e.canonical_value or e.value,
95
+ "confidence": e.confidence,
96
+ "context_snippet": e.context_snippet,
97
+ "context": e.context,
98
+ "investigation_id": str(e.investigation_id) if e.investigation_id else None,
99
+ "created_at": e.created_at.isoformat() if e.created_at else None,
100
+ }
101
+ for e in entities
102
+ ],
103
+ "total": total,
104
+ "skip": offset,
105
+ "limit": limit,
106
+ }
107
+ except HTTPException:
108
+ raise
109
+ except Exception as exc:
110
+ logger.warning("list_entities failed: %s", exc)
111
+ return []
112
+
113
+
114
+ @router.get("/{entity_id}/export/stix")
115
+ async def export_entity_stix(
116
+ entity_id: str,
117
+ current_user: CurrentUser = Depends(get_current_user),
118
+ ) -> Response:
119
+ """Export single entity as a STIX 2.1 bundle."""
120
+ eid = _parse_uuid(entity_id)
121
+ try:
122
+ from db.session import get_session # noqa: PLC0415
123
+ from db.models import Entity # noqa: PLC0415
124
+
125
+ with get_session() as session:
126
+ entity = session.query(Entity).filter_by(id=eid).first()
127
+ if entity is None:
128
+ raise HTTPException(status_code=404, detail="Entity not found")
129
+ _assert_entity_accessible(session, eid, current_user.user.id)
130
+
131
+ try:
132
+ from export.stix import entity_to_stix_indicator, entity_to_stix_threat_actor, entity_to_stix_malware, bundle_to_json # noqa: PLC0415
133
+ import stix2 # noqa: PLC0415
134
+
135
+ stix_obj = (
136
+ entity_to_stix_threat_actor(entity)
137
+ or entity_to_stix_malware(entity)
138
+ or entity_to_stix_indicator(entity)
139
+ )
140
+ if stix_obj:
141
+ bundle = stix2.Bundle(objects=[stix_obj], spec_version="2.1")
142
+ json_str = bundle_to_json(bundle)
143
+ else:
144
+ json_str = json.dumps({
145
+ "type": "bundle",
146
+ "spec_version": "2.1",
147
+ "id": f"bundle--{uuid.uuid4()}",
148
+ "objects": [],
149
+ })
150
+ except Exception as exc:
151
+ logger.warning("STIX export for entity %s failed, falling back to raw JSON: %s", entity_id, exc)
152
+ json_str = json.dumps(_entity_to_dict(entity), indent=2)
153
+
154
+ filename = f"voidaccess_entity_{entity_id}_stix.json"
155
+ return Response(
156
+ content=json_str,
157
+ media_type="application/json",
158
+ headers={"Content-Disposition": f'attachment; filename="{filename}"'},
159
+ )
160
+ except HTTPException:
161
+ raise
162
+ except Exception as exc:
163
+ logger.warning("export_entity_stix failed: %s", exc)
164
+ raise HTTPException(status_code=500, detail="Export failed")
165
+
166
+
167
+ @router.get("/{entity_id}/export/json")
168
+ async def export_entity_json(
169
+ entity_id: str,
170
+ current_user: CurrentUser = Depends(get_current_user),
171
+ ) -> Response:
172
+ """Export single entity as JSON."""
173
+ eid = _parse_uuid(entity_id)
174
+ try:
175
+ from db.session import get_session # noqa: PLC0415
176
+ from db.models import Entity # noqa: PLC0415
177
+ from db.queries import get_entity_appearances # noqa: PLC0415
178
+
179
+ with get_session() as session:
180
+ entity = session.query(Entity).filter_by(id=eid).first()
181
+ if entity is None:
182
+ raise HTTPException(status_code=404, detail="Entity not found")
183
+ _assert_entity_accessible(session, eid, current_user.user.id)
184
+
185
+ appearances = get_entity_appearances(session, eid, current_user.user.id)
186
+ data = _entity_to_dict(entity)
187
+ data["appearances"] = appearances
188
+ json_str = json.dumps(data, indent=2, default=str)
189
+
190
+ filename = f"voidaccess_entity_{entity_id}.json"
191
+ return Response(
192
+ content=json_str,
193
+ media_type="application/json",
194
+ headers={"Content-Disposition": f'attachment; filename="{filename}"'},
195
+ )
196
+ except HTTPException:
197
+ raise
198
+ except Exception as exc:
199
+ logger.warning("export_entity_json failed: %s", exc)
200
+ raise HTTPException(status_code=500, detail="Export failed")
201
+
202
+
203
+ @router.get("/{entity_id}/related")
204
+ async def get_entity_related(
205
+ entity_id: str,
206
+ current_user: CurrentUser = Depends(get_current_user),
207
+ ) -> dict:
208
+ """
209
+ Return DB-based related entities for the profile page mini-graph.
210
+ Uses EntityRelationship table directly — returns DB UUIDs for navigation.
211
+ """
212
+ eid = _parse_uuid(entity_id)
213
+ if not os.getenv("DATABASE_URL"):
214
+ raise HTTPException(status_code=503, detail="Database not configured")
215
+ try:
216
+ from db.session import get_session # noqa: PLC0415
217
+ from db.models import Entity, EntityRelationship # noqa: PLC0415
218
+
219
+ with get_session() as session:
220
+ entity = session.query(Entity).filter_by(id=eid).first()
221
+ if entity is None:
222
+ raise HTTPException(status_code=404, detail="Entity not found")
223
+ _assert_entity_accessible(session, eid, current_user.user.id)
224
+
225
+ rels = (
226
+ session.query(EntityRelationship)
227
+ .filter(
228
+ (EntityRelationship.entity_a_id == eid)
229
+ | (EntityRelationship.entity_b_id == eid)
230
+ )
231
+ .all()
232
+ )
233
+
234
+ neighbor_ids = set()
235
+ for rel in rels:
236
+ if rel.entity_a_id == eid:
237
+ neighbor_ids.add(rel.entity_b_id)
238
+ else:
239
+ neighbor_ids.add(rel.entity_a_id)
240
+
241
+ neighbors_map: dict[str, Entity] = {}
242
+ if neighbor_ids:
243
+ neighbor_entities = (
244
+ session.query(Entity)
245
+ .filter(Entity.id.in_(neighbor_ids))
246
+ .all()
247
+ )
248
+ neighbors_map = {ne.id: ne for ne in neighbor_entities}
249
+
250
+ neighbors: dict[str, dict] = {}
251
+ for rel in rels:
252
+ other_id = rel.entity_b_id if rel.entity_a_id == eid else rel.entity_a_id
253
+ other = neighbors_map.get(other_id)
254
+ if other is None:
255
+ continue
256
+ key = str(other.id)
257
+ if key not in neighbors or rel.confidence > neighbors[key]["strength"]:
258
+ neighbors[key] = {
259
+ "id": str(other.id),
260
+ "entity_type": other.entity_type,
261
+ "value": other.value,
262
+ "confidence": other.confidence,
263
+ "relationship_type": rel.relationship_type,
264
+ "strength": rel.confidence,
265
+ }
266
+
267
+ return {
268
+ "entity": {
269
+ "id": str(entity.id),
270
+ "entity_type": entity.entity_type,
271
+ "value": entity.value,
272
+ "confidence": entity.confidence,
273
+ },
274
+ "neighbors": list(neighbors.values()),
275
+ }
276
+ except HTTPException:
277
+ raise
278
+ except Exception as exc:
279
+ logger.warning("get_entity_related failed: %s", exc)
280
+ return {"entity": {"id": entity_id}, "neighbors": []}
281
+
282
+
283
+ @router.get("/{entity_id}/analysis/stylometry")
284
+ async def get_stylometry_analysis(
285
+ entity_id: str,
286
+ current_user: CurrentUser = Depends(get_current_user),
287
+ ) -> dict:
288
+ """
289
+ Run stylometric analysis on all text attributed to this entity.
290
+
291
+ Collects context_snippets for this entity's canonical alias group,
292
+ builds a writing-style fingerprint via fingerprint/stylometry.py,
293
+ and returns 6 scalar features + notable traits.
294
+
295
+ Returns {"error": "insufficient_data"} (not 500) when text < 3 samples.
296
+ """
297
+ eid = _parse_uuid(entity_id)
298
+ if not os.getenv("DATABASE_URL"):
299
+ raise HTTPException(status_code=503, detail="Database not configured")
300
+ try:
301
+ from sqlalchemy.orm import joinedload # noqa: PLC0415
302
+
303
+ from db.session import get_session # noqa: PLC0415
304
+ from db.models import Entity # noqa: PLC0415
305
+ from fingerprint.profiler import build_actor_profile # noqa: PLC0415
306
+
307
+ BASELINE = {
308
+ "avg_word_length": 4.8,
309
+ "avg_sentence_length": 12.1,
310
+ "punctuation_density": 0.12,
311
+ "uppercase_ratio": 0.09,
312
+ "vocabulary_richness": 0.52,
313
+ "digit_ratio": 0.04,
314
+ "avg_paragraph_length": 3.5,
315
+ "exclamation_ratio": 0.05,
316
+ "question_ratio": 0.08,
317
+ }
318
+
319
+ with get_session() as session:
320
+ entity = session.query(Entity).filter_by(id=eid).first()
321
+ if entity is None:
322
+ raise HTTPException(status_code=404, detail="Entity not found")
323
+
324
+ canonical = entity.canonical_value or entity.value.lower()
325
+
326
+ related = (
327
+ session.query(Entity)
328
+ .filter(
329
+ (Entity.canonical_value == canonical)
330
+ | (Entity.value == entity.value)
331
+ )
332
+ .options(joinedload(Entity.page))
333
+ .all()
334
+ )
335
+
336
+ texts: list[str] = []
337
+ for e in related:
338
+ if e.page and e.page.cleaned_text and len((e.page.cleaned_text or "").strip()) >= 100:
339
+ texts.append(e.page.cleaned_text[:3000].strip())
340
+ elif e.context_snippet and len((e.context_snippet or "").strip()) >= 50:
341
+ texts.append(e.context_snippet.strip())
342
+
343
+ text_samples = len(texts)
344
+ total_chars = sum(len(t) for t in texts)
345
+ logger.warning(
346
+ "Stylometry: %s samples, %s total chars (min 3 samples and 500 chars for MEDIUM confidence)",
347
+ text_samples,
348
+ total_chars,
349
+ )
350
+
351
+ if text_samples < 3 or total_chars < 500:
352
+ return {
353
+ "entity_id": entity_id,
354
+ "error": "insufficient_data",
355
+ "text_samples": text_samples,
356
+ "total_chars": total_chars,
357
+ "chars": total_chars,
358
+ "message": (
359
+ f"Insufficient text volume for stylometry "
360
+ f"({text_samples} samples, {total_chars} chars)"
361
+ ),
362
+ }
363
+
364
+ profile = build_actor_profile(texts)
365
+ if profile is None:
366
+ return {
367
+ "entity_id": entity_id,
368
+ "error": "insufficient_data",
369
+ "text_samples": text_samples,
370
+ "total_chars": 0,
371
+ "message": "Text samples too short for analysis (minimum 100 characters each)",
372
+ }
373
+
374
+ scalar_features = {
375
+ k: round(float(v), 4)
376
+ for k, v in profile.items()
377
+ if not k.startswith("_") and isinstance(v, (int, float))
378
+ }
379
+
380
+ sample_count = int(profile.get("_sample_count", text_samples))
381
+ confidence = "low"
382
+ if sample_count >= 5 and total_chars >= 2000:
383
+ confidence = "medium"
384
+ if sample_count >= 10 and total_chars >= 5000:
385
+ confidence = "high"
386
+
387
+ notable_traits: list[str] = []
388
+ for feat, baseline in BASELINE.items():
389
+ val = scalar_features.get(feat)
390
+ if val is None or baseline == 0:
391
+ continue
392
+ deviation = (val - baseline) / baseline
393
+ if abs(deviation) >= 0.5:
394
+ direction = "above" if deviation > 0 else "below"
395
+ pct = abs(round(deviation * 100))
396
+ feat_label = feat.replace("_", " ").title()
397
+ notable_traits.append(
398
+ f"{feat_label}: {pct}% {direction} baseline ({val:.2f} vs {baseline})"
399
+ )
400
+
401
+ # === NEW: Cross-actor matching ===
402
+ similar_actors = []
403
+ try:
404
+ import asyncio # noqa: PLC0415
405
+ if profile and text_samples >= 3:
406
+ similar_actors = await asyncio.to_thread(
407
+ _find_similar_actors,
408
+ profile=profile,
409
+ canonical_value=entity.canonical_value or entity.value.lower(),
410
+ entity_type=entity.entity_type,
411
+ )
412
+ except Exception as e:
413
+ logger.warning(f"Similar actor matching failed: {e}")
414
+
415
+ return {
416
+ "entity_id": entity_id,
417
+ "text_samples": sample_count,
418
+ "total_chars": total_chars,
419
+ "profile": scalar_features,
420
+ "confidence": confidence,
421
+ "notable_traits": notable_traits,
422
+ "similar_actors": similar_actors,
423
+ }
424
+ except HTTPException:
425
+ raise
426
+ except Exception as exc:
427
+ logger.warning("get_stylometry_analysis failed: %s", exc)
428
+ return {"error": "analysis_failed", "message": str(exc)[:300]}
429
+
430
+
431
+ @router.get("/{entity_id}/analysis/opsec")
432
+ async def get_opsec_analysis(
433
+ entity_id: str,
434
+ current_user: CurrentUser = Depends(get_current_user),
435
+ ) -> dict:
436
+ """
437
+ Run OPSEC failure analysis for this entity across all their appearances.
438
+
439
+ Collects texts + timestamps, runs analysis/opsec.py checks,
440
+ and returns structured findings with an opsec_score (0-100, lower = worse).
441
+
442
+ Returns {"error": "insufficient_data"} (not 500) when no text is available.
443
+ """
444
+ eid = _parse_uuid(entity_id)
445
+ if not os.getenv("DATABASE_URL"):
446
+ raise HTTPException(status_code=503, detail="Database not configured")
447
+ try:
448
+ from urllib.parse import urlparse # noqa: PLC0415
449
+
450
+ from sqlalchemy.orm import joinedload # noqa: PLC0415
451
+
452
+ from db.session import get_session # noqa: PLC0415
453
+ from db.models import Entity # noqa: PLC0415
454
+ from analysis.opsec import run_full_opsec_analysis # noqa: PLC0415
455
+
456
+ with get_session() as session:
457
+ entity = session.query(Entity).filter_by(id=eid).first()
458
+ if entity is None:
459
+ raise HTTPException(status_code=404, detail="Entity not found")
460
+
461
+ canonical = entity.canonical_value or entity.value.lower()
462
+ related = (
463
+ session.query(Entity)
464
+ .filter(
465
+ (Entity.canonical_value == canonical)
466
+ | (Entity.value == entity.value)
467
+ )
468
+ .options(joinedload(Entity.page))
469
+ .all()
470
+ )
471
+
472
+ texts_with_timestamps: list[dict] = []
473
+ for e in related:
474
+ text = ""
475
+ if e.page and e.page.cleaned_text and len((e.page.cleaned_text or "").strip()) >= 20:
476
+ text = (e.page.cleaned_text or "")[:8000].strip()
477
+ elif e.context_snippet and len((e.context_snippet or "").strip()) >= 20:
478
+ text = (e.context_snippet or "").strip()
479
+ if len(text) < 20:
480
+ continue
481
+ ts = e.created_at
482
+ if e.page:
483
+ if e.page.posted_at:
484
+ ts = e.page.posted_at
485
+ elif e.page.scrape_timestamp:
486
+ ts = e.page.scrape_timestamp
487
+ texts_with_timestamps.append({"text": text, "timestamp": ts})
488
+
489
+ inv_ids = list({e.investigation_id for e in related if e.investigation_id})
490
+ pgp_fingerprints: list[str] = []
491
+ pgp_sources: list[str] = []
492
+ if inv_ids:
493
+ pgp_rows = (
494
+ session.query(Entity)
495
+ .filter(
496
+ Entity.entity_type.in_(("PGP_KEY_BLOCK", "pgp_key")),
497
+ Entity.investigation_id.in_(inv_ids),
498
+ )
499
+ .options(joinedload(Entity.page))
500
+ .all()
501
+ )
502
+ for row in pgp_rows:
503
+ v = (row.value or "").strip()
504
+ if not v:
505
+ continue
506
+ pgp_fingerprints.append(v)
507
+ dom = ""
508
+ if row.page and row.page.url:
509
+ dom = urlparse(row.page.url).hostname or ""
510
+ pgp_sources.append(dom)
511
+
512
+ if not texts_with_timestamps:
513
+ return {
514
+ "entity_id": entity_id,
515
+ "error": "insufficient_data",
516
+ "message": "No text data available for OPSEC analysis",
517
+ "opsec_score": None,
518
+ "risk_level": None,
519
+ "findings": [],
520
+ "pages_analyzed": 0,
521
+ }
522
+
523
+ src_ok = len(pgp_sources) == len(pgp_fingerprints) and bool(pgp_fingerprints)
524
+ result = run_full_opsec_analysis(
525
+ entity.value,
526
+ texts_with_timestamps,
527
+ pgp_fingerprints=pgp_fingerprints or None,
528
+ pgp_sources=pgp_sources if src_ok else None,
529
+ )
530
+
531
+ findings = list(result.get("findings", []))
532
+ opsec_score = int(result.get("opsec_score", 100))
533
+ risk_raw = str(result.get("risk_level", "LOW")).upper()
534
+
535
+ return {
536
+ "entity_id": entity_id,
537
+ "opsec_score": opsec_score,
538
+ "risk_level": risk_raw,
539
+ "findings": findings,
540
+ "pages_analyzed": len(texts_with_timestamps),
541
+ }
542
+ except HTTPException:
543
+ raise
544
+ except Exception as exc:
545
+ logger.warning("get_opsec_analysis failed: %s", exc)
546
+ return {"error": "analysis_failed", "message": str(exc)[:300]}
547
+
548
+
549
+ @router.get("/{entity_id}")
550
+ async def get_entity(
551
+ entity_id: str,
552
+ defang: bool = True,
553
+ current_user: CurrentUser = Depends(get_current_user),
554
+ ) -> dict:
555
+ """Return full entity profile including appearances."""
556
+ if not os.getenv("DATABASE_URL"):
557
+ raise HTTPException(status_code=503, detail="Database not configured")
558
+ eid = _parse_uuid(entity_id)
559
+
560
+ try:
561
+ from db.session import get_session # noqa: PLC0415
562
+ from db.models import Entity # noqa: PLC0415
563
+ from db.queries import get_entity_appearances # noqa: PLC0415
564
+ from utils.ioc_freshness import get_freshness_tag, get_freshness_display # noqa: PLC0415
565
+ from utils.defang import defang_value, defang_text # noqa: PLC0415
566
+
567
+ with get_session() as session:
568
+ entity = session.query(Entity).filter_by(id=eid).first()
569
+ if entity is None:
570
+ raise HTTPException(status_code=404, detail="Entity not found")
571
+ _assert_entity_accessible(session, eid, current_user.user.id)
572
+
573
+ source_url = ""
574
+ try:
575
+ if entity.page:
576
+ source_url = entity.page.url or ""
577
+ except Exception:
578
+ pass
579
+
580
+ is_seed = False
581
+ try:
582
+ if entity.investigation:
583
+ is_seed = bool(entity.investigation.is_seed)
584
+ except Exception:
585
+ pass
586
+
587
+ appearances = get_entity_appearances(session, eid, current_user.user.id)
588
+
589
+ freshness_tag = get_freshness_tag(
590
+ entity.entity_type,
591
+ entity.last_seen_at,
592
+ entity.first_seen_at,
593
+ )
594
+ freshness_display = get_freshness_display(freshness_tag)
595
+
596
+ display_value = entity.value
597
+ display_canonical = entity.canonical_value
598
+ display_context = entity.context
599
+ if defang:
600
+ display_value = defang_value(entity.entity_type, entity.value or "")
601
+ if entity.canonical_value:
602
+ display_canonical = defang_value(entity.entity_type, entity.canonical_value)
603
+ if entity.context:
604
+ display_context = defang_text(entity.context)
605
+
606
+ return {
607
+ **_entity_to_dict(entity),
608
+ "value": display_value,
609
+ "canonical_value": display_canonical,
610
+ "context": display_context,
611
+ "source_url": source_url,
612
+ "is_seed": is_seed,
613
+ "appearances": appearances,
614
+ "appearance_count": len(appearances),
615
+ "first_seen_at": entity.first_seen_at.isoformat() if entity.first_seen_at else None,
616
+ "last_seen_at": entity.last_seen_at.isoformat() if entity.last_seen_at else None,
617
+ "freshness_tag": freshness_tag.value,
618
+ "freshness_label": freshness_display["label"],
619
+ "freshness_color": freshness_display["color"],
620
+ "source_count": entity.source_count or 1,
621
+ "corroborating_sources": json.loads(entity.corroborating_sources or '["dark_web_scrape"]'),
622
+ "cross_referenced": (entity.source_count or 1) > 1,
623
+ "defanged": defang,
624
+ "blockchain_data": {
625
+ "wallet_type": entity.entity_type if entity.entity_type in ("BITCOIN_ADDRESS", "ETHEREUM_ADDRESS", "MONERO_ADDRESS") else None,
626
+ "historical_context": entity.historical_context,
627
+ "first_seen_blockchain": entity.first_seen.isoformat() if entity.first_seen else None,
628
+ }
629
+ }
630
+ except HTTPException:
631
+ raise
632
+ except Exception as exc:
633
+ logger.warning("get_entity failed: %s", exc)
634
+ raise HTTPException(status_code=500, detail="Internal error")
635
+
636
+
637
+ @router.get("/{entity_id}/neighbors")
638
+ async def get_entity_neighbors(
639
+ entity_id: str,
640
+ hops: int = Query(default=1, ge=1, le=5),
641
+ edge_types: Optional[str] = Query(
642
+ default=None,
643
+ description="Comma-separated list of edge types to filter",
644
+ ),
645
+ investigation_id: Optional[str] = Query(
646
+ default=None,
647
+ description="Scope to a specific investigation",
648
+ ),
649
+ current_user: CurrentUser = Depends(get_current_user),
650
+ ) -> dict:
651
+ """
652
+ Return direct neighbors of an entity using targeted SQL queries.
653
+ Uses get_entity_neighbors_db for O(1) neighbor lookup instead of building the full graph.
654
+ """
655
+ try:
656
+ entity_uuid = uuid.UUID(entity_id)
657
+ except ValueError:
658
+ raise HTTPException(status_code=422, detail="Invalid entity ID format")
659
+
660
+ inv_uuid: Optional[uuid.UUID] = None
661
+ if investigation_id:
662
+ try:
663
+ inv_uuid = uuid.UUID(investigation_id)
664
+ except ValueError:
665
+ raise HTTPException(status_code=422, detail="Invalid investigation_id format")
666
+
667
+ edge_type_list: Optional[list[str]] = None
668
+ if edge_types:
669
+ edge_type_list = [t.strip() for t in edge_types.split(",") if t.strip()]
670
+
671
+ if not os.getenv("DATABASE_URL"):
672
+ raise HTTPException(status_code=503, detail="Database not configured")
673
+
674
+ try:
675
+ from db.session import get_session # noqa: PLC0415
676
+ from db.models import Entity # noqa: PLC0415
677
+ from db.queries import get_entity_neighbors_db # noqa: PLC0415
678
+
679
+ with get_session() as session:
680
+ entity = session.query(Entity).filter_by(id=entity_uuid).first()
681
+ if entity is None:
682
+ raise HTTPException(status_code=404, detail="Entity not found")
683
+
684
+ neighbors = get_entity_neighbors_db(
685
+ entity_id=entity_uuid,
686
+ investigation_id=inv_uuid,
687
+ session=session,
688
+ )
689
+
690
+ if edge_type_list:
691
+ neighbors = [
692
+ n for n in neighbors
693
+ if n.get("relationship_type") in edge_type_list
694
+ ]
695
+
696
+ if hops > 1:
697
+ neighbor_ids = [uuid.UUID(n["neighbor_id"]) for n in neighbors]
698
+ visited = {entity_uuid}
699
+ visited.update(neighbor_ids)
700
+
701
+ current_level = neighbor_ids
702
+ for _ in range(1, hops):
703
+ next_level = []
704
+ for nid in current_level:
705
+ if nid in visited:
706
+ continue
707
+ visited.add(nid)
708
+ nxt = get_entity_neighbors_db(
709
+ entity_id=nid,
710
+ investigation_id=inv_uuid,
711
+ session=session,
712
+ )
713
+ for n in nxt:
714
+ nid2 = uuid.UUID(n["neighbor_id"])
715
+ if nid2 not in visited:
716
+ next_level.append(nid2)
717
+ neighbors.append(n)
718
+ if not next_level:
719
+ break
720
+ current_level = next_level
721
+
722
+ return {
723
+ "entity_id": entity_id,
724
+ "hops": hops,
725
+ "neighbors": neighbors,
726
+ }
727
+ except HTTPException:
728
+ raise
729
+ except Exception as exc:
730
+ logger.warning("get_entity_neighbors failed: %s", exc)
731
+ return {"entity_id": entity_id, "hops": hops, "neighbors": []}
732
+
733
+
734
+ # ---------------------------------------------------------------------------
735
+ # Helpers
736
+ # ---------------------------------------------------------------------------
737
+
738
+
739
+ def _parse_uuid(entity_id: str) -> uuid.UUID:
740
+ try:
741
+ return uuid.UUID(entity_id)
742
+ except ValueError:
743
+ raise HTTPException(status_code=422, detail="Invalid entity ID format")
744
+
745
+
746
+ def _assert_entity_accessible(session, entity_id: uuid.UUID, user_id: int) -> None:
747
+ """Raise HTTP 404 if this entity is not reachable by the given user."""
748
+ import sqlalchemy as sa # noqa: PLC0415
749
+ from db.models import Entity, Investigation, InvestigationEntityLink # noqa: PLC0415
750
+
751
+ user_inv_ids = (
752
+ session.query(Investigation.id)
753
+ .filter(Investigation.user_id == user_id)
754
+ .subquery()
755
+ )
756
+ linked_entity_ids = (
757
+ session.query(InvestigationEntityLink.entity_id)
758
+ .filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
759
+ .subquery()
760
+ )
761
+ accessible = (
762
+ session.query(Entity.id)
763
+ .filter(
764
+ sa.or_(
765
+ Entity.investigation_id.in_(user_inv_ids),
766
+ Entity.id.in_(linked_entity_ids),
767
+ ),
768
+ Entity.id == entity_id,
769
+ )
770
+ .first()
771
+ )
772
+ if accessible is None:
773
+ raise HTTPException(status_code=404, detail="Entity not found")
774
+
775
+
776
+ def _entity_to_dict(entity) -> dict: # type: ignore[type-arg]
777
+ return {
778
+ "id": str(entity.id),
779
+ "entity_type": entity.entity_type,
780
+ "value": entity.value,
781
+ "canonical_value": entity.canonical_value,
782
+ "confidence": entity.confidence,
783
+ "context": entity.context,
784
+ "context_snippet": entity.context_snippet,
785
+ "historical_context": entity.historical_context,
786
+ "first_seen": entity.first_seen.isoformat() if entity.first_seen else None,
787
+ "last_seen": entity.last_seen.isoformat() if entity.last_seen else None,
788
+ "investigation_id": str(entity.investigation_id) if entity.investigation_id else None,
789
+ "created_at": entity.created_at.isoformat() if entity.created_at else None,
790
+ "extraction_method": getattr(entity, "extraction_method", None),
791
+ }
792
+
793
+
794
+ def _get_entity_value(entity_id: str) -> Optional[str]:
795
+ """Look up entity.value by UUID from DB."""
796
+ if not os.getenv("DATABASE_URL"):
797
+ return None
798
+ try:
799
+ from db.session import get_session # noqa: PLC0415
800
+ from db.models import Entity # noqa: PLC0415
801
+
802
+ eid = uuid.UUID(entity_id)
803
+ with get_session() as session:
804
+ entity = session.query(Entity).filter_by(id=eid).first()
805
+ if entity:
806
+ return entity.value
807
+ return None
808
+ except Exception:
809
+ return None
810
+
811
+
812
+ def _resolve_graph_node_id(graph, entity_value: str) -> Optional[str]:
813
+ """Resolve graph node by exact value, then by handle@domain prefix."""
814
+ if graph is None:
815
+ return None
816
+ if graph.has_node(entity_value):
817
+ return entity_value
818
+
819
+ prefix = f"{entity_value}@"
820
+ for node_id in graph.nodes:
821
+ if isinstance(node_id, str) and node_id.startswith(prefix):
822
+ return node_id
823
+ return None
824
+
825
+
826
+ def _find_similar_actors(
827
+ profile,
828
+ canonical_value: str,
829
+ entity_type: str,
830
+ threshold: float = 0.82,
831
+ top_k: int = 5,
832
+ ) -> list[dict]:
833
+ """
834
+ Find other actors with similar writing styles.
835
+
836
+ Returns list of matches sorted by similarity score, excluding
837
+ the entity itself and its known aliases (same canonical_value).
838
+ """
839
+ from fingerprint.profiler import match_against_profiles
840
+ from db.session import get_session
841
+
842
+ with get_session() as session:
843
+ matches = match_against_profiles(
844
+ profile=profile,
845
+ session=session,
846
+ threshold=threshold,
847
+ exclude_canonical=canonical_value, # Don't match self
848
+ )
849
+
850
+ # Format for API response
851
+ result = []
852
+ for match in matches[:top_k]:
853
+ score = match.get("similarity", match.get("score", 0))
854
+ result.append({
855
+ "canonical_value": match.get("canonical_value") or match.get("entity_id"),
856
+ "entity_type": match.get("entity_type", entity_type),
857
+ "similarity_score": round(float(score), 3),
858
+ "confidence": _score_to_confidence(float(score)),
859
+ "matching_features": match.get("matching_features", []),
860
+ "profile_sample_count": match.get("sample_count", 0),
861
+ })
862
+
863
+ return result
864
+
865
+
866
+ def _score_to_confidence(score: float) -> str:
867
+ if score >= 0.90:
868
+ return "high"
869
+ if score >= 0.80:
870
+ return "medium"
871
+ return "low"