voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
api/routes/entities.py
ADDED
|
@@ -0,0 +1,871 @@
|
|
|
1
|
+
"""
|
|
2
|
+
api/routes/entities.py — Entity query endpoints.
|
|
3
|
+
|
|
4
|
+
GET /entities — paginated entity list with filters
|
|
5
|
+
GET /entities/{entity_id} — single entity full profile
|
|
6
|
+
GET /entities/{entity_id}/neighbors — graph neighbors (sigma.js graph page)
|
|
7
|
+
GET /entities/{entity_id}/related — DB-based related entities for profile page
|
|
8
|
+
GET /entities/{entity_id}/export/stix — export single entity as STIX 2.1
|
|
9
|
+
GET /entities/{entity_id}/export/json — export single entity as JSON
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import uuid
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
22
|
+
from fastapi.responses import Response
|
|
23
|
+
from api.auth import CurrentUser, get_current_user
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
router = APIRouter()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Routes
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@router.get("")
|
|
35
|
+
async def list_entities(
|
|
36
|
+
entity_type: Optional[str] = Query(default=None, description="Filter by entity type"),
|
|
37
|
+
value_contains: Optional[str] = Query(default=None, description="Filter by value substring"),
|
|
38
|
+
since: Optional[str] = Query(default=None, description="ISO datetime lower bound for created_at"),
|
|
39
|
+
limit: int = Query(default=20, ge=1, le=100),
|
|
40
|
+
offset: int = Query(default=0, ge=0),
|
|
41
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
42
|
+
) -> dict:
|
|
43
|
+
"""Return paginated entities matching optional filters."""
|
|
44
|
+
if not os.getenv("DATABASE_URL"):
|
|
45
|
+
return {"items": [], "total": 0, "skip": 0, "limit": 20}
|
|
46
|
+
try:
|
|
47
|
+
from db.session import get_session # noqa: PLC0415
|
|
48
|
+
from db.models import Entity, Investigation, InvestigationEntityLink # noqa: PLC0415
|
|
49
|
+
import sqlalchemy as sa # noqa: PLC0415
|
|
50
|
+
|
|
51
|
+
since_dt: Optional[datetime] = None
|
|
52
|
+
if since:
|
|
53
|
+
try:
|
|
54
|
+
since_dt = datetime.fromisoformat(since)
|
|
55
|
+
except ValueError:
|
|
56
|
+
raise HTTPException(status_code=422, detail="Invalid 'since' datetime format")
|
|
57
|
+
|
|
58
|
+
with get_session() as session:
|
|
59
|
+
user_inv_ids = (
|
|
60
|
+
session.query(Investigation.id)
|
|
61
|
+
.filter(Investigation.user_id == current_user.user.id)
|
|
62
|
+
.subquery()
|
|
63
|
+
)
|
|
64
|
+
linked_entity_ids = (
|
|
65
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
66
|
+
.filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
|
|
67
|
+
.subquery()
|
|
68
|
+
)
|
|
69
|
+
q = session.query(Entity).filter(
|
|
70
|
+
sa.or_(
|
|
71
|
+
Entity.investigation_id.in_(user_inv_ids),
|
|
72
|
+
Entity.id.in_(linked_entity_ids),
|
|
73
|
+
)
|
|
74
|
+
).distinct()
|
|
75
|
+
if entity_type:
|
|
76
|
+
q = q.filter(Entity.entity_type == entity_type)
|
|
77
|
+
if value_contains:
|
|
78
|
+
q = q.filter(Entity.value.contains(value_contains))
|
|
79
|
+
if since_dt:
|
|
80
|
+
q = q.filter(Entity.created_at >= since_dt)
|
|
81
|
+
total = q.count()
|
|
82
|
+
entities = (
|
|
83
|
+
q.order_by(Entity.created_at.desc())
|
|
84
|
+
.offset(offset)
|
|
85
|
+
.limit(limit)
|
|
86
|
+
.all()
|
|
87
|
+
)
|
|
88
|
+
return {
|
|
89
|
+
"items": [
|
|
90
|
+
{
|
|
91
|
+
"id": str(e.id),
|
|
92
|
+
"entity_type": e.entity_type,
|
|
93
|
+
"canonical_value": e.canonical_value,
|
|
94
|
+
"value": e.canonical_value or e.value,
|
|
95
|
+
"confidence": e.confidence,
|
|
96
|
+
"context_snippet": e.context_snippet,
|
|
97
|
+
"context": e.context,
|
|
98
|
+
"investigation_id": str(e.investigation_id) if e.investigation_id else None,
|
|
99
|
+
"created_at": e.created_at.isoformat() if e.created_at else None,
|
|
100
|
+
}
|
|
101
|
+
for e in entities
|
|
102
|
+
],
|
|
103
|
+
"total": total,
|
|
104
|
+
"skip": offset,
|
|
105
|
+
"limit": limit,
|
|
106
|
+
}
|
|
107
|
+
except HTTPException:
|
|
108
|
+
raise
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
logger.warning("list_entities failed: %s", exc)
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@router.get("/{entity_id}/export/stix")
|
|
115
|
+
async def export_entity_stix(
|
|
116
|
+
entity_id: str,
|
|
117
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
118
|
+
) -> Response:
|
|
119
|
+
"""Export single entity as a STIX 2.1 bundle."""
|
|
120
|
+
eid = _parse_uuid(entity_id)
|
|
121
|
+
try:
|
|
122
|
+
from db.session import get_session # noqa: PLC0415
|
|
123
|
+
from db.models import Entity # noqa: PLC0415
|
|
124
|
+
|
|
125
|
+
with get_session() as session:
|
|
126
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
127
|
+
if entity is None:
|
|
128
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
129
|
+
_assert_entity_accessible(session, eid, current_user.user.id)
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
from export.stix import entity_to_stix_indicator, entity_to_stix_threat_actor, entity_to_stix_malware, bundle_to_json # noqa: PLC0415
|
|
133
|
+
import stix2 # noqa: PLC0415
|
|
134
|
+
|
|
135
|
+
stix_obj = (
|
|
136
|
+
entity_to_stix_threat_actor(entity)
|
|
137
|
+
or entity_to_stix_malware(entity)
|
|
138
|
+
or entity_to_stix_indicator(entity)
|
|
139
|
+
)
|
|
140
|
+
if stix_obj:
|
|
141
|
+
bundle = stix2.Bundle(objects=[stix_obj], spec_version="2.1")
|
|
142
|
+
json_str = bundle_to_json(bundle)
|
|
143
|
+
else:
|
|
144
|
+
json_str = json.dumps({
|
|
145
|
+
"type": "bundle",
|
|
146
|
+
"spec_version": "2.1",
|
|
147
|
+
"id": f"bundle--{uuid.uuid4()}",
|
|
148
|
+
"objects": [],
|
|
149
|
+
})
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
logger.warning("STIX export for entity %s failed, falling back to raw JSON: %s", entity_id, exc)
|
|
152
|
+
json_str = json.dumps(_entity_to_dict(entity), indent=2)
|
|
153
|
+
|
|
154
|
+
filename = f"voidaccess_entity_{entity_id}_stix.json"
|
|
155
|
+
return Response(
|
|
156
|
+
content=json_str,
|
|
157
|
+
media_type="application/json",
|
|
158
|
+
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
|
159
|
+
)
|
|
160
|
+
except HTTPException:
|
|
161
|
+
raise
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
logger.warning("export_entity_stix failed: %s", exc)
|
|
164
|
+
raise HTTPException(status_code=500, detail="Export failed")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@router.get("/{entity_id}/export/json")
|
|
168
|
+
async def export_entity_json(
|
|
169
|
+
entity_id: str,
|
|
170
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
171
|
+
) -> Response:
|
|
172
|
+
"""Export single entity as JSON."""
|
|
173
|
+
eid = _parse_uuid(entity_id)
|
|
174
|
+
try:
|
|
175
|
+
from db.session import get_session # noqa: PLC0415
|
|
176
|
+
from db.models import Entity # noqa: PLC0415
|
|
177
|
+
from db.queries import get_entity_appearances # noqa: PLC0415
|
|
178
|
+
|
|
179
|
+
with get_session() as session:
|
|
180
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
181
|
+
if entity is None:
|
|
182
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
183
|
+
_assert_entity_accessible(session, eid, current_user.user.id)
|
|
184
|
+
|
|
185
|
+
appearances = get_entity_appearances(session, eid, current_user.user.id)
|
|
186
|
+
data = _entity_to_dict(entity)
|
|
187
|
+
data["appearances"] = appearances
|
|
188
|
+
json_str = json.dumps(data, indent=2, default=str)
|
|
189
|
+
|
|
190
|
+
filename = f"voidaccess_entity_{entity_id}.json"
|
|
191
|
+
return Response(
|
|
192
|
+
content=json_str,
|
|
193
|
+
media_type="application/json",
|
|
194
|
+
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
|
195
|
+
)
|
|
196
|
+
except HTTPException:
|
|
197
|
+
raise
|
|
198
|
+
except Exception as exc:
|
|
199
|
+
logger.warning("export_entity_json failed: %s", exc)
|
|
200
|
+
raise HTTPException(status_code=500, detail="Export failed")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@router.get("/{entity_id}/related")
|
|
204
|
+
async def get_entity_related(
|
|
205
|
+
entity_id: str,
|
|
206
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
207
|
+
) -> dict:
|
|
208
|
+
"""
|
|
209
|
+
Return DB-based related entities for the profile page mini-graph.
|
|
210
|
+
Uses EntityRelationship table directly — returns DB UUIDs for navigation.
|
|
211
|
+
"""
|
|
212
|
+
eid = _parse_uuid(entity_id)
|
|
213
|
+
if not os.getenv("DATABASE_URL"):
|
|
214
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
215
|
+
try:
|
|
216
|
+
from db.session import get_session # noqa: PLC0415
|
|
217
|
+
from db.models import Entity, EntityRelationship # noqa: PLC0415
|
|
218
|
+
|
|
219
|
+
with get_session() as session:
|
|
220
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
221
|
+
if entity is None:
|
|
222
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
223
|
+
_assert_entity_accessible(session, eid, current_user.user.id)
|
|
224
|
+
|
|
225
|
+
rels = (
|
|
226
|
+
session.query(EntityRelationship)
|
|
227
|
+
.filter(
|
|
228
|
+
(EntityRelationship.entity_a_id == eid)
|
|
229
|
+
| (EntityRelationship.entity_b_id == eid)
|
|
230
|
+
)
|
|
231
|
+
.all()
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
neighbor_ids = set()
|
|
235
|
+
for rel in rels:
|
|
236
|
+
if rel.entity_a_id == eid:
|
|
237
|
+
neighbor_ids.add(rel.entity_b_id)
|
|
238
|
+
else:
|
|
239
|
+
neighbor_ids.add(rel.entity_a_id)
|
|
240
|
+
|
|
241
|
+
neighbors_map: dict[str, Entity] = {}
|
|
242
|
+
if neighbor_ids:
|
|
243
|
+
neighbor_entities = (
|
|
244
|
+
session.query(Entity)
|
|
245
|
+
.filter(Entity.id.in_(neighbor_ids))
|
|
246
|
+
.all()
|
|
247
|
+
)
|
|
248
|
+
neighbors_map = {ne.id: ne for ne in neighbor_entities}
|
|
249
|
+
|
|
250
|
+
neighbors: dict[str, dict] = {}
|
|
251
|
+
for rel in rels:
|
|
252
|
+
other_id = rel.entity_b_id if rel.entity_a_id == eid else rel.entity_a_id
|
|
253
|
+
other = neighbors_map.get(other_id)
|
|
254
|
+
if other is None:
|
|
255
|
+
continue
|
|
256
|
+
key = str(other.id)
|
|
257
|
+
if key not in neighbors or rel.confidence > neighbors[key]["strength"]:
|
|
258
|
+
neighbors[key] = {
|
|
259
|
+
"id": str(other.id),
|
|
260
|
+
"entity_type": other.entity_type,
|
|
261
|
+
"value": other.value,
|
|
262
|
+
"confidence": other.confidence,
|
|
263
|
+
"relationship_type": rel.relationship_type,
|
|
264
|
+
"strength": rel.confidence,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
return {
|
|
268
|
+
"entity": {
|
|
269
|
+
"id": str(entity.id),
|
|
270
|
+
"entity_type": entity.entity_type,
|
|
271
|
+
"value": entity.value,
|
|
272
|
+
"confidence": entity.confidence,
|
|
273
|
+
},
|
|
274
|
+
"neighbors": list(neighbors.values()),
|
|
275
|
+
}
|
|
276
|
+
except HTTPException:
|
|
277
|
+
raise
|
|
278
|
+
except Exception as exc:
|
|
279
|
+
logger.warning("get_entity_related failed: %s", exc)
|
|
280
|
+
return {"entity": {"id": entity_id}, "neighbors": []}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@router.get("/{entity_id}/analysis/stylometry")
|
|
284
|
+
async def get_stylometry_analysis(
|
|
285
|
+
entity_id: str,
|
|
286
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
287
|
+
) -> dict:
|
|
288
|
+
"""
|
|
289
|
+
Run stylometric analysis on all text attributed to this entity.
|
|
290
|
+
|
|
291
|
+
Collects context_snippets for this entity's canonical alias group,
|
|
292
|
+
builds a writing-style fingerprint via fingerprint/stylometry.py,
|
|
293
|
+
and returns 6 scalar features + notable traits.
|
|
294
|
+
|
|
295
|
+
Returns {"error": "insufficient_data"} (not 500) when text < 3 samples.
|
|
296
|
+
"""
|
|
297
|
+
eid = _parse_uuid(entity_id)
|
|
298
|
+
if not os.getenv("DATABASE_URL"):
|
|
299
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
300
|
+
try:
|
|
301
|
+
from sqlalchemy.orm import joinedload # noqa: PLC0415
|
|
302
|
+
|
|
303
|
+
from db.session import get_session # noqa: PLC0415
|
|
304
|
+
from db.models import Entity # noqa: PLC0415
|
|
305
|
+
from fingerprint.profiler import build_actor_profile # noqa: PLC0415
|
|
306
|
+
|
|
307
|
+
BASELINE = {
|
|
308
|
+
"avg_word_length": 4.8,
|
|
309
|
+
"avg_sentence_length": 12.1,
|
|
310
|
+
"punctuation_density": 0.12,
|
|
311
|
+
"uppercase_ratio": 0.09,
|
|
312
|
+
"vocabulary_richness": 0.52,
|
|
313
|
+
"digit_ratio": 0.04,
|
|
314
|
+
"avg_paragraph_length": 3.5,
|
|
315
|
+
"exclamation_ratio": 0.05,
|
|
316
|
+
"question_ratio": 0.08,
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
with get_session() as session:
|
|
320
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
321
|
+
if entity is None:
|
|
322
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
323
|
+
|
|
324
|
+
canonical = entity.canonical_value or entity.value.lower()
|
|
325
|
+
|
|
326
|
+
related = (
|
|
327
|
+
session.query(Entity)
|
|
328
|
+
.filter(
|
|
329
|
+
(Entity.canonical_value == canonical)
|
|
330
|
+
| (Entity.value == entity.value)
|
|
331
|
+
)
|
|
332
|
+
.options(joinedload(Entity.page))
|
|
333
|
+
.all()
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
texts: list[str] = []
|
|
337
|
+
for e in related:
|
|
338
|
+
if e.page and e.page.cleaned_text and len((e.page.cleaned_text or "").strip()) >= 100:
|
|
339
|
+
texts.append(e.page.cleaned_text[:3000].strip())
|
|
340
|
+
elif e.context_snippet and len((e.context_snippet or "").strip()) >= 50:
|
|
341
|
+
texts.append(e.context_snippet.strip())
|
|
342
|
+
|
|
343
|
+
text_samples = len(texts)
|
|
344
|
+
total_chars = sum(len(t) for t in texts)
|
|
345
|
+
logger.warning(
|
|
346
|
+
"Stylometry: %s samples, %s total chars (min 3 samples and 500 chars for MEDIUM confidence)",
|
|
347
|
+
text_samples,
|
|
348
|
+
total_chars,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if text_samples < 3 or total_chars < 500:
|
|
352
|
+
return {
|
|
353
|
+
"entity_id": entity_id,
|
|
354
|
+
"error": "insufficient_data",
|
|
355
|
+
"text_samples": text_samples,
|
|
356
|
+
"total_chars": total_chars,
|
|
357
|
+
"chars": total_chars,
|
|
358
|
+
"message": (
|
|
359
|
+
f"Insufficient text volume for stylometry "
|
|
360
|
+
f"({text_samples} samples, {total_chars} chars)"
|
|
361
|
+
),
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
profile = build_actor_profile(texts)
|
|
365
|
+
if profile is None:
|
|
366
|
+
return {
|
|
367
|
+
"entity_id": entity_id,
|
|
368
|
+
"error": "insufficient_data",
|
|
369
|
+
"text_samples": text_samples,
|
|
370
|
+
"total_chars": 0,
|
|
371
|
+
"message": "Text samples too short for analysis (minimum 100 characters each)",
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
scalar_features = {
|
|
375
|
+
k: round(float(v), 4)
|
|
376
|
+
for k, v in profile.items()
|
|
377
|
+
if not k.startswith("_") and isinstance(v, (int, float))
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
sample_count = int(profile.get("_sample_count", text_samples))
|
|
381
|
+
confidence = "low"
|
|
382
|
+
if sample_count >= 5 and total_chars >= 2000:
|
|
383
|
+
confidence = "medium"
|
|
384
|
+
if sample_count >= 10 and total_chars >= 5000:
|
|
385
|
+
confidence = "high"
|
|
386
|
+
|
|
387
|
+
notable_traits: list[str] = []
|
|
388
|
+
for feat, baseline in BASELINE.items():
|
|
389
|
+
val = scalar_features.get(feat)
|
|
390
|
+
if val is None or baseline == 0:
|
|
391
|
+
continue
|
|
392
|
+
deviation = (val - baseline) / baseline
|
|
393
|
+
if abs(deviation) >= 0.5:
|
|
394
|
+
direction = "above" if deviation > 0 else "below"
|
|
395
|
+
pct = abs(round(deviation * 100))
|
|
396
|
+
feat_label = feat.replace("_", " ").title()
|
|
397
|
+
notable_traits.append(
|
|
398
|
+
f"{feat_label}: {pct}% {direction} baseline ({val:.2f} vs {baseline})"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# === NEW: Cross-actor matching ===
|
|
402
|
+
similar_actors = []
|
|
403
|
+
try:
|
|
404
|
+
import asyncio # noqa: PLC0415
|
|
405
|
+
if profile and text_samples >= 3:
|
|
406
|
+
similar_actors = await asyncio.to_thread(
|
|
407
|
+
_find_similar_actors,
|
|
408
|
+
profile=profile,
|
|
409
|
+
canonical_value=entity.canonical_value or entity.value.lower(),
|
|
410
|
+
entity_type=entity.entity_type,
|
|
411
|
+
)
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.warning(f"Similar actor matching failed: {e}")
|
|
414
|
+
|
|
415
|
+
return {
|
|
416
|
+
"entity_id": entity_id,
|
|
417
|
+
"text_samples": sample_count,
|
|
418
|
+
"total_chars": total_chars,
|
|
419
|
+
"profile": scalar_features,
|
|
420
|
+
"confidence": confidence,
|
|
421
|
+
"notable_traits": notable_traits,
|
|
422
|
+
"similar_actors": similar_actors,
|
|
423
|
+
}
|
|
424
|
+
except HTTPException:
|
|
425
|
+
raise
|
|
426
|
+
except Exception as exc:
|
|
427
|
+
logger.warning("get_stylometry_analysis failed: %s", exc)
|
|
428
|
+
return {"error": "analysis_failed", "message": str(exc)[:300]}
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
@router.get("/{entity_id}/analysis/opsec")
|
|
432
|
+
async def get_opsec_analysis(
|
|
433
|
+
entity_id: str,
|
|
434
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
435
|
+
) -> dict:
|
|
436
|
+
"""
|
|
437
|
+
Run OPSEC failure analysis for this entity across all their appearances.
|
|
438
|
+
|
|
439
|
+
Collects texts + timestamps, runs analysis/opsec.py checks,
|
|
440
|
+
and returns structured findings with an opsec_score (0-100, lower = worse).
|
|
441
|
+
|
|
442
|
+
Returns {"error": "insufficient_data"} (not 500) when no text is available.
|
|
443
|
+
"""
|
|
444
|
+
eid = _parse_uuid(entity_id)
|
|
445
|
+
if not os.getenv("DATABASE_URL"):
|
|
446
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
447
|
+
try:
|
|
448
|
+
from urllib.parse import urlparse # noqa: PLC0415
|
|
449
|
+
|
|
450
|
+
from sqlalchemy.orm import joinedload # noqa: PLC0415
|
|
451
|
+
|
|
452
|
+
from db.session import get_session # noqa: PLC0415
|
|
453
|
+
from db.models import Entity # noqa: PLC0415
|
|
454
|
+
from analysis.opsec import run_full_opsec_analysis # noqa: PLC0415
|
|
455
|
+
|
|
456
|
+
with get_session() as session:
|
|
457
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
458
|
+
if entity is None:
|
|
459
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
460
|
+
|
|
461
|
+
canonical = entity.canonical_value or entity.value.lower()
|
|
462
|
+
related = (
|
|
463
|
+
session.query(Entity)
|
|
464
|
+
.filter(
|
|
465
|
+
(Entity.canonical_value == canonical)
|
|
466
|
+
| (Entity.value == entity.value)
|
|
467
|
+
)
|
|
468
|
+
.options(joinedload(Entity.page))
|
|
469
|
+
.all()
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
texts_with_timestamps: list[dict] = []
|
|
473
|
+
for e in related:
|
|
474
|
+
text = ""
|
|
475
|
+
if e.page and e.page.cleaned_text and len((e.page.cleaned_text or "").strip()) >= 20:
|
|
476
|
+
text = (e.page.cleaned_text or "")[:8000].strip()
|
|
477
|
+
elif e.context_snippet and len((e.context_snippet or "").strip()) >= 20:
|
|
478
|
+
text = (e.context_snippet or "").strip()
|
|
479
|
+
if len(text) < 20:
|
|
480
|
+
continue
|
|
481
|
+
ts = e.created_at
|
|
482
|
+
if e.page:
|
|
483
|
+
if e.page.posted_at:
|
|
484
|
+
ts = e.page.posted_at
|
|
485
|
+
elif e.page.scrape_timestamp:
|
|
486
|
+
ts = e.page.scrape_timestamp
|
|
487
|
+
texts_with_timestamps.append({"text": text, "timestamp": ts})
|
|
488
|
+
|
|
489
|
+
inv_ids = list({e.investigation_id for e in related if e.investigation_id})
|
|
490
|
+
pgp_fingerprints: list[str] = []
|
|
491
|
+
pgp_sources: list[str] = []
|
|
492
|
+
if inv_ids:
|
|
493
|
+
pgp_rows = (
|
|
494
|
+
session.query(Entity)
|
|
495
|
+
.filter(
|
|
496
|
+
Entity.entity_type.in_(("PGP_KEY_BLOCK", "pgp_key")),
|
|
497
|
+
Entity.investigation_id.in_(inv_ids),
|
|
498
|
+
)
|
|
499
|
+
.options(joinedload(Entity.page))
|
|
500
|
+
.all()
|
|
501
|
+
)
|
|
502
|
+
for row in pgp_rows:
|
|
503
|
+
v = (row.value or "").strip()
|
|
504
|
+
if not v:
|
|
505
|
+
continue
|
|
506
|
+
pgp_fingerprints.append(v)
|
|
507
|
+
dom = ""
|
|
508
|
+
if row.page and row.page.url:
|
|
509
|
+
dom = urlparse(row.page.url).hostname or ""
|
|
510
|
+
pgp_sources.append(dom)
|
|
511
|
+
|
|
512
|
+
if not texts_with_timestamps:
|
|
513
|
+
return {
|
|
514
|
+
"entity_id": entity_id,
|
|
515
|
+
"error": "insufficient_data",
|
|
516
|
+
"message": "No text data available for OPSEC analysis",
|
|
517
|
+
"opsec_score": None,
|
|
518
|
+
"risk_level": None,
|
|
519
|
+
"findings": [],
|
|
520
|
+
"pages_analyzed": 0,
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
src_ok = len(pgp_sources) == len(pgp_fingerprints) and bool(pgp_fingerprints)
|
|
524
|
+
result = run_full_opsec_analysis(
|
|
525
|
+
entity.value,
|
|
526
|
+
texts_with_timestamps,
|
|
527
|
+
pgp_fingerprints=pgp_fingerprints or None,
|
|
528
|
+
pgp_sources=pgp_sources if src_ok else None,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
findings = list(result.get("findings", []))
|
|
532
|
+
opsec_score = int(result.get("opsec_score", 100))
|
|
533
|
+
risk_raw = str(result.get("risk_level", "LOW")).upper()
|
|
534
|
+
|
|
535
|
+
return {
|
|
536
|
+
"entity_id": entity_id,
|
|
537
|
+
"opsec_score": opsec_score,
|
|
538
|
+
"risk_level": risk_raw,
|
|
539
|
+
"findings": findings,
|
|
540
|
+
"pages_analyzed": len(texts_with_timestamps),
|
|
541
|
+
}
|
|
542
|
+
except HTTPException:
|
|
543
|
+
raise
|
|
544
|
+
except Exception as exc:
|
|
545
|
+
logger.warning("get_opsec_analysis failed: %s", exc)
|
|
546
|
+
return {"error": "analysis_failed", "message": str(exc)[:300]}
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
@router.get("/{entity_id}")
|
|
550
|
+
async def get_entity(
|
|
551
|
+
entity_id: str,
|
|
552
|
+
defang: bool = True,
|
|
553
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
554
|
+
) -> dict:
|
|
555
|
+
"""Return full entity profile including appearances."""
|
|
556
|
+
if not os.getenv("DATABASE_URL"):
|
|
557
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
558
|
+
eid = _parse_uuid(entity_id)
|
|
559
|
+
|
|
560
|
+
try:
|
|
561
|
+
from db.session import get_session # noqa: PLC0415
|
|
562
|
+
from db.models import Entity # noqa: PLC0415
|
|
563
|
+
from db.queries import get_entity_appearances # noqa: PLC0415
|
|
564
|
+
from utils.ioc_freshness import get_freshness_tag, get_freshness_display # noqa: PLC0415
|
|
565
|
+
from utils.defang import defang_value, defang_text # noqa: PLC0415
|
|
566
|
+
|
|
567
|
+
with get_session() as session:
|
|
568
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
569
|
+
if entity is None:
|
|
570
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
571
|
+
_assert_entity_accessible(session, eid, current_user.user.id)
|
|
572
|
+
|
|
573
|
+
source_url = ""
|
|
574
|
+
try:
|
|
575
|
+
if entity.page:
|
|
576
|
+
source_url = entity.page.url or ""
|
|
577
|
+
except Exception:
|
|
578
|
+
pass
|
|
579
|
+
|
|
580
|
+
is_seed = False
|
|
581
|
+
try:
|
|
582
|
+
if entity.investigation:
|
|
583
|
+
is_seed = bool(entity.investigation.is_seed)
|
|
584
|
+
except Exception:
|
|
585
|
+
pass
|
|
586
|
+
|
|
587
|
+
appearances = get_entity_appearances(session, eid, current_user.user.id)
|
|
588
|
+
|
|
589
|
+
freshness_tag = get_freshness_tag(
|
|
590
|
+
entity.entity_type,
|
|
591
|
+
entity.last_seen_at,
|
|
592
|
+
entity.first_seen_at,
|
|
593
|
+
)
|
|
594
|
+
freshness_display = get_freshness_display(freshness_tag)
|
|
595
|
+
|
|
596
|
+
display_value = entity.value
|
|
597
|
+
display_canonical = entity.canonical_value
|
|
598
|
+
display_context = entity.context
|
|
599
|
+
if defang:
|
|
600
|
+
display_value = defang_value(entity.entity_type, entity.value or "")
|
|
601
|
+
if entity.canonical_value:
|
|
602
|
+
display_canonical = defang_value(entity.entity_type, entity.canonical_value)
|
|
603
|
+
if entity.context:
|
|
604
|
+
display_context = defang_text(entity.context)
|
|
605
|
+
|
|
606
|
+
return {
|
|
607
|
+
**_entity_to_dict(entity),
|
|
608
|
+
"value": display_value,
|
|
609
|
+
"canonical_value": display_canonical,
|
|
610
|
+
"context": display_context,
|
|
611
|
+
"source_url": source_url,
|
|
612
|
+
"is_seed": is_seed,
|
|
613
|
+
"appearances": appearances,
|
|
614
|
+
"appearance_count": len(appearances),
|
|
615
|
+
"first_seen_at": entity.first_seen_at.isoformat() if entity.first_seen_at else None,
|
|
616
|
+
"last_seen_at": entity.last_seen_at.isoformat() if entity.last_seen_at else None,
|
|
617
|
+
"freshness_tag": freshness_tag.value,
|
|
618
|
+
"freshness_label": freshness_display["label"],
|
|
619
|
+
"freshness_color": freshness_display["color"],
|
|
620
|
+
"source_count": entity.source_count or 1,
|
|
621
|
+
"corroborating_sources": json.loads(entity.corroborating_sources or '["dark_web_scrape"]'),
|
|
622
|
+
"cross_referenced": (entity.source_count or 1) > 1,
|
|
623
|
+
"defanged": defang,
|
|
624
|
+
"blockchain_data": {
|
|
625
|
+
"wallet_type": entity.entity_type if entity.entity_type in ("BITCOIN_ADDRESS", "ETHEREUM_ADDRESS", "MONERO_ADDRESS") else None,
|
|
626
|
+
"historical_context": entity.historical_context,
|
|
627
|
+
"first_seen_blockchain": entity.first_seen.isoformat() if entity.first_seen else None,
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
except HTTPException:
|
|
631
|
+
raise
|
|
632
|
+
except Exception as exc:
|
|
633
|
+
logger.warning("get_entity failed: %s", exc)
|
|
634
|
+
raise HTTPException(status_code=500, detail="Internal error")
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
@router.get("/{entity_id}/neighbors")
|
|
638
|
+
async def get_entity_neighbors(
|
|
639
|
+
entity_id: str,
|
|
640
|
+
hops: int = Query(default=1, ge=1, le=5),
|
|
641
|
+
edge_types: Optional[str] = Query(
|
|
642
|
+
default=None,
|
|
643
|
+
description="Comma-separated list of edge types to filter",
|
|
644
|
+
),
|
|
645
|
+
investigation_id: Optional[str] = Query(
|
|
646
|
+
default=None,
|
|
647
|
+
description="Scope to a specific investigation",
|
|
648
|
+
),
|
|
649
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
650
|
+
) -> dict:
|
|
651
|
+
"""
|
|
652
|
+
Return direct neighbors of an entity using targeted SQL queries.
|
|
653
|
+
Uses get_entity_neighbors_db for O(1) neighbor lookup instead of building the full graph.
|
|
654
|
+
"""
|
|
655
|
+
try:
|
|
656
|
+
entity_uuid = uuid.UUID(entity_id)
|
|
657
|
+
except ValueError:
|
|
658
|
+
raise HTTPException(status_code=422, detail="Invalid entity ID format")
|
|
659
|
+
|
|
660
|
+
inv_uuid: Optional[uuid.UUID] = None
|
|
661
|
+
if investigation_id:
|
|
662
|
+
try:
|
|
663
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
664
|
+
except ValueError:
|
|
665
|
+
raise HTTPException(status_code=422, detail="Invalid investigation_id format")
|
|
666
|
+
|
|
667
|
+
edge_type_list: Optional[list[str]] = None
|
|
668
|
+
if edge_types:
|
|
669
|
+
edge_type_list = [t.strip() for t in edge_types.split(",") if t.strip()]
|
|
670
|
+
|
|
671
|
+
if not os.getenv("DATABASE_URL"):
|
|
672
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
from db.session import get_session # noqa: PLC0415
|
|
676
|
+
from db.models import Entity # noqa: PLC0415
|
|
677
|
+
from db.queries import get_entity_neighbors_db # noqa: PLC0415
|
|
678
|
+
|
|
679
|
+
with get_session() as session:
|
|
680
|
+
entity = session.query(Entity).filter_by(id=entity_uuid).first()
|
|
681
|
+
if entity is None:
|
|
682
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
683
|
+
|
|
684
|
+
neighbors = get_entity_neighbors_db(
|
|
685
|
+
entity_id=entity_uuid,
|
|
686
|
+
investigation_id=inv_uuid,
|
|
687
|
+
session=session,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
if edge_type_list:
|
|
691
|
+
neighbors = [
|
|
692
|
+
n for n in neighbors
|
|
693
|
+
if n.get("relationship_type") in edge_type_list
|
|
694
|
+
]
|
|
695
|
+
|
|
696
|
+
if hops > 1:
|
|
697
|
+
neighbor_ids = [uuid.UUID(n["neighbor_id"]) for n in neighbors]
|
|
698
|
+
visited = {entity_uuid}
|
|
699
|
+
visited.update(neighbor_ids)
|
|
700
|
+
|
|
701
|
+
current_level = neighbor_ids
|
|
702
|
+
for _ in range(1, hops):
|
|
703
|
+
next_level = []
|
|
704
|
+
for nid in current_level:
|
|
705
|
+
if nid in visited:
|
|
706
|
+
continue
|
|
707
|
+
visited.add(nid)
|
|
708
|
+
nxt = get_entity_neighbors_db(
|
|
709
|
+
entity_id=nid,
|
|
710
|
+
investigation_id=inv_uuid,
|
|
711
|
+
session=session,
|
|
712
|
+
)
|
|
713
|
+
for n in nxt:
|
|
714
|
+
nid2 = uuid.UUID(n["neighbor_id"])
|
|
715
|
+
if nid2 not in visited:
|
|
716
|
+
next_level.append(nid2)
|
|
717
|
+
neighbors.append(n)
|
|
718
|
+
if not next_level:
|
|
719
|
+
break
|
|
720
|
+
current_level = next_level
|
|
721
|
+
|
|
722
|
+
return {
|
|
723
|
+
"entity_id": entity_id,
|
|
724
|
+
"hops": hops,
|
|
725
|
+
"neighbors": neighbors,
|
|
726
|
+
}
|
|
727
|
+
except HTTPException:
|
|
728
|
+
raise
|
|
729
|
+
except Exception as exc:
|
|
730
|
+
logger.warning("get_entity_neighbors failed: %s", exc)
|
|
731
|
+
return {"entity_id": entity_id, "hops": hops, "neighbors": []}
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
# ---------------------------------------------------------------------------
|
|
735
|
+
# Helpers
|
|
736
|
+
# ---------------------------------------------------------------------------
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def _parse_uuid(entity_id: str) -> uuid.UUID:
|
|
740
|
+
try:
|
|
741
|
+
return uuid.UUID(entity_id)
|
|
742
|
+
except ValueError:
|
|
743
|
+
raise HTTPException(status_code=422, detail="Invalid entity ID format")
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _assert_entity_accessible(session, entity_id: uuid.UUID, user_id: int) -> None:
|
|
747
|
+
"""Raise HTTP 404 if this entity is not reachable by the given user."""
|
|
748
|
+
import sqlalchemy as sa # noqa: PLC0415
|
|
749
|
+
from db.models import Entity, Investigation, InvestigationEntityLink # noqa: PLC0415
|
|
750
|
+
|
|
751
|
+
user_inv_ids = (
|
|
752
|
+
session.query(Investigation.id)
|
|
753
|
+
.filter(Investigation.user_id == user_id)
|
|
754
|
+
.subquery()
|
|
755
|
+
)
|
|
756
|
+
linked_entity_ids = (
|
|
757
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
758
|
+
.filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
|
|
759
|
+
.subquery()
|
|
760
|
+
)
|
|
761
|
+
accessible = (
|
|
762
|
+
session.query(Entity.id)
|
|
763
|
+
.filter(
|
|
764
|
+
sa.or_(
|
|
765
|
+
Entity.investigation_id.in_(user_inv_ids),
|
|
766
|
+
Entity.id.in_(linked_entity_ids),
|
|
767
|
+
),
|
|
768
|
+
Entity.id == entity_id,
|
|
769
|
+
)
|
|
770
|
+
.first()
|
|
771
|
+
)
|
|
772
|
+
if accessible is None:
|
|
773
|
+
raise HTTPException(status_code=404, detail="Entity not found")
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def _entity_to_dict(entity) -> dict: # type: ignore[type-arg]
|
|
777
|
+
return {
|
|
778
|
+
"id": str(entity.id),
|
|
779
|
+
"entity_type": entity.entity_type,
|
|
780
|
+
"value": entity.value,
|
|
781
|
+
"canonical_value": entity.canonical_value,
|
|
782
|
+
"confidence": entity.confidence,
|
|
783
|
+
"context": entity.context,
|
|
784
|
+
"context_snippet": entity.context_snippet,
|
|
785
|
+
"historical_context": entity.historical_context,
|
|
786
|
+
"first_seen": entity.first_seen.isoformat() if entity.first_seen else None,
|
|
787
|
+
"last_seen": entity.last_seen.isoformat() if entity.last_seen else None,
|
|
788
|
+
"investigation_id": str(entity.investigation_id) if entity.investigation_id else None,
|
|
789
|
+
"created_at": entity.created_at.isoformat() if entity.created_at else None,
|
|
790
|
+
"extraction_method": getattr(entity, "extraction_method", None),
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _get_entity_value(entity_id: str) -> Optional[str]:
|
|
795
|
+
"""Look up entity.value by UUID from DB."""
|
|
796
|
+
if not os.getenv("DATABASE_URL"):
|
|
797
|
+
return None
|
|
798
|
+
try:
|
|
799
|
+
from db.session import get_session # noqa: PLC0415
|
|
800
|
+
from db.models import Entity # noqa: PLC0415
|
|
801
|
+
|
|
802
|
+
eid = uuid.UUID(entity_id)
|
|
803
|
+
with get_session() as session:
|
|
804
|
+
entity = session.query(Entity).filter_by(id=eid).first()
|
|
805
|
+
if entity:
|
|
806
|
+
return entity.value
|
|
807
|
+
return None
|
|
808
|
+
except Exception:
|
|
809
|
+
return None
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def _resolve_graph_node_id(graph, entity_value: str) -> Optional[str]:
|
|
813
|
+
"""Resolve graph node by exact value, then by handle@domain prefix."""
|
|
814
|
+
if graph is None:
|
|
815
|
+
return None
|
|
816
|
+
if graph.has_node(entity_value):
|
|
817
|
+
return entity_value
|
|
818
|
+
|
|
819
|
+
prefix = f"{entity_value}@"
|
|
820
|
+
for node_id in graph.nodes:
|
|
821
|
+
if isinstance(node_id, str) and node_id.startswith(prefix):
|
|
822
|
+
return node_id
|
|
823
|
+
return None
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
def _find_similar_actors(
|
|
827
|
+
profile,
|
|
828
|
+
canonical_value: str,
|
|
829
|
+
entity_type: str,
|
|
830
|
+
threshold: float = 0.82,
|
|
831
|
+
top_k: int = 5,
|
|
832
|
+
) -> list[dict]:
|
|
833
|
+
"""
|
|
834
|
+
Find other actors with similar writing styles.
|
|
835
|
+
|
|
836
|
+
Returns list of matches sorted by similarity score, excluding
|
|
837
|
+
the entity itself and its known aliases (same canonical_value).
|
|
838
|
+
"""
|
|
839
|
+
from fingerprint.profiler import match_against_profiles
|
|
840
|
+
from db.session import get_session
|
|
841
|
+
|
|
842
|
+
with get_session() as session:
|
|
843
|
+
matches = match_against_profiles(
|
|
844
|
+
profile=profile,
|
|
845
|
+
session=session,
|
|
846
|
+
threshold=threshold,
|
|
847
|
+
exclude_canonical=canonical_value, # Don't match self
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
# Format for API response
|
|
851
|
+
result = []
|
|
852
|
+
for match in matches[:top_k]:
|
|
853
|
+
score = match.get("similarity", match.get("score", 0))
|
|
854
|
+
result.append({
|
|
855
|
+
"canonical_value": match.get("canonical_value") or match.get("entity_id"),
|
|
856
|
+
"entity_type": match.get("entity_type", entity_type),
|
|
857
|
+
"similarity_score": round(float(score), 3),
|
|
858
|
+
"confidence": _score_to_confidence(float(score)),
|
|
859
|
+
"matching_features": match.get("matching_features", []),
|
|
860
|
+
"profile_sample_count": match.get("sample_count", 0),
|
|
861
|
+
})
|
|
862
|
+
|
|
863
|
+
return result
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
def _score_to_confidence(score: float) -> str:
|
|
867
|
+
if score >= 0.90:
|
|
868
|
+
return "high"
|
|
869
|
+
if score >= 0.80:
|
|
870
|
+
return "medium"
|
|
871
|
+
return "low"
|