voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
db/queries.py
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common query helpers for the VoidAccess database layer.
|
|
3
|
+
|
|
4
|
+
All functions accept a SQLAlchemy Session as their first argument so callers
|
|
5
|
+
control transaction boundaries. None of these helpers call session.commit()
|
|
6
|
+
— that is the caller's responsibility (or the get_session() context manager's).
|
|
7
|
+
|
|
8
|
+
Where a helper needs an intermediate ID before the transaction is committed,
|
|
9
|
+
it calls session.flush() to write the row without finalising the transaction.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import uuid
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
from sqlalchemy.orm import Session
|
|
19
|
+
from sqlalchemy import func
|
|
20
|
+
import sqlalchemy as sa
|
|
21
|
+
|
|
22
|
+
from db.models import (
|
|
23
|
+
Entity,
|
|
24
|
+
EntityRelationship,
|
|
25
|
+
Investigation,
|
|
26
|
+
MonitorAlert,
|
|
27
|
+
Page,
|
|
28
|
+
RelationshipType,
|
|
29
|
+
Source,
|
|
30
|
+
SourceStatus,
|
|
31
|
+
SourceType,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def db_health_check(session: Session) -> bool:
|
|
36
|
+
"""Return True when DB responds to a trivial heartbeat query."""
|
|
37
|
+
try:
|
|
38
|
+
session.execute(sa.text("SELECT 1"))
|
|
39
|
+
return True
|
|
40
|
+
except Exception:
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Investigation helpers
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def get_investigation_by_id_or_run(
|
|
49
|
+
session: Session,
|
|
50
|
+
id_or_run: uuid.UUID,
|
|
51
|
+
) -> Optional[Investigation]:
|
|
52
|
+
"""Return the investigation row matching primary key *or* ``run_id``."""
|
|
53
|
+
return (
|
|
54
|
+
session.query(Investigation)
|
|
55
|
+
.filter(
|
|
56
|
+
(Investigation.id == id_or_run) | (Investigation.run_id == id_or_run)
|
|
57
|
+
)
|
|
58
|
+
.first()
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def count_distinct_pages_for_investigation(
|
|
63
|
+
session: Session,
|
|
64
|
+
investigation_id: uuid.UUID,
|
|
65
|
+
) -> int:
|
|
66
|
+
"""Count distinct scraped pages that contributed entities to this investigation.
|
|
67
|
+
|
|
68
|
+
Includes both entities owned by this investigation and entities linked via the
|
|
69
|
+
junction table (deduped entities from previous investigations re-linked here).
|
|
70
|
+
"""
|
|
71
|
+
from db.models import InvestigationEntityLink # noqa: PLC0415
|
|
72
|
+
linked_ids_subq = (
|
|
73
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
74
|
+
.filter(InvestigationEntityLink.investigation_id == investigation_id)
|
|
75
|
+
.subquery()
|
|
76
|
+
)
|
|
77
|
+
n = (
|
|
78
|
+
session.query(sa.func.count(sa.distinct(Entity.page_id)))
|
|
79
|
+
.filter(
|
|
80
|
+
(Entity.investigation_id == investigation_id)
|
|
81
|
+
| Entity.id.in_(linked_ids_subq)
|
|
82
|
+
)
|
|
83
|
+
.scalar()
|
|
84
|
+
)
|
|
85
|
+
return int(n or 0)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def create_investigation(
|
|
89
|
+
session: Session,
|
|
90
|
+
query: str,
|
|
91
|
+
refined_query: Optional[str] = None,
|
|
92
|
+
model_used: Optional[str] = None,
|
|
93
|
+
preset: Optional[str] = None,
|
|
94
|
+
summary: Optional[str] = None,
|
|
95
|
+
user_id: Optional[int] = None,
|
|
96
|
+
) -> Investigation:
|
|
97
|
+
"""Insert a new Investigation row and flush to populate id/run_id."""
|
|
98
|
+
inv = Investigation(
|
|
99
|
+
query=query,
|
|
100
|
+
refined_query=refined_query,
|
|
101
|
+
model_used=model_used,
|
|
102
|
+
preset=preset,
|
|
103
|
+
summary=summary,
|
|
104
|
+
user_id=user_id,
|
|
105
|
+
)
|
|
106
|
+
session.add(inv)
|
|
107
|
+
session.flush()
|
|
108
|
+
return inv
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def get_investigation_by_run_id(
|
|
112
|
+
session: Session, run_id: uuid.UUID
|
|
113
|
+
) -> Optional[Investigation]:
|
|
114
|
+
"""Return the Investigation with the given run_id, or None."""
|
|
115
|
+
return session.query(Investigation).filter_by(run_id=run_id).first()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_recent_investigations(
|
|
119
|
+
session: Session, limit: int = 20
|
|
120
|
+
) -> List[Investigation]:
|
|
121
|
+
"""Return the *limit* most recent investigations, newest first."""
|
|
122
|
+
return (
|
|
123
|
+
session.query(Investigation)
|
|
124
|
+
.order_by(Investigation.created_at.desc())
|
|
125
|
+
.limit(limit)
|
|
126
|
+
.all()
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def update_investigation_summary(
|
|
131
|
+
session: Session, investigation_id: uuid.UUID, summary: str
|
|
132
|
+
) -> None:
|
|
133
|
+
"""Patch the summary field of an existing investigation."""
|
|
134
|
+
session.query(Investigation).filter_by(id=investigation_id).update(
|
|
135
|
+
{"summary": summary}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# ---------------------------------------------------------------------------
|
|
140
|
+
# Source helpers
|
|
141
|
+
# ---------------------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
def get_or_create_source(
|
|
144
|
+
session: Session,
|
|
145
|
+
onion_address: str,
|
|
146
|
+
source_type: str = SourceType.SEARCH_RESULT.value,
|
|
147
|
+
) -> Tuple[Source, bool]:
|
|
148
|
+
"""
|
|
149
|
+
Return (source, created) where *created* is True if a new row was inserted.
|
|
150
|
+
|
|
151
|
+
Uses flush (not commit) so the caller retains transaction control.
|
|
152
|
+
The onion_address is stored as-is — normalisation (strip trailing slashes,
|
|
153
|
+
lower-case) is the caller's responsibility.
|
|
154
|
+
"""
|
|
155
|
+
existing = session.query(Source).filter_by(onion_address=onion_address).first()
|
|
156
|
+
if existing:
|
|
157
|
+
return existing, False
|
|
158
|
+
|
|
159
|
+
source = Source(
|
|
160
|
+
onion_address=onion_address,
|
|
161
|
+
source_type=source_type,
|
|
162
|
+
status=SourceStatus.UNKNOWN.value,
|
|
163
|
+
)
|
|
164
|
+
session.add(source)
|
|
165
|
+
session.flush()
|
|
166
|
+
return source, True
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def update_source_status(
|
|
170
|
+
session: Session, source_id: uuid.UUID, status: str
|
|
171
|
+
) -> None:
|
|
172
|
+
"""Update the status of a Source and refresh last_seen to now."""
|
|
173
|
+
session.query(Source).filter_by(id=source_id).update(
|
|
174
|
+
{
|
|
175
|
+
"status": status,
|
|
176
|
+
"last_seen": datetime.now(timezone.utc),
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def link_source_to_investigation(
|
|
182
|
+
session: Session, investigation: Investigation, source: Source
|
|
183
|
+
) -> None:
|
|
184
|
+
"""Add *source* to *investigation*.sources if not already present."""
|
|
185
|
+
if source not in investigation.sources:
|
|
186
|
+
investigation.sources.append(source)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# Page helpers
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def create_page(
|
|
194
|
+
session: Session,
|
|
195
|
+
url: str,
|
|
196
|
+
source_id: Optional[uuid.UUID] = None,
|
|
197
|
+
cleaned_text: Optional[str] = None,
|
|
198
|
+
raw_content_hash: Optional[str] = None,
|
|
199
|
+
byte_size: Optional[int] = None,
|
|
200
|
+
language: Optional[str] = None,
|
|
201
|
+
posted_at: Optional[datetime] = None,
|
|
202
|
+
) -> Page:
|
|
203
|
+
"""Insert a new Page row and flush to populate its id."""
|
|
204
|
+
page = Page(
|
|
205
|
+
url=url,
|
|
206
|
+
source_id=source_id,
|
|
207
|
+
cleaned_text=cleaned_text,
|
|
208
|
+
raw_content_hash=raw_content_hash,
|
|
209
|
+
byte_size=byte_size,
|
|
210
|
+
language=language,
|
|
211
|
+
posted_at=posted_at,
|
|
212
|
+
)
|
|
213
|
+
session.add(page)
|
|
214
|
+
session.flush()
|
|
215
|
+
return page
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def get_page_by_url(session: Session, url: str) -> Optional[Page]:
|
|
219
|
+
"""Return the Page with the exact URL, or None."""
|
|
220
|
+
return session.query(Page).filter_by(url=url).first()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def get_page_by_hash(session: Session, content_hash: str) -> Optional[Page]:
|
|
224
|
+
"""
|
|
225
|
+
Return the first Page whose raw_content_hash matches.
|
|
226
|
+
Used by the crawler to skip re-scraping identical content.
|
|
227
|
+
"""
|
|
228
|
+
return session.query(Page).filter_by(raw_content_hash=content_hash).first()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def get_pages_for_source(
|
|
232
|
+
session: Session, source_id: uuid.UUID, limit: int = 100
|
|
233
|
+
) -> List[Page]:
|
|
234
|
+
"""Return pages belonging to a given source, newest first."""
|
|
235
|
+
return (
|
|
236
|
+
session.query(Page)
|
|
237
|
+
.filter_by(source_id=source_id)
|
|
238
|
+
.order_by(Page.scrape_timestamp.desc())
|
|
239
|
+
.limit(limit)
|
|
240
|
+
.all()
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
# Entity helpers
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
def create_entity(
|
|
249
|
+
session: Session,
|
|
250
|
+
page_id: uuid.UUID,
|
|
251
|
+
entity_type: str,
|
|
252
|
+
value: str,
|
|
253
|
+
confidence: float = 1.0,
|
|
254
|
+
context: Optional[str] = None,
|
|
255
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
256
|
+
) -> Entity:
|
|
257
|
+
"""Insert an Entity row and flush to populate its id."""
|
|
258
|
+
entity = Entity(
|
|
259
|
+
page_id=page_id,
|
|
260
|
+
investigation_id=investigation_id,
|
|
261
|
+
entity_type=entity_type,
|
|
262
|
+
value=value,
|
|
263
|
+
confidence=confidence,
|
|
264
|
+
context_snippet=context,
|
|
265
|
+
)
|
|
266
|
+
session.add(entity)
|
|
267
|
+
session.flush()
|
|
268
|
+
return entity
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _link_entity_to_investigation(
|
|
272
|
+
session: Session, entity_id: uuid.UUID, investigation_id: uuid.UUID
|
|
273
|
+
) -> None:
|
|
274
|
+
"""Link an entity to an investigation via InvestigationEntityLink."""
|
|
275
|
+
from db.models import InvestigationEntityLink
|
|
276
|
+
|
|
277
|
+
# Check committed rows
|
|
278
|
+
existing = session.query(InvestigationEntityLink).filter_by(
|
|
279
|
+
entity_id=entity_id, investigation_id=investigation_id
|
|
280
|
+
).first()
|
|
281
|
+
if existing:
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
# Also check pending (unflushed) objects — autoflush=False means they won't
|
|
285
|
+
# appear in the query above, causing UNIQUE VIOLATION on batch flush.
|
|
286
|
+
for obj in session.new:
|
|
287
|
+
if (
|
|
288
|
+
isinstance(obj, InvestigationEntityLink)
|
|
289
|
+
and obj.entity_id == entity_id
|
|
290
|
+
and obj.investigation_id == investigation_id
|
|
291
|
+
):
|
|
292
|
+
return
|
|
293
|
+
|
|
294
|
+
link = InvestigationEntityLink(
|
|
295
|
+
entity_id=entity_id,
|
|
296
|
+
investigation_id=investigation_id
|
|
297
|
+
)
|
|
298
|
+
session.add(link)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def upsert_entity_canonical(
|
|
302
|
+
session: Session,
|
|
303
|
+
investigation_id: uuid.UUID,
|
|
304
|
+
entity_type: str,
|
|
305
|
+
entity_value: str,
|
|
306
|
+
confidence: float,
|
|
307
|
+
source_page_id: Optional[uuid.UUID] = None,
|
|
308
|
+
context_snippet: str = "",
|
|
309
|
+
extraction_method: Optional[str] = None,
|
|
310
|
+
) -> tuple[Entity, bool]:
|
|
311
|
+
"""
|
|
312
|
+
Insert or update an entity using canonical deduplication.
|
|
313
|
+
|
|
314
|
+
Dedup strategy:
|
|
315
|
+
1. Compute canonical key for this entity
|
|
316
|
+
2. Check if any entity with same (canonical_key, entity_type) exists
|
|
317
|
+
in ANY investigation (global dedup)
|
|
318
|
+
3. If found: update confidence to max(existing, new), link to this investigation
|
|
319
|
+
4. If not found: insert new entity
|
|
320
|
+
|
|
321
|
+
Returns: (entity, was_created)
|
|
322
|
+
"""
|
|
323
|
+
from extractor.normalizer import canonicalize_entity_value
|
|
324
|
+
|
|
325
|
+
canonical = canonicalize_entity_value(entity_type, entity_value)
|
|
326
|
+
|
|
327
|
+
# Look for existing entity with same canonical form (any investigation)
|
|
328
|
+
existing = (
|
|
329
|
+
session.query(Entity)
|
|
330
|
+
.filter(
|
|
331
|
+
Entity.entity_type == entity_type,
|
|
332
|
+
Entity.canonical_value == canonical,
|
|
333
|
+
)
|
|
334
|
+
.order_by(Entity.confidence.desc()) # Prefer highest confidence existing
|
|
335
|
+
.first()
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if existing:
|
|
339
|
+
# Update confidence if new extraction is more confident
|
|
340
|
+
if confidence > existing.confidence:
|
|
341
|
+
existing.confidence = confidence
|
|
342
|
+
# Update context if we have a better snippet
|
|
343
|
+
if context_snippet and len(context_snippet) > len(existing.context_snippet or ""):
|
|
344
|
+
existing.context_snippet = context_snippet
|
|
345
|
+
if extraction_method and not existing.extraction_method:
|
|
346
|
+
existing.extraction_method = extraction_method
|
|
347
|
+
# Update last_seen
|
|
348
|
+
existing.last_seen = datetime.now(timezone.utc)
|
|
349
|
+
# Update last_seen_at for freshness tracking
|
|
350
|
+
existing.last_seen_at = datetime.now(timezone.utc)
|
|
351
|
+
# Link to this investigation if not already linked
|
|
352
|
+
if existing.investigation_id != investigation_id:
|
|
353
|
+
_link_entity_to_investigation(session, existing.id, investigation_id)
|
|
354
|
+
return existing, False
|
|
355
|
+
else:
|
|
356
|
+
# Create new entity
|
|
357
|
+
entity = Entity(
|
|
358
|
+
investigation_id=investigation_id,
|
|
359
|
+
entity_type=entity_type,
|
|
360
|
+
value=entity_value,
|
|
361
|
+
canonical_value=canonical,
|
|
362
|
+
confidence=confidence,
|
|
363
|
+
context_snippet=context_snippet,
|
|
364
|
+
page_id=source_page_id,
|
|
365
|
+
extraction_method=extraction_method,
|
|
366
|
+
first_seen=datetime.now(timezone.utc),
|
|
367
|
+
last_seen=datetime.now(timezone.utc),
|
|
368
|
+
)
|
|
369
|
+
session.add(entity)
|
|
370
|
+
session.flush() # populate entity.id before creating the link
|
|
371
|
+
_link_entity_to_investigation(session, entity.id, investigation_id)
|
|
372
|
+
return entity, True
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def cross_reference_with_seeds(session: Session, investigation_id: uuid.UUID) -> int:
|
|
376
|
+
"""
|
|
377
|
+
For each entity in this investigation, check if it matches any seed entity.
|
|
378
|
+
If match found, update the investigation entity with historical context.
|
|
379
|
+
Returns count of matches found.
|
|
380
|
+
"""
|
|
381
|
+
from db.models import Investigation
|
|
382
|
+
|
|
383
|
+
inv_entities = session.query(Entity).join(Investigation, Entity.investigation_id == Investigation.id).filter(
|
|
384
|
+
Entity.investigation_id == investigation_id,
|
|
385
|
+
Investigation.is_seed == False
|
|
386
|
+
).all()
|
|
387
|
+
|
|
388
|
+
if not inv_entities:
|
|
389
|
+
return 0
|
|
390
|
+
|
|
391
|
+
canonical_keys = [(ent.entity_type, ent.canonical_value) for ent in inv_entities if ent.canonical_value]
|
|
392
|
+
if not canonical_keys:
|
|
393
|
+
return 0
|
|
394
|
+
|
|
395
|
+
entity_types = [k[0] for k in canonical_keys]
|
|
396
|
+
canonical_values = [k[1] for k in canonical_keys]
|
|
397
|
+
|
|
398
|
+
seed_entities = (
|
|
399
|
+
session.query(Entity)
|
|
400
|
+
.join(Investigation, Entity.investigation_id == Investigation.id)
|
|
401
|
+
.filter(
|
|
402
|
+
Entity.entity_type.in_(entity_types),
|
|
403
|
+
Entity.canonical_value.in_(canonical_values),
|
|
404
|
+
Investigation.is_seed == True
|
|
405
|
+
)
|
|
406
|
+
.all()
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
seed_map: dict[tuple, Entity] = {}
|
|
410
|
+
for seed in seed_entities:
|
|
411
|
+
key = (seed.entity_type, seed.canonical_value)
|
|
412
|
+
if key not in seed_map:
|
|
413
|
+
seed_map[key] = seed
|
|
414
|
+
|
|
415
|
+
entity_ids = [ent.id for ent in inv_entities]
|
|
416
|
+
seed_ids = [seed.id for seed in seed_entities]
|
|
417
|
+
all_ids = list(set(entity_ids + seed_ids))
|
|
418
|
+
|
|
419
|
+
existing_rels = (
|
|
420
|
+
session.query(EntityRelationship)
|
|
421
|
+
.filter(
|
|
422
|
+
sa.or_(
|
|
423
|
+
EntityRelationship.entity_a_id.in_(entity_ids),
|
|
424
|
+
EntityRelationship.entity_b_id.in_(entity_ids),
|
|
425
|
+
),
|
|
426
|
+
sa.or_(
|
|
427
|
+
EntityRelationship.entity_a_id.in_(seed_ids),
|
|
428
|
+
EntityRelationship.entity_b_id.in_(seed_ids),
|
|
429
|
+
),
|
|
430
|
+
)
|
|
431
|
+
.all()
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
existing_rel_set: set[tuple] = set()
|
|
435
|
+
for rel in existing_rels:
|
|
436
|
+
existing_rel_set.add((rel.entity_a_id, rel.entity_b_id))
|
|
437
|
+
existing_rel_set.add((rel.entity_b_id, rel.entity_a_id))
|
|
438
|
+
|
|
439
|
+
matches = 0
|
|
440
|
+
now = datetime.now(timezone.utc)
|
|
441
|
+
|
|
442
|
+
for ent in inv_entities:
|
|
443
|
+
key = (ent.entity_type, ent.canonical_value)
|
|
444
|
+
seed_match = seed_map.get(key)
|
|
445
|
+
|
|
446
|
+
if seed_match:
|
|
447
|
+
if not ent.historical_context:
|
|
448
|
+
ent.historical_context = seed_match.context_snippet
|
|
449
|
+
ent.first_seen = min(
|
|
450
|
+
ent.first_seen or now,
|
|
451
|
+
seed_match.first_seen or now
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
rel_key = (ent.id, seed_match.id)
|
|
455
|
+
if rel_key not in existing_rel_set:
|
|
456
|
+
session.add(
|
|
457
|
+
EntityRelationship(
|
|
458
|
+
entity_a_id=ent.id,
|
|
459
|
+
entity_b_id=seed_match.id,
|
|
460
|
+
relationship_type=RelationshipType.LIKELY_SAME_ACTOR.value,
|
|
461
|
+
source_page_id=ent.page_id,
|
|
462
|
+
confidence=0.90,
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
existing_rel_set.add(rel_key)
|
|
466
|
+
existing_rel_set.add((seed_match.id, ent.id))
|
|
467
|
+
matches += 1
|
|
468
|
+
|
|
469
|
+
session.flush()
|
|
470
|
+
return matches
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def get_entities_by_type(
|
|
474
|
+
session: Session,
|
|
475
|
+
entity_type: str,
|
|
476
|
+
limit: int = 200,
|
|
477
|
+
) -> List[Entity]:
|
|
478
|
+
"""Return up to *limit* entities of the given type, most recently created first."""
|
|
479
|
+
return (
|
|
480
|
+
session.query(Entity)
|
|
481
|
+
.filter_by(entity_type=entity_type)
|
|
482
|
+
.order_by(Entity.created_at.desc())
|
|
483
|
+
.limit(limit)
|
|
484
|
+
.all()
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def get_entities_by_value(
|
|
489
|
+
session: Session,
|
|
490
|
+
value: str,
|
|
491
|
+
) -> List[Entity]:
|
|
492
|
+
"""Return all Entity rows whose value exactly matches *value*."""
|
|
493
|
+
return session.query(Entity).filter(Entity.value == value).all()
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def get_entities_for_investigation(
|
|
497
|
+
session: Session,
|
|
498
|
+
investigation_id: uuid.UUID,
|
|
499
|
+
entity_type: Optional[str] = None,
|
|
500
|
+
) -> List[Entity]:
|
|
501
|
+
"""Return all entities linked to an investigation, optionally filtered by type."""
|
|
502
|
+
q = session.query(Entity).filter_by(investigation_id=investigation_id)
|
|
503
|
+
if entity_type:
|
|
504
|
+
q = q.filter_by(entity_type=entity_type)
|
|
505
|
+
return q.order_by(Entity.created_at.desc()).all()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
# ---------------------------------------------------------------------------
|
|
509
|
+
# EntityRelationship helpers
|
|
510
|
+
# ---------------------------------------------------------------------------
|
|
511
|
+
|
|
512
|
+
def create_entity_relationship(
|
|
513
|
+
session: Session,
|
|
514
|
+
entity_a_id: uuid.UUID,
|
|
515
|
+
entity_b_id: uuid.UUID,
|
|
516
|
+
relationship_type: str,
|
|
517
|
+
source_page_id: Optional[uuid.UUID] = None,
|
|
518
|
+
confidence: float = 1.0,
|
|
519
|
+
) -> EntityRelationship:
|
|
520
|
+
"""Insert an EntityRelationship edge and flush to populate its id."""
|
|
521
|
+
rel = EntityRelationship(
|
|
522
|
+
entity_a_id=entity_a_id,
|
|
523
|
+
entity_b_id=entity_b_id,
|
|
524
|
+
relationship_type=relationship_type,
|
|
525
|
+
source_page_id=source_page_id,
|
|
526
|
+
confidence=confidence,
|
|
527
|
+
)
|
|
528
|
+
session.add(rel)
|
|
529
|
+
session.flush()
|
|
530
|
+
return rel
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def get_relationships_for_entity(
|
|
534
|
+
session: Session,
|
|
535
|
+
entity_id: uuid.UUID,
|
|
536
|
+
) -> List[EntityRelationship]:
|
|
537
|
+
"""Return all edges where *entity_id* is either end of the relationship."""
|
|
538
|
+
return (
|
|
539
|
+
session.query(EntityRelationship)
|
|
540
|
+
.filter(
|
|
541
|
+
(EntityRelationship.entity_a_id == entity_id)
|
|
542
|
+
| (EntityRelationship.entity_b_id == entity_id)
|
|
543
|
+
)
|
|
544
|
+
.all()
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def get_entity_neighbors_db(
|
|
549
|
+
entity_id: uuid.UUID,
|
|
550
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
551
|
+
session: Optional[Session] = None,
|
|
552
|
+
) -> List[dict]:
|
|
553
|
+
"""
|
|
554
|
+
Return direct neighbors of an entity with relationship metadata.
|
|
555
|
+
Uses a single SQL JOIN query - no NetworkX graph construction needed.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
entity_id: UUID of the entity to find neighbors for
|
|
559
|
+
investigation_id: Optional scope to a specific investigation
|
|
560
|
+
session: Optional existing DB session (creates one if not provided)
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
List of dicts with: neighbor_id, entity_type, value, relationship_type,
|
|
564
|
+
confidence, source_page_id
|
|
565
|
+
"""
|
|
566
|
+
from db.session import get_session as _get_session
|
|
567
|
+
|
|
568
|
+
if session is None:
|
|
569
|
+
_session = _get_session().__enter__()
|
|
570
|
+
should_close = True
|
|
571
|
+
else:
|
|
572
|
+
_session = session
|
|
573
|
+
should_close = False
|
|
574
|
+
|
|
575
|
+
try:
|
|
576
|
+
query = (
|
|
577
|
+
_session.query(
|
|
578
|
+
Entity.id.label("neighbor_id"),
|
|
579
|
+
Entity.entity_type,
|
|
580
|
+
Entity.value,
|
|
581
|
+
EntityRelationship.relationship_type,
|
|
582
|
+
EntityRelationship.confidence,
|
|
583
|
+
EntityRelationship.source_page_id,
|
|
584
|
+
EntityRelationship.entity_a_id,
|
|
585
|
+
)
|
|
586
|
+
.join(
|
|
587
|
+
EntityRelationship,
|
|
588
|
+
(EntityRelationship.entity_a_id == entity_id)
|
|
589
|
+
| (EntityRelationship.entity_b_id == entity_id),
|
|
590
|
+
)
|
|
591
|
+
.join(
|
|
592
|
+
Entity,
|
|
593
|
+
sa.or_(
|
|
594
|
+
Entity.id == EntityRelationship.entity_a_id,
|
|
595
|
+
Entity.id == EntityRelationship.entity_b_id,
|
|
596
|
+
),
|
|
597
|
+
)
|
|
598
|
+
.filter(Entity.id != entity_id)
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
if investigation_id is not None:
|
|
602
|
+
query = query.filter(EntityRelationship.investigation_id == investigation_id)
|
|
603
|
+
|
|
604
|
+
rows = query.all()
|
|
605
|
+
|
|
606
|
+
neighbors: dict[str, dict] = {}
|
|
607
|
+
for row in rows:
|
|
608
|
+
key = str(row.neighbor_id)
|
|
609
|
+
if key not in neighbors or row.confidence > neighbors[key].get("confidence", 0):
|
|
610
|
+
neighbors[key] = {
|
|
611
|
+
"neighbor_id": str(row.neighbor_id),
|
|
612
|
+
"entity_type": row.entity_type,
|
|
613
|
+
"value": row.value,
|
|
614
|
+
"relationship_type": row.relationship_type,
|
|
615
|
+
"confidence": row.confidence,
|
|
616
|
+
"source_page_id": str(row.source_page_id) if row.source_page_id else None,
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
return list(neighbors.values())
|
|
620
|
+
finally:
|
|
621
|
+
if should_close:
|
|
622
|
+
_session.close()
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def get_entity_appearances(
|
|
626
|
+
session: Session,
|
|
627
|
+
entity_id: uuid.UUID,
|
|
628
|
+
user_id: int,
|
|
629
|
+
) -> List[dict]:
|
|
630
|
+
"""
|
|
631
|
+
Return investigations owned by user_id where this entity appears,
|
|
632
|
+
including via InvestigationEntityLink (cross-investigation references).
|
|
633
|
+
Returns list of {investigation_id, run_id, query, created_at}, newest first.
|
|
634
|
+
"""
|
|
635
|
+
from db.models import InvestigationEntityLink # noqa: PLC0415
|
|
636
|
+
|
|
637
|
+
appearances: dict[str, dict] = {}
|
|
638
|
+
|
|
639
|
+
entity = session.query(Entity).filter_by(id=entity_id).first()
|
|
640
|
+
investigation_ids = []
|
|
641
|
+
if entity and entity.investigation_id:
|
|
642
|
+
investigation_ids.append(entity.investigation_id)
|
|
643
|
+
|
|
644
|
+
links = (
|
|
645
|
+
session.query(InvestigationEntityLink)
|
|
646
|
+
.filter_by(entity_id=entity_id)
|
|
647
|
+
.all()
|
|
648
|
+
)
|
|
649
|
+
for link in links:
|
|
650
|
+
if link.investigation_id not in investigation_ids:
|
|
651
|
+
investigation_ids.append(link.investigation_id)
|
|
652
|
+
|
|
653
|
+
if investigation_ids:
|
|
654
|
+
investigations = (
|
|
655
|
+
session.query(Investigation)
|
|
656
|
+
.filter(Investigation.id.in_(investigation_ids))
|
|
657
|
+
.filter(Investigation.user_id == user_id)
|
|
658
|
+
.all()
|
|
659
|
+
)
|
|
660
|
+
inv_map = {inv.id: inv for inv in investigations}
|
|
661
|
+
for inv_id in investigation_ids:
|
|
662
|
+
inv = inv_map.get(inv_id)
|
|
663
|
+
if inv:
|
|
664
|
+
appearances[str(inv.id)] = {
|
|
665
|
+
"investigation_id": str(inv.id),
|
|
666
|
+
"run_id": str(inv.run_id),
|
|
667
|
+
"query": inv.query,
|
|
668
|
+
"created_at": inv.created_at.isoformat() if inv.created_at else None,
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
result = list(appearances.values())
|
|
672
|
+
result.sort(key=lambda x: x.get("created_at") or "", reverse=True)
|
|
673
|
+
return result
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
# ---------------------------------------------------------------------------
|
|
677
|
+
# Monitor alerts
|
|
678
|
+
# ---------------------------------------------------------------------------
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def create_monitor_alert(
|
|
682
|
+
session: Session,
|
|
683
|
+
monitor_name: str,
|
|
684
|
+
change_type: str,
|
|
685
|
+
summary: str,
|
|
686
|
+
diff_data: Optional[dict] = None,
|
|
687
|
+
severity: str = "info",
|
|
688
|
+
entity_count_delta: int = 0,
|
|
689
|
+
delivery_channels: Optional[List[str]] = None,
|
|
690
|
+
) -> MonitorAlert:
|
|
691
|
+
"""
|
|
692
|
+
Persist a new alert record.
|
|
693
|
+
Called immediately when a monitor detects a change.
|
|
694
|
+
"""
|
|
695
|
+
alert = MonitorAlert(
|
|
696
|
+
monitor_name=monitor_name,
|
|
697
|
+
triggered_at=datetime.now(timezone.utc),
|
|
698
|
+
change_type=change_type,
|
|
699
|
+
summary=summary,
|
|
700
|
+
diff_data=diff_data or {},
|
|
701
|
+
severity=severity,
|
|
702
|
+
entity_count_delta=entity_count_delta,
|
|
703
|
+
delivered=bool(delivery_channels),
|
|
704
|
+
delivery_channels=delivery_channels or [],
|
|
705
|
+
)
|
|
706
|
+
session.add(alert)
|
|
707
|
+
session.flush()
|
|
708
|
+
session.refresh(alert)
|
|
709
|
+
return alert
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def get_alerts_for_monitor(
|
|
713
|
+
session: Session,
|
|
714
|
+
monitor_name: str,
|
|
715
|
+
limit: int = 20,
|
|
716
|
+
include_acknowledged: bool = True,
|
|
717
|
+
) -> List[MonitorAlert]:
|
|
718
|
+
"""Get recent alerts for a specific monitor, newest first."""
|
|
719
|
+
query = session.query(MonitorAlert).filter(
|
|
720
|
+
MonitorAlert.monitor_name == monitor_name
|
|
721
|
+
)
|
|
722
|
+
if not include_acknowledged:
|
|
723
|
+
query = query.filter(MonitorAlert.acknowledged.is_(False))
|
|
724
|
+
return (
|
|
725
|
+
query.order_by(MonitorAlert.triggered_at.desc()).limit(limit).all()
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def get_unacknowledged_alert_count(session: Session) -> int:
|
|
730
|
+
"""Total unacknowledged alerts across all monitors. Used for nav badge."""
|
|
731
|
+
n = (
|
|
732
|
+
session.query(func.count(MonitorAlert.id))
|
|
733
|
+
.filter(MonitorAlert.acknowledged.is_(False))
|
|
734
|
+
.scalar()
|
|
735
|
+
)
|
|
736
|
+
return int(n or 0)
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def get_alert_counts_by_monitor(session: Session) -> dict[str, int]:
|
|
740
|
+
"""
|
|
741
|
+
Returns {monitor_name: unacknowledged_count} for all monitors.
|
|
742
|
+
Used to show per-monitor alert badges in the table.
|
|
743
|
+
"""
|
|
744
|
+
rows = (
|
|
745
|
+
session.query(
|
|
746
|
+
MonitorAlert.monitor_name,
|
|
747
|
+
func.count(MonitorAlert.id).label("count"),
|
|
748
|
+
)
|
|
749
|
+
.filter(MonitorAlert.acknowledged.is_(False))
|
|
750
|
+
.group_by(MonitorAlert.monitor_name)
|
|
751
|
+
.all()
|
|
752
|
+
)
|
|
753
|
+
return {row.monitor_name: int(row.count) for row in rows}
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def acknowledge_alerts(
|
|
757
|
+
session: Session,
|
|
758
|
+
monitor_name: str,
|
|
759
|
+
alert_ids: Optional[List[int]] = None,
|
|
760
|
+
) -> int:
|
|
761
|
+
"""
|
|
762
|
+
Mark alerts as acknowledged.
|
|
763
|
+
If alert_ids is None, acknowledges ALL unacknowledged alerts for monitor.
|
|
764
|
+
Returns count of acknowledged alerts.
|
|
765
|
+
"""
|
|
766
|
+
query = (
|
|
767
|
+
session.query(MonitorAlert)
|
|
768
|
+
.filter(MonitorAlert.monitor_name == monitor_name)
|
|
769
|
+
.filter(MonitorAlert.acknowledged.is_(False))
|
|
770
|
+
)
|
|
771
|
+
if alert_ids:
|
|
772
|
+
query = query.filter(MonitorAlert.id.in_(alert_ids))
|
|
773
|
+
|
|
774
|
+
now = datetime.now(timezone.utc)
|
|
775
|
+
count = query.update(
|
|
776
|
+
{"acknowledged": True, "acknowledged_at": now},
|
|
777
|
+
synchronize_session=False,
|
|
778
|
+
)
|
|
779
|
+
session.flush()
|
|
780
|
+
return int(count)
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
# ---------------------------------------------------------------------------
|
|
784
|
+
# Monitor stats
|
|
785
|
+
# ---------------------------------------------------------------------------
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def get_monitor_stats(session: Session, monitor_name: str) -> dict:
|
|
789
|
+
"""
|
|
790
|
+
Return aggregate stats for a monitor based on its alert history.
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
last_run_at: ISO timestamp of most recent alert, or None
|
|
794
|
+
last_run_status: derived from most recent alert change_type, or None
|
|
795
|
+
total_runs: count of alerts for this monitor
|
|
796
|
+
last_entity_count: entity_count_delta from the most recent alert
|
|
797
|
+
"""
|
|
798
|
+
latest = (
|
|
799
|
+
session.query(MonitorAlert)
|
|
800
|
+
.filter(MonitorAlert.monitor_name == monitor_name)
|
|
801
|
+
.order_by(MonitorAlert.triggered_at.desc())
|
|
802
|
+
.limit(1)
|
|
803
|
+
.first()
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
total_runs = (
|
|
807
|
+
session.query(func.count(MonitorAlert.id))
|
|
808
|
+
.filter(MonitorAlert.monitor_name == monitor_name)
|
|
809
|
+
.scalar() or 0
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
return {
|
|
813
|
+
"last_run_at": latest.triggered_at.isoformat() if latest and latest.triggered_at else None,
|
|
814
|
+
"last_run_status": latest.change_type if latest else None,
|
|
815
|
+
"total_runs": int(total_runs),
|
|
816
|
+
"last_entity_count": getattr(latest, "entity_count_delta", 0) or 0,
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def update_entity_source_count(
|
|
821
|
+
session: Session,
|
|
822
|
+
entity_id: uuid.UUID,
|
|
823
|
+
source_name: str,
|
|
824
|
+
) -> None:
|
|
825
|
+
"""
|
|
826
|
+
Increment source count and add source name to corroborating_sources for an entity.
|
|
827
|
+
"""
|
|
828
|
+
import json
|
|
829
|
+
|
|
830
|
+
entity = session.query(Entity).filter_by(id=entity_id).first()
|
|
831
|
+
if not entity:
|
|
832
|
+
return
|
|
833
|
+
|
|
834
|
+
existing = json.loads(entity.corroborating_sources or "[]")
|
|
835
|
+
|
|
836
|
+
if source_name not in existing:
|
|
837
|
+
existing.append(source_name)
|
|
838
|
+
entity.corroborating_sources = json.dumps(existing)
|
|
839
|
+
entity.source_count = len(existing)
|
|
840
|
+
|
|
841
|
+
session.flush()
|