voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
db/queries.py ADDED
@@ -0,0 +1,841 @@
1
+ """
2
+ Common query helpers for the VoidAccess database layer.
3
+
4
+ All functions accept a SQLAlchemy Session as their first argument so callers
5
+ control transaction boundaries. None of these helpers call session.commit()
6
+ — that is the caller's responsibility (or the get_session() context manager's).
7
+
8
+ Where a helper needs an intermediate ID before the transaction is committed,
9
+ it calls session.flush() to write the row without finalising the transaction.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from typing import List, Optional, Tuple
17
+
18
+ from sqlalchemy.orm import Session
19
+ from sqlalchemy import func
20
+ import sqlalchemy as sa
21
+
22
+ from db.models import (
23
+ Entity,
24
+ EntityRelationship,
25
+ Investigation,
26
+ MonitorAlert,
27
+ Page,
28
+ RelationshipType,
29
+ Source,
30
+ SourceStatus,
31
+ SourceType,
32
+ )
33
+
34
+
35
+ def db_health_check(session: Session) -> bool:
36
+ """Return True when DB responds to a trivial heartbeat query."""
37
+ try:
38
+ session.execute(sa.text("SELECT 1"))
39
+ return True
40
+ except Exception:
41
+ return False
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Investigation helpers
46
+ # ---------------------------------------------------------------------------
47
+
48
+ def get_investigation_by_id_or_run(
49
+ session: Session,
50
+ id_or_run: uuid.UUID,
51
+ ) -> Optional[Investigation]:
52
+ """Return the investigation row matching primary key *or* ``run_id``."""
53
+ return (
54
+ session.query(Investigation)
55
+ .filter(
56
+ (Investigation.id == id_or_run) | (Investigation.run_id == id_or_run)
57
+ )
58
+ .first()
59
+ )
60
+
61
+
62
+ def count_distinct_pages_for_investigation(
63
+ session: Session,
64
+ investigation_id: uuid.UUID,
65
+ ) -> int:
66
+ """Count distinct scraped pages that contributed entities to this investigation.
67
+
68
+ Includes both entities owned by this investigation and entities linked via the
69
+ junction table (deduped entities from previous investigations re-linked here).
70
+ """
71
+ from db.models import InvestigationEntityLink # noqa: PLC0415
72
+ linked_ids_subq = (
73
+ session.query(InvestigationEntityLink.entity_id)
74
+ .filter(InvestigationEntityLink.investigation_id == investigation_id)
75
+ .subquery()
76
+ )
77
+ n = (
78
+ session.query(sa.func.count(sa.distinct(Entity.page_id)))
79
+ .filter(
80
+ (Entity.investigation_id == investigation_id)
81
+ | Entity.id.in_(linked_ids_subq)
82
+ )
83
+ .scalar()
84
+ )
85
+ return int(n or 0)
86
+
87
+
88
+ def create_investigation(
89
+ session: Session,
90
+ query: str,
91
+ refined_query: Optional[str] = None,
92
+ model_used: Optional[str] = None,
93
+ preset: Optional[str] = None,
94
+ summary: Optional[str] = None,
95
+ user_id: Optional[int] = None,
96
+ ) -> Investigation:
97
+ """Insert a new Investigation row and flush to populate id/run_id."""
98
+ inv = Investigation(
99
+ query=query,
100
+ refined_query=refined_query,
101
+ model_used=model_used,
102
+ preset=preset,
103
+ summary=summary,
104
+ user_id=user_id,
105
+ )
106
+ session.add(inv)
107
+ session.flush()
108
+ return inv
109
+
110
+
111
+ def get_investigation_by_run_id(
112
+ session: Session, run_id: uuid.UUID
113
+ ) -> Optional[Investigation]:
114
+ """Return the Investigation with the given run_id, or None."""
115
+ return session.query(Investigation).filter_by(run_id=run_id).first()
116
+
117
+
118
+ def get_recent_investigations(
119
+ session: Session, limit: int = 20
120
+ ) -> List[Investigation]:
121
+ """Return the *limit* most recent investigations, newest first."""
122
+ return (
123
+ session.query(Investigation)
124
+ .order_by(Investigation.created_at.desc())
125
+ .limit(limit)
126
+ .all()
127
+ )
128
+
129
+
130
+ def update_investigation_summary(
131
+ session: Session, investigation_id: uuid.UUID, summary: str
132
+ ) -> None:
133
+ """Patch the summary field of an existing investigation."""
134
+ session.query(Investigation).filter_by(id=investigation_id).update(
135
+ {"summary": summary}
136
+ )
137
+
138
+
139
+ # ---------------------------------------------------------------------------
140
+ # Source helpers
141
+ # ---------------------------------------------------------------------------
142
+
143
+ def get_or_create_source(
144
+ session: Session,
145
+ onion_address: str,
146
+ source_type: str = SourceType.SEARCH_RESULT.value,
147
+ ) -> Tuple[Source, bool]:
148
+ """
149
+ Return (source, created) where *created* is True if a new row was inserted.
150
+
151
+ Uses flush (not commit) so the caller retains transaction control.
152
+ The onion_address is stored as-is — normalisation (strip trailing slashes,
153
+ lower-case) is the caller's responsibility.
154
+ """
155
+ existing = session.query(Source).filter_by(onion_address=onion_address).first()
156
+ if existing:
157
+ return existing, False
158
+
159
+ source = Source(
160
+ onion_address=onion_address,
161
+ source_type=source_type,
162
+ status=SourceStatus.UNKNOWN.value,
163
+ )
164
+ session.add(source)
165
+ session.flush()
166
+ return source, True
167
+
168
+
169
+ def update_source_status(
170
+ session: Session, source_id: uuid.UUID, status: str
171
+ ) -> None:
172
+ """Update the status of a Source and refresh last_seen to now."""
173
+ session.query(Source).filter_by(id=source_id).update(
174
+ {
175
+ "status": status,
176
+ "last_seen": datetime.now(timezone.utc),
177
+ }
178
+ )
179
+
180
+
181
+ def link_source_to_investigation(
182
+ session: Session, investigation: Investigation, source: Source
183
+ ) -> None:
184
+ """Add *source* to *investigation*.sources if not already present."""
185
+ if source not in investigation.sources:
186
+ investigation.sources.append(source)
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Page helpers
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def create_page(
194
+ session: Session,
195
+ url: str,
196
+ source_id: Optional[uuid.UUID] = None,
197
+ cleaned_text: Optional[str] = None,
198
+ raw_content_hash: Optional[str] = None,
199
+ byte_size: Optional[int] = None,
200
+ language: Optional[str] = None,
201
+ posted_at: Optional[datetime] = None,
202
+ ) -> Page:
203
+ """Insert a new Page row and flush to populate its id."""
204
+ page = Page(
205
+ url=url,
206
+ source_id=source_id,
207
+ cleaned_text=cleaned_text,
208
+ raw_content_hash=raw_content_hash,
209
+ byte_size=byte_size,
210
+ language=language,
211
+ posted_at=posted_at,
212
+ )
213
+ session.add(page)
214
+ session.flush()
215
+ return page
216
+
217
+
218
+ def get_page_by_url(session: Session, url: str) -> Optional[Page]:
219
+ """Return the Page with the exact URL, or None."""
220
+ return session.query(Page).filter_by(url=url).first()
221
+
222
+
223
+ def get_page_by_hash(session: Session, content_hash: str) -> Optional[Page]:
224
+ """
225
+ Return the first Page whose raw_content_hash matches.
226
+ Used by the crawler to skip re-scraping identical content.
227
+ """
228
+ return session.query(Page).filter_by(raw_content_hash=content_hash).first()
229
+
230
+
231
+ def get_pages_for_source(
232
+ session: Session, source_id: uuid.UUID, limit: int = 100
233
+ ) -> List[Page]:
234
+ """Return pages belonging to a given source, newest first."""
235
+ return (
236
+ session.query(Page)
237
+ .filter_by(source_id=source_id)
238
+ .order_by(Page.scrape_timestamp.desc())
239
+ .limit(limit)
240
+ .all()
241
+ )
242
+
243
+
244
+ # ---------------------------------------------------------------------------
245
+ # Entity helpers
246
+ # ---------------------------------------------------------------------------
247
+
248
+ def create_entity(
249
+ session: Session,
250
+ page_id: uuid.UUID,
251
+ entity_type: str,
252
+ value: str,
253
+ confidence: float = 1.0,
254
+ context: Optional[str] = None,
255
+ investigation_id: Optional[uuid.UUID] = None,
256
+ ) -> Entity:
257
+ """Insert an Entity row and flush to populate its id."""
258
+ entity = Entity(
259
+ page_id=page_id,
260
+ investigation_id=investigation_id,
261
+ entity_type=entity_type,
262
+ value=value,
263
+ confidence=confidence,
264
+ context_snippet=context,
265
+ )
266
+ session.add(entity)
267
+ session.flush()
268
+ return entity
269
+
270
+
271
+ def _link_entity_to_investigation(
272
+ session: Session, entity_id: uuid.UUID, investigation_id: uuid.UUID
273
+ ) -> None:
274
+ """Link an entity to an investigation via InvestigationEntityLink."""
275
+ from db.models import InvestigationEntityLink
276
+
277
+ # Check committed rows
278
+ existing = session.query(InvestigationEntityLink).filter_by(
279
+ entity_id=entity_id, investigation_id=investigation_id
280
+ ).first()
281
+ if existing:
282
+ return
283
+
284
+ # Also check pending (unflushed) objects — autoflush=False means they won't
285
+ # appear in the query above, causing UNIQUE VIOLATION on batch flush.
286
+ for obj in session.new:
287
+ if (
288
+ isinstance(obj, InvestigationEntityLink)
289
+ and obj.entity_id == entity_id
290
+ and obj.investigation_id == investigation_id
291
+ ):
292
+ return
293
+
294
+ link = InvestigationEntityLink(
295
+ entity_id=entity_id,
296
+ investigation_id=investigation_id
297
+ )
298
+ session.add(link)
299
+
300
+
301
+ def upsert_entity_canonical(
302
+ session: Session,
303
+ investigation_id: uuid.UUID,
304
+ entity_type: str,
305
+ entity_value: str,
306
+ confidence: float,
307
+ source_page_id: Optional[uuid.UUID] = None,
308
+ context_snippet: str = "",
309
+ extraction_method: Optional[str] = None,
310
+ ) -> tuple[Entity, bool]:
311
+ """
312
+ Insert or update an entity using canonical deduplication.
313
+
314
+ Dedup strategy:
315
+ 1. Compute canonical key for this entity
316
+ 2. Check if any entity with same (canonical_key, entity_type) exists
317
+ in ANY investigation (global dedup)
318
+ 3. If found: update confidence to max(existing, new), link to this investigation
319
+ 4. If not found: insert new entity
320
+
321
+ Returns: (entity, was_created)
322
+ """
323
+ from extractor.normalizer import canonicalize_entity_value
324
+
325
+ canonical = canonicalize_entity_value(entity_type, entity_value)
326
+
327
+ # Look for existing entity with same canonical form (any investigation)
328
+ existing = (
329
+ session.query(Entity)
330
+ .filter(
331
+ Entity.entity_type == entity_type,
332
+ Entity.canonical_value == canonical,
333
+ )
334
+ .order_by(Entity.confidence.desc()) # Prefer highest confidence existing
335
+ .first()
336
+ )
337
+
338
+ if existing:
339
+ # Update confidence if new extraction is more confident
340
+ if confidence > existing.confidence:
341
+ existing.confidence = confidence
342
+ # Update context if we have a better snippet
343
+ if context_snippet and len(context_snippet) > len(existing.context_snippet or ""):
344
+ existing.context_snippet = context_snippet
345
+ if extraction_method and not existing.extraction_method:
346
+ existing.extraction_method = extraction_method
347
+ # Update last_seen
348
+ existing.last_seen = datetime.now(timezone.utc)
349
+ # Update last_seen_at for freshness tracking
350
+ existing.last_seen_at = datetime.now(timezone.utc)
351
+ # Link to this investigation if not already linked
352
+ if existing.investigation_id != investigation_id:
353
+ _link_entity_to_investigation(session, existing.id, investigation_id)
354
+ return existing, False
355
+ else:
356
+ # Create new entity
357
+ entity = Entity(
358
+ investigation_id=investigation_id,
359
+ entity_type=entity_type,
360
+ value=entity_value,
361
+ canonical_value=canonical,
362
+ confidence=confidence,
363
+ context_snippet=context_snippet,
364
+ page_id=source_page_id,
365
+ extraction_method=extraction_method,
366
+ first_seen=datetime.now(timezone.utc),
367
+ last_seen=datetime.now(timezone.utc),
368
+ )
369
+ session.add(entity)
370
+ session.flush() # populate entity.id before creating the link
371
+ _link_entity_to_investigation(session, entity.id, investigation_id)
372
+ return entity, True
373
+
374
+
375
+ def cross_reference_with_seeds(session: Session, investigation_id: uuid.UUID) -> int:
376
+ """
377
+ For each entity in this investigation, check if it matches any seed entity.
378
+ If match found, update the investigation entity with historical context.
379
+ Returns count of matches found.
380
+ """
381
+ from db.models import Investigation
382
+
383
+ inv_entities = session.query(Entity).join(Investigation, Entity.investigation_id == Investigation.id).filter(
384
+ Entity.investigation_id == investigation_id,
385
+ Investigation.is_seed == False
386
+ ).all()
387
+
388
+ if not inv_entities:
389
+ return 0
390
+
391
+ canonical_keys = [(ent.entity_type, ent.canonical_value) for ent in inv_entities if ent.canonical_value]
392
+ if not canonical_keys:
393
+ return 0
394
+
395
+ entity_types = [k[0] for k in canonical_keys]
396
+ canonical_values = [k[1] for k in canonical_keys]
397
+
398
+ seed_entities = (
399
+ session.query(Entity)
400
+ .join(Investigation, Entity.investigation_id == Investigation.id)
401
+ .filter(
402
+ Entity.entity_type.in_(entity_types),
403
+ Entity.canonical_value.in_(canonical_values),
404
+ Investigation.is_seed == True
405
+ )
406
+ .all()
407
+ )
408
+
409
+ seed_map: dict[tuple, Entity] = {}
410
+ for seed in seed_entities:
411
+ key = (seed.entity_type, seed.canonical_value)
412
+ if key not in seed_map:
413
+ seed_map[key] = seed
414
+
415
+ entity_ids = [ent.id for ent in inv_entities]
416
+ seed_ids = [seed.id for seed in seed_entities]
417
+ all_ids = list(set(entity_ids + seed_ids))
418
+
419
+ existing_rels = (
420
+ session.query(EntityRelationship)
421
+ .filter(
422
+ sa.or_(
423
+ EntityRelationship.entity_a_id.in_(entity_ids),
424
+ EntityRelationship.entity_b_id.in_(entity_ids),
425
+ ),
426
+ sa.or_(
427
+ EntityRelationship.entity_a_id.in_(seed_ids),
428
+ EntityRelationship.entity_b_id.in_(seed_ids),
429
+ ),
430
+ )
431
+ .all()
432
+ )
433
+
434
+ existing_rel_set: set[tuple] = set()
435
+ for rel in existing_rels:
436
+ existing_rel_set.add((rel.entity_a_id, rel.entity_b_id))
437
+ existing_rel_set.add((rel.entity_b_id, rel.entity_a_id))
438
+
439
+ matches = 0
440
+ now = datetime.now(timezone.utc)
441
+
442
+ for ent in inv_entities:
443
+ key = (ent.entity_type, ent.canonical_value)
444
+ seed_match = seed_map.get(key)
445
+
446
+ if seed_match:
447
+ if not ent.historical_context:
448
+ ent.historical_context = seed_match.context_snippet
449
+ ent.first_seen = min(
450
+ ent.first_seen or now,
451
+ seed_match.first_seen or now
452
+ )
453
+
454
+ rel_key = (ent.id, seed_match.id)
455
+ if rel_key not in existing_rel_set:
456
+ session.add(
457
+ EntityRelationship(
458
+ entity_a_id=ent.id,
459
+ entity_b_id=seed_match.id,
460
+ relationship_type=RelationshipType.LIKELY_SAME_ACTOR.value,
461
+ source_page_id=ent.page_id,
462
+ confidence=0.90,
463
+ )
464
+ )
465
+ existing_rel_set.add(rel_key)
466
+ existing_rel_set.add((seed_match.id, ent.id))
467
+ matches += 1
468
+
469
+ session.flush()
470
+ return matches
471
+
472
+
473
+ def get_entities_by_type(
474
+ session: Session,
475
+ entity_type: str,
476
+ limit: int = 200,
477
+ ) -> List[Entity]:
478
+ """Return up to *limit* entities of the given type, most recently created first."""
479
+ return (
480
+ session.query(Entity)
481
+ .filter_by(entity_type=entity_type)
482
+ .order_by(Entity.created_at.desc())
483
+ .limit(limit)
484
+ .all()
485
+ )
486
+
487
+
488
+ def get_entities_by_value(
489
+ session: Session,
490
+ value: str,
491
+ ) -> List[Entity]:
492
+ """Return all Entity rows whose value exactly matches *value*."""
493
+ return session.query(Entity).filter(Entity.value == value).all()
494
+
495
+
496
+ def get_entities_for_investigation(
497
+ session: Session,
498
+ investigation_id: uuid.UUID,
499
+ entity_type: Optional[str] = None,
500
+ ) -> List[Entity]:
501
+ """Return all entities linked to an investigation, optionally filtered by type."""
502
+ q = session.query(Entity).filter_by(investigation_id=investigation_id)
503
+ if entity_type:
504
+ q = q.filter_by(entity_type=entity_type)
505
+ return q.order_by(Entity.created_at.desc()).all()
506
+
507
+
508
+ # ---------------------------------------------------------------------------
509
+ # EntityRelationship helpers
510
+ # ---------------------------------------------------------------------------
511
+
512
+ def create_entity_relationship(
513
+ session: Session,
514
+ entity_a_id: uuid.UUID,
515
+ entity_b_id: uuid.UUID,
516
+ relationship_type: str,
517
+ source_page_id: Optional[uuid.UUID] = None,
518
+ confidence: float = 1.0,
519
+ ) -> EntityRelationship:
520
+ """Insert an EntityRelationship edge and flush to populate its id."""
521
+ rel = EntityRelationship(
522
+ entity_a_id=entity_a_id,
523
+ entity_b_id=entity_b_id,
524
+ relationship_type=relationship_type,
525
+ source_page_id=source_page_id,
526
+ confidence=confidence,
527
+ )
528
+ session.add(rel)
529
+ session.flush()
530
+ return rel
531
+
532
+
533
+ def get_relationships_for_entity(
534
+ session: Session,
535
+ entity_id: uuid.UUID,
536
+ ) -> List[EntityRelationship]:
537
+ """Return all edges where *entity_id* is either end of the relationship."""
538
+ return (
539
+ session.query(EntityRelationship)
540
+ .filter(
541
+ (EntityRelationship.entity_a_id == entity_id)
542
+ | (EntityRelationship.entity_b_id == entity_id)
543
+ )
544
+ .all()
545
+ )
546
+
547
+
548
+ def get_entity_neighbors_db(
549
+ entity_id: uuid.UUID,
550
+ investigation_id: Optional[uuid.UUID] = None,
551
+ session: Optional[Session] = None,
552
+ ) -> List[dict]:
553
+ """
554
+ Return direct neighbors of an entity with relationship metadata.
555
+ Uses a single SQL JOIN query - no NetworkX graph construction needed.
556
+
557
+ Args:
558
+ entity_id: UUID of the entity to find neighbors for
559
+ investigation_id: Optional scope to a specific investigation
560
+ session: Optional existing DB session (creates one if not provided)
561
+
562
+ Returns:
563
+ List of dicts with: neighbor_id, entity_type, value, relationship_type,
564
+ confidence, source_page_id
565
+ """
566
+ from db.session import get_session as _get_session
567
+
568
+ if session is None:
569
+ _session = _get_session().__enter__()
570
+ should_close = True
571
+ else:
572
+ _session = session
573
+ should_close = False
574
+
575
+ try:
576
+ query = (
577
+ _session.query(
578
+ Entity.id.label("neighbor_id"),
579
+ Entity.entity_type,
580
+ Entity.value,
581
+ EntityRelationship.relationship_type,
582
+ EntityRelationship.confidence,
583
+ EntityRelationship.source_page_id,
584
+ EntityRelationship.entity_a_id,
585
+ )
586
+ .join(
587
+ EntityRelationship,
588
+ (EntityRelationship.entity_a_id == entity_id)
589
+ | (EntityRelationship.entity_b_id == entity_id),
590
+ )
591
+ .join(
592
+ Entity,
593
+ sa.or_(
594
+ Entity.id == EntityRelationship.entity_a_id,
595
+ Entity.id == EntityRelationship.entity_b_id,
596
+ ),
597
+ )
598
+ .filter(Entity.id != entity_id)
599
+ )
600
+
601
+ if investigation_id is not None:
602
+ query = query.filter(EntityRelationship.investigation_id == investigation_id)
603
+
604
+ rows = query.all()
605
+
606
+ neighbors: dict[str, dict] = {}
607
+ for row in rows:
608
+ key = str(row.neighbor_id)
609
+ if key not in neighbors or row.confidence > neighbors[key].get("confidence", 0):
610
+ neighbors[key] = {
611
+ "neighbor_id": str(row.neighbor_id),
612
+ "entity_type": row.entity_type,
613
+ "value": row.value,
614
+ "relationship_type": row.relationship_type,
615
+ "confidence": row.confidence,
616
+ "source_page_id": str(row.source_page_id) if row.source_page_id else None,
617
+ }
618
+
619
+ return list(neighbors.values())
620
+ finally:
621
+ if should_close:
622
+ _session.close()
623
+
624
+
625
+ def get_entity_appearances(
626
+ session: Session,
627
+ entity_id: uuid.UUID,
628
+ user_id: int,
629
+ ) -> List[dict]:
630
+ """
631
+ Return investigations owned by user_id where this entity appears,
632
+ including via InvestigationEntityLink (cross-investigation references).
633
+ Returns list of {investigation_id, run_id, query, created_at}, newest first.
634
+ """
635
+ from db.models import InvestigationEntityLink # noqa: PLC0415
636
+
637
+ appearances: dict[str, dict] = {}
638
+
639
+ entity = session.query(Entity).filter_by(id=entity_id).first()
640
+ investigation_ids = []
641
+ if entity and entity.investigation_id:
642
+ investigation_ids.append(entity.investigation_id)
643
+
644
+ links = (
645
+ session.query(InvestigationEntityLink)
646
+ .filter_by(entity_id=entity_id)
647
+ .all()
648
+ )
649
+ for link in links:
650
+ if link.investigation_id not in investigation_ids:
651
+ investigation_ids.append(link.investigation_id)
652
+
653
+ if investigation_ids:
654
+ investigations = (
655
+ session.query(Investigation)
656
+ .filter(Investigation.id.in_(investigation_ids))
657
+ .filter(Investigation.user_id == user_id)
658
+ .all()
659
+ )
660
+ inv_map = {inv.id: inv for inv in investigations}
661
+ for inv_id in investigation_ids:
662
+ inv = inv_map.get(inv_id)
663
+ if inv:
664
+ appearances[str(inv.id)] = {
665
+ "investigation_id": str(inv.id),
666
+ "run_id": str(inv.run_id),
667
+ "query": inv.query,
668
+ "created_at": inv.created_at.isoformat() if inv.created_at else None,
669
+ }
670
+
671
+ result = list(appearances.values())
672
+ result.sort(key=lambda x: x.get("created_at") or "", reverse=True)
673
+ return result
674
+
675
+
676
+ # ---------------------------------------------------------------------------
677
+ # Monitor alerts
678
+ # ---------------------------------------------------------------------------
679
+
680
+
681
+ def create_monitor_alert(
682
+ session: Session,
683
+ monitor_name: str,
684
+ change_type: str,
685
+ summary: str,
686
+ diff_data: Optional[dict] = None,
687
+ severity: str = "info",
688
+ entity_count_delta: int = 0,
689
+ delivery_channels: Optional[List[str]] = None,
690
+ ) -> MonitorAlert:
691
+ """
692
+ Persist a new alert record.
693
+ Called immediately when a monitor detects a change.
694
+ """
695
+ alert = MonitorAlert(
696
+ monitor_name=monitor_name,
697
+ triggered_at=datetime.now(timezone.utc),
698
+ change_type=change_type,
699
+ summary=summary,
700
+ diff_data=diff_data or {},
701
+ severity=severity,
702
+ entity_count_delta=entity_count_delta,
703
+ delivered=bool(delivery_channels),
704
+ delivery_channels=delivery_channels or [],
705
+ )
706
+ session.add(alert)
707
+ session.flush()
708
+ session.refresh(alert)
709
+ return alert
710
+
711
+
712
+ def get_alerts_for_monitor(
713
+ session: Session,
714
+ monitor_name: str,
715
+ limit: int = 20,
716
+ include_acknowledged: bool = True,
717
+ ) -> List[MonitorAlert]:
718
+ """Get recent alerts for a specific monitor, newest first."""
719
+ query = session.query(MonitorAlert).filter(
720
+ MonitorAlert.monitor_name == monitor_name
721
+ )
722
+ if not include_acknowledged:
723
+ query = query.filter(MonitorAlert.acknowledged.is_(False))
724
+ return (
725
+ query.order_by(MonitorAlert.triggered_at.desc()).limit(limit).all()
726
+ )
727
+
728
+
729
+ def get_unacknowledged_alert_count(session: Session) -> int:
730
+ """Total unacknowledged alerts across all monitors. Used for nav badge."""
731
+ n = (
732
+ session.query(func.count(MonitorAlert.id))
733
+ .filter(MonitorAlert.acknowledged.is_(False))
734
+ .scalar()
735
+ )
736
+ return int(n or 0)
737
+
738
+
739
+ def get_alert_counts_by_monitor(session: Session) -> dict[str, int]:
740
+ """
741
+ Returns {monitor_name: unacknowledged_count} for all monitors.
742
+ Used to show per-monitor alert badges in the table.
743
+ """
744
+ rows = (
745
+ session.query(
746
+ MonitorAlert.monitor_name,
747
+ func.count(MonitorAlert.id).label("count"),
748
+ )
749
+ .filter(MonitorAlert.acknowledged.is_(False))
750
+ .group_by(MonitorAlert.monitor_name)
751
+ .all()
752
+ )
753
+ return {row.monitor_name: int(row.count) for row in rows}
754
+
755
+
756
+ def acknowledge_alerts(
757
+ session: Session,
758
+ monitor_name: str,
759
+ alert_ids: Optional[List[int]] = None,
760
+ ) -> int:
761
+ """
762
+ Mark alerts as acknowledged.
763
+ If alert_ids is None, acknowledges ALL unacknowledged alerts for monitor.
764
+ Returns count of acknowledged alerts.
765
+ """
766
+ query = (
767
+ session.query(MonitorAlert)
768
+ .filter(MonitorAlert.monitor_name == monitor_name)
769
+ .filter(MonitorAlert.acknowledged.is_(False))
770
+ )
771
+ if alert_ids:
772
+ query = query.filter(MonitorAlert.id.in_(alert_ids))
773
+
774
+ now = datetime.now(timezone.utc)
775
+ count = query.update(
776
+ {"acknowledged": True, "acknowledged_at": now},
777
+ synchronize_session=False,
778
+ )
779
+ session.flush()
780
+ return int(count)
781
+
782
+
783
+ # ---------------------------------------------------------------------------
784
+ # Monitor stats
785
+ # ---------------------------------------------------------------------------
786
+
787
+
788
+ def get_monitor_stats(session: Session, monitor_name: str) -> dict:
789
+ """
790
+ Return aggregate stats for a monitor based on its alert history.
791
+
792
+ Returns:
793
+ last_run_at: ISO timestamp of most recent alert, or None
794
+ last_run_status: derived from most recent alert change_type, or None
795
+ total_runs: count of alerts for this monitor
796
+ last_entity_count: entity_count_delta from the most recent alert
797
+ """
798
+ latest = (
799
+ session.query(MonitorAlert)
800
+ .filter(MonitorAlert.monitor_name == monitor_name)
801
+ .order_by(MonitorAlert.triggered_at.desc())
802
+ .limit(1)
803
+ .first()
804
+ )
805
+
806
+ total_runs = (
807
+ session.query(func.count(MonitorAlert.id))
808
+ .filter(MonitorAlert.monitor_name == monitor_name)
809
+ .scalar() or 0
810
+ )
811
+
812
+ return {
813
+ "last_run_at": latest.triggered_at.isoformat() if latest and latest.triggered_at else None,
814
+ "last_run_status": latest.change_type if latest else None,
815
+ "total_runs": int(total_runs),
816
+ "last_entity_count": getattr(latest, "entity_count_delta", 0) or 0,
817
+ }
818
+
819
+
820
+ def update_entity_source_count(
821
+ session: Session,
822
+ entity_id: uuid.UUID,
823
+ source_name: str,
824
+ ) -> None:
825
+ """
826
+ Increment source count and add source name to corroborating_sources for an entity.
827
+ """
828
+ import json
829
+
830
+ entity = session.query(Entity).filter_by(id=entity_id).first()
831
+ if not entity:
832
+ return
833
+
834
+ existing = json.loads(entity.corroborating_sources or "[]")
835
+
836
+ if source_name not in existing:
837
+ existing.append(source_name)
838
+ entity.corroborating_sources = json.dumps(existing)
839
+ entity.source_count = len(existing)
840
+
841
+ session.flush()