voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
db/session.py ADDED
@@ -0,0 +1,270 @@
1
+ """
2
+ SQLAlchemy engine and session factory.
3
+
4
+ Usage (application code)
5
+ ------------------------
6
+ from db.session import get_session
7
+
8
+ with get_session() as session:
9
+ session.add(some_object)
10
+ # commits on exit, rolls back on exception
11
+
12
+ For async code, prefer get_async_session() with async with:
13
+ ------------------------
14
+ from db.session import get_async_session
15
+
16
+ async with get_async_session() as session:
17
+ await session.add(some_object)
18
+ await session.commit()
19
+
20
+ For short-lived async operations, use async_session_scope():
21
+ ------------------------
22
+ from db.session import async_session_scope
23
+
24
+ async with async_session_scope() as session:
25
+ # session is auto-committed on exit, rolled back on exception
26
+ await session.execute(...)
27
+
28
+ Usage (testing — pass an explicit URL to avoid needing DATABASE_URL in env)
29
+ ---------------------------------------------------------------------------
30
+ from db.session import get_engine, get_session_factory
31
+ from db.models import Base
32
+
33
+ engine = get_engine("sqlite:///:memory:")
34
+ Base.metadata.create_all(engine)
35
+ Session = get_session_factory("sqlite:///:memory:")
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ from functools import lru_cache
41
+
42
+ from contextlib import asynccontextmanager, contextmanager
43
+ from typing import AsyncGenerator, Generator, Optional
44
+
45
+ from sqlalchemy import create_engine, Engine
46
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
47
+ from sqlalchemy.orm import sessionmaker, Session
48
+
49
+ import config
50
+
51
+ _async_engine_cache: dict[str, "AsyncEngine"] = {}
52
+
53
+
54
+ @lru_cache(maxsize=8)
55
+ def _get_engine_cached(target_url: str) -> Engine:
56
+ is_sqlite = target_url.startswith("sqlite")
57
+
58
+ if is_sqlite:
59
+ engine = create_engine(
60
+ target_url,
61
+ pool_pre_ping=True,
62
+ connect_args={"check_same_thread": False},
63
+ )
64
+ else:
65
+ engine = create_engine(
66
+ target_url,
67
+ pool_pre_ping=True,
68
+ pool_size=20,
69
+ max_overflow=40,
70
+ pool_timeout=30,
71
+ pool_recycle=1800,
72
+ )
73
+
74
+ return engine
75
+
76
+
77
+ def get_engine(url: Optional[str] = None) -> Engine:
78
+ """
79
+ Return a SQLAlchemy Engine for *url* (defaults to DATABASE_URL env var).
80
+
81
+ Uses lru_cache with maxsize=8 to bound the cache and prevent unbounded
82
+ growth during test suites. Least-recently-used engines are evicted
83
+ automatically when the limit is reached.
84
+
85
+ PostgreSQL gets a connection pool tuned for the scraping workload.
86
+ SQLite skips pool parameters that only apply to QueuePool.
87
+ """
88
+ target_url = url or config.DATABASE_URL
89
+ if not target_url:
90
+ raise RuntimeError(
91
+ "DATABASE_URL is not configured.\n"
92
+ "Add it to your .env file, e.g.:\n"
93
+ " DATABASE_URL=postgresql://voidaccess:voidaccess@localhost:5433/voidaccess"
94
+ )
95
+
96
+ return _get_engine_cached(target_url)
97
+
98
+
99
+ def release_engine(url: Optional[str] = None) -> None:
100
+ """
101
+ Explicitly release and remove an engine from the cache.
102
+
103
+ Calls engine.dispose() to release connection pool resources and file handles,
104
+ then clears the cache. Use this in test teardown to prevent leaks.
105
+ """
106
+ target_url = url or config.DATABASE_URL
107
+ if target_url:
108
+ try:
109
+ engine = get_engine(target_url)
110
+ engine.dispose()
111
+ except Exception:
112
+ pass
113
+ _get_engine_cached.cache_clear()
114
+
115
+
116
+ def get_async_engine(url: Optional[str] = None) -> "AsyncEngine":
117
+ """
118
+ Return an async SQLAlchemy AsyncEngine for *url*.
119
+
120
+ Converts postgresql:// to postgresql+asyncpg:// and sqlite:// to sqlite+aiosqlite://.
121
+ """
122
+ from sqlalchemy.ext.asyncio import AsyncEngine
123
+
124
+ target_url = url or config.DATABASE_URL
125
+ if not target_url:
126
+ raise RuntimeError(
127
+ "DATABASE_URL is not configured.\n"
128
+ "Add it to your .env file, e.g.:\n"
129
+ " DATABASE_URL=postgresql://voidaccess:voidaccess@localhost:5433/voidaccess"
130
+ )
131
+
132
+ if target_url in _async_engine_cache:
133
+ return _async_engine_cache[target_url]
134
+
135
+ if target_url.startswith("postgresql://"):
136
+ async_url = target_url.replace("postgresql://", "postgresql+asyncpg://", 1)
137
+ elif target_url.startswith("sqlite://"):
138
+ async_url = target_url.replace("sqlite://", "sqlite+aiosqlite://", 1)
139
+ else:
140
+ async_url = target_url
141
+
142
+ is_sqlite = "sqlite" in async_url
143
+
144
+ if is_sqlite:
145
+ engine = create_async_engine(
146
+ async_url,
147
+ pool_pre_ping=True,
148
+ connect_args={"check_same_thread": False},
149
+ )
150
+ else:
151
+ engine = create_async_engine(
152
+ async_url,
153
+ pool_pre_ping=True,
154
+ pool_size=20,
155
+ max_overflow=40,
156
+ pool_timeout=30,
157
+ pool_recycle=1800,
158
+ )
159
+
160
+ _async_engine_cache[target_url] = engine
161
+ return engine
162
+
163
+
164
+ def release_async_engine(url: Optional[str] = None) -> None:
165
+ """
166
+ Explicitly release and remove an async engine from the cache.
167
+
168
+ Calls engine.dispose() to release connection pool resources and file handles.
169
+ """
170
+ target_url = url or config.DATABASE_URL
171
+ if target_url in _async_engine_cache:
172
+ _async_engine_cache[target_url].dispose()
173
+ del _async_engine_cache[target_url]
174
+
175
+
176
+ def get_session_factory(url: Optional[str] = None) -> sessionmaker:
177
+ """Return a sessionmaker bound to an engine for *url*."""
178
+ engine = get_engine(url)
179
+ return sessionmaker(bind=engine, autoflush=False, autocommit=False)
180
+
181
+
182
+ def get_async_session_factory(url: Optional[str] = None) -> async_sessionmaker:
183
+ """Return an async_sessionmaker bound to an async engine for *url*."""
184
+ engine = get_async_engine(url)
185
+ return async_sessionmaker(bind=engine, autoflush=False, expire_on_commit=False)
186
+
187
+
188
+ @contextmanager
189
+ def get_session(url: Optional[str] = None) -> Generator[Session, None, None]:
190
+ """
191
+ Context manager that yields a sync Session, commits on clean exit,
192
+ rolls back on any exception, and always closes.
193
+
194
+ Example::
195
+
196
+ with get_session() as session:
197
+ session.add(entity)
198
+ # committed here
199
+ """
200
+ factory = get_session_factory(url)
201
+ session: Session = factory()
202
+ try:
203
+ yield session
204
+ session.commit()
205
+ except Exception:
206
+ session.rollback()
207
+ raise
208
+ finally:
209
+ session.close()
210
+
211
+
212
+ def get_db(url: Optional[str] = None) -> Generator[Session, None, None]:
213
+ """
214
+ FastAPI dependency that yields a database session.
215
+ The session is closed automatically after the request.
216
+ Usage: db: Session = Depends(get_db)
217
+ """
218
+ factory = get_session_factory(url)
219
+ db = factory()
220
+ try:
221
+ yield db
222
+ finally:
223
+ db.close()
224
+
225
+
226
+ @asynccontextmanager
227
+ async def get_async_session(url: Optional[str] = None) -> AsyncGenerator[AsyncSession, None]:
228
+ """
229
+ Async generator that yields an AsyncSession.
230
+
231
+ Usage::
232
+
233
+ async with get_async_session() as session:
234
+ await session.add(entity)
235
+ await session.commit()
236
+
237
+ The session is automatically closed on exit.
238
+ """
239
+ factory = get_async_session_factory(url)
240
+ async with factory() as session:
241
+ yield session
242
+
243
+
244
+ @asynccontextmanager
245
+ async def async_session_scope(
246
+ url: Optional[str] = None,
247
+ ) -> AsyncGenerator[AsyncSession, None]:
248
+ """
249
+ Async context manager for short-lived sessions.
250
+
251
+ Automatically commits on clean exit, rolls back on exception,
252
+ and always closes the session. Use this for targeted DB operations.
253
+
254
+ Example::
255
+
256
+ async with async_session_scope() as session:
257
+ result = await session.execute(select(Investigation))
258
+ await session.commit()
259
+
260
+ This is the preferred pattern for the investigation pipeline —
261
+ each step gets its own session that commits and closes immediately.
262
+ """
263
+ factory = get_async_session_factory(url)
264
+ async with factory() as session:
265
+ try:
266
+ yield session
267
+ await session.commit()
268
+ except Exception:
269
+ await session.rollback()
270
+ raise
export/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """
2
+ export — Phase 5 intelligence export module.
3
+
4
+ Re-exports the public API from stix, misp, and sigma sub-modules.
5
+ """
6
+
7
+ from export.stix import (
8
+ bundle_to_dict,
9
+ bundle_to_json,
10
+ investigation_to_stix_bundle,
11
+ )
12
+ from export.misp import (
13
+ investigation_to_misp_event,
14
+ misp_event_to_json,
15
+ )
16
+ from export.sigma import (
17
+ entities_to_sigma_rules,
18
+ export_sigma_rules,
19
+ sigma_rule_to_yaml,
20
+ )
21
+
22
+ __all__ = [
23
+ # stix
24
+ "investigation_to_stix_bundle",
25
+ "bundle_to_json",
26
+ "bundle_to_dict",
27
+ # misp
28
+ "investigation_to_misp_event",
29
+ "misp_event_to_json",
30
+ # sigma
31
+ "entities_to_sigma_rules",
32
+ "sigma_rule_to_yaml",
33
+ "export_sigma_rules",
34
+ ]
export/misp.py ADDED
@@ -0,0 +1,257 @@
1
+ """
2
+ export/misp.py — Generates MISP event JSON from a VoidAccess investigation.
3
+
4
+ MISP format is constructed directly as a dict — no MISP library required.
5
+ The format follows the MISP standard event structure as documented at
6
+ https://www.misp-standard.org/rfc/misp-core-format.html
7
+
8
+ Public interface
9
+ ----------------
10
+ investigation_to_misp_event(investigation_id) → dict
11
+ misp_event_to_json(event) → str
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import logging
18
+ import os
19
+ from datetime import datetime, timezone
20
+ from typing import Any, Optional
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Entity type → MISP attribute mapping
26
+ # ---------------------------------------------------------------------------
27
+
28
+ _MISP_ATTR_MAP: dict[str, dict] = {
29
+ "BITCOIN_ADDRESS": {
30
+ "type": "btc",
31
+ "category": "Financial fraud",
32
+ "to_ids": True,
33
+ },
34
+ "ETHEREUM_ADDRESS": {
35
+ "type": "other",
36
+ "category": "Financial fraud",
37
+ "to_ids": True,
38
+ },
39
+ "MONERO_ADDRESS": {
40
+ "type": "other",
41
+ "category": "Financial fraud",
42
+ "to_ids": True,
43
+ },
44
+ "EMAIL_ADDRESS": {
45
+ "type": "email-src",
46
+ "category": "Network activity",
47
+ "to_ids": False,
48
+ },
49
+ "ONION_URL": {
50
+ "type": "url",
51
+ "category": "Network activity",
52
+ "to_ids": True,
53
+ },
54
+ "IP_ADDRESS": {
55
+ "type": "ip-dst",
56
+ "category": "Network activity",
57
+ "to_ids": True,
58
+ },
59
+ "CVE_NUMBER": {
60
+ "type": "vulnerability",
61
+ "category": "External analysis",
62
+ "to_ids": False,
63
+ },
64
+ "MALWARE_FAMILY": {
65
+ "type": "malware-type",
66
+ "category": "Antivirus detection",
67
+ "to_ids": False,
68
+ },
69
+ "RANSOMWARE_GROUP": {
70
+ "type": "malware-type",
71
+ "category": "Antivirus detection",
72
+ "to_ids": False,
73
+ },
74
+ "THREAT_ACTOR_HANDLE": {
75
+ "type": "threat-actor",
76
+ "category": "Attribution",
77
+ "to_ids": False,
78
+ },
79
+ }
80
+
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # Public interface
84
+ # ---------------------------------------------------------------------------
85
+
86
+
87
+ def investigation_to_misp_event(
88
+ investigation_id: Any,
89
+ entity_ids: Optional[list[str]] = None,
90
+ ) -> dict:
91
+ """
92
+ Build a MISP-compatible event dict for the given investigation.
93
+
94
+ Returns a valid (but empty-attribute) event if the investigation is not found.
95
+ Never raises.
96
+ """
97
+ investigation, entities = _load_investigation_and_entities(
98
+ investigation_id, entity_ids=entity_ids
99
+ )
100
+
101
+ if investigation is None:
102
+ return {
103
+ "Event": {
104
+ "info": "Not found",
105
+ "Attribute": [],
106
+ }
107
+ }
108
+
109
+ date_str = _utc_date_str(investigation.created_at)
110
+ query = getattr(investigation, "query", "") or ""
111
+
112
+ attributes: list[dict] = []
113
+ for entity in entities:
114
+ mapping = _MISP_ATTR_MAP.get(entity.entity_type)
115
+ if mapping is None:
116
+ continue
117
+ attr = {
118
+ "type": mapping["type"],
119
+ "category": mapping["category"],
120
+ "value": entity.value,
121
+ "comment": f"Source: {entity.source_url}" if entity.source_url else "Source: unknown",
122
+ "to_ids": mapping["to_ids"],
123
+ }
124
+ attributes.append(attr)
125
+
126
+ return {
127
+ "Event": {
128
+ "info": f"VoidAccess Investigation: {query}",
129
+ "date": date_str,
130
+ "threat_level_id": "2", # Medium
131
+ "analysis": "2", # Completed
132
+ "distribution": "0", # Your organisation only
133
+ "Attribute": attributes,
134
+ }
135
+ }
136
+
137
+
138
+ def misp_event_to_json(event: dict) -> str:
139
+ """
140
+ Return JSON string of a MISP event dict (pretty-printed, 2-space indent).
141
+ """
142
+ try:
143
+ return json.dumps(event, indent=2, default=str)
144
+ except Exception as exc:
145
+ logger.warning("misp_event_to_json failed: %s", exc)
146
+ return json.dumps({"Event": {"info": "Not found", "Attribute": []}}, indent=2)
147
+
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # Internal helpers
151
+ # ---------------------------------------------------------------------------
152
+
153
+
154
+ def _load_investigation_and_entities(
155
+ investigation_id: Any,
156
+ entity_ids: Optional[list[str]] = None,
157
+ ):
158
+ """
159
+ Load the investigation record and its entities from DB.
160
+
161
+ Includes entities owned directly by the investigation AND entities linked
162
+ via InvestigationEntityLink (canonical dedup junction table).
163
+
164
+ Returns (investigation, entities) or (None, []) on error / not found.
165
+ """
166
+ import uuid as _uuid
167
+
168
+ if not os.getenv("DATABASE_URL"):
169
+ return None, []
170
+
171
+ try:
172
+ from db.session import get_session # noqa: PLC0415
173
+ from db.queries import get_investigation_by_id_or_run # noqa: PLC0415
174
+ from db.models import Entity, InvestigationEntityLink # noqa: PLC0415
175
+ from extractor.normalizer import NormalizedEntity # noqa: PLC0415
176
+
177
+ inv_uuid = _coerce_uuid(investigation_id)
178
+ if inv_uuid is None:
179
+ return None, []
180
+
181
+ filter_uuids: Optional[list[_uuid.UUID]] = None
182
+ if entity_ids:
183
+ filter_uuids = []
184
+ for raw in entity_ids:
185
+ try:
186
+ filter_uuids.append(_uuid.UUID(str(raw)))
187
+ except (ValueError, AttributeError):
188
+ continue
189
+
190
+ with get_session() as session:
191
+ investigation = get_investigation_by_id_or_run(session, inv_uuid)
192
+ if investigation is None:
193
+ return None, []
194
+
195
+ linked_ids_subq = (
196
+ session.query(InvestigationEntityLink.entity_id)
197
+ .filter(InvestigationEntityLink.investigation_id == investigation.id)
198
+ .subquery()
199
+ )
200
+ db_entities = (
201
+ session.query(Entity)
202
+ .filter(
203
+ (Entity.investigation_id == investigation.id)
204
+ | Entity.id.in_(linked_ids_subq)
205
+ )
206
+ .all()
207
+ )
208
+
209
+ if filter_uuids is not None:
210
+ want = frozenset(filter_uuids)
211
+ db_entities = [e for e in db_entities if e.id in want]
212
+
213
+ normalized: list[NormalizedEntity] = []
214
+ for e in db_entities:
215
+ source_url = ""
216
+ try:
217
+ if e.page:
218
+ source_url = e.page.url or ""
219
+ except Exception:
220
+ pass
221
+ ne = NormalizedEntity(
222
+ entity_type=e.entity_type,
223
+ value=e.canonical_value or e.value,
224
+ confidence=e.confidence,
225
+ source_url=source_url,
226
+ page_id=e.page_id,
227
+ context_snippet=e.context_snippet or "",
228
+ )
229
+ normalized.append(ne)
230
+
231
+ session.expunge_all()
232
+ return investigation, normalized
233
+
234
+ except Exception as exc:
235
+ logger.warning("_load_investigation_and_entities failed: %s", exc)
236
+ return None, []
237
+
238
+
239
+ def _coerce_uuid(value: Any):
240
+ """Coerce value to uuid.UUID. Returns None on failure."""
241
+ import uuid as _uuid
242
+ if isinstance(value, _uuid.UUID):
243
+ return value
244
+ try:
245
+ return _uuid.UUID(str(value))
246
+ except (ValueError, AttributeError):
247
+ return None
248
+
249
+
250
+ def _utc_date_str(dt: Optional[Any]) -> str:
251
+ """Format a datetime as YYYY-MM-DD string. Defaults to today on None."""
252
+ if dt is None:
253
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")
254
+ try:
255
+ return dt.strftime("%Y-%m-%d")
256
+ except Exception:
257
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")