voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,2567 @@
1
+ """
2
+ api/routes/investigations.py — Investigation management endpoints.
3
+
4
+ POST /investigations — trigger an investigation (background task)
5
+ GET /investigations — list recent investigations
6
+ GET /investigations/{id} — get single investigation
7
+ GET /investigations/{id}/entities — list entities for investigation
8
+ GET /investigations/{id}/graph — graph JSON for investigation
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import concurrent.futures
15
+ import csv
16
+ import hashlib
17
+ import io
18
+ import logging
19
+ import os
20
+ import uuid
21
+ from typing import Any, Optional
22
+
23
+ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request, Response
24
+ from fastapi.responses import StreamingResponse
25
+ from pydantic import BaseModel, Field, validator
26
+ from sqlalchemy import select as sa_select
27
+ from crawler import crawl
28
+ from sources.seeds import get_seeds
29
+ from sources.seed_manager import get_seed_manager
30
+ from sources.paste_scraper import scrape_paste_sites
31
+ from sources.github_scraper import scrape_github
32
+ from sources.gitlab_scraper import scrape_gitlab
33
+ from sources.rss_scraper import scrape_rss_feeds
34
+
35
+ # Paste-site hostnames used for counting paste-sourced pages in responses.
36
+ PASTE_SITE_HOSTNAMES = (
37
+ "pastebin.com",
38
+ "rentry.co",
39
+ "dpaste.org",
40
+ "paste.ee",
41
+ )
42
+
43
+ # Opt-out toggle for the parallel paste site scraper (read at task time so
44
+ # tests can monkey-patch the env var without re-importing this module).
45
+ def _paste_scraping_enabled() -> bool:
46
+ return os.getenv("PASTE_SCRAPING_ENABLED", "true").lower() == "true"
47
+
48
+
49
+ def _github_scraping_enabled() -> bool:
50
+ return os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() == "true"
51
+
52
+
53
+ def _gitlab_scraping_enabled() -> bool:
54
+ return os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() == "true"
55
+
56
+
57
+ def _rss_scraping_enabled() -> bool:
58
+ return os.getenv("RSS_FEEDS_ENABLED", "true").lower() == "true"
59
+ from api.auth import CurrentUser, get_current_user, require_password_not_reset_pending
60
+ import json
61
+
62
+ logger = logging.getLogger(__name__)
63
+ logger.setLevel(logging.DEBUG)
64
+ router = APIRouter()
65
+
66
+ # In-process cache: investigation_id (str) → infrastructure clusters list.
67
+ # Populated during the pipeline run; read by the GET detail endpoint.
68
+ _infra_cluster_cache: dict[str, list] = {}
69
+
70
+ # In-process cache: investigation_id (str) → sources_used status dict.
71
+ # Populated during the pipeline run; read by the GET detail endpoint.
72
+ _sources_used_cache: dict[str, dict] = {}
73
+
74
+ # Cooperative cancellation flags: investigation_id (str) → True when cancel requested.
75
+ # Checked at pipeline checkpoints; cleared once the pipeline honours the request.
76
+ # Falls back cleanly in multi-process deployments (each worker has its own dict;
77
+ # cancellation works as long as the pipeline task runs in the same process as the
78
+ # cancel HTTP request, which is true for single-worker FastAPI/uvicorn).
79
+ _cancel_flags: dict[str, bool] = {}
80
+
81
+
82
+ def _is_cancelled(investigation_id: str) -> bool:
83
+ return _cancel_flags.get(investigation_id, False)
84
+
85
+
86
+ def _set_cancelled(investigation_id: str) -> None:
87
+ _cancel_flags[investigation_id] = True
88
+
89
+
90
+ def _clear_cancel_flag(investigation_id: str) -> None:
91
+ _cancel_flags.pop(investigation_id, None)
92
+
93
+
94
+ async def _check_cancelled(inv_uuid: uuid.UUID, investigation_id: str) -> bool:
95
+ """Return True and mark investigation cancelled in DB if cancellation was requested."""
96
+ if not _is_cancelled(investigation_id):
97
+ return False
98
+ _clear_cancel_flag(investigation_id)
99
+ logger.info("[%s] Cancellation flag detected — stopping pipeline cleanly", inv_uuid)
100
+ from db.models import Investigation
101
+ from db.session import get_session
102
+ with get_session() as session:
103
+ session.query(Investigation).filter_by(id=inv_uuid).update({"status": "cancelled"})
104
+ session.commit()
105
+ return True
106
+
107
+ # ---------------------------------------------------------------------------
108
+ # Rate limiting (shared key_func with api/main.py; enforcement via app.state.limiter)
109
+ # ---------------------------------------------------------------------------
110
+
111
+ _DISABLE_RATE_LIMIT = os.getenv("DISABLE_RATE_LIMIT", "false").lower() == "true"
112
+
113
+ if not _DISABLE_RATE_LIMIT:
114
+ try:
115
+ from slowapi import Limiter
116
+ from slowapi.util import get_remote_address
117
+ _limiter: "Limiter | None" = Limiter(key_func=get_remote_address)
118
+ except ImportError:
119
+ _limiter = None
120
+ else:
121
+ _limiter = None
122
+
123
+
124
+ def _rate_limit(limit_string: str):
125
+ """Return a slowapi rate-limit decorator, or a pass-through when disabled."""
126
+ if _limiter is None:
127
+ return lambda f: f
128
+ return _limiter.limit(limit_string)
129
+
130
+
131
+ STEP_LABELS = {
132
+ 1: "Refining query",
133
+ 2: "Searching dark web",
134
+ 3: "Filtering results",
135
+ 4: "Scraping pages",
136
+ 5: "Extracting entities",
137
+ 6: "Enriching intelligence",
138
+ 7: "Building graph",
139
+ 8: "Generating summary",
140
+ 9: "Finalizing results",
141
+ }
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # Request / response schemas
146
+ # ---------------------------------------------------------------------------
147
+
148
+
149
+ class InvestigationRequest(BaseModel):
150
+ query: str = Field(..., min_length=3, max_length=500, description="Search query (3-500 chars)")
151
+ model: str = Field(default="openrouter/deepseek/deepseek-chat", description="LLM model ID to use")
152
+ run_crawler: bool = False
153
+
154
+ @validator("query")
155
+ def query_not_whitespace(cls, v: str) -> str:
156
+ if not v.strip():
157
+ raise ValueError("Query cannot be empty or whitespace")
158
+ if len(v.strip()) < 3:
159
+ raise ValueError("Query must be at least 3 characters")
160
+ return v.strip()
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Helper: load investigation from DB
165
+ # ---------------------------------------------------------------------------
166
+
167
+
168
+ def _count_paste_pages_for_investigation(session, internal_id) -> tuple[int, list[str]]:
169
+ """
170
+ Count distinct paste-site pages observed for *internal_id* and return the
171
+ list of paste sources that contributed at least one page.
172
+
173
+ Implementation: paste pages are persisted as rows in the `pages` table
174
+ with their paste-site URL, and entities extracted from those pages are
175
+ linked back to the investigation via Entity.investigation_id. We join
176
+ Entity → Page and filter by hostname instead of adding a DB column.
177
+ """
178
+ try:
179
+ from db.models import Entity, Page
180
+
181
+ rows = (
182
+ session.query(Page.url)
183
+ .join(Entity, Entity.page_id == Page.id)
184
+ .filter(Entity.investigation_id == internal_id)
185
+ .distinct()
186
+ .all()
187
+ )
188
+ except Exception as exc:
189
+ logger.debug("paste-page count failed: %s", exc)
190
+ return 0, []
191
+
192
+ paste_urls: set[str] = set()
193
+ sources_used: set[str] = set()
194
+ for (url,) in rows:
195
+ if not url:
196
+ continue
197
+ url_lower = url.lower()
198
+ for host in PASTE_SITE_HOSTNAMES:
199
+ if host in url_lower:
200
+ paste_urls.add(url)
201
+ sources_used.add({
202
+ "pastebin.com": "Pastebin",
203
+ "rentry.co": "Rentry",
204
+ "dpaste.org": "dpaste",
205
+ "paste.ee": "paste.ee",
206
+ }[host])
207
+ break
208
+ return len(paste_urls), sorted(sources_used)
209
+
210
+
211
+ def _get_db_investigation(investigation_id: str) -> Any:
212
+ """Return investigation dict or raise HTTPException 404."""
213
+ if not os.getenv("DATABASE_URL"):
214
+ raise HTTPException(status_code=503, detail="Database not configured")
215
+ try:
216
+ from db.session import get_session # noqa: PLC0415
217
+ from db.queries import ( # noqa: PLC0415
218
+ count_distinct_pages_for_investigation,
219
+ get_investigation_by_id_or_run,
220
+ )
221
+
222
+ from sqlalchemy import func # noqa: PLC0415
223
+ from db.models import Entity, EntityRelationship, InvestigationEntityLink # noqa: PLC0415
224
+
225
+ inv_uuid = uuid.UUID(investigation_id)
226
+ with get_session() as session:
227
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
228
+ if inv is None:
229
+ raise HTTPException(status_code=404, detail="Investigation not found")
230
+ pages_crawled = count_distinct_pages_for_investigation(session, inv.id)
231
+ paste_pages_found, paste_sources_used = _count_paste_pages_for_investigation(
232
+ session, inv.id
233
+ )
234
+
235
+ # Entity IDs for this investigation = own entities + junction-table links
236
+ linked_ids_subq = (
237
+ session.query(InvestigationEntityLink.entity_id)
238
+ .filter(InvestigationEntityLink.investigation_id == inv.id)
239
+ .subquery()
240
+ )
241
+ entity_subq = (
242
+ session.query(Entity.id)
243
+ .filter(
244
+ (Entity.investigation_id == inv.id)
245
+ | Entity.id.in_(linked_ids_subq)
246
+ )
247
+ .subquery()
248
+ )
249
+ entity_count = int(
250
+ session.query(func.count()).select_from(entity_subq).scalar() or 0
251
+ )
252
+ relationship_count = int(
253
+ session.query(func.count(EntityRelationship.id))
254
+ .filter(
255
+ (EntityRelationship.entity_a_id.in_(entity_subq))
256
+ | (EntityRelationship.entity_b_id.in_(entity_subq))
257
+ )
258
+ .scalar()
259
+ or 0
260
+ )
261
+ return {
262
+ "id": str(inv.id),
263
+ "run_id": str(inv.run_id),
264
+ "query": inv.query,
265
+ "refined_query": inv.refined_query,
266
+ "model_used": inv.model_used,
267
+ "preset": inv.preset,
268
+ "summary": inv.summary,
269
+ "status": inv.status,
270
+ "graph_status": getattr(inv, "graph_status", "pending"),
271
+ "created_at": inv.created_at.isoformat() if inv.created_at else None,
272
+ "current_step": inv.current_step or 0,
273
+ "total_steps": 13,
274
+ "current_step_label": inv.current_step_label or "",
275
+ "entity_count": entity_count,
276
+ "relationship_count": relationship_count,
277
+ "page_count": pages_crawled,
278
+ "pages_crawled": pages_crawled, # keep for compat
279
+ "paste_pages_found": paste_pages_found,
280
+ "paste_sources_used": paste_sources_used,
281
+ "infrastructure_clusters": _infra_cluster_cache.get(investigation_id, _infra_cluster_cache.get(str(inv.id), [])),
282
+ "sources_used": _sources_used_cache.get(str(inv.id), _sources_used_cache.get(investigation_id, {})),
283
+ }
284
+ except HTTPException:
285
+ raise
286
+ except ValueError:
287
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
288
+ except Exception as exc:
289
+ logger.exception("_get_db_investigation failed: %s", exc)
290
+ raise HTTPException(
291
+ status_code=500,
292
+ detail=f"Internal error: {exc!s}"[:500],
293
+ )
294
+
295
+
296
+ async def _update_investigation_status(
297
+ investigation_id: uuid.UUID,
298
+ status: str,
299
+ model_used: Optional[str] = None,
300
+ summary: Optional[str] = None,
301
+ ) -> None:
302
+ """Update investigation status in a short-lived session."""
303
+ from db.session import get_session
304
+ from db.models import Investigation
305
+
306
+ with get_session() as session:
307
+ updates: dict[str, Any] = {"status": status}
308
+ if model_used is not None:
309
+ updates["model_used"] = model_used
310
+ if summary is not None:
311
+ updates["summary"] = summary
312
+ session.query(Investigation).filter_by(id=investigation_id).update(updates)
313
+ session.commit()
314
+
315
+
316
+ async def _update_progress(
317
+ investigation_id: uuid.UUID,
318
+ step: Optional[int] = None,
319
+ entity_count: Optional[int] = None,
320
+ scraped_pages: Optional[dict] = None,
321
+ label: Optional[str] = None,
322
+ ) -> None:
323
+ """Fire-and-forget progress field update. Failures are non-critical."""
324
+ try:
325
+ from db.session import get_session
326
+ from db.models import Investigation
327
+
328
+ with get_session() as session:
329
+ inv = session.query(Investigation).filter_by(id=investigation_id).first()
330
+ if inv is None:
331
+ return
332
+ if step is not None:
333
+ inv.current_step = step
334
+ inv.current_step_label = label if label is not None else STEP_LABELS.get(step, "Processing")
335
+ elif label is not None:
336
+ inv.current_step_label = label
337
+ if entity_count is not None:
338
+ inv.entity_count = entity_count
339
+ if scraped_pages is not None:
340
+ inv.page_count = len(scraped_pages)
341
+ session.commit()
342
+ except Exception as e:
343
+ logger.warning("[%s] _update_progress failed (non-critical): %s", investigation_id, e)
344
+
345
+
346
+ async def _get_investigation_model_choice(model: Optional[str]) -> tuple[str, Any]:
347
+ """Get model choices and selected model in a short-lived session."""
348
+ from db.session import get_session
349
+ from voidaccess.llm_utils import get_model_choices
350
+ import config as config_module
351
+
352
+ with get_session() as session:
353
+ model_choices = get_model_choices()
354
+ if not model_choices:
355
+ raise RuntimeError("No LLM models available")
356
+ selected_model = (
357
+ model
358
+ or config_module.DEFAULT_MODEL
359
+ or "openrouter/deepseek/deepseek-chat"
360
+ )
361
+ return selected_model, model_choices
362
+
363
+
364
+ # ---------------------------------------------------------------------------
365
+ # Background task: run investigation pipeline
366
+ # ---------------------------------------------------------------------------
367
+
368
+
369
+ def _parse_rate_limit_reset(exc: Exception) -> float:
370
+ """Extract reset timestamp from a 429 error and return seconds to wait."""
371
+ import time, re
372
+ exc_str = str(exc)
373
+ # OpenRouter returns X-RateLimit-Reset as epoch milliseconds in metadata
374
+ match = re.search(r"'X-RateLimit-Reset':\s*'?(\d{13})'?", exc_str)
375
+ if match:
376
+ reset_ms = int(match.group(1))
377
+ wait = (reset_ms / 1000.0) - time.time() + 1.0 # +1s buffer
378
+ return max(wait, 5.0)
379
+ # Fallback: 65s to outlast a 60s/1-min rate-limit window
380
+ return 65.0
381
+
382
+
383
+ async def _llm_with_backoff(fn, *args, max_retries: int = 4, investigation_id: "uuid.UUID | None" = None, **kwargs):
384
+ """Run a synchronous LLM function in a thread, retrying on 429 rate-limit errors."""
385
+ for attempt in range(max_retries):
386
+ try:
387
+ return await asyncio.to_thread(fn, *args, **kwargs)
388
+ except Exception as exc:
389
+ if "429" in str(exc) and attempt < max_retries - 1:
390
+ wait_secs = _parse_rate_limit_reset(exc)
391
+ logger.info(
392
+ "[LLM] Rate limit hit — waiting %.0fs before retry (attempt %d/%d)",
393
+ wait_secs, attempt + 1, max_retries,
394
+ )
395
+ if investigation_id is not None:
396
+ await _update_progress(investigation_id, label=f"Rate limited — retrying in {wait_secs:.0f}s...")
397
+ await asyncio.sleep(wait_secs)
398
+ else:
399
+ raise
400
+ raise RuntimeError("LLM max retries exceeded")
401
+
402
+
403
+ async def _run_investigation_task(
404
+ investigation_id: str, run_id: str, query: str, model: str, run_crawler: bool
405
+ ) -> None:
406
+ """
407
+ Background task that runs the investigation pipeline.
408
+
409
+ The investigation DB record already exists (created by the HTTP handler) with
410
+ status "pending". This task updates status → processing → completed/failed.
411
+
412
+ CRITICAL: Each DB operation uses its own short-lived session that commits
413
+ and closes immediately. No session is held open across asyncio.to_thread()
414
+ calls, which prevents SQLAlchemy session state corruption and connection
415
+ pool exhaustion.
416
+
417
+ Errors are logged — never propagated to the caller.
418
+ """
419
+ try:
420
+ if not os.getenv("DATABASE_URL"):
421
+ logger.warning("Background investigation: DATABASE_URL not set, skipping persist")
422
+ return
423
+
424
+ from db.models import Investigation
425
+ from db.session import get_session, get_async_session
426
+ from db.queries import update_investigation_summary
427
+ from voidaccess.llm import filter_results, generate_summary, get_llm, refine_query
428
+ from voidaccess.llm_utils import get_model_choices
429
+ from search.search import _search_async as _search_engines_async, _dedupe_links as _search_dedupe, ENGINE_WEIGHTS as _engine_weights
430
+ from scraper.scrape import scrape_multiple, validate_urls_for_scraping
431
+ from extractor import extract_entities_from_pages
432
+
433
+ inv_uuid = uuid.UUID(investigation_id)
434
+
435
+ async with get_async_session() as session:
436
+ result = await session.execute(
437
+ sa_select(Investigation).where(Investigation.id == inv_uuid)
438
+ )
439
+ inv_record = result.scalar_one_or_none()
440
+ inv_user_id = inv_record.user_id if inv_record else None
441
+
442
+ resolved_keys = {}
443
+ if inv_user_id is not None:
444
+ async with get_async_session() as session:
445
+ from utils.user_keys import resolve_api_key
446
+ for key_name in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY",
447
+ "OPENROUTER_API_KEY", "GROQ_API_KEY", "OTX_API_KEY", "VT_API_KEY"):
448
+ resolved_keys[key_name] = await resolve_api_key(inv_user_id, key_name, session)
449
+
450
+ # ===== STEP 0: Get model choice and mark as processing =====
451
+ selected_model, _ = await _get_investigation_model_choice(model)
452
+ logger.info(
453
+ "Investigation %s: using model '%s'",
454
+ inv_uuid,
455
+ selected_model,
456
+ )
457
+ await _update_investigation_status(inv_uuid, "processing", model_used=selected_model)
458
+ await _update_progress(inv_uuid, 0)
459
+ logger.info("[%s] Starting investigation: %s", inv_uuid, query)
460
+
461
+ # ===== STEP 1: Query refinement (no session held) =====
462
+ logger.info("[%s] STEP 1: Refining query...", inv_uuid)
463
+ llm_client = None
464
+ refined_query = query
465
+ try:
466
+ llm_client = get_llm(selected_model, api_keys=resolved_keys)
467
+ refined_query = await _llm_with_backoff(refine_query, llm_client, query, investigation_id=inv_uuid)
468
+ logger.info("[%s] Refined query: %s", inv_uuid, refined_query)
469
+ except Exception as exc:
470
+ logger.exception("[%s] Query refinement failed, using original query: %s", inv_uuid, exc)
471
+ refined_query = query
472
+
473
+ def _persist_refined_query():
474
+ with get_session() as session:
475
+ inv = session.query(Investigation).filter_by(id=inv_uuid).first()
476
+ if inv:
477
+ inv.refined_query = refined_query
478
+ session.commit()
479
+
480
+ await asyncio.to_thread(_persist_refined_query)
481
+ await _update_progress(inv_uuid, 1)
482
+ if await _check_cancelled(inv_uuid, investigation_id):
483
+ return
484
+
485
+ # ===== STEP 1.5: Multilingual Query Expansion (no session held) =====
486
+ logger.info("[%s] STEP 1.5: Expanding query to multiple languages...", inv_uuid)
487
+ expanded_queries: dict[str, str] = {"en": refined_query}
488
+ try:
489
+ from i18n.query_expand import expand_query
490
+
491
+ expansion = expand_query(refined_query)
492
+
493
+ if expansion and isinstance(expansion, dict) and len(expansion) > 1:
494
+ expanded_queries = expansion
495
+ lang_count = len(expanded_queries)
496
+ logger.info(
497
+ "[%s] Query expanded to %d languages: %s",
498
+ inv_uuid,
499
+ lang_count,
500
+ list(expanded_queries.keys()),
501
+ )
502
+ else:
503
+ logger.info("[%s] Query expansion returned no results, using English only", inv_uuid)
504
+
505
+ except ImportError:
506
+ logger.info("[%s] i18n module not available, using English only", inv_uuid)
507
+ except Exception as e:
508
+ logger.info("[%s] Query expansion failed (non-fatal): %s", inv_uuid, e)
509
+
510
+ # ===== SEED URL INJECTION (runs before search engine fan-out) =====
511
+ # Curated, known-active .onion intelligence sources are checked first
512
+ # so we always visit relevant leak sites/forums even if search engines
513
+ # don't surface them. These bypass the LLM filter.
514
+ relevant_seeds: list[dict] = []
515
+ try:
516
+ seed_manager = get_seed_manager()
517
+ relevant_seeds = seed_manager.get_relevant_seeds(
518
+ query=query,
519
+ refined_query=refined_query or "",
520
+ max_seeds=10,
521
+ )
522
+ except Exception as exc:
523
+ logger.info("[%s] Seed manager unavailable (non-fatal): %s", inv_uuid, exc)
524
+ relevant_seeds = []
525
+
526
+ seed_urls: list[dict] = []
527
+ if relevant_seeds:
528
+ for s in relevant_seeds:
529
+ url = s.get("url") or ""
530
+ if not url:
531
+ continue
532
+ seed_urls.append({
533
+ "link": url,
534
+ "title": s.get("name", "Seed source"),
535
+ "source": "seed",
536
+ "source_type": "seed",
537
+ "seed_category": s.get("category", "unknown"),
538
+ "seed_tags": s.get("tags", []),
539
+ })
540
+ categories = sorted({s.get("category", "unknown") for s in relevant_seeds})
541
+ logger.info(
542
+ "[%s] Injecting %d seed URLs into scrape queue (categories: %s)",
543
+ inv_uuid,
544
+ len(seed_urls),
545
+ categories,
546
+ )
547
+ await _update_progress(
548
+ inv_uuid,
549
+ step=2,
550
+ label=f"Checking {len(seed_urls)} known intelligence sources + searching Tor engines",
551
+ )
552
+ else:
553
+ logger.info("[%s] No relevant seeds for query", inv_uuid)
554
+
555
+ # ===== STEP 2, 3.5, 4: Parallel Pipeline =====
556
+ logger.info("[%s] STEP 2/3.5/4: Launching Search, Enrichment, and Crawler concurrently...", inv_uuid)
557
+
558
+ async def run_search_and_filter() -> list:
559
+ logger.info("[%s] STEP 2: Searching dark web...", inv_uuid)
560
+
561
+ async def search_single_language(lang_code: str, q: str) -> list[dict]:
562
+ search_query = q.replace(" ", "+")
563
+ logger.info("[%s] Searching [%s]: %s...", inv_uuid, lang_code, search_query[:60])
564
+ try:
565
+ engine_results = await _search_engines_async(search_query)
566
+ all_links: list[dict] = []
567
+ for er in engine_results:
568
+ weight = 0.5
569
+ for known in _engine_weights:
570
+ if known in er.name.lower():
571
+ weight = _engine_weights[known]
572
+ break
573
+ for link in er.links:
574
+ link["source_engine"] = er.name
575
+ link["source_weight"] = weight
576
+ all_links.append(link)
577
+ lang_results = _search_dedupe(all_links)
578
+ lang_results.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
579
+ for result in lang_results:
580
+ result["search_language"] = lang_code
581
+ return lang_results
582
+ except Exception as e:
583
+ logger.info("[%s] [%s] search failed: %s", inv_uuid, lang_code, e)
584
+ return []
585
+
586
+ search_tasks = [
587
+ search_single_language(lang, q)
588
+ for lang, q in expanded_queries.items()
589
+ ]
590
+ try:
591
+ results_by_language = await asyncio.wait_for(
592
+ asyncio.gather(*search_tasks, return_exceptions=True),
593
+ timeout=180,
594
+ )
595
+ except asyncio.TimeoutError:
596
+ logger.warning("[%s] Multilingual search timed out after 180s, using partial results", inv_uuid)
597
+ results_by_language = []
598
+
599
+ all_search_results = []
600
+ seen_urls = set()
601
+ for lang_results in results_by_language:
602
+ if isinstance(lang_results, Exception):
603
+ continue
604
+ for result in lang_results:
605
+ url = result.get("link", "")
606
+ normalized = url.lower().rstrip("/").replace("https://", "http://")
607
+ if normalized and normalized not in seen_urls:
608
+ seen_urls.add(normalized)
609
+ all_search_results.append(result)
610
+
611
+ search_results = all_search_results
612
+ logger.info("[%s] Total search results: %d (from %d languages)", inv_uuid, len(search_results), len(expanded_queries))
613
+
614
+ if not search_results:
615
+ logger.info("[%s] WARNING: No search results from any language", inv_uuid)
616
+
617
+ logger.info("[%s] STEP 3: Filtering results...", inv_uuid)
618
+ if llm_client is None:
619
+ filtered_results = list(search_results[:100])
620
+ logger.info("[%s] LLM unavailable; fallback to top %s search results", inv_uuid, len(filtered_results))
621
+ else:
622
+ try:
623
+ filtered_results = await _llm_with_backoff(filter_results, llm_client, refined_query, search_results, investigation_id=inv_uuid)
624
+ except Exception as exc:
625
+ logger.exception("[%s] Filter step failed, falling back: %s", inv_uuid, exc)
626
+ filtered_results = list(search_results[:100])
627
+ logger.info("[%s] Filtered to %s results", inv_uuid, len(filtered_results))
628
+
629
+ _urls_to_scrape = list(filtered_results)
630
+ if len(_urls_to_scrape) < 100:
631
+ current_links = {res.get("link") for res in _urls_to_scrape if res.get("link")}
632
+ for res in search_results:
633
+ if res.get("link") not in current_links:
634
+ _urls_to_scrape.append(res)
635
+ current_links.add(res.get("link"))
636
+ if len(_urls_to_scrape) >= 150:
637
+ break
638
+ return _urls_to_scrape
639
+
640
+ async def run_enrichment() -> list:
641
+ logger.info("[%s] STEP 3.5: Running threat intel enrichment...", inv_uuid)
642
+ try:
643
+ from sources.enrichment import enrich_investigation
644
+
645
+ queries_to_enrich = [query]
646
+ if refined_query and refined_query.strip().lower() != query.strip().lower():
647
+ queries_to_enrich.append(refined_query)
648
+
649
+ all_pages: list = []
650
+ seen_urls: set = set()
651
+ for eq in queries_to_enrich:
652
+ try:
653
+ # Hard 60s cap per enrichment query — individual requests already have 30s timeouts
654
+ batch = await asyncio.wait_for(
655
+ enrich_investigation(
656
+ query=eq,
657
+ otx_api_key=resolved_keys.get("OTX_API_KEY") or "",
658
+ ),
659
+ timeout=60,
660
+ )
661
+ for p in batch:
662
+ u = p.get("url") or p.get("link") or ""
663
+ if u not in seen_urls:
664
+ seen_urls.add(u)
665
+ all_pages.append(p)
666
+ except asyncio.TimeoutError:
667
+ logger.warning("[%s] Enrichment query '%s' timed out after 60s", inv_uuid, eq)
668
+ except Exception as exc:
669
+ logger.info("[%s] Enrichment batch failed for '%s': %s", inv_uuid, eq, exc)
670
+
671
+ logger.info("[%s] Enrichment: %s pages (tried %s queries)", inv_uuid, len(all_pages), len(queries_to_enrich))
672
+ return all_pages
673
+ except Exception as exc:
674
+ logger.info("[%s] Enrichment failed (non-fatal): %s", inv_uuid, exc)
675
+ return []
676
+
677
+ async def run_crawler_task() -> list:
678
+ if not run_crawler:
679
+ logger.info("[%s] STEP 4: Crawler disabled", inv_uuid)
680
+ return []
681
+ try:
682
+ logger.info("[%s] STEP 4: Running recursive crawler...", inv_uuid)
683
+ seeds = await asyncio.to_thread(get_seeds, category="index", query=refined_query)
684
+ seed_urls = [seed["url"] for seed in seeds if seed.get("url")]
685
+ # max_depth=1 and max_pages=20 keep the crawler bounded;
686
+ # 120s hard cap prevents dead Tor circuits from stalling the pipeline
687
+ crawler_result = await asyncio.wait_for(
688
+ crawl(seed_urls=seed_urls, query=refined_query, max_depth=1, max_pages=20),
689
+ timeout=120,
690
+ )
691
+ logger.info("[%s] Crawler: %s pages, %s failed", inv_uuid, crawler_result.pages_crawled, crawler_result.pages_failed)
692
+ return [{"link": item.get("url", ""), "title": "Crawler discovery"}
693
+ for item in crawler_result.results if isinstance(item, dict) and item.get("url")]
694
+ except asyncio.TimeoutError:
695
+ logger.warning("[%s] Crawler timed out after 120s, continuing without crawler results", inv_uuid)
696
+ return []
697
+ except Exception as exc:
698
+ logger.exception("[%s] Crawler failed: %s", inv_uuid, str(exc))
699
+ return []
700
+
701
+ async def run_paste_scraping_task() -> list:
702
+ # Clearnet paste-site sweep (Pastebin, dpaste, paste.ee, Rentry).
703
+ # Opt-out via PASTE_SCRAPING_ENABLED=false.
704
+ if not _paste_scraping_enabled():
705
+ logger.info("[%s] Paste sites: disabled via env var", inv_uuid)
706
+ return []
707
+ try:
708
+ paste_max = int(os.getenv("PASTE_MAX_RESULTS", "15") or 15)
709
+ except ValueError:
710
+ paste_max = 15
711
+ try:
712
+ pages = await asyncio.wait_for(
713
+ scrape_paste_sites(
714
+ query=query,
715
+ refined_query=refined_query or "",
716
+ max_results=paste_max,
717
+ ),
718
+ timeout=120,
719
+ )
720
+ logger.info(
721
+ "[%s] Paste sites: %d pastes found",
722
+ inv_uuid,
723
+ len(pages),
724
+ )
725
+ return pages
726
+ except asyncio.TimeoutError:
727
+ logger.warning("[%s] Paste scraping timed out after 120s", inv_uuid)
728
+ return []
729
+ except Exception as exc:
730
+ logger.info("[%s] Paste scraping failed (non-fatal): %s", inv_uuid, exc)
731
+ return []
732
+
733
+ async def run_github_scraping_task() -> list:
734
+ # Clearnet GitHub sweep — code search + repo READMEs.
735
+ # Opt-out via GITHUB_SCRAPING_ENABLED=false.
736
+ if not _github_scraping_enabled():
737
+ logger.info("[%s] GitHub: disabled via env var", inv_uuid)
738
+ return []
739
+ try:
740
+ github_max = int(os.getenv("GITHUB_MAX_RESULTS", "15") or 15)
741
+ except ValueError:
742
+ github_max = 15
743
+ try:
744
+ pages = await asyncio.wait_for(
745
+ scrape_github(
746
+ query=query,
747
+ refined_query=refined_query or "",
748
+ max_results=github_max,
749
+ ),
750
+ timeout=180,
751
+ )
752
+ logger.info(
753
+ "[%s] GitHub: %d files found",
754
+ inv_uuid,
755
+ len(pages),
756
+ )
757
+ return pages
758
+ except asyncio.TimeoutError:
759
+ logger.warning("[%s] GitHub scraping timed out after 180s", inv_uuid)
760
+ return []
761
+ except Exception as exc:
762
+ logger.info("[%s] GitHub scraping failed (non-fatal): %s", inv_uuid, exc)
763
+ return []
764
+
765
+ async def run_gitlab_scraping_task() -> list:
766
+ # Clearnet GitLab sweep — code search + project READMEs.
767
+ # Opt-out via GITLAB_SCRAPING_ENABLED=false.
768
+ if not _gitlab_scraping_enabled():
769
+ logger.info("[%s] GitLab: disabled via env var", inv_uuid)
770
+ return []
771
+ try:
772
+ gitlab_max = int(os.getenv("GITLAB_MAX_RESULTS", "15") or 15)
773
+ except ValueError:
774
+ gitlab_max = 15
775
+ try:
776
+ pages = await asyncio.wait_for(
777
+ scrape_gitlab(
778
+ query=query,
779
+ refined_query=refined_query or "",
780
+ max_results=gitlab_max,
781
+ ),
782
+ timeout=180,
783
+ )
784
+ logger.info(
785
+ "[%s] GitLab: %d results found",
786
+ inv_uuid,
787
+ len(pages),
788
+ )
789
+ return pages
790
+ except asyncio.TimeoutError:
791
+ logger.warning("[%s] GitLab scraping timed out after 180s", inv_uuid)
792
+ return []
793
+ except Exception as exc:
794
+ logger.info("[%s] GitLab scraping failed (non-fatal): %s", inv_uuid, exc)
795
+ return []
796
+
797
+ async def run_rss_scraping_task() -> list:
798
+ if not _rss_scraping_enabled():
799
+ logger.info("[%s] RSS feeds: disabled via env var", inv_uuid)
800
+ return []
801
+ try:
802
+ rss_max = int(os.getenv("RSS_MAX_ARTICLES", "20") or 20)
803
+ except ValueError:
804
+ rss_max = 20
805
+ try:
806
+ pages = await asyncio.wait_for(
807
+ scrape_rss_feeds(
808
+ query=query,
809
+ refined_query=refined_query or "",
810
+ max_results=rss_max,
811
+ ),
812
+ timeout=120,
813
+ )
814
+ logger.info("[%s] RSS feeds: %d articles found", inv_uuid, len(pages))
815
+ return pages
816
+ except asyncio.TimeoutError:
817
+ logger.warning("[%s] RSS scraping timed out after 120s", inv_uuid)
818
+ return []
819
+ except Exception as exc:
820
+ logger.info("[%s] RSS scraping failed (non-fatal): %s", inv_uuid, exc)
821
+ return []
822
+
823
+ # Hard 5-minute cap on the entire parallel phase (search + enrichment +
824
+ # crawler + paste scraping + github scraping + gitlab scraping + RSS
825
+ # feeds). Each inner function also has its own timeout so partial
826
+ # results are preserved even if only one hangs.
827
+ # return_exceptions=True ensures one failing task never cancels the others.
828
+ try:
829
+ _gr = await asyncio.wait_for(
830
+ asyncio.gather(
831
+ run_search_and_filter(),
832
+ run_enrichment(),
833
+ run_crawler_task(),
834
+ run_paste_scraping_task(),
835
+ run_github_scraping_task(),
836
+ run_gitlab_scraping_task(),
837
+ run_rss_scraping_task(),
838
+ return_exceptions=True,
839
+ ),
840
+ timeout=300,
841
+ )
842
+ except asyncio.TimeoutError:
843
+ logger.warning("[%s] Parallel phase hit 300s hard cap — using empty results", inv_uuid)
844
+ _gr = [[], [], [], [], [], [], []]
845
+
846
+ _source_errors: set[str] = set()
847
+
848
+ if isinstance(_gr[0], Exception):
849
+ logger.warning("[%s] Search+filter task raised: %s", inv_uuid, _gr[0])
850
+ _source_errors.add("tor_search")
851
+ search_urls = []
852
+ else:
853
+ search_urls = _gr[0]
854
+
855
+ if isinstance(_gr[1], Exception):
856
+ logger.warning("[%s] Enrichment task raised: %s", inv_uuid, _gr[1])
857
+ _source_errors.add("enrichment")
858
+ enrichment_pages = []
859
+ else:
860
+ enrichment_pages = _gr[1]
861
+
862
+ if isinstance(_gr[2], Exception):
863
+ logger.warning("[%s] Crawler task raised: %s", inv_uuid, _gr[2])
864
+ crawler_urls = []
865
+ else:
866
+ crawler_urls = _gr[2]
867
+
868
+ if isinstance(_gr[3], Exception):
869
+ logger.warning("[%s] Paste scraping task raised: %s", inv_uuid, _gr[3])
870
+ _source_errors.add("paste_sites")
871
+ paste_pages = []
872
+ else:
873
+ paste_pages = _gr[3]
874
+
875
+ if isinstance(_gr[4], Exception):
876
+ logger.warning("[%s] GitHub scraping task raised: %s", inv_uuid, _gr[4])
877
+ _source_errors.add("github")
878
+ github_pages = []
879
+ else:
880
+ github_pages = _gr[4]
881
+
882
+ if isinstance(_gr[5], Exception):
883
+ logger.warning("[%s] GitLab scraping task raised: %s", inv_uuid, _gr[5])
884
+ _source_errors.add("gitlab")
885
+ gitlab_pages = []
886
+ else:
887
+ gitlab_pages = _gr[5]
888
+
889
+ if isinstance(_gr[6], Exception):
890
+ logger.warning("[%s] RSS scraping task raised: %s", inv_uuid, _gr[6])
891
+ _source_errors.add("rss_feeds")
892
+ rss_pages = []
893
+ else:
894
+ rss_pages = _gr[6]
895
+
896
+ await _update_progress(inv_uuid, 2)
897
+ if await _check_cancelled(inv_uuid, investigation_id):
898
+ return
899
+
900
+ if paste_pages:
901
+ paste_sources_used = sorted({
902
+ p.get("source_name") for p in paste_pages
903
+ if p.get("source_name")
904
+ })
905
+ await _update_progress(
906
+ inv_uuid,
907
+ label=(
908
+ f"Found {len(paste_pages)} paste site results "
909
+ f"({', '.join(paste_sources_used)})"
910
+ ),
911
+ )
912
+
913
+ # ── sources_used: record which sources ran and what they returned ──────
914
+ _otx_key = (resolved_keys.get("OTX_API_KEY") or "").strip()
915
+ _vt_key = os.getenv("VT_API_KEY", "").strip()
916
+ _st_key = os.getenv("SECURITYTRAILS_API_KEY", "").strip()
917
+
918
+ def _src_status(count: int, error_key: str | None = None) -> str:
919
+ if error_key and error_key in _source_errors:
920
+ return "error"
921
+ return f"ok_{count}_results" if count > 0 else "ok_0_results"
922
+
923
+ sources_used: dict[str, str] = {}
924
+
925
+ # Keyed sources — show "skipped_no_key" when the key is absent
926
+ if not _otx_key:
927
+ sources_used["otx"] = "skipped_no_key"
928
+ else:
929
+ n = sum(1 for p in enrichment_pages if p.get("source") == "alienvault_otx")
930
+ sources_used["otx"] = _src_status(n, "enrichment")
931
+
932
+ if not _vt_key:
933
+ sources_used["virustotal"] = "skipped_no_key"
934
+ else:
935
+ n = sum(1 for p in enrichment_pages if p.get("source") == "virustotal")
936
+ sources_used["virustotal"] = _src_status(n, "enrichment")
937
+
938
+ sources_used["securitytrails"] = "skipped_no_key" if not _st_key else "skipped_not_implemented"
939
+
940
+ # Free enrichment sources
941
+ for _skey, _psrc in [
942
+ ("malwarebazaar", "malwarebazaar"),
943
+ ("threatfox", "threatfox"),
944
+ ("urlhaus", "urlhaus"),
945
+ ]:
946
+ n = sum(1 for p in enrichment_pages if p.get("source") == _psrc)
947
+ sources_used[_skey] = _src_status(n, "enrichment")
948
+
949
+ _rl_n = sum(
950
+ 1 for p in enrichment_pages
951
+ if p.get("source") == "ransomware_live" and not p.get("_scrape_seed")
952
+ )
953
+ sources_used["ransomware_live"] = _src_status(_rl_n, "enrichment")
954
+
955
+ _cisa_n = sum(1 for p in enrichment_pages if p.get("source") in ("cisa_kev", "cisa_advisory"))
956
+ sources_used["cisa"] = _src_status(_cisa_n, "enrichment")
957
+
958
+ _shodan_n = sum(1 for p in enrichment_pages if p.get("source") == "shodan_internetdb")
959
+ sources_used["shodan"] = _src_status(_shodan_n, "enrichment")
960
+
961
+ # Tor search
962
+ if "tor_search" in _source_errors:
963
+ sources_used["tor_search"] = "error"
964
+ else:
965
+ n = len(search_urls)
966
+ sources_used["tor_search"] = f"ok_{n}_pages" if n > 0 else "ok_0_pages"
967
+
968
+ # Clearnet scrapers
969
+ if not _github_scraping_enabled():
970
+ sources_used["github"] = "skipped_disabled"
971
+ elif "github" in _source_errors:
972
+ sources_used["github"] = "error"
973
+ else:
974
+ sources_used["github"] = _src_status(len(github_pages))
975
+
976
+ if not _gitlab_scraping_enabled():
977
+ sources_used["gitlab"] = "skipped_disabled"
978
+ elif "gitlab" in _source_errors:
979
+ sources_used["gitlab"] = "error"
980
+ else:
981
+ sources_used["gitlab"] = _src_status(len(gitlab_pages))
982
+
983
+ if not _paste_scraping_enabled():
984
+ sources_used["paste_sites"] = "skipped_disabled"
985
+ elif "paste_sites" in _source_errors:
986
+ sources_used["paste_sites"] = "error"
987
+ else:
988
+ sources_used["paste_sites"] = _src_status(len(paste_pages))
989
+
990
+ if not _rss_scraping_enabled():
991
+ sources_used["rss_feeds"] = "skipped_disabled"
992
+ elif "rss_feeds" in _source_errors:
993
+ sources_used["rss_feeds"] = "error"
994
+ else:
995
+ sources_used["rss_feeds"] = _src_status(len(rss_pages))
996
+
997
+ # DNS, domain, hash, and email reputation placeholders — updated after those steps complete
998
+ sources_used["circl_pdns"] = "pending"
999
+ sources_used["domain_reputation"] = "pending"
1000
+ sources_used["hash_reputation"] = "pending"
1001
+ sources_used["email_reputation"] = "pending"
1002
+ _sources_used_cache[investigation_id] = sources_used
1003
+ # ── end sources_used ──────────────────────────────────────────────────
1004
+
1005
+ if len(search_urls) < 2:
1006
+ logger.warning(
1007
+ "[%s] Filtered results too small (%s INTELLIGENCE pages). "
1008
+ "Query may have returned only directory/index pages. "
1009
+ "Try a more specific query.",
1010
+ inv_uuid,
1011
+ len(search_urls),
1012
+ )
1013
+ no_result_summary = (
1014
+ f"Investigation for '{refined_query}' completed but found insufficient "
1015
+ f"intelligence content. Only {len(search_urls)} qualifying page(s) remained "
1016
+ f"after filtering out directory/index pages. This suggests the query "
1017
+ f"returned primarily link aggregators or marketplace indexes rather than "
1018
+ f"actual threat intelligence content. Try a more specific, targeted query "
1019
+ f"(e.g., specific malware names, actor handles, or infrastructure indicators) "
1020
+ f"instead of broad topic searches."
1021
+ )
1022
+ with get_session() as session:
1023
+ session.query(Investigation).filter_by(id=inv_uuid).update(
1024
+ {"status": "completed_no_results", "summary": no_result_summary, "graph_status": "no_data"}
1025
+ )
1026
+ session.commit()
1027
+ logger.info("[%s] Investigation COMPLETED_NO_RESULTS (run_id=%s)", inv_uuid, run_id)
1028
+ return
1029
+
1030
+ # Seed .onion leak-site URLs discovered by enrichment (e.g. ransomware.live)
1031
+ # into the scrape queue so they get visited even if search engines didn't find them
1032
+ enrichment_onion_seeds = [
1033
+ {"link": p.get("link") or p.get("url"), "title": p.get("title", "Enrichment seed")}
1034
+ for p in enrichment_pages
1035
+ if p.get("_scrape_seed") and ".onion" in (p.get("link") or p.get("url") or "")
1036
+ ]
1037
+ if enrichment_onion_seeds:
1038
+ logger.info(
1039
+ "[%s] Adding %d .onion seeds from enrichment to scrape queue",
1040
+ inv_uuid, len(enrichment_onion_seeds),
1041
+ )
1042
+
1043
+ # Seed URLs go first — they're known intelligence sources and skip the LLM filter
1044
+ all_urls_to_scrape = seed_urls + search_urls + crawler_urls + enrichment_onion_seeds
1045
+ logger.info(
1046
+ "[%s] Total URLs to scrape: %s (%s seeds + %s search + %s crawler + %s enrichment)",
1047
+ inv_uuid,
1048
+ len(all_urls_to_scrape),
1049
+ len(seed_urls),
1050
+ len(search_urls),
1051
+ len(crawler_urls),
1052
+ len(enrichment_onion_seeds),
1053
+ )
1054
+
1055
+ if enrichment_pages:
1056
+ try:
1057
+ from vector.store import store_page
1058
+ for ep in enrichment_pages:
1059
+ u = ep.get("url") or ep.get("link") or ""
1060
+ t = ep.get("text") or ep.get("content") or ""
1061
+ if u and t:
1062
+ store_page(url=u, content=t, metadata={"source": ep.get("source", "enrichment")})
1063
+ except Exception:
1064
+ pass
1065
+
1066
+ # ===== STEP 4.5: Vector Cache Lookup (no session held) =====
1067
+ logger.info(
1068
+ "[%s] STEP 4.5: Checking vector cache for %d URLs...",
1069
+ inv_uuid,
1070
+ len(all_urls_to_scrape),
1071
+ )
1072
+ cached_dict: dict = {}
1073
+ uncached_url_dicts = list(all_urls_to_scrape)
1074
+ try:
1075
+ from vector.store import bulk_check_cache
1076
+
1077
+ url_strings = [
1078
+ u.get("link", u) if isinstance(u, dict) else str(u)
1079
+ for u in all_urls_to_scrape
1080
+ ]
1081
+ cached_pages_list, urls_needing_scrape = bulk_check_cache(
1082
+ url_strings, max_age_hours=24
1083
+ )
1084
+ cached_dict = {p["link"]: p["content"] for p in cached_pages_list}
1085
+ uncached_set = set(urls_needing_scrape)
1086
+ uncached_url_dicts = [
1087
+ u for u in all_urls_to_scrape
1088
+ if (u.get("link", u) if isinstance(u, dict) else str(u))
1089
+ in uncached_set
1090
+ ]
1091
+ logger.info(
1092
+ "[%s] Cache: %d hits, %d misses (need Tor)",
1093
+ inv_uuid,
1094
+ len(cached_dict),
1095
+ len(uncached_url_dicts),
1096
+ )
1097
+ except Exception as exc:
1098
+ logger.info("[%s] Cache check failed (non-fatal): %s", inv_uuid, exc)
1099
+ cached_dict = {}
1100
+ uncached_url_dicts = list(all_urls_to_scrape)
1101
+
1102
+ # ===== STEP 5: Scraping (no session held) =====
1103
+ uncached_url_dicts, ssrf_blocked = validate_urls_for_scraping(uncached_url_dicts)
1104
+ if ssrf_blocked:
1105
+ logger.info(
1106
+ "[%s] SSRF: blocked %d unsafe URLs",
1107
+ inv_uuid,
1108
+ len(ssrf_blocked),
1109
+ )
1110
+ logger.info(
1111
+ "[%s] STEP 5: Scraping %d URLs (skipped %d cached)...",
1112
+ inv_uuid,
1113
+ len(uncached_url_dicts),
1114
+ len(cached_dict),
1115
+ )
1116
+ freshly_scraped = await scrape_multiple(uncached_url_dicts, max_workers=12)
1117
+ await _update_progress(inv_uuid, 4, scraped_pages=freshly_scraped)
1118
+ if await _check_cancelled(inv_uuid, investigation_id):
1119
+ return
1120
+
1121
+ # ===== STEP 5.5: Store new pages in vector cache (no session held) =====
1122
+ try:
1123
+ from vector.store import store_page
1124
+
1125
+ stored_count = 0
1126
+ for page_url, page_text in freshly_scraped.items():
1127
+ if page_text and len(page_text) > 100:
1128
+ if store_page(url=page_url, content=page_text, metadata={"source": "scraper"}):
1129
+ stored_count += 1
1130
+ logger.info("[%s] Stored %d new pages in vector cache", inv_uuid, stored_count)
1131
+ except Exception as exc:
1132
+ logger.info("[%s] Cache store failed (non-fatal): %s", inv_uuid, exc)
1133
+
1134
+ scraped_pages = {**cached_dict, **freshly_scraped}
1135
+
1136
+ # ===== STEP 5.75: Content safety scan (Layer 4) =====
1137
+ from utils.content_safety import sanitize_content, log_content_safety_event
1138
+ clean_pages: dict[str, str] = {}
1139
+ blocked_count = 0
1140
+ for page_url, page_text in scraped_pages.items():
1141
+ clean_text, was_flagged = sanitize_content(page_text)
1142
+ if was_flagged:
1143
+ blocked_count += 1
1144
+ url_hash = hashlib.sha256(page_url.encode()).hexdigest()[:16]
1145
+ logger.warning(
1146
+ "[%s] Page content blocked — prohibited content. Page hash: %s",
1147
+ inv_uuid,
1148
+ url_hash,
1149
+ )
1150
+ log_content_safety_event(
1151
+ event_type="content_blocked",
1152
+ content_hash=url_hash,
1153
+ user_id=inv_user_id,
1154
+ )
1155
+ else:
1156
+ clean_pages[page_url] = clean_text
1157
+ if blocked_count > 0:
1158
+ logger.warning(
1159
+ "[%s] Blocked %d pages for prohibited content",
1160
+ inv_uuid,
1161
+ blocked_count,
1162
+ )
1163
+ scraped_pages = clean_pages
1164
+
1165
+ scraped_count = len(scraped_pages)
1166
+ logger.info(
1167
+ "[%s] Total for extraction: %d pages (%d cached + %d fresh, %d blocked)",
1168
+ inv_uuid,
1169
+ scraped_count,
1170
+ len(cached_dict),
1171
+ len(freshly_scraped),
1172
+ blocked_count,
1173
+ )
1174
+
1175
+ page_records = [
1176
+ {"url": page_url, "text": page_text, "content": page_text}
1177
+ for page_url, page_text in scraped_pages.items()
1178
+ ]
1179
+
1180
+ if enrichment_pages:
1181
+ enrichment_count = 0
1182
+ for ep in enrichment_pages:
1183
+ u = ep.get("url") or ep.get("link") or ""
1184
+ t = ep.get("text") or ep.get("content") or ""
1185
+ if u and (t or "").strip():
1186
+ page_records.append({"url": u, "text": t, "content": t})
1187
+ enrichment_count += 1
1188
+
1189
+ logger.info(
1190
+ "[%s] Total pages for extraction: %s (%s scraped + %s enrichment)",
1191
+ inv_uuid,
1192
+ len(page_records),
1193
+ scraped_count,
1194
+ enrichment_count,
1195
+ )
1196
+ else:
1197
+ logger.info(
1198
+ "[%s] Total pages for extraction: %s (%s scraped + 0 enrichment)",
1199
+ inv_uuid,
1200
+ len(page_records),
1201
+ scraped_count,
1202
+ )
1203
+
1204
+ # Paste-site pages already have fetched text — bypass scraping and
1205
+ # add them directly to the extraction pool, marked with their source.
1206
+ if paste_pages:
1207
+ paste_added = 0
1208
+ for pp in paste_pages:
1209
+ u = pp.get("url") or ""
1210
+ t = pp.get("text_content") or ""
1211
+ if u and t.strip():
1212
+ page_records.append({
1213
+ "url": u,
1214
+ "text": t,
1215
+ "content": t,
1216
+ "source_type": "paste_site",
1217
+ "source_name": pp.get("source_name"),
1218
+ })
1219
+ paste_added += 1
1220
+ logger.info(
1221
+ "[%s] Added %d paste-site pages to extraction pool",
1222
+ inv_uuid,
1223
+ paste_added,
1224
+ )
1225
+
1226
+ # GitHub pages already have fetched text — bypass scraping and add
1227
+ # them directly to the extraction pool, marked source_type="github".
1228
+ if github_pages:
1229
+ github_added = 0
1230
+ for gp in github_pages:
1231
+ u = gp.get("url") or ""
1232
+ t = gp.get("text_content") or ""
1233
+ if u and t.strip():
1234
+ page_records.append({
1235
+ "url": u,
1236
+ "text": t,
1237
+ "content": t,
1238
+ "source_type": "github",
1239
+ "source_name": gp.get("source_name", "GitHub"),
1240
+ })
1241
+ github_added += 1
1242
+ logger.info(
1243
+ "[%s] Added %d GitHub pages to extraction pool",
1244
+ inv_uuid,
1245
+ github_added,
1246
+ )
1247
+ else:
1248
+ logger.info("[%s] GitHub: no results", inv_uuid)
1249
+
1250
+ # GitLab pages already have fetched text — bypass scraping and add
1251
+ # them directly to the extraction pool, marked source_type="gitlab".
1252
+ if gitlab_pages:
1253
+ gitlab_added = 0
1254
+ for glp in gitlab_pages:
1255
+ u = glp.get("url") or ""
1256
+ t = glp.get("text_content") or ""
1257
+ if u and t.strip():
1258
+ page_records.append({
1259
+ "url": u,
1260
+ "text": t,
1261
+ "content": t,
1262
+ "source_type": "gitlab",
1263
+ "source_name": glp.get("source_name", "GitLab"),
1264
+ })
1265
+ gitlab_added += 1
1266
+ logger.info(
1267
+ "[%s] Added %d GitLab pages to extraction pool",
1268
+ inv_uuid,
1269
+ gitlab_added,
1270
+ )
1271
+ else:
1272
+ logger.info("[%s] GitLab: no results", inv_uuid)
1273
+
1274
+ # RSS feed articles are pre-fetched — bypass scraping, add directly
1275
+ # to the extraction pool marked source_type="rss_feed".
1276
+ if rss_pages:
1277
+ rss_added = 0
1278
+ for rp in rss_pages:
1279
+ u = rp.get("url") or ""
1280
+ t = rp.get("text_content") or ""
1281
+ if u and t.strip():
1282
+ page_records.append({
1283
+ "url": u,
1284
+ "text": t,
1285
+ "content": t,
1286
+ "source_type": "rss_feed",
1287
+ "source_name": rp.get("source_name", "RSS Feed"),
1288
+ "title": rp.get("title", ""),
1289
+ "published_at": rp.get("published_at", ""),
1290
+ })
1291
+ rss_added += 1
1292
+ contributing_feeds = sorted({
1293
+ rp.get("source_name", "unknown") for rp in rss_pages
1294
+ if rp.get("source_name")
1295
+ })
1296
+ logger.info(
1297
+ "[%s] Added %d RSS articles to extraction pool (feeds: %s)",
1298
+ inv_uuid,
1299
+ rss_added,
1300
+ contributing_feeds,
1301
+ )
1302
+ else:
1303
+ logger.info("[%s] RSS feeds: no relevant articles", inv_uuid)
1304
+
1305
+ non_empty_records = [r for r in page_records if len((r.get("text") or "").strip()) > 100]
1306
+ logger.info("[%s] Non-empty pages (>100 chars): %s", inv_uuid, len(non_empty_records))
1307
+ if not non_empty_records:
1308
+ first_length = len(page_records[0].get("text", "")) if page_records else 0
1309
+ logger.info("[%s] WARNING: All scraped pages are empty/short", inv_uuid)
1310
+ logger.info("[%s] First page content length: %s", inv_uuid, first_length)
1311
+
1312
+ # ===== STEP 5.7: Detect content languages (no session held) =====
1313
+ try:
1314
+ from i18n.detect import detect_language
1315
+
1316
+ lang_distribution: dict[str, int] = {}
1317
+ for page in page_records:
1318
+ text = page.get("content") or page.get("text") or ""
1319
+ if len(text) >= 50:
1320
+ lang = detect_language(text[:500])
1321
+ if lang:
1322
+ lang_distribution[lang] = lang_distribution.get(lang, 0) + 1
1323
+
1324
+ if lang_distribution:
1325
+ total_pages = sum(lang_distribution.values())
1326
+ non_english = {k: v for k, v in lang_distribution.items() if k != "en"}
1327
+ logger.info(
1328
+ "[%s] Content languages: %s (%d/%d non-English pages)",
1329
+ inv_uuid,
1330
+ lang_distribution,
1331
+ sum(non_english.values()),
1332
+ total_pages,
1333
+ )
1334
+ except Exception as e:
1335
+ logger.info("[%s] Language detection failed (non-fatal): %s", inv_uuid, e)
1336
+
1337
+ # ===== STEP 6: Entity extraction (no session held) =====
1338
+ logger.info("[%s] STEP 6: Extracting entities...", inv_uuid)
1339
+ extraction_input = non_empty_records if non_empty_records else page_records
1340
+ try:
1341
+ extraction_results = await extract_entities_from_pages(
1342
+ extraction_input,
1343
+ investigation_id=inv_uuid,
1344
+ llm=llm_client,
1345
+ run_llm_extraction=True,
1346
+ )
1347
+ total_entities = sum(r.entity_count for r in extraction_results)
1348
+ logger.info("[%s] Extracted %s entities", inv_uuid, total_entities)
1349
+ if total_entities == 0:
1350
+ logger.info("[%s] WARNING: No entities extracted", inv_uuid)
1351
+ logger.info(
1352
+ "[%s] Pages passed to extractor: %s",
1353
+ inv_uuid,
1354
+ len(extraction_input),
1355
+ )
1356
+ except Exception as exc:
1357
+ logger.exception("[%s] Extraction failed: %s", inv_uuid, str(exc))
1358
+ extraction_results = []
1359
+ total_entities = 0
1360
+
1361
+ await _update_progress(inv_uuid, 5, entity_count=total_entities)
1362
+ if await _check_cancelled(inv_uuid, investigation_id):
1363
+ return
1364
+
1365
+ # ===== STEP 6.1: IP Reputation Enrichment =====
1366
+ # Runs after entities are in DB but before the entity cap is applied.
1367
+ # Suppresses GreyNoise-benign IPs and boosts confidence for confirmed C2s.
1368
+ logger.info("[%s] STEP 6.1: Running IP reputation enrichment...", inv_uuid)
1369
+ try:
1370
+ from sources.ip_reputation import enrich_ip_entities as _enrich_ips
1371
+
1372
+ extraction_results, _ip_stats = await asyncio.wait_for(
1373
+ _enrich_ips(extraction_results, inv_uuid),
1374
+ timeout=60,
1375
+ )
1376
+ total_entities = sum(r.entity_count for r in extraction_results)
1377
+ sources_used["ip_reputation"] = _ip_stats.get("ip_reputation", "ok_0_ips")
1378
+ _sources_used_cache[investigation_id] = sources_used
1379
+ logger.info(
1380
+ "[%s] IP reputation: %d checked, %d suppressed, %d C2 confirmed, %d abuse",
1381
+ inv_uuid,
1382
+ _ip_stats.get("checked", 0),
1383
+ _ip_stats.get("suppressed", 0),
1384
+ _ip_stats.get("c2_confirmed", 0),
1385
+ _ip_stats.get("abuse_confirmed", 0),
1386
+ )
1387
+ except asyncio.TimeoutError:
1388
+ logger.warning("[%s] IP reputation enrichment timed out after 60s", inv_uuid)
1389
+ sources_used["ip_reputation"] = "error_timeout"
1390
+ _sources_used_cache[investigation_id] = sources_used
1391
+ except Exception as _ip_exc:
1392
+ logger.info("[%s] IP reputation enrichment failed (non-fatal): %s", inv_uuid, _ip_exc)
1393
+ sources_used["ip_reputation"] = "error"
1394
+ _sources_used_cache[investigation_id] = sources_used
1395
+
1396
+ # ===== STEP 6.5: Cross-reference against seed data (short-lived session) =====
1397
+ logger.info("[%s] STEP 6.5: Cross-referencing with historical data...", inv_uuid)
1398
+ try:
1399
+ from db.queries import cross_reference_with_seeds
1400
+
1401
+ with get_session() as session:
1402
+ seed_matches = cross_reference_with_seeds(session, inv_uuid)
1403
+ logger.info("[%s] Found %s historical matches", inv_uuid, seed_matches)
1404
+ except Exception as e:
1405
+ logger.info("[%s] Cross-reference failed (non-fatal): %s", inv_uuid, e)
1406
+
1407
+ # ===== STEP 6.6: Build Stylometry Profiles (wrapped in to_thread with own session) =====
1408
+ logger.info(f"[{inv_uuid}] STEP 6.6: Building actor style profiles...")
1409
+ try:
1410
+ profiles_built = await asyncio.to_thread(
1411
+ _build_investigation_profiles,
1412
+ inv_uuid,
1413
+ )
1414
+ logger.info(f"[{inv_uuid}] Built {profiles_built} actor profiles")
1415
+ except Exception as e:
1416
+ logger.info(f"[{inv_uuid}] Profile building failed (non-fatal): {e}")
1417
+
1418
+ # ===== STEP 6.7: Blockchain Wallet Enrichment (wrapped in to_thread with own session) =====
1419
+ logger.info(f"[{inv_uuid}] STEP 6.7: Enriching wallet entities...")
1420
+ try:
1421
+ from sources.blockchain import enrich_wallets_for_investigation
1422
+ from config import BLOCKCYPHER_TOKEN, ETHERSCAN_API_KEY
1423
+
1424
+ blockchain_stats = await asyncio.to_thread(
1425
+ _enrich_wallets_sync,
1426
+ inv_uuid,
1427
+ BLOCKCYPHER_TOKEN,
1428
+ ETHERSCAN_API_KEY,
1429
+ )
1430
+
1431
+ logger.info(
1432
+ f"[{inv_uuid}] Blockchain enrichment: "
1433
+ f"{blockchain_stats['successful_lookups']}/{blockchain_stats['wallets_looked_up']} lookups successful, "
1434
+ f"{blockchain_stats['edges_created']} PAID_TO edges created, "
1435
+ f"{blockchain_stats['connected_wallets_found']} connected wallets found"
1436
+ )
1437
+ except Exception as e:
1438
+ logger.info(f"[{inv_uuid}] Blockchain enrichment failed (non-fatal): {e}")
1439
+
1440
+ await _update_progress(inv_uuid, 6)
1441
+
1442
+ # ===== STEP 6.8: DNS/WHOIS Enrichment (no session held) =====
1443
+ logger.info("[%s] STEP 6.8: Running DNS/WHOIS enrichment...", inv_uuid)
1444
+ try:
1445
+ from sources.enrichment import run_dns_enrichment
1446
+
1447
+ # Build a flat list of entity dicts from extraction results for DNS lookup.
1448
+ # NormalizedEntity dataclasses are converted to the dict format expected by
1449
+ # enrich_with_dns (entity_type + canonical_value/value).
1450
+ extracted_entities_for_dns: list[dict] = []
1451
+ for _r in extraction_results:
1452
+ for _e in getattr(_r, "entities", []):
1453
+ if hasattr(_e, "entity_type"):
1454
+ extracted_entities_for_dns.append({
1455
+ "entity_type": _e.entity_type,
1456
+ "canonical_value": _e.value,
1457
+ "value": _e.value,
1458
+ "confidence": _e.confidence,
1459
+ })
1460
+ elif isinstance(_e, dict):
1461
+ extracted_entities_for_dns.append(_e)
1462
+
1463
+ dns_results = await asyncio.wait_for(
1464
+ run_dns_enrichment(extracted_entities_for_dns),
1465
+ timeout=120,
1466
+ )
1467
+
1468
+ new_dns_entities = dns_results.get("new_entities", [])
1469
+ if new_dns_entities:
1470
+ logger.info(
1471
+ "[%s] DNS enrichment: %d new entities discovered",
1472
+ inv_uuid,
1473
+ len(new_dns_entities),
1474
+ )
1475
+
1476
+ clusters = dns_results.get("infrastructure_clusters", [])
1477
+ if clusters:
1478
+ logger.info(
1479
+ "[%s] Infrastructure clusters found: %d",
1480
+ inv_uuid,
1481
+ len(clusters),
1482
+ )
1483
+ for cluster in clusters:
1484
+ logger.info("[%s] %s", inv_uuid, cluster["description"])
1485
+ _infra_cluster_cache[investigation_id] = clusters
1486
+
1487
+ _dns_ent_count = len(new_dns_entities)
1488
+ sources_used["circl_pdns"] = (
1489
+ f"ok_{_dns_ent_count}_enrichments" if _dns_ent_count > 0 else "ok_0_enrichments"
1490
+ )
1491
+ _sources_used_cache[investigation_id] = sources_used
1492
+
1493
+ except asyncio.TimeoutError:
1494
+ logger.warning("[%s] DNS enrichment timed out after 120s", inv_uuid)
1495
+ sources_used["circl_pdns"] = "error"
1496
+ _sources_used_cache[investigation_id] = sources_used
1497
+ except Exception as _dns_exc:
1498
+ logger.info("[%s] DNS enrichment failed (non-fatal): %s", inv_uuid, _dns_exc)
1499
+ sources_used["circl_pdns"] = "error"
1500
+ _sources_used_cache[investigation_id] = sources_used
1501
+
1502
+ # ===== STEP 6.2: Domain Reputation Enrichment =====
1503
+ # Runs after DNS enrichment. Enriches DOMAIN entities with:
1504
+ # crt.sh (subdomain enumeration via certificate transparency)
1505
+ # URLScan.io (live scan data, malicious indicators, communicating IPs)
1506
+ # Wayback Machine (historical snapshots for taken-down domains)
1507
+ # Non-fatal: if all three sources fail for a domain, entity is unchanged.
1508
+ logger.info("[%s] STEP 6.2: Running domain reputation enrichment...", inv_uuid)
1509
+ try:
1510
+ from sources.domain_reputation import enrich_domain_entities as _enrich_domains
1511
+
1512
+ extraction_results, _dom_stats = await asyncio.wait_for(
1513
+ _enrich_domains(extraction_results, inv_uuid),
1514
+ timeout=120,
1515
+ )
1516
+ sources_used["domain_reputation"] = _dom_stats.get(
1517
+ "domain_reputation", "ok_0_domains"
1518
+ )
1519
+ _sources_used_cache[investigation_id] = sources_used
1520
+ logger.info(
1521
+ "[%s] Domain reputation: %d domains, %d CT records, %d malicious, %d archived",
1522
+ inv_uuid,
1523
+ _dom_stats.get("domains_checked", 0),
1524
+ _dom_stats.get("ct_records", 0),
1525
+ _dom_stats.get("urlscan_malicious", 0),
1526
+ _dom_stats.get("wayback_archived", 0),
1527
+ )
1528
+ except asyncio.TimeoutError:
1529
+ logger.warning("[%s] Domain reputation enrichment timed out after 120s", inv_uuid)
1530
+ sources_used["domain_reputation"] = "error_timeout"
1531
+ _sources_used_cache[investigation_id] = sources_used
1532
+ except Exception as _dom_exc:
1533
+ logger.info("[%s] Domain reputation enrichment failed (non-fatal): %s", inv_uuid, _dom_exc)
1534
+ sources_used["domain_reputation"] = "error"
1535
+ _sources_used_cache[investigation_id] = sources_used
1536
+
1537
+ # ===== STEP 6.3: Hash Reputation Enrichment =====
1538
+ # Runs after domain reputation. Enriches FILE_HASH_* entities with:
1539
+ # Hybrid Analysis (behavioral sandbox — requires HYBRID_ANALYSIS_API_KEY)
1540
+ # MalwareBazaar (family classification — free, no auth)
1541
+ # ThreatFox (IOC database — free, no auth)
1542
+ # VirusTotal extended (AV detections + sandbox IOCs — requires VT_API_KEY)
1543
+ # Hashes are never suppressed. Non-fatal: 90s timeout.
1544
+ logger.info("[%s] STEP 6.3: Running hash reputation enrichment...", inv_uuid)
1545
+ try:
1546
+ from sources.hash_reputation import enrich_hash_entities as _enrich_hashes
1547
+
1548
+ extraction_results, _hash_stats = await asyncio.wait_for(
1549
+ _enrich_hashes(extraction_results, inv_uuid),
1550
+ timeout=90,
1551
+ )
1552
+ sources_used["hash_reputation"] = _hash_stats.get("hash_reputation", "ok_0_hashes")
1553
+ _sources_used_cache[investigation_id] = sources_used
1554
+ logger.info(
1555
+ "[%s] Hash reputation: %d checked, %d malicious, %d suspicious, "
1556
+ "%d families, %d new entities",
1557
+ inv_uuid,
1558
+ _hash_stats.get("hashes_checked", 0),
1559
+ _hash_stats.get("malicious", 0),
1560
+ _hash_stats.get("suspicious", 0),
1561
+ _hash_stats.get("malware_families_found", 0),
1562
+ _hash_stats.get("new_entities_discovered", 0),
1563
+ )
1564
+ except asyncio.TimeoutError:
1565
+ logger.warning("[%s] Hash reputation enrichment timed out after 90s", inv_uuid)
1566
+ sources_used["hash_reputation"] = "error_timeout"
1567
+ _sources_used_cache[investigation_id] = sources_used
1568
+ except Exception as _hash_exc:
1569
+ logger.info("[%s] Hash reputation enrichment failed (non-fatal): %s", inv_uuid, _hash_exc)
1570
+ sources_used["hash_reputation"] = "error"
1571
+ _sources_used_cache[investigation_id] = sources_used
1572
+
1573
+ # ===== STEP 6.4: Email Reputation Enrichment =====
1574
+ # Runs after hash reputation. Enriches EMAIL_ADDRESS entities with:
1575
+ # HIBP (breach history — requires HIBP_API_KEY, paid $3.50/mo)
1576
+ # EmailRep.io (reputation scoring — works without key)
1577
+ # Disposable domain blocklist (local check, no auth)
1578
+ # Domain cross-reference (custom email domains added as DOMAIN entities)
1579
+ # Non-fatal: 60s timeout.
1580
+ logger.info("[%s] STEP 6.4: Running email reputation enrichment...", inv_uuid)
1581
+ try:
1582
+ from sources.email_reputation import enrich_email_entities as _enrich_emails
1583
+
1584
+ extraction_results, _email_stats = await asyncio.wait_for(
1585
+ _enrich_emails(extraction_results, inv_uuid),
1586
+ timeout=60,
1587
+ )
1588
+ sources_used["email_reputation"] = _email_stats.get(
1589
+ "email_reputation", "ok_0_emails"
1590
+ )
1591
+ _sources_used_cache[investigation_id] = sources_used
1592
+ logger.info(
1593
+ "[%s] Email reputation: %d checked, %d breached, %d passwords exposed, "
1594
+ "%d disposable, %d malicious",
1595
+ inv_uuid,
1596
+ _email_stats.get("emails_checked", 0),
1597
+ _email_stats.get("breached", 0),
1598
+ _email_stats.get("password_exposed", 0),
1599
+ _email_stats.get("disposable", 0),
1600
+ _email_stats.get("malicious", 0),
1601
+ )
1602
+ except asyncio.TimeoutError:
1603
+ logger.warning("[%s] Email reputation enrichment timed out after 60s", inv_uuid)
1604
+ sources_used["email_reputation"] = "error_timeout"
1605
+ _sources_used_cache[investigation_id] = sources_used
1606
+ except Exception as _email_exc:
1607
+ logger.info(
1608
+ "[%s] Email reputation enrichment failed (non-fatal): %s", inv_uuid, _email_exc
1609
+ )
1610
+ sources_used["email_reputation"] = "error"
1611
+ _sources_used_cache[investigation_id] = sources_used
1612
+
1613
+ # ===== STEP 7: Graph building (wrapped in to_thread with own session) =====
1614
+ logger.info("[%s] STEP 7: Building graph...", inv_uuid)
1615
+ try:
1616
+ from graph.builder import build_graph_from_db, persist_graph_edges
1617
+
1618
+ graph_obj = await asyncio.to_thread(build_graph_from_db, investigation_id=inv_uuid)
1619
+ node_count = len(graph_obj.nodes())
1620
+ edge_count = len(graph_obj.edges())
1621
+ logger.info(
1622
+ "[%s] Graph: %s nodes, %s edges",
1623
+ inv_uuid,
1624
+ node_count,
1625
+ edge_count,
1626
+ )
1627
+
1628
+ try:
1629
+ persist_result = await asyncio.to_thread(
1630
+ _persist_graph_edges_sync,
1631
+ graph_obj,
1632
+ inv_uuid,
1633
+ )
1634
+ graph_status = persist_result.get("status", "written")
1635
+ edges_written = persist_result.get("edges_written", 0)
1636
+ logger.info(
1637
+ "[%s] Graph edges persisted: %s (%s)",
1638
+ inv_uuid,
1639
+ edges_written,
1640
+ graph_status,
1641
+ )
1642
+
1643
+ new_graph_status = "skipped_overflow" if graph_status == "skipped_overflow" else "built"
1644
+ with get_session() as session:
1645
+ session.query(Investigation).filter_by(id=inv_uuid).update(
1646
+ {"graph_status": new_graph_status}
1647
+ )
1648
+ session.commit()
1649
+ except Exception as e:
1650
+ logger.info("[%s] Edge persistence failed (non-fatal): %s", inv_uuid, e)
1651
+
1652
+ except Exception as exc:
1653
+ logger.exception("[%s] Graph building failed: %s", inv_uuid, str(exc))
1654
+
1655
+ await _update_progress(inv_uuid, 7)
1656
+
1657
+ # ===== STEP 8: Summary (no session held) =====
1658
+ logger.info("[%s] STEP 8: Generating summary (%d pages available)...", inv_uuid, len(page_records))
1659
+ if llm_client is None:
1660
+ summary = (
1661
+ f"Investigation completed without LLM summary. "
1662
+ f"Scraped {scraped_count} pages; extracted {total_entities} entities."
1663
+ )
1664
+ else:
1665
+ try:
1666
+ summary_entities = []
1667
+ if extraction_results:
1668
+ for result in extraction_results:
1669
+ summary_entities.extend(result.entities)
1670
+
1671
+ summary = await _llm_with_backoff(
1672
+ generate_summary,
1673
+ llm=llm_client,
1674
+ query=refined_query,
1675
+ content=page_records,
1676
+ entities=summary_entities if summary_entities else None,
1677
+ investigation_id=inv_uuid,
1678
+ )
1679
+ logger.info("[%s] Summary generated (%d chars)", inv_uuid, len(summary or ""))
1680
+ except Exception as exc:
1681
+ logger.exception("[%s] Summary generation failed, using fallback summary: %s", inv_uuid, exc)
1682
+ summary = (
1683
+ f"Investigation complete for '{refined_query}'. "
1684
+ f"Analysis pipeline completed successfully, but summary generation failed: {exc}."
1685
+ )
1686
+
1687
+ logger.info("[%s] Summary preview: %s", inv_uuid, (summary or "")[:100])
1688
+
1689
+ await _update_progress(inv_uuid, 8)
1690
+
1691
+ # ===== Final: Update summary and mark completed (short-lived session) =====
1692
+ with get_session() as session:
1693
+ update_investigation_summary(session, inv_uuid, summary)
1694
+ session.query(Investigation).filter_by(id=inv_uuid).update(
1695
+ {"status": "completed"}
1696
+ )
1697
+ session.commit()
1698
+ await _update_progress(inv_uuid, 9)
1699
+ logger.info("[%s] Investigation COMPLETED (run_id=%s)", inv_uuid, run_id)
1700
+
1701
+ except Exception as exc:
1702
+ logger.exception("[%s] Investigation FAILED with exception: %s", investigation_id, exc)
1703
+ try:
1704
+ from db.models import Investigation
1705
+ from db.session import get_session
1706
+
1707
+ with get_session() as session:
1708
+ session.query(Investigation).filter_by(id=uuid.UUID(investigation_id)).update(
1709
+ {"status": "failed", "summary": f"Investigation failed: {exc!s}"[:500]}
1710
+ )
1711
+ session.commit()
1712
+ except Exception as update_exc:
1713
+ logger.warning("Failed to persist investigation failure status: %s", update_exc)
1714
+
1715
+
1716
+ def _enrich_wallets_sync(investigation_id, blockcypher_token, etherscan_key):
1717
+ """Sync wrapper for blockchain enrichment - creates its own session."""
1718
+ from sources.blockchain import enrich_wallets_for_investigation
1719
+ from db.session import get_session
1720
+
1721
+ with get_session() as session:
1722
+ return enrich_wallets_for_investigation(
1723
+ investigation_id=investigation_id,
1724
+ session=session,
1725
+ blockcypher_token=blockcypher_token,
1726
+ etherscan_key=etherscan_key,
1727
+ max_wallets=10,
1728
+ )
1729
+
1730
+
1731
+ def _persist_graph_edges_sync(graph_obj, investigation_id):
1732
+ """Sync wrapper for graph edge persistence - creates its own session."""
1733
+ from graph.builder import persist_graph_edges
1734
+ from db.session import get_session
1735
+
1736
+ with get_session() as session:
1737
+ return persist_graph_edges(
1738
+ graph_obj,
1739
+ investigation_id,
1740
+ session,
1741
+ )
1742
+
1743
+
1744
+ # ---------------------------------------------------------------------------
1745
+ # Routes
1746
+ # ---------------------------------------------------------------------------
1747
+
1748
+
1749
+ @router.post("")
1750
+ @_rate_limit("3/minute")
1751
+ async def create_investigation(
1752
+ request: Request,
1753
+ body: InvestigationRequest,
1754
+ background_tasks: BackgroundTasks,
1755
+ current_user: CurrentUser = Depends(require_password_not_reset_pending),
1756
+ ) -> dict:
1757
+ """Trigger an investigation asynchronously.
1758
+
1759
+ Creates the investigation row in the DB synchronously before returning so
1760
+ that GET /investigations/{run_id} returns a valid record immediately while
1761
+ the background pipeline runs.
1762
+ """
1763
+ from utils.content_safety import is_blocked_query, log_content_safety_event
1764
+
1765
+ blocked, reason = is_blocked_query(body.query)
1766
+ if blocked:
1767
+ logger.warning(
1768
+ "Investigation blocked — prohibited content detected. User: %s",
1769
+ current_user.user.id,
1770
+ )
1771
+ log_content_safety_event(
1772
+ event_type="query_blocked",
1773
+ content_hash=hashlib.sha256(body.query.encode()).hexdigest()[:16],
1774
+ user_id=current_user.user.id,
1775
+ )
1776
+ raise HTTPException(
1777
+ status_code=400,
1778
+ detail={
1779
+ "error": "prohibited_content",
1780
+ "message": (
1781
+ "This query cannot be processed. VoidAccess is intended "
1782
+ "for legitimate security research only."
1783
+ ),
1784
+ "code": "CONTENT_BLOCKED",
1785
+ },
1786
+ )
1787
+
1788
+ run_id = str(uuid.uuid4())
1789
+
1790
+ if os.getenv("DATABASE_URL"):
1791
+ try:
1792
+ from db.session import get_session
1793
+ from db.queries import create_investigation as db_create
1794
+
1795
+ with get_session() as session:
1796
+ inv = db_create(session, query=body.query, user_id=current_user.user.id)
1797
+ inv.run_id = uuid.UUID(run_id)
1798
+ inv.status = "pending"
1799
+ session.commit()
1800
+ investigation_id = str(inv.id)
1801
+ except Exception as exc:
1802
+ logger.exception("Failed to create investigation record: %s", exc)
1803
+ raise HTTPException(
1804
+ status_code=500,
1805
+ detail=f"Could not persist investigation: {exc!s}"[:300],
1806
+ )
1807
+ else:
1808
+ investigation_id = str(uuid.uuid4())
1809
+
1810
+ background_tasks.add_task(
1811
+ _run_investigation_task,
1812
+ investigation_id=investigation_id,
1813
+ run_id=run_id,
1814
+ query=body.query,
1815
+ model=body.model,
1816
+ run_crawler=body.run_crawler,
1817
+ )
1818
+ return {"run_id": run_id, "status": "pending", "query": body.query}
1819
+
1820
+
1821
+ @router.get("")
1822
+ async def list_investigations(
1823
+ limit: int = Query(default=20, ge=1, le=200),
1824
+ offset: int = Query(default=0, ge=0),
1825
+ current_user: "CurrentUser" = Depends(get_current_user),
1826
+ ) -> list[dict]:
1827
+ """Return a paginated list of investigation summaries."""
1828
+ if not os.getenv("DATABASE_URL"):
1829
+ return []
1830
+ try:
1831
+ from db.session import get_session
1832
+ from db.models import Investigation
1833
+
1834
+ with get_session() as session:
1835
+ invs = (
1836
+ session.query(Investigation)
1837
+ .filter(Investigation.is_seed == False)
1838
+ .filter(Investigation.user_id == current_user.id)
1839
+ .order_by(Investigation.created_at.desc())
1840
+ .offset(offset)
1841
+ .limit(limit)
1842
+ .all()
1843
+ )
1844
+ return [
1845
+ {
1846
+ "id": str(inv.id),
1847
+ "run_id": str(inv.run_id),
1848
+ "query": inv.query,
1849
+ "status": inv.status,
1850
+ "model_used": inv.model_used,
1851
+ "created_at": inv.created_at.isoformat() if inv.created_at else None,
1852
+ "entity_count": inv.entity_count or 0,
1853
+ "page_count": inv.page_count or 0,
1854
+ }
1855
+ for inv in invs
1856
+ ]
1857
+ except Exception as exc:
1858
+ logger.exception("list_investigations failed: %s", exc)
1859
+ return []
1860
+
1861
+
1862
+ @router.post("/{investigation_id}/cancel")
1863
+ async def cancel_investigation(
1864
+ investigation_id: str,
1865
+ current_user: "CurrentUser" = Depends(require_password_not_reset_pending),
1866
+ ) -> dict:
1867
+ """Request cooperative cancellation of a running investigation.
1868
+
1869
+ Sets a cancellation flag that the pipeline checks at each checkpoint.
1870
+ Returns 200 immediately — the pipeline may still be running; poll the
1871
+ investigation status to confirm it reaches 'cancelled'.
1872
+ Returns 409 if the investigation is already in a terminal state.
1873
+ """
1874
+ if not os.getenv("DATABASE_URL"):
1875
+ raise HTTPException(status_code=503, detail="Database not configured")
1876
+ try:
1877
+ inv_uuid = uuid.UUID(investigation_id)
1878
+ except ValueError:
1879
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
1880
+
1881
+ from db.session import get_session
1882
+ from db.models import Investigation
1883
+ from db.queries import get_investigation_by_id_or_run
1884
+
1885
+ try:
1886
+ with get_session() as session:
1887
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
1888
+ if inv is None:
1889
+ raise HTTPException(status_code=404, detail="Investigation not found")
1890
+ if str(inv.user_id) != str(current_user.user.id):
1891
+ raise HTTPException(status_code=403, detail="Forbidden")
1892
+ terminal = {"completed", "failed", "cancelled", "completed_no_results"}
1893
+ if inv.status in terminal:
1894
+ raise HTTPException(
1895
+ status_code=409,
1896
+ detail=f"Investigation cannot be cancelled (current status: {inv.status})",
1897
+ )
1898
+ # Set flag by both run_id and inv.id — the pipeline task uses inv.id
1899
+ _set_cancelled(investigation_id)
1900
+ _set_cancelled(str(inv.id))
1901
+ logger.info(
1902
+ "[%s] Cancellation requested by user %s",
1903
+ inv_uuid,
1904
+ current_user.user.id,
1905
+ )
1906
+ except HTTPException:
1907
+ raise
1908
+ except Exception as exc:
1909
+ logger.exception("cancel_investigation failed: %s", exc)
1910
+ raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}"[:300])
1911
+
1912
+ return _get_db_investigation(investigation_id)
1913
+
1914
+
1915
+ @router.get("/{investigation_id}/progress")
1916
+ async def investigation_progress(
1917
+ investigation_id: str,
1918
+ current_user: "CurrentUser" = Depends(get_current_user),
1919
+ ) -> StreamingResponse:
1920
+ """
1921
+ SSE stream of investigation pipeline progress.
1922
+ Emits step updates every 5 seconds until a terminal state is reached.
1923
+ """
1924
+ from db.session import get_async_session
1925
+ from db.models import Investigation
1926
+
1927
+ try:
1928
+ inv_uuid = uuid.UUID(investigation_id)
1929
+ except ValueError:
1930
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
1931
+
1932
+ # Verify existence and ownership before opening the stream
1933
+ async with get_async_session() as session:
1934
+ result = await session.execute(sa_select(Investigation).where(Investigation.id == inv_uuid))
1935
+ inv_check = result.scalar_one_or_none()
1936
+ if inv_check is None:
1937
+ raise HTTPException(status_code=404, detail="Investigation not found")
1938
+ if str(inv_check.user_id) != str(current_user.user.id):
1939
+ raise HTTPException(status_code=403, detail="Forbidden")
1940
+
1941
+ async def event_stream():
1942
+ last_step = None
1943
+ last_status = None
1944
+ timeout_count = 0
1945
+ max_timeout = 360
1946
+ data: dict = {}
1947
+
1948
+ while timeout_count < max_timeout:
1949
+ try:
1950
+ async with get_async_session() as session:
1951
+ result = await session.execute(
1952
+ sa_select(Investigation).where(Investigation.id == inv_uuid)
1953
+ )
1954
+ inv = result.scalar_one_or_none()
1955
+ except Exception:
1956
+ break
1957
+
1958
+ if inv is None:
1959
+ yield f"data: {json.dumps({'error': 'not_found'})}\n\n"
1960
+ break
1961
+
1962
+ step = inv.current_step or 0
1963
+ label = inv.current_step_label or ""
1964
+ status = inv.status
1965
+
1966
+ if step != last_step or status != last_status:
1967
+ data = {
1968
+ "step": step,
1969
+ "total_steps": 13,
1970
+ "label": label,
1971
+ "progress": int((step / 13) * 100),
1972
+ "status": status,
1973
+ "entity_count": inv.entity_count or 0,
1974
+ "page_count": inv.page_count or 0,
1975
+ }
1976
+ yield f"data: {json.dumps(data)}\n\n"
1977
+ last_step = step
1978
+ last_status = status
1979
+
1980
+ if status in ("completed", "failed", "completed_no_results", "cancelled"):
1981
+ yield f"data: {json.dumps({**data, 'done': True})}\n\n"
1982
+ break
1983
+
1984
+ timeout_count += 1
1985
+ await asyncio.sleep(5)
1986
+
1987
+ yield ": stream closed\n\n"
1988
+
1989
+ return StreamingResponse(
1990
+ event_stream(),
1991
+ media_type="text/event-stream",
1992
+ headers={
1993
+ "Cache-Control": "no-cache",
1994
+ "X-Accel-Buffering": "no",
1995
+ },
1996
+ )
1997
+
1998
+
1999
+ @router.get("/{investigation_id}/analysis/temporal")
2000
+ async def get_temporal_analysis(investigation_id: str) -> dict:
2001
+ """
2002
+ Run temporal analysis on pages from this investigation.
2003
+
2004
+ Returns activity patterns by hour/day, anomalies, and silence breaks.
2005
+ Returns {"error": "insufficient_data"} (not 500) when there is not enough data.
2006
+ """
2007
+ try:
2008
+ inv_uuid = uuid.UUID(investigation_id)
2009
+ except ValueError:
2010
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
2011
+
2012
+ if not os.getenv("DATABASE_URL"):
2013
+ raise HTTPException(status_code=503, detail="Database not configured")
2014
+
2015
+ try:
2016
+ from db.session import get_session
2017
+ from db.models import Entity, Page
2018
+ from db.queries import get_investigation_by_id_or_run
2019
+ from collections import defaultdict
2020
+ from analysis.temporal import detect_anomalies, detect_silence_breaks, Z_SCORE_THRESHOLD
2021
+
2022
+ with get_session() as session:
2023
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
2024
+ if inv is None:
2025
+ raise HTTPException(status_code=404, detail="Investigation not found")
2026
+
2027
+ entities = session.query(Entity).filter(
2028
+ Entity.investigation_id == inv.id
2029
+ ).all()
2030
+
2031
+ if not entities:
2032
+ return {
2033
+ "investigation_id": investigation_id,
2034
+ "error": "insufficient_data",
2035
+ "message": "No entities found for this investigation",
2036
+ }
2037
+
2038
+ page_ids = list({e.page_id for e in entities if e.page_id is not None})
2039
+ if not page_ids:
2040
+ return {
2041
+ "investigation_id": investigation_id,
2042
+ "error": "insufficient_data",
2043
+ "message": "No page timestamps available",
2044
+ }
2045
+
2046
+ pages = session.query(Page).filter(Page.id.in_(page_ids)).all()
2047
+ real_post_ts = sum(1 for p in pages if p.posted_at is not None)
2048
+ skipped_no_posted_at = len(pages) - real_post_ts
2049
+ if skipped_no_posted_at > 0:
2050
+ logger.debug(
2051
+ "Temporal analysis: skipped %d pages due to missing posted_at (using content timestamp, not scrape time)",
2052
+ skipped_no_posted_at,
2053
+ )
2054
+ timestamps = []
2055
+ for p in pages:
2056
+ if p.posted_at is not None:
2057
+ timestamps.append(p.posted_at)
2058
+
2059
+ if len(timestamps) < 3:
2060
+ return {
2061
+ "investigation_id": investigation_id,
2062
+ "error": "insufficient_data",
2063
+ "message": f"Only {len(timestamps)} timestamps available (minimum 3)",
2064
+ "data_points": len(timestamps),
2065
+ }
2066
+
2067
+ by_hour: dict[int, int] = defaultdict(int)
2068
+ for ts in timestamps:
2069
+ by_hour[ts.hour] += 1
2070
+ activity_by_hour = {str(h): int(by_hour.get(h, 0)) for h in range(24)}
2071
+
2072
+ day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
2073
+ by_day: dict[int, int] = defaultdict(int)
2074
+ for ts in timestamps:
2075
+ by_day[ts.weekday()] += 1
2076
+ activity_by_day = {day_names[d]: int(by_day.get(d, 0)) for d in range(7)}
2077
+
2078
+ peak_hour_key = max(activity_by_hour, key=lambda h: activity_by_hour[h], default=None)
2079
+ peak_day_key = max(activity_by_day, key=lambda d: activity_by_day[d], default=None)
2080
+
2081
+ daily_counts: dict = defaultdict(int)
2082
+ for ts in timestamps:
2083
+ daily_counts[ts.date()] += 1
2084
+ timeline = [
2085
+ {"date": d, "count": c} for d, c in sorted(daily_counts.items())
2086
+ ]
2087
+
2088
+ anomalies_raw = detect_anomalies(timeline, z_threshold=Z_SCORE_THRESHOLD)
2089
+ anomalies = [
2090
+ {
2091
+ "date": str(a["date"]),
2092
+ "count": a["count"],
2093
+ "z_score": round(a["z_score"], 2),
2094
+ "type": a["type"],
2095
+ "description": (
2096
+ f"Activity {'spike' if a['z_score'] > 0 else 'drop'}: "
2097
+ f"z-score {a['z_score']:.1f}"
2098
+ ),
2099
+ }
2100
+ for a in anomalies_raw
2101
+ ]
2102
+
2103
+ silence_raw = detect_silence_breaks(timeline, silence_days=7)
2104
+ silence_breaks = [
2105
+ {
2106
+ "before": str(s["silent_from"]),
2107
+ "after": str(s["silent_to"]),
2108
+ "gap_days": s["gap_days"],
2109
+ "significance": "high" if s["gap_days"] >= 14 else "medium",
2110
+ }
2111
+ for s in silence_raw
2112
+ ]
2113
+
2114
+ all_dates = sorted(daily_counts.keys())
2115
+ timespan_days = (
2116
+ (all_dates[-1] - all_dates[0]).days if len(all_dates) >= 2 else 0
2117
+ )
2118
+
2119
+ return {
2120
+ "investigation_id": investigation_id,
2121
+ "activity_by_hour": activity_by_hour,
2122
+ "activity_by_day": activity_by_day,
2123
+ "anomalies": anomalies,
2124
+ "silence_breaks": silence_breaks,
2125
+ "peak_hour": int(peak_hour_key) if peak_hour_key is not None else None,
2126
+ "peak_day": peak_day_key,
2127
+ "total_timespan_days": timespan_days,
2128
+ "data_points": len(timestamps),
2129
+ }
2130
+ except HTTPException:
2131
+ raise
2132
+ except Exception as exc:
2133
+ logger.warning("get_temporal_analysis failed: %s", exc)
2134
+ return {"error": "analysis_failed", "message": str(exc)[:300]}
2135
+
2136
+
2137
+ @router.get("/{investigation_id}")
2138
+ async def get_investigation(
2139
+ investigation_id: str,
2140
+ current_user: "CurrentUser" = Depends(get_current_user),
2141
+ ) -> dict:
2142
+ """Return full investigation record including entity count. 404 if not found."""
2143
+ if os.getenv("DATABASE_URL"):
2144
+ try:
2145
+ from db.session import get_session
2146
+ from db.queries import get_investigation_by_id_or_run
2147
+ inv_uuid = uuid.UUID(investigation_id)
2148
+ with get_session() as session:
2149
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
2150
+ if inv is None:
2151
+ raise HTTPException(status_code=404, detail="Investigation not found")
2152
+ if str(inv.user_id) != str(current_user.user.id):
2153
+ raise HTTPException(status_code=403, detail="Forbidden")
2154
+ except HTTPException:
2155
+ raise
2156
+ except ValueError:
2157
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
2158
+ return _get_db_investigation(investigation_id)
2159
+
2160
+
2161
+ @router.get("/{investigation_id}/entities")
2162
+ async def get_investigation_entities(
2163
+ investigation_id: str,
2164
+ entity_type: Optional[str] = Query(default=None),
2165
+ min_confidence: float = Query(default=0.75, ge=0.0, le=1.0),
2166
+ limit: int = Query(default=20, ge=1, le=100),
2167
+ offset: int = Query(default=0, ge=0),
2168
+ defang: bool = Query(default=True),
2169
+ freshness_exclude: Optional[str] = Query(default=None),
2170
+ current_user: CurrentUser = Depends(get_current_user),
2171
+ ) -> dict:
2172
+ """Return paginated entities for an investigation, optionally filtered by type and confidence."""
2173
+ if not os.getenv("DATABASE_URL"):
2174
+ raise HTTPException(status_code=503, detail="Database not configured")
2175
+ try:
2176
+ from db.session import get_session
2177
+ from db.models import Entity, InvestigationEntityLink
2178
+ from db.queries import get_investigation_by_id_or_run
2179
+ from graph.builder import _make_node_id
2180
+ from sqlalchemy import func
2181
+ from utils.ioc_freshness import get_freshness_tag, get_freshness_display
2182
+ from utils.defang import defang_value, defang_text
2183
+
2184
+ inv_uuid = uuid.UUID(investigation_id)
2185
+ with get_session() as session:
2186
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
2187
+ if inv is None:
2188
+ raise HTTPException(status_code=404, detail="Investigation not found")
2189
+ if str(inv.user_id) != str(current_user.user.id):
2190
+ raise HTTPException(status_code=403, detail="Forbidden")
2191
+
2192
+ linked_ids_subq = (
2193
+ session.query(InvestigationEntityLink.entity_id)
2194
+ .filter(InvestigationEntityLink.investigation_id == inv.id)
2195
+ .subquery()
2196
+ )
2197
+ query = session.query(Entity).filter(
2198
+ (Entity.investigation_id == inv.id)
2199
+ | Entity.id.in_(linked_ids_subq)
2200
+ )
2201
+ if entity_type:
2202
+ query = query.filter(Entity.entity_type == entity_type)
2203
+ if min_confidence > 0.0:
2204
+ query = query.filter(Entity.confidence >= min_confidence)
2205
+
2206
+ total = query.count()
2207
+ entities = (
2208
+ query.order_by(Entity.created_at.desc())
2209
+ .offset(offset)
2210
+ .limit(limit)
2211
+ .all()
2212
+ )
2213
+
2214
+ # Safety net: filter prohibited entity values from the response.
2215
+ # Catches values that may have been stored before FIX 2 was deployed.
2216
+ from utils.content_safety import is_blocked_entity_value as _is_blocked_ev
2217
+ entities = [
2218
+ e for e in entities
2219
+ if not _is_blocked_ev(e.entity_type, e.value)
2220
+ ]
2221
+
2222
+ out: list[dict] = []
2223
+ for e in entities:
2224
+ source_url = ""
2225
+ try:
2226
+ if e.page:
2227
+ source_url = e.page.url or ""
2228
+ except Exception:
2229
+ pass
2230
+
2231
+ freshness_tag = get_freshness_tag(
2232
+ e.entity_type,
2233
+ e.last_seen_at,
2234
+ e.first_seen_at,
2235
+ )
2236
+
2237
+ if freshness_exclude == "expired" and freshness_tag.value == "expired":
2238
+ continue
2239
+
2240
+ graph_node_id = _make_node_id(e.entity_type, e.value, source_url)
2241
+
2242
+ display_value = e.value
2243
+ display_context = e.context
2244
+ if defang:
2245
+ display_value = defang_value(e.entity_type, e.value or "")
2246
+ if e.context:
2247
+ display_context = defang_text(e.context)
2248
+
2249
+ freshness_display = get_freshness_display(freshness_tag)
2250
+
2251
+ out.append(
2252
+ {
2253
+ "id": str(e.id),
2254
+ "entity_type": e.entity_type,
2255
+ "canonical_value": e.canonical_value,
2256
+ "value": display_value,
2257
+ "confidence": e.confidence,
2258
+ "context_snippet": e.context_snippet,
2259
+ "context": display_context,
2260
+ "created_at": e.created_at.isoformat() if e.created_at else None,
2261
+ "first_seen": e.first_seen.isoformat() if e.first_seen else None,
2262
+ "last_seen": e.last_seen.isoformat() if e.last_seen else None,
2263
+ "first_seen_at": e.first_seen_at.isoformat() if e.first_seen_at else None,
2264
+ "last_seen_at": e.last_seen_at.isoformat() if e.last_seen_at else None,
2265
+ "freshness_tag": freshness_tag.value,
2266
+ "freshness_label": freshness_display["label"],
2267
+ "freshness_color": freshness_display["color"],
2268
+ "source_count": e.source_count or 1,
2269
+ "corroborating_sources": json.loads(e.corroborating_sources or '["dark_web_scrape"]'),
2270
+ "cross_referenced": (e.source_count or 1) > 1,
2271
+ "graph_node_id": graph_node_id,
2272
+ "defanged": defang,
2273
+ }
2274
+ )
2275
+ return {"items": out, "total": total, "skip": offset, "limit": limit}
2276
+ except HTTPException:
2277
+ raise
2278
+ except ValueError:
2279
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
2280
+ except Exception as exc:
2281
+ logger.exception("get_investigation_entities failed: %s", exc)
2282
+ raise HTTPException(
2283
+ status_code=500,
2284
+ detail=f"Internal error: {exc!s}"[:500],
2285
+ )
2286
+
2287
+
2288
+ @router.get("/{investigation_id}/entities/export/csv")
2289
+ async def export_investigation_entities_csv(
2290
+ investigation_id: str,
2291
+ current_user: CurrentUser = Depends(get_current_user),
2292
+ ) -> Response:
2293
+ """
2294
+ Export entities for an investigation as a CSV file download.
2295
+
2296
+ Returns CSV with columns: entity_type, canonical_value, confidence,
2297
+ occurrence_count, first_seen_page, context_snippet
2298
+ """
2299
+ if not os.getenv("DATABASE_URL"):
2300
+ raise HTTPException(status_code=503, detail="Database not configured")
2301
+
2302
+ try:
2303
+ inv_uuid = uuid.UUID(investigation_id)
2304
+ except ValueError:
2305
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
2306
+
2307
+ try:
2308
+ from db.session import get_session
2309
+ from db.models import Entity, InvestigationEntityLink
2310
+ from db.queries import get_investigation_by_id_or_run
2311
+ from sqlalchemy import func
2312
+
2313
+ with get_session() as session:
2314
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
2315
+ if inv is None:
2316
+ raise HTTPException(status_code=404, detail="Investigation not found")
2317
+ if str(inv.user_id) != str(current_user.user.id):
2318
+ raise HTTPException(status_code=403, detail="Forbidden")
2319
+
2320
+ linked_ids_subq = (
2321
+ session.query(InvestigationEntityLink.entity_id)
2322
+ .filter(InvestigationEntityLink.investigation_id == inv.id)
2323
+ .subquery()
2324
+ )
2325
+ entities = (
2326
+ session.query(Entity)
2327
+ .filter(
2328
+ (Entity.investigation_id == inv.id)
2329
+ | Entity.id.in_(linked_ids_subq)
2330
+ )
2331
+ .all()
2332
+ )
2333
+
2334
+ output = io.StringIO()
2335
+ writer = csv.writer(output)
2336
+ writer.writerow([
2337
+ "entity_type",
2338
+ "canonical_value",
2339
+ "confidence",
2340
+ "occurrence_count",
2341
+ "first_seen_page",
2342
+ "context_snippet",
2343
+ ])
2344
+
2345
+ for e in entities:
2346
+ source_url = ""
2347
+ try:
2348
+ if e.page:
2349
+ source_url = e.page.url or ""
2350
+ except Exception:
2351
+ pass
2352
+ context = (e.context_snippet or "").replace(
2353
+ "\n", " "
2354
+ ).replace(
2355
+ "\r", " "
2356
+ ).strip()
2357
+ writer.writerow([
2358
+ e.entity_type,
2359
+ e.canonical_value or e.value,
2360
+ e.confidence,
2361
+ 1,
2362
+ source_url,
2363
+ context[:500],
2364
+ ])
2365
+
2366
+ csv_content = output.getvalue()
2367
+
2368
+ return Response(
2369
+ content=csv_content,
2370
+ media_type="text/csv",
2371
+ headers={
2372
+ "Content-Disposition": f"attachment; filename=voidaccess_{investigation_id}_entities.csv"
2373
+ },
2374
+ )
2375
+ except HTTPException:
2376
+ raise
2377
+ except Exception as exc:
2378
+ logger.exception("export_investigation_entities_csv failed: %s", exc)
2379
+ raise HTTPException(
2380
+ status_code=500,
2381
+ detail=f"Internal error: {exc!s}"[:500],
2382
+ )
2383
+
2384
+
2385
+ MAX_GRAPH_NODES = 500
2386
+
2387
+
2388
+ @router.get("/{investigation_id}/graph")
2389
+ async def get_investigation_graph(
2390
+ investigation_id: str,
2391
+ force_rebuild: bool = False,
2392
+ max_nodes: int = Query(default=MAX_GRAPH_NODES, ge=1, le=MAX_GRAPH_NODES),
2393
+ min_confidence: float = Query(default=0.75, ge=0.0, le=1.0),
2394
+ ) -> dict:
2395
+ """
2396
+ Return graph JSON for the investigation.
2397
+
2398
+ Requires investigation_id (now enforced - no more global graph).
2399
+ Uses persisted edges from the DB with O(1) lookup.
2400
+
2401
+ Use ?force_rebuild=true to recompute from scratch.
2402
+ Use ?max_nodes=N to limit node count (default 500, max 500).
2403
+ Use ?min_confidence=N to filter nodes/edges by confidence (default 0.75).
2404
+ Returns 400 if node count exceeds max_nodes - filter by entity type first.
2405
+ Returns 200 with {"graph_status": "skipped_overflow", ...} if graph was skipped due to size.
2406
+ """
2407
+ try:
2408
+ inv_uuid = uuid.UUID(investigation_id)
2409
+ except ValueError:
2410
+ raise HTTPException(status_code=422, detail="Invalid investigation ID format")
2411
+
2412
+ try:
2413
+ from db.session import get_session
2414
+ from db.queries import get_investigation_by_id_or_run
2415
+ from graph.builder import build_graph_from_db, build_graph_from_db_cached
2416
+ from graph.export import to_json
2417
+ from db.models import EntityRelationship, Entity
2418
+ from sqlalchemy import func
2419
+
2420
+ with get_session() as session:
2421
+ inv = get_investigation_by_id_or_run(session, inv_uuid)
2422
+ if inv is None:
2423
+ raise HTTPException(status_code=404, detail="Investigation not found")
2424
+ internal_id = inv.id
2425
+ graph_status = getattr(inv, "graph_status", "pending")
2426
+
2427
+ if graph_status == "skipped_overflow":
2428
+ entity_count = (
2429
+ session.query(func.count(Entity.id))
2430
+ .filter(Entity.investigation_id == internal_id)
2431
+ .scalar() or 0
2432
+ )
2433
+ return {
2434
+ "graph_status": "skipped_overflow",
2435
+ "message": "Graph too large to render. Use the entity list or download the CSV export instead.",
2436
+ "total_entities": entity_count,
2437
+ "nodes": [],
2438
+ "edges": [],
2439
+ }
2440
+
2441
+ persisted_edge_count = (
2442
+ session.query(func.count(EntityRelationship.id))
2443
+ .filter(EntityRelationship.investigation_id == internal_id)
2444
+ .scalar() or 0
2445
+ )
2446
+
2447
+ total_entity_count = (
2448
+ session.query(func.count(Entity.id))
2449
+ .filter(Entity.investigation_id == internal_id)
2450
+ .scalar() or 0
2451
+ )
2452
+
2453
+ if persisted_edge_count > 0 and not force_rebuild:
2454
+ logger.debug(
2455
+ "Graph cache hit: %s edges from DB for investigation %s",
2456
+ persisted_edge_count,
2457
+ investigation_id,
2458
+ )
2459
+ graph = build_graph_from_db_cached(investigation_id=internal_id)
2460
+ else:
2461
+ graph = build_graph_from_db(investigation_id=internal_id)
2462
+
2463
+ node_count = len(graph.nodes)
2464
+ if node_count > max_nodes:
2465
+ raise HTTPException(
2466
+ status_code=400,
2467
+ detail=(
2468
+ f"Graph has {node_count} nodes, exceeds max_nodes={max_nodes}. "
2469
+ "Filter by entity type first using the /entities endpoint "
2470
+ "with entity_type filter, then rebuild the graph."
2471
+ ),
2472
+ )
2473
+
2474
+ graph_data = to_json(graph)
2475
+
2476
+ nodes_to_keep = set()
2477
+ total_entities = len(graph_data["nodes"])
2478
+ for node in graph_data["nodes"]:
2479
+ node_confidence = node.get("confidence", 0.0)
2480
+ if node_confidence >= min_confidence:
2481
+ nodes_to_keep.add(node["id"])
2482
+
2483
+ filtered_nodes = [n for n in graph_data["nodes"] if n["id"] in nodes_to_keep]
2484
+ filtered_edges = [
2485
+ e for e in graph_data["edges"]
2486
+ if e["source"] in nodes_to_keep and e["target"] in nodes_to_keep
2487
+ ]
2488
+
2489
+ return {
2490
+ "graph_status": graph_status,
2491
+ "total_entities": total_entities,
2492
+ "filtered_entities": len(filtered_nodes),
2493
+ "min_confidence": min_confidence,
2494
+ "nodes": filtered_nodes,
2495
+ "edges": filtered_edges,
2496
+ }
2497
+ except HTTPException:
2498
+ raise
2499
+ except Exception as exc:
2500
+ logger.warning("get_investigation_graph failed: %s", exc)
2501
+ return {"nodes": [], "edges": []}
2502
+
2503
+
2504
+ def _build_investigation_profiles(investigation_id) -> int:
2505
+ """
2506
+ For each THREAT_ACTOR entity in this investigation,
2507
+ build/update their style profile from available text.
2508
+
2509
+ Uses context_snippets collected across all appearances
2510
+ of the same canonical entity.
2511
+
2512
+ NOTE: This function creates its own session - never pass a session
2513
+ across thread boundaries.
2514
+ """
2515
+ from db.models import Entity
2516
+ from db.session import get_session
2517
+ from fingerprint.profiler import build_actor_profile, save_profile_to_db
2518
+ from sqlalchemy import func
2519
+
2520
+ count = 0
2521
+ with get_session() as session:
2522
+ actors = (
2523
+ session.query(Entity.canonical_value, Entity.entity_type)
2524
+ .filter(
2525
+ Entity.investigation_id == investigation_id,
2526
+ Entity.entity_type.in_(["THREAT_ACTOR", "THREAT_ACTOR_HANDLE", "MALWARE_FAMILY", "RANSOMWARE_GROUP"]),
2527
+ Entity.canonical_value.isnot(None),
2528
+ )
2529
+ .distinct()
2530
+ .all()
2531
+ )
2532
+
2533
+ for canonical_value, entity_type in actors:
2534
+ texts = (
2535
+ session.query(Entity.context_snippet)
2536
+ .filter(
2537
+ Entity.entity_type == entity_type,
2538
+ Entity.canonical_value == canonical_value,
2539
+ Entity.context_snippet.isnot(None),
2540
+ func.length(Entity.context_snippet) >= 50,
2541
+ )
2542
+ .all()
2543
+ )
2544
+
2545
+ text_list = [t[0] for t in texts if t[0]]
2546
+ total_chars = sum(len(t) for t in text_list)
2547
+
2548
+ if len(text_list) < 2 or total_chars < 200:
2549
+ continue
2550
+
2551
+ try:
2552
+ profile = build_actor_profile(text_list)
2553
+ if profile:
2554
+ save_profile_to_db(
2555
+ profile=profile,
2556
+ canonical_value=canonical_value,
2557
+ entity_type=entity_type,
2558
+ session=session,
2559
+ )
2560
+ count += 1
2561
+ except Exception as e:
2562
+ logger.debug(f"Profile build failed for {canonical_value}: {e}")
2563
+ continue
2564
+
2565
+ session.commit()
2566
+
2567
+ return count