voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,601 @@
1
+ """
2
+ cli/commands/investigate.py — voidaccess investigate "<query>"
3
+
4
+ Orchestrates the existing pipeline modules (search, sources, scraper,
5
+ extractor, llm) from a fresh async entry point. Re-implements the
6
+ sequencing that api.routes.investigations._run_investigation_task did
7
+ under FastAPI — minus auth, SSE, rate limiting, Postgres.
8
+
9
+ Outputs
10
+ ~/.voidaccess/results/<slug>-<YYYYMMDD-HHMMSS>.json
11
+ ~/.voidaccess/results/<slug>-<YYYYMMDD-HHMMSS>.md
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import asyncio
17
+ import json
18
+ import logging
19
+ import os
20
+ import re
21
+ import time
22
+ import uuid
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any, Optional
26
+
27
+ import typer
28
+ from rich.console import Console
29
+
30
+ console = Console()
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Typer entry point
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ def run(
39
+ query: str = typer.Argument(..., help="Investigation query (e.g. 'LockBit ransomware')"),
40
+ output: Optional[Path] = typer.Option(None, "--output", help="Override output directory"),
41
+ model: Optional[str] = typer.Option(None, "--model", help="Override LLM model"),
42
+ no_tor: bool = typer.Option(False, "--no-tor", help="Clearnet-only mode (skip Tor)"),
43
+ no_llm: bool = typer.Option(False, "--no-llm", help="Skip LLM (query refinement, filtering, summary)"),
44
+ depth: str = typer.Option("normal", "--depth", help="shallow | normal | deep"),
45
+ fmt: str = typer.Option("both", "--format", help="json | md | both"),
46
+ quiet: bool = typer.Option(False, "--quiet", help="No live display; print final summary only"),
47
+ ) -> None:
48
+ """Run an investigation: query → search → scrape → extract → enrich → report."""
49
+ from cli import config as cli_config
50
+
51
+ cli_config.apply_env()
52
+ if quiet:
53
+ logging.getLogger().setLevel(logging.ERROR)
54
+
55
+ if not cli_config.is_configured() and not no_llm:
56
+ console.print("[yellow]No LLM configured.[/yellow] Run [bold]voidaccess configure[/bold] first, or pass --no-llm.")
57
+ raise typer.Exit(code=2)
58
+
59
+ if depth not in ("shallow", "normal", "deep"):
60
+ console.print(f"[red]Invalid depth:[/red] {depth}")
61
+ raise typer.Exit(code=2)
62
+ if fmt not in ("json", "md", "both"):
63
+ console.print(f"[red]Invalid format:[/red] {fmt}")
64
+ raise typer.Exit(code=2)
65
+
66
+ out_dir = Path(output).expanduser() if output else cli_config.get_output_dir()
67
+ out_dir.mkdir(parents=True, exist_ok=True)
68
+
69
+ try:
70
+ asyncio.run(
71
+ _run_investigation(
72
+ query=query,
73
+ out_dir=out_dir,
74
+ model=model,
75
+ no_tor=no_tor,
76
+ no_llm=no_llm,
77
+ depth=depth,
78
+ fmt=fmt,
79
+ quiet=quiet,
80
+ )
81
+ )
82
+ except KeyboardInterrupt:
83
+ console.print("\n[yellow]Interrupted by user.[/yellow]")
84
+ raise typer.Exit(code=130)
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Pipeline orchestrator
89
+ # ---------------------------------------------------------------------------
90
+
91
+
92
+ DEPTH_PRESETS = {
93
+ "shallow": {"top_n": 10, "max_workers": 3, "extract_concurrency": 3},
94
+ "normal": {"top_n": 20, "max_workers": 5, "extract_concurrency": 5},
95
+ "deep": {"top_n": 40, "max_workers": 8, "extract_concurrency": 6},
96
+ }
97
+
98
+
99
+ async def _run_investigation(
100
+ query: str,
101
+ out_dir: Path,
102
+ model: Optional[str],
103
+ no_tor: bool,
104
+ no_llm: bool,
105
+ depth: str,
106
+ fmt: str,
107
+ quiet: bool,
108
+ ) -> None:
109
+ from cli import config as cli_config
110
+ from cli.adapters import sqlite as sqlite_adapter
111
+ from cli.display import InvestigationDisplay
112
+ from cli.tor_detect import detect_tor, tor_unavailable_message
113
+
114
+ cfg = cli_config.load_config()
115
+ preset = DEPTH_PRESETS[depth]
116
+ display = InvestigationDisplay(quiet=quiet)
117
+ display.start(query)
118
+
119
+ # --- DB init ----------------------------------------------------------
120
+ sqlite_adapter.init_db()
121
+
122
+ # --- Tor preflight ----------------------------------------------------
123
+ tor_proxy: Optional[str] = None
124
+ if not no_tor:
125
+ status = detect_tor()
126
+ if status.proxy_url:
127
+ tor_proxy = status.proxy_url
128
+ os.environ["TOR_PROXY_HOST"] = status.host or "127.0.0.1"
129
+ os.environ["TOR_PROXY_PORT"] = str(status.port or 9050)
130
+ else:
131
+ display.error(tor_unavailable_message())
132
+ return
133
+
134
+ # --- LLM instance -----------------------------------------------------
135
+ llm = None
136
+ chosen_model = model or cli_config.get_llm_model(cfg)
137
+ if not no_llm:
138
+ try:
139
+ from voidaccess.llm import get_llm
140
+ llm = get_llm(chosen_model)
141
+ except Exception as exc:
142
+ display.update_step("Refining query", "fail", f"LLM init failed: {exc}")
143
+ llm = None
144
+
145
+ # --- Create investigation row -----------------------------------------
146
+ investigation_id = sqlite_adapter.save_investigation(
147
+ query=query,
148
+ model_used=chosen_model if llm is not None else None,
149
+ status="running",
150
+ )
151
+ inv_uuid = uuid.UUID(investigation_id)
152
+
153
+ sources_used: dict[str, dict[str, Any]] = {}
154
+ page_count_by_url: dict[str, dict[str, Any]] = {}
155
+
156
+ # --- Step 1 — refine query -------------------------------------------
157
+ display.update_step("Refining query", "active")
158
+ refined = query
159
+ if llm is not None:
160
+ try:
161
+ from voidaccess.llm import refine_query
162
+ refined = await asyncio.to_thread(refine_query, llm, query) or query
163
+ except Exception as exc:
164
+ display.update_step("Refining query", "fail", str(exc))
165
+ refined = query
166
+ else:
167
+ display.update_step("Refining query", "ok", f"→ {refined!r}")
168
+ else:
169
+ display.update_step("Refining query", "skip", "--no-llm")
170
+ sqlite_adapter.update_investigation(investigation_id, {"refined_query": refined})
171
+
172
+ # --- Step 2 — search fan-out -----------------------------------------
173
+ display.update_step("Searching dark web", "active")
174
+ search_links: list[dict] = []
175
+ paste_pages: list[dict] = []
176
+ github_pages: list[dict] = []
177
+ gitlab_pages: list[dict] = []
178
+ rss_pages: list[dict] = []
179
+
180
+ if not no_tor:
181
+ try:
182
+ from search import get_search_results_async
183
+ display.update_substep("Searching dark web", "Tor engines", "active")
184
+ search_links = await asyncio.to_thread(get_search_results_async, refined)
185
+ display.update_substep("Searching dark web", "Tor engines", "ok")
186
+ sources_used["tor_search"] = {"status": "ok", "count": len(search_links)}
187
+ except Exception as exc:
188
+ display.update_substep("Searching dark web", "Tor engines", "fail")
189
+ sources_used["tor_search"] = {"status": "fail", "error": str(exc)}
190
+ else:
191
+ display.update_substep("Searching dark web", "Tor engines", "skip")
192
+ sources_used["tor_search"] = {"status": "skipped"}
193
+
194
+ # Parallel clearnet sources
195
+ async def _safe(coro_factory, label, key):
196
+ display.update_substep("Searching dark web", label, "active")
197
+ try:
198
+ res = await coro_factory()
199
+ display.update_substep("Searching dark web", label, "ok")
200
+ sources_used[key] = {"status": "ok", "count": len(res) if res else 0}
201
+ return res or []
202
+ except Exception as exc:
203
+ display.update_substep("Searching dark web", label, "fail")
204
+ sources_used[key] = {"status": "fail", "error": str(exc)}
205
+ return []
206
+
207
+ side_tasks = await asyncio.gather(
208
+ _safe(lambda: _scrape_pastes(refined), "Paste sites", "paste_sites"),
209
+ _safe(lambda: _scrape_github(refined), "GitHub", "github"),
210
+ _safe(lambda: _scrape_gitlab(refined), "GitLab", "gitlab"),
211
+ _safe(lambda: _scrape_rss(refined), "RSS feeds", "rss"),
212
+ )
213
+ paste_pages, github_pages, gitlab_pages, rss_pages = side_tasks
214
+
215
+ display.update_step("Searching dark web", "ok", f"{len(search_links)} links + side sources")
216
+
217
+ # --- Step 3 — filter results ------------------------------------------
218
+ display.update_step("Filtering results", "active")
219
+ top_n = preset["top_n"]
220
+ filtered_links = search_links[: top_n * 2] if search_links else []
221
+ if llm is not None and search_links:
222
+ try:
223
+ from voidaccess.llm import filter_results
224
+ filtered_links = await asyncio.to_thread(filter_results, llm, refined, search_links) or search_links
225
+ filtered_links = filtered_links[:top_n]
226
+ display.update_step("Filtering results", "ok", f"top {len(filtered_links)}")
227
+ except Exception as exc:
228
+ display.update_step("Filtering results", "fail", str(exc))
229
+ filtered_links = search_links[:top_n]
230
+ else:
231
+ filtered_links = (search_links or [])[:top_n]
232
+ display.update_step("Filtering results", "skip" if no_llm else "ok", f"{len(filtered_links)} kept")
233
+
234
+ # --- Step 4 — scrape pages -------------------------------------------
235
+ display.update_step("Scraping pages", "active")
236
+ scraped_pages: list[dict] = []
237
+ if filtered_links:
238
+ try:
239
+ from scraper.scrape import scrape_multiple
240
+
241
+ async def _scrape_with_progress():
242
+ # scrape_multiple does its own batching; we surface current URL
243
+ # by intercepting via a side ticker since the underlying API
244
+ # doesn't expose per-URL callbacks. Best effort: just show the
245
+ # first URL while the gather runs.
246
+ display.update_current_url(
247
+ (filtered_links[0].get("link") if filtered_links else "") or ""
248
+ )
249
+ return await scrape_multiple(filtered_links, max_workers=preset["max_workers"])
250
+
251
+ results = await _scrape_with_progress()
252
+ display.update_current_url("")
253
+ for url, text in results.items():
254
+ if text:
255
+ scraped_pages.append({"url": url, "text": text, "source": "tor_search"})
256
+ display.update_step("Scraping pages", "ok", f"{len(scraped_pages)} pages")
257
+ except Exception as exc:
258
+ display.update_step("Scraping pages", "fail", str(exc))
259
+ else:
260
+ display.update_step("Scraping pages", "skip", "no links")
261
+
262
+ # Merge in clearnet pages (paste/github/gitlab/rss)
263
+ for extra in (paste_pages, github_pages, gitlab_pages, rss_pages):
264
+ for page in extra:
265
+ url = page.get("url") or page.get("link")
266
+ text = page.get("text") or page.get("content") or page.get("cleaned_text") or ""
267
+ if not url or not text:
268
+ continue
269
+ scraped_pages.append({"url": url, "text": text, "source": page.get("source", "clearnet")})
270
+
271
+ # Resolve page_ids from DB (scrape_multiple persisted .onion pages)
272
+ page_ids = await asyncio.to_thread(_lookup_page_ids, [p["url"] for p in scraped_pages])
273
+ for page in scraped_pages:
274
+ pid = page_ids.get(page["url"])
275
+ if pid is not None:
276
+ page["page_id"] = pid
277
+
278
+ page_count_by_url = {p["url"]: p for p in scraped_pages}
279
+
280
+ # --- Step 5 — extract entities ---------------------------------------
281
+ display.update_step("Extracting entities", "active")
282
+ extraction_results = []
283
+ try:
284
+ from extractor.pipeline import extract_entities_from_pages
285
+ extraction_results = await extract_entities_from_pages(
286
+ pages=scraped_pages,
287
+ investigation_id=inv_uuid,
288
+ llm=llm,
289
+ run_llm_extraction=llm is not None,
290
+ max_concurrent=preset["extract_concurrency"],
291
+ )
292
+ total_entities = sum(len(r.entity_ids) for r in extraction_results)
293
+ display.update_step("Extracting entities", "ok", f"{total_entities} entities")
294
+ except Exception as exc:
295
+ display.update_step("Extracting entities", "fail", str(exc))
296
+
297
+ # --- Step 6 — enrich intelligence ------------------------------------
298
+ display.update_step("Enriching intelligence", "active")
299
+ enrichment_pages: list[dict] = []
300
+ try:
301
+ from sources.enrichment import enrich_investigation as _enrich_inv
302
+ otx_key = os.getenv("OTX_API_KEY", "") or ""
303
+ # Build entity dicts for sources that take them
304
+ entity_dicts = sqlite_adapter.get_entities(investigation_id)
305
+ enrichment_pages = await _enrich_inv(refined, otx_api_key=otx_key, entities=entity_dicts)
306
+
307
+ # IP reputation pass — re-uses sources.ip_reputation
308
+ try:
309
+ from sources.ip_reputation import enrich_ip_entities
310
+ await enrich_ip_entities(extraction_results, investigation_id=inv_uuid)
311
+ except Exception as ip_exc:
312
+ console.print(f"[grey50]ip_reputation skipped: {ip_exc}[/grey50]")
313
+
314
+ sources_used["enrichment"] = {"status": "ok", "count": len(enrichment_pages)}
315
+ display.update_step("Enriching intelligence", "ok", f"{len(enrichment_pages)} pages added")
316
+ except Exception as exc:
317
+ sources_used["enrichment"] = {"status": "fail", "error": str(exc)}
318
+ display.update_step("Enriching intelligence", "fail", str(exc))
319
+
320
+ # Run extraction over enrichment pages too
321
+ if enrichment_pages:
322
+ try:
323
+ from extractor.pipeline import extract_entities_from_pages as _extr2
324
+ await _extr2(
325
+ pages=enrichment_pages,
326
+ investigation_id=inv_uuid,
327
+ llm=None,
328
+ run_llm_extraction=False,
329
+ max_concurrent=preset["extract_concurrency"],
330
+ )
331
+ except Exception as exc:
332
+ console.print(f"[grey50]Enrichment extraction failed: {exc}[/grey50]")
333
+
334
+ # --- Step 7 — build graph (co-occurrence) ----------------------------
335
+ display.update_step("Building graph", "active")
336
+ try:
337
+ edges_written = await asyncio.to_thread(_build_cooccurrence_edges, investigation_id)
338
+ display.update_step("Building graph", "ok", f"{edges_written} edges")
339
+ except Exception as exc:
340
+ display.update_step("Building graph", "fail", str(exc))
341
+
342
+ # --- Step 8 — summary -------------------------------------------------
343
+ display.update_step("Generating summary", "active")
344
+ summary_text = ""
345
+ if llm is not None:
346
+ try:
347
+ from voidaccess.llm import generate_summary
348
+ corpus = "\n\n".join(p["text"][:5000] for p in scraped_pages[:10])
349
+ if corpus:
350
+ summary_text = await asyncio.to_thread(
351
+ generate_summary, llm, refined, corpus, "threat_intel"
352
+ )
353
+ display.update_step("Generating summary", "ok")
354
+ except Exception as exc:
355
+ display.update_step("Generating summary", "fail", str(exc))
356
+ else:
357
+ display.update_step("Generating summary", "skip", "--no-llm")
358
+
359
+ # --- Step 9 — finalize & write outputs --------------------------------
360
+ display.update_step("Finalizing results", "active")
361
+ final_entities = sqlite_adapter.get_entities(investigation_id)
362
+ final_relationships = sqlite_adapter.get_relationships(investigation_id)
363
+ sqlite_adapter.update_investigation(
364
+ investigation_id,
365
+ {
366
+ "status": "completed",
367
+ "summary": summary_text or None,
368
+ "entity_count": len(final_entities),
369
+ "page_count": len(scraped_pages),
370
+ "current_step": 9,
371
+ "current_step_label": "Completed",
372
+ },
373
+ )
374
+
375
+ slug = _slugify(query)
376
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
377
+ json_path = out_dir / f"{slug}-{ts}.json"
378
+ md_path = out_dir / f"{slug}-{ts}.md"
379
+
380
+ payload = {
381
+ "id": investigation_id,
382
+ "query": query,
383
+ "refined_query": refined,
384
+ "model_used": chosen_model if llm is not None else None,
385
+ "created_at": datetime.now(timezone.utc).isoformat(),
386
+ "summary": summary_text,
387
+ "sources_used": sources_used,
388
+ "entities": final_entities,
389
+ "relationships": final_relationships,
390
+ "pages_scraped": [{"url": p["url"], "source": p.get("source", "")} for p in scraped_pages],
391
+ }
392
+
393
+ if fmt in ("json", "both"):
394
+ json_path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8")
395
+ if fmt in ("md", "both"):
396
+ md_path.write_text(_render_markdown(payload), encoding="utf-8")
397
+
398
+ display.update_step("Finalizing results", "ok")
399
+
400
+ c2_count = sum(
401
+ 1 for e in final_entities
402
+ if e["entity_type"] == "ip_address"
403
+ and (e.get("corroborating_sources") or "").lower().find("c2") >= 0
404
+ )
405
+
406
+ display.complete(
407
+ {
408
+ "entity_count": len(final_entities),
409
+ "page_count": len(scraped_pages),
410
+ "c2_ips": c2_count,
411
+ "sources_used": sum(1 for v in sources_used.values() if v.get("status") == "ok"),
412
+ "report_path": str(md_path) if fmt in ("md", "both") else None,
413
+ "data_path": str(json_path) if fmt in ("json", "both") else None,
414
+ }
415
+ )
416
+
417
+
418
+ # ---------------------------------------------------------------------------
419
+ # Side-source helpers (each gracefully degrades if module missing/disabled)
420
+ # ---------------------------------------------------------------------------
421
+
422
+
423
+ async def _scrape_pastes(query: str) -> list[dict]:
424
+ try:
425
+ from sources.paste_scraper import scrape_paste_sites
426
+ except Exception:
427
+ return []
428
+ if os.getenv("PASTE_SCRAPING_ENABLED", "true").lower() != "true":
429
+ return []
430
+ try:
431
+ return await scrape_paste_sites(query) or []
432
+ except Exception:
433
+ return []
434
+
435
+
436
+ async def _scrape_github(query: str) -> list[dict]:
437
+ try:
438
+ from sources.github_scraper import scrape_github
439
+ except Exception:
440
+ return []
441
+ if os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() != "true":
442
+ return []
443
+ try:
444
+ return await scrape_github(query) or []
445
+ except Exception:
446
+ return []
447
+
448
+
449
+ async def _scrape_gitlab(query: str) -> list[dict]:
450
+ try:
451
+ from sources.gitlab_scraper import scrape_gitlab
452
+ except Exception:
453
+ return []
454
+ if os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() != "true":
455
+ return []
456
+ try:
457
+ return await scrape_gitlab(query) or []
458
+ except Exception:
459
+ return []
460
+
461
+
462
+ async def _scrape_rss(query: str) -> list[dict]:
463
+ try:
464
+ from sources.rss_scraper import scrape_rss_feeds
465
+ except Exception:
466
+ return []
467
+ if os.getenv("RSS_FEEDS_ENABLED", "true").lower() != "true":
468
+ return []
469
+ try:
470
+ return await scrape_rss_feeds(query) or []
471
+ except Exception:
472
+ return []
473
+
474
+
475
+ # ---------------------------------------------------------------------------
476
+ # DB helpers
477
+ # ---------------------------------------------------------------------------
478
+
479
+
480
+ def _lookup_page_ids(urls: list[str]) -> dict[str, uuid.UUID]:
481
+ if not urls:
482
+ return {}
483
+ try:
484
+ from db.models import Page
485
+ from db.session import get_session
486
+ except Exception:
487
+ return {}
488
+ out: dict[str, uuid.UUID] = {}
489
+ with get_session() as session:
490
+ rows = session.query(Page).filter(Page.url.in_(urls)).all()
491
+ for r in rows:
492
+ out[r.url] = r.id
493
+ return out
494
+
495
+
496
+ def _build_cooccurrence_edges(investigation_id: str) -> int:
497
+ """Generate CO_APPEARED_ON edges for entities sharing a page."""
498
+ try:
499
+ from db.models import Entity
500
+ from db.session import get_session
501
+ except Exception:
502
+ return 0
503
+ from cli.adapters.sqlite import save_relationships
504
+
505
+ edges: list[dict] = []
506
+ inv_uuid = uuid.UUID(investigation_id)
507
+
508
+ with get_session() as session:
509
+ rows = (
510
+ session.query(Entity.id, Entity.page_id)
511
+ .filter(Entity.investigation_id == inv_uuid)
512
+ .all()
513
+ )
514
+ by_page: dict[uuid.UUID, list[uuid.UUID]] = {}
515
+ for ent_id, page_id in rows:
516
+ if page_id is None:
517
+ continue
518
+ by_page.setdefault(page_id, []).append(ent_id)
519
+
520
+ for ents in by_page.values():
521
+ if len(ents) < 2:
522
+ continue
523
+ for i in range(len(ents)):
524
+ for j in range(i + 1, len(ents)):
525
+ edges.append(
526
+ {
527
+ "entity_a_id": str(ents[i]),
528
+ "entity_b_id": str(ents[j]),
529
+ "relationship_type": "CO_APPEARED_ON",
530
+ "confidence": 0.8,
531
+ }
532
+ )
533
+ return save_relationships(investigation_id, edges)
534
+
535
+
536
+ # ---------------------------------------------------------------------------
537
+ # Markdown rendering
538
+ # ---------------------------------------------------------------------------
539
+
540
+
541
+ def _slugify(s: str) -> str:
542
+ s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
543
+ return s[:50] or "investigation"
544
+
545
+
546
+ def _render_markdown(payload: dict[str, Any]) -> str:
547
+ lines: list[str] = []
548
+ lines.append(f"# Investigation: {payload['query']}")
549
+ lines.append(
550
+ f"**Date:** {payload['created_at']} | **Model:** {payload.get('model_used') or '—'}"
551
+ )
552
+ if payload.get("refined_query") and payload["refined_query"] != payload["query"]:
553
+ lines.append(f"**Refined:** {payload['refined_query']}")
554
+ lines.append("")
555
+ lines.append("## Summary")
556
+ lines.append(payload.get("summary") or "_(no summary — LLM disabled or unavailable)_")
557
+ lines.append("")
558
+
559
+ entities = payload.get("entities", [])
560
+ by_type: dict[str, list[dict]] = {}
561
+ for e in entities:
562
+ by_type.setdefault(e["entity_type"], []).append(e)
563
+
564
+ c2_ips = [
565
+ e for e in entities
566
+ if e["entity_type"] == "ip_address"
567
+ and (e.get("corroborating_sources") or "").lower().find("c2") >= 0
568
+ ]
569
+ lines.append("## Key findings")
570
+ lines.append(f"- {len(c2_ips)} confirmed C2 IP addresses")
571
+ lines.append(
572
+ f"- {len(by_type.get('ransomware_group', []))} ransomware group(s) identified"
573
+ )
574
+ lines.append(f"- {len(by_type.get('onion_url', []))} .onion URLs mapped")
575
+ lines.append(f"- {len(entities)} entities total")
576
+ lines.append("")
577
+
578
+ lines.append(f"## Entities ({len(entities)} total)")
579
+ for etype in sorted(by_type.keys()):
580
+ rows = by_type[etype]
581
+ lines.append(f"\n### {etype} ({len(rows)})")
582
+ lines.append("| Value | Confidence | Method | Tags |")
583
+ lines.append("|---|---|---|---|")
584
+ for r in rows[:50]:
585
+ tags = (r.get("corroborating_sources") or "").replace("|", "/")
586
+ val = (r.get("canonical_value") or r.get("value") or "").replace("|", "/")
587
+ conf = r.get("confidence")
588
+ lines.append(
589
+ f"| {val} | {conf:.2f} | {r.get('extraction_method') or ''} | {tags} |"
590
+ )
591
+ if len(rows) > 50:
592
+ lines.append(f"\n_…and {len(rows) - 50} more (see JSON)_")
593
+ lines.append("")
594
+
595
+ lines.append("## Sources used")
596
+ for name, info in payload.get("sources_used", {}).items():
597
+ glyph = "✓" if info.get("status") == "ok" else ("↷" if info.get("status") == "skipped" else "✗")
598
+ detail = f" ({info.get('count', 0)} results)" if "count" in info else ""
599
+ lines.append(f"- {glyph} {name}{detail}")
600
+
601
+ return "\n".join(lines) + "\n"
cli/commands/show.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ cli/commands/show.py — Launch the Textual entity browser.
3
+
4
+ Argument can be:
5
+ a path to a saved .json investigation file
6
+ an investigation id (UUID stored in SQLite)
7
+ omitted → interactive picker over recent runs
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ import typer
17
+ from rich.console import Console
18
+ from rich.table import Table
19
+
20
+ console = Console()
21
+
22
+
23
+ def run(
24
+ target: Optional[str] = typer.Argument(
25
+ None, help="Investigation id or path to a .json export"
26
+ ),
27
+ ) -> None:
28
+ """Open the entity browser TUI."""
29
+ from cli import config as cli_config
30
+ cli_config.apply_env()
31
+
32
+ data: Optional[dict] = None
33
+
34
+ if target is None:
35
+ target = _pick_recent()
36
+ if target is None:
37
+ console.print("[yellow]No investigations found. Run `voidaccess investigate` first.[/yellow]")
38
+ raise typer.Exit(code=1)
39
+
40
+ candidate_path = Path(target).expanduser()
41
+ if candidate_path.exists() and candidate_path.suffix == ".json":
42
+ data = json.loads(candidate_path.read_text(encoding="utf-8"))
43
+ else:
44
+ from cli.adapters import sqlite as sqlite_adapter
45
+ sqlite_adapter.init_db()
46
+ resolved = sqlite_adapter.resolve_investigation_id(target) or target
47
+ data = sqlite_adapter.investigation_to_export_dict(resolved)
48
+ if not data or not data.get("investigation"):
49
+ console.print(f"[red]Unknown investigation:[/red] {target}")
50
+ raise typer.Exit(code=1)
51
+
52
+ from cli.browser import EntityBrowserApp
53
+ app = EntityBrowserApp(data=data)
54
+ app.run()
55
+
56
+
57
+ def _pick_recent() -> Optional[str]:
58
+ from cli.adapters import sqlite as sqlite_adapter
59
+ sqlite_adapter.init_db()
60
+ rows = sqlite_adapter.list_investigations(limit=20)
61
+ if not rows:
62
+ return None
63
+
64
+ table = Table(title="Recent investigations")
65
+ table.add_column("#", style="cyan", justify="right")
66
+ table.add_column("Query")
67
+ table.add_column("Status")
68
+ table.add_column("Entities", justify="right")
69
+ table.add_column("Date")
70
+ for idx, r in enumerate(rows, 1):
71
+ table.add_row(
72
+ str(idx),
73
+ (r["query"] or "")[:50],
74
+ r["status"] or "",
75
+ str(r["entity_count"]),
76
+ (r["created_at"] or "")[:19],
77
+ )
78
+ console.print(table)
79
+ from rich.prompt import Prompt
80
+ choice = Prompt.ask("Pick #", default="1")
81
+ try:
82
+ idx = int(choice) - 1
83
+ if 0 <= idx < len(rows):
84
+ return rows[idx]["id"]
85
+ except ValueError:
86
+ pass
87
+ return None