voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli/commands/investigate.py — voidaccess investigate "<query>"
|
|
3
|
+
|
|
4
|
+
Orchestrates the existing pipeline modules (search, sources, scraper,
|
|
5
|
+
extractor, llm) from a fresh async entry point. Re-implements the
|
|
6
|
+
sequencing that api.routes.investigations._run_investigation_task did
|
|
7
|
+
under FastAPI — minus auth, SSE, rate limiting, Postgres.
|
|
8
|
+
|
|
9
|
+
Outputs
|
|
10
|
+
~/.voidaccess/results/<slug>-<YYYYMMDD-HHMMSS>.json
|
|
11
|
+
~/.voidaccess/results/<slug>-<YYYYMMDD-HHMMSS>.md
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import time
|
|
22
|
+
import uuid
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Optional
|
|
26
|
+
|
|
27
|
+
import typer
|
|
28
|
+
from rich.console import Console
|
|
29
|
+
|
|
30
|
+
console = Console()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Typer entry point
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run(
|
|
39
|
+
query: str = typer.Argument(..., help="Investigation query (e.g. 'LockBit ransomware')"),
|
|
40
|
+
output: Optional[Path] = typer.Option(None, "--output", help="Override output directory"),
|
|
41
|
+
model: Optional[str] = typer.Option(None, "--model", help="Override LLM model"),
|
|
42
|
+
no_tor: bool = typer.Option(False, "--no-tor", help="Clearnet-only mode (skip Tor)"),
|
|
43
|
+
no_llm: bool = typer.Option(False, "--no-llm", help="Skip LLM (query refinement, filtering, summary)"),
|
|
44
|
+
depth: str = typer.Option("normal", "--depth", help="shallow | normal | deep"),
|
|
45
|
+
fmt: str = typer.Option("both", "--format", help="json | md | both"),
|
|
46
|
+
quiet: bool = typer.Option(False, "--quiet", help="No live display; print final summary only"),
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Run an investigation: query → search → scrape → extract → enrich → report."""
|
|
49
|
+
from cli import config as cli_config
|
|
50
|
+
|
|
51
|
+
cli_config.apply_env()
|
|
52
|
+
if quiet:
|
|
53
|
+
logging.getLogger().setLevel(logging.ERROR)
|
|
54
|
+
|
|
55
|
+
if not cli_config.is_configured() and not no_llm:
|
|
56
|
+
console.print("[yellow]No LLM configured.[/yellow] Run [bold]voidaccess configure[/bold] first, or pass --no-llm.")
|
|
57
|
+
raise typer.Exit(code=2)
|
|
58
|
+
|
|
59
|
+
if depth not in ("shallow", "normal", "deep"):
|
|
60
|
+
console.print(f"[red]Invalid depth:[/red] {depth}")
|
|
61
|
+
raise typer.Exit(code=2)
|
|
62
|
+
if fmt not in ("json", "md", "both"):
|
|
63
|
+
console.print(f"[red]Invalid format:[/red] {fmt}")
|
|
64
|
+
raise typer.Exit(code=2)
|
|
65
|
+
|
|
66
|
+
out_dir = Path(output).expanduser() if output else cli_config.get_output_dir()
|
|
67
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
asyncio.run(
|
|
71
|
+
_run_investigation(
|
|
72
|
+
query=query,
|
|
73
|
+
out_dir=out_dir,
|
|
74
|
+
model=model,
|
|
75
|
+
no_tor=no_tor,
|
|
76
|
+
no_llm=no_llm,
|
|
77
|
+
depth=depth,
|
|
78
|
+
fmt=fmt,
|
|
79
|
+
quiet=quiet,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
except KeyboardInterrupt:
|
|
83
|
+
console.print("\n[yellow]Interrupted by user.[/yellow]")
|
|
84
|
+
raise typer.Exit(code=130)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# Pipeline orchestrator
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
DEPTH_PRESETS = {
|
|
93
|
+
"shallow": {"top_n": 10, "max_workers": 3, "extract_concurrency": 3},
|
|
94
|
+
"normal": {"top_n": 20, "max_workers": 5, "extract_concurrency": 5},
|
|
95
|
+
"deep": {"top_n": 40, "max_workers": 8, "extract_concurrency": 6},
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def _run_investigation(
|
|
100
|
+
query: str,
|
|
101
|
+
out_dir: Path,
|
|
102
|
+
model: Optional[str],
|
|
103
|
+
no_tor: bool,
|
|
104
|
+
no_llm: bool,
|
|
105
|
+
depth: str,
|
|
106
|
+
fmt: str,
|
|
107
|
+
quiet: bool,
|
|
108
|
+
) -> None:
|
|
109
|
+
from cli import config as cli_config
|
|
110
|
+
from cli.adapters import sqlite as sqlite_adapter
|
|
111
|
+
from cli.display import InvestigationDisplay
|
|
112
|
+
from cli.tor_detect import detect_tor, tor_unavailable_message
|
|
113
|
+
|
|
114
|
+
cfg = cli_config.load_config()
|
|
115
|
+
preset = DEPTH_PRESETS[depth]
|
|
116
|
+
display = InvestigationDisplay(quiet=quiet)
|
|
117
|
+
display.start(query)
|
|
118
|
+
|
|
119
|
+
# --- DB init ----------------------------------------------------------
|
|
120
|
+
sqlite_adapter.init_db()
|
|
121
|
+
|
|
122
|
+
# --- Tor preflight ----------------------------------------------------
|
|
123
|
+
tor_proxy: Optional[str] = None
|
|
124
|
+
if not no_tor:
|
|
125
|
+
status = detect_tor()
|
|
126
|
+
if status.proxy_url:
|
|
127
|
+
tor_proxy = status.proxy_url
|
|
128
|
+
os.environ["TOR_PROXY_HOST"] = status.host or "127.0.0.1"
|
|
129
|
+
os.environ["TOR_PROXY_PORT"] = str(status.port or 9050)
|
|
130
|
+
else:
|
|
131
|
+
display.error(tor_unavailable_message())
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
# --- LLM instance -----------------------------------------------------
|
|
135
|
+
llm = None
|
|
136
|
+
chosen_model = model or cli_config.get_llm_model(cfg)
|
|
137
|
+
if not no_llm:
|
|
138
|
+
try:
|
|
139
|
+
from voidaccess.llm import get_llm
|
|
140
|
+
llm = get_llm(chosen_model)
|
|
141
|
+
except Exception as exc:
|
|
142
|
+
display.update_step("Refining query", "fail", f"LLM init failed: {exc}")
|
|
143
|
+
llm = None
|
|
144
|
+
|
|
145
|
+
# --- Create investigation row -----------------------------------------
|
|
146
|
+
investigation_id = sqlite_adapter.save_investigation(
|
|
147
|
+
query=query,
|
|
148
|
+
model_used=chosen_model if llm is not None else None,
|
|
149
|
+
status="running",
|
|
150
|
+
)
|
|
151
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
152
|
+
|
|
153
|
+
sources_used: dict[str, dict[str, Any]] = {}
|
|
154
|
+
page_count_by_url: dict[str, dict[str, Any]] = {}
|
|
155
|
+
|
|
156
|
+
# --- Step 1 — refine query -------------------------------------------
|
|
157
|
+
display.update_step("Refining query", "active")
|
|
158
|
+
refined = query
|
|
159
|
+
if llm is not None:
|
|
160
|
+
try:
|
|
161
|
+
from voidaccess.llm import refine_query
|
|
162
|
+
refined = await asyncio.to_thread(refine_query, llm, query) or query
|
|
163
|
+
except Exception as exc:
|
|
164
|
+
display.update_step("Refining query", "fail", str(exc))
|
|
165
|
+
refined = query
|
|
166
|
+
else:
|
|
167
|
+
display.update_step("Refining query", "ok", f"→ {refined!r}")
|
|
168
|
+
else:
|
|
169
|
+
display.update_step("Refining query", "skip", "--no-llm")
|
|
170
|
+
sqlite_adapter.update_investigation(investigation_id, {"refined_query": refined})
|
|
171
|
+
|
|
172
|
+
# --- Step 2 — search fan-out -----------------------------------------
|
|
173
|
+
display.update_step("Searching dark web", "active")
|
|
174
|
+
search_links: list[dict] = []
|
|
175
|
+
paste_pages: list[dict] = []
|
|
176
|
+
github_pages: list[dict] = []
|
|
177
|
+
gitlab_pages: list[dict] = []
|
|
178
|
+
rss_pages: list[dict] = []
|
|
179
|
+
|
|
180
|
+
if not no_tor:
|
|
181
|
+
try:
|
|
182
|
+
from search import get_search_results_async
|
|
183
|
+
display.update_substep("Searching dark web", "Tor engines", "active")
|
|
184
|
+
search_links = await asyncio.to_thread(get_search_results_async, refined)
|
|
185
|
+
display.update_substep("Searching dark web", "Tor engines", "ok")
|
|
186
|
+
sources_used["tor_search"] = {"status": "ok", "count": len(search_links)}
|
|
187
|
+
except Exception as exc:
|
|
188
|
+
display.update_substep("Searching dark web", "Tor engines", "fail")
|
|
189
|
+
sources_used["tor_search"] = {"status": "fail", "error": str(exc)}
|
|
190
|
+
else:
|
|
191
|
+
display.update_substep("Searching dark web", "Tor engines", "skip")
|
|
192
|
+
sources_used["tor_search"] = {"status": "skipped"}
|
|
193
|
+
|
|
194
|
+
# Parallel clearnet sources
|
|
195
|
+
async def _safe(coro_factory, label, key):
|
|
196
|
+
display.update_substep("Searching dark web", label, "active")
|
|
197
|
+
try:
|
|
198
|
+
res = await coro_factory()
|
|
199
|
+
display.update_substep("Searching dark web", label, "ok")
|
|
200
|
+
sources_used[key] = {"status": "ok", "count": len(res) if res else 0}
|
|
201
|
+
return res or []
|
|
202
|
+
except Exception as exc:
|
|
203
|
+
display.update_substep("Searching dark web", label, "fail")
|
|
204
|
+
sources_used[key] = {"status": "fail", "error": str(exc)}
|
|
205
|
+
return []
|
|
206
|
+
|
|
207
|
+
side_tasks = await asyncio.gather(
|
|
208
|
+
_safe(lambda: _scrape_pastes(refined), "Paste sites", "paste_sites"),
|
|
209
|
+
_safe(lambda: _scrape_github(refined), "GitHub", "github"),
|
|
210
|
+
_safe(lambda: _scrape_gitlab(refined), "GitLab", "gitlab"),
|
|
211
|
+
_safe(lambda: _scrape_rss(refined), "RSS feeds", "rss"),
|
|
212
|
+
)
|
|
213
|
+
paste_pages, github_pages, gitlab_pages, rss_pages = side_tasks
|
|
214
|
+
|
|
215
|
+
display.update_step("Searching dark web", "ok", f"{len(search_links)} links + side sources")
|
|
216
|
+
|
|
217
|
+
# --- Step 3 — filter results ------------------------------------------
|
|
218
|
+
display.update_step("Filtering results", "active")
|
|
219
|
+
top_n = preset["top_n"]
|
|
220
|
+
filtered_links = search_links[: top_n * 2] if search_links else []
|
|
221
|
+
if llm is not None and search_links:
|
|
222
|
+
try:
|
|
223
|
+
from voidaccess.llm import filter_results
|
|
224
|
+
filtered_links = await asyncio.to_thread(filter_results, llm, refined, search_links) or search_links
|
|
225
|
+
filtered_links = filtered_links[:top_n]
|
|
226
|
+
display.update_step("Filtering results", "ok", f"top {len(filtered_links)}")
|
|
227
|
+
except Exception as exc:
|
|
228
|
+
display.update_step("Filtering results", "fail", str(exc))
|
|
229
|
+
filtered_links = search_links[:top_n]
|
|
230
|
+
else:
|
|
231
|
+
filtered_links = (search_links or [])[:top_n]
|
|
232
|
+
display.update_step("Filtering results", "skip" if no_llm else "ok", f"{len(filtered_links)} kept")
|
|
233
|
+
|
|
234
|
+
# --- Step 4 — scrape pages -------------------------------------------
|
|
235
|
+
display.update_step("Scraping pages", "active")
|
|
236
|
+
scraped_pages: list[dict] = []
|
|
237
|
+
if filtered_links:
|
|
238
|
+
try:
|
|
239
|
+
from scraper.scrape import scrape_multiple
|
|
240
|
+
|
|
241
|
+
async def _scrape_with_progress():
|
|
242
|
+
# scrape_multiple does its own batching; we surface current URL
|
|
243
|
+
# by intercepting via a side ticker since the underlying API
|
|
244
|
+
# doesn't expose per-URL callbacks. Best effort: just show the
|
|
245
|
+
# first URL while the gather runs.
|
|
246
|
+
display.update_current_url(
|
|
247
|
+
(filtered_links[0].get("link") if filtered_links else "") or ""
|
|
248
|
+
)
|
|
249
|
+
return await scrape_multiple(filtered_links, max_workers=preset["max_workers"])
|
|
250
|
+
|
|
251
|
+
results = await _scrape_with_progress()
|
|
252
|
+
display.update_current_url("")
|
|
253
|
+
for url, text in results.items():
|
|
254
|
+
if text:
|
|
255
|
+
scraped_pages.append({"url": url, "text": text, "source": "tor_search"})
|
|
256
|
+
display.update_step("Scraping pages", "ok", f"{len(scraped_pages)} pages")
|
|
257
|
+
except Exception as exc:
|
|
258
|
+
display.update_step("Scraping pages", "fail", str(exc))
|
|
259
|
+
else:
|
|
260
|
+
display.update_step("Scraping pages", "skip", "no links")
|
|
261
|
+
|
|
262
|
+
# Merge in clearnet pages (paste/github/gitlab/rss)
|
|
263
|
+
for extra in (paste_pages, github_pages, gitlab_pages, rss_pages):
|
|
264
|
+
for page in extra:
|
|
265
|
+
url = page.get("url") or page.get("link")
|
|
266
|
+
text = page.get("text") or page.get("content") or page.get("cleaned_text") or ""
|
|
267
|
+
if not url or not text:
|
|
268
|
+
continue
|
|
269
|
+
scraped_pages.append({"url": url, "text": text, "source": page.get("source", "clearnet")})
|
|
270
|
+
|
|
271
|
+
# Resolve page_ids from DB (scrape_multiple persisted .onion pages)
|
|
272
|
+
page_ids = await asyncio.to_thread(_lookup_page_ids, [p["url"] for p in scraped_pages])
|
|
273
|
+
for page in scraped_pages:
|
|
274
|
+
pid = page_ids.get(page["url"])
|
|
275
|
+
if pid is not None:
|
|
276
|
+
page["page_id"] = pid
|
|
277
|
+
|
|
278
|
+
page_count_by_url = {p["url"]: p for p in scraped_pages}
|
|
279
|
+
|
|
280
|
+
# --- Step 5 — extract entities ---------------------------------------
|
|
281
|
+
display.update_step("Extracting entities", "active")
|
|
282
|
+
extraction_results = []
|
|
283
|
+
try:
|
|
284
|
+
from extractor.pipeline import extract_entities_from_pages
|
|
285
|
+
extraction_results = await extract_entities_from_pages(
|
|
286
|
+
pages=scraped_pages,
|
|
287
|
+
investigation_id=inv_uuid,
|
|
288
|
+
llm=llm,
|
|
289
|
+
run_llm_extraction=llm is not None,
|
|
290
|
+
max_concurrent=preset["extract_concurrency"],
|
|
291
|
+
)
|
|
292
|
+
total_entities = sum(len(r.entity_ids) for r in extraction_results)
|
|
293
|
+
display.update_step("Extracting entities", "ok", f"{total_entities} entities")
|
|
294
|
+
except Exception as exc:
|
|
295
|
+
display.update_step("Extracting entities", "fail", str(exc))
|
|
296
|
+
|
|
297
|
+
# --- Step 6 — enrich intelligence ------------------------------------
|
|
298
|
+
display.update_step("Enriching intelligence", "active")
|
|
299
|
+
enrichment_pages: list[dict] = []
|
|
300
|
+
try:
|
|
301
|
+
from sources.enrichment import enrich_investigation as _enrich_inv
|
|
302
|
+
otx_key = os.getenv("OTX_API_KEY", "") or ""
|
|
303
|
+
# Build entity dicts for sources that take them
|
|
304
|
+
entity_dicts = sqlite_adapter.get_entities(investigation_id)
|
|
305
|
+
enrichment_pages = await _enrich_inv(refined, otx_api_key=otx_key, entities=entity_dicts)
|
|
306
|
+
|
|
307
|
+
# IP reputation pass — re-uses sources.ip_reputation
|
|
308
|
+
try:
|
|
309
|
+
from sources.ip_reputation import enrich_ip_entities
|
|
310
|
+
await enrich_ip_entities(extraction_results, investigation_id=inv_uuid)
|
|
311
|
+
except Exception as ip_exc:
|
|
312
|
+
console.print(f"[grey50]ip_reputation skipped: {ip_exc}[/grey50]")
|
|
313
|
+
|
|
314
|
+
sources_used["enrichment"] = {"status": "ok", "count": len(enrichment_pages)}
|
|
315
|
+
display.update_step("Enriching intelligence", "ok", f"{len(enrichment_pages)} pages added")
|
|
316
|
+
except Exception as exc:
|
|
317
|
+
sources_used["enrichment"] = {"status": "fail", "error": str(exc)}
|
|
318
|
+
display.update_step("Enriching intelligence", "fail", str(exc))
|
|
319
|
+
|
|
320
|
+
# Run extraction over enrichment pages too
|
|
321
|
+
if enrichment_pages:
|
|
322
|
+
try:
|
|
323
|
+
from extractor.pipeline import extract_entities_from_pages as _extr2
|
|
324
|
+
await _extr2(
|
|
325
|
+
pages=enrichment_pages,
|
|
326
|
+
investigation_id=inv_uuid,
|
|
327
|
+
llm=None,
|
|
328
|
+
run_llm_extraction=False,
|
|
329
|
+
max_concurrent=preset["extract_concurrency"],
|
|
330
|
+
)
|
|
331
|
+
except Exception as exc:
|
|
332
|
+
console.print(f"[grey50]Enrichment extraction failed: {exc}[/grey50]")
|
|
333
|
+
|
|
334
|
+
# --- Step 7 — build graph (co-occurrence) ----------------------------
|
|
335
|
+
display.update_step("Building graph", "active")
|
|
336
|
+
try:
|
|
337
|
+
edges_written = await asyncio.to_thread(_build_cooccurrence_edges, investigation_id)
|
|
338
|
+
display.update_step("Building graph", "ok", f"{edges_written} edges")
|
|
339
|
+
except Exception as exc:
|
|
340
|
+
display.update_step("Building graph", "fail", str(exc))
|
|
341
|
+
|
|
342
|
+
# --- Step 8 — summary -------------------------------------------------
|
|
343
|
+
display.update_step("Generating summary", "active")
|
|
344
|
+
summary_text = ""
|
|
345
|
+
if llm is not None:
|
|
346
|
+
try:
|
|
347
|
+
from voidaccess.llm import generate_summary
|
|
348
|
+
corpus = "\n\n".join(p["text"][:5000] for p in scraped_pages[:10])
|
|
349
|
+
if corpus:
|
|
350
|
+
summary_text = await asyncio.to_thread(
|
|
351
|
+
generate_summary, llm, refined, corpus, "threat_intel"
|
|
352
|
+
)
|
|
353
|
+
display.update_step("Generating summary", "ok")
|
|
354
|
+
except Exception as exc:
|
|
355
|
+
display.update_step("Generating summary", "fail", str(exc))
|
|
356
|
+
else:
|
|
357
|
+
display.update_step("Generating summary", "skip", "--no-llm")
|
|
358
|
+
|
|
359
|
+
# --- Step 9 — finalize & write outputs --------------------------------
|
|
360
|
+
display.update_step("Finalizing results", "active")
|
|
361
|
+
final_entities = sqlite_adapter.get_entities(investigation_id)
|
|
362
|
+
final_relationships = sqlite_adapter.get_relationships(investigation_id)
|
|
363
|
+
sqlite_adapter.update_investigation(
|
|
364
|
+
investigation_id,
|
|
365
|
+
{
|
|
366
|
+
"status": "completed",
|
|
367
|
+
"summary": summary_text or None,
|
|
368
|
+
"entity_count": len(final_entities),
|
|
369
|
+
"page_count": len(scraped_pages),
|
|
370
|
+
"current_step": 9,
|
|
371
|
+
"current_step_label": "Completed",
|
|
372
|
+
},
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
slug = _slugify(query)
|
|
376
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
377
|
+
json_path = out_dir / f"{slug}-{ts}.json"
|
|
378
|
+
md_path = out_dir / f"{slug}-{ts}.md"
|
|
379
|
+
|
|
380
|
+
payload = {
|
|
381
|
+
"id": investigation_id,
|
|
382
|
+
"query": query,
|
|
383
|
+
"refined_query": refined,
|
|
384
|
+
"model_used": chosen_model if llm is not None else None,
|
|
385
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
386
|
+
"summary": summary_text,
|
|
387
|
+
"sources_used": sources_used,
|
|
388
|
+
"entities": final_entities,
|
|
389
|
+
"relationships": final_relationships,
|
|
390
|
+
"pages_scraped": [{"url": p["url"], "source": p.get("source", "")} for p in scraped_pages],
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if fmt in ("json", "both"):
|
|
394
|
+
json_path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8")
|
|
395
|
+
if fmt in ("md", "both"):
|
|
396
|
+
md_path.write_text(_render_markdown(payload), encoding="utf-8")
|
|
397
|
+
|
|
398
|
+
display.update_step("Finalizing results", "ok")
|
|
399
|
+
|
|
400
|
+
c2_count = sum(
|
|
401
|
+
1 for e in final_entities
|
|
402
|
+
if e["entity_type"] == "ip_address"
|
|
403
|
+
and (e.get("corroborating_sources") or "").lower().find("c2") >= 0
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
display.complete(
|
|
407
|
+
{
|
|
408
|
+
"entity_count": len(final_entities),
|
|
409
|
+
"page_count": len(scraped_pages),
|
|
410
|
+
"c2_ips": c2_count,
|
|
411
|
+
"sources_used": sum(1 for v in sources_used.values() if v.get("status") == "ok"),
|
|
412
|
+
"report_path": str(md_path) if fmt in ("md", "both") else None,
|
|
413
|
+
"data_path": str(json_path) if fmt in ("json", "both") else None,
|
|
414
|
+
}
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# ---------------------------------------------------------------------------
|
|
419
|
+
# Side-source helpers (each gracefully degrades if module missing/disabled)
|
|
420
|
+
# ---------------------------------------------------------------------------
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
async def _scrape_pastes(query: str) -> list[dict]:
|
|
424
|
+
try:
|
|
425
|
+
from sources.paste_scraper import scrape_paste_sites
|
|
426
|
+
except Exception:
|
|
427
|
+
return []
|
|
428
|
+
if os.getenv("PASTE_SCRAPING_ENABLED", "true").lower() != "true":
|
|
429
|
+
return []
|
|
430
|
+
try:
|
|
431
|
+
return await scrape_paste_sites(query) or []
|
|
432
|
+
except Exception:
|
|
433
|
+
return []
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
async def _scrape_github(query: str) -> list[dict]:
|
|
437
|
+
try:
|
|
438
|
+
from sources.github_scraper import scrape_github
|
|
439
|
+
except Exception:
|
|
440
|
+
return []
|
|
441
|
+
if os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() != "true":
|
|
442
|
+
return []
|
|
443
|
+
try:
|
|
444
|
+
return await scrape_github(query) or []
|
|
445
|
+
except Exception:
|
|
446
|
+
return []
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
async def _scrape_gitlab(query: str) -> list[dict]:
|
|
450
|
+
try:
|
|
451
|
+
from sources.gitlab_scraper import scrape_gitlab
|
|
452
|
+
except Exception:
|
|
453
|
+
return []
|
|
454
|
+
if os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() != "true":
|
|
455
|
+
return []
|
|
456
|
+
try:
|
|
457
|
+
return await scrape_gitlab(query) or []
|
|
458
|
+
except Exception:
|
|
459
|
+
return []
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
async def _scrape_rss(query: str) -> list[dict]:
|
|
463
|
+
try:
|
|
464
|
+
from sources.rss_scraper import scrape_rss_feeds
|
|
465
|
+
except Exception:
|
|
466
|
+
return []
|
|
467
|
+
if os.getenv("RSS_FEEDS_ENABLED", "true").lower() != "true":
|
|
468
|
+
return []
|
|
469
|
+
try:
|
|
470
|
+
return await scrape_rss_feeds(query) or []
|
|
471
|
+
except Exception:
|
|
472
|
+
return []
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# ---------------------------------------------------------------------------
|
|
476
|
+
# DB helpers
|
|
477
|
+
# ---------------------------------------------------------------------------
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _lookup_page_ids(urls: list[str]) -> dict[str, uuid.UUID]:
|
|
481
|
+
if not urls:
|
|
482
|
+
return {}
|
|
483
|
+
try:
|
|
484
|
+
from db.models import Page
|
|
485
|
+
from db.session import get_session
|
|
486
|
+
except Exception:
|
|
487
|
+
return {}
|
|
488
|
+
out: dict[str, uuid.UUID] = {}
|
|
489
|
+
with get_session() as session:
|
|
490
|
+
rows = session.query(Page).filter(Page.url.in_(urls)).all()
|
|
491
|
+
for r in rows:
|
|
492
|
+
out[r.url] = r.id
|
|
493
|
+
return out
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _build_cooccurrence_edges(investigation_id: str) -> int:
|
|
497
|
+
"""Generate CO_APPEARED_ON edges for entities sharing a page."""
|
|
498
|
+
try:
|
|
499
|
+
from db.models import Entity
|
|
500
|
+
from db.session import get_session
|
|
501
|
+
except Exception:
|
|
502
|
+
return 0
|
|
503
|
+
from cli.adapters.sqlite import save_relationships
|
|
504
|
+
|
|
505
|
+
edges: list[dict] = []
|
|
506
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
507
|
+
|
|
508
|
+
with get_session() as session:
|
|
509
|
+
rows = (
|
|
510
|
+
session.query(Entity.id, Entity.page_id)
|
|
511
|
+
.filter(Entity.investigation_id == inv_uuid)
|
|
512
|
+
.all()
|
|
513
|
+
)
|
|
514
|
+
by_page: dict[uuid.UUID, list[uuid.UUID]] = {}
|
|
515
|
+
for ent_id, page_id in rows:
|
|
516
|
+
if page_id is None:
|
|
517
|
+
continue
|
|
518
|
+
by_page.setdefault(page_id, []).append(ent_id)
|
|
519
|
+
|
|
520
|
+
for ents in by_page.values():
|
|
521
|
+
if len(ents) < 2:
|
|
522
|
+
continue
|
|
523
|
+
for i in range(len(ents)):
|
|
524
|
+
for j in range(i + 1, len(ents)):
|
|
525
|
+
edges.append(
|
|
526
|
+
{
|
|
527
|
+
"entity_a_id": str(ents[i]),
|
|
528
|
+
"entity_b_id": str(ents[j]),
|
|
529
|
+
"relationship_type": "CO_APPEARED_ON",
|
|
530
|
+
"confidence": 0.8,
|
|
531
|
+
}
|
|
532
|
+
)
|
|
533
|
+
return save_relationships(investigation_id, edges)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# ---------------------------------------------------------------------------
|
|
537
|
+
# Markdown rendering
|
|
538
|
+
# ---------------------------------------------------------------------------
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _slugify(s: str) -> str:
|
|
542
|
+
s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
|
|
543
|
+
return s[:50] or "investigation"
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _render_markdown(payload: dict[str, Any]) -> str:
|
|
547
|
+
lines: list[str] = []
|
|
548
|
+
lines.append(f"# Investigation: {payload['query']}")
|
|
549
|
+
lines.append(
|
|
550
|
+
f"**Date:** {payload['created_at']} | **Model:** {payload.get('model_used') or '—'}"
|
|
551
|
+
)
|
|
552
|
+
if payload.get("refined_query") and payload["refined_query"] != payload["query"]:
|
|
553
|
+
lines.append(f"**Refined:** {payload['refined_query']}")
|
|
554
|
+
lines.append("")
|
|
555
|
+
lines.append("## Summary")
|
|
556
|
+
lines.append(payload.get("summary") or "_(no summary — LLM disabled or unavailable)_")
|
|
557
|
+
lines.append("")
|
|
558
|
+
|
|
559
|
+
entities = payload.get("entities", [])
|
|
560
|
+
by_type: dict[str, list[dict]] = {}
|
|
561
|
+
for e in entities:
|
|
562
|
+
by_type.setdefault(e["entity_type"], []).append(e)
|
|
563
|
+
|
|
564
|
+
c2_ips = [
|
|
565
|
+
e for e in entities
|
|
566
|
+
if e["entity_type"] == "ip_address"
|
|
567
|
+
and (e.get("corroborating_sources") or "").lower().find("c2") >= 0
|
|
568
|
+
]
|
|
569
|
+
lines.append("## Key findings")
|
|
570
|
+
lines.append(f"- {len(c2_ips)} confirmed C2 IP addresses")
|
|
571
|
+
lines.append(
|
|
572
|
+
f"- {len(by_type.get('ransomware_group', []))} ransomware group(s) identified"
|
|
573
|
+
)
|
|
574
|
+
lines.append(f"- {len(by_type.get('onion_url', []))} .onion URLs mapped")
|
|
575
|
+
lines.append(f"- {len(entities)} entities total")
|
|
576
|
+
lines.append("")
|
|
577
|
+
|
|
578
|
+
lines.append(f"## Entities ({len(entities)} total)")
|
|
579
|
+
for etype in sorted(by_type.keys()):
|
|
580
|
+
rows = by_type[etype]
|
|
581
|
+
lines.append(f"\n### {etype} ({len(rows)})")
|
|
582
|
+
lines.append("| Value | Confidence | Method | Tags |")
|
|
583
|
+
lines.append("|---|---|---|---|")
|
|
584
|
+
for r in rows[:50]:
|
|
585
|
+
tags = (r.get("corroborating_sources") or "").replace("|", "/")
|
|
586
|
+
val = (r.get("canonical_value") or r.get("value") or "").replace("|", "/")
|
|
587
|
+
conf = r.get("confidence")
|
|
588
|
+
lines.append(
|
|
589
|
+
f"| {val} | {conf:.2f} | {r.get('extraction_method') or ''} | {tags} |"
|
|
590
|
+
)
|
|
591
|
+
if len(rows) > 50:
|
|
592
|
+
lines.append(f"\n_…and {len(rows) - 50} more (see JSON)_")
|
|
593
|
+
lines.append("")
|
|
594
|
+
|
|
595
|
+
lines.append("## Sources used")
|
|
596
|
+
for name, info in payload.get("sources_used", {}).items():
|
|
597
|
+
glyph = "✓" if info.get("status") == "ok" else ("↷" if info.get("status") == "skipped" else "✗")
|
|
598
|
+
detail = f" ({info.get('count', 0)} results)" if "count" in info else ""
|
|
599
|
+
lines.append(f"- {glyph} {name}{detail}")
|
|
600
|
+
|
|
601
|
+
return "\n".join(lines) + "\n"
|
cli/commands/show.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli/commands/show.py — Launch the Textual entity browser.
|
|
3
|
+
|
|
4
|
+
Argument can be:
|
|
5
|
+
a path to a saved .json investigation file
|
|
6
|
+
an investigation id (UUID stored in SQLite)
|
|
7
|
+
omitted → interactive picker over recent runs
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import typer
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
|
|
20
|
+
console = Console()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run(
|
|
24
|
+
target: Optional[str] = typer.Argument(
|
|
25
|
+
None, help="Investigation id or path to a .json export"
|
|
26
|
+
),
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Open the entity browser TUI."""
|
|
29
|
+
from cli import config as cli_config
|
|
30
|
+
cli_config.apply_env()
|
|
31
|
+
|
|
32
|
+
data: Optional[dict] = None
|
|
33
|
+
|
|
34
|
+
if target is None:
|
|
35
|
+
target = _pick_recent()
|
|
36
|
+
if target is None:
|
|
37
|
+
console.print("[yellow]No investigations found. Run `voidaccess investigate` first.[/yellow]")
|
|
38
|
+
raise typer.Exit(code=1)
|
|
39
|
+
|
|
40
|
+
candidate_path = Path(target).expanduser()
|
|
41
|
+
if candidate_path.exists() and candidate_path.suffix == ".json":
|
|
42
|
+
data = json.loads(candidate_path.read_text(encoding="utf-8"))
|
|
43
|
+
else:
|
|
44
|
+
from cli.adapters import sqlite as sqlite_adapter
|
|
45
|
+
sqlite_adapter.init_db()
|
|
46
|
+
resolved = sqlite_adapter.resolve_investigation_id(target) or target
|
|
47
|
+
data = sqlite_adapter.investigation_to_export_dict(resolved)
|
|
48
|
+
if not data or not data.get("investigation"):
|
|
49
|
+
console.print(f"[red]Unknown investigation:[/red] {target}")
|
|
50
|
+
raise typer.Exit(code=1)
|
|
51
|
+
|
|
52
|
+
from cli.browser import EntityBrowserApp
|
|
53
|
+
app = EntityBrowserApp(data=data)
|
|
54
|
+
app.run()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _pick_recent() -> Optional[str]:
|
|
58
|
+
from cli.adapters import sqlite as sqlite_adapter
|
|
59
|
+
sqlite_adapter.init_db()
|
|
60
|
+
rows = sqlite_adapter.list_investigations(limit=20)
|
|
61
|
+
if not rows:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
table = Table(title="Recent investigations")
|
|
65
|
+
table.add_column("#", style="cyan", justify="right")
|
|
66
|
+
table.add_column("Query")
|
|
67
|
+
table.add_column("Status")
|
|
68
|
+
table.add_column("Entities", justify="right")
|
|
69
|
+
table.add_column("Date")
|
|
70
|
+
for idx, r in enumerate(rows, 1):
|
|
71
|
+
table.add_row(
|
|
72
|
+
str(idx),
|
|
73
|
+
(r["query"] or "")[:50],
|
|
74
|
+
r["status"] or "",
|
|
75
|
+
str(r["entity_count"]),
|
|
76
|
+
(r["created_at"] or "")[:19],
|
|
77
|
+
)
|
|
78
|
+
console.print(table)
|
|
79
|
+
from rich.prompt import Prompt
|
|
80
|
+
choice = Prompt.ask("Pick #", default="1")
|
|
81
|
+
try:
|
|
82
|
+
idx = int(choice) - 1
|
|
83
|
+
if 0 <= idx < len(rows):
|
|
84
|
+
return rows[idx]["id"]
|
|
85
|
+
except ValueError:
|
|
86
|
+
pass
|
|
87
|
+
return None
|