voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,2567 @@
|
|
|
1
|
+
"""
|
|
2
|
+
api/routes/investigations.py — Investigation management endpoints.
|
|
3
|
+
|
|
4
|
+
POST /investigations — trigger an investigation (background task)
|
|
5
|
+
GET /investigations — list recent investigations
|
|
6
|
+
GET /investigations/{id} — get single investigation
|
|
7
|
+
GET /investigations/{id}/entities — list entities for investigation
|
|
8
|
+
GET /investigations/{id}/graph — graph JSON for investigation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import concurrent.futures
|
|
15
|
+
import csv
|
|
16
|
+
import hashlib
|
|
17
|
+
import io
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import uuid
|
|
21
|
+
from typing import Any, Optional
|
|
22
|
+
|
|
23
|
+
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request, Response
|
|
24
|
+
from fastapi.responses import StreamingResponse
|
|
25
|
+
from pydantic import BaseModel, Field, validator
|
|
26
|
+
from sqlalchemy import select as sa_select
|
|
27
|
+
from crawler import crawl
|
|
28
|
+
from sources.seeds import get_seeds
|
|
29
|
+
from sources.seed_manager import get_seed_manager
|
|
30
|
+
from sources.paste_scraper import scrape_paste_sites
|
|
31
|
+
from sources.github_scraper import scrape_github
|
|
32
|
+
from sources.gitlab_scraper import scrape_gitlab
|
|
33
|
+
from sources.rss_scraper import scrape_rss_feeds
|
|
34
|
+
|
|
35
|
+
# Paste-site hostnames used for counting paste-sourced pages in responses.
|
|
36
|
+
PASTE_SITE_HOSTNAMES = (
|
|
37
|
+
"pastebin.com",
|
|
38
|
+
"rentry.co",
|
|
39
|
+
"dpaste.org",
|
|
40
|
+
"paste.ee",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Opt-out toggle for the parallel paste site scraper (read at task time so
|
|
44
|
+
# tests can monkey-patch the env var without re-importing this module).
|
|
45
|
+
def _paste_scraping_enabled() -> bool:
|
|
46
|
+
return os.getenv("PASTE_SCRAPING_ENABLED", "true").lower() == "true"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _github_scraping_enabled() -> bool:
|
|
50
|
+
return os.getenv("GITHUB_SCRAPING_ENABLED", "true").lower() == "true"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _gitlab_scraping_enabled() -> bool:
|
|
54
|
+
return os.getenv("GITLAB_SCRAPING_ENABLED", "true").lower() == "true"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _rss_scraping_enabled() -> bool:
|
|
58
|
+
return os.getenv("RSS_FEEDS_ENABLED", "true").lower() == "true"
|
|
59
|
+
from api.auth import CurrentUser, get_current_user, require_password_not_reset_pending
|
|
60
|
+
import json
|
|
61
|
+
|
|
62
|
+
logger = logging.getLogger(__name__)
|
|
63
|
+
logger.setLevel(logging.DEBUG)
|
|
64
|
+
router = APIRouter()
|
|
65
|
+
|
|
66
|
+
# In-process cache: investigation_id (str) → infrastructure clusters list.
|
|
67
|
+
# Populated during the pipeline run; read by the GET detail endpoint.
|
|
68
|
+
_infra_cluster_cache: dict[str, list] = {}
|
|
69
|
+
|
|
70
|
+
# In-process cache: investigation_id (str) → sources_used status dict.
|
|
71
|
+
# Populated during the pipeline run; read by the GET detail endpoint.
|
|
72
|
+
_sources_used_cache: dict[str, dict] = {}
|
|
73
|
+
|
|
74
|
+
# Cooperative cancellation flags: investigation_id (str) → True when cancel requested.
|
|
75
|
+
# Checked at pipeline checkpoints; cleared once the pipeline honours the request.
|
|
76
|
+
# Falls back cleanly in multi-process deployments (each worker has its own dict;
|
|
77
|
+
# cancellation works as long as the pipeline task runs in the same process as the
|
|
78
|
+
# cancel HTTP request, which is true for single-worker FastAPI/uvicorn).
|
|
79
|
+
_cancel_flags: dict[str, bool] = {}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _is_cancelled(investigation_id: str) -> bool:
|
|
83
|
+
return _cancel_flags.get(investigation_id, False)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _set_cancelled(investigation_id: str) -> None:
|
|
87
|
+
_cancel_flags[investigation_id] = True
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _clear_cancel_flag(investigation_id: str) -> None:
|
|
91
|
+
_cancel_flags.pop(investigation_id, None)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def _check_cancelled(inv_uuid: uuid.UUID, investigation_id: str) -> bool:
|
|
95
|
+
"""Return True and mark investigation cancelled in DB if cancellation was requested."""
|
|
96
|
+
if not _is_cancelled(investigation_id):
|
|
97
|
+
return False
|
|
98
|
+
_clear_cancel_flag(investigation_id)
|
|
99
|
+
logger.info("[%s] Cancellation flag detected — stopping pipeline cleanly", inv_uuid)
|
|
100
|
+
from db.models import Investigation
|
|
101
|
+
from db.session import get_session
|
|
102
|
+
with get_session() as session:
|
|
103
|
+
session.query(Investigation).filter_by(id=inv_uuid).update({"status": "cancelled"})
|
|
104
|
+
session.commit()
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
# Rate limiting (shared key_func with api/main.py; enforcement via app.state.limiter)
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
_DISABLE_RATE_LIMIT = os.getenv("DISABLE_RATE_LIMIT", "false").lower() == "true"
|
|
112
|
+
|
|
113
|
+
if not _DISABLE_RATE_LIMIT:
|
|
114
|
+
try:
|
|
115
|
+
from slowapi import Limiter
|
|
116
|
+
from slowapi.util import get_remote_address
|
|
117
|
+
_limiter: "Limiter | None" = Limiter(key_func=get_remote_address)
|
|
118
|
+
except ImportError:
|
|
119
|
+
_limiter = None
|
|
120
|
+
else:
|
|
121
|
+
_limiter = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _rate_limit(limit_string: str):
|
|
125
|
+
"""Return a slowapi rate-limit decorator, or a pass-through when disabled."""
|
|
126
|
+
if _limiter is None:
|
|
127
|
+
return lambda f: f
|
|
128
|
+
return _limiter.limit(limit_string)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
STEP_LABELS = {
|
|
132
|
+
1: "Refining query",
|
|
133
|
+
2: "Searching dark web",
|
|
134
|
+
3: "Filtering results",
|
|
135
|
+
4: "Scraping pages",
|
|
136
|
+
5: "Extracting entities",
|
|
137
|
+
6: "Enriching intelligence",
|
|
138
|
+
7: "Building graph",
|
|
139
|
+
8: "Generating summary",
|
|
140
|
+
9: "Finalizing results",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Request / response schemas
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class InvestigationRequest(BaseModel):
|
|
150
|
+
query: str = Field(..., min_length=3, max_length=500, description="Search query (3-500 chars)")
|
|
151
|
+
model: str = Field(default="openrouter/deepseek/deepseek-chat", description="LLM model ID to use")
|
|
152
|
+
run_crawler: bool = False
|
|
153
|
+
|
|
154
|
+
@validator("query")
|
|
155
|
+
def query_not_whitespace(cls, v: str) -> str:
|
|
156
|
+
if not v.strip():
|
|
157
|
+
raise ValueError("Query cannot be empty or whitespace")
|
|
158
|
+
if len(v.strip()) < 3:
|
|
159
|
+
raise ValueError("Query must be at least 3 characters")
|
|
160
|
+
return v.strip()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Helper: load investigation from DB
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _count_paste_pages_for_investigation(session, internal_id) -> tuple[int, list[str]]:
|
|
169
|
+
"""
|
|
170
|
+
Count distinct paste-site pages observed for *internal_id* and return the
|
|
171
|
+
list of paste sources that contributed at least one page.
|
|
172
|
+
|
|
173
|
+
Implementation: paste pages are persisted as rows in the `pages` table
|
|
174
|
+
with their paste-site URL, and entities extracted from those pages are
|
|
175
|
+
linked back to the investigation via Entity.investigation_id. We join
|
|
176
|
+
Entity → Page and filter by hostname instead of adding a DB column.
|
|
177
|
+
"""
|
|
178
|
+
try:
|
|
179
|
+
from db.models import Entity, Page
|
|
180
|
+
|
|
181
|
+
rows = (
|
|
182
|
+
session.query(Page.url)
|
|
183
|
+
.join(Entity, Entity.page_id == Page.id)
|
|
184
|
+
.filter(Entity.investigation_id == internal_id)
|
|
185
|
+
.distinct()
|
|
186
|
+
.all()
|
|
187
|
+
)
|
|
188
|
+
except Exception as exc:
|
|
189
|
+
logger.debug("paste-page count failed: %s", exc)
|
|
190
|
+
return 0, []
|
|
191
|
+
|
|
192
|
+
paste_urls: set[str] = set()
|
|
193
|
+
sources_used: set[str] = set()
|
|
194
|
+
for (url,) in rows:
|
|
195
|
+
if not url:
|
|
196
|
+
continue
|
|
197
|
+
url_lower = url.lower()
|
|
198
|
+
for host in PASTE_SITE_HOSTNAMES:
|
|
199
|
+
if host in url_lower:
|
|
200
|
+
paste_urls.add(url)
|
|
201
|
+
sources_used.add({
|
|
202
|
+
"pastebin.com": "Pastebin",
|
|
203
|
+
"rentry.co": "Rentry",
|
|
204
|
+
"dpaste.org": "dpaste",
|
|
205
|
+
"paste.ee": "paste.ee",
|
|
206
|
+
}[host])
|
|
207
|
+
break
|
|
208
|
+
return len(paste_urls), sorted(sources_used)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _get_db_investigation(investigation_id: str) -> Any:
|
|
212
|
+
"""Return investigation dict or raise HTTPException 404."""
|
|
213
|
+
if not os.getenv("DATABASE_URL"):
|
|
214
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
215
|
+
try:
|
|
216
|
+
from db.session import get_session # noqa: PLC0415
|
|
217
|
+
from db.queries import ( # noqa: PLC0415
|
|
218
|
+
count_distinct_pages_for_investigation,
|
|
219
|
+
get_investigation_by_id_or_run,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
from sqlalchemy import func # noqa: PLC0415
|
|
223
|
+
from db.models import Entity, EntityRelationship, InvestigationEntityLink # noqa: PLC0415
|
|
224
|
+
|
|
225
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
226
|
+
with get_session() as session:
|
|
227
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
228
|
+
if inv is None:
|
|
229
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
230
|
+
pages_crawled = count_distinct_pages_for_investigation(session, inv.id)
|
|
231
|
+
paste_pages_found, paste_sources_used = _count_paste_pages_for_investigation(
|
|
232
|
+
session, inv.id
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Entity IDs for this investigation = own entities + junction-table links
|
|
236
|
+
linked_ids_subq = (
|
|
237
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
238
|
+
.filter(InvestigationEntityLink.investigation_id == inv.id)
|
|
239
|
+
.subquery()
|
|
240
|
+
)
|
|
241
|
+
entity_subq = (
|
|
242
|
+
session.query(Entity.id)
|
|
243
|
+
.filter(
|
|
244
|
+
(Entity.investigation_id == inv.id)
|
|
245
|
+
| Entity.id.in_(linked_ids_subq)
|
|
246
|
+
)
|
|
247
|
+
.subquery()
|
|
248
|
+
)
|
|
249
|
+
entity_count = int(
|
|
250
|
+
session.query(func.count()).select_from(entity_subq).scalar() or 0
|
|
251
|
+
)
|
|
252
|
+
relationship_count = int(
|
|
253
|
+
session.query(func.count(EntityRelationship.id))
|
|
254
|
+
.filter(
|
|
255
|
+
(EntityRelationship.entity_a_id.in_(entity_subq))
|
|
256
|
+
| (EntityRelationship.entity_b_id.in_(entity_subq))
|
|
257
|
+
)
|
|
258
|
+
.scalar()
|
|
259
|
+
or 0
|
|
260
|
+
)
|
|
261
|
+
return {
|
|
262
|
+
"id": str(inv.id),
|
|
263
|
+
"run_id": str(inv.run_id),
|
|
264
|
+
"query": inv.query,
|
|
265
|
+
"refined_query": inv.refined_query,
|
|
266
|
+
"model_used": inv.model_used,
|
|
267
|
+
"preset": inv.preset,
|
|
268
|
+
"summary": inv.summary,
|
|
269
|
+
"status": inv.status,
|
|
270
|
+
"graph_status": getattr(inv, "graph_status", "pending"),
|
|
271
|
+
"created_at": inv.created_at.isoformat() if inv.created_at else None,
|
|
272
|
+
"current_step": inv.current_step or 0,
|
|
273
|
+
"total_steps": 13,
|
|
274
|
+
"current_step_label": inv.current_step_label or "",
|
|
275
|
+
"entity_count": entity_count,
|
|
276
|
+
"relationship_count": relationship_count,
|
|
277
|
+
"page_count": pages_crawled,
|
|
278
|
+
"pages_crawled": pages_crawled, # keep for compat
|
|
279
|
+
"paste_pages_found": paste_pages_found,
|
|
280
|
+
"paste_sources_used": paste_sources_used,
|
|
281
|
+
"infrastructure_clusters": _infra_cluster_cache.get(investigation_id, _infra_cluster_cache.get(str(inv.id), [])),
|
|
282
|
+
"sources_used": _sources_used_cache.get(str(inv.id), _sources_used_cache.get(investigation_id, {})),
|
|
283
|
+
}
|
|
284
|
+
except HTTPException:
|
|
285
|
+
raise
|
|
286
|
+
except ValueError:
|
|
287
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
288
|
+
except Exception as exc:
|
|
289
|
+
logger.exception("_get_db_investigation failed: %s", exc)
|
|
290
|
+
raise HTTPException(
|
|
291
|
+
status_code=500,
|
|
292
|
+
detail=f"Internal error: {exc!s}"[:500],
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
async def _update_investigation_status(
|
|
297
|
+
investigation_id: uuid.UUID,
|
|
298
|
+
status: str,
|
|
299
|
+
model_used: Optional[str] = None,
|
|
300
|
+
summary: Optional[str] = None,
|
|
301
|
+
) -> None:
|
|
302
|
+
"""Update investigation status in a short-lived session."""
|
|
303
|
+
from db.session import get_session
|
|
304
|
+
from db.models import Investigation
|
|
305
|
+
|
|
306
|
+
with get_session() as session:
|
|
307
|
+
updates: dict[str, Any] = {"status": status}
|
|
308
|
+
if model_used is not None:
|
|
309
|
+
updates["model_used"] = model_used
|
|
310
|
+
if summary is not None:
|
|
311
|
+
updates["summary"] = summary
|
|
312
|
+
session.query(Investigation).filter_by(id=investigation_id).update(updates)
|
|
313
|
+
session.commit()
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
async def _update_progress(
|
|
317
|
+
investigation_id: uuid.UUID,
|
|
318
|
+
step: Optional[int] = None,
|
|
319
|
+
entity_count: Optional[int] = None,
|
|
320
|
+
scraped_pages: Optional[dict] = None,
|
|
321
|
+
label: Optional[str] = None,
|
|
322
|
+
) -> None:
|
|
323
|
+
"""Fire-and-forget progress field update. Failures are non-critical."""
|
|
324
|
+
try:
|
|
325
|
+
from db.session import get_session
|
|
326
|
+
from db.models import Investigation
|
|
327
|
+
|
|
328
|
+
with get_session() as session:
|
|
329
|
+
inv = session.query(Investigation).filter_by(id=investigation_id).first()
|
|
330
|
+
if inv is None:
|
|
331
|
+
return
|
|
332
|
+
if step is not None:
|
|
333
|
+
inv.current_step = step
|
|
334
|
+
inv.current_step_label = label if label is not None else STEP_LABELS.get(step, "Processing")
|
|
335
|
+
elif label is not None:
|
|
336
|
+
inv.current_step_label = label
|
|
337
|
+
if entity_count is not None:
|
|
338
|
+
inv.entity_count = entity_count
|
|
339
|
+
if scraped_pages is not None:
|
|
340
|
+
inv.page_count = len(scraped_pages)
|
|
341
|
+
session.commit()
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.warning("[%s] _update_progress failed (non-critical): %s", investigation_id, e)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
async def _get_investigation_model_choice(model: Optional[str]) -> tuple[str, Any]:
|
|
347
|
+
"""Get model choices and selected model in a short-lived session."""
|
|
348
|
+
from db.session import get_session
|
|
349
|
+
from voidaccess.llm_utils import get_model_choices
|
|
350
|
+
import config as config_module
|
|
351
|
+
|
|
352
|
+
with get_session() as session:
|
|
353
|
+
model_choices = get_model_choices()
|
|
354
|
+
if not model_choices:
|
|
355
|
+
raise RuntimeError("No LLM models available")
|
|
356
|
+
selected_model = (
|
|
357
|
+
model
|
|
358
|
+
or config_module.DEFAULT_MODEL
|
|
359
|
+
or "openrouter/deepseek/deepseek-chat"
|
|
360
|
+
)
|
|
361
|
+
return selected_model, model_choices
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# ---------------------------------------------------------------------------
|
|
365
|
+
# Background task: run investigation pipeline
|
|
366
|
+
# ---------------------------------------------------------------------------
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _parse_rate_limit_reset(exc: Exception) -> float:
|
|
370
|
+
"""Extract reset timestamp from a 429 error and return seconds to wait."""
|
|
371
|
+
import time, re
|
|
372
|
+
exc_str = str(exc)
|
|
373
|
+
# OpenRouter returns X-RateLimit-Reset as epoch milliseconds in metadata
|
|
374
|
+
match = re.search(r"'X-RateLimit-Reset':\s*'?(\d{13})'?", exc_str)
|
|
375
|
+
if match:
|
|
376
|
+
reset_ms = int(match.group(1))
|
|
377
|
+
wait = (reset_ms / 1000.0) - time.time() + 1.0 # +1s buffer
|
|
378
|
+
return max(wait, 5.0)
|
|
379
|
+
# Fallback: 65s to outlast a 60s/1-min rate-limit window
|
|
380
|
+
return 65.0
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
async def _llm_with_backoff(fn, *args, max_retries: int = 4, investigation_id: "uuid.UUID | None" = None, **kwargs):
|
|
384
|
+
"""Run a synchronous LLM function in a thread, retrying on 429 rate-limit errors."""
|
|
385
|
+
for attempt in range(max_retries):
|
|
386
|
+
try:
|
|
387
|
+
return await asyncio.to_thread(fn, *args, **kwargs)
|
|
388
|
+
except Exception as exc:
|
|
389
|
+
if "429" in str(exc) and attempt < max_retries - 1:
|
|
390
|
+
wait_secs = _parse_rate_limit_reset(exc)
|
|
391
|
+
logger.info(
|
|
392
|
+
"[LLM] Rate limit hit — waiting %.0fs before retry (attempt %d/%d)",
|
|
393
|
+
wait_secs, attempt + 1, max_retries,
|
|
394
|
+
)
|
|
395
|
+
if investigation_id is not None:
|
|
396
|
+
await _update_progress(investigation_id, label=f"Rate limited — retrying in {wait_secs:.0f}s...")
|
|
397
|
+
await asyncio.sleep(wait_secs)
|
|
398
|
+
else:
|
|
399
|
+
raise
|
|
400
|
+
raise RuntimeError("LLM max retries exceeded")
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
async def _run_investigation_task(
|
|
404
|
+
investigation_id: str, run_id: str, query: str, model: str, run_crawler: bool
|
|
405
|
+
) -> None:
|
|
406
|
+
"""
|
|
407
|
+
Background task that runs the investigation pipeline.
|
|
408
|
+
|
|
409
|
+
The investigation DB record already exists (created by the HTTP handler) with
|
|
410
|
+
status "pending". This task updates status → processing → completed/failed.
|
|
411
|
+
|
|
412
|
+
CRITICAL: Each DB operation uses its own short-lived session that commits
|
|
413
|
+
and closes immediately. No session is held open across asyncio.to_thread()
|
|
414
|
+
calls, which prevents SQLAlchemy session state corruption and connection
|
|
415
|
+
pool exhaustion.
|
|
416
|
+
|
|
417
|
+
Errors are logged — never propagated to the caller.
|
|
418
|
+
"""
|
|
419
|
+
try:
|
|
420
|
+
if not os.getenv("DATABASE_URL"):
|
|
421
|
+
logger.warning("Background investigation: DATABASE_URL not set, skipping persist")
|
|
422
|
+
return
|
|
423
|
+
|
|
424
|
+
from db.models import Investigation
|
|
425
|
+
from db.session import get_session, get_async_session
|
|
426
|
+
from db.queries import update_investigation_summary
|
|
427
|
+
from voidaccess.llm import filter_results, generate_summary, get_llm, refine_query
|
|
428
|
+
from voidaccess.llm_utils import get_model_choices
|
|
429
|
+
from search.search import _search_async as _search_engines_async, _dedupe_links as _search_dedupe, ENGINE_WEIGHTS as _engine_weights
|
|
430
|
+
from scraper.scrape import scrape_multiple, validate_urls_for_scraping
|
|
431
|
+
from extractor import extract_entities_from_pages
|
|
432
|
+
|
|
433
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
434
|
+
|
|
435
|
+
async with get_async_session() as session:
|
|
436
|
+
result = await session.execute(
|
|
437
|
+
sa_select(Investigation).where(Investigation.id == inv_uuid)
|
|
438
|
+
)
|
|
439
|
+
inv_record = result.scalar_one_or_none()
|
|
440
|
+
inv_user_id = inv_record.user_id if inv_record else None
|
|
441
|
+
|
|
442
|
+
resolved_keys = {}
|
|
443
|
+
if inv_user_id is not None:
|
|
444
|
+
async with get_async_session() as session:
|
|
445
|
+
from utils.user_keys import resolve_api_key
|
|
446
|
+
for key_name in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY",
|
|
447
|
+
"OPENROUTER_API_KEY", "GROQ_API_KEY", "OTX_API_KEY", "VT_API_KEY"):
|
|
448
|
+
resolved_keys[key_name] = await resolve_api_key(inv_user_id, key_name, session)
|
|
449
|
+
|
|
450
|
+
# ===== STEP 0: Get model choice and mark as processing =====
|
|
451
|
+
selected_model, _ = await _get_investigation_model_choice(model)
|
|
452
|
+
logger.info(
|
|
453
|
+
"Investigation %s: using model '%s'",
|
|
454
|
+
inv_uuid,
|
|
455
|
+
selected_model,
|
|
456
|
+
)
|
|
457
|
+
await _update_investigation_status(inv_uuid, "processing", model_used=selected_model)
|
|
458
|
+
await _update_progress(inv_uuid, 0)
|
|
459
|
+
logger.info("[%s] Starting investigation: %s", inv_uuid, query)
|
|
460
|
+
|
|
461
|
+
# ===== STEP 1: Query refinement (no session held) =====
|
|
462
|
+
logger.info("[%s] STEP 1: Refining query...", inv_uuid)
|
|
463
|
+
llm_client = None
|
|
464
|
+
refined_query = query
|
|
465
|
+
try:
|
|
466
|
+
llm_client = get_llm(selected_model, api_keys=resolved_keys)
|
|
467
|
+
refined_query = await _llm_with_backoff(refine_query, llm_client, query, investigation_id=inv_uuid)
|
|
468
|
+
logger.info("[%s] Refined query: %s", inv_uuid, refined_query)
|
|
469
|
+
except Exception as exc:
|
|
470
|
+
logger.exception("[%s] Query refinement failed, using original query: %s", inv_uuid, exc)
|
|
471
|
+
refined_query = query
|
|
472
|
+
|
|
473
|
+
def _persist_refined_query():
|
|
474
|
+
with get_session() as session:
|
|
475
|
+
inv = session.query(Investigation).filter_by(id=inv_uuid).first()
|
|
476
|
+
if inv:
|
|
477
|
+
inv.refined_query = refined_query
|
|
478
|
+
session.commit()
|
|
479
|
+
|
|
480
|
+
await asyncio.to_thread(_persist_refined_query)
|
|
481
|
+
await _update_progress(inv_uuid, 1)
|
|
482
|
+
if await _check_cancelled(inv_uuid, investigation_id):
|
|
483
|
+
return
|
|
484
|
+
|
|
485
|
+
# ===== STEP 1.5: Multilingual Query Expansion (no session held) =====
|
|
486
|
+
logger.info("[%s] STEP 1.5: Expanding query to multiple languages...", inv_uuid)
|
|
487
|
+
expanded_queries: dict[str, str] = {"en": refined_query}
|
|
488
|
+
try:
|
|
489
|
+
from i18n.query_expand import expand_query
|
|
490
|
+
|
|
491
|
+
expansion = expand_query(refined_query)
|
|
492
|
+
|
|
493
|
+
if expansion and isinstance(expansion, dict) and len(expansion) > 1:
|
|
494
|
+
expanded_queries = expansion
|
|
495
|
+
lang_count = len(expanded_queries)
|
|
496
|
+
logger.info(
|
|
497
|
+
"[%s] Query expanded to %d languages: %s",
|
|
498
|
+
inv_uuid,
|
|
499
|
+
lang_count,
|
|
500
|
+
list(expanded_queries.keys()),
|
|
501
|
+
)
|
|
502
|
+
else:
|
|
503
|
+
logger.info("[%s] Query expansion returned no results, using English only", inv_uuid)
|
|
504
|
+
|
|
505
|
+
except ImportError:
|
|
506
|
+
logger.info("[%s] i18n module not available, using English only", inv_uuid)
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.info("[%s] Query expansion failed (non-fatal): %s", inv_uuid, e)
|
|
509
|
+
|
|
510
|
+
# ===== SEED URL INJECTION (runs before search engine fan-out) =====
|
|
511
|
+
# Curated, known-active .onion intelligence sources are checked first
|
|
512
|
+
# so we always visit relevant leak sites/forums even if search engines
|
|
513
|
+
# don't surface them. These bypass the LLM filter.
|
|
514
|
+
relevant_seeds: list[dict] = []
|
|
515
|
+
try:
|
|
516
|
+
seed_manager = get_seed_manager()
|
|
517
|
+
relevant_seeds = seed_manager.get_relevant_seeds(
|
|
518
|
+
query=query,
|
|
519
|
+
refined_query=refined_query or "",
|
|
520
|
+
max_seeds=10,
|
|
521
|
+
)
|
|
522
|
+
except Exception as exc:
|
|
523
|
+
logger.info("[%s] Seed manager unavailable (non-fatal): %s", inv_uuid, exc)
|
|
524
|
+
relevant_seeds = []
|
|
525
|
+
|
|
526
|
+
seed_urls: list[dict] = []
|
|
527
|
+
if relevant_seeds:
|
|
528
|
+
for s in relevant_seeds:
|
|
529
|
+
url = s.get("url") or ""
|
|
530
|
+
if not url:
|
|
531
|
+
continue
|
|
532
|
+
seed_urls.append({
|
|
533
|
+
"link": url,
|
|
534
|
+
"title": s.get("name", "Seed source"),
|
|
535
|
+
"source": "seed",
|
|
536
|
+
"source_type": "seed",
|
|
537
|
+
"seed_category": s.get("category", "unknown"),
|
|
538
|
+
"seed_tags": s.get("tags", []),
|
|
539
|
+
})
|
|
540
|
+
categories = sorted({s.get("category", "unknown") for s in relevant_seeds})
|
|
541
|
+
logger.info(
|
|
542
|
+
"[%s] Injecting %d seed URLs into scrape queue (categories: %s)",
|
|
543
|
+
inv_uuid,
|
|
544
|
+
len(seed_urls),
|
|
545
|
+
categories,
|
|
546
|
+
)
|
|
547
|
+
await _update_progress(
|
|
548
|
+
inv_uuid,
|
|
549
|
+
step=2,
|
|
550
|
+
label=f"Checking {len(seed_urls)} known intelligence sources + searching Tor engines",
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
logger.info("[%s] No relevant seeds for query", inv_uuid)
|
|
554
|
+
|
|
555
|
+
# ===== STEP 2, 3.5, 4: Parallel Pipeline =====
|
|
556
|
+
logger.info("[%s] STEP 2/3.5/4: Launching Search, Enrichment, and Crawler concurrently...", inv_uuid)
|
|
557
|
+
|
|
558
|
+
async def run_search_and_filter() -> list:
|
|
559
|
+
logger.info("[%s] STEP 2: Searching dark web...", inv_uuid)
|
|
560
|
+
|
|
561
|
+
async def search_single_language(lang_code: str, q: str) -> list[dict]:
|
|
562
|
+
search_query = q.replace(" ", "+")
|
|
563
|
+
logger.info("[%s] Searching [%s]: %s...", inv_uuid, lang_code, search_query[:60])
|
|
564
|
+
try:
|
|
565
|
+
engine_results = await _search_engines_async(search_query)
|
|
566
|
+
all_links: list[dict] = []
|
|
567
|
+
for er in engine_results:
|
|
568
|
+
weight = 0.5
|
|
569
|
+
for known in _engine_weights:
|
|
570
|
+
if known in er.name.lower():
|
|
571
|
+
weight = _engine_weights[known]
|
|
572
|
+
break
|
|
573
|
+
for link in er.links:
|
|
574
|
+
link["source_engine"] = er.name
|
|
575
|
+
link["source_weight"] = weight
|
|
576
|
+
all_links.append(link)
|
|
577
|
+
lang_results = _search_dedupe(all_links)
|
|
578
|
+
lang_results.sort(key=lambda r: r.get("source_weight", 0.5), reverse=True)
|
|
579
|
+
for result in lang_results:
|
|
580
|
+
result["search_language"] = lang_code
|
|
581
|
+
return lang_results
|
|
582
|
+
except Exception as e:
|
|
583
|
+
logger.info("[%s] [%s] search failed: %s", inv_uuid, lang_code, e)
|
|
584
|
+
return []
|
|
585
|
+
|
|
586
|
+
search_tasks = [
|
|
587
|
+
search_single_language(lang, q)
|
|
588
|
+
for lang, q in expanded_queries.items()
|
|
589
|
+
]
|
|
590
|
+
try:
|
|
591
|
+
results_by_language = await asyncio.wait_for(
|
|
592
|
+
asyncio.gather(*search_tasks, return_exceptions=True),
|
|
593
|
+
timeout=180,
|
|
594
|
+
)
|
|
595
|
+
except asyncio.TimeoutError:
|
|
596
|
+
logger.warning("[%s] Multilingual search timed out after 180s, using partial results", inv_uuid)
|
|
597
|
+
results_by_language = []
|
|
598
|
+
|
|
599
|
+
all_search_results = []
|
|
600
|
+
seen_urls = set()
|
|
601
|
+
for lang_results in results_by_language:
|
|
602
|
+
if isinstance(lang_results, Exception):
|
|
603
|
+
continue
|
|
604
|
+
for result in lang_results:
|
|
605
|
+
url = result.get("link", "")
|
|
606
|
+
normalized = url.lower().rstrip("/").replace("https://", "http://")
|
|
607
|
+
if normalized and normalized not in seen_urls:
|
|
608
|
+
seen_urls.add(normalized)
|
|
609
|
+
all_search_results.append(result)
|
|
610
|
+
|
|
611
|
+
search_results = all_search_results
|
|
612
|
+
logger.info("[%s] Total search results: %d (from %d languages)", inv_uuid, len(search_results), len(expanded_queries))
|
|
613
|
+
|
|
614
|
+
if not search_results:
|
|
615
|
+
logger.info("[%s] WARNING: No search results from any language", inv_uuid)
|
|
616
|
+
|
|
617
|
+
logger.info("[%s] STEP 3: Filtering results...", inv_uuid)
|
|
618
|
+
if llm_client is None:
|
|
619
|
+
filtered_results = list(search_results[:100])
|
|
620
|
+
logger.info("[%s] LLM unavailable; fallback to top %s search results", inv_uuid, len(filtered_results))
|
|
621
|
+
else:
|
|
622
|
+
try:
|
|
623
|
+
filtered_results = await _llm_with_backoff(filter_results, llm_client, refined_query, search_results, investigation_id=inv_uuid)
|
|
624
|
+
except Exception as exc:
|
|
625
|
+
logger.exception("[%s] Filter step failed, falling back: %s", inv_uuid, exc)
|
|
626
|
+
filtered_results = list(search_results[:100])
|
|
627
|
+
logger.info("[%s] Filtered to %s results", inv_uuid, len(filtered_results))
|
|
628
|
+
|
|
629
|
+
_urls_to_scrape = list(filtered_results)
|
|
630
|
+
if len(_urls_to_scrape) < 100:
|
|
631
|
+
current_links = {res.get("link") for res in _urls_to_scrape if res.get("link")}
|
|
632
|
+
for res in search_results:
|
|
633
|
+
if res.get("link") not in current_links:
|
|
634
|
+
_urls_to_scrape.append(res)
|
|
635
|
+
current_links.add(res.get("link"))
|
|
636
|
+
if len(_urls_to_scrape) >= 150:
|
|
637
|
+
break
|
|
638
|
+
return _urls_to_scrape
|
|
639
|
+
|
|
640
|
+
async def run_enrichment() -> list:
|
|
641
|
+
logger.info("[%s] STEP 3.5: Running threat intel enrichment...", inv_uuid)
|
|
642
|
+
try:
|
|
643
|
+
from sources.enrichment import enrich_investigation
|
|
644
|
+
|
|
645
|
+
queries_to_enrich = [query]
|
|
646
|
+
if refined_query and refined_query.strip().lower() != query.strip().lower():
|
|
647
|
+
queries_to_enrich.append(refined_query)
|
|
648
|
+
|
|
649
|
+
all_pages: list = []
|
|
650
|
+
seen_urls: set = set()
|
|
651
|
+
for eq in queries_to_enrich:
|
|
652
|
+
try:
|
|
653
|
+
# Hard 60s cap per enrichment query — individual requests already have 30s timeouts
|
|
654
|
+
batch = await asyncio.wait_for(
|
|
655
|
+
enrich_investigation(
|
|
656
|
+
query=eq,
|
|
657
|
+
otx_api_key=resolved_keys.get("OTX_API_KEY") or "",
|
|
658
|
+
),
|
|
659
|
+
timeout=60,
|
|
660
|
+
)
|
|
661
|
+
for p in batch:
|
|
662
|
+
u = p.get("url") or p.get("link") or ""
|
|
663
|
+
if u not in seen_urls:
|
|
664
|
+
seen_urls.add(u)
|
|
665
|
+
all_pages.append(p)
|
|
666
|
+
except asyncio.TimeoutError:
|
|
667
|
+
logger.warning("[%s] Enrichment query '%s' timed out after 60s", inv_uuid, eq)
|
|
668
|
+
except Exception as exc:
|
|
669
|
+
logger.info("[%s] Enrichment batch failed for '%s': %s", inv_uuid, eq, exc)
|
|
670
|
+
|
|
671
|
+
logger.info("[%s] Enrichment: %s pages (tried %s queries)", inv_uuid, len(all_pages), len(queries_to_enrich))
|
|
672
|
+
return all_pages
|
|
673
|
+
except Exception as exc:
|
|
674
|
+
logger.info("[%s] Enrichment failed (non-fatal): %s", inv_uuid, exc)
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
async def run_crawler_task() -> list:
|
|
678
|
+
if not run_crawler:
|
|
679
|
+
logger.info("[%s] STEP 4: Crawler disabled", inv_uuid)
|
|
680
|
+
return []
|
|
681
|
+
try:
|
|
682
|
+
logger.info("[%s] STEP 4: Running recursive crawler...", inv_uuid)
|
|
683
|
+
seeds = await asyncio.to_thread(get_seeds, category="index", query=refined_query)
|
|
684
|
+
seed_urls = [seed["url"] for seed in seeds if seed.get("url")]
|
|
685
|
+
# max_depth=1 and max_pages=20 keep the crawler bounded;
|
|
686
|
+
# 120s hard cap prevents dead Tor circuits from stalling the pipeline
|
|
687
|
+
crawler_result = await asyncio.wait_for(
|
|
688
|
+
crawl(seed_urls=seed_urls, query=refined_query, max_depth=1, max_pages=20),
|
|
689
|
+
timeout=120,
|
|
690
|
+
)
|
|
691
|
+
logger.info("[%s] Crawler: %s pages, %s failed", inv_uuid, crawler_result.pages_crawled, crawler_result.pages_failed)
|
|
692
|
+
return [{"link": item.get("url", ""), "title": "Crawler discovery"}
|
|
693
|
+
for item in crawler_result.results if isinstance(item, dict) and item.get("url")]
|
|
694
|
+
except asyncio.TimeoutError:
|
|
695
|
+
logger.warning("[%s] Crawler timed out after 120s, continuing without crawler results", inv_uuid)
|
|
696
|
+
return []
|
|
697
|
+
except Exception as exc:
|
|
698
|
+
logger.exception("[%s] Crawler failed: %s", inv_uuid, str(exc))
|
|
699
|
+
return []
|
|
700
|
+
|
|
701
|
+
async def run_paste_scraping_task() -> list:
|
|
702
|
+
# Clearnet paste-site sweep (Pastebin, dpaste, paste.ee, Rentry).
|
|
703
|
+
# Opt-out via PASTE_SCRAPING_ENABLED=false.
|
|
704
|
+
if not _paste_scraping_enabled():
|
|
705
|
+
logger.info("[%s] Paste sites: disabled via env var", inv_uuid)
|
|
706
|
+
return []
|
|
707
|
+
try:
|
|
708
|
+
paste_max = int(os.getenv("PASTE_MAX_RESULTS", "15") or 15)
|
|
709
|
+
except ValueError:
|
|
710
|
+
paste_max = 15
|
|
711
|
+
try:
|
|
712
|
+
pages = await asyncio.wait_for(
|
|
713
|
+
scrape_paste_sites(
|
|
714
|
+
query=query,
|
|
715
|
+
refined_query=refined_query or "",
|
|
716
|
+
max_results=paste_max,
|
|
717
|
+
),
|
|
718
|
+
timeout=120,
|
|
719
|
+
)
|
|
720
|
+
logger.info(
|
|
721
|
+
"[%s] Paste sites: %d pastes found",
|
|
722
|
+
inv_uuid,
|
|
723
|
+
len(pages),
|
|
724
|
+
)
|
|
725
|
+
return pages
|
|
726
|
+
except asyncio.TimeoutError:
|
|
727
|
+
logger.warning("[%s] Paste scraping timed out after 120s", inv_uuid)
|
|
728
|
+
return []
|
|
729
|
+
except Exception as exc:
|
|
730
|
+
logger.info("[%s] Paste scraping failed (non-fatal): %s", inv_uuid, exc)
|
|
731
|
+
return []
|
|
732
|
+
|
|
733
|
+
async def run_github_scraping_task() -> list:
|
|
734
|
+
# Clearnet GitHub sweep — code search + repo READMEs.
|
|
735
|
+
# Opt-out via GITHUB_SCRAPING_ENABLED=false.
|
|
736
|
+
if not _github_scraping_enabled():
|
|
737
|
+
logger.info("[%s] GitHub: disabled via env var", inv_uuid)
|
|
738
|
+
return []
|
|
739
|
+
try:
|
|
740
|
+
github_max = int(os.getenv("GITHUB_MAX_RESULTS", "15") or 15)
|
|
741
|
+
except ValueError:
|
|
742
|
+
github_max = 15
|
|
743
|
+
try:
|
|
744
|
+
pages = await asyncio.wait_for(
|
|
745
|
+
scrape_github(
|
|
746
|
+
query=query,
|
|
747
|
+
refined_query=refined_query or "",
|
|
748
|
+
max_results=github_max,
|
|
749
|
+
),
|
|
750
|
+
timeout=180,
|
|
751
|
+
)
|
|
752
|
+
logger.info(
|
|
753
|
+
"[%s] GitHub: %d files found",
|
|
754
|
+
inv_uuid,
|
|
755
|
+
len(pages),
|
|
756
|
+
)
|
|
757
|
+
return pages
|
|
758
|
+
except asyncio.TimeoutError:
|
|
759
|
+
logger.warning("[%s] GitHub scraping timed out after 180s", inv_uuid)
|
|
760
|
+
return []
|
|
761
|
+
except Exception as exc:
|
|
762
|
+
logger.info("[%s] GitHub scraping failed (non-fatal): %s", inv_uuid, exc)
|
|
763
|
+
return []
|
|
764
|
+
|
|
765
|
+
async def run_gitlab_scraping_task() -> list:
|
|
766
|
+
# Clearnet GitLab sweep — code search + project READMEs.
|
|
767
|
+
# Opt-out via GITLAB_SCRAPING_ENABLED=false.
|
|
768
|
+
if not _gitlab_scraping_enabled():
|
|
769
|
+
logger.info("[%s] GitLab: disabled via env var", inv_uuid)
|
|
770
|
+
return []
|
|
771
|
+
try:
|
|
772
|
+
gitlab_max = int(os.getenv("GITLAB_MAX_RESULTS", "15") or 15)
|
|
773
|
+
except ValueError:
|
|
774
|
+
gitlab_max = 15
|
|
775
|
+
try:
|
|
776
|
+
pages = await asyncio.wait_for(
|
|
777
|
+
scrape_gitlab(
|
|
778
|
+
query=query,
|
|
779
|
+
refined_query=refined_query or "",
|
|
780
|
+
max_results=gitlab_max,
|
|
781
|
+
),
|
|
782
|
+
timeout=180,
|
|
783
|
+
)
|
|
784
|
+
logger.info(
|
|
785
|
+
"[%s] GitLab: %d results found",
|
|
786
|
+
inv_uuid,
|
|
787
|
+
len(pages),
|
|
788
|
+
)
|
|
789
|
+
return pages
|
|
790
|
+
except asyncio.TimeoutError:
|
|
791
|
+
logger.warning("[%s] GitLab scraping timed out after 180s", inv_uuid)
|
|
792
|
+
return []
|
|
793
|
+
except Exception as exc:
|
|
794
|
+
logger.info("[%s] GitLab scraping failed (non-fatal): %s", inv_uuid, exc)
|
|
795
|
+
return []
|
|
796
|
+
|
|
797
|
+
async def run_rss_scraping_task() -> list:
|
|
798
|
+
if not _rss_scraping_enabled():
|
|
799
|
+
logger.info("[%s] RSS feeds: disabled via env var", inv_uuid)
|
|
800
|
+
return []
|
|
801
|
+
try:
|
|
802
|
+
rss_max = int(os.getenv("RSS_MAX_ARTICLES", "20") or 20)
|
|
803
|
+
except ValueError:
|
|
804
|
+
rss_max = 20
|
|
805
|
+
try:
|
|
806
|
+
pages = await asyncio.wait_for(
|
|
807
|
+
scrape_rss_feeds(
|
|
808
|
+
query=query,
|
|
809
|
+
refined_query=refined_query or "",
|
|
810
|
+
max_results=rss_max,
|
|
811
|
+
),
|
|
812
|
+
timeout=120,
|
|
813
|
+
)
|
|
814
|
+
logger.info("[%s] RSS feeds: %d articles found", inv_uuid, len(pages))
|
|
815
|
+
return pages
|
|
816
|
+
except asyncio.TimeoutError:
|
|
817
|
+
logger.warning("[%s] RSS scraping timed out after 120s", inv_uuid)
|
|
818
|
+
return []
|
|
819
|
+
except Exception as exc:
|
|
820
|
+
logger.info("[%s] RSS scraping failed (non-fatal): %s", inv_uuid, exc)
|
|
821
|
+
return []
|
|
822
|
+
|
|
823
|
+
# Hard 5-minute cap on the entire parallel phase (search + enrichment +
|
|
824
|
+
# crawler + paste scraping + github scraping + gitlab scraping + RSS
|
|
825
|
+
# feeds). Each inner function also has its own timeout so partial
|
|
826
|
+
# results are preserved even if only one hangs.
|
|
827
|
+
# return_exceptions=True ensures one failing task never cancels the others.
|
|
828
|
+
try:
|
|
829
|
+
_gr = await asyncio.wait_for(
|
|
830
|
+
asyncio.gather(
|
|
831
|
+
run_search_and_filter(),
|
|
832
|
+
run_enrichment(),
|
|
833
|
+
run_crawler_task(),
|
|
834
|
+
run_paste_scraping_task(),
|
|
835
|
+
run_github_scraping_task(),
|
|
836
|
+
run_gitlab_scraping_task(),
|
|
837
|
+
run_rss_scraping_task(),
|
|
838
|
+
return_exceptions=True,
|
|
839
|
+
),
|
|
840
|
+
timeout=300,
|
|
841
|
+
)
|
|
842
|
+
except asyncio.TimeoutError:
|
|
843
|
+
logger.warning("[%s] Parallel phase hit 300s hard cap — using empty results", inv_uuid)
|
|
844
|
+
_gr = [[], [], [], [], [], [], []]
|
|
845
|
+
|
|
846
|
+
_source_errors: set[str] = set()
|
|
847
|
+
|
|
848
|
+
if isinstance(_gr[0], Exception):
|
|
849
|
+
logger.warning("[%s] Search+filter task raised: %s", inv_uuid, _gr[0])
|
|
850
|
+
_source_errors.add("tor_search")
|
|
851
|
+
search_urls = []
|
|
852
|
+
else:
|
|
853
|
+
search_urls = _gr[0]
|
|
854
|
+
|
|
855
|
+
if isinstance(_gr[1], Exception):
|
|
856
|
+
logger.warning("[%s] Enrichment task raised: %s", inv_uuid, _gr[1])
|
|
857
|
+
_source_errors.add("enrichment")
|
|
858
|
+
enrichment_pages = []
|
|
859
|
+
else:
|
|
860
|
+
enrichment_pages = _gr[1]
|
|
861
|
+
|
|
862
|
+
if isinstance(_gr[2], Exception):
|
|
863
|
+
logger.warning("[%s] Crawler task raised: %s", inv_uuid, _gr[2])
|
|
864
|
+
crawler_urls = []
|
|
865
|
+
else:
|
|
866
|
+
crawler_urls = _gr[2]
|
|
867
|
+
|
|
868
|
+
if isinstance(_gr[3], Exception):
|
|
869
|
+
logger.warning("[%s] Paste scraping task raised: %s", inv_uuid, _gr[3])
|
|
870
|
+
_source_errors.add("paste_sites")
|
|
871
|
+
paste_pages = []
|
|
872
|
+
else:
|
|
873
|
+
paste_pages = _gr[3]
|
|
874
|
+
|
|
875
|
+
if isinstance(_gr[4], Exception):
|
|
876
|
+
logger.warning("[%s] GitHub scraping task raised: %s", inv_uuid, _gr[4])
|
|
877
|
+
_source_errors.add("github")
|
|
878
|
+
github_pages = []
|
|
879
|
+
else:
|
|
880
|
+
github_pages = _gr[4]
|
|
881
|
+
|
|
882
|
+
if isinstance(_gr[5], Exception):
|
|
883
|
+
logger.warning("[%s] GitLab scraping task raised: %s", inv_uuid, _gr[5])
|
|
884
|
+
_source_errors.add("gitlab")
|
|
885
|
+
gitlab_pages = []
|
|
886
|
+
else:
|
|
887
|
+
gitlab_pages = _gr[5]
|
|
888
|
+
|
|
889
|
+
if isinstance(_gr[6], Exception):
|
|
890
|
+
logger.warning("[%s] RSS scraping task raised: %s", inv_uuid, _gr[6])
|
|
891
|
+
_source_errors.add("rss_feeds")
|
|
892
|
+
rss_pages = []
|
|
893
|
+
else:
|
|
894
|
+
rss_pages = _gr[6]
|
|
895
|
+
|
|
896
|
+
await _update_progress(inv_uuid, 2)
|
|
897
|
+
if await _check_cancelled(inv_uuid, investigation_id):
|
|
898
|
+
return
|
|
899
|
+
|
|
900
|
+
if paste_pages:
|
|
901
|
+
paste_sources_used = sorted({
|
|
902
|
+
p.get("source_name") for p in paste_pages
|
|
903
|
+
if p.get("source_name")
|
|
904
|
+
})
|
|
905
|
+
await _update_progress(
|
|
906
|
+
inv_uuid,
|
|
907
|
+
label=(
|
|
908
|
+
f"Found {len(paste_pages)} paste site results "
|
|
909
|
+
f"({', '.join(paste_sources_used)})"
|
|
910
|
+
),
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# ── sources_used: record which sources ran and what they returned ──────
|
|
914
|
+
_otx_key = (resolved_keys.get("OTX_API_KEY") or "").strip()
|
|
915
|
+
_vt_key = os.getenv("VT_API_KEY", "").strip()
|
|
916
|
+
_st_key = os.getenv("SECURITYTRAILS_API_KEY", "").strip()
|
|
917
|
+
|
|
918
|
+
def _src_status(count: int, error_key: str | None = None) -> str:
|
|
919
|
+
if error_key and error_key in _source_errors:
|
|
920
|
+
return "error"
|
|
921
|
+
return f"ok_{count}_results" if count > 0 else "ok_0_results"
|
|
922
|
+
|
|
923
|
+
sources_used: dict[str, str] = {}
|
|
924
|
+
|
|
925
|
+
# Keyed sources — show "skipped_no_key" when the key is absent
|
|
926
|
+
if not _otx_key:
|
|
927
|
+
sources_used["otx"] = "skipped_no_key"
|
|
928
|
+
else:
|
|
929
|
+
n = sum(1 for p in enrichment_pages if p.get("source") == "alienvault_otx")
|
|
930
|
+
sources_used["otx"] = _src_status(n, "enrichment")
|
|
931
|
+
|
|
932
|
+
if not _vt_key:
|
|
933
|
+
sources_used["virustotal"] = "skipped_no_key"
|
|
934
|
+
else:
|
|
935
|
+
n = sum(1 for p in enrichment_pages if p.get("source") == "virustotal")
|
|
936
|
+
sources_used["virustotal"] = _src_status(n, "enrichment")
|
|
937
|
+
|
|
938
|
+
sources_used["securitytrails"] = "skipped_no_key" if not _st_key else "skipped_not_implemented"
|
|
939
|
+
|
|
940
|
+
# Free enrichment sources
|
|
941
|
+
for _skey, _psrc in [
|
|
942
|
+
("malwarebazaar", "malwarebazaar"),
|
|
943
|
+
("threatfox", "threatfox"),
|
|
944
|
+
("urlhaus", "urlhaus"),
|
|
945
|
+
]:
|
|
946
|
+
n = sum(1 for p in enrichment_pages if p.get("source") == _psrc)
|
|
947
|
+
sources_used[_skey] = _src_status(n, "enrichment")
|
|
948
|
+
|
|
949
|
+
_rl_n = sum(
|
|
950
|
+
1 for p in enrichment_pages
|
|
951
|
+
if p.get("source") == "ransomware_live" and not p.get("_scrape_seed")
|
|
952
|
+
)
|
|
953
|
+
sources_used["ransomware_live"] = _src_status(_rl_n, "enrichment")
|
|
954
|
+
|
|
955
|
+
_cisa_n = sum(1 for p in enrichment_pages if p.get("source") in ("cisa_kev", "cisa_advisory"))
|
|
956
|
+
sources_used["cisa"] = _src_status(_cisa_n, "enrichment")
|
|
957
|
+
|
|
958
|
+
_shodan_n = sum(1 for p in enrichment_pages if p.get("source") == "shodan_internetdb")
|
|
959
|
+
sources_used["shodan"] = _src_status(_shodan_n, "enrichment")
|
|
960
|
+
|
|
961
|
+
# Tor search
|
|
962
|
+
if "tor_search" in _source_errors:
|
|
963
|
+
sources_used["tor_search"] = "error"
|
|
964
|
+
else:
|
|
965
|
+
n = len(search_urls)
|
|
966
|
+
sources_used["tor_search"] = f"ok_{n}_pages" if n > 0 else "ok_0_pages"
|
|
967
|
+
|
|
968
|
+
# Clearnet scrapers
|
|
969
|
+
if not _github_scraping_enabled():
|
|
970
|
+
sources_used["github"] = "skipped_disabled"
|
|
971
|
+
elif "github" in _source_errors:
|
|
972
|
+
sources_used["github"] = "error"
|
|
973
|
+
else:
|
|
974
|
+
sources_used["github"] = _src_status(len(github_pages))
|
|
975
|
+
|
|
976
|
+
if not _gitlab_scraping_enabled():
|
|
977
|
+
sources_used["gitlab"] = "skipped_disabled"
|
|
978
|
+
elif "gitlab" in _source_errors:
|
|
979
|
+
sources_used["gitlab"] = "error"
|
|
980
|
+
else:
|
|
981
|
+
sources_used["gitlab"] = _src_status(len(gitlab_pages))
|
|
982
|
+
|
|
983
|
+
if not _paste_scraping_enabled():
|
|
984
|
+
sources_used["paste_sites"] = "skipped_disabled"
|
|
985
|
+
elif "paste_sites" in _source_errors:
|
|
986
|
+
sources_used["paste_sites"] = "error"
|
|
987
|
+
else:
|
|
988
|
+
sources_used["paste_sites"] = _src_status(len(paste_pages))
|
|
989
|
+
|
|
990
|
+
if not _rss_scraping_enabled():
|
|
991
|
+
sources_used["rss_feeds"] = "skipped_disabled"
|
|
992
|
+
elif "rss_feeds" in _source_errors:
|
|
993
|
+
sources_used["rss_feeds"] = "error"
|
|
994
|
+
else:
|
|
995
|
+
sources_used["rss_feeds"] = _src_status(len(rss_pages))
|
|
996
|
+
|
|
997
|
+
# DNS, domain, hash, and email reputation placeholders — updated after those steps complete
|
|
998
|
+
sources_used["circl_pdns"] = "pending"
|
|
999
|
+
sources_used["domain_reputation"] = "pending"
|
|
1000
|
+
sources_used["hash_reputation"] = "pending"
|
|
1001
|
+
sources_used["email_reputation"] = "pending"
|
|
1002
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1003
|
+
# ── end sources_used ──────────────────────────────────────────────────
|
|
1004
|
+
|
|
1005
|
+
if len(search_urls) < 2:
|
|
1006
|
+
logger.warning(
|
|
1007
|
+
"[%s] Filtered results too small (%s INTELLIGENCE pages). "
|
|
1008
|
+
"Query may have returned only directory/index pages. "
|
|
1009
|
+
"Try a more specific query.",
|
|
1010
|
+
inv_uuid,
|
|
1011
|
+
len(search_urls),
|
|
1012
|
+
)
|
|
1013
|
+
no_result_summary = (
|
|
1014
|
+
f"Investigation for '{refined_query}' completed but found insufficient "
|
|
1015
|
+
f"intelligence content. Only {len(search_urls)} qualifying page(s) remained "
|
|
1016
|
+
f"after filtering out directory/index pages. This suggests the query "
|
|
1017
|
+
f"returned primarily link aggregators or marketplace indexes rather than "
|
|
1018
|
+
f"actual threat intelligence content. Try a more specific, targeted query "
|
|
1019
|
+
f"(e.g., specific malware names, actor handles, or infrastructure indicators) "
|
|
1020
|
+
f"instead of broad topic searches."
|
|
1021
|
+
)
|
|
1022
|
+
with get_session() as session:
|
|
1023
|
+
session.query(Investigation).filter_by(id=inv_uuid).update(
|
|
1024
|
+
{"status": "completed_no_results", "summary": no_result_summary, "graph_status": "no_data"}
|
|
1025
|
+
)
|
|
1026
|
+
session.commit()
|
|
1027
|
+
logger.info("[%s] Investigation COMPLETED_NO_RESULTS (run_id=%s)", inv_uuid, run_id)
|
|
1028
|
+
return
|
|
1029
|
+
|
|
1030
|
+
# Seed .onion leak-site URLs discovered by enrichment (e.g. ransomware.live)
|
|
1031
|
+
# into the scrape queue so they get visited even if search engines didn't find them
|
|
1032
|
+
enrichment_onion_seeds = [
|
|
1033
|
+
{"link": p.get("link") or p.get("url"), "title": p.get("title", "Enrichment seed")}
|
|
1034
|
+
for p in enrichment_pages
|
|
1035
|
+
if p.get("_scrape_seed") and ".onion" in (p.get("link") or p.get("url") or "")
|
|
1036
|
+
]
|
|
1037
|
+
if enrichment_onion_seeds:
|
|
1038
|
+
logger.info(
|
|
1039
|
+
"[%s] Adding %d .onion seeds from enrichment to scrape queue",
|
|
1040
|
+
inv_uuid, len(enrichment_onion_seeds),
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
# Seed URLs go first — they're known intelligence sources and skip the LLM filter
|
|
1044
|
+
all_urls_to_scrape = seed_urls + search_urls + crawler_urls + enrichment_onion_seeds
|
|
1045
|
+
logger.info(
|
|
1046
|
+
"[%s] Total URLs to scrape: %s (%s seeds + %s search + %s crawler + %s enrichment)",
|
|
1047
|
+
inv_uuid,
|
|
1048
|
+
len(all_urls_to_scrape),
|
|
1049
|
+
len(seed_urls),
|
|
1050
|
+
len(search_urls),
|
|
1051
|
+
len(crawler_urls),
|
|
1052
|
+
len(enrichment_onion_seeds),
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
if enrichment_pages:
|
|
1056
|
+
try:
|
|
1057
|
+
from vector.store import store_page
|
|
1058
|
+
for ep in enrichment_pages:
|
|
1059
|
+
u = ep.get("url") or ep.get("link") or ""
|
|
1060
|
+
t = ep.get("text") or ep.get("content") or ""
|
|
1061
|
+
if u and t:
|
|
1062
|
+
store_page(url=u, content=t, metadata={"source": ep.get("source", "enrichment")})
|
|
1063
|
+
except Exception:
|
|
1064
|
+
pass
|
|
1065
|
+
|
|
1066
|
+
# ===== STEP 4.5: Vector Cache Lookup (no session held) =====
|
|
1067
|
+
logger.info(
|
|
1068
|
+
"[%s] STEP 4.5: Checking vector cache for %d URLs...",
|
|
1069
|
+
inv_uuid,
|
|
1070
|
+
len(all_urls_to_scrape),
|
|
1071
|
+
)
|
|
1072
|
+
cached_dict: dict = {}
|
|
1073
|
+
uncached_url_dicts = list(all_urls_to_scrape)
|
|
1074
|
+
try:
|
|
1075
|
+
from vector.store import bulk_check_cache
|
|
1076
|
+
|
|
1077
|
+
url_strings = [
|
|
1078
|
+
u.get("link", u) if isinstance(u, dict) else str(u)
|
|
1079
|
+
for u in all_urls_to_scrape
|
|
1080
|
+
]
|
|
1081
|
+
cached_pages_list, urls_needing_scrape = bulk_check_cache(
|
|
1082
|
+
url_strings, max_age_hours=24
|
|
1083
|
+
)
|
|
1084
|
+
cached_dict = {p["link"]: p["content"] for p in cached_pages_list}
|
|
1085
|
+
uncached_set = set(urls_needing_scrape)
|
|
1086
|
+
uncached_url_dicts = [
|
|
1087
|
+
u for u in all_urls_to_scrape
|
|
1088
|
+
if (u.get("link", u) if isinstance(u, dict) else str(u))
|
|
1089
|
+
in uncached_set
|
|
1090
|
+
]
|
|
1091
|
+
logger.info(
|
|
1092
|
+
"[%s] Cache: %d hits, %d misses (need Tor)",
|
|
1093
|
+
inv_uuid,
|
|
1094
|
+
len(cached_dict),
|
|
1095
|
+
len(uncached_url_dicts),
|
|
1096
|
+
)
|
|
1097
|
+
except Exception as exc:
|
|
1098
|
+
logger.info("[%s] Cache check failed (non-fatal): %s", inv_uuid, exc)
|
|
1099
|
+
cached_dict = {}
|
|
1100
|
+
uncached_url_dicts = list(all_urls_to_scrape)
|
|
1101
|
+
|
|
1102
|
+
# ===== STEP 5: Scraping (no session held) =====
|
|
1103
|
+
uncached_url_dicts, ssrf_blocked = validate_urls_for_scraping(uncached_url_dicts)
|
|
1104
|
+
if ssrf_blocked:
|
|
1105
|
+
logger.info(
|
|
1106
|
+
"[%s] SSRF: blocked %d unsafe URLs",
|
|
1107
|
+
inv_uuid,
|
|
1108
|
+
len(ssrf_blocked),
|
|
1109
|
+
)
|
|
1110
|
+
logger.info(
|
|
1111
|
+
"[%s] STEP 5: Scraping %d URLs (skipped %d cached)...",
|
|
1112
|
+
inv_uuid,
|
|
1113
|
+
len(uncached_url_dicts),
|
|
1114
|
+
len(cached_dict),
|
|
1115
|
+
)
|
|
1116
|
+
freshly_scraped = await scrape_multiple(uncached_url_dicts, max_workers=12)
|
|
1117
|
+
await _update_progress(inv_uuid, 4, scraped_pages=freshly_scraped)
|
|
1118
|
+
if await _check_cancelled(inv_uuid, investigation_id):
|
|
1119
|
+
return
|
|
1120
|
+
|
|
1121
|
+
# ===== STEP 5.5: Store new pages in vector cache (no session held) =====
|
|
1122
|
+
try:
|
|
1123
|
+
from vector.store import store_page
|
|
1124
|
+
|
|
1125
|
+
stored_count = 0
|
|
1126
|
+
for page_url, page_text in freshly_scraped.items():
|
|
1127
|
+
if page_text and len(page_text) > 100:
|
|
1128
|
+
if store_page(url=page_url, content=page_text, metadata={"source": "scraper"}):
|
|
1129
|
+
stored_count += 1
|
|
1130
|
+
logger.info("[%s] Stored %d new pages in vector cache", inv_uuid, stored_count)
|
|
1131
|
+
except Exception as exc:
|
|
1132
|
+
logger.info("[%s] Cache store failed (non-fatal): %s", inv_uuid, exc)
|
|
1133
|
+
|
|
1134
|
+
scraped_pages = {**cached_dict, **freshly_scraped}
|
|
1135
|
+
|
|
1136
|
+
# ===== STEP 5.75: Content safety scan (Layer 4) =====
|
|
1137
|
+
from utils.content_safety import sanitize_content, log_content_safety_event
|
|
1138
|
+
clean_pages: dict[str, str] = {}
|
|
1139
|
+
blocked_count = 0
|
|
1140
|
+
for page_url, page_text in scraped_pages.items():
|
|
1141
|
+
clean_text, was_flagged = sanitize_content(page_text)
|
|
1142
|
+
if was_flagged:
|
|
1143
|
+
blocked_count += 1
|
|
1144
|
+
url_hash = hashlib.sha256(page_url.encode()).hexdigest()[:16]
|
|
1145
|
+
logger.warning(
|
|
1146
|
+
"[%s] Page content blocked — prohibited content. Page hash: %s",
|
|
1147
|
+
inv_uuid,
|
|
1148
|
+
url_hash,
|
|
1149
|
+
)
|
|
1150
|
+
log_content_safety_event(
|
|
1151
|
+
event_type="content_blocked",
|
|
1152
|
+
content_hash=url_hash,
|
|
1153
|
+
user_id=inv_user_id,
|
|
1154
|
+
)
|
|
1155
|
+
else:
|
|
1156
|
+
clean_pages[page_url] = clean_text
|
|
1157
|
+
if blocked_count > 0:
|
|
1158
|
+
logger.warning(
|
|
1159
|
+
"[%s] Blocked %d pages for prohibited content",
|
|
1160
|
+
inv_uuid,
|
|
1161
|
+
blocked_count,
|
|
1162
|
+
)
|
|
1163
|
+
scraped_pages = clean_pages
|
|
1164
|
+
|
|
1165
|
+
scraped_count = len(scraped_pages)
|
|
1166
|
+
logger.info(
|
|
1167
|
+
"[%s] Total for extraction: %d pages (%d cached + %d fresh, %d blocked)",
|
|
1168
|
+
inv_uuid,
|
|
1169
|
+
scraped_count,
|
|
1170
|
+
len(cached_dict),
|
|
1171
|
+
len(freshly_scraped),
|
|
1172
|
+
blocked_count,
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
page_records = [
|
|
1176
|
+
{"url": page_url, "text": page_text, "content": page_text}
|
|
1177
|
+
for page_url, page_text in scraped_pages.items()
|
|
1178
|
+
]
|
|
1179
|
+
|
|
1180
|
+
if enrichment_pages:
|
|
1181
|
+
enrichment_count = 0
|
|
1182
|
+
for ep in enrichment_pages:
|
|
1183
|
+
u = ep.get("url") or ep.get("link") or ""
|
|
1184
|
+
t = ep.get("text") or ep.get("content") or ""
|
|
1185
|
+
if u and (t or "").strip():
|
|
1186
|
+
page_records.append({"url": u, "text": t, "content": t})
|
|
1187
|
+
enrichment_count += 1
|
|
1188
|
+
|
|
1189
|
+
logger.info(
|
|
1190
|
+
"[%s] Total pages for extraction: %s (%s scraped + %s enrichment)",
|
|
1191
|
+
inv_uuid,
|
|
1192
|
+
len(page_records),
|
|
1193
|
+
scraped_count,
|
|
1194
|
+
enrichment_count,
|
|
1195
|
+
)
|
|
1196
|
+
else:
|
|
1197
|
+
logger.info(
|
|
1198
|
+
"[%s] Total pages for extraction: %s (%s scraped + 0 enrichment)",
|
|
1199
|
+
inv_uuid,
|
|
1200
|
+
len(page_records),
|
|
1201
|
+
scraped_count,
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
# Paste-site pages already have fetched text — bypass scraping and
|
|
1205
|
+
# add them directly to the extraction pool, marked with their source.
|
|
1206
|
+
if paste_pages:
|
|
1207
|
+
paste_added = 0
|
|
1208
|
+
for pp in paste_pages:
|
|
1209
|
+
u = pp.get("url") or ""
|
|
1210
|
+
t = pp.get("text_content") or ""
|
|
1211
|
+
if u and t.strip():
|
|
1212
|
+
page_records.append({
|
|
1213
|
+
"url": u,
|
|
1214
|
+
"text": t,
|
|
1215
|
+
"content": t,
|
|
1216
|
+
"source_type": "paste_site",
|
|
1217
|
+
"source_name": pp.get("source_name"),
|
|
1218
|
+
})
|
|
1219
|
+
paste_added += 1
|
|
1220
|
+
logger.info(
|
|
1221
|
+
"[%s] Added %d paste-site pages to extraction pool",
|
|
1222
|
+
inv_uuid,
|
|
1223
|
+
paste_added,
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
# GitHub pages already have fetched text — bypass scraping and add
|
|
1227
|
+
# them directly to the extraction pool, marked source_type="github".
|
|
1228
|
+
if github_pages:
|
|
1229
|
+
github_added = 0
|
|
1230
|
+
for gp in github_pages:
|
|
1231
|
+
u = gp.get("url") or ""
|
|
1232
|
+
t = gp.get("text_content") or ""
|
|
1233
|
+
if u and t.strip():
|
|
1234
|
+
page_records.append({
|
|
1235
|
+
"url": u,
|
|
1236
|
+
"text": t,
|
|
1237
|
+
"content": t,
|
|
1238
|
+
"source_type": "github",
|
|
1239
|
+
"source_name": gp.get("source_name", "GitHub"),
|
|
1240
|
+
})
|
|
1241
|
+
github_added += 1
|
|
1242
|
+
logger.info(
|
|
1243
|
+
"[%s] Added %d GitHub pages to extraction pool",
|
|
1244
|
+
inv_uuid,
|
|
1245
|
+
github_added,
|
|
1246
|
+
)
|
|
1247
|
+
else:
|
|
1248
|
+
logger.info("[%s] GitHub: no results", inv_uuid)
|
|
1249
|
+
|
|
1250
|
+
# GitLab pages already have fetched text — bypass scraping and add
|
|
1251
|
+
# them directly to the extraction pool, marked source_type="gitlab".
|
|
1252
|
+
if gitlab_pages:
|
|
1253
|
+
gitlab_added = 0
|
|
1254
|
+
for glp in gitlab_pages:
|
|
1255
|
+
u = glp.get("url") or ""
|
|
1256
|
+
t = glp.get("text_content") or ""
|
|
1257
|
+
if u and t.strip():
|
|
1258
|
+
page_records.append({
|
|
1259
|
+
"url": u,
|
|
1260
|
+
"text": t,
|
|
1261
|
+
"content": t,
|
|
1262
|
+
"source_type": "gitlab",
|
|
1263
|
+
"source_name": glp.get("source_name", "GitLab"),
|
|
1264
|
+
})
|
|
1265
|
+
gitlab_added += 1
|
|
1266
|
+
logger.info(
|
|
1267
|
+
"[%s] Added %d GitLab pages to extraction pool",
|
|
1268
|
+
inv_uuid,
|
|
1269
|
+
gitlab_added,
|
|
1270
|
+
)
|
|
1271
|
+
else:
|
|
1272
|
+
logger.info("[%s] GitLab: no results", inv_uuid)
|
|
1273
|
+
|
|
1274
|
+
# RSS feed articles are pre-fetched — bypass scraping, add directly
|
|
1275
|
+
# to the extraction pool marked source_type="rss_feed".
|
|
1276
|
+
if rss_pages:
|
|
1277
|
+
rss_added = 0
|
|
1278
|
+
for rp in rss_pages:
|
|
1279
|
+
u = rp.get("url") or ""
|
|
1280
|
+
t = rp.get("text_content") or ""
|
|
1281
|
+
if u and t.strip():
|
|
1282
|
+
page_records.append({
|
|
1283
|
+
"url": u,
|
|
1284
|
+
"text": t,
|
|
1285
|
+
"content": t,
|
|
1286
|
+
"source_type": "rss_feed",
|
|
1287
|
+
"source_name": rp.get("source_name", "RSS Feed"),
|
|
1288
|
+
"title": rp.get("title", ""),
|
|
1289
|
+
"published_at": rp.get("published_at", ""),
|
|
1290
|
+
})
|
|
1291
|
+
rss_added += 1
|
|
1292
|
+
contributing_feeds = sorted({
|
|
1293
|
+
rp.get("source_name", "unknown") for rp in rss_pages
|
|
1294
|
+
if rp.get("source_name")
|
|
1295
|
+
})
|
|
1296
|
+
logger.info(
|
|
1297
|
+
"[%s] Added %d RSS articles to extraction pool (feeds: %s)",
|
|
1298
|
+
inv_uuid,
|
|
1299
|
+
rss_added,
|
|
1300
|
+
contributing_feeds,
|
|
1301
|
+
)
|
|
1302
|
+
else:
|
|
1303
|
+
logger.info("[%s] RSS feeds: no relevant articles", inv_uuid)
|
|
1304
|
+
|
|
1305
|
+
non_empty_records = [r for r in page_records if len((r.get("text") or "").strip()) > 100]
|
|
1306
|
+
logger.info("[%s] Non-empty pages (>100 chars): %s", inv_uuid, len(non_empty_records))
|
|
1307
|
+
if not non_empty_records:
|
|
1308
|
+
first_length = len(page_records[0].get("text", "")) if page_records else 0
|
|
1309
|
+
logger.info("[%s] WARNING: All scraped pages are empty/short", inv_uuid)
|
|
1310
|
+
logger.info("[%s] First page content length: %s", inv_uuid, first_length)
|
|
1311
|
+
|
|
1312
|
+
# ===== STEP 5.7: Detect content languages (no session held) =====
|
|
1313
|
+
try:
|
|
1314
|
+
from i18n.detect import detect_language
|
|
1315
|
+
|
|
1316
|
+
lang_distribution: dict[str, int] = {}
|
|
1317
|
+
for page in page_records:
|
|
1318
|
+
text = page.get("content") or page.get("text") or ""
|
|
1319
|
+
if len(text) >= 50:
|
|
1320
|
+
lang = detect_language(text[:500])
|
|
1321
|
+
if lang:
|
|
1322
|
+
lang_distribution[lang] = lang_distribution.get(lang, 0) + 1
|
|
1323
|
+
|
|
1324
|
+
if lang_distribution:
|
|
1325
|
+
total_pages = sum(lang_distribution.values())
|
|
1326
|
+
non_english = {k: v for k, v in lang_distribution.items() if k != "en"}
|
|
1327
|
+
logger.info(
|
|
1328
|
+
"[%s] Content languages: %s (%d/%d non-English pages)",
|
|
1329
|
+
inv_uuid,
|
|
1330
|
+
lang_distribution,
|
|
1331
|
+
sum(non_english.values()),
|
|
1332
|
+
total_pages,
|
|
1333
|
+
)
|
|
1334
|
+
except Exception as e:
|
|
1335
|
+
logger.info("[%s] Language detection failed (non-fatal): %s", inv_uuid, e)
|
|
1336
|
+
|
|
1337
|
+
# ===== STEP 6: Entity extraction (no session held) =====
|
|
1338
|
+
logger.info("[%s] STEP 6: Extracting entities...", inv_uuid)
|
|
1339
|
+
extraction_input = non_empty_records if non_empty_records else page_records
|
|
1340
|
+
try:
|
|
1341
|
+
extraction_results = await extract_entities_from_pages(
|
|
1342
|
+
extraction_input,
|
|
1343
|
+
investigation_id=inv_uuid,
|
|
1344
|
+
llm=llm_client,
|
|
1345
|
+
run_llm_extraction=True,
|
|
1346
|
+
)
|
|
1347
|
+
total_entities = sum(r.entity_count for r in extraction_results)
|
|
1348
|
+
logger.info("[%s] Extracted %s entities", inv_uuid, total_entities)
|
|
1349
|
+
if total_entities == 0:
|
|
1350
|
+
logger.info("[%s] WARNING: No entities extracted", inv_uuid)
|
|
1351
|
+
logger.info(
|
|
1352
|
+
"[%s] Pages passed to extractor: %s",
|
|
1353
|
+
inv_uuid,
|
|
1354
|
+
len(extraction_input),
|
|
1355
|
+
)
|
|
1356
|
+
except Exception as exc:
|
|
1357
|
+
logger.exception("[%s] Extraction failed: %s", inv_uuid, str(exc))
|
|
1358
|
+
extraction_results = []
|
|
1359
|
+
total_entities = 0
|
|
1360
|
+
|
|
1361
|
+
await _update_progress(inv_uuid, 5, entity_count=total_entities)
|
|
1362
|
+
if await _check_cancelled(inv_uuid, investigation_id):
|
|
1363
|
+
return
|
|
1364
|
+
|
|
1365
|
+
# ===== STEP 6.1: IP Reputation Enrichment =====
|
|
1366
|
+
# Runs after entities are in DB but before the entity cap is applied.
|
|
1367
|
+
# Suppresses GreyNoise-benign IPs and boosts confidence for confirmed C2s.
|
|
1368
|
+
logger.info("[%s] STEP 6.1: Running IP reputation enrichment...", inv_uuid)
|
|
1369
|
+
try:
|
|
1370
|
+
from sources.ip_reputation import enrich_ip_entities as _enrich_ips
|
|
1371
|
+
|
|
1372
|
+
extraction_results, _ip_stats = await asyncio.wait_for(
|
|
1373
|
+
_enrich_ips(extraction_results, inv_uuid),
|
|
1374
|
+
timeout=60,
|
|
1375
|
+
)
|
|
1376
|
+
total_entities = sum(r.entity_count for r in extraction_results)
|
|
1377
|
+
sources_used["ip_reputation"] = _ip_stats.get("ip_reputation", "ok_0_ips")
|
|
1378
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1379
|
+
logger.info(
|
|
1380
|
+
"[%s] IP reputation: %d checked, %d suppressed, %d C2 confirmed, %d abuse",
|
|
1381
|
+
inv_uuid,
|
|
1382
|
+
_ip_stats.get("checked", 0),
|
|
1383
|
+
_ip_stats.get("suppressed", 0),
|
|
1384
|
+
_ip_stats.get("c2_confirmed", 0),
|
|
1385
|
+
_ip_stats.get("abuse_confirmed", 0),
|
|
1386
|
+
)
|
|
1387
|
+
except asyncio.TimeoutError:
|
|
1388
|
+
logger.warning("[%s] IP reputation enrichment timed out after 60s", inv_uuid)
|
|
1389
|
+
sources_used["ip_reputation"] = "error_timeout"
|
|
1390
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1391
|
+
except Exception as _ip_exc:
|
|
1392
|
+
logger.info("[%s] IP reputation enrichment failed (non-fatal): %s", inv_uuid, _ip_exc)
|
|
1393
|
+
sources_used["ip_reputation"] = "error"
|
|
1394
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1395
|
+
|
|
1396
|
+
# ===== STEP 6.5: Cross-reference against seed data (short-lived session) =====
|
|
1397
|
+
logger.info("[%s] STEP 6.5: Cross-referencing with historical data...", inv_uuid)
|
|
1398
|
+
try:
|
|
1399
|
+
from db.queries import cross_reference_with_seeds
|
|
1400
|
+
|
|
1401
|
+
with get_session() as session:
|
|
1402
|
+
seed_matches = cross_reference_with_seeds(session, inv_uuid)
|
|
1403
|
+
logger.info("[%s] Found %s historical matches", inv_uuid, seed_matches)
|
|
1404
|
+
except Exception as e:
|
|
1405
|
+
logger.info("[%s] Cross-reference failed (non-fatal): %s", inv_uuid, e)
|
|
1406
|
+
|
|
1407
|
+
# ===== STEP 6.6: Build Stylometry Profiles (wrapped in to_thread with own session) =====
|
|
1408
|
+
logger.info(f"[{inv_uuid}] STEP 6.6: Building actor style profiles...")
|
|
1409
|
+
try:
|
|
1410
|
+
profiles_built = await asyncio.to_thread(
|
|
1411
|
+
_build_investigation_profiles,
|
|
1412
|
+
inv_uuid,
|
|
1413
|
+
)
|
|
1414
|
+
logger.info(f"[{inv_uuid}] Built {profiles_built} actor profiles")
|
|
1415
|
+
except Exception as e:
|
|
1416
|
+
logger.info(f"[{inv_uuid}] Profile building failed (non-fatal): {e}")
|
|
1417
|
+
|
|
1418
|
+
# ===== STEP 6.7: Blockchain Wallet Enrichment (wrapped in to_thread with own session) =====
|
|
1419
|
+
logger.info(f"[{inv_uuid}] STEP 6.7: Enriching wallet entities...")
|
|
1420
|
+
try:
|
|
1421
|
+
from sources.blockchain import enrich_wallets_for_investigation
|
|
1422
|
+
from config import BLOCKCYPHER_TOKEN, ETHERSCAN_API_KEY
|
|
1423
|
+
|
|
1424
|
+
blockchain_stats = await asyncio.to_thread(
|
|
1425
|
+
_enrich_wallets_sync,
|
|
1426
|
+
inv_uuid,
|
|
1427
|
+
BLOCKCYPHER_TOKEN,
|
|
1428
|
+
ETHERSCAN_API_KEY,
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
logger.info(
|
|
1432
|
+
f"[{inv_uuid}] Blockchain enrichment: "
|
|
1433
|
+
f"{blockchain_stats['successful_lookups']}/{blockchain_stats['wallets_looked_up']} lookups successful, "
|
|
1434
|
+
f"{blockchain_stats['edges_created']} PAID_TO edges created, "
|
|
1435
|
+
f"{blockchain_stats['connected_wallets_found']} connected wallets found"
|
|
1436
|
+
)
|
|
1437
|
+
except Exception as e:
|
|
1438
|
+
logger.info(f"[{inv_uuid}] Blockchain enrichment failed (non-fatal): {e}")
|
|
1439
|
+
|
|
1440
|
+
await _update_progress(inv_uuid, 6)
|
|
1441
|
+
|
|
1442
|
+
# ===== STEP 6.8: DNS/WHOIS Enrichment (no session held) =====
|
|
1443
|
+
logger.info("[%s] STEP 6.8: Running DNS/WHOIS enrichment...", inv_uuid)
|
|
1444
|
+
try:
|
|
1445
|
+
from sources.enrichment import run_dns_enrichment
|
|
1446
|
+
|
|
1447
|
+
# Build a flat list of entity dicts from extraction results for DNS lookup.
|
|
1448
|
+
# NormalizedEntity dataclasses are converted to the dict format expected by
|
|
1449
|
+
# enrich_with_dns (entity_type + canonical_value/value).
|
|
1450
|
+
extracted_entities_for_dns: list[dict] = []
|
|
1451
|
+
for _r in extraction_results:
|
|
1452
|
+
for _e in getattr(_r, "entities", []):
|
|
1453
|
+
if hasattr(_e, "entity_type"):
|
|
1454
|
+
extracted_entities_for_dns.append({
|
|
1455
|
+
"entity_type": _e.entity_type,
|
|
1456
|
+
"canonical_value": _e.value,
|
|
1457
|
+
"value": _e.value,
|
|
1458
|
+
"confidence": _e.confidence,
|
|
1459
|
+
})
|
|
1460
|
+
elif isinstance(_e, dict):
|
|
1461
|
+
extracted_entities_for_dns.append(_e)
|
|
1462
|
+
|
|
1463
|
+
dns_results = await asyncio.wait_for(
|
|
1464
|
+
run_dns_enrichment(extracted_entities_for_dns),
|
|
1465
|
+
timeout=120,
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
new_dns_entities = dns_results.get("new_entities", [])
|
|
1469
|
+
if new_dns_entities:
|
|
1470
|
+
logger.info(
|
|
1471
|
+
"[%s] DNS enrichment: %d new entities discovered",
|
|
1472
|
+
inv_uuid,
|
|
1473
|
+
len(new_dns_entities),
|
|
1474
|
+
)
|
|
1475
|
+
|
|
1476
|
+
clusters = dns_results.get("infrastructure_clusters", [])
|
|
1477
|
+
if clusters:
|
|
1478
|
+
logger.info(
|
|
1479
|
+
"[%s] Infrastructure clusters found: %d",
|
|
1480
|
+
inv_uuid,
|
|
1481
|
+
len(clusters),
|
|
1482
|
+
)
|
|
1483
|
+
for cluster in clusters:
|
|
1484
|
+
logger.info("[%s] %s", inv_uuid, cluster["description"])
|
|
1485
|
+
_infra_cluster_cache[investigation_id] = clusters
|
|
1486
|
+
|
|
1487
|
+
_dns_ent_count = len(new_dns_entities)
|
|
1488
|
+
sources_used["circl_pdns"] = (
|
|
1489
|
+
f"ok_{_dns_ent_count}_enrichments" if _dns_ent_count > 0 else "ok_0_enrichments"
|
|
1490
|
+
)
|
|
1491
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1492
|
+
|
|
1493
|
+
except asyncio.TimeoutError:
|
|
1494
|
+
logger.warning("[%s] DNS enrichment timed out after 120s", inv_uuid)
|
|
1495
|
+
sources_used["circl_pdns"] = "error"
|
|
1496
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1497
|
+
except Exception as _dns_exc:
|
|
1498
|
+
logger.info("[%s] DNS enrichment failed (non-fatal): %s", inv_uuid, _dns_exc)
|
|
1499
|
+
sources_used["circl_pdns"] = "error"
|
|
1500
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1501
|
+
|
|
1502
|
+
# ===== STEP 6.2: Domain Reputation Enrichment =====
|
|
1503
|
+
# Runs after DNS enrichment. Enriches DOMAIN entities with:
|
|
1504
|
+
# crt.sh (subdomain enumeration via certificate transparency)
|
|
1505
|
+
# URLScan.io (live scan data, malicious indicators, communicating IPs)
|
|
1506
|
+
# Wayback Machine (historical snapshots for taken-down domains)
|
|
1507
|
+
# Non-fatal: if all three sources fail for a domain, entity is unchanged.
|
|
1508
|
+
logger.info("[%s] STEP 6.2: Running domain reputation enrichment...", inv_uuid)
|
|
1509
|
+
try:
|
|
1510
|
+
from sources.domain_reputation import enrich_domain_entities as _enrich_domains
|
|
1511
|
+
|
|
1512
|
+
extraction_results, _dom_stats = await asyncio.wait_for(
|
|
1513
|
+
_enrich_domains(extraction_results, inv_uuid),
|
|
1514
|
+
timeout=120,
|
|
1515
|
+
)
|
|
1516
|
+
sources_used["domain_reputation"] = _dom_stats.get(
|
|
1517
|
+
"domain_reputation", "ok_0_domains"
|
|
1518
|
+
)
|
|
1519
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1520
|
+
logger.info(
|
|
1521
|
+
"[%s] Domain reputation: %d domains, %d CT records, %d malicious, %d archived",
|
|
1522
|
+
inv_uuid,
|
|
1523
|
+
_dom_stats.get("domains_checked", 0),
|
|
1524
|
+
_dom_stats.get("ct_records", 0),
|
|
1525
|
+
_dom_stats.get("urlscan_malicious", 0),
|
|
1526
|
+
_dom_stats.get("wayback_archived", 0),
|
|
1527
|
+
)
|
|
1528
|
+
except asyncio.TimeoutError:
|
|
1529
|
+
logger.warning("[%s] Domain reputation enrichment timed out after 120s", inv_uuid)
|
|
1530
|
+
sources_used["domain_reputation"] = "error_timeout"
|
|
1531
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1532
|
+
except Exception as _dom_exc:
|
|
1533
|
+
logger.info("[%s] Domain reputation enrichment failed (non-fatal): %s", inv_uuid, _dom_exc)
|
|
1534
|
+
sources_used["domain_reputation"] = "error"
|
|
1535
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1536
|
+
|
|
1537
|
+
# ===== STEP 6.3: Hash Reputation Enrichment =====
|
|
1538
|
+
# Runs after domain reputation. Enriches FILE_HASH_* entities with:
|
|
1539
|
+
# Hybrid Analysis (behavioral sandbox — requires HYBRID_ANALYSIS_API_KEY)
|
|
1540
|
+
# MalwareBazaar (family classification — free, no auth)
|
|
1541
|
+
# ThreatFox (IOC database — free, no auth)
|
|
1542
|
+
# VirusTotal extended (AV detections + sandbox IOCs — requires VT_API_KEY)
|
|
1543
|
+
# Hashes are never suppressed. Non-fatal: 90s timeout.
|
|
1544
|
+
logger.info("[%s] STEP 6.3: Running hash reputation enrichment...", inv_uuid)
|
|
1545
|
+
try:
|
|
1546
|
+
from sources.hash_reputation import enrich_hash_entities as _enrich_hashes
|
|
1547
|
+
|
|
1548
|
+
extraction_results, _hash_stats = await asyncio.wait_for(
|
|
1549
|
+
_enrich_hashes(extraction_results, inv_uuid),
|
|
1550
|
+
timeout=90,
|
|
1551
|
+
)
|
|
1552
|
+
sources_used["hash_reputation"] = _hash_stats.get("hash_reputation", "ok_0_hashes")
|
|
1553
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1554
|
+
logger.info(
|
|
1555
|
+
"[%s] Hash reputation: %d checked, %d malicious, %d suspicious, "
|
|
1556
|
+
"%d families, %d new entities",
|
|
1557
|
+
inv_uuid,
|
|
1558
|
+
_hash_stats.get("hashes_checked", 0),
|
|
1559
|
+
_hash_stats.get("malicious", 0),
|
|
1560
|
+
_hash_stats.get("suspicious", 0),
|
|
1561
|
+
_hash_stats.get("malware_families_found", 0),
|
|
1562
|
+
_hash_stats.get("new_entities_discovered", 0),
|
|
1563
|
+
)
|
|
1564
|
+
except asyncio.TimeoutError:
|
|
1565
|
+
logger.warning("[%s] Hash reputation enrichment timed out after 90s", inv_uuid)
|
|
1566
|
+
sources_used["hash_reputation"] = "error_timeout"
|
|
1567
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1568
|
+
except Exception as _hash_exc:
|
|
1569
|
+
logger.info("[%s] Hash reputation enrichment failed (non-fatal): %s", inv_uuid, _hash_exc)
|
|
1570
|
+
sources_used["hash_reputation"] = "error"
|
|
1571
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1572
|
+
|
|
1573
|
+
# ===== STEP 6.4: Email Reputation Enrichment =====
|
|
1574
|
+
# Runs after hash reputation. Enriches EMAIL_ADDRESS entities with:
|
|
1575
|
+
# HIBP (breach history — requires HIBP_API_KEY, paid $3.50/mo)
|
|
1576
|
+
# EmailRep.io (reputation scoring — works without key)
|
|
1577
|
+
# Disposable domain blocklist (local check, no auth)
|
|
1578
|
+
# Domain cross-reference (custom email domains added as DOMAIN entities)
|
|
1579
|
+
# Non-fatal: 60s timeout.
|
|
1580
|
+
logger.info("[%s] STEP 6.4: Running email reputation enrichment...", inv_uuid)
|
|
1581
|
+
try:
|
|
1582
|
+
from sources.email_reputation import enrich_email_entities as _enrich_emails
|
|
1583
|
+
|
|
1584
|
+
extraction_results, _email_stats = await asyncio.wait_for(
|
|
1585
|
+
_enrich_emails(extraction_results, inv_uuid),
|
|
1586
|
+
timeout=60,
|
|
1587
|
+
)
|
|
1588
|
+
sources_used["email_reputation"] = _email_stats.get(
|
|
1589
|
+
"email_reputation", "ok_0_emails"
|
|
1590
|
+
)
|
|
1591
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1592
|
+
logger.info(
|
|
1593
|
+
"[%s] Email reputation: %d checked, %d breached, %d passwords exposed, "
|
|
1594
|
+
"%d disposable, %d malicious",
|
|
1595
|
+
inv_uuid,
|
|
1596
|
+
_email_stats.get("emails_checked", 0),
|
|
1597
|
+
_email_stats.get("breached", 0),
|
|
1598
|
+
_email_stats.get("password_exposed", 0),
|
|
1599
|
+
_email_stats.get("disposable", 0),
|
|
1600
|
+
_email_stats.get("malicious", 0),
|
|
1601
|
+
)
|
|
1602
|
+
except asyncio.TimeoutError:
|
|
1603
|
+
logger.warning("[%s] Email reputation enrichment timed out after 60s", inv_uuid)
|
|
1604
|
+
sources_used["email_reputation"] = "error_timeout"
|
|
1605
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1606
|
+
except Exception as _email_exc:
|
|
1607
|
+
logger.info(
|
|
1608
|
+
"[%s] Email reputation enrichment failed (non-fatal): %s", inv_uuid, _email_exc
|
|
1609
|
+
)
|
|
1610
|
+
sources_used["email_reputation"] = "error"
|
|
1611
|
+
_sources_used_cache[investigation_id] = sources_used
|
|
1612
|
+
|
|
1613
|
+
# ===== STEP 7: Graph building (wrapped in to_thread with own session) =====
|
|
1614
|
+
logger.info("[%s] STEP 7: Building graph...", inv_uuid)
|
|
1615
|
+
try:
|
|
1616
|
+
from graph.builder import build_graph_from_db, persist_graph_edges
|
|
1617
|
+
|
|
1618
|
+
graph_obj = await asyncio.to_thread(build_graph_from_db, investigation_id=inv_uuid)
|
|
1619
|
+
node_count = len(graph_obj.nodes())
|
|
1620
|
+
edge_count = len(graph_obj.edges())
|
|
1621
|
+
logger.info(
|
|
1622
|
+
"[%s] Graph: %s nodes, %s edges",
|
|
1623
|
+
inv_uuid,
|
|
1624
|
+
node_count,
|
|
1625
|
+
edge_count,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
try:
|
|
1629
|
+
persist_result = await asyncio.to_thread(
|
|
1630
|
+
_persist_graph_edges_sync,
|
|
1631
|
+
graph_obj,
|
|
1632
|
+
inv_uuid,
|
|
1633
|
+
)
|
|
1634
|
+
graph_status = persist_result.get("status", "written")
|
|
1635
|
+
edges_written = persist_result.get("edges_written", 0)
|
|
1636
|
+
logger.info(
|
|
1637
|
+
"[%s] Graph edges persisted: %s (%s)",
|
|
1638
|
+
inv_uuid,
|
|
1639
|
+
edges_written,
|
|
1640
|
+
graph_status,
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
new_graph_status = "skipped_overflow" if graph_status == "skipped_overflow" else "built"
|
|
1644
|
+
with get_session() as session:
|
|
1645
|
+
session.query(Investigation).filter_by(id=inv_uuid).update(
|
|
1646
|
+
{"graph_status": new_graph_status}
|
|
1647
|
+
)
|
|
1648
|
+
session.commit()
|
|
1649
|
+
except Exception as e:
|
|
1650
|
+
logger.info("[%s] Edge persistence failed (non-fatal): %s", inv_uuid, e)
|
|
1651
|
+
|
|
1652
|
+
except Exception as exc:
|
|
1653
|
+
logger.exception("[%s] Graph building failed: %s", inv_uuid, str(exc))
|
|
1654
|
+
|
|
1655
|
+
await _update_progress(inv_uuid, 7)
|
|
1656
|
+
|
|
1657
|
+
# ===== STEP 8: Summary (no session held) =====
|
|
1658
|
+
logger.info("[%s] STEP 8: Generating summary (%d pages available)...", inv_uuid, len(page_records))
|
|
1659
|
+
if llm_client is None:
|
|
1660
|
+
summary = (
|
|
1661
|
+
f"Investigation completed without LLM summary. "
|
|
1662
|
+
f"Scraped {scraped_count} pages; extracted {total_entities} entities."
|
|
1663
|
+
)
|
|
1664
|
+
else:
|
|
1665
|
+
try:
|
|
1666
|
+
summary_entities = []
|
|
1667
|
+
if extraction_results:
|
|
1668
|
+
for result in extraction_results:
|
|
1669
|
+
summary_entities.extend(result.entities)
|
|
1670
|
+
|
|
1671
|
+
summary = await _llm_with_backoff(
|
|
1672
|
+
generate_summary,
|
|
1673
|
+
llm=llm_client,
|
|
1674
|
+
query=refined_query,
|
|
1675
|
+
content=page_records,
|
|
1676
|
+
entities=summary_entities if summary_entities else None,
|
|
1677
|
+
investigation_id=inv_uuid,
|
|
1678
|
+
)
|
|
1679
|
+
logger.info("[%s] Summary generated (%d chars)", inv_uuid, len(summary or ""))
|
|
1680
|
+
except Exception as exc:
|
|
1681
|
+
logger.exception("[%s] Summary generation failed, using fallback summary: %s", inv_uuid, exc)
|
|
1682
|
+
summary = (
|
|
1683
|
+
f"Investigation complete for '{refined_query}'. "
|
|
1684
|
+
f"Analysis pipeline completed successfully, but summary generation failed: {exc}."
|
|
1685
|
+
)
|
|
1686
|
+
|
|
1687
|
+
logger.info("[%s] Summary preview: %s", inv_uuid, (summary or "")[:100])
|
|
1688
|
+
|
|
1689
|
+
await _update_progress(inv_uuid, 8)
|
|
1690
|
+
|
|
1691
|
+
# ===== Final: Update summary and mark completed (short-lived session) =====
|
|
1692
|
+
with get_session() as session:
|
|
1693
|
+
update_investigation_summary(session, inv_uuid, summary)
|
|
1694
|
+
session.query(Investigation).filter_by(id=inv_uuid).update(
|
|
1695
|
+
{"status": "completed"}
|
|
1696
|
+
)
|
|
1697
|
+
session.commit()
|
|
1698
|
+
await _update_progress(inv_uuid, 9)
|
|
1699
|
+
logger.info("[%s] Investigation COMPLETED (run_id=%s)", inv_uuid, run_id)
|
|
1700
|
+
|
|
1701
|
+
except Exception as exc:
|
|
1702
|
+
logger.exception("[%s] Investigation FAILED with exception: %s", investigation_id, exc)
|
|
1703
|
+
try:
|
|
1704
|
+
from db.models import Investigation
|
|
1705
|
+
from db.session import get_session
|
|
1706
|
+
|
|
1707
|
+
with get_session() as session:
|
|
1708
|
+
session.query(Investigation).filter_by(id=uuid.UUID(investigation_id)).update(
|
|
1709
|
+
{"status": "failed", "summary": f"Investigation failed: {exc!s}"[:500]}
|
|
1710
|
+
)
|
|
1711
|
+
session.commit()
|
|
1712
|
+
except Exception as update_exc:
|
|
1713
|
+
logger.warning("Failed to persist investigation failure status: %s", update_exc)
|
|
1714
|
+
|
|
1715
|
+
|
|
1716
|
+
def _enrich_wallets_sync(investigation_id, blockcypher_token, etherscan_key):
|
|
1717
|
+
"""Sync wrapper for blockchain enrichment - creates its own session."""
|
|
1718
|
+
from sources.blockchain import enrich_wallets_for_investigation
|
|
1719
|
+
from db.session import get_session
|
|
1720
|
+
|
|
1721
|
+
with get_session() as session:
|
|
1722
|
+
return enrich_wallets_for_investigation(
|
|
1723
|
+
investigation_id=investigation_id,
|
|
1724
|
+
session=session,
|
|
1725
|
+
blockcypher_token=blockcypher_token,
|
|
1726
|
+
etherscan_key=etherscan_key,
|
|
1727
|
+
max_wallets=10,
|
|
1728
|
+
)
|
|
1729
|
+
|
|
1730
|
+
|
|
1731
|
+
def _persist_graph_edges_sync(graph_obj, investigation_id):
|
|
1732
|
+
"""Sync wrapper for graph edge persistence - creates its own session."""
|
|
1733
|
+
from graph.builder import persist_graph_edges
|
|
1734
|
+
from db.session import get_session
|
|
1735
|
+
|
|
1736
|
+
with get_session() as session:
|
|
1737
|
+
return persist_graph_edges(
|
|
1738
|
+
graph_obj,
|
|
1739
|
+
investigation_id,
|
|
1740
|
+
session,
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
|
|
1744
|
+
# ---------------------------------------------------------------------------
|
|
1745
|
+
# Routes
|
|
1746
|
+
# ---------------------------------------------------------------------------
|
|
1747
|
+
|
|
1748
|
+
|
|
1749
|
+
@router.post("")
|
|
1750
|
+
@_rate_limit("3/minute")
|
|
1751
|
+
async def create_investigation(
|
|
1752
|
+
request: Request,
|
|
1753
|
+
body: InvestigationRequest,
|
|
1754
|
+
background_tasks: BackgroundTasks,
|
|
1755
|
+
current_user: CurrentUser = Depends(require_password_not_reset_pending),
|
|
1756
|
+
) -> dict:
|
|
1757
|
+
"""Trigger an investigation asynchronously.
|
|
1758
|
+
|
|
1759
|
+
Creates the investigation row in the DB synchronously before returning so
|
|
1760
|
+
that GET /investigations/{run_id} returns a valid record immediately while
|
|
1761
|
+
the background pipeline runs.
|
|
1762
|
+
"""
|
|
1763
|
+
from utils.content_safety import is_blocked_query, log_content_safety_event
|
|
1764
|
+
|
|
1765
|
+
blocked, reason = is_blocked_query(body.query)
|
|
1766
|
+
if blocked:
|
|
1767
|
+
logger.warning(
|
|
1768
|
+
"Investigation blocked — prohibited content detected. User: %s",
|
|
1769
|
+
current_user.user.id,
|
|
1770
|
+
)
|
|
1771
|
+
log_content_safety_event(
|
|
1772
|
+
event_type="query_blocked",
|
|
1773
|
+
content_hash=hashlib.sha256(body.query.encode()).hexdigest()[:16],
|
|
1774
|
+
user_id=current_user.user.id,
|
|
1775
|
+
)
|
|
1776
|
+
raise HTTPException(
|
|
1777
|
+
status_code=400,
|
|
1778
|
+
detail={
|
|
1779
|
+
"error": "prohibited_content",
|
|
1780
|
+
"message": (
|
|
1781
|
+
"This query cannot be processed. VoidAccess is intended "
|
|
1782
|
+
"for legitimate security research only."
|
|
1783
|
+
),
|
|
1784
|
+
"code": "CONTENT_BLOCKED",
|
|
1785
|
+
},
|
|
1786
|
+
)
|
|
1787
|
+
|
|
1788
|
+
run_id = str(uuid.uuid4())
|
|
1789
|
+
|
|
1790
|
+
if os.getenv("DATABASE_URL"):
|
|
1791
|
+
try:
|
|
1792
|
+
from db.session import get_session
|
|
1793
|
+
from db.queries import create_investigation as db_create
|
|
1794
|
+
|
|
1795
|
+
with get_session() as session:
|
|
1796
|
+
inv = db_create(session, query=body.query, user_id=current_user.user.id)
|
|
1797
|
+
inv.run_id = uuid.UUID(run_id)
|
|
1798
|
+
inv.status = "pending"
|
|
1799
|
+
session.commit()
|
|
1800
|
+
investigation_id = str(inv.id)
|
|
1801
|
+
except Exception as exc:
|
|
1802
|
+
logger.exception("Failed to create investigation record: %s", exc)
|
|
1803
|
+
raise HTTPException(
|
|
1804
|
+
status_code=500,
|
|
1805
|
+
detail=f"Could not persist investigation: {exc!s}"[:300],
|
|
1806
|
+
)
|
|
1807
|
+
else:
|
|
1808
|
+
investigation_id = str(uuid.uuid4())
|
|
1809
|
+
|
|
1810
|
+
background_tasks.add_task(
|
|
1811
|
+
_run_investigation_task,
|
|
1812
|
+
investigation_id=investigation_id,
|
|
1813
|
+
run_id=run_id,
|
|
1814
|
+
query=body.query,
|
|
1815
|
+
model=body.model,
|
|
1816
|
+
run_crawler=body.run_crawler,
|
|
1817
|
+
)
|
|
1818
|
+
return {"run_id": run_id, "status": "pending", "query": body.query}
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
@router.get("")
|
|
1822
|
+
async def list_investigations(
|
|
1823
|
+
limit: int = Query(default=20, ge=1, le=200),
|
|
1824
|
+
offset: int = Query(default=0, ge=0),
|
|
1825
|
+
current_user: "CurrentUser" = Depends(get_current_user),
|
|
1826
|
+
) -> list[dict]:
|
|
1827
|
+
"""Return a paginated list of investigation summaries."""
|
|
1828
|
+
if not os.getenv("DATABASE_URL"):
|
|
1829
|
+
return []
|
|
1830
|
+
try:
|
|
1831
|
+
from db.session import get_session
|
|
1832
|
+
from db.models import Investigation
|
|
1833
|
+
|
|
1834
|
+
with get_session() as session:
|
|
1835
|
+
invs = (
|
|
1836
|
+
session.query(Investigation)
|
|
1837
|
+
.filter(Investigation.is_seed == False)
|
|
1838
|
+
.filter(Investigation.user_id == current_user.id)
|
|
1839
|
+
.order_by(Investigation.created_at.desc())
|
|
1840
|
+
.offset(offset)
|
|
1841
|
+
.limit(limit)
|
|
1842
|
+
.all()
|
|
1843
|
+
)
|
|
1844
|
+
return [
|
|
1845
|
+
{
|
|
1846
|
+
"id": str(inv.id),
|
|
1847
|
+
"run_id": str(inv.run_id),
|
|
1848
|
+
"query": inv.query,
|
|
1849
|
+
"status": inv.status,
|
|
1850
|
+
"model_used": inv.model_used,
|
|
1851
|
+
"created_at": inv.created_at.isoformat() if inv.created_at else None,
|
|
1852
|
+
"entity_count": inv.entity_count or 0,
|
|
1853
|
+
"page_count": inv.page_count or 0,
|
|
1854
|
+
}
|
|
1855
|
+
for inv in invs
|
|
1856
|
+
]
|
|
1857
|
+
except Exception as exc:
|
|
1858
|
+
logger.exception("list_investigations failed: %s", exc)
|
|
1859
|
+
return []
|
|
1860
|
+
|
|
1861
|
+
|
|
1862
|
+
@router.post("/{investigation_id}/cancel")
|
|
1863
|
+
async def cancel_investigation(
|
|
1864
|
+
investigation_id: str,
|
|
1865
|
+
current_user: "CurrentUser" = Depends(require_password_not_reset_pending),
|
|
1866
|
+
) -> dict:
|
|
1867
|
+
"""Request cooperative cancellation of a running investigation.
|
|
1868
|
+
|
|
1869
|
+
Sets a cancellation flag that the pipeline checks at each checkpoint.
|
|
1870
|
+
Returns 200 immediately — the pipeline may still be running; poll the
|
|
1871
|
+
investigation status to confirm it reaches 'cancelled'.
|
|
1872
|
+
Returns 409 if the investigation is already in a terminal state.
|
|
1873
|
+
"""
|
|
1874
|
+
if not os.getenv("DATABASE_URL"):
|
|
1875
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
1876
|
+
try:
|
|
1877
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
1878
|
+
except ValueError:
|
|
1879
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
1880
|
+
|
|
1881
|
+
from db.session import get_session
|
|
1882
|
+
from db.models import Investigation
|
|
1883
|
+
from db.queries import get_investigation_by_id_or_run
|
|
1884
|
+
|
|
1885
|
+
try:
|
|
1886
|
+
with get_session() as session:
|
|
1887
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
1888
|
+
if inv is None:
|
|
1889
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
1890
|
+
if str(inv.user_id) != str(current_user.user.id):
|
|
1891
|
+
raise HTTPException(status_code=403, detail="Forbidden")
|
|
1892
|
+
terminal = {"completed", "failed", "cancelled", "completed_no_results"}
|
|
1893
|
+
if inv.status in terminal:
|
|
1894
|
+
raise HTTPException(
|
|
1895
|
+
status_code=409,
|
|
1896
|
+
detail=f"Investigation cannot be cancelled (current status: {inv.status})",
|
|
1897
|
+
)
|
|
1898
|
+
# Set flag by both run_id and inv.id — the pipeline task uses inv.id
|
|
1899
|
+
_set_cancelled(investigation_id)
|
|
1900
|
+
_set_cancelled(str(inv.id))
|
|
1901
|
+
logger.info(
|
|
1902
|
+
"[%s] Cancellation requested by user %s",
|
|
1903
|
+
inv_uuid,
|
|
1904
|
+
current_user.user.id,
|
|
1905
|
+
)
|
|
1906
|
+
except HTTPException:
|
|
1907
|
+
raise
|
|
1908
|
+
except Exception as exc:
|
|
1909
|
+
logger.exception("cancel_investigation failed: %s", exc)
|
|
1910
|
+
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}"[:300])
|
|
1911
|
+
|
|
1912
|
+
return _get_db_investigation(investigation_id)
|
|
1913
|
+
|
|
1914
|
+
|
|
1915
|
+
@router.get("/{investigation_id}/progress")
|
|
1916
|
+
async def investigation_progress(
|
|
1917
|
+
investigation_id: str,
|
|
1918
|
+
current_user: "CurrentUser" = Depends(get_current_user),
|
|
1919
|
+
) -> StreamingResponse:
|
|
1920
|
+
"""
|
|
1921
|
+
SSE stream of investigation pipeline progress.
|
|
1922
|
+
Emits step updates every 5 seconds until a terminal state is reached.
|
|
1923
|
+
"""
|
|
1924
|
+
from db.session import get_async_session
|
|
1925
|
+
from db.models import Investigation
|
|
1926
|
+
|
|
1927
|
+
try:
|
|
1928
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
1929
|
+
except ValueError:
|
|
1930
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
1931
|
+
|
|
1932
|
+
# Verify existence and ownership before opening the stream
|
|
1933
|
+
async with get_async_session() as session:
|
|
1934
|
+
result = await session.execute(sa_select(Investigation).where(Investigation.id == inv_uuid))
|
|
1935
|
+
inv_check = result.scalar_one_or_none()
|
|
1936
|
+
if inv_check is None:
|
|
1937
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
1938
|
+
if str(inv_check.user_id) != str(current_user.user.id):
|
|
1939
|
+
raise HTTPException(status_code=403, detail="Forbidden")
|
|
1940
|
+
|
|
1941
|
+
async def event_stream():
|
|
1942
|
+
last_step = None
|
|
1943
|
+
last_status = None
|
|
1944
|
+
timeout_count = 0
|
|
1945
|
+
max_timeout = 360
|
|
1946
|
+
data: dict = {}
|
|
1947
|
+
|
|
1948
|
+
while timeout_count < max_timeout:
|
|
1949
|
+
try:
|
|
1950
|
+
async with get_async_session() as session:
|
|
1951
|
+
result = await session.execute(
|
|
1952
|
+
sa_select(Investigation).where(Investigation.id == inv_uuid)
|
|
1953
|
+
)
|
|
1954
|
+
inv = result.scalar_one_or_none()
|
|
1955
|
+
except Exception:
|
|
1956
|
+
break
|
|
1957
|
+
|
|
1958
|
+
if inv is None:
|
|
1959
|
+
yield f"data: {json.dumps({'error': 'not_found'})}\n\n"
|
|
1960
|
+
break
|
|
1961
|
+
|
|
1962
|
+
step = inv.current_step or 0
|
|
1963
|
+
label = inv.current_step_label or ""
|
|
1964
|
+
status = inv.status
|
|
1965
|
+
|
|
1966
|
+
if step != last_step or status != last_status:
|
|
1967
|
+
data = {
|
|
1968
|
+
"step": step,
|
|
1969
|
+
"total_steps": 13,
|
|
1970
|
+
"label": label,
|
|
1971
|
+
"progress": int((step / 13) * 100),
|
|
1972
|
+
"status": status,
|
|
1973
|
+
"entity_count": inv.entity_count or 0,
|
|
1974
|
+
"page_count": inv.page_count or 0,
|
|
1975
|
+
}
|
|
1976
|
+
yield f"data: {json.dumps(data)}\n\n"
|
|
1977
|
+
last_step = step
|
|
1978
|
+
last_status = status
|
|
1979
|
+
|
|
1980
|
+
if status in ("completed", "failed", "completed_no_results", "cancelled"):
|
|
1981
|
+
yield f"data: {json.dumps({**data, 'done': True})}\n\n"
|
|
1982
|
+
break
|
|
1983
|
+
|
|
1984
|
+
timeout_count += 1
|
|
1985
|
+
await asyncio.sleep(5)
|
|
1986
|
+
|
|
1987
|
+
yield ": stream closed\n\n"
|
|
1988
|
+
|
|
1989
|
+
return StreamingResponse(
|
|
1990
|
+
event_stream(),
|
|
1991
|
+
media_type="text/event-stream",
|
|
1992
|
+
headers={
|
|
1993
|
+
"Cache-Control": "no-cache",
|
|
1994
|
+
"X-Accel-Buffering": "no",
|
|
1995
|
+
},
|
|
1996
|
+
)
|
|
1997
|
+
|
|
1998
|
+
|
|
1999
|
+
@router.get("/{investigation_id}/analysis/temporal")
|
|
2000
|
+
async def get_temporal_analysis(investigation_id: str) -> dict:
|
|
2001
|
+
"""
|
|
2002
|
+
Run temporal analysis on pages from this investigation.
|
|
2003
|
+
|
|
2004
|
+
Returns activity patterns by hour/day, anomalies, and silence breaks.
|
|
2005
|
+
Returns {"error": "insufficient_data"} (not 500) when there is not enough data.
|
|
2006
|
+
"""
|
|
2007
|
+
try:
|
|
2008
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
2009
|
+
except ValueError:
|
|
2010
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
2011
|
+
|
|
2012
|
+
if not os.getenv("DATABASE_URL"):
|
|
2013
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
2014
|
+
|
|
2015
|
+
try:
|
|
2016
|
+
from db.session import get_session
|
|
2017
|
+
from db.models import Entity, Page
|
|
2018
|
+
from db.queries import get_investigation_by_id_or_run
|
|
2019
|
+
from collections import defaultdict
|
|
2020
|
+
from analysis.temporal import detect_anomalies, detect_silence_breaks, Z_SCORE_THRESHOLD
|
|
2021
|
+
|
|
2022
|
+
with get_session() as session:
|
|
2023
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
2024
|
+
if inv is None:
|
|
2025
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
2026
|
+
|
|
2027
|
+
entities = session.query(Entity).filter(
|
|
2028
|
+
Entity.investigation_id == inv.id
|
|
2029
|
+
).all()
|
|
2030
|
+
|
|
2031
|
+
if not entities:
|
|
2032
|
+
return {
|
|
2033
|
+
"investigation_id": investigation_id,
|
|
2034
|
+
"error": "insufficient_data",
|
|
2035
|
+
"message": "No entities found for this investigation",
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
page_ids = list({e.page_id for e in entities if e.page_id is not None})
|
|
2039
|
+
if not page_ids:
|
|
2040
|
+
return {
|
|
2041
|
+
"investigation_id": investigation_id,
|
|
2042
|
+
"error": "insufficient_data",
|
|
2043
|
+
"message": "No page timestamps available",
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
pages = session.query(Page).filter(Page.id.in_(page_ids)).all()
|
|
2047
|
+
real_post_ts = sum(1 for p in pages if p.posted_at is not None)
|
|
2048
|
+
skipped_no_posted_at = len(pages) - real_post_ts
|
|
2049
|
+
if skipped_no_posted_at > 0:
|
|
2050
|
+
logger.debug(
|
|
2051
|
+
"Temporal analysis: skipped %d pages due to missing posted_at (using content timestamp, not scrape time)",
|
|
2052
|
+
skipped_no_posted_at,
|
|
2053
|
+
)
|
|
2054
|
+
timestamps = []
|
|
2055
|
+
for p in pages:
|
|
2056
|
+
if p.posted_at is not None:
|
|
2057
|
+
timestamps.append(p.posted_at)
|
|
2058
|
+
|
|
2059
|
+
if len(timestamps) < 3:
|
|
2060
|
+
return {
|
|
2061
|
+
"investigation_id": investigation_id,
|
|
2062
|
+
"error": "insufficient_data",
|
|
2063
|
+
"message": f"Only {len(timestamps)} timestamps available (minimum 3)",
|
|
2064
|
+
"data_points": len(timestamps),
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
by_hour: dict[int, int] = defaultdict(int)
|
|
2068
|
+
for ts in timestamps:
|
|
2069
|
+
by_hour[ts.hour] += 1
|
|
2070
|
+
activity_by_hour = {str(h): int(by_hour.get(h, 0)) for h in range(24)}
|
|
2071
|
+
|
|
2072
|
+
day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
|
2073
|
+
by_day: dict[int, int] = defaultdict(int)
|
|
2074
|
+
for ts in timestamps:
|
|
2075
|
+
by_day[ts.weekday()] += 1
|
|
2076
|
+
activity_by_day = {day_names[d]: int(by_day.get(d, 0)) for d in range(7)}
|
|
2077
|
+
|
|
2078
|
+
peak_hour_key = max(activity_by_hour, key=lambda h: activity_by_hour[h], default=None)
|
|
2079
|
+
peak_day_key = max(activity_by_day, key=lambda d: activity_by_day[d], default=None)
|
|
2080
|
+
|
|
2081
|
+
daily_counts: dict = defaultdict(int)
|
|
2082
|
+
for ts in timestamps:
|
|
2083
|
+
daily_counts[ts.date()] += 1
|
|
2084
|
+
timeline = [
|
|
2085
|
+
{"date": d, "count": c} for d, c in sorted(daily_counts.items())
|
|
2086
|
+
]
|
|
2087
|
+
|
|
2088
|
+
anomalies_raw = detect_anomalies(timeline, z_threshold=Z_SCORE_THRESHOLD)
|
|
2089
|
+
anomalies = [
|
|
2090
|
+
{
|
|
2091
|
+
"date": str(a["date"]),
|
|
2092
|
+
"count": a["count"],
|
|
2093
|
+
"z_score": round(a["z_score"], 2),
|
|
2094
|
+
"type": a["type"],
|
|
2095
|
+
"description": (
|
|
2096
|
+
f"Activity {'spike' if a['z_score'] > 0 else 'drop'}: "
|
|
2097
|
+
f"z-score {a['z_score']:.1f}"
|
|
2098
|
+
),
|
|
2099
|
+
}
|
|
2100
|
+
for a in anomalies_raw
|
|
2101
|
+
]
|
|
2102
|
+
|
|
2103
|
+
silence_raw = detect_silence_breaks(timeline, silence_days=7)
|
|
2104
|
+
silence_breaks = [
|
|
2105
|
+
{
|
|
2106
|
+
"before": str(s["silent_from"]),
|
|
2107
|
+
"after": str(s["silent_to"]),
|
|
2108
|
+
"gap_days": s["gap_days"],
|
|
2109
|
+
"significance": "high" if s["gap_days"] >= 14 else "medium",
|
|
2110
|
+
}
|
|
2111
|
+
for s in silence_raw
|
|
2112
|
+
]
|
|
2113
|
+
|
|
2114
|
+
all_dates = sorted(daily_counts.keys())
|
|
2115
|
+
timespan_days = (
|
|
2116
|
+
(all_dates[-1] - all_dates[0]).days if len(all_dates) >= 2 else 0
|
|
2117
|
+
)
|
|
2118
|
+
|
|
2119
|
+
return {
|
|
2120
|
+
"investigation_id": investigation_id,
|
|
2121
|
+
"activity_by_hour": activity_by_hour,
|
|
2122
|
+
"activity_by_day": activity_by_day,
|
|
2123
|
+
"anomalies": anomalies,
|
|
2124
|
+
"silence_breaks": silence_breaks,
|
|
2125
|
+
"peak_hour": int(peak_hour_key) if peak_hour_key is not None else None,
|
|
2126
|
+
"peak_day": peak_day_key,
|
|
2127
|
+
"total_timespan_days": timespan_days,
|
|
2128
|
+
"data_points": len(timestamps),
|
|
2129
|
+
}
|
|
2130
|
+
except HTTPException:
|
|
2131
|
+
raise
|
|
2132
|
+
except Exception as exc:
|
|
2133
|
+
logger.warning("get_temporal_analysis failed: %s", exc)
|
|
2134
|
+
return {"error": "analysis_failed", "message": str(exc)[:300]}
|
|
2135
|
+
|
|
2136
|
+
|
|
2137
|
+
@router.get("/{investigation_id}")
|
|
2138
|
+
async def get_investigation(
|
|
2139
|
+
investigation_id: str,
|
|
2140
|
+
current_user: "CurrentUser" = Depends(get_current_user),
|
|
2141
|
+
) -> dict:
|
|
2142
|
+
"""Return full investigation record including entity count. 404 if not found."""
|
|
2143
|
+
if os.getenv("DATABASE_URL"):
|
|
2144
|
+
try:
|
|
2145
|
+
from db.session import get_session
|
|
2146
|
+
from db.queries import get_investigation_by_id_or_run
|
|
2147
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
2148
|
+
with get_session() as session:
|
|
2149
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
2150
|
+
if inv is None:
|
|
2151
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
2152
|
+
if str(inv.user_id) != str(current_user.user.id):
|
|
2153
|
+
raise HTTPException(status_code=403, detail="Forbidden")
|
|
2154
|
+
except HTTPException:
|
|
2155
|
+
raise
|
|
2156
|
+
except ValueError:
|
|
2157
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
2158
|
+
return _get_db_investigation(investigation_id)
|
|
2159
|
+
|
|
2160
|
+
|
|
2161
|
+
@router.get("/{investigation_id}/entities")
|
|
2162
|
+
async def get_investigation_entities(
|
|
2163
|
+
investigation_id: str,
|
|
2164
|
+
entity_type: Optional[str] = Query(default=None),
|
|
2165
|
+
min_confidence: float = Query(default=0.75, ge=0.0, le=1.0),
|
|
2166
|
+
limit: int = Query(default=20, ge=1, le=100),
|
|
2167
|
+
offset: int = Query(default=0, ge=0),
|
|
2168
|
+
defang: bool = Query(default=True),
|
|
2169
|
+
freshness_exclude: Optional[str] = Query(default=None),
|
|
2170
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
2171
|
+
) -> dict:
|
|
2172
|
+
"""Return paginated entities for an investigation, optionally filtered by type and confidence."""
|
|
2173
|
+
if not os.getenv("DATABASE_URL"):
|
|
2174
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
2175
|
+
try:
|
|
2176
|
+
from db.session import get_session
|
|
2177
|
+
from db.models import Entity, InvestigationEntityLink
|
|
2178
|
+
from db.queries import get_investigation_by_id_or_run
|
|
2179
|
+
from graph.builder import _make_node_id
|
|
2180
|
+
from sqlalchemy import func
|
|
2181
|
+
from utils.ioc_freshness import get_freshness_tag, get_freshness_display
|
|
2182
|
+
from utils.defang import defang_value, defang_text
|
|
2183
|
+
|
|
2184
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
2185
|
+
with get_session() as session:
|
|
2186
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
2187
|
+
if inv is None:
|
|
2188
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
2189
|
+
if str(inv.user_id) != str(current_user.user.id):
|
|
2190
|
+
raise HTTPException(status_code=403, detail="Forbidden")
|
|
2191
|
+
|
|
2192
|
+
linked_ids_subq = (
|
|
2193
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
2194
|
+
.filter(InvestigationEntityLink.investigation_id == inv.id)
|
|
2195
|
+
.subquery()
|
|
2196
|
+
)
|
|
2197
|
+
query = session.query(Entity).filter(
|
|
2198
|
+
(Entity.investigation_id == inv.id)
|
|
2199
|
+
| Entity.id.in_(linked_ids_subq)
|
|
2200
|
+
)
|
|
2201
|
+
if entity_type:
|
|
2202
|
+
query = query.filter(Entity.entity_type == entity_type)
|
|
2203
|
+
if min_confidence > 0.0:
|
|
2204
|
+
query = query.filter(Entity.confidence >= min_confidence)
|
|
2205
|
+
|
|
2206
|
+
total = query.count()
|
|
2207
|
+
entities = (
|
|
2208
|
+
query.order_by(Entity.created_at.desc())
|
|
2209
|
+
.offset(offset)
|
|
2210
|
+
.limit(limit)
|
|
2211
|
+
.all()
|
|
2212
|
+
)
|
|
2213
|
+
|
|
2214
|
+
# Safety net: filter prohibited entity values from the response.
|
|
2215
|
+
# Catches values that may have been stored before FIX 2 was deployed.
|
|
2216
|
+
from utils.content_safety import is_blocked_entity_value as _is_blocked_ev
|
|
2217
|
+
entities = [
|
|
2218
|
+
e for e in entities
|
|
2219
|
+
if not _is_blocked_ev(e.entity_type, e.value)
|
|
2220
|
+
]
|
|
2221
|
+
|
|
2222
|
+
out: list[dict] = []
|
|
2223
|
+
for e in entities:
|
|
2224
|
+
source_url = ""
|
|
2225
|
+
try:
|
|
2226
|
+
if e.page:
|
|
2227
|
+
source_url = e.page.url or ""
|
|
2228
|
+
except Exception:
|
|
2229
|
+
pass
|
|
2230
|
+
|
|
2231
|
+
freshness_tag = get_freshness_tag(
|
|
2232
|
+
e.entity_type,
|
|
2233
|
+
e.last_seen_at,
|
|
2234
|
+
e.first_seen_at,
|
|
2235
|
+
)
|
|
2236
|
+
|
|
2237
|
+
if freshness_exclude == "expired" and freshness_tag.value == "expired":
|
|
2238
|
+
continue
|
|
2239
|
+
|
|
2240
|
+
graph_node_id = _make_node_id(e.entity_type, e.value, source_url)
|
|
2241
|
+
|
|
2242
|
+
display_value = e.value
|
|
2243
|
+
display_context = e.context
|
|
2244
|
+
if defang:
|
|
2245
|
+
display_value = defang_value(e.entity_type, e.value or "")
|
|
2246
|
+
if e.context:
|
|
2247
|
+
display_context = defang_text(e.context)
|
|
2248
|
+
|
|
2249
|
+
freshness_display = get_freshness_display(freshness_tag)
|
|
2250
|
+
|
|
2251
|
+
out.append(
|
|
2252
|
+
{
|
|
2253
|
+
"id": str(e.id),
|
|
2254
|
+
"entity_type": e.entity_type,
|
|
2255
|
+
"canonical_value": e.canonical_value,
|
|
2256
|
+
"value": display_value,
|
|
2257
|
+
"confidence": e.confidence,
|
|
2258
|
+
"context_snippet": e.context_snippet,
|
|
2259
|
+
"context": display_context,
|
|
2260
|
+
"created_at": e.created_at.isoformat() if e.created_at else None,
|
|
2261
|
+
"first_seen": e.first_seen.isoformat() if e.first_seen else None,
|
|
2262
|
+
"last_seen": e.last_seen.isoformat() if e.last_seen else None,
|
|
2263
|
+
"first_seen_at": e.first_seen_at.isoformat() if e.first_seen_at else None,
|
|
2264
|
+
"last_seen_at": e.last_seen_at.isoformat() if e.last_seen_at else None,
|
|
2265
|
+
"freshness_tag": freshness_tag.value,
|
|
2266
|
+
"freshness_label": freshness_display["label"],
|
|
2267
|
+
"freshness_color": freshness_display["color"],
|
|
2268
|
+
"source_count": e.source_count or 1,
|
|
2269
|
+
"corroborating_sources": json.loads(e.corroborating_sources or '["dark_web_scrape"]'),
|
|
2270
|
+
"cross_referenced": (e.source_count or 1) > 1,
|
|
2271
|
+
"graph_node_id": graph_node_id,
|
|
2272
|
+
"defanged": defang,
|
|
2273
|
+
}
|
|
2274
|
+
)
|
|
2275
|
+
return {"items": out, "total": total, "skip": offset, "limit": limit}
|
|
2276
|
+
except HTTPException:
|
|
2277
|
+
raise
|
|
2278
|
+
except ValueError:
|
|
2279
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
2280
|
+
except Exception as exc:
|
|
2281
|
+
logger.exception("get_investigation_entities failed: %s", exc)
|
|
2282
|
+
raise HTTPException(
|
|
2283
|
+
status_code=500,
|
|
2284
|
+
detail=f"Internal error: {exc!s}"[:500],
|
|
2285
|
+
)
|
|
2286
|
+
|
|
2287
|
+
|
|
2288
|
+
@router.get("/{investigation_id}/entities/export/csv")
|
|
2289
|
+
async def export_investigation_entities_csv(
|
|
2290
|
+
investigation_id: str,
|
|
2291
|
+
current_user: CurrentUser = Depends(get_current_user),
|
|
2292
|
+
) -> Response:
|
|
2293
|
+
"""
|
|
2294
|
+
Export entities for an investigation as a CSV file download.
|
|
2295
|
+
|
|
2296
|
+
Returns CSV with columns: entity_type, canonical_value, confidence,
|
|
2297
|
+
occurrence_count, first_seen_page, context_snippet
|
|
2298
|
+
"""
|
|
2299
|
+
if not os.getenv("DATABASE_URL"):
|
|
2300
|
+
raise HTTPException(status_code=503, detail="Database not configured")
|
|
2301
|
+
|
|
2302
|
+
try:
|
|
2303
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
2304
|
+
except ValueError:
|
|
2305
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
2306
|
+
|
|
2307
|
+
try:
|
|
2308
|
+
from db.session import get_session
|
|
2309
|
+
from db.models import Entity, InvestigationEntityLink
|
|
2310
|
+
from db.queries import get_investigation_by_id_or_run
|
|
2311
|
+
from sqlalchemy import func
|
|
2312
|
+
|
|
2313
|
+
with get_session() as session:
|
|
2314
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
2315
|
+
if inv is None:
|
|
2316
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
2317
|
+
if str(inv.user_id) != str(current_user.user.id):
|
|
2318
|
+
raise HTTPException(status_code=403, detail="Forbidden")
|
|
2319
|
+
|
|
2320
|
+
linked_ids_subq = (
|
|
2321
|
+
session.query(InvestigationEntityLink.entity_id)
|
|
2322
|
+
.filter(InvestigationEntityLink.investigation_id == inv.id)
|
|
2323
|
+
.subquery()
|
|
2324
|
+
)
|
|
2325
|
+
entities = (
|
|
2326
|
+
session.query(Entity)
|
|
2327
|
+
.filter(
|
|
2328
|
+
(Entity.investigation_id == inv.id)
|
|
2329
|
+
| Entity.id.in_(linked_ids_subq)
|
|
2330
|
+
)
|
|
2331
|
+
.all()
|
|
2332
|
+
)
|
|
2333
|
+
|
|
2334
|
+
output = io.StringIO()
|
|
2335
|
+
writer = csv.writer(output)
|
|
2336
|
+
writer.writerow([
|
|
2337
|
+
"entity_type",
|
|
2338
|
+
"canonical_value",
|
|
2339
|
+
"confidence",
|
|
2340
|
+
"occurrence_count",
|
|
2341
|
+
"first_seen_page",
|
|
2342
|
+
"context_snippet",
|
|
2343
|
+
])
|
|
2344
|
+
|
|
2345
|
+
for e in entities:
|
|
2346
|
+
source_url = ""
|
|
2347
|
+
try:
|
|
2348
|
+
if e.page:
|
|
2349
|
+
source_url = e.page.url or ""
|
|
2350
|
+
except Exception:
|
|
2351
|
+
pass
|
|
2352
|
+
context = (e.context_snippet or "").replace(
|
|
2353
|
+
"\n", " "
|
|
2354
|
+
).replace(
|
|
2355
|
+
"\r", " "
|
|
2356
|
+
).strip()
|
|
2357
|
+
writer.writerow([
|
|
2358
|
+
e.entity_type,
|
|
2359
|
+
e.canonical_value or e.value,
|
|
2360
|
+
e.confidence,
|
|
2361
|
+
1,
|
|
2362
|
+
source_url,
|
|
2363
|
+
context[:500],
|
|
2364
|
+
])
|
|
2365
|
+
|
|
2366
|
+
csv_content = output.getvalue()
|
|
2367
|
+
|
|
2368
|
+
return Response(
|
|
2369
|
+
content=csv_content,
|
|
2370
|
+
media_type="text/csv",
|
|
2371
|
+
headers={
|
|
2372
|
+
"Content-Disposition": f"attachment; filename=voidaccess_{investigation_id}_entities.csv"
|
|
2373
|
+
},
|
|
2374
|
+
)
|
|
2375
|
+
except HTTPException:
|
|
2376
|
+
raise
|
|
2377
|
+
except Exception as exc:
|
|
2378
|
+
logger.exception("export_investigation_entities_csv failed: %s", exc)
|
|
2379
|
+
raise HTTPException(
|
|
2380
|
+
status_code=500,
|
|
2381
|
+
detail=f"Internal error: {exc!s}"[:500],
|
|
2382
|
+
)
|
|
2383
|
+
|
|
2384
|
+
|
|
2385
|
+
MAX_GRAPH_NODES = 500
|
|
2386
|
+
|
|
2387
|
+
|
|
2388
|
+
@router.get("/{investigation_id}/graph")
|
|
2389
|
+
async def get_investigation_graph(
|
|
2390
|
+
investigation_id: str,
|
|
2391
|
+
force_rebuild: bool = False,
|
|
2392
|
+
max_nodes: int = Query(default=MAX_GRAPH_NODES, ge=1, le=MAX_GRAPH_NODES),
|
|
2393
|
+
min_confidence: float = Query(default=0.75, ge=0.0, le=1.0),
|
|
2394
|
+
) -> dict:
|
|
2395
|
+
"""
|
|
2396
|
+
Return graph JSON for the investigation.
|
|
2397
|
+
|
|
2398
|
+
Requires investigation_id (now enforced - no more global graph).
|
|
2399
|
+
Uses persisted edges from the DB with O(1) lookup.
|
|
2400
|
+
|
|
2401
|
+
Use ?force_rebuild=true to recompute from scratch.
|
|
2402
|
+
Use ?max_nodes=N to limit node count (default 500, max 500).
|
|
2403
|
+
Use ?min_confidence=N to filter nodes/edges by confidence (default 0.75).
|
|
2404
|
+
Returns 400 if node count exceeds max_nodes - filter by entity type first.
|
|
2405
|
+
Returns 200 with {"graph_status": "skipped_overflow", ...} if graph was skipped due to size.
|
|
2406
|
+
"""
|
|
2407
|
+
try:
|
|
2408
|
+
inv_uuid = uuid.UUID(investigation_id)
|
|
2409
|
+
except ValueError:
|
|
2410
|
+
raise HTTPException(status_code=422, detail="Invalid investigation ID format")
|
|
2411
|
+
|
|
2412
|
+
try:
|
|
2413
|
+
from db.session import get_session
|
|
2414
|
+
from db.queries import get_investigation_by_id_or_run
|
|
2415
|
+
from graph.builder import build_graph_from_db, build_graph_from_db_cached
|
|
2416
|
+
from graph.export import to_json
|
|
2417
|
+
from db.models import EntityRelationship, Entity
|
|
2418
|
+
from sqlalchemy import func
|
|
2419
|
+
|
|
2420
|
+
with get_session() as session:
|
|
2421
|
+
inv = get_investigation_by_id_or_run(session, inv_uuid)
|
|
2422
|
+
if inv is None:
|
|
2423
|
+
raise HTTPException(status_code=404, detail="Investigation not found")
|
|
2424
|
+
internal_id = inv.id
|
|
2425
|
+
graph_status = getattr(inv, "graph_status", "pending")
|
|
2426
|
+
|
|
2427
|
+
if graph_status == "skipped_overflow":
|
|
2428
|
+
entity_count = (
|
|
2429
|
+
session.query(func.count(Entity.id))
|
|
2430
|
+
.filter(Entity.investigation_id == internal_id)
|
|
2431
|
+
.scalar() or 0
|
|
2432
|
+
)
|
|
2433
|
+
return {
|
|
2434
|
+
"graph_status": "skipped_overflow",
|
|
2435
|
+
"message": "Graph too large to render. Use the entity list or download the CSV export instead.",
|
|
2436
|
+
"total_entities": entity_count,
|
|
2437
|
+
"nodes": [],
|
|
2438
|
+
"edges": [],
|
|
2439
|
+
}
|
|
2440
|
+
|
|
2441
|
+
persisted_edge_count = (
|
|
2442
|
+
session.query(func.count(EntityRelationship.id))
|
|
2443
|
+
.filter(EntityRelationship.investigation_id == internal_id)
|
|
2444
|
+
.scalar() or 0
|
|
2445
|
+
)
|
|
2446
|
+
|
|
2447
|
+
total_entity_count = (
|
|
2448
|
+
session.query(func.count(Entity.id))
|
|
2449
|
+
.filter(Entity.investigation_id == internal_id)
|
|
2450
|
+
.scalar() or 0
|
|
2451
|
+
)
|
|
2452
|
+
|
|
2453
|
+
if persisted_edge_count > 0 and not force_rebuild:
|
|
2454
|
+
logger.debug(
|
|
2455
|
+
"Graph cache hit: %s edges from DB for investigation %s",
|
|
2456
|
+
persisted_edge_count,
|
|
2457
|
+
investigation_id,
|
|
2458
|
+
)
|
|
2459
|
+
graph = build_graph_from_db_cached(investigation_id=internal_id)
|
|
2460
|
+
else:
|
|
2461
|
+
graph = build_graph_from_db(investigation_id=internal_id)
|
|
2462
|
+
|
|
2463
|
+
node_count = len(graph.nodes)
|
|
2464
|
+
if node_count > max_nodes:
|
|
2465
|
+
raise HTTPException(
|
|
2466
|
+
status_code=400,
|
|
2467
|
+
detail=(
|
|
2468
|
+
f"Graph has {node_count} nodes, exceeds max_nodes={max_nodes}. "
|
|
2469
|
+
"Filter by entity type first using the /entities endpoint "
|
|
2470
|
+
"with entity_type filter, then rebuild the graph."
|
|
2471
|
+
),
|
|
2472
|
+
)
|
|
2473
|
+
|
|
2474
|
+
graph_data = to_json(graph)
|
|
2475
|
+
|
|
2476
|
+
nodes_to_keep = set()
|
|
2477
|
+
total_entities = len(graph_data["nodes"])
|
|
2478
|
+
for node in graph_data["nodes"]:
|
|
2479
|
+
node_confidence = node.get("confidence", 0.0)
|
|
2480
|
+
if node_confidence >= min_confidence:
|
|
2481
|
+
nodes_to_keep.add(node["id"])
|
|
2482
|
+
|
|
2483
|
+
filtered_nodes = [n for n in graph_data["nodes"] if n["id"] in nodes_to_keep]
|
|
2484
|
+
filtered_edges = [
|
|
2485
|
+
e for e in graph_data["edges"]
|
|
2486
|
+
if e["source"] in nodes_to_keep and e["target"] in nodes_to_keep
|
|
2487
|
+
]
|
|
2488
|
+
|
|
2489
|
+
return {
|
|
2490
|
+
"graph_status": graph_status,
|
|
2491
|
+
"total_entities": total_entities,
|
|
2492
|
+
"filtered_entities": len(filtered_nodes),
|
|
2493
|
+
"min_confidence": min_confidence,
|
|
2494
|
+
"nodes": filtered_nodes,
|
|
2495
|
+
"edges": filtered_edges,
|
|
2496
|
+
}
|
|
2497
|
+
except HTTPException:
|
|
2498
|
+
raise
|
|
2499
|
+
except Exception as exc:
|
|
2500
|
+
logger.warning("get_investigation_graph failed: %s", exc)
|
|
2501
|
+
return {"nodes": [], "edges": []}
|
|
2502
|
+
|
|
2503
|
+
|
|
2504
|
+
def _build_investigation_profiles(investigation_id) -> int:
|
|
2505
|
+
"""
|
|
2506
|
+
For each THREAT_ACTOR entity in this investigation,
|
|
2507
|
+
build/update their style profile from available text.
|
|
2508
|
+
|
|
2509
|
+
Uses context_snippets collected across all appearances
|
|
2510
|
+
of the same canonical entity.
|
|
2511
|
+
|
|
2512
|
+
NOTE: This function creates its own session - never pass a session
|
|
2513
|
+
across thread boundaries.
|
|
2514
|
+
"""
|
|
2515
|
+
from db.models import Entity
|
|
2516
|
+
from db.session import get_session
|
|
2517
|
+
from fingerprint.profiler import build_actor_profile, save_profile_to_db
|
|
2518
|
+
from sqlalchemy import func
|
|
2519
|
+
|
|
2520
|
+
count = 0
|
|
2521
|
+
with get_session() as session:
|
|
2522
|
+
actors = (
|
|
2523
|
+
session.query(Entity.canonical_value, Entity.entity_type)
|
|
2524
|
+
.filter(
|
|
2525
|
+
Entity.investigation_id == investigation_id,
|
|
2526
|
+
Entity.entity_type.in_(["THREAT_ACTOR", "THREAT_ACTOR_HANDLE", "MALWARE_FAMILY", "RANSOMWARE_GROUP"]),
|
|
2527
|
+
Entity.canonical_value.isnot(None),
|
|
2528
|
+
)
|
|
2529
|
+
.distinct()
|
|
2530
|
+
.all()
|
|
2531
|
+
)
|
|
2532
|
+
|
|
2533
|
+
for canonical_value, entity_type in actors:
|
|
2534
|
+
texts = (
|
|
2535
|
+
session.query(Entity.context_snippet)
|
|
2536
|
+
.filter(
|
|
2537
|
+
Entity.entity_type == entity_type,
|
|
2538
|
+
Entity.canonical_value == canonical_value,
|
|
2539
|
+
Entity.context_snippet.isnot(None),
|
|
2540
|
+
func.length(Entity.context_snippet) >= 50,
|
|
2541
|
+
)
|
|
2542
|
+
.all()
|
|
2543
|
+
)
|
|
2544
|
+
|
|
2545
|
+
text_list = [t[0] for t in texts if t[0]]
|
|
2546
|
+
total_chars = sum(len(t) for t in text_list)
|
|
2547
|
+
|
|
2548
|
+
if len(text_list) < 2 or total_chars < 200:
|
|
2549
|
+
continue
|
|
2550
|
+
|
|
2551
|
+
try:
|
|
2552
|
+
profile = build_actor_profile(text_list)
|
|
2553
|
+
if profile:
|
|
2554
|
+
save_profile_to_db(
|
|
2555
|
+
profile=profile,
|
|
2556
|
+
canonical_value=canonical_value,
|
|
2557
|
+
entity_type=entity_type,
|
|
2558
|
+
session=session,
|
|
2559
|
+
)
|
|
2560
|
+
count += 1
|
|
2561
|
+
except Exception as e:
|
|
2562
|
+
logger.debug(f"Profile build failed for {canonical_value}: {e}")
|
|
2563
|
+
continue
|
|
2564
|
+
|
|
2565
|
+
session.commit()
|
|
2566
|
+
|
|
2567
|
+
return count
|