voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,372 @@
1
+ """
2
+ extractor/llm_extract.py — LLM-assisted entity extraction.
3
+
4
+ Runs AFTER regex and NER — only on text chunks that already contain at least
5
+ one entity (to avoid wasting API calls on irrelevant content).
6
+
7
+ Accepts an *llm* object (any LangChain chat model) as a parameter — does not
8
+ instantiate LLMs internally.
9
+
10
+ Public interface
11
+ ----------------
12
+ async extract_with_llm(text, llm, existing_entities, max_chunk_chars, page_hash, disable_cache) → dict[str, list[str]]
13
+
14
+ Configuration
15
+ -------------
16
+ - Set DISABLE_EXTRACTION_CACHE=true in .env to disable caching entirely
17
+ - Use --no-cache CLI flag to bypass cache for a specific run
18
+ - Cache TTL is 30 days
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ from datetime import datetime, timezone, timedelta
28
+ from typing import Optional
29
+
30
+ from config import DISABLE_EXTRACTION_CACHE
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _CACHE_TTL_DAYS = 30
35
+ _DEFAULT_MAX_CHUNK_CHARS = 12000
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Prompt template
39
+ # ---------------------------------------------------------------------------
40
+
41
+ _PROMPT_TEMPLATE = (
42
+ "You are a threat intelligence analyst. Extract structured entities from the "
43
+ "following dark web content. Return ONLY valid JSON with these keys: "
44
+ "crypto_wallets, threat_actor_handles, malware_names, dates, urls, "
45
+ "cve_identifiers, mitre_techniques, file_hashes_md5, file_hashes_sha1, file_hashes_sha256. "
46
+ "Each key maps to a list of strings. If none found, use empty list. "
47
+ "Do not include any text outside the JSON object.\n\n"
48
+ "CRITICAL: File hashes (MD5, SHA1, SHA256) must be extracted in their complete, "
49
+ "untruncated form. MD5 hashes are exactly 32 hex characters. "
50
+ "SHA1 hashes are exactly 40 hex characters. "
51
+ "SHA256 hashes are exactly 64 hex characters. "
52
+ "If a hash appears truncated in the source text (e.g. 'a3f8b2...'), "
53
+ "do NOT extract it — skip truncated hashes entirely.\n\n"
54
+ "CVE: Common Vulnerabilities and Exposures identifiers in format CVE-YYYY-NNNNN. "
55
+ "Extract the complete ID including year and number.\n\n"
56
+ "MITRE_TECHNIQUE: MITRE ATT&CK technique identifiers in format TNNNN "
57
+ "or TNNNN.NNN (sub-techniques). These map to adversary tactics and are "
58
+ "critical for detection engineering.\n\n"
59
+ "Content:\n{chunk}"
60
+ )
61
+
62
+ # Map LLM output keys → internal entity type constants
63
+ _LLM_KEY_TO_TYPE: dict[str, str] = {
64
+ "crypto_wallets": "BITCOIN_ADDRESS",
65
+ "threat_actor_handles": "THREAT_ACTOR_HANDLE",
66
+ "malware_names": "MALWARE_FAMILY",
67
+ "dates": "DATE",
68
+ "urls": "ONION_URL",
69
+ "cve_identifiers": "CVE_NUMBER",
70
+ "mitre_techniques": "MITRE_TECHNIQUE",
71
+ "file_hashes_md5": "FILE_HASH_MD5",
72
+ "file_hashes_sha1": "FILE_HASH_SHA1",
73
+ "file_hashes_sha256": "FILE_HASH_SHA256",
74
+ }
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Cache layer
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _get_cache_disabled(flag: Optional[bool] = None) -> bool:
81
+ """Check if cache should be disabled (CLI flag overrides env var)."""
82
+ if flag is True:
83
+ return True
84
+ return DISABLE_EXTRACTION_CACHE
85
+
86
+
87
+ def _compute_page_hash(content: str) -> str:
88
+ """Compute SHA-256 hash of page content for cache key."""
89
+ return hashlib.sha256(content.encode()).hexdigest()
90
+
91
+
92
+ def _load_from_cache(page_hash: str) -> Optional[dict[str, list[str]]]:
93
+ """Load cached extraction results from database if not expired."""
94
+ if not os.getenv("DATABASE_URL"):
95
+ return None
96
+
97
+ try:
98
+ from sqlalchemy import text
99
+ from db.session import get_session
100
+
101
+ with get_session() as session:
102
+ result = session.execute(
103
+ text("""
104
+ SELECT entities_json, expires_at
105
+ FROM page_extraction_cache
106
+ WHERE page_hash = :page_hash
107
+ """),
108
+ {"page_hash": page_hash}
109
+ ).fetchone()
110
+
111
+ if result is None:
112
+ return None
113
+
114
+ entities_json, expires_at = result
115
+ if expires_at.tzinfo is None:
116
+ expires_at = expires_at.replace(tzinfo=timezone.utc)
117
+
118
+ if expires_at < datetime.now(timezone.utc):
119
+ logger.debug("Cache expired for page_hash=%s", page_hash[:16])
120
+ return None
121
+
122
+ logger.info("Cache HIT for page_hash=%s", page_hash[:16])
123
+ return json.loads(entities_json)
124
+
125
+ except Exception as exc:
126
+ logger.warning("Cache lookup failed: %s", exc)
127
+ return None
128
+
129
+
130
+ def _save_to_cache(page_hash: str, entities: dict[str, list[str]]) -> None:
131
+ """Store extraction results in cache with 30-day TTL."""
132
+ if not os.getenv("DATABASE_URL"):
133
+ return
134
+
135
+ try:
136
+ from sqlalchemy import text
137
+ from db.session import get_session
138
+
139
+ entities_json = json.dumps(entities)
140
+ expires_at = datetime.now(timezone.utc) + timedelta(days=_CACHE_TTL_DAYS)
141
+
142
+ with get_session() as session:
143
+ session.execute(
144
+ text("""
145
+ INSERT INTO page_extraction_cache (page_hash, entities_json, extracted_at, expires_at)
146
+ VALUES (:page_hash, :entities_json, :extracted_at, :expires_at)
147
+ ON CONFLICT (page_hash) DO UPDATE SET
148
+ entities_json = EXCLUDED.entities_json,
149
+ extracted_at = EXCLUDED.extracted_at,
150
+ expires_at = EXCLUDED.expires_at
151
+ """),
152
+ {
153
+ "page_hash": page_hash,
154
+ "entities_json": entities_json,
155
+ "extracted_at": datetime.now(timezone.utc),
156
+ "expires_at": expires_at,
157
+ }
158
+ )
159
+ session.commit()
160
+
161
+ logger.info("Cache saved for page_hash=%s", page_hash[:16])
162
+
163
+ except Exception as exc:
164
+ logger.warning("Cache save failed: %s", exc)
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Public interface
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ async def extract_with_llm(
173
+ text: str,
174
+ llm,
175
+ existing_entities: dict[str, list[str]],
176
+ max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
177
+ page_hash: Optional[str] = None,
178
+ disable_cache: Optional[bool] = None,
179
+ ) -> dict[str, list[str]]:
180
+ """
181
+ Augment *existing_entities* with entities found by the LLM.
182
+
183
+ - If *llm* is None, returns *existing_entities* unchanged.
184
+ - Only processes text when *existing_entities* has at least one value
185
+ (to avoid API calls on irrelevant pages).
186
+ - Splits text into overlapping chunks of *max_chunk_chars* with a 200-char
187
+ overlap to avoid splitting entities at boundaries.
188
+ - Merges and deduplicates results from every chunk into *existing_entities*.
189
+ - Uses content-hash caching to skip LLM calls for identical content.
190
+ - Entity confidence increases with chunk occurrence count.
191
+ - Invalid JSON from the LLM is logged as a warning; that chunk contributes
192
+ no results rather than raising.
193
+ - Never raises.
194
+ """
195
+ if llm is None:
196
+ return existing_entities
197
+
198
+ # Skip expensive LLM calls if regex/NER found nothing at all
199
+ if not any(existing_entities.values()):
200
+ return existing_entities
201
+
202
+ # Determine page hash for caching
203
+ if page_hash is None:
204
+ page_hash = _compute_page_hash(text)
205
+
206
+ # Check cache first (unless disabled)
207
+ if not _get_cache_disabled(disable_cache):
208
+ cached = _load_from_cache(page_hash)
209
+ if cached is not None:
210
+ return _merge_existing_and_cached(existing_entities, cached)
211
+
212
+ # Filter blocked entities before LLM to avoid processing noise
213
+ # Only apply to NER types (regex types have precise patterns, skip blocklist)
214
+ try:
215
+ from extractor.normalizer import is_blocked_entity, _REGEX_TYPES
216
+ filtered: dict[str, list[str]] = {}
217
+ for entity_type, values in existing_entities.items():
218
+ if entity_type in _REGEX_TYPES:
219
+ filtered[entity_type] = list(values)
220
+ else:
221
+ kept = [v for v in values if not is_blocked_entity(entity_type, v)]
222
+ if kept:
223
+ filtered[entity_type] = kept
224
+ if not filtered:
225
+ # Still cache the empty result to avoid repeated LLM calls
226
+ if not _get_cache_disabled(disable_cache):
227
+ _save_to_cache(page_hash, {})
228
+ return existing_entities
229
+ existing_entities = filtered
230
+ except ImportError:
231
+ pass
232
+
233
+ try:
234
+ chunks = _chunk_text(text, max_chunk_chars, overlap=200)
235
+
236
+ # Track entity occurrences across chunks for confidence scoring
237
+ entity_occurrences: dict[str, dict[str, int]] = {}
238
+ for entity_type in _LLM_KEY_TO_TYPE.values():
239
+ entity_occurrences[entity_type] = {}
240
+
241
+ result: dict[str, list[str]] = {k: list(v) for k, v in existing_entities.items()}
242
+
243
+ for chunk_idx, chunk in enumerate(chunks):
244
+ chunk_result = await _extract_chunk(chunk, llm)
245
+ for llm_key, entity_type in _LLM_KEY_TO_TYPE.items():
246
+ new_values = chunk_result.get(llm_key, [])
247
+ if not isinstance(new_values, list):
248
+ continue
249
+
250
+ # Track occurrences for confidence scoring
251
+ for val in new_values:
252
+ normalized = str(val).strip()
253
+ if normalized:
254
+ counts = entity_occurrences.get(entity_type, {})
255
+ counts[normalized] = counts.get(normalized, 0) + 1
256
+
257
+ existing = result.get(entity_type, [])
258
+ existing.extend(str(v) for v in new_values)
259
+ result[entity_type] = _dedup(existing)
260
+
261
+ # Store result in cache (even if empty)
262
+ if not _get_cache_disabled(disable_cache):
263
+ _save_to_cache(page_hash, result)
264
+
265
+ # Add confidence info via logging (could be extended to return metadata)
266
+ _log_confidence_stats(entity_occurrences, len(chunks))
267
+
268
+ return result
269
+
270
+ except Exception:
271
+ logger.exception("extract_with_llm encountered an unexpected error")
272
+ return existing_entities
273
+
274
+
275
+ def _merge_existing_and_cached(
276
+ existing: dict[str, list[str]],
277
+ cached: dict[str, list[str]],
278
+ ) -> dict[str, list[str]]:
279
+ """
280
+ Merge cached entities with existing ones.
281
+ Existing entities (from regex/NER) take precedence.
282
+ """
283
+ merged = dict(cached)
284
+ for entity_type, values in existing.items():
285
+ if entity_type in merged:
286
+ # Dedupe and prefer existing values
287
+ merged[entity_type] = _dedup(list(values) + merged[entity_type])
288
+ else:
289
+ merged[entity_type] = list(values)
290
+ return merged
291
+
292
+
293
+ def _log_confidence_stats(
294
+ entity_occurrences: dict[str, dict[str, int]],
295
+ total_chunks: int,
296
+ ) -> None:
297
+ """Log confidence statistics for extracted entities."""
298
+ for entity_type, counts in entity_occurrences.items():
299
+ if not counts:
300
+ continue
301
+ for value, count in counts.items():
302
+ if count > 1:
303
+ confidence = count / total_chunks
304
+ logger.debug(
305
+ "Entity %s=%s found in %d/%d chunks (confidence=%.2f)",
306
+ entity_type, value[:20], count, total_chunks, confidence
307
+ )
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Internal helpers
312
+ # ---------------------------------------------------------------------------
313
+
314
+
315
+ def _chunk_text(text: str, max_chars: int, overlap: int) -> list[str]:
316
+ """
317
+ Split *text* into chunks of at most *max_chars* with *overlap* char overlap.
318
+
319
+ The last chunk may be shorter. Single chunks are returned as-is without
320
+ copying.
321
+ """
322
+ if len(text) <= max_chars:
323
+ return [text]
324
+
325
+ chunks: list[str] = []
326
+ start = 0
327
+ while start < len(text):
328
+ end = min(start + max_chars, len(text))
329
+ chunks.append(text[start:end])
330
+ if end == len(text):
331
+ break
332
+ start = end - overlap
333
+ return chunks
334
+
335
+
336
+ async def _extract_chunk(chunk: str, llm) -> dict:
337
+ """
338
+ Send one chunk to the LLM and return the parsed JSON dict.
339
+
340
+ Returns an empty dict if the LLM returns invalid JSON or an error occurs.
341
+ """
342
+ try:
343
+ prompt = _PROMPT_TEMPLATE.format(chunk=chunk)
344
+ response = await llm.ainvoke(prompt)
345
+ content = response.content if hasattr(response, "content") else str(response)
346
+ content = content.strip()
347
+
348
+ # Strip markdown code fences if the LLM wrapped output in them
349
+ if content.startswith("```"):
350
+ lines = content.split("\n", 1)
351
+ if len(lines) > 1:
352
+ content = lines[1]
353
+ content = content.rsplit("```", 1)[0].strip()
354
+
355
+ return json.loads(content)
356
+
357
+ except json.JSONDecodeError as exc:
358
+ logger.warning("LLM returned invalid JSON for chunk (len=%d): %s", len(chunk), exc)
359
+ return {}
360
+ except Exception as exc:
361
+ logger.warning("LLM chunk extraction failed: %s", exc)
362
+ return {}
363
+
364
+
365
+ def _dedup(values) -> list[str]:
366
+ seen: set[str] = set()
367
+ result: list[str] = []
368
+ for v in values:
369
+ if v not in seen:
370
+ seen.add(v)
371
+ result.append(v)
372
+ return result