voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
extractor/pipeline.py ADDED
@@ -0,0 +1,401 @@
1
+ """
2
+ extractor/pipeline.py — Pipeline orchestrator for entity extraction.
3
+
4
+ Single entry point that the rest of the system calls. Runs:
5
+ 1. Regex extraction (extractor/regex_patterns.py)
6
+ 2. NER extraction (extractor/ner.py)
7
+ 3. LLM extraction (extractor/llm_extract.py) — optional
8
+ 4. Normalisation (extractor/normalizer.py)
9
+ 5. DB persistence (extractor/normalizer.merge_with_db)
10
+
11
+ Public interface
12
+ ----------------
13
+ async extract_entities_from_page(...) → ExtractionResult
14
+ async extract_entities_from_pages(...) → list[ExtractionResult]
15
+
16
+ ExtractionResult is a dataclass exported through extractor/__init__.py.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import asyncio
22
+ import logging
23
+ from dataclasses import dataclass, field
24
+ from typing import Optional
25
+ import uuid
26
+
27
+ from extractor.regex_patterns import extract_all as _regex_extract_all
28
+ from extractor.ner import extract_named_entities as _ner_extract
29
+ from extractor.llm_extract import extract_with_llm as _llm_extract
30
+ from extractor.normalizer import normalize_entities as _normalize, merge_with_db as _merge_db, NormalizedEntity, resolve_entity_type_conflicts as _resolve_conflicts
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ PER_TYPE_CAPS = {
35
+ "ORGANIZATION_NAME": 50,
36
+ "PERSON_NAME": 30,
37
+ "LOCATION": 20,
38
+ "THREAT_ACTOR_HANDLE": 80,
39
+ }
40
+
41
+ _ENTITY_TYPE_PRIORITY = {
42
+ 1: frozenset({"CVE", "CVE_NUMBER", "IP_ADDRESS", "IPV6_ADDRESS", "FILE_HASH", "FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256", "FILE_HASH_SHA512", "ONION_URL", "DOMAIN", "DOMAIN_NAME"}),
43
+ 2: frozenset({"MALWARE_FAMILY", "RANSOMWARE_GROUP", "THREAT_ACTOR", "THREAT_ACTOR_HANDLE"}),
44
+ 3: frozenset({"BITCOIN_ADDRESS", "MONERO_ADDRESS", "ETHEREUM_ADDRESS", "WALLET"}),
45
+ 4: frozenset({"EMAIL_ADDRESS", "PGP_KEY_BLOCK"}),
46
+ 5: frozenset({"ORGANIZATION_NAME", "PERSON_NAME"}),
47
+ }
48
+
49
+
50
+ def _type_priority(entity_type: str) -> int:
51
+ for priority, types in _ENTITY_TYPE_PRIORITY.items():
52
+ if entity_type in types:
53
+ return priority
54
+ return 99
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Result dataclass
58
+ # ---------------------------------------------------------------------------
59
+
60
+
61
+ @dataclass
62
+ class ExtractionResult:
63
+ page_url: str
64
+ entity_count: int
65
+ entities_by_type: dict[str, int] = field(default_factory=dict)
66
+ entity_ids: list = field(default_factory=list)
67
+ errors: list[str] = field(default_factory=list)
68
+ entities: list = field(default_factory=list)
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Public interface
73
+ # ---------------------------------------------------------------------------
74
+
75
+
76
+ async def extract_entities_from_page(
77
+ page_text: str,
78
+ page_url: str,
79
+ page_id: Optional[int] = None,
80
+ investigation_id: Optional[uuid.UUID] = None,
81
+ llm=None,
82
+ run_llm_extraction: bool = False,
83
+ disable_cache: Optional[bool] = None,
84
+ persist: bool = True,
85
+ ) -> ExtractionResult:
86
+ """
87
+ Run the full extraction pipeline for a single page.
88
+
89
+ Each stage is wrapped in its own try/except so a failure in one stage
90
+ never prevents later stages from running. Non-fatal errors are collected
91
+ in ExtractionResult.errors.
92
+
93
+ Set persist=False to skip DB persistence (used when collecting entities
94
+ for batch capping before write).
95
+ """
96
+ errors: list[str] = []
97
+
98
+ # -----------------------------------------------------------------------
99
+ # Stage 1 — Regex
100
+ # -----------------------------------------------------------------------
101
+ try:
102
+ regex_entities = _regex_extract_all(page_text)
103
+ except Exception as exc:
104
+ logger.error("Regex extraction failed for %s: %s", page_url, exc)
105
+ errors.append(f"regex: {exc}")
106
+ regex_entities = {}
107
+
108
+ # -----------------------------------------------------------------------
109
+ # Stage 2 — NER
110
+ # -----------------------------------------------------------------------
111
+ try:
112
+ ner_entities = _ner_extract(page_text)
113
+ except Exception as exc:
114
+ logger.error("NER extraction failed for %s: %s", page_url, exc)
115
+ errors.append(f"ner: {exc}")
116
+ ner_entities = {}
117
+
118
+ # Merge regex + NER (regex results take precedence for shared types)
119
+ combined: dict[str, list[str]] = dict(regex_entities)
120
+ for entity_type, values in ner_entities.items():
121
+ if entity_type in combined:
122
+ combined[entity_type] = _dedup(combined[entity_type] + values)
123
+ else:
124
+ combined[entity_type] = list(values)
125
+
126
+ # -----------------------------------------------------------------------
127
+ # Stage 3 — LLM (optional)
128
+ # -----------------------------------------------------------------------
129
+ if run_llm_extraction and llm is not None:
130
+ try:
131
+ import hashlib
132
+ page_hash = hashlib.sha256(page_text.encode()).hexdigest() if page_text else None
133
+ combined = await _llm_extract(
134
+ page_text, llm, combined, page_hash=page_hash, disable_cache=disable_cache
135
+ )
136
+ except Exception as exc:
137
+ logger.error("LLM extraction failed for %s: %s", page_url, exc)
138
+ errors.append(f"llm: {exc}")
139
+
140
+ # -----------------------------------------------------------------------
141
+ # Stage 4 — Normalise
142
+ # -----------------------------------------------------------------------
143
+ try:
144
+ normalized = _normalize(combined, page_url, page_id, page_text=page_text)
145
+ except Exception as exc:
146
+ logger.error("Normalization failed for %s: %s", page_url, exc)
147
+ errors.append(f"normalize: {exc}")
148
+ normalized = []
149
+
150
+ # -----------------------------------------------------------------------
151
+ # Build result (no DB persist yet if persist=False)
152
+ # -----------------------------------------------------------------------
153
+ entities_by_type: dict[str, int] = {}
154
+ for entity in normalized:
155
+ entities_by_type[entity.entity_type] = (
156
+ entities_by_type.get(entity.entity_type, 0) + 1
157
+ )
158
+
159
+ if not persist:
160
+ return ExtractionResult(
161
+ page_url=page_url,
162
+ entity_count=len(normalized),
163
+ entities_by_type=entities_by_type,
164
+ entity_ids=[],
165
+ errors=errors,
166
+ entities=normalized,
167
+ )
168
+
169
+ # -----------------------------------------------------------------------
170
+ # Stage 5 — DB persist
171
+ # -----------------------------------------------------------------------
172
+ try:
173
+ entity_ids = _merge_db(normalized, investigation_id)
174
+ except Exception as exc:
175
+ logger.error("DB persist failed for %s: %s", page_url, exc)
176
+ errors.append(f"db: {exc}")
177
+ entity_ids = []
178
+
179
+ return ExtractionResult(
180
+ page_url=page_url,
181
+ entity_count=len(normalized),
182
+ entities_by_type=entities_by_type,
183
+ entity_ids=entity_ids,
184
+ errors=errors,
185
+ )
186
+
187
+
188
+ async def extract_entities_from_pages(
189
+ pages: list[dict],
190
+ investigation_id: Optional[uuid.UUID] = None,
191
+ llm=None,
192
+ run_llm_extraction: bool = False,
193
+ max_concurrent: int = 5,
194
+ disable_cache: Optional[bool] = None,
195
+ entity_cap: int = 400,
196
+ ) -> list[ExtractionResult]:
197
+ """
198
+ Run extraction concurrently across a list of pages.
199
+
200
+ Each page dict must have at least a "url" key. Content is read from
201
+ "text", "content", or "cleaned_text" keys (first found wins).
202
+
203
+ A semaphore limits concurrency to *max_concurrent* simultaneous pages.
204
+ One page failing never blocks others — failures are captured in each
205
+ page's ExtractionResult.errors.
206
+
207
+ Before DB persistence, applies entity cap (default 400) ranked by:
208
+ confidence (primary), entity type priority (secondary), occurrence count (tertiary).
209
+ """
210
+ semaphore = asyncio.Semaphore(max_concurrent)
211
+
212
+ async def _process(page: dict) -> ExtractionResult:
213
+ async with semaphore:
214
+ url = page.get("url", "")
215
+ text = (
216
+ page.get("text")
217
+ or page.get("content")
218
+ or page.get("cleaned_text")
219
+ or ""
220
+ )
221
+ try:
222
+ return await extract_entities_from_page(
223
+ page_text=text,
224
+ page_url=url,
225
+ page_id=page.get("page_id"),
226
+ investigation_id=investigation_id,
227
+ llm=llm,
228
+ run_llm_extraction=run_llm_extraction,
229
+ disable_cache=disable_cache,
230
+ persist=False,
231
+ )
232
+ except Exception as exc:
233
+ logger.error("Page processing failed for %s: %s", url, exc)
234
+ return ExtractionResult(
235
+ page_url=url,
236
+ entity_count=0,
237
+ entities_by_type={},
238
+ entity_ids=[],
239
+ errors=[str(exc)],
240
+ )
241
+
242
+ results = list(await asyncio.gather(*[_process(p) for p in pages]))
243
+
244
+ all_normalized: list[NormalizedEntity] = []
245
+ for result in results:
246
+ all_normalized.extend(result.entities)
247
+
248
+ if not all_normalized:
249
+ return results
250
+
251
+ all_normalized = _resolve_conflicts(all_normalized)
252
+
253
+ # -----------------------------------------------------------------------
254
+ # Content safety: drop prohibited entity values before capping/storing.
255
+ # Only text-based types are checked; technical IOCs are never filtered.
256
+ # The actual value is never logged — only type and count.
257
+ # -----------------------------------------------------------------------
258
+ from utils.content_safety import is_blocked_entity_value as _is_blocked_entity_value
259
+ clean_entities: list[NormalizedEntity] = []
260
+ blocked_entity_count = 0
261
+ for _ent in all_normalized:
262
+ if _is_blocked_entity_value(_ent.entity_type, _ent.value):
263
+ blocked_entity_count += 1
264
+ logger.debug(
265
+ "Entity value blocked — prohibited content: type=%s",
266
+ _ent.entity_type,
267
+ )
268
+ else:
269
+ clean_entities.append(_ent)
270
+ if blocked_entity_count > 0:
271
+ logger.info(
272
+ "Blocked %d entities for prohibited content",
273
+ blocked_entity_count,
274
+ )
275
+ all_normalized = clean_entities
276
+
277
+ capped_entities, original_count = apply_entity_cap(
278
+ all_normalized, cap=entity_cap, investigation_id=investigation_id
279
+ )
280
+
281
+ if capped_entities:
282
+ try:
283
+ entity_id_map = _merge_db(capped_entities, investigation_id)
284
+ url_to_ids: dict[str, list] = {}
285
+ for ent, eid in zip(capped_entities, entity_id_map):
286
+ if ent.source_url not in url_to_ids:
287
+ url_to_ids[ent.source_url] = []
288
+ url_to_ids[ent.source_url].append(eid)
289
+
290
+ for result in results:
291
+ result.entity_ids = url_to_ids.get(result.page_url, [])
292
+ result.entities = [e for e in capped_entities if e.source_url == result.page_url]
293
+ except Exception as exc:
294
+ logger.error("Batch entity persist failed: %s", exc)
295
+
296
+ return results
297
+
298
+
299
+ # ---------------------------------------------------------------------------
300
+ # Entity cap logic
301
+ # ---------------------------------------------------------------------------
302
+
303
+ def _occurrence_count(entity: NormalizedEntity, all_entities: list[NormalizedEntity]) -> int:
304
+ """Count how many times this entity value appears across all pages."""
305
+ count = 0
306
+ for other in all_entities:
307
+ if other.entity_type == entity.entity_type and other.value == entity.value:
308
+ count += 1
309
+ return count
310
+
311
+
312
+ def _apply_per_type_caps(
313
+ entities: list[NormalizedEntity],
314
+ caps: dict = PER_TYPE_CAPS,
315
+ ) -> list[NormalizedEntity]:
316
+ """
317
+ Apply per-type sub-caps before the global cap.
318
+
319
+ This prevents high-volume low-specificity entity types (e.g., ORGANIZATION_NAME)
320
+ from crowding out high-value IOCs (FILE_HASH, CVE, MITRE_TECHNIQUE).
321
+ """
322
+ type_counts: dict[str, int] = {}
323
+ result: list[NormalizedEntity] = []
324
+
325
+ for entity in entities:
326
+ etype = entity.entity_type
327
+ cap = caps.get(etype, float("inf"))
328
+ count = type_counts.get(etype, 0)
329
+ if count < cap:
330
+ result.append(entity)
331
+ type_counts[etype] = count + 1
332
+ else:
333
+ logger.debug(f"Per-type cap: {etype} capped at {cap}")
334
+
335
+ return result
336
+
337
+
338
+ def apply_entity_cap(
339
+ entities: list[NormalizedEntity],
340
+ cap: int = 400,
341
+ investigation_id: Optional[uuid.UUID] = None,
342
+ ) -> tuple[list[NormalizedEntity], int]:
343
+ """
344
+ Apply quality-based entity filtering and hard cap.
345
+
346
+ Steps:
347
+ a) Remove any entity where confidence < 0.80
348
+ b) Apply per-type sub-caps (see _apply_per_type_caps)
349
+ c) Apply per-investigation hard cap of *cap* entities, ranked by:
350
+ - confidence score (primary, descending)
351
+ - entity type priority (secondary, ascending - lower number = higher priority)
352
+ - occurrence count across pages (tertiary, descending)
353
+ d) Log a warning when cap is applied
354
+
355
+ Returns: (capped_entities, original_count)
356
+ """
357
+ original_count = len(entities)
358
+
359
+ # Step a: confidence filter
360
+ filtered = [e for e in entities if e.confidence >= 0.80]
361
+ removed_confidence = original_count - len(filtered)
362
+ if removed_confidence:
363
+ logger.warning(f"Entity confidence filter removed {removed_confidence} low-confidence entities")
364
+
365
+ # Count occurrences per entity (by type+value)
366
+ for ent in filtered:
367
+ ent._occurrence = _occurrence_count(ent, filtered)
368
+
369
+ # Step b: per-type sub-caps
370
+ filtered = _apply_per_type_caps(filtered)
371
+
372
+ # Step c: sort and cap
373
+ if len(filtered) > cap:
374
+ filtered.sort(key=lambda e: (-e.confidence, _type_priority(e.entity_type), -e._occurrence))
375
+ filtered = filtered[:cap]
376
+ logger.warning(
377
+ f"Entity cap applied: {original_count} entities reduced to {len(filtered)} "
378
+ f"for investigation {investigation_id}"
379
+ )
380
+
381
+ # Clean up temporary attribute
382
+ for ent in filtered:
383
+ if hasattr(ent, "_occurrence"):
384
+ del ent._occurrence
385
+
386
+ return filtered, original_count
387
+
388
+
389
+ # ---------------------------------------------------------------------------
390
+ # Internal helper
391
+ # ---------------------------------------------------------------------------
392
+
393
+
394
+ def _dedup(values) -> list[str]:
395
+ seen: set[str] = set()
396
+ result: list[str] = []
397
+ for v in values:
398
+ if v not in seen:
399
+ seen.add(v)
400
+ result.append(v)
401
+ return result