voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
vector/__init__.py ADDED
@@ -0,0 +1,39 @@
1
+ """
2
+ vector — Phase 4 embedding storage and semantic search.
3
+
4
+ Re-exports the public API from embedder, store, and search.
5
+ """
6
+
7
+ from vector.embedder import embed_batch, embed_text, get_embedder
8
+ from vector.search import (
9
+ cross_investigation_recall,
10
+ find_pages_similar_to,
11
+ find_related_pages,
12
+ )
13
+ from vector.store import (
14
+ bulk_check_cache,
15
+ get_cached_page,
16
+ get_collection,
17
+ get_collection_stats,
18
+ is_duplicate,
19
+ search_similar,
20
+ store_page,
21
+ upsert_page,
22
+ )
23
+
24
+ __all__ = [
25
+ "get_embedder",
26
+ "embed_text",
27
+ "embed_batch",
28
+ "get_collection",
29
+ "upsert_page",
30
+ "search_similar",
31
+ "is_duplicate",
32
+ "get_collection_stats",
33
+ "find_related_pages",
34
+ "find_pages_similar_to",
35
+ "cross_investigation_recall",
36
+ "get_cached_page",
37
+ "store_page",
38
+ "bulk_check_cache",
39
+ ]
vector/embedder.py ADDED
@@ -0,0 +1,100 @@
1
+ """
2
+ Local embedding generation via sentence-transformers (all-MiniLM-L6-v2).
3
+
4
+ Uses the shared model singleton from vector.model_singleton.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+
11
+ from vector.model_singleton import get_embedding_model
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _MAX_TOKENS = 512
16
+
17
+
18
+ def get_embedder():
19
+ """Return the shared SentenceTransformer instance, or None if unavailable."""
20
+ return get_embedding_model()
21
+
22
+
23
+ def _truncate_to_model_limit(text: str, model) -> str:
24
+ if not text:
25
+ return text
26
+ try:
27
+ tok = getattr(model, "tokenizer", None)
28
+ if tok is None:
29
+ return text[:50000]
30
+ encoded = tok.encode(
31
+ text,
32
+ add_special_tokens=True,
33
+ truncation=True,
34
+ max_length=_MAX_TOKENS,
35
+ )
36
+ return tok.decode(encoded, skip_special_tokens=True)
37
+ except Exception:
38
+ return text[:50000]
39
+
40
+
41
+ def embed_text(text: str) -> list[float] | None:
42
+ """
43
+ Return a 384-dim embedding as a plain Python list, or None if unavailable
44
+ or text is empty.
45
+ """
46
+ if not (text and str(text).strip()):
47
+ return None
48
+ model = get_embedder()
49
+ if model is None:
50
+ return None
51
+ try:
52
+ truncated = _truncate_to_model_limit(str(text), model)
53
+ vec = model.encode(
54
+ truncated,
55
+ convert_to_numpy=True,
56
+ show_progress_bar=False,
57
+ )
58
+ return [float(x) for x in vec.tolist()]
59
+ except Exception as exc:
60
+ logger.warning("embed_text failed: %s", exc)
61
+ return None
62
+
63
+
64
+ def embed_batch(texts: list[str]) -> list[list[float] | None]:
65
+ """
66
+ Batch embedding. Returns a list parallel to *texts*; any failure becomes None.
67
+ """
68
+ if not texts:
69
+ return []
70
+ model = get_embedder()
71
+ if model is None:
72
+ return [None for _ in texts]
73
+ prepared: list[str] = []
74
+ empty_indices: set[int] = set()
75
+ for i, t in enumerate(texts):
76
+ if not (t and str(t).strip()):
77
+ empty_indices.add(i)
78
+ prepared.append("")
79
+ else:
80
+ prepared.append(_truncate_to_model_limit(str(t), model))
81
+ out: list[list[float] | None] = [None] * len(texts)
82
+ for i in empty_indices:
83
+ out[i] = None
84
+ to_encode_idx = [i for i in range(len(texts)) if i not in empty_indices]
85
+ if not to_encode_idx:
86
+ return out
87
+ try:
88
+ batch_in = [prepared[i] for i in to_encode_idx]
89
+ encoded = model.encode(
90
+ batch_in,
91
+ convert_to_numpy=True,
92
+ show_progress_bar=False,
93
+ )
94
+ for j, row_idx in enumerate(to_encode_idx):
95
+ out[row_idx] = [float(x) for x in encoded[j].tolist()]
96
+ except Exception as exc:
97
+ logger.warning("embed_batch failed: %s", exc)
98
+ for i in to_encode_idx:
99
+ out[i] = None
100
+ return out # type: ignore[return-value]
@@ -0,0 +1,49 @@
1
+ """
2
+ Thread-safe singleton for the SentenceTransformer embedding model.
3
+
4
+ Loads all-MiniLM-L6-v2 lazily on first use, then reuses the same instance
5
+ across all consumers. This eliminates the ~80 MB duplicate model weight
6
+ problem when multiple modules each instantiate their own model at load time.
7
+
8
+ Import torch and SentenceTransformer INSIDE the getter function to avoid
9
+ the 2-5 second startup delay from torch enumerating CUDA devices.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import threading
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _model: "SentenceTransformer | None" = None
20
+ _lock = threading.Lock()
21
+
22
+
23
+ def get_embedding_model() -> "SentenceTransformer | None":
24
+ """
25
+ Return the shared SentenceTransformer instance.
26
+
27
+ Lazy-loads the model on first call, then caches it for all subsequent calls.
28
+ Thread-safe: uses a lock to prevent race conditions during init.
29
+ """
30
+ global _model
31
+
32
+ if _model is not None:
33
+ return _model
34
+
35
+ with _lock:
36
+ if _model is not None:
37
+ return _model
38
+
39
+ import torch # noqa: PLC0415 - imported inside function per M-6
40
+ from sentence_transformers import SentenceTransformer # noqa: PLC0415
41
+
42
+ try:
43
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
44
+ logger.info("Loaded embedding model all-MiniLM-L6-v2 (singleton)")
45
+ except Exception as exc:
46
+ logger.warning("Failed to load embedding model: %s", exc)
47
+ _model = None
48
+
49
+ return _model
vector/search.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ Higher-level semantic search built on vector/store.py.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from typing import Any
9
+
10
+ from . import store
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def find_related_pages(query: str, n_results: int = 10) -> list[dict]:
16
+ """Semantic search over stored pages (metadata + distance)."""
17
+ return store.search_similar(query, n_results=n_results)
18
+
19
+
20
+ def find_pages_similar_to(reference_url: str, n_results: int = 10) -> list[dict]:
21
+ """
22
+ Find pages similar to *reference_url* using its stored embedding.
23
+ Returns [] if the URL is not in the collection.
24
+ """
25
+ col = store.get_collection()
26
+ if col is None:
27
+ return []
28
+ try:
29
+ import hashlib
30
+
31
+ pid = hashlib.sha256(reference_url.encode("utf-8")).hexdigest()
32
+ got = col.get(ids=[pid], include=["embeddings"])
33
+ embs = got.get("embeddings") or []
34
+ if not embs or embs[0] is None:
35
+ return []
36
+ emb = list(embs[0])
37
+ n = max(1, int(n_results))
38
+ res = col.query(
39
+ query_embeddings=[emb],
40
+ n_results=n + 1,
41
+ include=["distances", "metadatas"],
42
+ )
43
+ ids = (res.get("ids") or [[]])[0]
44
+ dists = (res.get("distances") or [[]])[0]
45
+ metas = (res.get("metadatas") or [[]])[0]
46
+ out: list[dict] = []
47
+ for i, _eid in enumerate(ids):
48
+ if _eid == pid:
49
+ continue
50
+ m = metas[i] if i < len(metas) and metas[i] else {}
51
+ md = dict(m) if isinstance(m, dict) else {}
52
+ raw_pid = md.get("page_id")
53
+ page_id_out: int | None = None
54
+ if raw_pid is not None:
55
+ try:
56
+ page_id_out = int(raw_pid)
57
+ except (TypeError, ValueError):
58
+ page_id_out = None
59
+ dist_f = float(dists[i]) if i < len(dists) else 0.0
60
+ out.append(
61
+ {
62
+ "url": md.get("url", ""),
63
+ "page_id": page_id_out,
64
+ "distance": dist_f,
65
+ "metadata": md,
66
+ }
67
+ )
68
+ if len(out) >= n:
69
+ break
70
+ return out[:n]
71
+ except Exception as exc:
72
+ logger.warning("find_pages_similar_to failed: %s", exc)
73
+ return []
74
+
75
+
76
+ def cross_investigation_recall(
77
+ query: str,
78
+ exclude_investigation_id: int | None = None,
79
+ ) -> list[dict]:
80
+ """
81
+ Similar pages across investigations; optionally exclude one investigation_id.
82
+ """
83
+ where: dict[str, Any] | None = None
84
+ if exclude_investigation_id is not None:
85
+ ex = str(exclude_investigation_id)
86
+ where = {"investigation_id": {"$ne": ex}}
87
+ return store.search_similar(query, n_results=10, where=where)