voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
vector/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
vector — Phase 4 embedding storage and semantic search.
|
|
3
|
+
|
|
4
|
+
Re-exports the public API from embedder, store, and search.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from vector.embedder import embed_batch, embed_text, get_embedder
|
|
8
|
+
from vector.search import (
|
|
9
|
+
cross_investigation_recall,
|
|
10
|
+
find_pages_similar_to,
|
|
11
|
+
find_related_pages,
|
|
12
|
+
)
|
|
13
|
+
from vector.store import (
|
|
14
|
+
bulk_check_cache,
|
|
15
|
+
get_cached_page,
|
|
16
|
+
get_collection,
|
|
17
|
+
get_collection_stats,
|
|
18
|
+
is_duplicate,
|
|
19
|
+
search_similar,
|
|
20
|
+
store_page,
|
|
21
|
+
upsert_page,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"get_embedder",
|
|
26
|
+
"embed_text",
|
|
27
|
+
"embed_batch",
|
|
28
|
+
"get_collection",
|
|
29
|
+
"upsert_page",
|
|
30
|
+
"search_similar",
|
|
31
|
+
"is_duplicate",
|
|
32
|
+
"get_collection_stats",
|
|
33
|
+
"find_related_pages",
|
|
34
|
+
"find_pages_similar_to",
|
|
35
|
+
"cross_investigation_recall",
|
|
36
|
+
"get_cached_page",
|
|
37
|
+
"store_page",
|
|
38
|
+
"bulk_check_cache",
|
|
39
|
+
]
|
vector/embedder.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local embedding generation via sentence-transformers (all-MiniLM-L6-v2).
|
|
3
|
+
|
|
4
|
+
Uses the shared model singleton from vector.model_singleton.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from vector.model_singleton import get_embedding_model
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_MAX_TOKENS = 512
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_embedder():
|
|
19
|
+
"""Return the shared SentenceTransformer instance, or None if unavailable."""
|
|
20
|
+
return get_embedding_model()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _truncate_to_model_limit(text: str, model) -> str:
|
|
24
|
+
if not text:
|
|
25
|
+
return text
|
|
26
|
+
try:
|
|
27
|
+
tok = getattr(model, "tokenizer", None)
|
|
28
|
+
if tok is None:
|
|
29
|
+
return text[:50000]
|
|
30
|
+
encoded = tok.encode(
|
|
31
|
+
text,
|
|
32
|
+
add_special_tokens=True,
|
|
33
|
+
truncation=True,
|
|
34
|
+
max_length=_MAX_TOKENS,
|
|
35
|
+
)
|
|
36
|
+
return tok.decode(encoded, skip_special_tokens=True)
|
|
37
|
+
except Exception:
|
|
38
|
+
return text[:50000]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def embed_text(text: str) -> list[float] | None:
|
|
42
|
+
"""
|
|
43
|
+
Return a 384-dim embedding as a plain Python list, or None if unavailable
|
|
44
|
+
or text is empty.
|
|
45
|
+
"""
|
|
46
|
+
if not (text and str(text).strip()):
|
|
47
|
+
return None
|
|
48
|
+
model = get_embedder()
|
|
49
|
+
if model is None:
|
|
50
|
+
return None
|
|
51
|
+
try:
|
|
52
|
+
truncated = _truncate_to_model_limit(str(text), model)
|
|
53
|
+
vec = model.encode(
|
|
54
|
+
truncated,
|
|
55
|
+
convert_to_numpy=True,
|
|
56
|
+
show_progress_bar=False,
|
|
57
|
+
)
|
|
58
|
+
return [float(x) for x in vec.tolist()]
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
logger.warning("embed_text failed: %s", exc)
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def embed_batch(texts: list[str]) -> list[list[float] | None]:
|
|
65
|
+
"""
|
|
66
|
+
Batch embedding. Returns a list parallel to *texts*; any failure becomes None.
|
|
67
|
+
"""
|
|
68
|
+
if not texts:
|
|
69
|
+
return []
|
|
70
|
+
model = get_embedder()
|
|
71
|
+
if model is None:
|
|
72
|
+
return [None for _ in texts]
|
|
73
|
+
prepared: list[str] = []
|
|
74
|
+
empty_indices: set[int] = set()
|
|
75
|
+
for i, t in enumerate(texts):
|
|
76
|
+
if not (t and str(t).strip()):
|
|
77
|
+
empty_indices.add(i)
|
|
78
|
+
prepared.append("")
|
|
79
|
+
else:
|
|
80
|
+
prepared.append(_truncate_to_model_limit(str(t), model))
|
|
81
|
+
out: list[list[float] | None] = [None] * len(texts)
|
|
82
|
+
for i in empty_indices:
|
|
83
|
+
out[i] = None
|
|
84
|
+
to_encode_idx = [i for i in range(len(texts)) if i not in empty_indices]
|
|
85
|
+
if not to_encode_idx:
|
|
86
|
+
return out
|
|
87
|
+
try:
|
|
88
|
+
batch_in = [prepared[i] for i in to_encode_idx]
|
|
89
|
+
encoded = model.encode(
|
|
90
|
+
batch_in,
|
|
91
|
+
convert_to_numpy=True,
|
|
92
|
+
show_progress_bar=False,
|
|
93
|
+
)
|
|
94
|
+
for j, row_idx in enumerate(to_encode_idx):
|
|
95
|
+
out[row_idx] = [float(x) for x in encoded[j].tolist()]
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
logger.warning("embed_batch failed: %s", exc)
|
|
98
|
+
for i in to_encode_idx:
|
|
99
|
+
out[i] = None
|
|
100
|
+
return out # type: ignore[return-value]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thread-safe singleton for the SentenceTransformer embedding model.
|
|
3
|
+
|
|
4
|
+
Loads all-MiniLM-L6-v2 lazily on first use, then reuses the same instance
|
|
5
|
+
across all consumers. This eliminates the ~80 MB duplicate model weight
|
|
6
|
+
problem when multiple modules each instantiate their own model at load time.
|
|
7
|
+
|
|
8
|
+
Import torch and SentenceTransformer INSIDE the getter function to avoid
|
|
9
|
+
the 2-5 second startup delay from torch enumerating CUDA devices.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import threading
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_model: "SentenceTransformer | None" = None
|
|
20
|
+
_lock = threading.Lock()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_embedding_model() -> "SentenceTransformer | None":
|
|
24
|
+
"""
|
|
25
|
+
Return the shared SentenceTransformer instance.
|
|
26
|
+
|
|
27
|
+
Lazy-loads the model on first call, then caches it for all subsequent calls.
|
|
28
|
+
Thread-safe: uses a lock to prevent race conditions during init.
|
|
29
|
+
"""
|
|
30
|
+
global _model
|
|
31
|
+
|
|
32
|
+
if _model is not None:
|
|
33
|
+
return _model
|
|
34
|
+
|
|
35
|
+
with _lock:
|
|
36
|
+
if _model is not None:
|
|
37
|
+
return _model
|
|
38
|
+
|
|
39
|
+
import torch # noqa: PLC0415 - imported inside function per M-6
|
|
40
|
+
from sentence_transformers import SentenceTransformer # noqa: PLC0415
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
44
|
+
logger.info("Loaded embedding model all-MiniLM-L6-v2 (singleton)")
|
|
45
|
+
except Exception as exc:
|
|
46
|
+
logger.warning("Failed to load embedding model: %s", exc)
|
|
47
|
+
_model = None
|
|
48
|
+
|
|
49
|
+
return _model
|
vector/search.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Higher-level semantic search built on vector/store.py.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from . import store
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def find_related_pages(query: str, n_results: int = 10) -> list[dict]:
|
|
16
|
+
"""Semantic search over stored pages (metadata + distance)."""
|
|
17
|
+
return store.search_similar(query, n_results=n_results)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def find_pages_similar_to(reference_url: str, n_results: int = 10) -> list[dict]:
|
|
21
|
+
"""
|
|
22
|
+
Find pages similar to *reference_url* using its stored embedding.
|
|
23
|
+
Returns [] if the URL is not in the collection.
|
|
24
|
+
"""
|
|
25
|
+
col = store.get_collection()
|
|
26
|
+
if col is None:
|
|
27
|
+
return []
|
|
28
|
+
try:
|
|
29
|
+
import hashlib
|
|
30
|
+
|
|
31
|
+
pid = hashlib.sha256(reference_url.encode("utf-8")).hexdigest()
|
|
32
|
+
got = col.get(ids=[pid], include=["embeddings"])
|
|
33
|
+
embs = got.get("embeddings") or []
|
|
34
|
+
if not embs or embs[0] is None:
|
|
35
|
+
return []
|
|
36
|
+
emb = list(embs[0])
|
|
37
|
+
n = max(1, int(n_results))
|
|
38
|
+
res = col.query(
|
|
39
|
+
query_embeddings=[emb],
|
|
40
|
+
n_results=n + 1,
|
|
41
|
+
include=["distances", "metadatas"],
|
|
42
|
+
)
|
|
43
|
+
ids = (res.get("ids") or [[]])[0]
|
|
44
|
+
dists = (res.get("distances") or [[]])[0]
|
|
45
|
+
metas = (res.get("metadatas") or [[]])[0]
|
|
46
|
+
out: list[dict] = []
|
|
47
|
+
for i, _eid in enumerate(ids):
|
|
48
|
+
if _eid == pid:
|
|
49
|
+
continue
|
|
50
|
+
m = metas[i] if i < len(metas) and metas[i] else {}
|
|
51
|
+
md = dict(m) if isinstance(m, dict) else {}
|
|
52
|
+
raw_pid = md.get("page_id")
|
|
53
|
+
page_id_out: int | None = None
|
|
54
|
+
if raw_pid is not None:
|
|
55
|
+
try:
|
|
56
|
+
page_id_out = int(raw_pid)
|
|
57
|
+
except (TypeError, ValueError):
|
|
58
|
+
page_id_out = None
|
|
59
|
+
dist_f = float(dists[i]) if i < len(dists) else 0.0
|
|
60
|
+
out.append(
|
|
61
|
+
{
|
|
62
|
+
"url": md.get("url", ""),
|
|
63
|
+
"page_id": page_id_out,
|
|
64
|
+
"distance": dist_f,
|
|
65
|
+
"metadata": md,
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
if len(out) >= n:
|
|
69
|
+
break
|
|
70
|
+
return out[:n]
|
|
71
|
+
except Exception as exc:
|
|
72
|
+
logger.warning("find_pages_similar_to failed: %s", exc)
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cross_investigation_recall(
|
|
77
|
+
query: str,
|
|
78
|
+
exclude_investigation_id: int | None = None,
|
|
79
|
+
) -> list[dict]:
|
|
80
|
+
"""
|
|
81
|
+
Similar pages across investigations; optionally exclude one investigation_id.
|
|
82
|
+
"""
|
|
83
|
+
where: dict[str, Any] | None = None
|
|
84
|
+
if exclude_investigation_id is not None:
|
|
85
|
+
ex = str(exclude_investigation_id)
|
|
86
|
+
where = {"investigation_id": {"$ne": ex}}
|
|
87
|
+
return store.search_similar(query, n_results=10, where=where)
|