voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fingerprint/stylometry.py — Writing style feature extraction and similarity.
|
|
3
|
+
|
|
4
|
+
Identifies when the same person posts under different handles on different
|
|
5
|
+
forums, based on HOW they write rather than WHAT they write.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
import string
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
# Top-20 English function words — nearly impossible to consciously change
|
|
17
|
+
_FUNCTION_WORDS = [
|
|
18
|
+
"the", "a", "an", "and", "but", "or", "if", "in", "on", "at",
|
|
19
|
+
"to", "for", "of", "with", "is", "are", "was", "were", "be", "have",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
# Splits on whitespace that follows a sentence-ending punctuation mark
|
|
23
|
+
_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")
|
|
24
|
+
|
|
25
|
+
# Patterns for structured data to detect non-natural-language text
|
|
26
|
+
_BITCOIN_RE = re.compile(r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b")
|
|
27
|
+
_ETH_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
|
|
28
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
|
|
29
|
+
_URL_RE = re.compile(r"https?://\S+")
|
|
30
|
+
_ADDRESS_RE = re.compile(r"\b[a-z2-7]{56}\.onion\b", re.IGNORECASE)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_natural_language(text: str) -> bool:
|
|
34
|
+
"""Returns True if text contains enough natural language for stylometry."""
|
|
35
|
+
words = text.split()
|
|
36
|
+
if len(words) < 10:
|
|
37
|
+
return False
|
|
38
|
+
structured_count = 0
|
|
39
|
+
structured_count += len(_BITCOIN_RE.findall(text))
|
|
40
|
+
structured_count += len(_ETH_RE.findall(text))
|
|
41
|
+
structured_count += len(_CVE_RE.findall(text))
|
|
42
|
+
structured_count += len(_URL_RE.findall(text))
|
|
43
|
+
structured_count += len(_ADDRESS_RE.findall(text))
|
|
44
|
+
if structured_count / len(words) > 0.5:
|
|
45
|
+
return False
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _split_sentences(text: str) -> list[str]:
|
|
50
|
+
parts = _SENTENCE_RE.split(text.strip())
|
|
51
|
+
return [s for s in parts if s.strip()]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _split_paragraphs(text: str) -> list[str]:
|
|
55
|
+
parts = re.split(r"\n\s*\n", text.strip())
|
|
56
|
+
return [p for p in parts if p.strip()]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_words(text: str) -> list[str]:
|
|
60
|
+
return re.findall(r"\b\w+\b", text)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_style_vector(text: str) -> dict | None:
|
|
64
|
+
"""
|
|
65
|
+
Extract a fixed set of stylometric features from *text*.
|
|
66
|
+
|
|
67
|
+
Returns None for text shorter than 100 characters (too short to be
|
|
68
|
+
meaningful) OR if text is primarily structured data (wallets, URLs, CVEs).
|
|
69
|
+
Never raises.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
if not text or len(text) < 100:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
if not _is_natural_language(text):
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
words = _get_words(text)
|
|
79
|
+
if not words:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
alpha_words = re.findall(r"\b[a-zA-Z]+\b", text)
|
|
83
|
+
|
|
84
|
+
# avg_word_length
|
|
85
|
+
avg_word_length = (
|
|
86
|
+
sum(len(w) for w in alpha_words) / len(alpha_words)
|
|
87
|
+
if alpha_words
|
|
88
|
+
else 0.0
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# avg_sentence_length (words per sentence)
|
|
92
|
+
sentences = _split_sentences(text)
|
|
93
|
+
if not sentences:
|
|
94
|
+
sentences = [text]
|
|
95
|
+
sent_word_counts = [len(_get_words(s)) for s in sentences]
|
|
96
|
+
avg_sentence_length = (
|
|
97
|
+
sum(sent_word_counts) / len(sent_word_counts)
|
|
98
|
+
if sent_word_counts
|
|
99
|
+
else 0.0
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# vocabulary_richness — type-token ratio
|
|
103
|
+
total_words = len(words)
|
|
104
|
+
unique_words = len({w.lower() for w in words})
|
|
105
|
+
vocabulary_richness = min(unique_words / total_words, 1.0) if total_words else 0.0
|
|
106
|
+
|
|
107
|
+
# punctuation_density
|
|
108
|
+
punct_count = sum(1 for c in text if c in string.punctuation)
|
|
109
|
+
punctuation_density = punct_count / len(text) if text else 0.0
|
|
110
|
+
|
|
111
|
+
# uppercase_ratio
|
|
112
|
+
alpha_chars = [c for c in text if c.isalpha()]
|
|
113
|
+
upper_chars = [c for c in alpha_chars if c.isupper()]
|
|
114
|
+
uppercase_ratio = len(upper_chars) / len(alpha_chars) if alpha_chars else 0.0
|
|
115
|
+
|
|
116
|
+
# digit_ratio
|
|
117
|
+
digit_count = sum(1 for c in text if c.isdigit())
|
|
118
|
+
digit_ratio = digit_count / len(text) if text else 0.0
|
|
119
|
+
|
|
120
|
+
# function_word_freq — frequency of each of the 20 function words
|
|
121
|
+
words_lower = [w.lower() for w in words]
|
|
122
|
+
function_word_freq: dict[str, float] = {
|
|
123
|
+
fw: words_lower.count(fw) / total_words if total_words else 0.0
|
|
124
|
+
for fw in _FUNCTION_WORDS
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# avg_paragraph_length — mean sentences per paragraph
|
|
128
|
+
paragraphs = _split_paragraphs(text)
|
|
129
|
+
if paragraphs:
|
|
130
|
+
para_sent_counts = [
|
|
131
|
+
max(len(_split_sentences(p)), 1) for p in paragraphs
|
|
132
|
+
]
|
|
133
|
+
avg_paragraph_length = sum(para_sent_counts) / len(para_sent_counts)
|
|
134
|
+
else:
|
|
135
|
+
avg_paragraph_length = float(len(sentences))
|
|
136
|
+
|
|
137
|
+
# exclamation_ratio and question_ratio
|
|
138
|
+
num_sentences = len(sentences)
|
|
139
|
+
exclamation_ratio = text.count("!") / num_sentences if num_sentences else 0.0
|
|
140
|
+
question_ratio = text.count("?") / num_sentences if num_sentences else 0.0
|
|
141
|
+
|
|
142
|
+
# char_ngram_freq — top-50 character 3-grams
|
|
143
|
+
text_lower = text.lower()
|
|
144
|
+
all_ngrams = [text_lower[i : i + 3] for i in range(len(text_lower) - 2)]
|
|
145
|
+
ngram_counter = Counter(all_ngrams)
|
|
146
|
+
total_ngrams = len(all_ngrams)
|
|
147
|
+
char_ngram_freq: dict[str, float] = {
|
|
148
|
+
ngram: count / total_ngrams if total_ngrams else 0.0
|
|
149
|
+
for ngram, count in ngram_counter.most_common(50)
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
"avg_word_length": float(avg_word_length),
|
|
154
|
+
"avg_sentence_length": float(avg_sentence_length),
|
|
155
|
+
"vocabulary_richness": float(vocabulary_richness),
|
|
156
|
+
"punctuation_density": float(punctuation_density),
|
|
157
|
+
"uppercase_ratio": float(uppercase_ratio),
|
|
158
|
+
"digit_ratio": float(digit_ratio),
|
|
159
|
+
"function_word_freq": function_word_freq,
|
|
160
|
+
"avg_paragraph_length": float(avg_paragraph_length),
|
|
161
|
+
"exclamation_ratio": float(exclamation_ratio),
|
|
162
|
+
"question_ratio": float(question_ratio),
|
|
163
|
+
"char_ngram_freq": char_ngram_freq,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
except Exception:
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
# Vector alignment helpers
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
def _aligned_flatten(
|
|
175
|
+
vector_a: dict, vector_b: dict
|
|
176
|
+
) -> tuple[list[float], list[float]]:
|
|
177
|
+
"""
|
|
178
|
+
Flatten two style vectors into aligned float arrays.
|
|
179
|
+
|
|
180
|
+
For scalar keys: use the value from each vector (0.0 if missing).
|
|
181
|
+
For nested-dict keys (function_word_freq, char_ngram_freq): use the
|
|
182
|
+
union of subkeys, with 0.0 for any subkey missing in one vector.
|
|
183
|
+
"""
|
|
184
|
+
flat_a: list[float] = []
|
|
185
|
+
flat_b: list[float] = []
|
|
186
|
+
|
|
187
|
+
all_keys = sorted(set(vector_a.keys()) | set(vector_b.keys()))
|
|
188
|
+
|
|
189
|
+
for key in all_keys:
|
|
190
|
+
val_a = vector_a.get(key, 0.0)
|
|
191
|
+
val_b = vector_b.get(key, 0.0)
|
|
192
|
+
|
|
193
|
+
if isinstance(val_a, dict) or isinstance(val_b, dict):
|
|
194
|
+
dict_a = val_a if isinstance(val_a, dict) else {}
|
|
195
|
+
dict_b = val_b if isinstance(val_b, dict) else {}
|
|
196
|
+
all_subkeys = sorted(set(dict_a.keys()) | set(dict_b.keys()))
|
|
197
|
+
for subkey in all_subkeys:
|
|
198
|
+
flat_a.append(float(dict_a.get(subkey, 0.0)))
|
|
199
|
+
flat_b.append(float(dict_b.get(subkey, 0.0)))
|
|
200
|
+
else:
|
|
201
|
+
flat_a.append(float(val_a) if val_a else 0.0)
|
|
202
|
+
flat_b.append(float(val_b) if val_b else 0.0)
|
|
203
|
+
|
|
204
|
+
return flat_a, flat_b
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
208
|
+
if not a or not b or len(a) != len(b):
|
|
209
|
+
return 0.0
|
|
210
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
211
|
+
mag_a = math.sqrt(sum(x * x for x in a))
|
|
212
|
+
mag_b = math.sqrt(sum(y * y for y in b))
|
|
213
|
+
if mag_a == 0.0 or mag_b == 0.0:
|
|
214
|
+
return 0.0
|
|
215
|
+
return dot / (mag_a * mag_b)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def compute_similarity(vector_a: dict, vector_b: dict) -> float:
|
|
219
|
+
"""
|
|
220
|
+
Cosine similarity between two style vectors (0.0–1.0).
|
|
221
|
+
|
|
222
|
+
Handles nested function_word_freq and char_ngram_freq dicts by
|
|
223
|
+
flattening both vectors into aligned float arrays. Returns 0.0 if
|
|
224
|
+
either vector is None or malformed. Never raises.
|
|
225
|
+
"""
|
|
226
|
+
try:
|
|
227
|
+
if not vector_a or not vector_b:
|
|
228
|
+
return 0.0
|
|
229
|
+
if not isinstance(vector_a, dict) or not isinstance(vector_b, dict):
|
|
230
|
+
return 0.0
|
|
231
|
+
flat_a, flat_b = _aligned_flatten(vector_a, vector_b)
|
|
232
|
+
raw = _cosine_similarity(flat_a, flat_b)
|
|
233
|
+
return float(max(0.0, min(1.0, raw)))
|
|
234
|
+
except Exception:
|
|
235
|
+
return 0.0
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def are_likely_same_author(
|
|
239
|
+
vector_a: dict,
|
|
240
|
+
vector_b: dict,
|
|
241
|
+
threshold: float = 0.85,
|
|
242
|
+
) -> tuple[bool, float]:
|
|
243
|
+
"""
|
|
244
|
+
Returns (True, similarity_score) if similarity >= threshold.
|
|
245
|
+
|
|
246
|
+
Threshold of 0.85 is conservative — only flag high-confidence matches.
|
|
247
|
+
"""
|
|
248
|
+
score = compute_similarity(vector_a, vector_b)
|
|
249
|
+
return (score >= threshold, score)
|
graph/__init__.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph — Phase 3 graph relationship mapping module.
|
|
3
|
+
|
|
4
|
+
Exports all public symbols from the five sub-modules:
|
|
5
|
+
model — data definitions (node/edge types, dataclasses)
|
|
6
|
+
builder — graph construction from DB + manual mutation helpers
|
|
7
|
+
queries — pure query functions over a NetworkX graph
|
|
8
|
+
export — serialisation to GraphML, JSON, Gephi CSV
|
|
9
|
+
visualize — interactive HTML visualisation via pyvis
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from graph.model import (
|
|
13
|
+
GraphEdge,
|
|
14
|
+
GraphNode,
|
|
15
|
+
EDGE_TYPES,
|
|
16
|
+
NODE_TYPES,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from graph.builder import (
|
|
20
|
+
add_entity_to_graph,
|
|
21
|
+
add_relationship,
|
|
22
|
+
build_graph_from_db,
|
|
23
|
+
infer_relationships,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from graph.queries import (
|
|
27
|
+
find_co_occurring_entities,
|
|
28
|
+
find_high_degree_nodes,
|
|
29
|
+
find_nodes_by_type,
|
|
30
|
+
get_actor_profile,
|
|
31
|
+
get_neighbors,
|
|
32
|
+
get_new_nodes_since,
|
|
33
|
+
get_shortest_path,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
from graph.export import (
|
|
37
|
+
summary_stats,
|
|
38
|
+
to_graphml,
|
|
39
|
+
to_json,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from graph.visualize import (
|
|
43
|
+
build_pyvis_network,
|
|
44
|
+
get_html_string,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
# model
|
|
49
|
+
"GraphNode",
|
|
50
|
+
"GraphEdge",
|
|
51
|
+
"NODE_TYPES",
|
|
52
|
+
"EDGE_TYPES",
|
|
53
|
+
# builder
|
|
54
|
+
"build_graph_from_db",
|
|
55
|
+
"add_entity_to_graph",
|
|
56
|
+
"add_relationship",
|
|
57
|
+
"infer_relationships",
|
|
58
|
+
# queries
|
|
59
|
+
"get_neighbors",
|
|
60
|
+
"find_nodes_by_type",
|
|
61
|
+
"find_co_occurring_entities",
|
|
62
|
+
"get_new_nodes_since",
|
|
63
|
+
"find_high_degree_nodes",
|
|
64
|
+
"get_shortest_path",
|
|
65
|
+
"get_actor_profile",
|
|
66
|
+
# export
|
|
67
|
+
"to_json",
|
|
68
|
+
"to_graphml",
|
|
69
|
+
"summary_stats",
|
|
70
|
+
# visualize
|
|
71
|
+
"build_pyvis_network",
|
|
72
|
+
"get_html_string",
|
|
73
|
+
]
|