voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,249 @@
1
+ """
2
+ fingerprint/stylometry.py — Writing style feature extraction and similarity.
3
+
4
+ Identifies when the same person posts under different handles on different
5
+ forums, based on HOW they write rather than WHAT they write.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ import re
12
+ import string
13
+ from collections import Counter
14
+ from typing import Optional
15
+
16
+ # Top-20 English function words — nearly impossible to consciously change
17
+ _FUNCTION_WORDS = [
18
+ "the", "a", "an", "and", "but", "or", "if", "in", "on", "at",
19
+ "to", "for", "of", "with", "is", "are", "was", "were", "be", "have",
20
+ ]
21
+
22
+ # Splits on whitespace that follows a sentence-ending punctuation mark
23
+ _SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")
24
+
25
+ # Patterns for structured data to detect non-natural-language text
26
+ _BITCOIN_RE = re.compile(r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b")
27
+ _ETH_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
28
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")
29
+ _URL_RE = re.compile(r"https?://\S+")
30
+ _ADDRESS_RE = re.compile(r"\b[a-z2-7]{56}\.onion\b", re.IGNORECASE)
31
+
32
+
33
+ def _is_natural_language(text: str) -> bool:
34
+ """Returns True if text contains enough natural language for stylometry."""
35
+ words = text.split()
36
+ if len(words) < 10:
37
+ return False
38
+ structured_count = 0
39
+ structured_count += len(_BITCOIN_RE.findall(text))
40
+ structured_count += len(_ETH_RE.findall(text))
41
+ structured_count += len(_CVE_RE.findall(text))
42
+ structured_count += len(_URL_RE.findall(text))
43
+ structured_count += len(_ADDRESS_RE.findall(text))
44
+ if structured_count / len(words) > 0.5:
45
+ return False
46
+ return True
47
+
48
+
49
+ def _split_sentences(text: str) -> list[str]:
50
+ parts = _SENTENCE_RE.split(text.strip())
51
+ return [s for s in parts if s.strip()]
52
+
53
+
54
+ def _split_paragraphs(text: str) -> list[str]:
55
+ parts = re.split(r"\n\s*\n", text.strip())
56
+ return [p for p in parts if p.strip()]
57
+
58
+
59
+ def _get_words(text: str) -> list[str]:
60
+ return re.findall(r"\b\w+\b", text)
61
+
62
+
63
+ def extract_style_vector(text: str) -> dict | None:
64
+ """
65
+ Extract a fixed set of stylometric features from *text*.
66
+
67
+ Returns None for text shorter than 100 characters (too short to be
68
+ meaningful) OR if text is primarily structured data (wallets, URLs, CVEs).
69
+ Never raises.
70
+ """
71
+ try:
72
+ if not text or len(text) < 100:
73
+ return None
74
+
75
+ if not _is_natural_language(text):
76
+ return None
77
+
78
+ words = _get_words(text)
79
+ if not words:
80
+ return None
81
+
82
+ alpha_words = re.findall(r"\b[a-zA-Z]+\b", text)
83
+
84
+ # avg_word_length
85
+ avg_word_length = (
86
+ sum(len(w) for w in alpha_words) / len(alpha_words)
87
+ if alpha_words
88
+ else 0.0
89
+ )
90
+
91
+ # avg_sentence_length (words per sentence)
92
+ sentences = _split_sentences(text)
93
+ if not sentences:
94
+ sentences = [text]
95
+ sent_word_counts = [len(_get_words(s)) for s in sentences]
96
+ avg_sentence_length = (
97
+ sum(sent_word_counts) / len(sent_word_counts)
98
+ if sent_word_counts
99
+ else 0.0
100
+ )
101
+
102
+ # vocabulary_richness — type-token ratio
103
+ total_words = len(words)
104
+ unique_words = len({w.lower() for w in words})
105
+ vocabulary_richness = min(unique_words / total_words, 1.0) if total_words else 0.0
106
+
107
+ # punctuation_density
108
+ punct_count = sum(1 for c in text if c in string.punctuation)
109
+ punctuation_density = punct_count / len(text) if text else 0.0
110
+
111
+ # uppercase_ratio
112
+ alpha_chars = [c for c in text if c.isalpha()]
113
+ upper_chars = [c for c in alpha_chars if c.isupper()]
114
+ uppercase_ratio = len(upper_chars) / len(alpha_chars) if alpha_chars else 0.0
115
+
116
+ # digit_ratio
117
+ digit_count = sum(1 for c in text if c.isdigit())
118
+ digit_ratio = digit_count / len(text) if text else 0.0
119
+
120
+ # function_word_freq — frequency of each of the 20 function words
121
+ words_lower = [w.lower() for w in words]
122
+ function_word_freq: dict[str, float] = {
123
+ fw: words_lower.count(fw) / total_words if total_words else 0.0
124
+ for fw in _FUNCTION_WORDS
125
+ }
126
+
127
+ # avg_paragraph_length — mean sentences per paragraph
128
+ paragraphs = _split_paragraphs(text)
129
+ if paragraphs:
130
+ para_sent_counts = [
131
+ max(len(_split_sentences(p)), 1) for p in paragraphs
132
+ ]
133
+ avg_paragraph_length = sum(para_sent_counts) / len(para_sent_counts)
134
+ else:
135
+ avg_paragraph_length = float(len(sentences))
136
+
137
+ # exclamation_ratio and question_ratio
138
+ num_sentences = len(sentences)
139
+ exclamation_ratio = text.count("!") / num_sentences if num_sentences else 0.0
140
+ question_ratio = text.count("?") / num_sentences if num_sentences else 0.0
141
+
142
+ # char_ngram_freq — top-50 character 3-grams
143
+ text_lower = text.lower()
144
+ all_ngrams = [text_lower[i : i + 3] for i in range(len(text_lower) - 2)]
145
+ ngram_counter = Counter(all_ngrams)
146
+ total_ngrams = len(all_ngrams)
147
+ char_ngram_freq: dict[str, float] = {
148
+ ngram: count / total_ngrams if total_ngrams else 0.0
149
+ for ngram, count in ngram_counter.most_common(50)
150
+ }
151
+
152
+ return {
153
+ "avg_word_length": float(avg_word_length),
154
+ "avg_sentence_length": float(avg_sentence_length),
155
+ "vocabulary_richness": float(vocabulary_richness),
156
+ "punctuation_density": float(punctuation_density),
157
+ "uppercase_ratio": float(uppercase_ratio),
158
+ "digit_ratio": float(digit_ratio),
159
+ "function_word_freq": function_word_freq,
160
+ "avg_paragraph_length": float(avg_paragraph_length),
161
+ "exclamation_ratio": float(exclamation_ratio),
162
+ "question_ratio": float(question_ratio),
163
+ "char_ngram_freq": char_ngram_freq,
164
+ }
165
+
166
+ except Exception:
167
+ return None
168
+
169
+
170
+ # ---------------------------------------------------------------------------
171
+ # Vector alignment helpers
172
+ # ---------------------------------------------------------------------------
173
+
174
+ def _aligned_flatten(
175
+ vector_a: dict, vector_b: dict
176
+ ) -> tuple[list[float], list[float]]:
177
+ """
178
+ Flatten two style vectors into aligned float arrays.
179
+
180
+ For scalar keys: use the value from each vector (0.0 if missing).
181
+ For nested-dict keys (function_word_freq, char_ngram_freq): use the
182
+ union of subkeys, with 0.0 for any subkey missing in one vector.
183
+ """
184
+ flat_a: list[float] = []
185
+ flat_b: list[float] = []
186
+
187
+ all_keys = sorted(set(vector_a.keys()) | set(vector_b.keys()))
188
+
189
+ for key in all_keys:
190
+ val_a = vector_a.get(key, 0.0)
191
+ val_b = vector_b.get(key, 0.0)
192
+
193
+ if isinstance(val_a, dict) or isinstance(val_b, dict):
194
+ dict_a = val_a if isinstance(val_a, dict) else {}
195
+ dict_b = val_b if isinstance(val_b, dict) else {}
196
+ all_subkeys = sorted(set(dict_a.keys()) | set(dict_b.keys()))
197
+ for subkey in all_subkeys:
198
+ flat_a.append(float(dict_a.get(subkey, 0.0)))
199
+ flat_b.append(float(dict_b.get(subkey, 0.0)))
200
+ else:
201
+ flat_a.append(float(val_a) if val_a else 0.0)
202
+ flat_b.append(float(val_b) if val_b else 0.0)
203
+
204
+ return flat_a, flat_b
205
+
206
+
207
+ def _cosine_similarity(a: list[float], b: list[float]) -> float:
208
+ if not a or not b or len(a) != len(b):
209
+ return 0.0
210
+ dot = sum(x * y for x, y in zip(a, b))
211
+ mag_a = math.sqrt(sum(x * x for x in a))
212
+ mag_b = math.sqrt(sum(y * y for y in b))
213
+ if mag_a == 0.0 or mag_b == 0.0:
214
+ return 0.0
215
+ return dot / (mag_a * mag_b)
216
+
217
+
218
+ def compute_similarity(vector_a: dict, vector_b: dict) -> float:
219
+ """
220
+ Cosine similarity between two style vectors (0.0–1.0).
221
+
222
+ Handles nested function_word_freq and char_ngram_freq dicts by
223
+ flattening both vectors into aligned float arrays. Returns 0.0 if
224
+ either vector is None or malformed. Never raises.
225
+ """
226
+ try:
227
+ if not vector_a or not vector_b:
228
+ return 0.0
229
+ if not isinstance(vector_a, dict) or not isinstance(vector_b, dict):
230
+ return 0.0
231
+ flat_a, flat_b = _aligned_flatten(vector_a, vector_b)
232
+ raw = _cosine_similarity(flat_a, flat_b)
233
+ return float(max(0.0, min(1.0, raw)))
234
+ except Exception:
235
+ return 0.0
236
+
237
+
238
+ def are_likely_same_author(
239
+ vector_a: dict,
240
+ vector_b: dict,
241
+ threshold: float = 0.85,
242
+ ) -> tuple[bool, float]:
243
+ """
244
+ Returns (True, similarity_score) if similarity >= threshold.
245
+
246
+ Threshold of 0.85 is conservative — only flag high-confidence matches.
247
+ """
248
+ score = compute_similarity(vector_a, vector_b)
249
+ return (score >= threshold, score)
graph/__init__.py ADDED
@@ -0,0 +1,73 @@
1
+ """
2
+ graph — Phase 3 graph relationship mapping module.
3
+
4
+ Exports all public symbols from the five sub-modules:
5
+ model — data definitions (node/edge types, dataclasses)
6
+ builder — graph construction from DB + manual mutation helpers
7
+ queries — pure query functions over a NetworkX graph
8
+ export — serialisation to GraphML, JSON, Gephi CSV
9
+ visualize — interactive HTML visualisation via pyvis
10
+ """
11
+
12
+ from graph.model import (
13
+ GraphEdge,
14
+ GraphNode,
15
+ EDGE_TYPES,
16
+ NODE_TYPES,
17
+ )
18
+
19
+ from graph.builder import (
20
+ add_entity_to_graph,
21
+ add_relationship,
22
+ build_graph_from_db,
23
+ infer_relationships,
24
+ )
25
+
26
+ from graph.queries import (
27
+ find_co_occurring_entities,
28
+ find_high_degree_nodes,
29
+ find_nodes_by_type,
30
+ get_actor_profile,
31
+ get_neighbors,
32
+ get_new_nodes_since,
33
+ get_shortest_path,
34
+ )
35
+
36
+ from graph.export import (
37
+ summary_stats,
38
+ to_graphml,
39
+ to_json,
40
+ )
41
+
42
+ from graph.visualize import (
43
+ build_pyvis_network,
44
+ get_html_string,
45
+ )
46
+
47
+ __all__ = [
48
+ # model
49
+ "GraphNode",
50
+ "GraphEdge",
51
+ "NODE_TYPES",
52
+ "EDGE_TYPES",
53
+ # builder
54
+ "build_graph_from_db",
55
+ "add_entity_to_graph",
56
+ "add_relationship",
57
+ "infer_relationships",
58
+ # queries
59
+ "get_neighbors",
60
+ "find_nodes_by_type",
61
+ "find_co_occurring_entities",
62
+ "get_new_nodes_since",
63
+ "find_high_degree_nodes",
64
+ "get_shortest_path",
65
+ "get_actor_profile",
66
+ # export
67
+ "to_json",
68
+ "to_graphml",
69
+ "summary_stats",
70
+ # visualize
71
+ "build_pyvis_network",
72
+ "get_html_string",
73
+ ]