voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
extractor/normalizer.py
ADDED
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor/normalizer.py — Entity deduplication and canonical record merging.
|
|
3
|
+
|
|
4
|
+
The same wallet address may appear in 50 pages; it gets one NormalizedEntity
|
|
5
|
+
per call to normalize_entities() (deduped by canonical value within that call).
|
|
6
|
+
merge_with_db() upserts records to the DB and returns the assigned IDs.
|
|
7
|
+
|
|
8
|
+
Public interface
|
|
9
|
+
----------------
|
|
10
|
+
normalize_entities(raw_entities, page_url, page_id) → list[NormalizedEntity]
|
|
11
|
+
merge_with_db(entities, investigation_id) → list (DB IDs / empty)
|
|
12
|
+
resolve_entity_type_conflicts(entities) → list (deduped by canonical value)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import unicodedata
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Optional, List
|
|
25
|
+
import uuid
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Canonical type priority for conflict resolution
|
|
31
|
+
# Lower number = higher specificity, wins in conflicts
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
TYPE_PRIORITY = {
|
|
35
|
+
"CVE": 1,
|
|
36
|
+
"MITRE_TECHNIQUE": 1,
|
|
37
|
+
"FILE_HASH_SHA256": 1,
|
|
38
|
+
"FILE_HASH_SHA1": 1,
|
|
39
|
+
"FILE_HASH_MD5": 1,
|
|
40
|
+
"IP_ADDRESS": 1,
|
|
41
|
+
"ONION_URL": 1,
|
|
42
|
+
"BITCOIN_ADDRESS": 2,
|
|
43
|
+
"MONERO_ADDRESS": 2,
|
|
44
|
+
"ETH_ADDRESS": 2,
|
|
45
|
+
"RANSOMWARE_GROUP": 3,
|
|
46
|
+
"THREAT_ACTOR": 3,
|
|
47
|
+
"MALWARE_FAMILY": 3,
|
|
48
|
+
"EMAIL_ADDRESS": 4,
|
|
49
|
+
"PGP_KEY_BLOCK": 4,
|
|
50
|
+
"DOMAIN": 5,
|
|
51
|
+
"ORGANIZATION_NAME": 6,
|
|
52
|
+
"PERSON_NAME": 6,
|
|
53
|
+
"LOCATION": 7,
|
|
54
|
+
}
|
|
55
|
+
DEFAULT_PRIORITY = 99
|
|
56
|
+
|
|
57
|
+
# Tiebreak order when types have equal priority
|
|
58
|
+
TIEBREAK_ORDER = [
|
|
59
|
+
"RANSOMWARE_GROUP",
|
|
60
|
+
"THREAT_ACTOR",
|
|
61
|
+
"MALWARE_FAMILY",
|
|
62
|
+
"FILE_HASH_SHA256",
|
|
63
|
+
"FILE_HASH_SHA1",
|
|
64
|
+
"FILE_HASH_MD5",
|
|
65
|
+
"CVE",
|
|
66
|
+
"MITRE_TECHNIQUE",
|
|
67
|
+
"IP_ADDRESS",
|
|
68
|
+
"ONION_URL",
|
|
69
|
+
"EMAIL_ADDRESS",
|
|
70
|
+
"PGP_KEY_BLOCK",
|
|
71
|
+
"BITCOIN_ADDRESS",
|
|
72
|
+
"MONERO_ADDRESS",
|
|
73
|
+
"ETH_ADDRESS",
|
|
74
|
+
"DOMAIN",
|
|
75
|
+
"ORGANIZATION_NAME",
|
|
76
|
+
"PERSON_NAME",
|
|
77
|
+
"LOCATION",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_priority(entity_type: str) -> int:
|
|
82
|
+
return TYPE_PRIORITY.get(entity_type, DEFAULT_PRIORITY)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_tiebreak_rank(entity_type: str) -> int:
|
|
86
|
+
try:
|
|
87
|
+
return TIEBREAK_ORDER.index(entity_type)
|
|
88
|
+
except ValueError:
|
|
89
|
+
return len(TIEBREAK_ORDER)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def resolve_entity_type_conflicts(entities: list) -> list:
|
|
93
|
+
"""
|
|
94
|
+
Resolve entity type conflicts by keeping only the most specific type
|
|
95
|
+
for each unique canonical value.
|
|
96
|
+
|
|
97
|
+
When the same value appears with multiple types:
|
|
98
|
+
- Lower TYPE_PRIORITY wins (higher specificity)
|
|
99
|
+
- Equal priority resolved by TIEBREAK_ORDER
|
|
100
|
+
"""
|
|
101
|
+
value_to_entities: dict[str, list] = {}
|
|
102
|
+
for entity in entities:
|
|
103
|
+
key = entity.value.lower()
|
|
104
|
+
if key not in value_to_entities:
|
|
105
|
+
value_to_entities[key] = []
|
|
106
|
+
value_to_entities[key].append(entity)
|
|
107
|
+
|
|
108
|
+
resolved = []
|
|
109
|
+
for value_lower, group in value_to_entities.items():
|
|
110
|
+
if len(group) == 1:
|
|
111
|
+
resolved.append(group[0])
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
type_to_entity = {}
|
|
115
|
+
for entity in group:
|
|
116
|
+
et = entity.entity_type
|
|
117
|
+
if et not in type_to_entity:
|
|
118
|
+
type_to_entity[et] = entity
|
|
119
|
+
else:
|
|
120
|
+
existing = type_to_entity[et]
|
|
121
|
+
if entity.confidence > existing.confidence:
|
|
122
|
+
type_to_entity[et] = entity
|
|
123
|
+
|
|
124
|
+
conflicting_types = list(type_to_entity.keys())
|
|
125
|
+
if len(conflicting_types) == 1:
|
|
126
|
+
resolved.append(type_to_entity[conflicting_types[0]])
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
def _sort_key(t):
|
|
130
|
+
return (_get_priority(t), _get_tiebreak_rank(t))
|
|
131
|
+
|
|
132
|
+
conflicting_types.sort(key=_sort_key)
|
|
133
|
+
winner_type = conflicting_types[0]
|
|
134
|
+
winner = type_to_entity[winner_type]
|
|
135
|
+
|
|
136
|
+
logger.debug(
|
|
137
|
+
f"Type conflict: '{winner.value}' resolved from {conflicting_types} to {winner_type}"
|
|
138
|
+
)
|
|
139
|
+
resolved.append(winner)
|
|
140
|
+
|
|
141
|
+
return resolved
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _validate_hash_length(entity_type: str, value: str) -> bool:
|
|
145
|
+
"""Validate that a hash entity has the correct length for its type."""
|
|
146
|
+
if entity_type == "FILE_HASH_MD5":
|
|
147
|
+
return len(value) == 32 and re.fullmatch(r"[0-9a-fA-F]{32}", value) is not None
|
|
148
|
+
elif entity_type == "FILE_HASH_SHA1":
|
|
149
|
+
return len(value) == 40 and re.fullmatch(r"[0-9a-fA-F]{40}", value) is not None
|
|
150
|
+
elif entity_type == "FILE_HASH_SHA256":
|
|
151
|
+
return len(value) == 64 and re.fullmatch(r"[0-9a-fA-F]{64}", value) is not None
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _validate_onion_url(value: str) -> bool:
|
|
156
|
+
"""Return True only if value is a valid .onion address."""
|
|
157
|
+
value = value.lower().strip()
|
|
158
|
+
if not value.endswith(".onion") and ".onion/" not in value:
|
|
159
|
+
return False
|
|
160
|
+
_ONION_PATTERN = re.compile(r'^(https?://)?[a-z2-7]{16,56}\.onion(/.*)?$')
|
|
161
|
+
return bool(_ONION_PATTERN.match(value))
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
# Confidence scores by extraction source (inferred from entity_type)
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
_REGEX_TYPES: frozenset[str] = frozenset({
|
|
169
|
+
"BITCOIN_ADDRESS",
|
|
170
|
+
"ETHEREUM_ADDRESS",
|
|
171
|
+
"MONERO_ADDRESS",
|
|
172
|
+
"ONION_URL",
|
|
173
|
+
"EMAIL_ADDRESS",
|
|
174
|
+
"PGP_KEY_BLOCK",
|
|
175
|
+
"CVE_NUMBER",
|
|
176
|
+
"FILE_HASH_MD5",
|
|
177
|
+
"FILE_HASH_SHA1",
|
|
178
|
+
"FILE_HASH_SHA256",
|
|
179
|
+
"IP_ADDRESS",
|
|
180
|
+
"PHONE_NUMBER",
|
|
181
|
+
"PASTE_URL",
|
|
182
|
+
"MITRE_TECHNIQUE",
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
_NER_TYPES: frozenset[str] = frozenset({
|
|
186
|
+
"THREAT_ACTOR_HANDLE",
|
|
187
|
+
"MALWARE_FAMILY",
|
|
188
|
+
"RANSOMWARE_GROUP",
|
|
189
|
+
"ORGANIZATION_NAME",
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _confidence_for(entity_type: str) -> float:
|
|
194
|
+
if entity_type in _REGEX_TYPES:
|
|
195
|
+
return 1.0
|
|
196
|
+
if entity_type in _NER_TYPES:
|
|
197
|
+
return 0.85
|
|
198
|
+
return 0.75
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _extraction_method_for(entity_type: str) -> str:
|
|
202
|
+
if entity_type in _REGEX_TYPES:
|
|
203
|
+
return "regex"
|
|
204
|
+
if entity_type in _NER_TYPES:
|
|
205
|
+
return "NER"
|
|
206
|
+
return "LLM"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _context_snippet(page_text: str, needle: str, max_len: int = 2000) -> str:
|
|
210
|
+
"""Return a window of *page_text* around *needle* for analyst / stylometry context."""
|
|
211
|
+
try:
|
|
212
|
+
if not page_text or not needle:
|
|
213
|
+
return ""
|
|
214
|
+
idx = page_text.find(needle)
|
|
215
|
+
if idx < 0:
|
|
216
|
+
idx = page_text.lower().find(needle.lower())
|
|
217
|
+
if idx < 0:
|
|
218
|
+
return ""
|
|
219
|
+
half = max_len // 2
|
|
220
|
+
start = max(0, idx - half)
|
|
221
|
+
end = min(len(page_text), start + max_len)
|
|
222
|
+
return page_text[start:end].strip()
|
|
223
|
+
except Exception:
|
|
224
|
+
return ""
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
# Blocklist (NER / LLM only — regex types bypass; see normalize_entities)
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
|
|
231
|
+
ENTITY_BLOCKLIST: frozenset[str] = frozenset({
|
|
232
|
+
"bitcoin", "btc", "ethereum", "eth", "monero", "xmr", "litecoin", "ltc",
|
|
233
|
+
"dogecoin", "doge", "dash", "zcash", "zec", "ripple", "xrp", "usdt",
|
|
234
|
+
"tether", "usdc", "bnb", "solana", "sol",
|
|
235
|
+
"darknet", "dark web", "darkweb", "deep web", "tor", "onion",
|
|
236
|
+
"marketplace", "market", "shop", "store", "vendor",
|
|
237
|
+
"interface", "server", "client", "host", "system", "network",
|
|
238
|
+
"database", "application", "service", "api", "endpoint",
|
|
239
|
+
"stop", "start", "end", "new", "old", "free", "paid", "pro", "basic",
|
|
240
|
+
"admin", "user", "root", "guest", "test", "demo",
|
|
241
|
+
"h4ck3r", "h4cker", "hax0r", "haxor", "1337", "leet", "elite",
|
|
242
|
+
"noob", "n00b", "script", "scriptkiddie", "skid",
|
|
243
|
+
"vproxy", "proxychains", "nmap", "metasploit", "burpsuite",
|
|
244
|
+
"cobalt", "covenant", "empire", "mimikatz", "lazagne", "pypykatz",
|
|
245
|
+
"identities", "identity", "workflows", "workflow", "process",
|
|
246
|
+
"processes", "services", "service", "systems", "system",
|
|
247
|
+
"network", "networks", "access", "accounts", "account",
|
|
248
|
+
"platform", "platforms", "solution", "solutions",
|
|
249
|
+
"interface", "interfaces", "backend", "frontend",
|
|
250
|
+
"resources", "resource", "project", "projects",
|
|
251
|
+
"community", "communities", "member", "members",
|
|
252
|
+
"moderator", "administrator", "operator", "staff", "support",
|
|
253
|
+
"customer", "vendor", "buyer", "seller", "trader",
|
|
254
|
+
"dropper", "loader", "stager", "payload", "beacon",
|
|
255
|
+
})
|
|
256
|
+
|
|
257
|
+
KNOWN_TOOLS: frozenset[str] = frozenset({
|
|
258
|
+
"nmap", "metasploit", "cobaltstr", "cobaltstrike", "empire",
|
|
259
|
+
"covenant", "brute", "hydra", "sqlmap", "nikto", "burp",
|
|
260
|
+
"wireshark", "tcpdump", "netcat", "nc", "vproxy", "proxifier",
|
|
261
|
+
"tor", "torbrowser", "onionbrowser", "i2p", "freenet",
|
|
262
|
+
"kali", "parrot", "blackarch", "backtrack",
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
LEET_GENERIC = re.compile(r"^h[4a][ck]+[3e]?r?$")
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
ENTITY_MIN_LENGTH: dict[str, int] = {
|
|
269
|
+
"THREAT_ACTOR_HANDLE": 4,
|
|
270
|
+
"MALWARE_FAMILY": 3,
|
|
271
|
+
"RANSOMWARE_GROUP": 4,
|
|
272
|
+
"ORGANIZATION_NAME": 4,
|
|
273
|
+
"BITCOIN_ADDRESS": 10,
|
|
274
|
+
"ETHEREUM_ADDRESS": 10,
|
|
275
|
+
"MONERO_ADDRESS": 10,
|
|
276
|
+
"ONION_URL": 16,
|
|
277
|
+
"EMAIL_ADDRESS": 6,
|
|
278
|
+
"CVE_NUMBER": 9,
|
|
279
|
+
"IP_ADDRESS": 7,
|
|
280
|
+
"PGP_KEY_BLOCK": 8,
|
|
281
|
+
"PASTE_URL": 10,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def normalize_wallet_value(value: str) -> str:
|
|
286
|
+
"""Normalize wallet addresses for deduplication (Ethereum compared lowercase)."""
|
|
287
|
+
value = value.strip()
|
|
288
|
+
if value.startswith("0x"):
|
|
289
|
+
return value.lower()
|
|
290
|
+
return value
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def is_blocked_entity(entity_type: str, entity_value: str) -> bool:
|
|
294
|
+
"""
|
|
295
|
+
Returns True if an entity should be filtered as noise (NER/LLM only).
|
|
296
|
+
Regex-extracted entities must not use this — their patterns are precise.
|
|
297
|
+
"""
|
|
298
|
+
value_lower = entity_value.lower().strip()
|
|
299
|
+
|
|
300
|
+
if value_lower in ENTITY_BLOCKLIST:
|
|
301
|
+
return True
|
|
302
|
+
|
|
303
|
+
if entity_type == "THREAT_ACTOR_HANDLE":
|
|
304
|
+
if value_lower in KNOWN_TOOLS:
|
|
305
|
+
return True
|
|
306
|
+
if LEET_GENERIC.match(value_lower):
|
|
307
|
+
return True
|
|
308
|
+
|
|
309
|
+
min_len = ENTITY_MIN_LENGTH.get(entity_type, 3)
|
|
310
|
+
if len(value_lower) < min_len:
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
norm_num = value_lower.replace(".", "").replace(",", "")
|
|
314
|
+
if norm_num.isnumeric():
|
|
315
|
+
return True
|
|
316
|
+
|
|
317
|
+
return False
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
# NormalizedEntity dataclass
|
|
322
|
+
# ---------------------------------------------------------------------------
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@dataclass
|
|
326
|
+
class NormalizedEntity:
|
|
327
|
+
entity_type: str
|
|
328
|
+
value: str
|
|
329
|
+
confidence: float
|
|
330
|
+
source_url: str
|
|
331
|
+
page_id: Optional[uuid.UUID]
|
|
332
|
+
context_snippet: str = field(default="")
|
|
333
|
+
extraction_method: str = field(default="")
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ---------------------------------------------------------------------------
|
|
337
|
+
# Normalization rules per entity type
|
|
338
|
+
# ---------------------------------------------------------------------------
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _normalize_value(entity_type: str, value: str) -> str:
|
|
342
|
+
"""
|
|
343
|
+
Return the canonical form of *value* for a given *entity_type*.
|
|
344
|
+
Never raises — on any error returns the value stripped of leading/trailing
|
|
345
|
+
whitespace.
|
|
346
|
+
"""
|
|
347
|
+
try:
|
|
348
|
+
if entity_type == "BITCOIN_ADDRESS":
|
|
349
|
+
if value.lower().startswith("bc1"):
|
|
350
|
+
return value.lower()
|
|
351
|
+
return value
|
|
352
|
+
|
|
353
|
+
if entity_type == "ETHEREUM_ADDRESS":
|
|
354
|
+
return _eth_checksum(value)
|
|
355
|
+
|
|
356
|
+
if entity_type == "EMAIL_ADDRESS":
|
|
357
|
+
return value.lower()
|
|
358
|
+
|
|
359
|
+
if entity_type == "CVE_NUMBER":
|
|
360
|
+
return value.upper()
|
|
361
|
+
|
|
362
|
+
if entity_type == "MITRE_TECHNIQUE":
|
|
363
|
+
return value.upper()
|
|
364
|
+
|
|
365
|
+
if entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
|
|
366
|
+
return value.lower()
|
|
367
|
+
|
|
368
|
+
if entity_type == "ONION_URL":
|
|
369
|
+
try:
|
|
370
|
+
from crawler.utils import normalize_url
|
|
371
|
+
return normalize_url(value)
|
|
372
|
+
except Exception:
|
|
373
|
+
parsed_lower = value.lower()
|
|
374
|
+
return parsed_lower
|
|
375
|
+
|
|
376
|
+
stripped = value.strip()
|
|
377
|
+
return re.sub(r"\s+", " ", stripped)
|
|
378
|
+
|
|
379
|
+
except Exception:
|
|
380
|
+
return value.strip()
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# ---------------------------------------------------------------------------
|
|
384
|
+
# Web3 availability check (import once at module load)
|
|
385
|
+
# ---------------------------------------------------------------------------
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
from web3 import Web3
|
|
389
|
+
|
|
390
|
+
Web3.to_checksum_address("0x" + "0" * 40)
|
|
391
|
+
WEB3_AVAILABLE = True
|
|
392
|
+
except Exception:
|
|
393
|
+
WEB3_AVAILABLE = False
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _eth_checksum(addr: str) -> str:
|
|
397
|
+
"""
|
|
398
|
+
Apply EIP-55 mixed-case checksum encoding to an Ethereum address.
|
|
399
|
+
Falls back to lowercase if web3 is unavailable or checksum fails.
|
|
400
|
+
"""
|
|
401
|
+
if not addr:
|
|
402
|
+
return ""
|
|
403
|
+
|
|
404
|
+
addr = addr.strip()
|
|
405
|
+
if not addr.startswith("0x") or len(addr) != 42:
|
|
406
|
+
return addr.lower()
|
|
407
|
+
|
|
408
|
+
if not WEB3_AVAILABLE:
|
|
409
|
+
return addr.lower()
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
from web3 import Web3
|
|
413
|
+
|
|
414
|
+
return Web3.to_checksum_address(addr)
|
|
415
|
+
except ValueError:
|
|
416
|
+
return addr.lower()
|
|
417
|
+
except Exception:
|
|
418
|
+
return addr.lower()
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def canonicalize_entity_value(entity_type: str, value: str) -> str:
|
|
422
|
+
"""
|
|
423
|
+
Produce a canonical form of an entity value for deduplication.
|
|
424
|
+
The canonical form is used as the dedup key — NOT stored as the display value.
|
|
425
|
+
The original casing/formatting is preserved for display.
|
|
426
|
+
"""
|
|
427
|
+
if not value:
|
|
428
|
+
return (value or "").lower().strip()
|
|
429
|
+
|
|
430
|
+
v = value.strip()
|
|
431
|
+
|
|
432
|
+
if entity_type in ("THREAT_ACTOR", "MALWARE", "FORUM", "THREAT_ACTOR_HANDLE", "MALWARE_FAMILY", "RANSOMWARE_GROUP"):
|
|
433
|
+
v = unicodedata.normalize("NFKD", v)
|
|
434
|
+
v = v.encode("ascii", "ignore").decode("ascii")
|
|
435
|
+
v = v.lower()
|
|
436
|
+
v = re.sub(r"[\s\-_\.]", "", v)
|
|
437
|
+
v = re.sub(r"[^\w]", "", v)
|
|
438
|
+
return v
|
|
439
|
+
|
|
440
|
+
elif entity_type in ("WALLET", "BITCOIN_ADDRESS", "ETHEREUM_ADDRESS", "MONERO_ADDRESS"):
|
|
441
|
+
if v.startswith("0x"):
|
|
442
|
+
return v.lower()
|
|
443
|
+
if v.startswith("4") and len(v) in (95, 106):
|
|
444
|
+
return v.lower()
|
|
445
|
+
return v.strip()
|
|
446
|
+
|
|
447
|
+
elif entity_type in ("CVE", "CVE_NUMBER"):
|
|
448
|
+
v = v.upper().strip()
|
|
449
|
+
v = re.sub(r"\s+", "-", v)
|
|
450
|
+
return v
|
|
451
|
+
|
|
452
|
+
elif entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
|
|
453
|
+
return v.lower()
|
|
454
|
+
|
|
455
|
+
elif entity_type == "MITRE_TECHNIQUE":
|
|
456
|
+
return v.upper().strip()
|
|
457
|
+
|
|
458
|
+
elif entity_type in ("EMAIL", "EMAIL_ADDRESS"):
|
|
459
|
+
return v.lower().strip()
|
|
460
|
+
|
|
461
|
+
elif entity_type == "ONION_URL":
|
|
462
|
+
v = v.lower().rstrip("/")
|
|
463
|
+
v = re.sub(r"^https://", "http://", v)
|
|
464
|
+
return v
|
|
465
|
+
|
|
466
|
+
elif entity_type in ("PGP_KEY", "PGP_KEY_BLOCK"):
|
|
467
|
+
normalized = re.sub(r"\s+", "", v).upper()
|
|
468
|
+
return "pgp:" + hashlib.sha256(normalized.encode()).hexdigest()
|
|
469
|
+
|
|
470
|
+
else:
|
|
471
|
+
v = v.lower().strip()
|
|
472
|
+
|
|
473
|
+
return v[:1024]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def are_same_entity(type_a: str, value_a: str, type_b: str, value_b: str) -> bool:
|
|
477
|
+
"""Returns True if two entities should be considered the same."""
|
|
478
|
+
if type_a != type_b:
|
|
479
|
+
return False
|
|
480
|
+
return canonicalize_entity_value(type_a, value_a) == canonicalize_entity_value(type_b, value_b)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
# ---------------------------------------------------------------------------
|
|
484
|
+
# Public interface
|
|
485
|
+
# ---------------------------------------------------------------------------
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def normalize_entities(
|
|
489
|
+
raw_entities: dict[str, list[str]],
|
|
490
|
+
page_url: str,
|
|
491
|
+
page_id: Optional[uuid.UUID] = None,
|
|
492
|
+
page_text: Optional[str] = None,
|
|
493
|
+
) -> list[NormalizedEntity]:
|
|
494
|
+
"""
|
|
495
|
+
Convert raw extraction results into deduplicated NormalizedEntity records.
|
|
496
|
+
"""
|
|
497
|
+
seen_values: set[str] = set()
|
|
498
|
+
result: list[NormalizedEntity] = []
|
|
499
|
+
|
|
500
|
+
tool_count = 0
|
|
501
|
+
generic_count = 0
|
|
502
|
+
leet_count = 0
|
|
503
|
+
noise_count = 0
|
|
504
|
+
|
|
505
|
+
for entity_type, values in raw_entities.items():
|
|
506
|
+
confidence = _confidence_for(entity_type)
|
|
507
|
+
for raw_value in values:
|
|
508
|
+
if not raw_value or not raw_value.strip():
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
if entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
|
|
512
|
+
if not _validate_hash_length(entity_type, raw_value):
|
|
513
|
+
logger.debug(
|
|
514
|
+
f"Hash length validation failed for {entity_type}={raw_value}"
|
|
515
|
+
)
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
if entity_type == "ONION_URL":
|
|
519
|
+
if not _validate_onion_url(raw_value):
|
|
520
|
+
logger.debug("ONION_URL discarded (not a valid onion address): %r", raw_value)
|
|
521
|
+
continue
|
|
522
|
+
|
|
523
|
+
canonical = _normalize_value(entity_type, raw_value)
|
|
524
|
+
if not canonical:
|
|
525
|
+
continue
|
|
526
|
+
|
|
527
|
+
if entity_type not in _REGEX_TYPES:
|
|
528
|
+
value_lower = canonical.lower()
|
|
529
|
+
if is_blocked_entity(entity_type, canonical):
|
|
530
|
+
if entity_type == "THREAT_ACTOR_HANDLE" and value_lower in KNOWN_TOOLS:
|
|
531
|
+
tool_count += 1
|
|
532
|
+
elif entity_type == "THREAT_ACTOR_HANDLE" and LEET_GENERIC.match(value_lower):
|
|
533
|
+
leet_count += 1
|
|
534
|
+
elif value_lower in ENTITY_BLOCKLIST:
|
|
535
|
+
generic_count += 1
|
|
536
|
+
else:
|
|
537
|
+
noise_count += 1
|
|
538
|
+
|
|
539
|
+
logger.debug(
|
|
540
|
+
"Filtered blocked entity: %s=%s", entity_type, canonical
|
|
541
|
+
)
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
dedup_key = f"{entity_type}::{canonical}"
|
|
545
|
+
if dedup_key in seen_values:
|
|
546
|
+
continue
|
|
547
|
+
seen_values.add(dedup_key)
|
|
548
|
+
snip = _context_snippet(page_text, canonical) if page_text else ""
|
|
549
|
+
result.append(
|
|
550
|
+
NormalizedEntity(
|
|
551
|
+
entity_type=entity_type,
|
|
552
|
+
value=canonical,
|
|
553
|
+
confidence=confidence,
|
|
554
|
+
source_url=page_url,
|
|
555
|
+
page_id=page_id,
|
|
556
|
+
context_snippet=snip,
|
|
557
|
+
extraction_method=_extraction_method_for(entity_type),
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
total_filtered = tool_count + leet_count + generic_count + noise_count
|
|
562
|
+
if total_filtered:
|
|
563
|
+
logger.warning(
|
|
564
|
+
f"Entity blocklist filtered {total_filtered} entities "
|
|
565
|
+
f"(tool_names={tool_count}, generic_terms={generic_count}, "
|
|
566
|
+
f"leet_generic={leet_count}, NER/LLM noise={noise_count})"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
return result
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def merge_with_db(
|
|
573
|
+
entities: list[NormalizedEntity],
|
|
574
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
575
|
+
) -> list:
|
|
576
|
+
"""
|
|
577
|
+
Upsert each entity to the DB entities table using canonical deduplication.
|
|
578
|
+
Returns a list of DB-assigned entity IDs (as strings).
|
|
579
|
+
"""
|
|
580
|
+
if not os.getenv("DATABASE_URL"):
|
|
581
|
+
logger.warning(
|
|
582
|
+
"DATABASE_URL not set — skipping DB persist (%d entities)", len(entities)
|
|
583
|
+
)
|
|
584
|
+
return []
|
|
585
|
+
|
|
586
|
+
if not entities:
|
|
587
|
+
return []
|
|
588
|
+
|
|
589
|
+
ids: list = []
|
|
590
|
+
new_count = 0
|
|
591
|
+
dedup_count = 0
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
from db.session import get_session
|
|
595
|
+
from db.queries import upsert_entity_canonical, create_page, get_page_by_url
|
|
596
|
+
|
|
597
|
+
with get_session() as session:
|
|
598
|
+
page_cache: dict[str, object] = {}
|
|
599
|
+
|
|
600
|
+
for entity in entities:
|
|
601
|
+
url = entity.source_url
|
|
602
|
+
if url not in page_cache:
|
|
603
|
+
page = get_page_by_url(session, url)
|
|
604
|
+
if page is None:
|
|
605
|
+
page = create_page(session, url=url)
|
|
606
|
+
page_cache[url] = page
|
|
607
|
+
|
|
608
|
+
page = page_cache[url]
|
|
609
|
+
|
|
610
|
+
db_entity, created = upsert_entity_canonical(
|
|
611
|
+
session=session,
|
|
612
|
+
investigation_id=investigation_id,
|
|
613
|
+
entity_type=entity.entity_type,
|
|
614
|
+
entity_value=entity.value,
|
|
615
|
+
confidence=entity.confidence,
|
|
616
|
+
source_page_id=page.id,
|
|
617
|
+
context_snippet=entity.context_snippet,
|
|
618
|
+
extraction_method=entity.extraction_method or None,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
if created:
|
|
622
|
+
new_count += 1
|
|
623
|
+
else:
|
|
624
|
+
dedup_count += 1
|
|
625
|
+
|
|
626
|
+
ids.append(str(db_entity.id))
|
|
627
|
+
|
|
628
|
+
session.commit()
|
|
629
|
+
if investigation_id:
|
|
630
|
+
logger.warning(
|
|
631
|
+
f"[{investigation_id}] Entity dedup: {new_count} new, {dedup_count} merged with existing"
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
except Exception as exc:
|
|
635
|
+
logger.warning("merge_with_db failed: %s", exc)
|
|
636
|
+
return []
|
|
637
|
+
|
|
638
|
+
return ids
|