voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
@@ -0,0 +1,638 @@
1
+ """
2
+ extractor/normalizer.py — Entity deduplication and canonical record merging.
3
+
4
+ The same wallet address may appear in 50 pages; it gets one NormalizedEntity
5
+ per call to normalize_entities() (deduped by canonical value within that call).
6
+ merge_with_db() upserts records to the DB and returns the assigned IDs.
7
+
8
+ Public interface
9
+ ----------------
10
+ normalize_entities(raw_entities, page_url, page_id) → list[NormalizedEntity]
11
+ merge_with_db(entities, investigation_id) → list (DB IDs / empty)
12
+ resolve_entity_type_conflicts(entities) → list (deduped by canonical value)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import logging
19
+ import os
20
+ import re
21
+ import unicodedata
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime, timezone
24
+ from typing import Optional, List
25
+ import uuid
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Canonical type priority for conflict resolution
31
+ # Lower number = higher specificity, wins in conflicts
32
+ # ---------------------------------------------------------------------------
33
+
34
+ TYPE_PRIORITY = {
35
+ "CVE": 1,
36
+ "MITRE_TECHNIQUE": 1,
37
+ "FILE_HASH_SHA256": 1,
38
+ "FILE_HASH_SHA1": 1,
39
+ "FILE_HASH_MD5": 1,
40
+ "IP_ADDRESS": 1,
41
+ "ONION_URL": 1,
42
+ "BITCOIN_ADDRESS": 2,
43
+ "MONERO_ADDRESS": 2,
44
+ "ETH_ADDRESS": 2,
45
+ "RANSOMWARE_GROUP": 3,
46
+ "THREAT_ACTOR": 3,
47
+ "MALWARE_FAMILY": 3,
48
+ "EMAIL_ADDRESS": 4,
49
+ "PGP_KEY_BLOCK": 4,
50
+ "DOMAIN": 5,
51
+ "ORGANIZATION_NAME": 6,
52
+ "PERSON_NAME": 6,
53
+ "LOCATION": 7,
54
+ }
55
+ DEFAULT_PRIORITY = 99
56
+
57
+ # Tiebreak order when types have equal priority
58
+ TIEBREAK_ORDER = [
59
+ "RANSOMWARE_GROUP",
60
+ "THREAT_ACTOR",
61
+ "MALWARE_FAMILY",
62
+ "FILE_HASH_SHA256",
63
+ "FILE_HASH_SHA1",
64
+ "FILE_HASH_MD5",
65
+ "CVE",
66
+ "MITRE_TECHNIQUE",
67
+ "IP_ADDRESS",
68
+ "ONION_URL",
69
+ "EMAIL_ADDRESS",
70
+ "PGP_KEY_BLOCK",
71
+ "BITCOIN_ADDRESS",
72
+ "MONERO_ADDRESS",
73
+ "ETH_ADDRESS",
74
+ "DOMAIN",
75
+ "ORGANIZATION_NAME",
76
+ "PERSON_NAME",
77
+ "LOCATION",
78
+ ]
79
+
80
+
81
+ def _get_priority(entity_type: str) -> int:
82
+ return TYPE_PRIORITY.get(entity_type, DEFAULT_PRIORITY)
83
+
84
+
85
+ def _get_tiebreak_rank(entity_type: str) -> int:
86
+ try:
87
+ return TIEBREAK_ORDER.index(entity_type)
88
+ except ValueError:
89
+ return len(TIEBREAK_ORDER)
90
+
91
+
92
+ def resolve_entity_type_conflicts(entities: list) -> list:
93
+ """
94
+ Resolve entity type conflicts by keeping only the most specific type
95
+ for each unique canonical value.
96
+
97
+ When the same value appears with multiple types:
98
+ - Lower TYPE_PRIORITY wins (higher specificity)
99
+ - Equal priority resolved by TIEBREAK_ORDER
100
+ """
101
+ value_to_entities: dict[str, list] = {}
102
+ for entity in entities:
103
+ key = entity.value.lower()
104
+ if key not in value_to_entities:
105
+ value_to_entities[key] = []
106
+ value_to_entities[key].append(entity)
107
+
108
+ resolved = []
109
+ for value_lower, group in value_to_entities.items():
110
+ if len(group) == 1:
111
+ resolved.append(group[0])
112
+ continue
113
+
114
+ type_to_entity = {}
115
+ for entity in group:
116
+ et = entity.entity_type
117
+ if et not in type_to_entity:
118
+ type_to_entity[et] = entity
119
+ else:
120
+ existing = type_to_entity[et]
121
+ if entity.confidence > existing.confidence:
122
+ type_to_entity[et] = entity
123
+
124
+ conflicting_types = list(type_to_entity.keys())
125
+ if len(conflicting_types) == 1:
126
+ resolved.append(type_to_entity[conflicting_types[0]])
127
+ continue
128
+
129
+ def _sort_key(t):
130
+ return (_get_priority(t), _get_tiebreak_rank(t))
131
+
132
+ conflicting_types.sort(key=_sort_key)
133
+ winner_type = conflicting_types[0]
134
+ winner = type_to_entity[winner_type]
135
+
136
+ logger.debug(
137
+ f"Type conflict: '{winner.value}' resolved from {conflicting_types} to {winner_type}"
138
+ )
139
+ resolved.append(winner)
140
+
141
+ return resolved
142
+
143
+
144
+ def _validate_hash_length(entity_type: str, value: str) -> bool:
145
+ """Validate that a hash entity has the correct length for its type."""
146
+ if entity_type == "FILE_HASH_MD5":
147
+ return len(value) == 32 and re.fullmatch(r"[0-9a-fA-F]{32}", value) is not None
148
+ elif entity_type == "FILE_HASH_SHA1":
149
+ return len(value) == 40 and re.fullmatch(r"[0-9a-fA-F]{40}", value) is not None
150
+ elif entity_type == "FILE_HASH_SHA256":
151
+ return len(value) == 64 and re.fullmatch(r"[0-9a-fA-F]{64}", value) is not None
152
+ return True
153
+
154
+
155
+ def _validate_onion_url(value: str) -> bool:
156
+ """Return True only if value is a valid .onion address."""
157
+ value = value.lower().strip()
158
+ if not value.endswith(".onion") and ".onion/" not in value:
159
+ return False
160
+ _ONION_PATTERN = re.compile(r'^(https?://)?[a-z2-7]{16,56}\.onion(/.*)?$')
161
+ return bool(_ONION_PATTERN.match(value))
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # Confidence scores by extraction source (inferred from entity_type)
166
+ # ---------------------------------------------------------------------------
167
+
168
+ _REGEX_TYPES: frozenset[str] = frozenset({
169
+ "BITCOIN_ADDRESS",
170
+ "ETHEREUM_ADDRESS",
171
+ "MONERO_ADDRESS",
172
+ "ONION_URL",
173
+ "EMAIL_ADDRESS",
174
+ "PGP_KEY_BLOCK",
175
+ "CVE_NUMBER",
176
+ "FILE_HASH_MD5",
177
+ "FILE_HASH_SHA1",
178
+ "FILE_HASH_SHA256",
179
+ "IP_ADDRESS",
180
+ "PHONE_NUMBER",
181
+ "PASTE_URL",
182
+ "MITRE_TECHNIQUE",
183
+ })
184
+
185
+ _NER_TYPES: frozenset[str] = frozenset({
186
+ "THREAT_ACTOR_HANDLE",
187
+ "MALWARE_FAMILY",
188
+ "RANSOMWARE_GROUP",
189
+ "ORGANIZATION_NAME",
190
+ })
191
+
192
+
193
+ def _confidence_for(entity_type: str) -> float:
194
+ if entity_type in _REGEX_TYPES:
195
+ return 1.0
196
+ if entity_type in _NER_TYPES:
197
+ return 0.85
198
+ return 0.75
199
+
200
+
201
+ def _extraction_method_for(entity_type: str) -> str:
202
+ if entity_type in _REGEX_TYPES:
203
+ return "regex"
204
+ if entity_type in _NER_TYPES:
205
+ return "NER"
206
+ return "LLM"
207
+
208
+
209
+ def _context_snippet(page_text: str, needle: str, max_len: int = 2000) -> str:
210
+ """Return a window of *page_text* around *needle* for analyst / stylometry context."""
211
+ try:
212
+ if not page_text or not needle:
213
+ return ""
214
+ idx = page_text.find(needle)
215
+ if idx < 0:
216
+ idx = page_text.lower().find(needle.lower())
217
+ if idx < 0:
218
+ return ""
219
+ half = max_len // 2
220
+ start = max(0, idx - half)
221
+ end = min(len(page_text), start + max_len)
222
+ return page_text[start:end].strip()
223
+ except Exception:
224
+ return ""
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Blocklist (NER / LLM only — regex types bypass; see normalize_entities)
229
+ # ---------------------------------------------------------------------------
230
+
231
+ ENTITY_BLOCKLIST: frozenset[str] = frozenset({
232
+ "bitcoin", "btc", "ethereum", "eth", "monero", "xmr", "litecoin", "ltc",
233
+ "dogecoin", "doge", "dash", "zcash", "zec", "ripple", "xrp", "usdt",
234
+ "tether", "usdc", "bnb", "solana", "sol",
235
+ "darknet", "dark web", "darkweb", "deep web", "tor", "onion",
236
+ "marketplace", "market", "shop", "store", "vendor",
237
+ "interface", "server", "client", "host", "system", "network",
238
+ "database", "application", "service", "api", "endpoint",
239
+ "stop", "start", "end", "new", "old", "free", "paid", "pro", "basic",
240
+ "admin", "user", "root", "guest", "test", "demo",
241
+ "h4ck3r", "h4cker", "hax0r", "haxor", "1337", "leet", "elite",
242
+ "noob", "n00b", "script", "scriptkiddie", "skid",
243
+ "vproxy", "proxychains", "nmap", "metasploit", "burpsuite",
244
+ "cobalt", "covenant", "empire", "mimikatz", "lazagne", "pypykatz",
245
+ "identities", "identity", "workflows", "workflow", "process",
246
+ "processes", "services", "service", "systems", "system",
247
+ "network", "networks", "access", "accounts", "account",
248
+ "platform", "platforms", "solution", "solutions",
249
+ "interface", "interfaces", "backend", "frontend",
250
+ "resources", "resource", "project", "projects",
251
+ "community", "communities", "member", "members",
252
+ "moderator", "administrator", "operator", "staff", "support",
253
+ "customer", "vendor", "buyer", "seller", "trader",
254
+ "dropper", "loader", "stager", "payload", "beacon",
255
+ })
256
+
257
+ KNOWN_TOOLS: frozenset[str] = frozenset({
258
+ "nmap", "metasploit", "cobaltstr", "cobaltstrike", "empire",
259
+ "covenant", "brute", "hydra", "sqlmap", "nikto", "burp",
260
+ "wireshark", "tcpdump", "netcat", "nc", "vproxy", "proxifier",
261
+ "tor", "torbrowser", "onionbrowser", "i2p", "freenet",
262
+ "kali", "parrot", "blackarch", "backtrack",
263
+ })
264
+
265
+ LEET_GENERIC = re.compile(r"^h[4a][ck]+[3e]?r?$")
266
+
267
+
268
+ ENTITY_MIN_LENGTH: dict[str, int] = {
269
+ "THREAT_ACTOR_HANDLE": 4,
270
+ "MALWARE_FAMILY": 3,
271
+ "RANSOMWARE_GROUP": 4,
272
+ "ORGANIZATION_NAME": 4,
273
+ "BITCOIN_ADDRESS": 10,
274
+ "ETHEREUM_ADDRESS": 10,
275
+ "MONERO_ADDRESS": 10,
276
+ "ONION_URL": 16,
277
+ "EMAIL_ADDRESS": 6,
278
+ "CVE_NUMBER": 9,
279
+ "IP_ADDRESS": 7,
280
+ "PGP_KEY_BLOCK": 8,
281
+ "PASTE_URL": 10,
282
+ }
283
+
284
+
285
+ def normalize_wallet_value(value: str) -> str:
286
+ """Normalize wallet addresses for deduplication (Ethereum compared lowercase)."""
287
+ value = value.strip()
288
+ if value.startswith("0x"):
289
+ return value.lower()
290
+ return value
291
+
292
+
293
+ def is_blocked_entity(entity_type: str, entity_value: str) -> bool:
294
+ """
295
+ Returns True if an entity should be filtered as noise (NER/LLM only).
296
+ Regex-extracted entities must not use this — their patterns are precise.
297
+ """
298
+ value_lower = entity_value.lower().strip()
299
+
300
+ if value_lower in ENTITY_BLOCKLIST:
301
+ return True
302
+
303
+ if entity_type == "THREAT_ACTOR_HANDLE":
304
+ if value_lower in KNOWN_TOOLS:
305
+ return True
306
+ if LEET_GENERIC.match(value_lower):
307
+ return True
308
+
309
+ min_len = ENTITY_MIN_LENGTH.get(entity_type, 3)
310
+ if len(value_lower) < min_len:
311
+ return True
312
+
313
+ norm_num = value_lower.replace(".", "").replace(",", "")
314
+ if norm_num.isnumeric():
315
+ return True
316
+
317
+ return False
318
+
319
+
320
+ # ---------------------------------------------------------------------------
321
+ # NormalizedEntity dataclass
322
+ # ---------------------------------------------------------------------------
323
+
324
+
325
+ @dataclass
326
+ class NormalizedEntity:
327
+ entity_type: str
328
+ value: str
329
+ confidence: float
330
+ source_url: str
331
+ page_id: Optional[uuid.UUID]
332
+ context_snippet: str = field(default="")
333
+ extraction_method: str = field(default="")
334
+
335
+
336
+ # ---------------------------------------------------------------------------
337
+ # Normalization rules per entity type
338
+ # ---------------------------------------------------------------------------
339
+
340
+
341
+ def _normalize_value(entity_type: str, value: str) -> str:
342
+ """
343
+ Return the canonical form of *value* for a given *entity_type*.
344
+ Never raises — on any error returns the value stripped of leading/trailing
345
+ whitespace.
346
+ """
347
+ try:
348
+ if entity_type == "BITCOIN_ADDRESS":
349
+ if value.lower().startswith("bc1"):
350
+ return value.lower()
351
+ return value
352
+
353
+ if entity_type == "ETHEREUM_ADDRESS":
354
+ return _eth_checksum(value)
355
+
356
+ if entity_type == "EMAIL_ADDRESS":
357
+ return value.lower()
358
+
359
+ if entity_type == "CVE_NUMBER":
360
+ return value.upper()
361
+
362
+ if entity_type == "MITRE_TECHNIQUE":
363
+ return value.upper()
364
+
365
+ if entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
366
+ return value.lower()
367
+
368
+ if entity_type == "ONION_URL":
369
+ try:
370
+ from crawler.utils import normalize_url
371
+ return normalize_url(value)
372
+ except Exception:
373
+ parsed_lower = value.lower()
374
+ return parsed_lower
375
+
376
+ stripped = value.strip()
377
+ return re.sub(r"\s+", " ", stripped)
378
+
379
+ except Exception:
380
+ return value.strip()
381
+
382
+
383
+ # ---------------------------------------------------------------------------
384
+ # Web3 availability check (import once at module load)
385
+ # ---------------------------------------------------------------------------
386
+
387
+ try:
388
+ from web3 import Web3
389
+
390
+ Web3.to_checksum_address("0x" + "0" * 40)
391
+ WEB3_AVAILABLE = True
392
+ except Exception:
393
+ WEB3_AVAILABLE = False
394
+
395
+
396
+ def _eth_checksum(addr: str) -> str:
397
+ """
398
+ Apply EIP-55 mixed-case checksum encoding to an Ethereum address.
399
+ Falls back to lowercase if web3 is unavailable or checksum fails.
400
+ """
401
+ if not addr:
402
+ return ""
403
+
404
+ addr = addr.strip()
405
+ if not addr.startswith("0x") or len(addr) != 42:
406
+ return addr.lower()
407
+
408
+ if not WEB3_AVAILABLE:
409
+ return addr.lower()
410
+
411
+ try:
412
+ from web3 import Web3
413
+
414
+ return Web3.to_checksum_address(addr)
415
+ except ValueError:
416
+ return addr.lower()
417
+ except Exception:
418
+ return addr.lower()
419
+
420
+
421
+ def canonicalize_entity_value(entity_type: str, value: str) -> str:
422
+ """
423
+ Produce a canonical form of an entity value for deduplication.
424
+ The canonical form is used as the dedup key — NOT stored as the display value.
425
+ The original casing/formatting is preserved for display.
426
+ """
427
+ if not value:
428
+ return (value or "").lower().strip()
429
+
430
+ v = value.strip()
431
+
432
+ if entity_type in ("THREAT_ACTOR", "MALWARE", "FORUM", "THREAT_ACTOR_HANDLE", "MALWARE_FAMILY", "RANSOMWARE_GROUP"):
433
+ v = unicodedata.normalize("NFKD", v)
434
+ v = v.encode("ascii", "ignore").decode("ascii")
435
+ v = v.lower()
436
+ v = re.sub(r"[\s\-_\.]", "", v)
437
+ v = re.sub(r"[^\w]", "", v)
438
+ return v
439
+
440
+ elif entity_type in ("WALLET", "BITCOIN_ADDRESS", "ETHEREUM_ADDRESS", "MONERO_ADDRESS"):
441
+ if v.startswith("0x"):
442
+ return v.lower()
443
+ if v.startswith("4") and len(v) in (95, 106):
444
+ return v.lower()
445
+ return v.strip()
446
+
447
+ elif entity_type in ("CVE", "CVE_NUMBER"):
448
+ v = v.upper().strip()
449
+ v = re.sub(r"\s+", "-", v)
450
+ return v
451
+
452
+ elif entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
453
+ return v.lower()
454
+
455
+ elif entity_type == "MITRE_TECHNIQUE":
456
+ return v.upper().strip()
457
+
458
+ elif entity_type in ("EMAIL", "EMAIL_ADDRESS"):
459
+ return v.lower().strip()
460
+
461
+ elif entity_type == "ONION_URL":
462
+ v = v.lower().rstrip("/")
463
+ v = re.sub(r"^https://", "http://", v)
464
+ return v
465
+
466
+ elif entity_type in ("PGP_KEY", "PGP_KEY_BLOCK"):
467
+ normalized = re.sub(r"\s+", "", v).upper()
468
+ return "pgp:" + hashlib.sha256(normalized.encode()).hexdigest()
469
+
470
+ else:
471
+ v = v.lower().strip()
472
+
473
+ return v[:1024]
474
+
475
+
476
+ def are_same_entity(type_a: str, value_a: str, type_b: str, value_b: str) -> bool:
477
+ """Returns True if two entities should be considered the same."""
478
+ if type_a != type_b:
479
+ return False
480
+ return canonicalize_entity_value(type_a, value_a) == canonicalize_entity_value(type_b, value_b)
481
+
482
+
483
+ # ---------------------------------------------------------------------------
484
+ # Public interface
485
+ # ---------------------------------------------------------------------------
486
+
487
+
488
+ def normalize_entities(
489
+ raw_entities: dict[str, list[str]],
490
+ page_url: str,
491
+ page_id: Optional[uuid.UUID] = None,
492
+ page_text: Optional[str] = None,
493
+ ) -> list[NormalizedEntity]:
494
+ """
495
+ Convert raw extraction results into deduplicated NormalizedEntity records.
496
+ """
497
+ seen_values: set[str] = set()
498
+ result: list[NormalizedEntity] = []
499
+
500
+ tool_count = 0
501
+ generic_count = 0
502
+ leet_count = 0
503
+ noise_count = 0
504
+
505
+ for entity_type, values in raw_entities.items():
506
+ confidence = _confidence_for(entity_type)
507
+ for raw_value in values:
508
+ if not raw_value or not raw_value.strip():
509
+ continue
510
+
511
+ if entity_type in ("FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256"):
512
+ if not _validate_hash_length(entity_type, raw_value):
513
+ logger.debug(
514
+ f"Hash length validation failed for {entity_type}={raw_value}"
515
+ )
516
+ continue
517
+
518
+ if entity_type == "ONION_URL":
519
+ if not _validate_onion_url(raw_value):
520
+ logger.debug("ONION_URL discarded (not a valid onion address): %r", raw_value)
521
+ continue
522
+
523
+ canonical = _normalize_value(entity_type, raw_value)
524
+ if not canonical:
525
+ continue
526
+
527
+ if entity_type not in _REGEX_TYPES:
528
+ value_lower = canonical.lower()
529
+ if is_blocked_entity(entity_type, canonical):
530
+ if entity_type == "THREAT_ACTOR_HANDLE" and value_lower in KNOWN_TOOLS:
531
+ tool_count += 1
532
+ elif entity_type == "THREAT_ACTOR_HANDLE" and LEET_GENERIC.match(value_lower):
533
+ leet_count += 1
534
+ elif value_lower in ENTITY_BLOCKLIST:
535
+ generic_count += 1
536
+ else:
537
+ noise_count += 1
538
+
539
+ logger.debug(
540
+ "Filtered blocked entity: %s=%s", entity_type, canonical
541
+ )
542
+ continue
543
+
544
+ dedup_key = f"{entity_type}::{canonical}"
545
+ if dedup_key in seen_values:
546
+ continue
547
+ seen_values.add(dedup_key)
548
+ snip = _context_snippet(page_text, canonical) if page_text else ""
549
+ result.append(
550
+ NormalizedEntity(
551
+ entity_type=entity_type,
552
+ value=canonical,
553
+ confidence=confidence,
554
+ source_url=page_url,
555
+ page_id=page_id,
556
+ context_snippet=snip,
557
+ extraction_method=_extraction_method_for(entity_type),
558
+ )
559
+ )
560
+
561
+ total_filtered = tool_count + leet_count + generic_count + noise_count
562
+ if total_filtered:
563
+ logger.warning(
564
+ f"Entity blocklist filtered {total_filtered} entities "
565
+ f"(tool_names={tool_count}, generic_terms={generic_count}, "
566
+ f"leet_generic={leet_count}, NER/LLM noise={noise_count})"
567
+ )
568
+
569
+ return result
570
+
571
+
572
+ def merge_with_db(
573
+ entities: list[NormalizedEntity],
574
+ investigation_id: Optional[uuid.UUID] = None,
575
+ ) -> list:
576
+ """
577
+ Upsert each entity to the DB entities table using canonical deduplication.
578
+ Returns a list of DB-assigned entity IDs (as strings).
579
+ """
580
+ if not os.getenv("DATABASE_URL"):
581
+ logger.warning(
582
+ "DATABASE_URL not set — skipping DB persist (%d entities)", len(entities)
583
+ )
584
+ return []
585
+
586
+ if not entities:
587
+ return []
588
+
589
+ ids: list = []
590
+ new_count = 0
591
+ dedup_count = 0
592
+
593
+ try:
594
+ from db.session import get_session
595
+ from db.queries import upsert_entity_canonical, create_page, get_page_by_url
596
+
597
+ with get_session() as session:
598
+ page_cache: dict[str, object] = {}
599
+
600
+ for entity in entities:
601
+ url = entity.source_url
602
+ if url not in page_cache:
603
+ page = get_page_by_url(session, url)
604
+ if page is None:
605
+ page = create_page(session, url=url)
606
+ page_cache[url] = page
607
+
608
+ page = page_cache[url]
609
+
610
+ db_entity, created = upsert_entity_canonical(
611
+ session=session,
612
+ investigation_id=investigation_id,
613
+ entity_type=entity.entity_type,
614
+ entity_value=entity.value,
615
+ confidence=entity.confidence,
616
+ source_page_id=page.id,
617
+ context_snippet=entity.context_snippet,
618
+ extraction_method=entity.extraction_method or None,
619
+ )
620
+
621
+ if created:
622
+ new_count += 1
623
+ else:
624
+ dedup_count += 1
625
+
626
+ ids.append(str(db_entity.id))
627
+
628
+ session.commit()
629
+ if investigation_id:
630
+ logger.warning(
631
+ f"[{investigation_id}] Entity dedup: {new_count} new, {dedup_count} merged with existing"
632
+ )
633
+
634
+ except Exception as exc:
635
+ logger.warning("merge_with_db failed: %s", exc)
636
+ return []
637
+
638
+ return ids