voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
extractor/ner.py
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor/ner.py — Named Entity Recognition for entities without fixed patterns.
|
|
3
|
+
|
|
4
|
+
Uses spaCy (en_core_web_sm) as a module-level singleton. If the model is not
|
|
5
|
+
installed the module still imports cleanly — all public functions return empty
|
|
6
|
+
dicts / sets and log a warning rather than raising.
|
|
7
|
+
|
|
8
|
+
Uses a bundled dictionary of 200+ malware family names for MALWARE_FAMILY and
|
|
9
|
+
RANSOMWARE_GROUP detection (word-bounded, case-insensitive).
|
|
10
|
+
|
|
11
|
+
Public interface
|
|
12
|
+
----------------
|
|
13
|
+
extract_named_entities(text) → dict[str, list[str]]
|
|
14
|
+
load_malware_dictionary() → set[str]
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# NER entity type constants (supplements regex_patterns constants)
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
THREAT_ACTOR_HANDLE = "THREAT_ACTOR_HANDLE"
|
|
29
|
+
MALWARE_FAMILY = "MALWARE_FAMILY"
|
|
30
|
+
RANSOMWARE_GROUP = "RANSOMWARE_GROUP"
|
|
31
|
+
ORGANIZATION_NAME = "ORGANIZATION_NAME"
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Malware family dictionary
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
_MALWARE_DICT: set[str] = {
|
|
38
|
+
# Ransomware families — active and historical
|
|
39
|
+
"LockBit", "LockBit 2.0", "LockBit 3.0",
|
|
40
|
+
"BlackCat", "ALPHV",
|
|
41
|
+
"Cl0p", "Clop",
|
|
42
|
+
"REvil", "Sodinokibi",
|
|
43
|
+
"Conti",
|
|
44
|
+
"BlackMatter",
|
|
45
|
+
"Hive",
|
|
46
|
+
"Vice Society",
|
|
47
|
+
"Play",
|
|
48
|
+
"Royal",
|
|
49
|
+
"Akira",
|
|
50
|
+
"BlackSuit",
|
|
51
|
+
"Avaddon",
|
|
52
|
+
"DarkSide",
|
|
53
|
+
"Maze",
|
|
54
|
+
"Ryuk",
|
|
55
|
+
"Egregor",
|
|
56
|
+
"Babuk",
|
|
57
|
+
"DoppelPaymer",
|
|
58
|
+
"MedusaLocker",
|
|
59
|
+
"Prometheus",
|
|
60
|
+
"Grief",
|
|
61
|
+
"Ragnar Locker",
|
|
62
|
+
"RagnarLocker",
|
|
63
|
+
"Cuba",
|
|
64
|
+
"BlackBasta",
|
|
65
|
+
"Black Basta",
|
|
66
|
+
"Yanluowang",
|
|
67
|
+
"Quantum",
|
|
68
|
+
"Monti",
|
|
69
|
+
"Nokoyawa",
|
|
70
|
+
"Trigona",
|
|
71
|
+
"Rhysida",
|
|
72
|
+
"Hunters International",
|
|
73
|
+
"Cactus",
|
|
74
|
+
"INC Ransom",
|
|
75
|
+
"Meow",
|
|
76
|
+
"MedusaBIG",
|
|
77
|
+
"KillSec",
|
|
78
|
+
"Dispossessor",
|
|
79
|
+
"Eldorado",
|
|
80
|
+
"SenSayQ",
|
|
81
|
+
"RansomHub",
|
|
82
|
+
"DragonForce",
|
|
83
|
+
"Scattered Spider",
|
|
84
|
+
"Dark Angels",
|
|
85
|
+
"8Base",
|
|
86
|
+
"Qilin",
|
|
87
|
+
"Fog",
|
|
88
|
+
"Lynx",
|
|
89
|
+
"Cicada3301",
|
|
90
|
+
"Embargo",
|
|
91
|
+
"Karakurt",
|
|
92
|
+
"LV",
|
|
93
|
+
"Entropy",
|
|
94
|
+
"Vice",
|
|
95
|
+
"Zeppelin",
|
|
96
|
+
"Dharma",
|
|
97
|
+
"Phobos",
|
|
98
|
+
"Xorist",
|
|
99
|
+
"Globeimposter",
|
|
100
|
+
"Makop",
|
|
101
|
+
"Stop",
|
|
102
|
+
"Djvu",
|
|
103
|
+
"WannaCry",
|
|
104
|
+
"WannaCryptor",
|
|
105
|
+
"Petya",
|
|
106
|
+
"NotPetya",
|
|
107
|
+
"GoldenEye",
|
|
108
|
+
"BadRabbit",
|
|
109
|
+
"SamSam",
|
|
110
|
+
"Cerber",
|
|
111
|
+
"Locky",
|
|
112
|
+
"CryptoLocker",
|
|
113
|
+
"TeslaCrypt",
|
|
114
|
+
"Cryptowall",
|
|
115
|
+
"Jigsaw",
|
|
116
|
+
"Philadelphia",
|
|
117
|
+
"Stampado",
|
|
118
|
+
"Shade",
|
|
119
|
+
"Troldesh",
|
|
120
|
+
"Reveton",
|
|
121
|
+
"KeRanger",
|
|
122
|
+
"Erebus",
|
|
123
|
+
"Satan",
|
|
124
|
+
"GandCrab",
|
|
125
|
+
"Scarab",
|
|
126
|
+
"GlobeImposter",
|
|
127
|
+
"Sodinokibi",
|
|
128
|
+
# RATs — remote access trojans
|
|
129
|
+
"AsyncRAT",
|
|
130
|
+
"QuasarRAT",
|
|
131
|
+
"Quasar",
|
|
132
|
+
"NjRAT",
|
|
133
|
+
"njRAT",
|
|
134
|
+
"DarkComet",
|
|
135
|
+
"Remcos",
|
|
136
|
+
"NetWire",
|
|
137
|
+
"XWorm",
|
|
138
|
+
"Warzone",
|
|
139
|
+
"Warzone RAT",
|
|
140
|
+
"Agent Tesla",
|
|
141
|
+
"AgentTesla",
|
|
142
|
+
"BitRAT",
|
|
143
|
+
"RevengeRAT",
|
|
144
|
+
"Orcus",
|
|
145
|
+
"Gh0st",
|
|
146
|
+
"Gh0stRAT",
|
|
147
|
+
"Havoc",
|
|
148
|
+
"Sliver",
|
|
149
|
+
"Cobalt Strike",
|
|
150
|
+
"CobaltStrike",
|
|
151
|
+
"Metasploit",
|
|
152
|
+
"Empire",
|
|
153
|
+
"PowerShell Empire",
|
|
154
|
+
"Mythic",
|
|
155
|
+
"Brute Ratel",
|
|
156
|
+
"BruteRatel",
|
|
157
|
+
"PoshC2",
|
|
158
|
+
"Covenant",
|
|
159
|
+
"Merlin",
|
|
160
|
+
"SILENTTRINITY",
|
|
161
|
+
"Nishang",
|
|
162
|
+
"Pupy",
|
|
163
|
+
"Koadic",
|
|
164
|
+
# Stealers — credential and data theft
|
|
165
|
+
"RedLine",
|
|
166
|
+
"Raccoon",
|
|
167
|
+
"Raccoon Stealer",
|
|
168
|
+
"Vidar",
|
|
169
|
+
"Mars",
|
|
170
|
+
"Aurora",
|
|
171
|
+
"Lumma",
|
|
172
|
+
"Lumma Stealer",
|
|
173
|
+
"LummaC2",
|
|
174
|
+
"AZORult",
|
|
175
|
+
"Azorult",
|
|
176
|
+
"FormBook",
|
|
177
|
+
"Snake Keylogger",
|
|
178
|
+
"SnakeKeylogger",
|
|
179
|
+
"HawkEye",
|
|
180
|
+
"Predator",
|
|
181
|
+
"Predator the Thief",
|
|
182
|
+
"Ducktail",
|
|
183
|
+
"Rhadamanthys",
|
|
184
|
+
"WhiteSnake",
|
|
185
|
+
"Atomic Stealer",
|
|
186
|
+
"AMOS",
|
|
187
|
+
"StealC",
|
|
188
|
+
"Meduza",
|
|
189
|
+
"MetaStealer",
|
|
190
|
+
"RisePro",
|
|
191
|
+
"Mystic",
|
|
192
|
+
"CryptBot",
|
|
193
|
+
"Cryptbot",
|
|
194
|
+
"Panda Stealer",
|
|
195
|
+
"BlackGuard",
|
|
196
|
+
"Titan Stealer",
|
|
197
|
+
"Erbium",
|
|
198
|
+
"Eternity Stealer",
|
|
199
|
+
"Oski",
|
|
200
|
+
"Krypton Stealer",
|
|
201
|
+
"Luca Stealer",
|
|
202
|
+
"Spectre Stealer",
|
|
203
|
+
# Loaders — malware delivery mechanisms
|
|
204
|
+
"SmokeLoader",
|
|
205
|
+
"Smoke Loader",
|
|
206
|
+
"IcedID",
|
|
207
|
+
"Emotet",
|
|
208
|
+
"QakBot",
|
|
209
|
+
"Qakbot",
|
|
210
|
+
"Bumblebee",
|
|
211
|
+
"GootLoader",
|
|
212
|
+
"PrivateLoader",
|
|
213
|
+
"GuLoader",
|
|
214
|
+
"CloudEyE",
|
|
215
|
+
"DanaBot",
|
|
216
|
+
"Amadey",
|
|
217
|
+
"RCSession",
|
|
218
|
+
"PureCrypter",
|
|
219
|
+
"DonutLoader",
|
|
220
|
+
"ModiLoader",
|
|
221
|
+
"AiBotLoader",
|
|
222
|
+
"Loader",
|
|
223
|
+
"SystemBC",
|
|
224
|
+
"Matanbuchus",
|
|
225
|
+
"Gozi",
|
|
226
|
+
"DBatLoader",
|
|
227
|
+
"MalDoc",
|
|
228
|
+
"XLoader",
|
|
229
|
+
"FormBook",
|
|
230
|
+
"MoqHao",
|
|
231
|
+
"Pikabot",
|
|
232
|
+
"Darkgate",
|
|
233
|
+
"DarkGate",
|
|
234
|
+
"Latrodectus",
|
|
235
|
+
"WarmCookie",
|
|
236
|
+
# Banking trojans
|
|
237
|
+
"TrickBot",
|
|
238
|
+
"Trickbot",
|
|
239
|
+
"Dridex",
|
|
240
|
+
"Ursnif",
|
|
241
|
+
"ZLoader",
|
|
242
|
+
"Zloader",
|
|
243
|
+
"Gozi",
|
|
244
|
+
"ISFB",
|
|
245
|
+
"Ramnit",
|
|
246
|
+
"Qbot",
|
|
247
|
+
"QBot",
|
|
248
|
+
"Shylock",
|
|
249
|
+
"Kronos",
|
|
250
|
+
"Zeus",
|
|
251
|
+
"SpyEye",
|
|
252
|
+
"Carbanak",
|
|
253
|
+
"FIN7",
|
|
254
|
+
"Valak",
|
|
255
|
+
"BazarLoader",
|
|
256
|
+
"BazarBackdoor",
|
|
257
|
+
"IcedID",
|
|
258
|
+
"TaurusLoader",
|
|
259
|
+
"Bookworm",
|
|
260
|
+
"Casbaneiro",
|
|
261
|
+
"Mekotio",
|
|
262
|
+
"Grandoreiro",
|
|
263
|
+
"Javali",
|
|
264
|
+
"Vizom",
|
|
265
|
+
# APT / nation-state tools
|
|
266
|
+
"PlugX",
|
|
267
|
+
"ShadowPad",
|
|
268
|
+
"Winnti",
|
|
269
|
+
"Flame",
|
|
270
|
+
"Shamoon",
|
|
271
|
+
"BlackEnergy",
|
|
272
|
+
"GreyEnergy",
|
|
273
|
+
"Industroyer",
|
|
274
|
+
"Stuxnet",
|
|
275
|
+
"Turla",
|
|
276
|
+
"Snake",
|
|
277
|
+
"ComRAT",
|
|
278
|
+
"Duqu",
|
|
279
|
+
"Gauss",
|
|
280
|
+
"MiniFlame",
|
|
281
|
+
"Regin",
|
|
282
|
+
"ProjectSauron",
|
|
283
|
+
"EternalBlue",
|
|
284
|
+
"DoublePulsar",
|
|
285
|
+
"WannaMine",
|
|
286
|
+
# Post-exploitation / red team tools
|
|
287
|
+
"Mimikatz",
|
|
288
|
+
"BloodHound",
|
|
289
|
+
"SharpHound",
|
|
290
|
+
"Responder",
|
|
291
|
+
"Impacket",
|
|
292
|
+
"LaZagne",
|
|
293
|
+
"Rubeus",
|
|
294
|
+
"Certify",
|
|
295
|
+
"Seatbelt",
|
|
296
|
+
"PowerView",
|
|
297
|
+
"PowerSploit",
|
|
298
|
+
"Nmap",
|
|
299
|
+
"Metasploit",
|
|
300
|
+
"Burp Suite",
|
|
301
|
+
"SQLMap",
|
|
302
|
+
"Nikto",
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
# Ransomware group subset (active RaaS operators)
|
|
306
|
+
_RANSOMWARE_DICT: set[str] = {
|
|
307
|
+
"LockBit", "LockBit 2.0", "LockBit 3.0",
|
|
308
|
+
"BlackCat", "ALPHV",
|
|
309
|
+
"Cl0p", "Clop",
|
|
310
|
+
"REvil", "Sodinokibi",
|
|
311
|
+
"Conti",
|
|
312
|
+
"BlackMatter",
|
|
313
|
+
"Hive",
|
|
314
|
+
"Vice Society",
|
|
315
|
+
"Play",
|
|
316
|
+
"Royal",
|
|
317
|
+
"Akira",
|
|
318
|
+
"BlackSuit",
|
|
319
|
+
"Avaddon",
|
|
320
|
+
"DarkSide",
|
|
321
|
+
"Maze",
|
|
322
|
+
"Ryuk",
|
|
323
|
+
"Egregor",
|
|
324
|
+
"Babuk",
|
|
325
|
+
"DoppelPaymer",
|
|
326
|
+
"MedusaLocker",
|
|
327
|
+
"Prometheus",
|
|
328
|
+
"Grief",
|
|
329
|
+
"Ragnar Locker",
|
|
330
|
+
"RagnarLocker",
|
|
331
|
+
"Cuba",
|
|
332
|
+
"BlackBasta",
|
|
333
|
+
"Black Basta",
|
|
334
|
+
"Yanluowang",
|
|
335
|
+
"Quantum",
|
|
336
|
+
"Monti",
|
|
337
|
+
"Nokoyawa",
|
|
338
|
+
"Trigona",
|
|
339
|
+
"Rhysida",
|
|
340
|
+
"Hunters International",
|
|
341
|
+
"Cactus",
|
|
342
|
+
"INC Ransom",
|
|
343
|
+
"KillSec",
|
|
344
|
+
"Dispossessor",
|
|
345
|
+
"Eldorado",
|
|
346
|
+
"SenSayQ",
|
|
347
|
+
"RansomHub",
|
|
348
|
+
"DragonForce",
|
|
349
|
+
"Scattered Spider",
|
|
350
|
+
"Dark Angels",
|
|
351
|
+
"8Base",
|
|
352
|
+
"Qilin",
|
|
353
|
+
"Fog",
|
|
354
|
+
"Lynx",
|
|
355
|
+
"Cicada3301",
|
|
356
|
+
"Embargo",
|
|
357
|
+
"Karakurt",
|
|
358
|
+
"GandCrab",
|
|
359
|
+
"SamSam",
|
|
360
|
+
"WannaCry",
|
|
361
|
+
"NotPetya",
|
|
362
|
+
"Petya",
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# ---------------------------------------------------------------------------
|
|
366
|
+
# Build compiled patterns from the dictionaries (at module load time)
|
|
367
|
+
# ---------------------------------------------------------------------------
|
|
368
|
+
|
|
369
|
+
def _build_pattern(names: set[str]) -> re.Pattern:
|
|
370
|
+
"""Build a word-bounded alternation pattern sorted longest-first."""
|
|
371
|
+
sorted_names = sorted(names, key=len, reverse=True)
|
|
372
|
+
alternation = "|".join(re.escape(name) for name in sorted_names)
|
|
373
|
+
return re.compile(rf"\b(?:{alternation})\b", re.IGNORECASE)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
_MALWARE_RE = _build_pattern(_MALWARE_DICT)
|
|
377
|
+
_RANSOMWARE_RE = _build_pattern(_RANSOMWARE_DICT)
|
|
378
|
+
|
|
379
|
+
# ---------------------------------------------------------------------------
|
|
380
|
+
# Heuristic threat-actor handle detection
|
|
381
|
+
# Context patterns: "posted by X", "user X", "alias X", "known as X", etc.
|
|
382
|
+
# Handle: 3–30 chars, may contain underscores / dots / hyphens but not
|
|
383
|
+
# starting or ending with them; must not be a plain email address.
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
_HANDLE_CHAR = r"[a-zA-Z0-9][a-zA-Z0-9_.\-]{1,28}[a-zA-Z0-9]"
|
|
387
|
+
_HANDLE_RE = re.compile(
|
|
388
|
+
r"(?:"
|
|
389
|
+
r"posted\s+by|user\s+|alias\s+|known\s+as|by\s+user"
|
|
390
|
+
r"|from\s+user|handle\s+|nickname\s+|nick\s+"
|
|
391
|
+
r"|op\s+is|author\s+|authored\s+by|written\s+by"
|
|
392
|
+
r")\s*(" + _HANDLE_CHAR + r")",
|
|
393
|
+
re.IGNORECASE,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Words that are common English nouns/verbs that may false-positive as handles
|
|
397
|
+
_COMMON_WORDS: frozenset[str] = frozenset({
|
|
398
|
+
"admin", "moderator", "user", "guest", "anon", "anonymous",
|
|
399
|
+
"unknown", "nobody", "someone", "anyone", "everyone",
|
|
400
|
+
"the", "and", "not", "for", "with", "that", "this",
|
|
401
|
+
})
|
|
402
|
+
|
|
403
|
+
# Threat context words used to filter spaCy ORG entities
|
|
404
|
+
_THREAT_CONTEXT: frozenset[str] = frozenset({
|
|
405
|
+
"breach", "leak", "attack", "ransom", "victim", "target",
|
|
406
|
+
"compromised", "hacked", "stolen", "exfiltrated", "extorted",
|
|
407
|
+
"encrypted", "infected", "malware", "ransomware", "exploit",
|
|
408
|
+
"vulnerability", "phishing", "credentials", "data",
|
|
409
|
+
})
|
|
410
|
+
|
|
411
|
+
# ---------------------------------------------------------------------------
|
|
412
|
+
# spaCy singleton — loaded lazily on first call, never reloaded
|
|
413
|
+
# ---------------------------------------------------------------------------
|
|
414
|
+
|
|
415
|
+
_nlp = None
|
|
416
|
+
_nlp_attempted = False
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _get_nlp():
|
|
420
|
+
global _nlp, _nlp_attempted
|
|
421
|
+
if _nlp_attempted:
|
|
422
|
+
return _nlp
|
|
423
|
+
_nlp_attempted = True
|
|
424
|
+
try:
|
|
425
|
+
import spacy # noqa: PLC0415
|
|
426
|
+
_nlp = spacy.load("en_core_web_sm")
|
|
427
|
+
logger.info("spaCy en_core_web_sm loaded successfully")
|
|
428
|
+
except Exception as exc:
|
|
429
|
+
logger.warning(
|
|
430
|
+
"spaCy model en_core_web_sm is not available — NER will be skipped. "
|
|
431
|
+
"Install with: python -m spacy download en_core_web_sm. Error: %s",
|
|
432
|
+
exc,
|
|
433
|
+
)
|
|
434
|
+
_nlp = None
|
|
435
|
+
return _nlp
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
# Public interface
|
|
440
|
+
# ---------------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def load_malware_dictionary() -> set[str]:
|
|
444
|
+
"""Return the full set of known malware family names used for matching."""
|
|
445
|
+
return set(_MALWARE_DICT)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def extract_named_entities(text: str) -> dict[str, list[str]]:
|
|
449
|
+
"""
|
|
450
|
+
Extract named entities that don't have fixed regex patterns.
|
|
451
|
+
|
|
452
|
+
Returns a dict with the same format as regex_patterns.extract_all().
|
|
453
|
+
If spaCy is unavailable, regex-based malware matching still runs;
|
|
454
|
+
threat actor handles are extracted heuristically.
|
|
455
|
+
Never raises.
|
|
456
|
+
"""
|
|
457
|
+
result: dict[str, list[str]] = {
|
|
458
|
+
THREAT_ACTOR_HANDLE: [],
|
|
459
|
+
MALWARE_FAMILY: [],
|
|
460
|
+
RANSOMWARE_GROUP: [],
|
|
461
|
+
ORGANIZATION_NAME: [],
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
# --- Malware & ransomware: dictionary-based regex (no spaCy needed) ---
|
|
466
|
+
result[MALWARE_FAMILY] = _dedup(
|
|
467
|
+
m.group(0) for m in _MALWARE_RE.finditer(text)
|
|
468
|
+
)
|
|
469
|
+
result[RANSOMWARE_GROUP] = _dedup(
|
|
470
|
+
m.group(0) for m in _RANSOMWARE_RE.finditer(text)
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# --- Threat actor handles: heuristic context matching ---
|
|
474
|
+
handles: list[str] = []
|
|
475
|
+
for m in _HANDLE_RE.finditer(text):
|
|
476
|
+
handle = m.group(1).strip()
|
|
477
|
+
if handle.lower() not in _COMMON_WORDS and "@" not in handle:
|
|
478
|
+
handles.append(handle)
|
|
479
|
+
result[THREAT_ACTOR_HANDLE] = _dedup(handles)
|
|
480
|
+
|
|
481
|
+
# --- Organization names: spaCy ORG entities in threat context ---
|
|
482
|
+
nlp = _get_nlp()
|
|
483
|
+
if nlp is not None:
|
|
484
|
+
text_lower = text.lower()
|
|
485
|
+
has_threat_context = any(w in text_lower for w in _THREAT_CONTEXT)
|
|
486
|
+
if has_threat_context:
|
|
487
|
+
doc = nlp(text[:100_000]) # cap for performance
|
|
488
|
+
orgs: list[str] = []
|
|
489
|
+
for ent in doc.ents:
|
|
490
|
+
if ent.label_ == "ORG":
|
|
491
|
+
orgs.append(ent.text.strip())
|
|
492
|
+
result[ORGANIZATION_NAME] = _dedup(orgs)
|
|
493
|
+
|
|
494
|
+
except Exception:
|
|
495
|
+
logger.exception("extract_named_entities encountered an unexpected error")
|
|
496
|
+
|
|
497
|
+
return result
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# ---------------------------------------------------------------------------
|
|
501
|
+
# Internal helpers
|
|
502
|
+
# ---------------------------------------------------------------------------
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _dedup(values) -> list[str]:
|
|
506
|
+
seen: set[str] = set()
|
|
507
|
+
result: list[str] = []
|
|
508
|
+
for v in values:
|
|
509
|
+
if v not in seen:
|
|
510
|
+
seen.add(v)
|
|
511
|
+
result.append(v)
|
|
512
|
+
return result
|