txt2detection 1.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of txt2detection might be problematic. Click here for more details.
- txt2detection/__init__.py +2 -0
- txt2detection/__main__.py +343 -0
- txt2detection/ai_extractor/__init__.py +16 -0
- txt2detection/ai_extractor/anthropic.py +12 -0
- txt2detection/ai_extractor/base.py +72 -0
- txt2detection/ai_extractor/deepseek.py +20 -0
- txt2detection/ai_extractor/gemini.py +18 -0
- txt2detection/ai_extractor/openai.py +18 -0
- txt2detection/ai_extractor/openrouter.py +20 -0
- txt2detection/ai_extractor/prompts.py +121 -0
- txt2detection/ai_extractor/utils.py +21 -0
- txt2detection/attack_navigator.py +68 -0
- txt2detection/bundler.py +418 -0
- txt2detection/config/detection_languages.yaml +14 -0
- txt2detection/credential_checker.py +82 -0
- txt2detection/models.py +427 -0
- txt2detection/observables.py +161 -0
- txt2detection/utils.py +100 -0
- txt2detection-1.0.15.dist-info/METADATA +230 -0
- txt2detection-1.0.15.dist-info/RECORD +23 -0
- txt2detection-1.0.15.dist-info/WHEEL +4 -0
- txt2detection-1.0.15.dist-info/entry_points.txt +2 -0
- txt2detection-1.0.15.dist-info/licenses/LICENSE +202 -0
txt2detection/models.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import typing
|
|
5
|
+
import uuid
|
|
6
|
+
import requests
|
|
7
|
+
from slugify import slugify
|
|
8
|
+
from datetime import date as dt_date
|
|
9
|
+
from typing import Any, ClassVar, List, Literal, Optional, Union
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
from stix2extensions.data_source import DataSource
|
|
12
|
+
|
|
13
|
+
import jsonschema
|
|
14
|
+
from pydantic import BaseModel, Field, computed_field, field_validator
|
|
15
|
+
from pydantic_core import PydanticCustomError, core_schema
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
from stix2 import (
|
|
19
|
+
MarkingDefinition,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if typing.TYPE_CHECKING:
|
|
24
|
+
from txt2detection.bundler import Bundler
|
|
25
|
+
|
|
26
|
+
UUID_NAMESPACE = uuid.UUID("a4d70b75-6f4a-5d19-9137-da863edd33d7")
|
|
27
|
+
|
|
28
|
+
TAG_PATTERN = re.compile(r"^[a-z0-9_-]+\.[a-z0-9._-]+$")
|
|
29
|
+
|
|
30
|
+
MITRE_TACTIC_MAP = {
|
|
31
|
+
"initial-access": "TA0001",
|
|
32
|
+
"execution": "TA0002",
|
|
33
|
+
"persistence": "TA0003",
|
|
34
|
+
"privilege-escalation": "TA0004",
|
|
35
|
+
"defense-evasion": "TA0005",
|
|
36
|
+
"credential-access": "TA0006",
|
|
37
|
+
"discovery": "TA0007",
|
|
38
|
+
"lateral-movement": "TA0008",
|
|
39
|
+
"collection": "TA0009",
|
|
40
|
+
"exfiltration": "TA0010",
|
|
41
|
+
"command-and-control": "TA0011",
|
|
42
|
+
"impact": "TA0040",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TLP_LEVEL(enum.Enum):
|
|
47
|
+
CLEAR = MarkingDefinition(
|
|
48
|
+
spec_version="2.1",
|
|
49
|
+
id="marking-definition--94868c89-83c2-464b-929b-a1a8aa3c8487",
|
|
50
|
+
created="2022-10-01T00:00:00.000Z",
|
|
51
|
+
definition_type="TLP:CLEAR",
|
|
52
|
+
extensions={
|
|
53
|
+
"extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
|
|
54
|
+
"extension_type": "property-extension",
|
|
55
|
+
"tlp_2_0": "clear",
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
)
|
|
59
|
+
GREEN = MarkingDefinition(
|
|
60
|
+
spec_version="2.1",
|
|
61
|
+
id="marking-definition--bab4a63c-aed9-4cf5-a766-dfca5abac2bb",
|
|
62
|
+
created="2022-10-01T00:00:00.000Z",
|
|
63
|
+
definition_type="TLP:GREEN",
|
|
64
|
+
extensions={
|
|
65
|
+
"extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
|
|
66
|
+
"extension_type": "property-extension",
|
|
67
|
+
"tlp_2_0": "green",
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
AMBER = MarkingDefinition(
|
|
72
|
+
spec_version="2.1",
|
|
73
|
+
id="marking-definition--55d920b0-5e8b-4f79-9ee9-91f868d9b421",
|
|
74
|
+
created="2022-10-01T00:00:00.000Z",
|
|
75
|
+
definition_type="TLP:AMBER",
|
|
76
|
+
extensions={
|
|
77
|
+
"extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
|
|
78
|
+
"extension_type": "property-extension",
|
|
79
|
+
"tlp_2_0": "amber",
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
AMBER_STRICT = MarkingDefinition(
|
|
84
|
+
spec_version="2.1",
|
|
85
|
+
id="marking-definition--939a9414-2ddd-4d32-a0cd-375ea402b003",
|
|
86
|
+
created="2022-10-01T00:00:00.000Z",
|
|
87
|
+
definition_type="TLP:AMBER+STRICT",
|
|
88
|
+
extensions={
|
|
89
|
+
"extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
|
|
90
|
+
"extension_type": "property-extension",
|
|
91
|
+
"tlp_2_0": "amber+strict",
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
)
|
|
95
|
+
RED = MarkingDefinition(
|
|
96
|
+
spec_version="2.1",
|
|
97
|
+
id="marking-definition--e828b379-4e03-4974-9ac4-e53a884c97c1",
|
|
98
|
+
created="2022-10-01T00:00:00.000Z",
|
|
99
|
+
definition_type="TLP:RED",
|
|
100
|
+
extensions={
|
|
101
|
+
"extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
|
|
102
|
+
"extension_type": "property-extension",
|
|
103
|
+
"tlp_2_0": "red",
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def levels(cls):
|
|
110
|
+
return dict(
|
|
111
|
+
clear=cls.CLEAR,
|
|
112
|
+
green=cls.GREEN,
|
|
113
|
+
amber=cls.AMBER,
|
|
114
|
+
amber_strict=cls.AMBER_STRICT,
|
|
115
|
+
red=cls.RED,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def values(cls):
|
|
120
|
+
return [
|
|
121
|
+
cls.CLEAR.value,
|
|
122
|
+
cls.GREEN.value,
|
|
123
|
+
cls.AMBER.value,
|
|
124
|
+
cls.AMBER_STRICT.value,
|
|
125
|
+
cls.RED.value,
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
@classmethod
|
|
129
|
+
def get(cls, level: "str|TLP_LEVEL"):
|
|
130
|
+
if isinstance(level, cls):
|
|
131
|
+
return level
|
|
132
|
+
level = level.lower()
|
|
133
|
+
level = level.replace("+", "_").replace("-", "_")
|
|
134
|
+
if level not in cls.levels():
|
|
135
|
+
raise Exception(f"unsupported tlp level: `{level}`")
|
|
136
|
+
return cls.levels()[level]
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def name(self):
|
|
140
|
+
return super().name.lower()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class Statuses(enum.StrEnum):
|
|
144
|
+
stable = enum.auto()
|
|
145
|
+
test = enum.auto()
|
|
146
|
+
experimental = enum.auto()
|
|
147
|
+
deprecated = enum.auto()
|
|
148
|
+
unsupported = enum.auto()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class Level(enum.StrEnum):
|
|
152
|
+
informational = enum.auto()
|
|
153
|
+
low = enum.auto()
|
|
154
|
+
medium = enum.auto()
|
|
155
|
+
high = enum.auto()
|
|
156
|
+
critical = enum.auto()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SigmaTag(str):
|
|
160
|
+
@classmethod
|
|
161
|
+
def __get_pydantic_core_schema__(
|
|
162
|
+
cls,
|
|
163
|
+
_source: type[Any],
|
|
164
|
+
_handler,
|
|
165
|
+
) -> core_schema.CoreSchema:
|
|
166
|
+
return core_schema.no_info_after_validator_function(
|
|
167
|
+
cls._validate, core_schema.str_schema()
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def __get_pydantic_json_schema__(cls, core_schema: core_schema.CoreSchema, handler):
|
|
172
|
+
field_schema = handler(core_schema)
|
|
173
|
+
field_schema.update(
|
|
174
|
+
type="string", pattern=TAG_PATTERN.pattern, format="sigma-tag"
|
|
175
|
+
)
|
|
176
|
+
return field_schema
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def _validate(cls, input_value: str, /) -> str:
|
|
180
|
+
if not TAG_PATTERN.match(input_value):
|
|
181
|
+
raise PydanticCustomError(
|
|
182
|
+
"value_error",
|
|
183
|
+
"value is not a valid SIGMA tag: {reason}",
|
|
184
|
+
{
|
|
185
|
+
"reason": f"Must be in format namespace.value and match pattern {TAG_PATTERN.pattern}"
|
|
186
|
+
},
|
|
187
|
+
)
|
|
188
|
+
return input_value
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class RelatedRule(BaseModel):
|
|
192
|
+
id: UUID
|
|
193
|
+
type: Literal["derived", "obsolete", "merged", "renamed", "similar"]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class BaseDetection(BaseModel):
|
|
197
|
+
title: str
|
|
198
|
+
description: str
|
|
199
|
+
detection: dict
|
|
200
|
+
logsource: dict
|
|
201
|
+
status: Statuses = Statuses.experimental
|
|
202
|
+
falsepositives: list[str]
|
|
203
|
+
tags: list[str]
|
|
204
|
+
level: Level
|
|
205
|
+
_custom_id = None
|
|
206
|
+
_extra_data: dict
|
|
207
|
+
sigma_json_schema: ClassVar = requests.get(
|
|
208
|
+
"https://github.com/SigmaHQ/sigma-specification/raw/refs/heads/main/json-schema/sigma-detection-rule-schema.json"
|
|
209
|
+
).json()
|
|
210
|
+
|
|
211
|
+
def model_post_init(self, __context):
|
|
212
|
+
self.tags = self.tags or []
|
|
213
|
+
self._extra_data = dict()
|
|
214
|
+
return super().model_post_init(__context)
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def detection_id(self):
|
|
218
|
+
return str(self._custom_id or getattr(self, "id", None) or uuid.uuid4())
|
|
219
|
+
|
|
220
|
+
@detection_id.setter
|
|
221
|
+
def detection_id(self, custom_id):
|
|
222
|
+
self._custom_id = custom_id.split("--")[-1]
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def tlp_level(self):
|
|
226
|
+
return tlp_from_tags(self.tags)
|
|
227
|
+
|
|
228
|
+
@tlp_level.setter
|
|
229
|
+
def tlp_level(self, level):
|
|
230
|
+
set_tlp_level_in_tags(self.tags, level)
|
|
231
|
+
|
|
232
|
+
def set_labels(self, labels):
|
|
233
|
+
self.tags.extend(labels)
|
|
234
|
+
|
|
235
|
+
def set_extra_data_from_bundler(self, bundler: "Bundler"):
|
|
236
|
+
raise NotImplementedError("this class should no longer be in use")
|
|
237
|
+
|
|
238
|
+
def make_rule(self, bundler: "Bundler"):
|
|
239
|
+
self.set_extra_data_from_bundler(bundler)
|
|
240
|
+
self.tags = list(dict.fromkeys(self.tags))
|
|
241
|
+
|
|
242
|
+
rule = dict(
|
|
243
|
+
id=self.detection_id,
|
|
244
|
+
**self.model_dump(
|
|
245
|
+
exclude=["indicator_types", "id"], mode="json", by_alias=True
|
|
246
|
+
),
|
|
247
|
+
)
|
|
248
|
+
for k, v in list(rule.items()):
|
|
249
|
+
if not v:
|
|
250
|
+
rule.pop(k, None)
|
|
251
|
+
|
|
252
|
+
self.validate_rule_with_json_schema(rule)
|
|
253
|
+
if getattr(self, "date", 0):
|
|
254
|
+
rule.update(date=self.date)
|
|
255
|
+
if getattr(self, "modified", 0):
|
|
256
|
+
rule.update(modified=self.modified)
|
|
257
|
+
return yaml.dump(rule, sort_keys=False, indent=4)
|
|
258
|
+
|
|
259
|
+
def validate_rule_with_json_schema(self, rule):
|
|
260
|
+
jsonschema.validate(
|
|
261
|
+
rule,
|
|
262
|
+
self.sigma_json_schema,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def external_references(self):
|
|
267
|
+
refs = []
|
|
268
|
+
for attr in ["level", "status", "license"]:
|
|
269
|
+
if attr_val := getattr(self, attr, None):
|
|
270
|
+
refs.append(dict(source_name=f"sigma-{attr}", description=attr_val))
|
|
271
|
+
return refs
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def mitre_attack_ids(self):
|
|
275
|
+
retval = []
|
|
276
|
+
for i, label in enumerate(self.tags):
|
|
277
|
+
label = label.replace("_", "-").lower()
|
|
278
|
+
namespace, _, label_id = label.partition(".")
|
|
279
|
+
if namespace == "attack":
|
|
280
|
+
retval.append(MITRE_TACTIC_MAP.get(label_id, label_id.upper()))
|
|
281
|
+
return retval
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def cve_ids(self):
|
|
285
|
+
retval = []
|
|
286
|
+
for label in self.tags:
|
|
287
|
+
namespace, _, label_id = label.partition(".")
|
|
288
|
+
if namespace == "cve":
|
|
289
|
+
retval.append(namespace.upper() + "-" + label_id)
|
|
290
|
+
return retval
|
|
291
|
+
|
|
292
|
+
def make_data_source(self):
|
|
293
|
+
return DataSource(
|
|
294
|
+
category=self.logsource.get("category"),
|
|
295
|
+
product=self.logsource.get("product"),
|
|
296
|
+
service=self.logsource.get("service"),
|
|
297
|
+
definition=self.logsource.get("definition"),
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class AIDetection(BaseDetection):
|
|
302
|
+
indicator_types: list[str] = Field(default_factory=list)
|
|
303
|
+
|
|
304
|
+
def to_sigma_rule_detection(self, bundler):
|
|
305
|
+
rule_dict = {
|
|
306
|
+
**self.model_dump(exclude=["indicator_types"]),
|
|
307
|
+
**dict(
|
|
308
|
+
date=bundler.report.created.date(),
|
|
309
|
+
modified=bundler.report.modified.date(),
|
|
310
|
+
id=uuid.uuid4(),
|
|
311
|
+
),
|
|
312
|
+
}
|
|
313
|
+
try:
|
|
314
|
+
return SigmaRuleDetection.model_validate(rule_dict)
|
|
315
|
+
except Exception as e:
|
|
316
|
+
raise ValueError(
|
|
317
|
+
dict(message="validate ai output failed", error=e, content=rule_dict)
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class SigmaRuleDetection(BaseDetection):
|
|
322
|
+
title: str
|
|
323
|
+
id: Optional[UUID] = None
|
|
324
|
+
related: Optional[list[RelatedRule]] = None
|
|
325
|
+
name: Optional[str] = None
|
|
326
|
+
taxonomy: Optional[str] = None
|
|
327
|
+
status: Optional[Statuses] = None
|
|
328
|
+
description: Optional[str] = None
|
|
329
|
+
license: Optional[str] = None
|
|
330
|
+
author: Optional[str] = None
|
|
331
|
+
references: Optional[List[str]] = Field(default_factory=list)
|
|
332
|
+
date: Optional["dt_date"] = Field(alias="date", default=None)
|
|
333
|
+
modified: Optional["dt_date"] = None
|
|
334
|
+
logsource: dict
|
|
335
|
+
detection: dict
|
|
336
|
+
fields: Optional[List[str]] = None
|
|
337
|
+
falsepositives: Optional[List[str]] = None
|
|
338
|
+
level: Optional[Level] = None
|
|
339
|
+
tags: Optional[List[SigmaTag]] = Field(default_factory=list)
|
|
340
|
+
scope: Optional[List[str]] = None
|
|
341
|
+
_indicator_types: list = None
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def detection_id(self):
|
|
345
|
+
return str(self.id)
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def indicator_types(self):
|
|
349
|
+
return self._indicator_types
|
|
350
|
+
|
|
351
|
+
@indicator_types.setter
|
|
352
|
+
def indicator_types(self, types):
|
|
353
|
+
self._indicator_types = types
|
|
354
|
+
|
|
355
|
+
@detection_id.setter
|
|
356
|
+
def detection_id(self, new_id):
|
|
357
|
+
if self.id and str(self.id) != str(new_id):
|
|
358
|
+
self.related = self.related or []
|
|
359
|
+
self.related.append(RelatedRule(id=self.id, type="renamed"))
|
|
360
|
+
self.id = new_id
|
|
361
|
+
|
|
362
|
+
@field_validator("tags", mode="after")
|
|
363
|
+
@classmethod
|
|
364
|
+
def validate_tlp(cls, tags: list[str]):
|
|
365
|
+
tlps = []
|
|
366
|
+
for tag in tags:
|
|
367
|
+
if tag.startswith("tlp."):
|
|
368
|
+
tlps.append(tag)
|
|
369
|
+
if len(tlps) > 1:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"tag must not contain more than one tag in tlp namespace. Got {tlps}"
|
|
372
|
+
)
|
|
373
|
+
return tags
|
|
374
|
+
|
|
375
|
+
@field_validator("modified", mode="after")
|
|
376
|
+
@classmethod
|
|
377
|
+
def validate_modified(cls, modified, info):
|
|
378
|
+
if info.data.get("date") == modified:
|
|
379
|
+
return None
|
|
380
|
+
return modified
|
|
381
|
+
|
|
382
|
+
def set_extra_data_from_bundler(self, bundler: "Bundler"):
|
|
383
|
+
if not bundler:
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
if not self.date:
|
|
387
|
+
from .utils import as_date
|
|
388
|
+
|
|
389
|
+
self.date = as_date(bundler.created)
|
|
390
|
+
|
|
391
|
+
self.set_labels(bundler.labels)
|
|
392
|
+
self.tlp_level = bundler.tlp_level.name
|
|
393
|
+
self.author = bundler.report.created_by_ref
|
|
394
|
+
self.license = bundler.license
|
|
395
|
+
self.references = bundler.reference_urls
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class DetectionContainer(BaseModel):
|
|
399
|
+
success: bool
|
|
400
|
+
detections: list[Union[BaseDetection, AIDetection, SigmaRuleDetection]]
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
class DataContainer(BaseModel):
|
|
404
|
+
detections: DetectionContainer
|
|
405
|
+
navigator_layer: dict = Field(default=None)
|
|
406
|
+
observables: list[dict] = Field(default=None)
|
|
407
|
+
cves: dict[str, str] = Field(default_factory=dict)
|
|
408
|
+
attacks: dict[str, str] = Field(default_factory=dict)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def tlp_from_tags(tags: list[SigmaTag]):
|
|
412
|
+
for tag in tags:
|
|
413
|
+
ns, _, level = tag.partition(".")
|
|
414
|
+
if ns != "tlp":
|
|
415
|
+
continue
|
|
416
|
+
if tlp_level := TLP_LEVEL.get(level.replace("-", "_")):
|
|
417
|
+
return tlp_level
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def set_tlp_level_in_tags(tags: list[SigmaTag], level):
|
|
422
|
+
level = str(level)
|
|
423
|
+
for i, tag in enumerate(tags):
|
|
424
|
+
if tag.startswith("tlp."):
|
|
425
|
+
tags.remove(tag)
|
|
426
|
+
tags.append("tlp." + level.replace("_", "-"))
|
|
427
|
+
return tags
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import re, validators
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
from stix2 import parse as parse_stix, parse_observable
|
|
4
|
+
|
|
5
|
+
# Mapping of key regex patterns to STIX observable types
|
|
6
|
+
STIX_PATTERNS_KEYS = {
|
|
7
|
+
"ipv4-addr": r"(?i)\b(ip|ipv4)\b",
|
|
8
|
+
"ipv6-addr": r"(?i)\bipv6\b",
|
|
9
|
+
"email-addr": r"(?i)\bemail\b",
|
|
10
|
+
"url": r"(?i)\b(url|uri)\b",
|
|
11
|
+
"directory": r"(?i)\b(directory|path)\b",
|
|
12
|
+
"domain-name": r"(?i)\bdomain\b",
|
|
13
|
+
"hostname": r"(?i)\bhost\b",
|
|
14
|
+
"file.hashes.MD5": r"(?i)\bmd5\b",
|
|
15
|
+
"file.hashes.SHA-1": r"(?i)\bsha1\b",
|
|
16
|
+
"file.hashes.SHA-256": r"(?i)\bsha256\b",
|
|
17
|
+
"file.hashes.SHA-512": r"(?i)\bsha512\b",
|
|
18
|
+
"file.hashes.SSDEEP": r"(?i)\bssdeep\b",
|
|
19
|
+
"mac-addr": r"(?i)\bmac\b",
|
|
20
|
+
"user-account": r"(?i)\buser\b",
|
|
21
|
+
"windows-registry-key": r"(?i)\bregistry\b",
|
|
22
|
+
"x509-certificate": r"(?i)\bx509\b",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Mapping of value regex patterns to STIX observable types
|
|
26
|
+
STIX_PATTERNS_VALUES = {
|
|
27
|
+
"ipv4-addr": [r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.|$)){4}\b"],
|
|
28
|
+
"ipv6-addr": [r"\b(?:[A-F0-9]{1,4}:){7}[A-F0-9]{1,4}\b"],
|
|
29
|
+
"email-addr": [r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"],
|
|
30
|
+
"url": [r"\bhttps?://[^\s/$.?#].[^\x00\s]*\b"],
|
|
31
|
+
"directory": [r"(?:[A-Za-z]:)?(?:\\\\[^\\\\:*?\"<>|\r\n]+)+\\\\?"],
|
|
32
|
+
"domain-name": [r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b"],
|
|
33
|
+
"hostname": [r"\b[a-zA-Z0-9-]{1,63}(\.[a-zA-Z0-9-]{1,63})*\b"],
|
|
34
|
+
"file.hashes.MD5": [r"\b[a-fA-F0-9]{32}\b"],
|
|
35
|
+
"file.hashes.SHA-1": [r"\b[a-fA-F0-9]{40}\b"],
|
|
36
|
+
"file.hashes.SHA-256": [r"\b[a-fA-F0-9]{64}\b"],
|
|
37
|
+
"file.hashes.SHA-512": [r"\b[a-fA-F0-9]{128}\b"],
|
|
38
|
+
"file.hashes.SSDEEP": [r"\b\d{1,}:[A-Za-z0-9/+]{10,}:[A-Za-z0-9/+]{10,}\b"],
|
|
39
|
+
"mac-addr": [r"\b([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})\b"],
|
|
40
|
+
"user-account": [r"\b[A-Za-z0-9._%-]{3,}\\\\?[A-Za-z0-9._%-]{3,}\b"],
|
|
41
|
+
"windows-registry-key": [r"HK\w{0,2}_[A-Z_]+\\.*"],
|
|
42
|
+
"x509-certificate": [r"-----BEGIN CERTIFICATE-----.+?-----END CERTIFICATE-----"],
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def filter_out(type, value: str):
|
|
47
|
+
match type:
|
|
48
|
+
case "ipv4-addr":
|
|
49
|
+
return validators.ipv4(value)
|
|
50
|
+
case "ipv6-addr":
|
|
51
|
+
return validators.ipv6(value)
|
|
52
|
+
case "email-addr":
|
|
53
|
+
return validators.email(value)
|
|
54
|
+
case "url":
|
|
55
|
+
return validators.url(value)
|
|
56
|
+
case "domain-name":
|
|
57
|
+
return validators.domain(value, consider_tld=True)
|
|
58
|
+
case "file.hashes.MD5":
|
|
59
|
+
return validators.hashes.md5(value)
|
|
60
|
+
case "file.hashes.SHA-1":
|
|
61
|
+
return validators.hashes.sha1(value)
|
|
62
|
+
case "file.hashes.SHA-256":
|
|
63
|
+
return validators.hashes.sha256(value)
|
|
64
|
+
case "file.hashes.SHA-512":
|
|
65
|
+
return validators.hashes.sha512(value)
|
|
66
|
+
case "file.hashes.SSDEEP":
|
|
67
|
+
pass
|
|
68
|
+
case "mac-addr":
|
|
69
|
+
return validators.mac_address(value)
|
|
70
|
+
case "user-account":
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
case "windows-registry-key":
|
|
74
|
+
print(value)
|
|
75
|
+
ns, _, _ = value.partition("\\")
|
|
76
|
+
return ns in [
|
|
77
|
+
"HKEY_CLASSES_ROOT",
|
|
78
|
+
"HKCR",
|
|
79
|
+
"HKEY_CURRENT_USER",
|
|
80
|
+
"HKCU",
|
|
81
|
+
"HKEY_LOCAL_MACHINE",
|
|
82
|
+
"HKLM",
|
|
83
|
+
"HKEY_USERS",
|
|
84
|
+
"HKU",
|
|
85
|
+
"HKEY_CURRENT_CONFIG",
|
|
86
|
+
"HKCC",
|
|
87
|
+
"HKEY_PERFORMANCE_DATA",
|
|
88
|
+
"HKEY_DYN_DATA",
|
|
89
|
+
]
|
|
90
|
+
case _:
|
|
91
|
+
return False
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def find_stix_observables(detection: Any, matches: List[str] = None) -> List[str]:
|
|
96
|
+
if matches is None:
|
|
97
|
+
matches = []
|
|
98
|
+
|
|
99
|
+
if isinstance(detection, dict):
|
|
100
|
+
for key, value in detection.items():
|
|
101
|
+
for stix_type, key_pattern in STIX_PATTERNS_KEYS.items():
|
|
102
|
+
value_patterns = STIX_PATTERNS_VALUES.get(stix_type, [])
|
|
103
|
+
if re.search(key_pattern, key, re.IGNORECASE):
|
|
104
|
+
for pattern in value_patterns:
|
|
105
|
+
if isinstance(value, str) and re.search(
|
|
106
|
+
pattern, value, re.IGNORECASE
|
|
107
|
+
):
|
|
108
|
+
if filter_out(stix_type, value):
|
|
109
|
+
matches.append((stix_type, value))
|
|
110
|
+
find_stix_observables(value, matches)
|
|
111
|
+
find_stix_observables(value, matches)
|
|
112
|
+
elif isinstance(detection, list):
|
|
113
|
+
for item in detection:
|
|
114
|
+
find_stix_observables(item, matches)
|
|
115
|
+
elif isinstance(detection, str):
|
|
116
|
+
for stix_type, value_patterns in STIX_PATTERNS_VALUES.items():
|
|
117
|
+
for pattern in value_patterns:
|
|
118
|
+
if re.search(pattern, detection, re.IGNORECASE):
|
|
119
|
+
if filter_out(stix_type, detection):
|
|
120
|
+
matches.append((stix_type, detection))
|
|
121
|
+
return matches
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def to_stix_object(observable_type: str, value):
|
|
125
|
+
match observable_type:
|
|
126
|
+
case (
|
|
127
|
+
"ipv4-addr"
|
|
128
|
+
| "ipv6-addr"
|
|
129
|
+
| "email-addr"
|
|
130
|
+
| "url"
|
|
131
|
+
| "domain-name"
|
|
132
|
+
| "mac-addr"
|
|
133
|
+
):
|
|
134
|
+
return parse_observable(
|
|
135
|
+
dict(
|
|
136
|
+
type=observable_type,
|
|
137
|
+
value=value,
|
|
138
|
+
spec_version="2.1",
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
case (
|
|
142
|
+
"file.hashes.MD5"
|
|
143
|
+
| "file.hashes.SHA-1"
|
|
144
|
+
| "file.hashes.SHA-256"
|
|
145
|
+
| "file.hashes.SHA-512"
|
|
146
|
+
| "file.hashes.SSDEEP"
|
|
147
|
+
):
|
|
148
|
+
_, _, hash_type = observable_type.rpartition(".")
|
|
149
|
+
return parse_observable(
|
|
150
|
+
dict(type="file", spec_version="2.1", hashes={hash_type: value})
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
case "windows-registry-key":
|
|
154
|
+
return parse_observable(
|
|
155
|
+
dict(
|
|
156
|
+
type=observable_type,
|
|
157
|
+
spec_version="2.1",
|
|
158
|
+
key=value,
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
return None
|
txt2detection/utils.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
import uuid
|
|
5
|
+
import requests
|
|
6
|
+
from .ai_extractor import ALL_AI_EXTRACTORS, BaseAIExtractor, ModelError
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
import enum
|
|
10
|
+
import logging
|
|
11
|
+
import requests
|
|
12
|
+
from stix2 import Identity
|
|
13
|
+
|
|
14
|
+
from .models import UUID_NAMESPACE
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DetectionLanguage(SimpleNamespace):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_model(value: str):
|
|
22
|
+
splits = value.split(":", 1)
|
|
23
|
+
provider = splits[0]
|
|
24
|
+
if provider not in ALL_AI_EXTRACTORS:
|
|
25
|
+
raise NotImplementedError(
|
|
26
|
+
f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
|
|
27
|
+
)
|
|
28
|
+
provider = ALL_AI_EXTRACTORS[provider]
|
|
29
|
+
try:
|
|
30
|
+
if len(splits) == 2:
|
|
31
|
+
return provider(model=splits[1])
|
|
32
|
+
return provider()
|
|
33
|
+
except Exception as e:
|
|
34
|
+
raise ModelError(f"Unable to initialize model `{value}`") from e
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def make_identity(name, namespace=None, created_by_ref=None, object_marking_refs=None):
|
|
38
|
+
from .bundler import Bundler
|
|
39
|
+
|
|
40
|
+
if isinstance(namespace, str):
|
|
41
|
+
namespace = uuid.UUID(namespace)
|
|
42
|
+
namespace = namespace or UUID_NAMESPACE
|
|
43
|
+
return Identity(
|
|
44
|
+
id="identity--" + str(uuid.uuid5(namespace, f"{name}")),
|
|
45
|
+
name=name,
|
|
46
|
+
created_by_ref=created_by_ref or Bundler.default_identity.id,
|
|
47
|
+
created=datetime(2020, 1, 1),
|
|
48
|
+
modified=datetime(2020, 1, 1),
|
|
49
|
+
object_marking_refs=object_marking_refs
|
|
50
|
+
or [
|
|
51
|
+
"marking-definition--94868c89-83c2-464b-929b-a1a8aa3c8487",
|
|
52
|
+
"marking-definition--a4d70b75-6f4a-5d19-9137-da863edd33d7",
|
|
53
|
+
],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def validate_token_count(max_tokens, input, extractor: BaseAIExtractor):
|
|
58
|
+
logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
|
|
59
|
+
token_count = extractor.count_tokens(input)
|
|
60
|
+
logging.info("TOKEN COUNT FOR %s: %d", extractor.extractor_name, token_count)
|
|
61
|
+
if token_count > max_tokens:
|
|
62
|
+
raise Exception(
|
|
63
|
+
f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@lru_cache(maxsize=5)
|
|
68
|
+
def get_licenses(date):
|
|
69
|
+
resp = requests.get(
|
|
70
|
+
"https://github.com/spdx/license-list-data/raw/refs/heads/main/json/licenses.json"
|
|
71
|
+
)
|
|
72
|
+
return {l["licenseId"]: l["name"] for l in resp.json()["licenses"]}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def valid_licenses():
|
|
76
|
+
return get_licenses(datetime.now().date().isoformat())
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def remove_rule_specific_tags(tags):
|
|
80
|
+
labels = []
|
|
81
|
+
for tag in tags:
|
|
82
|
+
namespace, _, label = tag.partition(".")
|
|
83
|
+
if namespace in ["attack", "cve", "tlp"]:
|
|
84
|
+
continue
|
|
85
|
+
labels.append(tag)
|
|
86
|
+
return labels
|
|
87
|
+
|
|
88
|
+
@lru_cache()
|
|
89
|
+
def load_stix_object_from_url(url):
|
|
90
|
+
resp = requests.get(url)
|
|
91
|
+
return resp.json()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def as_date(d: "date|datetime"):
|
|
95
|
+
if isinstance(d, datetime):
|
|
96
|
+
return d.date()
|
|
97
|
+
return d
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
STATUSES = ["stable", "test", "experimental", "deprecated", "unsupported"]
|