txt2detection 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of txt2detection might be problematic. Click here for more details.

@@ -0,0 +1,427 @@
1
+ import enum
2
+ import json
3
+ import re
4
+ import typing
5
+ import uuid
6
+ import requests
7
+ from slugify import slugify
8
+ from datetime import date as dt_date
9
+ from typing import Any, ClassVar, List, Literal, Optional, Union
10
+ from uuid import UUID
11
+ from stix2extensions import DataSource
12
+
13
+ import jsonschema
14
+ from pydantic import BaseModel, Field, computed_field, field_validator
15
+ from pydantic_core import PydanticCustomError, core_schema
16
+ import yaml
17
+
18
+ from stix2 import (
19
+ MarkingDefinition,
20
+ )
21
+
22
+
23
+ if typing.TYPE_CHECKING:
24
+ from txt2detection.bundler import Bundler
25
+
26
+ UUID_NAMESPACE = uuid.UUID("a4d70b75-6f4a-5d19-9137-da863edd33d7")
27
+
28
+ TAG_PATTERN = re.compile(r"^[a-z0-9_-]+\.[a-z0-9._-]+$")
29
+
30
+ MITRE_TACTIC_MAP = {
31
+ "initial-access": "TA0001",
32
+ "execution": "TA0002",
33
+ "persistence": "TA0003",
34
+ "privilege-escalation": "TA0004",
35
+ "defense-evasion": "TA0005",
36
+ "credential-access": "TA0006",
37
+ "discovery": "TA0007",
38
+ "lateral-movement": "TA0008",
39
+ "collection": "TA0009",
40
+ "exfiltration": "TA0010",
41
+ "command-and-control": "TA0011",
42
+ "impact": "TA0040",
43
+ }
44
+
45
+
46
+ class TLP_LEVEL(enum.Enum):
47
+ CLEAR = MarkingDefinition(
48
+ spec_version="2.1",
49
+ id="marking-definition--94868c89-83c2-464b-929b-a1a8aa3c8487",
50
+ created="2022-10-01T00:00:00.000Z",
51
+ definition_type="TLP:CLEAR",
52
+ extensions={
53
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
54
+ "extension_type": "property-extension",
55
+ "tlp_2_0": "clear",
56
+ }
57
+ },
58
+ )
59
+ GREEN = MarkingDefinition(
60
+ spec_version="2.1",
61
+ id="marking-definition--bab4a63c-aed9-4cf5-a766-dfca5abac2bb",
62
+ created="2022-10-01T00:00:00.000Z",
63
+ definition_type="TLP:GREEN",
64
+ extensions={
65
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
66
+ "extension_type": "property-extension",
67
+ "tlp_2_0": "green",
68
+ }
69
+ },
70
+ )
71
+ AMBER = MarkingDefinition(
72
+ spec_version="2.1",
73
+ id="marking-definition--55d920b0-5e8b-4f79-9ee9-91f868d9b421",
74
+ created="2022-10-01T00:00:00.000Z",
75
+ definition_type="TLP:AMBER",
76
+ extensions={
77
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
78
+ "extension_type": "property-extension",
79
+ "tlp_2_0": "amber",
80
+ }
81
+ },
82
+ )
83
+ AMBER_STRICT = MarkingDefinition(
84
+ spec_version="2.1",
85
+ id="marking-definition--939a9414-2ddd-4d32-a0cd-375ea402b003",
86
+ created="2022-10-01T00:00:00.000Z",
87
+ definition_type="TLP:AMBER+STRICT",
88
+ extensions={
89
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
90
+ "extension_type": "property-extension",
91
+ "tlp_2_0": "amber+strict",
92
+ }
93
+ },
94
+ )
95
+ RED = MarkingDefinition(
96
+ spec_version="2.1",
97
+ id="marking-definition--e828b379-4e03-4974-9ac4-e53a884c97c1",
98
+ created="2022-10-01T00:00:00.000Z",
99
+ definition_type="TLP:RED",
100
+ extensions={
101
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
102
+ "extension_type": "property-extension",
103
+ "tlp_2_0": "red",
104
+ }
105
+ },
106
+ )
107
+
108
+ @classmethod
109
+ def levels(cls):
110
+ return dict(
111
+ clear=cls.CLEAR,
112
+ green=cls.GREEN,
113
+ amber=cls.AMBER,
114
+ amber_strict=cls.AMBER_STRICT,
115
+ red=cls.RED,
116
+ )
117
+
118
+ @classmethod
119
+ def values(cls):
120
+ return [
121
+ cls.CLEAR.value,
122
+ cls.GREEN.value,
123
+ cls.AMBER.value,
124
+ cls.AMBER_STRICT.value,
125
+ cls.RED.value,
126
+ ]
127
+
128
+ @classmethod
129
+ def get(cls, level: "str|TLP_LEVEL"):
130
+ if isinstance(level, cls):
131
+ return level
132
+ level = level.lower()
133
+ level = level.replace("+", "_").replace("-", "_")
134
+ if level not in cls.levels():
135
+ raise Exception(f"unsupported tlp level: `{level}`")
136
+ return cls.levels()[level]
137
+
138
+ @property
139
+ def name(self):
140
+ return super().name.lower()
141
+
142
+
143
+ class Statuses(enum.StrEnum):
144
+ stable = enum.auto()
145
+ test = enum.auto()
146
+ experimental = enum.auto()
147
+ deprecated = enum.auto()
148
+ unsupported = enum.auto()
149
+
150
+
151
+ class Level(enum.StrEnum):
152
+ informational = enum.auto()
153
+ low = enum.auto()
154
+ medium = enum.auto()
155
+ high = enum.auto()
156
+ critical = enum.auto()
157
+
158
+
159
+ class SigmaTag(str):
160
+ @classmethod
161
+ def __get_pydantic_core_schema__(
162
+ cls,
163
+ _source: type[Any],
164
+ _handler,
165
+ ) -> core_schema.CoreSchema:
166
+ return core_schema.no_info_after_validator_function(
167
+ cls._validate, core_schema.str_schema()
168
+ )
169
+
170
+ @classmethod
171
+ def __get_pydantic_json_schema__(cls, core_schema: core_schema.CoreSchema, handler):
172
+ field_schema = handler(core_schema)
173
+ field_schema.update(
174
+ type="string", pattern=TAG_PATTERN.pattern, format="sigma-tag"
175
+ )
176
+ return field_schema
177
+
178
+ @classmethod
179
+ def _validate(cls, input_value: str, /) -> str:
180
+ if not TAG_PATTERN.match(input_value):
181
+ raise PydanticCustomError(
182
+ "value_error",
183
+ "value is not a valid SIGMA tag: {reason}",
184
+ {
185
+ "reason": f"Must be in format namespace.value and match pattern {TAG_PATTERN.pattern}"
186
+ },
187
+ )
188
+ return input_value
189
+
190
+
191
+ class RelatedRule(BaseModel):
192
+ id: UUID
193
+ type: Literal["derived", "obsolete", "merged", "renamed", "similar"]
194
+
195
+
196
+ class BaseDetection(BaseModel):
197
+ title: str
198
+ description: str
199
+ detection: dict
200
+ logsource: dict
201
+ status: Statuses = Statuses.experimental
202
+ falsepositives: list[str]
203
+ tags: list[str]
204
+ level: Level
205
+ _custom_id = None
206
+ _extra_data: dict
207
+ sigma_json_schema: ClassVar = requests.get(
208
+ "https://github.com/SigmaHQ/sigma-specification/raw/refs/heads/main/json-schema/sigma-detection-rule-schema.json"
209
+ ).json()
210
+
211
+ def model_post_init(self, __context):
212
+ self.tags = self.tags or []
213
+ self._extra_data = dict()
214
+ return super().model_post_init(__context)
215
+
216
+ @property
217
+ def detection_id(self):
218
+ return str(self._custom_id or getattr(self, "id", None) or uuid.uuid4())
219
+
220
+ @detection_id.setter
221
+ def detection_id(self, custom_id):
222
+ self._custom_id = custom_id.split("--")[-1]
223
+
224
+ @property
225
+ def tlp_level(self):
226
+ return tlp_from_tags(self.tags)
227
+
228
+ @tlp_level.setter
229
+ def tlp_level(self, level):
230
+ set_tlp_level_in_tags(self.tags, level)
231
+
232
+ def set_labels(self, labels):
233
+ self.tags.extend(labels)
234
+
235
+ def set_extra_data_from_bundler(self, bundler: "Bundler"):
236
+ raise NotImplementedError("this class should no longer be in use")
237
+
238
+ def make_rule(self, bundler: "Bundler"):
239
+ self.set_extra_data_from_bundler(bundler)
240
+ self.tags = list(dict.fromkeys(self.tags))
241
+
242
+ rule = dict(
243
+ id=self.detection_id,
244
+ **self.model_dump(
245
+ exclude=["indicator_types", "id"], mode="json", by_alias=True
246
+ ),
247
+ )
248
+ for k, v in list(rule.items()):
249
+ if not v:
250
+ rule.pop(k, None)
251
+
252
+ self.validate_rule_with_json_schema(rule)
253
+ if getattr(self, "date", 0):
254
+ rule.update(date=self.date)
255
+ if getattr(self, "modified", 0):
256
+ rule.update(modified=self.modified)
257
+ return yaml.dump(rule, sort_keys=False, indent=4)
258
+
259
+ def validate_rule_with_json_schema(self, rule):
260
+ jsonschema.validate(
261
+ rule,
262
+ self.sigma_json_schema,
263
+ )
264
+
265
+ @property
266
+ def external_references(self):
267
+ refs = []
268
+ for attr in ["level", "status", "license"]:
269
+ if attr_val := getattr(self, attr, None):
270
+ refs.append(dict(source_name=f"sigma-{attr}", description=attr_val))
271
+ return refs
272
+
273
+ @property
274
+ def mitre_attack_ids(self):
275
+ retval = []
276
+ for i, label in enumerate(self.tags):
277
+ label = label.replace("_", "-").lower()
278
+ namespace, _, label_id = label.partition(".")
279
+ if namespace == "attack":
280
+ retval.append(MITRE_TACTIC_MAP.get(label_id, label_id.upper()))
281
+ return retval
282
+
283
+ @property
284
+ def cve_ids(self):
285
+ retval = []
286
+ for label in self.tags:
287
+ namespace, _, label_id = label.partition(".")
288
+ if namespace == "cve":
289
+ retval.append(namespace.upper() + "-" + label_id)
290
+ return retval
291
+
292
+ def make_data_source(self):
293
+ return DataSource(
294
+ category=self.logsource.get("category"),
295
+ product=self.logsource.get("product"),
296
+ service=self.logsource.get("service"),
297
+ definition=self.logsource.get("definition"),
298
+ )
299
+
300
+
301
+ class AIDetection(BaseDetection):
302
+ indicator_types: list[str] = Field(default_factory=list)
303
+
304
+ def to_sigma_rule_detection(self, bundler):
305
+ rule_dict = {
306
+ **self.model_dump(exclude=["indicator_types"]),
307
+ **dict(
308
+ date=bundler.report.created.date(),
309
+ modified=bundler.report.modified.date(),
310
+ id=uuid.uuid4(),
311
+ ),
312
+ }
313
+ try:
314
+ return SigmaRuleDetection.model_validate(rule_dict)
315
+ except Exception as e:
316
+ raise ValueError(
317
+ dict(message="validate ai output failed", error=e, content=rule_dict)
318
+ )
319
+
320
+
321
+ class SigmaRuleDetection(BaseDetection):
322
+ title: str
323
+ id: Optional[UUID] = None
324
+ related: Optional[list[RelatedRule]] = None
325
+ name: Optional[str] = None
326
+ taxonomy: Optional[str] = None
327
+ status: Optional[Statuses] = None
328
+ description: Optional[str] = None
329
+ license: Optional[str] = None
330
+ author: Optional[str] = None
331
+ references: Optional[List[str]] = Field(default_factory=list)
332
+ date: Optional["dt_date"] = Field(alias="date", default=None)
333
+ modified: Optional["dt_date"] = None
334
+ logsource: dict
335
+ detection: dict
336
+ fields: Optional[List[str]] = None
337
+ falsepositives: Optional[List[str]] = None
338
+ level: Optional[Level] = None
339
+ tags: Optional[List[SigmaTag]] = Field(default_factory=list)
340
+ scope: Optional[List[str]] = None
341
+ _indicator_types: list = None
342
+
343
+ @property
344
+ def detection_id(self):
345
+ return str(self.id)
346
+
347
+ @property
348
+ def indicator_types(self):
349
+ return self._indicator_types
350
+
351
+ @indicator_types.setter
352
+ def indicator_types(self, types):
353
+ self._indicator_types = types
354
+
355
+ @detection_id.setter
356
+ def detection_id(self, new_id):
357
+ if self.id and str(self.id) != str(new_id):
358
+ self.related = self.related or []
359
+ self.related.append(RelatedRule(id=self.id, type="renamed"))
360
+ self.id = new_id
361
+
362
+ @field_validator("tags", mode="after")
363
+ @classmethod
364
+ def validate_tlp(cls, tags: list[str]):
365
+ tlps = []
366
+ for tag in tags:
367
+ if tag.startswith("tlp."):
368
+ tlps.append(tag)
369
+ if len(tlps) > 1:
370
+ raise ValueError(
371
+ f"tag must not contain more than one tag in tlp namespace. Got {tlps}"
372
+ )
373
+ return tags
374
+
375
+ @field_validator("modified", mode="after")
376
+ @classmethod
377
+ def validate_modified(cls, modified, info):
378
+ if info.data.get("date") == modified:
379
+ return None
380
+ return modified
381
+
382
+ def set_extra_data_from_bundler(self, bundler: "Bundler"):
383
+ if not bundler:
384
+ return
385
+
386
+ if not self.date:
387
+ from .utils import as_date
388
+
389
+ self.date = as_date(bundler.created)
390
+
391
+ self.set_labels(bundler.labels)
392
+ self.tlp_level = bundler.tlp_level.name
393
+ self.author = bundler.report.created_by_ref
394
+ self.license = bundler.license
395
+ self.references = bundler.reference_urls
396
+
397
+
398
+ class DetectionContainer(BaseModel):
399
+ success: bool
400
+ detections: list[Union[BaseDetection, AIDetection, SigmaRuleDetection]]
401
+
402
+
403
+ class DataContainer(BaseModel):
404
+ detections: DetectionContainer
405
+ navigator_layer: dict = Field(default=None)
406
+ observables: list[dict] = Field(default=None)
407
+ cves: dict[str, str] = Field(default_factory=dict)
408
+ attacks: dict[str, str] = Field(default_factory=dict)
409
+
410
+
411
+ def tlp_from_tags(tags: list[SigmaTag]):
412
+ for tag in tags:
413
+ ns, _, level = tag.partition(".")
414
+ if ns != "tlp":
415
+ continue
416
+ if tlp_level := TLP_LEVEL.get(level.replace("-", "_")):
417
+ return tlp_level
418
+ return None
419
+
420
+
421
+ def set_tlp_level_in_tags(tags: list[SigmaTag], level):
422
+ level = str(level)
423
+ for i, tag in enumerate(tags):
424
+ if tag.startswith("tlp."):
425
+ tags.remove(tag)
426
+ tags.append("tlp." + level.replace("_", "-"))
427
+ return tags
@@ -0,0 +1,161 @@
1
+ import re, validators
2
+ from typing import Any, Dict, List
3
+ from stix2 import parse as parse_stix, parse_observable
4
+
5
+ # Mapping of key regex patterns to STIX observable types
6
+ STIX_PATTERNS_KEYS = {
7
+ "ipv4-addr": r"(?i)\b(ip|ipv4)\b",
8
+ "ipv6-addr": r"(?i)\bipv6\b",
9
+ "email-addr": r"(?i)\bemail\b",
10
+ "url": r"(?i)\b(url|uri)\b",
11
+ "directory": r"(?i)\b(directory|path)\b",
12
+ "domain-name": r"(?i)\bdomain\b",
13
+ "hostname": r"(?i)\bhost\b",
14
+ "file.hashes.MD5": r"(?i)\bmd5\b",
15
+ "file.hashes.SHA-1": r"(?i)\bsha1\b",
16
+ "file.hashes.SHA-256": r"(?i)\bsha256\b",
17
+ "file.hashes.SHA-512": r"(?i)\bsha512\b",
18
+ "file.hashes.SSDEEP": r"(?i)\bssdeep\b",
19
+ "mac-addr": r"(?i)\bmac\b",
20
+ "user-account": r"(?i)\buser\b",
21
+ "windows-registry-key": r"(?i)\bregistry\b",
22
+ "x509-certificate": r"(?i)\bx509\b",
23
+ }
24
+
25
+ # Mapping of value regex patterns to STIX observable types
26
+ STIX_PATTERNS_VALUES = {
27
+ "ipv4-addr": [r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.|$)){4}\b"],
28
+ "ipv6-addr": [r"\b(?:[A-F0-9]{1,4}:){7}[A-F0-9]{1,4}\b"],
29
+ "email-addr": [r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"],
30
+ "url": [r"\bhttps?://[^\s/$.?#].[^\x00\s]*\b"],
31
+ "directory": [r"(?:[A-Za-z]:)?(?:\\\\[^\\\\:*?\"<>|\r\n]+)+\\\\?"],
32
+ "domain-name": [r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b"],
33
+ "hostname": [r"\b[a-zA-Z0-9-]{1,63}(\.[a-zA-Z0-9-]{1,63})*\b"],
34
+ "file.hashes.MD5": [r"\b[a-fA-F0-9]{32}\b"],
35
+ "file.hashes.SHA-1": [r"\b[a-fA-F0-9]{40}\b"],
36
+ "file.hashes.SHA-256": [r"\b[a-fA-F0-9]{64}\b"],
37
+ "file.hashes.SHA-512": [r"\b[a-fA-F0-9]{128}\b"],
38
+ "file.hashes.SSDEEP": [r"\b\d{1,}:[A-Za-z0-9/+]{10,}:[A-Za-z0-9/+]{10,}\b"],
39
+ "mac-addr": [r"\b([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})\b"],
40
+ "user-account": [r"\b[A-Za-z0-9._%-]{3,}\\\\?[A-Za-z0-9._%-]{3,}\b"],
41
+ "windows-registry-key": [r"HK\w{0,2}_[A-Z_]+\\.*"],
42
+ "x509-certificate": [r"-----BEGIN CERTIFICATE-----.+?-----END CERTIFICATE-----"],
43
+ }
44
+
45
+
46
+ def filter_out(type, value: str):
47
+ match type:
48
+ case "ipv4-addr":
49
+ return validators.ipv4(value)
50
+ case "ipv6-addr":
51
+ return validators.ipv6(value)
52
+ case "email-addr":
53
+ return validators.email(value)
54
+ case "url":
55
+ return validators.url(value)
56
+ case "domain-name":
57
+ return validators.domain(value, consider_tld=True)
58
+ case "file.hashes.MD5":
59
+ return validators.hashes.md5(value)
60
+ case "file.hashes.SHA-1":
61
+ return validators.hashes.sha1(value)
62
+ case "file.hashes.SHA-256":
63
+ return validators.hashes.sha256(value)
64
+ case "file.hashes.SHA-512":
65
+ return validators.hashes.sha512(value)
66
+ case "file.hashes.SSDEEP":
67
+ pass
68
+ case "mac-addr":
69
+ return validators.mac_address(value)
70
+ case "user-account":
71
+ pass
72
+
73
+ case "windows-registry-key":
74
+ print(value)
75
+ ns, _, _ = value.partition("\\")
76
+ return ns in [
77
+ "HKEY_CLASSES_ROOT",
78
+ "HKCR",
79
+ "HKEY_CURRENT_USER",
80
+ "HKCU",
81
+ "HKEY_LOCAL_MACHINE",
82
+ "HKLM",
83
+ "HKEY_USERS",
84
+ "HKU",
85
+ "HKEY_CURRENT_CONFIG",
86
+ "HKCC",
87
+ "HKEY_PERFORMANCE_DATA",
88
+ "HKEY_DYN_DATA",
89
+ ]
90
+ case _:
91
+ return False
92
+ return False
93
+
94
+
95
+ def find_stix_observables(detection: Any, matches: List[str] = None) -> List[str]:
96
+ if matches is None:
97
+ matches = []
98
+
99
+ if isinstance(detection, dict):
100
+ for key, value in detection.items():
101
+ for stix_type, key_pattern in STIX_PATTERNS_KEYS.items():
102
+ value_patterns = STIX_PATTERNS_VALUES.get(stix_type, [])
103
+ if re.search(key_pattern, key, re.IGNORECASE):
104
+ for pattern in value_patterns:
105
+ if isinstance(value, str) and re.search(
106
+ pattern, value, re.IGNORECASE
107
+ ):
108
+ if filter_out(stix_type, value):
109
+ matches.append((stix_type, value))
110
+ find_stix_observables(value, matches)
111
+ find_stix_observables(value, matches)
112
+ elif isinstance(detection, list):
113
+ for item in detection:
114
+ find_stix_observables(item, matches)
115
+ elif isinstance(detection, str):
116
+ for stix_type, value_patterns in STIX_PATTERNS_VALUES.items():
117
+ for pattern in value_patterns:
118
+ if re.search(pattern, detection, re.IGNORECASE):
119
+ if filter_out(stix_type, detection):
120
+ matches.append((stix_type, detection))
121
+ return matches
122
+
123
+
124
+ def to_stix_object(observable_type: str, value):
125
+ match observable_type:
126
+ case (
127
+ "ipv4-addr"
128
+ | "ipv6-addr"
129
+ | "email-addr"
130
+ | "url"
131
+ | "domain-name"
132
+ | "mac-addr"
133
+ ):
134
+ return parse_observable(
135
+ dict(
136
+ type=observable_type,
137
+ value=value,
138
+ spec_version="2.1",
139
+ )
140
+ )
141
+ case (
142
+ "file.hashes.MD5"
143
+ | "file.hashes.SHA-1"
144
+ | "file.hashes.SHA-256"
145
+ | "file.hashes.SHA-512"
146
+ | "file.hashes.SSDEEP"
147
+ ):
148
+ _, _, hash_type = observable_type.rpartition(".")
149
+ return parse_observable(
150
+ dict(type="file", spec_version="2.1", hashes={hash_type: value})
151
+ )
152
+
153
+ case "windows-registry-key":
154
+ return parse_observable(
155
+ dict(
156
+ type=observable_type,
157
+ spec_version="2.1",
158
+ key=value,
159
+ )
160
+ )
161
+ return None
txt2detection/utils.py ADDED
@@ -0,0 +1,100 @@
1
+ from datetime import date, datetime
2
+ from functools import lru_cache
3
+ from types import SimpleNamespace
4
+ import uuid
5
+ import requests
6
+ from .ai_extractor import ALL_AI_EXTRACTORS, BaseAIExtractor, ModelError
7
+ import logging
8
+
9
+ import enum
10
+ import logging
11
+ import requests
12
+ from stix2 import Identity
13
+
14
+ from .models import UUID_NAMESPACE
15
+
16
+
17
+ class DetectionLanguage(SimpleNamespace):
18
+ pass
19
+
20
+
21
+ def parse_model(value: str):
22
+ splits = value.split(":", 1)
23
+ provider = splits[0]
24
+ if provider not in ALL_AI_EXTRACTORS:
25
+ raise NotImplementedError(
26
+ f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
27
+ )
28
+ provider = ALL_AI_EXTRACTORS[provider]
29
+ try:
30
+ if len(splits) == 2:
31
+ return provider(model=splits[1])
32
+ return provider()
33
+ except Exception as e:
34
+ raise ModelError(f"Unable to initialize model `{value}`") from e
35
+
36
+
37
+ def make_identity(name, namespace=None, created_by_ref=None, object_marking_refs=None):
38
+ from .bundler import Bundler
39
+
40
+ if isinstance(namespace, str):
41
+ namespace = uuid.UUID(namespace)
42
+ namespace = namespace or UUID_NAMESPACE
43
+ return Identity(
44
+ id="identity--" + str(uuid.uuid5(namespace, f"{name}")),
45
+ name=name,
46
+ created_by_ref=created_by_ref or Bundler.default_identity.id,
47
+ created=datetime(2020, 1, 1),
48
+ modified=datetime(2020, 1, 1),
49
+ object_marking_refs=object_marking_refs
50
+ or [
51
+ "marking-definition--94868c89-83c2-464b-929b-a1a8aa3c8487",
52
+ "marking-definition--a4d70b75-6f4a-5d19-9137-da863edd33d7",
53
+ ],
54
+ )
55
+
56
+
57
+ def validate_token_count(max_tokens, input, extractor: BaseAIExtractor):
58
+ logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
59
+ token_count = extractor.count_tokens(input)
60
+ logging.info("TOKEN COUNT FOR %s: %d", extractor.extractor_name, token_count)
61
+ if token_count > max_tokens:
62
+ raise Exception(
63
+ f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
64
+ )
65
+
66
+
67
+ @lru_cache(maxsize=5)
68
+ def get_licenses(date):
69
+ resp = requests.get(
70
+ "https://github.com/spdx/license-list-data/raw/refs/heads/main/json/licenses.json"
71
+ )
72
+ return {l["licenseId"]: l["name"] for l in resp.json()["licenses"]}
73
+
74
+
75
+ def valid_licenses():
76
+ return get_licenses(datetime.now().date().isoformat())
77
+
78
+
79
+ def remove_rule_specific_tags(tags):
80
+ labels = []
81
+ for tag in tags:
82
+ namespace, _, label = tag.partition(".")
83
+ if namespace in ["attack", "cve", "tlp"]:
84
+ continue
85
+ labels.append(tag)
86
+ return labels
87
+
88
+ @lru_cache()
89
+ def load_stix_object_from_url(url):
90
+ resp = requests.get(url)
91
+ return resp.json()
92
+
93
+
94
+ def as_date(d: "date|datetime"):
95
+ if isinstance(d, datetime):
96
+ return d.date()
97
+ return d
98
+
99
+
100
+ STATUSES = ["stable", "test", "experimental", "deprecated", "unsupported"]