txt2stix 1.1.12__py3-none-any.whl → 1.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2stix/__init__.py +0 -1
- txt2stix/ai_extractor/base.py +4 -1
- txt2stix/ai_extractor/utils.py +3 -0
- txt2stix/attack_flow.py +2 -1
- txt2stix/bundler.py +1 -4
- txt2stix/txt2stix.py +277 -81
- txt2stix/utils.py +2 -2
- {txt2stix-1.1.12.dist-info → txt2stix-1.1.13.dist-info}/METADATA +1 -1
- {txt2stix-1.1.12.dist-info → txt2stix-1.1.13.dist-info}/RECORD +12 -12
- {txt2stix-1.1.12.dist-info → txt2stix-1.1.13.dist-info}/WHEEL +1 -1
- {txt2stix-1.1.12.dist-info → txt2stix-1.1.13.dist-info}/entry_points.txt +0 -0
- {txt2stix-1.1.12.dist-info → txt2stix-1.1.13.dist-info}/licenses/LICENSE +0 -0
txt2stix/__init__.py
CHANGED
txt2stix/ai_extractor/base.py
CHANGED
txt2stix/ai_extractor/utils.py
CHANGED
|
@@ -33,6 +33,9 @@ class RelationshipList(BaseModel):
|
|
|
33
33
|
relationships: list[Relationship] = Field(default_factory=list)
|
|
34
34
|
success: bool
|
|
35
35
|
|
|
36
|
+
def get(self, key, default=None):
|
|
37
|
+
return getattr(self, key, default)
|
|
38
|
+
|
|
36
39
|
class DescribesIncident(BaseModel):
|
|
37
40
|
describes_incident: bool = Field(description="does the <document> include malware analysis, APT group reports, data breaches and vulnerabilities?")
|
|
38
41
|
explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
|
txt2stix/attack_flow.py
CHANGED
|
@@ -213,6 +213,7 @@ def extract_attack_flow_and_navigator(
|
|
|
213
213
|
ai_create_attack_flow,
|
|
214
214
|
ai_create_attack_navigator_layer,
|
|
215
215
|
ai_settings_relationships,
|
|
216
|
+
flow=None
|
|
216
217
|
):
|
|
217
218
|
ex: BaseAIExtractor = ai_settings_relationships
|
|
218
219
|
tactics = get_all_tactics()
|
|
@@ -225,7 +226,7 @@ def extract_attack_flow_and_navigator(
|
|
|
225
226
|
]
|
|
226
227
|
logging.debug(f"parsed techniques: {json.dumps(logged_techniques, indent=4)}")
|
|
227
228
|
|
|
228
|
-
flow = ex.extract_attack_flow(preprocessed_text, techniques)
|
|
229
|
+
flow = flow or ex.extract_attack_flow(preprocessed_text, techniques)
|
|
229
230
|
navigator = None
|
|
230
231
|
if ai_create_attack_flow:
|
|
231
232
|
logging.info("creating attack-flow bundle")
|
txt2stix/bundler.py
CHANGED
|
@@ -422,10 +422,6 @@ class txt2stixBundler:
|
|
|
422
422
|
def process_observables(self, extractions, add_standard_relationship=False):
|
|
423
423
|
for ex in extractions:
|
|
424
424
|
try:
|
|
425
|
-
if ex.get("id", "").startswith(
|
|
426
|
-
"ai"
|
|
427
|
-
): # so id is distinct across multiple AIExtractors
|
|
428
|
-
ex["id"] = f'{ex["id"]}_{self.observables_processed}'
|
|
429
425
|
ex["id"] = ex.get("id", f"ex_{self.observables_processed}")
|
|
430
426
|
self.observables_processed += 1
|
|
431
427
|
self.add_indicator(ex, add_standard_relationship)
|
|
@@ -437,6 +433,7 @@ class txt2stixBundler:
|
|
|
437
433
|
ex["error"] = str(e)
|
|
438
434
|
|
|
439
435
|
def process_relationships(self, observables):
|
|
436
|
+
print(observables)
|
|
440
437
|
for relationship in observables:
|
|
441
438
|
try:
|
|
442
439
|
self.add_ai_relationship(relationship)
|
txt2stix/txt2stix.py
CHANGED
|
@@ -21,7 +21,7 @@ from .utils import RELATIONSHIP_TYPES, Txt2StixData, remove_links
|
|
|
21
21
|
from .common import UUID_NAMESPACE, FatalException
|
|
22
22
|
|
|
23
23
|
from .bundler import txt2stixBundler, parse_stix, TLP_LEVEL
|
|
24
|
-
from .import extractions, lookups, pattern
|
|
24
|
+
from . import extractions, lookups, pattern
|
|
25
25
|
from types import SimpleNamespace
|
|
26
26
|
import functools
|
|
27
27
|
from fnmatch import filter
|
|
@@ -40,41 +40,51 @@ def newLogger(name: str) -> logging.Logger:
|
|
|
40
40
|
level=logging.DEBUG, # Set the desired logging level
|
|
41
41
|
format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
|
|
42
42
|
handlers=[stream_handler],
|
|
43
|
-
datefmt=
|
|
43
|
+
datefmt="%d-%b-%y %H:%M:%S",
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
return logging.root
|
|
47
47
|
|
|
48
|
+
|
|
48
49
|
def setLogFile(logger, file: Path):
|
|
49
50
|
file.parent.mkdir(parents=True, exist_ok=True)
|
|
50
51
|
logger.info(f"Saving log to `{file.absolute()}`")
|
|
51
52
|
handler = logging.FileHandler(file, "w")
|
|
52
|
-
handler.formatter = logging.Formatter(
|
|
53
|
+
handler.formatter = logging.Formatter(
|
|
54
|
+
fmt="%(levelname)s %(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S"
|
|
55
|
+
)
|
|
53
56
|
handler.setLevel(logging.DEBUG)
|
|
54
57
|
logger.addHandler(handler)
|
|
55
58
|
logger.info("=====================txt2stix======================")
|
|
56
59
|
|
|
57
60
|
|
|
58
61
|
MODULE_PATH = Path(__file__).parent.parent
|
|
59
|
-
INCLUDES_PATH = MODULE_PATH/"includes"
|
|
62
|
+
INCLUDES_PATH = MODULE_PATH / "includes"
|
|
60
63
|
try:
|
|
61
64
|
from . import includes
|
|
65
|
+
|
|
62
66
|
INCLUDES_PATH = Path(includes.__file__).parent
|
|
63
67
|
except:
|
|
64
68
|
pass
|
|
65
69
|
|
|
70
|
+
|
|
66
71
|
def split_comma(s: str) -> list[str]:
|
|
67
72
|
return [ss for ss in s.split(",") if ss]
|
|
68
73
|
|
|
74
|
+
|
|
69
75
|
def range_type(min, max):
|
|
70
76
|
def fn(astr):
|
|
71
77
|
value = int(astr)
|
|
72
|
-
if min<= value <= max:
|
|
78
|
+
if min <= value <= max:
|
|
73
79
|
return value
|
|
74
80
|
else:
|
|
75
|
-
raise argparse.ArgumentTypeError(
|
|
81
|
+
raise argparse.ArgumentTypeError(
|
|
82
|
+
f"value {value} not in range [{min}-{max}]"
|
|
83
|
+
)
|
|
84
|
+
|
|
76
85
|
return fn
|
|
77
86
|
|
|
87
|
+
|
|
78
88
|
def parse_labels(labels: str) -> list[str]:
|
|
79
89
|
labels = labels.split(",")
|
|
80
90
|
for label in labels:
|
|
@@ -83,39 +93,44 @@ def parse_labels(labels: str) -> list[str]:
|
|
|
83
93
|
|
|
84
94
|
return labels
|
|
85
95
|
|
|
96
|
+
|
|
86
97
|
def parse_extractors_globbed(type, all_extractors, names):
|
|
87
98
|
globbed_names = set()
|
|
88
99
|
for name in names.split(","):
|
|
89
100
|
matches = fnmatch.filter(all_extractors.keys(), name)
|
|
90
101
|
if not matches:
|
|
91
|
-
raise argparse.ArgumentTypeError(f
|
|
102
|
+
raise argparse.ArgumentTypeError(f"`{name}` has 0 matches")
|
|
92
103
|
globbed_names.update(matches)
|
|
93
|
-
filtered_extractors
|
|
104
|
+
filtered_extractors = {}
|
|
94
105
|
for extractor_name in globbed_names:
|
|
95
106
|
try:
|
|
96
107
|
extractor = all_extractors[extractor_name]
|
|
97
|
-
extraction_processor
|
|
108
|
+
extraction_processor = filtered_extractors.get(extractor.type, {})
|
|
98
109
|
if extractor.type in ["lookup"]:
|
|
99
110
|
lookups.load_lookup(extractor)
|
|
100
111
|
if extractor.type == "pattern":
|
|
101
112
|
pattern.load_extractor(extractor)
|
|
102
|
-
filtered_extractors[extractor.type] =
|
|
113
|
+
filtered_extractors[extractor.type] = extraction_processor
|
|
103
114
|
extraction_processor[extractor_name] = extractor
|
|
104
115
|
except BaseException as e:
|
|
105
116
|
raise argparse.ArgumentTypeError(f"{type} `{extractor_name}`: {e}")
|
|
106
117
|
return filtered_extractors
|
|
107
118
|
|
|
119
|
+
|
|
108
120
|
def parse_ref(value):
|
|
109
|
-
m = re.compile(r
|
|
121
|
+
m = re.compile(r"(.+?)=(.+)").match(value)
|
|
110
122
|
if not m:
|
|
111
123
|
raise argparse.ArgumentTypeError("must be in format key=value")
|
|
112
124
|
return dict(source_name=m.group(1), external_id=m.group(2))
|
|
113
125
|
|
|
126
|
+
|
|
114
127
|
def parse_model(value: str):
|
|
115
|
-
splits = value.split(
|
|
128
|
+
splits = value.split(":", 1)
|
|
116
129
|
provider = splits[0]
|
|
117
130
|
if provider not in ALL_AI_EXTRACTORS:
|
|
118
|
-
raise argparse.ArgumentTypeError(
|
|
131
|
+
raise argparse.ArgumentTypeError(
|
|
132
|
+
f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
|
|
133
|
+
)
|
|
119
134
|
provider = ALL_AI_EXTRACTORS[provider]
|
|
120
135
|
|
|
121
136
|
try:
|
|
@@ -125,6 +140,7 @@ def parse_model(value: str):
|
|
|
125
140
|
except Exception as e:
|
|
126
141
|
raise ModelError(f"Unable to initialize model `{value}`") from e
|
|
127
142
|
|
|
143
|
+
|
|
128
144
|
def parse_bool(value: str):
|
|
129
145
|
value = value.lower()
|
|
130
146
|
return value in ["yes", "y", "true", "1"]
|
|
@@ -135,7 +151,12 @@ def parse_args():
|
|
|
135
151
|
all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
|
|
136
152
|
|
|
137
153
|
parser = argparse.ArgumentParser(description="File Conversion Tool")
|
|
138
|
-
parser.add_argument(
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--check_credentials",
|
|
156
|
+
"--check-credentials",
|
|
157
|
+
action="store_true",
|
|
158
|
+
help="Print the validity of the credentials and exit",
|
|
159
|
+
)
|
|
139
160
|
args, _ = parser.parse_known_args()
|
|
140
161
|
if args.check_credentials:
|
|
141
162
|
statuses = credential_checker.check_statuses(test_llms=True)
|
|
@@ -259,7 +280,6 @@ def parse_args():
|
|
|
259
280
|
help="create attack flow for attack objects in report/bundle",
|
|
260
281
|
)
|
|
261
282
|
|
|
262
|
-
|
|
263
283
|
args = parser.parse_args()
|
|
264
284
|
if not args.input_file.exists():
|
|
265
285
|
raise argparse.ArgumentError(inf_arg, "cannot open file")
|
|
@@ -296,6 +316,8 @@ REQUIRED_ENV_VARIABLES = [
|
|
|
296
316
|
"CTIBUTLER_BASE_URL",
|
|
297
317
|
"VULMATCH_BASE_URL",
|
|
298
318
|
]
|
|
319
|
+
|
|
320
|
+
|
|
299
321
|
def load_env():
|
|
300
322
|
for env in REQUIRED_ENV_VARIABLES:
|
|
301
323
|
if not os.getenv(env):
|
|
@@ -304,19 +326,34 @@ def load_env():
|
|
|
304
326
|
|
|
305
327
|
def log_notes(content, type):
|
|
306
328
|
logging.debug(f" ========================= {type} ========================= ")
|
|
307
|
-
logging.debug(
|
|
329
|
+
logging.debug(
|
|
330
|
+
f" ========================= {'+'*len(type)} ========================= "
|
|
331
|
+
)
|
|
308
332
|
logging.debug(json.dumps(content, sort_keys=True, indent=4))
|
|
309
|
-
logging.debug(
|
|
333
|
+
logging.debug(
|
|
334
|
+
f" ========================= {'-'*len(type)} ========================= "
|
|
335
|
+
)
|
|
336
|
+
|
|
310
337
|
|
|
311
|
-
def
|
|
312
|
-
|
|
338
|
+
def run_extractors(
|
|
339
|
+
extractors_map, text_content, ai_extractors: list[BaseAIExtractor] = [], **kwargs
|
|
340
|
+
):
|
|
341
|
+
"""Run extraction calls (lookup, pattern, AI) and return a dict of all extracts.
|
|
313
342
|
|
|
314
|
-
|
|
343
|
+
This function does NOT modify the bundler. Use `process_extracts` to
|
|
344
|
+
feed the returned extracts into a bundler (or replay saved extracts).
|
|
345
|
+
"""
|
|
346
|
+
assert ai_extractors or not extractors_map.get(
|
|
347
|
+
"ai"
|
|
348
|
+
), "There should be at least one AI extractor in ai_extractors"
|
|
349
|
+
|
|
350
|
+
text_content = "\n" + text_content + "\n"
|
|
315
351
|
all_extracts = dict()
|
|
316
352
|
if extractors_map.get("lookup"):
|
|
317
353
|
try:
|
|
318
|
-
lookup_extracts = lookups.extract_all(
|
|
319
|
-
|
|
354
|
+
lookup_extracts = lookups.extract_all(
|
|
355
|
+
extractors_map["lookup"].values(), text_content
|
|
356
|
+
)
|
|
320
357
|
all_extracts["lookup"] = lookup_extracts
|
|
321
358
|
except BaseException as e:
|
|
322
359
|
logging.exception("lookup extraction failed", exc_info=True)
|
|
@@ -324,95 +361,239 @@ def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extra
|
|
|
324
361
|
if extractors_map.get("pattern"):
|
|
325
362
|
try:
|
|
326
363
|
logging.info("using pattern extractors")
|
|
327
|
-
pattern_extracts = pattern.extract_all(
|
|
328
|
-
|
|
364
|
+
pattern_extracts = pattern.extract_all(
|
|
365
|
+
extractors_map["pattern"].values(),
|
|
366
|
+
text_content,
|
|
367
|
+
ignore_extraction_boundary=kwargs.get(
|
|
368
|
+
"ignore_extraction_boundary", False
|
|
369
|
+
),
|
|
370
|
+
)
|
|
329
371
|
all_extracts["pattern"] = pattern_extracts
|
|
330
372
|
except BaseException as e:
|
|
331
373
|
logging.exception("pattern extraction failed", exc_info=True)
|
|
332
374
|
|
|
333
375
|
if extractors_map.get("ai"):
|
|
334
376
|
logging.info("using ai extractors")
|
|
335
|
-
|
|
336
377
|
for extractor in ai_extractors:
|
|
337
378
|
logging.info("running extractor: %s", extractor.extractor_name)
|
|
338
379
|
try:
|
|
339
|
-
ai_extracts = extractor.extract_objects(
|
|
340
|
-
|
|
380
|
+
ai_extracts = extractor.extract_objects(
|
|
381
|
+
text_content, extractors_map["ai"].values()
|
|
382
|
+
)
|
|
341
383
|
all_extracts[f"ai-{extractor.extractor_name}"] = ai_extracts
|
|
342
384
|
except BaseException as e:
|
|
343
|
-
logging.exception(
|
|
385
|
+
logging.exception(
|
|
386
|
+
"AI extraction failed for %s",
|
|
387
|
+
extractor.extractor_name,
|
|
388
|
+
exc_info=True,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
for i, ex in enumerate(itertools.chain(*all_extracts.values())):
|
|
392
|
+
ex["id"] = "ex-" + str(i)
|
|
393
|
+
return all_extracts
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def process_extracts(bundler: txt2stixBundler, all_extracts: dict):
|
|
397
|
+
"""Process a previously-created `all_extracts` dict into the given bundler.
|
|
398
|
+
|
|
399
|
+
This allows replaying saved extracts without invoking extractors again.
|
|
400
|
+
"""
|
|
401
|
+
for key, extracts in (all_extracts or {}).items():
|
|
402
|
+
try:
|
|
403
|
+
bundler.process_observables(extracts)
|
|
404
|
+
except BaseException:
|
|
405
|
+
logging.exception("processing extracts failed for %s", key, exc_info=True)
|
|
344
406
|
|
|
345
407
|
log_notes(all_extracts, "Extractions")
|
|
346
|
-
return all_extracts
|
|
347
408
|
|
|
348
|
-
|
|
409
|
+
|
|
410
|
+
def extract_relationships(
|
|
411
|
+
text_content, all_extracts, ai_extractor_session: BaseAIExtractor
|
|
412
|
+
):
|
|
349
413
|
relationships = None
|
|
350
414
|
try:
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
415
|
+
# flatten extracts into a single list
|
|
416
|
+
flattened = list(itertools.chain(*all_extracts.values()))
|
|
417
|
+
rel = ai_extractor_session.extract_relationships(
|
|
418
|
+
text_content, flattened, RELATIONSHIP_TYPES
|
|
419
|
+
)
|
|
420
|
+
relationships = rel.model_dump()
|
|
354
421
|
log_notes(relationships, "Relationships")
|
|
355
|
-
bundler.process_relationships(relationships['relationships'])
|
|
356
422
|
except BaseException as e:
|
|
357
|
-
logging.exception("Relationship
|
|
423
|
+
logging.exception("Relationship extraction failed: %s", e)
|
|
358
424
|
return relationships
|
|
359
425
|
|
|
426
|
+
|
|
360
427
|
def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
|
|
361
|
-
logging.info(
|
|
428
|
+
logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
|
|
362
429
|
for extractor in extractors:
|
|
363
430
|
token_count = _count_token(extractor, input)
|
|
364
|
-
logging.info(
|
|
365
|
-
|
|
366
|
-
|
|
431
|
+
logging.info(
|
|
432
|
+
f"{extractor.extractor_name}: input_file token count = {token_count}"
|
|
433
|
+
)
|
|
434
|
+
if token_count > max_tokens:
|
|
435
|
+
raise FatalException(
|
|
436
|
+
f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
|
|
437
|
+
)
|
|
367
438
|
|
|
368
439
|
|
|
369
440
|
@functools.lru_cache
|
|
370
441
|
def _count_token(extractor: BaseAIExtractor, input: str):
|
|
371
442
|
return extractor.count_tokens(input)
|
|
372
443
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
444
|
+
|
|
445
|
+
def run_txt2stix(
|
|
446
|
+
bundler: txt2stixBundler,
|
|
447
|
+
preprocessed_text: str,
|
|
448
|
+
extractors_map: dict,
|
|
449
|
+
ai_content_check_provider=None,
|
|
450
|
+
ai_create_attack_flow=None,
|
|
451
|
+
ai_create_attack_navigator_layer=None,
|
|
452
|
+
input_token_limit=10,
|
|
453
|
+
ai_settings_extractions=None,
|
|
454
|
+
ai_settings_relationships=None,
|
|
455
|
+
relationship_mode="standard",
|
|
456
|
+
ignore_extraction_boundary=False,
|
|
457
|
+
ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
|
|
458
|
+
txt2stix_data: Txt2StixData = None,
|
|
459
|
+
**kwargs,
|
|
460
|
+
) -> Txt2StixData:
|
|
461
|
+
# First, perform extraction-phase (LLM and extractor calls). This does not
|
|
462
|
+
# modify the provided bundler so the results can be saved and replayed.
|
|
463
|
+
# skip extraction phase if txt2stix_data is passed
|
|
464
|
+
txt2stix_data = txt2stix_data or extraction_phase(
|
|
465
|
+
preprocessed_text,
|
|
466
|
+
extractors_map,
|
|
467
|
+
ai_content_check_provider=ai_content_check_provider,
|
|
468
|
+
input_token_limit=input_token_limit,
|
|
469
|
+
ai_settings_extractions=ai_settings_extractions,
|
|
470
|
+
ai_settings_relationships=ai_settings_relationships,
|
|
471
|
+
relationship_mode=relationship_mode,
|
|
472
|
+
ignore_extraction_boundary=ignore_extraction_boundary,
|
|
473
|
+
ai_extract_if_no_incidence=ai_extract_if_no_incidence,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Then, process the extracted data into the bundler (no LLM calls).
|
|
477
|
+
processing_phase(
|
|
478
|
+
bundler,
|
|
479
|
+
preprocessed_text,
|
|
480
|
+
txt2stix_data,
|
|
481
|
+
ai_create_attack_flow=ai_create_attack_flow,
|
|
482
|
+
ai_create_attack_navigator_layer=ai_create_attack_navigator_layer,
|
|
483
|
+
ai_settings_relationships=ai_settings_relationships,
|
|
484
|
+
ai_content_check_provider=ai_content_check_provider,
|
|
485
|
+
)
|
|
486
|
+
return txt2stix_data
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def extraction_phase(
|
|
490
|
+
preprocessed_text: str,
|
|
491
|
+
extractors_map: dict,
|
|
492
|
+
ai_content_check_provider=None,
|
|
493
|
+
input_token_limit=10,
|
|
494
|
+
ai_settings_extractions=None,
|
|
495
|
+
ai_settings_relationships=None,
|
|
496
|
+
relationship_mode="standard",
|
|
497
|
+
ignore_extraction_boundary=False,
|
|
498
|
+
ai_extract_if_no_incidence=True,
|
|
499
|
+
**kwargs,
|
|
500
|
+
) -> Txt2StixData:
|
|
501
|
+
"""Perform token validation and run extractors/AI models. Does NOT modify a bundler."""
|
|
386
502
|
should_extract = True
|
|
387
|
-
|
|
388
|
-
|
|
503
|
+
txt2stix_data = Txt2StixData.model_construct()
|
|
504
|
+
txt2stix_data.extractions = txt2stix_data.attack_flow = (
|
|
505
|
+
txt2stix_data.relationships
|
|
506
|
+
) = None
|
|
507
|
+
|
|
389
508
|
if ai_content_check_provider:
|
|
390
509
|
logging.info("checking content")
|
|
391
|
-
model
|
|
510
|
+
model: BaseAIExtractor = ai_content_check_provider
|
|
392
511
|
validate_token_count(input_token_limit, preprocessed_text, [model])
|
|
393
|
-
|
|
394
|
-
should_extract =
|
|
512
|
+
txt2stix_data.content_check = model.check_content(preprocessed_text)
|
|
513
|
+
should_extract = txt2stix_data.content_check.describes_incident
|
|
395
514
|
logging.info("=== ai-check-content output ====")
|
|
396
|
-
logging.info(
|
|
397
|
-
bundler.report.external_references.append(dict(source_name='txt2stix_describes_incident', description=str(should_extract).lower(), external_id=model.extractor_name))
|
|
398
|
-
for classification in retval.content_check.incident_classification:
|
|
399
|
-
bundler.report.labels.append(f'classification.{classification}'.lower())
|
|
400
|
-
bundler.add_summary(retval.content_check.summary, model.extractor_name)
|
|
515
|
+
logging.info(txt2stix_data.content_check.model_dump_json())
|
|
401
516
|
|
|
402
517
|
if should_extract or ai_extract_if_no_incidence:
|
|
403
518
|
if extractors_map.get("ai"):
|
|
404
|
-
validate_token_count(
|
|
519
|
+
validate_token_count(
|
|
520
|
+
input_token_limit, preprocessed_text, ai_settings_extractions
|
|
521
|
+
)
|
|
405
522
|
if relationship_mode == "ai":
|
|
406
|
-
validate_token_count(
|
|
523
|
+
validate_token_count(
|
|
524
|
+
input_token_limit, preprocessed_text, [ai_settings_relationships]
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
txt2stix_data.extractions = run_extractors(
|
|
528
|
+
extractors_map,
|
|
529
|
+
preprocessed_text,
|
|
530
|
+
ai_extractors=ai_settings_extractions,
|
|
531
|
+
ignore_extraction_boundary=ignore_extraction_boundary,
|
|
532
|
+
)
|
|
407
533
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
534
|
+
if (
|
|
535
|
+
relationship_mode == "ai"
|
|
536
|
+
and txt2stix_data.extractions
|
|
537
|
+
and sum(map(lambda x: len(x), txt2stix_data.extractions.values()))
|
|
538
|
+
):
|
|
539
|
+
txt2stix_data.relationships = extract_relationships(
|
|
540
|
+
preprocessed_text, txt2stix_data.extractions, ai_settings_relationships
|
|
541
|
+
)
|
|
542
|
+
return txt2stix_data
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def processing_phase(
|
|
546
|
+
bundler: txt2stixBundler,
|
|
547
|
+
preprocessed_text: str,
|
|
548
|
+
data: Txt2StixData,
|
|
549
|
+
ai_create_attack_flow=False,
|
|
550
|
+
ai_create_attack_navigator_layer=False,
|
|
551
|
+
ai_settings_relationships=None,
|
|
552
|
+
ai_content_check_provider=None,
|
|
553
|
+
):
|
|
554
|
+
"""Process extracted `data` into the given `bundler` without invoking LLMs."""
|
|
555
|
+
try:
|
|
556
|
+
if data.content_check:
|
|
557
|
+
cc = data.content_check
|
|
558
|
+
provider_name = str(ai_content_check_provider)
|
|
559
|
+
bundler.report.external_references.append(
|
|
560
|
+
dict(
|
|
561
|
+
source_name="txt2stix_describes_incident",
|
|
562
|
+
description=str(cc.describes_incident).lower(),
|
|
563
|
+
external_id=provider_name,
|
|
564
|
+
)
|
|
565
|
+
)
|
|
566
|
+
for classification in cc.incident_classification:
|
|
567
|
+
bundler.report.labels.append(f"classification.{classification}".lower())
|
|
568
|
+
bundler.add_summary(cc.summary, provider_name)
|
|
569
|
+
except BaseException:
|
|
570
|
+
logging.exception("applying content_check to bundler failed", exc_info=True)
|
|
571
|
+
|
|
572
|
+
# process extracts into bundler
|
|
573
|
+
process_extracts(bundler, data.extractions)
|
|
574
|
+
|
|
575
|
+
# process relationships into bundler
|
|
576
|
+
try:
|
|
577
|
+
if data.relationships:
|
|
578
|
+
bundler.process_relationships(data.relationships.get("relationships", []))
|
|
579
|
+
except BaseException:
|
|
580
|
+
logging.exception("processing relationships failed", exc_info=True)
|
|
415
581
|
|
|
582
|
+
# generate attack flow / navigator layer now that bundler has been populated
|
|
583
|
+
try:
|
|
584
|
+
if ai_create_attack_flow or ai_create_attack_navigator_layer:
|
|
585
|
+
data.attack_flow, data.navigator_layer = (
|
|
586
|
+
attack_flow.extract_attack_flow_and_navigator(
|
|
587
|
+
bundler,
|
|
588
|
+
preprocessed_text,
|
|
589
|
+
ai_create_attack_flow,
|
|
590
|
+
ai_create_attack_navigator_layer,
|
|
591
|
+
ai_settings_relationships,
|
|
592
|
+
flow=data.attack_flow,
|
|
593
|
+
)
|
|
594
|
+
)
|
|
595
|
+
except BaseException:
|
|
596
|
+
logging.exception("attack flow / navigator generation failed", exc_info=True)
|
|
416
597
|
|
|
417
598
|
|
|
418
599
|
def main():
|
|
@@ -424,35 +605,50 @@ def main():
|
|
|
424
605
|
setLogFile(logger, Path(f"logs/logs-{job_id}.log"))
|
|
425
606
|
logger.info(f"Arguments: {json.dumps(sys.argv[1:])}")
|
|
426
607
|
|
|
427
|
-
|
|
428
608
|
input_text = args.input_file.read_text()
|
|
429
|
-
preprocessed_text = remove_links(
|
|
609
|
+
preprocessed_text = remove_links(
|
|
610
|
+
input_text, args.ignore_image_refs, args.ignore_link_refs
|
|
611
|
+
)
|
|
430
612
|
load_env()
|
|
431
613
|
|
|
432
|
-
|
|
433
|
-
|
|
614
|
+
bundler = txt2stixBundler(
|
|
615
|
+
args.name,
|
|
616
|
+
args.use_identity,
|
|
617
|
+
args.tlp_level,
|
|
618
|
+
input_text,
|
|
619
|
+
args.confidence,
|
|
620
|
+
args.all_extractors,
|
|
621
|
+
args.labels,
|
|
622
|
+
created=args.created,
|
|
623
|
+
report_id=args.report_id,
|
|
624
|
+
external_references=args.external_refs,
|
|
625
|
+
)
|
|
434
626
|
log_notes(sys.argv, "Config")
|
|
435
627
|
|
|
436
628
|
data = run_txt2stix(
|
|
437
|
-
bundler,
|
|
438
|
-
|
|
629
|
+
bundler,
|
|
630
|
+
preprocessed_text,
|
|
631
|
+
args.use_extractions,
|
|
632
|
+
input_token_limit=int(os.environ["INPUT_TOKEN_LIMIT"]),
|
|
439
633
|
**args.__dict__,
|
|
440
634
|
)
|
|
441
635
|
|
|
442
636
|
## write outputs
|
|
443
637
|
out = bundler.to_json()
|
|
444
|
-
output_dir = Path("./output")/str(bundler.uuid)
|
|
638
|
+
output_dir = Path("./output") / str(bundler.uuid)
|
|
445
639
|
with contextlib.suppress(BaseException):
|
|
446
640
|
shutil.rmtree(output_dir)
|
|
447
641
|
output_dir.mkdir(exist_ok=True, parents=True)
|
|
448
|
-
output_path = output_dir/f"{bundler.bundle.id}.json"
|
|
642
|
+
output_path = output_dir / f"{bundler.bundle.id}.json"
|
|
449
643
|
output_path.write_text(out)
|
|
450
644
|
logger.info(f"Wrote bundle output to `{output_path}`")
|
|
451
|
-
data_path = output_dir/f"data--{bundler.uuid}.json"
|
|
645
|
+
data_path = output_dir / f"data--{bundler.uuid}.json"
|
|
452
646
|
data_path.write_text(data.model_dump_json(indent=4))
|
|
453
647
|
logger.info(f"Wrote data output to `{data_path}`")
|
|
454
648
|
for nav_layer in data.navigator_layer or []:
|
|
455
|
-
nav_path =
|
|
649
|
+
nav_path = (
|
|
650
|
+
output_dir / f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
|
|
651
|
+
)
|
|
456
652
|
nav_path.write_text(json.dumps(nav_layer, indent=4))
|
|
457
653
|
logger.info(f"Wrote navigator output to `{nav_path}`")
|
|
458
654
|
except argparse.ArgumentError as e:
|
txt2stix/utils.py
CHANGED
|
@@ -8,7 +8,7 @@ import mistune
|
|
|
8
8
|
from mistune.renderers.markdown import MarkdownRenderer
|
|
9
9
|
from mistune.util import unescape
|
|
10
10
|
|
|
11
|
-
from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident
|
|
11
|
+
from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, RelationshipList
|
|
12
12
|
class ImageLinkRemover(MarkdownRenderer):
|
|
13
13
|
def __init__(self, remove_links: bool=False, remove_images: bool=False):
|
|
14
14
|
self.remove_links = remove_links
|
|
@@ -49,7 +49,7 @@ class ImageLinkRemover(MarkdownRenderer):
|
|
|
49
49
|
class Txt2StixData(BaseModel):
|
|
50
50
|
content_check: DescribesIncident = Field(default=None)
|
|
51
51
|
extractions: dict = Field(default=None)
|
|
52
|
-
relationships:
|
|
52
|
+
relationships: dict|RelationshipList = Field(default_factory=dict)
|
|
53
53
|
attack_flow: AttackFlowList = Field(default=None)
|
|
54
54
|
navigator_layer: list = Field(default=None)
|
|
55
55
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: txt2stix
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.13
|
|
4
4
|
Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
|
|
5
5
|
Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
|
|
6
6
|
Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
txt2stix/__init__.py,sha256=
|
|
2
|
-
txt2stix/attack_flow.py,sha256=
|
|
3
|
-
txt2stix/bundler.py,sha256=
|
|
1
|
+
txt2stix/__init__.py,sha256=kHCnJtzi37ivXx2STT5zT7-cUL16i86o7ywtSd3iXd4,769
|
|
2
|
+
txt2stix/attack_flow.py,sha256=VAsgNKZvPa-llUsGvbv0tYNc2Kif5pNeMoxH88-6CWc,9060
|
|
3
|
+
txt2stix/bundler.py,sha256=GmpWW9ek4iFZdEIyjVSpd9RnmyeNsZJOpnax5Tt0uT0,16748
|
|
4
4
|
txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
|
|
5
5
|
txt2stix/credential_checker.py,sha256=eWDP-jY3-jm8zI0JMoUcyoQZ_JqPNfCIr_HAO8nVYz0,3044
|
|
6
6
|
txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
|
|
@@ -8,17 +8,17 @@ txt2stix/indicator.py,sha256=dyf4wbvVrZRitZpm6t7UusSM98bVW1qc5UkdGpVm3ls,30025
|
|
|
8
8
|
txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
|
|
9
9
|
txt2stix/retriever.py,sha256=sMNhnEYk3l5W44qZsWaDQtJYoHXA1oYIPM6wDqiUHSg,6642
|
|
10
10
|
txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
|
|
11
|
-
txt2stix/txt2stix.py,sha256=
|
|
12
|
-
txt2stix/utils.py,sha256=
|
|
11
|
+
txt2stix/txt2stix.py,sha256=eUL0pynQXruJRDvqs-LQ-dspDITx5tFDnTPEgCRQApk,23348
|
|
12
|
+
txt2stix/utils.py,sha256=Le0VYx8n8UNpcjqwpx7Avb06qIS9_hId8yP8_PquBUs,3333
|
|
13
13
|
txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
|
|
14
14
|
txt2stix/ai_extractor/anthropic.py,sha256=B5Z3nm2-w5KBhLcVJGkhNF0dn4lUo-fW_DnbOeJKA5Q,481
|
|
15
|
-
txt2stix/ai_extractor/base.py,sha256=
|
|
15
|
+
txt2stix/ai_extractor/base.py,sha256=I_UwX4mOAVa8HrjSkI3KqKKImIBtQ29RdprDOu2NK6A,4235
|
|
16
16
|
txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
|
|
17
17
|
txt2stix/ai_extractor/gemini.py,sha256=rhhYrCa1zZTjadVk2QFhguD8_Yr03gl-D4Yb2nVBMI4,633
|
|
18
18
|
txt2stix/ai_extractor/openai.py,sha256=1RxaLy0TJ4GjNKmcJoi6ZiBrCS_gt5ql9jpeE-SOy8g,642
|
|
19
19
|
txt2stix/ai_extractor/openrouter.py,sha256=hAA6mTOMcpA28XYsOCvuJH7WMJqXCxfqZGJf_VrDsIk,628
|
|
20
20
|
txt2stix/ai_extractor/prompts.py,sha256=NtqtVyPPtShPlVZ5SrFmo-LCkfpANIIi4H9rjqaxqDo,10559
|
|
21
|
-
txt2stix/ai_extractor/utils.py,sha256=
|
|
21
|
+
txt2stix/ai_extractor/utils.py,sha256=7iB2qm-oUSFaYidsNi74EACwLV5skCcecCw3F9eIJx4,4507
|
|
22
22
|
txt2stix/pattern/__init__.py,sha256=K9ofaP2AOikvzb48VSBpJZijckdqufZxSzr_kbRypLY,491
|
|
23
23
|
txt2stix/pattern/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
txt2stix/pattern/extractors/base_extractor.py,sha256=ly80rp-L40g7DbhrGiCvhPWI95-ZFMtAQUEC-fH6Y-o,6130
|
|
@@ -114,8 +114,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
|
|
|
114
114
|
txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
|
|
115
115
|
txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
|
|
116
116
|
txt2stix/includes/tests/test_cases.yaml,sha256=vErA3c5fySeWvJ5yJ8dCTEo3ufRATASAjaF4gj4Az1M,22424
|
|
117
|
-
txt2stix-1.1.
|
|
118
|
-
txt2stix-1.1.
|
|
119
|
-
txt2stix-1.1.
|
|
120
|
-
txt2stix-1.1.
|
|
121
|
-
txt2stix-1.1.
|
|
117
|
+
txt2stix-1.1.13.dist-info/METADATA,sha256=4vgSOfXJOiJrJ9-WkodqXtOdtAYgDKjXY7xLEYCvRAg,15032
|
|
118
|
+
txt2stix-1.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
119
|
+
txt2stix-1.1.13.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
|
|
120
|
+
txt2stix-1.1.13.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
|
|
121
|
+
txt2stix-1.1.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|