txt2stix 1.1.11__py3-none-any.whl → 1.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2stix/__init__.py +0 -1
- txt2stix/ai_extractor/anthropic.py +2 -2
- txt2stix/ai_extractor/base.py +4 -1
- txt2stix/ai_extractor/gemini.py +3 -3
- txt2stix/ai_extractor/utils.py +3 -0
- txt2stix/attack_flow.py +2 -1
- txt2stix/bundler.py +1 -4
- txt2stix/txt2stix.py +277 -80
- txt2stix/utils.py +2 -2
- {txt2stix-1.1.11.dist-info → txt2stix-1.1.13.dist-info}/METADATA +12 -7
- {txt2stix-1.1.11.dist-info → txt2stix-1.1.13.dist-info}/RECORD +14 -14
- {txt2stix-1.1.11.dist-info → txt2stix-1.1.13.dist-info}/WHEEL +1 -1
- {txt2stix-1.1.11.dist-info → txt2stix-1.1.13.dist-info}/entry_points.txt +0 -0
- {txt2stix-1.1.11.dist-info → txt2stix-1.1.13.dist-info}/licenses/LICENSE +0 -0
txt2stix/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ from llama_index.llms.anthropic import Anthropic
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class AnthropicAIExtractor(BaseAIExtractor, provider="anthropic"):
|
|
9
|
-
def __init__(self, **kwargs) -> None:
|
|
9
|
+
def __init__(self, model='claude-sonnet-4-0', **kwargs) -> None:
|
|
10
10
|
kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
|
|
11
|
-
self.llm = Anthropic(max_tokens=4096, system_prompt=self.system_prompt, **kwargs)
|
|
11
|
+
self.llm = Anthropic(max_tokens=4096, model=model, system_prompt=self.system_prompt, **kwargs)
|
|
12
12
|
super().__init__()
|
txt2stix/ai_extractor/base.py
CHANGED
txt2stix/ai_extractor/gemini.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
|
|
2
2
|
import os
|
|
3
3
|
from txt2stix.ai_extractor.base import BaseAIExtractor
|
|
4
|
-
from llama_index.llms.
|
|
4
|
+
from llama_index.llms.google_genai import GoogleGenAI
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class GeminiAIExtractor(BaseAIExtractor, provider="gemini"):
|
|
8
8
|
def __init__(self, **kwargs) -> None:
|
|
9
9
|
kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
|
|
10
|
-
self.llm =
|
|
10
|
+
self.llm = GoogleGenAI(max_tokens=4096, **kwargs)
|
|
11
11
|
super().__init__()
|
|
12
12
|
|
|
13
13
|
def count_tokens(self, text):
|
|
14
|
-
return self.llm.
|
|
14
|
+
return self.llm._client.models.count_tokens(model=self.llm.model, contents=text).total_tokens
|
|
15
15
|
|
|
16
16
|
@property
|
|
17
17
|
def extractor_name(self):
|
txt2stix/ai_extractor/utils.py
CHANGED
|
@@ -33,6 +33,9 @@ class RelationshipList(BaseModel):
|
|
|
33
33
|
relationships: list[Relationship] = Field(default_factory=list)
|
|
34
34
|
success: bool
|
|
35
35
|
|
|
36
|
+
def get(self, key, default=None):
|
|
37
|
+
return getattr(self, key, default)
|
|
38
|
+
|
|
36
39
|
class DescribesIncident(BaseModel):
|
|
37
40
|
describes_incident: bool = Field(description="does the <document> include malware analysis, APT group reports, data breaches and vulnerabilities?")
|
|
38
41
|
explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
|
txt2stix/attack_flow.py
CHANGED
|
@@ -213,6 +213,7 @@ def extract_attack_flow_and_navigator(
|
|
|
213
213
|
ai_create_attack_flow,
|
|
214
214
|
ai_create_attack_navigator_layer,
|
|
215
215
|
ai_settings_relationships,
|
|
216
|
+
flow=None
|
|
216
217
|
):
|
|
217
218
|
ex: BaseAIExtractor = ai_settings_relationships
|
|
218
219
|
tactics = get_all_tactics()
|
|
@@ -225,7 +226,7 @@ def extract_attack_flow_and_navigator(
|
|
|
225
226
|
]
|
|
226
227
|
logging.debug(f"parsed techniques: {json.dumps(logged_techniques, indent=4)}")
|
|
227
228
|
|
|
228
|
-
flow = ex.extract_attack_flow(preprocessed_text, techniques)
|
|
229
|
+
flow = flow or ex.extract_attack_flow(preprocessed_text, techniques)
|
|
229
230
|
navigator = None
|
|
230
231
|
if ai_create_attack_flow:
|
|
231
232
|
logging.info("creating attack-flow bundle")
|
txt2stix/bundler.py
CHANGED
|
@@ -422,10 +422,6 @@ class txt2stixBundler:
|
|
|
422
422
|
def process_observables(self, extractions, add_standard_relationship=False):
|
|
423
423
|
for ex in extractions:
|
|
424
424
|
try:
|
|
425
|
-
if ex.get("id", "").startswith(
|
|
426
|
-
"ai"
|
|
427
|
-
): # so id is distinct across multiple AIExtractors
|
|
428
|
-
ex["id"] = f'{ex["id"]}_{self.observables_processed}'
|
|
429
425
|
ex["id"] = ex.get("id", f"ex_{self.observables_processed}")
|
|
430
426
|
self.observables_processed += 1
|
|
431
427
|
self.add_indicator(ex, add_standard_relationship)
|
|
@@ -437,6 +433,7 @@ class txt2stixBundler:
|
|
|
437
433
|
ex["error"] = str(e)
|
|
438
434
|
|
|
439
435
|
def process_relationships(self, observables):
|
|
436
|
+
print(observables)
|
|
440
437
|
for relationship in observables:
|
|
441
438
|
try:
|
|
442
439
|
self.add_ai_relationship(relationship)
|
txt2stix/txt2stix.py
CHANGED
|
@@ -21,7 +21,7 @@ from .utils import RELATIONSHIP_TYPES, Txt2StixData, remove_links
|
|
|
21
21
|
from .common import UUID_NAMESPACE, FatalException
|
|
22
22
|
|
|
23
23
|
from .bundler import txt2stixBundler, parse_stix, TLP_LEVEL
|
|
24
|
-
from .import extractions, lookups, pattern
|
|
24
|
+
from . import extractions, lookups, pattern
|
|
25
25
|
from types import SimpleNamespace
|
|
26
26
|
import functools
|
|
27
27
|
from fnmatch import filter
|
|
@@ -40,41 +40,51 @@ def newLogger(name: str) -> logging.Logger:
|
|
|
40
40
|
level=logging.DEBUG, # Set the desired logging level
|
|
41
41
|
format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
|
|
42
42
|
handlers=[stream_handler],
|
|
43
|
-
datefmt=
|
|
43
|
+
datefmt="%d-%b-%y %H:%M:%S",
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
return logging.root
|
|
47
47
|
|
|
48
|
+
|
|
48
49
|
def setLogFile(logger, file: Path):
|
|
49
50
|
file.parent.mkdir(parents=True, exist_ok=True)
|
|
50
51
|
logger.info(f"Saving log to `{file.absolute()}`")
|
|
51
52
|
handler = logging.FileHandler(file, "w")
|
|
52
|
-
handler.formatter = logging.Formatter(
|
|
53
|
+
handler.formatter = logging.Formatter(
|
|
54
|
+
fmt="%(levelname)s %(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S"
|
|
55
|
+
)
|
|
53
56
|
handler.setLevel(logging.DEBUG)
|
|
54
57
|
logger.addHandler(handler)
|
|
55
58
|
logger.info("=====================txt2stix======================")
|
|
56
59
|
|
|
57
60
|
|
|
58
61
|
MODULE_PATH = Path(__file__).parent.parent
|
|
59
|
-
INCLUDES_PATH = MODULE_PATH/"includes"
|
|
62
|
+
INCLUDES_PATH = MODULE_PATH / "includes"
|
|
60
63
|
try:
|
|
61
64
|
from . import includes
|
|
65
|
+
|
|
62
66
|
INCLUDES_PATH = Path(includes.__file__).parent
|
|
63
67
|
except:
|
|
64
68
|
pass
|
|
65
69
|
|
|
70
|
+
|
|
66
71
|
def split_comma(s: str) -> list[str]:
|
|
67
72
|
return [ss for ss in s.split(",") if ss]
|
|
68
73
|
|
|
74
|
+
|
|
69
75
|
def range_type(min, max):
|
|
70
76
|
def fn(astr):
|
|
71
77
|
value = int(astr)
|
|
72
|
-
if min<= value <= max:
|
|
78
|
+
if min <= value <= max:
|
|
73
79
|
return value
|
|
74
80
|
else:
|
|
75
|
-
raise argparse.ArgumentTypeError(
|
|
81
|
+
raise argparse.ArgumentTypeError(
|
|
82
|
+
f"value {value} not in range [{min}-{max}]"
|
|
83
|
+
)
|
|
84
|
+
|
|
76
85
|
return fn
|
|
77
86
|
|
|
87
|
+
|
|
78
88
|
def parse_labels(labels: str) -> list[str]:
|
|
79
89
|
labels = labels.split(",")
|
|
80
90
|
for label in labels:
|
|
@@ -83,39 +93,44 @@ def parse_labels(labels: str) -> list[str]:
|
|
|
83
93
|
|
|
84
94
|
return labels
|
|
85
95
|
|
|
96
|
+
|
|
86
97
|
def parse_extractors_globbed(type, all_extractors, names):
|
|
87
98
|
globbed_names = set()
|
|
88
99
|
for name in names.split(","):
|
|
89
100
|
matches = fnmatch.filter(all_extractors.keys(), name)
|
|
90
101
|
if not matches:
|
|
91
|
-
raise argparse.ArgumentTypeError(f
|
|
102
|
+
raise argparse.ArgumentTypeError(f"`{name}` has 0 matches")
|
|
92
103
|
globbed_names.update(matches)
|
|
93
|
-
filtered_extractors
|
|
104
|
+
filtered_extractors = {}
|
|
94
105
|
for extractor_name in globbed_names:
|
|
95
106
|
try:
|
|
96
107
|
extractor = all_extractors[extractor_name]
|
|
97
|
-
extraction_processor
|
|
108
|
+
extraction_processor = filtered_extractors.get(extractor.type, {})
|
|
98
109
|
if extractor.type in ["lookup"]:
|
|
99
110
|
lookups.load_lookup(extractor)
|
|
100
111
|
if extractor.type == "pattern":
|
|
101
112
|
pattern.load_extractor(extractor)
|
|
102
|
-
filtered_extractors[extractor.type] =
|
|
113
|
+
filtered_extractors[extractor.type] = extraction_processor
|
|
103
114
|
extraction_processor[extractor_name] = extractor
|
|
104
115
|
except BaseException as e:
|
|
105
116
|
raise argparse.ArgumentTypeError(f"{type} `{extractor_name}`: {e}")
|
|
106
117
|
return filtered_extractors
|
|
107
118
|
|
|
119
|
+
|
|
108
120
|
def parse_ref(value):
|
|
109
|
-
m = re.compile(r
|
|
121
|
+
m = re.compile(r"(.+?)=(.+)").match(value)
|
|
110
122
|
if not m:
|
|
111
123
|
raise argparse.ArgumentTypeError("must be in format key=value")
|
|
112
124
|
return dict(source_name=m.group(1), external_id=m.group(2))
|
|
113
125
|
|
|
126
|
+
|
|
114
127
|
def parse_model(value: str):
|
|
115
|
-
splits = value.split(
|
|
128
|
+
splits = value.split(":", 1)
|
|
116
129
|
provider = splits[0]
|
|
117
130
|
if provider not in ALL_AI_EXTRACTORS:
|
|
118
|
-
raise argparse.ArgumentTypeError(
|
|
131
|
+
raise argparse.ArgumentTypeError(
|
|
132
|
+
f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
|
|
133
|
+
)
|
|
119
134
|
provider = ALL_AI_EXTRACTORS[provider]
|
|
120
135
|
|
|
121
136
|
try:
|
|
@@ -125,6 +140,7 @@ def parse_model(value: str):
|
|
|
125
140
|
except Exception as e:
|
|
126
141
|
raise ModelError(f"Unable to initialize model `{value}`") from e
|
|
127
142
|
|
|
143
|
+
|
|
128
144
|
def parse_bool(value: str):
|
|
129
145
|
value = value.lower()
|
|
130
146
|
return value in ["yes", "y", "true", "1"]
|
|
@@ -135,7 +151,12 @@ def parse_args():
|
|
|
135
151
|
all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
|
|
136
152
|
|
|
137
153
|
parser = argparse.ArgumentParser(description="File Conversion Tool")
|
|
138
|
-
parser.add_argument(
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--check_credentials",
|
|
156
|
+
"--check-credentials",
|
|
157
|
+
action="store_true",
|
|
158
|
+
help="Print the validity of the credentials and exit",
|
|
159
|
+
)
|
|
139
160
|
args, _ = parser.parse_known_args()
|
|
140
161
|
if args.check_credentials:
|
|
141
162
|
statuses = credential_checker.check_statuses(test_llms=True)
|
|
@@ -259,7 +280,6 @@ def parse_args():
|
|
|
259
280
|
help="create attack flow for attack objects in report/bundle",
|
|
260
281
|
)
|
|
261
282
|
|
|
262
|
-
|
|
263
283
|
args = parser.parse_args()
|
|
264
284
|
if not args.input_file.exists():
|
|
265
285
|
raise argparse.ArgumentError(inf_arg, "cannot open file")
|
|
@@ -296,6 +316,8 @@ REQUIRED_ENV_VARIABLES = [
|
|
|
296
316
|
"CTIBUTLER_BASE_URL",
|
|
297
317
|
"VULMATCH_BASE_URL",
|
|
298
318
|
]
|
|
319
|
+
|
|
320
|
+
|
|
299
321
|
def load_env():
|
|
300
322
|
for env in REQUIRED_ENV_VARIABLES:
|
|
301
323
|
if not os.getenv(env):
|
|
@@ -304,19 +326,34 @@ def load_env():
|
|
|
304
326
|
|
|
305
327
|
def log_notes(content, type):
|
|
306
328
|
logging.debug(f" ========================= {type} ========================= ")
|
|
307
|
-
logging.debug(
|
|
329
|
+
logging.debug(
|
|
330
|
+
f" ========================= {'+'*len(type)} ========================= "
|
|
331
|
+
)
|
|
308
332
|
logging.debug(json.dumps(content, sort_keys=True, indent=4))
|
|
309
|
-
logging.debug(
|
|
333
|
+
logging.debug(
|
|
334
|
+
f" ========================= {'-'*len(type)} ========================= "
|
|
335
|
+
)
|
|
336
|
+
|
|
310
337
|
|
|
311
|
-
def
|
|
312
|
-
|
|
338
|
+
def run_extractors(
|
|
339
|
+
extractors_map, text_content, ai_extractors: list[BaseAIExtractor] = [], **kwargs
|
|
340
|
+
):
|
|
341
|
+
"""Run extraction calls (lookup, pattern, AI) and return a dict of all extracts.
|
|
313
342
|
|
|
314
|
-
|
|
343
|
+
This function does NOT modify the bundler. Use `process_extracts` to
|
|
344
|
+
feed the returned extracts into a bundler (or replay saved extracts).
|
|
345
|
+
"""
|
|
346
|
+
assert ai_extractors or not extractors_map.get(
|
|
347
|
+
"ai"
|
|
348
|
+
), "There should be at least one AI extractor in ai_extractors"
|
|
349
|
+
|
|
350
|
+
text_content = "\n" + text_content + "\n"
|
|
315
351
|
all_extracts = dict()
|
|
316
352
|
if extractors_map.get("lookup"):
|
|
317
353
|
try:
|
|
318
|
-
lookup_extracts = lookups.extract_all(
|
|
319
|
-
|
|
354
|
+
lookup_extracts = lookups.extract_all(
|
|
355
|
+
extractors_map["lookup"].values(), text_content
|
|
356
|
+
)
|
|
320
357
|
all_extracts["lookup"] = lookup_extracts
|
|
321
358
|
except BaseException as e:
|
|
322
359
|
logging.exception("lookup extraction failed", exc_info=True)
|
|
@@ -324,94 +361,239 @@ def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extra
|
|
|
324
361
|
if extractors_map.get("pattern"):
|
|
325
362
|
try:
|
|
326
363
|
logging.info("using pattern extractors")
|
|
327
|
-
pattern_extracts = pattern.extract_all(
|
|
328
|
-
|
|
364
|
+
pattern_extracts = pattern.extract_all(
|
|
365
|
+
extractors_map["pattern"].values(),
|
|
366
|
+
text_content,
|
|
367
|
+
ignore_extraction_boundary=kwargs.get(
|
|
368
|
+
"ignore_extraction_boundary", False
|
|
369
|
+
),
|
|
370
|
+
)
|
|
329
371
|
all_extracts["pattern"] = pattern_extracts
|
|
330
372
|
except BaseException as e:
|
|
331
373
|
logging.exception("pattern extraction failed", exc_info=True)
|
|
332
374
|
|
|
333
375
|
if extractors_map.get("ai"):
|
|
334
376
|
logging.info("using ai extractors")
|
|
335
|
-
|
|
336
377
|
for extractor in ai_extractors:
|
|
337
378
|
logging.info("running extractor: %s", extractor.extractor_name)
|
|
338
379
|
try:
|
|
339
|
-
ai_extracts = extractor.extract_objects(
|
|
340
|
-
|
|
380
|
+
ai_extracts = extractor.extract_objects(
|
|
381
|
+
text_content, extractors_map["ai"].values()
|
|
382
|
+
)
|
|
341
383
|
all_extracts[f"ai-{extractor.extractor_name}"] = ai_extracts
|
|
342
384
|
except BaseException as e:
|
|
343
|
-
logging.exception(
|
|
385
|
+
logging.exception(
|
|
386
|
+
"AI extraction failed for %s",
|
|
387
|
+
extractor.extractor_name,
|
|
388
|
+
exc_info=True,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
for i, ex in enumerate(itertools.chain(*all_extracts.values())):
|
|
392
|
+
ex["id"] = "ex-" + str(i)
|
|
393
|
+
return all_extracts
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def process_extracts(bundler: txt2stixBundler, all_extracts: dict):
|
|
397
|
+
"""Process a previously-created `all_extracts` dict into the given bundler.
|
|
398
|
+
|
|
399
|
+
This allows replaying saved extracts without invoking extractors again.
|
|
400
|
+
"""
|
|
401
|
+
for key, extracts in (all_extracts or {}).items():
|
|
402
|
+
try:
|
|
403
|
+
bundler.process_observables(extracts)
|
|
404
|
+
except BaseException:
|
|
405
|
+
logging.exception("processing extracts failed for %s", key, exc_info=True)
|
|
344
406
|
|
|
345
407
|
log_notes(all_extracts, "Extractions")
|
|
346
|
-
return all_extracts
|
|
347
408
|
|
|
348
|
-
|
|
409
|
+
|
|
410
|
+
def extract_relationships(
|
|
411
|
+
text_content, all_extracts, ai_extractor_session: BaseAIExtractor
|
|
412
|
+
):
|
|
349
413
|
relationships = None
|
|
350
414
|
try:
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
415
|
+
# flatten extracts into a single list
|
|
416
|
+
flattened = list(itertools.chain(*all_extracts.values()))
|
|
417
|
+
rel = ai_extractor_session.extract_relationships(
|
|
418
|
+
text_content, flattened, RELATIONSHIP_TYPES
|
|
419
|
+
)
|
|
420
|
+
relationships = rel.model_dump()
|
|
354
421
|
log_notes(relationships, "Relationships")
|
|
355
|
-
bundler.process_relationships(relationships['relationships'])
|
|
356
422
|
except BaseException as e:
|
|
357
|
-
logging.exception("Relationship
|
|
423
|
+
logging.exception("Relationship extraction failed: %s", e)
|
|
358
424
|
return relationships
|
|
359
425
|
|
|
426
|
+
|
|
360
427
|
def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
|
|
361
|
-
logging.info(
|
|
428
|
+
logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
|
|
362
429
|
for extractor in extractors:
|
|
363
430
|
token_count = _count_token(extractor, input)
|
|
364
|
-
|
|
365
|
-
|
|
431
|
+
logging.info(
|
|
432
|
+
f"{extractor.extractor_name}: input_file token count = {token_count}"
|
|
433
|
+
)
|
|
434
|
+
if token_count > max_tokens:
|
|
435
|
+
raise FatalException(
|
|
436
|
+
f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
|
|
437
|
+
)
|
|
366
438
|
|
|
367
439
|
|
|
368
440
|
@functools.lru_cache
|
|
369
441
|
def _count_token(extractor: BaseAIExtractor, input: str):
|
|
370
442
|
return extractor.count_tokens(input)
|
|
371
443
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
444
|
+
|
|
445
|
+
def run_txt2stix(
|
|
446
|
+
bundler: txt2stixBundler,
|
|
447
|
+
preprocessed_text: str,
|
|
448
|
+
extractors_map: dict,
|
|
449
|
+
ai_content_check_provider=None,
|
|
450
|
+
ai_create_attack_flow=None,
|
|
451
|
+
ai_create_attack_navigator_layer=None,
|
|
452
|
+
input_token_limit=10,
|
|
453
|
+
ai_settings_extractions=None,
|
|
454
|
+
ai_settings_relationships=None,
|
|
455
|
+
relationship_mode="standard",
|
|
456
|
+
ignore_extraction_boundary=False,
|
|
457
|
+
ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
|
|
458
|
+
txt2stix_data: Txt2StixData = None,
|
|
459
|
+
**kwargs,
|
|
460
|
+
) -> Txt2StixData:
|
|
461
|
+
# First, perform extraction-phase (LLM and extractor calls). This does not
|
|
462
|
+
# modify the provided bundler so the results can be saved and replayed.
|
|
463
|
+
# skip extraction phase if txt2stix_data is passed
|
|
464
|
+
txt2stix_data = txt2stix_data or extraction_phase(
|
|
465
|
+
preprocessed_text,
|
|
466
|
+
extractors_map,
|
|
467
|
+
ai_content_check_provider=ai_content_check_provider,
|
|
468
|
+
input_token_limit=input_token_limit,
|
|
469
|
+
ai_settings_extractions=ai_settings_extractions,
|
|
470
|
+
ai_settings_relationships=ai_settings_relationships,
|
|
471
|
+
relationship_mode=relationship_mode,
|
|
472
|
+
ignore_extraction_boundary=ignore_extraction_boundary,
|
|
473
|
+
ai_extract_if_no_incidence=ai_extract_if_no_incidence,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Then, process the extracted data into the bundler (no LLM calls).
|
|
477
|
+
processing_phase(
|
|
478
|
+
bundler,
|
|
479
|
+
preprocessed_text,
|
|
480
|
+
txt2stix_data,
|
|
481
|
+
ai_create_attack_flow=ai_create_attack_flow,
|
|
482
|
+
ai_create_attack_navigator_layer=ai_create_attack_navigator_layer,
|
|
483
|
+
ai_settings_relationships=ai_settings_relationships,
|
|
484
|
+
ai_content_check_provider=ai_content_check_provider,
|
|
485
|
+
)
|
|
486
|
+
return txt2stix_data
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def extraction_phase(
|
|
490
|
+
preprocessed_text: str,
|
|
491
|
+
extractors_map: dict,
|
|
492
|
+
ai_content_check_provider=None,
|
|
493
|
+
input_token_limit=10,
|
|
494
|
+
ai_settings_extractions=None,
|
|
495
|
+
ai_settings_relationships=None,
|
|
496
|
+
relationship_mode="standard",
|
|
497
|
+
ignore_extraction_boundary=False,
|
|
498
|
+
ai_extract_if_no_incidence=True,
|
|
499
|
+
**kwargs,
|
|
500
|
+
) -> Txt2StixData:
|
|
501
|
+
"""Perform token validation and run extractors/AI models. Does NOT modify a bundler."""
|
|
385
502
|
should_extract = True
|
|
386
|
-
|
|
387
|
-
|
|
503
|
+
txt2stix_data = Txt2StixData.model_construct()
|
|
504
|
+
txt2stix_data.extractions = txt2stix_data.attack_flow = (
|
|
505
|
+
txt2stix_data.relationships
|
|
506
|
+
) = None
|
|
507
|
+
|
|
388
508
|
if ai_content_check_provider:
|
|
389
509
|
logging.info("checking content")
|
|
390
|
-
model
|
|
510
|
+
model: BaseAIExtractor = ai_content_check_provider
|
|
391
511
|
validate_token_count(input_token_limit, preprocessed_text, [model])
|
|
392
|
-
|
|
393
|
-
should_extract =
|
|
512
|
+
txt2stix_data.content_check = model.check_content(preprocessed_text)
|
|
513
|
+
should_extract = txt2stix_data.content_check.describes_incident
|
|
394
514
|
logging.info("=== ai-check-content output ====")
|
|
395
|
-
logging.info(
|
|
396
|
-
bundler.report.external_references.append(dict(source_name='txt2stix_describes_incident', description=str(should_extract).lower(), external_id=model.extractor_name))
|
|
397
|
-
for classification in retval.content_check.incident_classification:
|
|
398
|
-
bundler.report.labels.append(f'classification.{classification}'.lower())
|
|
399
|
-
bundler.add_summary(retval.content_check.summary, model.extractor_name)
|
|
515
|
+
logging.info(txt2stix_data.content_check.model_dump_json())
|
|
400
516
|
|
|
401
517
|
if should_extract or ai_extract_if_no_incidence:
|
|
402
518
|
if extractors_map.get("ai"):
|
|
403
|
-
validate_token_count(
|
|
519
|
+
validate_token_count(
|
|
520
|
+
input_token_limit, preprocessed_text, ai_settings_extractions
|
|
521
|
+
)
|
|
404
522
|
if relationship_mode == "ai":
|
|
405
|
-
validate_token_count(
|
|
523
|
+
validate_token_count(
|
|
524
|
+
input_token_limit, preprocessed_text, [ai_settings_relationships]
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
txt2stix_data.extractions = run_extractors(
|
|
528
|
+
extractors_map,
|
|
529
|
+
preprocessed_text,
|
|
530
|
+
ai_extractors=ai_settings_extractions,
|
|
531
|
+
ignore_extraction_boundary=ignore_extraction_boundary,
|
|
532
|
+
)
|
|
406
533
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
534
|
+
if (
|
|
535
|
+
relationship_mode == "ai"
|
|
536
|
+
and txt2stix_data.extractions
|
|
537
|
+
and sum(map(lambda x: len(x), txt2stix_data.extractions.values()))
|
|
538
|
+
):
|
|
539
|
+
txt2stix_data.relationships = extract_relationships(
|
|
540
|
+
preprocessed_text, txt2stix_data.extractions, ai_settings_relationships
|
|
541
|
+
)
|
|
542
|
+
return txt2stix_data
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def processing_phase(
|
|
546
|
+
bundler: txt2stixBundler,
|
|
547
|
+
preprocessed_text: str,
|
|
548
|
+
data: Txt2StixData,
|
|
549
|
+
ai_create_attack_flow=False,
|
|
550
|
+
ai_create_attack_navigator_layer=False,
|
|
551
|
+
ai_settings_relationships=None,
|
|
552
|
+
ai_content_check_provider=None,
|
|
553
|
+
):
|
|
554
|
+
"""Process extracted `data` into the given `bundler` without invoking LLMs."""
|
|
555
|
+
try:
|
|
556
|
+
if data.content_check:
|
|
557
|
+
cc = data.content_check
|
|
558
|
+
provider_name = str(ai_content_check_provider)
|
|
559
|
+
bundler.report.external_references.append(
|
|
560
|
+
dict(
|
|
561
|
+
source_name="txt2stix_describes_incident",
|
|
562
|
+
description=str(cc.describes_incident).lower(),
|
|
563
|
+
external_id=provider_name,
|
|
564
|
+
)
|
|
565
|
+
)
|
|
566
|
+
for classification in cc.incident_classification:
|
|
567
|
+
bundler.report.labels.append(f"classification.{classification}".lower())
|
|
568
|
+
bundler.add_summary(cc.summary, provider_name)
|
|
569
|
+
except BaseException:
|
|
570
|
+
logging.exception("applying content_check to bundler failed", exc_info=True)
|
|
571
|
+
|
|
572
|
+
# process extracts into bundler
|
|
573
|
+
process_extracts(bundler, data.extractions)
|
|
574
|
+
|
|
575
|
+
# process relationships into bundler
|
|
576
|
+
try:
|
|
577
|
+
if data.relationships:
|
|
578
|
+
bundler.process_relationships(data.relationships.get("relationships", []))
|
|
579
|
+
except BaseException:
|
|
580
|
+
logging.exception("processing relationships failed", exc_info=True)
|
|
414
581
|
|
|
582
|
+
# generate attack flow / navigator layer now that bundler has been populated
|
|
583
|
+
try:
|
|
584
|
+
if ai_create_attack_flow or ai_create_attack_navigator_layer:
|
|
585
|
+
data.attack_flow, data.navigator_layer = (
|
|
586
|
+
attack_flow.extract_attack_flow_and_navigator(
|
|
587
|
+
bundler,
|
|
588
|
+
preprocessed_text,
|
|
589
|
+
ai_create_attack_flow,
|
|
590
|
+
ai_create_attack_navigator_layer,
|
|
591
|
+
ai_settings_relationships,
|
|
592
|
+
flow=data.attack_flow,
|
|
593
|
+
)
|
|
594
|
+
)
|
|
595
|
+
except BaseException:
|
|
596
|
+
logging.exception("attack flow / navigator generation failed", exc_info=True)
|
|
415
597
|
|
|
416
598
|
|
|
417
599
|
def main():
|
|
@@ -423,35 +605,50 @@ def main():
|
|
|
423
605
|
setLogFile(logger, Path(f"logs/logs-{job_id}.log"))
|
|
424
606
|
logger.info(f"Arguments: {json.dumps(sys.argv[1:])}")
|
|
425
607
|
|
|
426
|
-
|
|
427
608
|
input_text = args.input_file.read_text()
|
|
428
|
-
preprocessed_text = remove_links(
|
|
609
|
+
preprocessed_text = remove_links(
|
|
610
|
+
input_text, args.ignore_image_refs, args.ignore_link_refs
|
|
611
|
+
)
|
|
429
612
|
load_env()
|
|
430
613
|
|
|
431
|
-
|
|
432
|
-
|
|
614
|
+
bundler = txt2stixBundler(
|
|
615
|
+
args.name,
|
|
616
|
+
args.use_identity,
|
|
617
|
+
args.tlp_level,
|
|
618
|
+
input_text,
|
|
619
|
+
args.confidence,
|
|
620
|
+
args.all_extractors,
|
|
621
|
+
args.labels,
|
|
622
|
+
created=args.created,
|
|
623
|
+
report_id=args.report_id,
|
|
624
|
+
external_references=args.external_refs,
|
|
625
|
+
)
|
|
433
626
|
log_notes(sys.argv, "Config")
|
|
434
627
|
|
|
435
628
|
data = run_txt2stix(
|
|
436
|
-
bundler,
|
|
437
|
-
|
|
629
|
+
bundler,
|
|
630
|
+
preprocessed_text,
|
|
631
|
+
args.use_extractions,
|
|
632
|
+
input_token_limit=int(os.environ["INPUT_TOKEN_LIMIT"]),
|
|
438
633
|
**args.__dict__,
|
|
439
634
|
)
|
|
440
635
|
|
|
441
636
|
## write outputs
|
|
442
637
|
out = bundler.to_json()
|
|
443
|
-
output_dir = Path("./output")/str(bundler.uuid)
|
|
638
|
+
output_dir = Path("./output") / str(bundler.uuid)
|
|
444
639
|
with contextlib.suppress(BaseException):
|
|
445
640
|
shutil.rmtree(output_dir)
|
|
446
641
|
output_dir.mkdir(exist_ok=True, parents=True)
|
|
447
|
-
output_path = output_dir/f"{bundler.bundle.id}.json"
|
|
642
|
+
output_path = output_dir / f"{bundler.bundle.id}.json"
|
|
448
643
|
output_path.write_text(out)
|
|
449
644
|
logger.info(f"Wrote bundle output to `{output_path}`")
|
|
450
|
-
data_path = output_dir/f"data--{bundler.uuid}.json"
|
|
645
|
+
data_path = output_dir / f"data--{bundler.uuid}.json"
|
|
451
646
|
data_path.write_text(data.model_dump_json(indent=4))
|
|
452
647
|
logger.info(f"Wrote data output to `{data_path}`")
|
|
453
648
|
for nav_layer in data.navigator_layer or []:
|
|
454
|
-
nav_path =
|
|
649
|
+
nav_path = (
|
|
650
|
+
output_dir / f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
|
|
651
|
+
)
|
|
455
652
|
nav_path.write_text(json.dumps(nav_layer, indent=4))
|
|
456
653
|
logger.info(f"Wrote navigator output to `{nav_path}`")
|
|
457
654
|
except argparse.ArgumentError as e:
|
txt2stix/utils.py
CHANGED
|
@@ -8,7 +8,7 @@ import mistune
|
|
|
8
8
|
from mistune.renderers.markdown import MarkdownRenderer
|
|
9
9
|
from mistune.util import unescape
|
|
10
10
|
|
|
11
|
-
from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident
|
|
11
|
+
from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, RelationshipList
|
|
12
12
|
class ImageLinkRemover(MarkdownRenderer):
|
|
13
13
|
def __init__(self, remove_links: bool=False, remove_images: bool=False):
|
|
14
14
|
self.remove_links = remove_links
|
|
@@ -49,7 +49,7 @@ class ImageLinkRemover(MarkdownRenderer):
|
|
|
49
49
|
class Txt2StixData(BaseModel):
|
|
50
50
|
content_check: DescribesIncident = Field(default=None)
|
|
51
51
|
extractions: dict = Field(default=None)
|
|
52
|
-
relationships:
|
|
52
|
+
relationships: dict|RelationshipList = Field(default_factory=dict)
|
|
53
53
|
attack_flow: AttackFlowList = Field(default=None)
|
|
54
54
|
navigator_layer: list = Field(default=None)
|
|
55
55
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: txt2stix
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.13
|
|
4
4
|
Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
|
|
5
5
|
Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
|
|
6
6
|
Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
|
|
@@ -15,8 +15,8 @@ Requires-Python: >=3.9
|
|
|
15
15
|
Requires-Dist: base58>=2.1.1
|
|
16
16
|
Requires-Dist: beautifulsoup4>=4.12.3
|
|
17
17
|
Requires-Dist: json-repair
|
|
18
|
-
Requires-Dist: llama-index-core>=0.
|
|
19
|
-
Requires-Dist: llama-index-llms-openai>=0.
|
|
18
|
+
Requires-Dist: llama-index-core>=0.14.8
|
|
19
|
+
Requires-Dist: llama-index-llms-openai>=0.6.8
|
|
20
20
|
Requires-Dist: mistune>=3.0.2
|
|
21
21
|
Requires-Dist: pathvalidate>=3.2.0
|
|
22
22
|
Requires-Dist: phonenumbers>=8.13.39
|
|
@@ -29,13 +29,18 @@ Requires-Dist: tld>=0.13
|
|
|
29
29
|
Requires-Dist: tldextract>=5.1.2
|
|
30
30
|
Requires-Dist: validators>=0.28.3
|
|
31
31
|
Provides-Extra: anthropic
|
|
32
|
-
Requires-Dist: llama-index-llms-anthropic>=0.7
|
|
32
|
+
Requires-Dist: llama-index-llms-anthropic>=0.9.7; extra == 'anthropic'
|
|
33
33
|
Provides-Extra: deepseek
|
|
34
|
-
Requires-Dist: llama-index-llms-deepseek>=0.
|
|
34
|
+
Requires-Dist: llama-index-llms-deepseek>=0.2.2; extra == 'deepseek'
|
|
35
35
|
Provides-Extra: gemini
|
|
36
|
-
Requires-Dist: llama-index-llms-
|
|
36
|
+
Requires-Dist: llama-index-llms-google-genai>=0.5.0; extra == 'gemini'
|
|
37
|
+
Provides-Extra: llms
|
|
38
|
+
Requires-Dist: llama-index-llms-anthropic>=0.9.7; extra == 'llms'
|
|
39
|
+
Requires-Dist: llama-index-llms-deepseek>=0.2.2; extra == 'llms'
|
|
40
|
+
Requires-Dist: llama-index-llms-google-genai>=0.5.0; extra == 'llms'
|
|
41
|
+
Requires-Dist: llama-index-llms-openrouter>=0.4.2; extra == 'llms'
|
|
37
42
|
Provides-Extra: openrouter
|
|
38
|
-
Requires-Dist: llama-index-llms-openrouter>=0.
|
|
43
|
+
Requires-Dist: llama-index-llms-openrouter>=0.4.2; extra == 'openrouter'
|
|
39
44
|
Provides-Extra: tests
|
|
40
45
|
Requires-Dist: pytest; extra == 'tests'
|
|
41
46
|
Requires-Dist: pytest-cov; extra == 'tests'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
txt2stix/__init__.py,sha256=
|
|
2
|
-
txt2stix/attack_flow.py,sha256=
|
|
3
|
-
txt2stix/bundler.py,sha256=
|
|
1
|
+
txt2stix/__init__.py,sha256=kHCnJtzi37ivXx2STT5zT7-cUL16i86o7ywtSd3iXd4,769
|
|
2
|
+
txt2stix/attack_flow.py,sha256=VAsgNKZvPa-llUsGvbv0tYNc2Kif5pNeMoxH88-6CWc,9060
|
|
3
|
+
txt2stix/bundler.py,sha256=GmpWW9ek4iFZdEIyjVSpd9RnmyeNsZJOpnax5Tt0uT0,16748
|
|
4
4
|
txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
|
|
5
5
|
txt2stix/credential_checker.py,sha256=eWDP-jY3-jm8zI0JMoUcyoQZ_JqPNfCIr_HAO8nVYz0,3044
|
|
6
6
|
txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
|
|
@@ -8,17 +8,17 @@ txt2stix/indicator.py,sha256=dyf4wbvVrZRitZpm6t7UusSM98bVW1qc5UkdGpVm3ls,30025
|
|
|
8
8
|
txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
|
|
9
9
|
txt2stix/retriever.py,sha256=sMNhnEYk3l5W44qZsWaDQtJYoHXA1oYIPM6wDqiUHSg,6642
|
|
10
10
|
txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
|
|
11
|
-
txt2stix/txt2stix.py,sha256=
|
|
12
|
-
txt2stix/utils.py,sha256=
|
|
11
|
+
txt2stix/txt2stix.py,sha256=eUL0pynQXruJRDvqs-LQ-dspDITx5tFDnTPEgCRQApk,23348
|
|
12
|
+
txt2stix/utils.py,sha256=Le0VYx8n8UNpcjqwpx7Avb06qIS9_hId8yP8_PquBUs,3333
|
|
13
13
|
txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
|
|
14
|
-
txt2stix/ai_extractor/anthropic.py,sha256=
|
|
15
|
-
txt2stix/ai_extractor/base.py,sha256=
|
|
14
|
+
txt2stix/ai_extractor/anthropic.py,sha256=B5Z3nm2-w5KBhLcVJGkhNF0dn4lUo-fW_DnbOeJKA5Q,481
|
|
15
|
+
txt2stix/ai_extractor/base.py,sha256=I_UwX4mOAVa8HrjSkI3KqKKImIBtQ29RdprDOu2NK6A,4235
|
|
16
16
|
txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
|
|
17
|
-
txt2stix/ai_extractor/gemini.py,sha256=
|
|
17
|
+
txt2stix/ai_extractor/gemini.py,sha256=rhhYrCa1zZTjadVk2QFhguD8_Yr03gl-D4Yb2nVBMI4,633
|
|
18
18
|
txt2stix/ai_extractor/openai.py,sha256=1RxaLy0TJ4GjNKmcJoi6ZiBrCS_gt5ql9jpeE-SOy8g,642
|
|
19
19
|
txt2stix/ai_extractor/openrouter.py,sha256=hAA6mTOMcpA28XYsOCvuJH7WMJqXCxfqZGJf_VrDsIk,628
|
|
20
20
|
txt2stix/ai_extractor/prompts.py,sha256=NtqtVyPPtShPlVZ5SrFmo-LCkfpANIIi4H9rjqaxqDo,10559
|
|
21
|
-
txt2stix/ai_extractor/utils.py,sha256=
|
|
21
|
+
txt2stix/ai_extractor/utils.py,sha256=7iB2qm-oUSFaYidsNi74EACwLV5skCcecCw3F9eIJx4,4507
|
|
22
22
|
txt2stix/pattern/__init__.py,sha256=K9ofaP2AOikvzb48VSBpJZijckdqufZxSzr_kbRypLY,491
|
|
23
23
|
txt2stix/pattern/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
txt2stix/pattern/extractors/base_extractor.py,sha256=ly80rp-L40g7DbhrGiCvhPWI95-ZFMtAQUEC-fH6Y-o,6130
|
|
@@ -114,8 +114,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
|
|
|
114
114
|
txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
|
|
115
115
|
txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
|
|
116
116
|
txt2stix/includes/tests/test_cases.yaml,sha256=vErA3c5fySeWvJ5yJ8dCTEo3ufRATASAjaF4gj4Az1M,22424
|
|
117
|
-
txt2stix-1.1.
|
|
118
|
-
txt2stix-1.1.
|
|
119
|
-
txt2stix-1.1.
|
|
120
|
-
txt2stix-1.1.
|
|
121
|
-
txt2stix-1.1.
|
|
117
|
+
txt2stix-1.1.13.dist-info/METADATA,sha256=4vgSOfXJOiJrJ9-WkodqXtOdtAYgDKjXY7xLEYCvRAg,15032
|
|
118
|
+
txt2stix-1.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
119
|
+
txt2stix-1.1.13.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
|
|
120
|
+
txt2stix-1.1.13.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
|
|
121
|
+
txt2stix-1.1.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|