txt2stix 1.1.11__py3-none-any.whl → 1.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
txt2stix/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from txt2stix import extractions
2
2
  from .bundler import txt2stixBundler
3
- from .txt2stix import extract_all
4
3
  from pathlib import Path
5
4
 
6
5
  INCLUDES_PATH = None
@@ -6,7 +6,7 @@ from llama_index.llms.anthropic import Anthropic
6
6
 
7
7
 
8
8
  class AnthropicAIExtractor(BaseAIExtractor, provider="anthropic"):
9
- def __init__(self, **kwargs) -> None:
9
+ def __init__(self, model='claude-sonnet-4-0', **kwargs) -> None:
10
10
  kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
11
- self.llm = Anthropic(max_tokens=4096, system_prompt=self.system_prompt, **kwargs)
11
+ self.llm = Anthropic(max_tokens=4096, model=model, system_prompt=self.system_prompt, **kwargs)
12
12
  super().__init__()
@@ -104,4 +104,7 @@ class BaseAIExtractor():
104
104
 
105
105
  def _check_credential(self):
106
106
  self.llm.complete("say 'hi'")
107
- return True
107
+ return True
108
+
109
+ def __str__(self):
110
+ return self.extractor_name
@@ -1,17 +1,17 @@
1
1
 
2
2
  import os
3
3
  from txt2stix.ai_extractor.base import BaseAIExtractor
4
- from llama_index.llms.gemini import Gemini
4
+ from llama_index.llms.google_genai import GoogleGenAI
5
5
 
6
6
 
7
7
  class GeminiAIExtractor(BaseAIExtractor, provider="gemini"):
8
8
  def __init__(self, **kwargs) -> None:
9
9
  kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
10
- self.llm = Gemini(max_tokens=4096, **kwargs)
10
+ self.llm = GoogleGenAI(max_tokens=4096, **kwargs)
11
11
  super().__init__()
12
12
 
13
13
  def count_tokens(self, text):
14
- return self.llm._model.count_tokens(text).total_tokens
14
+ return self.llm._client.models.count_tokens(model=self.llm.model, contents=text).total_tokens
15
15
 
16
16
  @property
17
17
  def extractor_name(self):
@@ -33,6 +33,9 @@ class RelationshipList(BaseModel):
33
33
  relationships: list[Relationship] = Field(default_factory=list)
34
34
  success: bool
35
35
 
36
+ def get(self, key, default=None):
37
+ return getattr(self, key, default)
38
+
36
39
  class DescribesIncident(BaseModel):
37
40
  describes_incident: bool = Field(description="does the <document> include malware analysis, APT group reports, data breaches and vulnerabilities?")
38
41
  explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
txt2stix/attack_flow.py CHANGED
@@ -213,6 +213,7 @@ def extract_attack_flow_and_navigator(
213
213
  ai_create_attack_flow,
214
214
  ai_create_attack_navigator_layer,
215
215
  ai_settings_relationships,
216
+ flow=None
216
217
  ):
217
218
  ex: BaseAIExtractor = ai_settings_relationships
218
219
  tactics = get_all_tactics()
@@ -225,7 +226,7 @@ def extract_attack_flow_and_navigator(
225
226
  ]
226
227
  logging.debug(f"parsed techniques: {json.dumps(logged_techniques, indent=4)}")
227
228
 
228
- flow = ex.extract_attack_flow(preprocessed_text, techniques)
229
+ flow = flow or ex.extract_attack_flow(preprocessed_text, techniques)
229
230
  navigator = None
230
231
  if ai_create_attack_flow:
231
232
  logging.info("creating attack-flow bundle")
txt2stix/bundler.py CHANGED
@@ -422,10 +422,6 @@ class txt2stixBundler:
422
422
  def process_observables(self, extractions, add_standard_relationship=False):
423
423
  for ex in extractions:
424
424
  try:
425
- if ex.get("id", "").startswith(
426
- "ai"
427
- ): # so id is distinct across multiple AIExtractors
428
- ex["id"] = f'{ex["id"]}_{self.observables_processed}'
429
425
  ex["id"] = ex.get("id", f"ex_{self.observables_processed}")
430
426
  self.observables_processed += 1
431
427
  self.add_indicator(ex, add_standard_relationship)
@@ -437,6 +433,7 @@ class txt2stixBundler:
437
433
  ex["error"] = str(e)
438
434
 
439
435
  def process_relationships(self, observables):
436
+ print(observables)
440
437
  for relationship in observables:
441
438
  try:
442
439
  self.add_ai_relationship(relationship)
txt2stix/txt2stix.py CHANGED
@@ -21,7 +21,7 @@ from .utils import RELATIONSHIP_TYPES, Txt2StixData, remove_links
21
21
  from .common import UUID_NAMESPACE, FatalException
22
22
 
23
23
  from .bundler import txt2stixBundler, parse_stix, TLP_LEVEL
24
- from .import extractions, lookups, pattern
24
+ from . import extractions, lookups, pattern
25
25
  from types import SimpleNamespace
26
26
  import functools
27
27
  from fnmatch import filter
@@ -40,41 +40,51 @@ def newLogger(name: str) -> logging.Logger:
40
40
  level=logging.DEBUG, # Set the desired logging level
41
41
  format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
42
42
  handlers=[stream_handler],
43
- datefmt='%d-%b-%y %H:%M:%S',
43
+ datefmt="%d-%b-%y %H:%M:%S",
44
44
  )
45
45
 
46
46
  return logging.root
47
47
 
48
+
48
49
  def setLogFile(logger, file: Path):
49
50
  file.parent.mkdir(parents=True, exist_ok=True)
50
51
  logger.info(f"Saving log to `{file.absolute()}`")
51
52
  handler = logging.FileHandler(file, "w")
52
- handler.formatter = logging.Formatter(fmt='%(levelname)s %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
53
+ handler.formatter = logging.Formatter(
54
+ fmt="%(levelname)s %(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S"
55
+ )
53
56
  handler.setLevel(logging.DEBUG)
54
57
  logger.addHandler(handler)
55
58
  logger.info("=====================txt2stix======================")
56
59
 
57
60
 
58
61
  MODULE_PATH = Path(__file__).parent.parent
59
- INCLUDES_PATH = MODULE_PATH/"includes"
62
+ INCLUDES_PATH = MODULE_PATH / "includes"
60
63
  try:
61
64
  from . import includes
65
+
62
66
  INCLUDES_PATH = Path(includes.__file__).parent
63
67
  except:
64
68
  pass
65
69
 
70
+
66
71
  def split_comma(s: str) -> list[str]:
67
72
  return [ss for ss in s.split(",") if ss]
68
73
 
74
+
69
75
  def range_type(min, max):
70
76
  def fn(astr):
71
77
  value = int(astr)
72
- if min<= value <= max:
78
+ if min <= value <= max:
73
79
  return value
74
80
  else:
75
- raise argparse.ArgumentTypeError(f'value {value} not in range [{min}-{max}]')
81
+ raise argparse.ArgumentTypeError(
82
+ f"value {value} not in range [{min}-{max}]"
83
+ )
84
+
76
85
  return fn
77
86
 
87
+
78
88
  def parse_labels(labels: str) -> list[str]:
79
89
  labels = labels.split(",")
80
90
  for label in labels:
@@ -83,39 +93,44 @@ def parse_labels(labels: str) -> list[str]:
83
93
 
84
94
  return labels
85
95
 
96
+
86
97
  def parse_extractors_globbed(type, all_extractors, names):
87
98
  globbed_names = set()
88
99
  for name in names.split(","):
89
100
  matches = fnmatch.filter(all_extractors.keys(), name)
90
101
  if not matches:
91
- raise argparse.ArgumentTypeError(f'`{name}` has 0 matches')
102
+ raise argparse.ArgumentTypeError(f"`{name}` has 0 matches")
92
103
  globbed_names.update(matches)
93
- filtered_extractors = {}
104
+ filtered_extractors = {}
94
105
  for extractor_name in globbed_names:
95
106
  try:
96
107
  extractor = all_extractors[extractor_name]
97
- extraction_processor = filtered_extractors.get(extractor.type, {})
108
+ extraction_processor = filtered_extractors.get(extractor.type, {})
98
109
  if extractor.type in ["lookup"]:
99
110
  lookups.load_lookup(extractor)
100
111
  if extractor.type == "pattern":
101
112
  pattern.load_extractor(extractor)
102
- filtered_extractors[extractor.type] = extraction_processor
113
+ filtered_extractors[extractor.type] = extraction_processor
103
114
  extraction_processor[extractor_name] = extractor
104
115
  except BaseException as e:
105
116
  raise argparse.ArgumentTypeError(f"{type} `{extractor_name}`: {e}")
106
117
  return filtered_extractors
107
118
 
119
+
108
120
  def parse_ref(value):
109
- m = re.compile(r'(.+?)=(.+)').match(value)
121
+ m = re.compile(r"(.+?)=(.+)").match(value)
110
122
  if not m:
111
123
  raise argparse.ArgumentTypeError("must be in format key=value")
112
124
  return dict(source_name=m.group(1), external_id=m.group(2))
113
125
 
126
+
114
127
  def parse_model(value: str):
115
- splits = value.split(':', 1)
128
+ splits = value.split(":", 1)
116
129
  provider = splits[0]
117
130
  if provider not in ALL_AI_EXTRACTORS:
118
- raise argparse.ArgumentTypeError(f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}")
131
+ raise argparse.ArgumentTypeError(
132
+ f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
133
+ )
119
134
  provider = ALL_AI_EXTRACTORS[provider]
120
135
 
121
136
  try:
@@ -125,6 +140,7 @@ def parse_model(value: str):
125
140
  except Exception as e:
126
141
  raise ModelError(f"Unable to initialize model `{value}`") from e
127
142
 
143
+
128
144
  def parse_bool(value: str):
129
145
  value = value.lower()
130
146
  return value in ["yes", "y", "true", "1"]
@@ -135,7 +151,12 @@ def parse_args():
135
151
  all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
136
152
 
137
153
  parser = argparse.ArgumentParser(description="File Conversion Tool")
138
- parser.add_argument('--check_credentials', "--check-credentials", action="store_true", help="Print the validity of the credentials and exit")
154
+ parser.add_argument(
155
+ "--check_credentials",
156
+ "--check-credentials",
157
+ action="store_true",
158
+ help="Print the validity of the credentials and exit",
159
+ )
139
160
  args, _ = parser.parse_known_args()
140
161
  if args.check_credentials:
141
162
  statuses = credential_checker.check_statuses(test_llms=True)
@@ -259,7 +280,6 @@ def parse_args():
259
280
  help="create attack flow for attack objects in report/bundle",
260
281
  )
261
282
 
262
-
263
283
  args = parser.parse_args()
264
284
  if not args.input_file.exists():
265
285
  raise argparse.ArgumentError(inf_arg, "cannot open file")
@@ -296,6 +316,8 @@ REQUIRED_ENV_VARIABLES = [
296
316
  "CTIBUTLER_BASE_URL",
297
317
  "VULMATCH_BASE_URL",
298
318
  ]
319
+
320
+
299
321
  def load_env():
300
322
  for env in REQUIRED_ENV_VARIABLES:
301
323
  if not os.getenv(env):
@@ -304,19 +326,34 @@ def load_env():
304
326
 
305
327
  def log_notes(content, type):
306
328
  logging.debug(f" ========================= {type} ========================= ")
307
- logging.debug(f" ========================= {'+'*len(type)} ========================= ")
329
+ logging.debug(
330
+ f" ========================= {'+'*len(type)} ========================= "
331
+ )
308
332
  logging.debug(json.dumps(content, sort_keys=True, indent=4))
309
- logging.debug(f" ========================= {'-'*len(type)} ========================= ")
333
+ logging.debug(
334
+ f" ========================= {'-'*len(type)} ========================= "
335
+ )
336
+
310
337
 
311
- def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extractors: list[BaseAIExtractor]=[], **kwargs):
312
- assert ai_extractors or not extractors_map.get("ai"), "There should be at least one AI extractor in ai_extractors"
338
+ def run_extractors(
339
+ extractors_map, text_content, ai_extractors: list[BaseAIExtractor] = [], **kwargs
340
+ ):
341
+ """Run extraction calls (lookup, pattern, AI) and return a dict of all extracts.
313
342
 
314
- text_content = "\n"+text_content+"\n"
343
+ This function does NOT modify the bundler. Use `process_extracts` to
344
+ feed the returned extracts into a bundler (or replay saved extracts).
345
+ """
346
+ assert ai_extractors or not extractors_map.get(
347
+ "ai"
348
+ ), "There should be at least one AI extractor in ai_extractors"
349
+
350
+ text_content = "\n" + text_content + "\n"
315
351
  all_extracts = dict()
316
352
  if extractors_map.get("lookup"):
317
353
  try:
318
- lookup_extracts = lookups.extract_all(extractors_map["lookup"].values(), text_content)
319
- bundler.process_observables(lookup_extracts)
354
+ lookup_extracts = lookups.extract_all(
355
+ extractors_map["lookup"].values(), text_content
356
+ )
320
357
  all_extracts["lookup"] = lookup_extracts
321
358
  except BaseException as e:
322
359
  logging.exception("lookup extraction failed", exc_info=True)
@@ -324,94 +361,239 @@ def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extra
324
361
  if extractors_map.get("pattern"):
325
362
  try:
326
363
  logging.info("using pattern extractors")
327
- pattern_extracts = pattern.extract_all(extractors_map["pattern"].values(), text_content, ignore_extraction_boundary=kwargs.get('ignore_extraction_boundary', False))
328
- bundler.process_observables(pattern_extracts)
364
+ pattern_extracts = pattern.extract_all(
365
+ extractors_map["pattern"].values(),
366
+ text_content,
367
+ ignore_extraction_boundary=kwargs.get(
368
+ "ignore_extraction_boundary", False
369
+ ),
370
+ )
329
371
  all_extracts["pattern"] = pattern_extracts
330
372
  except BaseException as e:
331
373
  logging.exception("pattern extraction failed", exc_info=True)
332
374
 
333
375
  if extractors_map.get("ai"):
334
376
  logging.info("using ai extractors")
335
-
336
377
  for extractor in ai_extractors:
337
378
  logging.info("running extractor: %s", extractor.extractor_name)
338
379
  try:
339
- ai_extracts = extractor.extract_objects(text_content, extractors_map["ai"].values())
340
- bundler.process_observables(ai_extracts)
380
+ ai_extracts = extractor.extract_objects(
381
+ text_content, extractors_map["ai"].values()
382
+ )
341
383
  all_extracts[f"ai-{extractor.extractor_name}"] = ai_extracts
342
384
  except BaseException as e:
343
- logging.exception("AI extraction failed for %s", extractor.extractor_name, exc_info=True)
385
+ logging.exception(
386
+ "AI extraction failed for %s",
387
+ extractor.extractor_name,
388
+ exc_info=True,
389
+ )
390
+
391
+ for i, ex in enumerate(itertools.chain(*all_extracts.values())):
392
+ ex["id"] = "ex-" + str(i)
393
+ return all_extracts
394
+
395
+
396
+ def process_extracts(bundler: txt2stixBundler, all_extracts: dict):
397
+ """Process a previously-created `all_extracts` dict into the given bundler.
398
+
399
+ This allows replaying saved extracts without invoking extractors again.
400
+ """
401
+ for key, extracts in (all_extracts or {}).items():
402
+ try:
403
+ bundler.process_observables(extracts)
404
+ except BaseException:
405
+ logging.exception("processing extracts failed for %s", key, exc_info=True)
344
406
 
345
407
  log_notes(all_extracts, "Extractions")
346
- return all_extracts
347
408
 
348
- def extract_relationships_with_ai(bundler: txt2stixBundler, text_content, all_extracts, ai_extractor_session: BaseAIExtractor):
409
+
410
+ def extract_relationships(
411
+ text_content, all_extracts, ai_extractor_session: BaseAIExtractor
412
+ ):
349
413
  relationships = None
350
414
  try:
351
- all_extracts = list(itertools.chain(*all_extracts.values()))
352
- relationships = ai_extractor_session.extract_relationships(text_content, all_extracts, RELATIONSHIP_TYPES)
353
- relationships = relationships.model_dump()
415
+ # flatten extracts into a single list
416
+ flattened = list(itertools.chain(*all_extracts.values()))
417
+ rel = ai_extractor_session.extract_relationships(
418
+ text_content, flattened, RELATIONSHIP_TYPES
419
+ )
420
+ relationships = rel.model_dump()
354
421
  log_notes(relationships, "Relationships")
355
- bundler.process_relationships(relationships['relationships'])
356
422
  except BaseException as e:
357
- logging.exception("Relationship processing failed: %s", e)
423
+ logging.exception("Relationship extraction failed: %s", e)
358
424
  return relationships
359
425
 
426
+
360
427
  def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
361
- logging.info('INPUT_TOKEN_LIMIT = %d', max_tokens)
428
+ logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
362
429
  for extractor in extractors:
363
430
  token_count = _count_token(extractor, input)
364
- if token_count > max_tokens:
365
- raise FatalException(f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})")
431
+ logging.info(
432
+ f"{extractor.extractor_name}: input_file token count = {token_count}"
433
+ )
434
+ if token_count > max_tokens:
435
+ raise FatalException(
436
+ f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
437
+ )
366
438
 
367
439
 
368
440
  @functools.lru_cache
369
441
  def _count_token(extractor: BaseAIExtractor, input: str):
370
442
  return extractor.count_tokens(input)
371
443
 
372
- def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_map: dict,
373
- ai_content_check_provider=None,
374
- ai_create_attack_flow=None,
375
- ai_create_attack_navigator_layer=None,
376
- input_token_limit=10,
377
- ai_settings_extractions=None,
378
- ai_settings_relationships=None,
379
- relationship_mode="standard",
380
- ignore_extraction_boundary=False,
381
- ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
382
-
383
- **kwargs
384
- ) -> Txt2StixData:
444
+
445
+ def run_txt2stix(
446
+ bundler: txt2stixBundler,
447
+ preprocessed_text: str,
448
+ extractors_map: dict,
449
+ ai_content_check_provider=None,
450
+ ai_create_attack_flow=None,
451
+ ai_create_attack_navigator_layer=None,
452
+ input_token_limit=10,
453
+ ai_settings_extractions=None,
454
+ ai_settings_relationships=None,
455
+ relationship_mode="standard",
456
+ ignore_extraction_boundary=False,
457
+ ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
458
+ txt2stix_data: Txt2StixData = None,
459
+ **kwargs,
460
+ ) -> Txt2StixData:
461
+ # First, perform extraction-phase (LLM and extractor calls). This does not
462
+ # modify the provided bundler so the results can be saved and replayed.
463
+ # skip extraction phase if txt2stix_data is passed
464
+ txt2stix_data = txt2stix_data or extraction_phase(
465
+ preprocessed_text,
466
+ extractors_map,
467
+ ai_content_check_provider=ai_content_check_provider,
468
+ input_token_limit=input_token_limit,
469
+ ai_settings_extractions=ai_settings_extractions,
470
+ ai_settings_relationships=ai_settings_relationships,
471
+ relationship_mode=relationship_mode,
472
+ ignore_extraction_boundary=ignore_extraction_boundary,
473
+ ai_extract_if_no_incidence=ai_extract_if_no_incidence,
474
+ )
475
+
476
+ # Then, process the extracted data into the bundler (no LLM calls).
477
+ processing_phase(
478
+ bundler,
479
+ preprocessed_text,
480
+ txt2stix_data,
481
+ ai_create_attack_flow=ai_create_attack_flow,
482
+ ai_create_attack_navigator_layer=ai_create_attack_navigator_layer,
483
+ ai_settings_relationships=ai_settings_relationships,
484
+ ai_content_check_provider=ai_content_check_provider,
485
+ )
486
+ return txt2stix_data
487
+
488
+
489
+ def extraction_phase(
490
+ preprocessed_text: str,
491
+ extractors_map: dict,
492
+ ai_content_check_provider=None,
493
+ input_token_limit=10,
494
+ ai_settings_extractions=None,
495
+ ai_settings_relationships=None,
496
+ relationship_mode="standard",
497
+ ignore_extraction_boundary=False,
498
+ ai_extract_if_no_incidence=True,
499
+ **kwargs,
500
+ ) -> Txt2StixData:
501
+ """Perform token validation and run extractors/AI models. Does NOT modify a bundler."""
385
502
  should_extract = True
386
- retval = Txt2StixData.model_construct()
387
- retval.extractions = retval.attack_flow = retval.relationships = None
503
+ txt2stix_data = Txt2StixData.model_construct()
504
+ txt2stix_data.extractions = txt2stix_data.attack_flow = (
505
+ txt2stix_data.relationships
506
+ ) = None
507
+
388
508
  if ai_content_check_provider:
389
509
  logging.info("checking content")
390
- model : BaseAIExtractor = ai_content_check_provider
510
+ model: BaseAIExtractor = ai_content_check_provider
391
511
  validate_token_count(input_token_limit, preprocessed_text, [model])
392
- retval.content_check = model.check_content(preprocessed_text)
393
- should_extract = retval.content_check.describes_incident
512
+ txt2stix_data.content_check = model.check_content(preprocessed_text)
513
+ should_extract = txt2stix_data.content_check.describes_incident
394
514
  logging.info("=== ai-check-content output ====")
395
- logging.info(retval.content_check.model_dump_json())
396
- bundler.report.external_references.append(dict(source_name='txt2stix_describes_incident', description=str(should_extract).lower(), external_id=model.extractor_name))
397
- for classification in retval.content_check.incident_classification:
398
- bundler.report.labels.append(f'classification.{classification}'.lower())
399
- bundler.add_summary(retval.content_check.summary, model.extractor_name)
515
+ logging.info(txt2stix_data.content_check.model_dump_json())
400
516
 
401
517
  if should_extract or ai_extract_if_no_incidence:
402
518
  if extractors_map.get("ai"):
403
- validate_token_count(input_token_limit, preprocessed_text, ai_settings_extractions)
519
+ validate_token_count(
520
+ input_token_limit, preprocessed_text, ai_settings_extractions
521
+ )
404
522
  if relationship_mode == "ai":
405
- validate_token_count(input_token_limit, preprocessed_text, [ai_settings_relationships])
523
+ validate_token_count(
524
+ input_token_limit, preprocessed_text, [ai_settings_relationships]
525
+ )
526
+
527
+ txt2stix_data.extractions = run_extractors(
528
+ extractors_map,
529
+ preprocessed_text,
530
+ ai_extractors=ai_settings_extractions,
531
+ ignore_extraction_boundary=ignore_extraction_boundary,
532
+ )
406
533
 
407
- retval.extractions = extract_all(bundler, extractors_map, preprocessed_text, ai_extractors=ai_settings_extractions, ignore_extraction_boundary=ignore_extraction_boundary)
408
- if relationship_mode == "ai" and sum(map(lambda x: len(x), retval.extractions.values())):
409
- retval.relationships = extract_relationships_with_ai(bundler, preprocessed_text, retval.extractions, ai_settings_relationships)
410
-
411
- if ai_create_attack_flow or ai_create_attack_navigator_layer:
412
- retval.attack_flow, retval.navigator_layer = attack_flow.extract_attack_flow_and_navigator(bundler, preprocessed_text, ai_create_attack_flow, ai_create_attack_navigator_layer, ai_settings_relationships)
413
- return retval
534
+ if (
535
+ relationship_mode == "ai"
536
+ and txt2stix_data.extractions
537
+ and sum(map(lambda x: len(x), txt2stix_data.extractions.values()))
538
+ ):
539
+ txt2stix_data.relationships = extract_relationships(
540
+ preprocessed_text, txt2stix_data.extractions, ai_settings_relationships
541
+ )
542
+ return txt2stix_data
543
+
544
+
545
+ def processing_phase(
546
+ bundler: txt2stixBundler,
547
+ preprocessed_text: str,
548
+ data: Txt2StixData,
549
+ ai_create_attack_flow=False,
550
+ ai_create_attack_navigator_layer=False,
551
+ ai_settings_relationships=None,
552
+ ai_content_check_provider=None,
553
+ ):
554
+ """Process extracted `data` into the given `bundler` without invoking LLMs."""
555
+ try:
556
+ if data.content_check:
557
+ cc = data.content_check
558
+ provider_name = str(ai_content_check_provider)
559
+ bundler.report.external_references.append(
560
+ dict(
561
+ source_name="txt2stix_describes_incident",
562
+ description=str(cc.describes_incident).lower(),
563
+ external_id=provider_name,
564
+ )
565
+ )
566
+ for classification in cc.incident_classification:
567
+ bundler.report.labels.append(f"classification.{classification}".lower())
568
+ bundler.add_summary(cc.summary, provider_name)
569
+ except BaseException:
570
+ logging.exception("applying content_check to bundler failed", exc_info=True)
571
+
572
+ # process extracts into bundler
573
+ process_extracts(bundler, data.extractions)
574
+
575
+ # process relationships into bundler
576
+ try:
577
+ if data.relationships:
578
+ bundler.process_relationships(data.relationships.get("relationships", []))
579
+ except BaseException:
580
+ logging.exception("processing relationships failed", exc_info=True)
414
581
 
582
+ # generate attack flow / navigator layer now that bundler has been populated
583
+ try:
584
+ if ai_create_attack_flow or ai_create_attack_navigator_layer:
585
+ data.attack_flow, data.navigator_layer = (
586
+ attack_flow.extract_attack_flow_and_navigator(
587
+ bundler,
588
+ preprocessed_text,
589
+ ai_create_attack_flow,
590
+ ai_create_attack_navigator_layer,
591
+ ai_settings_relationships,
592
+ flow=data.attack_flow,
593
+ )
594
+ )
595
+ except BaseException:
596
+ logging.exception("attack flow / navigator generation failed", exc_info=True)
415
597
 
416
598
 
417
599
  def main():
@@ -423,35 +605,50 @@ def main():
423
605
  setLogFile(logger, Path(f"logs/logs-{job_id}.log"))
424
606
  logger.info(f"Arguments: {json.dumps(sys.argv[1:])}")
425
607
 
426
-
427
608
  input_text = args.input_file.read_text()
428
- preprocessed_text = remove_links(input_text, args.ignore_image_refs, args.ignore_link_refs)
609
+ preprocessed_text = remove_links(
610
+ input_text, args.ignore_image_refs, args.ignore_link_refs
611
+ )
429
612
  load_env()
430
613
 
431
-
432
- bundler = txt2stixBundler(args.name, args.use_identity, args.tlp_level, input_text, args.confidence, args.all_extractors, args.labels, created=args.created, report_id=args.report_id, external_references=args.external_refs)
614
+ bundler = txt2stixBundler(
615
+ args.name,
616
+ args.use_identity,
617
+ args.tlp_level,
618
+ input_text,
619
+ args.confidence,
620
+ args.all_extractors,
621
+ args.labels,
622
+ created=args.created,
623
+ report_id=args.report_id,
624
+ external_references=args.external_refs,
625
+ )
433
626
  log_notes(sys.argv, "Config")
434
627
 
435
628
  data = run_txt2stix(
436
- bundler, preprocessed_text, args.use_extractions,
437
- input_token_limit=int(os.environ['INPUT_TOKEN_LIMIT']),
629
+ bundler,
630
+ preprocessed_text,
631
+ args.use_extractions,
632
+ input_token_limit=int(os.environ["INPUT_TOKEN_LIMIT"]),
438
633
  **args.__dict__,
439
634
  )
440
635
 
441
636
  ## write outputs
442
637
  out = bundler.to_json()
443
- output_dir = Path("./output")/str(bundler.uuid)
638
+ output_dir = Path("./output") / str(bundler.uuid)
444
639
  with contextlib.suppress(BaseException):
445
640
  shutil.rmtree(output_dir)
446
641
  output_dir.mkdir(exist_ok=True, parents=True)
447
- output_path = output_dir/f"{bundler.bundle.id}.json"
642
+ output_path = output_dir / f"{bundler.bundle.id}.json"
448
643
  output_path.write_text(out)
449
644
  logger.info(f"Wrote bundle output to `{output_path}`")
450
- data_path = output_dir/f"data--{bundler.uuid}.json"
645
+ data_path = output_dir / f"data--{bundler.uuid}.json"
451
646
  data_path.write_text(data.model_dump_json(indent=4))
452
647
  logger.info(f"Wrote data output to `{data_path}`")
453
648
  for nav_layer in data.navigator_layer or []:
454
- nav_path = output_dir/f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
649
+ nav_path = (
650
+ output_dir / f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
651
+ )
455
652
  nav_path.write_text(json.dumps(nav_layer, indent=4))
456
653
  logger.info(f"Wrote navigator output to `{nav_path}`")
457
654
  except argparse.ArgumentError as e:
txt2stix/utils.py CHANGED
@@ -8,7 +8,7 @@ import mistune
8
8
  from mistune.renderers.markdown import MarkdownRenderer
9
9
  from mistune.util import unescape
10
10
 
11
- from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident
11
+ from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, RelationshipList
12
12
  class ImageLinkRemover(MarkdownRenderer):
13
13
  def __init__(self, remove_links: bool=False, remove_images: bool=False):
14
14
  self.remove_links = remove_links
@@ -49,7 +49,7 @@ class ImageLinkRemover(MarkdownRenderer):
49
49
  class Txt2StixData(BaseModel):
50
50
  content_check: DescribesIncident = Field(default=None)
51
51
  extractions: dict = Field(default=None)
52
- relationships: list[dict] = Field(default_factory=list)
52
+ relationships: dict|RelationshipList = Field(default_factory=dict)
53
53
  attack_flow: AttackFlowList = Field(default=None)
54
54
  navigator_layer: list = Field(default=None)
55
55
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: txt2stix
3
- Version: 1.1.11
3
+ Version: 1.1.13
4
4
  Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
5
5
  Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
6
6
  Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
@@ -15,8 +15,8 @@ Requires-Python: >=3.9
15
15
  Requires-Dist: base58>=2.1.1
16
16
  Requires-Dist: beautifulsoup4>=4.12.3
17
17
  Requires-Dist: json-repair
18
- Requires-Dist: llama-index-core>=0.12.42
19
- Requires-Dist: llama-index-llms-openai>=0.4.5
18
+ Requires-Dist: llama-index-core>=0.14.8
19
+ Requires-Dist: llama-index-llms-openai>=0.6.8
20
20
  Requires-Dist: mistune>=3.0.2
21
21
  Requires-Dist: pathvalidate>=3.2.0
22
22
  Requires-Dist: phonenumbers>=8.13.39
@@ -29,13 +29,18 @@ Requires-Dist: tld>=0.13
29
29
  Requires-Dist: tldextract>=5.1.2
30
30
  Requires-Dist: validators>=0.28.3
31
31
  Provides-Extra: anthropic
32
- Requires-Dist: llama-index-llms-anthropic>=0.7.2; extra == 'anthropic'
32
+ Requires-Dist: llama-index-llms-anthropic>=0.9.7; extra == 'anthropic'
33
33
  Provides-Extra: deepseek
34
- Requires-Dist: llama-index-llms-deepseek>=0.1.2; extra == 'deepseek'
34
+ Requires-Dist: llama-index-llms-deepseek>=0.2.2; extra == 'deepseek'
35
35
  Provides-Extra: gemini
36
- Requires-Dist: llama-index-llms-gemini>=0.5.0; extra == 'gemini'
36
+ Requires-Dist: llama-index-llms-google-genai>=0.5.0; extra == 'gemini'
37
+ Provides-Extra: llms
38
+ Requires-Dist: llama-index-llms-anthropic>=0.9.7; extra == 'llms'
39
+ Requires-Dist: llama-index-llms-deepseek>=0.2.2; extra == 'llms'
40
+ Requires-Dist: llama-index-llms-google-genai>=0.5.0; extra == 'llms'
41
+ Requires-Dist: llama-index-llms-openrouter>=0.4.2; extra == 'llms'
37
42
  Provides-Extra: openrouter
38
- Requires-Dist: llama-index-llms-openrouter>=0.3.2; extra == 'openrouter'
43
+ Requires-Dist: llama-index-llms-openrouter>=0.4.2; extra == 'openrouter'
39
44
  Provides-Extra: tests
40
45
  Requires-Dist: pytest; extra == 'tests'
41
46
  Requires-Dist: pytest-cov; extra == 'tests'
@@ -1,6 +1,6 @@
1
- txt2stix/__init__.py,sha256=Sm_VT913IFuAZ6dJEdVz3baPwC5VYtHySVfBAOUG92w,803
2
- txt2stix/attack_flow.py,sha256=qlzI7TdYwPOXegx0hTdvVuZ_He2yQVg9eFPOpEt3huE,9038
3
- txt2stix/bundler.py,sha256=5E6IptaAyHXdMA7JUw8yG5J2hLZ9kqQuDsWCQAC3xlY,16937
1
+ txt2stix/__init__.py,sha256=kHCnJtzi37ivXx2STT5zT7-cUL16i86o7ywtSd3iXd4,769
2
+ txt2stix/attack_flow.py,sha256=VAsgNKZvPa-llUsGvbv0tYNc2Kif5pNeMoxH88-6CWc,9060
3
+ txt2stix/bundler.py,sha256=GmpWW9ek4iFZdEIyjVSpd9RnmyeNsZJOpnax5Tt0uT0,16748
4
4
  txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
5
5
  txt2stix/credential_checker.py,sha256=eWDP-jY3-jm8zI0JMoUcyoQZ_JqPNfCIr_HAO8nVYz0,3044
6
6
  txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
@@ -8,17 +8,17 @@ txt2stix/indicator.py,sha256=dyf4wbvVrZRitZpm6t7UusSM98bVW1qc5UkdGpVm3ls,30025
8
8
  txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
9
9
  txt2stix/retriever.py,sha256=sMNhnEYk3l5W44qZsWaDQtJYoHXA1oYIPM6wDqiUHSg,6642
10
10
  txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
11
- txt2stix/txt2stix.py,sha256=l889ysbkZLFoSLak6Hv4IN8sr0HillVp4cbC2WS-UkI,18769
12
- txt2stix/utils.py,sha256=n6mh4t9ZRJ7iT4Jvp9ai_dfCXjgXNcRtF_zXO7nkpnk,3304
11
+ txt2stix/txt2stix.py,sha256=eUL0pynQXruJRDvqs-LQ-dspDITx5tFDnTPEgCRQApk,23348
12
+ txt2stix/utils.py,sha256=Le0VYx8n8UNpcjqwpx7Avb06qIS9_hId8yP8_PquBUs,3333
13
13
  txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
14
- txt2stix/ai_extractor/anthropic.py,sha256=mdz-8CB-BSCEqnK5l35DRZURVPUf508ef2b48XMxmuk,441
15
- txt2stix/ai_extractor/base.py,sha256=t0SCh24FeDEDzXsrGFada6ux9F6m0ILwXtPSaleDiv8,4172
14
+ txt2stix/ai_extractor/anthropic.py,sha256=B5Z3nm2-w5KBhLcVJGkhNF0dn4lUo-fW_DnbOeJKA5Q,481
15
+ txt2stix/ai_extractor/base.py,sha256=I_UwX4mOAVa8HrjSkI3KqKKImIBtQ29RdprDOu2NK6A,4235
16
16
  txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
17
- txt2stix/ai_extractor/gemini.py,sha256=yJC7knYzl-TScyCBd-MTpUf-NT6znC25E7vXxNMqjLU,578
17
+ txt2stix/ai_extractor/gemini.py,sha256=rhhYrCa1zZTjadVk2QFhguD8_Yr03gl-D4Yb2nVBMI4,633
18
18
  txt2stix/ai_extractor/openai.py,sha256=1RxaLy0TJ4GjNKmcJoi6ZiBrCS_gt5ql9jpeE-SOy8g,642
19
19
  txt2stix/ai_extractor/openrouter.py,sha256=hAA6mTOMcpA28XYsOCvuJH7WMJqXCxfqZGJf_VrDsIk,628
20
20
  txt2stix/ai_extractor/prompts.py,sha256=NtqtVyPPtShPlVZ5SrFmo-LCkfpANIIi4H9rjqaxqDo,10559
21
- txt2stix/ai_extractor/utils.py,sha256=K3qglBRWYAuRU806-ahbz_PK1qQFfJ7ueWybVxYZYlQ,4425
21
+ txt2stix/ai_extractor/utils.py,sha256=7iB2qm-oUSFaYidsNi74EACwLV5skCcecCw3F9eIJx4,4507
22
22
  txt2stix/pattern/__init__.py,sha256=K9ofaP2AOikvzb48VSBpJZijckdqufZxSzr_kbRypLY,491
23
23
  txt2stix/pattern/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  txt2stix/pattern/extractors/base_extractor.py,sha256=ly80rp-L40g7DbhrGiCvhPWI95-ZFMtAQUEC-fH6Y-o,6130
@@ -114,8 +114,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
114
114
  txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
115
115
  txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
116
116
  txt2stix/includes/tests/test_cases.yaml,sha256=vErA3c5fySeWvJ5yJ8dCTEo3ufRATASAjaF4gj4Az1M,22424
117
- txt2stix-1.1.11.dist-info/METADATA,sha256=WgfNqcRRCIhML4mqK0qhAxSsDh1H-74Xj9X_QbRmsG4,14739
118
- txt2stix-1.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
119
- txt2stix-1.1.11.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
120
- txt2stix-1.1.11.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
121
- txt2stix-1.1.11.dist-info/RECORD,,
117
+ txt2stix-1.1.13.dist-info/METADATA,sha256=4vgSOfXJOiJrJ9-WkodqXtOdtAYgDKjXY7xLEYCvRAg,15032
118
+ txt2stix-1.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
119
+ txt2stix-1.1.13.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
120
+ txt2stix-1.1.13.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
121
+ txt2stix-1.1.13.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any