txt2stix 1.1.12__py3-none-any.whl → 1.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
txt2stix/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from txt2stix import extractions
2
2
  from .bundler import txt2stixBundler
3
- from .txt2stix import extract_all
4
3
  from pathlib import Path
5
4
 
6
5
  INCLUDES_PATH = None
@@ -104,4 +104,7 @@ class BaseAIExtractor():
104
104
 
105
105
  def _check_credential(self):
106
106
  self.llm.complete("say 'hi'")
107
- return True
107
+ return True
108
+
109
+ def __str__(self):
110
+ return self.extractor_name
@@ -33,6 +33,9 @@ class RelationshipList(BaseModel):
33
33
  relationships: list[Relationship] = Field(default_factory=list)
34
34
  success: bool
35
35
 
36
+ def get(self, key, default=None):
37
+ return getattr(self, key, default)
38
+
36
39
  class DescribesIncident(BaseModel):
37
40
  describes_incident: bool = Field(description="does the <document> include malware analysis, APT group reports, data breaches and vulnerabilities?")
38
41
  explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
txt2stix/attack_flow.py CHANGED
@@ -213,6 +213,7 @@ def extract_attack_flow_and_navigator(
213
213
  ai_create_attack_flow,
214
214
  ai_create_attack_navigator_layer,
215
215
  ai_settings_relationships,
216
+ flow=None
216
217
  ):
217
218
  ex: BaseAIExtractor = ai_settings_relationships
218
219
  tactics = get_all_tactics()
@@ -225,7 +226,7 @@ def extract_attack_flow_and_navigator(
225
226
  ]
226
227
  logging.debug(f"parsed techniques: {json.dumps(logged_techniques, indent=4)}")
227
228
 
228
- flow = ex.extract_attack_flow(preprocessed_text, techniques)
229
+ flow = flow or ex.extract_attack_flow(preprocessed_text, techniques)
229
230
  navigator = None
230
231
  if ai_create_attack_flow:
231
232
  logging.info("creating attack-flow bundle")
txt2stix/bundler.py CHANGED
@@ -422,10 +422,6 @@ class txt2stixBundler:
422
422
  def process_observables(self, extractions, add_standard_relationship=False):
423
423
  for ex in extractions:
424
424
  try:
425
- if ex.get("id", "").startswith(
426
- "ai"
427
- ): # so id is distinct across multiple AIExtractors
428
- ex["id"] = f'{ex["id"]}_{self.observables_processed}'
429
425
  ex["id"] = ex.get("id", f"ex_{self.observables_processed}")
430
426
  self.observables_processed += 1
431
427
  self.add_indicator(ex, add_standard_relationship)
@@ -437,6 +433,7 @@ class txt2stixBundler:
437
433
  ex["error"] = str(e)
438
434
 
439
435
  def process_relationships(self, observables):
436
+ print(observables)
440
437
  for relationship in observables:
441
438
  try:
442
439
  self.add_ai_relationship(relationship)
txt2stix/indicator.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
+ from datetime import UTC, datetime
2
3
  import os
3
4
  import re
5
+ import uuid
4
6
  from stix2.parsing import dict_to_stix2
5
7
  from stix2 import HashConstant, File
6
8
  from stix2.v21.vocab import HASHING_ALGORITHM
@@ -24,7 +26,7 @@ if TYPE_CHECKING:
24
26
 
25
27
  # from schwifty import IBAN
26
28
 
27
- from .common import MinorException
29
+ from .common import UUID_NAMESPACE, MinorException
28
30
 
29
31
  from .retriever import retrieve_stix_objects
30
32
 
@@ -675,11 +677,19 @@ def _build_observables(
675
677
  )
676
678
  )
677
679
 
680
+ _id_part = str(
681
+ uuid.uuid5(
682
+ UUID_NAMESPACE,
683
+ f"txt2stix+{extracted_value}",
684
+ )
685
+ )
686
+
678
687
  if stix_mapping == "attack-pattern":
679
688
  stix_objects = [
680
689
  dict_to_stix2(
681
690
  {
682
691
  "type": "attack-pattern",
692
+ # "id": stix_mapping + "--" + _id_part,
683
693
  "spec_version": "2.1",
684
694
  "created_by_ref": indicator["created_by_ref"],
685
695
  "created": indicator["created"],
@@ -695,6 +705,7 @@ def _build_observables(
695
705
  dict_to_stix2(
696
706
  {
697
707
  "type": "campaign",
708
+ # "id": stix_mapping + "--" + _id_part,
698
709
  "spec_version": "2.1",
699
710
  "created_by_ref": indicator["created_by_ref"],
700
711
  "created": indicator["created"],
@@ -711,6 +722,7 @@ def _build_observables(
711
722
  dict_to_stix2(
712
723
  {
713
724
  "type": "course-of-action",
725
+ # "id": stix_mapping + "--" + _id_part,
714
726
  "spec_version": "2.1",
715
727
  "created_by_ref": indicator["created_by_ref"],
716
728
  "created": indicator["created"],
@@ -727,6 +739,7 @@ def _build_observables(
727
739
  dict_to_stix2(
728
740
  {
729
741
  "type": "infrastructure",
742
+ # "id": stix_mapping + "--" + _id_part,
730
743
  "spec_version": "2.1",
731
744
  "created_by_ref": indicator["created_by_ref"],
732
745
  "created": indicator["created"],
@@ -744,6 +757,7 @@ def _build_observables(
744
757
  dict_to_stix2(
745
758
  {
746
759
  "type": "intrusion-set",
760
+ # "id": stix_mapping + "--" + _id_part,
747
761
  "spec_version": "2.1",
748
762
  "created_by_ref": indicator["created_by_ref"],
749
763
  "created": indicator["created"],
@@ -760,6 +774,7 @@ def _build_observables(
760
774
  dict_to_stix2(
761
775
  {
762
776
  "type": "malware",
777
+ # "id": stix_mapping + "--" + _id_part,
763
778
  "spec_version": "2.1",
764
779
  "created_by_ref": indicator["created_by_ref"],
765
780
  "created": indicator["created"],
@@ -778,6 +793,7 @@ def _build_observables(
778
793
  dict_to_stix2(
779
794
  {
780
795
  "type": "threat-actor",
796
+ # "id": stix_mapping + "--" + _id_part,
781
797
  "spec_version": "2.1",
782
798
  "created_by_ref": indicator["created_by_ref"],
783
799
  "created": indicator["created"],
@@ -795,6 +811,7 @@ def _build_observables(
795
811
  dict_to_stix2(
796
812
  {
797
813
  "type": "tool",
814
+ # "id": stix_mapping + "--" + _id_part,
798
815
  "spec_version": "2.1",
799
816
  "created_by_ref": indicator["created_by_ref"],
800
817
  "created": indicator["created"],
@@ -814,8 +831,9 @@ def _build_observables(
814
831
  "type": "identity",
815
832
  "spec_version": "2.1",
816
833
  "created_by_ref": indicator["created_by_ref"],
817
- "created": indicator["created"],
818
- "modified": indicator["modified"],
834
+ "created": datetime(2020, 1, 1, tzinfo=UTC),
835
+ "modified": datetime(2020, 1, 1, tzinfo=UTC),
836
+ "id": "identity--" + _id_part,
819
837
  "name": extracted_value,
820
838
  "identity_class": "unspecified",
821
839
  "object_marking_refs": indicator["object_marking_refs"],
txt2stix/txt2stix.py CHANGED
@@ -21,7 +21,7 @@ from .utils import RELATIONSHIP_TYPES, Txt2StixData, remove_links
21
21
  from .common import UUID_NAMESPACE, FatalException
22
22
 
23
23
  from .bundler import txt2stixBundler, parse_stix, TLP_LEVEL
24
- from .import extractions, lookups, pattern
24
+ from . import extractions, lookups, pattern
25
25
  from types import SimpleNamespace
26
26
  import functools
27
27
  from fnmatch import filter
@@ -40,41 +40,51 @@ def newLogger(name: str) -> logging.Logger:
40
40
  level=logging.DEBUG, # Set the desired logging level
41
41
  format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
42
42
  handlers=[stream_handler],
43
- datefmt='%d-%b-%y %H:%M:%S',
43
+ datefmt="%d-%b-%y %H:%M:%S",
44
44
  )
45
45
 
46
46
  return logging.root
47
47
 
48
+
48
49
  def setLogFile(logger, file: Path):
49
50
  file.parent.mkdir(parents=True, exist_ok=True)
50
51
  logger.info(f"Saving log to `{file.absolute()}`")
51
52
  handler = logging.FileHandler(file, "w")
52
- handler.formatter = logging.Formatter(fmt='%(levelname)s %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
53
+ handler.formatter = logging.Formatter(
54
+ fmt="%(levelname)s %(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S"
55
+ )
53
56
  handler.setLevel(logging.DEBUG)
54
57
  logger.addHandler(handler)
55
58
  logger.info("=====================txt2stix======================")
56
59
 
57
60
 
58
61
  MODULE_PATH = Path(__file__).parent.parent
59
- INCLUDES_PATH = MODULE_PATH/"includes"
62
+ INCLUDES_PATH = MODULE_PATH / "includes"
60
63
  try:
61
64
  from . import includes
65
+
62
66
  INCLUDES_PATH = Path(includes.__file__).parent
63
67
  except:
64
68
  pass
65
69
 
70
+
66
71
  def split_comma(s: str) -> list[str]:
67
72
  return [ss for ss in s.split(",") if ss]
68
73
 
74
+
69
75
  def range_type(min, max):
70
76
  def fn(astr):
71
77
  value = int(astr)
72
- if min<= value <= max:
78
+ if min <= value <= max:
73
79
  return value
74
80
  else:
75
- raise argparse.ArgumentTypeError(f'value {value} not in range [{min}-{max}]')
81
+ raise argparse.ArgumentTypeError(
82
+ f"value {value} not in range [{min}-{max}]"
83
+ )
84
+
76
85
  return fn
77
86
 
87
+
78
88
  def parse_labels(labels: str) -> list[str]:
79
89
  labels = labels.split(",")
80
90
  for label in labels:
@@ -83,39 +93,44 @@ def parse_labels(labels: str) -> list[str]:
83
93
 
84
94
  return labels
85
95
 
96
+
86
97
  def parse_extractors_globbed(type, all_extractors, names):
87
98
  globbed_names = set()
88
99
  for name in names.split(","):
89
100
  matches = fnmatch.filter(all_extractors.keys(), name)
90
101
  if not matches:
91
- raise argparse.ArgumentTypeError(f'`{name}` has 0 matches')
102
+ raise argparse.ArgumentTypeError(f"`{name}` has 0 matches")
92
103
  globbed_names.update(matches)
93
- filtered_extractors = {}
104
+ filtered_extractors = {}
94
105
  for extractor_name in globbed_names:
95
106
  try:
96
107
  extractor = all_extractors[extractor_name]
97
- extraction_processor = filtered_extractors.get(extractor.type, {})
108
+ extraction_processor = filtered_extractors.get(extractor.type, {})
98
109
  if extractor.type in ["lookup"]:
99
110
  lookups.load_lookup(extractor)
100
111
  if extractor.type == "pattern":
101
112
  pattern.load_extractor(extractor)
102
- filtered_extractors[extractor.type] = extraction_processor
113
+ filtered_extractors[extractor.type] = extraction_processor
103
114
  extraction_processor[extractor_name] = extractor
104
115
  except BaseException as e:
105
116
  raise argparse.ArgumentTypeError(f"{type} `{extractor_name}`: {e}")
106
117
  return filtered_extractors
107
118
 
119
+
108
120
  def parse_ref(value):
109
- m = re.compile(r'(.+?)=(.+)').match(value)
121
+ m = re.compile(r"(.+?)=(.+)").match(value)
110
122
  if not m:
111
123
  raise argparse.ArgumentTypeError("must be in format key=value")
112
124
  return dict(source_name=m.group(1), external_id=m.group(2))
113
125
 
126
+
114
127
  def parse_model(value: str):
115
- splits = value.split(':', 1)
128
+ splits = value.split(":", 1)
116
129
  provider = splits[0]
117
130
  if provider not in ALL_AI_EXTRACTORS:
118
- raise argparse.ArgumentTypeError(f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}")
131
+ raise argparse.ArgumentTypeError(
132
+ f"invalid AI provider in `{value}`, must be one of {list(ALL_AI_EXTRACTORS)}"
133
+ )
119
134
  provider = ALL_AI_EXTRACTORS[provider]
120
135
 
121
136
  try:
@@ -125,6 +140,7 @@ def parse_model(value: str):
125
140
  except Exception as e:
126
141
  raise ModelError(f"Unable to initialize model `{value}`") from e
127
142
 
143
+
128
144
  def parse_bool(value: str):
129
145
  value = value.lower()
130
146
  return value in ["yes", "y", "true", "1"]
@@ -135,7 +151,12 @@ def parse_args():
135
151
  all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
136
152
 
137
153
  parser = argparse.ArgumentParser(description="File Conversion Tool")
138
- parser.add_argument('--check_credentials', "--check-credentials", action="store_true", help="Print the validity of the credentials and exit")
154
+ parser.add_argument(
155
+ "--check_credentials",
156
+ "--check-credentials",
157
+ action="store_true",
158
+ help="Print the validity of the credentials and exit",
159
+ )
139
160
  args, _ = parser.parse_known_args()
140
161
  if args.check_credentials:
141
162
  statuses = credential_checker.check_statuses(test_llms=True)
@@ -259,7 +280,6 @@ def parse_args():
259
280
  help="create attack flow for attack objects in report/bundle",
260
281
  )
261
282
 
262
-
263
283
  args = parser.parse_args()
264
284
  if not args.input_file.exists():
265
285
  raise argparse.ArgumentError(inf_arg, "cannot open file")
@@ -296,6 +316,8 @@ REQUIRED_ENV_VARIABLES = [
296
316
  "CTIBUTLER_BASE_URL",
297
317
  "VULMATCH_BASE_URL",
298
318
  ]
319
+
320
+
299
321
  def load_env():
300
322
  for env in REQUIRED_ENV_VARIABLES:
301
323
  if not os.getenv(env):
@@ -304,19 +326,34 @@ def load_env():
304
326
 
305
327
  def log_notes(content, type):
306
328
  logging.debug(f" ========================= {type} ========================= ")
307
- logging.debug(f" ========================= {'+'*len(type)} ========================= ")
329
+ logging.debug(
330
+ f" ========================= {'+'*len(type)} ========================= "
331
+ )
308
332
  logging.debug(json.dumps(content, sort_keys=True, indent=4))
309
- logging.debug(f" ========================= {'-'*len(type)} ========================= ")
333
+ logging.debug(
334
+ f" ========================= {'-'*len(type)} ========================= "
335
+ )
336
+
310
337
 
311
- def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extractors: list[BaseAIExtractor]=[], **kwargs):
312
- assert ai_extractors or not extractors_map.get("ai"), "There should be at least one AI extractor in ai_extractors"
338
+ def run_extractors(
339
+ extractors_map, text_content, ai_extractors: list[BaseAIExtractor] = [], **kwargs
340
+ ):
341
+ """Run extraction calls (lookup, pattern, AI) and return a dict of all extracts.
313
342
 
314
- text_content = "\n"+text_content+"\n"
343
+ This function does NOT modify the bundler. Use `process_extracts` to
344
+ feed the returned extracts into a bundler (or replay saved extracts).
345
+ """
346
+ assert ai_extractors or not extractors_map.get(
347
+ "ai"
348
+ ), "There should be at least one AI extractor in ai_extractors"
349
+
350
+ text_content = "\n" + text_content + "\n"
315
351
  all_extracts = dict()
316
352
  if extractors_map.get("lookup"):
317
353
  try:
318
- lookup_extracts = lookups.extract_all(extractors_map["lookup"].values(), text_content)
319
- bundler.process_observables(lookup_extracts)
354
+ lookup_extracts = lookups.extract_all(
355
+ extractors_map["lookup"].values(), text_content
356
+ )
320
357
  all_extracts["lookup"] = lookup_extracts
321
358
  except BaseException as e:
322
359
  logging.exception("lookup extraction failed", exc_info=True)
@@ -324,95 +361,239 @@ def extract_all(bundler: txt2stixBundler, extractors_map, text_content, ai_extra
324
361
  if extractors_map.get("pattern"):
325
362
  try:
326
363
  logging.info("using pattern extractors")
327
- pattern_extracts = pattern.extract_all(extractors_map["pattern"].values(), text_content, ignore_extraction_boundary=kwargs.get('ignore_extraction_boundary', False))
328
- bundler.process_observables(pattern_extracts)
364
+ pattern_extracts = pattern.extract_all(
365
+ extractors_map["pattern"].values(),
366
+ text_content,
367
+ ignore_extraction_boundary=kwargs.get(
368
+ "ignore_extraction_boundary", False
369
+ ),
370
+ )
329
371
  all_extracts["pattern"] = pattern_extracts
330
372
  except BaseException as e:
331
373
  logging.exception("pattern extraction failed", exc_info=True)
332
374
 
333
375
  if extractors_map.get("ai"):
334
376
  logging.info("using ai extractors")
335
-
336
377
  for extractor in ai_extractors:
337
378
  logging.info("running extractor: %s", extractor.extractor_name)
338
379
  try:
339
- ai_extracts = extractor.extract_objects(text_content, extractors_map["ai"].values())
340
- bundler.process_observables(ai_extracts)
380
+ ai_extracts = extractor.extract_objects(
381
+ text_content, extractors_map["ai"].values()
382
+ )
341
383
  all_extracts[f"ai-{extractor.extractor_name}"] = ai_extracts
342
384
  except BaseException as e:
343
- logging.exception("AI extraction failed for %s", extractor.extractor_name, exc_info=True)
385
+ logging.exception(
386
+ "AI extraction failed for %s",
387
+ extractor.extractor_name,
388
+ exc_info=True,
389
+ )
390
+
391
+ for i, ex in enumerate(itertools.chain(*all_extracts.values())):
392
+ ex["id"] = "ex-" + str(i)
393
+ return all_extracts
394
+
395
+
396
+ def process_extracts(bundler: txt2stixBundler, all_extracts: dict):
397
+ """Process a previously-created `all_extracts` dict into the given bundler.
398
+
399
+ This allows replaying saved extracts without invoking extractors again.
400
+ """
401
+ for key, extracts in (all_extracts or {}).items():
402
+ try:
403
+ bundler.process_observables(extracts)
404
+ except BaseException:
405
+ logging.exception("processing extracts failed for %s", key, exc_info=True)
344
406
 
345
407
  log_notes(all_extracts, "Extractions")
346
- return all_extracts
347
408
 
348
- def extract_relationships_with_ai(bundler: txt2stixBundler, text_content, all_extracts, ai_extractor_session: BaseAIExtractor):
409
+
410
+ def extract_relationships(
411
+ text_content, all_extracts, ai_extractor_session: BaseAIExtractor
412
+ ):
349
413
  relationships = None
350
414
  try:
351
- all_extracts = list(itertools.chain(*all_extracts.values()))
352
- relationships = ai_extractor_session.extract_relationships(text_content, all_extracts, RELATIONSHIP_TYPES)
353
- relationships = relationships.model_dump()
415
+ # flatten extracts into a single list
416
+ flattened = list(itertools.chain(*all_extracts.values()))
417
+ rel = ai_extractor_session.extract_relationships(
418
+ text_content, flattened, RELATIONSHIP_TYPES
419
+ )
420
+ relationships = rel.model_dump()
354
421
  log_notes(relationships, "Relationships")
355
- bundler.process_relationships(relationships['relationships'])
356
422
  except BaseException as e:
357
- logging.exception("Relationship processing failed: %s", e)
423
+ logging.exception("Relationship extraction failed: %s", e)
358
424
  return relationships
359
425
 
426
+
360
427
  def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
361
- logging.info('INPUT_TOKEN_LIMIT = %d', max_tokens)
428
+ logging.info("INPUT_TOKEN_LIMIT = %d", max_tokens)
362
429
  for extractor in extractors:
363
430
  token_count = _count_token(extractor, input)
364
- logging.info(f"{extractor.extractor_name}: input_file token count = {token_count}")
365
- if token_count > max_tokens:
366
- raise FatalException(f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})")
431
+ logging.info(
432
+ f"{extractor.extractor_name}: input_file token count = {token_count}"
433
+ )
434
+ if token_count > max_tokens:
435
+ raise FatalException(
436
+ f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})"
437
+ )
367
438
 
368
439
 
369
440
  @functools.lru_cache
370
441
  def _count_token(extractor: BaseAIExtractor, input: str):
371
442
  return extractor.count_tokens(input)
372
443
 
373
- def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_map: dict,
374
- ai_content_check_provider=None,
375
- ai_create_attack_flow=None,
376
- ai_create_attack_navigator_layer=None,
377
- input_token_limit=10,
378
- ai_settings_extractions=None,
379
- ai_settings_relationships=None,
380
- relationship_mode="standard",
381
- ignore_extraction_boundary=False,
382
- ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
383
-
384
- **kwargs
385
- ) -> Txt2StixData:
444
+
445
+ def run_txt2stix(
446
+ bundler: txt2stixBundler,
447
+ preprocessed_text: str,
448
+ extractors_map: dict,
449
+ ai_content_check_provider=None,
450
+ ai_create_attack_flow=None,
451
+ ai_create_attack_navigator_layer=None,
452
+ input_token_limit=10,
453
+ ai_settings_extractions=None,
454
+ ai_settings_relationships=None,
455
+ relationship_mode="standard",
456
+ ignore_extraction_boundary=False,
457
+ ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
458
+ txt2stix_data: Txt2StixData = None,
459
+ **kwargs,
460
+ ) -> Txt2StixData:
461
+ # First, perform extraction-phase (LLM and extractor calls). This does not
462
+ # modify the provided bundler so the results can be saved and replayed.
463
+ # skip extraction phase if txt2stix_data is passed
464
+ txt2stix_data = txt2stix_data or extraction_phase(
465
+ preprocessed_text,
466
+ extractors_map,
467
+ ai_content_check_provider=ai_content_check_provider,
468
+ input_token_limit=input_token_limit,
469
+ ai_settings_extractions=ai_settings_extractions,
470
+ ai_settings_relationships=ai_settings_relationships,
471
+ relationship_mode=relationship_mode,
472
+ ignore_extraction_boundary=ignore_extraction_boundary,
473
+ ai_extract_if_no_incidence=ai_extract_if_no_incidence,
474
+ )
475
+
476
+ # Then, process the extracted data into the bundler (no LLM calls).
477
+ processing_phase(
478
+ bundler,
479
+ preprocessed_text,
480
+ txt2stix_data,
481
+ ai_create_attack_flow=ai_create_attack_flow,
482
+ ai_create_attack_navigator_layer=ai_create_attack_navigator_layer,
483
+ ai_settings_relationships=ai_settings_relationships,
484
+ ai_content_check_provider=ai_content_check_provider,
485
+ )
486
+ return txt2stix_data
487
+
488
+
489
+ def extraction_phase(
490
+ preprocessed_text: str,
491
+ extractors_map: dict,
492
+ ai_content_check_provider=None,
493
+ input_token_limit=10,
494
+ ai_settings_extractions=None,
495
+ ai_settings_relationships=None,
496
+ relationship_mode="standard",
497
+ ignore_extraction_boundary=False,
498
+ ai_extract_if_no_incidence=True,
499
+ **kwargs,
500
+ ) -> Txt2StixData:
501
+ """Perform token validation and run extractors/AI models. Does NOT modify a bundler."""
386
502
  should_extract = True
387
- retval = Txt2StixData.model_construct()
388
- retval.extractions = retval.attack_flow = retval.relationships = None
503
+ txt2stix_data = Txt2StixData.model_construct()
504
+ txt2stix_data.extractions = txt2stix_data.attack_flow = (
505
+ txt2stix_data.relationships
506
+ ) = None
507
+
389
508
  if ai_content_check_provider:
390
509
  logging.info("checking content")
391
- model : BaseAIExtractor = ai_content_check_provider
510
+ model: BaseAIExtractor = ai_content_check_provider
392
511
  validate_token_count(input_token_limit, preprocessed_text, [model])
393
- retval.content_check = model.check_content(preprocessed_text)
394
- should_extract = retval.content_check.describes_incident
512
+ txt2stix_data.content_check = model.check_content(preprocessed_text)
513
+ should_extract = txt2stix_data.content_check.describes_incident
395
514
  logging.info("=== ai-check-content output ====")
396
- logging.info(retval.content_check.model_dump_json())
397
- bundler.report.external_references.append(dict(source_name='txt2stix_describes_incident', description=str(should_extract).lower(), external_id=model.extractor_name))
398
- for classification in retval.content_check.incident_classification:
399
- bundler.report.labels.append(f'classification.{classification}'.lower())
400
- bundler.add_summary(retval.content_check.summary, model.extractor_name)
515
+ logging.info(txt2stix_data.content_check.model_dump_json())
401
516
 
402
517
  if should_extract or ai_extract_if_no_incidence:
403
518
  if extractors_map.get("ai"):
404
- validate_token_count(input_token_limit, preprocessed_text, ai_settings_extractions)
519
+ validate_token_count(
520
+ input_token_limit, preprocessed_text, ai_settings_extractions
521
+ )
405
522
  if relationship_mode == "ai":
406
- validate_token_count(input_token_limit, preprocessed_text, [ai_settings_relationships])
523
+ validate_token_count(
524
+ input_token_limit, preprocessed_text, [ai_settings_relationships]
525
+ )
526
+
527
+ txt2stix_data.extractions = run_extractors(
528
+ extractors_map,
529
+ preprocessed_text,
530
+ ai_extractors=ai_settings_extractions,
531
+ ignore_extraction_boundary=ignore_extraction_boundary,
532
+ )
407
533
 
408
- retval.extractions = extract_all(bundler, extractors_map, preprocessed_text, ai_extractors=ai_settings_extractions, ignore_extraction_boundary=ignore_extraction_boundary)
409
- if relationship_mode == "ai" and sum(map(lambda x: len(x), retval.extractions.values())):
410
- retval.relationships = extract_relationships_with_ai(bundler, preprocessed_text, retval.extractions, ai_settings_relationships)
411
-
412
- if ai_create_attack_flow or ai_create_attack_navigator_layer:
413
- retval.attack_flow, retval.navigator_layer = attack_flow.extract_attack_flow_and_navigator(bundler, preprocessed_text, ai_create_attack_flow, ai_create_attack_navigator_layer, ai_settings_relationships)
414
- return retval
534
+ if (
535
+ relationship_mode == "ai"
536
+ and txt2stix_data.extractions
537
+ and sum(map(lambda x: len(x), txt2stix_data.extractions.values()))
538
+ ):
539
+ txt2stix_data.relationships = extract_relationships(
540
+ preprocessed_text, txt2stix_data.extractions, ai_settings_relationships
541
+ )
542
+ return txt2stix_data
543
+
544
+
545
+ def processing_phase(
546
+ bundler: txt2stixBundler,
547
+ preprocessed_text: str,
548
+ data: Txt2StixData,
549
+ ai_create_attack_flow=False,
550
+ ai_create_attack_navigator_layer=False,
551
+ ai_settings_relationships=None,
552
+ ai_content_check_provider=None,
553
+ ):
554
+ """Process extracted `data` into the given `bundler` without invoking LLMs."""
555
+ try:
556
+ if data.content_check:
557
+ cc = data.content_check
558
+ provider_name = str(ai_content_check_provider)
559
+ bundler.report.external_references.append(
560
+ dict(
561
+ source_name="txt2stix_describes_incident",
562
+ description=str(cc.describes_incident).lower(),
563
+ external_id=provider_name,
564
+ )
565
+ )
566
+ for classification in cc.incident_classification:
567
+ bundler.report.labels.append(f"classification.{classification}".lower())
568
+ bundler.add_summary(cc.summary, provider_name)
569
+ except BaseException:
570
+ logging.exception("applying content_check to bundler failed", exc_info=True)
571
+
572
+ # process extracts into bundler
573
+ process_extracts(bundler, data.extractions)
574
+
575
+ # process relationships into bundler
576
+ try:
577
+ if data.relationships:
578
+ bundler.process_relationships(data.relationships.get("relationships", []))
579
+ except BaseException:
580
+ logging.exception("processing relationships failed", exc_info=True)
415
581
 
582
+ # generate attack flow / navigator layer now that bundler has been populated
583
+ try:
584
+ if ai_create_attack_flow or ai_create_attack_navigator_layer:
585
+ data.attack_flow, data.navigator_layer = (
586
+ attack_flow.extract_attack_flow_and_navigator(
587
+ bundler,
588
+ preprocessed_text,
589
+ ai_create_attack_flow,
590
+ ai_create_attack_navigator_layer,
591
+ ai_settings_relationships,
592
+ flow=data.attack_flow,
593
+ )
594
+ )
595
+ except BaseException:
596
+ logging.exception("attack flow / navigator generation failed", exc_info=True)
416
597
 
417
598
 
418
599
  def main():
@@ -424,35 +605,50 @@ def main():
424
605
  setLogFile(logger, Path(f"logs/logs-{job_id}.log"))
425
606
  logger.info(f"Arguments: {json.dumps(sys.argv[1:])}")
426
607
 
427
-
428
608
  input_text = args.input_file.read_text()
429
- preprocessed_text = remove_links(input_text, args.ignore_image_refs, args.ignore_link_refs)
609
+ preprocessed_text = remove_links(
610
+ input_text, args.ignore_image_refs, args.ignore_link_refs
611
+ )
430
612
  load_env()
431
613
 
432
-
433
- bundler = txt2stixBundler(args.name, args.use_identity, args.tlp_level, input_text, args.confidence, args.all_extractors, args.labels, created=args.created, report_id=args.report_id, external_references=args.external_refs)
614
+ bundler = txt2stixBundler(
615
+ args.name,
616
+ args.use_identity,
617
+ args.tlp_level,
618
+ input_text,
619
+ args.confidence,
620
+ args.all_extractors,
621
+ args.labels,
622
+ created=args.created,
623
+ report_id=args.report_id,
624
+ external_references=args.external_refs,
625
+ )
434
626
  log_notes(sys.argv, "Config")
435
627
 
436
628
  data = run_txt2stix(
437
- bundler, preprocessed_text, args.use_extractions,
438
- input_token_limit=int(os.environ['INPUT_TOKEN_LIMIT']),
629
+ bundler,
630
+ preprocessed_text,
631
+ args.use_extractions,
632
+ input_token_limit=int(os.environ["INPUT_TOKEN_LIMIT"]),
439
633
  **args.__dict__,
440
634
  )
441
635
 
442
636
  ## write outputs
443
637
  out = bundler.to_json()
444
- output_dir = Path("./output")/str(bundler.uuid)
638
+ output_dir = Path("./output") / str(bundler.uuid)
445
639
  with contextlib.suppress(BaseException):
446
640
  shutil.rmtree(output_dir)
447
641
  output_dir.mkdir(exist_ok=True, parents=True)
448
- output_path = output_dir/f"{bundler.bundle.id}.json"
642
+ output_path = output_dir / f"{bundler.bundle.id}.json"
449
643
  output_path.write_text(out)
450
644
  logger.info(f"Wrote bundle output to `{output_path}`")
451
- data_path = output_dir/f"data--{bundler.uuid}.json"
645
+ data_path = output_dir / f"data--{bundler.uuid}.json"
452
646
  data_path.write_text(data.model_dump_json(indent=4))
453
647
  logger.info(f"Wrote data output to `{data_path}`")
454
648
  for nav_layer in data.navigator_layer or []:
455
- nav_path = output_dir/f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
649
+ nav_path = (
650
+ output_dir / f"navigator-{nav_layer['domain']}----{bundler.uuid}.json"
651
+ )
456
652
  nav_path.write_text(json.dumps(nav_layer, indent=4))
457
653
  logger.info(f"Wrote navigator output to `{nav_path}`")
458
654
  except argparse.ArgumentError as e:
txt2stix/utils.py CHANGED
@@ -8,7 +8,7 @@ import mistune
8
8
  from mistune.renderers.markdown import MarkdownRenderer
9
9
  from mistune.util import unescape
10
10
 
11
- from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident
11
+ from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, RelationshipList
12
12
  class ImageLinkRemover(MarkdownRenderer):
13
13
  def __init__(self, remove_links: bool=False, remove_images: bool=False):
14
14
  self.remove_links = remove_links
@@ -49,7 +49,7 @@ class ImageLinkRemover(MarkdownRenderer):
49
49
  class Txt2StixData(BaseModel):
50
50
  content_check: DescribesIncident = Field(default=None)
51
51
  extractions: dict = Field(default=None)
52
- relationships: list[dict] = Field(default_factory=list)
52
+ relationships: dict|RelationshipList = Field(default_factory=dict)
53
53
  attack_flow: AttackFlowList = Field(default=None)
54
54
  navigator_layer: list = Field(default=None)
55
55
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: txt2stix
3
- Version: 1.1.12
3
+ Version: 1.1.14
4
4
  Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
5
5
  Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
6
6
  Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
@@ -1,24 +1,24 @@
1
- txt2stix/__init__.py,sha256=Sm_VT913IFuAZ6dJEdVz3baPwC5VYtHySVfBAOUG92w,803
2
- txt2stix/attack_flow.py,sha256=qlzI7TdYwPOXegx0hTdvVuZ_He2yQVg9eFPOpEt3huE,9038
3
- txt2stix/bundler.py,sha256=5E6IptaAyHXdMA7JUw8yG5J2hLZ9kqQuDsWCQAC3xlY,16937
1
+ txt2stix/__init__.py,sha256=kHCnJtzi37ivXx2STT5zT7-cUL16i86o7ywtSd3iXd4,769
2
+ txt2stix/attack_flow.py,sha256=VAsgNKZvPa-llUsGvbv0tYNc2Kif5pNeMoxH88-6CWc,9060
3
+ txt2stix/bundler.py,sha256=GmpWW9ek4iFZdEIyjVSpd9RnmyeNsZJOpnax5Tt0uT0,16748
4
4
  txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
5
5
  txt2stix/credential_checker.py,sha256=eWDP-jY3-jm8zI0JMoUcyoQZ_JqPNfCIr_HAO8nVYz0,3044
6
6
  txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
7
- txt2stix/indicator.py,sha256=dyf4wbvVrZRitZpm6t7UusSM98bVW1qc5UkdGpVm3ls,30025
7
+ txt2stix/indicator.py,sha256=EYh3mmgdq-8_5uQrHX5OkQG1sgiO1jQjRCqJaHqyF8k,30770
8
8
  txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
9
9
  txt2stix/retriever.py,sha256=sMNhnEYk3l5W44qZsWaDQtJYoHXA1oYIPM6wDqiUHSg,6642
10
10
  txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
11
- txt2stix/txt2stix.py,sha256=CaK2YmkMjBvC8FXZmvkThZfb9_K94sV31Uvns3gPx20,18862
12
- txt2stix/utils.py,sha256=n6mh4t9ZRJ7iT4Jvp9ai_dfCXjgXNcRtF_zXO7nkpnk,3304
11
+ txt2stix/txt2stix.py,sha256=eUL0pynQXruJRDvqs-LQ-dspDITx5tFDnTPEgCRQApk,23348
12
+ txt2stix/utils.py,sha256=Le0VYx8n8UNpcjqwpx7Avb06qIS9_hId8yP8_PquBUs,3333
13
13
  txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
14
14
  txt2stix/ai_extractor/anthropic.py,sha256=B5Z3nm2-w5KBhLcVJGkhNF0dn4lUo-fW_DnbOeJKA5Q,481
15
- txt2stix/ai_extractor/base.py,sha256=t0SCh24FeDEDzXsrGFada6ux9F6m0ILwXtPSaleDiv8,4172
15
+ txt2stix/ai_extractor/base.py,sha256=I_UwX4mOAVa8HrjSkI3KqKKImIBtQ29RdprDOu2NK6A,4235
16
16
  txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
17
17
  txt2stix/ai_extractor/gemini.py,sha256=rhhYrCa1zZTjadVk2QFhguD8_Yr03gl-D4Yb2nVBMI4,633
18
18
  txt2stix/ai_extractor/openai.py,sha256=1RxaLy0TJ4GjNKmcJoi6ZiBrCS_gt5ql9jpeE-SOy8g,642
19
19
  txt2stix/ai_extractor/openrouter.py,sha256=hAA6mTOMcpA28XYsOCvuJH7WMJqXCxfqZGJf_VrDsIk,628
20
20
  txt2stix/ai_extractor/prompts.py,sha256=NtqtVyPPtShPlVZ5SrFmo-LCkfpANIIi4H9rjqaxqDo,10559
21
- txt2stix/ai_extractor/utils.py,sha256=K3qglBRWYAuRU806-ahbz_PK1qQFfJ7ueWybVxYZYlQ,4425
21
+ txt2stix/ai_extractor/utils.py,sha256=7iB2qm-oUSFaYidsNi74EACwLV5skCcecCw3F9eIJx4,4507
22
22
  txt2stix/pattern/__init__.py,sha256=K9ofaP2AOikvzb48VSBpJZijckdqufZxSzr_kbRypLY,491
23
23
  txt2stix/pattern/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  txt2stix/pattern/extractors/base_extractor.py,sha256=ly80rp-L40g7DbhrGiCvhPWI95-ZFMtAQUEC-fH6Y-o,6130
@@ -114,8 +114,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
114
114
  txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
115
115
  txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
116
116
  txt2stix/includes/tests/test_cases.yaml,sha256=vErA3c5fySeWvJ5yJ8dCTEo3ufRATASAjaF4gj4Az1M,22424
117
- txt2stix-1.1.12.dist-info/METADATA,sha256=H_-Z_rIZrd0_yLobzdL9Ftthm400x05vLmSThIRDcVQ,15032
118
- txt2stix-1.1.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
119
- txt2stix-1.1.12.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
120
- txt2stix-1.1.12.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
121
- txt2stix-1.1.12.dist-info/RECORD,,
117
+ txt2stix-1.1.14.dist-info/METADATA,sha256=NaUvm8KFwWFYKiug2PDVWGXRd_W1E75y5j3LiouGW7c,15032
118
+ txt2stix-1.1.14.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
119
+ txt2stix-1.1.14.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
120
+ txt2stix-1.1.14.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
121
+ txt2stix-1.1.14.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any