txt2stix 1.0.1__py3-none-any.whl → 1.0.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
 
3
4
  import dotenv
4
5
 
@@ -12,4 +13,4 @@ for path in ["openai", "anthropic", "gemini", "deepseek", "openrouter"]:
12
13
  try:
13
14
  __import__(__package__ + "." + path)
14
15
  except Exception as e:
15
- logging.warning("%s not supported, please install missing modules", path, exc_info=True)
16
+ pass
txt2stix/extractions.py CHANGED
@@ -31,8 +31,9 @@ class Extractor(NamedDict):
31
31
  self.extraction_key = key
32
32
  self.slug = key
33
33
  test_cases = test_cases or dict()
34
- self.prompt_negative_examples = test_cases.get('test_negative_examples') or []
35
- self.prompt_positive_examples = test_cases.get('test_positive_examples') or []
34
+
35
+ self.prompt_negative_examples = remove_empty(test_cases.get('test_negative_examples') or [])
36
+ self.prompt_positive_examples = remove_empty(test_cases.get('test_positive_examples') or [])
36
37
  if self.file and not Path(self.file).is_absolute() and include_path:
37
38
  self.file = Path(include_path) / self.file
38
39
 
@@ -44,6 +45,9 @@ class Extractor(NamedDict):
44
45
  for line in file.read_text().splitlines():
45
46
  self.lookups.add(line.strip())
46
47
 
48
+ def remove_empty(iterable: list):
49
+ return [it for it in iterable if it]
50
+
47
51
  def parse_extraction_config(include_path: Path):
48
52
  config = {}
49
53
  test_cases = load_test_cases_config(include_path)
txt2stix/txt2stix.py CHANGED
@@ -38,7 +38,7 @@ def newLogger(name: str) -> logging.Logger:
38
38
  level=logging.DEBUG, # Set the desired logging level
39
39
  format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
40
40
  handlers=[stream_handler],
41
- datefmt='%d-%b-%y %H:%M:%S'
41
+ datefmt='%d-%b-%y %H:%M:%S',
42
42
  )
43
43
 
44
44
  return logging.root
@@ -127,31 +127,122 @@ def parse_bool(value: str):
127
127
  value = value.lower()
128
128
  return value in ["yes", "y", "true", "1"]
129
129
 
130
+
130
131
  def parse_args():
131
- EXTRACTORS_PATH = INCLUDES_PATH/"extractions"
132
+ EXTRACTORS_PATH = INCLUDES_PATH / "extractions"
132
133
  all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
133
-
134
+
134
135
  parser = argparse.ArgumentParser(description="File Conversion Tool")
135
136
 
136
- inf_arg = parser.add_argument("--input_file", "--input-file", required=True, help="The file to be converted. Must be .txt", type=Path)
137
- parser.add_argument("--ai_content_check_provider", required=False, type=parse_model, help="Use an AI model to check wether the content of the file contains threat intelligence. Paticularly useful to weed out vendor marketing.")
138
- parser.add_argument("--always_extract", default=True, type=parse_bool, help="Whether to always extract or not depending on output of ai_content_check_provider. Default, extracts even when content_check returns describes_incident=False")
139
- name_arg = parser.add_argument("--name", required=True, help="Name of the file, max 124 chars", default="stix-out")
140
- parser.add_argument("--created", required=False, default=datetime.now(), help="Allow user to optionally pass --created time in input, which will hardcode the time used in created times")
141
- parser.add_argument("--ai_settings_extractions", required=False, type=parse_model, help="(required if AI extraction enabled): passed in format provider:model e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers.", metavar="provider[:model]", nargs='+')
142
- parser.add_argument("--ai_settings_relationships", required=False, type=parse_model, help="(required if AI relationship enabled): passed in format `provider:model`. Can only pass one model at this time.", metavar="provider[:model]")
137
+ inf_arg = parser.add_argument(
138
+ "--input_file",
139
+ "--input-file",
140
+ required=True,
141
+ help="The file to be converted. Must be .txt",
142
+ type=Path,
143
+ )
144
+ parser.add_argument(
145
+ "--ai_content_check_provider",
146
+ required=False,
147
+ type=parse_model,
148
+ help="Use an AI model to check wether the content of the file contains threat intelligence. Paticularly useful to weed out vendor marketing.",
149
+ )
150
+ parser.add_argument(
151
+ "--ai_extract_if_no_incidence",
152
+ default=True,
153
+ type=parse_bool,
154
+ help="if content check decides the report is not related to cyber security intelligence (e.g. vendor marketing), then you can use this setting to decide wether or not script should proceed. Setting to `false` will stop processing. It is designed to save AI tokens processing unknown content at scale in an automated way.",
155
+ )
156
+ name_arg = parser.add_argument(
157
+ "--name",
158
+ required=True,
159
+ help="Name of the file, max 124 chars",
160
+ default="stix-out",
161
+ )
162
+ parser.add_argument(
163
+ "--created",
164
+ required=False,
165
+ default=datetime.now(),
166
+ help="Allow user to optionally pass --created time in input, which will hardcode the time used in created times",
167
+ )
168
+ parser.add_argument(
169
+ "--ai_settings_extractions",
170
+ required=False,
171
+ type=parse_model,
172
+ help="(required if AI extraction enabled): passed in format provider:model e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers.",
173
+ metavar="provider[:model]",
174
+ nargs="+",
175
+ )
176
+ parser.add_argument(
177
+ "--ai_settings_relationships",
178
+ required=False,
179
+ type=parse_model,
180
+ help="(required if AI relationship enabled): passed in format `provider:model`. Can only pass one model at this time.",
181
+ metavar="provider[:model]",
182
+ )
143
183
  parser.add_argument("--labels", type=parse_labels)
144
- rmode_arg = parser.add_argument("--relationship_mode", choices=["ai", "standard"], required=True)
145
- parser.add_argument("--report_id", type=uuid.UUID, required=False, help="id to use instead of automatically generated `{name}+{created}`", metavar="VALID_UUID")
146
- parser.add_argument("--confidence", type=range_type(0,100), default=None, help="value between 0-100. Default if not passed is null.", metavar="[0-100]")
147
- parser.add_argument("--tlp_level", "--tlp-level", choices=TLP_LEVEL.levels().keys(), default="clear", help="TLP level, default is clear")
148
- extractions_arg = parser.add_argument("--use_extractions", "--use-extractions", default={}, type=functools.partial(parse_extractors_globbed, "extractor", all_extractors), help="Specify extraction types from the default/local extractions .yaml file", metavar="EXTRACTION1,EXTRACTION2")
149
- parser.add_argument("--use_identity", "--use-identity", help="Specify an identity file id (e.g., {\"type\":\"identity\",\"name\":\"demo\",\"identity_class\":\"system\"})", metavar="[stix2 identity json]", type=parse_stix)
150
- parser.add_argument("--external_refs", type=parse_ref, help="pass additional `external_references` entry (or entries) to the report object created. e.g --external_ref author=dogesec link=https://dkjjadhdaj.net", default=[], metavar="{source_name}={external_id}", action="extend", nargs='+')
151
- parser.add_argument('--ignore_image_refs', default=True, type=parse_bool)
152
- parser.add_argument('--ignore_link_refs', default=True, type=parse_bool)
153
- parser.add_argument("--ignore_extraction_boundary", default=False, type=parse_bool, help="default if not passed is `false`, but if set to `true` will ignore boundary capture logic for extractions")
154
- aflow_arg = parser.add_argument('--ai_create_attack_flow', default=False, action='store_true', help="create attack flow for attack objects in report/bundle")
184
+ rmode_arg = parser.add_argument(
185
+ "--relationship_mode", choices=["ai", "standard"], required=True
186
+ )
187
+ parser.add_argument(
188
+ "--report_id",
189
+ type=uuid.UUID,
190
+ required=False,
191
+ help="id to use instead of automatically generated `{name}+{created}`",
192
+ metavar="VALID_UUID",
193
+ )
194
+ parser.add_argument(
195
+ "--confidence",
196
+ type=range_type(0, 100),
197
+ default=None,
198
+ help="value between 0-100. Default if not passed is null.",
199
+ metavar="[0-100]",
200
+ )
201
+ parser.add_argument(
202
+ "--tlp_level",
203
+ "--tlp-level",
204
+ choices=TLP_LEVEL.levels().keys(),
205
+ default="clear",
206
+ help="TLP level, default is clear",
207
+ )
208
+ extractions_arg = parser.add_argument(
209
+ "--use_extractions",
210
+ "--use-extractions",
211
+ default={},
212
+ type=functools.partial(parse_extractors_globbed, "extractor", all_extractors),
213
+ help="Specify extraction types from the default/local extractions .yaml file",
214
+ metavar="EXTRACTION1,EXTRACTION2",
215
+ )
216
+ parser.add_argument(
217
+ "--use_identity",
218
+ "--use-identity",
219
+ help='Specify an identity file id (e.g., {"type":"identity","name":"demo","identity_class":"system"})',
220
+ metavar="[stix2 identity json]",
221
+ type=parse_stix,
222
+ )
223
+ parser.add_argument(
224
+ "--external_refs",
225
+ type=parse_ref,
226
+ help="pass additional `external_references` entry (or entries) to the report object created. e.g --external_ref author=dogesec link=https://dkjjadhdaj.net",
227
+ default=[],
228
+ metavar="{source_name}={external_id}",
229
+ action="extend",
230
+ nargs="+",
231
+ )
232
+ parser.add_argument("--ignore_image_refs", default=True, type=parse_bool)
233
+ parser.add_argument("--ignore_link_refs", default=True, type=parse_bool)
234
+ parser.add_argument(
235
+ "--ignore_extraction_boundary",
236
+ default=False,
237
+ type=parse_bool,
238
+ help="default if not passed is `false`, but if set to `true` will ignore boundary capture logic for extractions",
239
+ )
240
+ aflow_arg = parser.add_argument(
241
+ "--ai_create_attack_flow",
242
+ default=False,
243
+ action="store_true",
244
+ help="create attack flow for attack objects in report/bundle",
245
+ )
155
246
 
156
247
  args = parser.parse_args()
157
248
  if not args.input_file.exists():
@@ -159,18 +250,27 @@ def parse_args():
159
250
  if len(args.name) > 124:
160
251
  raise argparse.ArgumentError(name_arg, "max 124 characters")
161
252
 
162
- if args.relationship_mode == 'ai' and not args.ai_settings_relationships:
163
- raise argparse.ArgumentError(rmode_arg, "relationship_mode is set to AI, --ai_settings_relationships is required")
253
+ if args.relationship_mode == "ai" and not args.ai_settings_relationships:
254
+ raise argparse.ArgumentError(
255
+ rmode_arg,
256
+ "relationship_mode is set to AI, --ai_settings_relationships is required",
257
+ )
164
258
 
165
259
  if args.ai_create_attack_flow and not args.ai_settings_relationships:
166
- raise argparse.ArgumentError(aflow_arg, "--ai_create_attack_flow requires --ai_settings_relationships")
167
- #### process --use-extractions
168
- if args.use_extractions.get('ai') and not args.ai_settings_extractions:
169
- raise argparse.ArgumentError(extractions_arg, "ai based extractors are passed, --ai_settings_extractions is required")
260
+ raise argparse.ArgumentError(
261
+ aflow_arg, "--ai_create_attack_flow requires --ai_settings_relationships"
262
+ )
263
+ #### process --use-extractions
264
+ if args.use_extractions.get("ai") and not args.ai_settings_extractions:
265
+ raise argparse.ArgumentError(
266
+ extractions_arg,
267
+ "ai based extractors are passed, --ai_settings_extractions is required",
268
+ )
170
269
 
171
- args.all_extractors = all_extractors
270
+ args.all_extractors = all_extractors
172
271
  return args
173
272
 
273
+
174
274
  REQUIRED_ENV_VARIABLES = [
175
275
  "INPUT_TOKEN_LIMIT",
176
276
  "CTIBUTLER_BASE_URL",
@@ -243,12 +343,12 @@ def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
243
343
  token_count = _count_token(extractor, input)
244
344
  if token_count > max_tokens:
245
345
  raise FatalException(f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})")
246
-
346
+
247
347
 
248
348
  @functools.lru_cache
249
349
  def _count_token(extractor: BaseAIExtractor, input: str):
250
350
  return extractor.count_tokens(input)
251
-
351
+
252
352
  def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_map: dict,
253
353
  ai_content_check_provider=None,
254
354
  ai_create_attack_flow=None,
@@ -257,7 +357,7 @@ def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_ma
257
357
  ai_settings_relationships=None,
258
358
  relationship_mode="standard",
259
359
  ignore_extraction_boundary=False,
260
- always_extract=False, # continue even if ai_content_check fails
360
+ ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
261
361
 
262
362
  **kwargs
263
363
  ) -> Txt2StixData:
@@ -276,7 +376,7 @@ def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_ma
276
376
  bundler.report.labels.append(f'txt2stix:{classification}'.lower())
277
377
  bundler.add_summary(retval.content_check.summary, model.extractor_name)
278
378
 
279
- if should_extract or always_extract:
379
+ if should_extract or ai_extract_if_no_incidence:
280
380
  if extractors_map.get("ai"):
281
381
  validate_token_count(input_token_limit, preprocessed_text, ai_settings_extractions)
282
382
  if relationship_mode == "ai":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: txt2stix
3
- Version: 1.0.1
3
+ Version: 1.0.1.post3
4
4
  Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
5
5
  Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
6
6
  Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
@@ -15,11 +15,7 @@ Requires-Python: >=3.9
15
15
  Requires-Dist: base58>=2.1.1
16
16
  Requires-Dist: beautifulsoup4>=4.12.3
17
17
  Requires-Dist: llama-index-core>=0.12.42
18
- Requires-Dist: llama-index-llms-anthropic>=0.7.2
19
- Requires-Dist: llama-index-llms-deepseek>=0.1.2
20
- Requires-Dist: llama-index-llms-gemini>=0.5.0
21
18
  Requires-Dist: llama-index-llms-openai>=0.4.5
22
- Requires-Dist: llama-index-llms-openrouter>=0.3.2
23
19
  Requires-Dist: mistune>=3.0.2
24
20
  Requires-Dist: pathvalidate>=3.2.0
25
21
  Requires-Dist: phonenumbers>=8.13.39
@@ -30,10 +26,19 @@ Requires-Dist: stix2extensions
30
26
  Requires-Dist: tld>=0.13
31
27
  Requires-Dist: tldextract>=5.1.2
32
28
  Requires-Dist: validators>=0.28.3
29
+ Provides-Extra: anthropic
30
+ Requires-Dist: llama-index-llms-anthropic>=0.7.2; extra == 'anthropic'
31
+ Provides-Extra: deepseek
32
+ Requires-Dist: llama-index-llms-deepseek>=0.1.2; extra == 'deepseek'
33
+ Provides-Extra: gemini
34
+ Requires-Dist: llama-index-llms-gemini>=0.5.0; extra == 'gemini'
35
+ Provides-Extra: openrouter
36
+ Requires-Dist: llama-index-llms-openrouter>=0.3.2; extra == 'openrouter'
33
37
  Provides-Extra: tests
34
38
  Requires-Dist: pytest; extra == 'tests'
35
39
  Requires-Dist: pytest-cov; extra == 'tests'
36
40
  Requires-Dist: pytest-subtests; extra == 'tests'
41
+ Requires-Dist: python-dateutil; extra == 'tests'
37
42
  Requires-Dist: requests; extra == 'tests'
38
43
  Description-Content-Type: text/markdown
39
44
 
@@ -85,7 +90,13 @@ cd txt2stix
85
90
  python3 -m venv txt2stix-venv
86
91
  source txt2stix-venv/bin/activate
87
92
  # install requirements
88
- pip3 install .
93
+ pip3 install txt2stix
94
+ ```
95
+
96
+ Note, by default txt2stix will install OpenAI to use as the AI provider. You can also use Anthropic, Gemini, OpenRouter or Deepseek. You need to install these manually if you plan to use them as follows (remove those that don't apply)
97
+
98
+ ```shell
99
+ pip3 install txt2stix[deepseek,gemini,anthropic,openrouter]
89
100
  ```
90
101
 
91
102
  ### Set variables
@@ -113,39 +124,39 @@ The following arguments are available:
113
124
 
114
125
  #### Input settings
115
126
 
116
- * `--input_file` (REQUIRED): the file to be converted. Must be `.txt`
127
+ * `--input_file` (`path/to/file.txt`, required): the file to be converted. Must be `.txt`
117
128
 
118
129
  #### STIX Report generation settings
119
130
 
120
131
 
121
- * `--name` (REQUIRED): name of file, max 72 chars. Will be used in the STIX Report Object created.
122
- * `--report_id` (OPTIONAL): Sometimes it is required to control the id of the `report` object generated. You can therefore pass a valid UUIDv4 in this field to be assigned to the report. e.g. passing `2611965-930e-43db-8b95-30a1e119d7e2` would create a STIX object id `report--2611965-930e-43db-8b95-30a1e119d7e2`. If this argument is not passed, the UUID will be randomly generated.
123
- * `--tlp_level` (OPTIONAL): Options are `clear`, `green`, `amber`, `amber_strict`, `red`. Default if not passed, is `clear`.
124
- * `--confidence` (OPTIONAL): value between 0-100. Default if not passed is null.
132
+ * `--name` (text, required): name of file, max 72 chars. Will be used in the STIX Report Object created.
133
+ * `--report_id` (UUIDv4, default is random UUIDv4): Sometimes it is required to control the id of the `report` object generated. You can therefore pass a valid UUIDv4 in this field to be assigned to the report. e.g. passing `2611965-930e-43db-8b95-30a1e119d7e2` would create a STIX object id `report--2611965-930e-43db-8b95-30a1e119d7e2`. If this argument is not passed, the UUID will be randomly generated.
134
+ * `--tlp_level` (dictionary, default, `clear`): Options are `clear`, `green`, `amber`, `amber_strict`, `red`.
135
+ * `--confidence` (value between 0-100): If not passed, report will be assigned no confidence score value
125
136
  * `--labels` (OPTIONAL): comma seperated list of labels. Case-insensitive (will all be converted to lower-case). Allowed `a-z`, `0-9`. e.g.`label1,label2` would create 2 labels.
126
- * `--created` (OPTIONAL): by default all object `created` times will take the time the script was run. If you want to explicitly set these times you can do so using this flag. Pass the value in the format `YYYY-MM-DDTHH:MM:SS.sssZ` e.g. `2020-01-01T00:00:00.000Z`
127
- * `--use_identity` (OPTIONAL): can pass a full STIX 2.1 identity object (make sure to properly escape). Will be validated by the STIX2 library.
137
+ * `--created` (datetime, optional): by default all object `created` times will take the time the script was run. If you want to explicitly set these times you can do so using this flag. Pass the value in the format `YYYY-MM-DDTHH:MM:SS.sssZ` e.g. `2020-01-01T00:00:00.000Z`
138
+ * `--use_identity` (stix identity, optional, default txt2stix identity): can pass a full STIX 2.1 identity object (make sure to properly escape). Will be validated by the STIX2 library.
128
139
  * `--external_refs` (OPTIONAL): txt2stix will automatically populate the `external_references` of the report object it creates for the input. You can use this value to add additional objects to `external_references`. Note, you can only add `source_name` and `external_id` values currently. Pass as `source_name=external_id`. e.g. `--external_refs txt2stix=demo1 source=id` would create the following objects under the `external_references` property: `{"source_name":"txt2stix","external_id":"demo1"},{"source_name":"source","external_id":"id"}`
129
140
 
130
141
  #### Output settings
131
142
 
132
143
  How the extractions are performed
133
144
 
134
- * `--use_extractions` (REQUIRED): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `'pattern_*'` would run all extractions starting with `pattern_` -- make sure to use quotes when using a wildcard)
145
+ * `--use_extractions` (dictionary, required): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `'pattern_*'` would run all extractions starting with `pattern_` -- make sure to use quotes when using a wildcard)
135
146
  * Important: if using any AI extractions (`ai_*`), you must set an AI API key in your `.env` file
136
147
  * Important: if you are using any MITRE ATT&CK, CAPEC, CWE, ATLAS or Location extractions you must set `CTIBUTLER` or NVD CPE or CVE extractions you must set `VULMATCH` settings in your `.env` file
137
- * `--relationship_mode` (REQUIRED): either.
148
+ * `--relationship_mode` (dictionary, required): either.
138
149
  * `ai`: AI provider must be enabled. extractions performed by either regex or AI for extractions user selected. Rich relationships created from AI provider from extractions.
139
150
  * `standard`: extractions performed by either regex or AI (AI provider must be enabled) for extractions user selected. Basic relationships created from extractions back to master Report object generated.
140
- * `--ignore_extraction_boundary` (OPTIONAL, default `false`, not compatible with AI extractions): in some cases the same string will create multiple extractions depending on extractions set (e.g. `https://www.google.com/file.txt` could create a url, url with file, domain, subdomain, and file). The default behaviour is for txt2stix to take the longest extraction and ignore everything else (e.g. only extract url with file, and ignore url, file, domain, subdomain, and file). If you want to override this behaviour and get all extractions in the output, set this flag to `true`.
141
- * `--ignore_image_refs` (default `true`): images references in documents don't usually need extracting. e.g. `<img src="https://example.com/image.png" alt="something">` you would not want domain or file extractions extracting `example.com` and `image.png`. Hence these are ignored by default (they are removed from text sent to extraction). Note, only the `img src` is ignored, all other values e.g. `alt` are considered. If you want extractions to consider this data, set it to `false`
142
- * `--ignore_link_refs` (default `true`): link references in documents don't usually need extracting e.g. `<a href="https://example.com/link.html" title="something">Bad Actor</a>` you would only want `Bad actor` to be considered for extraction. Hence these part of the link are ignored by default (they are removed from text sent to extraction). Note, only the `a href` is ignored, all other values e.g. `title` are considered. Setting this to `false` will also include everything inside the link tag (e.g. `example.com` would extract as a domain)
151
+ * `--ignore_extraction_boundary` (boolean, default `false`, not compatible with AI extractions): in some cases the same string will create multiple extractions depending on extractions set (e.g. `https://www.google.com/file.txt` could create a url, url with file, domain, subdomain, and file). The default behaviour is for txt2stix to take the longest extraction and ignore everything else (e.g. only extract url with file, and ignore url, file, domain, subdomain, and file). If you want to override this behaviour and get all extractions in the output, set this flag to `true`.
152
+ * `--ignore_image_refs` (boolean, default `true`): images references in documents don't usually need extracting. e.g. `<img src="https://example.com/image.png" alt="something">` you would not want domain or file extractions extracting `example.com` and `image.png`. Hence these are ignored by default (they are removed from text sent to extraction). Note, only the `img src` is ignored, all other values e.g. `alt` are considered. If you want extractions to consider this data, set it to `false`
153
+ * `--ignore_link_refs` (boolean, default `true`): link references in documents don't usually need extracting e.g. `<a href="https://example.com/link.html" title="something">Bad Actor</a>` you would only want `Bad actor` to be considered for extraction. Hence these part of the link are ignored by default (they are removed from text sent to extraction). Note, only the `a href` is ignored, all other values e.g. `title` are considered. Setting this to `false` will also include everything inside the link tag (e.g. `example.com` would extract as a domain)
143
154
 
144
155
  #### AI settings
145
156
 
146
157
  If any AI extractions, or AI relationship mode is set, you must set the following accordingly
147
158
 
148
- * `--ai_settings_extractions`:
159
+ * `--ai_settings_extractions` (`model:provider`, required if one or more AI extractions set):
149
160
  * defines the `provider:model` to be used for extractions. You can supply more than one provider. Seperate with a space (e.g. `openrouter:openai/gpt-4o` `openrouter:deepseek/deepseek-chat`) If more than one provider passed, txt2stix will take extractions from all models, de-dupelicate them, and them package them in the output. Currently supports:
150
161
  * Provider (env var required `OPENROUTER_API_KEY`): `openrouter:`, providers/models `openai/gpt-4o`, `deepseek/deepseek-chat` ([More here](https://openrouter.ai/models))
151
162
  * Provider (env var required `OPENAI_API_KEY`): `openai:`, models e.g.: `gpt-4o`, `gpt-4o-mini`, `gpt-4-turbo`, `gpt-4` ([More here](https://platform.openai.com/docs/models))
@@ -153,11 +164,15 @@ If any AI extractions, or AI relationship mode is set, you must set the followin
153
164
  * Provider (env var required `GOOGLE_API_KEY`): `gemini:models/`, models: `gemini-1.5-pro-latest`, `gemini-1.5-flash-latest` ([More here](https://ai.google.dev/gemini-api/docs/models/gemini))
154
165
  * Provider (env var required `DEEPSEEK_API_KEY`): `deepseek:`, models `deepseek-chat` ([More here](https://api-docs.deepseek.com/quick_start/pricing))
155
166
  * See `tests/manual-tests/cases-ai-extraction-type.md` for some examples
156
- * `--ai_settings_relationships`:
167
+ * `--ai_settings_relationships` (`model:provider`, required if AI relationship mode set):
157
168
  * similar to `ai_settings_extractions` but defines the model used to generate relationships. Only one model can be provided. Passed in same format as `ai_settings_extractions`
158
169
  * See `tests/manual-tests/cases-ai-relationships.md` for some examples
159
- * `--ai_content_check_provider`: Passing this flag will get the AI to try and classify the text in the input to 1) determine if it is talking about threat intelligence, and 2) what type of threat intelligence it is talking about. For context, we use this to filter out non-threat intel posts in Obstracts and Stixify. You pass `provider:model` with this flag to determine the AI model you wish to use to perform the check.
160
- * `--ai_create_attack_flow`: passing this flag will also prompt the AI model (the same entered for `--ai_settings_relationships`) to generate an [Attack Flow](https://center-for-threat-informed-defense.github.io/attack-flow/) for the MITRE ATT&CK extractions to define the logical order in which they are being described. You must pass `--ai_settings_relationships` for this to work.
170
+
171
+ #### Other AI related settings
172
+
173
+ * `--ai_content_check_provider` (`model:provider`, required if passed): Passing this flag will get the AI to try and classify the text in the input to 1) determine if it is talking about threat intelligence, and 2) what type of threat intelligence it is talking about. For context, we use this to filter out non-threat intel posts in Obstracts and Stixify. You pass `provider:model` with this flag to determine the AI model you wish to use to perform the check. It will also create a summary of the content passed (and store this into a STIX Note).
174
+ * `--ai_extract_if_no_incidence` (boolean, default `true`) if content check decides the report is not related to cyber security intelligence (e.g. vendor marketing), then you can use this setting to decide wether or not script should proceed. Setting to `false` will stop processing. It is designed to save AI tokens processing unknown content at scale in an automated way.
175
+ * `--ai_create_attack_flow` (boolean): passing this flag will also prompt the AI model (the same entered for `--ai_settings_relationships`) to generate an [Attack Flow](https://center-for-threat-informed-defense.github.io/attack-flow/) for the MITRE ATT&CK extractions to define the logical order in which they are being described. You must pass `--ai_settings_relationships` for this to work.
161
176
 
162
177
  ## Adding new extractions
163
178
 
@@ -2,14 +2,14 @@ txt2stix/__init__.py,sha256=Sm_VT913IFuAZ6dJEdVz3baPwC5VYtHySVfBAOUG92w,803
2
2
  txt2stix/attack_flow.py,sha256=WWlukuQYrGW1SJ1DnhfROYC5Ck4WYqNifgmtiuyDg7E,4177
3
3
  txt2stix/bundler.py,sha256=EVTcVgZyVMwb6XjNQ3Gyj7zm44UErXo9wbVr2JGsjQQ,16797
4
4
  txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
5
- txt2stix/extractions.py,sha256=ExynKWSeuWOC0q6i4SuU1NkeNw7uoOm6xu0YtJRVaiE,2058
5
+ txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
6
6
  txt2stix/indicator.py,sha256=c6S0xx0K8JM-PT_Qd1PlN_ZlDXdnEwiRS8529iUp3yg,30774
7
7
  txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
8
8
  txt2stix/retriever.py,sha256=zU8L00RSh9N5J0NpAo3CM3IHsuZsNVjJGohRisXcMRs,5167
9
9
  txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
10
- txt2stix/txt2stix.py,sha256=Vt9CUsSEO1bw5SS7vlsVxktFz1nW8M_G4-RN6idOTA0,16444
10
+ txt2stix/txt2stix.py,sha256=RzGmzkIViEHO45GsxStcz5nbE0ynhifVeJpTalFSlZc,17405
11
11
  txt2stix/utils.py,sha256=P66yq-SphsQu2S9At6BfYpavfghXsZqh4h6W13HUEoI,3256
12
- txt2stix/ai_extractor/__init__.py,sha256=RcXh30ZcIA3Fva2bOPH4EtWq6ffWhGE39C_II8ElAx0,417
12
+ txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
13
13
  txt2stix/ai_extractor/anthropic.py,sha256=mdz-8CB-BSCEqnK5l35DRZURVPUf508ef2b48XMxmuk,441
14
14
  txt2stix/ai_extractor/base.py,sha256=MAtnKvWUmWZgnzwDM0i2n-WrRWq69du4KVcapNMIsEg,3523
15
15
  txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
@@ -112,8 +112,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
112
112
  txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
113
113
  txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
114
114
  txt2stix/includes/tests/test_cases.yaml,sha256=QD1FdIunpPkOpsn6wJRqs2vil_hv8OSVaqUp4a96aZg,22247
115
- txt2stix-1.0.1.dist-info/METADATA,sha256=rg6DP9idqFjH6eK2FbVmHxCgk-XnQ9mNTbFg3re-dvE,12916
116
- txt2stix-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
117
- txt2stix-1.0.1.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
118
- txt2stix-1.0.1.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
119
- txt2stix-1.0.1.dist-info/RECORD,,
115
+ txt2stix-1.0.1.post3.dist-info/METADATA,sha256=6rWnpDd2GTg0rjflOVndvCsbEISnV4rxhRj21as6SqI,14289
116
+ txt2stix-1.0.1.post3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
117
+ txt2stix-1.0.1.post3.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
118
+ txt2stix-1.0.1.post3.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
119
+ txt2stix-1.0.1.post3.dist-info/RECORD,,