txt2stix 1.0.1.post1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2stix/ai_extractor/__init__.py +2 -1
- txt2stix/ai_extractor/base.py +17 -13
- txt2stix/ai_extractor/prompts.py +97 -29
- txt2stix/ai_extractor/utils.py +9 -2
- txt2stix/attack_flow.py +120 -13
- txt2stix/bundler.py +2 -0
- txt2stix/extractions.py +6 -2
- txt2stix/includes/lookups/_generate_lookups.py +2 -0
- txt2stix/retriever.py +9 -0
- txt2stix/txt2stix.py +163 -43
- txt2stix/utils.py +1 -0
- {txt2stix-1.0.1.post1.dist-info → txt2stix-1.0.2.dist-info}/METADATA +38 -23
- {txt2stix-1.0.1.post1.dist-info → txt2stix-1.0.2.dist-info}/RECORD +16 -16
- {txt2stix-1.0.1.post1.dist-info → txt2stix-1.0.2.dist-info}/WHEEL +0 -0
- {txt2stix-1.0.1.post1.dist-info → txt2stix-1.0.2.dist-info}/entry_points.txt +0 -0
- {txt2stix-1.0.1.post1.dist-info → txt2stix-1.0.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import warnings
|
2
3
|
|
3
4
|
import dotenv
|
4
5
|
|
@@ -12,4 +13,4 @@ for path in ["openai", "anthropic", "gemini", "deepseek", "openrouter"]:
|
|
12
13
|
try:
|
13
14
|
__import__(__package__ + "." + path)
|
14
15
|
except Exception as e:
|
15
|
-
|
16
|
+
pass
|
txt2stix/ai_extractor/base.py
CHANGED
@@ -14,7 +14,7 @@ from llama_index.core.utils import get_tokenizer
|
|
14
14
|
_ai_extractor_registry: dict[str, 'Type[BaseAIExtractor]'] = {}
|
15
15
|
class BaseAIExtractor():
|
16
16
|
system_prompt = DEFAULT_SYSTEM_PROMPT
|
17
|
-
|
17
|
+
|
18
18
|
extraction_template = DEFAULT_EXTRACTION_TEMPL
|
19
19
|
|
20
20
|
relationship_template = DEFAULT_RELATIONSHIP_TEMPL
|
@@ -28,7 +28,7 @@ class BaseAIExtractor():
|
|
28
28
|
verbose=True,
|
29
29
|
llm=self.llm,
|
30
30
|
)
|
31
|
-
|
31
|
+
|
32
32
|
def _get_relationship_program(self):
|
33
33
|
return LLMTextCompletionProgram.from_defaults(
|
34
34
|
output_parser=ParserWithLogging(RelationshipList),
|
@@ -36,7 +36,7 @@ class BaseAIExtractor():
|
|
36
36
|
verbose=True,
|
37
37
|
llm=self.llm,
|
38
38
|
)
|
39
|
-
|
39
|
+
|
40
40
|
def _get_content_checker_program(self):
|
41
41
|
return LLMTextCompletionProgram.from_defaults(
|
42
42
|
output_parser=ParserWithLogging(DescribesIncident),
|
@@ -44,10 +44,10 @@ class BaseAIExtractor():
|
|
44
44
|
verbose=True,
|
45
45
|
llm=self.llm,
|
46
46
|
)
|
47
|
-
|
47
|
+
|
48
48
|
def check_content(self, text) -> DescribesIncident:
|
49
49
|
return self._get_content_checker_program()(context_str=text)
|
50
|
-
|
50
|
+
|
51
51
|
def _get_attack_flow_program(self):
|
52
52
|
return LLMTextCompletionProgram.from_defaults(
|
53
53
|
output_parser=ParserWithLogging(AttackFlowList),
|
@@ -55,24 +55,29 @@ class BaseAIExtractor():
|
|
55
55
|
verbose=True,
|
56
56
|
llm=self.llm,
|
57
57
|
)
|
58
|
-
|
59
|
-
def extract_attack_flow(self, input_text,
|
60
|
-
|
58
|
+
|
59
|
+
def extract_attack_flow(self, input_text, techniques) -> AttackFlowList:
|
60
|
+
extracted_techniques = []
|
61
|
+
for t in techniques.values():
|
62
|
+
extracted_techniques.append(
|
63
|
+
dict(id=t['id'], name=t['name'], possible_tactics=list(t['possible_tactics'].keys()))
|
64
|
+
)
|
65
|
+
return self._get_attack_flow_program()(document=input_text, extracted_techniques=extracted_techniques)
|
61
66
|
|
62
67
|
def extract_relationships(self, input_text, extractions, relationship_types: list[str]) -> RelationshipList:
|
63
68
|
return self._get_relationship_program()(relationship_types=relationship_types, input_file=input_text, extractions=extractions)
|
64
|
-
|
69
|
+
|
65
70
|
def extract_objects(self, input_text, extractors) -> ExtractionList:
|
66
71
|
extraction_list = self._get_extraction_program()(extractors=get_extractors_str(extractors), input_file=input_text)
|
67
72
|
return extraction_list.model_dump().get('extractions', [])
|
68
|
-
|
73
|
+
|
69
74
|
def __init__(self, *args, **kwargs) -> None:
|
70
75
|
pass
|
71
76
|
|
72
77
|
def count_tokens(self, input_text):
|
73
78
|
logging.info("unsupported model `%s`, estimating using llama-index's default tokenizer", self.extractor_name)
|
74
79
|
return len(get_tokenizer()(input_text))
|
75
|
-
|
80
|
+
|
76
81
|
def __init_subclass__(cls, /, provider, register=True, **kwargs):
|
77
82
|
super().__init_subclass__(**kwargs)
|
78
83
|
if register:
|
@@ -82,7 +87,6 @@ class BaseAIExtractor():
|
|
82
87
|
@property
|
83
88
|
def extractor_name(self):
|
84
89
|
return f"{self.provider}:{self.llm.model}"
|
85
|
-
|
86
90
|
|
87
91
|
def __hash__(self):
|
88
|
-
return hash(self.extractor_name)
|
92
|
+
return hash(self.extractor_name)
|
txt2stix/ai_extractor/prompts.py
CHANGED
@@ -128,43 +128,111 @@ DEFAULT_CONTENT_CHECKER_WITH_SUMMARY_TEMPL = PromptTemplate("""
|
|
128
128
|
</summary>
|
129
129
|
""")
|
130
130
|
|
131
|
+
|
132
|
+
|
131
133
|
ATTACK_FLOW_PROMPT_TEMPL = ChatPromptTemplate([
|
132
|
-
ChatMessage.from_str("""You are a
|
133
|
-
|
134
|
-
|
135
|
-
|
134
|
+
ChatMessage.from_str("""You are a cybersecurity threat intelligence analyst.
|
135
|
+
|
136
|
+
Your task is to analyze structured cybersecurity incident reports (e.g., malware analysis, APTs, data breaches, vulnerabilities) and extract and organize MITRE ATT&CK techniques as part of an attack flow analysis. This analysis helps defenders understand adversary behavior using the MITRE Attack Flow model maintained by the MITRE Center for Threat-Informed Defense.""", MessageRole.SYSTEM),
|
137
|
+
|
138
|
+
ChatMessage.from_str("Hello. Please provide the document for analysis. Only include the full document text in your response.", MessageRole.ASSISTANT),
|
139
|
+
|
136
140
|
ChatMessage.from_str("{document}", MessageRole.USER),
|
137
|
-
ChatMessage.from_str("What are the objects that have been extracted (<extractions>) from the document above?", MessageRole.ASSISTANT),
|
138
|
-
ChatMessage.from_str("{extractions}", MessageRole.USER),
|
139
|
-
ChatMessage.from_str("What are the relationships that have been extracted (<relationships>) between the documents?", MessageRole.USER),
|
140
|
-
ChatMessage.from_str("{relationships}", MessageRole.USER),
|
141
|
-
ChatMessage.from_str("What should I do with all the data that have been provided?", MessageRole.ASSISTANT),
|
142
|
-
ChatMessage.from_str("""Consider all the MITRE ATT&CK Objects extracted from the report and the relationships they have to other objects.
|
143
141
|
|
144
|
-
|
142
|
+
ChatMessage.from_str("What ATT&CK techniques and related metadata were extracted from this document?", MessageRole.ASSISTANT),
|
145
143
|
|
146
|
-
|
144
|
+
ChatMessage.from_str("<extracted_techniques>\n\n{extracted_techniques}\n\n</extracted_techniques>", MessageRole.USER),
|
147
145
|
|
148
|
-
|
146
|
+
ChatMessage.from_str("Let's begin with tactic selection. What should I do with the techniques and possible tactics?", MessageRole.ASSISTANT),
|
149
147
|
|
150
|
-
|
148
|
+
# PART 1: Tactic Selection Phase
|
149
|
+
ChatMessage.from_str("""
|
150
|
+
PART 1: TACTIC SELECTION
|
151
151
|
|
152
|
-
|
152
|
+
For each of the technique in `<extracted_techniques>`, return [technique_id, tactic_name], where
|
153
|
+
- technique id = `technique.id`
|
154
|
+
- tactic_name = choice from `technique.possible_tactics`, where choice is selected based on the **most contextually appropriate** tactic name for each technique based on how it's used in the document.
|
153
155
|
|
154
|
-
|
156
|
+
📌 Output only the tactic assignments in this format:
|
157
|
+
<code>
|
155
158
|
{
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
}
|
159
|
+
"tactic_selection": [
|
160
|
+
["Txxxx", "impact"],
|
161
|
+
["Tyyyy", "discovery"],
|
162
|
+
...
|
163
|
+
]
|
164
|
+
}
|
165
|
+
</code>
|
166
|
+
|
167
|
+
⚠️ Constraints:
|
168
|
+
- Use **only** the `possible_tactics` provided with each technique.
|
169
|
+
- Do **not** invent or infer any technique or tactic name beyond what’s given in <extracted_techniques>.
|
170
|
+
- Ensure **every** technique in `<extracted_techniques>` appears in `tactic_selection`, even if uncertain — choose the best fit.
|
171
|
+
- Technique IDs in `tactic_selection` must match exactly from <extracted_techniques> (e.g., `T1059` must match `T1059` and not `T1059.005`, `T1001.001` must match `T1001.001` and not `T1001`).
|
172
|
+
- Must include every technique in `<extracted_techniques>`
|
173
|
+
""", MessageRole.USER),
|
174
|
+
|
175
|
+
ChatMessage.from_str("Thanks. Now let's continue with the attack flow. How should I proceed?", MessageRole.ASSISTANT),
|
176
|
+
|
177
|
+
# PART 2: Attack Flow Construction Phase
|
178
|
+
ChatMessage.from_str("""
|
179
|
+
PART 2: ATTACK FLOW CONSTRUCTION
|
180
|
+
|
181
|
+
Using the `<extracted_techniques>` and the incident details in the document, construct a sequence of MITRE ATT&CK techniques that represent the adversary’s logical progression through the attack.
|
182
|
+
|
183
|
+
For each technique:
|
184
|
+
- Use the `technique.id` exactly as provided
|
185
|
+
- Assign:
|
186
|
+
- `name`: a short, context-based phrase describing how the technique is used
|
187
|
+
- `description`: a longer explanation of how the technique operates in this specific incident, based only on the document
|
188
|
+
- `position`: the step in the logical or chronological attack sequence (starting at 0)
|
189
|
+
|
190
|
+
⚠️ Constraints:
|
191
|
+
- Use **only** technique IDs provided in `<extracted_techniques>` — do **not** invent or infer new ones
|
192
|
+
- Ensure all included technique IDs exactly match `technique.id` from `<extracted_techniques>` (e.g., `T1059` must match `T1059` and not `T1059.005`, `T1001.001` must match `T1001.001` and not `T1001`).
|
193
|
+
|
194
|
+
📤 Output Format:
|
195
|
+
<code>
|
162
196
|
{
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
197
|
+
"items": [
|
198
|
+
{
|
199
|
+
"position": 0,
|
200
|
+
"attack_technique_id": "Txxxx",
|
201
|
+
"name": "Short contextual name",
|
202
|
+
"description": "Detailed contextual explanation"
|
203
|
+
},
|
204
|
+
...
|
205
|
+
],
|
206
|
+
"success": true
|
168
207
|
}
|
169
|
-
|
170
|
-
|
208
|
+
</code>
|
209
|
+
|
210
|
+
Your goal is to tell the story of how the adversary moved through the attack using the extracted ATT&CK techniques, in the correct sequence, with clear context for defenders.
|
211
|
+
""", MessageRole.USER),
|
212
|
+
# PART 3: Combination phase
|
213
|
+
ChatMessage.from_str("""
|
214
|
+
📤 Final Output Format:
|
215
|
+
<code>
|
216
|
+
{
|
217
|
+
"tactic_selection": [...], // Use your previous output
|
218
|
+
"items": [
|
219
|
+
{
|
220
|
+
"position": 0,
|
221
|
+
"attack_technique_id": "Txxxx",
|
222
|
+
"name": "Short contextual name",
|
223
|
+
"description": "Detailed contextual explanation"
|
224
|
+
},
|
225
|
+
...
|
226
|
+
],
|
227
|
+
"success": true
|
228
|
+
}
|
229
|
+
</code>
|
230
|
+
|
231
|
+
⚠️ Constraints:
|
232
|
+
- All `attack_technique_id` values in `items` must come from `<extracted_techniques>`
|
233
|
+
- The `position` field should reflect the **chronological or logical** execution order of the attack
|
234
|
+
- Do **not** introduce new technique IDs
|
235
|
+
|
236
|
+
✅ Your goal is to build a realistic, document-based attack flow using MITRE ATT&CK technique–tactic pairs.
|
237
|
+
""", MessageRole.USER)
|
238
|
+
])
|
txt2stix/ai_extractor/utils.py
CHANGED
@@ -38,16 +38,23 @@ class DescribesIncident(BaseModel):
|
|
38
38
|
|
39
39
|
class AttackFlowItem(BaseModel):
|
40
40
|
position : int = Field(description="order of object starting at 0")
|
41
|
-
attack_tactic_id : str
|
42
41
|
attack_technique_id : str
|
43
42
|
name: str
|
44
43
|
description: str
|
45
44
|
|
46
45
|
class AttackFlowList(BaseModel):
|
47
|
-
|
46
|
+
tactic_selection: list[tuple[str, str]] = Field(description="attack technique id to attack tactic id mapping using possible_tactics")
|
47
|
+
# additional_tactic_mapping: list[tuple[str, str]] = Field(description="the rest of tactic_mapping")
|
48
48
|
items : list[AttackFlowItem]
|
49
49
|
success: bool = Field(description="determines if there's any valid flow in <extractions>")
|
50
50
|
|
51
|
+
def model_post_init(self, context):
|
52
|
+
return super().model_post_init(context)
|
53
|
+
|
54
|
+
@property
|
55
|
+
def tactic_mapping(self):
|
56
|
+
return dict(self.tactic_selection)
|
57
|
+
|
51
58
|
class ParserWithLogging(PydanticOutputParser):
|
52
59
|
def parse(self, text: str):
|
53
60
|
f = io.StringIO()
|
txt2stix/attack_flow.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
+
import json
|
1
2
|
import logging
|
2
3
|
import uuid
|
3
4
|
from stix2 import Relationship
|
5
|
+
from txt2stix import txt2stixBundler
|
4
6
|
|
7
|
+
from txt2stix.ai_extractor.base import BaseAIExtractor
|
5
8
|
from txt2stix.common import UUID_NAMESPACE
|
6
9
|
from txt2stix.retriever import STIXObjectRetriever
|
7
10
|
from stix2extensions.attack_action import AttackAction, AttackFlow
|
@@ -9,33 +12,27 @@ from stix2extensions._extensions import attack_flow_ExtensionDefinitionSMO
|
|
9
12
|
from .utils import AttackFlowList
|
10
13
|
|
11
14
|
|
12
|
-
def parse_flow(report, flow: AttackFlowList):
|
15
|
+
def parse_flow(report, flow: AttackFlowList, techniques, tactics):
|
13
16
|
logging.info(f"flow.success = {flow.success}")
|
14
17
|
if not flow.success:
|
15
18
|
return []
|
16
|
-
attack_objects = STIXObjectRetriever().get_attack_objects(
|
17
|
-
flow.matrix,
|
18
|
-
[item.attack_tactic_id for item in flow.items]
|
19
|
-
+ [item.attack_technique_id for item in flow.items],
|
20
|
-
)
|
21
|
-
attack_objects = {
|
22
|
-
obj["external_references"][0]["external_id"]: obj for obj in attack_objects
|
23
|
-
}
|
24
19
|
flow_objects = [report, attack_flow_ExtensionDefinitionSMO]
|
25
20
|
last_action = None
|
26
21
|
for i, item in enumerate(flow.items):
|
27
22
|
try:
|
28
|
-
|
29
|
-
|
23
|
+
technique = techniques[item.attack_technique_id]
|
24
|
+
tactic_id = technique['possible_tactics'][flow.tactic_mapping[item.attack_technique_id]]
|
25
|
+
technique_obj = technique["stix_obj"]
|
26
|
+
tactic_obj = tactics[technique["domain"]][tactic_id]
|
30
27
|
action_obj = AttackAction(
|
31
28
|
**{
|
32
29
|
"id": flow_id(
|
33
|
-
report["id"], item.attack_technique_id,
|
30
|
+
report["id"], item.attack_technique_id, tactic_id
|
34
31
|
),
|
35
32
|
"effect_refs": [f"attack-action--{str(uuid.uuid4())}"],
|
36
33
|
"technique_id": item.attack_technique_id,
|
37
34
|
"technique_ref": technique_obj["id"],
|
38
|
-
"tactic_id":
|
35
|
+
"tactic_id": tactic_id,
|
39
36
|
"tactic_ref": tactic_obj["id"],
|
40
37
|
"name": item.name,
|
41
38
|
"description": item.description,
|
@@ -99,3 +96,113 @@ def flow_id(report_id, technique_id, tactic_id):
|
|
99
96
|
f"{report_id}+{technique_id}+{tactic_id}",
|
100
97
|
)
|
101
98
|
)
|
99
|
+
|
100
|
+
|
101
|
+
def get_all_tactics():
|
102
|
+
tactics = {
|
103
|
+
"enterprise-attack": None,
|
104
|
+
"mobile-attack": None,
|
105
|
+
"ics-attack": None,
|
106
|
+
}
|
107
|
+
for k in tactics.keys():
|
108
|
+
matrix = k.replace("attack", "").strip("-")
|
109
|
+
all_tactics = STIXObjectRetriever().get_attack_tactics(matrix)
|
110
|
+
tactics[k] = all_tactics
|
111
|
+
return tactics
|
112
|
+
|
113
|
+
|
114
|
+
def get_techniques_from_extracted_objects(objects: dict, tactics: dict):
|
115
|
+
techniques = {}
|
116
|
+
for obj in objects:
|
117
|
+
if (
|
118
|
+
obj["type"] == "attack-pattern"
|
119
|
+
and obj.get("external_references", [{"source_name": None}])[0][
|
120
|
+
"source_name"
|
121
|
+
]
|
122
|
+
== "mitre-attack"
|
123
|
+
):
|
124
|
+
domain = obj["x_mitre_domains"][0]
|
125
|
+
technique = dict(
|
126
|
+
domain=domain,
|
127
|
+
name=obj["name"],
|
128
|
+
possible_tactics={},
|
129
|
+
id=obj["external_references"][0]["external_id"],
|
130
|
+
platforms=[
|
131
|
+
platform
|
132
|
+
for platform in obj["x_mitre_platforms"]
|
133
|
+
if platform != "None"
|
134
|
+
],
|
135
|
+
stix_obj=obj,
|
136
|
+
)
|
137
|
+
for phase in obj["kill_chain_phases"]:
|
138
|
+
if not set(phase["kill_chain_name"].split("-")).issuperset(
|
139
|
+
["mitre", "attack"]
|
140
|
+
):
|
141
|
+
continue
|
142
|
+
tactic_name = phase["phase_name"]
|
143
|
+
tactic_obj = tactics[domain][tactic_name]
|
144
|
+
tactic_id = tactic_obj["external_references"][0]["external_id"]
|
145
|
+
technique["possible_tactics"][tactic_name] = tactic_id
|
146
|
+
techniques[technique["id"]] = technique
|
147
|
+
return techniques
|
148
|
+
|
149
|
+
|
150
|
+
def create_navigator_layer(report, summary, flow: AttackFlowList, techniques):
|
151
|
+
domains = {}
|
152
|
+
for technique in techniques.values():
|
153
|
+
domain_techniques = domains.setdefault(technique["domain"], [])
|
154
|
+
technique_id = technique["id"]
|
155
|
+
if technique_id not in flow.tactic_mapping:
|
156
|
+
continue
|
157
|
+
domain_techniques.append(
|
158
|
+
dict(techniqueID=technique_id, tactic=flow.tactic_mapping[technique_id])
|
159
|
+
)
|
160
|
+
|
161
|
+
retval = []
|
162
|
+
|
163
|
+
for domain, domain_techniques in domains.items():
|
164
|
+
retval.append(
|
165
|
+
{
|
166
|
+
"version": "4.5",
|
167
|
+
"name": report.name,
|
168
|
+
"domain": domain,
|
169
|
+
"description": summary,
|
170
|
+
"techniques": domain_techniques,
|
171
|
+
"gradient": {
|
172
|
+
"colors": ["#ffffff", "#ff6666"],
|
173
|
+
"minValue": 0,
|
174
|
+
"maxValue": 100,
|
175
|
+
},
|
176
|
+
"legendItems": [],
|
177
|
+
"metadata": [],
|
178
|
+
"layout": {"layout": "side"},
|
179
|
+
}
|
180
|
+
)
|
181
|
+
return retval
|
182
|
+
|
183
|
+
|
184
|
+
def extract_attack_flow_and_navigator(
|
185
|
+
bundler: txt2stixBundler,
|
186
|
+
preprocessed_text,
|
187
|
+
ai_create_attack_flow,
|
188
|
+
ai_create_attack_navigator_layer,
|
189
|
+
ai_settings_relationships,
|
190
|
+
):
|
191
|
+
ex: BaseAIExtractor = ai_settings_relationships
|
192
|
+
tactics = get_all_tactics()
|
193
|
+
techniques = get_techniques_from_extracted_objects(bundler.bundle.objects, tactics)
|
194
|
+
logged_techniques = [
|
195
|
+
{k: v for k, v in t.items() if k != "stix_obj"}
|
196
|
+
for t in techniques.values()
|
197
|
+
]
|
198
|
+
logging.debug(f"parsed techniques: {json.dumps(logged_techniques, indent=4)}")
|
199
|
+
|
200
|
+
flow = ex.extract_attack_flow(preprocessed_text, techniques)
|
201
|
+
navigator = None
|
202
|
+
if ai_create_attack_flow:
|
203
|
+
logging.info("creating attack-flow bundle")
|
204
|
+
bundler.flow_objects = parse_flow(bundler.report, flow, techniques, tactics)
|
205
|
+
|
206
|
+
if ai_create_attack_navigator_layer:
|
207
|
+
navigator = create_navigator_layer(bundler.report, bundler.summary, flow, techniques)
|
208
|
+
return flow, navigator
|
txt2stix/bundler.py
CHANGED
@@ -194,6 +194,7 @@ class txt2stixBundler:
|
|
194
194
|
self.all_extractors = extractors
|
195
195
|
self.identity = identity or self.default_identity
|
196
196
|
self.tlp_level = TLP_LEVEL.get(tlp_level)
|
197
|
+
self.summary = ""
|
197
198
|
if report_id:
|
198
199
|
self.uuid = report_id
|
199
200
|
else:
|
@@ -415,6 +416,7 @@ class txt2stixBundler:
|
|
415
416
|
)
|
416
417
|
|
417
418
|
def add_summary(self, summary, ai_summary_provider):
|
419
|
+
self.summary = summary
|
418
420
|
summary_note_obj = Note(
|
419
421
|
type="note",
|
420
422
|
spec_version="2.1",
|
txt2stix/extractions.py
CHANGED
@@ -31,8 +31,9 @@ class Extractor(NamedDict):
|
|
31
31
|
self.extraction_key = key
|
32
32
|
self.slug = key
|
33
33
|
test_cases = test_cases or dict()
|
34
|
-
|
35
|
-
self.
|
34
|
+
|
35
|
+
self.prompt_negative_examples = remove_empty(test_cases.get('test_negative_examples') or [])
|
36
|
+
self.prompt_positive_examples = remove_empty(test_cases.get('test_positive_examples') or [])
|
36
37
|
if self.file and not Path(self.file).is_absolute() and include_path:
|
37
38
|
self.file = Path(include_path) / self.file
|
38
39
|
|
@@ -44,6 +45,9 @@ class Extractor(NamedDict):
|
|
44
45
|
for line in file.read_text().splitlines():
|
45
46
|
self.lookups.add(line.strip())
|
46
47
|
|
48
|
+
def remove_empty(iterable: list):
|
49
|
+
return [it for it in iterable if it]
|
50
|
+
|
47
51
|
def parse_extraction_config(include_path: Path):
|
48
52
|
config = {}
|
49
53
|
test_cases = load_test_cases_config(include_path)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
## IMPORTANT: if using CTI Butler database locally in arangodb (i.e is not app.ctibutler.com in .env) you need to follow these steps to import the data needed to populate these lookups: https://github.com/muchdogesec/stix2arango/blob/main/utilities/arango_cti_processor/README.md (use `--database ctibutler_database` in the s2a script or change it in this script)
|
2
|
+
|
1
3
|
import os
|
2
4
|
from arango import ArangoClient
|
3
5
|
|
txt2stix/retriever.py
CHANGED
@@ -22,6 +22,15 @@ class STIXObjectRetriever:
|
|
22
22
|
endpoint = urljoin(self.api_root, f"v1/attack-{matrix}/objects/{attack_id}/")
|
23
23
|
return self._retrieve_objects(endpoint)
|
24
24
|
|
25
|
+
def get_attack_tactics(self, matrix):
|
26
|
+
endpoint = urljoin(self.api_root, f"v1/attack-{matrix}/objects/?attack_type=Tactic")
|
27
|
+
tactics = self._retrieve_objects(endpoint)
|
28
|
+
retval = {}
|
29
|
+
for tac in tactics:
|
30
|
+
retval[tac['x_mitre_shortname']] = tac
|
31
|
+
retval[tac['external_references'][0]['external_id']] = tac
|
32
|
+
return retval
|
33
|
+
|
25
34
|
def get_attack_objects(self, matrix, attack_ids):
|
26
35
|
endpoint = urljoin(self.api_root, f"v1/attack-{matrix}/objects/?attack_id={','.join(attack_ids)}")
|
27
36
|
return self._retrieve_objects(endpoint)
|
txt2stix/txt2stix.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
import argparse, dotenv
|
2
|
+
import contextlib
|
3
|
+
import shutil
|
2
4
|
from datetime import datetime
|
3
5
|
import glob
|
4
6
|
import uuid
|
@@ -11,7 +13,7 @@ import sys, os
|
|
11
13
|
from pydantic import BaseModel
|
12
14
|
|
13
15
|
from txt2stix.ai_extractor.utils import DescribesIncident
|
14
|
-
from txt2stix
|
16
|
+
from txt2stix import attack_flow
|
15
17
|
|
16
18
|
|
17
19
|
from .utils import RELATIONSHIP_TYPES, Txt2StixData, remove_links
|
@@ -38,7 +40,7 @@ def newLogger(name: str) -> logging.Logger:
|
|
38
40
|
level=logging.DEBUG, # Set the desired logging level
|
39
41
|
format=f"%(asctime)s [{name}] [%(levelname)s] %(message)s",
|
40
42
|
handlers=[stream_handler],
|
41
|
-
datefmt='%d-%b-%y %H:%M:%S'
|
43
|
+
datefmt='%d-%b-%y %H:%M:%S',
|
42
44
|
)
|
43
45
|
|
44
46
|
return logging.root
|
@@ -127,31 +129,130 @@ def parse_bool(value: str):
|
|
127
129
|
value = value.lower()
|
128
130
|
return value in ["yes", "y", "true", "1"]
|
129
131
|
|
132
|
+
|
130
133
|
def parse_args():
|
131
|
-
EXTRACTORS_PATH = INCLUDES_PATH/"extractions"
|
134
|
+
EXTRACTORS_PATH = INCLUDES_PATH / "extractions"
|
132
135
|
all_extractors = extractions.parse_extraction_config(INCLUDES_PATH)
|
133
|
-
|
136
|
+
|
134
137
|
parser = argparse.ArgumentParser(description="File Conversion Tool")
|
135
138
|
|
136
|
-
inf_arg
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
139
|
+
inf_arg = parser.add_argument(
|
140
|
+
"--input_file",
|
141
|
+
"--input-file",
|
142
|
+
required=True,
|
143
|
+
help="The file to be converted. Must be .txt",
|
144
|
+
type=Path,
|
145
|
+
)
|
146
|
+
parser.add_argument(
|
147
|
+
"--ai_content_check_provider",
|
148
|
+
required=False,
|
149
|
+
type=parse_model,
|
150
|
+
help="Use an AI model to check wether the content of the file contains threat intelligence. Paticularly useful to weed out vendor marketing.",
|
151
|
+
)
|
152
|
+
parser.add_argument(
|
153
|
+
"--ai_extract_if_no_incidence",
|
154
|
+
default=True,
|
155
|
+
type=parse_bool,
|
156
|
+
help="if content check decides the report is not related to cyber security intelligence (e.g. vendor marketing), then you can use this setting to decide wether or not script should proceed. Setting to `false` will stop processing. It is designed to save AI tokens processing unknown content at scale in an automated way.",
|
157
|
+
)
|
158
|
+
name_arg = parser.add_argument(
|
159
|
+
"--name",
|
160
|
+
required=True,
|
161
|
+
help="Name of the file, max 124 chars",
|
162
|
+
default="stix-out",
|
163
|
+
)
|
164
|
+
parser.add_argument(
|
165
|
+
"--created",
|
166
|
+
required=False,
|
167
|
+
default=datetime.now(),
|
168
|
+
help="Allow user to optionally pass --created time in input, which will hardcode the time used in created times",
|
169
|
+
)
|
170
|
+
parser.add_argument(
|
171
|
+
"--ai_settings_extractions",
|
172
|
+
required=False,
|
173
|
+
type=parse_model,
|
174
|
+
help="(required if AI extraction enabled): passed in format provider:model e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers.",
|
175
|
+
metavar="provider[:model]",
|
176
|
+
nargs="+",
|
177
|
+
)
|
178
|
+
parser.add_argument(
|
179
|
+
"--ai_settings_relationships",
|
180
|
+
required=False,
|
181
|
+
type=parse_model,
|
182
|
+
help="(required if AI relationship enabled): passed in format `provider:model`. Can only pass one model at this time.",
|
183
|
+
metavar="provider[:model]",
|
184
|
+
)
|
143
185
|
parser.add_argument("--labels", type=parse_labels)
|
144
|
-
rmode_arg = parser.add_argument(
|
145
|
-
|
146
|
-
|
147
|
-
parser.add_argument(
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
186
|
+
rmode_arg = parser.add_argument(
|
187
|
+
"--relationship_mode", choices=["ai", "standard"], required=True
|
188
|
+
)
|
189
|
+
parser.add_argument(
|
190
|
+
"--report_id",
|
191
|
+
type=uuid.UUID,
|
192
|
+
required=False,
|
193
|
+
help="id to use instead of automatically generated `{name}+{created}`",
|
194
|
+
metavar="VALID_UUID",
|
195
|
+
)
|
196
|
+
parser.add_argument(
|
197
|
+
"--confidence",
|
198
|
+
type=range_type(0, 100),
|
199
|
+
default=None,
|
200
|
+
help="value between 0-100. Default if not passed is null.",
|
201
|
+
metavar="[0-100]",
|
202
|
+
)
|
203
|
+
parser.add_argument(
|
204
|
+
"--tlp_level",
|
205
|
+
"--tlp-level",
|
206
|
+
choices=TLP_LEVEL.levels().keys(),
|
207
|
+
default="clear",
|
208
|
+
help="TLP level, default is clear",
|
209
|
+
)
|
210
|
+
extractions_arg = parser.add_argument(
|
211
|
+
"--use_extractions",
|
212
|
+
"--use-extractions",
|
213
|
+
default={},
|
214
|
+
type=functools.partial(parse_extractors_globbed, "extractor", all_extractors),
|
215
|
+
help="Specify extraction types from the default/local extractions .yaml file",
|
216
|
+
metavar="EXTRACTION1,EXTRACTION2",
|
217
|
+
)
|
218
|
+
parser.add_argument(
|
219
|
+
"--use_identity",
|
220
|
+
"--use-identity",
|
221
|
+
help='Specify an identity file id (e.g., {"type":"identity","name":"demo","identity_class":"system"})',
|
222
|
+
metavar="[stix2 identity json]",
|
223
|
+
type=parse_stix,
|
224
|
+
)
|
225
|
+
parser.add_argument(
|
226
|
+
"--external_refs",
|
227
|
+
type=parse_ref,
|
228
|
+
help="pass additional `external_references` entry (or entries) to the report object created. e.g --external_ref author=dogesec link=https://dkjjadhdaj.net",
|
229
|
+
default=[],
|
230
|
+
metavar="{source_name}={external_id}",
|
231
|
+
action="extend",
|
232
|
+
nargs="+",
|
233
|
+
)
|
234
|
+
parser.add_argument("--ignore_image_refs", default=True, type=parse_bool)
|
235
|
+
parser.add_argument("--ignore_link_refs", default=True, type=parse_bool)
|
236
|
+
parser.add_argument(
|
237
|
+
"--ignore_extraction_boundary",
|
238
|
+
default=False,
|
239
|
+
type=parse_bool,
|
240
|
+
help="default if not passed is `false`, but if set to `true` will ignore boundary capture logic for extractions",
|
241
|
+
)
|
242
|
+
aflow_arg = parser.add_argument(
|
243
|
+
"--ai_create_attack_flow",
|
244
|
+
default=False,
|
245
|
+
action="store_true",
|
246
|
+
help="create attack flow for attack objects in report/bundle",
|
247
|
+
)
|
248
|
+
|
249
|
+
anav_arg = parser.add_argument(
|
250
|
+
"--ai_create_attack_navigator_layer",
|
251
|
+
default=False,
|
252
|
+
action="store_true",
|
253
|
+
help="create attack flow for attack objects in report/bundle",
|
254
|
+
)
|
255
|
+
|
155
256
|
|
156
257
|
args = parser.parse_args()
|
157
258
|
if not args.input_file.exists():
|
@@ -159,18 +260,31 @@ def parse_args():
|
|
159
260
|
if len(args.name) > 124:
|
160
261
|
raise argparse.ArgumentError(name_arg, "max 124 characters")
|
161
262
|
|
162
|
-
if args.relationship_mode ==
|
163
|
-
raise argparse.ArgumentError(
|
263
|
+
if args.relationship_mode == "ai" and not args.ai_settings_relationships:
|
264
|
+
raise argparse.ArgumentError(
|
265
|
+
rmode_arg,
|
266
|
+
"relationship_mode is set to AI, --ai_settings_relationships is required",
|
267
|
+
)
|
164
268
|
|
165
269
|
if args.ai_create_attack_flow and not args.ai_settings_relationships:
|
166
|
-
raise argparse.ArgumentError(
|
167
|
-
|
168
|
-
|
169
|
-
|
270
|
+
raise argparse.ArgumentError(
|
271
|
+
aflow_arg, "--ai_settings_relationships must be set"
|
272
|
+
)
|
273
|
+
if args.ai_create_attack_navigator_layer and not args.ai_settings_relationships:
|
274
|
+
raise argparse.ArgumentError(
|
275
|
+
anav_arg, "--ai_settings_relationships must be set"
|
276
|
+
)
|
277
|
+
#### process --use-extractions
|
278
|
+
if args.use_extractions.get("ai") and not args.ai_settings_extractions:
|
279
|
+
raise argparse.ArgumentError(
|
280
|
+
extractions_arg,
|
281
|
+
"ai based extractors are passed, --ai_settings_extractions is required",
|
282
|
+
)
|
170
283
|
|
171
|
-
args.all_extractors
|
284
|
+
args.all_extractors = all_extractors
|
172
285
|
return args
|
173
286
|
|
287
|
+
|
174
288
|
REQUIRED_ENV_VARIABLES = [
|
175
289
|
"INPUT_TOKEN_LIMIT",
|
176
290
|
"CTIBUTLER_BASE_URL",
|
@@ -243,21 +357,22 @@ def validate_token_count(max_tokens, input, extractors: list[BaseAIExtractor]):
|
|
243
357
|
token_count = _count_token(extractor, input)
|
244
358
|
if token_count > max_tokens:
|
245
359
|
raise FatalException(f"{extractor.extractor_name}: input_file token count ({token_count}) exceeds INPUT_TOKEN_LIMIT ({max_tokens})")
|
246
|
-
|
360
|
+
|
247
361
|
|
248
362
|
@functools.lru_cache
|
249
363
|
def _count_token(extractor: BaseAIExtractor, input: str):
|
250
364
|
return extractor.count_tokens(input)
|
251
|
-
|
365
|
+
|
252
366
|
def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_map: dict,
|
253
367
|
ai_content_check_provider=None,
|
254
368
|
ai_create_attack_flow=None,
|
369
|
+
ai_create_attack_navigator_layer=None,
|
255
370
|
input_token_limit=10,
|
256
371
|
ai_settings_extractions=None,
|
257
372
|
ai_settings_relationships=None,
|
258
373
|
relationship_mode="standard",
|
259
374
|
ignore_extraction_boundary=False,
|
260
|
-
|
375
|
+
ai_extract_if_no_incidence=True, # continue even if ai_content_check fails
|
261
376
|
|
262
377
|
**kwargs
|
263
378
|
) -> Txt2StixData:
|
@@ -276,7 +391,7 @@ def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_ma
|
|
276
391
|
bundler.report.labels.append(f'txt2stix:{classification}'.lower())
|
277
392
|
bundler.add_summary(retval.content_check.summary, model.extractor_name)
|
278
393
|
|
279
|
-
if should_extract or
|
394
|
+
if should_extract or ai_extract_if_no_incidence:
|
280
395
|
if extractors_map.get("ai"):
|
281
396
|
validate_token_count(input_token_limit, preprocessed_text, ai_settings_extractions)
|
282
397
|
if relationship_mode == "ai":
|
@@ -285,15 +400,13 @@ def run_txt2stix(bundler: txt2stixBundler, preprocessed_text: str, extractors_ma
|
|
285
400
|
retval.extractions = extract_all(bundler, extractors_map, preprocessed_text, ai_extractors=ai_settings_extractions, ignore_extraction_boundary=ignore_extraction_boundary)
|
286
401
|
if relationship_mode == "ai" and sum(map(lambda x: len(x), retval.extractions.values())):
|
287
402
|
retval.relationships = extract_relationships_with_ai(bundler, preprocessed_text, retval.extractions, ai_settings_relationships)
|
288
|
-
|
289
|
-
if ai_create_attack_flow:
|
290
|
-
|
291
|
-
ex: BaseAIExtractor = ai_settings_relationships
|
292
|
-
retval.attack_flow = ex.extract_attack_flow(preprocessed_text, retval.extractions, retval.relationships)
|
293
|
-
bundler.flow_objects = parse_flow(bundler.report, retval.attack_flow)
|
294
|
-
|
403
|
+
|
404
|
+
if ai_create_attack_flow or ai_create_attack_navigator_layer:
|
405
|
+
retval.attack_flow, retval.navigator_layer = attack_flow.extract_attack_flow_and_navigator(bundler, preprocessed_text, ai_create_attack_flow, ai_create_attack_navigator_layer, ai_settings_relationships)
|
295
406
|
return retval
|
296
407
|
|
408
|
+
|
409
|
+
|
297
410
|
def main():
|
298
411
|
dotenv.load_dotenv()
|
299
412
|
logger = newLogger("txt2stix")
|
@@ -320,13 +433,20 @@ def main():
|
|
320
433
|
|
321
434
|
## write outputs
|
322
435
|
out = bundler.to_json()
|
323
|
-
|
324
|
-
|
436
|
+
output_dir = Path("./output")/str(job_id)
|
437
|
+
with contextlib.suppress(BaseException):
|
438
|
+
shutil.rmtree(output_dir)
|
439
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
440
|
+
output_path = output_dir/f"{bundler.bundle.id}.json"
|
325
441
|
output_path.write_text(out)
|
326
442
|
logger.info(f"Wrote bundle output to `{output_path}`")
|
327
|
-
data_path =
|
443
|
+
data_path = output_dir/"data.json"
|
328
444
|
data_path.write_text(data.model_dump_json(indent=4))
|
329
445
|
logger.info(f"Wrote data output to `{data_path}`")
|
446
|
+
for nav_layer in data.navigator_layer or []:
|
447
|
+
nav_path = output_dir/f"navigator-{nav_layer['domain']}.json"
|
448
|
+
nav_path.write_text(json.dumps(nav_layer, indent=4))
|
449
|
+
logger.info(f"Wrote navigator output to `{nav_path}`")
|
330
450
|
except argparse.ArgumentError as e:
|
331
451
|
logger.exception(e, exc_info=True)
|
332
452
|
except:
|
txt2stix/utils.py
CHANGED
@@ -51,6 +51,7 @@ class Txt2StixData(BaseModel):
|
|
51
51
|
extractions: dict = Field(default=None)
|
52
52
|
relationships: list[dict] = Field(default_factory=list)
|
53
53
|
attack_flow: AttackFlowList = Field(default=None)
|
54
|
+
navigator_layer: list = Field(default=None)
|
54
55
|
|
55
56
|
|
56
57
|
def remove_links(input_text: str, remove_images: bool, remove_anchors: bool):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: txt2stix
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2
|
4
4
|
Summary: txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle.
|
5
5
|
Project-URL: Homepage, https://github.com/muchdogesec/txt2stix
|
6
6
|
Project-URL: Issues, https://github.com/muchdogesec/txt2stix/issues
|
@@ -26,15 +26,19 @@ Requires-Dist: stix2extensions
|
|
26
26
|
Requires-Dist: tld>=0.13
|
27
27
|
Requires-Dist: tldextract>=5.1.2
|
28
28
|
Requires-Dist: validators>=0.28.3
|
29
|
-
Provides-Extra:
|
30
|
-
Requires-Dist: llama-index-llms-anthropic>=0.7.2; extra == '
|
31
|
-
|
32
|
-
Requires-Dist: llama-index-llms-
|
33
|
-
|
29
|
+
Provides-Extra: anthropic
|
30
|
+
Requires-Dist: llama-index-llms-anthropic>=0.7.2; extra == 'anthropic'
|
31
|
+
Provides-Extra: deepseek
|
32
|
+
Requires-Dist: llama-index-llms-deepseek>=0.1.2; extra == 'deepseek'
|
33
|
+
Provides-Extra: gemini
|
34
|
+
Requires-Dist: llama-index-llms-gemini>=0.5.0; extra == 'gemini'
|
35
|
+
Provides-Extra: openrouter
|
36
|
+
Requires-Dist: llama-index-llms-openrouter>=0.3.2; extra == 'openrouter'
|
34
37
|
Provides-Extra: tests
|
35
38
|
Requires-Dist: pytest; extra == 'tests'
|
36
39
|
Requires-Dist: pytest-cov; extra == 'tests'
|
37
40
|
Requires-Dist: pytest-subtests; extra == 'tests'
|
41
|
+
Requires-Dist: python-dateutil; extra == 'tests'
|
38
42
|
Requires-Dist: requests; extra == 'tests'
|
39
43
|
Description-Content-Type: text/markdown
|
40
44
|
|
@@ -86,7 +90,13 @@ cd txt2stix
|
|
86
90
|
python3 -m venv txt2stix-venv
|
87
91
|
source txt2stix-venv/bin/activate
|
88
92
|
# install requirements
|
89
|
-
pip3 install
|
93
|
+
pip3 install txt2stix
|
94
|
+
```
|
95
|
+
|
96
|
+
Note, by default txt2stix will install OpenAI to use as the AI provider. You can also use Anthropic, Gemini, OpenRouter or Deepseek. You need to install these manually if you plan to use them as follows (remove those that don't apply)
|
97
|
+
|
98
|
+
```shell
|
99
|
+
pip3 install txt2stix[deepseek,gemini,anthropic,openrouter]
|
90
100
|
```
|
91
101
|
|
92
102
|
### Set variables
|
@@ -114,39 +124,39 @@ The following arguments are available:
|
|
114
124
|
|
115
125
|
#### Input settings
|
116
126
|
|
117
|
-
* `--input_file` (
|
127
|
+
* `--input_file` (`path/to/file.txt`, required): the file to be converted. Must be `.txt`
|
118
128
|
|
119
129
|
#### STIX Report generation settings
|
120
130
|
|
121
131
|
|
122
|
-
* `--name` (
|
123
|
-
* `--report_id` (
|
124
|
-
* `--tlp_level` (
|
125
|
-
* `--confidence` (
|
132
|
+
* `--name` (text, required): name of file, max 72 chars. Will be used in the STIX Report Object created.
|
133
|
+
* `--report_id` (UUIDv4, default is random UUIDv4): Sometimes it is required to control the id of the `report` object generated. You can therefore pass a valid UUIDv4 in this field to be assigned to the report. e.g. passing `2611965-930e-43db-8b95-30a1e119d7e2` would create a STIX object id `report--2611965-930e-43db-8b95-30a1e119d7e2`. If this argument is not passed, the UUID will be randomly generated.
|
134
|
+
* `--tlp_level` (dictionary, default, `clear`): Options are `clear`, `green`, `amber`, `amber_strict`, `red`.
|
135
|
+
* `--confidence` (value between 0-100): If not passed, report will be assigned no confidence score value
|
126
136
|
* `--labels` (OPTIONAL): comma seperated list of labels. Case-insensitive (will all be converted to lower-case). Allowed `a-z`, `0-9`. e.g.`label1,label2` would create 2 labels.
|
127
|
-
* `--created` (
|
128
|
-
* `--use_identity` (
|
137
|
+
* `--created` (datetime, optional): by default all object `created` times will take the time the script was run. If you want to explicitly set these times you can do so using this flag. Pass the value in the format `YYYY-MM-DDTHH:MM:SS.sssZ` e.g. `2020-01-01T00:00:00.000Z`
|
138
|
+
* `--use_identity` (stix identity, optional, default txt2stix identity): can pass a full STIX 2.1 identity object (make sure to properly escape). Will be validated by the STIX2 library.
|
129
139
|
* `--external_refs` (OPTIONAL): txt2stix will automatically populate the `external_references` of the report object it creates for the input. You can use this value to add additional objects to `external_references`. Note, you can only add `source_name` and `external_id` values currently. Pass as `source_name=external_id`. e.g. `--external_refs txt2stix=demo1 source=id` would create the following objects under the `external_references` property: `{"source_name":"txt2stix","external_id":"demo1"},{"source_name":"source","external_id":"id"}`
|
130
140
|
|
131
141
|
#### Output settings
|
132
142
|
|
133
143
|
How the extractions are performed
|
134
144
|
|
135
|
-
* `--use_extractions` (
|
145
|
+
* `--use_extractions` (dictionary, required): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `'pattern_*'` would run all extractions starting with `pattern_` -- make sure to use quotes when using a wildcard)
|
136
146
|
* Important: if using any AI extractions (`ai_*`), you must set an AI API key in your `.env` file
|
137
147
|
* Important: if you are using any MITRE ATT&CK, CAPEC, CWE, ATLAS or Location extractions you must set `CTIBUTLER` or NVD CPE or CVE extractions you must set `VULMATCH` settings in your `.env` file
|
138
|
-
* `--relationship_mode` (
|
148
|
+
* `--relationship_mode` (dictionary, required): either.
|
139
149
|
* `ai`: AI provider must be enabled. extractions performed by either regex or AI for extractions user selected. Rich relationships created from AI provider from extractions.
|
140
150
|
* `standard`: extractions performed by either regex or AI (AI provider must be enabled) for extractions user selected. Basic relationships created from extractions back to master Report object generated.
|
141
|
-
* `--ignore_extraction_boundary` (
|
142
|
-
* `--ignore_image_refs` (default `true`): images references in documents don't usually need extracting. e.g. `<img src="https://example.com/image.png" alt="something">` you would not want domain or file extractions extracting `example.com` and `image.png`. Hence these are ignored by default (they are removed from text sent to extraction). Note, only the `img src` is ignored, all other values e.g. `alt` are considered. If you want extractions to consider this data, set it to `false`
|
143
|
-
* `--ignore_link_refs` (default `true`): link references in documents don't usually need extracting e.g. `<a href="https://example.com/link.html" title="something">Bad Actor</a>` you would only want `Bad actor` to be considered for extraction. Hence these part of the link are ignored by default (they are removed from text sent to extraction). Note, only the `a href` is ignored, all other values e.g. `title` are considered. Setting this to `false` will also include everything inside the link tag (e.g. `example.com` would extract as a domain)
|
151
|
+
* `--ignore_extraction_boundary` (boolean, default `false`, not compatible with AI extractions): in some cases the same string will create multiple extractions depending on extractions set (e.g. `https://www.google.com/file.txt` could create a url, url with file, domain, subdomain, and file). The default behaviour is for txt2stix to take the longest extraction and ignore everything else (e.g. only extract url with file, and ignore url, file, domain, subdomain, and file). If you want to override this behaviour and get all extractions in the output, set this flag to `true`.
|
152
|
+
* `--ignore_image_refs` (boolean, default `true`): images references in documents don't usually need extracting. e.g. `<img src="https://example.com/image.png" alt="something">` you would not want domain or file extractions extracting `example.com` and `image.png`. Hence these are ignored by default (they are removed from text sent to extraction). Note, only the `img src` is ignored, all other values e.g. `alt` are considered. If you want extractions to consider this data, set it to `false`
|
153
|
+
* `--ignore_link_refs` (boolean, default `true`): link references in documents don't usually need extracting e.g. `<a href="https://example.com/link.html" title="something">Bad Actor</a>` you would only want `Bad actor` to be considered for extraction. Hence these part of the link are ignored by default (they are removed from text sent to extraction). Note, only the `a href` is ignored, all other values e.g. `title` are considered. Setting this to `false` will also include everything inside the link tag (e.g. `example.com` would extract as a domain)
|
144
154
|
|
145
155
|
#### AI settings
|
146
156
|
|
147
157
|
If any AI extractions, or AI relationship mode is set, you must set the following accordingly
|
148
158
|
|
149
|
-
* `--ai_settings_extractions
|
159
|
+
* `--ai_settings_extractions` (`model:provider`, required if one or more AI extractions set):
|
150
160
|
* defines the `provider:model` to be used for extractions. You can supply more than one provider. Seperate with a space (e.g. `openrouter:openai/gpt-4o` `openrouter:deepseek/deepseek-chat`) If more than one provider passed, txt2stix will take extractions from all models, de-dupelicate them, and them package them in the output. Currently supports:
|
151
161
|
* Provider (env var required `OPENROUTER_API_KEY`): `openrouter:`, providers/models `openai/gpt-4o`, `deepseek/deepseek-chat` ([More here](https://openrouter.ai/models))
|
152
162
|
* Provider (env var required `OPENAI_API_KEY`): `openai:`, models e.g.: `gpt-4o`, `gpt-4o-mini`, `gpt-4-turbo`, `gpt-4` ([More here](https://platform.openai.com/docs/models))
|
@@ -154,11 +164,16 @@ If any AI extractions, or AI relationship mode is set, you must set the followin
|
|
154
164
|
* Provider (env var required `GOOGLE_API_KEY`): `gemini:models/`, models: `gemini-1.5-pro-latest`, `gemini-1.5-flash-latest` ([More here](https://ai.google.dev/gemini-api/docs/models/gemini))
|
155
165
|
* Provider (env var required `DEEPSEEK_API_KEY`): `deepseek:`, models `deepseek-chat` ([More here](https://api-docs.deepseek.com/quick_start/pricing))
|
156
166
|
* See `tests/manual-tests/cases-ai-extraction-type.md` for some examples
|
157
|
-
* `--ai_settings_relationships
|
167
|
+
* `--ai_settings_relationships` (`model:provider`, required if AI relationship mode set):
|
158
168
|
* similar to `ai_settings_extractions` but defines the model used to generate relationships. Only one model can be provided. Passed in same format as `ai_settings_extractions`
|
159
169
|
* See `tests/manual-tests/cases-ai-relationships.md` for some examples
|
160
|
-
|
161
|
-
|
170
|
+
|
171
|
+
#### Other AI related settings
|
172
|
+
|
173
|
+
* `--ai_content_check_provider` (`model:provider`, required if passed): Passing this flag will get the AI to try and classify the text in the input to 1) determine if it is talking about threat intelligence, and 2) what type of threat intelligence it is talking about. For context, we use this to filter out non-threat intel posts in Obstracts and Stixify. You pass `provider:model` with this flag to determine the AI model you wish to use to perform the check. It will also create a summary of the content passed (and store this into a STIX Note).
|
174
|
+
* `--ai_extract_if_no_incidence` (boolean, default `true`, will only work if `ai_content_check_provider` set) if content check decides the report is not related to cyber security intelligence (e.g. vendor marketing), then you can use this setting to decide wether or not script should proceed. Setting to `false` will stop processing. It is designed to save AI tokens processing unknown content at scale in an automated way.
|
175
|
+
* `--ai_create_attack_flow` (boolean): passing this flag will also prompt the AI model (the same entered for `--ai_settings_relationships`, default `false`) to generate an [Attack Flow](https://center-for-threat-informed-defense.github.io/attack-flow/) for the MITRE ATT&CK extractions to define the logical order in which they are being described. You must pass `--ai_settings_relationships` for this to work.
|
176
|
+
* `--ai_create_attack_navigator_layer` (boolean, default `false`): passing this flag will generate [MITRE ATT&CK Navigator layers](https://mitre-attack.github.io/attack-navigator/) for MITRE ATT&CK extractions. For each ATT&CK domain (Enterprise, ICS, Mobile) txt2stix will generate a layer. You must pass `--ai_settings_relationships` for this to work because the AI is tasked with linking extracted Techniques to the correct Tactic. Known issues with `openai:gpt-3.5` (avoid using this model if possible when using ATT&CK Navigator).
|
162
177
|
|
163
178
|
## Adding new extractions
|
164
179
|
|
@@ -1,23 +1,23 @@
|
|
1
1
|
txt2stix/__init__.py,sha256=Sm_VT913IFuAZ6dJEdVz3baPwC5VYtHySVfBAOUG92w,803
|
2
|
-
txt2stix/attack_flow.py,sha256=
|
3
|
-
txt2stix/bundler.py,sha256=
|
2
|
+
txt2stix/attack_flow.py,sha256=DLDaNXB_gxuqdEb_A1VQO_nu69MG23nolTx7-JESrKI,7889
|
3
|
+
txt2stix/bundler.py,sha256=kqUNW9_jktuMyWSkoAa-ydZY-L5gzSSkthb7OdhUiKo,16854
|
4
4
|
txt2stix/common.py,sha256=ISnGNKqJPE1EcfhL-x_4G18mcwt1urmorkW-ru9kV-0,585
|
5
|
-
txt2stix/extractions.py,sha256=
|
5
|
+
txt2stix/extractions.py,sha256=_tlsqYHhfAoV-PJzxRHysrX47uxCsMlSg7PQWxww1u0,2171
|
6
6
|
txt2stix/indicator.py,sha256=c6S0xx0K8JM-PT_Qd1PlN_ZlDXdnEwiRS8529iUp3yg,30774
|
7
7
|
txt2stix/lookups.py,sha256=h42YVtYUkWZm6ZPv2h5hHDHDzDs3yBqrT_T7pj2MDZI,2301
|
8
|
-
txt2stix/retriever.py,sha256=
|
8
|
+
txt2stix/retriever.py,sha256=auKlk6JlRE9en-oiQ5KICMW0IwmU8R558o0K5UmEQZc,5550
|
9
9
|
txt2stix/stix.py,sha256=9nXD9a2dCY4uaatl-mlIA1k3srwQBhGW-tUSho3iYe0,30
|
10
|
-
txt2stix/txt2stix.py,sha256=
|
11
|
-
txt2stix/utils.py,sha256=
|
12
|
-
txt2stix/ai_extractor/__init__.py,sha256=
|
10
|
+
txt2stix/txt2stix.py,sha256=HYXN9dKzakoqdqJ4wSthwGdFIxOm6KTegiQlVmfp0eQ,18169
|
11
|
+
txt2stix/utils.py,sha256=n6mh4t9ZRJ7iT4Jvp9ai_dfCXjgXNcRtF_zXO7nkpnk,3304
|
12
|
+
txt2stix/ai_extractor/__init__.py,sha256=5Tf6Co9THzytBdFEVhD-7vvT05TT3nSpltnAV1sfdoM,349
|
13
13
|
txt2stix/ai_extractor/anthropic.py,sha256=mdz-8CB-BSCEqnK5l35DRZURVPUf508ef2b48XMxmuk,441
|
14
|
-
txt2stix/ai_extractor/base.py,sha256=
|
14
|
+
txt2stix/ai_extractor/base.py,sha256=mHu6xtWu78aDHnb2ePXR0UCBbROS-jH0kPRgQxfIwhI,3685
|
15
15
|
txt2stix/ai_extractor/deepseek.py,sha256=2XehIYbWXG6Odq68nQX4CNtl5GdmBlAmjLP_lG2eEFo,660
|
16
16
|
txt2stix/ai_extractor/gemini.py,sha256=yJC7knYzl-TScyCBd-MTpUf-NT6znC25E7vXxNMqjLU,578
|
17
17
|
txt2stix/ai_extractor/openai.py,sha256=DtllzeVhZw1231hj35vn1U8V2MMzm8wM7mqKLBkxazQ,489
|
18
18
|
txt2stix/ai_extractor/openrouter.py,sha256=hAA6mTOMcpA28XYsOCvuJH7WMJqXCxfqZGJf_VrDsIk,628
|
19
|
-
txt2stix/ai_extractor/prompts.py,sha256=
|
20
|
-
txt2stix/ai_extractor/utils.py,sha256=
|
19
|
+
txt2stix/ai_extractor/prompts.py,sha256=NtqtVyPPtShPlVZ5SrFmo-LCkfpANIIi4H9rjqaxqDo,10559
|
20
|
+
txt2stix/ai_extractor/utils.py,sha256=xPVtp_lI7254MvkXPt9YY_Vter0uiPLKMGcv5poXVKs,4763
|
21
21
|
txt2stix/pattern/__init__.py,sha256=K9ofaP2AOikvzb48VSBpJZijckdqufZxSzr_kbRypLY,491
|
22
22
|
txt2stix/pattern/extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
txt2stix/pattern/extractors/base_extractor.py,sha256=ly80rp-L40g7DbhrGiCvhPWI95-ZFMtAQUEC-fH6Y-o,6130
|
@@ -81,7 +81,7 @@ txt2stix/includes/helpers/stix_relationship_types.txt,sha256=PQytANVSrWepdK_SLEZ
|
|
81
81
|
txt2stix/includes/helpers/tlds.txt,sha256=Va_awj-FQiKgs5ace6C0kC5xxAHIl9yAIBhvT08Q7Q0,9551
|
82
82
|
txt2stix/includes/helpers/windows_registry_key_prefix.txt,sha256=J5gU4FsqmOVYt6cVRgwCG7odYEWk-UPLpuCiDwpzBfg,145
|
83
83
|
txt2stix/includes/lookups/_README.md,sha256=OGkyqCcqAOPI-JLE81zAmyg4sHW5apJNhDFcvHUW1nc,338
|
84
|
-
txt2stix/includes/lookups/_generate_lookups.py,sha256=
|
84
|
+
txt2stix/includes/lookups/_generate_lookups.py,sha256=ex12zxiFnKYHgsXfcDX4OL-KyrjAXSlvzeYVUzUD2lE,9390
|
85
85
|
txt2stix/includes/lookups/attack_pattern.txt,sha256=4ARDLG-cwUqk6_TO_JAY6hNJg6KRbAaIr-Or5nML6io,15
|
86
86
|
txt2stix/includes/lookups/campaign.txt,sha256=N66XO0H3Rx-3Tvo7wwHDouckIT0tGlGVyCDKxDs1KnM,11
|
87
87
|
txt2stix/includes/lookups/country_iso3166_alpha2.txt,sha256=LMM7j50NoBv7BlK64mpmE3Dbef9_tNBUNbuTXOEIvCo,746
|
@@ -112,8 +112,8 @@ txt2stix/includes/lookups/threat_actor.txt,sha256=QfDO9maQuqKBgW_Sdd7VGv1SHZ9Ra-
|
|
112
112
|
txt2stix/includes/lookups/tld.txt,sha256=-MEgJea2NMG_KDsnc4BVvI8eRk5Dm93L-t8SGYx5wMo,8598
|
113
113
|
txt2stix/includes/lookups/tool.txt,sha256=HGKG6JpUE26w6ezzSxOjBkp15UpSaB7N-mZ_NU_3G7A,6
|
114
114
|
txt2stix/includes/tests/test_cases.yaml,sha256=QD1FdIunpPkOpsn6wJRqs2vil_hv8OSVaqUp4a96aZg,22247
|
115
|
-
txt2stix-1.0.
|
116
|
-
txt2stix-1.0.
|
117
|
-
txt2stix-1.0.
|
118
|
-
txt2stix-1.0.
|
119
|
-
txt2stix-1.0.
|
115
|
+
txt2stix-1.0.2.dist-info/METADATA,sha256=-qdpBMRkkfhkAvFlAK8ya9Dj8ZYnXH0rt-NJSH8bqnw,14887
|
116
|
+
txt2stix-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
117
|
+
txt2stix-1.0.2.dist-info/entry_points.txt,sha256=x6QPtt65hWeomw4IpJ_wQUesBl1M4WOLODbhOKyWMFg,55
|
118
|
+
txt2stix-1.0.2.dist-info/licenses/LICENSE,sha256=BK8Ppqlc4pdgnNzIxnxde0taoQ1BgicdyqmBvMiNYgY,11364
|
119
|
+
txt2stix-1.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|