txt2detection 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2detection/__init__.py +2 -0
- txt2detection/__main__.py +343 -0
- txt2detection/ai_extractor/__init__.py +16 -0
- txt2detection/ai_extractor/anthropic.py +12 -0
- txt2detection/ai_extractor/base.py +72 -0
- txt2detection/ai_extractor/deepseek.py +20 -0
- txt2detection/ai_extractor/gemini.py +18 -0
- txt2detection/ai_extractor/openai.py +18 -0
- txt2detection/ai_extractor/openrouter.py +20 -0
- txt2detection/ai_extractor/prompts.py +121 -0
- txt2detection/ai_extractor/utils.py +21 -0
- txt2detection/attack_navigator.py +68 -0
- txt2detection/bundler.py +422 -0
- txt2detection/config/detection_languages.yaml +14 -0
- txt2detection/credential_checker.py +82 -0
- txt2detection/models.py +427 -0
- txt2detection/observables.py +161 -0
- txt2detection/utils.py +100 -0
- txt2detection-1.1.3.dist-info/METADATA +230 -0
- txt2detection-1.1.3.dist-info/RECORD +23 -0
- txt2detection-1.1.3.dist-info/WHEEL +4 -0
- txt2detection-1.1.3.dist-info/entry_points.txt +2 -0
- txt2detection-1.1.3.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from datetime import UTC, datetime
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
import sys
|
|
12
|
+
import uuid
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
from stix2 import Identity
|
|
15
|
+
import yaml
|
|
16
|
+
|
|
17
|
+
from txt2detection import credential_checker
|
|
18
|
+
from txt2detection.ai_extractor.base import BaseAIExtractor
|
|
19
|
+
from txt2detection.models import (
|
|
20
|
+
TAG_PATTERN,
|
|
21
|
+
DetectionContainer,
|
|
22
|
+
Level,
|
|
23
|
+
SigmaRuleDetection,
|
|
24
|
+
)
|
|
25
|
+
from txt2detection.utils import validate_token_count
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def configureLogging():
|
|
29
|
+
# Configure logging
|
|
30
|
+
stream_handler = logging.StreamHandler() # Log to stdout and stderr
|
|
31
|
+
stream_handler.setLevel(logging.INFO)
|
|
32
|
+
logging.basicConfig(
|
|
33
|
+
level=logging.DEBUG, # Set the desired logging level
|
|
34
|
+
format=f"%(asctime)s [%(levelname)s] %(message)s",
|
|
35
|
+
handlers=[stream_handler],
|
|
36
|
+
datefmt="%d-%b-%y %H:%M:%S",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
return logging.root
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
configureLogging()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def setLogFile(logger, file: Path):
|
|
46
|
+
file.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
logger.info(f"Saving log to `{file.absolute()}`")
|
|
48
|
+
handler = logging.FileHandler(file, "w")
|
|
49
|
+
handler.formatter = logging.Formatter(
|
|
50
|
+
fmt="%(levelname)s %(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S"
|
|
51
|
+
)
|
|
52
|
+
handler.setLevel(logging.DEBUG)
|
|
53
|
+
logger.addHandler(handler)
|
|
54
|
+
logger.info("=====================txt2detection======================")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
from .bundler import Bundler
|
|
58
|
+
import shutil
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
from .utils import STATUSES, as_date, make_identity, valid_licenses, parse_model
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_identity(str):
|
|
65
|
+
return Identity(**json.loads(str))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class Args:
|
|
70
|
+
input_file: str
|
|
71
|
+
input_text: str
|
|
72
|
+
name: str
|
|
73
|
+
tlp_level: str
|
|
74
|
+
labels: list[str]
|
|
75
|
+
created: datetime
|
|
76
|
+
use_identity: Identity
|
|
77
|
+
ai_provider: BaseAIExtractor
|
|
78
|
+
report_id: uuid.UUID
|
|
79
|
+
external_refs: dict[str, str]
|
|
80
|
+
reference_urls: list[str]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def parse_created(value):
|
|
84
|
+
"""Convert the created timestamp to a datetime object."""
|
|
85
|
+
try:
|
|
86
|
+
return datetime.strptime(value, "%Y-%m-%dT%H:%M:%S").replace(tzinfo=UTC)
|
|
87
|
+
except ValueError:
|
|
88
|
+
raise argparse.ArgumentTypeError(
|
|
89
|
+
"Invalid date format. Use YYYY-MM-DDTHH:MM:SS."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def parse_ref(value):
|
|
94
|
+
m = re.compile(r"(.+?)=(.+)").match(value)
|
|
95
|
+
if not m:
|
|
96
|
+
raise argparse.ArgumentTypeError("must be in format key=value")
|
|
97
|
+
return dict(source_name=m.group(1), external_id=m.group(2))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def parse_label(label: str):
|
|
101
|
+
if not TAG_PATTERN.match(label):
|
|
102
|
+
raise argparse.ArgumentTypeError(
|
|
103
|
+
"Invalid label format. Must follow sigma tag format {namespace}.{label}"
|
|
104
|
+
)
|
|
105
|
+
namespace, _, _ = label.partition(".")
|
|
106
|
+
if namespace in ["tlp"]:
|
|
107
|
+
raise argparse.ArgumentTypeError(f"Unsupported tag namespace `{namespace}`")
|
|
108
|
+
return label
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_args():
|
|
112
|
+
parser = argparse.ArgumentParser(
|
|
113
|
+
description="Convert text file to detection format."
|
|
114
|
+
)
|
|
115
|
+
mode = parser.add_subparsers(
|
|
116
|
+
title="process-mode", dest="mode", description="mode to use"
|
|
117
|
+
)
|
|
118
|
+
file = mode.add_parser("file", help="process a file input using ai")
|
|
119
|
+
text = mode.add_parser("text", help="process a text argument using ai")
|
|
120
|
+
sigma = mode.add_parser("sigma", help="process a sigma file without ai")
|
|
121
|
+
check_credentials = mode.add_parser(
|
|
122
|
+
"check-credentials",
|
|
123
|
+
help="show status of external services with respect to credentials",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
for mode_parser in [file, text, sigma]:
|
|
127
|
+
mode_parser.add_argument(
|
|
128
|
+
"--report_id", type=uuid.UUID, help="report_id to use for generated report"
|
|
129
|
+
)
|
|
130
|
+
mode_parser.add_argument(
|
|
131
|
+
"--name",
|
|
132
|
+
required=True,
|
|
133
|
+
help="Name of file, max 72 chars. Will be used in the STIX Report Object created.",
|
|
134
|
+
)
|
|
135
|
+
mode_parser.add_argument(
|
|
136
|
+
"--tlp_level",
|
|
137
|
+
choices=["clear", "green", "amber", "amber_strict", "red"],
|
|
138
|
+
help="Options are clear, green, amber, amber_strict, red. Default is clear if not passed.",
|
|
139
|
+
)
|
|
140
|
+
mode_parser.add_argument(
|
|
141
|
+
"--labels",
|
|
142
|
+
type=parse_label,
|
|
143
|
+
action="extend",
|
|
144
|
+
nargs="+",
|
|
145
|
+
help="Comma-separated list of labels. Case-insensitive (will be converted to lower-case). Allowed a-z, 0-9.",
|
|
146
|
+
)
|
|
147
|
+
mode_parser.add_argument(
|
|
148
|
+
"--created",
|
|
149
|
+
type=parse_created,
|
|
150
|
+
help="Explicitly set created time in format YYYY-MM-DDTHH:MM:SS.sssZ. Default is current time.",
|
|
151
|
+
)
|
|
152
|
+
mode_parser.add_argument(
|
|
153
|
+
"--use_identity",
|
|
154
|
+
type=parse_identity,
|
|
155
|
+
help="Pass a full STIX 2.1 identity object (properly escaped). Validated by the STIX2 library. Default is SIEM Rules identity.",
|
|
156
|
+
)
|
|
157
|
+
mode_parser.add_argument(
|
|
158
|
+
"--ai_provider",
|
|
159
|
+
required=False,
|
|
160
|
+
type=parse_model,
|
|
161
|
+
help="(required): defines the `provider:model` to be used. Select one option.",
|
|
162
|
+
metavar="provider[:model]",
|
|
163
|
+
)
|
|
164
|
+
mode_parser.add_argument(
|
|
165
|
+
"--external_refs",
|
|
166
|
+
type=parse_ref,
|
|
167
|
+
help="pass additional `external_references` entry (or entries) to the report object created. e.g --external_ref author=dogesec link=https://dkjjadhdaj.net",
|
|
168
|
+
default=[],
|
|
169
|
+
metavar="{source_name}={external_id}",
|
|
170
|
+
action="extend",
|
|
171
|
+
nargs="+",
|
|
172
|
+
)
|
|
173
|
+
mode_parser.add_argument(
|
|
174
|
+
"--reference_urls",
|
|
175
|
+
help="pass additional `external_references` url entry (or entries) to the report object created.",
|
|
176
|
+
default=[],
|
|
177
|
+
metavar="{url}",
|
|
178
|
+
action="extend",
|
|
179
|
+
nargs="+",
|
|
180
|
+
)
|
|
181
|
+
mode_parser.add_argument(
|
|
182
|
+
"--license",
|
|
183
|
+
help="Valid SPDX license for the rule",
|
|
184
|
+
default=None,
|
|
185
|
+
metavar="[LICENSE]",
|
|
186
|
+
choices=valid_licenses(),
|
|
187
|
+
)
|
|
188
|
+
mode_parser.add_argument(
|
|
189
|
+
"--create_attack_navigator_layer",
|
|
190
|
+
help="Create navigator layer",
|
|
191
|
+
action="store_true",
|
|
192
|
+
default=False,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
file.add_argument(
|
|
196
|
+
"--input_file",
|
|
197
|
+
help="The file to be converted. Must be .txt",
|
|
198
|
+
type=lambda x: Path(x).read_text(),
|
|
199
|
+
)
|
|
200
|
+
text.add_argument("--input_text", help="The text to be converted")
|
|
201
|
+
sigma.add_argument(
|
|
202
|
+
"--sigma_file",
|
|
203
|
+
help="The sigma file to be converted. Must be .yml",
|
|
204
|
+
type=lambda x: Path(x).read_text(),
|
|
205
|
+
)
|
|
206
|
+
sigma.add_argument(
|
|
207
|
+
"--status",
|
|
208
|
+
help="If passed, will overwrite any existing `status` recorded in the rule",
|
|
209
|
+
choices=STATUSES,
|
|
210
|
+
)
|
|
211
|
+
sigma.add_argument(
|
|
212
|
+
"--level",
|
|
213
|
+
help="If passed, will overwrite any existing `level` recorded in the rule",
|
|
214
|
+
choices=Level._member_names_,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
args: Args = parser.parse_args()
|
|
218
|
+
if args.mode == "check-credentials":
|
|
219
|
+
statuses = credential_checker.check_statuses(test_llms=True)
|
|
220
|
+
credential_checker.format_statuses(statuses)
|
|
221
|
+
sys.exit(0)
|
|
222
|
+
|
|
223
|
+
if args.mode != "sigma":
|
|
224
|
+
assert args.ai_provider, "--ai_provider is required in file or txt mode"
|
|
225
|
+
|
|
226
|
+
if args.mode == "file":
|
|
227
|
+
args.input_text = args.input_file
|
|
228
|
+
|
|
229
|
+
args.input_text = getattr(args, "input_text", "")
|
|
230
|
+
if not args.report_id:
|
|
231
|
+
args.report_id = Bundler.generate_report_id(
|
|
232
|
+
args.use_identity.id if args.use_identity else None, args.created, args.name
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return args
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def run_txt2detection(
|
|
239
|
+
name,
|
|
240
|
+
identity,
|
|
241
|
+
tlp_level,
|
|
242
|
+
input_text: str,
|
|
243
|
+
labels: list[str],
|
|
244
|
+
report_id: str | uuid.UUID,
|
|
245
|
+
ai_provider: BaseAIExtractor,
|
|
246
|
+
create_attack_navigator_layer=False,
|
|
247
|
+
**kwargs,
|
|
248
|
+
) -> Bundler:
|
|
249
|
+
if not kwargs.get("sigma_file"):
|
|
250
|
+
validate_token_count(
|
|
251
|
+
int(os.getenv("INPUT_TOKEN_LIMIT", 0)), input_text, ai_provider
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if sigma := kwargs.get("sigma_file"):
|
|
255
|
+
detection = get_sigma_detections(sigma, name=name)
|
|
256
|
+
if not identity and detection.author:
|
|
257
|
+
identity = make_identity(detection.author)
|
|
258
|
+
kwargs.update(
|
|
259
|
+
reference_urls=kwargs.setdefault("reference_urls", [])
|
|
260
|
+
+ detection.references
|
|
261
|
+
)
|
|
262
|
+
if not kwargs.get("created"):
|
|
263
|
+
# only consider rule.date and rule.modified if user does not pass --created
|
|
264
|
+
kwargs.update(
|
|
265
|
+
created=detection.date,
|
|
266
|
+
modified=detection.modified,
|
|
267
|
+
)
|
|
268
|
+
kwargs['license'] = kwargs.get('license') or detection.license
|
|
269
|
+
detection.level = kwargs.get("level") or detection.level
|
|
270
|
+
detection.status = kwargs.get("status") or detection.status
|
|
271
|
+
detection.date = as_date(kwargs.get("created"))
|
|
272
|
+
detection.modified = as_date(kwargs.get("modified"))
|
|
273
|
+
detection.references = kwargs["reference_urls"]
|
|
274
|
+
detection.detection_id = str(report_id).removeprefix("report--")
|
|
275
|
+
bundler = Bundler(
|
|
276
|
+
name or detection.title,
|
|
277
|
+
identity,
|
|
278
|
+
tlp_level or detection.tlp_level or "clear",
|
|
279
|
+
detection.description,
|
|
280
|
+
(labels or []) + detection.tags,
|
|
281
|
+
report_id=report_id,
|
|
282
|
+
**kwargs,
|
|
283
|
+
)
|
|
284
|
+
detections = DetectionContainer(success=True, detections=[])
|
|
285
|
+
detections.detections.append(detection)
|
|
286
|
+
else:
|
|
287
|
+
bundler = Bundler(
|
|
288
|
+
name, identity, tlp_level, input_text, labels, report_id=report_id, **kwargs
|
|
289
|
+
)
|
|
290
|
+
detections = ai_provider.get_detections(input_text)
|
|
291
|
+
bundler.bundle_detections(detections)
|
|
292
|
+
if create_attack_navigator_layer:
|
|
293
|
+
bundler.create_attack_navigator()
|
|
294
|
+
return bundler
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def get_sigma_detections(sigma: str, name=None) -> SigmaRuleDetection:
|
|
298
|
+
obj = yaml.safe_load(io.StringIO(sigma))
|
|
299
|
+
if not isinstance(obj, dict):
|
|
300
|
+
raise ValueError(
|
|
301
|
+
f"bad sigma input file. expected object/dict, got {type(obj)}."
|
|
302
|
+
)
|
|
303
|
+
if name:
|
|
304
|
+
obj["title"] = name
|
|
305
|
+
return SigmaRuleDetection.model_validate(obj)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main(args: Args):
|
|
309
|
+
|
|
310
|
+
setLogFile(logging.root, Path(f"logs/log-{args.report_id}.log"))
|
|
311
|
+
logging.info(f"starting argument: {json.dumps(sys.argv[1:])}")
|
|
312
|
+
kwargs = args.__dict__
|
|
313
|
+
kwargs["identity"] = args.use_identity
|
|
314
|
+
try:
|
|
315
|
+
bundler = run_txt2detection(**kwargs)
|
|
316
|
+
except (ValidationError, ValueError) as e:
|
|
317
|
+
logging.error(f"Validate sigma file failed: {str(e)}")
|
|
318
|
+
if isinstance(e, ValidationError):
|
|
319
|
+
full_error = e.json(indent=4)
|
|
320
|
+
logging.debug(f"Validate sigma file failed: {full_error}", exc_info=True)
|
|
321
|
+
sys.exit(19)
|
|
322
|
+
|
|
323
|
+
output_dir = Path("./output") / str(bundler.bundle.id)
|
|
324
|
+
shutil.rmtree(output_dir, ignore_errors=True)
|
|
325
|
+
rules_dir = output_dir / "rules"
|
|
326
|
+
rules_dir.mkdir(exist_ok=True, parents=True)
|
|
327
|
+
|
|
328
|
+
output_path = output_dir / "bundle.json"
|
|
329
|
+
data_path = output_dir / f"data.json"
|
|
330
|
+
output_path.write_text(bundler.to_json())
|
|
331
|
+
data_path.write_text(bundler.data.model_dump_json(indent=4))
|
|
332
|
+
for obj in bundler.bundle["objects"]:
|
|
333
|
+
if obj["type"] != "indicator" or obj["pattern_type"] != "sigma":
|
|
334
|
+
continue
|
|
335
|
+
rule_id: str = obj["id"].replace("indicator--", "")
|
|
336
|
+
rule_path = rules_dir / ("rule--" + rule_id + ".yml")
|
|
337
|
+
nav_path = rules_dir / f"attack-enterprise-navigator-layer-rule--{rule_id}.json"
|
|
338
|
+
rule_path.write_text(obj["pattern"])
|
|
339
|
+
if rule_nav := (
|
|
340
|
+
bundler.data.navigator_layer and bundler.data.navigator_layer.get(rule_id)
|
|
341
|
+
):
|
|
342
|
+
nav_path.write_text(json.dumps(rule_nav, indent=4))
|
|
343
|
+
logging.info(f"Writing bundle output to `{output_path}`")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import dotenv
|
|
4
|
+
|
|
5
|
+
from .base import _ai_extractor_registry as ALL_AI_EXTRACTORS
|
|
6
|
+
|
|
7
|
+
from .base import BaseAIExtractor
|
|
8
|
+
|
|
9
|
+
class ModelError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
for path in ["openai", "anthropic", "gemini", "deepseek", "openrouter"]:
|
|
13
|
+
try:
|
|
14
|
+
__import__(__package__ + "." + path)
|
|
15
|
+
except Exception as e:
|
|
16
|
+
logging.warning("%s not supported, please install missing modules", path, exc_info=True)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from .base import BaseAIExtractor
|
|
5
|
+
from llama_index.llms.anthropic import Anthropic
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AnthropicAIExtractor(BaseAIExtractor, provider="anthropic"):
|
|
9
|
+
def __init__(self, **kwargs) -> None:
|
|
10
|
+
kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
|
|
11
|
+
self.llm = Anthropic(max_tokens=4096, system_prompt=self.system_prompt, **kwargs)
|
|
12
|
+
super().__init__()
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Type
|
|
3
|
+
from llama_index.core.program import LLMTextCompletionProgram
|
|
4
|
+
|
|
5
|
+
import textwrap
|
|
6
|
+
from llama_index.core.llms.llm import LLM
|
|
7
|
+
|
|
8
|
+
from txt2detection.ai_extractor import prompts
|
|
9
|
+
|
|
10
|
+
from txt2detection.ai_extractor.utils import ParserWithLogging
|
|
11
|
+
from txt2detection.models import DetectionContainer, DetectionContainer
|
|
12
|
+
from llama_index.core.utils import get_tokenizer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_ai_extractor_registry: dict[str, "Type[BaseAIExtractor]"] = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseAIExtractor:
|
|
19
|
+
llm: LLM
|
|
20
|
+
system_prompt = textwrap.dedent(
|
|
21
|
+
"""
|
|
22
|
+
<persona>
|
|
23
|
+
|
|
24
|
+
You are a cyber-security detection engineering tool responsible for analysing intelligence reports provided in text files and writing SIGMA detection rules to detect the content being described in the reports.
|
|
25
|
+
|
|
26
|
+
You have a deep understanding of cybersecurity tools like SIEMs and XDRs, as well as threat intelligence concepts.
|
|
27
|
+
|
|
28
|
+
IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
|
|
29
|
+
|
|
30
|
+
</persona>
|
|
31
|
+
"""
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def get_detections(self, input_text) -> DetectionContainer:
|
|
35
|
+
logging.info("getting detections")
|
|
36
|
+
|
|
37
|
+
return LLMTextCompletionProgram.from_defaults(
|
|
38
|
+
output_parser=ParserWithLogging(DetectionContainer),
|
|
39
|
+
prompt=prompts.SIEMRULES_PROMPT,
|
|
40
|
+
verbose=True,
|
|
41
|
+
llm=self.llm,
|
|
42
|
+
)(document=input_text)
|
|
43
|
+
|
|
44
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def count_tokens(self, input_text):
|
|
48
|
+
logging.info(
|
|
49
|
+
"unsupported model `%s`, estimating using llama-index's default tokenizer",
|
|
50
|
+
self.extractor_name,
|
|
51
|
+
)
|
|
52
|
+
return len(get_tokenizer()(input_text))
|
|
53
|
+
|
|
54
|
+
def __init_subclass__(cls, /, provider, register=True, **kwargs):
|
|
55
|
+
super().__init_subclass__(**kwargs)
|
|
56
|
+
if register:
|
|
57
|
+
cls.provider = provider
|
|
58
|
+
_ai_extractor_registry[provider] = cls
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def extractor_name(self):
|
|
62
|
+
return f"{self.provider}:{self.llm.model}"
|
|
63
|
+
|
|
64
|
+
def check_credential(self):
|
|
65
|
+
try:
|
|
66
|
+
return "authorized" if self._check_credential() else "unauthorized"
|
|
67
|
+
except:
|
|
68
|
+
return "unknown"
|
|
69
|
+
|
|
70
|
+
def _check_credential(self):
|
|
71
|
+
self.llm.complete("say 'hi'")
|
|
72
|
+
return True
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from .base import BaseAIExtractor
|
|
5
|
+
from llama_index.llms.deepseek import DeepSeek
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DeepseekExtractor(BaseAIExtractor, provider="deepseek"):
|
|
9
|
+
def __init__(self, **kwargs) -> None:
|
|
10
|
+
kwargs.setdefault("temperature", float(os.environ.get("TEMPERATURE", 0.0)))
|
|
11
|
+
kwargs.setdefault("model", "deepseek-chat")
|
|
12
|
+
self.llm = DeepSeek(system_prompt=self.system_prompt, **kwargs)
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def count_tokens(self, text):
|
|
16
|
+
try:
|
|
17
|
+
return len(self.llm._tokenizer.encode(text))
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logging.warning(e)
|
|
20
|
+
return super().count_tokens(text)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
from .base import BaseAIExtractor
|
|
4
|
+
from llama_index.llms.gemini import Gemini
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GeminiAIExtractor(BaseAIExtractor, provider="gemini"):
|
|
8
|
+
def __init__(self, **kwargs) -> None:
|
|
9
|
+
kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
|
|
10
|
+
self.llm = Gemini(max_tokens=4096, **kwargs)
|
|
11
|
+
super().__init__()
|
|
12
|
+
|
|
13
|
+
def count_tokens(self, text):
|
|
14
|
+
return self.llm._model.count_tokens(text).total_tokens
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def extractor_name(self):
|
|
18
|
+
return f"{self.provider}:{self.llm.model}"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from .base import BaseAIExtractor
|
|
4
|
+
from llama_index.llms.openai import OpenAI
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OpenAIExtractor(BaseAIExtractor, provider="openai"):
|
|
8
|
+
def __init__(self, **kwargs) -> None:
|
|
9
|
+
kwargs.setdefault("temperature", float(os.environ.get("TEMPERATURE", 0.0)))
|
|
10
|
+
self.llm = OpenAI(system_prompt=self.system_prompt, **kwargs, max_tokens=4096)
|
|
11
|
+
super().__init__()
|
|
12
|
+
|
|
13
|
+
def count_tokens(self, text):
|
|
14
|
+
try:
|
|
15
|
+
return len(self.llm._tokenizer.encode(text))
|
|
16
|
+
except Exception as e:
|
|
17
|
+
logging.warning(e)
|
|
18
|
+
return super().count_tokens(text)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from .base import BaseAIExtractor
|
|
4
|
+
from llama_index.llms.openrouter import OpenRouter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OpenRouterExtractor(BaseAIExtractor, provider="openrouter"):
|
|
8
|
+
def __init__(self, **kwargs) -> None:
|
|
9
|
+
kwargs.setdefault("temperature", float(os.environ.get("TEMPERATURE", 0.0)))
|
|
10
|
+
self.llm = OpenRouter(
|
|
11
|
+
system_prompt=self.system_prompt, max_tokens=4096, **kwargs
|
|
12
|
+
)
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
def count_tokens(self, text):
|
|
16
|
+
try:
|
|
17
|
+
return len(self.llm._tokenizer.encode(text))
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logging.warning(e)
|
|
20
|
+
return super().count_tokens(text)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from llama_index.core import PromptTemplate, ChatPromptTemplate
|
|
2
|
+
import textwrap
|
|
3
|
+
from llama_index.core.base.llms.types import ChatMessage, MessageRole
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SIEMRULES_PROMPT = ChatPromptTemplate(
|
|
7
|
+
[
|
|
8
|
+
ChatMessage.from_str(
|
|
9
|
+
"""
|
|
10
|
+
**Persona:**
|
|
11
|
+
|
|
12
|
+
You are an expert in cybersecurity threat detection. Given a structured security report, generate a Sigma rule following the Sigma specification.
|
|
13
|
+
|
|
14
|
+
## **Requirements:**
|
|
15
|
+
Return the result as a **JSON output**, ensuring that each dictionary represents a Sigma rule with the following **AI-generated fields**:
|
|
16
|
+
|
|
17
|
+
### **Meta Properties (Generated by AI Only)**
|
|
18
|
+
- `"title"`: A concise, descriptive title for the rule.
|
|
19
|
+
- `"description"`: A summary of the rule, explaining its purpose and detection logic.
|
|
20
|
+
- `"tags"`: **Generated by AI**, including:
|
|
21
|
+
- ATT&CK Technique IDs, ATT&CK Sub-technique IDs and CVE IDs relevant to the report.
|
|
22
|
+
- `"falsepositives"`: Please describe situations where this detection rule might trigger false positive detections. Put each situation description as a new list item
|
|
23
|
+
- `"logsources"`: Valid sigma rule logsource
|
|
24
|
+
- `"detection"`: Valid sigma rule detection
|
|
25
|
+
- `"indicator_types"`: One or more STIX v2.1 indicator.indicator_types
|
|
26
|
+
- `"level"`: select SIGMA level for the rule
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
### **Indicator Types**
|
|
30
|
+
- `"anomalous-activity"`: Unexpected, or unusual activity that may not necessarily be malicious or indicate compromise. This type of activity may include reconnaissance-like behavior such as port scans or version identification, network behavior anomalies, and asset and/or user behavioral anomalies.
|
|
31
|
+
- `"anonymization"`: Suspected anonymization tools or infrastructure (proxy, TOR, VPN, etc.).
|
|
32
|
+
- `"benign"`: Activity that is not suspicious or malicious in and of itself, but when combined with other activity may indicate suspicious or malicious behavior.
|
|
33
|
+
- `"compromised"`: Assets that are suspected to be compromised.
|
|
34
|
+
- `"malicious-activity"`: Patterns of suspected malicious objects and/or activity.
|
|
35
|
+
- `"attribution"`: Patterns of behavior that indicate attribution to a particular Threat Actor or Campaign.
|
|
36
|
+
- `"unknown"`: There is not enough information available to determine the type of indicator.
|
|
37
|
+
|
|
38
|
+
### **Level**
|
|
39
|
+
The level field contains one of five string values. It describes the criticality of a triggered rule. While low and medium level events have an informative character, events with high and critical level should lead to immediate reviews by security analysts.
|
|
40
|
+
- informational: Rule is intended for enrichment of events, e.g. by tagging them. No case or alerting should be triggered by such rules because it is expected that a huge amount of events will match these rules.
|
|
41
|
+
- low: Notable event but rarely an incident. Low rated events can be relevant in high numbers or combination with others. Immediate reaction shouldn't be necessary, but a regular review is recommended.
|
|
42
|
+
- medium: Relevant event that should be reviewed manually on a more frequent basis.
|
|
43
|
+
- high: Relevant event that should trigger an internal alert and requires a prompt review.
|
|
44
|
+
- critical: Highly relevant event that indicates an incident. Critical events should be reviewed immediately. It is used only for cases in which probability borders certainty.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
### **Tags***
|
|
48
|
+
|
|
49
|
+
Tags are to follow the format <namespace>.<value>.
|
|
50
|
+
IMPORTANT: select 0 or more from this section and only include valid and most appropriate tags
|
|
51
|
+
|
|
52
|
+
#### Namespace: attack
|
|
53
|
+
|
|
54
|
+
ATT&CK is either a [technique], a [group], a [software] or a [tactic].
|
|
55
|
+
|
|
56
|
+
* *attack.t1234*: Refers to a [technique](https://attack.mitre.org/wiki/All_Techniques)
|
|
57
|
+
* *attack.g1234*: Refers to a [group](https://attack.mitre.org/wiki/Groups)
|
|
58
|
+
* *attack.s1234*: Refers to [software](https://attack.mitre.org/wiki/Software)
|
|
59
|
+
|
|
60
|
+
Tactics:
|
|
61
|
+
|
|
62
|
+
* attack.initial-access
|
|
63
|
+
* attack.execution
|
|
64
|
+
* attack.persistence
|
|
65
|
+
* attack.privilege-escalation
|
|
66
|
+
* attack.defense-evasion
|
|
67
|
+
* attack.credential-access
|
|
68
|
+
* attack.discovery
|
|
69
|
+
* attack.lateral-movement
|
|
70
|
+
* attack.collection
|
|
71
|
+
* attack.exfiltration
|
|
72
|
+
* attack.command-and-control
|
|
73
|
+
* attack.impact
|
|
74
|
+
|
|
75
|
+
#### Namespace: cve
|
|
76
|
+
Only include from this section, CVEs that are explicitly mentioned in input document. Do not attempt to make up any random CVE-ID
|
|
77
|
+
|
|
78
|
+
Use the CVE tag from [MITRE](https://cve.mitre.org) in lower case separated by dots. Example tag: `cve.2021-44228`.
|
|
79
|
+
|
|
80
|
+
### **Detection:**
|
|
81
|
+
|
|
82
|
+
You need to generate a structured `detection` object representing a set of **search identifiers** used for detecting patterns in log data. Follow these rules when constructing `detection`:
|
|
83
|
+
|
|
84
|
+
1. **Overall Structure:**
|
|
85
|
+
- The root object must contain a **required** field called `"condition"`, which is a string that defines the relationship between different search identifiers (e.g., `"selection1 OR selection2"`).
|
|
86
|
+
- Additional properties (besides `"condition"`) represent **search identifiers**, which can be either a **list** or a **map**.
|
|
87
|
+
|
|
88
|
+
2. **Search Identifier Format:**
|
|
89
|
+
- A **list** (`array`) can contain:
|
|
90
|
+
- **Strings** (e.g., `"error_code_500"`, `"user_login_failed"`).
|
|
91
|
+
- **Integers** (e.g., `404`, `500`).
|
|
92
|
+
- **Objects** where all values are strings.
|
|
93
|
+
- A **map** (`object`) where all values are strings.
|
|
94
|
+
|
|
95
|
+
3. **Example Detection Data:**
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"condition": "selection OR event_selection",
|
|
99
|
+
"selection": ["error_code_500", 404, {"key": "value"}],
|
|
100
|
+
"event_selection": {
|
|
101
|
+
"event_type": "failed_login",
|
|
102
|
+
"source_ip": "192.168.1.1"
|
|
103
|
+
}=
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Make sure your response follows this format and adheres to the rules above.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
## **Additional Instructions**
|
|
111
|
+
- Ensure the `"tags"` field includes relevant ATT&CK and CVE references based on the report content.
|
|
112
|
+
- Return a **valid JSON output** without YAML formatting for seamless processing.
|
|
113
|
+
"""
|
|
114
|
+
),
|
|
115
|
+
ChatMessage.from_str(
|
|
116
|
+
"Taking the entire input of my next message, analyze and return appropriate response",
|
|
117
|
+
MessageRole.USER,
|
|
118
|
+
),
|
|
119
|
+
ChatMessage.from_str("{document}", MessageRole.USER),
|
|
120
|
+
]
|
|
121
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import logging
|
|
3
|
+
import typing
|
|
4
|
+
import json_repair
|
|
5
|
+
|
|
6
|
+
from llama_index.core.output_parsers import PydanticOutputParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
if typing.TYPE_CHECKING:
|
|
10
|
+
from txt2detection.bundler import Bundler
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ParserWithLogging(PydanticOutputParser):
|
|
14
|
+
def parse(self, text: str):
|
|
15
|
+
f = io.StringIO()
|
|
16
|
+
print("\n" * 5 + "=================start=================", file=f)
|
|
17
|
+
print(text, file=f)
|
|
18
|
+
print("=================close=================" + "\n" * 5, file=f)
|
|
19
|
+
logging.debug(f.getvalue())
|
|
20
|
+
repaired_json = json_repair.repair_json(text)
|
|
21
|
+
return super().parse(repaired_json)
|