dogesec-commons 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dogesec_commons/__init__.py +0 -0
- dogesec_commons/asgi.py +16 -0
- dogesec_commons/objects/__init__.py +1 -0
- dogesec_commons/objects/apps.py +10 -0
- dogesec_commons/objects/conf.py +8 -0
- dogesec_commons/objects/db_view_creator.py +164 -0
- dogesec_commons/objects/helpers.py +660 -0
- dogesec_commons/objects/views.py +427 -0
- dogesec_commons/settings.py +161 -0
- dogesec_commons/stixifier/__init__.py +0 -0
- dogesec_commons/stixifier/apps.py +5 -0
- dogesec_commons/stixifier/conf.py +1 -0
- dogesec_commons/stixifier/migrations/0001_initial.py +36 -0
- dogesec_commons/stixifier/migrations/0002_profile_ai_content_check_variable.py +18 -0
- dogesec_commons/stixifier/migrations/0003_rename_ai_content_check_variable_profile_ai_content_check_provider_and_more.py +23 -0
- dogesec_commons/stixifier/migrations/0004_profile_identity_id.py +18 -0
- dogesec_commons/stixifier/migrations/0005_profile_generate_pdf.py +18 -0
- dogesec_commons/stixifier/migrations/__init__.py +0 -0
- dogesec_commons/stixifier/models.py +57 -0
- dogesec_commons/stixifier/serializers.py +192 -0
- dogesec_commons/stixifier/stixifier.py +252 -0
- dogesec_commons/stixifier/summarizer.py +62 -0
- dogesec_commons/stixifier/views.py +193 -0
- dogesec_commons/urls.py +45 -0
- dogesec_commons/utils/__init__.py +3 -0
- dogesec_commons/utils/autoschema.py +88 -0
- dogesec_commons/utils/exceptions.py +28 -0
- dogesec_commons/utils/filters.py +66 -0
- dogesec_commons/utils/ordering.py +47 -0
- dogesec_commons/utils/pagination.py +81 -0
- dogesec_commons/utils/schemas.py +27 -0
- dogesec_commons/utils/serializers.py +47 -0
- dogesec_commons/wsgi.py +16 -0
- dogesec_commons-1.0.2.dist-info/METADATA +57 -0
- dogesec_commons-1.0.2.dist-info/RECORD +37 -0
- dogesec_commons-1.0.2.dist-info/WHEEL +4 -0
- dogesec_commons-1.0.2.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
# Generated by Django 5.1.5 on 2025-05-01 09:48
|
2
|
+
|
3
|
+
from django.db import migrations, models
|
4
|
+
|
5
|
+
|
6
|
+
class Migration(migrations.Migration):
|
7
|
+
|
8
|
+
dependencies = [
|
9
|
+
('dogesec_stixifier', '0003_rename_ai_content_check_variable_profile_ai_content_check_provider_and_more'),
|
10
|
+
]
|
11
|
+
|
12
|
+
operations = [
|
13
|
+
migrations.AddField(
|
14
|
+
model_name='profile',
|
15
|
+
name='identity_id',
|
16
|
+
field=models.CharField(default=None, max_length=46, null=True),
|
17
|
+
),
|
18
|
+
]
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Generated by Django 5.1.5 on 2025-06-24 12:58
|
2
|
+
|
3
|
+
from django.db import migrations, models
|
4
|
+
|
5
|
+
|
6
|
+
class Migration(migrations.Migration):
|
7
|
+
|
8
|
+
dependencies = [
|
9
|
+
('dogesec_stixifier', '0004_profile_identity_id'),
|
10
|
+
]
|
11
|
+
|
12
|
+
operations = [
|
13
|
+
migrations.AddField(
|
14
|
+
model_name='profile',
|
15
|
+
name='generate_pdf',
|
16
|
+
field=models.BooleanField(default=False),
|
17
|
+
),
|
18
|
+
]
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import txt2stix
|
2
|
+
from django.conf import settings
|
3
|
+
from django.db import models
|
4
|
+
from django.contrib.postgres.fields import ArrayField
|
5
|
+
import uuid
|
6
|
+
from functools import partial
|
7
|
+
import txt2stix.common
|
8
|
+
import txt2stix, txt2stix.extractions
|
9
|
+
from django.core.exceptions import ValidationError
|
10
|
+
|
11
|
+
|
12
|
+
class RelationshipMode(models.TextChoices):
|
13
|
+
AI = "ai", "AI Relationship"
|
14
|
+
STANDARD = "standard", "Standard Relationship"
|
15
|
+
|
16
|
+
def validate_extractor(types, name):
|
17
|
+
extractors = txt2stix.extractions.parse_extraction_config(
|
18
|
+
txt2stix.txt2stix.INCLUDES_PATH
|
19
|
+
).values()
|
20
|
+
for extractor in extractors:
|
21
|
+
if name == extractor.slug and extractor.type in types:
|
22
|
+
return True
|
23
|
+
raise ValidationError(f"{name} does not exist", 400)
|
24
|
+
|
25
|
+
|
26
|
+
class Profile(models.Model):
|
27
|
+
id = models.UUIDField(primary_key=True)
|
28
|
+
created = models.DateTimeField(auto_now_add=True)
|
29
|
+
name = models.CharField(max_length=250, unique=True)
|
30
|
+
identity_id = models.CharField(max_length=46, null=True, default=None)
|
31
|
+
extractions = ArrayField(base_field=models.CharField(max_length=256))
|
32
|
+
relationship_mode = models.CharField(choices=RelationshipMode.choices, max_length=20, default=RelationshipMode.STANDARD)
|
33
|
+
extract_text_from_image = models.BooleanField(default=False)
|
34
|
+
defang = models.BooleanField()
|
35
|
+
generate_pdf = models.BooleanField(default=False)
|
36
|
+
ai_settings_relationships = models.CharField(max_length=256, blank=False, null=True)
|
37
|
+
ai_settings_extractions = ArrayField(base_field=models.CharField(max_length=256), default=list)
|
38
|
+
ai_content_check_provider = models.CharField(default=None, null=True, blank=False, max_length=256)
|
39
|
+
ai_summary_provider = models.CharField(max_length=256, blank=False, null=True)
|
40
|
+
ai_create_attack_flow = models.BooleanField(default=True)
|
41
|
+
ignore_image_refs = models.BooleanField(default=True)
|
42
|
+
ignore_link_refs = models.BooleanField(default=True)
|
43
|
+
ignore_extraction_boundary = models.BooleanField(default=False)
|
44
|
+
|
45
|
+
#############
|
46
|
+
ignore_embedded_relationships_sro = models.BooleanField(default=True)
|
47
|
+
ignore_embedded_relationships_smo = models.BooleanField(default=True)
|
48
|
+
ignore_embedded_relationships = models.BooleanField(default=False)
|
49
|
+
|
50
|
+
|
51
|
+
def save(self, *args, **kwargs) -> None:
|
52
|
+
if not self.id:
|
53
|
+
name = self.name
|
54
|
+
if self.identity_id:
|
55
|
+
name = f"{self.name}+{self.identity_id}"
|
56
|
+
self.id = uuid.uuid5(settings.STIXIFIER_NAMESPACE, name)
|
57
|
+
return super().save(*args, **kwargs)
|
@@ -0,0 +1,192 @@
|
|
1
|
+
import argparse
|
2
|
+
import contextlib
|
3
|
+
from functools import partial
|
4
|
+
import uuid
|
5
|
+
from rest_framework import serializers
|
6
|
+
|
7
|
+
from . import conf
|
8
|
+
from .models import Profile
|
9
|
+
from rest_framework import serializers
|
10
|
+
import txt2stix.extractions
|
11
|
+
import txt2stix.txt2stix
|
12
|
+
from urllib.parse import urljoin
|
13
|
+
from django.conf import settings
|
14
|
+
from django.contrib.postgres.fields import ArrayField
|
15
|
+
from rest_framework.validators import ValidationError
|
16
|
+
from dogesec_commons.utils.serializers import CommonErrorSerializer
|
17
|
+
|
18
|
+
from drf_spectacular.utils import OpenApiResponse, OpenApiExample
|
19
|
+
|
20
|
+
from drf_spectacular.utils import OpenApiResponse, OpenApiExample
|
21
|
+
|
22
|
+
from django.db import models
|
23
|
+
|
24
|
+
|
25
|
+
def validate_model(model):
|
26
|
+
if not model:
|
27
|
+
return None
|
28
|
+
try:
|
29
|
+
extractor = txt2stix.txt2stix.parse_model(model)
|
30
|
+
except BaseException as e:
|
31
|
+
raise ValidationError(f"invalid model: {model}")
|
32
|
+
return model
|
33
|
+
|
34
|
+
def validate_extractor(typestr, types, name):
|
35
|
+
extractors = txt2stix.extractions.parse_extraction_config(
|
36
|
+
txt2stix.txt2stix.INCLUDES_PATH
|
37
|
+
)
|
38
|
+
if name not in extractors or extractors[name].type not in types:
|
39
|
+
raise ValidationError(f"`{name}` is not a valid {typestr}", 400)
|
40
|
+
|
41
|
+
def validate_stix_id(stix_id: str, type: str):
|
42
|
+
type_part, _, id_part = stix_id.partition('--')
|
43
|
+
if type_part != type:
|
44
|
+
raise ValidationError(f"Invalid STIX ID for type `{type}`")
|
45
|
+
with contextlib.suppress(Exception):
|
46
|
+
uuid.UUID(id_part)
|
47
|
+
return stix_id
|
48
|
+
raise ValidationError("Invalid STIX ID")
|
49
|
+
|
50
|
+
|
51
|
+
def uses_ai(slugs):
|
52
|
+
extractors = txt2stix.extractions.parse_extraction_config(
|
53
|
+
txt2stix.txt2stix.INCLUDES_PATH
|
54
|
+
)
|
55
|
+
ai_based_extractors = []
|
56
|
+
for slug in slugs:
|
57
|
+
if extractors[slug].type == 'ai':
|
58
|
+
ai_based_extractors.append(slug)
|
59
|
+
|
60
|
+
if ai_based_extractors:
|
61
|
+
raise ValidationError(f'AI based extractors `{ai_based_extractors}` used when `ai_settings_extractions` is not configured')
|
62
|
+
|
63
|
+
class ProfileSerializer(serializers.ModelSerializer):
|
64
|
+
id = serializers.UUIDField(read_only=True)
|
65
|
+
identity_id = serializers.CharField(
|
66
|
+
max_length=46,
|
67
|
+
validators=[lambda stix_id: validate_stix_id(stix_id, "identity")],
|
68
|
+
allow_null=True,
|
69
|
+
required=False,
|
70
|
+
help_text="STIX Identity ID (e.g `identity--19686d47-3a50-48a0-8ef0-f3e0f8a4bd99`)"
|
71
|
+
)
|
72
|
+
|
73
|
+
ai_settings_relationships = serializers.CharField(
|
74
|
+
validators=[validate_model],
|
75
|
+
help_text='(required if AI relationship enabled): passed in format `provider:model`. Can only pass one model at this time.',
|
76
|
+
allow_null=True,
|
77
|
+
required=False,
|
78
|
+
)
|
79
|
+
ai_settings_extractions = serializers.ListField(
|
80
|
+
child=serializers.CharField(max_length=256, validators=[validate_model]),
|
81
|
+
help_text='(required if AI extractions enabled) passed in format provider[:model] e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers. model part is optional',
|
82
|
+
required=False,
|
83
|
+
)
|
84
|
+
ai_summary_provider = serializers.CharField(
|
85
|
+
validators=[validate_model],
|
86
|
+
help_text='you can optionally get an AI model to produce a summary of the blog. You must pass the request in format `provider:model`. model part is optional',
|
87
|
+
allow_null=True,
|
88
|
+
required=False,
|
89
|
+
)
|
90
|
+
ai_content_check_provider = serializers.CharField(
|
91
|
+
max_length=256, validators=[validate_model],
|
92
|
+
allow_null=True,
|
93
|
+
required=False,
|
94
|
+
help_text='check content before proceeding'
|
95
|
+
)
|
96
|
+
ai_create_attack_flow = serializers.BooleanField(required=False, help_text="should create attack-flow (default is `false`)", default=True)
|
97
|
+
extractions = serializers.ListField(
|
98
|
+
min_length=1,
|
99
|
+
child=serializers.CharField(max_length=256, validators=[partial(validate_extractor, 'extractor', ["ai", "pattern", "lookup"])]),
|
100
|
+
help_text="extraction id(s)",
|
101
|
+
)
|
102
|
+
defang = serializers.BooleanField(help_text='If the text should be defanged before processing')
|
103
|
+
|
104
|
+
|
105
|
+
ignore_embedded_relationships = serializers.BooleanField(required=False, help_text="applies to SDO and SCO types (default is `false`)")
|
106
|
+
ignore_embedded_relationships_sro = serializers.BooleanField(required=False, help_text="sets wether to ignore embedded refs on `relationship` object types (default is `true`)")
|
107
|
+
ignore_embedded_relationships_smo = serializers.BooleanField(required=False, help_text="sets wether to ignore embedded refs on SMO object types (`marking-definition`, `extension-definition`, `language-content`) (default is `true`)")
|
108
|
+
generate_pdf = serializers.BooleanField(required=False, help_text="Whether or not to generate pdf file for input, applies to both stixify and obstracts (default is `false`)")
|
109
|
+
|
110
|
+
class Meta:
|
111
|
+
model = Profile
|
112
|
+
fields = "__all__"
|
113
|
+
|
114
|
+
def validate(self, attrs):
|
115
|
+
if not attrs.get('ai_settings_relationships'):
|
116
|
+
if attrs['relationship_mode'] == 'ai':
|
117
|
+
raise ValidationError('`ai_settings_relationships` is required when `relationship_mode == "ai"`')
|
118
|
+
if attrs['ai_create_attack_flow']:
|
119
|
+
raise ValidationError('`ai_settings_relationships` is required when `ai_create_attack_flow == true`')
|
120
|
+
if not attrs.get('ai_settings_extractions'):
|
121
|
+
uses_ai(attrs['extractions'])
|
122
|
+
return super().validate(attrs)
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
DEFAULT_400_ERROR = OpenApiResponse(
|
127
|
+
CommonErrorSerializer,
|
128
|
+
"The server did not understand the request",
|
129
|
+
[
|
130
|
+
OpenApiExample(
|
131
|
+
"http400",
|
132
|
+
{"message": " The server did not understand the request", "code": 400},
|
133
|
+
)
|
134
|
+
],
|
135
|
+
)
|
136
|
+
|
137
|
+
|
138
|
+
DEFAULT_404_ERROR = OpenApiResponse(
|
139
|
+
CommonErrorSerializer,
|
140
|
+
"Resource not found",
|
141
|
+
[
|
142
|
+
OpenApiExample(
|
143
|
+
"http404",
|
144
|
+
{
|
145
|
+
"message": "The server cannot find the resource you requested",
|
146
|
+
"code": 404,
|
147
|
+
},
|
148
|
+
)
|
149
|
+
],
|
150
|
+
)
|
151
|
+
|
152
|
+
|
153
|
+
##
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
class Txt2stixExtractorSerializer(serializers.Serializer):
|
158
|
+
id = serializers.CharField(label='The `id` of the extractor')
|
159
|
+
name = serializers.CharField()
|
160
|
+
type = serializers.CharField()
|
161
|
+
description = serializers.CharField(required=False, allow_null=True)
|
162
|
+
notes = serializers.CharField(required=False, allow_null=True)
|
163
|
+
file = serializers.CharField(required=False, allow_null=True)
|
164
|
+
created = serializers.CharField(required=False, allow_null=True)
|
165
|
+
modified = serializers.CharField(required=False, allow_null=True)
|
166
|
+
created_by = serializers.CharField(required=False, allow_null=True)
|
167
|
+
version = serializers.CharField()
|
168
|
+
stix_mapping = serializers.CharField(required=False, allow_null=True)
|
169
|
+
dogesec_web = serializers.BooleanField(required=False, allow_null=True)
|
170
|
+
|
171
|
+
@classmethod
|
172
|
+
def all_extractors(cls, types):
|
173
|
+
retval = {}
|
174
|
+
extractors = txt2stix.extractions.parse_extraction_config(
|
175
|
+
txt2stix.txt2stix.INCLUDES_PATH
|
176
|
+
).values()
|
177
|
+
for extractor in extractors:
|
178
|
+
if extractor.type in types:
|
179
|
+
retval[extractor.slug] = cls.cleanup_extractor(extractor)
|
180
|
+
if extractor.file:
|
181
|
+
retval[extractor.slug]["file"] = urljoin(conf.TXT2STIX_INCLUDE_URL, str(extractor.file.relative_to(txt2stix.txt2stix.INCLUDES_PATH)))
|
182
|
+
return retval
|
183
|
+
|
184
|
+
@classmethod
|
185
|
+
def cleanup_extractor(cls, dct: dict):
|
186
|
+
KEYS = cls(data={}).get_fields()
|
187
|
+
retval = {"id": dct["slug"]}
|
188
|
+
for key in KEYS:
|
189
|
+
if key in dct:
|
190
|
+
retval[key] = dct[key]
|
191
|
+
return retval
|
192
|
+
|
@@ -0,0 +1,252 @@
|
|
1
|
+
import io
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
import shutil
|
7
|
+
import uuid
|
8
|
+
from attr import dataclass
|
9
|
+
|
10
|
+
from dogesec_commons.stixifier.summarizer import parse_summarizer_model
|
11
|
+
|
12
|
+
from ..objects import db_view_creator
|
13
|
+
from . import models
|
14
|
+
import tempfile
|
15
|
+
from file2txt.converter import get_parser_class
|
16
|
+
from txt2stix import get_include_path
|
17
|
+
from txt2stix.stix import txt2stixBundler
|
18
|
+
from txt2stix.ai_extractor import BaseAIExtractor
|
19
|
+
from stix2arango.stix2arango import Stix2Arango
|
20
|
+
from django.conf import settings
|
21
|
+
from txt2stix.ai_extractor.utils import DescribesIncident
|
22
|
+
|
23
|
+
|
24
|
+
from file2txt.converter import Fanger, get_parser_class
|
25
|
+
from file2txt.parsers.core import BaseParser
|
26
|
+
import txt2stix.utils
|
27
|
+
import txt2stix.txt2stix
|
28
|
+
import txt2stix.extractions
|
29
|
+
|
30
|
+
|
31
|
+
def all_extractors(names, _all=False):
|
32
|
+
retval = {}
|
33
|
+
extractors = txt2stix.extractions.parse_extraction_config(
|
34
|
+
get_include_path()
|
35
|
+
).values()
|
36
|
+
for extractor in extractors:
|
37
|
+
if _all or extractor.slug in names:
|
38
|
+
retval[extractor.slug] = extractor
|
39
|
+
return retval
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class ReportProperties:
|
44
|
+
name: str = None
|
45
|
+
identity: dict = None
|
46
|
+
tlp_level: str = None
|
47
|
+
confidence: int = None
|
48
|
+
labels: list[str] = None
|
49
|
+
created: str = None
|
50
|
+
kwargs: dict = {}
|
51
|
+
|
52
|
+
|
53
|
+
class StixifyProcessor:
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
file: io.FileIO,
|
57
|
+
profile: models.Profile,
|
58
|
+
job_id: uuid.UUID,
|
59
|
+
post=None,
|
60
|
+
file2txt_mode="html",
|
61
|
+
report_id=None,
|
62
|
+
base_url=None,
|
63
|
+
always_extract=False,
|
64
|
+
) -> None:
|
65
|
+
self.job_id = str(job_id)
|
66
|
+
self.extra_data = dict()
|
67
|
+
self.report_id = report_id
|
68
|
+
self.profile = profile
|
69
|
+
self.collection_name = "stixify"
|
70
|
+
self.tmpdir = Path(tempfile.mkdtemp(prefix="stixify-"))
|
71
|
+
self.file2txt_mode = file2txt_mode
|
72
|
+
self.md_images = []
|
73
|
+
self.processed_image_base_url = ""
|
74
|
+
self.base_url = base_url
|
75
|
+
self.incident: DescribesIncident = None
|
76
|
+
self.summary = None
|
77
|
+
|
78
|
+
self.filename = self.tmpdir / Path(file.name).name
|
79
|
+
self.filename.write_bytes(file.read())
|
80
|
+
|
81
|
+
self.task_name = f"{self.profile.name}/{job_id}/{self.report_id}"
|
82
|
+
self.always_extract = always_extract
|
83
|
+
|
84
|
+
def setup(self, /, report_prop: ReportProperties, extra={}):
|
85
|
+
self.extra_data.update(extra)
|
86
|
+
self.report_prop = report_prop
|
87
|
+
|
88
|
+
def file2txt(self):
|
89
|
+
parser_class = get_parser_class(self.file2txt_mode, self.filename.name)
|
90
|
+
converter: BaseParser = parser_class(
|
91
|
+
self.filename,
|
92
|
+
self.file2txt_mode,
|
93
|
+
self.profile.extract_text_from_image,
|
94
|
+
settings.GOOGLE_VISION_API_KEY,
|
95
|
+
base_url=self.base_url,
|
96
|
+
)
|
97
|
+
output = converter.convert(
|
98
|
+
processed_image_base_url=self.processed_image_base_url
|
99
|
+
)
|
100
|
+
if self.profile.defang:
|
101
|
+
output = Fanger(output).defang()
|
102
|
+
for name, img in converter.images.items():
|
103
|
+
img_file = io.BytesIO()
|
104
|
+
img_file.name = name
|
105
|
+
img.save(img_file, format="png")
|
106
|
+
self.md_images.append(img_file)
|
107
|
+
|
108
|
+
self.output_md = output
|
109
|
+
self.md_file = self.tmpdir / f"post_md_{self.report_id or 'file'}.md"
|
110
|
+
self.md_file.write_text(self.output_md)
|
111
|
+
|
112
|
+
def txt2stix(self):
|
113
|
+
extractors = all_extractors(self.profile.extractions)
|
114
|
+
extractors_map = {}
|
115
|
+
for extractor in extractors.values():
|
116
|
+
if extractors_map.get(extractor.type):
|
117
|
+
extractors_map[extractor.type][extractor.slug] = extractor
|
118
|
+
else:
|
119
|
+
extractors_map[extractor.type] = {extractor.slug: extractor}
|
120
|
+
|
121
|
+
self.bundler = txt2stixBundler(
|
122
|
+
self.report_prop.name,
|
123
|
+
identity=self.report_prop.identity,
|
124
|
+
tlp_level=self.report_prop.tlp_level,
|
125
|
+
confidence=self.report_prop.confidence,
|
126
|
+
labels=self.report_prop.labels,
|
127
|
+
description=self.output_md,
|
128
|
+
extractors=extractors,
|
129
|
+
report_id=self.report_id,
|
130
|
+
created=self.report_prop.created,
|
131
|
+
**self.report_prop.kwargs,
|
132
|
+
)
|
133
|
+
self.extra_data["_stixify_report_id"] = str(self.bundler.report.id)
|
134
|
+
input_text = txt2stix.utils.remove_links(
|
135
|
+
self.output_md,
|
136
|
+
self.profile.ignore_image_refs,
|
137
|
+
self.profile.ignore_link_refs,
|
138
|
+
)
|
139
|
+
ai_extractors = [
|
140
|
+
txt2stix.txt2stix.parse_model(model_str)
|
141
|
+
for model_str in self.profile.ai_settings_extractions
|
142
|
+
]
|
143
|
+
self.txt2stix_data = txt2stix.txt2stix.run_txt2stix(
|
144
|
+
self.bundler,
|
145
|
+
input_text,
|
146
|
+
extractors_map,
|
147
|
+
ai_content_check_provider=self.profile.ai_content_check_provider
|
148
|
+
and txt2stix.txt2stix.parse_model(self.profile.ai_content_check_provider),
|
149
|
+
ai_create_attack_flow=self.profile.ai_create_attack_flow,
|
150
|
+
input_token_limit=settings.INPUT_TOKEN_LIMIT,
|
151
|
+
ai_settings_extractions=ai_extractors,
|
152
|
+
ai_settings_relationships=self.profile.ai_settings_relationships
|
153
|
+
and txt2stix.txt2stix.parse_model(self.profile.ai_settings_relationships),
|
154
|
+
relationship_mode=self.profile.relationship_mode,
|
155
|
+
ignore_extraction_boundary=self.profile.ignore_extraction_boundary,
|
156
|
+
always_extract=self.always_extract,
|
157
|
+
)
|
158
|
+
self.incident = self.txt2stix_data.content_check
|
159
|
+
return self.bundler
|
160
|
+
|
161
|
+
def summarize(self):
|
162
|
+
if self.profile.ai_summary_provider:
|
163
|
+
logging.info(
|
164
|
+
f"summarizing report {self.report_id} using `{self.profile.ai_summary_provider}`"
|
165
|
+
)
|
166
|
+
try:
|
167
|
+
report = self.bundler.report
|
168
|
+
summary_extractor = parse_summarizer_model(
|
169
|
+
self.profile.ai_summary_provider
|
170
|
+
)
|
171
|
+
self.summary = summary_extractor.summarize(self.output_md)
|
172
|
+
summary_note_obj = {
|
173
|
+
"type": "note",
|
174
|
+
"spec_version": "2.1",
|
175
|
+
"id": report.id.replace("report", "note"),
|
176
|
+
"created": report.created,
|
177
|
+
"modified": report.modified,
|
178
|
+
"created_by_ref": report.created_by_ref,
|
179
|
+
"external_references": [
|
180
|
+
{
|
181
|
+
"source_name": "txt2stix_ai_summary_provider",
|
182
|
+
"external_id": self.profile.ai_summary_provider,
|
183
|
+
},
|
184
|
+
],
|
185
|
+
"abstract": f"AI Summary: {report.name}",
|
186
|
+
"content": self.summary,
|
187
|
+
"object_refs": [report.id],
|
188
|
+
"object_marking_refs": report.object_marking_refs,
|
189
|
+
"labels": report.labels,
|
190
|
+
"confidence": report.confidence,
|
191
|
+
}
|
192
|
+
|
193
|
+
self.bundler.add_ref(summary_note_obj)
|
194
|
+
self.bundler.add_ref(
|
195
|
+
self.bundler.new_relationship(
|
196
|
+
summary_note_obj["id"],
|
197
|
+
report.id,
|
198
|
+
relationship_type="summary-of",
|
199
|
+
description=f"AI generated summary for {report.name}",
|
200
|
+
external_references=summary_note_obj["external_references"],
|
201
|
+
)
|
202
|
+
)
|
203
|
+
except BaseException as e:
|
204
|
+
print(f"got err {e}")
|
205
|
+
logging.info(f"got err {e}", exc_info=True)
|
206
|
+
return self.summary
|
207
|
+
|
208
|
+
def process(self) -> str:
|
209
|
+
logging.info(f"running file2txt on {self.task_name}")
|
210
|
+
self.file2txt()
|
211
|
+
logging.info(f"running txt2stix on {self.task_name}")
|
212
|
+
bundler = self.txt2stix()
|
213
|
+
self.summarize()
|
214
|
+
self.write_bundle(bundler)
|
215
|
+
logging.info(f"uploading {self.task_name} to arangodb via stix2arango")
|
216
|
+
self.upload_to_arango()
|
217
|
+
return bundler.report.id
|
218
|
+
|
219
|
+
def write_bundle(self, bundler: txt2stixBundler):
|
220
|
+
bundle = json.loads(bundler.to_json())
|
221
|
+
self.bundle = json.dumps(bundle, indent=4)
|
222
|
+
self.bundle_file = self.tmpdir / f"bundle_{self.report_id}.json"
|
223
|
+
self.bundle_file.write_text(self.bundle)
|
224
|
+
|
225
|
+
def upload_to_arango(self):
|
226
|
+
s2a = Stix2Arango(
|
227
|
+
file=str(self.bundle_file),
|
228
|
+
database=settings.ARANGODB_DATABASE,
|
229
|
+
collection=self.collection_name,
|
230
|
+
stix2arango_note=f"stixifier-report--{self.report_id}",
|
231
|
+
ignore_embedded_relationships=self.profile.ignore_embedded_relationships,
|
232
|
+
ignore_embedded_relationships_smo=self.profile.ignore_embedded_relationships_smo,
|
233
|
+
ignore_embedded_relationships_sro=self.profile.ignore_embedded_relationships_sro,
|
234
|
+
host_url=settings.ARANGODB_HOST_URL,
|
235
|
+
username=settings.ARANGODB_USERNAME,
|
236
|
+
password=settings.ARANGODB_PASSWORD,
|
237
|
+
)
|
238
|
+
s2a.arangodb_extra_data.update(self.extra_data)
|
239
|
+
db_view_creator.link_one_collection(
|
240
|
+
s2a.arango.db,
|
241
|
+
settings.ARANGODB_DATABASE_VIEW,
|
242
|
+
f"{self.collection_name}_edge_collection",
|
243
|
+
)
|
244
|
+
db_view_creator.link_one_collection(
|
245
|
+
s2a.arango.db,
|
246
|
+
settings.ARANGODB_DATABASE_VIEW,
|
247
|
+
f"{self.collection_name}_vertex_collection",
|
248
|
+
)
|
249
|
+
s2a.run()
|
250
|
+
|
251
|
+
def __del__(self):
|
252
|
+
shutil.rmtree(self.tmpdir)
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from typing import Type
|
2
|
+
from llama_index.core import PromptTemplate
|
3
|
+
from txt2stix.ai_extractor import ALL_AI_EXTRACTORS, BaseAIExtractor
|
4
|
+
from llama_index.core.response_synthesizers import SimpleSummarize
|
5
|
+
from rest_framework.validators import ValidationError
|
6
|
+
|
7
|
+
prompt = PromptTemplate("""
|
8
|
+
<persona>
|
9
|
+
|
10
|
+
You are a cyber-security threat intelligence analyst responsible for analysing intelligence. You have a deep understanding of cybersecurity concepts and threat intelligence. You are responsible for simplifying long intelligence reports into concise summaries for other to quickly understand the contents.
|
11
|
+
|
12
|
+
</persona>
|
13
|
+
|
14
|
+
<requirement>
|
15
|
+
|
16
|
+
Using the MARKDOWN of the report provided in <document>, provide an executive summary of it containing no more than one paragraphs.
|
17
|
+
|
18
|
+
IMPORTANT: the output should be structured as markdown text.
|
19
|
+
IMPORTANT: do not put output in code block
|
20
|
+
|
21
|
+
</requirement>
|
22
|
+
|
23
|
+
<accuracy>
|
24
|
+
|
25
|
+
Think about your answer first before you respond.
|
26
|
+
|
27
|
+
</accuracy>
|
28
|
+
|
29
|
+
<document>
|
30
|
+
{context_str}
|
31
|
+
</document>
|
32
|
+
|
33
|
+
""")
|
34
|
+
|
35
|
+
def get_provider(klass: Type[BaseAIExtractor]):
|
36
|
+
class SummarizerSession(klass, provider="~"+klass.provider, register=False):
|
37
|
+
system_prompt = """
|
38
|
+
You are a cyber-security threat intelligence analyst responsible for analysing intelligence.
|
39
|
+
You have a deep understanding of cybersecurity concepts and threat intelligence.
|
40
|
+
You are responsible for simplifying long intelligence reports into concise summaries for other to quickly understand the contents.
|
41
|
+
"""
|
42
|
+
def summarize(self, text):
|
43
|
+
summarizer = SimpleSummarize(llm=self.llm, text_qa_template=prompt)
|
44
|
+
return summarizer.get_response('', text_chunks=[text])
|
45
|
+
SummarizerSession.__name__ = klass.__name__.replace('Extractor', 'Summarizer')
|
46
|
+
return SummarizerSession
|
47
|
+
|
48
|
+
|
49
|
+
def parse_summarizer_model(value: str):
|
50
|
+
try:
|
51
|
+
splits = value.split(':', 1)
|
52
|
+
provider = splits[0]
|
53
|
+
if provider not in ALL_AI_EXTRACTORS:
|
54
|
+
raise ValidationError(f"invalid summary provider in `{value}`, must be one of [{list(ALL_AI_EXTRACTORS)}]")
|
55
|
+
provider = get_provider(ALL_AI_EXTRACTORS[provider])
|
56
|
+
if len(splits) == 2:
|
57
|
+
return provider(model=splits[1])
|
58
|
+
return provider()
|
59
|
+
except ValidationError:
|
60
|
+
raise
|
61
|
+
except BaseException as e:
|
62
|
+
raise ValidationError(f'invalid model: {value}') from e
|