stix2arango 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of stix2arango might be problematic. Click here for more details.
- stix2arango/__init__.py +0 -0
- stix2arango/__main__.py +27 -0
- stix2arango/config.py +41 -0
- stix2arango/services/__init__.py +1 -0
- stix2arango/services/arangodb_service.py +303 -0
- stix2arango/services/version_annotator.py +76 -0
- stix2arango/stix2arango/__init__.py +1 -0
- stix2arango/stix2arango/bundle_loader.py +126 -0
- stix2arango/stix2arango/stix2arango.py +546 -0
- stix2arango/templates/marking-definition.json +111 -0
- stix2arango/utils.py +144 -0
- stix2arango-0.0.5.dist-info/METADATA +169 -0
- stix2arango-0.0.5.dist-info/RECORD +16 -0
- stix2arango-0.0.5.dist-info/WHEEL +4 -0
- stix2arango-0.0.5.dist-info/entry_points.txt +2 -0
- stix2arango-0.0.5.dist-info/licenses/LICENSE +202 -0
stix2arango/__init__.py
ADDED
|
File without changes
|
stix2arango/__main__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from stix2arango.stix2arango import Stix2Arango
|
|
3
|
+
|
|
4
|
+
def parse_bool(value: str):
|
|
5
|
+
value = value.lower()
|
|
6
|
+
# ["false", "no", "n"]
|
|
7
|
+
return value in ["yes", "y", "true", "1"]
|
|
8
|
+
|
|
9
|
+
def parse_arguments():
|
|
10
|
+
parser = argparse.ArgumentParser(description="Import STIX JSON into ArangoDB")
|
|
11
|
+
parser.add_argument("--file", required=True, help="Path to STIX JSON file")
|
|
12
|
+
parser.add_argument("--is_large_file", action="store_true", help="Use large file mode [Use this mode when the bundle is very large, this will enable you stix2arango to chunk before loading into memory]")
|
|
13
|
+
parser.add_argument("--database", required=True, help="ArangoDB database name")
|
|
14
|
+
parser.add_argument("--create_db", default=True, type=parse_bool, help="whether or not to skip the creation of database, requires admin permission")
|
|
15
|
+
parser.add_argument("--collection", required=True, help="ArangoDB collection name")
|
|
16
|
+
parser.add_argument("--stix2arango_note", required=False, help="Note for the import", default="")
|
|
17
|
+
parser.add_argument("--ignore_embedded_relationships", required=False, help="Ignore Embedded Relationship for the import", type=parse_bool, default=False)
|
|
18
|
+
parser.add_argument("--ignore_embedded_relationships_sro", required=False, help="Ignore Embedded Relationship for imported SROs", type=parse_bool, default=False)
|
|
19
|
+
parser.add_argument("--ignore_embedded_relationships_smo", required=False, help="Ignore Embedded Relationship for imported SMOs", type=parse_bool, default=False)
|
|
20
|
+
|
|
21
|
+
return parser.parse_args()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main():
|
|
25
|
+
args = parse_arguments()
|
|
26
|
+
stix_obj = Stix2Arango(args.database, args.collection, file=args.file, create_db=args.create_db, stix2arango_note=args.stix2arango_note, ignore_embedded_relationships=args.ignore_embedded_relationships, ignore_embedded_relationships_sro=args.ignore_embedded_relationships_sro, ignore_embedded_relationships_smo=args.ignore_embedded_relationships_smo, is_large_file=args.is_large_file)
|
|
27
|
+
stix_obj.run()
|
stix2arango/config.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
logging.basicConfig(
|
|
9
|
+
level=logging.INFO,
|
|
10
|
+
format="[%(asctime)s] %(levelname)s - %(message)s", # noqa D100 E501
|
|
11
|
+
datefmt="%Y-%m-%d - %H:%M:%S",
|
|
12
|
+
)
|
|
13
|
+
ARANGODB_HOST = os.getenv("ARANGODB_HOST")
|
|
14
|
+
ARANGODB_PORT = os.getenv("ARANGODB_PORT")
|
|
15
|
+
ARANGODB_USERNAME = os.getenv("ARANGODB_USERNAME")
|
|
16
|
+
ARANGODB_PASSWORD = os.getenv("ARANGODB_PASSWORD")
|
|
17
|
+
|
|
18
|
+
json_schema = {
|
|
19
|
+
"type": "object",
|
|
20
|
+
"properties": {
|
|
21
|
+
"type": {"type": "string", "const": "bundle"},
|
|
22
|
+
"id": {"type": "string"},
|
|
23
|
+
"objects": {"type": "array", "items": {"type": "object"}}
|
|
24
|
+
},
|
|
25
|
+
"required": ["type", "id", "objects"]
|
|
26
|
+
}
|
|
27
|
+
STIX2ARANGO_IDENTITY = "https://github.com/muchdogesec/stix4doge/raw/main/objects/identity/stix2arango.json" # this is stix2arango identity
|
|
28
|
+
DOGESEC_IDENTITY = "https://github.com/muchdogesec/stix4doge/raw/main/objects/identity/dogesec.json" # this is stix2arango identity
|
|
29
|
+
|
|
30
|
+
STIX2ARANGO_MARKING_DEFINITION = "https://raw.githubusercontent.com/muchdogesec/stix4doge/main/objects/marking-definition/stix2arango.json" # this is stix2arango marking-definition
|
|
31
|
+
|
|
32
|
+
IDENTITY_REFS = [
|
|
33
|
+
STIX2ARANGO_IDENTITY,
|
|
34
|
+
DOGESEC_IDENTITY
|
|
35
|
+
]
|
|
36
|
+
MARKING_DEFINITION_REFS = [
|
|
37
|
+
STIX2ARANGO_MARKING_DEFINITION
|
|
38
|
+
]
|
|
39
|
+
DEFAULT_OBJECT_URL = MARKING_DEFINITION_REFS + IDENTITY_REFS
|
|
40
|
+
|
|
41
|
+
namespace = UUID("72e906ce-ca1b-5d73-adcd-9ea9eb66a1b4")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .arangodb_service import ArangoDBService
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any
|
|
8
|
+
import arango.database
|
|
9
|
+
from arango.collection import StandardCollection
|
|
10
|
+
from arango import ArangoClient
|
|
11
|
+
from arango.exceptions import ArangoServerError
|
|
12
|
+
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
from stix2arango.services.version_annotator import annotate_versions
|
|
17
|
+
|
|
18
|
+
from .. import config
|
|
19
|
+
from .. import utils
|
|
20
|
+
from pprint import pprint
|
|
21
|
+
|
|
22
|
+
module_logger = logging.getLogger("data_ingestion_service")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ArangoDBService:
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
db,
|
|
30
|
+
vertex_collections,
|
|
31
|
+
edge_collections,
|
|
32
|
+
relationship=None,
|
|
33
|
+
create_db=False,
|
|
34
|
+
create=False,
|
|
35
|
+
username=None,
|
|
36
|
+
password=None,
|
|
37
|
+
host_url=None,
|
|
38
|
+
**kwargs,
|
|
39
|
+
):
|
|
40
|
+
self.ARANGO_DB = self.get_db_name(db)
|
|
41
|
+
self.ARANGO_GRAPH = f"{self.ARANGO_DB.split('_database')[0]}_graph"
|
|
42
|
+
self.COLLECTIONS_VERTEX = vertex_collections
|
|
43
|
+
self.COLLECTIONS_EDGE = edge_collections
|
|
44
|
+
self.FORCE_RELATIONSHIP = [relationship] if relationship else None
|
|
45
|
+
self.missing_collection = True
|
|
46
|
+
|
|
47
|
+
module_logger.info("Establishing connection...")
|
|
48
|
+
client = ArangoClient(hosts=host_url)
|
|
49
|
+
self._client = client
|
|
50
|
+
|
|
51
|
+
if create_db:
|
|
52
|
+
module_logger.info(f"create db `{self.ARANGO_DB}` if not exist")
|
|
53
|
+
self.sys_db = client.db("_system", username=username, password=password)
|
|
54
|
+
|
|
55
|
+
module_logger.info("_system database - OK")
|
|
56
|
+
|
|
57
|
+
if not self.sys_db.has_database(self.ARANGO_DB):
|
|
58
|
+
self.create_database(self.ARANGO_DB)
|
|
59
|
+
|
|
60
|
+
self.db = client.db(
|
|
61
|
+
self.ARANGO_DB, username=username, password=password, verify=True
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if self.db.has_graph(self.ARANGO_GRAPH):
|
|
65
|
+
self.cti2stix_graph = self.db.graph(self.ARANGO_GRAPH)
|
|
66
|
+
elif create:
|
|
67
|
+
self.cti2stix_graph = self.db.create_graph(self.ARANGO_GRAPH)
|
|
68
|
+
|
|
69
|
+
self.collections: dict[str, StandardCollection] = {}
|
|
70
|
+
for collection in self.COLLECTIONS_VERTEX:
|
|
71
|
+
if create:
|
|
72
|
+
self.collections[collection] = self.create_collection(collection)
|
|
73
|
+
|
|
74
|
+
self.collections[collection] = self.db.collection(collection)
|
|
75
|
+
|
|
76
|
+
for collection in self.COLLECTIONS_EDGE:
|
|
77
|
+
|
|
78
|
+
if create:
|
|
79
|
+
try:
|
|
80
|
+
self.cti2stix_objects_relationship = (
|
|
81
|
+
self.cti2stix_graph.create_edge_definition(
|
|
82
|
+
edge_collection=collection,
|
|
83
|
+
from_vertex_collections=self.COLLECTIONS_VERTEX,
|
|
84
|
+
to_vertex_collections=self.COLLECTIONS_VERTEX,
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
module_logger.debug(
|
|
89
|
+
f"create edge collection {collection} failed with {e}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
self.cti2stix_objects_relationship = self.cti2stix_graph.edge_collection(
|
|
93
|
+
collection
|
|
94
|
+
)
|
|
95
|
+
self.collections[collection] = self.cti2stix_objects_relationship
|
|
96
|
+
|
|
97
|
+
module_logger.info("ArangoDB Connected now!")
|
|
98
|
+
|
|
99
|
+
def create_database(self, db_name):
|
|
100
|
+
try:
|
|
101
|
+
self.sys_db.create_database(db_name)
|
|
102
|
+
except arango.exceptions.DatabaseCreateError as e:
|
|
103
|
+
module_logger.debug(f"create database {db_name} failed with {e}")
|
|
104
|
+
|
|
105
|
+
def create_collection(self, collection_name):
|
|
106
|
+
try:
|
|
107
|
+
return self.db.create_collection(collection_name)
|
|
108
|
+
except arango.exceptions.CollectionCreateError as e:
|
|
109
|
+
module_logger.warning(
|
|
110
|
+
f"create collection {collection_name} failed with {e}"
|
|
111
|
+
)
|
|
112
|
+
return self.db.collection(collection_name)
|
|
113
|
+
|
|
114
|
+
def execute_raw_query(self, query: str, bind_vars=None, **kwargs) -> list:
|
|
115
|
+
try:
|
|
116
|
+
cursor = self.db.aql.execute(query, bind_vars=bind_vars, **kwargs)
|
|
117
|
+
result = [doc for doc in cursor]
|
|
118
|
+
return result
|
|
119
|
+
except arango.exceptions.AQLQueryExecuteError:
|
|
120
|
+
module_logger.error(f"AQL exception in the query: {query}")
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
def insert_several_objects(self, objects: list[dict], collection_name: str) -> None:
|
|
124
|
+
if not collection_name:
|
|
125
|
+
module_logger.info(f"Object has unknown type: {objects}")
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
for _, obj in enumerate(objects):
|
|
129
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
130
|
+
obj["_is_latest"] = False
|
|
131
|
+
obj["_record_created"] = obj.get("_record_created", now)
|
|
132
|
+
obj["_record_modified"] = now
|
|
133
|
+
obj["_key"] = obj.get("_key", f'{obj["id"]}+{now}')
|
|
134
|
+
|
|
135
|
+
if obj["type"] == "relationship":
|
|
136
|
+
obj.update(
|
|
137
|
+
_target_type=obj["target_ref"].split("--")[0],
|
|
138
|
+
_source_type=obj["source_ref"].split("--")[0],
|
|
139
|
+
)
|
|
140
|
+
new_insertions = objects #[obj for obj in objects if f'{obj["id"]};{obj["_record_md5_hash"]}' not in existing_objects]
|
|
141
|
+
existing_objects = {}
|
|
142
|
+
|
|
143
|
+
d = self.db.collection(collection_name).insert_many(new_insertions, overwrite_mode="ignore", sync=True)
|
|
144
|
+
for i, ret in enumerate(d):
|
|
145
|
+
obj = objects[i]
|
|
146
|
+
if isinstance(ret, arango.exceptions.DocumentInsertError):
|
|
147
|
+
if ret.error_code == 1210:
|
|
148
|
+
existing_objects[f'{obj["id"]};{obj["_record_md5_hash"]}'] = collection_name + '/' + re.search(r'conflicting key: (.*)', ret.message).group(1)
|
|
149
|
+
if 'relationship--7a1682a3-fec3-53ed-8dd5-68f54b1d6f7a' in ret.message:
|
|
150
|
+
print(1)
|
|
151
|
+
else:
|
|
152
|
+
raise ret
|
|
153
|
+
return [obj["id"] for obj in new_insertions], existing_objects
|
|
154
|
+
|
|
155
|
+
def insert_several_objects_chunked(
|
|
156
|
+
self, objects, collection_name, chunk_size=1000, remove_duplicates=True
|
|
157
|
+
):
|
|
158
|
+
if remove_duplicates:
|
|
159
|
+
original_length = len(objects)
|
|
160
|
+
objects = utils.remove_duplicates(objects)
|
|
161
|
+
logging.info(
|
|
162
|
+
"removed {count} duplicates from imported objects.".format(
|
|
163
|
+
count=original_length - len(objects)
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
progress_bar = tqdm(
|
|
168
|
+
utils.chunked(objects, chunk_size),
|
|
169
|
+
total=len(objects),
|
|
170
|
+
desc="insert_several_objects_chunked",
|
|
171
|
+
)
|
|
172
|
+
inserted_objects = []
|
|
173
|
+
existing_objects = {}
|
|
174
|
+
for chunk in progress_bar:
|
|
175
|
+
inserted, existing = self.insert_several_objects(chunk, collection_name)
|
|
176
|
+
inserted_objects.extend(inserted)
|
|
177
|
+
existing_objects.update(existing)
|
|
178
|
+
progress_bar.update(len(chunk))
|
|
179
|
+
return inserted_objects, existing_objects
|
|
180
|
+
|
|
181
|
+
def insert_relationships_chunked(
|
|
182
|
+
self,
|
|
183
|
+
relationships: list[dict[str, Any]],
|
|
184
|
+
id_to_key_map: dict[str, str],
|
|
185
|
+
collection_name: str,
|
|
186
|
+
chunk_size=1200,
|
|
187
|
+
):
|
|
188
|
+
for relationship in relationships:
|
|
189
|
+
source_key = id_to_key_map.get(relationship["source_ref"])
|
|
190
|
+
target_key = id_to_key_map.get(relationship["target_ref"])
|
|
191
|
+
|
|
192
|
+
relationship["_stix2arango_ref_err"] = not (target_key and source_key)
|
|
193
|
+
relationship["_from"] = self.fix_edge_ref(source_key or relationship["_from"])
|
|
194
|
+
relationship["_to"] = self.fix_edge_ref(target_key or relationship["_to"])
|
|
195
|
+
relationship["_record_md5_hash"] = relationship.get(
|
|
196
|
+
"_record_md5_hash", utils.generate_md5(relationship)
|
|
197
|
+
)
|
|
198
|
+
return self.insert_several_objects_chunked(
|
|
199
|
+
relationships, collection_name, chunk_size=chunk_size
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def fix_edge_ref(_id):
|
|
204
|
+
c, _, _key = _id.partition('/')
|
|
205
|
+
if not c:
|
|
206
|
+
c = "missing_collection"
|
|
207
|
+
return f"{c}/{_key}"
|
|
208
|
+
|
|
209
|
+
def update_is_latest_several(self, object_ids, collection_name):
|
|
210
|
+
# returns newly deprecated _ids
|
|
211
|
+
query = """
|
|
212
|
+
FOR doc IN @@collection OPTIONS {indexHint: "s2a_search", forceIndexHint: true}
|
|
213
|
+
FILTER doc.id IN @object_ids
|
|
214
|
+
RETURN [doc.id, doc._key, doc.modified, doc._record_modified, doc._is_latest, doc._id]
|
|
215
|
+
"""
|
|
216
|
+
out = self.execute_raw_query(
|
|
217
|
+
query,
|
|
218
|
+
bind_vars={
|
|
219
|
+
"@collection": collection_name,
|
|
220
|
+
"object_ids": object_ids,
|
|
221
|
+
},
|
|
222
|
+
)
|
|
223
|
+
out = [dict(zip(('id', '_key', 'modified', '_record_modified', '_is_latest', '_id'), obj_tuple)) for obj_tuple in out]
|
|
224
|
+
annotated, deprecated = annotate_versions(out)
|
|
225
|
+
self.db.collection(collection_name).update_many(annotated, sync=True, keep_none=False)
|
|
226
|
+
return deprecated
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def update_is_latest_several_chunked(
|
|
231
|
+
self, object_ids, collection_name, edge_collection=None, chunk_size=5000
|
|
232
|
+
):
|
|
233
|
+
logging.info(f"Updating _is_latest for {len(object_ids)} newly inserted items")
|
|
234
|
+
progress_bar = tqdm(
|
|
235
|
+
utils.chunked(object_ids, chunk_size),
|
|
236
|
+
total=len(object_ids),
|
|
237
|
+
desc="update_is_latest_several_chunked",
|
|
238
|
+
)
|
|
239
|
+
deprecated_key_ids = [] # contains newly deprecated _ids
|
|
240
|
+
for chunk in progress_bar:
|
|
241
|
+
deprecated_key_ids.extend(
|
|
242
|
+
self.update_is_latest_several(chunk, collection_name)
|
|
243
|
+
)
|
|
244
|
+
progress_bar.update(len(chunk))
|
|
245
|
+
|
|
246
|
+
logging.info(
|
|
247
|
+
f"Updating relationship's _is_latest for {len(deprecated_key_ids)} items"
|
|
248
|
+
)
|
|
249
|
+
self.deprecate_relationships(deprecated_key_ids, edge_collection)
|
|
250
|
+
return deprecated_key_ids
|
|
251
|
+
|
|
252
|
+
def deprecate_relationships(
|
|
253
|
+
self, deprecated_key_ids: list, edge_collection: str, chunk_size=5000
|
|
254
|
+
):
|
|
255
|
+
keys = self.get_relationships_to_deprecate(deprecated_key_ids, edge_collection)
|
|
256
|
+
self.db.collection(edge_collection).update_many(
|
|
257
|
+
tuple(dict(_key=_key, _is_latest=False) for _key in keys),
|
|
258
|
+
silent=True,
|
|
259
|
+
raise_on_document_error=True,
|
|
260
|
+
)
|
|
261
|
+
return len(keys)
|
|
262
|
+
|
|
263
|
+
def get_relationships_to_deprecate(
|
|
264
|
+
self, deprecated_key_ids: list, edge_collection: str
|
|
265
|
+
):
|
|
266
|
+
query = """
|
|
267
|
+
FOR doc IN @@collection OPTIONS {indexHint: "s2a_search_edge", forceIndexHint: true}
|
|
268
|
+
FILTER doc._from IN @deprecated_key_ids AND doc._is_latest == TRUE
|
|
269
|
+
RETURN doc._id
|
|
270
|
+
"""
|
|
271
|
+
items_to_deprecate_full: set[str] = {*deprecated_key_ids}
|
|
272
|
+
|
|
273
|
+
while deprecated_key_ids:
|
|
274
|
+
deprecated_key_ids = self.execute_raw_query(
|
|
275
|
+
query,
|
|
276
|
+
bind_vars={
|
|
277
|
+
"@collection": edge_collection,
|
|
278
|
+
"deprecated_key_ids": deprecated_key_ids,
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
items_to_deprecate_full.update(deprecated_key_ids)
|
|
282
|
+
return [_id.split("/", 1)[1] for _id in items_to_deprecate_full]
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def get_db_name(name):
|
|
286
|
+
ENDING = "_database"
|
|
287
|
+
if name.endswith(ENDING):
|
|
288
|
+
return name
|
|
289
|
+
return name + ENDING
|
|
290
|
+
|
|
291
|
+
@contextlib.contextmanager
|
|
292
|
+
def transactional(self, write=None, exclusive=None, sync=True):
|
|
293
|
+
original_db = self.db
|
|
294
|
+
transactional_db = self.db.begin_transaction(allow_implicit=True, write=write, exclusive=exclusive, sync=sync)
|
|
295
|
+
try:
|
|
296
|
+
self.db = transactional_db
|
|
297
|
+
yield self
|
|
298
|
+
transactional_db.commit_transaction()
|
|
299
|
+
except:
|
|
300
|
+
transactional_db.abort_transaction()
|
|
301
|
+
raise
|
|
302
|
+
finally:
|
|
303
|
+
self.db = original_db
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
import copy
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
import os
|
|
9
|
+
import typing
|
|
10
|
+
from arango.client import ArangoClient
|
|
11
|
+
from arango.database import StandardDatabase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def annotate_versions(objects: List[Dict]):
|
|
15
|
+
grouped = defaultdict(list)
|
|
16
|
+
|
|
17
|
+
# Group by 'id'
|
|
18
|
+
for obj in objects:
|
|
19
|
+
grouped[obj['id']].append(obj)
|
|
20
|
+
|
|
21
|
+
result: list[dict] = []
|
|
22
|
+
deprecated = []
|
|
23
|
+
|
|
24
|
+
for obj_id, items in grouped.items():
|
|
25
|
+
# items = [copy.deepcopy(item) for item in items]
|
|
26
|
+
|
|
27
|
+
# Separate items with non-None modified
|
|
28
|
+
valid_modified = [item for item in items if item.get('modified') is not None]
|
|
29
|
+
|
|
30
|
+
# _is_latest: max(modified) -> max(_record_modified)
|
|
31
|
+
if valid_modified:
|
|
32
|
+
max_modified = max(item.get('modified') for item in valid_modified)
|
|
33
|
+
latest_candidates = [item for item in valid_modified if item.get('modified') == max_modified]
|
|
34
|
+
max_record_modified_latest = max(item['_record_modified'] for item in latest_candidates)
|
|
35
|
+
else:
|
|
36
|
+
max_modified = None
|
|
37
|
+
max_record_modified_latest = max(item['_record_modified'] for item in items)
|
|
38
|
+
# _is_earliest: min(modified) -> max(_record_modified)
|
|
39
|
+
if valid_modified:
|
|
40
|
+
min_modified = min(item.get('modified') for item in valid_modified)
|
|
41
|
+
earliest_candidates = [item for item in valid_modified if item.get('modified') == min_modified]
|
|
42
|
+
max_record_modified_earliest = max(item['_record_modified'] for item in earliest_candidates)
|
|
43
|
+
else:
|
|
44
|
+
min_modified = None
|
|
45
|
+
max_record_modified_earliest = min(item['_record_modified'] for item in items)
|
|
46
|
+
|
|
47
|
+
# _taxii_visible: for each modified (including None), select highest _record_modified
|
|
48
|
+
taxii_visible_keys = set()
|
|
49
|
+
modified_groups = defaultdict(list)
|
|
50
|
+
for item in items:
|
|
51
|
+
modified_groups[item.get('modified')].append(item)
|
|
52
|
+
|
|
53
|
+
for mod_val, group in modified_groups.items():
|
|
54
|
+
max_rec_mod = max(i['_record_modified'] for i in group)
|
|
55
|
+
for item in group:
|
|
56
|
+
if item['_record_modified'] == max_rec_mod:
|
|
57
|
+
taxii_visible_keys.add(item['_key'])
|
|
58
|
+
|
|
59
|
+
for item in items:
|
|
60
|
+
is_latest = (
|
|
61
|
+
item.get('modified') == max_modified
|
|
62
|
+
and item['_record_modified'] == max_record_modified_latest
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
is_earliest = (
|
|
66
|
+
item.get('modified') == min_modified
|
|
67
|
+
and item['_record_modified'] == max_record_modified_earliest
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if item.get('_is_latest') and not is_latest:
|
|
71
|
+
deprecated.append(item['_id'])
|
|
72
|
+
item['_is_latest'] = is_latest
|
|
73
|
+
item['_taxii'] = dict(visible=item['_key'] in taxii_visible_keys, first=is_earliest, last=is_latest)
|
|
74
|
+
result.append(item)
|
|
75
|
+
|
|
76
|
+
return result, deprecated
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .stix2arango import Stix2Arango
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sqlite3
|
|
7
|
+
import tempfile
|
|
8
|
+
import uuid
|
|
9
|
+
import ijson
|
|
10
|
+
import json
|
|
11
|
+
from collections import Counter
|
|
12
|
+
|
|
13
|
+
class BundleLoader:
|
|
14
|
+
def __init__(self, file_path, chunk_size_min=20_000, db_path=""):
|
|
15
|
+
self.file_path = Path(file_path)
|
|
16
|
+
self.chunk_size_min = chunk_size_min
|
|
17
|
+
self.groups = None
|
|
18
|
+
self.bundle_id = "bundle--" + str(uuid.uuid4())
|
|
19
|
+
|
|
20
|
+
self.db_path = db_path
|
|
21
|
+
if not self.db_path:
|
|
22
|
+
self.temp_path = tempfile.NamedTemporaryFile(prefix='s2a_bundle_loader--', suffix='.sqlite')
|
|
23
|
+
self.db_path = self.temp_path.name
|
|
24
|
+
self._init_db()
|
|
25
|
+
|
|
26
|
+
def _init_db(self):
|
|
27
|
+
"""Initialize SQLite DB with objects table."""
|
|
28
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
29
|
+
self.conn.execute('''
|
|
30
|
+
CREATE TABLE IF NOT EXISTS objects (
|
|
31
|
+
id TEXT PRIMARY KEY,
|
|
32
|
+
type TEXT,
|
|
33
|
+
raw TEXT
|
|
34
|
+
)
|
|
35
|
+
''')
|
|
36
|
+
self.conn.execute('PRAGMA synchronous = OFF;')
|
|
37
|
+
self.conn.execute('PRAGMA journal_mode = MEMORY;')
|
|
38
|
+
self.conn.execute('PRAGMA temp_store = MEMORY;')
|
|
39
|
+
self.conn.commit()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def save_to_sqlite(self, objects):
|
|
43
|
+
"""Save one STIX object to the SQLite database."""
|
|
44
|
+
self.inserted = getattr(self, 'inserted', 0)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
self.conn.executemany(
|
|
48
|
+
"INSERT OR REPLACE INTO objects (id, type, raw) VALUES (?, ?, ?)",
|
|
49
|
+
[(obj['id'], obj['type'], json.dumps(obj)) for obj in objects]
|
|
50
|
+
)
|
|
51
|
+
except sqlite3.IntegrityError as e:
|
|
52
|
+
print(f"Failed to insert len({objects}) objects: {e}")
|
|
53
|
+
else:
|
|
54
|
+
self.conn.commit()
|
|
55
|
+
self.inserted += len(objects)
|
|
56
|
+
# logging.info(f"inserted {self.inserted}")
|
|
57
|
+
|
|
58
|
+
def build_groups(self):
|
|
59
|
+
"""
|
|
60
|
+
Iterates the STIX bundle and uses union-find to group IDs such that for every
|
|
61
|
+
relationship (object of type "relationship"), its own id and its source_ref
|
|
62
|
+
and target_ref end up in the same group.
|
|
63
|
+
"""
|
|
64
|
+
all_ids: dict[str, list[str]] = dict() # All object IDs in the file
|
|
65
|
+
logging.info(f"loading into {self.db_path}")
|
|
66
|
+
|
|
67
|
+
with open(self.file_path, 'rb') as f:
|
|
68
|
+
objects = ijson.items(f, 'objects.item', use_float=True)
|
|
69
|
+
to_insert = []
|
|
70
|
+
for obj in objects:
|
|
71
|
+
obj_id = obj.get('id')
|
|
72
|
+
to_insert.append(obj)
|
|
73
|
+
all_ids.setdefault(obj_id, [])
|
|
74
|
+
if obj['type'] == 'relationship' and all(x in obj for x in ['target_ref', 'source_ref']):
|
|
75
|
+
sr, tr = [obj['source_ref'], obj['target_ref']]
|
|
76
|
+
all_ids[obj_id].extend([sr, tr])
|
|
77
|
+
all_ids.setdefault(sr, []).extend([tr, obj_id])
|
|
78
|
+
all_ids.setdefault(tr, []).extend([sr, obj_id])
|
|
79
|
+
if len(to_insert) >= self.chunk_size_min:
|
|
80
|
+
self.save_to_sqlite(to_insert)
|
|
81
|
+
to_insert.clear()
|
|
82
|
+
if to_insert:
|
|
83
|
+
self.save_to_sqlite(to_insert)
|
|
84
|
+
|
|
85
|
+
logging.info(f"loaded {self.inserted} into {self.db_path}")
|
|
86
|
+
handled = set()
|
|
87
|
+
|
|
88
|
+
self.groups = []
|
|
89
|
+
group = set()
|
|
90
|
+
def from_ids(all_ids):
|
|
91
|
+
for obj_id in all_ids:
|
|
92
|
+
if obj_id in handled:
|
|
93
|
+
continue
|
|
94
|
+
group_objs = {obj_id, *all_ids[obj_id]}
|
|
95
|
+
handled.update(group_objs)
|
|
96
|
+
new_group = group.union(group_objs)
|
|
97
|
+
if len(new_group) >= self.chunk_size_min:
|
|
98
|
+
group.clear()
|
|
99
|
+
self.groups.append(tuple(new_group))
|
|
100
|
+
else:
|
|
101
|
+
group.update(group_objs)
|
|
102
|
+
|
|
103
|
+
from_ids(all_ids)
|
|
104
|
+
if group:
|
|
105
|
+
self.groups.append(tuple(group))
|
|
106
|
+
return self.groups
|
|
107
|
+
|
|
108
|
+
def load_objects_by_ids(self, ids):
|
|
109
|
+
"""Retrieve a list of STIX objects by their IDs from the SQLite database."""
|
|
110
|
+
placeholders = ','.join(['?'] * len(ids))
|
|
111
|
+
query = f"SELECT raw FROM objects WHERE id IN ({placeholders})"
|
|
112
|
+
cursor = self.conn.execute(query, list(ids))
|
|
113
|
+
return [json.loads(row[0]) for row in cursor.fetchall()]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def get_objects(self, group):
|
|
117
|
+
return list(self.load_objects_by_ids(group))
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def chunks(self):
|
|
121
|
+
for group in self.groups or self.build_groups():
|
|
122
|
+
yield self.get_objects(group)
|
|
123
|
+
|
|
124
|
+
def __del__(self):
|
|
125
|
+
with contextlib.suppress(Exception):
|
|
126
|
+
os.remove(self.db_path)
|