stix2arango 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stix2arango might be problematic. Click here for more details.

@@ -0,0 +1,546 @@
1
+ from datetime import datetime
2
+ import os
3
+ import json
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ import pkgutil
8
+ import re
9
+ import time
10
+ import uuid
11
+
12
+ from .bundle_loader import BundleLoader
13
+
14
+ from .. import config
15
+ from tqdm import tqdm
16
+ from ..services.arangodb_service import ArangoDBService
17
+ from jsonschema import validate
18
+ from arango.collection import StandardCollection
19
+
20
+
21
+ from .. import utils
22
+
23
+ module_logger = logging.getLogger("data_ingestion_service")
24
+ SMO_TYPES = ["marking-definition", "extension-definition", "language-content"]
25
+
26
+
27
+ class Stix2Arango:
28
+ EMBEDDED_RELATIONSHIP_RE = re.compile(r"([a-z\-_]+)[_\-]refs{0,1}")
29
+ filename = "bundle.json"
30
+ ARANGODB_URL = f"http://{config.ARANGODB_HOST}:{config.ARANGODB_PORT}"
31
+
32
+ def __init__(
33
+ self,
34
+ database,
35
+ collection,
36
+ file,
37
+ create_collection=True,
38
+ create_db=True,
39
+ stix2arango_note="",
40
+ ignore_embedded_relationships=False,
41
+ ignore_embedded_relationships_sro=True,
42
+ ignore_embedded_relationships_smo=True,
43
+ bundle_id=None,
44
+ username=config.ARANGODB_USERNAME,
45
+ password=config.ARANGODB_PASSWORD,
46
+ host_url=ARANGODB_URL,
47
+ is_large_file=False,
48
+ skip_default_indexes=False,
49
+ create_taxii_views=True,
50
+ **kwargs,
51
+ ):
52
+ """
53
+ `modify_fn` should modify in-place, returned value is discarded
54
+ """
55
+
56
+ self.alter_functions = []
57
+
58
+ self.core_collection_vertex, self.core_collection_edge = (
59
+ utils.get_vertex_and_edge_collection_names(collection)
60
+ )
61
+ EDGE_COLLECTIONS = [self.core_collection_edge]
62
+ VERTEX_COLLECTIONS = [self.core_collection_vertex]
63
+
64
+ self.arango = ArangoDBService(
65
+ database,
66
+ VERTEX_COLLECTIONS,
67
+ EDGE_COLLECTIONS,
68
+ create=create_collection,
69
+ create_db=create_db,
70
+ username=username,
71
+ password=password,
72
+ host_url=host_url,
73
+ **kwargs,
74
+ )
75
+
76
+ self.arangodb_extra_data = {}
77
+
78
+ self.file = file
79
+ self.is_large_file = is_large_file
80
+ self.note = stix2arango_note or ""
81
+ self.identity_ref = utils.load_file_from_url(config.STIX2ARANGO_IDENTITY)
82
+ self.default_ref_objects = [
83
+ utils.load_file_from_url(link)
84
+ for link in config.MARKING_DEFINITION_REFS + config.IDENTITY_REFS
85
+ ]
86
+ self.bundle_id = bundle_id
87
+ self.ignore_embedded_relationships = ignore_embedded_relationships
88
+ self.ignore_embedded_relationships_smo = ignore_embedded_relationships_smo
89
+ self.ignore_embedded_relationships_sro = ignore_embedded_relationships_sro
90
+ self.object_key_mapping = {}
91
+ if create_collection:
92
+ self.create_s2a_indexes()
93
+ if not skip_default_indexes:
94
+ self.create_default_indexes()
95
+ if create_taxii_views:
96
+ self.create_taxii_views()
97
+
98
+ if self.file:
99
+ self.filename = Path(self.file).name
100
+
101
+ def alter_objects(self, objects: list[dict]):
102
+ for obj in objects:
103
+ obj.update(self.arangodb_extra_data)
104
+ for fn in self.alter_functions:
105
+ try:
106
+ fn(obj)
107
+ except Exception as e:
108
+ logging.warning(f"alter function {fn} failed on {obj}")
109
+ logging.warning(f"alter function {fn} failed on {obj}", exc_info=True)
110
+
111
+ def add_object_alter_fn(self, modify_fn):
112
+ if not callable(modify_fn):
113
+ raise ValueError("Bad modification function passed")
114
+ self.alter_functions.append(modify_fn)
115
+
116
+ def create_s2a_indexes(self):
117
+ for name, collection in self.arango.collections.items():
118
+ collection.add_index(
119
+ dict(
120
+ type="persistent",
121
+ name="s2a_search",
122
+ sparse=False,
123
+ fields=[
124
+ "id",
125
+ "modified",
126
+ "_is_latest",
127
+ ],
128
+ inBackground=True,
129
+ storedValues=["_record_modified", "_key", "_id"],
130
+ )
131
+ )
132
+
133
+ collection.add_index(
134
+ dict(
135
+ type="persistent",
136
+ name="s2a_unique_constraint",
137
+ unique=True,
138
+ fields=[
139
+ "id",
140
+ "_record_md5_hash",
141
+ ],
142
+ inBackground=True,
143
+ )
144
+ )
145
+ if name.endswith("_edge_collection"):
146
+ collection.add_index(
147
+ dict(
148
+ type="persistent",
149
+ name="s2a_search_edge",
150
+ sparse=True,
151
+ fields=[
152
+ "_from",
153
+ "_is_latest",
154
+ ],
155
+ inBackground=True,
156
+ storedValues=["_id"],
157
+ )
158
+ )
159
+
160
+ def create_taxii_views(self):
161
+ views = set()
162
+ for name, collection in self.arango.collections.items():
163
+ collection.add_index(
164
+ dict(
165
+ type="inverted",
166
+ name="taxii_search",
167
+ sparse=True,
168
+ fields=[
169
+ "_record_created",
170
+ "modified",
171
+ "id",
172
+ "_taxii.visible",
173
+ "_taxii.last",
174
+ "_taxii.first",
175
+ "spec_version",
176
+ "type",
177
+ ],
178
+ inBackground=True,
179
+ storedValues=["_key", "_created"],
180
+ primarySort={
181
+ "fields": [{"field": "_record_created", "direction": "asc"}]
182
+ },
183
+ )
184
+ )
185
+ views.add('ats__' + name.removesuffix('_vertex_collection').removesuffix('_edge_collection'))
186
+
187
+ def create_default_indexes(self):
188
+ for name, collection in self.arango.collections.items():
189
+ module_logger.info(
190
+ f"creating indexes for collection {collection.db_name}/{name}"
191
+ )
192
+ time = int(datetime.now().timestamp())
193
+
194
+ collection.add_index(
195
+ dict(
196
+ type="persistent",
197
+ fields=["id"],
198
+ storedValues=[
199
+ "modified",
200
+ "created",
201
+ "type",
202
+ "_record_modified",
203
+ "spec_version",
204
+ "_record_md5_hash",
205
+ ],
206
+ inBackground=True,
207
+ name=f"by_stix_id_{time}",
208
+ )
209
+ )
210
+ collection.add_index(
211
+ dict(
212
+ type="persistent",
213
+ fields=["id", "type"],
214
+ storedValues=[
215
+ "modified",
216
+ "created",
217
+ "_record_modified",
218
+ "spec_version",
219
+ "_record_md5_hash",
220
+ ],
221
+ inBackground=True,
222
+ name=f"by_stix_id_type_{time}",
223
+ )
224
+ )
225
+ collection.add_index(
226
+ dict(
227
+ type="persistent",
228
+ fields=["modified", "created"],
229
+ storedValues=[
230
+ "type",
231
+ "_record_modified",
232
+ "id",
233
+ "spec_version",
234
+ "_record_md5_hash",
235
+ ],
236
+ inBackground=True,
237
+ name=f"by_stix_version_{time}",
238
+ )
239
+ )
240
+ collection.add_index(
241
+ dict(
242
+ type="persistent",
243
+ fields=["type"],
244
+ storedValues=[
245
+ "modified",
246
+ "created",
247
+ "_record_modified",
248
+ "id",
249
+ "spec_version",
250
+ "_record_md5_hash",
251
+ ],
252
+ inBackground=True,
253
+ name=f"by_stix_type_{time}",
254
+ )
255
+ )
256
+ collection.add_index(
257
+ dict(
258
+ type="persistent",
259
+ fields=["_record_modified", "_record_created"],
260
+ storedValues=[
261
+ "modified",
262
+ "created",
263
+ "type",
264
+ "id",
265
+ "spec_version",
266
+ "_record_md5_hash",
267
+ ],
268
+ inBackground=True,
269
+ name=f"by_insertion_time_{time}",
270
+ )
271
+ )
272
+ if name.endswith("_edge_collection"):
273
+ collection.add_index(
274
+ dict(
275
+ type="persistent",
276
+ fields=["source_ref", "target_ref", "relationship_type"],
277
+ storedValues=[
278
+ "modified",
279
+ "created",
280
+ "type",
281
+ "_record_modified",
282
+ "spec_version",
283
+ "_record_md5_hash",
284
+ "id",
285
+ ],
286
+ inBackground=True,
287
+ name=f"relation_from_{time}",
288
+ )
289
+ )
290
+ collection.add_index(
291
+ dict(
292
+ type="persistent",
293
+ fields=["target_ref", "source_ref", "relationship_type"],
294
+ storedValues=[
295
+ "modified",
296
+ "created",
297
+ "type",
298
+ "_record_modified",
299
+ "spec_version",
300
+ "_record_md5_hash",
301
+ "id",
302
+ ],
303
+ inBackground=True,
304
+ name=f"relation_to_{time}",
305
+ )
306
+ )
307
+ collection.add_index(
308
+ dict(
309
+ type="persistent",
310
+ fields=["relationship_type", "target_ref", "source_ref"],
311
+ storedValues=[
312
+ "modified",
313
+ "created",
314
+ "type",
315
+ "_record_modified",
316
+ "spec_version",
317
+ "_record_md5_hash",
318
+ "id",
319
+ ],
320
+ inBackground=True,
321
+ name=f"relation_type_{time}",
322
+ )
323
+ )
324
+
325
+ def default_objects(self):
326
+ object_list = self.default_ref_objects
327
+ for obj in json.loads(
328
+ pkgutil.get_data("stix2arango", "templates/marking-definition.json")
329
+ ):
330
+ object_list.append(obj)
331
+ return object_list
332
+
333
+ def process_bundle_into_graph(
334
+ self, objects_in, notes=None, is_default_objects=False
335
+ ):
336
+ module_logger.info(f"Reading vertex from file {self.file} now")
337
+
338
+ objects = []
339
+ insert_data = [] # That would be the overall statement
340
+ for obj in tqdm(objects_in, desc="upload_vertices"):
341
+ if obj.get("type") == "relationship":
342
+ continue
343
+ obj.setdefault("_stix2arango_note", notes or self.note)
344
+ obj["_record_md5_hash"] = utils.generate_md5(obj)
345
+ if not is_default_objects:
346
+ obj["_bundle_id"] = self.bundle_id or ""
347
+ obj["_file_name"] = self.filename or ""
348
+ obj.update(self.arangodb_extra_data)
349
+ objects.append(obj)
350
+ insert_data.append(
351
+ [obj.get("type"), obj.get("id"), True if "modified" in obj else False]
352
+ )
353
+
354
+ module_logger.info(
355
+ f"Inserting objects into database. Total objects: {len(objects)}"
356
+ )
357
+ with self.arango.transactional(exclusive=[self.core_collection_edge, self.core_collection_vertex]):
358
+ inserted_object_ids, existing_objects = (
359
+ self.arango.insert_several_objects_chunked(
360
+ objects, self.core_collection_vertex
361
+ )
362
+ )
363
+ deprecated_key_ids = self.arango.update_is_latest_several_chunked(
364
+ inserted_object_ids, self.core_collection_vertex, self.core_collection_edge
365
+ )
366
+
367
+ self.update_object_key_mapping(self.core_collection_vertex, objects, existing_objects)
368
+ return inserted_object_ids, existing_objects, deprecated_key_ids
369
+
370
+ def update_object_key_mapping(self, collection, objects, existing_objects={}):
371
+ for obj in objects:
372
+ if db_key := existing_objects.get(f"{obj['id']};{obj['_record_md5_hash']}"):
373
+ self.object_key_mapping[obj["id"]] = db_key
374
+ else:
375
+ self.object_key_mapping[obj["id"]] = "{collection}/{_key}".format(
376
+ collection=collection,
377
+ _key=obj.get("_key", "not_imported"),
378
+ )
379
+
380
+ def map_relationships(self, filename, objects_in):
381
+
382
+ module_logger.info("Mapping Prebuilt Relationship Objects -> ")
383
+ objects = []
384
+ inserted_data = []
385
+ obj: dict
386
+ for obj in tqdm(objects_in, desc="upload_edges"):
387
+ if obj.get("type") == "relationship":
388
+
389
+ source_ref = obj.get("source_ref")
390
+ target_ref = obj.get("target_ref")
391
+
392
+ obj.setdefault("_from", f"{self.core_collection_vertex}/{source_ref}")
393
+ obj.setdefault("_to", f"{self.core_collection_vertex}/{target_ref}")
394
+ obj["_bundle_id"] = self.bundle_id or ""
395
+ obj["_file_name"] = filename
396
+ obj.setdefault("_is_ref", False)
397
+ obj.setdefault("_stix2arango_note", self.note)
398
+ # obj['_record_md5_hash'] = utils.generate_md5(obj)
399
+ obj.update(self.arangodb_extra_data)
400
+ objects.append(obj)
401
+ inserted_data.append(
402
+ [
403
+ obj.get("type"),
404
+ obj.get("id"),
405
+ True if "modified" in obj else False,
406
+ ]
407
+ )
408
+
409
+ module_logger.info(
410
+ f"Inserting relationship into database. Total objects: {len(objects)}"
411
+ )
412
+ with self.arango.transactional(exclusive=[self.core_collection_edge, self.core_collection_vertex]):
413
+ inserted_object_ids, existing_objects = (
414
+ self.arango.insert_relationships_chunked(
415
+ objects, self.object_key_mapping, self.core_collection_edge
416
+ )
417
+ )
418
+ deprecated_key_ids = self.arango.update_is_latest_several_chunked(
419
+ inserted_object_ids, self.core_collection_edge, self.core_collection_edge
420
+ )
421
+ self.update_object_key_mapping(self.core_collection_edge, objects, existing_objects)
422
+ return inserted_object_ids, deprecated_key_ids
423
+
424
+ def map_embedded_relationships(self, bundle_objects, inserted_object_ids):
425
+ objects = []
426
+ inserted_data = []
427
+ for obj in tqdm(bundle_objects, desc="upload_embedded_edges"):
428
+ if obj["id"] not in inserted_object_ids:
429
+ continue
430
+ if (
431
+ self.ignore_embedded_relationships_smo and obj["type"] in SMO_TYPES
432
+ ) or (
433
+ self.ignore_embedded_relationships_sro and obj["type"] == "relationship"
434
+ ):
435
+ continue
436
+
437
+ for ref_type, targets in utils.get_embedded_refs(obj):
438
+ utils.create_relationship_obj(
439
+ obj=obj,
440
+ source=obj.get("id"),
441
+ targets=targets,
442
+ relationship=ref_type,
443
+ arango_obj=self,
444
+ bundle_id=self.bundle_id or '',
445
+ insert_statement=objects,
446
+ extra_data=self.arangodb_extra_data,
447
+ )
448
+
449
+ module_logger.info(
450
+ f"Inserting embedded relationship into database. Total objects: {len(objects)}"
451
+ )
452
+
453
+ self.alter_objects(objects)
454
+ with self.arango.transactional(exclusive=[self.core_collection_edge, self.core_collection_vertex]):
455
+ inserted_object_ids, existing_objects = (
456
+ self.arango.insert_relationships_chunked(
457
+ objects, self.object_key_mapping, self.core_collection_edge
458
+ )
459
+ )
460
+ self.arango.update_is_latest_several_chunked(
461
+ inserted_object_ids, self.core_collection_edge, self.core_collection_edge
462
+ )
463
+ return inserted_object_ids, existing_objects
464
+
465
+ def import_default_objects(self):
466
+ self.process_bundle_into_graph(
467
+ self.default_objects(),
468
+ notes="automatically imported on collection creation",
469
+ is_default_objects=True,
470
+ )
471
+
472
+ def run(self, data=None):
473
+ if not data and not self.file:
474
+ raise Exception("file or data must be passed")
475
+
476
+ if not data:
477
+ if self.is_large_file:
478
+ module_logger.info("using large file mode")
479
+ os.makedirs("db_loader_tempfiles", exist_ok=True)
480
+ bundle_loader = BundleLoader(
481
+ self.file, db_path=f"db_loader_tempfiles/mydb_{time.time()}.sqlite"
482
+ )
483
+ for chunk in bundle_loader.chunks:
484
+ self.run_with_bundle(
485
+ {
486
+ "type": "bundle",
487
+ "objects": chunk,
488
+ "id": bundle_loader.bundle_id,
489
+ }
490
+ )
491
+ return
492
+ else:
493
+ module_logger.info("using regular file mode")
494
+ with open(self.file, "r") as input_file:
495
+ file_data = input_file.read()
496
+ try:
497
+ data = json.loads(file_data)
498
+ self.bundle_id = self.bundle_id or data["id"]
499
+ except Exception as e:
500
+ raise Exception("Invalid file type")
501
+ try:
502
+ validate(instance=data, schema=config.json_schema)
503
+ except Exception as e:
504
+ raise Exception("Invalid File structure")
505
+
506
+ self.run_with_bundle(data)
507
+
508
+ def run_with_bundle(self, bundle_dict):
509
+ if bundle_dict.get("type", None) != "bundle":
510
+ raise Exception("Provided file is not a STIX bundle. Aborted")
511
+
512
+ all_objects = bundle_dict["objects"]
513
+ self.alter_objects(all_objects)
514
+
515
+ module_logger.info(
516
+ f"Loading default objects from url and store into {self.core_collection_vertex}"
517
+ )
518
+ self.import_default_objects()
519
+
520
+ module_logger.info(
521
+ f"Load objects from file: {self.file} and store into {self.core_collection_vertex}"
522
+ )
523
+ inserted_object_ids, _, deprecated_key_ids1 = self.process_bundle_into_graph(
524
+ all_objects
525
+ )
526
+ module_logger.info("Mapping relationships now -> ")
527
+ inserted_relationship_ids, deprecated_key_ids2 = self.map_relationships(
528
+ self.filename, all_objects
529
+ )
530
+
531
+ if not self.ignore_embedded_relationships:
532
+ module_logger.info(
533
+ "Creating new embedded relationships using _refs and _ref"
534
+ )
535
+ self.map_embedded_relationships(
536
+ all_objects, inserted_object_ids + inserted_relationship_ids
537
+ )
538
+
539
+
540
+ with self.arango.transactional(exclusive=[self.core_collection_edge, self.core_collection_vertex]):
541
+ self.arango.deprecate_relationships(
542
+ deprecated_key_ids1, self.core_collection_edge
543
+ )
544
+ self.arango.deprecate_relationships(
545
+ deprecated_key_ids2, self.core_collection_edge
546
+ )
@@ -0,0 +1,111 @@
1
+ [
2
+ {
3
+ "type": "marking-definition",
4
+ "spec_version": "2.1",
5
+ "id": "marking-definition--613f2e26-407d-48c7-9eca-b8e91df99dc9",
6
+ "created": "2017-01-20T00:00:00.000Z",
7
+ "definition_type": "tlp",
8
+ "name": "TLP:WHITE",
9
+ "definition": {
10
+ "tlp": "white"
11
+ }
12
+ },
13
+ {
14
+ "type": "marking-definition",
15
+ "spec_version": "2.1",
16
+ "id": "marking-definition--34098fce-860f-48ae-8e50-ebd3cc5e41da",
17
+ "created": "2017-01-20T00:00:00.000Z",
18
+ "definition_type": "tlp",
19
+ "name": "TLP:GREEN",
20
+ "definition": {
21
+ "tlp": "green"
22
+ }
23
+ },
24
+ {
25
+ "type": "marking-definition",
26
+ "spec_version": "2.1",
27
+ "id": "marking-definition--f88d31f6-486f-44da-b317-01333bde0b82",
28
+ "created": "2017-01-20T00:00:00.000Z",
29
+ "definition_type": "tlp",
30
+ "name": "TLP:AMBER",
31
+ "definition": {
32
+ "tlp": "amber"
33
+ }
34
+ },
35
+ {
36
+ "type": "marking-definition",
37
+ "spec_version": "2.1",
38
+ "id": "marking-definition--5e57c739-391a-4eb3-b6be-7d15ca92d5ed",
39
+ "created": "2017-01-20T00:00:00.000Z",
40
+ "definition_type": "tlp",
41
+ "name": "TLP:RED",
42
+ "definition": {
43
+ "tlp": "red"
44
+ }
45
+ },
46
+ {
47
+ "type": "marking-definition",
48
+ "spec_version": "2.1",
49
+ "id": "marking-definition--94868c89-83c2-464b-929b-a1a8aa3c8487",
50
+ "created": "2022-10-01T00:00:00.000Z",
51
+ "name": "TLP:CLEAR",
52
+ "extensions": {
53
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
54
+ "extension_type": "property-extension",
55
+ "tlp_2_0" : "clear"
56
+ }
57
+ }
58
+ },
59
+ {
60
+ "type": "marking-definition",
61
+ "spec_version": "2.1",
62
+ "id": "marking-definition--bab4a63c-aed9-4cf5-a766-dfca5abac2bb",
63
+ "created": "2022-10-01T00:00:00.000Z",
64
+ "name": "TLP:GREEN",
65
+ "extensions": {
66
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
67
+ "extension_type": "property-extension",
68
+ "tlp_2_0" : "green"
69
+ }
70
+ }
71
+ },
72
+ {
73
+ "type": "marking-definition",
74
+ "spec_version": "2.1",
75
+ "id": "marking-definition--55d920b0-5e8b-4f79-9ee9-91f868d9b421",
76
+ "created": "2022-10-01T00:00:00.000Z",
77
+ "name": "TLP:AMBER",
78
+ "extensions": {
79
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
80
+ "extension_type": "property-extension",
81
+ "tlp_2_0" : "amber"
82
+ }
83
+ }
84
+ },
85
+ {
86
+ "type": "marking-definition",
87
+ "spec_version": "2.1",
88
+ "id": "marking-definition--939a9414-2ddd-4d32-a0cd-375ea402b003",
89
+ "created": "2022-10-01T00:00:00.000Z",
90
+ "name": "TLP:AMBER+STRICT",
91
+ "extensions": {
92
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
93
+ "extension_type": "property-extension",
94
+ "tlp_2_0" : "amber+strict"
95
+ }
96
+ }
97
+ },
98
+ {
99
+ "type": "marking-definition",
100
+ "spec_version": "2.1",
101
+ "id": "marking-definition--e828b379-4e03-4974-9ac4-e53a884c97c1",
102
+ "created": "2022-10-01T00:00:00.000Z",
103
+ "name": "TLP:RED",
104
+ "extensions": {
105
+ "extension-definition--60a3c5c5-0d10-413e-aab3-9e08dde9e88d": {
106
+ "extension_type": "property-extension",
107
+ "tlp_2_0" : "red"
108
+ }
109
+ }
110
+ }
111
+ ]