stix2arango 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,601 @@
1
+ from datetime import datetime
2
+ import os
3
+ import json
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ import pkgutil
8
+ import re
9
+ import time
10
+ import uuid
11
+
12
+ from .bundle_loader import BundleLoader
13
+
14
+ from .. import config
15
+ from tqdm import tqdm
16
+ from ..services.arangodb_service import ArangoDBService
17
+ from jsonschema import validate
18
+ from arango.collection import StandardCollection
19
+ import arango.exceptions
20
+
21
+
22
+ from .. import utils
23
+
24
+ module_logger = logging.getLogger("data_ingestion_service")
25
+ SMO_TYPES = ["marking-definition", "extension-definition", "language-content"]
26
+ LARGE_FILE_SIZE = 80 * 1024 * 1024
27
+
28
+
29
+ class Stix2Arango:
30
+ EMBEDDED_RELATIONSHIP_RE = re.compile(r"([a-z\-_]+)[_\-]refs{0,1}")
31
+ filename = "bundle.json"
32
+ ARANGODB_URL = f"http://{config.ARANGODB_HOST}:{config.ARANGODB_PORT}"
33
+
34
+ def __init__(
35
+ self,
36
+ database,
37
+ collection,
38
+ file,
39
+ create_collection=True,
40
+ create_db=True,
41
+ stix2arango_note="",
42
+ ignore_embedded_relationships=False,
43
+ ignore_embedded_relationships_sro=True,
44
+ ignore_embedded_relationships_smo=True,
45
+ include_embedded_relationships_attributes=None,
46
+ bundle_id=None,
47
+ username=config.ARANGODB_USERNAME,
48
+ password=config.ARANGODB_PASSWORD,
49
+ host_url=ARANGODB_URL,
50
+ is_large_file=False,
51
+ skip_default_indexes=False,
52
+ create_taxii_views=True,
53
+ **kwargs,
54
+ ):
55
+ """
56
+ `modify_fn` should modify in-place, returned value is discarded
57
+ """
58
+
59
+ self.alter_functions = []
60
+
61
+ self.core_collection_vertex, self.core_collection_edge = (
62
+ utils.get_vertex_and_edge_collection_names(collection)
63
+ )
64
+ EDGE_COLLECTIONS = [self.core_collection_edge]
65
+ VERTEX_COLLECTIONS = [self.core_collection_vertex]
66
+
67
+ self.arango = ArangoDBService(
68
+ database,
69
+ VERTEX_COLLECTIONS,
70
+ EDGE_COLLECTIONS,
71
+ create=create_collection,
72
+ create_db=create_db,
73
+ username=username,
74
+ password=password,
75
+ host_url=host_url,
76
+ **kwargs,
77
+ )
78
+
79
+ self.arangodb_extra_data = {}
80
+
81
+ self.file = file
82
+ self._is_large_file = is_large_file
83
+ self.note = stix2arango_note or ""
84
+ self.identity_ref = utils.load_file_from_url(config.STIX2ARANGO_IDENTITY).copy()
85
+ self.default_ref_objects = [
86
+ utils.load_file_from_url(link).copy()
87
+ for link in config.MARKING_DEFINITION_REFS + config.IDENTITY_REFS
88
+ ]
89
+ self.bundle_id = bundle_id
90
+ self.ignore_embedded_relationships = ignore_embedded_relationships
91
+ self.ignore_embedded_relationships_smo = ignore_embedded_relationships_smo
92
+ self.ignore_embedded_relationships_sro = ignore_embedded_relationships_sro
93
+ self.include_embedded_relationships_attributes = include_embedded_relationships_attributes
94
+ self.object_key_mapping = {}
95
+ if create_collection:
96
+ self.create_s2a_indexes()
97
+ if not skip_default_indexes:
98
+ self.create_default_indexes()
99
+ if create_taxii_views:
100
+ self.create_taxii_views()
101
+
102
+ if self.file:
103
+ self.filename = Path(self.file).name
104
+
105
+ @property
106
+ def is_large_file(self):
107
+ return self._is_large_file or os.path.getsize(self.file) > LARGE_FILE_SIZE
108
+
109
+ def alter_objects(self, objects: list[dict]):
110
+ for obj in objects:
111
+ obj.update(self.arangodb_extra_data)
112
+ for fn in self.alter_functions:
113
+ try:
114
+ fn(obj)
115
+ except Exception as e:
116
+ logging.warning(f"alter function {fn} failed on {obj}")
117
+ logging.warning(
118
+ f"alter function {fn} failed on {obj}", exc_info=True
119
+ )
120
+
121
+ def add_object_alter_fn(self, modify_fn):
122
+ if not callable(modify_fn):
123
+ raise ValueError("Bad modification function passed")
124
+ self.alter_functions.append(modify_fn)
125
+
126
+ def create_s2a_indexes(self):
127
+ for name, collection in self.arango.collections.items():
128
+ collection.add_index(
129
+ dict(
130
+ type="persistent",
131
+ name="s2a_search",
132
+ sparse=False,
133
+ fields=[
134
+ "id",
135
+ "modified",
136
+ "_is_latest",
137
+ ],
138
+ inBackground=True,
139
+ storedValues=["_record_modified", "_key", "_id"],
140
+ )
141
+ )
142
+
143
+ collection.add_index(
144
+ dict(
145
+ type="persistent",
146
+ name="s2a_unique_constraint",
147
+ unique=True,
148
+ fields=[
149
+ "id",
150
+ "_record_md5_hash",
151
+ ],
152
+ inBackground=True,
153
+ )
154
+ )
155
+ if name.endswith("_edge_collection"):
156
+ collection.add_index(
157
+ dict(
158
+ type="persistent",
159
+ name="s2a_search_edge",
160
+ sparse=True,
161
+ fields=[
162
+ "_from",
163
+ "_is_latest",
164
+ ],
165
+ inBackground=True,
166
+ storedValues=["_id"],
167
+ )
168
+ )
169
+
170
+ def create_analyzer(self, *args, **kwargs):
171
+ try:
172
+ return self.arango.db.create_analyzer(*args, **kwargs)
173
+ except arango.exceptions.AnalyzerCreateError as e:
174
+ if e.error_code != 10:
175
+ raise
176
+
177
+ def create_taxii_views(self):
178
+ views = set()
179
+ self.create_analyzer(
180
+ name="date_transform",
181
+ analyzer_type="aql",
182
+ features=[],
183
+ properties={
184
+ "queryString": "RETURN DATE_TIMESTAMP(@param)*1000 + TO_NUMBER(LAST(REGEX_MATCHES(@param, '.+\\\\.(\\\\d{6}).*')))%1000",
185
+ "collapsePositions": False,
186
+ "keepNull": True,
187
+ "batchSize": 1000,
188
+ "memoryLimit": 10485760,
189
+ "returnType": "number",
190
+ },
191
+ )
192
+ for name, collection in self.arango.collections.items():
193
+ logging.info(f'creating taxii index for {name}')
194
+ collection.add_index(
195
+ dict(
196
+ type="inverted",
197
+ name="taxii_search",
198
+ sparse=True,
199
+ fields=[
200
+ dict(name="_record_created", analyzer="date_transform"),
201
+ "modified",
202
+ "id",
203
+ "_taxii.visible",
204
+ "_taxii.last",
205
+ "_taxii.first",
206
+ "spec_version",
207
+ "type",
208
+ ],
209
+ inBackground=True,
210
+ storedValues=["_key", "_created"],
211
+ primarySort={
212
+ "fields": [{"field": "_record_created", "direction": "asc"}]
213
+ },
214
+ )
215
+ )
216
+ views.add(
217
+ "ats__"
218
+ + name.removesuffix("_vertex_collection").removesuffix(
219
+ "_edge_collection"
220
+ )
221
+ )
222
+
223
+ def create_default_indexes(self):
224
+ for name, collection in self.arango.collections.items():
225
+ module_logger.info(
226
+ f"creating indexes for collection {collection.db_name}/{name}"
227
+ )
228
+ time = int(datetime.now().timestamp())
229
+
230
+ collection.add_index(
231
+ dict(
232
+ type="persistent",
233
+ fields=["id"],
234
+ storedValues=[
235
+ "modified",
236
+ "created",
237
+ "type",
238
+ "_record_modified",
239
+ "spec_version",
240
+ "_record_md5_hash",
241
+ ],
242
+ inBackground=True,
243
+ name=f"by_stix_id_{time}",
244
+ )
245
+ )
246
+ collection.add_index(
247
+ dict(
248
+ type="persistent",
249
+ fields=["id", "type"],
250
+ storedValues=[
251
+ "modified",
252
+ "created",
253
+ "_record_modified",
254
+ "spec_version",
255
+ "_record_md5_hash",
256
+ ],
257
+ inBackground=True,
258
+ name=f"by_stix_id_type_{time}",
259
+ )
260
+ )
261
+ collection.add_index(
262
+ dict(
263
+ type="persistent",
264
+ fields=["modified", "created"],
265
+ storedValues=[
266
+ "type",
267
+ "_record_modified",
268
+ "id",
269
+ "spec_version",
270
+ "_record_md5_hash",
271
+ ],
272
+ inBackground=True,
273
+ name=f"by_stix_version_{time}",
274
+ )
275
+ )
276
+ collection.add_index(
277
+ dict(
278
+ type="persistent",
279
+ fields=["type"],
280
+ storedValues=[
281
+ "modified",
282
+ "created",
283
+ "_record_modified",
284
+ "id",
285
+ "spec_version",
286
+ "_record_md5_hash",
287
+ ],
288
+ inBackground=True,
289
+ name=f"by_stix_type_{time}",
290
+ )
291
+ )
292
+ collection.add_index(
293
+ dict(
294
+ type="persistent",
295
+ fields=["_record_modified", "_record_created"],
296
+ storedValues=[
297
+ "modified",
298
+ "created",
299
+ "type",
300
+ "id",
301
+ "spec_version",
302
+ "_record_md5_hash",
303
+ ],
304
+ inBackground=True,
305
+ name=f"by_insertion_time_{time}",
306
+ )
307
+ )
308
+ if name.endswith("_edge_collection"):
309
+ collection.add_index(
310
+ dict(
311
+ type="persistent",
312
+ fields=["source_ref", "target_ref", "relationship_type"],
313
+ storedValues=[
314
+ "modified",
315
+ "created",
316
+ "type",
317
+ "_record_modified",
318
+ "spec_version",
319
+ "_record_md5_hash",
320
+ "id",
321
+ ],
322
+ inBackground=True,
323
+ name=f"relation_from_{time}",
324
+ )
325
+ )
326
+ collection.add_index(
327
+ dict(
328
+ type="persistent",
329
+ fields=["target_ref", "source_ref", "relationship_type"],
330
+ storedValues=[
331
+ "modified",
332
+ "created",
333
+ "type",
334
+ "_record_modified",
335
+ "spec_version",
336
+ "_record_md5_hash",
337
+ "id",
338
+ ],
339
+ inBackground=True,
340
+ name=f"relation_to_{time}",
341
+ )
342
+ )
343
+ collection.add_index(
344
+ dict(
345
+ type="persistent",
346
+ fields=["relationship_type", "target_ref", "source_ref"],
347
+ storedValues=[
348
+ "modified",
349
+ "created",
350
+ "type",
351
+ "_record_modified",
352
+ "spec_version",
353
+ "_record_md5_hash",
354
+ "id",
355
+ ],
356
+ inBackground=True,
357
+ name=f"relation_type_{time}",
358
+ )
359
+ )
360
+
361
+ def default_objects(self):
362
+ object_list = self.default_ref_objects
363
+ for obj in json.loads(
364
+ pkgutil.get_data("stix2arango", "templates/marking-definition.json")
365
+ ):
366
+ object_list.append(obj)
367
+ return object_list
368
+
369
+ def process_bundle_into_graph(
370
+ self, objects_in, notes=None, is_default_objects=False
371
+ ):
372
+ module_logger.info(f"Reading vertex from file {self.file} now")
373
+
374
+ objects = []
375
+ insert_data = [] # That would be the overall statement
376
+ for obj in tqdm(objects_in, desc="upload_vertices"):
377
+ if obj.get("type") == "relationship":
378
+ continue
379
+ obj.setdefault("_stix2arango_note", notes or self.note)
380
+ obj["_record_md5_hash"] = utils.generate_md5(obj)
381
+ if not is_default_objects:
382
+ obj["_bundle_id"] = self.bundle_id or ""
383
+ obj["_file_name"] = self.filename or ""
384
+ obj.update(self.arangodb_extra_data)
385
+ objects.append(obj)
386
+ insert_data.append(
387
+ [obj.get("type"), obj.get("id"), True if "modified" in obj else False]
388
+ )
389
+
390
+ module_logger.info(
391
+ f"Inserting objects into database. Total objects: {len(objects)}"
392
+ )
393
+ with self.arango.transactional(
394
+ exclusive=[self.core_collection_edge, self.core_collection_vertex]
395
+ ):
396
+ inserted_object_ids, existing_objects = (
397
+ self.arango.insert_several_objects_chunked(
398
+ objects, self.core_collection_vertex
399
+ )
400
+ )
401
+ deprecated_key_ids = self.arango.update_is_latest_several_chunked(
402
+ inserted_object_ids,
403
+ self.core_collection_vertex,
404
+ self.core_collection_edge,
405
+ )
406
+
407
+ self.update_object_key_mapping(
408
+ self.core_collection_vertex, objects, existing_objects
409
+ )
410
+ return inserted_object_ids, existing_objects, deprecated_key_ids
411
+
412
+ def update_object_key_mapping(self, collection, objects, existing_objects={}):
413
+ for obj in objects:
414
+ if db_key := existing_objects.get(f"{obj['id']};{obj['_record_md5_hash']}"):
415
+ self.object_key_mapping[obj["id"]] = db_key
416
+ else:
417
+ self.object_key_mapping[obj["id"]] = "{collection}/{_key}".format(
418
+ collection=collection,
419
+ _key=obj.get("_key", "not_imported"),
420
+ )
421
+
422
+ def map_relationships(self, filename, objects_in):
423
+
424
+ module_logger.info("Mapping Prebuilt Relationship Objects -> ")
425
+ objects = []
426
+ inserted_data = []
427
+ obj: dict
428
+ for obj in tqdm(objects_in, desc="upload_edges"):
429
+ if obj.get("type") == "relationship":
430
+
431
+ source_ref = obj.get("source_ref")
432
+ target_ref = obj.get("target_ref")
433
+
434
+ obj.setdefault("_from", f"{self.core_collection_vertex}/{source_ref}")
435
+ obj.setdefault("_to", f"{self.core_collection_vertex}/{target_ref}")
436
+ obj["_bundle_id"] = self.bundle_id or ""
437
+ obj["_file_name"] = filename
438
+ obj.setdefault("_is_ref", False)
439
+ obj.setdefault("_stix2arango_note", self.note)
440
+ # obj['_record_md5_hash'] = utils.generate_md5(obj)
441
+ obj.update(self.arangodb_extra_data)
442
+ objects.append(obj)
443
+ inserted_data.append(
444
+ [
445
+ obj.get("type"),
446
+ obj.get("id"),
447
+ True if "modified" in obj else False,
448
+ ]
449
+ )
450
+
451
+ module_logger.info(
452
+ f"Inserting relationship into database. Total objects: {len(objects)}"
453
+ )
454
+ with self.arango.transactional(
455
+ exclusive=[self.core_collection_edge, self.core_collection_vertex]
456
+ ):
457
+ inserted_object_ids, existing_objects = (
458
+ self.arango.insert_relationships_chunked(
459
+ objects, self.object_key_mapping, self.core_collection_edge
460
+ )
461
+ )
462
+ deprecated_key_ids = self.arango.update_is_latest_several_chunked(
463
+ inserted_object_ids,
464
+ self.core_collection_edge,
465
+ self.core_collection_edge,
466
+ )
467
+ self.update_object_key_mapping(
468
+ self.core_collection_edge, objects, existing_objects
469
+ )
470
+ return inserted_object_ids, deprecated_key_ids
471
+
472
+ def map_embedded_relationships(self, bundle_objects, inserted_object_ids):
473
+ objects = []
474
+ for obj in tqdm(bundle_objects, desc="upload_embedded_edges"):
475
+ if obj["id"] not in inserted_object_ids:
476
+ continue
477
+ if self.include_embedded_relationships_attributes:
478
+ pass
479
+ elif (
480
+ self.ignore_embedded_relationships_smo and obj["type"] in SMO_TYPES
481
+ ) or (
482
+ self.ignore_embedded_relationships_sro and obj["type"] == "relationship"
483
+ ):
484
+ continue
485
+
486
+ for ref_type, targets in utils.get_embedded_refs(obj, attributes=self.include_embedded_relationships_attributes):
487
+ utils.create_relationship_obj(
488
+ obj=obj,
489
+ source=obj.get("id"),
490
+ targets=targets,
491
+ relationship=ref_type,
492
+ arango_obj=self,
493
+ bundle_id=self.bundle_id or "",
494
+ insert_statement=objects,
495
+ extra_data=self.arangodb_extra_data,
496
+ )
497
+
498
+ module_logger.info(
499
+ f"Inserting embedded relationship into database. Total objects: {len(objects)}"
500
+ )
501
+
502
+ self.alter_objects(objects)
503
+ inserted_object_ids = []
504
+ existing_objects = {}
505
+ for chunk in utils.chunked(objects, 20_000):
506
+ with self.arango.transactional(
507
+ exclusive=[self.core_collection_edge, self.core_collection_vertex]
508
+ ):
509
+ inserted, existing = self.arango.insert_relationships_chunked(
510
+ chunk, self.object_key_mapping, self.core_collection_edge
511
+ )
512
+ inserted_object_ids.extend(inserted)
513
+ existing_objects.update(existing)
514
+ self.arango.update_is_latest_several_chunked(
515
+ inserted_object_ids, self.core_collection_edge, self.core_collection_edge
516
+ )
517
+ return inserted_object_ids, existing_objects
518
+
519
+ def import_default_objects(self):
520
+ self.process_bundle_into_graph(
521
+ self.default_objects(),
522
+ notes="automatically imported on collection creation",
523
+ is_default_objects=True,
524
+ )
525
+
526
+ def run(self, data=None):
527
+ if not data and not self.file:
528
+ raise Exception("file or data must be passed")
529
+
530
+ if not data:
531
+ if self.is_large_file:
532
+ module_logger.info("using large file mode")
533
+ os.makedirs("db_loader_tempfiles", exist_ok=True)
534
+ bundle_loader = BundleLoader(
535
+ self.file, db_path=f"db_loader_tempfiles/mydb_{time.time()}.sqlite"
536
+ )
537
+ for chunk in bundle_loader.chunks:
538
+ self.run_with_bundle(
539
+ {
540
+ "type": "bundle",
541
+ "objects": chunk,
542
+ "id": bundle_loader.bundle_id,
543
+ }
544
+ )
545
+ return
546
+ else:
547
+ module_logger.info("using regular file mode")
548
+ with open(self.file, "r") as input_file:
549
+ file_data = input_file.read()
550
+ try:
551
+ data = json.loads(file_data)
552
+ self.bundle_id = self.bundle_id or data["id"]
553
+ except Exception as e:
554
+ raise Exception("Invalid file type")
555
+ try:
556
+ validate(instance=data, schema=config.json_schema)
557
+ except Exception as e:
558
+ raise Exception("Invalid File structure")
559
+
560
+ self.run_with_bundle(data)
561
+
562
+ def run_with_bundle(self, bundle_dict):
563
+ if bundle_dict.get("type", None) != "bundle":
564
+ raise Exception("Provided file is not a STIX bundle. Aborted")
565
+
566
+ all_objects = bundle_dict["objects"]
567
+ self.alter_objects(all_objects)
568
+
569
+ module_logger.info(
570
+ f"Loading default objects from url and store into {self.core_collection_vertex}"
571
+ )
572
+ self.import_default_objects()
573
+
574
+ module_logger.info(
575
+ f"Load objects from file: {self.file} and store into {self.core_collection_vertex}"
576
+ )
577
+ inserted_object_ids, _, deprecated_key_ids1 = self.process_bundle_into_graph(
578
+ all_objects
579
+ )
580
+ module_logger.info("Mapping relationships now -> ")
581
+ inserted_relationship_ids, deprecated_key_ids2 = self.map_relationships(
582
+ self.filename, all_objects
583
+ )
584
+
585
+ if (not self.ignore_embedded_relationships) or self.include_embedded_relationships_attributes:
586
+ module_logger.info(
587
+ "Creating new embedded relationships using _refs and _ref"
588
+ )
589
+ self.map_embedded_relationships(
590
+ all_objects, inserted_object_ids + inserted_relationship_ids
591
+ )
592
+
593
+ with self.arango.transactional(
594
+ exclusive=[self.core_collection_edge, self.core_collection_vertex]
595
+ ):
596
+ self.arango.deprecate_relationships(
597
+ deprecated_key_ids1, self.core_collection_edge
598
+ )
599
+ self.arango.deprecate_relationships(
600
+ deprecated_key_ids2, self.core_collection_edge
601
+ )