assemblyline-core 4.4.2.dev7__tar.gz → 4.5.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (87) hide show
  1. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/PKG-INFO +1 -4
  2. assemblyline-core-4.5.0.dev2/assemblyline_core/VERSION +1 -0
  3. assemblyline-core-4.5.0.dev2/assemblyline_core/badlist_client.py +134 -0
  4. assemblyline-core-4.5.0.dev2/assemblyline_core/replay/client.py +447 -0
  5. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/creator/run.py +13 -1
  6. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/creator/run_worker.py +75 -0
  7. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/loader/run.py +1 -1
  8. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/loader/run_worker.py +7 -3
  9. assemblyline-core-4.5.0.dev2/assemblyline_core/safelist_client.py +136 -0
  10. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core.egg-info/PKG-INFO +1 -4
  11. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_replay.py +105 -18
  12. assemblyline-core-4.4.2.dev7/assemblyline_core/VERSION +0 -1
  13. assemblyline-core-4.4.2.dev7/assemblyline_core/badlist_client.py +0 -47
  14. assemblyline-core-4.4.2.dev7/assemblyline_core/replay/client.py +0 -315
  15. assemblyline-core-4.4.2.dev7/assemblyline_core/safelist_client.py +0 -60
  16. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/LICENCE.md +0 -0
  17. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/README.md +0 -0
  18. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/__init__.py +0 -0
  19. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/alerter/__init__.py +0 -0
  20. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/alerter/processing.py +0 -0
  21. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/alerter/run_alerter.py +0 -0
  22. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/archiver/__init__.py +0 -0
  23. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/archiver/run_archiver.py +0 -0
  24. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/__init__.py +0 -0
  25. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/__main__.py +0 -0
  26. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/client.py +0 -0
  27. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/dispatcher.py +0 -0
  28. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/schedules.py +0 -0
  29. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/dispatching/timeout.py +0 -0
  30. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/expiry/__init__.py +0 -0
  31. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/expiry/run_expiry.py +0 -0
  32. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/ingester/__init__.py +0 -0
  33. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/ingester/__main__.py +0 -0
  34. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/ingester/constants.py +0 -0
  35. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/ingester/ingester.py +0 -0
  36. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/__init__.py +0 -0
  37. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/es_metrics.py +0 -0
  38. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  39. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/helper.py +0 -0
  40. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/metrics_server.py +0 -0
  41. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  42. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  43. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  44. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/plumber/__init__.py +0 -0
  45. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/plumber/run_plumber.py +0 -0
  46. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/__init__.py +0 -0
  47. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/creator/__init__.py +0 -0
  48. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/loader/__init__.py +0 -0
  49. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/replay/replay.py +0 -0
  50. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/__init__.py +0 -0
  51. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/collection.py +0 -0
  52. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  53. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  54. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/controllers/interface.py +0 -0
  55. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +0 -0
  56. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/run_scaler.py +0 -0
  57. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/scaler/scaler_server.py +0 -0
  58. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/server_base.py +0 -0
  59. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/submission_client.py +0 -0
  60. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/tasking_client.py +0 -0
  61. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/updater/__init__.py +0 -0
  62. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/updater/helper.py +0 -0
  63. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/updater/run_updater.py +0 -0
  64. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/__init__.py +0 -0
  65. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/crawler.py +0 -0
  66. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/department_map.py +0 -0
  67. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/safelist.py +0 -0
  68. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/stream_map.py +0 -0
  69. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/vacuum/worker.py +0 -0
  70. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/workflow/__init__.py +0 -0
  71. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core/workflow/run_workflow.py +0 -0
  72. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  73. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  74. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core.egg-info/requires.txt +0 -0
  75. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/assemblyline_core.egg-info/top_level.txt +0 -0
  76. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/setup.cfg +0 -0
  77. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/setup.py +0 -0
  78. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_alerter.py +0 -0
  79. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_dispatcher.py +0 -0
  80. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_expiry.py +0 -0
  81. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_plumber.py +0 -0
  82. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_scaler.py +0 -0
  83. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_scheduler.py +0 -0
  84. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_simulation.py +0 -0
  85. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_vacuum.py +0 -0
  86. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_worker_ingest.py +0 -0
  87. {assemblyline-core-4.4.2.dev7 → assemblyline-core-4.5.0.dev2}/test/test_worker_submit.py +0 -0
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.4.2.dev7
3
+ Version: 4.5.0.dev2
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
7
7
  Author-email: assemblyline@cyber.gc.ca
8
8
  License: MIT
9
9
  Keywords: assemblyline automated malware analysis gc canada cse-cst cse cst cyber cccs
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: Topic :: Software Development :: Libraries
@@ -59,5 +58,3 @@ Make sure the different services get their latest update files.
59
58
  ##### Workflow
60
59
 
61
60
  Run the different workflows in the system and apply their labels, priority and status.
62
-
63
-
@@ -0,0 +1 @@
1
+ 4.5.0.dev2
@@ -0,0 +1,134 @@
1
+ import hashlib
2
+ import logging
3
+
4
+ from assemblyline.common import forge
5
+ from assemblyline.common.chunk import chunk
6
+ from assemblyline.common.isotime import now_as_iso
7
+ from assemblyline.datastore.helper import AssemblylineDatastore
8
+
9
+ CHUNK_SIZE = 1000
10
+ CLASSIFICATION = forge.get_classification()
11
+
12
+
13
+ class InvalidBadhash(Exception):
14
+ pass
15
+
16
+
17
+ class BadlistClient:
18
+ """A helper class to simplify badlisting for privileged services and service-server."""
19
+
20
+ def __init__(self, datastore: AssemblylineDatastore = None, config=None):
21
+ self.log = logging.getLogger('assemblyline.badlist_client')
22
+ self.config = config or forge.CachedObject(forge.get_config)
23
+ self.datastore = datastore or forge.get_datastore(self.config)
24
+
25
+ # Badlist
26
+ def exists(self, qhash):
27
+ return self.datastore.badlist.get_if_exists(qhash, as_obj=False)
28
+
29
+ def exists_tags(self, tag_map):
30
+ lookup_keys = []
31
+ for tag_type, tag_values in tag_map.items():
32
+ for tag_value in tag_values:
33
+ lookup_keys.append(hashlib.sha256(f"{tag_type}: {tag_value}".encode('utf8')).hexdigest())
34
+
35
+ # Elasticsearch's result window can't be more than 10000 rows
36
+ # we will query for matches in chunks
37
+ results = []
38
+ for key_chunk in chunk(lookup_keys, CHUNK_SIZE):
39
+ results += self.datastore.badlist.search("*", fl="*", rows=CHUNK_SIZE,
40
+ as_obj=False, key_space=key_chunk)['items']
41
+
42
+ return results
43
+
44
+ def find_similar_tlsh(self, tlsh):
45
+ return self.datastore.badlist.search(f"hashes.tlsh:{tlsh}", fl="*", as_obj=False)['items']
46
+
47
+ def find_similar_ssdeep(self, ssdeep):
48
+ try:
49
+ _, long, _ = ssdeep.replace('/', '\\/').split(":")
50
+ return self.datastore.badlist.search(f"hashes.ssdeep:{long}~", fl="*", as_obj=False)['items']
51
+ except ValueError:
52
+ self.log.warning(f'This is not a valid SSDeep hash: {ssdeep}')
53
+ return []
54
+
55
+ @staticmethod
56
+ def _merge_hashes(new, old):
57
+ # Account for the possibility of merging with null types
58
+ if not (new or old):
59
+ # Both are null
60
+ raise ValueError("New and old are both null")
61
+ elif not (new and old):
62
+ # Only one is null, in which case return the other
63
+ return new or old
64
+
65
+ try:
66
+ # Check if hash types match
67
+ if new['type'] != old['type']:
68
+ raise InvalidBadhash(f"Bad hash type mismatch: {new['type']} != {old['type']}")
69
+
70
+ # Use the new classification but we will recompute it later anyway
71
+ old['classification'] = new['classification']
72
+
73
+ # Update updated time
74
+ old['updated'] = new.get('updated', now_as_iso())
75
+
76
+ # Update hashes
77
+ old['hashes'].update({k: v for k, v in new['hashes'].items() if v})
78
+
79
+ # Merge attributions
80
+ if not old['attribution']:
81
+ old['attribution'] = new.get('attribution', None)
82
+ elif new.get('attribution', None):
83
+ for key in ["actor", 'campaign', 'category', 'exploit', 'implant', 'family', 'network']:
84
+ old_value = old['attribution'].get(key, []) or []
85
+ new_value = new['attribution'].get(key, []) or []
86
+ old['attribution'][key] = list(set(old_value + new_value)) or None
87
+
88
+ if old['attribution'] is not None:
89
+ old['attribution'] = {key: value for key, value in old['attribution'].items() if value}
90
+
91
+ # Update type specific info
92
+ if old['type'] == 'file':
93
+ old.setdefault('file', {})
94
+ new_names = new.get('file', {}).pop('name', [])
95
+ if 'name' in old['file']:
96
+ for name in new_names:
97
+ if name not in old['file']['name']:
98
+ old['file']['name'].append(name)
99
+ elif new_names:
100
+ old['file']['name'] = new_names
101
+ old['file'].update({k: v for k, v in new.get('file', {}).items() if v})
102
+ elif old['type'] == 'tag':
103
+ old['tag'] = new['tag']
104
+
105
+ # Merge sources
106
+ src_map = {x['name']: x for x in new['sources']}
107
+ if not src_map:
108
+ raise InvalidBadhash("No valid source found")
109
+
110
+ old_src_map = {x['name']: x for x in old['sources']}
111
+ for name, src in src_map.items():
112
+ if name not in old_src_map:
113
+ old_src_map[name] = src
114
+ else:
115
+ old_src = old_src_map[name]
116
+ if old_src['type'] != src['type']:
117
+ raise InvalidBadhash(f"Source {name} has a type conflict: {old_src['type']} != {src['type']}")
118
+
119
+ for reason in src['reason']:
120
+ if reason not in old_src['reason']:
121
+ old_src['reason'].append(reason)
122
+ old_src['classification'] = src.get('classification', old_src['classification'])
123
+ old['sources'] = list(old_src_map.values())
124
+
125
+ # Calculate the new classification
126
+ for src in old['sources']:
127
+ old['classification'] = CLASSIFICATION.max_classification(
128
+ old['classification'], src.get('classification', None))
129
+
130
+ # Set the expiry
131
+ old['expiry_ts'] = new.get('expiry_ts', None)
132
+ return old
133
+ except Exception as e:
134
+ raise InvalidBadhash(f"Invalid data provided: {str(e)}")
@@ -0,0 +1,447 @@
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ from assemblyline.common import forge
6
+ from assemblyline.common.bundling import create_bundle, import_bundle
7
+ from assemblyline.odm import Model
8
+ from assemblyline.remote.datatypes.queues.named import NamedQueue
9
+ from assemblyline.remote.datatypes.hash import Hash
10
+ from assemblyline_core.badlist_client import BadlistClient
11
+ from assemblyline_core.safelist_client import SafelistClient
12
+
13
+ EMPTY_WAIT_TIME = int(os.environ.get('EMPTY_WAIT_TIME', '30'))
14
+ REPLAY_REQUESTED = 'requested'
15
+ REPLAY_PENDING = 'pending'
16
+ REPLAY_DONE = 'done'
17
+
18
+
19
+ class ClientBase(object):
20
+ def __init__(self, log, lookback_time='*',
21
+ alert_fqs=None, badlist_fqs=None, safelist_fqs=None, submission_fqs=None, workflow_fqs=None):
22
+ # Set logger
23
+ self.log = log
24
+
25
+ # Setup timming
26
+ self.last_alert_time = self.last_submission_time = self.lookback_time = lookback_time
27
+
28
+ # Setup filter queries
29
+ self.pending_fq = f'NOT metadata.replay:{REPLAY_PENDING}'
30
+ self.done_fq = f'NOT metadata.replay:{REPLAY_DONE}'
31
+ self.alert_fqs = alert_fqs or []
32
+ self.badlist_fqs = badlist_fqs or []
33
+ self.safelist_fqs = safelist_fqs or []
34
+ self.submission_fqs = submission_fqs or []
35
+ self.workflow_fqs = workflow_fqs or []
36
+
37
+ # Set running flag
38
+ self.running = True
39
+
40
+ def _put_checkpoint(self, *_):
41
+ raise NotImplementedError()
42
+
43
+ def _get_checkpoint(self, *_):
44
+ raise NotImplementedError()
45
+
46
+ def _get_next_object_ids(self, collection, query, filter_queries, fl, sort):
47
+ raise NotImplementedError()
48
+
49
+ def _get_next_alert_ids(self, query, filter_queries):
50
+ return self._get_next_object_ids("alert", query, filter_queries, "alert_id,reporting_ts", "reporting_ts asc")
51
+
52
+ def _get_next_submission_ids(self, query, filter_queries):
53
+ return self._get_next_object_ids("submission", query, filter_queries, "sid,times.completed",
54
+ "times.completed asc")
55
+
56
+ def _set_bulk_object_pending(self, collection, query, filter_queries, max_docs):
57
+ raise NotImplementedError()
58
+
59
+ def _set_bulk_alert_pending(self, query, filter_queries, max_docs):
60
+ self._set_bulk_object_pending("alert", query, filter_queries, max_docs)
61
+
62
+ def _set_bulk_submission_pending(self, query, filter_queries, max_docs):
63
+ self._set_bulk_object_pending("submission", query, filter_queries, max_docs)
64
+
65
+ def _stream_objects(self, collection, query, fl="*", filter_queries=[]):
66
+ raise NotImplementedError()
67
+
68
+ def _stream_alert_ids(self, query):
69
+ return self._stream_objects("alert", query, "alert_id,reporting_ts")
70
+
71
+ def _stream_submission_ids(self, query):
72
+ return self._stream_objects("submission", query, "sid,times.completed")
73
+
74
+ def create_al_bundle(self, id, bundle_path, use_alert=False):
75
+ raise NotImplementedError()
76
+
77
+ def create_alert_bundle(self, alert_id, bundle_path):
78
+ self.create_al_bundle(alert_id, bundle_path, use_alert=True)
79
+
80
+ def create_submission_bundle(self, sid, bundle_path):
81
+ self.create_al_bundle(sid, bundle_path)
82
+
83
+ def load_bundle(self, *_):
84
+ raise NotImplementedError()
85
+
86
+ def load_json(self, *_):
87
+ raise NotImplementedError()
88
+
89
+ def stop(self):
90
+ self.running = False
91
+
92
+ def set_single_object_complete(self, collection, id):
93
+ raise NotImplementedError()
94
+
95
+ def set_single_alert_complete(self, alert_id):
96
+ self.set_single_object_complete("alert", alert_id)
97
+
98
+ def set_single_submission_complete(self, sid):
99
+ self.set_single_object_complete("submission", sid)
100
+
101
+ def setup_alert_input_queue(self, once=False):
102
+ # Bootstrap recovery of pending replayed alerts
103
+ for a in self._stream_alert_ids(f"metadata.replay:{REPLAY_PENDING}"):
104
+ self.log.info(f"Replaying alert: {a['alert_id']}")
105
+ self.put_alert(a)
106
+
107
+ # Create the list of filter queries
108
+ processing_fqs = self.alert_fqs + [self.pending_fq, self.done_fq]
109
+
110
+ # Run
111
+ while self.running:
112
+ # Find alerts
113
+ alert_input_query = f"reporting_ts:{{{self.last_alert_time} TO now]"
114
+ alerts = self._get_next_alert_ids(alert_input_query, processing_fqs)
115
+
116
+ # Set their pending state
117
+ if alerts['items']:
118
+ last_time = alerts['items'][-1]['reporting_ts']
119
+ bulk_query = f"reporting_ts:{{{self.last_alert_time} TO {last_time}]"
120
+ count = len(alerts['items'])
121
+ self._set_bulk_alert_pending(bulk_query, processing_fqs, count)
122
+ self.last_alert_time = last_time
123
+
124
+ # Queue them
125
+ for a in alerts['items']:
126
+ self.log.info(f"Replaying alert: {a['alert_id']}")
127
+ self.put_alert(a)
128
+
129
+ # Wait if nothing found
130
+ if alerts['total'] == 0:
131
+ self.last_alert_time = self.lookback_time
132
+ for _ in range(EMPTY_WAIT_TIME):
133
+ if not self.running:
134
+ break
135
+ time.sleep(1)
136
+
137
+ if once:
138
+ break
139
+
140
+ def setup_submission_input_queue(self, once=False):
141
+ # Bootstrap recovery of pending replayed submission
142
+ for sub in self._stream_submission_ids(f"metadata.replay:{REPLAY_PENDING}"):
143
+ self.log.info(f"Replaying submission: {sub['sid']}")
144
+ self.put_submission(sub)
145
+
146
+ # Create the list of filter queries
147
+ processing_fqs = self.submission_fqs + [self.pending_fq, self.done_fq]
148
+
149
+ # Run
150
+ while self.running:
151
+ # Find submissions
152
+ sub_query = f"times.completed:[{self.last_submission_time} TO now]"
153
+ submissions = self._get_next_submission_ids(sub_query, processing_fqs)
154
+
155
+ # Set their pending state
156
+ if submissions['items']:
157
+ last_time = submissions['items'][-1]['times']['completed']
158
+ bulk_query = f"times.completed:[{self.last_submission_time} TO {last_time}]"
159
+ count = len(submissions['items'])
160
+ self._set_bulk_submission_pending(bulk_query, processing_fqs, count)
161
+ self.last_submission_time = last_time
162
+
163
+ # Queue them
164
+ for sub in submissions['items']:
165
+ self.log.info(f"Replaying submission: {sub['sid']}")
166
+ self.put_submission(sub)
167
+
168
+ # Wait if nothing found
169
+ if submissions['total'] == 0:
170
+ self.last_submission_time = self.lookback_time
171
+ for _ in range(EMPTY_WAIT_TIME):
172
+ if not self.running:
173
+ break
174
+ time.sleep(1)
175
+
176
+ if once:
177
+ break
178
+
179
+ def _setup_checkpoint_based_input_queue(self, collection: str, id_field: str, date_field: str, once=False):
180
+ # At bootstrap, get the last checkpoint
181
+ checkpoint = self._get_checkpoint(collection)
182
+ fqs = getattr(self, f"{collection}_fqs")
183
+
184
+ # Run
185
+ while self.running:
186
+ # Find objects of the collection that haven't been replayed
187
+ for obj in self._stream_objects(
188
+ collection, f"{date_field}:[{checkpoint} TO now]", fl="*,id", filter_queries=fqs):
189
+ self.log.info(f"Replaying {collection}: {obj[id_field]}")
190
+ # Submit name queue to be tasked to worker(s) for replay
191
+ self.put_message(collection, obj)
192
+ # Update checkpoint
193
+ checkpoint = obj[date_field]
194
+
195
+ # Wait if there are no more items to queue at this time
196
+ if self._query(collection, f"{date_field}:[{checkpoint} TO now]", fqs, rows=0)['total'] == 0:
197
+ for _ in range(EMPTY_WAIT_TIME):
198
+ if not self.running:
199
+ break
200
+ time.sleep(1)
201
+
202
+ if once:
203
+ break
204
+
205
+ def setup_workflow_input_queue(self, once=False):
206
+ self._setup_checkpoint_based_input_queue("workflow", "workflow_id", "last_edit", once)
207
+
208
+ def setup_badlist_input_queue(self, once=False):
209
+ self._setup_checkpoint_based_input_queue("badlist", "id", "updated", once)
210
+
211
+ def setup_safelist_input_queue(self, once=False):
212
+ self._setup_checkpoint_based_input_queue("safelist", "id", "updated", once)
213
+
214
+ def _query(self, collection, query, filter_queries=[], rows=None, track_total_hits=False):
215
+ raise NotImplementedError()
216
+
217
+ def query_alerts(self, query="*", track_total_hits=False):
218
+ self._query("alert", query, track_total_hits)
219
+
220
+ def get_next_message(self, message_type):
221
+ raise NotImplementedError()
222
+
223
+ def get_next_alert(self):
224
+ return self.get_next_message("alert")
225
+
226
+ def get_next_badlist(self):
227
+ return self.get_next_message("badlist")
228
+
229
+ def get_next_file(self):
230
+ return self.get_next_message("file")
231
+
232
+ def get_next_safelist(self):
233
+ return self.get_next_message("safelist")
234
+
235
+ def get_next_submission(self):
236
+ return self.get_next_message("submission")
237
+
238
+ def get_next_workflow(self):
239
+ return self.get_next_message("workflow")
240
+
241
+ def put_message(self, message_type, message):
242
+ raise NotImplementedError()
243
+
244
+ def put_alert(self, alert):
245
+ self.put_message("alert", alert)
246
+
247
+ def put_badlist(self, badlist):
248
+ self.put_message("badlist", badlist)
249
+
250
+ def put_file(self, path):
251
+ self.put_message("file", path)
252
+
253
+ def put_safelist(self, safelist):
254
+ self.put_message("safelist", safelist)
255
+
256
+ def put_submission(self, submission):
257
+ self.put_message("submission", submission)
258
+
259
+ def put_workflow(self, workflow):
260
+ self.put_message("workflow", workflow)
261
+
262
+
263
+ class APIClient(ClientBase):
264
+ def __init__(self, log, host, user, apikey, verify, **kwargs):
265
+ from assemblyline_client import get_client
266
+
267
+ # Setup AL client
268
+ self.al_client = get_client(host, apikey=(user, apikey), verify=verify)
269
+
270
+ super().__init__(log, **kwargs)
271
+
272
+ def _put_checkpoint(self, collection, checkpoint):
273
+ return self.al_client.replay.put_checkpoint(collection, checkpoint)
274
+
275
+ def _get_checkpoint(self, collection):
276
+ return self.al_client.replay.get_checkpoint(collection)
277
+
278
+ def _get_next_object_ids(self, collection, query, filter_queries, fl, sort):
279
+ return getattr(self.al_client.search, collection)(query, fl=fl, sort=sort, rows=100, filters=filter_queries)
280
+
281
+ def _set_bulk_object_pending(self, collection, query, filter_queries, max_docs):
282
+ self.al_client.replay.set_bulk_pending(collection, query, filter_queries, max_docs)
283
+
284
+ def _stream_objects(self, collection, query, fl="*", filter_queries=[]):
285
+ return getattr(self.al_client.search.stream, collection)(query, fl=fl, filters=filter_queries, as_obj=False)
286
+
287
+ def create_al_bundle(self, id, bundle_path, use_alert=False):
288
+ self.al_client.bundle.create(id, output=bundle_path, use_alert=use_alert)
289
+
290
+ def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True):
291
+ self.al_client.bundle.import_bundle(bundle_path,
292
+ min_classification=min_classification,
293
+ rescan_services=rescan_services,
294
+ exist_ok=exist_ok)
295
+
296
+ def load_json(self, file_path):
297
+ from assemblyline_client import ClientError
298
+
299
+ # We're assuming all JSON that loaded has an "enabled" field
300
+ collection = os.path.basename(file_path).split('_', 1)[0]
301
+ with open(file_path) as fp:
302
+ data_blob = json.load(fp)
303
+
304
+ if isinstance(data_blob, list):
305
+ for data in data_blob:
306
+ id = data.pop("id")
307
+ try:
308
+ # Let's see if there's an existing document with the same ID in the collection
309
+ obj = getattr(self.al_client, collection)(id)
310
+
311
+ if collection == "workflow":
312
+ # If there has been any edits by another user, then preserve the enabled state
313
+ # Otherwise, the workflow will be synchronized with the origin system
314
+ if obj['edited_by'] != data['edited_by']:
315
+ data['enabled'] = obj["enabled"]
316
+
317
+ self.al_client.workflow.update(id, data)
318
+ elif collection == "badlist":
319
+ data['enabled'] = obj["enabled"]
320
+ self.al_client.badlist.add_update(data)
321
+ elif collection == "safelist":
322
+ data['enabled'] = obj["enabled"]
323
+ self.al_client.safelist.add_update(data)
324
+ except ClientError as e:
325
+ if e.status_code == 404:
326
+ # The document doesn't exist in the system, therefore create it
327
+ if collection == "workflow":
328
+ self.al_client.workflow.add(data)
329
+ elif collection == "badlist":
330
+ self.al_client.badlist.add_update(data)
331
+ elif collection == "safelist":
332
+ self.al_client.safelist.add_update(data)
333
+ return
334
+ raise
335
+
336
+ def set_single_object_complete(self, collection, id):
337
+ self.al_client.replay.set_complete(collection, id)
338
+
339
+ def _query(self, collection, query, filter_queries=[], rows=None, track_total_hits=False):
340
+ return getattr(self.al_client.search, collection)(
341
+ query=query, filters=filter_queries, rows=rows, track_total_hits=track_total_hits
342
+ )
343
+
344
+ def get_next_message(self, message_type):
345
+ return self.al_client.replay.get_message(message_type)
346
+
347
+ def put_message(self, message_type, message):
348
+ if isinstance(message, Model):
349
+ message = message.as_primitives()
350
+ self.al_client.replay.put_message(message_type, message)
351
+
352
+
353
+ class DirectClient(ClientBase):
354
+ def __init__(self, log, **kwargs):
355
+ from assemblyline.remote.datatypes import get_client
356
+
357
+ # Setup datastore
358
+ config = forge.get_config()
359
+ redis = get_client(config.core.redis.nonpersistent.host, config.core.redis.nonpersistent.port, False)
360
+ # Initialize connection to redis-persistent for checkpointing
361
+ redis_persist = get_client(config.core.redis.persistent.host,
362
+ config.core.redis.persistent.port, False)
363
+ self.datastore = forge.get_datastore(config=config)
364
+ self.queues = {
365
+ queue_type: NamedQueue(f"replay_{queue_type}", host=redis)
366
+ for queue_type in ['alert', 'file', 'submission', 'safelist', 'badlist', 'workflow']
367
+ }
368
+ self.checkpoint_hash = Hash('replay_checkpoints', redis_persist)
369
+
370
+ super().__init__(log, **kwargs)
371
+
372
+ def _query(self, collection, query, filter_queries=[], rows=None, track_total_hits=False):
373
+ return getattr(self.datastore, collection).search(
374
+ query, filters=filter_queries, rows=rows, track_total_hits=track_total_hits
375
+ )
376
+
377
+ def _put_checkpoint(self, collection, checkpoint):
378
+ self.checkpoint_hash.set(collection, checkpoint)
379
+
380
+ def _get_checkpoint(self, collection) -> str:
381
+ return self.checkpoint_hash.get(collection) or "*"
382
+
383
+ def _get_next_object_ids(self, collection, query, filter_queries, fl, sort):
384
+ return getattr(self.datastore, collection).search(query, fl=fl, sort=sort, rows=100, filters=filter_queries)
385
+
386
+ def _set_bulk_object_pending(self, collection, query, filter_queries, max_docs):
387
+ ds_collection = getattr(self.datastore, collection)
388
+ operations = [(ds_collection.UPDATE_SET, 'metadata.replay', REPLAY_PENDING)]
389
+ ds_collection.update_by_query(query, operations, filters=filter_queries, max_docs=max_docs)
390
+
391
+ def _stream_objects(self, collection, query, fl="*", filter_queries=[]):
392
+ return getattr(self.datastore, collection).stream_search(query, fl=fl, filters=filter_queries, as_obj=False)
393
+
394
+ def create_al_bundle(self, id, bundle_path, use_alert=False):
395
+ temp_bundle_file = create_bundle(id, working_dir=os.path.dirname(bundle_path), use_alert=use_alert)
396
+ os.rename(temp_bundle_file, bundle_path)
397
+
398
+ def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True):
399
+ import_bundle(bundle_path,
400
+ min_classification=min_classification,
401
+ rescan_services=rescan_services,
402
+ exist_ok=exist_ok)
403
+
404
+ def load_json(self, file_path):
405
+ # We're assuming all JSON that loaded has an "enabled" field
406
+ collection = os.path.basename(file_path).split('_', 1)[0]
407
+ with open(file_path) as fp:
408
+ data_blob = json.load(fp)
409
+
410
+ if isinstance(data_blob, list):
411
+ es_collection = getattr(self.datastore, collection)
412
+ for data in data_blob:
413
+ id = data.pop("id")
414
+
415
+ # Let's see if there's an existing document with the same ID in the collection
416
+ obj = es_collection.get_if_exists(id, as_obj=False)
417
+
418
+ if collection == "workflow":
419
+ # If there has been any edits by another user, then preserve the enabled state
420
+ # Otherwise, the workflow will be synchronized with the origin system
421
+ if obj and obj['edited_by'] != data['edited_by']:
422
+ data['enabled'] = obj["enabled"]
423
+ es_collection.save(id, data)
424
+ elif collection == "badlist":
425
+ if obj:
426
+ # Preserve the system's enabled state of the item
427
+ data['enabled'] = obj["enabled"]
428
+ es_collection.save(id, BadlistClient._merge_hashes(data, obj))
429
+ elif collection == "safelist":
430
+ if obj:
431
+ # Preserve the system's enabled state of the item
432
+ data['enabled'] = obj["enabled"]
433
+ es_collection.save(id, SafelistClient._merge_hashes(data, obj))
434
+ es_collection.commit()
435
+
436
+ def set_single_object_complete(self, collection, id):
437
+ ds_collection = getattr(self.datastore, collection)
438
+ operations = [(ds_collection.UPDATE_SET, 'metadata.replay', REPLAY_DONE)]
439
+ ds_collection.update(id, operations)
440
+
441
+ def get_next_message(self, message_type):
442
+ return self.queues[message_type].pop(blocking=True, timeout=30)
443
+
444
+ def put_message(self, message_type, message):
445
+ if isinstance(message, Model):
446
+ message = message.as_primitives()
447
+ self.queues[message_type].push(message)
@@ -18,7 +18,10 @@ class ReplayCreator(ReplayBase):
18
18
  # Load client
19
19
  client_config = dict(lookback_time=self.replay_config.creator.lookback_time,
20
20
  alert_fqs=self.replay_config.creator.alert_input.filter_queries,
21
- submission_fqs=self.replay_config.creator.submission_input.filter_queries)
21
+ badlist_fqs=self.replay_config.creator.badlist_input.filter_queries,
22
+ safelist_fqs=self.replay_config.creator.safelist_input.filter_queries,
23
+ submission_fqs=self.replay_config.creator.submission_input.filter_queries,
24
+ workflow_fqs=self.replay_config.creator.workflow_input.filter_queries)
22
25
 
23
26
  if self.replay_config.creator.client.type == 'direct':
24
27
  self.log.info("Using direct database access client")
@@ -36,9 +39,18 @@ class ReplayCreator(ReplayBase):
36
39
  if self.replay_config.creator.alert_input.enabled:
37
40
  threads['Load Alerts'] = self.client.setup_alert_input_queue
38
41
 
42
+ if self.replay_config.creator.badlist_input.enabled:
43
+ threads['Load Badlist Items'] = self.client.setup_badlist_input_queue
44
+
45
+ if self.replay_config.creator.safelist_input.enabled:
46
+ threads['Load Safelist Items'] = self.client.setup_safelist_input_queue
47
+
39
48
  if self.replay_config.creator.submission_input.enabled:
40
49
  threads['Load Submissions'] = self.client.setup_submission_input_queue
41
50
 
51
+ if self.replay_config.creator.workflow_input.enabled:
52
+ threads['Load Workflows'] = self.client.setup_workflow_input_queue
53
+
42
54
  if threads:
43
55
  self.maintain_threads(threads)
44
56
  else: