assemblyline-core 4.5.0.22__tar.gz → 4.5.0.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.0.24/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/expiry/run_expiry.py +141 -118
  4. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core.egg-info/PKG-INFO +1 -1
  5. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_expiry.py +2 -1
  6. assemblyline-core-4.5.0.22/assemblyline_core/VERSION +0 -1
  7. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/LICENCE.md +0 -0
  8. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/README.md +0 -0
  9. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/__init__.py +0 -0
  10. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/alerter/__init__.py +0 -0
  11. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/alerter/processing.py +0 -0
  12. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/alerter/run_alerter.py +0 -0
  13. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/archiver/__init__.py +0 -0
  14. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/archiver/run_archiver.py +0 -0
  15. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/badlist_client.py +0 -0
  16. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/__init__.py +0 -0
  17. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/__main__.py +0 -0
  18. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/client.py +0 -0
  19. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/dispatcher.py +0 -0
  20. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/schedules.py +0 -0
  21. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/dispatching/timeout.py +0 -0
  22. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/expiry/__init__.py +0 -0
  23. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/ingester/__init__.py +0 -0
  24. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/ingester/__main__.py +0 -0
  25. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/ingester/constants.py +0 -0
  26. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/ingester/ingester.py +0 -0
  27. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/__init__.py +0 -0
  28. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/es_metrics.py +0 -0
  29. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  30. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/helper.py +0 -0
  31. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/metrics_server.py +0 -0
  32. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  33. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  34. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  35. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/plumber/__init__.py +0 -0
  36. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/plumber/run_plumber.py +0 -0
  37. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/__init__.py +0 -0
  38. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/client.py +0 -0
  39. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/creator/__init__.py +0 -0
  40. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/creator/run.py +0 -0
  41. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/creator/run_worker.py +0 -0
  42. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/loader/__init__.py +0 -0
  43. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/loader/run.py +0 -0
  44. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/loader/run_worker.py +0 -0
  45. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/replay/replay.py +0 -0
  46. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/safelist_client.py +0 -0
  47. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/__init__.py +0 -0
  48. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/collection.py +0 -0
  49. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  50. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  51. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/controllers/interface.py +0 -0
  52. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +0 -0
  53. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/run_scaler.py +0 -0
  54. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/scaler/scaler_server.py +0 -0
  55. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/server_base.py +0 -0
  56. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/signature_client.py +0 -0
  57. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/submission_client.py +0 -0
  58. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/tasking_client.py +0 -0
  59. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/updater/__init__.py +0 -0
  60. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/updater/helper.py +0 -0
  61. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/updater/run_updater.py +0 -0
  62. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/__init__.py +0 -0
  63. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/crawler.py +0 -0
  64. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/department_map.py +0 -0
  65. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/safelist.py +0 -0
  66. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/stream_map.py +0 -0
  67. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/vacuum/worker.py +0 -0
  68. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/workflow/__init__.py +0 -0
  69. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core/workflow/run_workflow.py +0 -0
  70. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  71. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  72. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core.egg-info/requires.txt +0 -0
  73. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/assemblyline_core.egg-info/top_level.txt +0 -0
  74. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/setup.cfg +0 -0
  75. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/setup.py +0 -0
  76. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_alerter.py +0 -0
  77. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_badlist_client.py +0 -0
  78. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_dispatcher.py +0 -0
  79. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_plumber.py +0 -0
  80. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_replay.py +0 -0
  81. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_safelist_client.py +0 -0
  82. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_scaler.py +0 -0
  83. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_scheduler.py +0 -0
  84. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_signature_client.py +0 -0
  85. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_simulation.py +0 -0
  86. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.0.22 → assemblyline-core-4.5.0.24}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.22
3
+ Version: 4.5.0.24
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.0.24
@@ -1,16 +1,19 @@
1
1
  #!/usr/bin/env python
2
2
  from __future__ import annotations
3
+
3
4
  import concurrent.futures
4
- from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Future, as_completed
5
- from concurrent.futures.process import BrokenProcessPool
5
+ import threading
6
6
  import functools
7
- from typing import Callable, Optional, Union, TYPE_CHECKING
8
7
  import elasticapm
9
8
  import time
10
9
 
10
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Future, as_completed
11
+ from concurrent.futures.process import BrokenProcessPool
11
12
  from datemath import dm
13
+ from typing import Callable, Optional, TYPE_CHECKING
12
14
 
13
- from assemblyline.common.isotime import epoch_to_iso, now_as_iso, iso_to_epoch
15
+ from assemblyline.common.isotime import epoch_to_iso, now_as_iso
16
+ from assemblyline.datastore.collection import Index
14
17
  from assemblyline_core.server_base import ServerBase
15
18
  from assemblyline_core.dispatching.dispatcher import BAD_SID_HASH
16
19
  from assemblyline.common import forge
@@ -150,19 +153,19 @@ class ExpiryManager(ServerBase):
150
153
  bulk.add_delete_operation(sha256)
151
154
 
152
155
  if len(file_list) > 0:
153
- self.log.info(f' Deleted associated files from the '
156
+ self.log.info(f'[{collection.name}] Deleted associated files from the '
154
157
  f'{"cachestore" if "cache" in collection.name else "filestore"}...')
155
158
  collection.bulk(bulk)
156
159
  self.counter.increment(f'{collection.name}', increment_by=len(file_list))
157
- self.log.info(f" Deleted {len(file_list)} items from the datastore...")
160
+ self.log.info(f"[{collection.name}] Deleted {len(file_list)} items from the datastore...")
158
161
  else:
159
- self.log.warning(' Expiry unable to clean up any of the files in filestore.')
162
+ self.log.warning(f'[{collection.name}] Expiry unable to clean up any of the files in filestore.')
160
163
 
161
164
  def _simple_delete(self, collection, delete_query, number_to_delete):
162
165
  self.heartbeat()
163
166
  collection.delete_by_query(delete_query)
164
167
  self.counter.increment(f'{collection.name}', increment_by=number_to_delete)
165
- self.log.info(f" Deleted {number_to_delete} items from the datastore...")
168
+ self.log.info(f"[{collection.name}] Deleted {number_to_delete} items from the datastore...")
166
169
 
167
170
  def _cleanup_canceled_submission(self, sid):
168
171
  # Allowing us at minimum 5 minutes to cleanup the submission
@@ -171,7 +174,7 @@ class ExpiryManager(ServerBase):
171
174
  self.apm_client.begin_transaction("Delete canceled submissions")
172
175
 
173
176
  # Cleaning up the submission
174
- self.log.info(f"Deleting incomplete submission {sid}...")
177
+ self.log.info(f"[submission] Deleting incomplete submission {sid}...")
175
178
  self.datastore.delete_submission_tree_bulk(sid, self.classification, transport=self.filestore)
176
179
  self.redis_bad_sids.remove(sid)
177
180
 
@@ -181,131 +184,151 @@ class ExpiryManager(ServerBase):
181
184
  if self.apm_client:
182
185
  self.apm_client.end_transaction("canceled_submissions", 'deleted')
183
186
 
184
- def run_expiry_once(self, pool: ThreadPoolExecutor):
185
- now = now_as_iso()
186
- reached_max = False
187
+ def _process_chunk(self, collection: ESCollection, start, end, final_date, number_to_delete):
188
+ # We assume that no records are ever inserted such that their expiry_ts is in the past.
189
+ # We also assume that the `end` dates are also in the past.
190
+ # As long as these two things are true, the set returned by this query should be consistent.
191
+ # The one race condition is that a record might be refreshed while the file
192
+ # blob would be deleted anyway, leaving a file record with no filestore object
193
+ delete_query = f"expiry_ts:{{{start} TO {end}]"
194
+
195
+ # check if we are dealing with an index that needs file cleanup
196
+ if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap:
197
+ # Delete associated files
198
+ delete_objects: list[str] = []
199
+ for item in collection.stream_search(delete_query, fl='id', as_obj=False):
200
+ self.heartbeat()
201
+ delete_objects.append(item['id'])
187
202
 
188
- # Delete canceled submissions
189
- # Make sure we're not dedicating more then a quarter of the pool to this operation because it is costly
190
- for submission in self.datastore.submission.search(
191
- "to_be_deleted:true", fl="sid", rows=max(1, int(self.config.core.expiry.workers / 4)))['items']:
192
- if submission.sid not in self.current_submission_cleanup:
193
- self.current_submission_cleanup.add(submission.sid)
194
- pool.submit(self.log_errors(self._cleanup_canceled_submission), submission.sid)
203
+ # Filter archived documents if archive filestore is the same as the filestore
204
+ expire_only = []
205
+ if self.same_storage and self.config.datastore.archive.enabled and collection.name == 'file':
206
+ archived_files = self.datastore.file.multiexists(delete_objects, index_type=Index.ARCHIVE)
207
+ delete_objects = [k for k, v in archived_files.items() if not v]
208
+ expire_only = [k for k, v in archived_files.items() if v]
195
209
 
196
- # Expire data
197
- for collection in self.expirable_collections:
198
- self.heartbeat()
210
+ delete_tasks = self.fs_hashmap[collection.name](delete_objects, final_date)
199
211
 
200
- # Start of expiry transaction
201
- if self.apm_client:
202
- self.apm_client.begin_transaction("Delete expired documents")
212
+ # Proceed with deletion, but only after all the scheduled deletes for this
213
+ self.log.info(f"[{collection.name}] Scheduled {len(delete_objects)}/{number_to_delete} files to be removed")
214
+ self._finish_delete(collection, delete_tasks, expire_only)
203
215
 
204
- if self.config.core.expiry.batch_delete:
205
- final_date = dm(f"{now}||-{self.config.core.expiry.delay}h/d").float_timestamp
206
- else:
207
- final_date = dm(f"{now}||-{self.config.core.expiry.delay}h").float_timestamp
208
- final_date_string = epoch_to_iso(final_date)
216
+ else:
217
+ # Proceed with deletion
218
+ self._simple_delete(collection, delete_query, number_to_delete)
209
219
 
210
- # Break down the expiry window into smaller chunks of data
211
- unchecked_chunks: list[tuple[float, float]] = [(self._find_expiry_start(collection), final_date)]
212
- ready_chunks: dict[tuple[float, float], int] = {}
213
- while unchecked_chunks and len(ready_chunks) < self.config.core.expiry.iteration_max_tasks:
214
- self.heartbeat()
215
- start, end = unchecked_chunks.pop()
216
- chunk_size = self._count_expired(collection, start, end)
220
+ def feed_expiry_jobs(self, collection, start, jobs: list[concurrent.futures.Future],
221
+ pool: ThreadPoolExecutor) -> tuple[str, bool]:
222
+ _process_chunk = self.log_errors(self._process_chunk)
223
+ number_to_delete = 0
224
+ self.heartbeat()
217
225
 
218
- # Empty chunks are fine
219
- if chunk_size == 0:
220
- continue
226
+ # Start of expiry transaction
227
+ if self.apm_client:
228
+ self.apm_client.begin_transaction("Delete expired documents")
221
229
 
222
- # We found a small enough chunk to
223
- # run on
224
- if chunk_size < self.expiry_size:
225
- ready_chunks[(start, end)] = chunk_size
226
- continue
230
+ final_date = self._get_final_date()
227
231
 
228
- # Break this chunk into parts
229
- middle = (end + start)/2
230
- unchecked_chunks.append((middle, end))
231
- unchecked_chunks.append((start, middle))
232
+ # Break down the expiry window into smaller chunks of data
233
+ while len(jobs) < self.config.core.expiry.iteration_max_tasks:
232
234
 
233
- # If there are still chunks we haven't checked, then we know there is more data
234
- if unchecked_chunks:
235
- reached_max = True
235
+ # Get the next chunk
236
+ end, number_to_delete = self._get_next_chunk(collection, start, final_date)
236
237
 
237
- for (start, end), number_to_delete in ready_chunks.items():
238
- self.heartbeat()
239
- # We assume that no records are ever inserted such that their expiry_ts is in the past.
240
- # We also assume that the `end` dates are also in the past.
241
- # As long as these two things are true, the set returned by this query should be consistent.
242
- # The one race condition is that a record might be refreshed while the file
243
- # blob would be deleted anyway, leaving a file record with no filestore object
244
- self.log.info(f"Processing collection: {collection.name}")
245
- delete_query = f"expiry_ts:[{epoch_to_iso(start) if start > 0 else '*'} TO {epoch_to_iso(end)}}}"
246
-
247
- # check if we are dealing with an index that needs file cleanup
248
- if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap:
249
- # Delete associated files
250
- delete_objects: list[str] = []
251
- for item in collection.stream_search(delete_query, fl='id', as_obj=False):
252
- self.heartbeat()
253
- delete_objects.append(item['id'])
254
-
255
- # Filter archived documents if archive filestore is the same as the filestore
256
- expire_only = []
257
- if self.same_storage and self.config.datastore.archive.enabled and collection.name == 'file':
258
- archived_files = self.datastore.file.multiexists_in_archive(delete_objects)
259
- delete_objects = [k for k, v in archived_files.items() if not v]
260
- expire_only = [k for k, v in archived_files.items() if v]
261
-
262
- delete_tasks = self.fs_hashmap[collection.name](delete_objects, final_date_string)
263
-
264
- # Proceed with deletion, but only after all the scheduled deletes for this
265
- self.log.info(f"Scheduled {len(delete_objects)}/{number_to_delete} "
266
- f"files to be removed for: {collection.name}")
267
- pool.submit(self.log_errors(self._finish_delete), collection, delete_tasks, expire_only)
268
-
269
- else:
270
- # Proceed with deletion
271
- pool.submit(self.log_errors(self._simple_delete),
272
- collection, delete_query, number_to_delete)
273
-
274
- # End of expiry transaction
275
- if self.apm_client:
276
- self.apm_client.end_transaction(collection.name, 'deleted')
277
-
278
- return reached_max
279
-
280
- def _find_expiry_start(self, container: ESCollection):
281
- """Find earliest expiring item in this container."""
282
- rows = container.search(f"expiry_ts: [* TO {epoch_to_iso(time.time())}]",
283
- rows=1, sort='expiry_ts asc', as_obj=False, fl='expiry_ts')
238
+ # Check if we got anything
239
+ if number_to_delete == 0:
240
+ break
241
+
242
+ # Process the chunk in the threadpool
243
+ jobs.append(pool.submit(_process_chunk, collection, start, end, final_date, number_to_delete))
244
+
245
+ # Prepare for next chunk
246
+ start = end
247
+
248
+ # End of expiry transaction
249
+ if self.apm_client:
250
+ self.apm_client.end_transaction(collection.name, 'deleted')
251
+
252
+ return start, number_to_delete < self.expiry_size
253
+
254
+ def _get_final_date(self):
255
+ now = now_as_iso()
256
+ if self.config.core.expiry.batch_delete:
257
+ final_date = dm(f"{now}||-{self.config.core.expiry.delay}h/d").float_timestamp
258
+ else:
259
+ final_date = dm(f"{now}||-{self.config.core.expiry.delay}h").float_timestamp
260
+ return epoch_to_iso(final_date)
261
+
262
+ def _get_next_chunk(self, collection: ESCollection, start, final_date):
263
+ """Find date of item at chunk size and the number of items that
264
+ will be affected in between start date and the date found"""
265
+ rows = collection.search(f"expiry_ts: {{{start} TO {final_date}]", rows=1,
266
+ offset=self.expiry_size - 1, sort='expiry_ts asc',
267
+ as_obj=False, fl='expiry_ts')
284
268
  if rows['items']:
285
- return iso_to_epoch(rows['items'][0]['expiry_ts'])
286
- return time.time()
287
-
288
- def _count_expired(self, container: ESCollection, start: Union[float, str], end: float) -> int:
289
- """Count how many items need to be erased in the given window."""
290
- if start == 0:
291
- start = '*'
292
- if isinstance(start, (float, int)):
293
- start = epoch_to_iso(start)
294
- query = f'expiry_ts:[{start} TO {epoch_to_iso(end)}}}'
295
- return container.search(query, rows=0, as_obj=False, track_total_hits=self.expiry_size)['total']
269
+ return rows['items'][0]['expiry_ts'], self.expiry_size
270
+ return final_date, rows['total']
296
271
 
297
272
  def try_run(self):
273
+ pool = ThreadPoolExecutor(self.config.core.expiry.workers)
274
+ main_threads = []
275
+
276
+ # Launch a thread that will expire submissions that have been deleted
277
+ thread = threading.Thread(target=self.clean_deleted_submissions, args=[pool])
278
+ thread.start()
279
+ main_threads.append(thread)
280
+
281
+ # Launch threads that expire data from each collection of data
282
+ for collection in self.expirable_collections:
283
+ thread = threading.Thread(target=self.run_collection, args=[pool, collection])
284
+ thread.start()
285
+ main_threads.append(thread)
286
+
287
+ # Wait for all the threads to exit
288
+ for thread in main_threads:
289
+ thread.join()
290
+
291
+ def clean_deleted_submissions(self, pool):
292
+ """Delete canceled submissions"""
298
293
  while self.running:
299
- try:
300
- expiry_maxed_out = False
294
+ # Make sure we're not dedicating more then a quarter of the pool to this operation because it is costly
295
+ for submission in self.datastore.submission.search(
296
+ "to_be_deleted:true", fl="sid", rows=max(1, int(self.config.core.expiry.workers / 4)))['items']:
297
+ if submission.sid not in self.current_submission_cleanup:
298
+ self.current_submission_cleanup.add(submission.sid)
299
+ pool.submit(self.log_errors(self._cleanup_canceled_submission), submission.sid)
300
+ self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)
301
+
302
+ def run_collection(self, pool: concurrent.futures.ThreadPoolExecutor, collection):
303
+ """Feed batches of jobs to delete to the thread pool for the given collection."""
304
+ start = "*"
305
+ jobs: list[concurrent.futures.Future] = []
301
306
 
302
- with ThreadPoolExecutor(self.config.core.expiry.workers) as pool:
303
- try:
304
- expiry_maxed_out = self.run_expiry_once(pool)
305
- except Exception as e:
306
- self.log.exception(str(e))
307
+ while self.running:
308
+ try:
309
+ try:
310
+ # Fill up 'jobs' with tasks that have been sent to the thread pool
311
+ # 'jobs' may already have items in it, but 'start' makes sure the new
312
+ # task added starts where the last finshed
313
+ start, final_job_small = self.feed_expiry_jobs(collection, start, jobs, pool)
314
+
315
+ # Wait until some of our work finishes and there is room in the queue for more work
316
+ finished, _jobs = concurrent.futures.wait(jobs, return_when=concurrent.futures.FIRST_COMPLETED)
317
+ jobs = list(_jobs)
318
+ for job in finished:
319
+ job.result()
320
+
321
+ # If we have expired all the data reset the start pointer
322
+ if len(jobs) == 0:
323
+ start = '*'
324
+
325
+ except Exception as e:
326
+ self.log.exception(str(e))
327
+ continue
307
328
 
308
- if not expiry_maxed_out:
329
+ # IF the most recent job added to the jobs list is short then
330
+ # all the data is currently queued up to delete and we can sleep
331
+ if final_job_small:
309
332
  self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)
310
333
 
311
334
  except BrokenProcessPool:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.22
3
+ Version: 4.5.0.24
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -63,7 +63,8 @@ def test_expire_all(ds_expiry):
63
63
  expiry.running = True
64
64
  expiry.counter = FakeCounter()
65
65
  with concurrent.futures.ThreadPoolExecutor(5) as pool:
66
- expiry.run_expiry_once(pool)
66
+ for collection in expiry.expirable_collections:
67
+ expiry.feed_expiry_jobs(collection=collection, pool=pool, start='*', jobs=[])
67
68
 
68
69
  for k, v in expiry_collections_len.items():
69
70
  assert v == expiry.counter.get(k)
@@ -1 +0,0 @@
1
- 4.5.0.22