assemblyline-core 4.5.1.dev134__tar.gz → 4.5.1.dev136__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.1.dev136/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/expiry/run_expiry.py +84 -51
  4. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core.egg-info/PKG-INFO +1 -1
  5. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_expiry.py +2 -1
  6. assemblyline-core-4.5.1.dev134/assemblyline_core/VERSION +0 -1
  7. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/LICENCE.md +0 -0
  8. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/README.md +0 -0
  9. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/__init__.py +0 -0
  10. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/alerter/__init__.py +0 -0
  11. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/alerter/processing.py +0 -0
  12. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/alerter/run_alerter.py +0 -0
  13. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/archiver/__init__.py +0 -0
  14. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/archiver/run_archiver.py +0 -0
  15. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/badlist_client.py +0 -0
  16. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/__init__.py +0 -0
  17. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/__main__.py +0 -0
  18. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/client.py +0 -0
  19. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/dispatcher.py +0 -0
  20. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/schedules.py +0 -0
  21. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/dispatching/timeout.py +0 -0
  22. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/expiry/__init__.py +0 -0
  23. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/ingester/__init__.py +0 -0
  24. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/ingester/__main__.py +0 -0
  25. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/ingester/constants.py +0 -0
  26. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/ingester/ingester.py +0 -0
  27. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/__init__.py +0 -0
  28. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/es_metrics.py +0 -0
  29. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  30. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/helper.py +0 -0
  31. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/metrics_server.py +0 -0
  32. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  33. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  34. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  35. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/plumber/__init__.py +0 -0
  36. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/plumber/run_plumber.py +0 -0
  37. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/__init__.py +0 -0
  38. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/client.py +0 -0
  39. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/creator/__init__.py +0 -0
  40. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/creator/run.py +0 -0
  41. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/creator/run_worker.py +0 -0
  42. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/loader/__init__.py +0 -0
  43. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/loader/run.py +0 -0
  44. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/loader/run_worker.py +0 -0
  45. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/replay/replay.py +0 -0
  46. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/safelist_client.py +0 -0
  47. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/__init__.py +0 -0
  48. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/collection.py +0 -0
  49. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  50. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  51. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/controllers/interface.py +0 -0
  52. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +0 -0
  53. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/run_scaler.py +0 -0
  54. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/scaler/scaler_server.py +0 -0
  55. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/server_base.py +0 -0
  56. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/signature_client.py +0 -0
  57. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/submission_client.py +0 -0
  58. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/tasking_client.py +0 -0
  59. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/updater/__init__.py +0 -0
  60. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/updater/helper.py +0 -0
  61. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/updater/run_updater.py +0 -0
  62. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/__init__.py +0 -0
  63. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/crawler.py +0 -0
  64. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/department_map.py +0 -0
  65. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/safelist.py +0 -0
  66. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/stream_map.py +0 -0
  67. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/vacuum/worker.py +0 -0
  68. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/workflow/__init__.py +0 -0
  69. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core/workflow/run_workflow.py +0 -0
  70. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  71. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  72. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core.egg-info/requires.txt +0 -0
  73. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/assemblyline_core.egg-info/top_level.txt +0 -0
  74. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/setup.cfg +0 -0
  75. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/setup.py +0 -0
  76. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_alerter.py +0 -0
  77. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_badlist_client.py +0 -0
  78. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_dispatcher.py +0 -0
  79. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_plumber.py +0 -0
  80. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_replay.py +0 -0
  81. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_safelist_client.py +0 -0
  82. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_scaler.py +0 -0
  83. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_scheduler.py +0 -0
  84. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_signature_client.py +0 -0
  85. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_simulation.py +0 -0
  86. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.1.dev134 → assemblyline-core-4.5.1.dev136}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.1.dev134
3
+ Version: 4.5.1.dev136
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.1.dev136
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import concurrent.futures
5
+ import threading
5
6
  import functools
6
7
  import elasticapm
7
8
  import time
@@ -12,6 +13,7 @@ from datemath import dm
12
13
  from typing import Callable, Optional, TYPE_CHECKING
13
14
 
14
15
  from assemblyline.common.isotime import epoch_to_iso, now_as_iso
16
+ from assemblyline.datastore.collection import Index
15
17
  from assemblyline_core.server_base import ServerBase
16
18
  from assemblyline_core.dispatching.dispatcher import BAD_SID_HASH
17
19
  from assemblyline.common import forge
@@ -202,7 +204,7 @@ class ExpiryManager(ServerBase):
202
204
  # Filter archived documents if archive filestore is the same as the filestore
203
205
  expire_only = []
204
206
  if self.same_storage and self.config.datastore.archive.enabled and collection.name == 'file':
205
- archived_files = self.datastore.file.multiexists_in_archive(delete_objects)
207
+ archived_files = self.datastore.file.multiexists(delete_objects, index_type=Index.ARCHIVE)
206
208
  delete_objects = [k for k, v in archived_files.items() if not v]
207
209
  expire_only = [k for k, v in archived_files.items() if v]
208
210
 
@@ -217,56 +219,39 @@ class ExpiryManager(ServerBase):
217
219
  # Proceed with deletion
218
220
  self._simple_delete(collection, delete_query, number_to_delete)
219
221
 
220
- def run_expiry_once(self, pool: ThreadPoolExecutor):
221
- busy_iteration = False
222
+ def feed_expiry_jobs(self, collection, start, jobs: list[concurrent.futures.Future], pool: ThreadPoolExecutor) -> tuple[str, bool]:
223
+ _process_chunk = self.log_errors(self._process_chunk)
224
+ number_to_delete = 0
225
+ self.heartbeat()
226
+
227
+ # Start of expiry transaction
228
+ if self.apm_client:
229
+ self.apm_client.begin_transaction("Delete expired documents")
222
230
 
223
- # Delete canceled submissions
224
- # Make sure we're not dedicating more then a quarter of the pool to this operation because it is costly
225
- for submission in self.datastore.submission.search(
226
- "to_be_deleted:true", fl="sid", rows=max(1, int(self.config.core.expiry.workers / 4)))['items']:
227
- if submission.sid not in self.current_submission_cleanup:
228
- self.current_submission_cleanup.add(submission.sid)
229
- pool.submit(self.log_errors(self._cleanup_canceled_submission), submission.sid)
231
+ final_date = self._get_final_date()
230
232
 
231
- # Expire data
232
- for collection in self.expirable_collections:
233
+ # Break down the expiry window into smaller chunks of data
234
+ while len(jobs) < self.config.core.expiry.iteration_max_tasks:
233
235
  self.heartbeat()
234
236
 
235
- # Start of expiry transaction
236
- if self.apm_client:
237
- self.apm_client.begin_transaction("Delete expired documents")
238
-
239
- final_date = self._get_final_date()
240
-
241
- # Break down the expiry window into smaller chunks of data
242
- start = "*"
243
- iterations = 0
244
- while iterations < self.config.core.expiry.iteration_max_tasks:
245
- self.heartbeat()
246
-
247
- # Get the next chunk
248
- end, number_to_delete = self._get_next_chunk(collection, start, final_date)
249
-
250
- # Check if we got anything
251
- if number_to_delete == 0:
252
- break
237
+ # Get the next chunk
238
+ end, number_to_delete = self._get_next_chunk(collection, start, final_date)
253
239
 
254
- # Tell the outer loop not to sleep between runs
255
- if number_to_delete >= self.expiry_size:
256
- busy_iteration = True
240
+ # Check if we got anything
241
+ if number_to_delete == 0:
242
+ break
257
243
 
258
- # Process the chunk in the threadpool
259
- pool.submit(self.log_errors(self._process_chunk), collection, start, end, final_date, number_to_delete)
244
+ # Process the chunk in the threadpool
245
+ jobs.append(pool.submit(_process_chunk, collection, start, end, final_date, number_to_delete))
260
246
 
261
- # Prepare for next chunk
262
- start = end
263
- iterations += 1
247
+ # Prepare for next chunk
248
+ start = end
264
249
 
265
- # End of expiry transaction
266
- if self.apm_client:
267
- self.apm_client.end_transaction(collection.name, 'deleted')
250
+ # End of expiry transaction
251
+ if self.apm_client:
252
+ self.apm_client.end_transaction(collection.name, 'deleted')
268
253
 
269
- return busy_iteration
254
+ return start, number_to_delete < self.expiry_size
270
255
 
271
256
  def _get_final_date(self):
272
257
  now = now_as_iso()
@@ -287,17 +272,65 @@ class ExpiryManager(ServerBase):
287
272
  return final_date, rows['total']
288
273
 
289
274
  def try_run(self):
290
- while self.running:
291
- try:
292
- busy_iteration = False
275
+ pool = ThreadPoolExecutor(self.config.core.expiry.workers)
276
+ main_threads = []
277
+
278
+ # Launch a thread that will expire submissions that have been deleted
279
+ thread = threading.Thread(target=self.clean_deleted_submissions, args=[pool])
280
+ thread.start()
281
+ main_threads.append(thread)
282
+
283
+ # Launch threads that expire data from each collection of data
284
+ for collection in self.expirable_collections:
285
+ thread = threading.Thread(target=self.run_collection, args=[pool, collection])
286
+ thread.start()
287
+ main_threads.append(thread)
293
288
 
294
- with ThreadPoolExecutor(self.config.core.expiry.workers) as pool:
295
- try:
296
- busy_iteration = self.run_expiry_once(pool)
297
- except Exception as e:
298
- self.log.exception(str(e))
289
+ # Wait for all the threads to exit
290
+ for thread in main_threads:
291
+ thread.join()
299
292
 
300
- if not busy_iteration:
293
+ def clean_deleted_submissions(self, pool):
294
+ """Delete canceled submissions"""
295
+ while self.running:
296
+ # Make sure we're not dedicating more then a quarter of the pool to this operation because it is costly
297
+ for submission in self.datastore.submission.search(
298
+ "to_be_deleted:true", fl="sid", rows=max(1, int(self.config.core.expiry.workers / 4)))['items']:
299
+ if submission.sid not in self.current_submission_cleanup:
300
+ self.current_submission_cleanup.add(submission.sid)
301
+ pool.submit(self.log_errors(self._cleanup_canceled_submission), submission.sid)
302
+ self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)
303
+
304
+ def run_collection(self, pool: concurrent.futures.ThreadPoolExecutor, collection):
305
+ """Feed batches of jobs to delete to the thread pool for the given collection."""
306
+ start = "*"
307
+ jobs: list[concurrent.futures.Future] = []
308
+
309
+ while self.running:
310
+ try:
311
+ try:
312
+ # Fill up 'jobs' with tasks that have been sent to the thread pool
313
+ # 'jobs' may already have items in it, but 'start' makes sure the new
314
+ # task added starts where the last finshed
315
+ start, final_job_small = self.feed_expiry_jobs(collection, start, jobs, pool)
316
+
317
+ # Wait until some of our work finishes and there is room in the queue for more work
318
+ finished, _jobs = concurrent.futures.wait(jobs, return_when=concurrent.futures.FIRST_COMPLETED)
319
+ jobs = list(_jobs)
320
+ for job in finished:
321
+ job.result()
322
+
323
+ # If we have expired all the data reset the start pointer
324
+ if len(jobs) == 0:
325
+ start = '*'
326
+
327
+ except Exception as e:
328
+ self.log.exception(str(e))
329
+ continue
330
+
331
+ # IF the most recent job added to the jobs list is short then
332
+ # all the data is currently queued up to delete and we can sleep
333
+ if final_job_small:
301
334
  self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)
302
335
 
303
336
  except BrokenProcessPool:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.1.dev134
3
+ Version: 4.5.1.dev136
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -63,7 +63,8 @@ def test_expire_all(ds_expiry):
63
63
  expiry.running = True
64
64
  expiry.counter = FakeCounter()
65
65
  with concurrent.futures.ThreadPoolExecutor(5) as pool:
66
- expiry.run_expiry_once(pool)
66
+ for collection in expiry.expirable_collections:
67
+ expiry.feed_expiry_jobs(collection=collection, pool=pool, start='*', jobs=[])
67
68
 
68
69
  for k, v in expiry_collections_len.items():
69
70
  assert v == expiry.counter.get(k)
@@ -1 +0,0 @@
1
- 4.5.1.dev134