maintenance_tasks 2.13.0 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,567 @@
1
+ # frozen_string_literal: true
2
+
3
+ module MaintenanceTasks
4
+ # Concern that holds the behavior of a maintenance task run. It is
5
+ # included in {Run}.
6
+ #
7
+ # @api private
8
+ module RunConcern
9
+ extend ActiveSupport::Concern
10
+
11
+ included do
12
+ # Various statuses a run can be in.
13
+ STATUSES = [
14
+ :enqueued, # The task has been enqueued by the user.
15
+ :running, # The task is being performed by a job worker.
16
+ :succeeded, # The task finished without error.
17
+ :cancelling, # The task has been told to cancel but is finishing work.
18
+ :cancelled, # The user explicitly halted the task's execution.
19
+ :interrupted, # The task was interrupted by the job infrastructure.
20
+ :pausing, # The task has been told to pause but is finishing work.
21
+ :paused, # The task was paused in the middle of the run by the user.
22
+ :errored, # The task code produced an unhandled exception.
23
+ ]
24
+
25
+ ACTIVE_STATUSES = [
26
+ :enqueued,
27
+ :running,
28
+ :paused,
29
+ :pausing,
30
+ :cancelling,
31
+ :interrupted,
32
+ ]
33
+ STOPPING_STATUSES = [
34
+ :pausing,
35
+ :cancelling,
36
+ :cancelled,
37
+ ]
38
+ COMPLETED_STATUSES = [:succeeded, :errored, :cancelled]
39
+
40
+ enum :status, STATUSES.to_h { |status| [status, status.to_s] }
41
+
42
+ after_save :instrument_status_change
43
+
44
+ validate :task_name_belongs_to_a_valid_task, on: :create
45
+ validate :csv_attachment_presence, on: :create
46
+ validate :csv_content_type, on: :create
47
+ validate :validate_task_arguments, on: :create
48
+
49
+ attr_readonly :task_name
50
+
51
+ serialize :backtrace, coder: YAML
52
+ serialize :arguments, coder: JSON
53
+ serialize :metadata, coder: JSON
54
+
55
+ scope :active, -> { where(status: ACTIVE_STATUSES) }
56
+ scope :completed, -> { where(status: COMPLETED_STATUSES) }
57
+
58
+ # Ensure ActiveStorage is in use before preloading the attachments
59
+ scope :with_attached_csv, -> do
60
+ return unless defined?(ActiveStorage)
61
+
62
+ with_attached_csv_file if ActiveStorage::Attachment.table_exists?
63
+ end
64
+
65
+ validates_with RunStatusValidator, on: :update
66
+
67
+ if MaintenanceTasks.active_storage_service.present?
68
+ has_one_attached :csv_file,
69
+ service: MaintenanceTasks.active_storage_service
70
+ elsif respond_to?(:has_one_attached)
71
+ has_one_attached :csv_file
72
+ end
73
+
74
+ # Sets the run status to enqueued, making sure the transition is validated
75
+ # in case it's already enqueued.
76
+ #
77
+ # Rescues and retries status transition if an ActiveRecord::StaleObjectError
78
+ # is encountered.
79
+ def enqueued!
80
+ with_stale_object_retry do
81
+ status_will_change!
82
+ super
83
+ end
84
+ end
85
+ end
86
+
87
+ CALLBACKS_TRANSITION = {
88
+ cancelled: :cancel,
89
+ interrupted: :interrupt,
90
+ paused: :pause,
91
+ succeeded: :complete,
92
+ }.transform_keys(&:to_s)
93
+
94
+ DELAYS_PER_ATTEMPT = [0.1, 0.2, 0.4, 0.8, 1.6]
95
+ MAX_RETRIES = DELAYS_PER_ATTEMPT.size
96
+
97
+ private_constant :CALLBACKS_TRANSITION, :DELAYS_PER_ATTEMPT, :MAX_RETRIES
98
+
99
+ # Saves the run, persisting the transition of its status, and all other
100
+ # changes to the object.
101
+ def persist_transition
102
+ retry_count = 0
103
+ begin
104
+ save!
105
+ rescue ActiveRecord::StaleObjectError
106
+ if retry_count < MAX_RETRIES
107
+ sleep(DELAYS_PER_ATTEMPT[retry_count])
108
+ retry_count += 1
109
+
110
+ success = succeeded?
111
+ reload_status
112
+ if success
113
+ self.status = :succeeded
114
+ else
115
+ job_shutdown
116
+ end
117
+
118
+ retry
119
+ else
120
+ raise
121
+ end
122
+ end
123
+
124
+ callback = CALLBACKS_TRANSITION[status]
125
+ run_task_callbacks(callback) if callback
126
+ end
127
+
128
+ # Increments +tick_count+ by +number_of_ticks+ and +time_running+ by
129
+ # +duration+, both directly in the DB.
130
+ # The attribute values are not set in the current instance, you need
131
+ # to reload the record.
132
+ #
133
+ # @param number_of_ticks [Integer] number of ticks to add to tick_count.
134
+ # @param duration [Float] the time in seconds that elapsed since the last
135
+ # increment of ticks.
136
+ def persist_progress(number_of_ticks, duration)
137
+ self.class.update_counters(
138
+ id,
139
+ tick_count: number_of_ticks,
140
+ time_running: duration,
141
+ touch: true,
142
+ )
143
+ if locking_enabled?
144
+ locking_column = self.class.locking_column
145
+ self[locking_column] += 1
146
+ clear_attribute_change(locking_column)
147
+ end
148
+ end
149
+
150
+ # Marks the run as errored and persists the error data.
151
+ #
152
+ # @param error [StandardError] the Error being persisted.
153
+ def persist_error(error)
154
+ with_stale_object_retry do
155
+ self.started_at ||= Time.now
156
+ update!(
157
+ status: :errored,
158
+ error_class: truncate(:error_class, error.class.name),
159
+ error_message: truncate(:error_message, error.message),
160
+ backtrace: MaintenanceTasks.backtrace_cleaner.clean(error.backtrace),
161
+ ended_at: Time.now,
162
+ )
163
+ end
164
+ run_error_callback
165
+ end
166
+
167
+ # Refreshes the status and lock version attributes on the Active Record
168
+ # object, and ensures ActiveModel::Dirty doesn't mark the object as changed.
169
+ #
170
+ # This allows us to get the Run's most up-to-date status without needing
171
+ # to reload the entire record.
172
+ #
173
+ # @return [MaintenanceTasks::Run] the Run record with its updated status.
174
+ def reload_status
175
+ columns_to_reload = if locking_enabled?
176
+ [:status, self.class.locking_column]
177
+ else
178
+ [:status]
179
+ end
180
+ updated_status, updated_lock_version = self.class.uncached do
181
+ self.class.where(id: id).pluck(*columns_to_reload).first
182
+ end
183
+
184
+ self.status = updated_status
185
+ if updated_lock_version
186
+ self[self.class.locking_column] = updated_lock_version
187
+ end
188
+ clear_attribute_changes(columns_to_reload)
189
+ self
190
+ end
191
+
192
+ # Returns whether the Run is stopping, which is defined as having a status
193
+ # of pausing or cancelling. The status of cancelled is also considered
194
+ # stopping since a Run can be cancelled while its job still exists in the
195
+ # queue, and we want to handle it the same way as a cancelling run.
196
+ #
197
+ # @return [Boolean] whether the Run is stopping.
198
+ def stopping?
199
+ STOPPING_STATUSES.include?(status.to_sym)
200
+ end
201
+
202
+ # Returns whether the Run is stopped, which is defined as having a status of
203
+ # paused, succeeded, cancelled, or errored.
204
+ #
205
+ # @return [Boolean] whether the Run is stopped.
206
+ def stopped?
207
+ completed? || paused?
208
+ end
209
+
210
+ # Returns whether the Run has been started, which is indicated by the
211
+ # started_at timestamp being present.
212
+ #
213
+ # @return [Boolean] whether the Run was started.
214
+ def started?
215
+ started_at.present?
216
+ end
217
+
218
+ # Returns whether the Run is completed, which is defined as
219
+ # having a status of succeeded, cancelled, or errored.
220
+ #
221
+ # @return [Boolean] whether the Run is completed.
222
+ def completed?
223
+ COMPLETED_STATUSES.include?(status.to_sym)
224
+ end
225
+
226
+ # Returns whether the Run is active, which is defined as
227
+ # having a status of enqueued, running, pausing, cancelling,
228
+ # paused or interrupted.
229
+ #
230
+ # @return [Boolean] whether the Run is active.
231
+ def active?
232
+ ACTIVE_STATUSES.include?(status.to_sym)
233
+ end
234
+
235
+ # Returns the duration left for the Run to finish based on the number of
236
+ # ticks left and the average time needed to process a tick. Returns nil if
237
+ # the Run is completed, or if tick_count or tick_total is zero.
238
+ #
239
+ # @return [ActiveSupport::Duration] the estimated duration left for the Run
240
+ # to finish.
241
+ def time_to_completion
242
+ return if completed? || tick_count == 0 || tick_total.to_i == 0
243
+
244
+ processed_per_second = (tick_count.to_f / time_running)
245
+ ticks_left = (tick_total - tick_count)
246
+ seconds_to_finished = ticks_left / processed_per_second
247
+ seconds_to_finished.seconds
248
+ end
249
+
250
+ # Marks a Run as running.
251
+ #
252
+ # If the run is stopping already, it will not transition to running.
253
+ # Rescues and retries status transition if an ActiveRecord::StaleObjectError
254
+ # is encountered.
255
+ def running
256
+ if locking_enabled?
257
+ with_stale_object_retry do
258
+ running! unless stopping?
259
+ end
260
+ else
261
+ # Preserve swap-and-replace solution for data races until users
262
+ # run migration to upgrade to optimistic locking solution
263
+ return if stopping?
264
+
265
+ updated = self.class.where(id: id).where.not(status: STOPPING_STATUSES)
266
+ .update_all(status: :running, updated_at: Time.now) > 0
267
+ if updated
268
+ self.status = :running
269
+ clear_attribute_changes([:status])
270
+ else
271
+ reload_status
272
+ end
273
+ end
274
+ end
275
+
276
+ # Starts a Run, setting its started_at timestamp and tick_total.
277
+ #
278
+ # @param count [Integer] the total iterations to be performed, as
279
+ # specified by the Task.
280
+ def start(count)
281
+ with_stale_object_retry do
282
+ update!(started_at: Time.now, tick_total: count)
283
+ end
284
+
285
+ task.run_callbacks(:start)
286
+ end
287
+
288
+ # Handles transitioning the status on a Run when the job shuts down.
289
+ def job_shutdown
290
+ if cancelling?
291
+ self.status = :cancelled
292
+ self.ended_at = Time.now
293
+ elsif pausing?
294
+ self.status = :paused
295
+ elsif cancelled?
296
+ else
297
+ self.status = :interrupted
298
+ end
299
+ end
300
+
301
+ # Handles the completion of a Run, setting a status of succeeded and the
302
+ # ended_at timestamp.
303
+ def complete
304
+ self.status = :succeeded
305
+ self.ended_at = Time.now
306
+ end
307
+
308
+ # Cancels a Run.
309
+ #
310
+ # If the Run is paused, it will transition directly to cancelled, since the
311
+ # Task is not being performed. In this case, the ended_at timestamp
312
+ # will be updated.
313
+ #
314
+ # If the Run is not paused, the Run will transition to cancelling.
315
+ #
316
+ # If the Run is already cancelling, and has last been updated more than 5
317
+ # minutes ago, it will transition to cancelled, and the ended_at timestamp
318
+ # will be updated.
319
+ def cancel
320
+ with_stale_object_retry do
321
+ if paused? || stuck?
322
+ self.status = :cancelled
323
+ self.ended_at = Time.now
324
+ persist_transition
325
+ else
326
+ cancelling!
327
+ end
328
+ end
329
+ end
330
+
331
+ # Marks a Run as pausing.
332
+ #
333
+ # If the Run has been stuck on pausing for more than 5 minutes, it forces
334
+ # the transition to paused. The ended_at timestamp will be updated.
335
+ #
336
+ # Rescues and retries status transition if an ActiveRecord::StaleObjectError
337
+ # is encountered.
338
+ def pause
339
+ with_stale_object_retry do
340
+ if stuck?
341
+ self.status = :paused
342
+ persist_transition
343
+ else
344
+ pausing!
345
+ end
346
+ end
347
+ end
348
+
349
+ # Returns whether a Run is stuck, which is defined as having a status of
350
+ # cancelling or pausing, and not having been updated in the last 5 minutes.
351
+ #
352
+ # @return [Boolean] whether the Run is stuck.
353
+ def stuck?
354
+ (cancelling? || pausing?) && updated_at <= MaintenanceTasks.stuck_task_duration.ago
355
+ end
356
+
357
+ # Performs validation on the task_name attribute.
358
+ # A Run must be associated with a valid Task to be valid.
359
+ # In order to confirm that, the Task is looked up by name.
360
+ def task_name_belongs_to_a_valid_task
361
+ Task.named(task_name)
362
+ rescue Task::NotFoundError
363
+ errors.add(:task_name, "must be the name of an existing Task.")
364
+ end
365
+
366
+ # Performs validation on the presence of a :csv_file attachment.
367
+ # A Run for a Task that uses CsvCollection must have an attached :csv_file
368
+ # to be valid. Conversely, a Run for a Task that doesn't use CsvCollection
369
+ # should not have an attachment to be valid. The appropriate error is added
370
+ # if the Run does not meet the above criteria.
371
+ def csv_attachment_presence
372
+ if Task.named(task_name).has_csv_content? && !csv_file.attached?
373
+ errors.add(:csv_file, "must be attached to CSV Task.")
374
+ elsif !Task.named(task_name).has_csv_content? && csv_file.present?
375
+ errors.add(:csv_file, "should not be attached to non-CSV Task.")
376
+ end
377
+ rescue Task::NotFoundError
378
+ nil
379
+ end
380
+
381
+ # Performs validation on the content type of the :csv_file attachment.
382
+ # A Run for a Task that uses CsvCollection must have a present :csv_file
383
+ # and a content type of "text/csv" to be valid. The appropriate error is
384
+ # added if the Run does not meet the above criteria.
385
+ def csv_content_type
386
+ if csv_file.present? && csv_file.content_type != "text/csv"
387
+ errors.add(:csv_file, "must be a CSV")
388
+ end
389
+ rescue Task::NotFoundError
390
+ nil
391
+ end
392
+
393
+ # Performs validation on the arguments to use for the Task. If the Task is
394
+ # invalid, the errors are added to the Run.
395
+ def validate_task_arguments
396
+ arguments_match_task_attributes if arguments.present?
397
+ if task.invalid?
398
+ error_messages = task.errors
399
+ .map { |error| "#{error.attribute.inspect} #{error.message}" }
400
+ errors.add(
401
+ :arguments,
402
+ "are invalid: #{error_messages.join("; ")}",
403
+ )
404
+ end
405
+ rescue Task::NotFoundError
406
+ nil
407
+ end
408
+
409
+ # Fetches the attached ActiveStorage CSV file for the run. Checks first
410
+ # whether the ActiveStorage::Attachment table exists so that we are
411
+ # compatible with apps that are not using ActiveStorage.
412
+ #
413
+ # @return [ActiveStorage::Attached::One] the attached CSV file
414
+ def csv_file
415
+ return unless defined?(ActiveStorage)
416
+ return unless ActiveStorage::Attachment.table_exists?
417
+
418
+ super
419
+ end
420
+
421
+ # Returns a Task instance for this Run. Assigns any attributes to the Task
422
+ # based on the Run's parameters. Note that the Task instance is not supplied
423
+ # with :csv_content yet if it's a CSV Task. This is done in the job, since
424
+ # downloading the CSV file can take some time.
425
+ #
426
+ # @return [Task] a Task instance.
427
+ def task
428
+ @task ||= begin
429
+ task = Task.named(task_name).new
430
+ if task.attribute_names.any? && arguments.present?
431
+ task.assign_attributes(arguments)
432
+ end
433
+
434
+ task.metadata = metadata
435
+ task
436
+ rescue ActiveModel::UnknownAttributeError
437
+ task
438
+ end
439
+ end
440
+
441
+ # Returns all the run arguments with sensitive information masked.
442
+ #
443
+ # @return [Hash] The masked arguments.
444
+ def masked_arguments
445
+ return unless arguments.present?
446
+
447
+ argument_filter.filter(arguments)
448
+ end
449
+
450
+ # @return [Boolean]
451
+ # True when the cursor value should be treated as serialized JSON.
452
+ def cursor_is_json?
453
+ MaintenanceTasks.serialize_cursors_as_json && cursor_is_json
454
+ end
455
+
456
+ # Configures the Run to use the appropriate type of cursor encoding based
457
+ # on `MaintenanceTasks.serialize_cursors_as_json`.
458
+ #
459
+ # This method exists to gracefully handle the situation where the
460
+ # `cursor_is_json` column does not exist. As long as the application is not
461
+ # configured to use JSON cursors (the default), the Run will continue to
462
+ # function even without the new column.
463
+ #
464
+ # * When `MaintenanceTasks.serialize_cursors_as_json` is false, this method
465
+ # no-ops.
466
+ # * When `MaintenanceTasks.serialize_cursors_as_json` is true, this method
467
+ # will mutate the Run so that `cursor_is_json` is set to true.
468
+ def configure_cursor_encoding!
469
+ return unless MaintenanceTasks.serialize_cursors_as_json
470
+
471
+ self.cursor_is_json = true
472
+ end
473
+
474
+ # Returns whether the run is stale based on the staleness threshold.
475
+ #
476
+ # @return [Boolean]
477
+ def stale?
478
+ return false unless MaintenanceTasks.task_staleness_threshold.present?
479
+ return false unless succeeded?
480
+ return false unless ended_at.present?
481
+
482
+ ended_at < MaintenanceTasks.task_staleness_threshold.ago
483
+ end
484
+
485
+ private
486
+
487
+ def instrument_status_change
488
+ return unless status_previously_changed? || id_previously_changed?
489
+ return if running? || pausing? || cancelling? || interrupted?
490
+
491
+ attr = {
492
+ run_id: id,
493
+ job_id: job_id,
494
+ task_name: task_name,
495
+ arguments: arguments,
496
+ metadata: metadata,
497
+ time_running: time_running,
498
+ started_at: started_at,
499
+ ended_at: ended_at,
500
+ }
501
+
502
+ attr[:error] = {
503
+ message: error_message,
504
+ class: error_class,
505
+ backtrace: backtrace,
506
+ } if errored?
507
+
508
+ ActiveSupport::Notifications.instrument("#{status}.maintenance_tasks", attr)
509
+ end
510
+
511
+ def run_task_callbacks(callback)
512
+ task.run_callbacks(callback)
513
+ rescue Task::NotFoundError
514
+ nil
515
+ end
516
+
517
+ def run_error_callback
518
+ task.run_callbacks(:error)
519
+ rescue
520
+ nil
521
+ end
522
+
523
+ def arguments_match_task_attributes
524
+ invalid_argument_keys = arguments.keys - task.attribute_names
525
+ if invalid_argument_keys.any?
526
+ error_message = <<~MSG.squish
527
+ Unknown parameters: #{invalid_argument_keys.map(&:to_sym).join(", ")}
528
+ MSG
529
+ errors.add(:base, error_message)
530
+ end
531
+ end
532
+
533
+ def truncate(attribute_name, value)
534
+ limit = self.class.column_for_attribute(attribute_name).limit
535
+ return value unless limit
536
+
537
+ value&.first(limit)
538
+ end
539
+
540
+ def argument_filter
541
+ @argument_filter ||= ActiveSupport::ParameterFilter.new(
542
+ Rails.application.config.filter_parameters + task.masked_arguments,
543
+ )
544
+ end
545
+
546
+ def with_stale_object_retry(retry_count = 0)
547
+ yield
548
+ rescue ActiveRecord::StaleObjectError
549
+ if retry_count < MAX_RETRIES
550
+ sleep(stale_object_retry_delay(retry_count))
551
+ retry_count += 1
552
+ reload_status
553
+
554
+ retry
555
+ else
556
+ raise
557
+ end
558
+ end
559
+
560
+ def stale_object_retry_delay(retry_count)
561
+ delay = DELAYS_PER_ATTEMPT[retry_count]
562
+ # Add jitter (±25% randomization) to prevent thundering herd
563
+ jitter = delay * 0.25
564
+ delay + (rand * 2 - 1) * jitter
565
+ end
566
+ end
567
+ end