rocketjob 6.0.0.rc3 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
4
- data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
3
+ metadata.gz: e313f192b854d066258a614ceac1131851c8df94c7e08f7cea6681fff6946d69
4
+ data.tar.gz: 10804682bee08715671696db4610ce4f93679398398bc385e93619f5a3aca715
5
5
  SHA512:
6
- metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
7
- data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
6
+ metadata.gz: 158675e5ddec87a8b277708887b037746e3f1573569edd5a8959eabdf5668b144553cbe6164844f2cf69bc603798b3b5b052697506dad8a29e8477afc62cc45f
7
+ data.tar.gz: 680efe5603de3649b7e09340a545d2e1df1e02697d34af9451869310e9f2a87bbd05423686bb7d40d60f104553f7311721870e52ac2171bb1491b3a8decaf439
data/README.md CHANGED
@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
49
49
 
50
50
  It is important to migrate away from these plugins, since they will be removed in a future release.
51
51
 
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
52
78
  #### Upgrading Batch Jobs to Rocket Job v6
53
79
 
54
80
  Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
@@ -72,34 +72,38 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
75
78
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError,
86
- "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
87
- end
79
+ # find does not work against this association
80
+ input_categories.each { |category| return category if category.name == category_name }
81
+
82
+ unless category_name == :main
83
+ raise(
84
+ ArgumentError,
85
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
86
+ )
88
87
  end
88
+
89
+ # Auto-register main input category when not defined
90
+ category = Category::Input.new(job: self)
91
+ self.input_categories << category
89
92
  category
90
93
  end
91
94
 
92
95
  def output_category(category_name = :main)
96
+ return category_name if category_name.is_a?(Category::Output)
97
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
98
+
93
99
  category_name = category_name.to_sym
94
- category = nil
95
100
  # .find does not work against this association
96
- output_categories.each { |catg| category = catg if catg.name == category_name }
97
- unless category
98
- raise(ArgumentError,
99
- "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
100
- end
101
+ output_categories.each { |category| return category if category.name == category_name }
101
102
 
102
- category
103
+ raise(
104
+ ArgumentError,
105
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
106
+ )
103
107
  end
104
108
 
105
109
  # Returns [true|false] whether the named category has already been defined
@@ -214,7 +218,7 @@ module RocketJob
214
218
  category.tabular.render(row)
215
219
  end
216
220
 
217
- # Migrate existing v4 batch jobs to v5.0
221
+ # Migrate existing v5 batch jobs to v6
218
222
  def rocketjob_categories_migrate
219
223
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
220
224
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -427,50 +449,28 @@ module RocketJob
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end