rocketjob 6.0.0.rc3 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +24 -20
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  11. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  12. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  13. data/lib/rocket_job/jobs/on_demand_batch_job.rb +7 -5
  14. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  15. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  16. data/lib/rocket_job/plugins/cron.rb +60 -20
  17. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  18. data/lib/rocket_job/plugins/restart.rb +3 -110
  19. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  20. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +1 -2
  21. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  22. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  23. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  24. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  25. data/lib/rocket_job/sliced/input.rb +42 -54
  26. data/lib/rocket_job/sliced/slice.rb +7 -3
  27. data/lib/rocket_job/sliced/slices.rb +12 -9
  28. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  29. data/lib/rocket_job/sliced.rb +1 -19
  30. data/lib/rocket_job/version.rb +1 -1
  31. data/lib/rocketjob.rb +2 -2
  32. metadata +8 -10
  33. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  34. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  35. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
4
- data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
3
+ metadata.gz: e313f192b854d066258a614ceac1131851c8df94c7e08f7cea6681fff6946d69
4
+ data.tar.gz: 10804682bee08715671696db4610ce4f93679398398bc385e93619f5a3aca715
5
5
  SHA512:
6
- metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
7
- data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
6
+ metadata.gz: 158675e5ddec87a8b277708887b037746e3f1573569edd5a8959eabdf5668b144553cbe6164844f2cf69bc603798b3b5b052697506dad8a29e8477afc62cc45f
7
+ data.tar.gz: 680efe5603de3649b7e09340a545d2e1df1e02697d34af9451869310e9f2a87bbd05423686bb7d40d60f104553f7311721870e52ac2171bb1491b3a8decaf439
data/README.md CHANGED
@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
49
49
 
50
50
  It is important to migrate away from these plugins, since they will be removed in a future release.
51
51
 
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
52
78
  #### Upgrading Batch Jobs to Rocket Job v6
53
79
 
54
80
  Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
@@ -72,34 +72,38 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
75
78
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError,
86
- "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
87
- end
79
+ # find does not work against this association
80
+ input_categories.each { |category| return category if category.name == category_name }
81
+
82
+ unless category_name == :main
83
+ raise(
84
+ ArgumentError,
85
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
86
+ )
88
87
  end
88
+
89
+ # Auto-register main input category when not defined
90
+ category = Category::Input.new(job: self)
91
+ self.input_categories << category
89
92
  category
90
93
  end
91
94
 
92
95
  def output_category(category_name = :main)
96
+ return category_name if category_name.is_a?(Category::Output)
97
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
98
+
93
99
  category_name = category_name.to_sym
94
- category = nil
95
100
  # .find does not work against this association
96
- output_categories.each { |catg| category = catg if catg.name == category_name }
97
- unless category
98
- raise(ArgumentError,
99
- "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
100
- end
101
+ output_categories.each { |category| return category if category.name == category_name }
101
102
 
102
- category
103
+ raise(
104
+ ArgumentError,
105
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
106
+ )
103
107
  end
104
108
 
105
109
  # Returns [true|false] whether the named category has already been defined
@@ -214,7 +218,7 @@ module RocketJob
214
218
  category.tabular.render(row)
215
219
  end
216
220
 
217
- # Migrate existing v4 batch jobs to v5.0
221
+ # Migrate existing v5 batch jobs to v6
218
222
  def rocketjob_categories_migrate
219
223
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
220
224
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -427,50 +449,28 @@ module RocketJob
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end