rocketjob 6.0.0.rc3 → 6.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +26 -24
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/job_exception.rb +1 -1
  11. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  12. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  13. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  14. data/lib/rocket_job/jobs/on_demand_batch_job.rb +11 -5
  15. data/lib/rocket_job/jobs/on_demand_job.rb +6 -2
  16. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  17. data/lib/rocket_job/plugins/cron.rb +60 -20
  18. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  19. data/lib/rocket_job/plugins/restart.rb +3 -110
  20. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  21. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +10 -5
  22. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  23. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  24. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  25. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  26. data/lib/rocket_job/sliced/input.rb +42 -54
  27. data/lib/rocket_job/sliced/slice.rb +7 -3
  28. data/lib/rocket_job/sliced/slices.rb +12 -9
  29. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  30. data/lib/rocket_job/sliced.rb +1 -19
  31. data/lib/rocket_job/subscribers/secret_config.rb +17 -0
  32. data/lib/rocket_job/supervisor.rb +1 -0
  33. data/lib/rocket_job/version.rb +1 -1
  34. data/lib/rocketjob.rb +4 -3
  35. metadata +11 -12
  36. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  37. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  38. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
4
- data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
3
+ metadata.gz: d37fc69678a20d2ab48a22daccbbff3238a4511d9fc711873c70640abe4b5d81
4
+ data.tar.gz: df5bc46066bf3359c7e75549cd75545f52fde6c4b5e6c0a4dfae0b63705ca3fe
5
5
  SHA512:
6
- metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
7
- data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
6
+ metadata.gz: 18c563035502f1c1b9c0a37fb87f0563544ba83de0db5f7dec0ba663540a1db2a32b686cfc64c8056b99d7a3ce503b508ab206270feb277c42e79ca010debdf7
7
+ data.tar.gz: 71e00ec619377e422934ae0682b4322ffea63b3db307c06ab4a33a2e24fa149f599a9a579ab3b4d699cf5684d296d861b1cbbf2abc802b55002e721e35d4070e
data/README.md CHANGED
@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
49
49
 
50
50
  It is important to migrate away from these plugins, since they will be removed in a future release.
51
51
 
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
52
78
  #### Upgrading Batch Jobs to Rocket Job v6
53
79
 
54
80
  Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
@@ -72,34 +72,37 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
78
+ # Initialize categories when this method is called before initialization is complete
79
+ rocketjob_categories_assign if input_categories.empty?
80
+
75
81
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError,
86
- "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
87
- end
88
- end
89
- category
82
+ # find does not work against this association
83
+ input_categories.each { |category| return category if category.name == category_name }
84
+
85
+ raise(
86
+ ArgumentError,
87
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
88
+ )
90
89
  end
91
90
 
92
91
  def output_category(category_name = :main)
92
+ return category_name if category_name.is_a?(Category::Output)
93
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
94
+
95
+ # Initialize categories when this method is called before initialization is complete
96
+ rocketjob_categories_assign if output_categories.empty? && self.class.defined_output_categories
97
+
93
98
  category_name = category_name.to_sym
94
- category = nil
95
99
  # .find does not work against this association
96
- output_categories.each { |catg| category = catg if catg.name == category_name }
97
- unless category
98
- raise(ArgumentError,
99
- "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
100
- end
100
+ output_categories.each { |category| return category if category.name == category_name }
101
101
 
102
- category
102
+ raise(
103
+ ArgumentError,
104
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
105
+ )
103
106
  end
104
107
 
105
108
  # Returns [true|false] whether the named category has already been defined
@@ -150,7 +153,7 @@ module RocketJob
150
153
  end
151
154
  end
152
155
 
153
- return if !self.class.defined_output_categories || !output_categories.empty?
156
+ return if !output_categories.empty? || !self.class.defined_output_categories
154
157
 
155
158
  # Input categories defaults to nil if none was set in the class
156
159
  self.output_categories = self.class.defined_output_categories.deep_dup
@@ -160,7 +163,6 @@ module RocketJob
160
163
  def rocketjob_categories_output_render
161
164
  return if @rocket_job_output.nil?
162
165
 
163
- # TODO: ..
164
166
  return unless output_categories
165
167
  return if output_categories.empty?
166
168
 
@@ -214,7 +216,7 @@ module RocketJob
214
216
  category.tabular.render(row)
215
217
  end
216
218
 
217
- # Migrate existing v4 batch jobs to v5.0
219
+ # Migrate existing v5 batch jobs to v6
218
220
  def rocketjob_categories_migrate
219
221
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
220
222
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -427,50 +449,28 @@ module RocketJob
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end
@@ -67,6 +67,8 @@ module RocketJob
67
67
  # Returns [Integer] the number of records processed in the slice
68
68
  #
69
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
70
72
  def work_first_slice(&block)
71
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
72
74
 
@@ -142,19 +144,19 @@ module RocketJob
142
144
  # Perform individual slice without callbacks
143
145
  def rocket_job_perform_slice(slice, &block)
144
146
  slice.processing_record_number ||= 0
145
- records = []
146
147
  append = false
147
148
 
148
- # Skip processed records in this slice if it has no output categpries.
149
- if slice.processing_record_number > 1
150
- records = slice.records[slice.processing_record_number - 1..-1]
151
- append = true
152
- logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
153
- else
154
- # Reprocess all records in this slice.
155
- slice.processing_record_number = 0
156
- records = slice.records
157
- end
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
159
161
  count = 0
160
162
  RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
@@ -246,7 +248,7 @@ module RocketJob
246
248
  unless new_record?
247
249
  # Fail job iff no other worker has already finished it
248
250
  # Must set write concern to at least 1 since we need the nModified back
249
- result = self.class.with(write: {w: 1}) do |query|
251
+ result = self.class.with(write: {w: 1}) do |query|
250
252
  query.
251
253
  where(id: id, state: :running, sub_state: :processing).
252
254
  update({"$set" => {state: :failed, worker_name: worker_name}})
@@ -11,7 +11,6 @@ module RocketJob
11
11
 
12
12
  # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
13
  field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
- validates_inclusion_of :serializer, in: %i[none compress encrypt bzip2]
15
14
 
16
15
  # The header columns when the file does not include a header row.
17
16
  # Note:
@@ -49,10 +48,12 @@ module RocketJob
49
48
  Sliced::CompressedSlice
50
49
  when :encrypt
51
50
  Sliced::EncryptedSlice
52
- when :bzip2
51
+ when :bzip2, :bz2
53
52
  Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
54
55
  else
55
- raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
56
57
  end
57
58
  end
58
59
 
@@ -65,14 +66,16 @@ module RocketJob
65
66
  )
66
67
  end
67
68
 
68
- def reset_tabular
69
- @tabular = nil
70
- end
71
-
72
69
  # Returns [true|false] whether this category has the attributes defined for tabular to work.
73
70
  def tabular?
74
71
  format.present?
75
72
  end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -10,6 +10,7 @@ module RocketJob
10
10
 
11
11
  # Slice size for this input collection
12
12
  field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
13
14
 
14
15
  #
15
16
  # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
82
83
  field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
83
84
  validates :header_cleanser, inclusion: %i[default none]
84
85
 
85
- validates_presence_of :slice_size
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
86
87
 
87
88
  # Cleanses the header column names when `cleanse_header` is true
88
89
  def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
105
106
  skip_unknown: skip_unknown
106
107
  )
107
108
  end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
108
168
  end
109
169
  end
110
170
  end