rocketjob 6.0.0.rc3 → 6.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/lib/rocket_job/batch/categories.rb +26 -24
  4. data/lib/rocket_job/batch/io.rb +128 -128
  5. data/lib/rocket_job/batch/worker.rb +14 -12
  6. data/lib/rocket_job/category/base.rb +10 -7
  7. data/lib/rocket_job/category/input.rb +61 -1
  8. data/lib/rocket_job/category/output.rb +9 -0
  9. data/lib/rocket_job/dirmon_entry.rb +1 -1
  10. data/lib/rocket_job/job_exception.rb +1 -1
  11. data/lib/rocket_job/jobs/conversion_job.rb +21 -17
  12. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  13. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  14. data/lib/rocket_job/jobs/on_demand_batch_job.rb +11 -5
  15. data/lib/rocket_job/jobs/on_demand_job.rb +6 -2
  16. data/lib/rocket_job/jobs/upload_file_job.rb +4 -0
  17. data/lib/rocket_job/plugins/cron.rb +60 -20
  18. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  19. data/lib/rocket_job/plugins/restart.rb +3 -110
  20. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  21. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +10 -5
  22. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  23. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  24. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  25. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  26. data/lib/rocket_job/sliced/input.rb +42 -54
  27. data/lib/rocket_job/sliced/slice.rb +7 -3
  28. data/lib/rocket_job/sliced/slices.rb +12 -9
  29. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  30. data/lib/rocket_job/sliced.rb +1 -19
  31. data/lib/rocket_job/subscribers/secret_config.rb +17 -0
  32. data/lib/rocket_job/supervisor.rb +1 -0
  33. data/lib/rocket_job/version.rb +1 -1
  34. data/lib/rocketjob.rb +4 -3
  35. metadata +11 -12
  36. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  37. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  38. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a04a33b0cd03bdf0a7cb948fc87dd6c7d7bb3b392e566a8c15df50b73e27459
4
- data.tar.gz: fc62e740a0a92bae8daf1f4ffbe199af1debcb84f8859aed10ea5954dc44c7b6
3
+ metadata.gz: d37fc69678a20d2ab48a22daccbbff3238a4511d9fc711873c70640abe4b5d81
4
+ data.tar.gz: df5bc46066bf3359c7e75549cd75545f52fde6c4b5e6c0a4dfae0b63705ca3fe
5
5
  SHA512:
6
- metadata.gz: 74cac01d253cf21a856e1ca4a5cf63d5e90320303bdf310cf90325c9cca242c4ed1b7a0a1c43ca00764f2f40d29822df6e6bee499c1bff56c9ddaa2401bc3862
7
- data.tar.gz: 1bbc47c7d869ef28fd578a7b2575f62957aa2f83f9fc927af1d6fba7866270b15fd21cef30007b78d58847137357c75ccae2d03545560d2ffe0d674fe34c1d0e
6
+ metadata.gz: 18c563035502f1c1b9c0a37fb87f0563544ba83de0db5f7dec0ba663540a1db2a32b686cfc64c8056b99d7a3ce503b508ab206270feb277c42e79ca010debdf7
7
+ data.tar.gz: 71e00ec619377e422934ae0682b4322ffea63b3db307c06ab4a33a2e24fa149f599a9a579ab3b4d699cf5684d296d861b1cbbf2abc802b55002e721e35d4070e
data/README.md CHANGED
@@ -49,6 +49,32 @@ require "rocket_job/batch/tabular"
49
49
 
50
50
  It is important to migrate away from these plugins, since they will be removed in a future release.
51
51
 
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
52
78
  #### Upgrading Batch Jobs to Rocket Job v6
53
79
 
54
80
  Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
@@ -72,34 +72,37 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
78
+ # Initialize categories when this method is called before initialization is complete
79
+ rocketjob_categories_assign if input_categories.empty?
80
+
75
81
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError,
86
- "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
87
- end
88
- end
89
- category
82
+ # find does not work against this association
83
+ input_categories.each { |category| return category if category.name == category_name }
84
+
85
+ raise(
86
+ ArgumentError,
87
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
88
+ )
90
89
  end
91
90
 
92
91
  def output_category(category_name = :main)
92
+ return category_name if category_name.is_a?(Category::Output)
93
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
94
+
95
+ # Initialize categories when this method is called before initialization is complete
96
+ rocketjob_categories_assign if output_categories.empty? && self.class.defined_output_categories
97
+
93
98
  category_name = category_name.to_sym
94
- category = nil
95
99
  # .find does not work against this association
96
- output_categories.each { |catg| category = catg if catg.name == category_name }
97
- unless category
98
- raise(ArgumentError,
99
- "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
100
- end
100
+ output_categories.each { |category| return category if category.name == category_name }
101
101
 
102
- category
102
+ raise(
103
+ ArgumentError,
104
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
105
+ )
103
106
  end
104
107
 
105
108
  # Returns [true|false] whether the named category has already been defined
@@ -150,7 +153,7 @@ module RocketJob
150
153
  end
151
154
  end
152
155
 
153
- return if !self.class.defined_output_categories || !output_categories.empty?
156
+ return if !output_categories.empty? || !self.class.defined_output_categories
154
157
 
155
158
  # Input categories defaults to nil if none was set in the class
156
159
  self.output_categories = self.class.defined_output_categories.deep_dup
@@ -160,7 +163,6 @@ module RocketJob
160
163
  def rocketjob_categories_output_render
161
164
  return if @rocket_job_output.nil?
162
165
 
163
- # TODO: ..
164
166
  return unless output_categories
165
167
  return if output_categories.empty?
166
168
 
@@ -214,7 +216,7 @@ module RocketJob
214
216
  category.tabular.render(row)
215
217
  end
216
218
 
217
- # Migrate existing v4 batch jobs to v5.0
219
+ # Migrate existing v5 batch jobs to v6
218
220
  def rocketjob_categories_migrate
219
221
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
220
222
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -427,50 +449,28 @@ module RocketJob
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end
@@ -67,6 +67,8 @@ module RocketJob
67
67
  # Returns [Integer] the number of records processed in the slice
68
68
  #
69
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
70
72
  def work_first_slice(&block)
71
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
72
74
 
@@ -142,19 +144,19 @@ module RocketJob
142
144
  # Perform individual slice without callbacks
143
145
  def rocket_job_perform_slice(slice, &block)
144
146
  slice.processing_record_number ||= 0
145
- records = []
146
147
  append = false
147
148
 
148
- # Skip processed records in this slice if it has no output categpries.
149
- if slice.processing_record_number > 1
150
- records = slice.records[slice.processing_record_number - 1..-1]
151
- append = true
152
- logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
153
- else
154
- # Reprocess all records in this slice.
155
- slice.processing_record_number = 0
156
- records = slice.records
157
- end
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
159
161
  count = 0
160
162
  RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
@@ -246,7 +248,7 @@ module RocketJob
246
248
  unless new_record?
247
249
  # Fail job iff no other worker has already finished it
248
250
  # Must set write concern to at least 1 since we need the nModified back
249
- result = self.class.with(write: {w: 1}) do |query|
251
+ result = self.class.with(write: {w: 1}) do |query|
250
252
  query.
251
253
  where(id: id, state: :running, sub_state: :processing).
252
254
  update({"$set" => {state: :failed, worker_name: worker_name}})
@@ -11,7 +11,6 @@ module RocketJob
11
11
 
12
12
  # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
13
  field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
- validates_inclusion_of :serializer, in: %i[none compress encrypt bzip2]
15
14
 
16
15
  # The header columns when the file does not include a header row.
17
16
  # Note:
@@ -49,10 +48,12 @@ module RocketJob
49
48
  Sliced::CompressedSlice
50
49
  when :encrypt
51
50
  Sliced::EncryptedSlice
52
- when :bzip2
51
+ when :bzip2, :bz2
53
52
  Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
54
55
  else
55
- raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
56
57
  end
57
58
  end
58
59
 
@@ -65,14 +66,16 @@ module RocketJob
65
66
  )
66
67
  end
67
68
 
68
- def reset_tabular
69
- @tabular = nil
70
- end
71
-
72
69
  # Returns [true|false] whether this category has the attributes defined for tabular to work.
73
70
  def tabular?
74
71
  format.present?
75
72
  end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -10,6 +10,7 @@ module RocketJob
10
10
 
11
11
  # Slice size for this input collection
12
12
  field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
13
14
 
14
15
  #
15
16
  # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
82
83
  field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
83
84
  validates :header_cleanser, inclusion: %i[default none]
84
85
 
85
- validates_presence_of :slice_size
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
86
87
 
87
88
  # Cleanses the header column names when `cleanse_header` is true
88
89
  def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
105
106
  skip_unknown: skip_unknown
106
107
  )
107
108
  end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
108
168
  end
109
169
  end
110
170
  end