rocketjob 6.0.0.rc1 → 6.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +164 -8
  3. data/lib/rocket_job/batch/categories.rb +25 -18
  4. data/lib/rocket_job/batch/io.rb +130 -130
  5. data/lib/rocket_job/batch/performance.rb +2 -2
  6. data/lib/rocket_job/batch/statistics.rb +2 -2
  7. data/lib/rocket_job/batch/throttle_running_workers.rb +1 -1
  8. data/lib/rocket_job/batch/worker.rb +14 -12
  9. data/lib/rocket_job/batch.rb +0 -1
  10. data/lib/rocket_job/category/base.rb +10 -7
  11. data/lib/rocket_job/category/input.rb +61 -1
  12. data/lib/rocket_job/category/output.rb +9 -0
  13. data/lib/rocket_job/cli.rb +1 -1
  14. data/lib/rocket_job/dirmon_entry.rb +1 -1
  15. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  16. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  17. data/lib/rocket_job/job_exception.rb +1 -1
  18. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  19. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  20. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  21. data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
  22. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  23. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -97
  24. data/lib/rocket_job/jobs/upload_file_job.rb +6 -3
  25. data/lib/rocket_job/lookup_collection.rb +4 -3
  26. data/lib/rocket_job/plugins/cron.rb +60 -20
  27. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  28. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  29. data/lib/rocket_job/plugins/restart.rb +3 -110
  30. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  31. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +43 -0
  32. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  33. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  34. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  35. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  36. data/lib/rocket_job/sliced/input.rb +42 -54
  37. data/lib/rocket_job/sliced/slice.rb +7 -3
  38. data/lib/rocket_job/sliced/slices.rb +12 -9
  39. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  40. data/lib/rocket_job/sliced/writer/output.rb +0 -1
  41. data/lib/rocket_job/sliced.rb +1 -19
  42. data/lib/rocket_job/throttle_definitions.rb +7 -1
  43. data/lib/rocket_job/version.rb +1 -1
  44. data/lib/rocketjob.rb +4 -5
  45. metadata +12 -12
  46. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  47. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  48. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2794f5dc5e0ada3ffdc3da9a13fd0cb6c5713f89254d93b69d60677283bc2d64
4
- data.tar.gz: 3a208b181aca760b07432348bc2e51443a9da03cc6a143be81765ca2b3c0e37a
3
+ metadata.gz: 305189df5d57cf64c3d771bc05f940df6be6fd8c322ae9f3f796166fe99e1b75
4
+ data.tar.gz: a16ed0d77f1d1cb4e0ec4beefba922278a8b05e90ab5ecaffe6a3c347abdfed0
5
5
  SHA512:
6
- metadata.gz: 44816973f2f63dc300fe41e168ae485cd8b7def5e3bbab173501f6ef2935d3f65707ec4c2f9eb7cb6b43bab2d464b163f3e10216be50babf2dab5e82a7998439
7
- data.tar.gz: c0b2d210a3bb3faa49f30eeaf687052ed79e9da802635c6434e374c4f1ccc3538a71a4db9391f41e18a6265106aa7fedeaf920edf4e9d11acf81c9bc632534bd
6
+ metadata.gz: f2afe61652e3b6b225515b95e7b370f8c4a8853c30cad45d0fc93d49a86571764b480aaca5a5cfb264ea1ea86912b6b680e4e122952383f2425585e3683e8581
7
+ data.tar.gz: eb1b041fbe425143c5f2b64aa5e799c35bf6885c00020964cf87d87485ba359e19796f7e5c5378baefe9f4ca93fe5de900b78e6669c7ffe83838c87e197ab0e1
data/README.md CHANGED
@@ -17,21 +17,177 @@ Checkout https://rocketjob.io/
17
17
  * Questions? Join the chat room on Gitter for [rocketjob support](https://gitter.im/rocketjob/support)
18
18
  * [Report bugs](https://github.com/rocketjob/rocketjob/issues)
19
19
 
20
- ## Rocket Job v5
20
+ ## Rocket Job v6
21
21
 
22
22
  - Support for Ruby v3 and Rails 6.
23
- - Multiple output file support through extended `output_categories` capability.
24
- - File output formats for each category. For example: CSV, PSV, JSON, etc.
25
- - Support for AWS DocumentDB as the data store.
23
+ - Major enhancements in Batch job support:
24
+ - Direct built-in Tabular support for all input and output categories.
25
+ - Multiple output file support, each with its own settings for:
26
+ - Compression
27
+ - GZip, Zip, BZip2 (Chunked for much faster loading into Apache Spark).
28
+ - Encryption
29
+ - PGP, Symmetric Encryption.
30
+ - File format
31
+ - CSV, PSV, JSON, Fixed Format, xlsx.
32
+ - Significant error handling improvements, especially around throttle failures
33
+ that used to result in "hanging" jobs.
34
+ - Support AWS DocumentDB in addition to MongoDB as the data store.
26
35
  - Removed use of Symbols to meet Symbol deprecation in MongoDB and Mongoid.
27
36
 
28
- The following plugins have been deprecated and will be removed in Rocket Job v5.1
29
- - RocketJob::Batch::Tabular::Input
30
- - RocketJob::Batch::Tabular::Output
37
+ ### Upgrading to Rocket Job v6
38
+
39
+ The following plugins have been deprecated and are no longer loaded by default.
40
+ - `RocketJob::Batch::Tabular::Input`
41
+ - `RocketJob::Batch::Tabular::Output`
42
+
43
+ If your code relies on these plugins and you still want to upgrade to Rocket Job v6,
44
+ add the following require statement to any jobs that still use them:
45
+
46
+ ~~~ruby
47
+ require "rocket_job/batch/tabular"
48
+ ~~~
49
+
50
+ It is important to migrate away from these plugins, since they will be removed in a future release.
51
+
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
78
+ #### Upgrading Batch Jobs to Rocket Job v6
79
+
80
+ Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
81
+ with an array of `RocketJob::Category::Input` and `RocketJob::Category::Output`.
82
+
83
+ Jobs that added or modified the input or output categories need to be upgraded. For example:
84
+ ~~~ruby
85
+ class MyJob < RocketJob::Job
86
+ include RocketJob::Batch
87
+
88
+ self.output_categories = [:main, :errors, :ignored]
89
+ end
90
+ ~~~
91
+
92
+ Needs to be changed to:
93
+ ~~~ruby
94
+ class MyJob < RocketJob::Job
95
+ include RocketJob::Batch
96
+
97
+ output_category name: :main
98
+ output_category name: :errors
99
+ output_category name: :ignored
100
+ end
101
+ ~~~
102
+
103
+ ##### slice_size, encrypt, compress
104
+
105
+ These fields have been removed from the job itself:
106
+ ~~~ruby
107
+ class MyJob < RocketJob::Job
108
+ include RocketJob::Batch
109
+
110
+ self.slice_sice = 1_000
111
+ self.encrypt = true
112
+ self.compress = true
113
+ end
114
+ ~~~
115
+
116
+ They are now specified on the `input_category` as follows:
117
+ - `slice_size` just moves under `input_category`.
118
+ - `encrypt` becomes an option to `serializer`.
119
+ - `compress` is now the default for all batch jobs so is not needed.
120
+
121
+ If the serializer is set to `encrypt` then it is automatically compressed.
122
+
123
+ ~~~ruby
124
+ class MyJob < RocketJob::Job
125
+ include RocketJob::Batch
126
+
127
+ input_category slice_sice: 1_000, serializer: :encrypt
128
+ end
129
+ ~~~
130
+
131
+ ##### collect_output, collect_nil_output
132
+
133
+ The following fields have been moved from the job itself:
134
+ ~~~ruby
135
+ class MyJob < RocketJob::Job
136
+ include RocketJob::Batch
137
+
138
+ self.collect_output = true
139
+ self.collect_nil_output = true
140
+ end
141
+ ~~~
142
+
143
+ Into the corresponding `output_category`:
144
+ - `collect_output` no longer has any meaning. Output is collected anytime an `output_category` is defined.
145
+ - `collect_nil_output` is now the option `nils` on the `output_category.
146
+ It defaults to `false` so that by default any `nil` output from the `perform` method is not collected.
147
+ ~~~ruby
148
+ class MyJob < RocketJob::Job
149
+ include RocketJob::Batch
150
+
151
+ output_category nils: true
152
+ end
153
+ ~~~
154
+
155
+ ##### name
156
+
157
+ For both `input_category` and `output_category`, when the `name` argument is not supplied
158
+ it defaults to `:main`.
159
+
160
+ For Example:
161
+ ~~~ruby
162
+ class MyJob < RocketJob::Job
163
+ include RocketJob::Batch
164
+
165
+ input_category name: :main, serializer: :encrypt
166
+ output_category name: :main
167
+ end
168
+ ~~~
169
+
170
+ Is the same as:
171
+ ~~~ruby
172
+ class MyJob < RocketJob::Job
173
+ include RocketJob::Batch
174
+
175
+ input_category serializer: :encrypt
176
+ output_category
177
+ end
178
+ ~~~
179
+
180
+ ##### Existing and inflight jobs
181
+
182
+ When migrating to Rocket Job 6, it is recommended to load every job and then save it back again as part of the
183
+ deployment. When the job loads it will automatically convert itself from the old schema to the new v6 schema.
184
+
185
+ In flight jobs should not be affected, other than it is important to shutdown all running batch
186
+ servers _before_ running any new instances.
31
187
 
32
188
  ## Rocket Job v4
33
189
 
34
- Rocket Job Pro is now open source and included in Rocket Job.
190
+ Rocket Job Pro is now fully open source and included in Rocket Job under the Apache License.
35
191
 
36
192
  The `RocketJob::Batch` plugin now adds batch processing capabilities to break up a single task into many
37
193
  concurrent workers processing slices of the entire job at the same time.
@@ -72,31 +72,38 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
75
78
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError, "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
86
- end
79
+ # find does not work against this association
80
+ input_categories.each { |category| return category if category.name == category_name }
81
+
82
+ unless category_name == :main
83
+ raise(
84
+ ArgumentError,
85
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
86
+ )
87
87
  end
88
+
89
+ # Auto-register main input category when not defined
90
+ category = Category::Input.new(job: self)
91
+ self.input_categories << category
88
92
  category
89
93
  end
90
94
 
91
95
  def output_category(category_name = :main)
96
+ return category_name if category_name.is_a?(Category::Output)
97
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
98
+
92
99
  category_name = category_name.to_sym
93
- category = nil
94
100
  # .find does not work against this association
95
- output_categories.each { |catg| category = catg if catg.name == category_name }
96
- unless category
97
- raise(ArgumentError, "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
98
- end
99
- category
101
+ output_categories.each { |category| return category if category.name == category_name }
102
+
103
+ raise(
104
+ ArgumentError,
105
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
106
+ )
100
107
  end
101
108
 
102
109
  # Returns [true|false] whether the named category has already been defined
@@ -211,7 +218,7 @@ module RocketJob
211
218
  category.tabular.render(row)
212
219
  end
213
220
 
214
- # Migrate existing v4 batch jobs to v5.0
221
+ # Migrate existing v5 batch jobs to v6
215
222
  def rocketjob_categories_migrate
216
223
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
217
224
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -421,56 +443,34 @@ module RocketJob
421
443
  def download(stream = nil, category: :main, header_line: nil, **args, &block)
422
444
  raise "Cannot download incomplete job: #{id}. Currently in state: #{state}-#{sub_state}" if rocket_job_processing?
423
445
 
424
- category = output_category(category) unless category.is_a?(Category::Output)
425
- output_collection = output(category)
446
+ category = output_category(category) unless category.is_a?(Category::Output)
447
+ output_collection = output(category)
426
448
 
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end
@@ -22,7 +22,7 @@ module RocketJob
22
22
  count_running_workers
23
23
 
24
24
  puts "Loading job with #{count} records/lines"
25
- job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
25
+ job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
26
26
  job.input_category.slice_size = slice_size
27
27
  if encrypt
28
28
  job.input_category.serializer = :encrypt
@@ -64,7 +64,7 @@ module RocketJob
64
64
 
65
65
  # Parse command line options
66
66
  def parse(argv)
67
- parser = OptionParser.new do |o|
67
+ parser = OptionParser.new do |o|
68
68
  o.on("-c", "--count COUNT", "Count of records to enqueue") do |arg|
69
69
  self.count = arg.to_i
70
70
  end
@@ -49,7 +49,7 @@ module RocketJob
49
49
  last = paths.pop
50
50
  return unless last
51
51
 
52
- last_target = paths.inject(in_memory) do |target, sub_key|
52
+ last_target = paths.inject(in_memory) do |target, sub_key|
53
53
  target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
54
54
  end
55
55
  last_target[last] += increment
@@ -99,7 +99,7 @@ module RocketJob
99
99
 
100
100
  # Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
101
101
  def rocket_job_batch_log_payload
102
- h = {
102
+ h = {
103
103
  from: aasm.from_state,
104
104
  to: aasm.to_state,
105
105
  event: aasm.current_event
@@ -53,7 +53,7 @@ module RocketJob
53
53
  # Allows another job with a higher priority to start even though this one is running already
54
54
  # @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
55
55
  def throttle_running_jobs_base_query
56
- query = super
56
+ query = super
57
57
  query[:priority.lte] = priority if throttle_running_workers&.positive?
58
58
  query
59
59
  end