rocketjob 6.0.0.rc1 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +164 -8
  3. data/lib/rocket_job/batch/categories.rb +25 -18
  4. data/lib/rocket_job/batch/io.rb +130 -130
  5. data/lib/rocket_job/batch/performance.rb +2 -2
  6. data/lib/rocket_job/batch/statistics.rb +2 -2
  7. data/lib/rocket_job/batch/throttle_running_workers.rb +1 -1
  8. data/lib/rocket_job/batch/worker.rb +14 -12
  9. data/lib/rocket_job/batch.rb +0 -1
  10. data/lib/rocket_job/category/base.rb +10 -7
  11. data/lib/rocket_job/category/input.rb +61 -1
  12. data/lib/rocket_job/category/output.rb +9 -0
  13. data/lib/rocket_job/cli.rb +1 -1
  14. data/lib/rocket_job/dirmon_entry.rb +1 -1
  15. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  16. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  17. data/lib/rocket_job/job_exception.rb +1 -1
  18. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  19. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  20. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  21. data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
  22. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  23. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -97
  24. data/lib/rocket_job/jobs/upload_file_job.rb +6 -3
  25. data/lib/rocket_job/lookup_collection.rb +4 -3
  26. data/lib/rocket_job/plugins/cron.rb +60 -20
  27. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  28. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  29. data/lib/rocket_job/plugins/restart.rb +3 -110
  30. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  31. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +43 -0
  32. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  33. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  34. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  35. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  36. data/lib/rocket_job/sliced/input.rb +42 -54
  37. data/lib/rocket_job/sliced/slice.rb +7 -3
  38. data/lib/rocket_job/sliced/slices.rb +12 -9
  39. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  40. data/lib/rocket_job/sliced/writer/output.rb +0 -1
  41. data/lib/rocket_job/sliced.rb +1 -19
  42. data/lib/rocket_job/throttle_definitions.rb +7 -1
  43. data/lib/rocket_job/version.rb +1 -1
  44. data/lib/rocketjob.rb +4 -5
  45. metadata +12 -12
  46. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  47. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  48. data/lib/rocket_job/batch/tabular.rb +0 -58
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2794f5dc5e0ada3ffdc3da9a13fd0cb6c5713f89254d93b69d60677283bc2d64
4
- data.tar.gz: 3a208b181aca760b07432348bc2e51443a9da03cc6a143be81765ca2b3c0e37a
3
+ metadata.gz: 305189df5d57cf64c3d771bc05f940df6be6fd8c322ae9f3f796166fe99e1b75
4
+ data.tar.gz: a16ed0d77f1d1cb4e0ec4beefba922278a8b05e90ab5ecaffe6a3c347abdfed0
5
5
  SHA512:
6
- metadata.gz: 44816973f2f63dc300fe41e168ae485cd8b7def5e3bbab173501f6ef2935d3f65707ec4c2f9eb7cb6b43bab2d464b163f3e10216be50babf2dab5e82a7998439
7
- data.tar.gz: c0b2d210a3bb3faa49f30eeaf687052ed79e9da802635c6434e374c4f1ccc3538a71a4db9391f41e18a6265106aa7fedeaf920edf4e9d11acf81c9bc632534bd
6
+ metadata.gz: f2afe61652e3b6b225515b95e7b370f8c4a8853c30cad45d0fc93d49a86571764b480aaca5a5cfb264ea1ea86912b6b680e4e122952383f2425585e3683e8581
7
+ data.tar.gz: eb1b041fbe425143c5f2b64aa5e799c35bf6885c00020964cf87d87485ba359e19796f7e5c5378baefe9f4ca93fe5de900b78e6669c7ffe83838c87e197ab0e1
data/README.md CHANGED
@@ -17,21 +17,177 @@ Checkout https://rocketjob.io/
17
17
  * Questions? Join the chat room on Gitter for [rocketjob support](https://gitter.im/rocketjob/support)
18
18
  * [Report bugs](https://github.com/rocketjob/rocketjob/issues)
19
19
 
20
- ## Rocket Job v5
20
+ ## Rocket Job v6
21
21
 
22
22
  - Support for Ruby v3 and Rails 6.
23
- - Multiple output file support through extended `output_categories` capability.
24
- - File output formats for each category. For example: CSV, PSV, JSON, etc.
25
- - Support for AWS DocumentDB as the data store.
23
+ - Major enhancements in Batch job support:
24
+ - Direct built-in Tabular support for all input and output categories.
25
+ - Multiple output file support, each with its own settings for:
26
+ - Compression
27
+ - GZip, Zip, BZip2 (Chunked for much faster loading into Apache Spark).
28
+ - Encryption
29
+ - PGP, Symmetric Encryption.
30
+ - File format
31
+ - CSV, PSV, JSON, Fixed Format, xlsx.
32
+ - Significant error handling improvements, especially around throttle failures
33
+ that used to result in "hanging" jobs.
34
+ - Support AWS DocumentDB in addition to MongoDB as the data store.
26
35
  - Removed use of Symbols to meet Symbol deprecation in MongoDB and Mongoid.
27
36
 
28
- The following plugins have been deprecated and will be removed in Rocket Job v5.1
29
- - RocketJob::Batch::Tabular::Input
30
- - RocketJob::Batch::Tabular::Output
37
+ ### Upgrading to Rocket Job v6
38
+
39
+ The following plugins have been deprecated and are no longer loaded by default.
40
+ - `RocketJob::Batch::Tabular::Input`
41
+ - `RocketJob::Batch::Tabular::Output`
42
+
43
+ If your code relies on these plugins and you still want to upgrade to Rocket Job v6,
44
+ add the following require statement to any jobs that still use them:
45
+
46
+ ~~~ruby
47
+ require "rocket_job/batch/tabular"
48
+ ~~~
49
+
50
+ It is important to migrate away from these plugins, since they will be removed in a future release.
51
+
52
+ #### Scheduled Jobs
53
+
54
+ For any scheduled jobs that include the `RocketJob::Plugins::Cron` plugin, the default behavior has changed
55
+ so that the scheduled job instance is created immediately after the currently scheduled instance starts.
56
+
57
+ To maintain the old behavior of creating the job when it fails, aborts, or completes, add the following line
58
+ to each of the applicable jobs:
59
+
60
+ ~~~ruby
61
+ self.cron_after_start = false
62
+ ~~~
63
+
64
+ Additionally, scheduled jobs will now prevent a new one from being created when another scheduled instance
65
+ of the same job is already queued, or running with the _same_ `cron_schedule`.
66
+
67
+ To maintain the old behavior of allowing multiple instances with the same cron schedule, add the following
68
+ line to each of the applicable jobs:
69
+
70
+ ~~~ruby
71
+ self.cron_singleton = false
72
+ ~~~
73
+
74
+ ##### Singleton
75
+
76
+ Since Scheduled jobs now implement their own singleton logic, remove the singleton plugin from any scheduled jobs.
77
+
78
+ #### Upgrading Batch Jobs to Rocket Job v6
79
+
80
+ Rocket Job v6 replaces the array of symbol type for `input_categories` and `output_categories`
81
+ with an array of `RocketJob::Category::Input` and `RocketJob::Category::Output`.
82
+
83
+ Jobs that added or modified the input or output categories need to be upgraded. For example:
84
+ ~~~ruby
85
+ class MyJob < RocketJob::Job
86
+ include RocketJob::Batch
87
+
88
+ self.output_categories = [:main, :errors, :ignored]
89
+ end
90
+ ~~~
91
+
92
+ Needs to be changed to:
93
+ ~~~ruby
94
+ class MyJob < RocketJob::Job
95
+ include RocketJob::Batch
96
+
97
+ output_category name: :main
98
+ output_category name: :errors
99
+ output_category name: :ignored
100
+ end
101
+ ~~~
102
+
103
+ ##### slice_size, encrypt, compress
104
+
105
+ These fields have been removed from the job itself:
106
+ ~~~ruby
107
+ class MyJob < RocketJob::Job
108
+ include RocketJob::Batch
109
+
110
+ self.slice_sice = 1_000
111
+ self.encrypt = true
112
+ self.compress = true
113
+ end
114
+ ~~~
115
+
116
+ They are now specified on the `input_category` as follows:
117
+ - `slice_size` just moves under `input_category`.
118
+ - `encrypt` becomes an option to `serializer`.
119
+ - `compress` is now the default for all batch jobs so is not needed.
120
+
121
+ If the serializer is set to `encrypt` then it is automatically compressed.
122
+
123
+ ~~~ruby
124
+ class MyJob < RocketJob::Job
125
+ include RocketJob::Batch
126
+
127
+ input_category slice_sice: 1_000, serializer: :encrypt
128
+ end
129
+ ~~~
130
+
131
+ ##### collect_output, collect_nil_output
132
+
133
+ The following fields have been moved from the job itself:
134
+ ~~~ruby
135
+ class MyJob < RocketJob::Job
136
+ include RocketJob::Batch
137
+
138
+ self.collect_output = true
139
+ self.collect_nil_output = true
140
+ end
141
+ ~~~
142
+
143
+ Into the corresponding `output_category`:
144
+ - `collect_output` no longer has any meaning. Output is collected anytime an `output_category` is defined.
145
+ - `collect_nil_output` is now the option `nils` on the `output_category.
146
+ It defaults to `false` so that by default any `nil` output from the `perform` method is not collected.
147
+ ~~~ruby
148
+ class MyJob < RocketJob::Job
149
+ include RocketJob::Batch
150
+
151
+ output_category nils: true
152
+ end
153
+ ~~~
154
+
155
+ ##### name
156
+
157
+ For both `input_category` and `output_category`, when the `name` argument is not supplied
158
+ it defaults to `:main`.
159
+
160
+ For Example:
161
+ ~~~ruby
162
+ class MyJob < RocketJob::Job
163
+ include RocketJob::Batch
164
+
165
+ input_category name: :main, serializer: :encrypt
166
+ output_category name: :main
167
+ end
168
+ ~~~
169
+
170
+ Is the same as:
171
+ ~~~ruby
172
+ class MyJob < RocketJob::Job
173
+ include RocketJob::Batch
174
+
175
+ input_category serializer: :encrypt
176
+ output_category
177
+ end
178
+ ~~~
179
+
180
+ ##### Existing and inflight jobs
181
+
182
+ When migrating to Rocket Job 6, it is recommended to load every job and then save it back again as part of the
183
+ deployment. When the job loads it will automatically convert itself from the old schema to the new v6 schema.
184
+
185
+ In flight jobs should not be affected, other than it is important to shutdown all running batch
186
+ servers _before_ running any new instances.
31
187
 
32
188
  ## Rocket Job v4
33
189
 
34
- Rocket Job Pro is now open source and included in Rocket Job.
190
+ Rocket Job Pro is now fully open source and included in Rocket Job under the Apache License.
35
191
 
36
192
  The `RocketJob::Batch` plugin now adds batch processing capabilities to break up a single task into many
37
193
  concurrent workers processing slices of the entire job at the same time.
@@ -72,31 +72,38 @@ module RocketJob
72
72
  end
73
73
 
74
74
  def input_category(category_name = :main)
75
+ return category_name if category_name.is_a?(Category::Input)
76
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category_name.is_a?(Category::Output)
77
+
75
78
  category_name = category_name.to_sym
76
- category = nil
77
- # .find does not work against this association
78
- input_categories.each { |catg| category = catg if catg.name == category_name }
79
- unless category
80
- # Auto-register main input category if missing
81
- if category_name == :main
82
- category = Category::Input.new
83
- self.input_categories = [category]
84
- else
85
- raise(ArgumentError, "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}")
86
- end
79
+ # find does not work against this association
80
+ input_categories.each { |category| return category if category.name == category_name }
81
+
82
+ unless category_name == :main
83
+ raise(
84
+ ArgumentError,
85
+ "Unknown Input Category: #{category_name.inspect}. Registered categories: #{input_categories.collect(&:name).join(',')}"
86
+ )
87
87
  end
88
+
89
+ # Auto-register main input category when not defined
90
+ category = Category::Input.new(job: self)
91
+ self.input_categories << category
88
92
  category
89
93
  end
90
94
 
91
95
  def output_category(category_name = :main)
96
+ return category_name if category_name.is_a?(Category::Output)
97
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category_name.is_a?(Category::Input)
98
+
92
99
  category_name = category_name.to_sym
93
- category = nil
94
100
  # .find does not work against this association
95
- output_categories.each { |catg| category = catg if catg.name == category_name }
96
- unless category
97
- raise(ArgumentError, "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}")
98
- end
99
- category
101
+ output_categories.each { |category| return category if category.name == category_name }
102
+
103
+ raise(
104
+ ArgumentError,
105
+ "Unknown Output Category: #{category_name.inspect}. Registered categories: #{output_categories.collect(&:name).join(',')}"
106
+ )
100
107
  end
101
108
 
102
109
  # Returns [true|false] whether the named category has already been defined
@@ -211,7 +218,7 @@ module RocketJob
211
218
  category.tabular.render(row)
212
219
  end
213
220
 
214
- # Migrate existing v4 batch jobs to v5.0
221
+ # Migrate existing v5 batch jobs to v6
215
222
  def rocketjob_categories_migrate
216
223
  return unless attribute_present?(:input_categories) && self[:input_categories]&.first.is_a?(Symbol)
217
224
 
@@ -14,11 +14,9 @@ module RocketJob
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
17
+ category = input_category(category)
18
18
 
19
- category = input_category(category) unless category.is_a?(Category::Input)
20
-
21
- (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
19
+ (@inputs ||= {})[category.name] ||= category.data_store(self)
22
20
  end
23
21
 
24
22
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
@@ -30,11 +28,9 @@ module RocketJob
30
28
  # Default: None ( Uses the single default output collection for this job )
31
29
  # Validates: This value must be one of those listed in #output_categories
32
30
  def output(category = :main)
33
- raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
-
35
- category = output_category(category) unless category.is_a?(Category::Output)
31
+ category = output_category(category)
36
32
 
37
- (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
33
+ (@outputs ||= {})[category.name] ||= category.data_store(self)
38
34
  end
39
35
 
40
36
  # Rapidly upload individual records in batches.
@@ -59,19 +55,19 @@ module RocketJob
59
55
  # The category or the name of the category to access or download data from
60
56
  # Default: None ( Uses the single default output collection for this job )
61
57
  # Validates: This value must be one of those listed in #input_categories
62
- def lookup_collection(category = :main)
63
- category = input_category(category) unless category.is_a?(Category::Input)
64
-
65
- collection = (@lookup_collections ||= {})[category.name]
66
-
67
- unless collection
68
- collection_name = "rocket_job.inputs.#{id}"
69
- collection_name << ".#{category.name}" unless category.name == :main
70
-
71
- @lookup_collections[category.name] ||=
72
- LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
- end
74
- end
58
+ # def lookup_collection(category = :main)
59
+ # category = input_category(category) unless category.is_a?(Category::Input)
60
+ #
61
+ # collection = (@lookup_collections ||= {})[category.name]
62
+ #
63
+ # unless collection
64
+ # collection_name = "rocket_job.inputs.#{id}"
65
+ # collection_name << ".#{category.name}" unless category.name == :main
66
+ #
67
+ # @lookup_collections[category.name] ||=
68
+ # LookupCollection.new(Sliced::Slice.collection.database, collection_name)
69
+ # end
70
+ # end
75
71
 
76
72
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
77
73
  #
@@ -154,53 +150,7 @@ module RocketJob
154
150
  # * If an io stream is supplied, it is read until it returns nil.
155
151
  # * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
156
152
  # * CSV parsing is slow, so it is usually left for the workers to do.
157
- def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
158
- raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
159
-
160
- category = input_category(category) unless category.is_a?(Category::Input)
161
- stream ||= category.file_name
162
- path = nil
163
-
164
- if stream
165
- path = IOStreams.new(stream)
166
- path.file_name = file_name if file_name
167
- category.file_name = path.file_name
168
-
169
- # Auto detect the format based on the upload file name if present.
170
- if category.format == :auto
171
- format = path.format
172
- if format
173
- # Rebuild tabular with the above file name
174
- category.reset_tabular
175
- category.format = format
176
- end
177
- end
178
- end
179
-
180
- # Tabular transformations required for upload?
181
- if category.tabular?
182
- # Remove non-printable characters from tabular input formats
183
- # Cannot change the length of fixed width lines
184
- replace = category.format == :fixed ? " " : ""
185
- path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
-
187
- # Extract the header line during the file upload when needed.
188
- on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
- end
190
-
191
- count =
192
- if block
193
- input(category).upload(on_first: on_first, &block)
194
- else
195
- input(category).upload(on_first: on_first) do |io|
196
- path.each(stream_mode, **args) { |line| io << line }
197
- end
198
- end
199
-
200
- self.record_count = (record_count || 0) + count
201
- count
202
- end
203
-
153
+ #
204
154
  # Upload results from an Arel into RocketJob::SlicedJob.
205
155
  #
206
156
  # Params
@@ -227,18 +177,13 @@ module RocketJob
227
177
  #
228
178
  # Example: Upload user_name and zip_code
229
179
  # arel = User.where(country_code: 'US')
230
- # job.upload_arel(arel, :user_name, :zip_code)
180
+ # job.upload_arel(arel, columns: [:user_name, :zip_code])
231
181
  #
232
182
  # Notes:
233
183
  # * Only call from one thread at a time against a single instance of this job.
234
184
  # * The record_count for the job is set to the number of records returned by the arel.
235
185
  # * If an exception is raised while uploading data, the input collection is cleared out
236
186
  # so that if a job is retried during an upload failure, data is not duplicated.
237
- def upload_arel(arel, *column_names, category: :main, &block)
238
- count = input(category).upload_arel(arel, *column_names, &block)
239
- self.record_count = (record_count || 0) + count
240
- count
241
- end
242
187
 
243
188
  # Upload the result of a MongoDB query to the input collection for processing
244
189
  # Useful when an entire MongoDB collection, or part thereof needs to be
@@ -266,24 +211,19 @@ module RocketJob
266
211
  # criteria = User.where(state: 'FL')
267
212
  # job.record_count = job.upload_mongo_query(criteria)
268
213
  #
269
- # Example: Upload just the supplied column
214
+ # Example: Upload only the specified column(s)
270
215
  # criteria = User.where(state: 'FL')
271
- # job.record_count = job.upload_mongo_query(criteria, :zip_code)
216
+ # job.record_count = job.upload_mongo_query(criteria, columns: [:zip_code])
272
217
  #
273
218
  # Notes:
274
219
  # * Only call from one thread at a time against a single instance of this job.
275
220
  # * The record_count for the job is set to the number of records returned by the monqo query.
276
221
  # * If an exception is raised while uploading data, the input collection is cleared out
277
222
  # so that if a job is retried during an upload failure, data is not duplicated.
278
- def upload_mongo_query(criteria, *column_names, category: :main, &block)
279
- count = input(category).upload_mongo_query(criteria, *column_names, &block)
280
- self.record_count = (record_count || 0) + count
281
- count
282
- end
283
223
 
284
224
  # Upload sliced range of integer requests as arrays of start and end ids.
285
225
  #
286
- # Returns [Integer] last_id - start_id + 1.
226
+ # Returns [Integer] the number of slices uploaded.
287
227
  #
288
228
  # Uploads one range per slice so that the response can return multiple records
289
229
  # for each slice processed
@@ -302,17 +242,11 @@ module RocketJob
302
242
  # * The record_count for the job is set to: last_id - start_id + 1.
303
243
  # * If an exception is raised while uploading data, the input collection is cleared out
304
244
  # so that if a job is retried during an upload failure, data is not duplicated.
305
- def upload_integer_range(start_id, last_id, category: :main)
306
- input(category).upload_integer_range(start_id, last_id)
307
- count = last_id - start_id + 1
308
- self.record_count = (record_count || 0) + count
309
- count
310
- end
311
245
 
312
246
  # Upload sliced range of integer requests as an arrays of start and end ids
313
247
  # starting with the last range first
314
248
  #
315
- # Returns [Integer] last_id - start_id + 1.
249
+ # Returns [Integer] the number of slices uploaded.
316
250
  #
317
251
  # Uploads one range per slice so that the response can return multiple records
318
252
  # for each slice processed.
@@ -334,14 +268,102 @@ module RocketJob
334
268
  # * The record_count for the job is set to: last_id - start_id + 1.
335
269
  # * If an exception is raised while uploading data, the input collection is cleared out
336
270
  # so that if a job is retried during an upload failure, data is not duplicated.
337
- def upload_integer_range_in_reverse_order(start_id, last_id, category: :main)
338
- input(category).upload_integer_range_in_reverse_order(start_id, last_id)
339
- count = last_id - start_id + 1
271
+
272
+ def upload(object = nil, category: :main, file_name: nil, stream_mode: nil, on_first: nil, columns: nil, slice_batch_size: nil, **args, &block)
273
+ input_collection = input(category)
274
+
275
+ if block
276
+ raise(ArgumentError, "Cannot supply both an object to upload, and a block.") if object
277
+ if stream_mode || columns || slice_batch_size || args.size > 0
278
+ raise(ArgumentError, "Unknown keyword arguments when uploading a block. Only accepts :category, :file_name, or :on_first")
279
+ end
280
+
281
+ category = input_category(category)
282
+ category.file_name = file_name if file_name
283
+
284
+ # Extract the header line during the upload when applicable.
285
+ extract_header = category.extract_header_callback(on_first)
286
+
287
+ count = input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size, &block)
288
+ self.record_count = (record_count || 0) + count
289
+ return count
290
+ end
291
+
292
+ count =
293
+ case object
294
+ when Range
295
+ if file_name || stream_mode || on_first || args.size > 0
296
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Range. Only accepts :category, :columns, or :slice_batch_size")
297
+ end
298
+
299
+ first = object.first
300
+ last = object.last
301
+ if first < last
302
+ input_collection.upload_integer_range(first, last, slice_batch_size: slice_batch_size || 1_000)
303
+ else
304
+ input_collection.upload_integer_range_in_reverse_order(last, first, slice_batch_size: slice_batch_size || 1_000)
305
+ end
306
+ when Mongoid::Criteria
307
+ if file_name || stream_mode || on_first || args.size > 0
308
+ raise(ArgumentError, "Unknown keyword arguments when uploading a Mongoid::Criteria. Only accepts :category, :columns, or :slice_batch_size")
309
+ end
310
+
311
+ input_collection.upload_mongo_query(object, columns: columns, slice_batch_size: slice_batch_size, &block)
312
+ when defined?(ActiveRecord::Relation) ? ActiveRecord::Relation : false
313
+ if file_name || stream_mode || on_first || args.size > 0
314
+ raise(ArgumentError, "Unknown keyword arguments when uploading an ActiveRecord::Relation. Only accepts :category, :columns, or :slice_batch_size")
315
+ end
316
+
317
+ input_collection.upload_arel(object, columns: columns, slice_batch_size: slice_batch_size, &block)
318
+
319
+ else
320
+ raise(ArgumentError, "Unknown keyword argument :columns when uploading a file") if columns
321
+
322
+ category = input_category(category)
323
+
324
+ # Extract the header line during the upload when applicable.
325
+ extract_header = category.extract_header_callback(on_first)
326
+ path = category.upload_path(object, original_file_name: file_name)
327
+
328
+ input_collection.upload(on_first: extract_header, slice_batch_size: slice_batch_size) do |io|
329
+ path.each(stream_mode || :line, **args) { |line| io << line }
330
+ end
331
+
332
+ end
333
+
334
+ self.record_count = (record_count || 0) + count
335
+ count
336
+ end
337
+
338
+ # @deprecated
339
+ def upload_arel(arel, *column_names, category: :main, &block)
340
+ count = input(category).upload_arel(arel, columns: column_names, &block)
340
341
  self.record_count = (record_count || 0) + count
341
342
  count
342
343
  end
343
344
 
344
- # Upload the supplied slices for processing by workers
345
+ # @deprecated
346
+ def upload_mongo_query(criteria, *column_names, category: :main, &block)
347
+ count = input(category).upload_mongo_query(criteria, columns: column_names, &block)
348
+ self.record_count = (record_count || 0) + count
349
+ count
350
+ end
351
+
352
+ # @deprecated
353
+ def upload_integer_range(start_id, last_id, category: :main, slice_batch_size: 1_000)
354
+ count = input(category).upload_integer_range(start_id, last_id, slice_batch_size: slice_batch_size)
355
+ self.record_count = (record_count || 0) + count
356
+ count
357
+ end
358
+
359
+ # @deprecated
360
+ def upload_integer_range_in_reverse_order(start_id, last_id, category: :main, slice_batch_size: 1_000)
361
+ count = input(category).upload_integer_range_in_reverse_order(start_id, last_id, slice_batch_size: slice_batch_size)
362
+ self.record_count = (record_count || 0) + count
363
+ count
364
+ end
365
+
366
+ # Upload the supplied slice for processing by workers
345
367
  #
346
368
  # Updates the record_count after adding the records
347
369
  #
@@ -421,56 +443,34 @@ module RocketJob
421
443
  def download(stream = nil, category: :main, header_line: nil, **args, &block)
422
444
  raise "Cannot download incomplete job: #{id}. Currently in state: #{state}-#{sub_state}" if rocket_job_processing?
423
445
 
424
- category = output_category(category) unless category.is_a?(Category::Output)
425
- output_collection = output(category)
446
+ category = output_category(category) unless category.is_a?(Category::Output)
447
+ output_collection = output(category)
426
448
 
427
449
  # Store the output file name in the category
428
450
  category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
429
451
 
430
- if output_collection.binary?
431
- raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
-
433
- return output_collection.download(&block) if block
452
+ header_line ||= category.render_header
434
453
 
435
- IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
436
- output_collection.download { |record| io << record[:binary] }
437
- end
438
- else
439
- header_line ||= category.render_header
454
+ return output_collection.download(header_line: header_line, &block) if block
440
455
 
441
- return output_collection.download(header_line: header_line, &block) if block
456
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
442
457
 
443
- raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
458
+ if output_collection.slice_class.binary_format
459
+ binary_header_line = output_collection.slice_class.to_binary(header_line) if header_line
444
460
 
461
+ # Don't overwrite supplied stream options if any
462
+ stream = stream&.is_a?(IOStreams::Stream) ? stream.dup : IOStreams.new(category.file_name)
463
+ stream.remove_from_pipeline(output_collection.slice_class.binary_format)
464
+ stream.writer(**args) do |io|
465
+ # TODO: Binary formats should return the record count, instead of the slice count.
466
+ output_collection.download(header_line: binary_header_line) { |record| io.write(record) }
467
+ end
468
+ else
445
469
  IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
446
470
  output_collection.download(header_line: header_line) { |record| io << record }
447
471
  end
448
472
  end
449
473
  end
450
-
451
- private
452
-
453
- # Return a lambda to extract the header row from the uploaded file.
454
- def rocket_job_upload_header_lambda(category, on_first)
455
- case category.mode
456
- when :line
457
- lambda do |line|
458
- category.tabular.parse_header(line)
459
- category.cleanse_header!
460
- category.columns = category.tabular.header.columns
461
- # Call chained on_first if present
462
- on_first&.call(line)
463
- end
464
- when :array
465
- lambda do |row|
466
- category.tabular.header.columns = row
467
- category.cleanse_header!
468
- category.columns = category.tabular.header.columns
469
- # Call chained on_first if present
470
- on_first&.call(line)
471
- end
472
- end
473
- end
474
474
  end
475
475
  end
476
476
  end
@@ -22,7 +22,7 @@ module RocketJob
22
22
  count_running_workers
23
23
 
24
24
  puts "Loading job with #{count} records/lines"
25
- job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
25
+ job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
26
26
  job.input_category.slice_size = slice_size
27
27
  if encrypt
28
28
  job.input_category.serializer = :encrypt
@@ -64,7 +64,7 @@ module RocketJob
64
64
 
65
65
  # Parse command line options
66
66
  def parse(argv)
67
- parser = OptionParser.new do |o|
67
+ parser = OptionParser.new do |o|
68
68
  o.on("-c", "--count COUNT", "Count of records to enqueue") do |arg|
69
69
  self.count = arg.to_i
70
70
  end
@@ -49,7 +49,7 @@ module RocketJob
49
49
  last = paths.pop
50
50
  return unless last
51
51
 
52
- last_target = paths.inject(in_memory) do |target, sub_key|
52
+ last_target = paths.inject(in_memory) do |target, sub_key|
53
53
  target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
54
54
  end
55
55
  last_target[last] += increment
@@ -99,7 +99,7 @@ module RocketJob
99
99
 
100
100
  # Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
101
101
  def rocket_job_batch_log_payload
102
- h = {
102
+ h = {
103
103
  from: aasm.from_state,
104
104
  to: aasm.to_state,
105
105
  event: aasm.current_event
@@ -53,7 +53,7 @@ module RocketJob
53
53
  # Allows another job with a higher priority to start even though this one is running already
54
54
  # @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
55
55
  def throttle_running_jobs_base_query
56
- query = super
56
+ query = super
57
57
  query[:priority.lte] = priority if throttle_running_workers&.positive?
58
58
  query
59
59
  end