rocketjob 5.4.0.beta2 → 6.0.0.rc3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +149 -5
  3. data/bin/rocketjob_batch_perf +1 -1
  4. data/bin/rocketjob_perf +1 -1
  5. data/lib/rocket_job/batch.rb +3 -1
  6. data/lib/rocket_job/batch/categories.rb +341 -0
  7. data/lib/rocket_job/batch/io.rb +128 -60
  8. data/lib/rocket_job/batch/model.rb +20 -68
  9. data/lib/rocket_job/batch/performance.rb +19 -7
  10. data/lib/rocket_job/batch/statistics.rb +34 -12
  11. data/lib/rocket_job/batch/tabular.rb +2 -0
  12. data/lib/rocket_job/batch/tabular/input.rb +8 -6
  13. data/lib/rocket_job/batch/tabular/output.rb +4 -2
  14. data/lib/rocket_job/batch/throttle_running_workers.rb +8 -17
  15. data/lib/rocket_job/batch/worker.rb +27 -24
  16. data/lib/rocket_job/category/base.rb +78 -0
  17. data/lib/rocket_job/category/input.rb +110 -0
  18. data/lib/rocket_job/category/output.rb +25 -0
  19. data/lib/rocket_job/cli.rb +25 -17
  20. data/lib/rocket_job/dirmon_entry.rb +22 -12
  21. data/lib/rocket_job/event.rb +1 -1
  22. data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
  23. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  24. data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
  25. data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
  26. data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
  27. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  28. data/lib/rocket_job/jobs/conversion_job.rb +39 -0
  29. data/lib/rocket_job/jobs/dirmon_job.rb +2 -2
  30. data/lib/rocket_job/jobs/housekeeping_job.rb +7 -7
  31. data/lib/rocket_job/jobs/on_demand_batch_job.rb +17 -6
  32. data/lib/rocket_job/jobs/on_demand_job.rb +1 -2
  33. data/lib/rocket_job/jobs/performance_job.rb +3 -1
  34. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
  35. data/lib/rocket_job/jobs/upload_file_job.rb +44 -8
  36. data/lib/rocket_job/lookup_collection.rb +69 -0
  37. data/lib/rocket_job/plugins/job/model.rb +25 -50
  38. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  39. data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +12 -4
  40. data/lib/rocket_job/plugins/job/worker.rb +2 -7
  41. data/lib/rocket_job/plugins/restart.rb +12 -5
  42. data/lib/rocket_job/plugins/state_machine.rb +2 -1
  43. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +38 -0
  44. data/lib/rocket_job/ractor_worker.rb +42 -0
  45. data/lib/rocket_job/server/model.rb +1 -1
  46. data/lib/rocket_job/sliced.rb +15 -70
  47. data/lib/rocket_job/sliced/bzip2_output_slice.rb +1 -1
  48. data/lib/rocket_job/sliced/input.rb +1 -1
  49. data/lib/rocket_job/sliced/slice.rb +5 -13
  50. data/lib/rocket_job/sliced/slices.rb +14 -2
  51. data/lib/rocket_job/sliced/writer/output.rb +33 -45
  52. data/lib/rocket_job/subscribers/server.rb +1 -1
  53. data/lib/rocket_job/thread_worker.rb +46 -0
  54. data/lib/rocket_job/throttle_definitions.rb +7 -1
  55. data/lib/rocket_job/version.rb +1 -1
  56. data/lib/rocket_job/worker.rb +21 -55
  57. data/lib/rocket_job/worker_pool.rb +5 -7
  58. data/lib/rocketjob.rb +53 -43
  59. metadata +36 -26
  60. data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
  61. data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -9,32 +9,68 @@ module RocketJob
9
9
  # Returns [RocketJob::Sliced::Input] input collection for holding input slices
10
10
  #
11
11
  # Parameters:
12
- # category [Symbol]
13
- # The name of the category to access or upload data into
12
+ # category [Symbol|RocketJob::Category::Input]
13
+ # The category or the name of the category to access or upload data into
14
14
  # Default: None ( Uses the single default input collection for this job )
15
15
  # Validates: This value must be one of those listed in #input_categories
16
16
  def input(category = :main)
17
- unless input_categories.include?(category) || (category == :main)
18
- raise "Category #{category.inspect}, must be registered in input_categories: #{input_categories.inspect}"
19
- end
17
+ raise(ArgumentError, "Cannot supply Output Category to input category") if category.is_a?(Category::Output)
18
+
19
+ category = input_category(category) unless category.is_a?(Category::Input)
20
20
 
21
- (@inputs ||= {})[category] ||= RocketJob::Sliced.factory(:input, category, self)
21
+ (@inputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:input, category, self)
22
22
  end
23
23
 
24
24
  # Returns [RocketJob::Sliced::Output] output collection for holding output slices
25
25
  # Returns nil if no output is being collected
26
26
  #
27
27
  # Parameters:
28
- # category [Symbol]
29
- # The name of the category to access or download data from
28
+ # category [Symbol|RocketJob::Category::Input]
29
+ # The category or the name of the category to access or download data from
30
30
  # Default: None ( Uses the single default output collection for this job )
31
31
  # Validates: This value must be one of those listed in #output_categories
32
32
  def output(category = :main)
33
- unless output_categories.include?(category) || (category == :main)
34
- raise "Category #{category.inspect}, must be registered in output_categories: #{output_categories.inspect}"
35
- end
33
+ raise(ArgumentError, "Cannot supply Input Category to output category") if category.is_a?(Category::Input)
34
+
35
+ category = output_category(category) unless category.is_a?(Category::Output)
36
+
37
+ (@outputs ||= {})[category.name] ||= RocketJob::Sliced.factory(:output, category, self)
38
+ end
39
+
40
+ # Rapidly upload individual records in batches.
41
+ #
42
+ # Operates directly on a Mongo Collection to avoid the overhead of creating Mongoid objects
43
+ # for each and every row.
44
+ #
45
+ # input_category(:my_lookup).find(id: 123).first
46
+ #
47
+ # Lookup collection.
48
+ #
49
+ # Upload side / secondary lookup tables that can be accessed during job processing.
50
+ #
51
+ # Example:
52
+ # lookup_collection(:my_lookup).upload do |io|
53
+ # io << {id: 123, data: "first record"}
54
+ # io << {id: 124, data: "second record"}
55
+ # end
56
+ #
57
+ # Parameters:
58
+ # category [Symbol|RocketJob::Category::Input]
59
+ # The category or the name of the category to access or download data from
60
+ # Default: None ( Uses the single default output collection for this job )
61
+ # Validates: This value must be one of those listed in #input_categories
62
+ def lookup_collection(category = :main)
63
+ category = input_category(category) unless category.is_a?(Category::Input)
36
64
 
37
- (@outputs ||= {})[category] ||= RocketJob::Sliced.factory(:output, category, self)
65
+ collection = (@lookup_collections ||= {})[category.name]
66
+
67
+ unless collection
68
+ collection_name = "rocket_job.inputs.#{id}"
69
+ collection_name << ".#{category.name}" unless category.name == :main
70
+
71
+ @lookup_collections[category.name] ||=
72
+ LookupCollection.new(Sliced::Slice.collection.database, collection_name)
73
+ end
38
74
  end
39
75
 
40
76
  # Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
@@ -65,6 +101,11 @@ module RocketJob
65
101
  # Parses each line from the file into a Hash and uploads each hash for processing by workers.
66
102
  # See IOStreams::Stream#each.
67
103
  #
104
+ # category [Symbol|RocketJob::Category::Input]
105
+ # The category or the name of the category to access or download data from
106
+ # Default: None ( Uses the single default output collection for this job )
107
+ # Validates: This value must be one of those listed in #input_categories
108
+ #
68
109
  # Example:
69
110
  # # Load plain text records from a file
70
111
  # job.upload('hello.csv')
@@ -116,22 +157,46 @@ module RocketJob
116
157
  def upload(stream = nil, file_name: nil, category: :main, stream_mode: :line, on_first: nil, **args, &block)
117
158
  raise(ArgumentError, "Either stream, or a block must be supplied") unless stream || block
118
159
 
119
- stream_mode = stream_mode.to_sym
120
- # Backward compatibility with existing v4 jobs
121
- stream_mode = :array if stream_mode == :row
122
- stream_mode = :hash if stream_mode == :record
160
+ category = input_category(category) unless category.is_a?(Category::Input)
161
+ stream ||= category.file_name
162
+ path = nil
123
163
 
124
- count =
164
+ if stream
165
+ path = IOStreams.new(stream)
166
+ path.file_name = file_name if file_name
167
+ category.file_name = path.file_name
168
+
169
+ # Auto detect the format based on the upload file name if present.
170
+ if category.format == :auto
171
+ format = path.format
172
+ if format
173
+ # Rebuild tabular with the above file name
174
+ category.reset_tabular
175
+ category.format = format
176
+ end
177
+ end
178
+ end
179
+
180
+ # Tabular transformations required for upload?
181
+ if category.tabular?
182
+ # Remove non-printable characters from tabular input formats
183
+ # Cannot change the length of fixed width lines
184
+ replace = category.format == :fixed ? " " : ""
185
+ path&.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
186
+
187
+ # Extract the header line during the file upload when needed.
188
+ on_first = rocket_job_upload_header_lambda(category, on_first) if category.tabular.header?
189
+ end
190
+
191
+ count =
125
192
  if block
126
193
  input(category).upload(on_first: on_first, &block)
127
194
  else
128
- path = IOStreams.new(stream)
129
- path.file_name = file_name if file_name
130
- self.upload_file_name = path.file_name
131
195
  input(category).upload(on_first: on_first) do |io|
132
196
  path.each(stream_mode, **args) { |line| io << line }
133
197
  end
134
198
  end
199
+
135
200
  self.record_count = (record_count || 0) + count
136
201
  count
137
202
  end
@@ -144,6 +209,9 @@ module RocketJob
144
209
  # and uploaded into the job
145
210
  # These columns are automatically added to the select list to reduce overhead
146
211
  #
212
+ # category [Symbol|RocketJob::Category::Input]
213
+ # The category or the name of the category to upload to.
214
+ #
147
215
  # If a Block is supplied it is passed the model returned from the database and should
148
216
  # return the work item to be uploaded into the job.
149
217
  #
@@ -221,7 +289,7 @@ module RocketJob
221
289
  # for each slice processed
222
290
  #
223
291
  # Example
224
- # job.slice_size = 100
292
+ # job.input_category.slice_size = 100
225
293
  # job.upload_integer_range(200, 421)
226
294
  #
227
295
  # # Equivalent to calling:
@@ -253,7 +321,7 @@ module RocketJob
253
321
  # in a database based on the id column
254
322
  #
255
323
  # Example
256
- # job.slice_size = 100
324
+ # job.input_category.slice_size = 100
257
325
  # job.upload_integer_range_in_reverse_order(200, 421)
258
326
  #
259
327
  # # Equivalent to calling:
@@ -285,12 +353,12 @@ module RocketJob
285
353
  # For example the following types are not supported: Date
286
354
  #
287
355
  # Note:
288
- # The caller should honor `:slice_size`, the entire slice is loaded as-is.
356
+ # The caller should implement `:slice_size`, since the entire slice is saved as-is.
289
357
  #
290
358
  # Note:
291
359
  # Not thread-safe. Only call from one thread at a time
292
- def upload_slice(slice)
293
- input.insert(slice)
360
+ def upload_slice(slice, category: :main)
361
+ input(category).insert(slice)
294
362
  count = slice.size
295
363
  self.record_count = (record_count || 0) + count
296
364
  count
@@ -353,54 +421,54 @@ module RocketJob
353
421
  def download(stream = nil, category: :main, header_line: nil, **args, &block)
354
422
  raise "Cannot download incomplete job: #{id}. Currently in state: #{state}-#{sub_state}" if rocket_job_processing?
355
423
 
356
- return output(category).download(header_line: header_line, &block) if block
424
+ category = output_category(category) unless category.is_a?(Category::Output)
425
+ output_collection = output(category)
357
426
 
358
- output_collection = output(category)
427
+ # Store the output file name in the category
428
+ category.file_name = stream if !block && (stream.is_a?(String) || stream.is_a?(IOStreams::Path))
359
429
 
360
430
  if output_collection.binary?
361
- IOStreams.new(stream).stream(:none).writer(**args) do |io|
362
- raise(ArgumenError, "A `header_line` is not supported with binary output collections") if header_line
431
+ raise(ArgumentError, "A `header_line` is not supported with binary output collections") if header_line
432
+
433
+ return output_collection.download(&block) if block
363
434
 
435
+ IOStreams.new(stream || category.file_name).stream(:none).writer(**args) do |io|
364
436
  output_collection.download { |record| io << record[:binary] }
365
437
  end
366
438
  else
367
- IOStreams.new(stream).writer(:line, **args) do |io|
439
+ header_line ||= category.render_header
440
+
441
+ return output_collection.download(header_line: header_line, &block) if block
442
+
443
+ raise(ArgumentError, "Missing mandatory `stream` or `category.file_name`") unless stream || category.file_name
444
+
445
+ IOStreams.new(stream || category.file_name).writer(:line, **args) do |io|
368
446
  output_collection.download(header_line: header_line) { |record| io << record }
369
447
  end
370
448
  end
371
449
  end
372
450
 
373
- # Writes the supplied result, Batch::Result or Batch::Results to the relevant collections.
374
- #
375
- # If a block is supplied, the block is supplied with a writer that should be used to
376
- # accumulate the results.
377
- #
378
- # Examples
379
- #
380
- # job.write_output('hello world')
381
- #
382
- # job.write_output do |writer|
383
- # writer << 'hello world'
384
- # end
385
- #
386
- # job.write_output do |writer|
387
- # result = RocketJob::Batch::Results
388
- # result << RocketJob::Batch::Result.new(:main, 'hello world')
389
- # result << RocketJob::Batch::Result.new(:errors, 'errors')
390
- # writer << result
391
- # end
392
- #
393
- # result = RocketJob::Batch::Results
394
- # result << RocketJob::Batch::Result.new(:main, 'hello world')
395
- # result << RocketJob::Batch::Result.new(:errors, 'errors')
396
- # job.write_output(result)
397
- def write_output(result = nil, input_slice = nil, &block)
398
- if block
399
- RocketJob::Sliced::Writer::Output.collect(self, input_slice, &block)
400
- else
401
- raise(ArgumentError, "result parameter is required when no block is supplied") unless result
451
+ private
402
452
 
403
- RocketJob::Sliced::Writer::Output.collect(self, input_slice) { |writer| writer << result }
453
+ # Return a lambda to extract the header row from the uploaded file.
454
+ def rocket_job_upload_header_lambda(category, on_first)
455
+ case category.mode
456
+ when :line
457
+ lambda do |line|
458
+ category.tabular.parse_header(line)
459
+ category.cleanse_header!
460
+ category.columns = category.tabular.header.columns
461
+ # Call chained on_first if present
462
+ on_first&.call(line)
463
+ end
464
+ when :array
465
+ lambda do |row|
466
+ category.tabular.header.columns = row
467
+ category.cleanse_header!
468
+ category.columns = category.tabular.header.columns
469
+ # Call chained on_first if present
470
+ on_first&.call(line)
471
+ end
404
472
  end
405
473
  end
406
474
  end
@@ -11,46 +11,6 @@ module RocketJob
11
11
  #
12
12
  # The following attributes are set when the job is created
13
13
 
14
- # Number of records to include in each slice that is processed
15
- # Note:
16
- # slice_size is only used by SlicedJob#upload & Sliced::Input#upload
17
- # When slices are supplied directly, their size is not modified to match this number
18
- field :slice_size, type: Integer, default: 100, class_attribute: true, user_editable: true, copy_on_restart: true
19
-
20
- # Whether to retain nil results.
21
- #
22
- # Only applicable if `collect_output` is `true`
23
- # Set to `false` to prevent collecting output from the perform
24
- # method when it returns `nil`.
25
- field :collect_nil_output, type: Boolean, default: true, class_attribute: true
26
-
27
- # Optional Array<Symbol> list of categories that this job can output to
28
- #
29
- # By using categories the output from #perform can be placed in different
30
- # output collections, and therefore different output files
31
- #
32
- # Categories must be declared in advance to avoid a #perform method
33
- # accidentally writing its results to an unknown category
34
- field :output_categories, type: Array, default: [:main], class_attribute: true
35
-
36
- # Optional Array<Symbol> list of categories that this job can load input data into
37
- field :input_categories, type: Array, default: [:main], class_attribute: true
38
-
39
- # The file name of the uploaded file, if any.
40
- # Set by #upload if a file name was supplied, but can also be set explicitly.
41
- # May or may not include the fully qualified path name.
42
- field :upload_file_name, type: String
43
-
44
- # Compress uploaded records.
45
- # The fields are not affected in any way, only the data stored in the
46
- # records and results collections will compressed
47
- field :compress, type: Object, default: false, class_attribute: true
48
-
49
- # Encrypt uploaded records.
50
- # The fields are not affected in any way, only the data stored in the
51
- # records and results collections will be encrypted
52
- field :encrypt, type: Object, default: false, class_attribute: true
53
-
54
14
  #
55
15
  # Values that jobs can also update during processing
56
16
  #
@@ -69,30 +29,7 @@ module RocketJob
69
29
 
70
30
  # Breaks the :running state up into multiple sub-states:
71
31
  # :running -> :before -> :processing -> :after -> :complete
72
- field :sub_state, type: Symbol
73
-
74
- validates_presence_of :slice_size
75
-
76
- validates_each :output_categories, :input_categories do |record, attr, value|
77
- # Under some circumstances ActiveModel is passing in a nil value even though the
78
- # attributes have default values
79
- Array(value).each do |category|
80
- record.errors.add(attr, "must only contain Symbol values") unless category.is_a?(Symbol)
81
- unless category.to_s =~ /\A[a-z_0-9]+\Z/
82
- record.errors.add(attr, "must only consist of lowercase characters, digits, and _")
83
- end
84
- end
85
- end
86
- end
87
-
88
- # Returns [true|false] whether the slices for this job are encrypted
89
- def encrypted?
90
- encrypt == true
91
- end
92
-
93
- # Returns [true|false] whether the slices for this job are compressed
94
- def compressed?
95
- compress == true
32
+ field :sub_state, type: Mongoid::StringifiedSymbol
96
33
  end
97
34
 
98
35
  # Returns [Integer] percent of records completed so far
@@ -102,10 +39,10 @@ module RocketJob
102
39
  return 0 unless record_count.to_i.positive?
103
40
 
104
41
  # Approximate number of input records
105
- input_records = input.count.to_f * slice_size
42
+ input_records = input.count.to_f * input_category.slice_size
106
43
  if input_records > record_count
107
44
  # Sanity check in case slice_size is not being adhered to
108
- 99
45
+ 0
109
46
  else
110
47
  ((1.0 - (input_records.to_f / record_count)) * 100).to_i
111
48
  end
@@ -120,6 +57,10 @@ module RocketJob
120
57
  h["active_slices"] = worker_count
121
58
  h["failed_slices"] = input.failed.count
122
59
  h["queued_slices"] = input.queued.count
60
+ output_categories.each do |category|
61
+ name_str = category.name == :main ? "" : "_#{category.name}"
62
+ h["output_slices#{name_str}"] = output(category).count
63
+ end
123
64
  # Very high level estimated time left
124
65
  if record_count && running? && record_count.positive?
125
66
  percent = percent_complete
@@ -129,10 +70,9 @@ module RocketJob
129
70
  end
130
71
  end
131
72
  elsif completed?
132
- secs = seconds.to_f
73
+ secs = seconds.to_f
133
74
  h["records_per_hour"] = ((record_count.to_f / secs) * 60 * 60).round if record_count&.positive? && (secs > 0.0)
134
75
  end
135
- h["output_slices"] = output.count if collect_output? && !completed?
136
76
  h.merge!(super(time_zone))
137
77
  h.delete("result")
138
78
  # Worker name should be retrieved from the slices when processing
@@ -172,6 +112,18 @@ module RocketJob
172
112
  @worker_count_last = Time.now.to_i
173
113
  @worker_count
174
114
  end
115
+
116
+ # @deprecated
117
+ # For backward compatibility
118
+ def upload_file_name
119
+ input_category.file_name
120
+ end
121
+
122
+ # @deprecated
123
+ # For backward compatibility
124
+ def upload_file_name=(upload_file_name)
125
+ input_category.file_name = upload_file_name
126
+ end
175
127
  end
176
128
  end
177
129
  end
@@ -22,12 +22,15 @@ module RocketJob
22
22
  count_running_workers
23
23
 
24
24
  puts "Loading job with #{count} records/lines"
25
- args = {log_level: :warn, slice_size: slice_size}
26
- if defined?(::RocketJob)
27
- args[:compress] = compress
28
- args[:encrypt] = encrypt
25
+ job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
26
+ job.input_category.slice_size = slice_size
27
+ if encrypt
28
+ job.input_category.serializer = :encrypt
29
+ job.output_category.serializer = :encrypt
30
+ elsif !compress
31
+ job.input_category.serializer = :none
32
+ job.output_category.serializer = :none
29
33
  end
30
- job = RocketJob::Jobs::PerformanceJob.new(args)
31
34
  job.upload do |writer|
32
35
  count.times { |i| writer << i }
33
36
  end
@@ -37,7 +40,15 @@ module RocketJob
37
40
  sleep 3 until job.reload.completed?
38
41
 
39
42
  duration = job.completed_at - job.started_at
40
- {count: count, duration: duration, records_per_second: (count.to_f / duration).round(3), workers: workers, servers: servers, compress: compress, encrypt: encrypt}
43
+ {
44
+ count: count,
45
+ duration: duration,
46
+ records_per_second: (count.to_f / duration).round(3),
47
+ workers: workers,
48
+ servers: servers,
49
+ compress: compress,
50
+ encrypt: encrypt
51
+ }
41
52
  end
42
53
 
43
54
  # Export the Results hash to a CSV file
@@ -60,7 +71,8 @@ module RocketJob
60
71
  o.on("-m", "--mongo MONGO_CONFIG_FILE_NAME", "Location of mongoid.yml config file") do |arg|
61
72
  self.mongo_config = arg
62
73
  end
63
- o.on("-e", "--environment ENVIRONMENT", "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
74
+ o.on("-e", "--environment ENVIRONMENT",
75
+ "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
64
76
  self.environment = arg
65
77
  end
66
78
  o.on("-z", "--compress", "Turn on compression") do