rocketjob 5.4.1 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +175 -5
  3. data/bin/rocketjob_batch_perf +1 -1
  4. data/bin/rocketjob_perf +1 -1
  5. data/lib/rocket_job/batch/categories.rb +345 -0
  6. data/lib/rocket_job/batch/io.rb +174 -106
  7. data/lib/rocket_job/batch/model.rb +20 -68
  8. data/lib/rocket_job/batch/performance.rb +19 -7
  9. data/lib/rocket_job/batch/statistics.rb +34 -12
  10. data/lib/rocket_job/batch/throttle_running_workers.rb +2 -6
  11. data/lib/rocket_job/batch/worker.rb +31 -26
  12. data/lib/rocket_job/batch.rb +3 -1
  13. data/lib/rocket_job/category/base.rb +81 -0
  14. data/lib/rocket_job/category/input.rb +170 -0
  15. data/lib/rocket_job/category/output.rb +34 -0
  16. data/lib/rocket_job/cli.rb +25 -17
  17. data/lib/rocket_job/dirmon_entry.rb +23 -13
  18. data/lib/rocket_job/event.rb +1 -1
  19. data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
  20. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  21. data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
  22. data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
  23. data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
  24. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  25. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  26. data/lib/rocket_job/jobs/dirmon_job.rb +25 -36
  27. data/lib/rocket_job/jobs/housekeeping_job.rb +11 -12
  28. data/lib/rocket_job/jobs/on_demand_batch_job.rb +24 -11
  29. data/lib/rocket_job/jobs/on_demand_job.rb +3 -4
  30. data/lib/rocket_job/jobs/performance_job.rb +3 -1
  31. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
  32. data/lib/rocket_job/jobs/upload_file_job.rb +48 -8
  33. data/lib/rocket_job/lookup_collection.rb +69 -0
  34. data/lib/rocket_job/plugins/cron.rb +60 -20
  35. data/lib/rocket_job/plugins/job/model.rb +25 -50
  36. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  37. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  38. data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +1 -1
  39. data/lib/rocket_job/plugins/job/worker.rb +2 -7
  40. data/lib/rocket_job/plugins/restart.rb +3 -103
  41. data/lib/rocket_job/plugins/state_machine.rb +4 -3
  42. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +37 -0
  43. data/lib/rocket_job/ractor_worker.rb +42 -0
  44. data/lib/rocket_job/server/model.rb +1 -1
  45. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  46. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  47. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  48. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  49. data/lib/rocket_job/sliced/input.rb +42 -54
  50. data/lib/rocket_job/sliced/slice.rb +12 -16
  51. data/lib/rocket_job/sliced/slices.rb +26 -11
  52. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  53. data/lib/rocket_job/sliced/writer/output.rb +33 -45
  54. data/lib/rocket_job/sliced.rb +1 -74
  55. data/lib/rocket_job/subscribers/server.rb +1 -1
  56. data/lib/rocket_job/thread_worker.rb +46 -0
  57. data/lib/rocket_job/throttle_definitions.rb +7 -1
  58. data/lib/rocket_job/version.rb +1 -1
  59. data/lib/rocket_job/worker.rb +21 -55
  60. data/lib/rocket_job/worker_pool.rb +5 -7
  61. data/lib/rocketjob.rb +53 -43
  62. metadata +36 -28
  63. data/lib/rocket_job/batch/tabular/input.rb +0 -131
  64. data/lib/rocket_job/batch/tabular/output.rb +0 -65
  65. data/lib/rocket_job/batch/tabular.rb +0 -56
  66. data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
  67. data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -22,12 +22,15 @@ module RocketJob
22
22
  count_running_workers
23
23
 
24
24
  puts "Loading job with #{count} records/lines"
25
- args = {log_level: :warn, slice_size: slice_size}
26
- if defined?(::RocketJob)
27
- args[:compress] = compress
28
- args[:encrypt] = encrypt
25
+ job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
26
+ job.input_category.slice_size = slice_size
27
+ if encrypt
28
+ job.input_category.serializer = :encrypt
29
+ job.output_category.serializer = :encrypt
30
+ elsif !compress
31
+ job.input_category.serializer = :none
32
+ job.output_category.serializer = :none
29
33
  end
30
- job = RocketJob::Jobs::PerformanceJob.new(args)
31
34
  job.upload do |writer|
32
35
  count.times { |i| writer << i }
33
36
  end
@@ -37,7 +40,15 @@ module RocketJob
37
40
  sleep 3 until job.reload.completed?
38
41
 
39
42
  duration = job.completed_at - job.started_at
40
- {count: count, duration: duration, records_per_second: (count.to_f / duration).round(3), workers: workers, servers: servers, compress: compress, encrypt: encrypt}
43
+ {
44
+ count: count,
45
+ duration: duration,
46
+ records_per_second: (count.to_f / duration).round(3),
47
+ workers: workers,
48
+ servers: servers,
49
+ compress: compress,
50
+ encrypt: encrypt
51
+ }
41
52
  end
42
53
 
43
54
  # Export the Results hash to a CSV file
@@ -60,7 +71,8 @@ module RocketJob
60
71
  o.on("-m", "--mongo MONGO_CONFIG_FILE_NAME", "Location of mongoid.yml config file") do |arg|
61
72
  self.mongo_config = arg
62
73
  end
63
- o.on("-e", "--environment ENVIRONMENT", "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
74
+ o.on("-e", "--environment ENVIRONMENT",
75
+ "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
64
76
  self.environment = arg
65
77
  end
66
78
  o.on("-z", "--compress", "Turn on compression") do
@@ -2,7 +2,11 @@ require "active_support/concern"
2
2
 
3
3
  module RocketJob
4
4
  module Batch
5
- # Allow statistics to be gathered while a batch job is running
5
+ # Allow statistics to be gathered while a batch job is running.
6
+ #
7
+ # Notes:
8
+ # - Statistics for successfully processed records within a slice are saved.
9
+ # - Statistics gathered during a perform that then results in an exception are discarded.
6
10
  module Statistics
7
11
  extend ActiveSupport::Concern
8
12
 
@@ -45,34 +49,52 @@ module RocketJob
45
49
  last = paths.pop
46
50
  return unless last
47
51
 
48
- target = paths.inject(in_memory) { |target, key| target.key?(key) ? target[key] : target[key] = Hash.new(0) }
49
- target[last] += increment
52
+ last_target = paths.inject(in_memory) do |target, sub_key|
53
+ target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
54
+ end
55
+ last_target[last] += increment
50
56
  end
51
57
  end
52
58
 
53
59
  included do
54
60
  field :statistics, type: Hash, default: -> { Hash.new(0) }
55
61
 
56
- around_slice :statistics_capture
62
+ around_slice :rocket_job_statistics_capture
63
+ after_perform :rocket_job_statistics_commit
57
64
  end
58
65
 
59
66
  # Increment a statistic
60
67
  def statistics_inc(key, increment = 1)
61
68
  return if key.nil? || key == ""
62
69
 
63
- # Being called within tests outside of a perform
64
- @slice_statistics ||= Stats.new(new_record? ? statistics : nil)
65
- key.is_a?(Hash) ? @slice_statistics.inc(key) : @slice_statistics.inc_key(key, increment)
70
+ (@rocket_job_perform_statistics ||= []) << (key.is_a?(Hash) ? key : [key, increment])
66
71
  end
67
72
 
68
73
  private
69
74
 
70
- # Capture the number of successful and failed tradelines
71
- # as well as those with notices and alerts.
72
- def statistics_capture
73
- @slice_statistics = Stats.new(new_record? ? statistics : nil)
75
+ def rocket_job_statistics_capture
76
+ @rocket_job_perform_statistics = nil
77
+ @rocket_job_slice_statistics = nil
74
78
  yield
75
- collection.update_one({_id: id}, {"$inc" => @slice_statistics.stats}) unless @slice_statistics.empty?
79
+ ensure
80
+ if @rocket_job_slice_statistics && !@rocket_job_slice_statistics.empty?
81
+ collection.update_one({_id: id}, {"$inc" => @rocket_job_slice_statistics.stats})
82
+ end
83
+ end
84
+
85
+ def rocket_job_slice_statistics
86
+ @rocket_job_slice_statistics ||= Stats.new(new_record? ? statistics : nil)
87
+ end
88
+
89
+ # Apply stats gathered during the perform to the slice level stats
90
+ def rocket_job_statistics_commit
91
+ return unless @rocket_job_perform_statistics
92
+
93
+ @rocket_job_perform_statistics.each do |key|
94
+ key.is_a?(Hash) ? rocket_job_slice_statistics.inc(key) : rocket_job_slice_statistics.inc_key(*key)
95
+ end
96
+
97
+ @rocket_job_perform_statistics = nil
76
98
  end
77
99
 
78
100
  # Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
@@ -37,15 +37,11 @@ module RocketJob
37
37
  validates :throttle_running_workers, numericality: {greater_than_or_equal_to: 0}, allow_nil: true
38
38
 
39
39
  define_batch_throttle :throttle_running_workers_exceeded?, filter: :throttle_filter_id
40
-
41
- # Deprecated. For backward compatibility.
42
- alias_method :throttle_running_slices, :throttle_running_workers
43
- alias_method :throttle_running_slices=, :throttle_running_workers=
44
40
  end
45
41
 
46
42
  private
47
43
 
48
- # Returns [Boolean] whether the throttle for this job has been exceeded
44
+ # Returns [true|false] whether the throttle for this job has been exceeded
49
45
  def throttle_running_workers_exceeded?(slice)
50
46
  return false unless throttle_running_workers&.positive?
51
47
 
@@ -57,7 +53,7 @@ module RocketJob
57
53
  # Allows another job with a higher priority to start even though this one is running already
58
54
  # @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
59
55
  def throttle_running_jobs_base_query
60
- query = super
56
+ query = super
61
57
  query[:priority.lte] = priority if throttle_running_workers&.positive?
62
58
  query
63
59
  end
@@ -23,9 +23,6 @@ module RocketJob
23
23
  #
24
24
  # Slices are destroyed after their records are successfully processed
25
25
  #
26
- # Results are stored in the output collection if `collect_output?`
27
- # `nil` results from workers are kept if `collect_nil_output`
28
- #
29
26
  # If an exception was thrown the entire slice of records is marked as failed.
30
27
  #
31
28
  # Thread-safe, can be called by multiple threads at the same time
@@ -40,7 +37,8 @@ module RocketJob
40
37
 
41
38
  SemanticLogger.named_tagged(job: id.to_s) do
42
39
  until worker.shutdown?
43
- if slice = input.next_slice(worker.name)
40
+ slice = input.next_slice(worker.name)
41
+ if slice
44
42
  # Grab a slice before checking the throttle to reduce concurrency race condition.
45
43
  return true if slice.fail_on_exception!(re_raise_exceptions) { rocket_job_batch_throttled?(slice, worker) }
46
44
  next if slice.failed?
@@ -69,6 +67,8 @@ module RocketJob
69
67
  # Returns [Integer] the number of records processed in the slice
70
68
  #
71
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
72
72
  def work_first_slice(&block)
73
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
74
74
 
@@ -97,8 +97,8 @@ module RocketJob
97
97
  servers = []
98
98
  case sub_state
99
99
  when :before, :after
100
- unless server_name && !worker_on_server?(server_name)
101
- servers << ActiveWorker.new(worker_name, started_at, self) if running?
100
+ if running? && (server_name.nil? || worker_on_server?(server_name))
101
+ servers << ActiveWorker.new(worker_name, started_at, self)
102
102
  end
103
103
  when :processing
104
104
  query = input.running
@@ -143,19 +143,23 @@ module RocketJob
143
143
 
144
144
  # Perform individual slice without callbacks
145
145
  def rocket_job_perform_slice(slice, &block)
146
- count = 0
147
- RocketJob::Sliced::Writer::Output.collect(self, slice) do |writer|
148
- records = slice.records
149
-
150
- # Skip records already processed, if any.
151
- # slice.processing_record_number ||= 0
152
- # TODO: Must append to existing output slices before this can be enabled.
153
- # if !collect_output && (slice.processing_record_number > 1)
154
- # records = records[slice.processing_record_number - 1..-1]
155
- # end
156
- # Until the changes above have been implemented, reprocess all records in the slice.
157
- slice.processing_record_number = 0
146
+ slice.processing_record_number ||= 0
147
+ append = false
148
+
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
161
+ count = 0
162
+ RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
159
163
  records.each do |record|
160
164
  slice.processing_record_number += 1
161
165
  SemanticLogger.named_tagged(record: slice.current_record_number) do
@@ -174,8 +178,8 @@ module RocketJob
174
178
  return block_given? ? yield(record) : perform(record) if _perform_callbacks.empty?
175
179
 
176
180
  # @rocket_job_input and @rocket_job_output can be modified by before/around callbacks
177
- @rocket_job_input = record
178
- @rocket_job_output = nil
181
+ @rocket_job_input = record
182
+ @rocket_job_output = nil
179
183
 
180
184
  run_callbacks(:perform) do
181
185
  @rocket_job_output =
@@ -186,9 +190,9 @@ module RocketJob
186
190
  end
187
191
  end
188
192
 
189
- @rocket_job_input = nil
190
- result = @rocket_job_output
191
- @rocket_job_output = nil
193
+ @rocket_job_input = nil
194
+ result = @rocket_job_output
195
+ @rocket_job_output = nil
192
196
  result
193
197
  end
194
198
 
@@ -244,7 +248,7 @@ module RocketJob
244
248
  unless new_record?
245
249
  # Fail job iff no other worker has already finished it
246
250
  # Must set write concern to at least 1 since we need the nModified back
247
- result = self.class.with(write: {w: 1}) do |query|
251
+ result = self.class.with(write: {w: 1}) do |query|
248
252
  query.
249
253
  where(id: id, state: :running, sub_state: :processing).
250
254
  update({"$set" => {state: :failed, worker_name: worker_name}})
@@ -305,11 +309,12 @@ module RocketJob
305
309
  # Run Batch before and after callbacks
306
310
  def rocket_job_batch_callbacks(worker)
307
311
  # If this is the first worker to pickup this job
308
- if sub_state == :before
312
+ case sub_state
313
+ when :before
309
314
  rocket_job_batch_run_before_callbacks
310
315
  # Check for 0 record jobs
311
316
  rocket_job_batch_complete?(worker.name) if running?
312
- elsif sub_state == :after
317
+ when sub_state == :after
313
318
  rocket_job_batch_run_after_callbacks
314
319
  end
315
320
  end
@@ -7,6 +7,8 @@ require "rocket_job/batch/state_machine"
7
7
  require "rocket_job/batch/throttle"
8
8
  require "rocket_job/batch/throttle_running_workers"
9
9
  require "rocket_job/batch/worker"
10
+ # Ensure after_perform is run first and #upload override is after IO#upload is defined.
11
+ require "rocket_job/batch/categories"
10
12
 
11
13
  module RocketJob
12
14
  module Batch
@@ -17,6 +19,7 @@ module RocketJob
17
19
  include Callbacks
18
20
  include Logger
19
21
  include Worker
22
+ include Categories
20
23
  include Throttle
21
24
  include ThrottleRunningWorkers
22
25
  include IO
@@ -27,6 +30,5 @@ module RocketJob
27
30
  autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
28
31
  autoload :Result, "rocket_job/batch/result"
29
32
  autoload :Results, "rocket_job/batch/results"
30
- autoload :Tabular, "rocket_job/batch/tabular"
31
33
  end
32
34
  end
@@ -0,0 +1,81 @@
1
+ require "active_support/concern"
2
+
3
+ module RocketJob
4
+ module Category
5
+ # Define the layout for each category of input or output data
6
+ module Base
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ field :name, type: ::Mongoid::StringifiedSymbol, default: :main
11
+
12
+ # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
+ field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
+
15
+ # The header columns when the file does not include a header row.
16
+ # Note:
17
+ # - All column names must be strings so that it can be serialized into MongoDB.
18
+ field :columns, type: Array
19
+
20
+ # On an input collection `format` specifies the format of the input data so that it can be
21
+ # transformed into a Hash when passed into the `#perform` method.
22
+ #
23
+ # On an output collection `format` specifies the format to transform the output hash into.
24
+ #
25
+ # `:auto` it uses the `file_name` on this category to determine the format.
26
+ # `nil` no transformation is performed on the data returned by the `#perform` method.
27
+ # Any other format supported by IOStreams, for example: csv, :hash, :array, :json, :psv, :fixed
28
+ #
29
+ # Default: `nil`
30
+ field :format, type: ::Mongoid::StringifiedSymbol
31
+ validates_inclusion_of :format, in: [nil, :auto] + IOStreams::Tabular.registered_formats
32
+
33
+ # Any specialized format specific options. For example, `:fixed` format requires a `:layout`.
34
+ field :format_options, type: Hash
35
+
36
+ # When `:format` is not supplied the file name can be used to infer the required format.
37
+ # Optional.
38
+ # Default: nil
39
+ field :file_name, type: IOStreams::Path
40
+ end
41
+
42
+ # Return which slice serializer class to use that matches the current options.
43
+ def serializer_class
44
+ case serializer
45
+ when :none
46
+ Sliced::Slice
47
+ when :compress
48
+ Sliced::CompressedSlice
49
+ when :encrypt
50
+ Sliced::EncryptedSlice
51
+ when :bzip2, :bz2
52
+ Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
55
+ else
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
57
+ end
58
+ end
59
+
60
+ def tabular
61
+ @tabular ||= IOStreams::Tabular.new(
62
+ columns: columns,
63
+ format: format == :auto ? nil : format,
64
+ format_options: format_options&.deep_symbolize_keys,
65
+ file_name: file_name
66
+ )
67
+ end
68
+
69
+ # Returns [true|false] whether this category has the attributes defined for tabular to work.
70
+ def tabular?
71
+ format.present?
72
+ end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,170 @@
1
+ module RocketJob
2
+ module Category
3
+ # Define the layout for each category of input or output data
4
+ class Input
5
+ include SemanticLogger::Loggable
6
+ include Plugins::Document
7
+ include Category::Base
8
+
9
+ embedded_in :job, class_name: "RocketJob::Job", inverse_of: :input_categories
10
+
11
+ # Slice size for this input collection
12
+ field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
14
+
15
+ #
16
+ # The fields below only apply if the field `format` has been set:
17
+ #
18
+
19
+ # List of columns to allow.
20
+ # Default: nil ( Allow all columns )
21
+ # Note:
22
+ # When supplied any columns that are rejected will be returned in the cleansed columns
23
+ # as nil so that they can be ignored during processing.
24
+ field :allowed_columns, type: Array
25
+
26
+ # List of columns that must be present, otherwise an Exception is raised.
27
+ field :required_columns, type: Array
28
+
29
+ # Whether to skip unknown columns in the uploaded file.
30
+ # Ignores any column that was not found in the `allowed_columns` list.
31
+ #
32
+ # false:
33
+ # Raises IOStreams::Tabular::InvalidHeader when a column is supplied that is not in `allowed_columns`.
34
+ # true:
35
+ # Ignore additional columns in a file that are not listed in `allowed_columns`
36
+ # Job processing will skip the additional columns entirely as if they were not supplied at all.
37
+ # A warning is logged with the names of the columns that were ignored.
38
+ # The `columns` field will list all skipped columns with a nil value so that downstream workers
39
+ # know to ignore those columns.
40
+ #
41
+ # Notes:
42
+ # - Only applicable when `allowed_columns` has been set.
43
+ # - Recommended to leave as `false` otherwise a misspelled column can result in missed columns.
44
+ field :skip_unknown, type: ::Mongoid::Boolean, default: false
45
+ validates_inclusion_of :skip_unknown, in: [true, false]
46
+
47
+ # When `#upload` is called with a file_name, it uploads the file using any of the following approaches:
48
+ # :line
49
+ # Uploads the file a line (String) at a time for processing by workers.
50
+ # This is the default behavior and is the most performant since it leaves the parsing of each line
51
+ # up to the workers themselves.
52
+ # :array
53
+ # Parses each line from the file as an Array and uploads each array for processing by workers.
54
+ # Every line in the input file is parsed and converted into an array before uploading.
55
+ # This approach ensures that the entire files is valid before starting to process it.
56
+ # Ideal for when files may contain invalid lines.
57
+ # Not recommended for large files since the CSV or other parsing is performed sequentially during the
58
+ # upload process.
59
+ # :hash
60
+ # Parses each line from the file into a Hash and uploads each hash for processing by workers.
61
+ # Similar to :array above in that the entire file is parsed before processing is started.
62
+ # Slightly less efficient than :array since it stores every record as a hash with both the key and value.
63
+ #
64
+ # Recommend using :array when the entire file must be parsed/validated before processing is started, and
65
+ # upload time is not important.
66
+ # See IOStreams#each for more details.
67
+ field :mode, type: ::Mongoid::StringifiedSymbol, default: :line
68
+ validates_inclusion_of :mode, in: %i[line array hash]
69
+
70
+ # When reading tabular input data (e.g. CSV, PSV) the header is automatically cleansed.
71
+ # This removes issues when the input header varies in case and other small ways. See IOStreams::Tabular
72
+ # Currently Supported:
73
+ # :default
74
+ # Each column is cleansed as follows:
75
+ # - Leading and trailing whitespace is stripped.
76
+ # - All characters converted to lower case.
77
+ # - Spaces and '-' are converted to '_'.
78
+ # - All characters except for letters, digits, and '_' are stripped.
79
+ # :none
80
+ # Do not cleanse the columns names supplied in the header row.
81
+ #
82
+ # Note: Submit a ticket if you have other cleansers that you want added.
83
+ field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
84
+ validates :header_cleanser, inclusion: %i[default none]
85
+
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
87
+
88
+ # Cleanses the header column names when `cleanse_header` is true
89
+ def cleanse_header!
90
+ return unless header_cleanser == :default
91
+
92
+ ignored_columns = tabular.header.cleanse!
93
+ logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
94
+
95
+ self.columns = tabular.header.columns
96
+ end
97
+
98
+ def tabular
99
+ @tabular ||= IOStreams::Tabular.new(
100
+ columns: columns,
101
+ format: format == :auto ? nil : format,
102
+ format_options: format_options&.deep_symbolize_keys,
103
+ file_name: file_name,
104
+ allowed_columns: allowed_columns,
105
+ required_columns: required_columns,
106
+ skip_unknown: skip_unknown
107
+ )
108
+ end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end