rocketjob 5.4.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +175 -5
  3. data/bin/rocketjob_batch_perf +1 -1
  4. data/bin/rocketjob_perf +1 -1
  5. data/lib/rocket_job/batch/categories.rb +345 -0
  6. data/lib/rocket_job/batch/io.rb +174 -106
  7. data/lib/rocket_job/batch/model.rb +20 -68
  8. data/lib/rocket_job/batch/performance.rb +19 -7
  9. data/lib/rocket_job/batch/statistics.rb +34 -12
  10. data/lib/rocket_job/batch/throttle_running_workers.rb +2 -6
  11. data/lib/rocket_job/batch/worker.rb +31 -26
  12. data/lib/rocket_job/batch.rb +3 -1
  13. data/lib/rocket_job/category/base.rb +81 -0
  14. data/lib/rocket_job/category/input.rb +170 -0
  15. data/lib/rocket_job/category/output.rb +34 -0
  16. data/lib/rocket_job/cli.rb +25 -17
  17. data/lib/rocket_job/dirmon_entry.rb +23 -13
  18. data/lib/rocket_job/event.rb +1 -1
  19. data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
  20. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  21. data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
  22. data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
  23. data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
  24. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  25. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  26. data/lib/rocket_job/jobs/dirmon_job.rb +25 -36
  27. data/lib/rocket_job/jobs/housekeeping_job.rb +11 -12
  28. data/lib/rocket_job/jobs/on_demand_batch_job.rb +24 -11
  29. data/lib/rocket_job/jobs/on_demand_job.rb +3 -4
  30. data/lib/rocket_job/jobs/performance_job.rb +3 -1
  31. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
  32. data/lib/rocket_job/jobs/upload_file_job.rb +48 -8
  33. data/lib/rocket_job/lookup_collection.rb +69 -0
  34. data/lib/rocket_job/plugins/cron.rb +60 -20
  35. data/lib/rocket_job/plugins/job/model.rb +25 -50
  36. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  37. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  38. data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +1 -1
  39. data/lib/rocket_job/plugins/job/worker.rb +2 -7
  40. data/lib/rocket_job/plugins/restart.rb +3 -103
  41. data/lib/rocket_job/plugins/state_machine.rb +4 -3
  42. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +37 -0
  43. data/lib/rocket_job/ractor_worker.rb +42 -0
  44. data/lib/rocket_job/server/model.rb +1 -1
  45. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  46. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  47. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  48. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  49. data/lib/rocket_job/sliced/input.rb +42 -54
  50. data/lib/rocket_job/sliced/slice.rb +12 -16
  51. data/lib/rocket_job/sliced/slices.rb +26 -11
  52. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  53. data/lib/rocket_job/sliced/writer/output.rb +33 -45
  54. data/lib/rocket_job/sliced.rb +1 -74
  55. data/lib/rocket_job/subscribers/server.rb +1 -1
  56. data/lib/rocket_job/thread_worker.rb +46 -0
  57. data/lib/rocket_job/throttle_definitions.rb +7 -1
  58. data/lib/rocket_job/version.rb +1 -1
  59. data/lib/rocket_job/worker.rb +21 -55
  60. data/lib/rocket_job/worker_pool.rb +5 -7
  61. data/lib/rocketjob.rb +53 -43
  62. metadata +36 -28
  63. data/lib/rocket_job/batch/tabular/input.rb +0 -131
  64. data/lib/rocket_job/batch/tabular/output.rb +0 -65
  65. data/lib/rocket_job/batch/tabular.rb +0 -56
  66. data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
  67. data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -22,12 +22,15 @@ module RocketJob
22
22
  count_running_workers
23
23
 
24
24
  puts "Loading job with #{count} records/lines"
25
- args = {log_level: :warn, slice_size: slice_size}
26
- if defined?(::RocketJob)
27
- args[:compress] = compress
28
- args[:encrypt] = encrypt
25
+ job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
26
+ job.input_category.slice_size = slice_size
27
+ if encrypt
28
+ job.input_category.serializer = :encrypt
29
+ job.output_category.serializer = :encrypt
30
+ elsif !compress
31
+ job.input_category.serializer = :none
32
+ job.output_category.serializer = :none
29
33
  end
30
- job = RocketJob::Jobs::PerformanceJob.new(args)
31
34
  job.upload do |writer|
32
35
  count.times { |i| writer << i }
33
36
  end
@@ -37,7 +40,15 @@ module RocketJob
37
40
  sleep 3 until job.reload.completed?
38
41
 
39
42
  duration = job.completed_at - job.started_at
40
- {count: count, duration: duration, records_per_second: (count.to_f / duration).round(3), workers: workers, servers: servers, compress: compress, encrypt: encrypt}
43
+ {
44
+ count: count,
45
+ duration: duration,
46
+ records_per_second: (count.to_f / duration).round(3),
47
+ workers: workers,
48
+ servers: servers,
49
+ compress: compress,
50
+ encrypt: encrypt
51
+ }
41
52
  end
42
53
 
43
54
  # Export the Results hash to a CSV file
@@ -60,7 +71,8 @@ module RocketJob
60
71
  o.on("-m", "--mongo MONGO_CONFIG_FILE_NAME", "Location of mongoid.yml config file") do |arg|
61
72
  self.mongo_config = arg
62
73
  end
63
- o.on("-e", "--environment ENVIRONMENT", "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
74
+ o.on("-e", "--environment ENVIRONMENT",
75
+ "The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
64
76
  self.environment = arg
65
77
  end
66
78
  o.on("-z", "--compress", "Turn on compression") do
@@ -2,7 +2,11 @@ require "active_support/concern"
2
2
 
3
3
  module RocketJob
4
4
  module Batch
5
- # Allow statistics to be gathered while a batch job is running
5
+ # Allow statistics to be gathered while a batch job is running.
6
+ #
7
+ # Notes:
8
+ # - Statistics for successfully processed records within a slice are saved.
9
+ # - Statistics gathered during a perform that then results in an exception are discarded.
6
10
  module Statistics
7
11
  extend ActiveSupport::Concern
8
12
 
@@ -45,34 +49,52 @@ module RocketJob
45
49
  last = paths.pop
46
50
  return unless last
47
51
 
48
- target = paths.inject(in_memory) { |target, key| target.key?(key) ? target[key] : target[key] = Hash.new(0) }
49
- target[last] += increment
52
+ last_target = paths.inject(in_memory) do |target, sub_key|
53
+ target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
54
+ end
55
+ last_target[last] += increment
50
56
  end
51
57
  end
52
58
 
53
59
  included do
54
60
  field :statistics, type: Hash, default: -> { Hash.new(0) }
55
61
 
56
- around_slice :statistics_capture
62
+ around_slice :rocket_job_statistics_capture
63
+ after_perform :rocket_job_statistics_commit
57
64
  end
58
65
 
59
66
  # Increment a statistic
60
67
  def statistics_inc(key, increment = 1)
61
68
  return if key.nil? || key == ""
62
69
 
63
- # Being called within tests outside of a perform
64
- @slice_statistics ||= Stats.new(new_record? ? statistics : nil)
65
- key.is_a?(Hash) ? @slice_statistics.inc(key) : @slice_statistics.inc_key(key, increment)
70
+ (@rocket_job_perform_statistics ||= []) << (key.is_a?(Hash) ? key : [key, increment])
66
71
  end
67
72
 
68
73
  private
69
74
 
70
- # Capture the number of successful and failed tradelines
71
- # as well as those with notices and alerts.
72
- def statistics_capture
73
- @slice_statistics = Stats.new(new_record? ? statistics : nil)
75
+ def rocket_job_statistics_capture
76
+ @rocket_job_perform_statistics = nil
77
+ @rocket_job_slice_statistics = nil
74
78
  yield
75
- collection.update_one({_id: id}, {"$inc" => @slice_statistics.stats}) unless @slice_statistics.empty?
79
+ ensure
80
+ if @rocket_job_slice_statistics && !@rocket_job_slice_statistics.empty?
81
+ collection.update_one({_id: id}, {"$inc" => @rocket_job_slice_statistics.stats})
82
+ end
83
+ end
84
+
85
+ def rocket_job_slice_statistics
86
+ @rocket_job_slice_statistics ||= Stats.new(new_record? ? statistics : nil)
87
+ end
88
+
89
+ # Apply stats gathered during the perform to the slice level stats
90
+ def rocket_job_statistics_commit
91
+ return unless @rocket_job_perform_statistics
92
+
93
+ @rocket_job_perform_statistics.each do |key|
94
+ key.is_a?(Hash) ? rocket_job_slice_statistics.inc(key) : rocket_job_slice_statistics.inc_key(*key)
95
+ end
96
+
97
+ @rocket_job_perform_statistics = nil
76
98
  end
77
99
 
78
100
  # Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
@@ -37,15 +37,11 @@ module RocketJob
37
37
  validates :throttle_running_workers, numericality: {greater_than_or_equal_to: 0}, allow_nil: true
38
38
 
39
39
  define_batch_throttle :throttle_running_workers_exceeded?, filter: :throttle_filter_id
40
-
41
- # Deprecated. For backward compatibility.
42
- alias_method :throttle_running_slices, :throttle_running_workers
43
- alias_method :throttle_running_slices=, :throttle_running_workers=
44
40
  end
45
41
 
46
42
  private
47
43
 
48
- # Returns [Boolean] whether the throttle for this job has been exceeded
44
+ # Returns [true|false] whether the throttle for this job has been exceeded
49
45
  def throttle_running_workers_exceeded?(slice)
50
46
  return false unless throttle_running_workers&.positive?
51
47
 
@@ -57,7 +53,7 @@ module RocketJob
57
53
  # Allows another job with a higher priority to start even though this one is running already
58
54
  # @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
59
55
  def throttle_running_jobs_base_query
60
- query = super
56
+ query = super
61
57
  query[:priority.lte] = priority if throttle_running_workers&.positive?
62
58
  query
63
59
  end
@@ -23,9 +23,6 @@ module RocketJob
23
23
  #
24
24
  # Slices are destroyed after their records are successfully processed
25
25
  #
26
- # Results are stored in the output collection if `collect_output?`
27
- # `nil` results from workers are kept if `collect_nil_output`
28
- #
29
26
  # If an exception was thrown the entire slice of records is marked as failed.
30
27
  #
31
28
  # Thread-safe, can be called by multiple threads at the same time
@@ -40,7 +37,8 @@ module RocketJob
40
37
 
41
38
  SemanticLogger.named_tagged(job: id.to_s) do
42
39
  until worker.shutdown?
43
- if slice = input.next_slice(worker.name)
40
+ slice = input.next_slice(worker.name)
41
+ if slice
44
42
  # Grab a slice before checking the throttle to reduce concurrency race condition.
45
43
  return true if slice.fail_on_exception!(re_raise_exceptions) { rocket_job_batch_throttled?(slice, worker) }
46
44
  next if slice.failed?
@@ -69,6 +67,8 @@ module RocketJob
69
67
  # Returns [Integer] the number of records processed in the slice
70
68
  #
71
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
72
72
  def work_first_slice(&block)
73
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
74
74
 
@@ -97,8 +97,8 @@ module RocketJob
97
97
  servers = []
98
98
  case sub_state
99
99
  when :before, :after
100
- unless server_name && !worker_on_server?(server_name)
101
- servers << ActiveWorker.new(worker_name, started_at, self) if running?
100
+ if running? && (server_name.nil? || worker_on_server?(server_name))
101
+ servers << ActiveWorker.new(worker_name, started_at, self)
102
102
  end
103
103
  when :processing
104
104
  query = input.running
@@ -143,19 +143,23 @@ module RocketJob
143
143
 
144
144
  # Perform individual slice without callbacks
145
145
  def rocket_job_perform_slice(slice, &block)
146
- count = 0
147
- RocketJob::Sliced::Writer::Output.collect(self, slice) do |writer|
148
- records = slice.records
149
-
150
- # Skip records already processed, if any.
151
- # slice.processing_record_number ||= 0
152
- # TODO: Must append to existing output slices before this can be enabled.
153
- # if !collect_output && (slice.processing_record_number > 1)
154
- # records = records[slice.processing_record_number - 1..-1]
155
- # end
156
- # Until the changes above have been implemented, reprocess all records in the slice.
157
- slice.processing_record_number = 0
146
+ slice.processing_record_number ||= 0
147
+ append = false
148
+
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
161
+ count = 0
162
+ RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
159
163
  records.each do |record|
160
164
  slice.processing_record_number += 1
161
165
  SemanticLogger.named_tagged(record: slice.current_record_number) do
@@ -174,8 +178,8 @@ module RocketJob
174
178
  return block_given? ? yield(record) : perform(record) if _perform_callbacks.empty?
175
179
 
176
180
  # @rocket_job_input and @rocket_job_output can be modified by before/around callbacks
177
- @rocket_job_input = record
178
- @rocket_job_output = nil
181
+ @rocket_job_input = record
182
+ @rocket_job_output = nil
179
183
 
180
184
  run_callbacks(:perform) do
181
185
  @rocket_job_output =
@@ -186,9 +190,9 @@ module RocketJob
186
190
  end
187
191
  end
188
192
 
189
- @rocket_job_input = nil
190
- result = @rocket_job_output
191
- @rocket_job_output = nil
193
+ @rocket_job_input = nil
194
+ result = @rocket_job_output
195
+ @rocket_job_output = nil
192
196
  result
193
197
  end
194
198
 
@@ -244,7 +248,7 @@ module RocketJob
244
248
  unless new_record?
245
249
  # Fail job iff no other worker has already finished it
246
250
  # Must set write concern to at least 1 since we need the nModified back
247
- result = self.class.with(write: {w: 1}) do |query|
251
+ result = self.class.with(write: {w: 1}) do |query|
248
252
  query.
249
253
  where(id: id, state: :running, sub_state: :processing).
250
254
  update({"$set" => {state: :failed, worker_name: worker_name}})
@@ -305,11 +309,12 @@ module RocketJob
305
309
  # Run Batch before and after callbacks
306
310
  def rocket_job_batch_callbacks(worker)
307
311
  # If this is the first worker to pickup this job
308
- if sub_state == :before
312
+ case sub_state
313
+ when :before
309
314
  rocket_job_batch_run_before_callbacks
310
315
  # Check for 0 record jobs
311
316
  rocket_job_batch_complete?(worker.name) if running?
312
- elsif sub_state == :after
317
+ when sub_state == :after
313
318
  rocket_job_batch_run_after_callbacks
314
319
  end
315
320
  end
@@ -7,6 +7,8 @@ require "rocket_job/batch/state_machine"
7
7
  require "rocket_job/batch/throttle"
8
8
  require "rocket_job/batch/throttle_running_workers"
9
9
  require "rocket_job/batch/worker"
10
+ # Ensure after_perform is run first and #upload override is after IO#upload is defined.
11
+ require "rocket_job/batch/categories"
10
12
 
11
13
  module RocketJob
12
14
  module Batch
@@ -17,6 +19,7 @@ module RocketJob
17
19
  include Callbacks
18
20
  include Logger
19
21
  include Worker
22
+ include Categories
20
23
  include Throttle
21
24
  include ThrottleRunningWorkers
22
25
  include IO
@@ -27,6 +30,5 @@ module RocketJob
27
30
  autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
28
31
  autoload :Result, "rocket_job/batch/result"
29
32
  autoload :Results, "rocket_job/batch/results"
30
- autoload :Tabular, "rocket_job/batch/tabular"
31
33
  end
32
34
  end
@@ -0,0 +1,81 @@
1
+ require "active_support/concern"
2
+
3
+ module RocketJob
4
+ module Category
5
+ # Define the layout for each category of input or output data
6
+ module Base
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ field :name, type: ::Mongoid::StringifiedSymbol, default: :main
11
+
12
+ # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
+ field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
+
15
+ # The header columns when the file does not include a header row.
16
+ # Note:
17
+ # - All column names must be strings so that it can be serialized into MongoDB.
18
+ field :columns, type: Array
19
+
20
+ # On an input collection `format` specifies the format of the input data so that it can be
21
+ # transformed into a Hash when passed into the `#perform` method.
22
+ #
23
+ # On an output collection `format` specifies the format to transform the output hash into.
24
+ #
25
+ # `:auto` it uses the `file_name` on this category to determine the format.
26
+ # `nil` no transformation is performed on the data returned by the `#perform` method.
27
+ # Any other format supported by IOStreams, for example: csv, :hash, :array, :json, :psv, :fixed
28
+ #
29
+ # Default: `nil`
30
+ field :format, type: ::Mongoid::StringifiedSymbol
31
+ validates_inclusion_of :format, in: [nil, :auto] + IOStreams::Tabular.registered_formats
32
+
33
+ # Any specialized format specific options. For example, `:fixed` format requires a `:layout`.
34
+ field :format_options, type: Hash
35
+
36
+ # When `:format` is not supplied the file name can be used to infer the required format.
37
+ # Optional.
38
+ # Default: nil
39
+ field :file_name, type: IOStreams::Path
40
+ end
41
+
42
+ # Return which slice serializer class to use that matches the current options.
43
+ def serializer_class
44
+ case serializer
45
+ when :none
46
+ Sliced::Slice
47
+ when :compress
48
+ Sliced::CompressedSlice
49
+ when :encrypt
50
+ Sliced::EncryptedSlice
51
+ when :bzip2, :bz2
52
+ Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
55
+ else
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
57
+ end
58
+ end
59
+
60
+ def tabular
61
+ @tabular ||= IOStreams::Tabular.new(
62
+ columns: columns,
63
+ format: format == :auto ? nil : format,
64
+ format_options: format_options&.deep_symbolize_keys,
65
+ file_name: file_name
66
+ )
67
+ end
68
+
69
+ # Returns [true|false] whether this category has the attributes defined for tabular to work.
70
+ def tabular?
71
+ format.present?
72
+ end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,170 @@
1
+ module RocketJob
2
+ module Category
3
+ # Define the layout for each category of input or output data
4
+ class Input
5
+ include SemanticLogger::Loggable
6
+ include Plugins::Document
7
+ include Category::Base
8
+
9
+ embedded_in :job, class_name: "RocketJob::Job", inverse_of: :input_categories
10
+
11
+ # Slice size for this input collection
12
+ field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
14
+
15
+ #
16
+ # The fields below only apply if the field `format` has been set:
17
+ #
18
+
19
+ # List of columns to allow.
20
+ # Default: nil ( Allow all columns )
21
+ # Note:
22
+ # When supplied any columns that are rejected will be returned in the cleansed columns
23
+ # as nil so that they can be ignored during processing.
24
+ field :allowed_columns, type: Array
25
+
26
+ # List of columns that must be present, otherwise an Exception is raised.
27
+ field :required_columns, type: Array
28
+
29
+ # Whether to skip unknown columns in the uploaded file.
30
+ # Ignores any column that was not found in the `allowed_columns` list.
31
+ #
32
+ # false:
33
+ # Raises IOStreams::Tabular::InvalidHeader when a column is supplied that is not in `allowed_columns`.
34
+ # true:
35
+ # Ignore additional columns in a file that are not listed in `allowed_columns`
36
+ # Job processing will skip the additional columns entirely as if they were not supplied at all.
37
+ # A warning is logged with the names of the columns that were ignored.
38
+ # The `columns` field will list all skipped columns with a nil value so that downstream workers
39
+ # know to ignore those columns.
40
+ #
41
+ # Notes:
42
+ # - Only applicable when `allowed_columns` has been set.
43
+ # - Recommended to leave as `false` otherwise a misspelled column can result in missed columns.
44
+ field :skip_unknown, type: ::Mongoid::Boolean, default: false
45
+ validates_inclusion_of :skip_unknown, in: [true, false]
46
+
47
+ # When `#upload` is called with a file_name, it uploads the file using any of the following approaches:
48
+ # :line
49
+ # Uploads the file a line (String) at a time for processing by workers.
50
+ # This is the default behavior and is the most performant since it leaves the parsing of each line
51
+ # up to the workers themselves.
52
+ # :array
53
+ # Parses each line from the file as an Array and uploads each array for processing by workers.
54
+ # Every line in the input file is parsed and converted into an array before uploading.
55
+ # This approach ensures that the entire files is valid before starting to process it.
56
+ # Ideal for when files may contain invalid lines.
57
+ # Not recommended for large files since the CSV or other parsing is performed sequentially during the
58
+ # upload process.
59
+ # :hash
60
+ # Parses each line from the file into a Hash and uploads each hash for processing by workers.
61
+ # Similar to :array above in that the entire file is parsed before processing is started.
62
+ # Slightly less efficient than :array since it stores every record as a hash with both the key and value.
63
+ #
64
+ # Recommend using :array when the entire file must be parsed/validated before processing is started, and
65
+ # upload time is not important.
66
+ # See IOStreams#each for more details.
67
+ field :mode, type: ::Mongoid::StringifiedSymbol, default: :line
68
+ validates_inclusion_of :mode, in: %i[line array hash]
69
+
70
+ # When reading tabular input data (e.g. CSV, PSV) the header is automatically cleansed.
71
+ # This removes issues when the input header varies in case and other small ways. See IOStreams::Tabular
72
+ # Currently Supported:
73
+ # :default
74
+ # Each column is cleansed as follows:
75
+ # - Leading and trailing whitespace is stripped.
76
+ # - All characters converted to lower case.
77
+ # - Spaces and '-' are converted to '_'.
78
+ # - All characters except for letters, digits, and '_' are stripped.
79
+ # :none
80
+ # Do not cleanse the columns names supplied in the header row.
81
+ #
82
+ # Note: Submit a ticket if you have other cleansers that you want added.
83
+ field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
84
+ validates :header_cleanser, inclusion: %i[default none]
85
+
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
87
+
88
+ # Cleanses the header column names when `cleanse_header` is true
89
+ def cleanse_header!
90
+ return unless header_cleanser == :default
91
+
92
+ ignored_columns = tabular.header.cleanse!
93
+ logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
94
+
95
+ self.columns = tabular.header.columns
96
+ end
97
+
98
+ def tabular
99
+ @tabular ||= IOStreams::Tabular.new(
100
+ columns: columns,
101
+ format: format == :auto ? nil : format,
102
+ format_options: format_options&.deep_symbolize_keys,
103
+ file_name: file_name,
104
+ allowed_columns: allowed_columns,
105
+ required_columns: required_columns,
106
+ skip_unknown: skip_unknown
107
+ )
108
+ end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end