rocketjob 5.4.0.beta2 → 6.0.0.rc3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +149 -5
- data/bin/rocketjob_batch_perf +1 -1
- data/bin/rocketjob_perf +1 -1
- data/lib/rocket_job/batch.rb +3 -1
- data/lib/rocket_job/batch/categories.rb +341 -0
- data/lib/rocket_job/batch/io.rb +128 -60
- data/lib/rocket_job/batch/model.rb +20 -68
- data/lib/rocket_job/batch/performance.rb +19 -7
- data/lib/rocket_job/batch/statistics.rb +34 -12
- data/lib/rocket_job/batch/tabular.rb +2 -0
- data/lib/rocket_job/batch/tabular/input.rb +8 -6
- data/lib/rocket_job/batch/tabular/output.rb +4 -2
- data/lib/rocket_job/batch/throttle_running_workers.rb +8 -17
- data/lib/rocket_job/batch/worker.rb +27 -24
- data/lib/rocket_job/category/base.rb +78 -0
- data/lib/rocket_job/category/input.rb +110 -0
- data/lib/rocket_job/category/output.rb +25 -0
- data/lib/rocket_job/cli.rb +25 -17
- data/lib/rocket_job/dirmon_entry.rb +22 -12
- data/lib/rocket_job/event.rb +1 -1
- data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
- data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
- data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
- data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
- data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
- data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
- data/lib/rocket_job/jobs/conversion_job.rb +39 -0
- data/lib/rocket_job/jobs/dirmon_job.rb +2 -2
- data/lib/rocket_job/jobs/housekeeping_job.rb +7 -7
- data/lib/rocket_job/jobs/on_demand_batch_job.rb +17 -6
- data/lib/rocket_job/jobs/on_demand_job.rb +1 -2
- data/lib/rocket_job/jobs/performance_job.rb +3 -1
- data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
- data/lib/rocket_job/jobs/upload_file_job.rb +44 -8
- data/lib/rocket_job/lookup_collection.rb +69 -0
- data/lib/rocket_job/plugins/job/model.rb +25 -50
- data/lib/rocket_job/plugins/job/throttle.rb +2 -2
- data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +12 -4
- data/lib/rocket_job/plugins/job/worker.rb +2 -7
- data/lib/rocket_job/plugins/restart.rb +12 -5
- data/lib/rocket_job/plugins/state_machine.rb +2 -1
- data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +38 -0
- data/lib/rocket_job/ractor_worker.rb +42 -0
- data/lib/rocket_job/server/model.rb +1 -1
- data/lib/rocket_job/sliced.rb +15 -70
- data/lib/rocket_job/sliced/bzip2_output_slice.rb +1 -1
- data/lib/rocket_job/sliced/input.rb +1 -1
- data/lib/rocket_job/sliced/slice.rb +5 -13
- data/lib/rocket_job/sliced/slices.rb +14 -2
- data/lib/rocket_job/sliced/writer/output.rb +33 -45
- data/lib/rocket_job/subscribers/server.rb +1 -1
- data/lib/rocket_job/thread_worker.rb +46 -0
- data/lib/rocket_job/throttle_definitions.rb +7 -1
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +21 -55
- data/lib/rocket_job/worker_pool.rb +5 -7
- data/lib/rocketjob.rb +53 -43
- metadata +36 -26
- data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
- data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -2,7 +2,11 @@ require "active_support/concern"
|
|
2
2
|
|
3
3
|
module RocketJob
|
4
4
|
module Batch
|
5
|
-
# Allow statistics to be gathered while a batch job is running
|
5
|
+
# Allow statistics to be gathered while a batch job is running.
|
6
|
+
#
|
7
|
+
# Notes:
|
8
|
+
# - Statistics for successfully processed records within a slice are saved.
|
9
|
+
# - Statistics gathered during a perform that then results in an exception are discarded.
|
6
10
|
module Statistics
|
7
11
|
extend ActiveSupport::Concern
|
8
12
|
|
@@ -45,34 +49,52 @@ module RocketJob
|
|
45
49
|
last = paths.pop
|
46
50
|
return unless last
|
47
51
|
|
48
|
-
|
49
|
-
|
52
|
+
last_target = paths.inject(in_memory) do |target, sub_key|
|
53
|
+
target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
|
54
|
+
end
|
55
|
+
last_target[last] += increment
|
50
56
|
end
|
51
57
|
end
|
52
58
|
|
53
59
|
included do
|
54
60
|
field :statistics, type: Hash, default: -> { Hash.new(0) }
|
55
61
|
|
56
|
-
around_slice :
|
62
|
+
around_slice :rocket_job_statistics_capture
|
63
|
+
after_perform :rocket_job_statistics_commit
|
57
64
|
end
|
58
65
|
|
59
66
|
# Increment a statistic
|
60
67
|
def statistics_inc(key, increment = 1)
|
61
68
|
return if key.nil? || key == ""
|
62
69
|
|
63
|
-
|
64
|
-
@slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
65
|
-
key.is_a?(Hash) ? @slice_statistics.inc(key) : @slice_statistics.inc_key(key, increment)
|
70
|
+
(@rocket_job_perform_statistics ||= []) << (key.is_a?(Hash) ? key : [key, increment])
|
66
71
|
end
|
67
72
|
|
68
73
|
private
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
@slice_statistics = Stats.new(new_record? ? statistics : nil)
|
75
|
+
def rocket_job_statistics_capture
|
76
|
+
@rocket_job_perform_statistics = nil
|
77
|
+
@rocket_job_slice_statistics = nil
|
74
78
|
yield
|
75
|
-
|
79
|
+
ensure
|
80
|
+
if @rocket_job_slice_statistics && !@rocket_job_slice_statistics.empty?
|
81
|
+
collection.update_one({_id: id}, {"$inc" => @rocket_job_slice_statistics.stats})
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def rocket_job_slice_statistics
|
86
|
+
@rocket_job_slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Apply stats gathered during the perform to the slice level stats
|
90
|
+
def rocket_job_statistics_commit
|
91
|
+
return unless @rocket_job_perform_statistics
|
92
|
+
|
93
|
+
@rocket_job_perform_statistics.each do |key|
|
94
|
+
key.is_a?(Hash) ? rocket_job_slice_statistics.inc(key) : rocket_job_slice_statistics.inc_key(*key)
|
95
|
+
end
|
96
|
+
|
97
|
+
@rocket_job_perform_statistics = nil
|
76
98
|
end
|
77
99
|
|
78
100
|
# Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
|
@@ -3,15 +3,15 @@ require "active_support/concern"
|
|
3
3
|
module RocketJob
|
4
4
|
module Batch
|
5
5
|
class Tabular
|
6
|
-
#
|
7
|
-
# If multiple input categories are used with different formats, then use IOStreams::Tabular directly
|
8
|
-
# instead of this plugin.
|
6
|
+
# @deprecated
|
9
7
|
module Input
|
10
8
|
extend ActiveSupport::Concern
|
11
9
|
|
12
10
|
included do
|
11
|
+
warn "#{name} is using RocketJob::Batch::Tabular::Input which is deprecated"
|
12
|
+
|
13
13
|
field :tabular_input_header, type: Array, class_attribute: true, user_editable: true
|
14
|
-
field :tabular_input_format, type:
|
14
|
+
field :tabular_input_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true
|
15
15
|
field :tabular_input_options, type: Hash, class_attribute: true
|
16
16
|
|
17
17
|
# tabular_input_mode: [:line | :array | :hash]
|
@@ -22,7 +22,7 @@ module RocketJob
|
|
22
22
|
# :hash
|
23
23
|
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
24
24
|
# See IOStreams#each.
|
25
|
-
field :tabular_input_mode, type:
|
25
|
+
field :tabular_input_mode, type: Mongoid::StringifiedSymbol, default: :line, class_attribute: true, user_editable: true, copy_on_restart: true
|
26
26
|
|
27
27
|
validates_inclusion_of :tabular_input_format, in: IOStreams::Tabular.registered_formats
|
28
28
|
validates_inclusion_of :tabular_input_mode, in: %i[line array hash row record]
|
@@ -119,7 +119,9 @@ module RocketJob
|
|
119
119
|
end
|
120
120
|
|
121
121
|
def tabular_input_header_present
|
122
|
-
if tabular_input_header.present? ||
|
122
|
+
if tabular_input_header.present? ||
|
123
|
+
!tabular_input.header? ||
|
124
|
+
(tabular_input_mode == :hash || tabular_input_mode == :record)
|
123
125
|
return
|
124
126
|
end
|
125
127
|
|
@@ -10,8 +10,10 @@ module RocketJob
|
|
10
10
|
extend ActiveSupport::Concern
|
11
11
|
|
12
12
|
included do
|
13
|
+
warn "#{name} is using RocketJob::Batch::Tabular::Output which is deprecated"
|
14
|
+
|
13
15
|
field :tabular_output_header, type: Array, class_attribute: true, user_editable: true, copy_on_restart: true
|
14
|
-
field :tabular_output_format, type:
|
16
|
+
field :tabular_output_format, type: Mongoid::StringifiedSymbol, default: :csv, class_attribute: true, user_editable: true, copy_on_restart: true
|
15
17
|
field :tabular_output_options, type: Hash, class_attribute: true
|
16
18
|
|
17
19
|
validates_inclusion_of :tabular_output_format, in: IOStreams::Tabular.registered_formats
|
@@ -55,7 +57,7 @@ module RocketJob
|
|
55
57
|
|
56
58
|
# Render the output from the perform.
|
57
59
|
def tabular_output_render
|
58
|
-
return unless
|
60
|
+
return unless output_categories.present?
|
59
61
|
|
60
62
|
@rocket_job_output = tabular_output.render(@rocket_job_output)
|
61
63
|
end
|
@@ -37,34 +37,25 @@ module RocketJob
|
|
37
37
|
validates :throttle_running_workers, numericality: {greater_than_or_equal_to: 0}, allow_nil: true
|
38
38
|
|
39
39
|
define_batch_throttle :throttle_running_workers_exceeded?, filter: :throttle_filter_id
|
40
|
-
|
41
|
-
# Deprecated. For backward compatibility.
|
42
|
-
alias_method :throttle_running_slices, :throttle_running_workers
|
43
|
-
alias_method :throttle_running_slices=, :throttle_running_workers=
|
44
40
|
end
|
45
41
|
|
46
42
|
private
|
47
43
|
|
48
|
-
# Returns [
|
44
|
+
# Returns [true|false] whether the throttle for this job has been exceeded
|
49
45
|
def throttle_running_workers_exceeded?(slice)
|
50
|
-
return unless throttle_running_workers&.positive?
|
46
|
+
return false unless throttle_running_workers&.positive?
|
51
47
|
|
52
48
|
input.running.with(read: {mode: :primary}) do |conn|
|
53
49
|
conn.where(:id.ne => slice.id).count >= throttle_running_workers
|
54
50
|
end
|
55
51
|
end
|
56
52
|
|
57
|
-
#
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
# Cannot use this class since it will include instances of parent job classes.
|
65
|
-
RocketJob::Job.with(read: {mode: :primary}) do |conn|
|
66
|
-
conn.running.where("_type" => self.class.name, :id.ne => id, :priority.lte => priority).count >= throttle_running_jobs
|
67
|
-
end
|
53
|
+
# Allows another job with a higher priority to start even though this one is running already
|
54
|
+
# @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
|
55
|
+
def throttle_running_jobs_base_query
|
56
|
+
query = super
|
57
|
+
query[:priority.lte] = priority if throttle_running_workers&.positive?
|
58
|
+
query
|
68
59
|
end
|
69
60
|
end
|
70
61
|
end
|
@@ -23,9 +23,6 @@ module RocketJob
|
|
23
23
|
#
|
24
24
|
# Slices are destroyed after their records are successfully processed
|
25
25
|
#
|
26
|
-
# Results are stored in the output collection if `collect_output?`
|
27
|
-
# `nil` results from workers are kept if `collect_nil_output`
|
28
|
-
#
|
29
26
|
# If an exception was thrown the entire slice of records is marked as failed.
|
30
27
|
#
|
31
28
|
# Thread-safe, can be called by multiple threads at the same time
|
@@ -40,7 +37,8 @@ module RocketJob
|
|
40
37
|
|
41
38
|
SemanticLogger.named_tagged(job: id.to_s) do
|
42
39
|
until worker.shutdown?
|
43
|
-
|
40
|
+
slice = input.next_slice(worker.name)
|
41
|
+
if slice
|
44
42
|
# Grab a slice before checking the throttle to reduce concurrency race condition.
|
45
43
|
return true if slice.fail_on_exception!(re_raise_exceptions) { rocket_job_batch_throttled?(slice, worker) }
|
46
44
|
next if slice.failed?
|
@@ -97,8 +95,8 @@ module RocketJob
|
|
97
95
|
servers = []
|
98
96
|
case sub_state
|
99
97
|
when :before, :after
|
100
|
-
|
101
|
-
servers << ActiveWorker.new(worker_name, started_at, self)
|
98
|
+
if running? && (server_name.nil? || worker_on_server?(server_name))
|
99
|
+
servers << ActiveWorker.new(worker_name, started_at, self)
|
102
100
|
end
|
103
101
|
when :processing
|
104
102
|
query = input.running
|
@@ -143,19 +141,23 @@ module RocketJob
|
|
143
141
|
|
144
142
|
# Perform individual slice without callbacks
|
145
143
|
def rocket_job_perform_slice(slice, &block)
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
#
|
144
|
+
slice.processing_record_number ||= 0
|
145
|
+
records = []
|
146
|
+
append = false
|
147
|
+
|
148
|
+
# Skip processed records in this slice if it has no output categpries.
|
149
|
+
if slice.processing_record_number > 1
|
150
|
+
records = slice.records[slice.processing_record_number - 1..-1]
|
151
|
+
append = true
|
152
|
+
logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
|
153
|
+
else
|
154
|
+
# Reprocess all records in this slice.
|
157
155
|
slice.processing_record_number = 0
|
156
|
+
records = slice.records
|
157
|
+
end
|
158
158
|
|
159
|
+
count = 0
|
160
|
+
RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
|
159
161
|
records.each do |record|
|
160
162
|
slice.processing_record_number += 1
|
161
163
|
SemanticLogger.named_tagged(record: slice.current_record_number) do
|
@@ -174,8 +176,8 @@ module RocketJob
|
|
174
176
|
return block_given? ? yield(record) : perform(record) if _perform_callbacks.empty?
|
175
177
|
|
176
178
|
# @rocket_job_input and @rocket_job_output can be modified by before/around callbacks
|
177
|
-
@rocket_job_input
|
178
|
-
@rocket_job_output
|
179
|
+
@rocket_job_input = record
|
180
|
+
@rocket_job_output = nil
|
179
181
|
|
180
182
|
run_callbacks(:perform) do
|
181
183
|
@rocket_job_output =
|
@@ -186,9 +188,9 @@ module RocketJob
|
|
186
188
|
end
|
187
189
|
end
|
188
190
|
|
189
|
-
@rocket_job_input
|
190
|
-
result
|
191
|
-
@rocket_job_output
|
191
|
+
@rocket_job_input = nil
|
192
|
+
result = @rocket_job_output
|
193
|
+
@rocket_job_output = nil
|
192
194
|
result
|
193
195
|
end
|
194
196
|
|
@@ -305,11 +307,12 @@ module RocketJob
|
|
305
307
|
# Run Batch before and after callbacks
|
306
308
|
def rocket_job_batch_callbacks(worker)
|
307
309
|
# If this is the first worker to pickup this job
|
308
|
-
|
310
|
+
case sub_state
|
311
|
+
when :before
|
309
312
|
rocket_job_batch_run_before_callbacks
|
310
313
|
# Check for 0 record jobs
|
311
314
|
rocket_job_batch_complete?(worker.name) if running?
|
312
|
-
|
315
|
+
when sub_state == :after
|
313
316
|
rocket_job_batch_run_after_callbacks
|
314
317
|
end
|
315
318
|
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "active_support/concern"
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
module Category
|
5
|
+
# Define the layout for each category of input or output data
|
6
|
+
module Base
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
field :name, type: ::Mongoid::StringifiedSymbol, default: :main
|
11
|
+
|
12
|
+
# Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
|
13
|
+
field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
|
14
|
+
validates_inclusion_of :serializer, in: %i[none compress encrypt bzip2]
|
15
|
+
|
16
|
+
# The header columns when the file does not include a header row.
|
17
|
+
# Note:
|
18
|
+
# - All column names must be strings so that it can be serialized into MongoDB.
|
19
|
+
field :columns, type: Array
|
20
|
+
|
21
|
+
# On an input collection `format` specifies the format of the input data so that it can be
|
22
|
+
# transformed into a Hash when passed into the `#perform` method.
|
23
|
+
#
|
24
|
+
# On an output collection `format` specifies the format to transform the output hash into.
|
25
|
+
#
|
26
|
+
# `:auto` it uses the `file_name` on this category to determine the format.
|
27
|
+
# `nil` no transformation is performed on the data returned by the `#perform` method.
|
28
|
+
# Any other format supported by IOStreams, for example: csv, :hash, :array, :json, :psv, :fixed
|
29
|
+
#
|
30
|
+
# Default: `nil`
|
31
|
+
field :format, type: ::Mongoid::StringifiedSymbol
|
32
|
+
validates_inclusion_of :format, in: [nil, :auto] + IOStreams::Tabular.registered_formats
|
33
|
+
|
34
|
+
# Any specialized format specific options. For example, `:fixed` format requires a `:layout`.
|
35
|
+
field :format_options, type: Hash
|
36
|
+
|
37
|
+
# When `:format` is not supplied the file name can be used to infer the required format.
|
38
|
+
# Optional.
|
39
|
+
# Default: nil
|
40
|
+
field :file_name, type: IOStreams::Path
|
41
|
+
end
|
42
|
+
|
43
|
+
# Return which slice serializer class to use that matches the current options.
|
44
|
+
def serializer_class
|
45
|
+
case serializer
|
46
|
+
when :none
|
47
|
+
Sliced::Slice
|
48
|
+
when :compress
|
49
|
+
Sliced::CompressedSlice
|
50
|
+
when :encrypt
|
51
|
+
Sliced::EncryptedSlice
|
52
|
+
when :bzip2
|
53
|
+
Sliced::BZip2OutputSlice
|
54
|
+
else
|
55
|
+
raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def tabular
|
60
|
+
@tabular ||= IOStreams::Tabular.new(
|
61
|
+
columns: columns,
|
62
|
+
format: format == :auto ? nil : format,
|
63
|
+
format_options: format_options&.deep_symbolize_keys,
|
64
|
+
file_name: file_name
|
65
|
+
)
|
66
|
+
end
|
67
|
+
|
68
|
+
def reset_tabular
|
69
|
+
@tabular = nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns [true|false] whether this category has the attributes defined for tabular to work.
|
73
|
+
def tabular?
|
74
|
+
format.present?
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Category
|
3
|
+
# Define the layout for each category of input or output data
|
4
|
+
class Input
|
5
|
+
include SemanticLogger::Loggable
|
6
|
+
include Plugins::Document
|
7
|
+
include Category::Base
|
8
|
+
|
9
|
+
embedded_in :job, class_name: "RocketJob::Job", inverse_of: :input_categories
|
10
|
+
|
11
|
+
# Slice size for this input collection
|
12
|
+
field :slice_size, type: Integer, default: 100
|
13
|
+
|
14
|
+
#
|
15
|
+
# The fields below only apply if the field `format` has been set:
|
16
|
+
#
|
17
|
+
|
18
|
+
# List of columns to allow.
|
19
|
+
# Default: nil ( Allow all columns )
|
20
|
+
# Note:
|
21
|
+
# When supplied any columns that are rejected will be returned in the cleansed columns
|
22
|
+
# as nil so that they can be ignored during processing.
|
23
|
+
field :allowed_columns, type: Array
|
24
|
+
|
25
|
+
# List of columns that must be present, otherwise an Exception is raised.
|
26
|
+
field :required_columns, type: Array
|
27
|
+
|
28
|
+
# Whether to skip unknown columns in the uploaded file.
|
29
|
+
# Ignores any column that was not found in the `allowed_columns` list.
|
30
|
+
#
|
31
|
+
# false:
|
32
|
+
# Raises IOStreams::Tabular::InvalidHeader when a column is supplied that is not in `allowed_columns`.
|
33
|
+
# true:
|
34
|
+
# Ignore additional columns in a file that are not listed in `allowed_columns`
|
35
|
+
# Job processing will skip the additional columns entirely as if they were not supplied at all.
|
36
|
+
# A warning is logged with the names of the columns that were ignored.
|
37
|
+
# The `columns` field will list all skipped columns with a nil value so that downstream workers
|
38
|
+
# know to ignore those columns.
|
39
|
+
#
|
40
|
+
# Notes:
|
41
|
+
# - Only applicable when `allowed_columns` has been set.
|
42
|
+
# - Recommended to leave as `false` otherwise a misspelled column can result in missed columns.
|
43
|
+
field :skip_unknown, type: ::Mongoid::Boolean, default: false
|
44
|
+
validates_inclusion_of :skip_unknown, in: [true, false]
|
45
|
+
|
46
|
+
# When `#upload` is called with a file_name, it uploads the file using any of the following approaches:
|
47
|
+
# :line
|
48
|
+
# Uploads the file a line (String) at a time for processing by workers.
|
49
|
+
# This is the default behavior and is the most performant since it leaves the parsing of each line
|
50
|
+
# up to the workers themselves.
|
51
|
+
# :array
|
52
|
+
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
53
|
+
# Every line in the input file is parsed and converted into an array before uploading.
|
54
|
+
# This approach ensures that the entire files is valid before starting to process it.
|
55
|
+
# Ideal for when files may contain invalid lines.
|
56
|
+
# Not recommended for large files since the CSV or other parsing is performed sequentially during the
|
57
|
+
# upload process.
|
58
|
+
# :hash
|
59
|
+
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
60
|
+
# Similar to :array above in that the entire file is parsed before processing is started.
|
61
|
+
# Slightly less efficient than :array since it stores every record as a hash with both the key and value.
|
62
|
+
#
|
63
|
+
# Recommend using :array when the entire file must be parsed/validated before processing is started, and
|
64
|
+
# upload time is not important.
|
65
|
+
# See IOStreams#each for more details.
|
66
|
+
field :mode, type: ::Mongoid::StringifiedSymbol, default: :line
|
67
|
+
validates_inclusion_of :mode, in: %i[line array hash]
|
68
|
+
|
69
|
+
# When reading tabular input data (e.g. CSV, PSV) the header is automatically cleansed.
|
70
|
+
# This removes issues when the input header varies in case and other small ways. See IOStreams::Tabular
|
71
|
+
# Currently Supported:
|
72
|
+
# :default
|
73
|
+
# Each column is cleansed as follows:
|
74
|
+
# - Leading and trailing whitespace is stripped.
|
75
|
+
# - All characters converted to lower case.
|
76
|
+
# - Spaces and '-' are converted to '_'.
|
77
|
+
# - All characters except for letters, digits, and '_' are stripped.
|
78
|
+
# :none
|
79
|
+
# Do not cleanse the columns names supplied in the header row.
|
80
|
+
#
|
81
|
+
# Note: Submit a ticket if you have other cleansers that you want added.
|
82
|
+
field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
|
83
|
+
validates :header_cleanser, inclusion: %i[default none]
|
84
|
+
|
85
|
+
validates_presence_of :slice_size
|
86
|
+
|
87
|
+
# Cleanses the header column names when `cleanse_header` is true
|
88
|
+
def cleanse_header!
|
89
|
+
return unless header_cleanser == :default
|
90
|
+
|
91
|
+
ignored_columns = tabular.header.cleanse!
|
92
|
+
logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
|
93
|
+
|
94
|
+
self.columns = tabular.header.columns
|
95
|
+
end
|
96
|
+
|
97
|
+
def tabular
|
98
|
+
@tabular ||= IOStreams::Tabular.new(
|
99
|
+
columns: columns,
|
100
|
+
format: format == :auto ? nil : format,
|
101
|
+
format_options: format_options&.deep_symbolize_keys,
|
102
|
+
file_name: file_name,
|
103
|
+
allowed_columns: allowed_columns,
|
104
|
+
required_columns: required_columns,
|
105
|
+
skip_unknown: skip_unknown
|
106
|
+
)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|