rocketjob 5.4.1 → 6.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +175 -5
- data/bin/rocketjob_batch_perf +1 -1
- data/bin/rocketjob_perf +1 -1
- data/lib/rocket_job/batch/categories.rb +345 -0
- data/lib/rocket_job/batch/io.rb +174 -106
- data/lib/rocket_job/batch/model.rb +20 -68
- data/lib/rocket_job/batch/performance.rb +19 -7
- data/lib/rocket_job/batch/statistics.rb +34 -12
- data/lib/rocket_job/batch/throttle_running_workers.rb +2 -6
- data/lib/rocket_job/batch/worker.rb +31 -26
- data/lib/rocket_job/batch.rb +3 -1
- data/lib/rocket_job/category/base.rb +81 -0
- data/lib/rocket_job/category/input.rb +170 -0
- data/lib/rocket_job/category/output.rb +34 -0
- data/lib/rocket_job/cli.rb +25 -17
- data/lib/rocket_job/dirmon_entry.rb +23 -13
- data/lib/rocket_job/event.rb +1 -1
- data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
- data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
- data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
- data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
- data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
- data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
- data/lib/rocket_job/jobs/conversion_job.rb +43 -0
- data/lib/rocket_job/jobs/dirmon_job.rb +25 -36
- data/lib/rocket_job/jobs/housekeeping_job.rb +11 -12
- data/lib/rocket_job/jobs/on_demand_batch_job.rb +24 -11
- data/lib/rocket_job/jobs/on_demand_job.rb +3 -4
- data/lib/rocket_job/jobs/performance_job.rb +3 -1
- data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
- data/lib/rocket_job/jobs/upload_file_job.rb +48 -8
- data/lib/rocket_job/lookup_collection.rb +69 -0
- data/lib/rocket_job/plugins/cron.rb +60 -20
- data/lib/rocket_job/plugins/job/model.rb +25 -50
- data/lib/rocket_job/plugins/job/persistence.rb +36 -0
- data/lib/rocket_job/plugins/job/throttle.rb +2 -2
- data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +1 -1
- data/lib/rocket_job/plugins/job/worker.rb +2 -7
- data/lib/rocket_job/plugins/restart.rb +3 -103
- data/lib/rocket_job/plugins/state_machine.rb +4 -3
- data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +37 -0
- data/lib/rocket_job/ractor_worker.rb +42 -0
- data/lib/rocket_job/server/model.rb +1 -1
- data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
- data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
- data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
- data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
- data/lib/rocket_job/sliced/input.rb +42 -54
- data/lib/rocket_job/sliced/slice.rb +12 -16
- data/lib/rocket_job/sliced/slices.rb +26 -11
- data/lib/rocket_job/sliced/writer/input.rb +46 -18
- data/lib/rocket_job/sliced/writer/output.rb +33 -45
- data/lib/rocket_job/sliced.rb +1 -74
- data/lib/rocket_job/subscribers/server.rb +1 -1
- data/lib/rocket_job/thread_worker.rb +46 -0
- data/lib/rocket_job/throttle_definitions.rb +7 -1
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +21 -55
- data/lib/rocket_job/worker_pool.rb +5 -7
- data/lib/rocketjob.rb +53 -43
- metadata +36 -28
- data/lib/rocket_job/batch/tabular/input.rb +0 -131
- data/lib/rocket_job/batch/tabular/output.rb +0 -65
- data/lib/rocket_job/batch/tabular.rb +0 -56
- data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
- data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -22,12 +22,15 @@ module RocketJob
|
|
22
22
|
count_running_workers
|
23
23
|
|
24
24
|
puts "Loading job with #{count} records/lines"
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
|
26
|
+
job.input_category.slice_size = slice_size
|
27
|
+
if encrypt
|
28
|
+
job.input_category.serializer = :encrypt
|
29
|
+
job.output_category.serializer = :encrypt
|
30
|
+
elsif !compress
|
31
|
+
job.input_category.serializer = :none
|
32
|
+
job.output_category.serializer = :none
|
29
33
|
end
|
30
|
-
job = RocketJob::Jobs::PerformanceJob.new(args)
|
31
34
|
job.upload do |writer|
|
32
35
|
count.times { |i| writer << i }
|
33
36
|
end
|
@@ -37,7 +40,15 @@ module RocketJob
|
|
37
40
|
sleep 3 until job.reload.completed?
|
38
41
|
|
39
42
|
duration = job.completed_at - job.started_at
|
40
|
-
{
|
43
|
+
{
|
44
|
+
count: count,
|
45
|
+
duration: duration,
|
46
|
+
records_per_second: (count.to_f / duration).round(3),
|
47
|
+
workers: workers,
|
48
|
+
servers: servers,
|
49
|
+
compress: compress,
|
50
|
+
encrypt: encrypt
|
51
|
+
}
|
41
52
|
end
|
42
53
|
|
43
54
|
# Export the Results hash to a CSV file
|
@@ -60,7 +71,8 @@ module RocketJob
|
|
60
71
|
o.on("-m", "--mongo MONGO_CONFIG_FILE_NAME", "Location of mongoid.yml config file") do |arg|
|
61
72
|
self.mongo_config = arg
|
62
73
|
end
|
63
|
-
o.on("-e", "--environment ENVIRONMENT",
|
74
|
+
o.on("-e", "--environment ENVIRONMENT",
|
75
|
+
"The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
|
64
76
|
self.environment = arg
|
65
77
|
end
|
66
78
|
o.on("-z", "--compress", "Turn on compression") do
|
@@ -2,7 +2,11 @@ require "active_support/concern"
|
|
2
2
|
|
3
3
|
module RocketJob
|
4
4
|
module Batch
|
5
|
-
# Allow statistics to be gathered while a batch job is running
|
5
|
+
# Allow statistics to be gathered while a batch job is running.
|
6
|
+
#
|
7
|
+
# Notes:
|
8
|
+
# - Statistics for successfully processed records within a slice are saved.
|
9
|
+
# - Statistics gathered during a perform that then results in an exception are discarded.
|
6
10
|
module Statistics
|
7
11
|
extend ActiveSupport::Concern
|
8
12
|
|
@@ -45,34 +49,52 @@ module RocketJob
|
|
45
49
|
last = paths.pop
|
46
50
|
return unless last
|
47
51
|
|
48
|
-
|
49
|
-
|
52
|
+
last_target = paths.inject(in_memory) do |target, sub_key|
|
53
|
+
target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
|
54
|
+
end
|
55
|
+
last_target[last] += increment
|
50
56
|
end
|
51
57
|
end
|
52
58
|
|
53
59
|
included do
|
54
60
|
field :statistics, type: Hash, default: -> { Hash.new(0) }
|
55
61
|
|
56
|
-
around_slice :
|
62
|
+
around_slice :rocket_job_statistics_capture
|
63
|
+
after_perform :rocket_job_statistics_commit
|
57
64
|
end
|
58
65
|
|
59
66
|
# Increment a statistic
|
60
67
|
def statistics_inc(key, increment = 1)
|
61
68
|
return if key.nil? || key == ""
|
62
69
|
|
63
|
-
|
64
|
-
@slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
65
|
-
key.is_a?(Hash) ? @slice_statistics.inc(key) : @slice_statistics.inc_key(key, increment)
|
70
|
+
(@rocket_job_perform_statistics ||= []) << (key.is_a?(Hash) ? key : [key, increment])
|
66
71
|
end
|
67
72
|
|
68
73
|
private
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
@slice_statistics = Stats.new(new_record? ? statistics : nil)
|
75
|
+
def rocket_job_statistics_capture
|
76
|
+
@rocket_job_perform_statistics = nil
|
77
|
+
@rocket_job_slice_statistics = nil
|
74
78
|
yield
|
75
|
-
|
79
|
+
ensure
|
80
|
+
if @rocket_job_slice_statistics && !@rocket_job_slice_statistics.empty?
|
81
|
+
collection.update_one({_id: id}, {"$inc" => @rocket_job_slice_statistics.stats})
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def rocket_job_slice_statistics
|
86
|
+
@rocket_job_slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Apply stats gathered during the perform to the slice level stats
|
90
|
+
def rocket_job_statistics_commit
|
91
|
+
return unless @rocket_job_perform_statistics
|
92
|
+
|
93
|
+
@rocket_job_perform_statistics.each do |key|
|
94
|
+
key.is_a?(Hash) ? rocket_job_slice_statistics.inc(key) : rocket_job_slice_statistics.inc_key(*key)
|
95
|
+
end
|
96
|
+
|
97
|
+
@rocket_job_perform_statistics = nil
|
76
98
|
end
|
77
99
|
|
78
100
|
# Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
|
@@ -37,15 +37,11 @@ module RocketJob
|
|
37
37
|
validates :throttle_running_workers, numericality: {greater_than_or_equal_to: 0}, allow_nil: true
|
38
38
|
|
39
39
|
define_batch_throttle :throttle_running_workers_exceeded?, filter: :throttle_filter_id
|
40
|
-
|
41
|
-
# Deprecated. For backward compatibility.
|
42
|
-
alias_method :throttle_running_slices, :throttle_running_workers
|
43
|
-
alias_method :throttle_running_slices=, :throttle_running_workers=
|
44
40
|
end
|
45
41
|
|
46
42
|
private
|
47
43
|
|
48
|
-
# Returns [
|
44
|
+
# Returns [true|false] whether the throttle for this job has been exceeded
|
49
45
|
def throttle_running_workers_exceeded?(slice)
|
50
46
|
return false unless throttle_running_workers&.positive?
|
51
47
|
|
@@ -57,7 +53,7 @@ module RocketJob
|
|
57
53
|
# Allows another job with a higher priority to start even though this one is running already
|
58
54
|
# @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
|
59
55
|
def throttle_running_jobs_base_query
|
60
|
-
query
|
56
|
+
query = super
|
61
57
|
query[:priority.lte] = priority if throttle_running_workers&.positive?
|
62
58
|
query
|
63
59
|
end
|
@@ -23,9 +23,6 @@ module RocketJob
|
|
23
23
|
#
|
24
24
|
# Slices are destroyed after their records are successfully processed
|
25
25
|
#
|
26
|
-
# Results are stored in the output collection if `collect_output?`
|
27
|
-
# `nil` results from workers are kept if `collect_nil_output`
|
28
|
-
#
|
29
26
|
# If an exception was thrown the entire slice of records is marked as failed.
|
30
27
|
#
|
31
28
|
# Thread-safe, can be called by multiple threads at the same time
|
@@ -40,7 +37,8 @@ module RocketJob
|
|
40
37
|
|
41
38
|
SemanticLogger.named_tagged(job: id.to_s) do
|
42
39
|
until worker.shutdown?
|
43
|
-
|
40
|
+
slice = input.next_slice(worker.name)
|
41
|
+
if slice
|
44
42
|
# Grab a slice before checking the throttle to reduce concurrency race condition.
|
45
43
|
return true if slice.fail_on_exception!(re_raise_exceptions) { rocket_job_batch_throttled?(slice, worker) }
|
46
44
|
next if slice.failed?
|
@@ -69,6 +67,8 @@ module RocketJob
|
|
69
67
|
# Returns [Integer] the number of records processed in the slice
|
70
68
|
#
|
71
69
|
# Note: The slice will be removed from processing when this method completes
|
70
|
+
#
|
71
|
+
# @deprecated Please open a ticket if you need this behavior.
|
72
72
|
def work_first_slice(&block)
|
73
73
|
raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
|
74
74
|
|
@@ -97,8 +97,8 @@ module RocketJob
|
|
97
97
|
servers = []
|
98
98
|
case sub_state
|
99
99
|
when :before, :after
|
100
|
-
|
101
|
-
servers << ActiveWorker.new(worker_name, started_at, self)
|
100
|
+
if running? && (server_name.nil? || worker_on_server?(server_name))
|
101
|
+
servers << ActiveWorker.new(worker_name, started_at, self)
|
102
102
|
end
|
103
103
|
when :processing
|
104
104
|
query = input.running
|
@@ -143,19 +143,23 @@ module RocketJob
|
|
143
143
|
|
144
144
|
# Perform individual slice without callbacks
|
145
145
|
def rocket_job_perform_slice(slice, &block)
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
146
|
+
slice.processing_record_number ||= 0
|
147
|
+
append = false
|
148
|
+
|
149
|
+
# Skip processed records in this slice if it has no output categories.
|
150
|
+
records =
|
151
|
+
if slice.processing_record_number.to_i > 1
|
152
|
+
append = true
|
153
|
+
logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
|
154
|
+
slice.records[slice.processing_record_number - 1..-1]
|
155
|
+
else
|
156
|
+
# Reprocess all records in this slice.
|
157
|
+
slice.processing_record_number = 0
|
158
|
+
slice.records
|
159
|
+
end
|
158
160
|
|
161
|
+
count = 0
|
162
|
+
RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
|
159
163
|
records.each do |record|
|
160
164
|
slice.processing_record_number += 1
|
161
165
|
SemanticLogger.named_tagged(record: slice.current_record_number) do
|
@@ -174,8 +178,8 @@ module RocketJob
|
|
174
178
|
return block_given? ? yield(record) : perform(record) if _perform_callbacks.empty?
|
175
179
|
|
176
180
|
# @rocket_job_input and @rocket_job_output can be modified by before/around callbacks
|
177
|
-
@rocket_job_input
|
178
|
-
@rocket_job_output
|
181
|
+
@rocket_job_input = record
|
182
|
+
@rocket_job_output = nil
|
179
183
|
|
180
184
|
run_callbacks(:perform) do
|
181
185
|
@rocket_job_output =
|
@@ -186,9 +190,9 @@ module RocketJob
|
|
186
190
|
end
|
187
191
|
end
|
188
192
|
|
189
|
-
@rocket_job_input
|
190
|
-
result
|
191
|
-
@rocket_job_output
|
193
|
+
@rocket_job_input = nil
|
194
|
+
result = @rocket_job_output
|
195
|
+
@rocket_job_output = nil
|
192
196
|
result
|
193
197
|
end
|
194
198
|
|
@@ -244,7 +248,7 @@ module RocketJob
|
|
244
248
|
unless new_record?
|
245
249
|
# Fail job iff no other worker has already finished it
|
246
250
|
# Must set write concern to at least 1 since we need the nModified back
|
247
|
-
result
|
251
|
+
result = self.class.with(write: {w: 1}) do |query|
|
248
252
|
query.
|
249
253
|
where(id: id, state: :running, sub_state: :processing).
|
250
254
|
update({"$set" => {state: :failed, worker_name: worker_name}})
|
@@ -305,11 +309,12 @@ module RocketJob
|
|
305
309
|
# Run Batch before and after callbacks
|
306
310
|
def rocket_job_batch_callbacks(worker)
|
307
311
|
# If this is the first worker to pickup this job
|
308
|
-
|
312
|
+
case sub_state
|
313
|
+
when :before
|
309
314
|
rocket_job_batch_run_before_callbacks
|
310
315
|
# Check for 0 record jobs
|
311
316
|
rocket_job_batch_complete?(worker.name) if running?
|
312
|
-
|
317
|
+
when sub_state == :after
|
313
318
|
rocket_job_batch_run_after_callbacks
|
314
319
|
end
|
315
320
|
end
|
data/lib/rocket_job/batch.rb
CHANGED
@@ -7,6 +7,8 @@ require "rocket_job/batch/state_machine"
|
|
7
7
|
require "rocket_job/batch/throttle"
|
8
8
|
require "rocket_job/batch/throttle_running_workers"
|
9
9
|
require "rocket_job/batch/worker"
|
10
|
+
# Ensure after_perform is run first and #upload override is after IO#upload is defined.
|
11
|
+
require "rocket_job/batch/categories"
|
10
12
|
|
11
13
|
module RocketJob
|
12
14
|
module Batch
|
@@ -17,6 +19,7 @@ module RocketJob
|
|
17
19
|
include Callbacks
|
18
20
|
include Logger
|
19
21
|
include Worker
|
22
|
+
include Categories
|
20
23
|
include Throttle
|
21
24
|
include ThrottleRunningWorkers
|
22
25
|
include IO
|
@@ -27,6 +30,5 @@ module RocketJob
|
|
27
30
|
autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
|
28
31
|
autoload :Result, "rocket_job/batch/result"
|
29
32
|
autoload :Results, "rocket_job/batch/results"
|
30
|
-
autoload :Tabular, "rocket_job/batch/tabular"
|
31
33
|
end
|
32
34
|
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require "active_support/concern"
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
module Category
|
5
|
+
# Define the layout for each category of input or output data
|
6
|
+
module Base
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
field :name, type: ::Mongoid::StringifiedSymbol, default: :main
|
11
|
+
|
12
|
+
# Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
|
13
|
+
field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
|
14
|
+
|
15
|
+
# The header columns when the file does not include a header row.
|
16
|
+
# Note:
|
17
|
+
# - All column names must be strings so that it can be serialized into MongoDB.
|
18
|
+
field :columns, type: Array
|
19
|
+
|
20
|
+
# On an input collection `format` specifies the format of the input data so that it can be
|
21
|
+
# transformed into a Hash when passed into the `#perform` method.
|
22
|
+
#
|
23
|
+
# On an output collection `format` specifies the format to transform the output hash into.
|
24
|
+
#
|
25
|
+
# `:auto` it uses the `file_name` on this category to determine the format.
|
26
|
+
# `nil` no transformation is performed on the data returned by the `#perform` method.
|
27
|
+
# Any other format supported by IOStreams, for example: csv, :hash, :array, :json, :psv, :fixed
|
28
|
+
#
|
29
|
+
# Default: `nil`
|
30
|
+
field :format, type: ::Mongoid::StringifiedSymbol
|
31
|
+
validates_inclusion_of :format, in: [nil, :auto] + IOStreams::Tabular.registered_formats
|
32
|
+
|
33
|
+
# Any specialized format specific options. For example, `:fixed` format requires a `:layout`.
|
34
|
+
field :format_options, type: Hash
|
35
|
+
|
36
|
+
# When `:format` is not supplied the file name can be used to infer the required format.
|
37
|
+
# Optional.
|
38
|
+
# Default: nil
|
39
|
+
field :file_name, type: IOStreams::Path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return which slice serializer class to use that matches the current options.
|
43
|
+
def serializer_class
|
44
|
+
case serializer
|
45
|
+
when :none
|
46
|
+
Sliced::Slice
|
47
|
+
when :compress
|
48
|
+
Sliced::CompressedSlice
|
49
|
+
when :encrypt
|
50
|
+
Sliced::EncryptedSlice
|
51
|
+
when :bzip2, :bz2
|
52
|
+
Sliced::BZip2OutputSlice
|
53
|
+
when :encrypted_bz2
|
54
|
+
Sliced::EncryptedBZip2OutputSlice
|
55
|
+
else
|
56
|
+
raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def tabular
|
61
|
+
@tabular ||= IOStreams::Tabular.new(
|
62
|
+
columns: columns,
|
63
|
+
format: format == :auto ? nil : format,
|
64
|
+
format_options: format_options&.deep_symbolize_keys,
|
65
|
+
file_name: file_name
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns [true|false] whether this category has the attributes defined for tabular to work.
|
70
|
+
def tabular?
|
71
|
+
format.present?
|
72
|
+
end
|
73
|
+
|
74
|
+
def build_collection_name(direction, job)
|
75
|
+
collection_name = "rocket_job.#{direction}s.#{job.id}"
|
76
|
+
collection_name << ".#{name}" unless name == :main
|
77
|
+
collection_name
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Category
|
3
|
+
# Define the layout for each category of input or output data
|
4
|
+
class Input
|
5
|
+
include SemanticLogger::Loggable
|
6
|
+
include Plugins::Document
|
7
|
+
include Category::Base
|
8
|
+
|
9
|
+
embedded_in :job, class_name: "RocketJob::Job", inverse_of: :input_categories
|
10
|
+
|
11
|
+
# Slice size for this input collection
|
12
|
+
field :slice_size, type: Integer, default: 100
|
13
|
+
validates_presence_of :slice_size
|
14
|
+
|
15
|
+
#
|
16
|
+
# The fields below only apply if the field `format` has been set:
|
17
|
+
#
|
18
|
+
|
19
|
+
# List of columns to allow.
|
20
|
+
# Default: nil ( Allow all columns )
|
21
|
+
# Note:
|
22
|
+
# When supplied any columns that are rejected will be returned in the cleansed columns
|
23
|
+
# as nil so that they can be ignored during processing.
|
24
|
+
field :allowed_columns, type: Array
|
25
|
+
|
26
|
+
# List of columns that must be present, otherwise an Exception is raised.
|
27
|
+
field :required_columns, type: Array
|
28
|
+
|
29
|
+
# Whether to skip unknown columns in the uploaded file.
|
30
|
+
# Ignores any column that was not found in the `allowed_columns` list.
|
31
|
+
#
|
32
|
+
# false:
|
33
|
+
# Raises IOStreams::Tabular::InvalidHeader when a column is supplied that is not in `allowed_columns`.
|
34
|
+
# true:
|
35
|
+
# Ignore additional columns in a file that are not listed in `allowed_columns`
|
36
|
+
# Job processing will skip the additional columns entirely as if they were not supplied at all.
|
37
|
+
# A warning is logged with the names of the columns that were ignored.
|
38
|
+
# The `columns` field will list all skipped columns with a nil value so that downstream workers
|
39
|
+
# know to ignore those columns.
|
40
|
+
#
|
41
|
+
# Notes:
|
42
|
+
# - Only applicable when `allowed_columns` has been set.
|
43
|
+
# - Recommended to leave as `false` otherwise a misspelled column can result in missed columns.
|
44
|
+
field :skip_unknown, type: ::Mongoid::Boolean, default: false
|
45
|
+
validates_inclusion_of :skip_unknown, in: [true, false]
|
46
|
+
|
47
|
+
# When `#upload` is called with a file_name, it uploads the file using any of the following approaches:
|
48
|
+
# :line
|
49
|
+
# Uploads the file a line (String) at a time for processing by workers.
|
50
|
+
# This is the default behavior and is the most performant since it leaves the parsing of each line
|
51
|
+
# up to the workers themselves.
|
52
|
+
# :array
|
53
|
+
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
54
|
+
# Every line in the input file is parsed and converted into an array before uploading.
|
55
|
+
# This approach ensures that the entire files is valid before starting to process it.
|
56
|
+
# Ideal for when files may contain invalid lines.
|
57
|
+
# Not recommended for large files since the CSV or other parsing is performed sequentially during the
|
58
|
+
# upload process.
|
59
|
+
# :hash
|
60
|
+
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
61
|
+
# Similar to :array above in that the entire file is parsed before processing is started.
|
62
|
+
# Slightly less efficient than :array since it stores every record as a hash with both the key and value.
|
63
|
+
#
|
64
|
+
# Recommend using :array when the entire file must be parsed/validated before processing is started, and
|
65
|
+
# upload time is not important.
|
66
|
+
# See IOStreams#each for more details.
|
67
|
+
field :mode, type: ::Mongoid::StringifiedSymbol, default: :line
|
68
|
+
validates_inclusion_of :mode, in: %i[line array hash]
|
69
|
+
|
70
|
+
# When reading tabular input data (e.g. CSV, PSV) the header is automatically cleansed.
|
71
|
+
# This removes issues when the input header varies in case and other small ways. See IOStreams::Tabular
|
72
|
+
# Currently Supported:
|
73
|
+
# :default
|
74
|
+
# Each column is cleansed as follows:
|
75
|
+
# - Leading and trailing whitespace is stripped.
|
76
|
+
# - All characters converted to lower case.
|
77
|
+
# - Spaces and '-' are converted to '_'.
|
78
|
+
# - All characters except for letters, digits, and '_' are stripped.
|
79
|
+
# :none
|
80
|
+
# Do not cleanse the columns names supplied in the header row.
|
81
|
+
#
|
82
|
+
# Note: Submit a ticket if you have other cleansers that you want added.
|
83
|
+
field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
|
84
|
+
validates :header_cleanser, inclusion: %i[default none]
|
85
|
+
|
86
|
+
validates_inclusion_of :serializer, in: %i[none compress encrypt]
|
87
|
+
|
88
|
+
# Cleanses the header column names when `cleanse_header` is true
|
89
|
+
def cleanse_header!
|
90
|
+
return unless header_cleanser == :default
|
91
|
+
|
92
|
+
ignored_columns = tabular.header.cleanse!
|
93
|
+
logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
|
94
|
+
|
95
|
+
self.columns = tabular.header.columns
|
96
|
+
end
|
97
|
+
|
98
|
+
def tabular
|
99
|
+
@tabular ||= IOStreams::Tabular.new(
|
100
|
+
columns: columns,
|
101
|
+
format: format == :auto ? nil : format,
|
102
|
+
format_options: format_options&.deep_symbolize_keys,
|
103
|
+
file_name: file_name,
|
104
|
+
allowed_columns: allowed_columns,
|
105
|
+
required_columns: required_columns,
|
106
|
+
skip_unknown: skip_unknown
|
107
|
+
)
|
108
|
+
end
|
109
|
+
|
110
|
+
def data_store(job)
|
111
|
+
RocketJob::Sliced::Input.new(
|
112
|
+
collection_name: build_collection_name(:input, job),
|
113
|
+
slice_class: serializer_class,
|
114
|
+
slice_size: slice_size
|
115
|
+
)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Returns [IOStreams::Path] of file to upload.
|
119
|
+
# Auto-detects file format from file name when format is :auto.
|
120
|
+
def upload_path(stream = nil, original_file_name: nil)
|
121
|
+
unless stream || file_name
|
122
|
+
raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
|
123
|
+
end
|
124
|
+
|
125
|
+
path = IOStreams.new(stream || file_name)
|
126
|
+
path.file_name = original_file_name if original_file_name
|
127
|
+
self.file_name = path.file_name
|
128
|
+
|
129
|
+
# Auto detect the format based on the upload file name if present.
|
130
|
+
if format == :auto
|
131
|
+
self.format = path.format || :csv
|
132
|
+
# Rebuild tabular with new values.
|
133
|
+
@tabular = nil
|
134
|
+
end
|
135
|
+
|
136
|
+
# Remove non-printable characters from tabular input formats.
|
137
|
+
if tabular?
|
138
|
+
# Cannot change the length of fixed width lines.
|
139
|
+
replace = format == :fixed ? " " : ""
|
140
|
+
path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
|
141
|
+
end
|
142
|
+
path
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a lambda to extract the header row from the uploaded file.
|
146
|
+
def extract_header_callback(on_first)
|
147
|
+
return on_first unless tabular? && tabular.header?
|
148
|
+
|
149
|
+
case mode
|
150
|
+
when :line
|
151
|
+
lambda do |line|
|
152
|
+
tabular.parse_header(line)
|
153
|
+
cleanse_header!
|
154
|
+
self.columns = tabular.header.columns
|
155
|
+
# Call chained on_first if present
|
156
|
+
on_first&.call(line)
|
157
|
+
end
|
158
|
+
when :array
|
159
|
+
lambda do |row|
|
160
|
+
tabular.header.columns = row
|
161
|
+
cleanse_header!
|
162
|
+
self.columns = category.tabular.header.columns
|
163
|
+
# Call chained on_first if present
|
164
|
+
on_first&.call(line)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|