rocketjob 5.4.1 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +175 -5
- data/bin/rocketjob_batch_perf +1 -1
- data/bin/rocketjob_perf +1 -1
- data/lib/rocket_job/batch/categories.rb +345 -0
- data/lib/rocket_job/batch/io.rb +174 -106
- data/lib/rocket_job/batch/model.rb +20 -68
- data/lib/rocket_job/batch/performance.rb +19 -7
- data/lib/rocket_job/batch/statistics.rb +34 -12
- data/lib/rocket_job/batch/throttle_running_workers.rb +2 -6
- data/lib/rocket_job/batch/worker.rb +31 -26
- data/lib/rocket_job/batch.rb +3 -1
- data/lib/rocket_job/category/base.rb +81 -0
- data/lib/rocket_job/category/input.rb +170 -0
- data/lib/rocket_job/category/output.rb +34 -0
- data/lib/rocket_job/cli.rb +25 -17
- data/lib/rocket_job/dirmon_entry.rb +23 -13
- data/lib/rocket_job/event.rb +1 -1
- data/lib/rocket_job/extensions/iostreams/path.rb +32 -0
- data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
- data/lib/rocket_job/extensions/mongoid/factory.rb +4 -12
- data/lib/rocket_job/extensions/mongoid/stringified_symbol.rb +50 -0
- data/lib/rocket_job/extensions/psych/yaml_tree.rb +8 -0
- data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
- data/lib/rocket_job/jobs/conversion_job.rb +43 -0
- data/lib/rocket_job/jobs/dirmon_job.rb +25 -36
- data/lib/rocket_job/jobs/housekeeping_job.rb +11 -12
- data/lib/rocket_job/jobs/on_demand_batch_job.rb +24 -11
- data/lib/rocket_job/jobs/on_demand_job.rb +3 -4
- data/lib/rocket_job/jobs/performance_job.rb +3 -1
- data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -96
- data/lib/rocket_job/jobs/upload_file_job.rb +48 -8
- data/lib/rocket_job/lookup_collection.rb +69 -0
- data/lib/rocket_job/plugins/cron.rb +60 -20
- data/lib/rocket_job/plugins/job/model.rb +25 -50
- data/lib/rocket_job/plugins/job/persistence.rb +36 -0
- data/lib/rocket_job/plugins/job/throttle.rb +2 -2
- data/lib/rocket_job/plugins/job/throttle_running_jobs.rb +1 -1
- data/lib/rocket_job/plugins/job/worker.rb +2 -7
- data/lib/rocket_job/plugins/restart.rb +3 -103
- data/lib/rocket_job/plugins/state_machine.rb +4 -3
- data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +37 -0
- data/lib/rocket_job/ractor_worker.rb +42 -0
- data/lib/rocket_job/server/model.rb +1 -1
- data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
- data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
- data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
- data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
- data/lib/rocket_job/sliced/input.rb +42 -54
- data/lib/rocket_job/sliced/slice.rb +12 -16
- data/lib/rocket_job/sliced/slices.rb +26 -11
- data/lib/rocket_job/sliced/writer/input.rb +46 -18
- data/lib/rocket_job/sliced/writer/output.rb +33 -45
- data/lib/rocket_job/sliced.rb +1 -74
- data/lib/rocket_job/subscribers/server.rb +1 -1
- data/lib/rocket_job/thread_worker.rb +46 -0
- data/lib/rocket_job/throttle_definitions.rb +7 -1
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +21 -55
- data/lib/rocket_job/worker_pool.rb +5 -7
- data/lib/rocketjob.rb +53 -43
- metadata +36 -28
- data/lib/rocket_job/batch/tabular/input.rb +0 -131
- data/lib/rocket_job/batch/tabular/output.rb +0 -65
- data/lib/rocket_job/batch/tabular.rb +0 -56
- data/lib/rocket_job/extensions/mongoid/remove_warnings.rb +0 -12
- data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +0 -28
@@ -22,12 +22,15 @@ module RocketJob
|
|
22
22
|
count_running_workers
|
23
23
|
|
24
24
|
puts "Loading job with #{count} records/lines"
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
job = RocketJob::Jobs::PerformanceJob.new(log_level: :warn)
|
26
|
+
job.input_category.slice_size = slice_size
|
27
|
+
if encrypt
|
28
|
+
job.input_category.serializer = :encrypt
|
29
|
+
job.output_category.serializer = :encrypt
|
30
|
+
elsif !compress
|
31
|
+
job.input_category.serializer = :none
|
32
|
+
job.output_category.serializer = :none
|
29
33
|
end
|
30
|
-
job = RocketJob::Jobs::PerformanceJob.new(args)
|
31
34
|
job.upload do |writer|
|
32
35
|
count.times { |i| writer << i }
|
33
36
|
end
|
@@ -37,7 +40,15 @@ module RocketJob
|
|
37
40
|
sleep 3 until job.reload.completed?
|
38
41
|
|
39
42
|
duration = job.completed_at - job.started_at
|
40
|
-
{
|
43
|
+
{
|
44
|
+
count: count,
|
45
|
+
duration: duration,
|
46
|
+
records_per_second: (count.to_f / duration).round(3),
|
47
|
+
workers: workers,
|
48
|
+
servers: servers,
|
49
|
+
compress: compress,
|
50
|
+
encrypt: encrypt
|
51
|
+
}
|
41
52
|
end
|
42
53
|
|
43
54
|
# Export the Results hash to a CSV file
|
@@ -60,7 +71,8 @@ module RocketJob
|
|
60
71
|
o.on("-m", "--mongo MONGO_CONFIG_FILE_NAME", "Location of mongoid.yml config file") do |arg|
|
61
72
|
self.mongo_config = arg
|
62
73
|
end
|
63
|
-
o.on("-e", "--environment ENVIRONMENT",
|
74
|
+
o.on("-e", "--environment ENVIRONMENT",
|
75
|
+
"The environment to run the app on (Default: RAILS_ENV || RACK_ENV || development)") do |arg|
|
64
76
|
self.environment = arg
|
65
77
|
end
|
66
78
|
o.on("-z", "--compress", "Turn on compression") do
|
@@ -2,7 +2,11 @@ require "active_support/concern"
|
|
2
2
|
|
3
3
|
module RocketJob
|
4
4
|
module Batch
|
5
|
-
# Allow statistics to be gathered while a batch job is running
|
5
|
+
# Allow statistics to be gathered while a batch job is running.
|
6
|
+
#
|
7
|
+
# Notes:
|
8
|
+
# - Statistics for successfully processed records within a slice are saved.
|
9
|
+
# - Statistics gathered during a perform that then results in an exception are discarded.
|
6
10
|
module Statistics
|
7
11
|
extend ActiveSupport::Concern
|
8
12
|
|
@@ -45,34 +49,52 @@ module RocketJob
|
|
45
49
|
last = paths.pop
|
46
50
|
return unless last
|
47
51
|
|
48
|
-
|
49
|
-
|
52
|
+
last_target = paths.inject(in_memory) do |target, sub_key|
|
53
|
+
target.key?(sub_key) ? target[sub_key] : target[sub_key] = Hash.new(0)
|
54
|
+
end
|
55
|
+
last_target[last] += increment
|
50
56
|
end
|
51
57
|
end
|
52
58
|
|
53
59
|
included do
|
54
60
|
field :statistics, type: Hash, default: -> { Hash.new(0) }
|
55
61
|
|
56
|
-
around_slice :
|
62
|
+
around_slice :rocket_job_statistics_capture
|
63
|
+
after_perform :rocket_job_statistics_commit
|
57
64
|
end
|
58
65
|
|
59
66
|
# Increment a statistic
|
60
67
|
def statistics_inc(key, increment = 1)
|
61
68
|
return if key.nil? || key == ""
|
62
69
|
|
63
|
-
|
64
|
-
@slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
65
|
-
key.is_a?(Hash) ? @slice_statistics.inc(key) : @slice_statistics.inc_key(key, increment)
|
70
|
+
(@rocket_job_perform_statistics ||= []) << (key.is_a?(Hash) ? key : [key, increment])
|
66
71
|
end
|
67
72
|
|
68
73
|
private
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
@slice_statistics = Stats.new(new_record? ? statistics : nil)
|
75
|
+
def rocket_job_statistics_capture
|
76
|
+
@rocket_job_perform_statistics = nil
|
77
|
+
@rocket_job_slice_statistics = nil
|
74
78
|
yield
|
75
|
-
|
79
|
+
ensure
|
80
|
+
if @rocket_job_slice_statistics && !@rocket_job_slice_statistics.empty?
|
81
|
+
collection.update_one({_id: id}, {"$inc" => @rocket_job_slice_statistics.stats})
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def rocket_job_slice_statistics
|
86
|
+
@rocket_job_slice_statistics ||= Stats.new(new_record? ? statistics : nil)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Apply stats gathered during the perform to the slice level stats
|
90
|
+
def rocket_job_statistics_commit
|
91
|
+
return unless @rocket_job_perform_statistics
|
92
|
+
|
93
|
+
@rocket_job_perform_statistics.each do |key|
|
94
|
+
key.is_a?(Hash) ? rocket_job_slice_statistics.inc(key) : rocket_job_slice_statistics.inc_key(*key)
|
95
|
+
end
|
96
|
+
|
97
|
+
@rocket_job_perform_statistics = nil
|
76
98
|
end
|
77
99
|
|
78
100
|
# Overrides RocketJob::Batch::Logger#rocket_job_batch_log_payload
|
@@ -37,15 +37,11 @@ module RocketJob
|
|
37
37
|
validates :throttle_running_workers, numericality: {greater_than_or_equal_to: 0}, allow_nil: true
|
38
38
|
|
39
39
|
define_batch_throttle :throttle_running_workers_exceeded?, filter: :throttle_filter_id
|
40
|
-
|
41
|
-
# Deprecated. For backward compatibility.
|
42
|
-
alias_method :throttle_running_slices, :throttle_running_workers
|
43
|
-
alias_method :throttle_running_slices=, :throttle_running_workers=
|
44
40
|
end
|
45
41
|
|
46
42
|
private
|
47
43
|
|
48
|
-
# Returns [
|
44
|
+
# Returns [true|false] whether the throttle for this job has been exceeded
|
49
45
|
def throttle_running_workers_exceeded?(slice)
|
50
46
|
return false unless throttle_running_workers&.positive?
|
51
47
|
|
@@ -57,7 +53,7 @@ module RocketJob
|
|
57
53
|
# Allows another job with a higher priority to start even though this one is running already
|
58
54
|
# @overrides RocketJob::Plugins::Job::ThrottleRunningJobs#throttle_running_jobs_base_query
|
59
55
|
def throttle_running_jobs_base_query
|
60
|
-
query
|
56
|
+
query = super
|
61
57
|
query[:priority.lte] = priority if throttle_running_workers&.positive?
|
62
58
|
query
|
63
59
|
end
|
@@ -23,9 +23,6 @@ module RocketJob
|
|
23
23
|
#
|
24
24
|
# Slices are destroyed after their records are successfully processed
|
25
25
|
#
|
26
|
-
# Results are stored in the output collection if `collect_output?`
|
27
|
-
# `nil` results from workers are kept if `collect_nil_output`
|
28
|
-
#
|
29
26
|
# If an exception was thrown the entire slice of records is marked as failed.
|
30
27
|
#
|
31
28
|
# Thread-safe, can be called by multiple threads at the same time
|
@@ -40,7 +37,8 @@ module RocketJob
|
|
40
37
|
|
41
38
|
SemanticLogger.named_tagged(job: id.to_s) do
|
42
39
|
until worker.shutdown?
|
43
|
-
|
40
|
+
slice = input.next_slice(worker.name)
|
41
|
+
if slice
|
44
42
|
# Grab a slice before checking the throttle to reduce concurrency race condition.
|
45
43
|
return true if slice.fail_on_exception!(re_raise_exceptions) { rocket_job_batch_throttled?(slice, worker) }
|
46
44
|
next if slice.failed?
|
@@ -69,6 +67,8 @@ module RocketJob
|
|
69
67
|
# Returns [Integer] the number of records processed in the slice
|
70
68
|
#
|
71
69
|
# Note: The slice will be removed from processing when this method completes
|
70
|
+
#
|
71
|
+
# @deprecated Please open a ticket if you need this behavior.
|
72
72
|
def work_first_slice(&block)
|
73
73
|
raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
|
74
74
|
|
@@ -97,8 +97,8 @@ module RocketJob
|
|
97
97
|
servers = []
|
98
98
|
case sub_state
|
99
99
|
when :before, :after
|
100
|
-
|
101
|
-
servers << ActiveWorker.new(worker_name, started_at, self)
|
100
|
+
if running? && (server_name.nil? || worker_on_server?(server_name))
|
101
|
+
servers << ActiveWorker.new(worker_name, started_at, self)
|
102
102
|
end
|
103
103
|
when :processing
|
104
104
|
query = input.running
|
@@ -143,19 +143,23 @@ module RocketJob
|
|
143
143
|
|
144
144
|
# Perform individual slice without callbacks
|
145
145
|
def rocket_job_perform_slice(slice, &block)
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
146
|
+
slice.processing_record_number ||= 0
|
147
|
+
append = false
|
148
|
+
|
149
|
+
# Skip processed records in this slice if it has no output categories.
|
150
|
+
records =
|
151
|
+
if slice.processing_record_number.to_i > 1
|
152
|
+
append = true
|
153
|
+
logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
|
154
|
+
slice.records[slice.processing_record_number - 1..-1]
|
155
|
+
else
|
156
|
+
# Reprocess all records in this slice.
|
157
|
+
slice.processing_record_number = 0
|
158
|
+
slice.records
|
159
|
+
end
|
158
160
|
|
161
|
+
count = 0
|
162
|
+
RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
|
159
163
|
records.each do |record|
|
160
164
|
slice.processing_record_number += 1
|
161
165
|
SemanticLogger.named_tagged(record: slice.current_record_number) do
|
@@ -174,8 +178,8 @@ module RocketJob
|
|
174
178
|
return block_given? ? yield(record) : perform(record) if _perform_callbacks.empty?
|
175
179
|
|
176
180
|
# @rocket_job_input and @rocket_job_output can be modified by before/around callbacks
|
177
|
-
@rocket_job_input
|
178
|
-
@rocket_job_output
|
181
|
+
@rocket_job_input = record
|
182
|
+
@rocket_job_output = nil
|
179
183
|
|
180
184
|
run_callbacks(:perform) do
|
181
185
|
@rocket_job_output =
|
@@ -186,9 +190,9 @@ module RocketJob
|
|
186
190
|
end
|
187
191
|
end
|
188
192
|
|
189
|
-
@rocket_job_input
|
190
|
-
result
|
191
|
-
@rocket_job_output
|
193
|
+
@rocket_job_input = nil
|
194
|
+
result = @rocket_job_output
|
195
|
+
@rocket_job_output = nil
|
192
196
|
result
|
193
197
|
end
|
194
198
|
|
@@ -244,7 +248,7 @@ module RocketJob
|
|
244
248
|
unless new_record?
|
245
249
|
# Fail job iff no other worker has already finished it
|
246
250
|
# Must set write concern to at least 1 since we need the nModified back
|
247
|
-
result
|
251
|
+
result = self.class.with(write: {w: 1}) do |query|
|
248
252
|
query.
|
249
253
|
where(id: id, state: :running, sub_state: :processing).
|
250
254
|
update({"$set" => {state: :failed, worker_name: worker_name}})
|
@@ -305,11 +309,12 @@ module RocketJob
|
|
305
309
|
# Run Batch before and after callbacks
|
306
310
|
def rocket_job_batch_callbacks(worker)
|
307
311
|
# If this is the first worker to pickup this job
|
308
|
-
|
312
|
+
case sub_state
|
313
|
+
when :before
|
309
314
|
rocket_job_batch_run_before_callbacks
|
310
315
|
# Check for 0 record jobs
|
311
316
|
rocket_job_batch_complete?(worker.name) if running?
|
312
|
-
|
317
|
+
when sub_state == :after
|
313
318
|
rocket_job_batch_run_after_callbacks
|
314
319
|
end
|
315
320
|
end
|
data/lib/rocket_job/batch.rb
CHANGED
@@ -7,6 +7,8 @@ require "rocket_job/batch/state_machine"
|
|
7
7
|
require "rocket_job/batch/throttle"
|
8
8
|
require "rocket_job/batch/throttle_running_workers"
|
9
9
|
require "rocket_job/batch/worker"
|
10
|
+
# Ensure after_perform is run first and #upload override is after IO#upload is defined.
|
11
|
+
require "rocket_job/batch/categories"
|
10
12
|
|
11
13
|
module RocketJob
|
12
14
|
module Batch
|
@@ -17,6 +19,7 @@ module RocketJob
|
|
17
19
|
include Callbacks
|
18
20
|
include Logger
|
19
21
|
include Worker
|
22
|
+
include Categories
|
20
23
|
include Throttle
|
21
24
|
include ThrottleRunningWorkers
|
22
25
|
include IO
|
@@ -27,6 +30,5 @@ module RocketJob
|
|
27
30
|
autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
|
28
31
|
autoload :Result, "rocket_job/batch/result"
|
29
32
|
autoload :Results, "rocket_job/batch/results"
|
30
|
-
autoload :Tabular, "rocket_job/batch/tabular"
|
31
33
|
end
|
32
34
|
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require "active_support/concern"
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
module Category
|
5
|
+
# Define the layout for each category of input or output data
|
6
|
+
module Base
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
field :name, type: ::Mongoid::StringifiedSymbol, default: :main
|
11
|
+
|
12
|
+
# Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
|
13
|
+
field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
|
14
|
+
|
15
|
+
# The header columns when the file does not include a header row.
|
16
|
+
# Note:
|
17
|
+
# - All column names must be strings so that it can be serialized into MongoDB.
|
18
|
+
field :columns, type: Array
|
19
|
+
|
20
|
+
# On an input collection `format` specifies the format of the input data so that it can be
|
21
|
+
# transformed into a Hash when passed into the `#perform` method.
|
22
|
+
#
|
23
|
+
# On an output collection `format` specifies the format to transform the output hash into.
|
24
|
+
#
|
25
|
+
# `:auto` it uses the `file_name` on this category to determine the format.
|
26
|
+
# `nil` no transformation is performed on the data returned by the `#perform` method.
|
27
|
+
# Any other format supported by IOStreams, for example: csv, :hash, :array, :json, :psv, :fixed
|
28
|
+
#
|
29
|
+
# Default: `nil`
|
30
|
+
field :format, type: ::Mongoid::StringifiedSymbol
|
31
|
+
validates_inclusion_of :format, in: [nil, :auto] + IOStreams::Tabular.registered_formats
|
32
|
+
|
33
|
+
# Any specialized format specific options. For example, `:fixed` format requires a `:layout`.
|
34
|
+
field :format_options, type: Hash
|
35
|
+
|
36
|
+
# When `:format` is not supplied the file name can be used to infer the required format.
|
37
|
+
# Optional.
|
38
|
+
# Default: nil
|
39
|
+
field :file_name, type: IOStreams::Path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Return which slice serializer class to use that matches the current options.
|
43
|
+
def serializer_class
|
44
|
+
case serializer
|
45
|
+
when :none
|
46
|
+
Sliced::Slice
|
47
|
+
when :compress
|
48
|
+
Sliced::CompressedSlice
|
49
|
+
when :encrypt
|
50
|
+
Sliced::EncryptedSlice
|
51
|
+
when :bzip2, :bz2
|
52
|
+
Sliced::BZip2OutputSlice
|
53
|
+
when :encrypted_bz2
|
54
|
+
Sliced::EncryptedBZip2OutputSlice
|
55
|
+
else
|
56
|
+
raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def tabular
|
61
|
+
@tabular ||= IOStreams::Tabular.new(
|
62
|
+
columns: columns,
|
63
|
+
format: format == :auto ? nil : format,
|
64
|
+
format_options: format_options&.deep_symbolize_keys,
|
65
|
+
file_name: file_name
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns [true|false] whether this category has the attributes defined for tabular to work.
|
70
|
+
def tabular?
|
71
|
+
format.present?
|
72
|
+
end
|
73
|
+
|
74
|
+
def build_collection_name(direction, job)
|
75
|
+
collection_name = "rocket_job.#{direction}s.#{job.id}"
|
76
|
+
collection_name << ".#{name}" unless name == :main
|
77
|
+
collection_name
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Category
|
3
|
+
# Define the layout for each category of input or output data
|
4
|
+
class Input
|
5
|
+
include SemanticLogger::Loggable
|
6
|
+
include Plugins::Document
|
7
|
+
include Category::Base
|
8
|
+
|
9
|
+
embedded_in :job, class_name: "RocketJob::Job", inverse_of: :input_categories
|
10
|
+
|
11
|
+
# Slice size for this input collection
|
12
|
+
field :slice_size, type: Integer, default: 100
|
13
|
+
validates_presence_of :slice_size
|
14
|
+
|
15
|
+
#
|
16
|
+
# The fields below only apply if the field `format` has been set:
|
17
|
+
#
|
18
|
+
|
19
|
+
# List of columns to allow.
|
20
|
+
# Default: nil ( Allow all columns )
|
21
|
+
# Note:
|
22
|
+
# When supplied any columns that are rejected will be returned in the cleansed columns
|
23
|
+
# as nil so that they can be ignored during processing.
|
24
|
+
field :allowed_columns, type: Array
|
25
|
+
|
26
|
+
# List of columns that must be present, otherwise an Exception is raised.
|
27
|
+
field :required_columns, type: Array
|
28
|
+
|
29
|
+
# Whether to skip unknown columns in the uploaded file.
|
30
|
+
# Ignores any column that was not found in the `allowed_columns` list.
|
31
|
+
#
|
32
|
+
# false:
|
33
|
+
# Raises IOStreams::Tabular::InvalidHeader when a column is supplied that is not in `allowed_columns`.
|
34
|
+
# true:
|
35
|
+
# Ignore additional columns in a file that are not listed in `allowed_columns`
|
36
|
+
# Job processing will skip the additional columns entirely as if they were not supplied at all.
|
37
|
+
# A warning is logged with the names of the columns that were ignored.
|
38
|
+
# The `columns` field will list all skipped columns with a nil value so that downstream workers
|
39
|
+
# know to ignore those columns.
|
40
|
+
#
|
41
|
+
# Notes:
|
42
|
+
# - Only applicable when `allowed_columns` has been set.
|
43
|
+
# - Recommended to leave as `false` otherwise a misspelled column can result in missed columns.
|
44
|
+
field :skip_unknown, type: ::Mongoid::Boolean, default: false
|
45
|
+
validates_inclusion_of :skip_unknown, in: [true, false]
|
46
|
+
|
47
|
+
# When `#upload` is called with a file_name, it uploads the file using any of the following approaches:
|
48
|
+
# :line
|
49
|
+
# Uploads the file a line (String) at a time for processing by workers.
|
50
|
+
# This is the default behavior and is the most performant since it leaves the parsing of each line
|
51
|
+
# up to the workers themselves.
|
52
|
+
# :array
|
53
|
+
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
54
|
+
# Every line in the input file is parsed and converted into an array before uploading.
|
55
|
+
# This approach ensures that the entire files is valid before starting to process it.
|
56
|
+
# Ideal for when files may contain invalid lines.
|
57
|
+
# Not recommended for large files since the CSV or other parsing is performed sequentially during the
|
58
|
+
# upload process.
|
59
|
+
# :hash
|
60
|
+
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
61
|
+
# Similar to :array above in that the entire file is parsed before processing is started.
|
62
|
+
# Slightly less efficient than :array since it stores every record as a hash with both the key and value.
|
63
|
+
#
|
64
|
+
# Recommend using :array when the entire file must be parsed/validated before processing is started, and
|
65
|
+
# upload time is not important.
|
66
|
+
# See IOStreams#each for more details.
|
67
|
+
field :mode, type: ::Mongoid::StringifiedSymbol, default: :line
|
68
|
+
validates_inclusion_of :mode, in: %i[line array hash]
|
69
|
+
|
70
|
+
# When reading tabular input data (e.g. CSV, PSV) the header is automatically cleansed.
|
71
|
+
# This removes issues when the input header varies in case and other small ways. See IOStreams::Tabular
|
72
|
+
# Currently Supported:
|
73
|
+
# :default
|
74
|
+
# Each column is cleansed as follows:
|
75
|
+
# - Leading and trailing whitespace is stripped.
|
76
|
+
# - All characters converted to lower case.
|
77
|
+
# - Spaces and '-' are converted to '_'.
|
78
|
+
# - All characters except for letters, digits, and '_' are stripped.
|
79
|
+
# :none
|
80
|
+
# Do not cleanse the columns names supplied in the header row.
|
81
|
+
#
|
82
|
+
# Note: Submit a ticket if you have other cleansers that you want added.
|
83
|
+
field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
|
84
|
+
validates :header_cleanser, inclusion: %i[default none]
|
85
|
+
|
86
|
+
validates_inclusion_of :serializer, in: %i[none compress encrypt]
|
87
|
+
|
88
|
+
# Cleanses the header column names when `cleanse_header` is true
|
89
|
+
def cleanse_header!
|
90
|
+
return unless header_cleanser == :default
|
91
|
+
|
92
|
+
ignored_columns = tabular.header.cleanse!
|
93
|
+
logger.warn("Stripped out invalid columns from custom header", ignored_columns) unless ignored_columns.empty?
|
94
|
+
|
95
|
+
self.columns = tabular.header.columns
|
96
|
+
end
|
97
|
+
|
98
|
+
def tabular
|
99
|
+
@tabular ||= IOStreams::Tabular.new(
|
100
|
+
columns: columns,
|
101
|
+
format: format == :auto ? nil : format,
|
102
|
+
format_options: format_options&.deep_symbolize_keys,
|
103
|
+
file_name: file_name,
|
104
|
+
allowed_columns: allowed_columns,
|
105
|
+
required_columns: required_columns,
|
106
|
+
skip_unknown: skip_unknown
|
107
|
+
)
|
108
|
+
end
|
109
|
+
|
110
|
+
def data_store(job)
|
111
|
+
RocketJob::Sliced::Input.new(
|
112
|
+
collection_name: build_collection_name(:input, job),
|
113
|
+
slice_class: serializer_class,
|
114
|
+
slice_size: slice_size
|
115
|
+
)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Returns [IOStreams::Path] of file to upload.
|
119
|
+
# Auto-detects file format from file name when format is :auto.
|
120
|
+
def upload_path(stream = nil, original_file_name: nil)
|
121
|
+
unless stream || file_name
|
122
|
+
raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
|
123
|
+
end
|
124
|
+
|
125
|
+
path = IOStreams.new(stream || file_name)
|
126
|
+
path.file_name = original_file_name if original_file_name
|
127
|
+
self.file_name = path.file_name
|
128
|
+
|
129
|
+
# Auto detect the format based on the upload file name if present.
|
130
|
+
if format == :auto
|
131
|
+
self.format = path.format || :csv
|
132
|
+
# Rebuild tabular with new values.
|
133
|
+
@tabular = nil
|
134
|
+
end
|
135
|
+
|
136
|
+
# Remove non-printable characters from tabular input formats.
|
137
|
+
if tabular?
|
138
|
+
# Cannot change the length of fixed width lines.
|
139
|
+
replace = format == :fixed ? " " : ""
|
140
|
+
path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
|
141
|
+
end
|
142
|
+
path
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return a lambda to extract the header row from the uploaded file.
|
146
|
+
def extract_header_callback(on_first)
|
147
|
+
return on_first unless tabular? && tabular.header?
|
148
|
+
|
149
|
+
case mode
|
150
|
+
when :line
|
151
|
+
lambda do |line|
|
152
|
+
tabular.parse_header(line)
|
153
|
+
cleanse_header!
|
154
|
+
self.columns = tabular.header.columns
|
155
|
+
# Call chained on_first if present
|
156
|
+
on_first&.call(line)
|
157
|
+
end
|
158
|
+
when :array
|
159
|
+
lambda do |row|
|
160
|
+
tabular.header.columns = row
|
161
|
+
cleanse_header!
|
162
|
+
self.columns = category.tabular.header.columns
|
163
|
+
# Call chained on_first if present
|
164
|
+
on_first&.call(line)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|