rocketjob 6.0.0.rc1 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +164 -8
  3. data/lib/rocket_job/batch/categories.rb +25 -18
  4. data/lib/rocket_job/batch/io.rb +130 -130
  5. data/lib/rocket_job/batch/performance.rb +2 -2
  6. data/lib/rocket_job/batch/statistics.rb +2 -2
  7. data/lib/rocket_job/batch/throttle_running_workers.rb +1 -1
  8. data/lib/rocket_job/batch/worker.rb +14 -12
  9. data/lib/rocket_job/batch.rb +0 -1
  10. data/lib/rocket_job/category/base.rb +10 -7
  11. data/lib/rocket_job/category/input.rb +61 -1
  12. data/lib/rocket_job/category/output.rb +9 -0
  13. data/lib/rocket_job/cli.rb +1 -1
  14. data/lib/rocket_job/dirmon_entry.rb +1 -1
  15. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  16. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  17. data/lib/rocket_job/job_exception.rb +1 -1
  18. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  19. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  20. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  21. data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
  22. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  23. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -97
  24. data/lib/rocket_job/jobs/upload_file_job.rb +6 -3
  25. data/lib/rocket_job/lookup_collection.rb +4 -3
  26. data/lib/rocket_job/plugins/cron.rb +60 -20
  27. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  28. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  29. data/lib/rocket_job/plugins/restart.rb +3 -110
  30. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  31. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +43 -0
  32. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  33. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  34. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  35. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  36. data/lib/rocket_job/sliced/input.rb +42 -54
  37. data/lib/rocket_job/sliced/slice.rb +7 -3
  38. data/lib/rocket_job/sliced/slices.rb +12 -9
  39. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  40. data/lib/rocket_job/sliced/writer/output.rb +0 -1
  41. data/lib/rocket_job/sliced.rb +1 -19
  42. data/lib/rocket_job/throttle_definitions.rb +7 -1
  43. data/lib/rocket_job/version.rb +1 -1
  44. data/lib/rocketjob.rb +4 -5
  45. metadata +12 -12
  46. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  47. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  48. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -67,6 +67,8 @@ module RocketJob
67
67
  # Returns [Integer] the number of records processed in the slice
68
68
  #
69
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
70
72
  def work_first_slice(&block)
71
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
72
74
 
@@ -96,7 +98,7 @@ module RocketJob
96
98
  case sub_state
97
99
  when :before, :after
98
100
  if running? && (server_name.nil? || worker_on_server?(server_name))
99
- servers << ActiveWorker.new(worker_name, started_at, self) if running?
101
+ servers << ActiveWorker.new(worker_name, started_at, self)
100
102
  end
101
103
  when :processing
102
104
  query = input.running
@@ -142,19 +144,19 @@ module RocketJob
142
144
  # Perform individual slice without callbacks
143
145
  def rocket_job_perform_slice(slice, &block)
144
146
  slice.processing_record_number ||= 0
145
- records = []
146
147
  append = false
147
148
 
148
- # Skip processed records in this slice if it has no output categpries.
149
- if slice.processing_record_number > 1
150
- records = slice.records[slice.processing_record_number - 1..-1]
151
- append = true
152
- logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
153
- else
154
- # Reprocess all records in this slice.
155
- slice.processing_record_number = 0
156
- records = slice.records
157
- end
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
159
161
  count = 0
160
162
  RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
@@ -30,6 +30,5 @@ module RocketJob
30
30
  autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
31
31
  autoload :Result, "rocket_job/batch/result"
32
32
  autoload :Results, "rocket_job/batch/results"
33
- autoload :Tabular, "rocket_job/batch/tabular"
34
33
  end
35
34
  end
@@ -11,7 +11,6 @@ module RocketJob
11
11
 
12
12
  # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
13
  field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
- validates_inclusion_of :serializer, in: [:none, :compress, :encrypt, :bzip2]
15
14
 
16
15
  # The header columns when the file does not include a header row.
17
16
  # Note:
@@ -49,10 +48,12 @@ module RocketJob
49
48
  Sliced::CompressedSlice
50
49
  when :encrypt
51
50
  Sliced::EncryptedSlice
52
- when :bzip2
51
+ when :bzip2, :bz2
53
52
  Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
54
55
  else
55
- raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
56
57
  end
57
58
  end
58
59
 
@@ -65,14 +66,16 @@ module RocketJob
65
66
  )
66
67
  end
67
68
 
68
- def reset_tabular
69
- @tabular = nil
70
- end
71
-
72
69
  # Returns [true|false] whether this category has the attributes defined for tabular to work.
73
70
  def tabular?
74
71
  format.present?
75
72
  end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -10,6 +10,7 @@ module RocketJob
10
10
 
11
11
  # Slice size for this input collection
12
12
  field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
13
14
 
14
15
  #
15
16
  # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
82
83
  field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
83
84
  validates :header_cleanser, inclusion: %i[default none]
84
85
 
85
- validates_presence_of :slice_size
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
86
87
 
87
88
  # Cleanses the header column names when `cleanse_header` is true
88
89
  def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
105
106
  skip_unknown: skip_unknown
106
107
  )
107
108
  end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
108
168
  end
109
169
  end
110
170
  end
@@ -13,6 +13,8 @@ module RocketJob
13
13
  # false: do not save nil values to the output categories.
14
14
  field :nils, type: ::Mongoid::Boolean, default: false
15
15
 
16
+ validates_inclusion_of :serializer, in: %i[none compress encrypt bz2 encrypted_bz2 bzip2]
17
+
16
18
  # Renders [String] the header line.
17
19
  # Returns [nil] if no header is needed.
18
20
  def render_header
@@ -20,6 +22,13 @@ module RocketJob
20
22
 
21
23
  tabular.render_header
22
24
  end
25
+
26
+ def data_store(job)
27
+ RocketJob::Sliced::Output.new(
28
+ collection_name: build_collection_name(:output, job),
29
+ slice_class: serializer_class
30
+ )
31
+ end
23
32
  end
24
33
  end
25
34
  end
@@ -233,7 +233,7 @@ module RocketJob
233
233
 
234
234
  # Parse command line options placing results in the corresponding instance variables
235
235
  def parse(argv)
236
- parser = OptionParser.new do |o|
236
+ parser = OptionParser.new do |o|
237
237
  o.on("-n", "--name NAME", "Unique Name of this server (Default: host_name:PID)") do |arg|
238
238
  Config.name = arg
239
239
  end
@@ -173,7 +173,7 @@ module RocketJob
173
173
  counts
174
174
  end
175
175
 
176
- # Passes each filename [Pathname] found that matches the pattern into the supplied block
176
+ # Yields [IOStreams::Path] for each file found that matches the current pattern.
177
177
  def each
178
178
  SemanticLogger.named_tagged(dirmon_entry: id.to_s) do
179
179
  # Case insensitive filename matching
@@ -4,8 +4,8 @@ module Mongoid
4
4
  class Mongo
5
5
  def initialize(criteria)
6
6
  @criteria = criteria
7
- @klass = criteria.klass
8
- @cache = criteria.options[:cache]
7
+ @klass = criteria.klass
8
+ @cache = criteria.options[:cache]
9
9
  # Only line changed is here, get collection name from criteria, not @klass
10
10
  # @collection = @klass.collection
11
11
  @collection = criteria.collection
@@ -55,13 +55,13 @@ module ActiveJob
55
55
  # - Completed jobs will not appear in completed since the Active Job adapter
56
56
  # uses the default Rocket Job `destroy_on_completion` of `false`.
57
57
  class RocketJobAdapter
58
- def self.enqueue(active_job) #:nodoc:
58
+ def self.enqueue(active_job)
59
59
  job = RocketJob::Jobs::ActiveJob.create!(active_job_params(active_job))
60
60
  active_job.provider_job_id = job.id.to_s if active_job.respond_to?(:provider_job_id=)
61
61
  job
62
62
  end
63
63
 
64
- def self.enqueue_at(active_job, timestamp) #:nodoc:
64
+ def self.enqueue_at(active_job, timestamp)
65
65
  params = active_job_params(active_job)
66
66
  params[:run_at] = Time.at(timestamp).utc
67
67
 
@@ -23,7 +23,7 @@ module RocketJob
23
23
  new(
24
24
  args.merge(
25
25
  class_name: exc.class.name,
26
- message: exc.message,
26
+ message: exc.message.to_s.encode("UTF-8", replace: ""),
27
27
  backtrace: exc.backtrace || []
28
28
  )
29
29
  )
@@ -0,0 +1,43 @@
1
+ # Convert to and from CSV, JSON, xlsx, and PSV files.
2
+ #
3
+ # Example, Convert CSV file to JSON.
4
+ # job = RocketJob::Jobs::ConversionJob.new
5
+ # job.input_category.file_name = "data.csv"
6
+ # job.output_category.file_name = "data.json"
7
+ # job.save!
8
+ #
9
+ # Example, Convert JSON file to PSV and compress it with GZip.
10
+ # job = RocketJob::Jobs::ConversionJob.new
11
+ # job.input_category.file_name = "data.json"
12
+ # job.output_category.file_name = "data.psv.gz"
13
+ # job.save!
14
+ #
15
+ # Example, Read a CSV file that has been zipped from a remote website and the convert it to a GZipped json file.
16
+ # job = RocketJob::Jobs::ConversionJob.new
17
+ # job.input_category.file_name = "https://example.org/file.zip"
18
+ # job.output_category.file_name = "data.json.gz"
19
+ # job.save!
20
+ #
21
+ module RocketJob
22
+ module Jobs
23
+ class ConversionJob < RocketJob::Job
24
+ include RocketJob::Batch
25
+
26
+ self.destroy_on_complete = false
27
+
28
+ # Detects file extension for its type
29
+ input_category format: :auto
30
+ output_category format: :auto
31
+
32
+ # Upload the file specified in `input_category.file_name` unless already uploaded.
33
+ before_batch :upload, unless: :record_count
34
+
35
+ # When the job completes it will write the result to `output_category.file_name`.
36
+ after_batch :cleanup!, :download
37
+
38
+ def perform(hash)
39
+ hash
40
+ end
41
+ end
42
+ end
43
+ end
@@ -30,59 +30,48 @@ module RocketJob
30
30
  #
31
31
  # If another DirmonJob instance is already queued or running, then the create
32
32
  # above will fail with:
33
- # MongoMapper::DocumentNotValid: Validation failed: State Another instance of this job is already queued or running
33
+ # Validation failed: State Another instance of this job is already queued or running
34
34
  #
35
35
  # Or to start DirmonJob and ignore errors if already running
36
36
  # RocketJob::Jobs::DirmonJob.create
37
37
  class DirmonJob < RocketJob::Job
38
- # Only allow one DirmonJob instance to be running at a time
39
- include RocketJob::Plugins::Singleton
40
- # Start a new job when this one completes, fails, or aborts
41
- include RocketJob::Plugins::Restart
38
+ include RocketJob::Plugins::Cron
42
39
 
43
- self.priority = 30
44
-
45
- # Number of seconds between directory scans. Default 5 mins
46
- field :check_seconds, type: Float, default: 300.0, copy_on_restart: true
40
+ # Runs every 5 minutes by default
41
+ self.cron_schedule = "*/5 * * * * UTC"
42
+ self.description = "Directory Monitor"
43
+ self.priority = 30
47
44
 
48
45
  # Hash[file_name, size]
49
46
  field :previous_file_names, type: Hash, default: {}, copy_on_restart: true
50
47
 
51
- before_create :set_run_at
52
-
53
- # Iterate over each Dirmon entry looking for new files
54
- # If a new file is found, it is not processed immediately, instead
55
- # it is passed to the next run of this job along with the file size.
56
- # If the file size has not changed, the Job is kicked off.
48
+ # Checks the directories for new files, starting jobs if files have not changed since the last run.
57
49
  def perform
58
50
  check_directories
59
51
  end
60
52
 
61
53
  private
62
54
 
63
- # Set a run_at when a new instance of this job is created
64
- def set_run_at
65
- self.run_at = Time.now + check_seconds
66
- end
67
-
68
- # Checks the directories for new files, starting jobs if files have not changed
69
- # since the last run
55
+ # Iterate over each Dirmon Entry looking for new files
56
+ # If a new file is found, it is not processed immediately, instead
57
+ # it is passed to the next run of this job along with the file size.
58
+ # If the file size has not changed, the Job is kicked off.
70
59
  def check_directories
71
60
  new_file_names = {}
72
- DirmonEntry.enabled.each do |entry|
73
- entry.each do |iopath|
74
- # S3 files are only visible once completely uploaded.
75
- unless iopath.partial_files_visible?
76
- logger.info("File: #{iopath}. Starting: #{entry.job_class_name}")
77
- entry.later(iopath)
61
+ DirmonEntry.enabled.each do |dirmon_entry|
62
+ dirmon_entry.each do |path|
63
+ # Skip file size checking since S3 files are only visible once completely uploaded.
64
+ unless path.partial_files_visible?
65
+ logger.info("File: #{path}. Starting: #{dirmon_entry.job_class_name}")
66
+ dirmon_entry.later(path)
78
67
  next
79
68
  end
80
69
 
81
70
  # BSON Keys cannot contain periods
82
- key = iopath.to_s.tr(".", "_")
71
+ key = path.to_s.tr(".", "_")
83
72
  previous_size = previous_file_names[key]
84
73
  # Check every few minutes for a file size change before trying to process the file.
85
- size = check_file(entry, iopath, previous_size)
74
+ size = check_file(dirmon_entry, path, previous_size)
86
75
  new_file_names[key] = size if size
87
76
  end
88
77
  end
@@ -91,14 +80,14 @@ module RocketJob
91
80
 
92
81
  # Checks if a file should result in starting a job
93
82
  # Returns [Integer] file size, or nil if the file started a job
94
- def check_file(entry, iopath, previous_size)
95
- size = iopath.size
83
+ def check_file(dirmon_entry, path, previous_size)
84
+ size = path.size
96
85
  if previous_size && (previous_size == size)
97
- logger.info("File stabilized: #{iopath}. Starting: #{entry.job_class_name}")
98
- entry.later(iopath)
86
+ logger.info("File stabilized: #{path}. Starting: #{dirmon_entry.job_class_name}")
87
+ dirmon_entry.later(path)
99
88
  nil
100
89
  else
101
- logger.info("Found file: #{iopath}. File size: #{size}")
90
+ logger.info("Found file: #{path}. File size: #{size}")
102
91
  # Keep for the next run
103
92
  size
104
93
  end
@@ -27,12 +27,11 @@ module RocketJob
27
27
  # )
28
28
  class HousekeepingJob < RocketJob::Job
29
29
  include RocketJob::Plugins::Cron
30
- include RocketJob::Plugins::Singleton
31
30
 
32
- self.priority = 25
33
- self.description = "Cleans out historical jobs, and zombie servers."
34
- # Runs every 15 minutes
35
- self.cron_schedule = "*/15 * * * * UTC"
31
+ # Runs every 15 minutes on the 15 minute period
32
+ self.cron_schedule = "0,15,30,45 * * * * UTC"
33
+ self.description = "Cleans out historical jobs, and zombie servers."
34
+ self.priority = 25
36
35
 
37
36
  # Whether to destroy zombie servers automatically
38
37
  field :destroy_zombies, type: Mongoid::Boolean, default: true, user_editable: true, copy_on_restart: true
@@ -65,27 +65,29 @@ module RocketJob
65
65
  module Jobs
66
66
  class OnDemandBatchJob < RocketJob::Job
67
67
  include RocketJob::Plugins::Cron
68
+ include RocketJob::Plugins::Retry
68
69
  include RocketJob::Batch
69
70
  include RocketJob::Batch::Statistics
70
71
 
71
72
  self.priority = 90
72
- self.description = "Batch Job"
73
+ self.description = "On Demand Batch Job"
73
74
  self.destroy_on_complete = false
75
+ self.retry_limit = 0
74
76
 
75
77
  # Code that is performed against every row / record.
76
- field :code, type: String
78
+ field :code, type: String, user_editable: true, copy_on_restart: true
77
79
 
78
80
  # Optional code to execute before the batch is run.
79
81
  # Usually to upload data into the job.
80
- field :before_code, type: String
82
+ field :before_code, type: String, user_editable: true, copy_on_restart: true
81
83
 
82
84
  # Optional code to execute after the batch is run.
83
85
  # Usually to upload data into the job.
84
- field :after_code, type: String
86
+ field :after_code, type: String, user_editable: true, copy_on_restart: true
85
87
 
86
88
  # Data that is made available to the job during the perform.
87
89
  # Be sure to store key names only as Strings, not Symbols.
88
- field :data, type: Hash, default: {}
90
+ field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
89
91
 
90
92
  validates :code, presence: true
91
93
  validate :validate_code
@@ -96,12 +98,14 @@ module RocketJob
96
98
  before_batch :run_before_code
97
99
  after_batch :run_after_code
98
100
 
99
- # Make this job collect its output
100
- # :nils [true|false]
101
- # Whether to skip the output from `code` when it is nil
102
- # Default: false
103
- def collect_output(nils: false)
104
- self.output_categories = [RocketJob::Category::Output.new(nils: nils)]
101
+ # Shortcut for setting the slice_size
102
+ def slice_size=(slice_size)
103
+ input_category.slice_size = slice_size
104
+ end
105
+
106
+ # Add a new output category and collect output for it.
107
+ def add_output_category(**args)
108
+ self.output_categories << RocketJob::Category::Output.new(**args)
105
109
  end
106
110
 
107
111
  private
@@ -78,8 +78,8 @@ module RocketJob
78
78
  self.retry_limit = 0
79
79
 
80
80
  # Be sure to store key names only as Strings, not Symbols
81
- field :data, type: Hash, default: {}, copy_on_restart: true
82
- field :code, type: String, copy_on_restart: true
81
+ field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
82
+ field :code, type: String, user_editable: true, copy_on_restart: true
83
83
 
84
84
  validates :code, presence: true
85
85
  validate :validate_code