rocketjob 6.0.0.rc1 → 6.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +164 -8
  3. data/lib/rocket_job/batch/categories.rb +25 -18
  4. data/lib/rocket_job/batch/io.rb +130 -130
  5. data/lib/rocket_job/batch/performance.rb +2 -2
  6. data/lib/rocket_job/batch/statistics.rb +2 -2
  7. data/lib/rocket_job/batch/throttle_running_workers.rb +1 -1
  8. data/lib/rocket_job/batch/worker.rb +14 -12
  9. data/lib/rocket_job/batch.rb +0 -1
  10. data/lib/rocket_job/category/base.rb +10 -7
  11. data/lib/rocket_job/category/input.rb +61 -1
  12. data/lib/rocket_job/category/output.rb +9 -0
  13. data/lib/rocket_job/cli.rb +1 -1
  14. data/lib/rocket_job/dirmon_entry.rb +1 -1
  15. data/lib/rocket_job/extensions/mongoid/contextual/mongo.rb +2 -2
  16. data/lib/rocket_job/extensions/rocket_job_adapter.rb +2 -2
  17. data/lib/rocket_job/job_exception.rb +1 -1
  18. data/lib/rocket_job/jobs/conversion_job.rb +43 -0
  19. data/lib/rocket_job/jobs/dirmon_job.rb +24 -35
  20. data/lib/rocket_job/jobs/housekeeping_job.rb +4 -5
  21. data/lib/rocket_job/jobs/on_demand_batch_job.rb +15 -11
  22. data/lib/rocket_job/jobs/on_demand_job.rb +2 -2
  23. data/lib/rocket_job/jobs/re_encrypt/relational_job.rb +103 -97
  24. data/lib/rocket_job/jobs/upload_file_job.rb +6 -3
  25. data/lib/rocket_job/lookup_collection.rb +4 -3
  26. data/lib/rocket_job/plugins/cron.rb +60 -20
  27. data/lib/rocket_job/plugins/job/persistence.rb +36 -0
  28. data/lib/rocket_job/plugins/job/throttle.rb +2 -2
  29. data/lib/rocket_job/plugins/restart.rb +3 -110
  30. data/lib/rocket_job/plugins/state_machine.rb +2 -2
  31. data/lib/rocket_job/plugins/throttle_dependent_jobs.rb +43 -0
  32. data/lib/rocket_job/sliced/bzip2_output_slice.rb +18 -19
  33. data/lib/rocket_job/sliced/compressed_slice.rb +3 -6
  34. data/lib/rocket_job/sliced/encrypted_bzip2_output_slice.rb +49 -0
  35. data/lib/rocket_job/sliced/encrypted_slice.rb +4 -6
  36. data/lib/rocket_job/sliced/input.rb +42 -54
  37. data/lib/rocket_job/sliced/slice.rb +7 -3
  38. data/lib/rocket_job/sliced/slices.rb +12 -9
  39. data/lib/rocket_job/sliced/writer/input.rb +46 -18
  40. data/lib/rocket_job/sliced/writer/output.rb +0 -1
  41. data/lib/rocket_job/sliced.rb +1 -19
  42. data/lib/rocket_job/throttle_definitions.rb +7 -1
  43. data/lib/rocket_job/version.rb +1 -1
  44. data/lib/rocketjob.rb +4 -5
  45. metadata +12 -12
  46. data/lib/rocket_job/batch/tabular/input.rb +0 -133
  47. data/lib/rocket_job/batch/tabular/output.rb +0 -67
  48. data/lib/rocket_job/batch/tabular.rb +0 -58
@@ -67,6 +67,8 @@ module RocketJob
67
67
  # Returns [Integer] the number of records processed in the slice
68
68
  #
69
69
  # Note: The slice will be removed from processing when this method completes
70
+ #
71
+ # @deprecated Please open a ticket if you need this behavior.
70
72
  def work_first_slice(&block)
71
73
  raise "#work_first_slice can only be called from within before_batch callbacks" unless sub_state == :before
72
74
 
@@ -96,7 +98,7 @@ module RocketJob
96
98
  case sub_state
97
99
  when :before, :after
98
100
  if running? && (server_name.nil? || worker_on_server?(server_name))
99
- servers << ActiveWorker.new(worker_name, started_at, self) if running?
101
+ servers << ActiveWorker.new(worker_name, started_at, self)
100
102
  end
101
103
  when :processing
102
104
  query = input.running
@@ -142,19 +144,19 @@ module RocketJob
142
144
  # Perform individual slice without callbacks
143
145
  def rocket_job_perform_slice(slice, &block)
144
146
  slice.processing_record_number ||= 0
145
- records = []
146
147
  append = false
147
148
 
148
- # Skip processed records in this slice if it has no output categpries.
149
- if slice.processing_record_number > 1
150
- records = slice.records[slice.processing_record_number - 1..-1]
151
- append = true
152
- logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
153
- else
154
- # Reprocess all records in this slice.
155
- slice.processing_record_number = 0
156
- records = slice.records
157
- end
149
+ # Skip processed records in this slice if it has no output categories.
150
+ records =
151
+ if slice.processing_record_number.to_i > 1
152
+ append = true
153
+ logger.info("Resuming previously incomplete slice from record number #{slice.processing_record_number}")
154
+ slice.records[slice.processing_record_number - 1..-1]
155
+ else
156
+ # Reprocess all records in this slice.
157
+ slice.processing_record_number = 0
158
+ slice.records
159
+ end
158
160
 
159
161
  count = 0
160
162
  RocketJob::Sliced::Writer::Output.collect(self, input_slice: slice, append: append) do |writer|
@@ -30,6 +30,5 @@ module RocketJob
30
30
  autoload :ThrottleWindows, "rocket_job/batch/throttle_windows"
31
31
  autoload :Result, "rocket_job/batch/result"
32
32
  autoload :Results, "rocket_job/batch/results"
33
- autoload :Tabular, "rocket_job/batch/tabular"
34
33
  end
35
34
  end
@@ -11,7 +11,6 @@ module RocketJob
11
11
 
12
12
  # Whether to compress, encrypt, or use the bzip2 serialization for data in this category.
13
13
  field :serializer, type: ::Mongoid::StringifiedSymbol, default: :compress
14
- validates_inclusion_of :serializer, in: [:none, :compress, :encrypt, :bzip2]
15
14
 
16
15
  # The header columns when the file does not include a header row.
17
16
  # Note:
@@ -49,10 +48,12 @@ module RocketJob
49
48
  Sliced::CompressedSlice
50
49
  when :encrypt
51
50
  Sliced::EncryptedSlice
52
- when :bzip2
51
+ when :bzip2, :bz2
53
52
  Sliced::BZip2OutputSlice
53
+ when :encrypted_bz2
54
+ Sliced::EncryptedBZip2OutputSlice
54
55
  else
55
- raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, or :bzip2")
56
+ raise(ArgumentError, "serialize: #{serializer.inspect} must be :none, :compress, :encrypt, :bz2, or :encrypted_bz2")
56
57
  end
57
58
  end
58
59
 
@@ -65,14 +66,16 @@ module RocketJob
65
66
  )
66
67
  end
67
68
 
68
- def reset_tabular
69
- @tabular = nil
70
- end
71
-
72
69
  # Returns [true|false] whether this category has the attributes defined for tabular to work.
73
70
  def tabular?
74
71
  format.present?
75
72
  end
73
+
74
+ def build_collection_name(direction, job)
75
+ collection_name = "rocket_job.#{direction}s.#{job.id}"
76
+ collection_name << ".#{name}" unless name == :main
77
+ collection_name
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -10,6 +10,7 @@ module RocketJob
10
10
 
11
11
  # Slice size for this input collection
12
12
  field :slice_size, type: Integer, default: 100
13
+ validates_presence_of :slice_size
13
14
 
14
15
  #
15
16
  # The fields below only apply if the field `format` has been set:
@@ -82,7 +83,7 @@ module RocketJob
82
83
  field :header_cleanser, type: ::Mongoid::StringifiedSymbol, default: :default
83
84
  validates :header_cleanser, inclusion: %i[default none]
84
85
 
85
- validates_presence_of :slice_size
86
+ validates_inclusion_of :serializer, in: %i[none compress encrypt]
86
87
 
87
88
  # Cleanses the header column names when `cleanse_header` is true
88
89
  def cleanse_header!
@@ -105,6 +106,65 @@ module RocketJob
105
106
  skip_unknown: skip_unknown
106
107
  )
107
108
  end
109
+
110
+ def data_store(job)
111
+ RocketJob::Sliced::Input.new(
112
+ collection_name: build_collection_name(:input, job),
113
+ slice_class: serializer_class,
114
+ slice_size: slice_size
115
+ )
116
+ end
117
+
118
+ # Returns [IOStreams::Path] of file to upload.
119
+ # Auto-detects file format from file name when format is :auto.
120
+ def upload_path(stream = nil, original_file_name: nil)
121
+ unless stream || file_name
122
+ raise(ArgumentError, "Either supply a file name to upload, or set input_collection.file_name first")
123
+ end
124
+
125
+ path = IOStreams.new(stream || file_name)
126
+ path.file_name = original_file_name if original_file_name
127
+ self.file_name = path.file_name
128
+
129
+ # Auto detect the format based on the upload file name if present.
130
+ if format == :auto
131
+ self.format = path.format || :csv
132
+ # Rebuild tabular with new values.
133
+ @tabular = nil
134
+ end
135
+
136
+ # Remove non-printable characters from tabular input formats.
137
+ if tabular?
138
+ # Cannot change the length of fixed width lines.
139
+ replace = format == :fixed ? " " : ""
140
+ path.option_or_stream(:encode, encoding: "UTF-8", cleaner: :printable, replace: replace)
141
+ end
142
+ path
143
+ end
144
+
145
+ # Return a lambda to extract the header row from the uploaded file.
146
+ def extract_header_callback(on_first)
147
+ return on_first unless tabular? && tabular.header?
148
+
149
+ case mode
150
+ when :line
151
+ lambda do |line|
152
+ tabular.parse_header(line)
153
+ cleanse_header!
154
+ self.columns = tabular.header.columns
155
+ # Call chained on_first if present
156
+ on_first&.call(line)
157
+ end
158
+ when :array
159
+ lambda do |row|
160
+ tabular.header.columns = row
161
+ cleanse_header!
162
+ self.columns = category.tabular.header.columns
163
+ # Call chained on_first if present
164
+ on_first&.call(line)
165
+ end
166
+ end
167
+ end
108
168
  end
109
169
  end
110
170
  end
@@ -13,6 +13,8 @@ module RocketJob
13
13
  # false: do not save nil values to the output categories.
14
14
  field :nils, type: ::Mongoid::Boolean, default: false
15
15
 
16
+ validates_inclusion_of :serializer, in: %i[none compress encrypt bz2 encrypted_bz2 bzip2]
17
+
16
18
  # Renders [String] the header line.
17
19
  # Returns [nil] if no header is needed.
18
20
  def render_header
@@ -20,6 +22,13 @@ module RocketJob
20
22
 
21
23
  tabular.render_header
22
24
  end
25
+
26
+ def data_store(job)
27
+ RocketJob::Sliced::Output.new(
28
+ collection_name: build_collection_name(:output, job),
29
+ slice_class: serializer_class
30
+ )
31
+ end
23
32
  end
24
33
  end
25
34
  end
@@ -233,7 +233,7 @@ module RocketJob
233
233
 
234
234
  # Parse command line options placing results in the corresponding instance variables
235
235
  def parse(argv)
236
- parser = OptionParser.new do |o|
236
+ parser = OptionParser.new do |o|
237
237
  o.on("-n", "--name NAME", "Unique Name of this server (Default: host_name:PID)") do |arg|
238
238
  Config.name = arg
239
239
  end
@@ -173,7 +173,7 @@ module RocketJob
173
173
  counts
174
174
  end
175
175
 
176
- # Passes each filename [Pathname] found that matches the pattern into the supplied block
176
+ # Yields [IOStreams::Path] for each file found that matches the current pattern.
177
177
  def each
178
178
  SemanticLogger.named_tagged(dirmon_entry: id.to_s) do
179
179
  # Case insensitive filename matching
@@ -4,8 +4,8 @@ module Mongoid
4
4
  class Mongo
5
5
  def initialize(criteria)
6
6
  @criteria = criteria
7
- @klass = criteria.klass
8
- @cache = criteria.options[:cache]
7
+ @klass = criteria.klass
8
+ @cache = criteria.options[:cache]
9
9
  # Only line changed is here, get collection name from criteria, not @klass
10
10
  # @collection = @klass.collection
11
11
  @collection = criteria.collection
@@ -55,13 +55,13 @@ module ActiveJob
55
55
  # - Completed jobs will not appear in completed since the Active Job adapter
56
56
  # uses the default Rocket Job `destroy_on_completion` of `false`.
57
57
  class RocketJobAdapter
58
- def self.enqueue(active_job) #:nodoc:
58
+ def self.enqueue(active_job)
59
59
  job = RocketJob::Jobs::ActiveJob.create!(active_job_params(active_job))
60
60
  active_job.provider_job_id = job.id.to_s if active_job.respond_to?(:provider_job_id=)
61
61
  job
62
62
  end
63
63
 
64
- def self.enqueue_at(active_job, timestamp) #:nodoc:
64
+ def self.enqueue_at(active_job, timestamp)
65
65
  params = active_job_params(active_job)
66
66
  params[:run_at] = Time.at(timestamp).utc
67
67
 
@@ -23,7 +23,7 @@ module RocketJob
23
23
  new(
24
24
  args.merge(
25
25
  class_name: exc.class.name,
26
- message: exc.message,
26
+ message: exc.message.to_s.encode("UTF-8", replace: ""),
27
27
  backtrace: exc.backtrace || []
28
28
  )
29
29
  )
@@ -0,0 +1,43 @@
1
+ # Convert to and from CSV, JSON, xlsx, and PSV files.
2
+ #
3
+ # Example, Convert CSV file to JSON.
4
+ # job = RocketJob::Jobs::ConversionJob.new
5
+ # job.input_category.file_name = "data.csv"
6
+ # job.output_category.file_name = "data.json"
7
+ # job.save!
8
+ #
9
+ # Example, Convert JSON file to PSV and compress it with GZip.
10
+ # job = RocketJob::Jobs::ConversionJob.new
11
+ # job.input_category.file_name = "data.json"
12
+ # job.output_category.file_name = "data.psv.gz"
13
+ # job.save!
14
+ #
15
+ # Example, Read a CSV file that has been zipped from a remote website and the convert it to a GZipped json file.
16
+ # job = RocketJob::Jobs::ConversionJob.new
17
+ # job.input_category.file_name = "https://example.org/file.zip"
18
+ # job.output_category.file_name = "data.json.gz"
19
+ # job.save!
20
+ #
21
+ module RocketJob
22
+ module Jobs
23
+ class ConversionJob < RocketJob::Job
24
+ include RocketJob::Batch
25
+
26
+ self.destroy_on_complete = false
27
+
28
+ # Detects file extension for its type
29
+ input_category format: :auto
30
+ output_category format: :auto
31
+
32
+ # Upload the file specified in `input_category.file_name` unless already uploaded.
33
+ before_batch :upload, unless: :record_count
34
+
35
+ # When the job completes it will write the result to `output_category.file_name`.
36
+ after_batch :cleanup!, :download
37
+
38
+ def perform(hash)
39
+ hash
40
+ end
41
+ end
42
+ end
43
+ end
@@ -30,59 +30,48 @@ module RocketJob
30
30
  #
31
31
  # If another DirmonJob instance is already queued or running, then the create
32
32
  # above will fail with:
33
- # MongoMapper::DocumentNotValid: Validation failed: State Another instance of this job is already queued or running
33
+ # Validation failed: State Another instance of this job is already queued or running
34
34
  #
35
35
  # Or to start DirmonJob and ignore errors if already running
36
36
  # RocketJob::Jobs::DirmonJob.create
37
37
  class DirmonJob < RocketJob::Job
38
- # Only allow one DirmonJob instance to be running at a time
39
- include RocketJob::Plugins::Singleton
40
- # Start a new job when this one completes, fails, or aborts
41
- include RocketJob::Plugins::Restart
38
+ include RocketJob::Plugins::Cron
42
39
 
43
- self.priority = 30
44
-
45
- # Number of seconds between directory scans. Default 5 mins
46
- field :check_seconds, type: Float, default: 300.0, copy_on_restart: true
40
+ # Runs every 5 minutes by default
41
+ self.cron_schedule = "*/5 * * * * UTC"
42
+ self.description = "Directory Monitor"
43
+ self.priority = 30
47
44
 
48
45
  # Hash[file_name, size]
49
46
  field :previous_file_names, type: Hash, default: {}, copy_on_restart: true
50
47
 
51
- before_create :set_run_at
52
-
53
- # Iterate over each Dirmon entry looking for new files
54
- # If a new file is found, it is not processed immediately, instead
55
- # it is passed to the next run of this job along with the file size.
56
- # If the file size has not changed, the Job is kicked off.
48
+ # Checks the directories for new files, starting jobs if files have not changed since the last run.
57
49
  def perform
58
50
  check_directories
59
51
  end
60
52
 
61
53
  private
62
54
 
63
- # Set a run_at when a new instance of this job is created
64
- def set_run_at
65
- self.run_at = Time.now + check_seconds
66
- end
67
-
68
- # Checks the directories for new files, starting jobs if files have not changed
69
- # since the last run
55
+ # Iterate over each Dirmon Entry looking for new files
56
+ # If a new file is found, it is not processed immediately, instead
57
+ # it is passed to the next run of this job along with the file size.
58
+ # If the file size has not changed, the Job is kicked off.
70
59
  def check_directories
71
60
  new_file_names = {}
72
- DirmonEntry.enabled.each do |entry|
73
- entry.each do |iopath|
74
- # S3 files are only visible once completely uploaded.
75
- unless iopath.partial_files_visible?
76
- logger.info("File: #{iopath}. Starting: #{entry.job_class_name}")
77
- entry.later(iopath)
61
+ DirmonEntry.enabled.each do |dirmon_entry|
62
+ dirmon_entry.each do |path|
63
+ # Skip file size checking since S3 files are only visible once completely uploaded.
64
+ unless path.partial_files_visible?
65
+ logger.info("File: #{path}. Starting: #{dirmon_entry.job_class_name}")
66
+ dirmon_entry.later(path)
78
67
  next
79
68
  end
80
69
 
81
70
  # BSON Keys cannot contain periods
82
- key = iopath.to_s.tr(".", "_")
71
+ key = path.to_s.tr(".", "_")
83
72
  previous_size = previous_file_names[key]
84
73
  # Check every few minutes for a file size change before trying to process the file.
85
- size = check_file(entry, iopath, previous_size)
74
+ size = check_file(dirmon_entry, path, previous_size)
86
75
  new_file_names[key] = size if size
87
76
  end
88
77
  end
@@ -91,14 +80,14 @@ module RocketJob
91
80
 
92
81
  # Checks if a file should result in starting a job
93
82
  # Returns [Integer] file size, or nil if the file started a job
94
- def check_file(entry, iopath, previous_size)
95
- size = iopath.size
83
+ def check_file(dirmon_entry, path, previous_size)
84
+ size = path.size
96
85
  if previous_size && (previous_size == size)
97
- logger.info("File stabilized: #{iopath}. Starting: #{entry.job_class_name}")
98
- entry.later(iopath)
86
+ logger.info("File stabilized: #{path}. Starting: #{dirmon_entry.job_class_name}")
87
+ dirmon_entry.later(path)
99
88
  nil
100
89
  else
101
- logger.info("Found file: #{iopath}. File size: #{size}")
90
+ logger.info("Found file: #{path}. File size: #{size}")
102
91
  # Keep for the next run
103
92
  size
104
93
  end
@@ -27,12 +27,11 @@ module RocketJob
27
27
  # )
28
28
  class HousekeepingJob < RocketJob::Job
29
29
  include RocketJob::Plugins::Cron
30
- include RocketJob::Plugins::Singleton
31
30
 
32
- self.priority = 25
33
- self.description = "Cleans out historical jobs, and zombie servers."
34
- # Runs every 15 minutes
35
- self.cron_schedule = "*/15 * * * * UTC"
31
+ # Runs every 15 minutes on the 15 minute period
32
+ self.cron_schedule = "0,15,30,45 * * * * UTC"
33
+ self.description = "Cleans out historical jobs, and zombie servers."
34
+ self.priority = 25
36
35
 
37
36
  # Whether to destroy zombie servers automatically
38
37
  field :destroy_zombies, type: Mongoid::Boolean, default: true, user_editable: true, copy_on_restart: true
@@ -65,27 +65,29 @@ module RocketJob
65
65
  module Jobs
66
66
  class OnDemandBatchJob < RocketJob::Job
67
67
  include RocketJob::Plugins::Cron
68
+ include RocketJob::Plugins::Retry
68
69
  include RocketJob::Batch
69
70
  include RocketJob::Batch::Statistics
70
71
 
71
72
  self.priority = 90
72
- self.description = "Batch Job"
73
+ self.description = "On Demand Batch Job"
73
74
  self.destroy_on_complete = false
75
+ self.retry_limit = 0
74
76
 
75
77
  # Code that is performed against every row / record.
76
- field :code, type: String
78
+ field :code, type: String, user_editable: true, copy_on_restart: true
77
79
 
78
80
  # Optional code to execute before the batch is run.
79
81
  # Usually to upload data into the job.
80
- field :before_code, type: String
82
+ field :before_code, type: String, user_editable: true, copy_on_restart: true
81
83
 
82
84
  # Optional code to execute after the batch is run.
83
85
  # Usually to upload data into the job.
84
- field :after_code, type: String
86
+ field :after_code, type: String, user_editable: true, copy_on_restart: true
85
87
 
86
88
  # Data that is made available to the job during the perform.
87
89
  # Be sure to store key names only as Strings, not Symbols.
88
- field :data, type: Hash, default: {}
90
+ field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
89
91
 
90
92
  validates :code, presence: true
91
93
  validate :validate_code
@@ -96,12 +98,14 @@ module RocketJob
96
98
  before_batch :run_before_code
97
99
  after_batch :run_after_code
98
100
 
99
- # Make this job collect its output
100
- # :nils [true|false]
101
- # Whether to skip the output from `code` when it is nil
102
- # Default: false
103
- def collect_output(nils: false)
104
- self.output_categories = [RocketJob::Category::Output.new(nils: nils)]
101
+ # Shortcut for setting the slice_size
102
+ def slice_size=(slice_size)
103
+ input_category.slice_size = slice_size
104
+ end
105
+
106
+ # Add a new output category and collect output for it.
107
+ def add_output_category(**args)
108
+ self.output_categories << RocketJob::Category::Output.new(**args)
105
109
  end
106
110
 
107
111
  private
@@ -78,8 +78,8 @@ module RocketJob
78
78
  self.retry_limit = 0
79
79
 
80
80
  # Be sure to store key names only as Strings, not Symbols
81
- field :data, type: Hash, default: {}, copy_on_restart: true
82
- field :code, type: String, copy_on_restart: true
81
+ field :data, type: Hash, default: {}, user_editable: true, copy_on_restart: true
82
+ field :code, type: String, user_editable: true, copy_on_restart: true
83
83
 
84
84
  validates :code, presence: true
85
85
  validate :validate_code