rocketjob 4.2.0 → 4.3.0.beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rocket_job/batch/io.rb +111 -70
- data/lib/rocket_job/batch/tabular/input.rb +1 -1
- data/lib/rocket_job/dirmon_entry.rb +23 -56
- data/lib/rocket_job/jobs/dirmon_job.rb +15 -8
- data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +27 -0
- data/lib/rocket_job/sliced/input.rb +17 -14
- data/lib/rocket_job/sliced/output.rb +10 -85
- data/lib/rocket_job/sliced/writer/input.rb +2 -8
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +3 -0
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ae22e2ca14255089a3500e9294fc689847bb92525da45ba90426cae38a45378
|
4
|
+
data.tar.gz: fc0f6dd9c042020a01d47f4c2be35df3896db47e3cc07d875a27f057a8e64ab6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb612469360af546d76ea1d024e80cbdf50f40693533fd3e608927911d62c86da6ad4ba290da0186ce98b3be95b3cfad21ceed3bf22091d6e80cf2adc7b2387d
|
7
|
+
data.tar.gz: 4038eb8af3353d6358f3dc74c50410d1f16cf96fe716c19b29aa8843428e2a8938267b3d130c4023b7234ca45ad5a546d65836ea69775d354889e6cb50121fd6
|
data/lib/rocket_job/batch/io.rb
CHANGED
@@ -19,7 +19,7 @@ module RocketJob
|
|
19
19
|
collection_name = "rocket_job.inputs.#{id}"
|
20
20
|
collection_name << ".#{category}" unless category == :main
|
21
21
|
|
22
|
-
(@inputs ||= {})[category] ||= RocketJob::Sliced::Input.new(
|
22
|
+
(@inputs ||= {})[category] ||= RocketJob::Sliced::Input.new(collection_name: collection_name, slice_size: slice_size)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Returns [RocketJob::Sliced::Output] output collection for holding output slices
|
@@ -36,23 +36,18 @@ module RocketJob
|
|
36
36
|
collection_name = "rocket_job.outputs.#{id}"
|
37
37
|
collection_name << ".#{category}" unless category == :main
|
38
38
|
|
39
|
-
(@outputs ||= {})[category] ||= RocketJob::Sliced::Output.new(
|
39
|
+
(@outputs ||= {})[category] ||= RocketJob::Sliced::Output.new(collection_name: collection_name, slice_size: slice_size)
|
40
40
|
end
|
41
41
|
|
42
|
-
# Upload the supplied
|
42
|
+
# Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
|
43
43
|
#
|
44
44
|
# Returns [Integer] the number of records uploaded.
|
45
45
|
#
|
46
46
|
# Parameters
|
47
|
-
#
|
47
|
+
# stream [String | IO | IOStreams::Path | IOStreams::Stream]
|
48
48
|
# Full path and file name to stream into the job,
|
49
49
|
# Or, an IO Stream that responds to: :read
|
50
|
-
#
|
51
|
-
# streams [Symbol|Array]
|
52
|
-
# Streams to convert the data whilst it is being read.
|
53
|
-
# When nil, the file_name extensions will be inspected to determine what
|
54
|
-
# streams should be applied.
|
55
|
-
# Default: nil
|
50
|
+
# Or, an IOStreams path such as IOStreams::Paths::File, or IOStreams::Paths::S3
|
56
51
|
#
|
57
52
|
# delimiter[String]
|
58
53
|
# Line / Record delimiter to use to break the stream up into records
|
@@ -63,9 +58,14 @@ module RocketJob
|
|
63
58
|
# Searches for the first "\r\n" or "\n" and then uses that as the
|
64
59
|
# delimiter for all subsequent records
|
65
60
|
#
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
61
|
+
# stream_mode: [:line | :row | :record]
|
62
|
+
# :line
|
63
|
+
# Uploads the file a line (String) at a time for processing by workers.
|
64
|
+
# :row
|
65
|
+
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
66
|
+
# :record
|
67
|
+
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
68
|
+
# See IOStreams::Stream#each_line, IOStreams::Stream#each_row, and IOStreams::Stream#each_record.
|
69
69
|
#
|
70
70
|
# encoding: [String|Encoding]
|
71
71
|
# Encode returned data with this encoding.
|
@@ -74,11 +74,15 @@ module RocketJob
|
|
74
74
|
# 'UTF-8': UTF-8 Format
|
75
75
|
# Etc.
|
76
76
|
# Default: 'UTF-8'
|
77
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
78
|
+
# if not already set in the supplied stream.
|
77
79
|
#
|
78
80
|
# encode_replace: [String]
|
79
81
|
# The character to replace with when a character cannot be converted to the target encoding.
|
80
82
|
# nil: Don't replace any invalid characters. Encoding::UndefinedConversionError is raised.
|
81
83
|
# Default: nil
|
84
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
85
|
+
# if not already set in the supplied stream.
|
82
86
|
#
|
83
87
|
# encode_cleaner: [nil|symbol|Proc]
|
84
88
|
# Cleanse data read from the input stream.
|
@@ -86,42 +90,38 @@ module RocketJob
|
|
86
90
|
# :printable Cleanse all non-printable characters except \r and \n
|
87
91
|
# Proc/lambda Proc to call after every read to cleanse the data
|
88
92
|
# Default: :printable
|
89
|
-
#
|
90
|
-
#
|
91
|
-
# :line
|
92
|
-
# Uploads the file a line (String) at a time for processing by workers.
|
93
|
-
# :row
|
94
|
-
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
95
|
-
# :record
|
96
|
-
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
97
|
-
# See IOStream#each_line, IOStream#each_row, and IOStream#each_record.
|
93
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
94
|
+
# if not already set in the supplied stream.
|
98
95
|
#
|
99
96
|
# Example:
|
100
97
|
# # Load plain text records from a file
|
101
|
-
# job.
|
98
|
+
# job.upload('hello.csv')
|
102
99
|
#
|
103
100
|
# Example:
|
104
101
|
# # Load plain text records from a file, stripping all non-printable characters,
|
105
102
|
# # as well as any characters that cannot be converted to UTF-8
|
106
|
-
# job.
|
103
|
+
# job.upload('hello.csv', encode_cleaner: :printable, encode_replace: '')
|
107
104
|
#
|
108
105
|
# Example: Zip
|
109
106
|
# # Since csv is not known to RocketJob it is ignored
|
110
|
-
# job.
|
107
|
+
# job.upload('myfile.csv.zip')
|
111
108
|
#
|
112
109
|
# Example: Encrypted Zip
|
113
|
-
# job.
|
110
|
+
# job.upload('myfile.csv.zip.enc')
|
114
111
|
#
|
115
112
|
# Example: Explicitly set the streams
|
116
|
-
#
|
113
|
+
# path = IOStreams.path('myfile.ze').stream(:encode, encoding: 'UTF-8').stream(:zip).stream(:enc)
|
114
|
+
# job.upload(path)
|
117
115
|
#
|
118
116
|
# Example: Supply custom options
|
119
|
-
#
|
117
|
+
# path = IOStreams.path('myfile.csv.enc').option(:enc, compress: false).option(:encode, encoding: 'UTF-8')
|
118
|
+
# job.upload(path)
|
120
119
|
#
|
121
|
-
# Example:
|
122
|
-
#
|
123
|
-
#
|
124
|
-
#
|
120
|
+
# Example: Read from a tempfile and use the original file name to determine which streams to apply
|
121
|
+
# temp_file = Tempfile.new('my_project')
|
122
|
+
# temp_file.write(gzip_and_encrypted_data)
|
123
|
+
# stream = IOStreams.stream(temp_file).file_name('myfile.gz.enc')
|
124
|
+
# job.upload(stream)
|
125
125
|
#
|
126
126
|
# Example: Upload by writing records one at a time to the upload stream
|
127
127
|
# job.upload do |writer|
|
@@ -140,18 +140,22 @@ module RocketJob
|
|
140
140
|
# * If an io stream is supplied, it is read until it returns nil.
|
141
141
|
# * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
|
142
142
|
# * CSV parsing is slow, so it is usually left for the workers to do.
|
143
|
-
def upload(
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
143
|
+
def upload(stream = nil, file_name: nil, category: :main, encoding: 'UTF-8', encode_cleaner: nil, encode_replace: nil, stream_mode: :line, on_first: nil, **args, &block)
|
144
|
+
raise(ArgumentError, 'Either stream, or a block must be supplied') unless stream || block
|
145
|
+
|
146
|
+
count =
|
147
|
+
if block
|
148
|
+
input(category).upload(on_first: on_first, &block)
|
149
|
+
else
|
150
|
+
path = build_path(stream, file_name, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace)
|
151
|
+
|
152
|
+
self.upload_file_name = path.file_name
|
153
|
+
input(category).upload(on_first: on_first) do |io|
|
154
|
+
path.public_send("each_#{stream_mode}".to_sym, **args) { |line| io << line }
|
155
|
+
end
|
156
|
+
end
|
150
157
|
self.record_count = (record_count || 0) + count
|
151
158
|
count
|
152
|
-
rescue StandardError => exc
|
153
|
-
input(category).delete_all
|
154
|
-
raise(exc)
|
155
159
|
end
|
156
160
|
|
157
161
|
# Upload results from an Arel into RocketJob::SlicedJob.
|
@@ -188,9 +192,6 @@ module RocketJob
|
|
188
192
|
count = input(category).upload_arel(arel, *column_names, &block)
|
189
193
|
self.record_count = (record_count || 0) + count
|
190
194
|
count
|
191
|
-
rescue StandardError => exc
|
192
|
-
input(category).delete_all
|
193
|
-
raise(exc)
|
194
195
|
end
|
195
196
|
|
196
197
|
# Upload the result of a MongoDB query to the input collection for processing
|
@@ -232,9 +233,6 @@ module RocketJob
|
|
232
233
|
count = input(category).upload_mongo_query(criteria, *column_names, &block)
|
233
234
|
self.record_count = (record_count || 0) + count
|
234
235
|
count
|
235
|
-
rescue StandardError => exc
|
236
|
-
input(category).delete_all
|
237
|
-
raise(exc)
|
238
236
|
end
|
239
237
|
|
240
238
|
# Upload sliced range of integer requests as arrays of start and end ids.
|
@@ -263,9 +261,6 @@ module RocketJob
|
|
263
261
|
count = last_id - start_id + 1
|
264
262
|
self.record_count = (record_count || 0) + count
|
265
263
|
count
|
266
|
-
rescue StandardError => exc
|
267
|
-
input(category).delete_all
|
268
|
-
raise(exc)
|
269
264
|
end
|
270
265
|
|
271
266
|
# Upload sliced range of integer requests as an arrays of start and end ids
|
@@ -298,9 +293,6 @@ module RocketJob
|
|
298
293
|
count = last_id - start_id + 1
|
299
294
|
self.record_count = (record_count || 0) + count
|
300
295
|
count
|
301
|
-
rescue StandardError => exc
|
302
|
-
input(category).delete_all
|
303
|
-
raise(exc)
|
304
296
|
end
|
305
297
|
|
306
298
|
# Upload the supplied slices for processing by workers
|
@@ -326,24 +318,71 @@ module RocketJob
|
|
326
318
|
count
|
327
319
|
end
|
328
320
|
|
329
|
-
# Download the output data into the supplied
|
321
|
+
# Download the output data into the supplied file, io, IOStreams::Path, or IOStreams::Stream.
|
322
|
+
# Returns [Integer] the number of records / lines downloaded.
|
330
323
|
#
|
331
324
|
# Parameters
|
332
|
-
#
|
333
|
-
#
|
325
|
+
# stream [String | IO | IOStreams::Path | IOStreams::Stream]
|
326
|
+
# Full path and file name to stream into the job,
|
327
|
+
# Or, an IO stream that responds to: :write
|
328
|
+
# Or, an IOStreams path such as IOStreams::Paths::File, or IOStreams::Paths::S3
|
329
|
+
#
|
330
|
+
# Example: Zip
|
331
|
+
# # Since csv is not known to RocketJob it is ignored
|
332
|
+
# job.download('myfile.csv.zip')
|
333
|
+
#
|
334
|
+
# Example: Encrypted Zip
|
335
|
+
# job.download('myfile.csv.zip.enc')
|
336
|
+
#
|
337
|
+
# Example: Explicitly set the streams
|
338
|
+
# path = IOStreams.path('myfile.ze').stream(:zip).stream(:enc)
|
339
|
+
# job.download(path)
|
334
340
|
#
|
335
|
-
#
|
336
|
-
#
|
337
|
-
#
|
338
|
-
#
|
341
|
+
# Example: Supply custom options
|
342
|
+
# path = IOStreams.path('myfile.csv.enc').option(:enc, compress: false)
|
343
|
+
# job.download(path)
|
344
|
+
#
|
345
|
+
# Example: Supply custom options. Set the file name within the zip file.
|
346
|
+
# path = IOStreams.path('myfile.csv.zip').option(:zip, zip_file_name: 'myfile.csv')
|
347
|
+
# job.download(path)
|
348
|
+
#
|
349
|
+
# Example: Download into a tempfile, or stream, using the original file name to determine the streams to apply:
|
350
|
+
# tempfile = Tempfile.new('my_project')
|
351
|
+
# stream = IOStreams.stream(tempfile).file_name('myfile.gz.enc')
|
352
|
+
# job.download(stream)
|
353
|
+
#
|
354
|
+
# Example: Add a header and/or trailer record to the downloaded file:
|
355
|
+
# IOStreams.path('/tmp/file.txt.gz').writer do |writer|
|
356
|
+
# writer << "Header\n"
|
357
|
+
# job.download do |line|
|
358
|
+
# writer << line + "\n"
|
359
|
+
# end
|
360
|
+
# writer << "Trailer\n"
|
361
|
+
# end
|
339
362
|
#
|
340
|
-
#
|
363
|
+
# Example: Add a header and/or trailer record to the downloaded file, letting the line writer add the line breaks:
|
364
|
+
# IOStreams.path('/tmp/file.txt.gz').line_writer do |writer|
|
365
|
+
# writer << "Header"
|
366
|
+
# job.download do |line|
|
367
|
+
# writer << line
|
368
|
+
# end
|
369
|
+
# writer << "Trailer"
|
370
|
+
# end
|
341
371
|
#
|
342
|
-
#
|
343
|
-
|
372
|
+
# Notes:
|
373
|
+
# - The records are returned in '_id' order. Usually this is the order in
|
374
|
+
# which the records were originally loaded.
|
375
|
+
def download(stream = nil, category: :main, header_line: nil, encoding: 'UTF-8', encode_cleaner: nil, encode_replace: nil, **args, &block)
|
344
376
|
raise "Cannot download incomplete job: #{id}. Currently in state: #{state}-#{sub_state}" if rocket_job_processing?
|
345
377
|
|
346
|
-
|
378
|
+
if block
|
379
|
+
output(category).download(header_line: header_line, &block)
|
380
|
+
else
|
381
|
+
path = build_path(stream, nil, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace)
|
382
|
+
path.line_writer(**args) do |io|
|
383
|
+
output(category).download(header_line: header_line) { |record| io << record }
|
384
|
+
end
|
385
|
+
end
|
347
386
|
end
|
348
387
|
|
349
388
|
# Writes the supplied result, Batch::Result or Batch::Results to the relevant collections.
|
@@ -381,11 +420,13 @@ module RocketJob
|
|
381
420
|
|
382
421
|
private
|
383
422
|
|
384
|
-
def
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
423
|
+
def build_path(stream, file_name, encoding: nil, encode_cleaner: nil, encode_replace: nil)
|
424
|
+
path = IOStreams.new(stream)
|
425
|
+
path.file_name = file_name if file_name
|
426
|
+
if (encoding || encode_cleaner || encode_replace) && !path.setting(:encode)
|
427
|
+
path.option_or_stream(:encode, encoding: encoding, cleaner: encode_cleaner, replace: encode_replace)
|
428
|
+
end
|
429
|
+
path
|
389
430
|
end
|
390
431
|
end
|
391
432
|
end
|
@@ -20,7 +20,7 @@ module RocketJob
|
|
20
20
|
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
21
21
|
# :record
|
22
22
|
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
23
|
-
# See
|
23
|
+
# See IOStreams#each_line, IOStreams#each_row, and IOStreams#each_record.
|
24
24
|
field :tabular_input_mode, type: Symbol, default: :line, class_attribute: true, user_editable: true, copy_on_restart: true
|
25
25
|
|
26
26
|
validates_inclusion_of :tabular_input_format, in: IOStreams::Tabular.registered_formats
|
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'concurrent'
|
2
|
-
require 'pathname'
|
3
2
|
require 'fileutils'
|
4
3
|
module RocketJob
|
5
4
|
class DirmonEntry
|
@@ -143,7 +142,7 @@ module RocketJob
|
|
143
142
|
# Raises: Errno::ENOENT: No such file or directory
|
144
143
|
def self.add_whitelist_path(path)
|
145
144
|
# Confirms that path exists
|
146
|
-
path =
|
145
|
+
path = IOStreams.path(path).realpath.to_s
|
147
146
|
whitelist_paths << path
|
148
147
|
whitelist_paths.uniq!
|
149
148
|
path
|
@@ -153,7 +152,7 @@ module RocketJob
|
|
153
152
|
# Raises: Errno::ENOENT: No such file or directory
|
154
153
|
def self.delete_whitelist_path(path)
|
155
154
|
# Confirms that path exists
|
156
|
-
path =
|
155
|
+
path = IOStreams.path(path).realpath.to_s
|
157
156
|
whitelist_paths.delete(path)
|
158
157
|
whitelist_paths.uniq!
|
159
158
|
path
|
@@ -186,32 +185,23 @@ module RocketJob
|
|
186
185
|
def each
|
187
186
|
SemanticLogger.named_tagged(dirmon_entry: id.to_s) do
|
188
187
|
# Case insensitive filename matching
|
189
|
-
|
190
|
-
|
191
|
-
pathname = begin
|
192
|
-
pathname.realpath
|
193
|
-
rescue Errno::ENOENT
|
194
|
-
logger.warn("Unable to expand the realpath for #{pathname.inspect}. Skipping file.")
|
195
|
-
next
|
196
|
-
end
|
197
|
-
|
198
|
-
file_name = pathname.to_s
|
199
|
-
|
188
|
+
IOStreams.each_child(pattern) do |path|
|
189
|
+
path = path.realpath
|
200
190
|
# Skip archive directories
|
201
|
-
next if
|
191
|
+
next if path.to_s.include?(archive_directory || self.class.default_archive_directory)
|
202
192
|
|
203
193
|
# Security check?
|
204
|
-
if whitelist_paths.size.positive? && whitelist_paths.none? { |whitepath|
|
205
|
-
logger.
|
194
|
+
if whitelist_paths.size.positive? && whitelist_paths.none? { |whitepath| path.to_s.start_with?(whitepath) }
|
195
|
+
logger.warn "Skipping file: #{path} since it is not in any of the whitelisted paths: #{whitelist_paths.join(', ')}"
|
206
196
|
next
|
207
197
|
end
|
208
198
|
|
209
199
|
# File must be writable so it can be removed after processing
|
210
|
-
|
211
|
-
logger.
|
200
|
+
if path.respond_to?(:writable?) && !path.writable?
|
201
|
+
logger.warn "Skipping file: #{file_name} since it is not writable by the current user. Must be able to delete/move the file after queueing the job"
|
212
202
|
next
|
213
203
|
end
|
214
|
-
yield(
|
204
|
+
yield(path)
|
215
205
|
end
|
216
206
|
end
|
217
207
|
end
|
@@ -239,17 +229,18 @@ module RocketJob
|
|
239
229
|
nil
|
240
230
|
end
|
241
231
|
|
242
|
-
# Archives the file
|
243
|
-
def later(
|
244
|
-
job_id
|
245
|
-
|
232
|
+
# Archives the file, then kicks off a file upload job to upload the archived file.
|
233
|
+
def later(iopath)
|
234
|
+
job_id = BSON::ObjectId.new
|
235
|
+
archive_path = archive_iopath(iopath).join("#{job_id}_#{iopath.basename}")
|
236
|
+
iopath.move_to(archive_path)
|
246
237
|
|
247
238
|
job = RocketJob::Jobs::UploadFileJob.create!(
|
248
239
|
job_class_name: job_class_name,
|
249
240
|
properties: properties,
|
250
|
-
description: "#{name}: #{
|
251
|
-
upload_file_name:
|
252
|
-
original_file_name:
|
241
|
+
description: "#{name}: #{iopath.basename}",
|
242
|
+
upload_file_name: archive_path.to_s,
|
243
|
+
original_file_name: iopath.to_s,
|
253
244
|
job_id: job_id
|
254
245
|
)
|
255
246
|
|
@@ -257,8 +248,8 @@ module RocketJob
|
|
257
248
|
message: 'Created RocketJob::Jobs::UploadFileJob',
|
258
249
|
payload: {
|
259
250
|
dirmon_entry_name: name,
|
260
|
-
upload_file_name:
|
261
|
-
original_file_name:
|
251
|
+
upload_file_name: archive_path.to_s,
|
252
|
+
original_file_name: iopath.to_s,
|
262
253
|
job_class_name: job_class_name,
|
263
254
|
job_id: job_id.to_s,
|
264
255
|
upload_job_id: job.id.to_s
|
@@ -278,37 +269,13 @@ module RocketJob
|
|
278
269
|
class_attribute :whitelist_paths
|
279
270
|
self.whitelist_paths = Concurrent::Array.new
|
280
271
|
|
281
|
-
# Move the file to the archive directory
|
282
|
-
#
|
283
|
-
# The archived file name is prefixed with the job id
|
284
|
-
#
|
285
|
-
# Returns [String] the fully qualified archived file name
|
286
|
-
#
|
287
|
-
# Note:
|
288
|
-
# - Works across partitions when the file and the archive are on different partitions
|
289
|
-
def archive_file(job_id, pathname)
|
290
|
-
target_path = archive_pathname(pathname)
|
291
|
-
target_path.mkpath
|
292
|
-
target_file_name = target_path.join("#{job_id}_#{pathname.basename}")
|
293
|
-
# In case the file is being moved across partitions
|
294
|
-
FileUtils.move(pathname.to_s, target_file_name.to_s)
|
295
|
-
target_file_name.to_s
|
296
|
-
end
|
297
|
-
|
298
272
|
# Returns [Pathname] to the archive directory, and creates it if it does not exist.
|
299
273
|
#
|
300
274
|
# If `archive_directory` is a relative path, it is appended to the `file_pathname`.
|
301
275
|
# If `archive_directory` is an absolute path, it is returned as-is.
|
302
|
-
def
|
303
|
-
path =
|
304
|
-
path
|
305
|
-
|
306
|
-
begin
|
307
|
-
path.mkpath unless path.exist?
|
308
|
-
rescue Errno::ENOENT => exc
|
309
|
-
raise(Errno::ENOENT, "DirmonJob failed to create archive directory: #{path}, #{exc.message}")
|
310
|
-
end
|
311
|
-
path.realpath
|
276
|
+
def archive_iopath(iopath)
|
277
|
+
path = IOStreams.path(archive_directory)
|
278
|
+
path.relative? ? iopath.directory.join(archive_directory) : path
|
312
279
|
end
|
313
280
|
|
314
281
|
# Validates job_class is a Rocket Job
|
@@ -70,11 +70,18 @@ module RocketJob
|
|
70
70
|
def check_directories
|
71
71
|
new_file_names = {}
|
72
72
|
DirmonEntry.enabled.each do |entry|
|
73
|
-
entry.each do |
|
73
|
+
entry.each do |iopath|
|
74
|
+
# S3 files are only visible once completely uploaded.
|
75
|
+
if iopath.is_a?(IOStreams::Paths::S3)
|
76
|
+
logger.info("S3 File: #{iopath}. Starting: #{entry.job_class_name}")
|
77
|
+
entry.later(iopath)
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
74
81
|
# BSON Keys cannot contain periods
|
75
|
-
key =
|
82
|
+
key = iopath.to_s.tr('.', '_')
|
76
83
|
previous_size = previous_file_names[key]
|
77
|
-
size = check_file(entry,
|
84
|
+
size = check_file(entry, iopath, previous_size)
|
78
85
|
new_file_names[key] = size if size
|
79
86
|
end
|
80
87
|
end
|
@@ -83,14 +90,14 @@ module RocketJob
|
|
83
90
|
|
84
91
|
# Checks if a file should result in starting a job
|
85
92
|
# Returns [Integer] file size, or nil if the file started a job
|
86
|
-
def check_file(entry,
|
87
|
-
size =
|
93
|
+
def check_file(entry, iopath, previous_size)
|
94
|
+
size = iopath.size
|
88
95
|
if previous_size && (previous_size == size)
|
89
|
-
logger.info("File stabilized: #{
|
90
|
-
entry.later(
|
96
|
+
logger.info("File stabilized: #{iopath}. Starting: #{entry.job_class_name}")
|
97
|
+
entry.later(iopath)
|
91
98
|
nil
|
92
99
|
else
|
93
|
-
logger.info("Found file: #{
|
100
|
+
logger.info("Found file: #{iopath}. File size: #{size}")
|
94
101
|
# Keep for the next run
|
95
102
|
size
|
96
103
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Job to dynamically perform ruby code on demand as a Batch,
|
2
|
+
# with input and/or output from CSV/JSON or other format supported by Tabular.
|
3
|
+
#
|
4
|
+
# Nodes:
|
5
|
+
# - Need to specify `destroy_on_complete: false` to collect output from this job.
|
6
|
+
# - `after_code` can be used to automatically download the output of this job to a file on completion.
|
7
|
+
#
|
8
|
+
# Example: Iterate over all rows in a table:
|
9
|
+
# code = <<-CODE
|
10
|
+
# if user = User.find(row)
|
11
|
+
# user.cleanse_attributes!
|
12
|
+
# user.save(validate: false)
|
13
|
+
# end
|
14
|
+
# CODE
|
15
|
+
# job = RocketJob::Jobs::OnDemandBatchTabularJob.new(code: code, description: 'cleanse users', destroy_on_complete: false)
|
16
|
+
# job.upload("users.csv")
|
17
|
+
# job.save!
|
18
|
+
#
|
19
|
+
# On completion export the output:
|
20
|
+
# job.download("output.csv")
|
21
|
+
module RocketJob
|
22
|
+
module Jobs
|
23
|
+
class OnDemandBatchTabularJob < OnDemandBatchJob
|
24
|
+
include RocketJob::Batch::Tabular
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -1,15 +1,13 @@
|
|
1
1
|
module RocketJob
|
2
2
|
module Sliced
|
3
3
|
class Input < Slices
|
4
|
-
def upload(
|
5
|
-
|
6
|
-
|
7
|
-
block ||= -> (io) do
|
8
|
-
iterator = "each_#{stream_mode}".to_sym
|
9
|
-
IOStreams.public_send(iterator, file_name_or_io, encoding: encoding, **args) { |line| io << line }
|
10
|
-
end
|
11
|
-
|
4
|
+
def upload(on_first: nil, &block)
|
5
|
+
# Create indexes before uploading
|
6
|
+
create_indexes
|
12
7
|
Writer::Input.collect(self, on_first: on_first, &block)
|
8
|
+
rescue StandardError => exc
|
9
|
+
drop
|
10
|
+
raise(exc)
|
13
11
|
end
|
14
12
|
|
15
13
|
def upload_mongo_query(criteria, *column_names, &block)
|
@@ -36,7 +34,7 @@ module RocketJob
|
|
36
34
|
end
|
37
35
|
end
|
38
36
|
|
39
|
-
|
37
|
+
upload do |records|
|
40
38
|
# Drop down to the mongo driver level to avoid constructing a Model for each document returned
|
41
39
|
criteria.klass.collection.find(criteria.selector, options).each do |document|
|
42
40
|
records << block.call(document)
|
@@ -46,8 +44,7 @@ module RocketJob
|
|
46
44
|
|
47
45
|
def upload_arel(arel, *column_names, &block)
|
48
46
|
unless block
|
49
|
-
column_names = column_names.collect(&:to_sym)
|
50
|
-
column_names << :id if column_names.size.zero?
|
47
|
+
column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
|
51
48
|
|
52
49
|
block =
|
53
50
|
if column_names.size == 1
|
@@ -61,12 +58,11 @@ module RocketJob
|
|
61
58
|
arel = arel.select(selection)
|
62
59
|
end
|
63
60
|
|
64
|
-
|
65
|
-
arel.find_each { |model| records << block.call(model) }
|
66
|
-
end
|
61
|
+
upload { |records| arel.find_each { |model| records << block.call(model) } }
|
67
62
|
end
|
68
63
|
|
69
64
|
def upload_integer_range(start_id, last_id)
|
65
|
+
# Create indexes before uploading
|
70
66
|
create_indexes
|
71
67
|
count = 0
|
72
68
|
while start_id <= last_id
|
@@ -77,9 +73,13 @@ module RocketJob
|
|
77
73
|
count += 1
|
78
74
|
end
|
79
75
|
count
|
76
|
+
rescue StandardError => exc
|
77
|
+
drop
|
78
|
+
raise(exc)
|
80
79
|
end
|
81
80
|
|
82
81
|
def upload_integer_range_in_reverse_order(start_id, last_id)
|
82
|
+
# Create indexes before uploading
|
83
83
|
create_indexes
|
84
84
|
end_id = last_id
|
85
85
|
count = 0
|
@@ -91,6 +91,9 @@ module RocketJob
|
|
91
91
|
count += 1
|
92
92
|
end
|
93
93
|
count
|
94
|
+
rescue StandardError => exc
|
95
|
+
drop
|
96
|
+
raise(exc)
|
94
97
|
end
|
95
98
|
|
96
99
|
# Iterate over each failed record, if any
|
@@ -3,93 +3,18 @@ require 'tempfile'
|
|
3
3
|
module RocketJob
|
4
4
|
module Sliced
|
5
5
|
class Output < Slices
|
6
|
-
|
7
|
-
|
8
|
-
# Returns [Integer] the number of records returned from the collection
|
9
|
-
#
|
10
|
-
# Parameters
|
11
|
-
# file_name_or_io [String|IO]
|
12
|
-
# The file_name of the file to write to, or an IO Stream that implements
|
13
|
-
# #write.
|
14
|
-
#
|
15
|
-
# options:
|
16
|
-
# streams [Symbol|Array]
|
17
|
-
# The formats/streams that be used to convert the data whilst it is
|
18
|
-
# being written.
|
19
|
-
# When nil, `file_name_or_io` will be inspected to try and determine what
|
20
|
-
# streams should be applied.
|
21
|
-
# Default: nil
|
22
|
-
#
|
23
|
-
# Any other option that can be supplied to IOStreams::Line::Writer
|
24
|
-
#
|
25
|
-
# Stream types / extensions supported:
|
26
|
-
# .zip Zip File [ :zip ]
|
27
|
-
# .gz, .gzip GZip File [ :gzip ]
|
28
|
-
# .enc File Encrypted using symmetric encryption [ :enc ]
|
29
|
-
#
|
30
|
-
# When a file is encrypted, it may also be compressed:
|
31
|
-
# .zip.enc [ :zip, :enc ]
|
32
|
-
# .gz.enc [ :gz, :enc ]
|
33
|
-
#
|
34
|
-
# Example: Zip
|
35
|
-
# # Since csv is not known to RocketJob it is ignored
|
36
|
-
# job.output.download('myfile.csv.zip')
|
37
|
-
#
|
38
|
-
# Example: Encrypted Zip
|
39
|
-
# job.output.download('myfile.csv.zip.enc')
|
40
|
-
#
|
41
|
-
# Example: Explicitly set the streams
|
42
|
-
# job.output.download('myfile.ze', streams: [:zip, :enc])
|
43
|
-
#
|
44
|
-
# Example: Supply custom options
|
45
|
-
# job.output.download('myfile.csv.enc', streams: [enc: { compress: true }])
|
46
|
-
#
|
47
|
-
# Example: Supply custom options
|
48
|
-
# job.output.download('myfile.csv.zip', streams: [ zip: { zip_file_name: 'myfile.csv' } ])
|
49
|
-
#
|
50
|
-
# Example: Extract streams from filename but write to a temp file
|
51
|
-
# t = Tempfile.new('my_project')
|
52
|
-
# job.output.download(t.to_path, file_name: 'myfile.gz.enc')
|
53
|
-
#
|
54
|
-
# Example: Add a header and/or trailer record to the downloaded file:
|
55
|
-
# IOStreams.writer('/tmp/file.txt.gz') do |writer|
|
56
|
-
# writer << "Header\n"
|
57
|
-
# job.download do |line|
|
58
|
-
# writer << line
|
59
|
-
# end
|
60
|
-
# writer << "Trailer\n"
|
61
|
-
# end
|
62
|
-
#
|
63
|
-
# Notes:
|
64
|
-
# - The records are returned in '_id' order. Usually this is the order in
|
65
|
-
# which the records were originally loaded.
|
66
|
-
def download(file_name_or_io = nil, header_line: nil, **args)
|
67
|
-
raise(ArgumentError, 'Either file_name_or_io, or a block must be supplied') unless file_name_or_io || block_given?
|
6
|
+
def download(header_line: nil)
|
7
|
+
raise(ArgumentError, 'Block is mandatory') unless block_given?
|
68
8
|
|
69
|
-
|
70
|
-
|
71
|
-
if block_given?
|
72
|
-
# Write the header line
|
73
|
-
yield(header_line) if header_line
|
9
|
+
# Write the header line
|
10
|
+
yield(header_line) if header_line
|
74
11
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
else
|
83
|
-
IOStreams.line_writer(file_name_or_io, **args) do |io|
|
84
|
-
# Write the header line
|
85
|
-
io << header_line if header_line
|
86
|
-
|
87
|
-
each do |slice|
|
88
|
-
slice.each do |record|
|
89
|
-
record_count += 1
|
90
|
-
io << record
|
91
|
-
end
|
92
|
-
end
|
12
|
+
# Call the supplied block for every record returned
|
13
|
+
record_count = 0
|
14
|
+
each do |slice|
|
15
|
+
slice.each do |record|
|
16
|
+
record_count += 1
|
17
|
+
yield(record)
|
93
18
|
end
|
94
19
|
end
|
95
20
|
record_count
|
@@ -12,16 +12,10 @@ module RocketJob
|
|
12
12
|
# Block to call on the first line only, instead of storing in the slice.
|
13
13
|
# Useful for extracting the header row
|
14
14
|
# Default: nil
|
15
|
-
def self.collect(input, **args
|
15
|
+
def self.collect(input, **args)
|
16
16
|
writer = new(input, **args)
|
17
|
-
|
18
|
-
input.create_indexes if input.respond_to?(:create_indexes)
|
19
|
-
block.call(writer)
|
17
|
+
yield(writer)
|
20
18
|
writer.record_count
|
21
|
-
rescue Exception => exc
|
22
|
-
# Drop input collection when upload fails
|
23
|
-
input.drop
|
24
|
-
raise exc
|
25
19
|
ensure
|
26
20
|
writer&.close
|
27
21
|
end
|
data/lib/rocket_job/version.rb
CHANGED
data/lib/rocket_job/worker.rb
CHANGED
@@ -119,6 +119,9 @@ module RocketJob
|
|
119
119
|
|
120
120
|
SemanticLogger.named_tagged(job: job.id.to_s) do
|
121
121
|
processed = true unless job.rocket_job_work(self, false, current_filter)
|
122
|
+
|
123
|
+
# Return the database connections for this thread back to the connection pool
|
124
|
+
ActiveRecord::Base.clear_active_connections! if defined?(ActiveRecord::Base)
|
122
125
|
end
|
123
126
|
end
|
124
127
|
processed
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rocketjob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.3.0.beta
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aasm
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 1.0.0.beta
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.0.0.beta
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: mongoid
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- lib/rocket_job/jobs/dirmon_job.rb
|
132
132
|
- lib/rocket_job/jobs/housekeeping_job.rb
|
133
133
|
- lib/rocket_job/jobs/on_demand_batch_job.rb
|
134
|
+
- lib/rocket_job/jobs/on_demand_batch_tabular_job.rb
|
134
135
|
- lib/rocket_job/jobs/on_demand_job.rb
|
135
136
|
- lib/rocket_job/jobs/performance_job.rb
|
136
137
|
- lib/rocket_job/jobs/simple_job.rb
|
@@ -189,11 +190,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
189
190
|
version: '2.3'
|
190
191
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
191
192
|
requirements:
|
192
|
-
- - "
|
193
|
+
- - ">"
|
193
194
|
- !ruby/object:Gem::Version
|
194
|
-
version:
|
195
|
+
version: 1.3.1
|
195
196
|
requirements: []
|
196
|
-
rubygems_version: 3.0.
|
197
|
+
rubygems_version: 3.0.6
|
197
198
|
signing_key:
|
198
199
|
specification_version: 4
|
199
200
|
summary: Ruby's missing batch system.
|