rocketjob 4.2.0 → 4.3.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rocket_job/batch/io.rb +111 -70
- data/lib/rocket_job/batch/tabular/input.rb +1 -1
- data/lib/rocket_job/dirmon_entry.rb +23 -56
- data/lib/rocket_job/jobs/dirmon_job.rb +15 -8
- data/lib/rocket_job/jobs/on_demand_batch_tabular_job.rb +27 -0
- data/lib/rocket_job/sliced/input.rb +17 -14
- data/lib/rocket_job/sliced/output.rb +10 -85
- data/lib/rocket_job/sliced/writer/input.rb +2 -8
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +3 -0
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ae22e2ca14255089a3500e9294fc689847bb92525da45ba90426cae38a45378
|
4
|
+
data.tar.gz: fc0f6dd9c042020a01d47f4c2be35df3896db47e3cc07d875a27f057a8e64ab6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb612469360af546d76ea1d024e80cbdf50f40693533fd3e608927911d62c86da6ad4ba290da0186ce98b3be95b3cfad21ceed3bf22091d6e80cf2adc7b2387d
|
7
|
+
data.tar.gz: 4038eb8af3353d6358f3dc74c50410d1f16cf96fe716c19b29aa8843428e2a8938267b3d130c4023b7234ca45ad5a546d65836ea69775d354889e6cb50121fd6
|
data/lib/rocket_job/batch/io.rb
CHANGED
@@ -19,7 +19,7 @@ module RocketJob
|
|
19
19
|
collection_name = "rocket_job.inputs.#{id}"
|
20
20
|
collection_name << ".#{category}" unless category == :main
|
21
21
|
|
22
|
-
(@inputs ||= {})[category] ||= RocketJob::Sliced::Input.new(
|
22
|
+
(@inputs ||= {})[category] ||= RocketJob::Sliced::Input.new(collection_name: collection_name, slice_size: slice_size)
|
23
23
|
end
|
24
24
|
|
25
25
|
# Returns [RocketJob::Sliced::Output] output collection for holding output slices
|
@@ -36,23 +36,18 @@ module RocketJob
|
|
36
36
|
collection_name = "rocket_job.outputs.#{id}"
|
37
37
|
collection_name << ".#{category}" unless category == :main
|
38
38
|
|
39
|
-
(@outputs ||= {})[category] ||= RocketJob::Sliced::Output.new(
|
39
|
+
(@outputs ||= {})[category] ||= RocketJob::Sliced::Output.new(collection_name: collection_name, slice_size: slice_size)
|
40
40
|
end
|
41
41
|
|
42
|
-
# Upload the supplied
|
42
|
+
# Upload the supplied file, io, IOStreams::Path, or IOStreams::Stream.
|
43
43
|
#
|
44
44
|
# Returns [Integer] the number of records uploaded.
|
45
45
|
#
|
46
46
|
# Parameters
|
47
|
-
#
|
47
|
+
# stream [String | IO | IOStreams::Path | IOStreams::Stream]
|
48
48
|
# Full path and file name to stream into the job,
|
49
49
|
# Or, an IO Stream that responds to: :read
|
50
|
-
#
|
51
|
-
# streams [Symbol|Array]
|
52
|
-
# Streams to convert the data whilst it is being read.
|
53
|
-
# When nil, the file_name extensions will be inspected to determine what
|
54
|
-
# streams should be applied.
|
55
|
-
# Default: nil
|
50
|
+
# Or, an IOStreams path such as IOStreams::Paths::File, or IOStreams::Paths::S3
|
56
51
|
#
|
57
52
|
# delimiter[String]
|
58
53
|
# Line / Record delimiter to use to break the stream up into records
|
@@ -63,9 +58,14 @@ module RocketJob
|
|
63
58
|
# Searches for the first "\r\n" or "\n" and then uses that as the
|
64
59
|
# delimiter for all subsequent records
|
65
60
|
#
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
61
|
+
# stream_mode: [:line | :row | :record]
|
62
|
+
# :line
|
63
|
+
# Uploads the file a line (String) at a time for processing by workers.
|
64
|
+
# :row
|
65
|
+
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
66
|
+
# :record
|
67
|
+
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
68
|
+
# See IOStreams::Stream#each_line, IOStreams::Stream#each_row, and IOStreams::Stream#each_record.
|
69
69
|
#
|
70
70
|
# encoding: [String|Encoding]
|
71
71
|
# Encode returned data with this encoding.
|
@@ -74,11 +74,15 @@ module RocketJob
|
|
74
74
|
# 'UTF-8': UTF-8 Format
|
75
75
|
# Etc.
|
76
76
|
# Default: 'UTF-8'
|
77
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
78
|
+
# if not already set in the supplied stream.
|
77
79
|
#
|
78
80
|
# encode_replace: [String]
|
79
81
|
# The character to replace with when a character cannot be converted to the target encoding.
|
80
82
|
# nil: Don't replace any invalid characters. Encoding::UndefinedConversionError is raised.
|
81
83
|
# Default: nil
|
84
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
85
|
+
# if not already set in the supplied stream.
|
82
86
|
#
|
83
87
|
# encode_cleaner: [nil|symbol|Proc]
|
84
88
|
# Cleanse data read from the input stream.
|
@@ -86,42 +90,38 @@ module RocketJob
|
|
86
90
|
# :printable Cleanse all non-printable characters except \r and \n
|
87
91
|
# Proc/lambda Proc to call after every read to cleanse the data
|
88
92
|
# Default: :printable
|
89
|
-
#
|
90
|
-
#
|
91
|
-
# :line
|
92
|
-
# Uploads the file a line (String) at a time for processing by workers.
|
93
|
-
# :row
|
94
|
-
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
95
|
-
# :record
|
96
|
-
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
97
|
-
# See IOStream#each_line, IOStream#each_row, and IOStream#each_record.
|
93
|
+
# NOTE: If a IOStreams::Path, or IOStreams::Stream was supplied then the encoding will be set
|
94
|
+
# if not already set in the supplied stream.
|
98
95
|
#
|
99
96
|
# Example:
|
100
97
|
# # Load plain text records from a file
|
101
|
-
# job.
|
98
|
+
# job.upload('hello.csv')
|
102
99
|
#
|
103
100
|
# Example:
|
104
101
|
# # Load plain text records from a file, stripping all non-printable characters,
|
105
102
|
# # as well as any characters that cannot be converted to UTF-8
|
106
|
-
# job.
|
103
|
+
# job.upload('hello.csv', encode_cleaner: :printable, encode_replace: '')
|
107
104
|
#
|
108
105
|
# Example: Zip
|
109
106
|
# # Since csv is not known to RocketJob it is ignored
|
110
|
-
# job.
|
107
|
+
# job.upload('myfile.csv.zip')
|
111
108
|
#
|
112
109
|
# Example: Encrypted Zip
|
113
|
-
# job.
|
110
|
+
# job.upload('myfile.csv.zip.enc')
|
114
111
|
#
|
115
112
|
# Example: Explicitly set the streams
|
116
|
-
#
|
113
|
+
# path = IOStreams.path('myfile.ze').stream(:encode, encoding: 'UTF-8').stream(:zip).stream(:enc)
|
114
|
+
# job.upload(path)
|
117
115
|
#
|
118
116
|
# Example: Supply custom options
|
119
|
-
#
|
117
|
+
# path = IOStreams.path('myfile.csv.enc').option(:enc, compress: false).option(:encode, encoding: 'UTF-8')
|
118
|
+
# job.upload(path)
|
120
119
|
#
|
121
|
-
# Example:
|
122
|
-
#
|
123
|
-
#
|
124
|
-
#
|
120
|
+
# Example: Read from a tempfile and use the original file name to determine which streams to apply
|
121
|
+
# temp_file = Tempfile.new('my_project')
|
122
|
+
# temp_file.write(gzip_and_encrypted_data)
|
123
|
+
# stream = IOStreams.stream(temp_file).file_name('myfile.gz.enc')
|
124
|
+
# job.upload(stream)
|
125
125
|
#
|
126
126
|
# Example: Upload by writing records one at a time to the upload stream
|
127
127
|
# job.upload do |writer|
|
@@ -140,18 +140,22 @@ module RocketJob
|
|
140
140
|
# * If an io stream is supplied, it is read until it returns nil.
|
141
141
|
# * Only use this method for UTF-8 data, for binary data use #input_slice or #input_records.
|
142
142
|
# * CSV parsing is slow, so it is usually left for the workers to do.
|
143
|
-
def upload(
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
143
|
+
def upload(stream = nil, file_name: nil, category: :main, encoding: 'UTF-8', encode_cleaner: nil, encode_replace: nil, stream_mode: :line, on_first: nil, **args, &block)
|
144
|
+
raise(ArgumentError, 'Either stream, or a block must be supplied') unless stream || block
|
145
|
+
|
146
|
+
count =
|
147
|
+
if block
|
148
|
+
input(category).upload(on_first: on_first, &block)
|
149
|
+
else
|
150
|
+
path = build_path(stream, file_name, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace)
|
151
|
+
|
152
|
+
self.upload_file_name = path.file_name
|
153
|
+
input(category).upload(on_first: on_first) do |io|
|
154
|
+
path.public_send("each_#{stream_mode}".to_sym, **args) { |line| io << line }
|
155
|
+
end
|
156
|
+
end
|
150
157
|
self.record_count = (record_count || 0) + count
|
151
158
|
count
|
152
|
-
rescue StandardError => exc
|
153
|
-
input(category).delete_all
|
154
|
-
raise(exc)
|
155
159
|
end
|
156
160
|
|
157
161
|
# Upload results from an Arel into RocketJob::SlicedJob.
|
@@ -188,9 +192,6 @@ module RocketJob
|
|
188
192
|
count = input(category).upload_arel(arel, *column_names, &block)
|
189
193
|
self.record_count = (record_count || 0) + count
|
190
194
|
count
|
191
|
-
rescue StandardError => exc
|
192
|
-
input(category).delete_all
|
193
|
-
raise(exc)
|
194
195
|
end
|
195
196
|
|
196
197
|
# Upload the result of a MongoDB query to the input collection for processing
|
@@ -232,9 +233,6 @@ module RocketJob
|
|
232
233
|
count = input(category).upload_mongo_query(criteria, *column_names, &block)
|
233
234
|
self.record_count = (record_count || 0) + count
|
234
235
|
count
|
235
|
-
rescue StandardError => exc
|
236
|
-
input(category).delete_all
|
237
|
-
raise(exc)
|
238
236
|
end
|
239
237
|
|
240
238
|
# Upload sliced range of integer requests as arrays of start and end ids.
|
@@ -263,9 +261,6 @@ module RocketJob
|
|
263
261
|
count = last_id - start_id + 1
|
264
262
|
self.record_count = (record_count || 0) + count
|
265
263
|
count
|
266
|
-
rescue StandardError => exc
|
267
|
-
input(category).delete_all
|
268
|
-
raise(exc)
|
269
264
|
end
|
270
265
|
|
271
266
|
# Upload sliced range of integer requests as an arrays of start and end ids
|
@@ -298,9 +293,6 @@ module RocketJob
|
|
298
293
|
count = last_id - start_id + 1
|
299
294
|
self.record_count = (record_count || 0) + count
|
300
295
|
count
|
301
|
-
rescue StandardError => exc
|
302
|
-
input(category).delete_all
|
303
|
-
raise(exc)
|
304
296
|
end
|
305
297
|
|
306
298
|
# Upload the supplied slices for processing by workers
|
@@ -326,24 +318,71 @@ module RocketJob
|
|
326
318
|
count
|
327
319
|
end
|
328
320
|
|
329
|
-
# Download the output data into the supplied
|
321
|
+
# Download the output data into the supplied file, io, IOStreams::Path, or IOStreams::Stream.
|
322
|
+
# Returns [Integer] the number of records / lines downloaded.
|
330
323
|
#
|
331
324
|
# Parameters
|
332
|
-
#
|
333
|
-
#
|
325
|
+
# stream [String | IO | IOStreams::Path | IOStreams::Stream]
|
326
|
+
# Full path and file name to stream into the job,
|
327
|
+
# Or, an IO stream that responds to: :write
|
328
|
+
# Or, an IOStreams path such as IOStreams::Paths::File, or IOStreams::Paths::S3
|
329
|
+
#
|
330
|
+
# Example: Zip
|
331
|
+
# # Since csv is not known to RocketJob it is ignored
|
332
|
+
# job.download('myfile.csv.zip')
|
333
|
+
#
|
334
|
+
# Example: Encrypted Zip
|
335
|
+
# job.download('myfile.csv.zip.enc')
|
336
|
+
#
|
337
|
+
# Example: Explicitly set the streams
|
338
|
+
# path = IOStreams.path('myfile.ze').stream(:zip).stream(:enc)
|
339
|
+
# job.download(path)
|
334
340
|
#
|
335
|
-
#
|
336
|
-
#
|
337
|
-
#
|
338
|
-
#
|
341
|
+
# Example: Supply custom options
|
342
|
+
# path = IOStreams.path('myfile.csv.enc').option(:enc, compress: false)
|
343
|
+
# job.download(path)
|
344
|
+
#
|
345
|
+
# Example: Supply custom options. Set the file name within the zip file.
|
346
|
+
# path = IOStreams.path('myfile.csv.zip').option(:zip, zip_file_name: 'myfile.csv')
|
347
|
+
# job.download(path)
|
348
|
+
#
|
349
|
+
# Example: Download into a tempfile, or stream, using the original file name to determine the streams to apply:
|
350
|
+
# tempfile = Tempfile.new('my_project')
|
351
|
+
# stream = IOStreams.stream(tempfile).file_name('myfile.gz.enc')
|
352
|
+
# job.download(stream)
|
353
|
+
#
|
354
|
+
# Example: Add a header and/or trailer record to the downloaded file:
|
355
|
+
# IOStreams.path('/tmp/file.txt.gz').writer do |writer|
|
356
|
+
# writer << "Header\n"
|
357
|
+
# job.download do |line|
|
358
|
+
# writer << line + "\n"
|
359
|
+
# end
|
360
|
+
# writer << "Trailer\n"
|
361
|
+
# end
|
339
362
|
#
|
340
|
-
#
|
363
|
+
# Example: Add a header and/or trailer record to the downloaded file, letting the line writer add the line breaks:
|
364
|
+
# IOStreams.path('/tmp/file.txt.gz').line_writer do |writer|
|
365
|
+
# writer << "Header"
|
366
|
+
# job.download do |line|
|
367
|
+
# writer << line
|
368
|
+
# end
|
369
|
+
# writer << "Trailer"
|
370
|
+
# end
|
341
371
|
#
|
342
|
-
#
|
343
|
-
|
372
|
+
# Notes:
|
373
|
+
# - The records are returned in '_id' order. Usually this is the order in
|
374
|
+
# which the records were originally loaded.
|
375
|
+
def download(stream = nil, category: :main, header_line: nil, encoding: 'UTF-8', encode_cleaner: nil, encode_replace: nil, **args, &block)
|
344
376
|
raise "Cannot download incomplete job: #{id}. Currently in state: #{state}-#{sub_state}" if rocket_job_processing?
|
345
377
|
|
346
|
-
|
378
|
+
if block
|
379
|
+
output(category).download(header_line: header_line, &block)
|
380
|
+
else
|
381
|
+
path = build_path(stream, nil, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace)
|
382
|
+
path.line_writer(**args) do |io|
|
383
|
+
output(category).download(header_line: header_line) { |record| io << record }
|
384
|
+
end
|
385
|
+
end
|
347
386
|
end
|
348
387
|
|
349
388
|
# Writes the supplied result, Batch::Result or Batch::Results to the relevant collections.
|
@@ -381,11 +420,13 @@ module RocketJob
|
|
381
420
|
|
382
421
|
private
|
383
422
|
|
384
|
-
def
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
423
|
+
def build_path(stream, file_name, encoding: nil, encode_cleaner: nil, encode_replace: nil)
|
424
|
+
path = IOStreams.new(stream)
|
425
|
+
path.file_name = file_name if file_name
|
426
|
+
if (encoding || encode_cleaner || encode_replace) && !path.setting(:encode)
|
427
|
+
path.option_or_stream(:encode, encoding: encoding, cleaner: encode_cleaner, replace: encode_replace)
|
428
|
+
end
|
429
|
+
path
|
389
430
|
end
|
390
431
|
end
|
391
432
|
end
|
@@ -20,7 +20,7 @@ module RocketJob
|
|
20
20
|
# Parses each line from the file as an Array and uploads each array for processing by workers.
|
21
21
|
# :record
|
22
22
|
# Parses each line from the file into a Hash and uploads each hash for processing by workers.
|
23
|
-
# See
|
23
|
+
# See IOStreams#each_line, IOStreams#each_row, and IOStreams#each_record.
|
24
24
|
field :tabular_input_mode, type: Symbol, default: :line, class_attribute: true, user_editable: true, copy_on_restart: true
|
25
25
|
|
26
26
|
validates_inclusion_of :tabular_input_format, in: IOStreams::Tabular.registered_formats
|
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'concurrent'
|
2
|
-
require 'pathname'
|
3
2
|
require 'fileutils'
|
4
3
|
module RocketJob
|
5
4
|
class DirmonEntry
|
@@ -143,7 +142,7 @@ module RocketJob
|
|
143
142
|
# Raises: Errno::ENOENT: No such file or directory
|
144
143
|
def self.add_whitelist_path(path)
|
145
144
|
# Confirms that path exists
|
146
|
-
path =
|
145
|
+
path = IOStreams.path(path).realpath.to_s
|
147
146
|
whitelist_paths << path
|
148
147
|
whitelist_paths.uniq!
|
149
148
|
path
|
@@ -153,7 +152,7 @@ module RocketJob
|
|
153
152
|
# Raises: Errno::ENOENT: No such file or directory
|
154
153
|
def self.delete_whitelist_path(path)
|
155
154
|
# Confirms that path exists
|
156
|
-
path =
|
155
|
+
path = IOStreams.path(path).realpath.to_s
|
157
156
|
whitelist_paths.delete(path)
|
158
157
|
whitelist_paths.uniq!
|
159
158
|
path
|
@@ -186,32 +185,23 @@ module RocketJob
|
|
186
185
|
def each
|
187
186
|
SemanticLogger.named_tagged(dirmon_entry: id.to_s) do
|
188
187
|
# Case insensitive filename matching
|
189
|
-
|
190
|
-
|
191
|
-
pathname = begin
|
192
|
-
pathname.realpath
|
193
|
-
rescue Errno::ENOENT
|
194
|
-
logger.warn("Unable to expand the realpath for #{pathname.inspect}. Skipping file.")
|
195
|
-
next
|
196
|
-
end
|
197
|
-
|
198
|
-
file_name = pathname.to_s
|
199
|
-
|
188
|
+
IOStreams.each_child(pattern) do |path|
|
189
|
+
path = path.realpath
|
200
190
|
# Skip archive directories
|
201
|
-
next if
|
191
|
+
next if path.to_s.include?(archive_directory || self.class.default_archive_directory)
|
202
192
|
|
203
193
|
# Security check?
|
204
|
-
if whitelist_paths.size.positive? && whitelist_paths.none? { |whitepath|
|
205
|
-
logger.
|
194
|
+
if whitelist_paths.size.positive? && whitelist_paths.none? { |whitepath| path.to_s.start_with?(whitepath) }
|
195
|
+
logger.warn "Skipping file: #{path} since it is not in any of the whitelisted paths: #{whitelist_paths.join(', ')}"
|
206
196
|
next
|
207
197
|
end
|
208
198
|
|
209
199
|
# File must be writable so it can be removed after processing
|
210
|
-
|
211
|
-
logger.
|
200
|
+
if path.respond_to?(:writable?) && !path.writable?
|
201
|
+
logger.warn "Skipping file: #{file_name} since it is not writable by the current user. Must be able to delete/move the file after queueing the job"
|
212
202
|
next
|
213
203
|
end
|
214
|
-
yield(
|
204
|
+
yield(path)
|
215
205
|
end
|
216
206
|
end
|
217
207
|
end
|
@@ -239,17 +229,18 @@ module RocketJob
|
|
239
229
|
nil
|
240
230
|
end
|
241
231
|
|
242
|
-
# Archives the file
|
243
|
-
def later(
|
244
|
-
job_id
|
245
|
-
|
232
|
+
# Archives the file, then kicks off a file upload job to upload the archived file.
|
233
|
+
def later(iopath)
|
234
|
+
job_id = BSON::ObjectId.new
|
235
|
+
archive_path = archive_iopath(iopath).join("#{job_id}_#{iopath.basename}")
|
236
|
+
iopath.move_to(archive_path)
|
246
237
|
|
247
238
|
job = RocketJob::Jobs::UploadFileJob.create!(
|
248
239
|
job_class_name: job_class_name,
|
249
240
|
properties: properties,
|
250
|
-
description: "#{name}: #{
|
251
|
-
upload_file_name:
|
252
|
-
original_file_name:
|
241
|
+
description: "#{name}: #{iopath.basename}",
|
242
|
+
upload_file_name: archive_path.to_s,
|
243
|
+
original_file_name: iopath.to_s,
|
253
244
|
job_id: job_id
|
254
245
|
)
|
255
246
|
|
@@ -257,8 +248,8 @@ module RocketJob
|
|
257
248
|
message: 'Created RocketJob::Jobs::UploadFileJob',
|
258
249
|
payload: {
|
259
250
|
dirmon_entry_name: name,
|
260
|
-
upload_file_name:
|
261
|
-
original_file_name:
|
251
|
+
upload_file_name: archive_path.to_s,
|
252
|
+
original_file_name: iopath.to_s,
|
262
253
|
job_class_name: job_class_name,
|
263
254
|
job_id: job_id.to_s,
|
264
255
|
upload_job_id: job.id.to_s
|
@@ -278,37 +269,13 @@ module RocketJob
|
|
278
269
|
class_attribute :whitelist_paths
|
279
270
|
self.whitelist_paths = Concurrent::Array.new
|
280
271
|
|
281
|
-
# Move the file to the archive directory
|
282
|
-
#
|
283
|
-
# The archived file name is prefixed with the job id
|
284
|
-
#
|
285
|
-
# Returns [String] the fully qualified archived file name
|
286
|
-
#
|
287
|
-
# Note:
|
288
|
-
# - Works across partitions when the file and the archive are on different partitions
|
289
|
-
def archive_file(job_id, pathname)
|
290
|
-
target_path = archive_pathname(pathname)
|
291
|
-
target_path.mkpath
|
292
|
-
target_file_name = target_path.join("#{job_id}_#{pathname.basename}")
|
293
|
-
# In case the file is being moved across partitions
|
294
|
-
FileUtils.move(pathname.to_s, target_file_name.to_s)
|
295
|
-
target_file_name.to_s
|
296
|
-
end
|
297
|
-
|
298
272
|
# Returns [Pathname] to the archive directory, and creates it if it does not exist.
|
299
273
|
#
|
300
274
|
# If `archive_directory` is a relative path, it is appended to the `file_pathname`.
|
301
275
|
# If `archive_directory` is an absolute path, it is returned as-is.
|
302
|
-
def
|
303
|
-
path =
|
304
|
-
path
|
305
|
-
|
306
|
-
begin
|
307
|
-
path.mkpath unless path.exist?
|
308
|
-
rescue Errno::ENOENT => exc
|
309
|
-
raise(Errno::ENOENT, "DirmonJob failed to create archive directory: #{path}, #{exc.message}")
|
310
|
-
end
|
311
|
-
path.realpath
|
276
|
+
def archive_iopath(iopath)
|
277
|
+
path = IOStreams.path(archive_directory)
|
278
|
+
path.relative? ? iopath.directory.join(archive_directory) : path
|
312
279
|
end
|
313
280
|
|
314
281
|
# Validates job_class is a Rocket Job
|
@@ -70,11 +70,18 @@ module RocketJob
|
|
70
70
|
def check_directories
|
71
71
|
new_file_names = {}
|
72
72
|
DirmonEntry.enabled.each do |entry|
|
73
|
-
entry.each do |
|
73
|
+
entry.each do |iopath|
|
74
|
+
# S3 files are only visible once completely uploaded.
|
75
|
+
if iopath.is_a?(IOStreams::Paths::S3)
|
76
|
+
logger.info("S3 File: #{iopath}. Starting: #{entry.job_class_name}")
|
77
|
+
entry.later(iopath)
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
74
81
|
# BSON Keys cannot contain periods
|
75
|
-
key =
|
82
|
+
key = iopath.to_s.tr('.', '_')
|
76
83
|
previous_size = previous_file_names[key]
|
77
|
-
size = check_file(entry,
|
84
|
+
size = check_file(entry, iopath, previous_size)
|
78
85
|
new_file_names[key] = size if size
|
79
86
|
end
|
80
87
|
end
|
@@ -83,14 +90,14 @@ module RocketJob
|
|
83
90
|
|
84
91
|
# Checks if a file should result in starting a job
|
85
92
|
# Returns [Integer] file size, or nil if the file started a job
|
86
|
-
def check_file(entry,
|
87
|
-
size =
|
93
|
+
def check_file(entry, iopath, previous_size)
|
94
|
+
size = iopath.size
|
88
95
|
if previous_size && (previous_size == size)
|
89
|
-
logger.info("File stabilized: #{
|
90
|
-
entry.later(
|
96
|
+
logger.info("File stabilized: #{iopath}. Starting: #{entry.job_class_name}")
|
97
|
+
entry.later(iopath)
|
91
98
|
nil
|
92
99
|
else
|
93
|
-
logger.info("Found file: #{
|
100
|
+
logger.info("Found file: #{iopath}. File size: #{size}")
|
94
101
|
# Keep for the next run
|
95
102
|
size
|
96
103
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Job to dynamically perform ruby code on demand as a Batch,
|
2
|
+
# with input and/or output from CSV/JSON or other format supported by Tabular.
|
3
|
+
#
|
4
|
+
# Nodes:
|
5
|
+
# - Need to specify `destroy_on_complete: false` to collect output from this job.
|
6
|
+
# - `after_code` can be used to automatically download the output of this job to a file on completion.
|
7
|
+
#
|
8
|
+
# Example: Iterate over all rows in a table:
|
9
|
+
# code = <<-CODE
|
10
|
+
# if user = User.find(row)
|
11
|
+
# user.cleanse_attributes!
|
12
|
+
# user.save(validate: false)
|
13
|
+
# end
|
14
|
+
# CODE
|
15
|
+
# job = RocketJob::Jobs::OnDemandBatchTabularJob.new(code: code, description: 'cleanse users', destroy_on_complete: false)
|
16
|
+
# job.upload("users.csv")
|
17
|
+
# job.save!
|
18
|
+
#
|
19
|
+
# On completion export the output:
|
20
|
+
# job.download("output.csv")
|
21
|
+
module RocketJob
|
22
|
+
module Jobs
|
23
|
+
class OnDemandBatchTabularJob < OnDemandBatchJob
|
24
|
+
include RocketJob::Batch::Tabular
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -1,15 +1,13 @@
|
|
1
1
|
module RocketJob
|
2
2
|
module Sliced
|
3
3
|
class Input < Slices
|
4
|
-
def upload(
|
5
|
-
|
6
|
-
|
7
|
-
block ||= -> (io) do
|
8
|
-
iterator = "each_#{stream_mode}".to_sym
|
9
|
-
IOStreams.public_send(iterator, file_name_or_io, encoding: encoding, **args) { |line| io << line }
|
10
|
-
end
|
11
|
-
|
4
|
+
def upload(on_first: nil, &block)
|
5
|
+
# Create indexes before uploading
|
6
|
+
create_indexes
|
12
7
|
Writer::Input.collect(self, on_first: on_first, &block)
|
8
|
+
rescue StandardError => exc
|
9
|
+
drop
|
10
|
+
raise(exc)
|
13
11
|
end
|
14
12
|
|
15
13
|
def upload_mongo_query(criteria, *column_names, &block)
|
@@ -36,7 +34,7 @@ module RocketJob
|
|
36
34
|
end
|
37
35
|
end
|
38
36
|
|
39
|
-
|
37
|
+
upload do |records|
|
40
38
|
# Drop down to the mongo driver level to avoid constructing a Model for each document returned
|
41
39
|
criteria.klass.collection.find(criteria.selector, options).each do |document|
|
42
40
|
records << block.call(document)
|
@@ -46,8 +44,7 @@ module RocketJob
|
|
46
44
|
|
47
45
|
def upload_arel(arel, *column_names, &block)
|
48
46
|
unless block
|
49
|
-
column_names = column_names.collect(&:to_sym)
|
50
|
-
column_names << :id if column_names.size.zero?
|
47
|
+
column_names = column_names.empty? ? [:id] : column_names.collect(&:to_sym)
|
51
48
|
|
52
49
|
block =
|
53
50
|
if column_names.size == 1
|
@@ -61,12 +58,11 @@ module RocketJob
|
|
61
58
|
arel = arel.select(selection)
|
62
59
|
end
|
63
60
|
|
64
|
-
|
65
|
-
arel.find_each { |model| records << block.call(model) }
|
66
|
-
end
|
61
|
+
upload { |records| arel.find_each { |model| records << block.call(model) } }
|
67
62
|
end
|
68
63
|
|
69
64
|
def upload_integer_range(start_id, last_id)
|
65
|
+
# Create indexes before uploading
|
70
66
|
create_indexes
|
71
67
|
count = 0
|
72
68
|
while start_id <= last_id
|
@@ -77,9 +73,13 @@ module RocketJob
|
|
77
73
|
count += 1
|
78
74
|
end
|
79
75
|
count
|
76
|
+
rescue StandardError => exc
|
77
|
+
drop
|
78
|
+
raise(exc)
|
80
79
|
end
|
81
80
|
|
82
81
|
def upload_integer_range_in_reverse_order(start_id, last_id)
|
82
|
+
# Create indexes before uploading
|
83
83
|
create_indexes
|
84
84
|
end_id = last_id
|
85
85
|
count = 0
|
@@ -91,6 +91,9 @@ module RocketJob
|
|
91
91
|
count += 1
|
92
92
|
end
|
93
93
|
count
|
94
|
+
rescue StandardError => exc
|
95
|
+
drop
|
96
|
+
raise(exc)
|
94
97
|
end
|
95
98
|
|
96
99
|
# Iterate over each failed record, if any
|
@@ -3,93 +3,18 @@ require 'tempfile'
|
|
3
3
|
module RocketJob
|
4
4
|
module Sliced
|
5
5
|
class Output < Slices
|
6
|
-
|
7
|
-
|
8
|
-
# Returns [Integer] the number of records returned from the collection
|
9
|
-
#
|
10
|
-
# Parameters
|
11
|
-
# file_name_or_io [String|IO]
|
12
|
-
# The file_name of the file to write to, or an IO Stream that implements
|
13
|
-
# #write.
|
14
|
-
#
|
15
|
-
# options:
|
16
|
-
# streams [Symbol|Array]
|
17
|
-
# The formats/streams that be used to convert the data whilst it is
|
18
|
-
# being written.
|
19
|
-
# When nil, `file_name_or_io` will be inspected to try and determine what
|
20
|
-
# streams should be applied.
|
21
|
-
# Default: nil
|
22
|
-
#
|
23
|
-
# Any other option that can be supplied to IOStreams::Line::Writer
|
24
|
-
#
|
25
|
-
# Stream types / extensions supported:
|
26
|
-
# .zip Zip File [ :zip ]
|
27
|
-
# .gz, .gzip GZip File [ :gzip ]
|
28
|
-
# .enc File Encrypted using symmetric encryption [ :enc ]
|
29
|
-
#
|
30
|
-
# When a file is encrypted, it may also be compressed:
|
31
|
-
# .zip.enc [ :zip, :enc ]
|
32
|
-
# .gz.enc [ :gz, :enc ]
|
33
|
-
#
|
34
|
-
# Example: Zip
|
35
|
-
# # Since csv is not known to RocketJob it is ignored
|
36
|
-
# job.output.download('myfile.csv.zip')
|
37
|
-
#
|
38
|
-
# Example: Encrypted Zip
|
39
|
-
# job.output.download('myfile.csv.zip.enc')
|
40
|
-
#
|
41
|
-
# Example: Explicitly set the streams
|
42
|
-
# job.output.download('myfile.ze', streams: [:zip, :enc])
|
43
|
-
#
|
44
|
-
# Example: Supply custom options
|
45
|
-
# job.output.download('myfile.csv.enc', streams: [enc: { compress: true }])
|
46
|
-
#
|
47
|
-
# Example: Supply custom options
|
48
|
-
# job.output.download('myfile.csv.zip', streams: [ zip: { zip_file_name: 'myfile.csv' } ])
|
49
|
-
#
|
50
|
-
# Example: Extract streams from filename but write to a temp file
|
51
|
-
# t = Tempfile.new('my_project')
|
52
|
-
# job.output.download(t.to_path, file_name: 'myfile.gz.enc')
|
53
|
-
#
|
54
|
-
# Example: Add a header and/or trailer record to the downloaded file:
|
55
|
-
# IOStreams.writer('/tmp/file.txt.gz') do |writer|
|
56
|
-
# writer << "Header\n"
|
57
|
-
# job.download do |line|
|
58
|
-
# writer << line
|
59
|
-
# end
|
60
|
-
# writer << "Trailer\n"
|
61
|
-
# end
|
62
|
-
#
|
63
|
-
# Notes:
|
64
|
-
# - The records are returned in '_id' order. Usually this is the order in
|
65
|
-
# which the records were originally loaded.
|
66
|
-
def download(file_name_or_io = nil, header_line: nil, **args)
|
67
|
-
raise(ArgumentError, 'Either file_name_or_io, or a block must be supplied') unless file_name_or_io || block_given?
|
6
|
+
def download(header_line: nil)
|
7
|
+
raise(ArgumentError, 'Block is mandatory') unless block_given?
|
68
8
|
|
69
|
-
|
70
|
-
|
71
|
-
if block_given?
|
72
|
-
# Write the header line
|
73
|
-
yield(header_line) if header_line
|
9
|
+
# Write the header line
|
10
|
+
yield(header_line) if header_line
|
74
11
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
else
|
83
|
-
IOStreams.line_writer(file_name_or_io, **args) do |io|
|
84
|
-
# Write the header line
|
85
|
-
io << header_line if header_line
|
86
|
-
|
87
|
-
each do |slice|
|
88
|
-
slice.each do |record|
|
89
|
-
record_count += 1
|
90
|
-
io << record
|
91
|
-
end
|
92
|
-
end
|
12
|
+
# Call the supplied block for every record returned
|
13
|
+
record_count = 0
|
14
|
+
each do |slice|
|
15
|
+
slice.each do |record|
|
16
|
+
record_count += 1
|
17
|
+
yield(record)
|
93
18
|
end
|
94
19
|
end
|
95
20
|
record_count
|
@@ -12,16 +12,10 @@ module RocketJob
|
|
12
12
|
# Block to call on the first line only, instead of storing in the slice.
|
13
13
|
# Useful for extracting the header row
|
14
14
|
# Default: nil
|
15
|
-
def self.collect(input, **args
|
15
|
+
def self.collect(input, **args)
|
16
16
|
writer = new(input, **args)
|
17
|
-
|
18
|
-
input.create_indexes if input.respond_to?(:create_indexes)
|
19
|
-
block.call(writer)
|
17
|
+
yield(writer)
|
20
18
|
writer.record_count
|
21
|
-
rescue Exception => exc
|
22
|
-
# Drop input collection when upload fails
|
23
|
-
input.drop
|
24
|
-
raise exc
|
25
19
|
ensure
|
26
20
|
writer&.close
|
27
21
|
end
|
data/lib/rocket_job/version.rb
CHANGED
data/lib/rocket_job/worker.rb
CHANGED
@@ -119,6 +119,9 @@ module RocketJob
|
|
119
119
|
|
120
120
|
SemanticLogger.named_tagged(job: job.id.to_s) do
|
121
121
|
processed = true unless job.rocket_job_work(self, false, current_filter)
|
122
|
+
|
123
|
+
# Return the database connections for this thread back to the connection pool
|
124
|
+
ActiveRecord::Base.clear_active_connections! if defined?(ActiveRecord::Base)
|
122
125
|
end
|
123
126
|
end
|
124
127
|
processed
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rocketjob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.3.0.beta
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: aasm
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 1.0.0.beta
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.0.0.beta
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: mongoid
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- lib/rocket_job/jobs/dirmon_job.rb
|
132
132
|
- lib/rocket_job/jobs/housekeeping_job.rb
|
133
133
|
- lib/rocket_job/jobs/on_demand_batch_job.rb
|
134
|
+
- lib/rocket_job/jobs/on_demand_batch_tabular_job.rb
|
134
135
|
- lib/rocket_job/jobs/on_demand_job.rb
|
135
136
|
- lib/rocket_job/jobs/performance_job.rb
|
136
137
|
- lib/rocket_job/jobs/simple_job.rb
|
@@ -189,11 +190,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
189
190
|
version: '2.3'
|
190
191
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
191
192
|
requirements:
|
192
|
-
- - "
|
193
|
+
- - ">"
|
193
194
|
- !ruby/object:Gem::Version
|
194
|
-
version:
|
195
|
+
version: 1.3.1
|
195
196
|
requirements: []
|
196
|
-
rubygems_version: 3.0.
|
197
|
+
rubygems_version: 3.0.6
|
197
198
|
signing_key:
|
198
199
|
specification_version: 4
|
199
200
|
summary: Ruby's missing batch system.
|