derivative-rodeo 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/errors.rb +1 -1
- data/lib/derivative_rodeo/generators/base_generator.rb +127 -15
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +2 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +24 -6
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +5 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +18 -6
- data/lib/derivative_rodeo/storage_locations/base_location.rb +7 -4
- data/lib/derivative_rodeo/storage_locations/file_location.rb +3 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +5 -2
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +8 -1
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5ea42b2912b6b1eb6981b8a72a3d39e4d6f466e07ad60381ba4880431214b18
|
4
|
+
data.tar.gz: 4e4ca5cdd6ba61898ba13970fbd95c8836c33492c377eb6f258255c8ebf79e67
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ebaacf09b1459e8bb8527ddd3b45b6e84106d335e91cc506a6b9b31aae6c74931c755dc5e084e9a39a6ddb0ca8f1b0f85ee21e461cac445edd3eea45384b1b5
|
7
|
+
data.tar.gz: e11195b3b7169a1f7e2f5df821c681512af93123fc272d3b1dd35d9c971f15ad58a86b4ab1cabf38b463c77d5531114af7ca36eb9197275e46549788a4ec6c1f
|
@@ -11,11 +11,21 @@ module DerivativeRodeo
|
|
11
11
|
##
|
12
12
|
# The Base Generator defines the interface and common methods.
|
13
13
|
#
|
14
|
+
# Fundamentally, they are about ensuring the files end up at the specified location, based on
|
15
|
+
# the given:
|
16
|
+
#
|
17
|
+
# - {#input_uris}
|
18
|
+
# - {#output_location_template}
|
19
|
+
# - {#preprocessed_location_template}
|
20
|
+
#
|
14
21
|
# In extending a BaseGenerator you:
|
15
22
|
#
|
16
23
|
# - must assign an {.output_extension}
|
17
24
|
# - must impliment a {#build_step} method
|
18
25
|
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
26
|
+
#
|
27
|
+
# {#generated_files} is "where the magic happens"
|
28
|
+
# rubocop:disable Metrics/ClassLength
|
19
29
|
class BaseGenerator
|
20
30
|
##
|
21
31
|
# @!group Class Attributes
|
@@ -26,9 +36,27 @@ module DerivativeRodeo
|
|
26
36
|
class_attribute :output_extension
|
27
37
|
# @!endgroup Class Attributes
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
39
|
+
##
|
40
|
+
# @!group Attributes
|
41
|
+
#
|
42
|
+
# The "original" files that we'll be processing (via {#generated_files})
|
43
|
+
# @return [Array<String>]
|
44
|
+
attr_reader :input_uris
|
45
|
+
|
46
|
+
##
|
47
|
+
# The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
|
48
|
+
# @return [String]
|
49
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
50
|
+
attr_reader :output_location_template
|
51
|
+
|
52
|
+
##
|
53
|
+
# The template that defines where we might find existing processed files for the given
|
54
|
+
# {#input_uris} (via {#generated_files})
|
55
|
+
#
|
56
|
+
# @return [String, NilClass]
|
57
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
58
|
+
attr_reader :preprocessed_location_template
|
59
|
+
# @!endgroup Attributes
|
32
60
|
|
33
61
|
##
|
34
62
|
# @param input_uris [Array<String>]
|
@@ -78,14 +106,28 @@ module DerivativeRodeo
|
|
78
106
|
##
|
79
107
|
# @api public
|
80
108
|
#
|
109
|
+
# Based on the {#input_uris} ensure that we have files at the given output location (as
|
110
|
+
# derived from the {#output_location_template}). We ensure that by:
|
111
|
+
#
|
112
|
+
# - Checking if a file already exists at the output location
|
113
|
+
# - Copying a preprocessed file to the output location if a preprocessed file exists
|
114
|
+
# - Generating the file based on the input location
|
115
|
+
#
|
116
|
+
# @note This is the method where the magic happens!
|
117
|
+
#
|
81
118
|
# @return [Array<StorageLocations::BaseLocation>]
|
82
119
|
#
|
83
120
|
# @see #build_step
|
84
121
|
# @see #with_each_requisite_location_and_tmp_file_path
|
122
|
+
# rubocop:disable Metrics/MethodLength
|
85
123
|
def generated_files
|
86
124
|
# TODO: Examples please
|
87
125
|
return @generated_files if defined?(@generated_files)
|
88
126
|
|
127
|
+
logger.info("Starting #{self.class}#generated_files with " \
|
128
|
+
"input_uris: #{input_uris.inspect}, " \
|
129
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
130
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}.")
|
89
131
|
# As much as I would like to use map or returned values; given the implementations it's
|
90
132
|
# better to explicitly require that; reducing downstream implementation headaches.
|
91
133
|
#
|
@@ -97,15 +139,20 @@ module DerivativeRodeo
|
|
97
139
|
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
98
140
|
# "file:///Users/jfriesen/.profile"
|
99
141
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
100
|
-
|
101
|
-
@generated_files << if
|
102
|
-
|
142
|
+
output_location = destination(input_location)
|
143
|
+
@generated_files << if output_location.exist?
|
144
|
+
output_location
|
103
145
|
else
|
104
|
-
|
146
|
+
log_message = "#{self.class}#generated_files :: " \
|
147
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
148
|
+
"Generating output_location file_uri #{output_location.file_uri} via build_step."
|
149
|
+
logger.info(log_message)
|
150
|
+
build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
|
105
151
|
end
|
106
152
|
end
|
107
153
|
@generated_files
|
108
154
|
end
|
155
|
+
# rubocop:enable Metrics/MethodLength
|
109
156
|
|
110
157
|
##
|
111
158
|
# @return [Array<String>]
|
@@ -157,9 +204,13 @@ module DerivativeRodeo
|
|
157
204
|
end
|
158
205
|
|
159
206
|
##
|
160
|
-
# Returns the location
|
161
|
-
# destination might exist or might not. In the case
|
162
|
-
#
|
207
|
+
# Returns the output location for the given :input_location. The file at the location
|
208
|
+
# destination might exist or might not. In the case where we have a
|
209
|
+
# {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
|
210
|
+
# and if it exists there copy it to the target output location.
|
211
|
+
#
|
212
|
+
# In the case of non-existence, then the {#build_step} will create
|
213
|
+
# the file.
|
163
214
|
#
|
164
215
|
# @param input_location [StorageLocations::BaseLocation]
|
165
216
|
#
|
@@ -167,20 +218,80 @@ module DerivativeRodeo
|
|
167
218
|
# {#output_location_template} or {#preprocessed_location_template}.
|
168
219
|
#
|
169
220
|
# @see [StorageLocations::BaseLocation#exist?]
|
221
|
+
# rubocop:disable Metrics/MethodLength
|
222
|
+
# rubocop:disable Metrics/AbcSize
|
170
223
|
def destination(input_location)
|
171
|
-
output_location = input_location.derived_file_from(template: output_location_template)
|
224
|
+
output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
|
225
|
+
|
226
|
+
if output_location.exist?
|
227
|
+
log_message = "#{self.class}#destination :: " \
|
228
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
229
|
+
"Found output_location file_uri #{output_location.file_uri}."
|
230
|
+
logger.info(log_message)
|
172
231
|
|
173
|
-
|
174
|
-
|
232
|
+
return output_location
|
233
|
+
end
|
234
|
+
|
235
|
+
unless preprocessed_location_template
|
236
|
+
log_message = "#{self.class}#destination :: " \
|
237
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
238
|
+
"No preprocessed_location_template provided " \
|
239
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
|
240
|
+
"moving on to generation via #{self.class}#build_step."
|
241
|
+
logger.info(log_message)
|
242
|
+
|
243
|
+
return output_location
|
244
|
+
end
|
245
|
+
|
246
|
+
template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
|
175
247
|
|
176
|
-
preprocessed_location = input_location.derived_file_from(template:
|
248
|
+
preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
|
177
249
|
# We only want the location if it exists
|
178
|
-
|
250
|
+
if preprocessed_location.exist?
|
251
|
+
log_message = "#{self.class}#destination :: " \
|
252
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
253
|
+
"Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
|
254
|
+
logger.info(log_message)
|
255
|
+
|
256
|
+
# Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
|
257
|
+
# will also write some logs.
|
258
|
+
output_location = CopyGenerator.new(
|
259
|
+
input_uris: [preprocessed_location.file_uri],
|
260
|
+
output_location_template: output_location.file_uri
|
261
|
+
).generated_files.first
|
262
|
+
|
263
|
+
return output_location
|
264
|
+
end
|
265
|
+
|
266
|
+
log_message = "#{self.class}#destination :: " \
|
267
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
268
|
+
"No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
|
269
|
+
"nor output_location file_uri #{output_location.file_uri}; " \
|
270
|
+
"moving on to generation via #{self.class}#build_step."
|
271
|
+
logger.info(log_message)
|
179
272
|
|
180
273
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
181
274
|
# that the #build_step knows where to write the file.
|
182
275
|
output_location
|
183
276
|
end
|
277
|
+
# rubocop:enable Metrics/AbcSize
|
278
|
+
# rubocop:enable Metrics/MethodLength
|
279
|
+
|
280
|
+
##
|
281
|
+
# Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
|
282
|
+
# the input location. Most often, however, the given :preprocessed_location_template is
|
283
|
+
# adequate and would be the typical returned value.
|
284
|
+
#
|
285
|
+
# @param input_location [StorageLocations::BaseLocation]
|
286
|
+
# @param preprocessed_location_template [String]
|
287
|
+
#
|
288
|
+
# @return [String]
|
289
|
+
#
|
290
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
291
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
292
|
+
preprocessed_location_template
|
293
|
+
end
|
294
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
184
295
|
|
185
296
|
##
|
186
297
|
# A bit of indirection to create a common interface for running a shell command.
|
@@ -196,6 +307,7 @@ module DerivativeRodeo
|
|
196
307
|
result
|
197
308
|
end
|
198
309
|
end
|
310
|
+
# rubocop:enable Metrics/ClassLength
|
199
311
|
end
|
200
312
|
end
|
201
313
|
|
@@ -5,7 +5,8 @@ module DerivativeRodeo
|
|
5
5
|
##
|
6
6
|
# Take images an ensures that we have a monochrome derivative of those images.
|
7
7
|
class MonochromeGenerator < BaseGenerator
|
8
|
-
#
|
8
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
|
9
|
+
# magic ".mono" suffix
|
9
10
|
self.output_extension = 'mono.tiff'
|
10
11
|
|
11
12
|
##
|
@@ -66,7 +66,7 @@ module DerivativeRodeo
|
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
|
-
# @note The
|
69
|
+
# @note The tail_regexp is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
72
|
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
@@ -76,12 +76,14 @@ module DerivativeRodeo
|
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# @api public
|
84
84
|
#
|
85
|
+
# @param splitter [#call]
|
86
|
+
#
|
85
87
|
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
86
88
|
# the page number.
|
87
89
|
#
|
@@ -98,22 +100,27 @@ module DerivativeRodeo
|
|
98
100
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
101
|
#
|
100
102
|
# rubocop:disable Metrics/MethodLength
|
101
|
-
|
103
|
+
# rubocop:disable Metrics/AbcSize
|
104
|
+
def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
|
102
105
|
input_files.each do |input_location|
|
103
106
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
107
|
existing_locations = existing_page_locations(input_location: input_location)
|
105
108
|
|
106
109
|
if existing_locations.count.positive?
|
107
|
-
existing_locations.
|
110
|
+
logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
|
111
|
+
existing_locations.each_with_index do |location, index|
|
112
|
+
logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
|
108
113
|
yield(location, location.file_path)
|
109
114
|
end
|
110
115
|
else
|
116
|
+
logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
|
111
117
|
# We're going to need to create the files and "cast" them to locations.
|
112
|
-
|
118
|
+
splitter.call(
|
113
119
|
input_tmp_file_path,
|
114
120
|
image_extension: output_extension,
|
115
121
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
116
|
-
).
|
122
|
+
).each_with_index do |image_path, index|
|
123
|
+
logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
|
117
124
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
125
|
yield(image_location, image_path)
|
119
126
|
end
|
@@ -121,7 +128,18 @@ module DerivativeRodeo
|
|
121
128
|
end
|
122
129
|
end
|
123
130
|
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
124
132
|
# rubocop:enable Metrics/MethodLength
|
133
|
+
|
134
|
+
##
|
135
|
+
# We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
|
136
|
+
# The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
|
137
|
+
# filename (e.g. "123.ARCHIVAL.pdf")
|
138
|
+
#
|
139
|
+
# And since the template doesn't have a concept of page number, we introduce this kludge.
|
140
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
141
|
+
File.join(File.dirname(preprocessed_location_template), input_location.file_name)
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
end
|
@@ -33,6 +33,11 @@ module DerivativeRodeo
|
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
34
|
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
|
+
rescue => e
|
37
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
|
38
|
+
exception = RuntimeError.new(message)
|
39
|
+
exception.set_backtrace(e.backtrace)
|
40
|
+
raise exception
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
@@ -46,11 +46,12 @@ module DerivativeRodeo
|
|
46
46
|
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
47
|
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
48
|
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
-
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
-
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
# rubocop:disable Metrics/MethodLength
|
54
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
|
54
55
|
@from_uri = from_uri
|
55
56
|
@template = template
|
56
57
|
@adapter = adapter
|
@@ -60,12 +61,23 @@ module DerivativeRodeo
|
|
60
61
|
@from_scheme, @path = uri.split("://")
|
61
62
|
@parts = @path.split(separator)
|
62
63
|
@dir_parts = @parts[0..-2]
|
63
|
-
@filename = @parts[-1]
|
64
|
-
@basename = File.basename(@filename, ".*")
|
65
|
-
|
64
|
+
@filename = options[:filename] || @parts[-1]
|
65
|
+
@basename = options[:basename] || File.basename(@filename, ".*")
|
66
|
+
|
67
|
+
##
|
68
|
+
# HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
|
69
|
+
# forward the `.mono` suffix as that makes it hard to find the preprocessed word
|
70
|
+
# coordinates, alto, and plain text. This ensures files derived from the .mono are findable
|
71
|
+
# in IIIF Print.
|
72
|
+
@basename = @basename.sub(/\.mono\z/, '')
|
73
|
+
@extension = options[:extension] || File.extname(@filename)
|
74
|
+
# When a generator specifies "same" we want to use the given file's extension
|
75
|
+
@extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
|
76
|
+
@extension = ".#{@extension}" unless @extension.start_with?(".")
|
66
77
|
|
67
78
|
@template_without_query, @template_query = template.split("?")
|
68
79
|
end
|
80
|
+
# rubocop:enable Metrics/MethodLength
|
69
81
|
|
70
82
|
def call
|
71
83
|
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
@@ -46,6 +46,8 @@ module DerivativeRodeo
|
|
46
46
|
delegate :config, to: DerivativeRodeo
|
47
47
|
end
|
48
48
|
|
49
|
+
delegate :logger, to: DerivativeRodeo
|
50
|
+
|
49
51
|
##
|
50
52
|
# @param location_name [String]
|
51
53
|
#
|
@@ -101,10 +103,10 @@ module DerivativeRodeo
|
|
101
103
|
# @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
|
102
104
|
#
|
103
105
|
# @return [StorageLocations::BaseLocation]
|
104
|
-
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
|
106
|
+
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
|
105
107
|
# HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
|
106
108
|
from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
|
107
|
-
to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
|
109
|
+
to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
|
108
110
|
new(to_uri)
|
109
111
|
end
|
110
112
|
|
@@ -203,9 +205,9 @@ module DerivativeRodeo
|
|
203
205
|
# @return [StorageLocations::BaseLocation]
|
204
206
|
#
|
205
207
|
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
206
|
-
def derived_file_from(template
|
208
|
+
def derived_file_from(template:, **options)
|
207
209
|
klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
|
208
|
-
klass.build(from_uri: file_path, template: template)
|
210
|
+
klass.build(from_uri: file_path, template: template, **options)
|
209
211
|
end
|
210
212
|
|
211
213
|
##
|
@@ -231,6 +233,7 @@ module DerivativeRodeo
|
|
231
233
|
def with_new_extension(extension)
|
232
234
|
return file_path if extension == StorageLocations::SAME
|
233
235
|
|
236
|
+
# NOTE: May need to revisit this
|
234
237
|
"#{file_path.split('.')[0]}.#{extension}"
|
235
238
|
end
|
236
239
|
|
@@ -42,6 +42,9 @@ module DerivativeRodeo
|
|
42
42
|
#
|
43
43
|
# @param tail_regexp [Regexp]
|
44
44
|
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
logger.debug("#{self.class}##{__method__} searching for matching files in " \
|
46
|
+
"file_dir: #{file_dir.inspect} " \
|
47
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
45
48
|
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
49
|
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
50
|
end
|
@@ -73,7 +73,10 @@ module DerivativeRodeo
|
|
73
73
|
def matching_locations_in_file_dir(tail_regexp:)
|
74
74
|
uri = URI.parse(file_uri)
|
75
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
76
|
-
|
76
|
+
logger.debug("#{self.class}##{__method__} searching for matching files for " \
|
77
|
+
"scheme_and_host: #{scheme_and_host.inspect} " \
|
78
|
+
"file_dir: #{file_dir.inspect} " \
|
79
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
77
80
|
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
81
|
if tail_regexp.match(object.key)
|
79
82
|
template = File.join(scheme_and_host, object.key)
|
@@ -120,7 +123,7 @@ module DerivativeRodeo
|
|
120
123
|
def bucket_name
|
121
124
|
@bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
|
122
125
|
rescue StandardError
|
123
|
-
raise Errors::BucketMissingError
|
126
|
+
raise Errors::BucketMissingError.new(file_uri: file_uri)
|
124
127
|
end
|
125
128
|
|
126
129
|
# @see .use_actual_s3_bucket
|
@@ -13,6 +13,8 @@ module DerivativeRodeo
|
|
13
13
|
# Location to download and upload files to Sqs
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
|
+
#
|
17
|
+
# rubocop:disable Metrics/ClassLength
|
16
18
|
class SqsLocation < BaseLocation
|
17
19
|
##
|
18
20
|
# @!group Class Attributes
|
@@ -85,11 +87,14 @@ module DerivativeRodeo
|
|
85
87
|
batch = []
|
86
88
|
Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
|
87
89
|
batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
|
88
|
-
if (i % batch_size).zero?
|
90
|
+
if (i + 1 % batch_size).zero?
|
89
91
|
add_batch(messages: batch)
|
90
92
|
batch = []
|
91
93
|
end
|
92
94
|
end
|
95
|
+
|
96
|
+
# Ensure we're flushing the batched up queue as part of completing the write.
|
97
|
+
add_batch(messages: batch) if batch.present?
|
93
98
|
file_uri
|
94
99
|
end
|
95
100
|
|
@@ -181,6 +186,7 @@ module DerivativeRodeo
|
|
181
186
|
end
|
182
187
|
|
183
188
|
def output_json(uri)
|
189
|
+
# TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
|
184
190
|
key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
|
185
191
|
{ key => [template] }.to_json
|
186
192
|
end
|
@@ -201,5 +207,6 @@ module DerivativeRodeo
|
|
201
207
|
@file_uri_parts
|
202
208
|
end
|
203
209
|
end
|
210
|
+
# rubocop:enable Metrics/ClassLength
|
204
211
|
end
|
205
212
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
337
337
|
- !ruby/object:Gem::Version
|
338
338
|
version: '0'
|
339
339
|
requirements: []
|
340
|
-
rubygems_version: 3.
|
340
|
+
rubygems_version: 3.3.7
|
341
341
|
signing_key:
|
342
342
|
specification_version: 4
|
343
343
|
summary: An ETL Ecosystem for Derivative Processing.
|