derivative-rodeo 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/errors.rb +1 -1
- data/lib/derivative_rodeo/generators/base_generator.rb +127 -15
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +2 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +24 -6
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +5 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +18 -6
- data/lib/derivative_rodeo/storage_locations/base_location.rb +7 -4
- data/lib/derivative_rodeo/storage_locations/file_location.rb +3 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +5 -2
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +8 -1
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5ea42b2912b6b1eb6981b8a72a3d39e4d6f466e07ad60381ba4880431214b18
|
4
|
+
data.tar.gz: 4e4ca5cdd6ba61898ba13970fbd95c8836c33492c377eb6f258255c8ebf79e67
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ebaacf09b1459e8bb8527ddd3b45b6e84106d335e91cc506a6b9b31aae6c74931c755dc5e084e9a39a6ddb0ca8f1b0f85ee21e461cac445edd3eea45384b1b5
|
7
|
+
data.tar.gz: e11195b3b7169a1f7e2f5df821c681512af93123fc272d3b1dd35d9c971f15ad58a86b4ab1cabf38b463c77d5531114af7ca36eb9197275e46549788a4ec6c1f
|
@@ -11,11 +11,21 @@ module DerivativeRodeo
|
|
11
11
|
##
|
12
12
|
# The Base Generator defines the interface and common methods.
|
13
13
|
#
|
14
|
+
# Fundamentally, they are about ensuring the files end up at the specified location, based on
|
15
|
+
# the given:
|
16
|
+
#
|
17
|
+
# - {#input_uris}
|
18
|
+
# - {#output_location_template}
|
19
|
+
# - {#preprocessed_location_template}
|
20
|
+
#
|
14
21
|
# In extending a BaseGenerator you:
|
15
22
|
#
|
16
23
|
# - must assign an {.output_extension}
|
17
24
|
# - must impliment a {#build_step} method
|
18
25
|
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
26
|
+
#
|
27
|
+
# {#generated_files} is "where the magic happens"
|
28
|
+
# rubocop:disable Metrics/ClassLength
|
19
29
|
class BaseGenerator
|
20
30
|
##
|
21
31
|
# @!group Class Attributes
|
@@ -26,9 +36,27 @@ module DerivativeRodeo
|
|
26
36
|
class_attribute :output_extension
|
27
37
|
# @!endgroup Class Attributes
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
39
|
+
##
|
40
|
+
# @!group Attributes
|
41
|
+
#
|
42
|
+
# The "original" files that we'll be processing (via {#generated_files})
|
43
|
+
# @return [Array<String>]
|
44
|
+
attr_reader :input_uris
|
45
|
+
|
46
|
+
##
|
47
|
+
# The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
|
48
|
+
# @return [String]
|
49
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
50
|
+
attr_reader :output_location_template
|
51
|
+
|
52
|
+
##
|
53
|
+
# The template that defines where we might find existing processed files for the given
|
54
|
+
# {#input_uris} (via {#generated_files})
|
55
|
+
#
|
56
|
+
# @return [String, NilClass]
|
57
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
58
|
+
attr_reader :preprocessed_location_template
|
59
|
+
# @!endgroup Attributes
|
32
60
|
|
33
61
|
##
|
34
62
|
# @param input_uris [Array<String>]
|
@@ -78,14 +106,28 @@ module DerivativeRodeo
|
|
78
106
|
##
|
79
107
|
# @api public
|
80
108
|
#
|
109
|
+
# Based on the {#input_uris} ensure that we have files at the given output location (as
|
110
|
+
# derived from the {#output_location_template}). We ensure that by:
|
111
|
+
#
|
112
|
+
# - Checking if a file already exists at the output location
|
113
|
+
# - Copying a preprocessed file to the output location if a preprocessed file exists
|
114
|
+
# - Generating the file based on the input location
|
115
|
+
#
|
116
|
+
# @note This is the method where the magic happens!
|
117
|
+
#
|
81
118
|
# @return [Array<StorageLocations::BaseLocation>]
|
82
119
|
#
|
83
120
|
# @see #build_step
|
84
121
|
# @see #with_each_requisite_location_and_tmp_file_path
|
122
|
+
# rubocop:disable Metrics/MethodLength
|
85
123
|
def generated_files
|
86
124
|
# TODO: Examples please
|
87
125
|
return @generated_files if defined?(@generated_files)
|
88
126
|
|
127
|
+
logger.info("Starting #{self.class}#generated_files with " \
|
128
|
+
"input_uris: #{input_uris.inspect}, " \
|
129
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
130
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}.")
|
89
131
|
# As much as I would like to use map or returned values; given the implementations it's
|
90
132
|
# better to explicitly require that; reducing downstream implementation headaches.
|
91
133
|
#
|
@@ -97,15 +139,20 @@ module DerivativeRodeo
|
|
97
139
|
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
98
140
|
# "file:///Users/jfriesen/.profile"
|
99
141
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
100
|
-
|
101
|
-
@generated_files << if
|
102
|
-
|
142
|
+
output_location = destination(input_location)
|
143
|
+
@generated_files << if output_location.exist?
|
144
|
+
output_location
|
103
145
|
else
|
104
|
-
|
146
|
+
log_message = "#{self.class}#generated_files :: " \
|
147
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
148
|
+
"Generating output_location file_uri #{output_location.file_uri} via build_step."
|
149
|
+
logger.info(log_message)
|
150
|
+
build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
|
105
151
|
end
|
106
152
|
end
|
107
153
|
@generated_files
|
108
154
|
end
|
155
|
+
# rubocop:enable Metrics/MethodLength
|
109
156
|
|
110
157
|
##
|
111
158
|
# @return [Array<String>]
|
@@ -157,9 +204,13 @@ module DerivativeRodeo
|
|
157
204
|
end
|
158
205
|
|
159
206
|
##
|
160
|
-
# Returns the location
|
161
|
-
# destination might exist or might not. In the case
|
162
|
-
#
|
207
|
+
# Returns the output location for the given :input_location. The file at the location
|
208
|
+
# destination might exist or might not. In the case where we have a
|
209
|
+
# {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
|
210
|
+
# and if it exists there copy it to the target output location.
|
211
|
+
#
|
212
|
+
# In the case of non-existence, then the {#build_step} will create
|
213
|
+
# the file.
|
163
214
|
#
|
164
215
|
# @param input_location [StorageLocations::BaseLocation]
|
165
216
|
#
|
@@ -167,20 +218,80 @@ module DerivativeRodeo
|
|
167
218
|
# {#output_location_template} or {#preprocessed_location_template}.
|
168
219
|
#
|
169
220
|
# @see [StorageLocations::BaseLocation#exist?]
|
221
|
+
# rubocop:disable Metrics/MethodLength
|
222
|
+
# rubocop:disable Metrics/AbcSize
|
170
223
|
def destination(input_location)
|
171
|
-
output_location = input_location.derived_file_from(template: output_location_template)
|
224
|
+
output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
|
225
|
+
|
226
|
+
if output_location.exist?
|
227
|
+
log_message = "#{self.class}#destination :: " \
|
228
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
229
|
+
"Found output_location file_uri #{output_location.file_uri}."
|
230
|
+
logger.info(log_message)
|
172
231
|
|
173
|
-
|
174
|
-
|
232
|
+
return output_location
|
233
|
+
end
|
234
|
+
|
235
|
+
unless preprocessed_location_template
|
236
|
+
log_message = "#{self.class}#destination :: " \
|
237
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
238
|
+
"No preprocessed_location_template provided " \
|
239
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
|
240
|
+
"moving on to generation via #{self.class}#build_step."
|
241
|
+
logger.info(log_message)
|
242
|
+
|
243
|
+
return output_location
|
244
|
+
end
|
245
|
+
|
246
|
+
template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
|
175
247
|
|
176
|
-
preprocessed_location = input_location.derived_file_from(template:
|
248
|
+
preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
|
177
249
|
# We only want the location if it exists
|
178
|
-
|
250
|
+
if preprocessed_location.exist?
|
251
|
+
log_message = "#{self.class}#destination :: " \
|
252
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
253
|
+
"Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
|
254
|
+
logger.info(log_message)
|
255
|
+
|
256
|
+
# Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
|
257
|
+
# will also write some logs.
|
258
|
+
output_location = CopyGenerator.new(
|
259
|
+
input_uris: [preprocessed_location.file_uri],
|
260
|
+
output_location_template: output_location.file_uri
|
261
|
+
).generated_files.first
|
262
|
+
|
263
|
+
return output_location
|
264
|
+
end
|
265
|
+
|
266
|
+
log_message = "#{self.class}#destination :: " \
|
267
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
268
|
+
"No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
|
269
|
+
"nor output_location file_uri #{output_location.file_uri}; " \
|
270
|
+
"moving on to generation via #{self.class}#build_step."
|
271
|
+
logger.info(log_message)
|
179
272
|
|
180
273
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
181
274
|
# that the #build_step knows where to write the file.
|
182
275
|
output_location
|
183
276
|
end
|
277
|
+
# rubocop:enable Metrics/AbcSize
|
278
|
+
# rubocop:enable Metrics/MethodLength
|
279
|
+
|
280
|
+
##
|
281
|
+
# Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
|
282
|
+
# the input location. Most often, however, the given :preprocessed_location_template is
|
283
|
+
# adequate and would be the typical returned value.
|
284
|
+
#
|
285
|
+
# @param input_location [StorageLocations::BaseLocation]
|
286
|
+
# @param preprocessed_location_template [String]
|
287
|
+
#
|
288
|
+
# @return [String]
|
289
|
+
#
|
290
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
291
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
292
|
+
preprocessed_location_template
|
293
|
+
end
|
294
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
184
295
|
|
185
296
|
##
|
186
297
|
# A bit of indirection to create a common interface for running a shell command.
|
@@ -196,6 +307,7 @@ module DerivativeRodeo
|
|
196
307
|
result
|
197
308
|
end
|
198
309
|
end
|
310
|
+
# rubocop:enable Metrics/ClassLength
|
199
311
|
end
|
200
312
|
end
|
201
313
|
|
@@ -5,7 +5,8 @@ module DerivativeRodeo
|
|
5
5
|
##
|
6
6
|
# Take images an ensures that we have a monochrome derivative of those images.
|
7
7
|
class MonochromeGenerator < BaseGenerator
|
8
|
-
#
|
8
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
|
9
|
+
# magic ".mono" suffix
|
9
10
|
self.output_extension = 'mono.tiff'
|
10
11
|
|
11
12
|
##
|
@@ -66,7 +66,7 @@ module DerivativeRodeo
|
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
|
-
# @note The
|
69
|
+
# @note The tail_regexp is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
72
|
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
@@ -76,12 +76,14 @@ module DerivativeRodeo
|
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# @api public
|
84
84
|
#
|
85
|
+
# @param splitter [#call]
|
86
|
+
#
|
85
87
|
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
86
88
|
# the page number.
|
87
89
|
#
|
@@ -98,22 +100,27 @@ module DerivativeRodeo
|
|
98
100
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
101
|
#
|
100
102
|
# rubocop:disable Metrics/MethodLength
|
101
|
-
|
103
|
+
# rubocop:disable Metrics/AbcSize
|
104
|
+
def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
|
102
105
|
input_files.each do |input_location|
|
103
106
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
107
|
existing_locations = existing_page_locations(input_location: input_location)
|
105
108
|
|
106
109
|
if existing_locations.count.positive?
|
107
|
-
existing_locations.
|
110
|
+
logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
|
111
|
+
existing_locations.each_with_index do |location, index|
|
112
|
+
logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
|
108
113
|
yield(location, location.file_path)
|
109
114
|
end
|
110
115
|
else
|
116
|
+
logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
|
111
117
|
# We're going to need to create the files and "cast" them to locations.
|
112
|
-
|
118
|
+
splitter.call(
|
113
119
|
input_tmp_file_path,
|
114
120
|
image_extension: output_extension,
|
115
121
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
116
|
-
).
|
122
|
+
).each_with_index do |image_path, index|
|
123
|
+
logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
|
117
124
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
125
|
yield(image_location, image_path)
|
119
126
|
end
|
@@ -121,7 +128,18 @@ module DerivativeRodeo
|
|
121
128
|
end
|
122
129
|
end
|
123
130
|
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
124
132
|
# rubocop:enable Metrics/MethodLength
|
133
|
+
|
134
|
+
##
|
135
|
+
# We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
|
136
|
+
# The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
|
137
|
+
# filename (e.g. "123.ARCHIVAL.pdf")
|
138
|
+
#
|
139
|
+
# And since the template doesn't have a concept of page number, we introduce this kludge.
|
140
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
141
|
+
File.join(File.dirname(preprocessed_location_template), input_location.file_name)
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
end
|
@@ -33,6 +33,11 @@ module DerivativeRodeo
|
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
34
|
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
|
+
rescue => e
|
37
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
|
38
|
+
exception = RuntimeError.new(message)
|
39
|
+
exception.set_backtrace(e.backtrace)
|
40
|
+
raise exception
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
@@ -46,11 +46,12 @@ module DerivativeRodeo
|
|
46
46
|
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
47
|
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
48
|
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
-
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
-
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
# rubocop:disable Metrics/MethodLength
|
54
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
|
54
55
|
@from_uri = from_uri
|
55
56
|
@template = template
|
56
57
|
@adapter = adapter
|
@@ -60,12 +61,23 @@ module DerivativeRodeo
|
|
60
61
|
@from_scheme, @path = uri.split("://")
|
61
62
|
@parts = @path.split(separator)
|
62
63
|
@dir_parts = @parts[0..-2]
|
63
|
-
@filename = @parts[-1]
|
64
|
-
@basename = File.basename(@filename, ".*")
|
65
|
-
|
64
|
+
@filename = options[:filename] || @parts[-1]
|
65
|
+
@basename = options[:basename] || File.basename(@filename, ".*")
|
66
|
+
|
67
|
+
##
|
68
|
+
# HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
|
69
|
+
# forward the `.mono` suffix as that makes it hard to find the preprocessed word
|
70
|
+
# coordinates, alto, and plain text. This ensures files derived from the .mono are findable
|
71
|
+
# in IIIF Print.
|
72
|
+
@basename = @basename.sub(/\.mono\z/, '')
|
73
|
+
@extension = options[:extension] || File.extname(@filename)
|
74
|
+
# When a generator specifies "same" we want to use the given file's extension
|
75
|
+
@extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
|
76
|
+
@extension = ".#{@extension}" unless @extension.start_with?(".")
|
66
77
|
|
67
78
|
@template_without_query, @template_query = template.split("?")
|
68
79
|
end
|
80
|
+
# rubocop:enable Metrics/MethodLength
|
69
81
|
|
70
82
|
def call
|
71
83
|
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
@@ -46,6 +46,8 @@ module DerivativeRodeo
|
|
46
46
|
delegate :config, to: DerivativeRodeo
|
47
47
|
end
|
48
48
|
|
49
|
+
delegate :logger, to: DerivativeRodeo
|
50
|
+
|
49
51
|
##
|
50
52
|
# @param location_name [String]
|
51
53
|
#
|
@@ -101,10 +103,10 @@ module DerivativeRodeo
|
|
101
103
|
# @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
|
102
104
|
#
|
103
105
|
# @return [StorageLocations::BaseLocation]
|
104
|
-
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
|
106
|
+
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
|
105
107
|
# HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
|
106
108
|
from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
|
107
|
-
to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
|
109
|
+
to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
|
108
110
|
new(to_uri)
|
109
111
|
end
|
110
112
|
|
@@ -203,9 +205,9 @@ module DerivativeRodeo
|
|
203
205
|
# @return [StorageLocations::BaseLocation]
|
204
206
|
#
|
205
207
|
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
206
|
-
def derived_file_from(template
|
208
|
+
def derived_file_from(template:, **options)
|
207
209
|
klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
|
208
|
-
klass.build(from_uri: file_path, template: template)
|
210
|
+
klass.build(from_uri: file_path, template: template, **options)
|
209
211
|
end
|
210
212
|
|
211
213
|
##
|
@@ -231,6 +233,7 @@ module DerivativeRodeo
|
|
231
233
|
def with_new_extension(extension)
|
232
234
|
return file_path if extension == StorageLocations::SAME
|
233
235
|
|
236
|
+
# NOTE: May need to revisit this
|
234
237
|
"#{file_path.split('.')[0]}.#{extension}"
|
235
238
|
end
|
236
239
|
|
@@ -42,6 +42,9 @@ module DerivativeRodeo
|
|
42
42
|
#
|
43
43
|
# @param tail_regexp [Regexp]
|
44
44
|
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
logger.debug("#{self.class}##{__method__} searching for matching files in " \
|
46
|
+
"file_dir: #{file_dir.inspect} " \
|
47
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
45
48
|
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
49
|
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
50
|
end
|
@@ -73,7 +73,10 @@ module DerivativeRodeo
|
|
73
73
|
def matching_locations_in_file_dir(tail_regexp:)
|
74
74
|
uri = URI.parse(file_uri)
|
75
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
76
|
-
|
76
|
+
logger.debug("#{self.class}##{__method__} searching for matching files for " \
|
77
|
+
"scheme_and_host: #{scheme_and_host.inspect} " \
|
78
|
+
"file_dir: #{file_dir.inspect} " \
|
79
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
77
80
|
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
81
|
if tail_regexp.match(object.key)
|
79
82
|
template = File.join(scheme_and_host, object.key)
|
@@ -120,7 +123,7 @@ module DerivativeRodeo
|
|
120
123
|
def bucket_name
|
121
124
|
@bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
|
122
125
|
rescue StandardError
|
123
|
-
raise Errors::BucketMissingError
|
126
|
+
raise Errors::BucketMissingError.new(file_uri: file_uri)
|
124
127
|
end
|
125
128
|
|
126
129
|
# @see .use_actual_s3_bucket
|
@@ -13,6 +13,8 @@ module DerivativeRodeo
|
|
13
13
|
# Location to download and upload files to Sqs
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
|
+
#
|
17
|
+
# rubocop:disable Metrics/ClassLength
|
16
18
|
class SqsLocation < BaseLocation
|
17
19
|
##
|
18
20
|
# @!group Class Attributes
|
@@ -85,11 +87,14 @@ module DerivativeRodeo
|
|
85
87
|
batch = []
|
86
88
|
Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
|
87
89
|
batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
|
88
|
-
if (i % batch_size).zero?
|
90
|
+
if (i + 1 % batch_size).zero?
|
89
91
|
add_batch(messages: batch)
|
90
92
|
batch = []
|
91
93
|
end
|
92
94
|
end
|
95
|
+
|
96
|
+
# Ensure we're flushing the batched up queue as part of completing the write.
|
97
|
+
add_batch(messages: batch) if batch.present?
|
93
98
|
file_uri
|
94
99
|
end
|
95
100
|
|
@@ -181,6 +186,7 @@ module DerivativeRodeo
|
|
181
186
|
end
|
182
187
|
|
183
188
|
def output_json(uri)
|
189
|
+
# TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
|
184
190
|
key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
|
185
191
|
{ key => [template] }.to_json
|
186
192
|
end
|
@@ -201,5 +207,6 @@ module DerivativeRodeo
|
|
201
207
|
@file_uri_parts
|
202
208
|
end
|
203
209
|
end
|
210
|
+
# rubocop:enable Metrics/ClassLength
|
204
211
|
end
|
205
212
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
337
337
|
- !ruby/object:Gem::Version
|
338
338
|
version: '0'
|
339
339
|
requirements: []
|
340
|
-
rubygems_version: 3.
|
340
|
+
rubygems_version: 3.3.7
|
341
341
|
signing_key:
|
342
342
|
specification_version: 4
|
343
343
|
summary: An ETL Ecosystem for Derivative Processing.
|