derivative-rodeo 0.3.0 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/base_generator.rb +46 -6
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +2 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +17 -15
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +18 -6
- data/lib/derivative_rodeo/storage_locations/base_location.rb +9 -8
- data/lib/derivative_rodeo/storage_locations/file_location.rb +10 -2
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +7 -19
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +8 -1
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
|
4
|
+
data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
|
7
|
+
data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
|
@@ -39,7 +39,6 @@ module DerivativeRodeo
|
|
39
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
40
40
|
# :preprocessed_location_template.
|
41
41
|
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
-
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
43
42
|
@input_uris = Array.wrap(input_uris)
|
44
43
|
@output_location_template = output_location_template
|
45
44
|
@preprocessed_location_template = preprocessed_location_template
|
@@ -83,6 +82,7 @@ module DerivativeRodeo
|
|
83
82
|
#
|
84
83
|
# @see #build_step
|
85
84
|
# @see #with_each_requisite_location_and_tmp_file_path
|
85
|
+
# rubocop:disable Metrics/MethodLength
|
86
86
|
def generated_files
|
87
87
|
# TODO: Examples please
|
88
88
|
return @generated_files if defined?(@generated_files)
|
@@ -102,11 +102,16 @@ module DerivativeRodeo
|
|
102
102
|
@generated_files << if generated_file.exist?
|
103
103
|
generated_file
|
104
104
|
else
|
105
|
+
log_message = "#{self.class}#generated_files :: " \
|
106
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
107
|
+
"Generating output_location file_uri #{generated_file.file_uri} via build_step."
|
108
|
+
logger.info(log_message)
|
105
109
|
build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
|
106
110
|
end
|
107
111
|
end
|
108
112
|
@generated_files
|
109
113
|
end
|
114
|
+
# rubocop:enable Metrics/MethodLength
|
110
115
|
|
111
116
|
##
|
112
117
|
# @return [Array<String>]
|
@@ -168,20 +173,55 @@ module DerivativeRodeo
|
|
168
173
|
# {#output_location_template} or {#preprocessed_location_template}.
|
169
174
|
#
|
170
175
|
# @see [StorageLocations::BaseLocation#exist?]
|
176
|
+
# rubocop:disable Metrics/MethodLength
|
177
|
+
# rubocop:disable Metrics/AbcSize
|
171
178
|
def destination(input_location)
|
172
|
-
output_location = input_location.derived_file_from(template: output_location_template)
|
179
|
+
output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
|
173
180
|
|
174
|
-
|
175
|
-
|
181
|
+
if output_location.exist?
|
182
|
+
log_message = "#{self.class}#destination :: " \
|
183
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
184
|
+
"Found output_location file_uri #{output_location.file_uri}."
|
185
|
+
logger.info(log_message)
|
176
186
|
|
177
|
-
|
187
|
+
return output_location
|
188
|
+
end
|
189
|
+
|
190
|
+
unless preprocessed_location_template
|
191
|
+
log_message = "#{self.class}#destination :: " \
|
192
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
193
|
+
"No preprocessed_location_template provided " \
|
194
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri};" \
|
195
|
+
" moving on to generation via #{self.class}#build_step."
|
196
|
+
logger.info(log_message)
|
197
|
+
|
198
|
+
return output_location
|
199
|
+
end
|
200
|
+
|
201
|
+
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
|
178
202
|
# We only want the location if it exists
|
179
|
-
|
203
|
+
if preprocessed_location&.exist?
|
204
|
+
log_message = "#{self.class}#destination :: " \
|
205
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
206
|
+
"Found preprocessed_location file_uri #{output_location.file_uri}."
|
207
|
+
logger.info(log_message)
|
208
|
+
|
209
|
+
return preprocessed_location
|
210
|
+
end
|
211
|
+
|
212
|
+
log_message = "#{self.class}#destination :: " \
|
213
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
214
|
+
"No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
|
215
|
+
"nor output_location file_uri #{output_location.file_uri}; " \
|
216
|
+
"moving on to generation via #{self.class}#build_step."
|
217
|
+
logger.info(log_message)
|
180
218
|
|
181
219
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
182
220
|
# that the #build_step knows where to write the file.
|
183
221
|
output_location
|
184
222
|
end
|
223
|
+
# rubocop:enable Metrics/AbcSize
|
224
|
+
# rubocop:enable Metrics/MethodLength
|
185
225
|
|
186
226
|
##
|
187
227
|
# A bit of indirection to create a common interface for running a shell command.
|
@@ -5,7 +5,8 @@ module DerivativeRodeo
|
|
5
5
|
##
|
6
6
|
# Take images an ensures that we have a monochrome derivative of those images.
|
7
7
|
class MonochromeGenerator < BaseGenerator
|
8
|
-
#
|
8
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
|
9
|
+
# magic ".mono" suffix
|
9
10
|
self.output_extension = 'mono.tiff'
|
10
11
|
|
11
12
|
##
|
@@ -52,7 +52,7 @@ module DerivativeRodeo
|
|
52
52
|
# @see #existing_page_locations
|
53
53
|
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
54
|
def image_file_basename_template(basename:)
|
55
|
-
"#{basename}
|
55
|
+
"#{basename}--page-%d.#{output_extension}"
|
56
56
|
end
|
57
57
|
|
58
58
|
##
|
@@ -62,21 +62,21 @@ module DerivativeRodeo
|
|
62
62
|
# @param input_location [StorageLocations::BaseLocation]
|
63
63
|
#
|
64
64
|
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
-
# with :
|
65
|
+
# with :tail_regexp.
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
69
|
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
|
-
|
72
|
+
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
73
73
|
|
74
|
-
output_locations = input_location.derived_file_from(template: output_location_template).
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
75
75
|
return output_locations if output_locations.count.positive?
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
@@ -101,20 +101,22 @@ module DerivativeRodeo
|
|
101
101
|
def with_each_requisite_location_and_tmp_file_path
|
102
102
|
input_files.each do |input_location|
|
103
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
|
-
|
105
|
-
generated_files = existing_page_locations(input_location: input_location)
|
104
|
+
existing_locations = existing_page_locations(input_location: input_location)
|
106
105
|
|
107
|
-
if
|
108
|
-
|
106
|
+
if existing_locations.count.positive?
|
107
|
+
existing_locations.each do |location|
|
108
|
+
yield(location, location.file_path)
|
109
|
+
end
|
110
|
+
else
|
111
|
+
# We're going to need to create the files and "cast" them to locations.
|
112
|
+
Services::PdfSplitter.call(
|
109
113
|
input_tmp_file_path,
|
110
114
|
image_extension: output_extension,
|
111
115
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
-
)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
117
|
-
yield(image_location, image_path)
|
116
|
+
).each do |image_path|
|
117
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
|
+
yield(image_location, image_path)
|
119
|
+
end
|
118
120
|
end
|
119
121
|
end
|
120
122
|
end
|
@@ -46,11 +46,12 @@ module DerivativeRodeo
|
|
46
46
|
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
47
|
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
48
|
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
-
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
-
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
# rubocop:disable Metrics/MethodLength
|
54
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
|
54
55
|
@from_uri = from_uri
|
55
56
|
@template = template
|
56
57
|
@adapter = adapter
|
@@ -60,12 +61,23 @@ module DerivativeRodeo
|
|
60
61
|
@from_scheme, @path = uri.split("://")
|
61
62
|
@parts = @path.split(separator)
|
62
63
|
@dir_parts = @parts[0..-2]
|
63
|
-
@filename = @parts[-1]
|
64
|
-
@basename = File.basename(@filename, ".*")
|
65
|
-
|
64
|
+
@filename = options[:filename] || @parts[-1]
|
65
|
+
@basename = options[:basename] || File.basename(@filename, ".*")
|
66
|
+
|
67
|
+
##
|
68
|
+
# HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
|
69
|
+
# forward the `.mono` suffix as that makes it hard to find the preprocessed word
|
70
|
+
# coordinates, alto, and plain text. This ensures files derived from the .mono are findable
|
71
|
+
# in IIIF Print.
|
72
|
+
@basename = @basename.sub(/\.mono\z/, '')
|
73
|
+
@extension = options[:extension] || File.extname(@filename)
|
74
|
+
# When a generator specifies "same" we want to use the given file's extension
|
75
|
+
@extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
|
76
|
+
@extension = ".#{@extension}" unless @extension.start_with?(".")
|
66
77
|
|
67
78
|
@template_without_query, @template_query = template.split("?")
|
68
79
|
end
|
80
|
+
# rubocop:enable Metrics/MethodLength
|
69
81
|
|
70
82
|
def call
|
71
83
|
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
@@ -101,10 +101,10 @@ module DerivativeRodeo
|
|
101
101
|
# @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
|
102
102
|
#
|
103
103
|
# @return [StorageLocations::BaseLocation]
|
104
|
-
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
|
104
|
+
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
|
105
105
|
# HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
|
106
106
|
from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
|
107
|
-
to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
|
107
|
+
to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
|
108
108
|
new(to_uri)
|
109
109
|
end
|
110
110
|
|
@@ -203,25 +203,25 @@ module DerivativeRodeo
|
|
203
203
|
# @return [StorageLocations::BaseLocation]
|
204
204
|
#
|
205
205
|
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
206
|
-
def derived_file_from(template
|
206
|
+
def derived_file_from(template:, **options)
|
207
207
|
klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
|
208
|
-
klass.build(from_uri: file_path, template: template)
|
208
|
+
klass.build(from_uri: file_path, template: template, **options)
|
209
209
|
end
|
210
210
|
|
211
211
|
##
|
212
212
|
# When you have a known location and want to check for files that are within that location,
|
213
|
-
# use the {#
|
213
|
+
# use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
|
214
214
|
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
215
|
#
|
216
216
|
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
217
|
# qualified" Dir.glob type search.
|
218
218
|
#
|
219
|
-
# @param
|
219
|
+
# @param tail_regexp [Regexp]
|
220
220
|
#
|
221
221
|
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
222
|
# array when there are none.
|
223
|
-
def
|
224
|
-
raise NotImplementedError, "#{self.class}#
|
223
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
224
|
+
raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
|
225
225
|
end
|
226
226
|
|
227
227
|
##
|
@@ -231,6 +231,7 @@ module DerivativeRodeo
|
|
231
231
|
def with_new_extension(extension)
|
232
232
|
return file_path if extension == StorageLocations::SAME
|
233
233
|
|
234
|
+
# NOTE: May need to revisit this
|
234
235
|
"#{file_path.split('.')[0]}.#{extension}"
|
235
236
|
end
|
236
237
|
|
@@ -35,8 +35,16 @@ module DerivativeRodeo
|
|
35
35
|
file_uri
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
##
|
39
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
|
40
|
+
#
|
41
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
42
|
+
#
|
43
|
+
# @param tail_regexp [Regexp]
|
44
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
|
+
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
end
|
42
50
|
end
|
@@ -65,31 +65,19 @@ module DerivativeRodeo
|
|
65
65
|
##
|
66
66
|
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
67
|
#
|
68
|
-
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
69
|
-
# use the components of the file_path to fake that behavior.
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
70
69
|
#
|
71
70
|
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
-
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
-
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
-
# and miscolate two PDFs.
|
79
|
-
#
|
80
|
-
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
-
globname = File.join(file_dir, tail_glob)
|
82
|
-
regexp = %r{#{File.extname(globname)}$}
|
83
|
-
|
84
|
-
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
-
# based on both the key of the item in the bucket as well as the bucket's host.
|
71
|
+
#
|
72
|
+
# @param tail_regexp [Regexp]
|
73
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
86
74
|
uri = URI.parse(file_uri)
|
87
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
76
|
|
89
|
-
bucket.objects(prefix:
|
90
|
-
if object.key
|
77
|
+
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
|
+
if tail_regexp.match(object.key)
|
91
79
|
template = File.join(scheme_and_host, object.key)
|
92
|
-
derived_file_from(template: template)
|
80
|
+
accumulator << derived_file_from(template: template)
|
93
81
|
end
|
94
82
|
end
|
95
83
|
end
|
@@ -13,6 +13,8 @@ module DerivativeRodeo
|
|
13
13
|
# Location to download and upload files to Sqs
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
|
+
#
|
17
|
+
# rubocop:disable Metrics/ClassLength
|
16
18
|
class SqsLocation < BaseLocation
|
17
19
|
##
|
18
20
|
# @!group Class Attributes
|
@@ -85,11 +87,14 @@ module DerivativeRodeo
|
|
85
87
|
batch = []
|
86
88
|
Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
|
87
89
|
batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
|
88
|
-
if (i % batch_size).zero?
|
90
|
+
if (i + 1 % batch_size).zero?
|
89
91
|
add_batch(messages: batch)
|
90
92
|
batch = []
|
91
93
|
end
|
92
94
|
end
|
95
|
+
|
96
|
+
# Ensure we're flushing the batched up queue as part of completing the write.
|
97
|
+
add_batch(messages: batch) if batch.present?
|
93
98
|
file_uri
|
94
99
|
end
|
95
100
|
|
@@ -181,6 +186,7 @@ module DerivativeRodeo
|
|
181
186
|
end
|
182
187
|
|
183
188
|
def output_json(uri)
|
189
|
+
# TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
|
184
190
|
key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
|
185
191
|
{ key => [template] }.to_json
|
186
192
|
end
|
@@ -201,5 +207,6 @@ module DerivativeRodeo
|
|
201
207
|
@file_uri_parts
|
202
208
|
end
|
203
209
|
end
|
210
|
+
# rubocop:enable Metrics/ClassLength
|
204
211
|
end
|
205
212
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|