derivative-rodeo 0.3.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/base_generator.rb +46 -6
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +2 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +17 -15
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +18 -6
- data/lib/derivative_rodeo/storage_locations/base_location.rb +9 -8
- data/lib/derivative_rodeo/storage_locations/file_location.rb +10 -2
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +7 -19
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +8 -1
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
|
4
|
+
data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
|
7
|
+
data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
|
@@ -39,7 +39,6 @@ module DerivativeRodeo
|
|
39
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
40
40
|
# :preprocessed_location_template.
|
41
41
|
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
-
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
43
42
|
@input_uris = Array.wrap(input_uris)
|
44
43
|
@output_location_template = output_location_template
|
45
44
|
@preprocessed_location_template = preprocessed_location_template
|
@@ -83,6 +82,7 @@ module DerivativeRodeo
|
|
83
82
|
#
|
84
83
|
# @see #build_step
|
85
84
|
# @see #with_each_requisite_location_and_tmp_file_path
|
85
|
+
# rubocop:disable Metrics/MethodLength
|
86
86
|
def generated_files
|
87
87
|
# TODO: Examples please
|
88
88
|
return @generated_files if defined?(@generated_files)
|
@@ -102,11 +102,16 @@ module DerivativeRodeo
|
|
102
102
|
@generated_files << if generated_file.exist?
|
103
103
|
generated_file
|
104
104
|
else
|
105
|
+
log_message = "#{self.class}#generated_files :: " \
|
106
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
107
|
+
"Generating output_location file_uri #{generated_file.file_uri} via build_step."
|
108
|
+
logger.info(log_message)
|
105
109
|
build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
|
106
110
|
end
|
107
111
|
end
|
108
112
|
@generated_files
|
109
113
|
end
|
114
|
+
# rubocop:enable Metrics/MethodLength
|
110
115
|
|
111
116
|
##
|
112
117
|
# @return [Array<String>]
|
@@ -168,20 +173,55 @@ module DerivativeRodeo
|
|
168
173
|
# {#output_location_template} or {#preprocessed_location_template}.
|
169
174
|
#
|
170
175
|
# @see [StorageLocations::BaseLocation#exist?]
|
176
|
+
# rubocop:disable Metrics/MethodLength
|
177
|
+
# rubocop:disable Metrics/AbcSize
|
171
178
|
def destination(input_location)
|
172
|
-
output_location = input_location.derived_file_from(template: output_location_template)
|
179
|
+
output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
|
173
180
|
|
174
|
-
|
175
|
-
|
181
|
+
if output_location.exist?
|
182
|
+
log_message = "#{self.class}#destination :: " \
|
183
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
184
|
+
"Found output_location file_uri #{output_location.file_uri}."
|
185
|
+
logger.info(log_message)
|
176
186
|
|
177
|
-
|
187
|
+
return output_location
|
188
|
+
end
|
189
|
+
|
190
|
+
unless preprocessed_location_template
|
191
|
+
log_message = "#{self.class}#destination :: " \
|
192
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
193
|
+
"No preprocessed_location_template provided " \
|
194
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri};" \
|
195
|
+
" moving on to generation via #{self.class}#build_step."
|
196
|
+
logger.info(log_message)
|
197
|
+
|
198
|
+
return output_location
|
199
|
+
end
|
200
|
+
|
201
|
+
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
|
178
202
|
# We only want the location if it exists
|
179
|
-
|
203
|
+
if preprocessed_location&.exist?
|
204
|
+
log_message = "#{self.class}#destination :: " \
|
205
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
206
|
+
"Found preprocessed_location file_uri #{output_location.file_uri}."
|
207
|
+
logger.info(log_message)
|
208
|
+
|
209
|
+
return preprocessed_location
|
210
|
+
end
|
211
|
+
|
212
|
+
log_message = "#{self.class}#destination :: " \
|
213
|
+
"input_location file_uri #{input_location.file_uri} :: " \
|
214
|
+
"No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
|
215
|
+
"nor output_location file_uri #{output_location.file_uri}; " \
|
216
|
+
"moving on to generation via #{self.class}#build_step."
|
217
|
+
logger.info(log_message)
|
180
218
|
|
181
219
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
182
220
|
# that the #build_step knows where to write the file.
|
183
221
|
output_location
|
184
222
|
end
|
223
|
+
# rubocop:enable Metrics/AbcSize
|
224
|
+
# rubocop:enable Metrics/MethodLength
|
185
225
|
|
186
226
|
##
|
187
227
|
# A bit of indirection to create a common interface for running a shell command.
|
@@ -5,7 +5,8 @@ module DerivativeRodeo
|
|
5
5
|
##
|
6
6
|
# Take images an ensures that we have a monochrome derivative of those images.
|
7
7
|
class MonochromeGenerator < BaseGenerator
|
8
|
-
#
|
8
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
|
9
|
+
# magic ".mono" suffix
|
9
10
|
self.output_extension = 'mono.tiff'
|
10
11
|
|
11
12
|
##
|
@@ -52,7 +52,7 @@ module DerivativeRodeo
|
|
52
52
|
# @see #existing_page_locations
|
53
53
|
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
54
|
def image_file_basename_template(basename:)
|
55
|
-
"#{basename}
|
55
|
+
"#{basename}--page-%d.#{output_extension}"
|
56
56
|
end
|
57
57
|
|
58
58
|
##
|
@@ -62,21 +62,21 @@ module DerivativeRodeo
|
|
62
62
|
# @param input_location [StorageLocations::BaseLocation]
|
63
63
|
#
|
64
64
|
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
-
# with :
|
65
|
+
# with :tail_regexp.
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
69
|
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
|
-
|
72
|
+
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
73
73
|
|
74
|
-
output_locations = input_location.derived_file_from(template: output_location_template).
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
75
75
|
return output_locations if output_locations.count.positive?
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
@@ -101,20 +101,22 @@ module DerivativeRodeo
|
|
101
101
|
def with_each_requisite_location_and_tmp_file_path
|
102
102
|
input_files.each do |input_location|
|
103
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
|
-
|
105
|
-
generated_files = existing_page_locations(input_location: input_location)
|
104
|
+
existing_locations = existing_page_locations(input_location: input_location)
|
106
105
|
|
107
|
-
if
|
108
|
-
|
106
|
+
if existing_locations.count.positive?
|
107
|
+
existing_locations.each do |location|
|
108
|
+
yield(location, location.file_path)
|
109
|
+
end
|
110
|
+
else
|
111
|
+
# We're going to need to create the files and "cast" them to locations.
|
112
|
+
Services::PdfSplitter.call(
|
109
113
|
input_tmp_file_path,
|
110
114
|
image_extension: output_extension,
|
111
115
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
-
)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
117
|
-
yield(image_location, image_path)
|
116
|
+
).each do |image_path|
|
117
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
|
+
yield(image_location, image_path)
|
119
|
+
end
|
118
120
|
end
|
119
121
|
end
|
120
122
|
end
|
@@ -46,11 +46,12 @@ module DerivativeRodeo
|
|
46
46
|
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
47
|
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
48
|
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
-
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
-
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
# rubocop:disable Metrics/MethodLength
|
54
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
|
54
55
|
@from_uri = from_uri
|
55
56
|
@template = template
|
56
57
|
@adapter = adapter
|
@@ -60,12 +61,23 @@ module DerivativeRodeo
|
|
60
61
|
@from_scheme, @path = uri.split("://")
|
61
62
|
@parts = @path.split(separator)
|
62
63
|
@dir_parts = @parts[0..-2]
|
63
|
-
@filename = @parts[-1]
|
64
|
-
@basename = File.basename(@filename, ".*")
|
65
|
-
|
64
|
+
@filename = options[:filename] || @parts[-1]
|
65
|
+
@basename = options[:basename] || File.basename(@filename, ".*")
|
66
|
+
|
67
|
+
##
|
68
|
+
# HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
|
69
|
+
# forward the `.mono` suffix as that makes it hard to find the preprocessed word
|
70
|
+
# coordinates, alto, and plain text. This ensures files derived from the .mono are findable
|
71
|
+
# in IIIF Print.
|
72
|
+
@basename = @basename.sub(/\.mono\z/, '')
|
73
|
+
@extension = options[:extension] || File.extname(@filename)
|
74
|
+
# When a generator specifies "same" we want to use the given file's extension
|
75
|
+
@extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
|
76
|
+
@extension = ".#{@extension}" unless @extension.start_with?(".")
|
66
77
|
|
67
78
|
@template_without_query, @template_query = template.split("?")
|
68
79
|
end
|
80
|
+
# rubocop:enable Metrics/MethodLength
|
69
81
|
|
70
82
|
def call
|
71
83
|
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
@@ -101,10 +101,10 @@ module DerivativeRodeo
|
|
101
101
|
# @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
|
102
102
|
#
|
103
103
|
# @return [StorageLocations::BaseLocation]
|
104
|
-
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
|
104
|
+
def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
|
105
105
|
# HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
|
106
106
|
from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
|
107
|
-
to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
|
107
|
+
to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
|
108
108
|
new(to_uri)
|
109
109
|
end
|
110
110
|
|
@@ -203,25 +203,25 @@ module DerivativeRodeo
|
|
203
203
|
# @return [StorageLocations::BaseLocation]
|
204
204
|
#
|
205
205
|
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
206
|
-
def derived_file_from(template
|
206
|
+
def derived_file_from(template:, **options)
|
207
207
|
klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
|
208
|
-
klass.build(from_uri: file_path, template: template)
|
208
|
+
klass.build(from_uri: file_path, template: template, **options)
|
209
209
|
end
|
210
210
|
|
211
211
|
##
|
212
212
|
# When you have a known location and want to check for files that are within that location,
|
213
|
-
# use the {#
|
213
|
+
# use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
|
214
214
|
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
215
|
#
|
216
216
|
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
217
|
# qualified" Dir.glob type search.
|
218
218
|
#
|
219
|
-
# @param
|
219
|
+
# @param tail_regexp [Regexp]
|
220
220
|
#
|
221
221
|
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
222
|
# array when there are none.
|
223
|
-
def
|
224
|
-
raise NotImplementedError, "#{self.class}#
|
223
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
224
|
+
raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
|
225
225
|
end
|
226
226
|
|
227
227
|
##
|
@@ -231,6 +231,7 @@ module DerivativeRodeo
|
|
231
231
|
def with_new_extension(extension)
|
232
232
|
return file_path if extension == StorageLocations::SAME
|
233
233
|
|
234
|
+
# NOTE: May need to revisit this
|
234
235
|
"#{file_path.split('.')[0]}.#{extension}"
|
235
236
|
end
|
236
237
|
|
@@ -35,8 +35,16 @@ module DerivativeRodeo
|
|
35
35
|
file_uri
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
##
|
39
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
|
40
|
+
#
|
41
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
42
|
+
#
|
43
|
+
# @param tail_regexp [Regexp]
|
44
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
|
+
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
end
|
42
50
|
end
|
@@ -65,31 +65,19 @@ module DerivativeRodeo
|
|
65
65
|
##
|
66
66
|
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
67
|
#
|
68
|
-
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
69
|
-
# use the components of the file_path to fake that behavior.
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
70
69
|
#
|
71
70
|
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
-
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
-
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
-
# and miscolate two PDFs.
|
79
|
-
#
|
80
|
-
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
-
globname = File.join(file_dir, tail_glob)
|
82
|
-
regexp = %r{#{File.extname(globname)}$}
|
83
|
-
|
84
|
-
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
-
# based on both the key of the item in the bucket as well as the bucket's host.
|
71
|
+
#
|
72
|
+
# @param tail_regexp [Regexp]
|
73
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
86
74
|
uri = URI.parse(file_uri)
|
87
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
76
|
|
89
|
-
bucket.objects(prefix:
|
90
|
-
if object.key
|
77
|
+
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
|
+
if tail_regexp.match(object.key)
|
91
79
|
template = File.join(scheme_and_host, object.key)
|
92
|
-
derived_file_from(template: template)
|
80
|
+
accumulator << derived_file_from(template: template)
|
93
81
|
end
|
94
82
|
end
|
95
83
|
end
|
@@ -13,6 +13,8 @@ module DerivativeRodeo
|
|
13
13
|
# Location to download and upload files to Sqs
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
|
+
#
|
17
|
+
# rubocop:disable Metrics/ClassLength
|
16
18
|
class SqsLocation < BaseLocation
|
17
19
|
##
|
18
20
|
# @!group Class Attributes
|
@@ -85,11 +87,14 @@ module DerivativeRodeo
|
|
85
87
|
batch = []
|
86
88
|
Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
|
87
89
|
batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
|
88
|
-
if (i % batch_size).zero?
|
90
|
+
if (i + 1 % batch_size).zero?
|
89
91
|
add_batch(messages: batch)
|
90
92
|
batch = []
|
91
93
|
end
|
92
94
|
end
|
95
|
+
|
96
|
+
# Ensure we're flushing the batched up queue as part of completing the write.
|
97
|
+
add_batch(messages: batch) if batch.present?
|
93
98
|
file_uri
|
94
99
|
end
|
95
100
|
|
@@ -181,6 +186,7 @@ module DerivativeRodeo
|
|
181
186
|
end
|
182
187
|
|
183
188
|
def output_json(uri)
|
189
|
+
# TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
|
184
190
|
key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
|
185
191
|
{ key => [template] }.to_json
|
186
192
|
end
|
@@ -201,5 +207,6 @@ module DerivativeRodeo
|
|
201
207
|
@file_uri_parts
|
202
208
|
end
|
203
209
|
end
|
210
|
+
# rubocop:enable Metrics/ClassLength
|
204
211
|
end
|
205
212
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|