derivative-rodeo 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/errors.rb +4 -1
- data/lib/derivative_rodeo/generators/base_generator.rb +88 -17
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +24 -6
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +5 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +2 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +2 -2
- data/lib/derivative_rodeo/storage_locations/file_location.rb +4 -1
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +6 -3
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2dc233be659f66709737ebdebc8ac6a69802f74bce6ac450e5eabb383ea4e54
|
4
|
+
data.tar.gz: 7a72ce66e69f827374b6f8e72767aca12638c71a9e5b2c63be09bbca08c12640
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de5ff8fc29943f5dc52b49c7741ae4a742793439dcca97ea94259d044cc15870dd19ced0911d9305901757e838a8ecb401eb81744c4f0529eef9607534d5dcb8
|
7
|
+
data.tar.gz: bec393ace49d0e6689a07d335017987fa6e6edf9aae632159b87150dc2d4a0720441d64f6c6ce48ceb7c4498df3b6ada40b010cd0d439cb76cb071bd65c2c19d
|
@@ -35,7 +35,7 @@ module DerivativeRodeo
|
|
35
35
|
##
|
36
36
|
# Raised when AWS bucket does not exist or is not accessible by current permissions
|
37
37
|
class BucketMissingError < Error
|
38
|
-
def initialize
|
38
|
+
def initialize(file_uri:)
|
39
39
|
super("Bucket part missing #{file_uri}")
|
40
40
|
end
|
41
41
|
end
|
@@ -43,6 +43,9 @@ module DerivativeRodeo
|
|
43
43
|
##
|
44
44
|
# Raised when trying to write a tmp file that does not exist
|
45
45
|
class FileMissingError < Error
|
46
|
+
def self.with_info(**info)
|
47
|
+
new(info.inspect)
|
48
|
+
end
|
46
49
|
end
|
47
50
|
|
48
51
|
##
|
@@ -11,11 +11,21 @@ module DerivativeRodeo
|
|
11
11
|
##
|
12
12
|
# The Base Generator defines the interface and common methods.
|
13
13
|
#
|
14
|
+
# Fundamentally, they are about ensuring the files end up at the specified location, based on
|
15
|
+
# the given:
|
16
|
+
#
|
17
|
+
# - {#input_uris}
|
18
|
+
# - {#output_location_template}
|
19
|
+
# - {#preprocessed_location_template}
|
20
|
+
#
|
14
21
|
# In extending a BaseGenerator you:
|
15
22
|
#
|
16
23
|
# - must assign an {.output_extension}
|
17
24
|
# - must impliment a {#build_step} method
|
18
25
|
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
26
|
+
#
|
27
|
+
# {#generated_files} is "where the magic happens"
|
28
|
+
# rubocop:disable Metrics/ClassLength
|
19
29
|
class BaseGenerator
|
20
30
|
##
|
21
31
|
# @!group Class Attributes
|
@@ -26,9 +36,27 @@ module DerivativeRodeo
|
|
26
36
|
class_attribute :output_extension
|
27
37
|
# @!endgroup Class Attributes
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
39
|
+
##
|
40
|
+
# @!group Attributes
|
41
|
+
#
|
42
|
+
# The "original" files that we'll be processing (via {#generated_files})
|
43
|
+
# @return [Array<String>]
|
44
|
+
attr_reader :input_uris
|
45
|
+
|
46
|
+
##
|
47
|
+
# The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
|
48
|
+
# @return [String]
|
49
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
50
|
+
attr_reader :output_location_template
|
51
|
+
|
52
|
+
##
|
53
|
+
# The template that defines where we might find existing processed files for the given
|
54
|
+
# {#input_uris} (via {#generated_files})
|
55
|
+
#
|
56
|
+
# @return [String, NilClass]
|
57
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
58
|
+
attr_reader :preprocessed_location_template
|
59
|
+
# @!endgroup Attributes
|
32
60
|
|
33
61
|
##
|
34
62
|
# @param input_uris [Array<String>]
|
@@ -78,6 +106,15 @@ module DerivativeRodeo
|
|
78
106
|
##
|
79
107
|
# @api public
|
80
108
|
#
|
109
|
+
# Based on the {#input_uris} ensure that we have files at the given output location (as
|
110
|
+
# derived from the {#output_location_template}). We ensure that by:
|
111
|
+
#
|
112
|
+
# - Checking if a file already exists at the output location
|
113
|
+
# - Copying a preprocessed file to the output location if a preprocessed file exists
|
114
|
+
# - Generating the file based on the input location
|
115
|
+
#
|
116
|
+
# @note This is the method where the magic happens!
|
117
|
+
#
|
81
118
|
# @return [Array<StorageLocations::BaseLocation>]
|
82
119
|
#
|
83
120
|
# @see #build_step
|
@@ -87,6 +124,10 @@ module DerivativeRodeo
|
|
87
124
|
# TODO: Examples please
|
88
125
|
return @generated_files if defined?(@generated_files)
|
89
126
|
|
127
|
+
logger.info("Starting #{self.class}#generated_files with " \
|
128
|
+
"input_uris: #{input_uris.inspect}, " \
|
129
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
130
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}.")
|
90
131
|
# As much as I would like to use map or returned values; given the implementations it's
|
91
132
|
# better to explicitly require that; reducing downstream implementation headaches.
|
92
133
|
#
|
@@ -98,15 +139,15 @@ module DerivativeRodeo
|
|
98
139
|
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
99
140
|
# "file:///Users/jfriesen/.profile"
|
100
141
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
101
|
-
|
102
|
-
@generated_files << if
|
103
|
-
|
142
|
+
output_location = destination(input_location)
|
143
|
+
@generated_files << if output_location.exist?
|
144
|
+
output_location
|
104
145
|
else
|
105
146
|
log_message = "#{self.class}#generated_files :: " \
|
106
147
|
"input_location file_uri #{input_location.file_uri} :: " \
|
107
|
-
"Generating output_location file_uri #{
|
148
|
+
"Generating output_location file_uri #{output_location.file_uri} via build_step."
|
108
149
|
logger.info(log_message)
|
109
|
-
build_step(input_location: input_location, output_location:
|
150
|
+
build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
|
110
151
|
end
|
111
152
|
end
|
112
153
|
@generated_files
|
@@ -163,9 +204,13 @@ module DerivativeRodeo
|
|
163
204
|
end
|
164
205
|
|
165
206
|
##
|
166
|
-
# Returns the location
|
167
|
-
# destination might exist or might not. In the case
|
168
|
-
#
|
207
|
+
# Returns the output location for the given :input_location. The file at the location
|
208
|
+
# destination might exist or might not. In the case where we have a
|
209
|
+
# {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
|
210
|
+
# and if it exists there copy it to the target output location.
|
211
|
+
#
|
212
|
+
# In the case of non-existence, then the {#build_step} will create
|
213
|
+
# the file.
|
169
214
|
#
|
170
215
|
# @param input_location [StorageLocations::BaseLocation]
|
171
216
|
#
|
@@ -191,22 +236,31 @@ module DerivativeRodeo
|
|
191
236
|
log_message = "#{self.class}#destination :: " \
|
192
237
|
"input_location file_uri #{input_location.file_uri} :: " \
|
193
238
|
"No preprocessed_location_template provided " \
|
194
|
-
"nor does a file exist at output_location file_uri #{output_location.file_uri};" \
|
195
|
-
"
|
239
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
|
240
|
+
"moving on to generation via #{self.class}#build_step."
|
196
241
|
logger.info(log_message)
|
197
242
|
|
198
243
|
return output_location
|
199
244
|
end
|
200
245
|
|
201
|
-
|
246
|
+
template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
|
247
|
+
|
248
|
+
preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
|
202
249
|
# We only want the location if it exists
|
203
|
-
if preprocessed_location
|
250
|
+
if preprocessed_location.exist?
|
204
251
|
log_message = "#{self.class}#destination :: " \
|
205
252
|
"input_location file_uri #{input_location.file_uri} :: " \
|
206
|
-
"Found preprocessed_location file_uri #{
|
253
|
+
"Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
|
207
254
|
logger.info(log_message)
|
208
255
|
|
209
|
-
|
256
|
+
# Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
|
257
|
+
# will also write some logs.
|
258
|
+
output_location = CopyGenerator.new(
|
259
|
+
input_uris: [preprocessed_location.file_uri],
|
260
|
+
output_location_template: output_location.file_uri
|
261
|
+
).generated_files.first
|
262
|
+
|
263
|
+
return output_location
|
210
264
|
end
|
211
265
|
|
212
266
|
log_message = "#{self.class}#destination :: " \
|
@@ -223,6 +277,22 @@ module DerivativeRodeo
|
|
223
277
|
# rubocop:enable Metrics/AbcSize
|
224
278
|
# rubocop:enable Metrics/MethodLength
|
225
279
|
|
280
|
+
##
|
281
|
+
# Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
|
282
|
+
# the input location. Most often, however, the given :preprocessed_location_template is
|
283
|
+
# adequate and would be the typical returned value.
|
284
|
+
#
|
285
|
+
# @param input_location [StorageLocations::BaseLocation]
|
286
|
+
# @param preprocessed_location_template [String]
|
287
|
+
#
|
288
|
+
# @return [String]
|
289
|
+
#
|
290
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
291
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
292
|
+
preprocessed_location_template
|
293
|
+
end
|
294
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
295
|
+
|
226
296
|
##
|
227
297
|
# A bit of indirection to create a common interface for running a shell command.
|
228
298
|
#
|
@@ -237,6 +307,7 @@ module DerivativeRodeo
|
|
237
307
|
result
|
238
308
|
end
|
239
309
|
end
|
310
|
+
# rubocop:enable Metrics/ClassLength
|
240
311
|
end
|
241
312
|
end
|
242
313
|
|
@@ -66,7 +66,7 @@ module DerivativeRodeo
|
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
|
-
# @note The
|
69
|
+
# @note The tail_regexp is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
72
|
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
@@ -76,12 +76,14 @@ module DerivativeRodeo
|
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# @api public
|
84
84
|
#
|
85
|
+
# @param splitter [#call]
|
86
|
+
#
|
85
87
|
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
86
88
|
# the page number.
|
87
89
|
#
|
@@ -98,22 +100,27 @@ module DerivativeRodeo
|
|
98
100
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
101
|
#
|
100
102
|
# rubocop:disable Metrics/MethodLength
|
101
|
-
|
103
|
+
# rubocop:disable Metrics/AbcSize
|
104
|
+
def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
|
102
105
|
input_files.each do |input_location|
|
103
106
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
107
|
existing_locations = existing_page_locations(input_location: input_location)
|
105
108
|
|
106
109
|
if existing_locations.count.positive?
|
107
|
-
existing_locations.
|
110
|
+
logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
|
111
|
+
existing_locations.each_with_index do |location, index|
|
112
|
+
logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
|
108
113
|
yield(location, location.file_path)
|
109
114
|
end
|
110
115
|
else
|
116
|
+
logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
|
111
117
|
# We're going to need to create the files and "cast" them to locations.
|
112
|
-
|
118
|
+
splitter.call(
|
113
119
|
input_tmp_file_path,
|
114
120
|
image_extension: output_extension,
|
115
121
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
116
|
-
).
|
122
|
+
).each_with_index do |image_path, index|
|
123
|
+
logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
|
117
124
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
125
|
yield(image_location, image_path)
|
119
126
|
end
|
@@ -121,7 +128,18 @@ module DerivativeRodeo
|
|
121
128
|
end
|
122
129
|
end
|
123
130
|
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
124
132
|
# rubocop:enable Metrics/MethodLength
|
133
|
+
|
134
|
+
##
|
135
|
+
# We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
|
136
|
+
# The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
|
137
|
+
# filename (e.g. "123.ARCHIVAL.pdf")
|
138
|
+
#
|
139
|
+
# And since the template doesn't have a concept of page number, we introduce this kludge.
|
140
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
141
|
+
File.join(File.dirname(preprocessed_location_template), input_location.file_name)
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
end
|
@@ -33,6 +33,11 @@ module DerivativeRodeo
|
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
34
|
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
|
+
rescue => e
|
37
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
|
38
|
+
exception = RuntimeError.new(message)
|
39
|
+
exception.set_backtrace(e.backtrace)
|
40
|
+
raise exception
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
@@ -23,8 +23,8 @@ module DerivativeRodeo
|
|
23
23
|
delegate :config, to: DerivativeRodeo
|
24
24
|
|
25
25
|
def with_existing_tmp_path(&block)
|
26
|
-
with_tmp_path(lambda { |
|
27
|
-
raise Errors::FileMissingError unless exist
|
26
|
+
with_tmp_path(lambda { |file_path, tmp_file_path, exist|
|
27
|
+
raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
|
28
28
|
|
29
29
|
response = get(file_uri)
|
30
30
|
File.open(tmp_file_path, 'wb') { |fp| fp.write(response.body) }
|
@@ -16,7 +16,7 @@ module DerivativeRodeo
|
|
16
16
|
|
17
17
|
def with_existing_tmp_path(&block)
|
18
18
|
with_tmp_path(lambda { |file_path, tmp_file_path, exist|
|
19
|
-
raise Errors::FileMissingError unless exist
|
19
|
+
raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
|
20
20
|
|
21
21
|
FileUtils.cp(file_path, tmp_file_path)
|
22
22
|
}, &block)
|
@@ -42,6 +42,9 @@ module DerivativeRodeo
|
|
42
42
|
#
|
43
43
|
# @param tail_regexp [Regexp]
|
44
44
|
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
logger.debug("#{self.class}##{__method__} searching for matching files in " \
|
46
|
+
"file_dir: #{file_dir.inspect} " \
|
47
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
45
48
|
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
49
|
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
50
|
end
|
@@ -47,7 +47,7 @@ module DerivativeRodeo
|
|
47
47
|
# @return [String] the path to the tmp file
|
48
48
|
def with_existing_tmp_path(&block)
|
49
49
|
with_tmp_path(lambda { |file_path, tmp_file_path, exist|
|
50
|
-
raise Errors::FileMissingError unless exist
|
50
|
+
raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
|
51
51
|
obj = bucket.object(file_path)
|
52
52
|
obj.download_file(tmp_file_path)
|
53
53
|
}, &block)
|
@@ -73,7 +73,10 @@ module DerivativeRodeo
|
|
73
73
|
def matching_locations_in_file_dir(tail_regexp:)
|
74
74
|
uri = URI.parse(file_uri)
|
75
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
76
|
-
|
76
|
+
logger.debug("#{self.class}##{__method__} searching for matching files for " \
|
77
|
+
"scheme_and_host: #{scheme_and_host.inspect} " \
|
78
|
+
"file_dir: #{file_dir.inspect} " \
|
79
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
77
80
|
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
81
|
if tail_regexp.match(object.key)
|
79
82
|
template = File.join(scheme_and_host, object.key)
|
@@ -120,7 +123,7 @@ module DerivativeRodeo
|
|
120
123
|
def bucket_name
|
121
124
|
@bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
|
122
125
|
rescue StandardError
|
123
|
-
raise Errors::BucketMissingError
|
126
|
+
raise Errors::BucketMissingError.new(file_uri: file_uri)
|
124
127
|
end
|
125
128
|
|
126
129
|
# @see .use_actual_s3_bucket
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-11-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|