derivative-rodeo 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/errors.rb +1 -1
- data/lib/derivative_rodeo/generators/base_generator.rb +88 -17
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +24 -6
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +5 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +2 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +3 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +5 -2
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5ea42b2912b6b1eb6981b8a72a3d39e4d6f466e07ad60381ba4880431214b18
|
4
|
+
data.tar.gz: 4e4ca5cdd6ba61898ba13970fbd95c8836c33492c377eb6f258255c8ebf79e67
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ebaacf09b1459e8bb8527ddd3b45b6e84106d335e91cc506a6b9b31aae6c74931c755dc5e084e9a39a6ddb0ca8f1b0f85ee21e461cac445edd3eea45384b1b5
|
7
|
+
data.tar.gz: e11195b3b7169a1f7e2f5df821c681512af93123fc272d3b1dd35d9c971f15ad58a86b4ab1cabf38b463c77d5531114af7ca36eb9197275e46549788a4ec6c1f
|
@@ -11,11 +11,21 @@ module DerivativeRodeo
|
|
11
11
|
##
|
12
12
|
# The Base Generator defines the interface and common methods.
|
13
13
|
#
|
14
|
+
# Fundamentally, they are about ensuring the files end up at the specified location, based on
|
15
|
+
# the given:
|
16
|
+
#
|
17
|
+
# - {#input_uris}
|
18
|
+
# - {#output_location_template}
|
19
|
+
# - {#preprocessed_location_template}
|
20
|
+
#
|
14
21
|
# In extending a BaseGenerator you:
|
15
22
|
#
|
16
23
|
# - must assign an {.output_extension}
|
17
24
|
# - must impliment a {#build_step} method
|
18
25
|
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
26
|
+
#
|
27
|
+
# {#generated_files} is "where the magic happens"
|
28
|
+
# rubocop:disable Metrics/ClassLength
|
19
29
|
class BaseGenerator
|
20
30
|
##
|
21
31
|
# @!group Class Attributes
|
@@ -26,9 +36,27 @@ module DerivativeRodeo
|
|
26
36
|
class_attribute :output_extension
|
27
37
|
# @!endgroup Class Attributes
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
39
|
+
##
|
40
|
+
# @!group Attributes
|
41
|
+
#
|
42
|
+
# The "original" files that we'll be processing (via {#generated_files})
|
43
|
+
# @return [Array<String>]
|
44
|
+
attr_reader :input_uris
|
45
|
+
|
46
|
+
##
|
47
|
+
# The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
|
48
|
+
# @return [String]
|
49
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
50
|
+
attr_reader :output_location_template
|
51
|
+
|
52
|
+
##
|
53
|
+
# The template that defines where we might find existing processed files for the given
|
54
|
+
# {#input_uris} (via {#generated_files})
|
55
|
+
#
|
56
|
+
# @return [String, NilClass]
|
57
|
+
# @see DerivativeRodeo::Services::ConvertUriViaTemplateService
|
58
|
+
attr_reader :preprocessed_location_template
|
59
|
+
# @!endgroup Attributes
|
32
60
|
|
33
61
|
##
|
34
62
|
# @param input_uris [Array<String>]
|
@@ -78,6 +106,15 @@ module DerivativeRodeo
|
|
78
106
|
##
|
79
107
|
# @api public
|
80
108
|
#
|
109
|
+
# Based on the {#input_uris} ensure that we have files at the given output location (as
|
110
|
+
# derived from the {#output_location_template}). We ensure that by:
|
111
|
+
#
|
112
|
+
# - Checking if a file already exists at the output location
|
113
|
+
# - Copying a preprocessed file to the output location if a preprocessed file exists
|
114
|
+
# - Generating the file based on the input location
|
115
|
+
#
|
116
|
+
# @note This is the method where the magic happens!
|
117
|
+
#
|
81
118
|
# @return [Array<StorageLocations::BaseLocation>]
|
82
119
|
#
|
83
120
|
# @see #build_step
|
@@ -87,6 +124,10 @@ module DerivativeRodeo
|
|
87
124
|
# TODO: Examples please
|
88
125
|
return @generated_files if defined?(@generated_files)
|
89
126
|
|
127
|
+
logger.info("Starting #{self.class}#generated_files with " \
|
128
|
+
"input_uris: #{input_uris.inspect}, " \
|
129
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
130
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}.")
|
90
131
|
# As much as I would like to use map or returned values; given the implementations it's
|
91
132
|
# better to explicitly require that; reducing downstream implementation headaches.
|
92
133
|
#
|
@@ -98,15 +139,15 @@ module DerivativeRodeo
|
|
98
139
|
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
99
140
|
# "file:///Users/jfriesen/.profile"
|
100
141
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
101
|
-
|
102
|
-
@generated_files << if
|
103
|
-
|
142
|
+
output_location = destination(input_location)
|
143
|
+
@generated_files << if output_location.exist?
|
144
|
+
output_location
|
104
145
|
else
|
105
146
|
log_message = "#{self.class}#generated_files :: " \
|
106
147
|
"input_location file_uri #{input_location.file_uri} :: " \
|
107
|
-
"Generating output_location file_uri #{
|
148
|
+
"Generating output_location file_uri #{output_location.file_uri} via build_step."
|
108
149
|
logger.info(log_message)
|
109
|
-
build_step(input_location: input_location, output_location:
|
150
|
+
build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
|
110
151
|
end
|
111
152
|
end
|
112
153
|
@generated_files
|
@@ -163,9 +204,13 @@ module DerivativeRodeo
|
|
163
204
|
end
|
164
205
|
|
165
206
|
##
|
166
|
-
# Returns the location
|
167
|
-
# destination might exist or might not. In the case
|
168
|
-
#
|
207
|
+
# Returns the output location for the given :input_location. The file at the location
|
208
|
+
# destination might exist or might not. In the case where we have a
|
209
|
+
# {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
|
210
|
+
# and if it exists there copy it to the target output location.
|
211
|
+
#
|
212
|
+
# In the case of non-existence, then the {#build_step} will create
|
213
|
+
# the file.
|
169
214
|
#
|
170
215
|
# @param input_location [StorageLocations::BaseLocation]
|
171
216
|
#
|
@@ -191,22 +236,31 @@ module DerivativeRodeo
|
|
191
236
|
log_message = "#{self.class}#destination :: " \
|
192
237
|
"input_location file_uri #{input_location.file_uri} :: " \
|
193
238
|
"No preprocessed_location_template provided " \
|
194
|
-
"nor does a file exist at output_location file_uri #{output_location.file_uri};" \
|
195
|
-
"
|
239
|
+
"nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
|
240
|
+
"moving on to generation via #{self.class}#build_step."
|
196
241
|
logger.info(log_message)
|
197
242
|
|
198
243
|
return output_location
|
199
244
|
end
|
200
245
|
|
201
|
-
|
246
|
+
template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
|
247
|
+
|
248
|
+
preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
|
202
249
|
# We only want the location if it exists
|
203
|
-
if preprocessed_location
|
250
|
+
if preprocessed_location.exist?
|
204
251
|
log_message = "#{self.class}#destination :: " \
|
205
252
|
"input_location file_uri #{input_location.file_uri} :: " \
|
206
|
-
"Found preprocessed_location file_uri #{
|
253
|
+
"Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
|
207
254
|
logger.info(log_message)
|
208
255
|
|
209
|
-
|
256
|
+
# Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
|
257
|
+
# will also write some logs.
|
258
|
+
output_location = CopyGenerator.new(
|
259
|
+
input_uris: [preprocessed_location.file_uri],
|
260
|
+
output_location_template: output_location.file_uri
|
261
|
+
).generated_files.first
|
262
|
+
|
263
|
+
return output_location
|
210
264
|
end
|
211
265
|
|
212
266
|
log_message = "#{self.class}#destination :: " \
|
@@ -223,6 +277,22 @@ module DerivativeRodeo
|
|
223
277
|
# rubocop:enable Metrics/AbcSize
|
224
278
|
# rubocop:enable Metrics/MethodLength
|
225
279
|
|
280
|
+
##
|
281
|
+
# Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
|
282
|
+
# the input location. Most often, however, the given :preprocessed_location_template is
|
283
|
+
# adequate and would be the typical returned value.
|
284
|
+
#
|
285
|
+
# @param input_location [StorageLocations::BaseLocation]
|
286
|
+
# @param preprocessed_location_template [String]
|
287
|
+
#
|
288
|
+
# @return [String]
|
289
|
+
#
|
290
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
291
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
292
|
+
preprocessed_location_template
|
293
|
+
end
|
294
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
295
|
+
|
226
296
|
##
|
227
297
|
# A bit of indirection to create a common interface for running a shell command.
|
228
298
|
#
|
@@ -237,6 +307,7 @@ module DerivativeRodeo
|
|
237
307
|
result
|
238
308
|
end
|
239
309
|
end
|
310
|
+
# rubocop:enable Metrics/ClassLength
|
240
311
|
end
|
241
312
|
end
|
242
313
|
|
@@ -66,7 +66,7 @@ module DerivativeRodeo
|
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
|
-
# @note The
|
69
|
+
# @note The tail_regexp is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
72
|
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
@@ -76,12 +76,14 @@ module DerivativeRodeo
|
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
83
83
|
# @api public
|
84
84
|
#
|
85
|
+
# @param splitter [#call]
|
86
|
+
#
|
85
87
|
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
86
88
|
# the page number.
|
87
89
|
#
|
@@ -98,22 +100,27 @@ module DerivativeRodeo
|
|
98
100
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
101
|
#
|
100
102
|
# rubocop:disable Metrics/MethodLength
|
101
|
-
|
103
|
+
# rubocop:disable Metrics/AbcSize
|
104
|
+
def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
|
102
105
|
input_files.each do |input_location|
|
103
106
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
107
|
existing_locations = existing_page_locations(input_location: input_location)
|
105
108
|
|
106
109
|
if existing_locations.count.positive?
|
107
|
-
existing_locations.
|
110
|
+
logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
|
111
|
+
existing_locations.each_with_index do |location, index|
|
112
|
+
logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
|
108
113
|
yield(location, location.file_path)
|
109
114
|
end
|
110
115
|
else
|
116
|
+
logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
|
111
117
|
# We're going to need to create the files and "cast" them to locations.
|
112
|
-
|
118
|
+
splitter.call(
|
113
119
|
input_tmp_file_path,
|
114
120
|
image_extension: output_extension,
|
115
121
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
116
|
-
).
|
122
|
+
).each_with_index do |image_path, index|
|
123
|
+
logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
|
117
124
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
125
|
yield(image_location, image_path)
|
119
126
|
end
|
@@ -121,7 +128,18 @@ module DerivativeRodeo
|
|
121
128
|
end
|
122
129
|
end
|
123
130
|
end
|
131
|
+
# rubocop:enable Metrics/AbcSize
|
124
132
|
# rubocop:enable Metrics/MethodLength
|
133
|
+
|
134
|
+
##
|
135
|
+
# We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
|
136
|
+
# The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
|
137
|
+
# filename (e.g. "123.ARCHIVAL.pdf")
|
138
|
+
#
|
139
|
+
# And since the template doesn't have a concept of page number, we introduce this kludge.
|
140
|
+
def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
|
141
|
+
File.join(File.dirname(preprocessed_location_template), input_location.file_name)
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
end
|
@@ -33,6 +33,11 @@ module DerivativeRodeo
|
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
34
|
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
|
+
rescue => e
|
37
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
|
38
|
+
exception = RuntimeError.new(message)
|
39
|
+
exception.set_backtrace(e.backtrace)
|
40
|
+
raise exception
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
@@ -42,6 +42,9 @@ module DerivativeRodeo
|
|
42
42
|
#
|
43
43
|
# @param tail_regexp [Regexp]
|
44
44
|
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
logger.debug("#{self.class}##{__method__} searching for matching files in " \
|
46
|
+
"file_dir: #{file_dir.inspect} " \
|
47
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
45
48
|
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
49
|
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
50
|
end
|
@@ -73,7 +73,10 @@ module DerivativeRodeo
|
|
73
73
|
def matching_locations_in_file_dir(tail_regexp:)
|
74
74
|
uri = URI.parse(file_uri)
|
75
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
76
|
-
|
76
|
+
logger.debug("#{self.class}##{__method__} searching for matching files for " \
|
77
|
+
"scheme_and_host: #{scheme_and_host.inspect} " \
|
78
|
+
"file_dir: #{file_dir.inspect} " \
|
79
|
+
"with tail_regexp: #{tail_regexp.inspect}.")
|
77
80
|
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
81
|
if tail_regexp.match(object.key)
|
79
82
|
template = File.join(scheme_and_host, object.key)
|
@@ -120,7 +123,7 @@ module DerivativeRodeo
|
|
120
123
|
def bucket_name
|
121
124
|
@bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
|
122
125
|
rescue StandardError
|
123
|
-
raise Errors::BucketMissingError
|
126
|
+
raise Errors::BucketMissingError.new(file_uri: file_uri)
|
124
127
|
end
|
125
128
|
|
126
129
|
# @see .use_actual_s3_bucket
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-07-
|
12
|
+
date: 2023-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|