derivative-rodeo 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +15 -0
- data/README.md +251 -0
- data/Rakefile +42 -0
- data/derivative_rodeo.gemspec +54 -0
- data/lib/derivative/rodeo.rb +3 -0
- data/lib/derivative-rodeo.rb +3 -0
- data/lib/derivative_rodeo/configuration.rb +95 -0
- data/lib/derivative_rodeo/errors.rb +56 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
- data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
- data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
- data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
- data/lib/derivative_rodeo/services/base_service.rb +15 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
- data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
- data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
- data/lib/derivative_rodeo/services/image_service.rb +73 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
- data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
- data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
- data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
- data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
- data/lib/derivative_rodeo/services/url_service.rb +42 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
- data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
- data/lib/derivative_rodeo/technical_metadata.rb +23 -0
- data/lib/derivative_rodeo/version.rb +5 -0
- data/lib/derivative_rodeo.rb +36 -0
- metadata +339 -0
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
##
|
5
|
+
# Generators execute a transformation on files and return new files.
|
6
|
+
#
|
7
|
+
# A new generator should inherit from {BaseGenerator}.
|
8
|
+
#
|
9
|
+
# @see BaseGenerator
|
10
|
+
module Generators
|
11
|
+
##
|
12
|
+
# The Base Generator defines the interface and common methods.
|
13
|
+
#
|
14
|
+
# In extending a BaseGenerator you:
|
15
|
+
#
|
16
|
+
# - must assign an {.output_extension}
|
17
|
+
# - must impliment a {#build_step} method
|
18
|
+
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
19
|
+
class BaseGenerator
|
20
|
+
##
|
21
|
+
# @!group Class Attributes
|
22
|
+
# @!attribute [rw]
|
23
|
+
#
|
24
|
+
# @return [String] of the form that starts with a string and may contain periods (though
|
25
|
+
# likely not as the first character).
|
26
|
+
class_attribute :output_extension
|
27
|
+
# @!endgroup Class Attributes
|
28
|
+
|
29
|
+
attr_reader :input_uris,
|
30
|
+
:logger,
|
31
|
+
:output_location_template,
|
32
|
+
:preprocessed_location_template
|
33
|
+
|
34
|
+
##
|
35
|
+
# @param input_uris [Array<String>]
|
36
|
+
# @param output_location_template [String] the template used to transform the given :input_uris
|
37
|
+
# via {Services::ConvertUriViaTemplateService}.
|
38
|
+
# @param preprocessed_location_template [NilClass, String] when `nil` ignore, otherwise attempt
|
39
|
+
# to find preprocessed uris by transforming the :input_uris via
|
40
|
+
# {Services::ConvertUriViaTemplateService} with the given
|
41
|
+
# :preprocessed_location_template.
|
42
|
+
# @param logger [Logger]
|
43
|
+
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
|
44
|
+
@input_uris = Array.wrap(input_uris)
|
45
|
+
@output_location_template = output_location_template
|
46
|
+
@preprocessed_location_template = preprocessed_location_template
|
47
|
+
@logger = logger
|
48
|
+
|
49
|
+
return if valid_instantiation?
|
50
|
+
|
51
|
+
raise Errors::ExtensionMissingError.new(klass: self.class)
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @api private
|
56
|
+
#
|
57
|
+
# @return [Boolean]
|
58
|
+
def valid_instantiation?
|
59
|
+
# When we have a BaseGenerator and not one of it's children or when we've assigned the
|
60
|
+
# output_extension. instance_of? is more specific than is_a?
|
61
|
+
instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# @api public
|
66
|
+
#
|
67
|
+
# @param input_location [StorageLocations::BaseLocation] the input source of the generation
|
68
|
+
# @param output_location [StorageLocations::BaseLocation] the output location of the generation
|
69
|
+
# @param input_tmp_file_path [String] the temporary path to the location of the given :input_location to
|
70
|
+
# enable further processing on the file.
|
71
|
+
#
|
72
|
+
# @return [StorageLocations::BaseLocation]
|
73
|
+
# @see #generated_files
|
74
|
+
def build_step(input_location:, output_location:, input_tmp_file_path:)
|
75
|
+
raise NotImplementedError, "#{self.class}#build_step"
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# @api public
|
80
|
+
#
|
81
|
+
# @return [Array<StorageLocations::BaseLocation>]
|
82
|
+
#
|
83
|
+
# @see #build_step
|
84
|
+
# @see #with_each_requisite_location_and_tmp_file_path
|
85
|
+
def generated_files
|
86
|
+
return @generated_files if defined?(@generated_files)
|
87
|
+
|
88
|
+
# As much as I would like to use map or returned values; given the implementations it's
|
89
|
+
# better to explicitly require that; reducing downstream implementation headaches.
|
90
|
+
#
|
91
|
+
# In other words, this little bit of ugly in a method that has yet to change in a subclass
|
92
|
+
# helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
|
93
|
+
# #build_step
|
94
|
+
@generated_files = []
|
95
|
+
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
96
|
+
generated_file = destination(input_location)
|
97
|
+
@generated_files << if generated_file.exist?
|
98
|
+
generated_file
|
99
|
+
else
|
100
|
+
build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
@generated_files
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
# @return [Array<String>]
|
108
|
+
# @see #generated_files
|
109
|
+
def generated_uris
|
110
|
+
# TODO: what do we do about nils?
|
111
|
+
generated_files.map { |file| file&.file_uri }
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# @api public
|
116
|
+
#
|
117
|
+
# The files that are required as part of the {#generated_files} (though more precisely the
|
118
|
+
# {#build_step}.)
|
119
|
+
#
|
120
|
+
# This method is responsible for one thing:
|
121
|
+
#
|
122
|
+
# - yielding a {StorageLocations::BaseLocation} and the path (as String) to the files
|
123
|
+
# location in the temporary working space.
|
124
|
+
#
|
125
|
+
# This method allows child classes to modify the file_uris for example, to filter out files
|
126
|
+
# that are not of the correct type or as a means of having "this" generator depend on another
|
127
|
+
# generator. The {Generators::HocrGenerator} requires that the input_location be a monochrome;
|
128
|
+
# so it does conversions of each given input_location. The {Generators::PdfSplitGenerator} uses
|
129
|
+
# this method to take each given PDF and generated one image per page of each given PDF.
|
130
|
+
# Those images are then treated as the requisite locations.
|
131
|
+
#
|
132
|
+
# @yieldparam input_location [StorageLocations::BaseLocations] the from location as represented by
|
133
|
+
# a URI.
|
134
|
+
# @yieldparam tmp_file_path [String] where to find the input_location's file in the processing tmp
|
135
|
+
# space.
|
136
|
+
#
|
137
|
+
# @see Generators::HocrGenerator
|
138
|
+
# @see Generators::PdfSplitGenerator
|
139
|
+
def with_each_requisite_location_and_tmp_file_path
|
140
|
+
input_files.each do |input_location|
|
141
|
+
input_location.with_existing_tmp_path do |tmp_file_path|
|
142
|
+
yield(input_location, tmp_file_path)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# @return [Array<StorageLocations::BaseLocation>]
|
149
|
+
def input_files
|
150
|
+
@input_files ||= input_uris.map do |file_uri|
|
151
|
+
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(file_uri)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
##
|
156
|
+
# Returns the location destination for the given :input_file. The file at the location
|
157
|
+
# destination might exist or might not. In the case of non-existence, then the {#build_step}
|
158
|
+
# will create the file.
|
159
|
+
#
|
160
|
+
# @param input_location [StorageLocations::BaseLocation]
|
161
|
+
#
|
162
|
+
# @return [StorageLocations::BaseLocation] the derivative of the given :file based on either the
|
163
|
+
# {#output_location_template} or {#preprocessed_location_template}.
|
164
|
+
#
|
165
|
+
# @see [StorageLocations::BaseLocation#exist?]
|
166
|
+
def destination(input_location)
|
167
|
+
output_location = input_location.derived_file_from(template: output_location_template)
|
168
|
+
|
169
|
+
return output_location if output_location.exist?
|
170
|
+
return output_location unless preprocessed_location_template
|
171
|
+
|
172
|
+
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
|
173
|
+
# We only want
|
174
|
+
return preprocessed_location if preprocessed_location&.exist?
|
175
|
+
|
176
|
+
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
177
|
+
# that the #build_step knows where to write the file.
|
178
|
+
output_location
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# A bit of indirection to create a common interface for running a shell command.
|
183
|
+
#
|
184
|
+
# @param command [String]
|
185
|
+
# @return [String]
|
186
|
+
def run(command)
|
187
|
+
logger.debug "* Start command: #{command}"
|
188
|
+
# TODO: What kind of error handling do we want?
|
189
|
+
result = `#{command}`
|
190
|
+
logger.debug "* Result: \n* #{result.gsub("\n", "\n* ")}"
|
191
|
+
logger.debug "* End command: #{command}"
|
192
|
+
result
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
|
199
|
+
require file unless File.directory?(file) || file.match?(/base_generator/)
|
200
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module DerivativeRodeo
|
3
|
+
module Generators
|
4
|
+
##
|
5
|
+
# A helper module for copying files from one location to another.
|
6
|
+
module CopyFileConcern
|
7
|
+
##
|
8
|
+
# Copy files from one adapter to another.
|
9
|
+
#
|
10
|
+
# @param output_location [StorageLocations::BaseLocation]
|
11
|
+
# @param input_tmp_file_path [String]
|
12
|
+
#
|
13
|
+
# @return [StorageLocations::BaseLocation]
|
14
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
15
|
+
copy(input_tmp_file_path, output_location)
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# @api private
|
20
|
+
def copy(from_path, output_location)
|
21
|
+
output_location.with_new_tmp_path do |out_path|
|
22
|
+
# We can move here because we are done with the tmp file after this.
|
23
|
+
FileUtils.mv(from_path, out_path)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'derivative_rodeo/generators/concerns/copy_file_concern'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Generators
|
6
|
+
##
|
7
|
+
# Responsible for moving files from one storage adapter to another.
|
8
|
+
class CopyGenerator < BaseGenerator
|
9
|
+
self.output_extension = StorageLocations::SAME
|
10
|
+
|
11
|
+
include CopyFileConcern
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Responsible for finding or creating a hocr file (or configured :output_suffix) using
|
7
|
+
# tesseract. Will create and store a monochrome derivative if one is not found.
|
8
|
+
#
|
9
|
+
# @see http://tesseract-ocr.github.io
|
10
|
+
#
|
11
|
+
# From `tesseract -h`
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# tesseract --help | --help-extra | --version
|
15
|
+
# tesseract --list-langs
|
16
|
+
# tesseract imagename outputbase [options...] [configfile...]
|
17
|
+
class HocrGenerator < BaseGenerator
|
18
|
+
##
|
19
|
+
# @!group Class Attributes
|
20
|
+
# @!attribute [rw]
|
21
|
+
# Command arena variables to for tesseract command; default `nil`.
|
22
|
+
# Should be a space seperated string of KEY=value pairs
|
23
|
+
#
|
24
|
+
# @example
|
25
|
+
# # this works for space_stone aws lambda
|
26
|
+
# Derivative::Rodeo::Step::HocrStep.command_environment_variables =
|
27
|
+
# 'OMP_THREAD_LIMIT=1 TESSDATA_PREFIX=/opt/share/tessdata LD_LIBRARY_PATH=/opt/lib PATH=/opt/bin:$PATH'
|
28
|
+
class_attribute :command_environment_variables, default: "OMP_THREAD_LIMIT=1"
|
29
|
+
|
30
|
+
##
|
31
|
+
# @!attribute [rw]
|
32
|
+
# Additional options to send to tesseract command; default `nil`.
|
33
|
+
class_attribute :additional_tessearct_options, default: nil
|
34
|
+
|
35
|
+
# @!attribute [rw]
|
36
|
+
# The tesseract command's output base; default `:hocr`.
|
37
|
+
class_attribute :output_suffix, default: :hocr
|
38
|
+
|
39
|
+
self.output_extension = 'hocr'
|
40
|
+
# @!endgroup Class Attributes
|
41
|
+
|
42
|
+
##
|
43
|
+
# Run tesseract on monocrhome file and store the resulting output in the configured
|
44
|
+
# {.output_extension} (default 'hocr')
|
45
|
+
#
|
46
|
+
# @param output_location [StorageLocations::BaseLocation]
|
47
|
+
# @param input_tmp_file_path [String]
|
48
|
+
#
|
49
|
+
# @return [StorageLocations::BaseLocation]
|
50
|
+
#
|
51
|
+
# @see #requisite_files
|
52
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
53
|
+
tesseractify(input_tmp_file_path, output_location)
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# @param builder [Class, #generated_files]
|
58
|
+
#
|
59
|
+
# When generating a hocr file from an image, we've found the best results are when we're
|
60
|
+
# processing a monochrome image. As such, this generator will auto-convert a given image to
|
61
|
+
# monochrome.
|
62
|
+
#
|
63
|
+
# @yieldparam file [StorageLocations::BaseLocation]
|
64
|
+
# @yieldparam tmp_path [String]
|
65
|
+
#
|
66
|
+
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
67
|
+
def with_each_requisite_location_and_tmp_file_path(builder: MonochromeGenerator)
|
68
|
+
mono_location_template = output_location_template.gsub(self.class.output_extension, builder.output_extension)
|
69
|
+
requisite_files ||= builder.new(input_uris: input_uris, output_location_template: mono_location_template).generated_files
|
70
|
+
requisite_files.each do |input_location|
|
71
|
+
input_location.with_existing_tmp_path do |tmp_file_path|
|
72
|
+
yield(input_location, tmp_file_path)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# @api private
|
79
|
+
#
|
80
|
+
# Call `tesseract` on the monochrome file and store the resulting hocr
|
81
|
+
# in the tmp_path
|
82
|
+
#
|
83
|
+
# @param input_tmp_file_path [String].
|
84
|
+
# @param output_location [StorageLocations::BaseLocation]
|
85
|
+
def tesseractify(input_tmp_file_path, output_location)
|
86
|
+
output_location.with_new_tmp_path do |out_tmp_path|
|
87
|
+
run_tesseract(input_tmp_file_path, out_tmp_path)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# @param in_path [String] the source of the file
|
93
|
+
# @param out_path [String]
|
94
|
+
def run_tesseract(in_path, out_path)
|
95
|
+
# we pull the extension off the output path, because tesseract will add it back
|
96
|
+
cmd = ""
|
97
|
+
cmd += command_environment_variables + " " if command_environment_variables.present?
|
98
|
+
# TODO: The line of code could mean we had a file with multiple periods and we'd just
|
99
|
+
# replace the first one. Should we instead prefer the following:
|
100
|
+
#
|
101
|
+
# `out_path.split(".")[0..-2].join('.') + ".#{output_extension}"`
|
102
|
+
output_to_path = out_path.sub('.' + output_extension, '')
|
103
|
+
cmd += "tesseract #{in_path} #{output_to_path}"
|
104
|
+
cmd += " #{additional_tessearct_options}" if additional_tessearct_options.present?
|
105
|
+
cmd += " #{output_suffix}"
|
106
|
+
|
107
|
+
# TODO: capture output in case of exceptions; perhaps delegate that to the #run method.
|
108
|
+
run(cmd)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Take images an ensures that we have a monochrome derivative of those images.
|
7
|
+
class MonochromeGenerator < BaseGenerator
|
8
|
+
# TODO: Can we assume a tiff?
|
9
|
+
self.output_extension = 'mono.tiff'
|
10
|
+
|
11
|
+
##
|
12
|
+
# @param input_location [StorageLocations::BaseLocation]
|
13
|
+
# @param output_location [StorageLocations::BaseLocation]
|
14
|
+
# @return [StorageLocations::BaseLocation]
|
15
|
+
def build_step(input_location:, output_location:, input_tmp_file_path:)
|
16
|
+
image = DerivativeRodeo::Services::ImageService.new(input_tmp_file_path)
|
17
|
+
if image.monochrome?
|
18
|
+
# The input_location is already have a monochrome file, no need to run conversions.
|
19
|
+
input_location
|
20
|
+
else
|
21
|
+
# We need to write monochromify and the image.
|
22
|
+
monochromify(output_location, image)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Convert the above image to a file at the monochrome_path
|
28
|
+
#
|
29
|
+
# @param monochrome_file [StorageLocations::BaseLocation]
|
30
|
+
# @param image [Services::ImageService]
|
31
|
+
# @return [StorageLocations::BaseLocation]
|
32
|
+
def monochromify(monochrome_file, image)
|
33
|
+
monochrome_file.with_new_tmp_path do |monochrome_path|
|
34
|
+
image.convert(destination: monochrome_path, monochrome: true)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'derivative_rodeo/generators/concerns/copy_file_concern'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Generators
|
6
|
+
##
|
7
|
+
# This class is responsible for splitting each given PDF (e.g. {#input_files}) into one image
|
8
|
+
# per page (e.g. {#with_each_requisite_location_and_tmp_file_path}). We need to ensure that we
|
9
|
+
# have each of those image files in S3/file storage then enqueue those files for processing.
|
10
|
+
class PdfSplitGenerator < BaseGenerator
|
11
|
+
##
|
12
|
+
# There is a duplication of the splitter name.
|
13
|
+
#
|
14
|
+
# @see #pdf_splitter_name
|
15
|
+
self.output_extension = "tiff"
|
16
|
+
|
17
|
+
include CopyFileConcern
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
|
21
|
+
#
|
22
|
+
# @return [#call, Services::PdfSplitter::Base]
|
23
|
+
def pdf_splitter(name: pdf_splitter_name)
|
24
|
+
@pdf_splitter ||= Services::PdfSplitter.for(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# @return [Symbol]
|
29
|
+
#
|
30
|
+
# @see .output_extension
|
31
|
+
def pdf_splitter_name
|
32
|
+
output_extension.to_s.split(".").last.to_sym
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @api public
|
37
|
+
#
|
38
|
+
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
39
|
+
# the page number.
|
40
|
+
#
|
41
|
+
# When we have two PDFs (10 pages and 20 pages respectively), we will have 30 requisite files;
|
42
|
+
# the files must have URLs that associate with their respective parent PDFs.
|
43
|
+
#
|
44
|
+
# @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
|
45
|
+
# @yieldparam image_path [String] where to find this file in the tmp space
|
46
|
+
#
|
47
|
+
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
48
|
+
def with_each_requisite_location_and_tmp_file_path
|
49
|
+
input_files.each do |input_location|
|
50
|
+
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
51
|
+
image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
|
52
|
+
image_paths.each do |image_path|
|
53
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
54
|
+
yield(image_location, image_path)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# This generator is responsible for converting a given binary into a thumbnail. As of
|
7
|
+
# <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
|
8
|
+
class ThumbnailGenerator < BaseGenerator
|
9
|
+
##
|
10
|
+
# We want to mirror the same file "last" extension as described in Hyrax.
|
11
|
+
#
|
12
|
+
# @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
|
13
|
+
self.output_extension = 'thumbnail.jpeg'
|
14
|
+
|
15
|
+
##
|
16
|
+
# @param output_location [StorageLocations::BaseLocation]
|
17
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
18
|
+
#
|
19
|
+
# @return [StorageLocations::BaseLocation]
|
20
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
21
|
+
output_location.with_new_tmp_path do |out_tmp_path|
|
22
|
+
thumbnify(path_of_file_to_create_thumbnail_from: input_tmp_file_path, path_for_thumbnail_output: out_tmp_path)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Convert the file found at :path_to_input into a thumbnail, writing it to the
|
28
|
+
# :path_for_thumbnail_output
|
29
|
+
#
|
30
|
+
# @param path_of_file_to_create_thumbnail_from [String]
|
31
|
+
# @param path_for_thumbnail_output [String]
|
32
|
+
def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
|
33
|
+
# @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
|
34
|
+
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Generate the word coordinates (as JSON) from the given input_uris.
|
7
|
+
#
|
8
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
9
|
+
class WordCoordinatesGenerator < BaseGenerator
|
10
|
+
self.output_extension = "coordinates.json"
|
11
|
+
|
12
|
+
##
|
13
|
+
# @param output_location [StorageLocations::BaseLocation]
|
14
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
15
|
+
#
|
16
|
+
# @return [StorageLocations::BaseLocation]
|
17
|
+
#
|
18
|
+
# @see #requisite_files
|
19
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
20
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
21
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_coordinate: output_tmp_file_path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param path_to_hocr [String]
|
29
|
+
# @param path_to_coordinate [String]
|
30
|
+
# @param service [#call, Services::ExtractWordCoordinatesFromHocrSgmlService]
|
31
|
+
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
|
32
|
+
hocr_html = File.read(path_to_hocr)
|
33
|
+
File.open(path_to_coordinate, "w+") do |file|
|
34
|
+
file.puts service.call(hocr_html)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
# @api private
|
7
|
+
#
|
8
|
+
class BaseService
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
|
14
|
+
require file unless File.directory?(file) || file.match?(/base_service/)
|
15
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
#
|
7
|
+
# A service to convert an array of :from_uris to :to_uris via a :template.
|
8
|
+
#
|
9
|
+
# @see .call
|
10
|
+
class ConvertUriViaTemplateService
|
11
|
+
DIR_PARTS_REPLACEMENT_REGEXP = %r{\{\{\s*dir_parts\[(?<left>\-?\d+)\.\.(?<right>\-?\d+)\]\s*\}\}}.freeze
|
12
|
+
FILENAME_REPLACEMENT_REGEXP = %r{\{\{\s*filename\s*\}\}}.freeze
|
13
|
+
BASENAME_REPLACEMENT_REGEXP = %r{\{\{\s*basename\s*\}\}}.freeze
|
14
|
+
EXTENSION_REPLACEMENT_REGEXP = %r{\{\{\s*extension\s*\}\}}.freeze
|
15
|
+
SCHEME_REPLACEMENT_REGEXP = %r{\{\{\s*scheme* \}\}}.freeze
|
16
|
+
SCHEME_FOR_URI_REGEXP = %r{^(?<from_scheme>[^:]+)://}.freeze
|
17
|
+
attr_accessor :from_uri, :template, :adapter, :separator, :uri, :from_scheme, :path, :parts, :dir_parts, :filename, :basename, :extension, :template_without_query, :template_query
|
18
|
+
|
19
|
+
##
|
20
|
+
# Convert the given :from_uris to a different list of uris based on the given :template.
|
21
|
+
#
|
22
|
+
# Components of the template:
|
23
|
+
#
|
24
|
+
# - basename :: the file's basename without extension
|
25
|
+
# - extension :: the file's extension with the period
|
26
|
+
# - dir_parts :: the directory parts in which the file exists; excludes the scheme
|
27
|
+
# - filename :: a convenience that could be represented as `basename.extension`
|
28
|
+
# - scheme :: a convenience that could be represented as `basename.extension`
|
29
|
+
#
|
30
|
+
# The specs demonstrate the use cases.
|
31
|
+
#
|
32
|
+
# @param from_uri [String] Of the form "scheme://dir/parts/basename.extension"
|
33
|
+
# @param template [String] Another URI that may contain path_parts or scheme template values.
|
34
|
+
# @param adapter [StorageLocations::Location]
|
35
|
+
# @param separator [String]
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
#
|
39
|
+
# @example
|
40
|
+
# DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
|
41
|
+
# from_uris: ["file:///path1/A/file.pdf", "file:///path2/B/file.pdf"],
|
42
|
+
# template: "file:///dest1/{{dir_parts[-2..-1]}}/{{filename}}")
|
43
|
+
# => ["file:///dest1/path2/A/file.pdf", "file:///dest1/path2/B/file.pdf"]
|
44
|
+
#
|
45
|
+
# DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
|
46
|
+
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
|
+
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
|
+
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
51
|
+
end
|
52
|
+
|
53
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/")
|
54
|
+
@from_uri = from_uri
|
55
|
+
@template = template
|
56
|
+
@adapter = adapter
|
57
|
+
@separator = separator
|
58
|
+
|
59
|
+
@uri, _query = from_uri.split("?")
|
60
|
+
@from_scheme, @path = uri.split("://")
|
61
|
+
@parts = @path.split(separator)
|
62
|
+
@dir_parts = @parts[0..-2]
|
63
|
+
@filename = @parts[-1]
|
64
|
+
@basename = File.basename(@filename, ".*")
|
65
|
+
@extension = File.extname(@filename)
|
66
|
+
|
67
|
+
@template_without_query, @template_query = template.split("?")
|
68
|
+
end
|
69
|
+
|
70
|
+
def call
|
71
|
+
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
72
|
+
# The yielded value does not include capture regions. So I'm re-matching things.
|
73
|
+
# capture region to handle this specific thing.
|
74
|
+
match = DIR_PARTS_REPLACEMENT_REGEXP.match(text)
|
75
|
+
dir_parts[(match[:left].to_i)..(match[:right].to_i)].join(separator)
|
76
|
+
end
|
77
|
+
|
78
|
+
to_uri = to_uri.gsub(SCHEME_REPLACEMENT_REGEXP, (adapter&.scheme || from_scheme))
|
79
|
+
to_uri = to_uri.gsub(EXTENSION_REPLACEMENT_REGEXP, extension)
|
80
|
+
to_uri = to_uri.gsub(BASENAME_REPLACEMENT_REGEXP, basename)
|
81
|
+
to_uri.gsub!(FILENAME_REPLACEMENT_REGEXP, filename)
|
82
|
+
to_uri = "#{to_uri}?#{template_query}" if template_query
|
83
|
+
to_uri
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|