derivative-rodeo 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +15 -0
- data/README.md +251 -0
- data/Rakefile +42 -0
- data/derivative_rodeo.gemspec +54 -0
- data/lib/derivative/rodeo.rb +3 -0
- data/lib/derivative-rodeo.rb +3 -0
- data/lib/derivative_rodeo/configuration.rb +95 -0
- data/lib/derivative_rodeo/errors.rb +56 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
- data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
- data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
- data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
- data/lib/derivative_rodeo/services/base_service.rb +15 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
- data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
- data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
- data/lib/derivative_rodeo/services/image_service.rb +73 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
- data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
- data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
- data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
- data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
- data/lib/derivative_rodeo/services/url_service.rb +42 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
- data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
- data/lib/derivative_rodeo/technical_metadata.rb +23 -0
- data/lib/derivative_rodeo/version.rb +5 -0
- data/lib/derivative_rodeo.rb +36 -0
- metadata +339 -0
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
##
|
5
|
+
# Generators execute a transformation on files and return new files.
|
6
|
+
#
|
7
|
+
# A new generator should inherit from {BaseGenerator}.
|
8
|
+
#
|
9
|
+
# @see BaseGenerator
|
10
|
+
module Generators
|
11
|
+
##
|
12
|
+
# The Base Generator defines the interface and common methods.
|
13
|
+
#
|
14
|
+
# In extending a BaseGenerator you:
|
15
|
+
#
|
16
|
+
# - must assign an {.output_extension}
|
17
|
+
# - must impliment a {#build_step} method
|
18
|
+
# - may override {#with_each_requisite_location_and_tmp_file_path}
|
19
|
+
class BaseGenerator
|
20
|
+
##
|
21
|
+
# @!group Class Attributes
|
22
|
+
# @!attribute [rw]
|
23
|
+
#
|
24
|
+
# @return [String] of the form that starts with a string and may contain periods (though
|
25
|
+
# likely not as the first character).
|
26
|
+
class_attribute :output_extension
|
27
|
+
# @!endgroup Class Attributes
|
28
|
+
|
29
|
+
attr_reader :input_uris,
|
30
|
+
:logger,
|
31
|
+
:output_location_template,
|
32
|
+
:preprocessed_location_template
|
33
|
+
|
34
|
+
##
|
35
|
+
# @param input_uris [Array<String>]
|
36
|
+
# @param output_location_template [String] the template used to transform the given :input_uris
|
37
|
+
# via {Services::ConvertUriViaTemplateService}.
|
38
|
+
# @param preprocessed_location_template [NilClass, String] when `nil` ignore, otherwise attempt
|
39
|
+
# to find preprocessed uris by transforming the :input_uris via
|
40
|
+
# {Services::ConvertUriViaTemplateService} with the given
|
41
|
+
# :preprocessed_location_template.
|
42
|
+
# @param logger [Logger]
|
43
|
+
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
|
44
|
+
@input_uris = Array.wrap(input_uris)
|
45
|
+
@output_location_template = output_location_template
|
46
|
+
@preprocessed_location_template = preprocessed_location_template
|
47
|
+
@logger = logger
|
48
|
+
|
49
|
+
return if valid_instantiation?
|
50
|
+
|
51
|
+
raise Errors::ExtensionMissingError.new(klass: self.class)
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @api private
|
56
|
+
#
|
57
|
+
# @return [Boolean]
|
58
|
+
def valid_instantiation?
|
59
|
+
# When we have a BaseGenerator and not one of it's children or when we've assigned the
|
60
|
+
# output_extension. instance_of? is more specific than is_a?
|
61
|
+
instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# @api public
|
66
|
+
#
|
67
|
+
# @param input_location [StorageLocations::BaseLocation] the input source of the generation
|
68
|
+
# @param output_location [StorageLocations::BaseLocation] the output location of the generation
|
69
|
+
# @param input_tmp_file_path [String] the temporary path to the location of the given :input_location to
|
70
|
+
# enable further processing on the file.
|
71
|
+
#
|
72
|
+
# @return [StorageLocations::BaseLocation]
|
73
|
+
# @see #generated_files
|
74
|
+
def build_step(input_location:, output_location:, input_tmp_file_path:)
|
75
|
+
raise NotImplementedError, "#{self.class}#build_step"
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# @api public
|
80
|
+
#
|
81
|
+
# @return [Array<StorageLocations::BaseLocation>]
|
82
|
+
#
|
83
|
+
# @see #build_step
|
84
|
+
# @see #with_each_requisite_location_and_tmp_file_path
|
85
|
+
def generated_files
|
86
|
+
return @generated_files if defined?(@generated_files)
|
87
|
+
|
88
|
+
# As much as I would like to use map or returned values; given the implementations it's
|
89
|
+
# better to explicitly require that; reducing downstream implementation headaches.
|
90
|
+
#
|
91
|
+
# In other words, this little bit of ugly in a method that has yet to change in a subclass
|
92
|
+
# helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
|
93
|
+
# #build_step
|
94
|
+
@generated_files = []
|
95
|
+
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
96
|
+
generated_file = destination(input_location)
|
97
|
+
@generated_files << if generated_file.exist?
|
98
|
+
generated_file
|
99
|
+
else
|
100
|
+
build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
@generated_files
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
# @return [Array<String>]
|
108
|
+
# @see #generated_files
|
109
|
+
def generated_uris
|
110
|
+
# TODO: what do we do about nils?
|
111
|
+
generated_files.map { |file| file&.file_uri }
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# @api public
|
116
|
+
#
|
117
|
+
# The files that are required as part of the {#generated_files} (though more precisely the
|
118
|
+
# {#build_step}.)
|
119
|
+
#
|
120
|
+
# This method is responsible for one thing:
|
121
|
+
#
|
122
|
+
# - yielding a {StorageLocations::BaseLocation} and the path (as String) to the files
|
123
|
+
# location in the temporary working space.
|
124
|
+
#
|
125
|
+
# This method allows child classes to modify the file_uris for example, to filter out files
|
126
|
+
# that are not of the correct type or as a means of having "this" generator depend on another
|
127
|
+
# generator. The {Generators::HocrGenerator} requires that the input_location be a monochrome;
|
128
|
+
# so it does conversions of each given input_location. The {Generators::PdfSplitGenerator} uses
|
129
|
+
# this method to take each given PDF and generated one image per page of each given PDF.
|
130
|
+
# Those images are then treated as the requisite locations.
|
131
|
+
#
|
132
|
+
# @yieldparam input_location [StorageLocations::BaseLocations] the from location as represented by
|
133
|
+
# a URI.
|
134
|
+
# @yieldparam tmp_file_path [String] where to find the input_location's file in the processing tmp
|
135
|
+
# space.
|
136
|
+
#
|
137
|
+
# @see Generators::HocrGenerator
|
138
|
+
# @see Generators::PdfSplitGenerator
|
139
|
+
def with_each_requisite_location_and_tmp_file_path
|
140
|
+
input_files.each do |input_location|
|
141
|
+
input_location.with_existing_tmp_path do |tmp_file_path|
|
142
|
+
yield(input_location, tmp_file_path)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
##
|
148
|
+
# @return [Array<StorageLocations::BaseLocation>]
|
149
|
+
def input_files
|
150
|
+
@input_files ||= input_uris.map do |file_uri|
|
151
|
+
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(file_uri)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
##
|
156
|
+
# Returns the location destination for the given :input_file. The file at the location
|
157
|
+
# destination might exist or might not. In the case of non-existence, then the {#build_step}
|
158
|
+
# will create the file.
|
159
|
+
#
|
160
|
+
# @param input_location [StorageLocations::BaseLocation]
|
161
|
+
#
|
162
|
+
# @return [StorageLocations::BaseLocation] the derivative of the given :file based on either the
|
163
|
+
# {#output_location_template} or {#preprocessed_location_template}.
|
164
|
+
#
|
165
|
+
# @see [StorageLocations::BaseLocation#exist?]
|
166
|
+
def destination(input_location)
|
167
|
+
output_location = input_location.derived_file_from(template: output_location_template)
|
168
|
+
|
169
|
+
return output_location if output_location.exist?
|
170
|
+
return output_location unless preprocessed_location_template
|
171
|
+
|
172
|
+
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
|
173
|
+
# We only want
|
174
|
+
return preprocessed_location if preprocessed_location&.exist?
|
175
|
+
|
176
|
+
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
177
|
+
# that the #build_step knows where to write the file.
|
178
|
+
output_location
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# A bit of indirection to create a common interface for running a shell command.
|
183
|
+
#
|
184
|
+
# @param command [String]
|
185
|
+
# @return [String]
|
186
|
+
def run(command)
|
187
|
+
logger.debug "* Start command: #{command}"
|
188
|
+
# TODO: What kind of error handling do we want?
|
189
|
+
result = `#{command}`
|
190
|
+
logger.debug "* Result: \n* #{result.gsub("\n", "\n* ")}"
|
191
|
+
logger.debug "* End command: #{command}"
|
192
|
+
result
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
|
199
|
+
require file unless File.directory?(file) || file.match?(/base_generator/)
|
200
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module DerivativeRodeo
|
3
|
+
module Generators
|
4
|
+
##
|
5
|
+
# A helper module for copying files from one location to another.
|
6
|
+
module CopyFileConcern
|
7
|
+
##
|
8
|
+
# Copy files from one adapter to another.
|
9
|
+
#
|
10
|
+
# @param output_location [StorageLocations::BaseLocation]
|
11
|
+
# @param input_tmp_file_path [String]
|
12
|
+
#
|
13
|
+
# @return [StorageLocations::BaseLocation]
|
14
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
15
|
+
copy(input_tmp_file_path, output_location)
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# @api private
|
20
|
+
def copy(from_path, output_location)
|
21
|
+
output_location.with_new_tmp_path do |out_path|
|
22
|
+
# We can move here because we are done with the tmp file after this.
|
23
|
+
FileUtils.mv(from_path, out_path)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'derivative_rodeo/generators/concerns/copy_file_concern'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Generators
|
6
|
+
##
|
7
|
+
# Responsible for moving files from one storage adapter to another.
|
8
|
+
class CopyGenerator < BaseGenerator
|
9
|
+
self.output_extension = StorageLocations::SAME
|
10
|
+
|
11
|
+
include CopyFileConcern
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Responsible for finding or creating a hocr file (or configured :output_suffix) using
|
7
|
+
# tesseract. Will create and store a monochrome derivative if one is not found.
|
8
|
+
#
|
9
|
+
# @see http://tesseract-ocr.github.io
|
10
|
+
#
|
11
|
+
# From `tesseract -h`
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# tesseract --help | --help-extra | --version
|
15
|
+
# tesseract --list-langs
|
16
|
+
# tesseract imagename outputbase [options...] [configfile...]
|
17
|
+
class HocrGenerator < BaseGenerator
|
18
|
+
##
|
19
|
+
# @!group Class Attributes
|
20
|
+
# @!attribute [rw]
|
21
|
+
# Command arena variables to for tesseract command; default `nil`.
|
22
|
+
# Should be a space seperated string of KEY=value pairs
|
23
|
+
#
|
24
|
+
# @example
|
25
|
+
# # this works for space_stone aws lambda
|
26
|
+
# Derivative::Rodeo::Step::HocrStep.command_environment_variables =
|
27
|
+
# 'OMP_THREAD_LIMIT=1 TESSDATA_PREFIX=/opt/share/tessdata LD_LIBRARY_PATH=/opt/lib PATH=/opt/bin:$PATH'
|
28
|
+
class_attribute :command_environment_variables, default: "OMP_THREAD_LIMIT=1"
|
29
|
+
|
30
|
+
##
|
31
|
+
# @!attribute [rw]
|
32
|
+
# Additional options to send to tesseract command; default `nil`.
|
33
|
+
class_attribute :additional_tessearct_options, default: nil
|
34
|
+
|
35
|
+
# @!attribute [rw]
|
36
|
+
# The tesseract command's output base; default `:hocr`.
|
37
|
+
class_attribute :output_suffix, default: :hocr
|
38
|
+
|
39
|
+
self.output_extension = 'hocr'
|
40
|
+
# @!endgroup Class Attributes
|
41
|
+
|
42
|
+
##
|
43
|
+
# Run tesseract on monocrhome file and store the resulting output in the configured
|
44
|
+
# {.output_extension} (default 'hocr')
|
45
|
+
#
|
46
|
+
# @param output_location [StorageLocations::BaseLocation]
|
47
|
+
# @param input_tmp_file_path [String]
|
48
|
+
#
|
49
|
+
# @return [StorageLocations::BaseLocation]
|
50
|
+
#
|
51
|
+
# @see #requisite_files
|
52
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
53
|
+
tesseractify(input_tmp_file_path, output_location)
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# @param builder [Class, #generated_files]
|
58
|
+
#
|
59
|
+
# When generating a hocr file from an image, we've found the best results are when we're
|
60
|
+
# processing a monochrome image. As such, this generator will auto-convert a given image to
|
61
|
+
# monochrome.
|
62
|
+
#
|
63
|
+
# @yieldparam file [StorageLocations::BaseLocation]
|
64
|
+
# @yieldparam tmp_path [String]
|
65
|
+
#
|
66
|
+
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
67
|
+
def with_each_requisite_location_and_tmp_file_path(builder: MonochromeGenerator)
|
68
|
+
mono_location_template = output_location_template.gsub(self.class.output_extension, builder.output_extension)
|
69
|
+
requisite_files ||= builder.new(input_uris: input_uris, output_location_template: mono_location_template).generated_files
|
70
|
+
requisite_files.each do |input_location|
|
71
|
+
input_location.with_existing_tmp_path do |tmp_file_path|
|
72
|
+
yield(input_location, tmp_file_path)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# @api private
|
79
|
+
#
|
80
|
+
# Call `tesseract` on the monochrome file and store the resulting hocr
|
81
|
+
# in the tmp_path
|
82
|
+
#
|
83
|
+
# @param input_tmp_file_path [String].
|
84
|
+
# @param output_location [StorageLocations::BaseLocation]
|
85
|
+
def tesseractify(input_tmp_file_path, output_location)
|
86
|
+
output_location.with_new_tmp_path do |out_tmp_path|
|
87
|
+
run_tesseract(input_tmp_file_path, out_tmp_path)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# @param in_path [String] the source of the file
|
93
|
+
# @param out_path [String]
|
94
|
+
def run_tesseract(in_path, out_path)
|
95
|
+
# we pull the extension off the output path, because tesseract will add it back
|
96
|
+
cmd = ""
|
97
|
+
cmd += command_environment_variables + " " if command_environment_variables.present?
|
98
|
+
# TODO: The line of code could mean we had a file with multiple periods and we'd just
|
99
|
+
# replace the first one. Should we instead prefer the following:
|
100
|
+
#
|
101
|
+
# `out_path.split(".")[0..-2].join('.') + ".#{output_extension}"`
|
102
|
+
output_to_path = out_path.sub('.' + output_extension, '')
|
103
|
+
cmd += "tesseract #{in_path} #{output_to_path}"
|
104
|
+
cmd += " #{additional_tessearct_options}" if additional_tessearct_options.present?
|
105
|
+
cmd += " #{output_suffix}"
|
106
|
+
|
107
|
+
# TODO: capture output in case of exceptions; perhaps delegate that to the #run method.
|
108
|
+
run(cmd)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Take images an ensures that we have a monochrome derivative of those images.
|
7
|
+
class MonochromeGenerator < BaseGenerator
|
8
|
+
# TODO: Can we assume a tiff?
|
9
|
+
self.output_extension = 'mono.tiff'
|
10
|
+
|
11
|
+
##
|
12
|
+
# @param input_location [StorageLocations::BaseLocation]
|
13
|
+
# @param output_location [StorageLocations::BaseLocation]
|
14
|
+
# @return [StorageLocations::BaseLocation]
|
15
|
+
def build_step(input_location:, output_location:, input_tmp_file_path:)
|
16
|
+
image = DerivativeRodeo::Services::ImageService.new(input_tmp_file_path)
|
17
|
+
if image.monochrome?
|
18
|
+
# The input_location is already have a monochrome file, no need to run conversions.
|
19
|
+
input_location
|
20
|
+
else
|
21
|
+
# We need to write monochromify and the image.
|
22
|
+
monochromify(output_location, image)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Convert the above image to a file at the monochrome_path
|
28
|
+
#
|
29
|
+
# @param monochrome_file [StorageLocations::BaseLocation]
|
30
|
+
# @param image [Services::ImageService]
|
31
|
+
# @return [StorageLocations::BaseLocation]
|
32
|
+
def monochromify(monochrome_file, image)
|
33
|
+
monochrome_file.with_new_tmp_path do |monochrome_path|
|
34
|
+
image.convert(destination: monochrome_path, monochrome: true)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'derivative_rodeo/generators/concerns/copy_file_concern'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Generators
|
6
|
+
##
|
7
|
+
# This class is responsible for splitting each given PDF (e.g. {#input_files}) into one image
|
8
|
+
# per page (e.g. {#with_each_requisite_location_and_tmp_file_path}). We need to ensure that we
|
9
|
+
# have each of those image files in S3/file storage then enqueue those files for processing.
|
10
|
+
class PdfSplitGenerator < BaseGenerator
|
11
|
+
##
|
12
|
+
# There is a duplication of the splitter name.
|
13
|
+
#
|
14
|
+
# @see #pdf_splitter_name
|
15
|
+
self.output_extension = "tiff"
|
16
|
+
|
17
|
+
include CopyFileConcern
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
|
21
|
+
#
|
22
|
+
# @return [#call, Services::PdfSplitter::Base]
|
23
|
+
def pdf_splitter(name: pdf_splitter_name)
|
24
|
+
@pdf_splitter ||= Services::PdfSplitter.for(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# @return [Symbol]
|
29
|
+
#
|
30
|
+
# @see .output_extension
|
31
|
+
def pdf_splitter_name
|
32
|
+
output_extension.to_s.split(".").last.to_sym
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @api public
|
37
|
+
#
|
38
|
+
# Take the given PDF(s) and into one image per page. Remember that the URL should account for
|
39
|
+
# the page number.
|
40
|
+
#
|
41
|
+
# When we have two PDFs (10 pages and 20 pages respectively), we will have 30 requisite files;
|
42
|
+
# the files must have URLs that associate with their respective parent PDFs.
|
43
|
+
#
|
44
|
+
# @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
|
45
|
+
# @yieldparam image_path [String] where to find this file in the tmp space
|
46
|
+
#
|
47
|
+
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
48
|
+
def with_each_requisite_location_and_tmp_file_path
|
49
|
+
input_files.each do |input_location|
|
50
|
+
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
51
|
+
image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
|
52
|
+
image_paths.each do |image_path|
|
53
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
54
|
+
yield(image_location, image_path)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# This generator is responsible for converting a given binary into a thumbnail. As of
|
7
|
+
# <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
|
8
|
+
class ThumbnailGenerator < BaseGenerator
|
9
|
+
##
|
10
|
+
# We want to mirror the same file "last" extension as described in Hyrax.
|
11
|
+
#
|
12
|
+
# @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
|
13
|
+
self.output_extension = 'thumbnail.jpeg'
|
14
|
+
|
15
|
+
##
|
16
|
+
# @param output_location [StorageLocations::BaseLocation]
|
17
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
18
|
+
#
|
19
|
+
# @return [StorageLocations::BaseLocation]
|
20
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
21
|
+
output_location.with_new_tmp_path do |out_tmp_path|
|
22
|
+
thumbnify(path_of_file_to_create_thumbnail_from: input_tmp_file_path, path_for_thumbnail_output: out_tmp_path)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Convert the file found at :path_to_input into a thumbnail, writing it to the
|
28
|
+
# :path_for_thumbnail_output
|
29
|
+
#
|
30
|
+
# @param path_of_file_to_create_thumbnail_from [String]
|
31
|
+
# @param path_for_thumbnail_output [String]
|
32
|
+
def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
|
33
|
+
# @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
|
34
|
+
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Generators
|
5
|
+
##
|
6
|
+
# Generate the word coordinates (as JSON) from the given input_uris.
|
7
|
+
#
|
8
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
9
|
+
class WordCoordinatesGenerator < BaseGenerator
|
10
|
+
self.output_extension = "coordinates.json"
|
11
|
+
|
12
|
+
##
|
13
|
+
# @param output_location [StorageLocations::BaseLocation]
|
14
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
15
|
+
#
|
16
|
+
# @return [StorageLocations::BaseLocation]
|
17
|
+
#
|
18
|
+
# @see #requisite_files
|
19
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
20
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
21
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_coordinate: output_tmp_file_path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param path_to_hocr [String]
|
29
|
+
# @param path_to_coordinate [String]
|
30
|
+
# @param service [#call, Services::ExtractWordCoordinatesFromHocrSgmlService]
|
31
|
+
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
|
32
|
+
hocr_html = File.read(path_to_hocr)
|
33
|
+
File.open(path_to_coordinate, "w+") do |file|
|
34
|
+
file.puts service.call(hocr_html)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
# @api private
|
7
|
+
#
|
8
|
+
class BaseService
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
|
14
|
+
require file unless File.directory?(file) || file.match?(/base_service/)
|
15
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
#
|
7
|
+
# A service to convert an array of :from_uris to :to_uris via a :template.
|
8
|
+
#
|
9
|
+
# @see .call
|
10
|
+
class ConvertUriViaTemplateService
|
11
|
+
DIR_PARTS_REPLACEMENT_REGEXP = %r{\{\{\s*dir_parts\[(?<left>\-?\d+)\.\.(?<right>\-?\d+)\]\s*\}\}}.freeze
|
12
|
+
FILENAME_REPLACEMENT_REGEXP = %r{\{\{\s*filename\s*\}\}}.freeze
|
13
|
+
BASENAME_REPLACEMENT_REGEXP = %r{\{\{\s*basename\s*\}\}}.freeze
|
14
|
+
EXTENSION_REPLACEMENT_REGEXP = %r{\{\{\s*extension\s*\}\}}.freeze
|
15
|
+
SCHEME_REPLACEMENT_REGEXP = %r{\{\{\s*scheme* \}\}}.freeze
|
16
|
+
SCHEME_FOR_URI_REGEXP = %r{^(?<from_scheme>[^:]+)://}.freeze
|
17
|
+
attr_accessor :from_uri, :template, :adapter, :separator, :uri, :from_scheme, :path, :parts, :dir_parts, :filename, :basename, :extension, :template_without_query, :template_query
|
18
|
+
|
19
|
+
##
|
20
|
+
# Convert the given :from_uris to a different list of uris based on the given :template.
|
21
|
+
#
|
22
|
+
# Components of the template:
|
23
|
+
#
|
24
|
+
# - basename :: the file's basename without extension
|
25
|
+
# - extension :: the file's extension with the period
|
26
|
+
# - dir_parts :: the directory parts in which the file exists; excludes the scheme
|
27
|
+
# - filename :: a convenience that could be represented as `basename.extension`
|
28
|
+
# - scheme :: a convenience that could be represented as `basename.extension`
|
29
|
+
#
|
30
|
+
# The specs demonstrate the use cases.
|
31
|
+
#
|
32
|
+
# @param from_uri [String] Of the form "scheme://dir/parts/basename.extension"
|
33
|
+
# @param template [String] Another URI that may contain path_parts or scheme template values.
|
34
|
+
# @param adapter [StorageLocations::Location]
|
35
|
+
# @param separator [String]
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
#
|
39
|
+
# @example
|
40
|
+
# DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
|
41
|
+
# from_uris: ["file:///path1/A/file.pdf", "file:///path2/B/file.pdf"],
|
42
|
+
# template: "file:///dest1/{{dir_parts[-2..-1]}}/{{filename}}")
|
43
|
+
# => ["file:///dest1/path2/A/file.pdf", "file:///dest1/path2/B/file.pdf"]
|
44
|
+
#
|
45
|
+
# DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
|
46
|
+
# from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
|
47
|
+
# template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
|
48
|
+
# => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
|
49
|
+
def self.call(from_uri:, template:, adapter: nil, separator: "/")
|
50
|
+
new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
|
51
|
+
end
|
52
|
+
|
53
|
+
def initialize(from_uri:, template:, adapter: nil, separator: "/")
|
54
|
+
@from_uri = from_uri
|
55
|
+
@template = template
|
56
|
+
@adapter = adapter
|
57
|
+
@separator = separator
|
58
|
+
|
59
|
+
@uri, _query = from_uri.split("?")
|
60
|
+
@from_scheme, @path = uri.split("://")
|
61
|
+
@parts = @path.split(separator)
|
62
|
+
@dir_parts = @parts[0..-2]
|
63
|
+
@filename = @parts[-1]
|
64
|
+
@basename = File.basename(@filename, ".*")
|
65
|
+
@extension = File.extname(@filename)
|
66
|
+
|
67
|
+
@template_without_query, @template_query = template.split("?")
|
68
|
+
end
|
69
|
+
|
70
|
+
def call
|
71
|
+
to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
|
72
|
+
# The yielded value does not include capture regions. So I'm re-matching things.
|
73
|
+
# capture region to handle this specific thing.
|
74
|
+
match = DIR_PARTS_REPLACEMENT_REGEXP.match(text)
|
75
|
+
dir_parts[(match[:left].to_i)..(match[:right].to_i)].join(separator)
|
76
|
+
end
|
77
|
+
|
78
|
+
to_uri = to_uri.gsub(SCHEME_REPLACEMENT_REGEXP, (adapter&.scheme || from_scheme))
|
79
|
+
to_uri = to_uri.gsub(EXTENSION_REPLACEMENT_REGEXP, extension)
|
80
|
+
to_uri = to_uri.gsub(BASENAME_REPLACEMENT_REGEXP, basename)
|
81
|
+
to_uri.gsub!(FILENAME_REPLACEMENT_REGEXP, filename)
|
82
|
+
to_uri = "#{to_uri}?#{template_query}" if template_query
|
83
|
+
to_uri
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|