derivative-rodeo 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +15 -0
  4. data/README.md +251 -0
  5. data/Rakefile +42 -0
  6. data/derivative_rodeo.gemspec +54 -0
  7. data/lib/derivative/rodeo.rb +3 -0
  8. data/lib/derivative-rodeo.rb +3 -0
  9. data/lib/derivative_rodeo/configuration.rb +95 -0
  10. data/lib/derivative_rodeo/errors.rb +56 -0
  11. data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
  12. data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
  13. data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
  14. data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
  15. data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
  16. data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
  17. data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
  18. data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
  19. data/lib/derivative_rodeo/services/base_service.rb +15 -0
  20. data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
  21. data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
  22. data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
  23. data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
  24. data/lib/derivative_rodeo/services/image_service.rb +73 -0
  25. data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
  26. data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
  27. data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
  28. data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
  29. data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
  30. data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
  31. data/lib/derivative_rodeo/services/url_service.rb +42 -0
  32. data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
  33. data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
  34. data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
  35. data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
  36. data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
  37. data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
  38. data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
  39. data/lib/derivative_rodeo/technical_metadata.rb +23 -0
  40. data/lib/derivative_rodeo/version.rb +5 -0
  41. data/lib/derivative_rodeo.rb +36 -0
  42. metadata +339 -0
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ ##
5
+ # Generators execute a transformation on files and return new files.
6
+ #
7
+ # A new generator should inherit from {BaseGenerator}.
8
+ #
9
+ # @see BaseGenerator
10
+ module Generators
11
+ ##
12
+ # The Base Generator defines the interface and common methods.
13
+ #
14
+ # In extending a BaseGenerator you:
15
+ #
16
+ # - must assign an {.output_extension}
17
+ # - must impliment a {#build_step} method
18
+ # - may override {#with_each_requisite_location_and_tmp_file_path}
19
+ class BaseGenerator
20
+ ##
21
+ # @!group Class Attributes
22
+ # @!attribute [rw]
23
+ #
24
+ # @return [String] of the form that starts with a string and may contain periods (though
25
+ # likely not as the first character).
26
+ class_attribute :output_extension
27
+ # @!endgroup Class Attributes
28
+
29
+ attr_reader :input_uris,
30
+ :logger,
31
+ :output_location_template,
32
+ :preprocessed_location_template
33
+
34
+ ##
35
+ # @param input_uris [Array<String>]
36
+ # @param output_location_template [String] the template used to transform the given :input_uris
37
+ # via {Services::ConvertUriViaTemplateService}.
38
+ # @param preprocessed_location_template [NilClass, String] when `nil` ignore, otherwise attempt
39
+ # to find preprocessed uris by transforming the :input_uris via
40
+ # {Services::ConvertUriViaTemplateService} with the given
41
+ # :preprocessed_location_template.
42
+ # @param logger [Logger]
43
+ def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
44
+ @input_uris = Array.wrap(input_uris)
45
+ @output_location_template = output_location_template
46
+ @preprocessed_location_template = preprocessed_location_template
47
+ @logger = logger
48
+
49
+ return if valid_instantiation?
50
+
51
+ raise Errors::ExtensionMissingError.new(klass: self.class)
52
+ end
53
+
54
+ ##
55
+ # @api private
56
+ #
57
+ # @return [Boolean]
58
+ def valid_instantiation?
59
+ # When we have a BaseGenerator and not one of it's children or when we've assigned the
60
+ # output_extension. instance_of? is more specific than is_a?
61
+ instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
62
+ end
63
+
64
+ ##
65
+ # @api public
66
+ #
67
+ # @param input_location [StorageLocations::BaseLocation] the input source of the generation
68
+ # @param output_location [StorageLocations::BaseLocation] the output location of the generation
69
+ # @param input_tmp_file_path [String] the temporary path to the location of the given :input_location to
70
+ # enable further processing on the file.
71
+ #
72
+ # @return [StorageLocations::BaseLocation]
73
+ # @see #generated_files
74
+ def build_step(input_location:, output_location:, input_tmp_file_path:)
75
+ raise NotImplementedError, "#{self.class}#build_step"
76
+ end
77
+
78
+ ##
79
+ # @api public
80
+ #
81
+ # @return [Array<StorageLocations::BaseLocation>]
82
+ #
83
+ # @see #build_step
84
+ # @see #with_each_requisite_location_and_tmp_file_path
85
+ def generated_files
86
+ return @generated_files if defined?(@generated_files)
87
+
88
+ # As much as I would like to use map or returned values; given the implementations it's
89
+ # better to explicitly require that; reducing downstream implementation headaches.
90
+ #
91
+ # In other words, this little bit of ugly in a method that has yet to change in a subclass
92
+ # helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
93
+ # #build_step
94
+ @generated_files = []
95
+ with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
96
+ generated_file = destination(input_location)
97
+ @generated_files << if generated_file.exist?
98
+ generated_file
99
+ else
100
+ build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
101
+ end
102
+ end
103
+ @generated_files
104
+ end
105
+
106
+ ##
107
+ # @return [Array<String>]
108
+ # @see #generated_files
109
+ def generated_uris
110
+ # TODO: what do we do about nils?
111
+ generated_files.map { |file| file&.file_uri }
112
+ end
113
+
114
+ ##
115
+ # @api public
116
+ #
117
+ # The files that are required as part of the {#generated_files} (though more precisely the
118
+ # {#build_step}.)
119
+ #
120
+ # This method is responsible for one thing:
121
+ #
122
+ # - yielding a {StorageLocations::BaseLocation} and the path (as String) to the files
123
+ # location in the temporary working space.
124
+ #
125
+ # This method allows child classes to modify the file_uris for example, to filter out files
126
+ # that are not of the correct type or as a means of having "this" generator depend on another
127
+ # generator. The {Generators::HocrGenerator} requires that the input_location be a monochrome;
128
+ # so it does conversions of each given input_location. The {Generators::PdfSplitGenerator} uses
129
+ # this method to take each given PDF and generated one image per page of each given PDF.
130
+ # Those images are then treated as the requisite locations.
131
+ #
132
+ # @yieldparam input_location [StorageLocations::BaseLocations] the from location as represented by
133
+ # a URI.
134
+ # @yieldparam tmp_file_path [String] where to find the input_location's file in the processing tmp
135
+ # space.
136
+ #
137
+ # @see Generators::HocrGenerator
138
+ # @see Generators::PdfSplitGenerator
139
+ def with_each_requisite_location_and_tmp_file_path
140
+ input_files.each do |input_location|
141
+ input_location.with_existing_tmp_path do |tmp_file_path|
142
+ yield(input_location, tmp_file_path)
143
+ end
144
+ end
145
+ end
146
+
147
+ ##
148
+ # @return [Array<StorageLocations::BaseLocation>]
149
+ def input_files
150
+ @input_files ||= input_uris.map do |file_uri|
151
+ DerivativeRodeo::StorageLocations::BaseLocation.from_uri(file_uri)
152
+ end
153
+ end
154
+
155
+ ##
156
+ # Returns the location destination for the given :input_file. The file at the location
157
+ # destination might exist or might not. In the case of non-existence, then the {#build_step}
158
+ # will create the file.
159
+ #
160
+ # @param input_location [StorageLocations::BaseLocation]
161
+ #
162
+ # @return [StorageLocations::BaseLocation] the derivative of the given :file based on either the
163
+ # {#output_location_template} or {#preprocessed_location_template}.
164
+ #
165
+ # @see [StorageLocations::BaseLocation#exist?]
166
+ def destination(input_location)
167
+ output_location = input_location.derived_file_from(template: output_location_template)
168
+
169
+ return output_location if output_location.exist?
170
+ return output_location unless preprocessed_location_template
171
+
172
+ preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
173
+ # We only want
174
+ return preprocessed_location if preprocessed_location&.exist?
175
+
176
+ # NOTE: The file does not exist at the output_location; but we pass this information along so
177
+ # that the #build_step knows where to write the file.
178
+ output_location
179
+ end
180
+
181
+ ##
182
+ # A bit of indirection to create a common interface for running a shell command.
183
+ #
184
+ # @param command [String]
185
+ # @return [String]
186
+ def run(command)
187
+ logger.debug "* Start command: #{command}"
188
+ # TODO: What kind of error handling do we want?
189
+ result = `#{command}`
190
+ logger.debug "* Result: \n* #{result.gsub("\n", "\n* ")}"
191
+ logger.debug "* End command: #{command}"
192
+ result
193
+ end
194
+ end
195
+ end
196
+ end
197
+
198
+ Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
199
+ require file unless File.directory?(file) || file.match?(/base_generator/)
200
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+ module DerivativeRodeo
3
+ module Generators
4
+ ##
5
+ # A helper module for copying files from one location to another.
6
+ module CopyFileConcern
7
+ ##
8
+ # Copy files from one adapter to another.
9
+ #
10
+ # @param output_location [StorageLocations::BaseLocation]
11
+ # @param input_tmp_file_path [String]
12
+ #
13
+ # @return [StorageLocations::BaseLocation]
14
+ def build_step(output_location:, input_tmp_file_path:, **)
15
+ copy(input_tmp_file_path, output_location)
16
+ end
17
+
18
+ ##
19
+ # @api private
20
+ def copy(from_path, output_location)
21
+ output_location.with_new_tmp_path do |out_path|
22
+ # We can move here because we are done with the tmp file after this.
23
+ FileUtils.mv(from_path, out_path)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+ require 'derivative_rodeo/generators/concerns/copy_file_concern'
3
+
4
+ module DerivativeRodeo
5
+ module Generators
6
+ ##
7
+ # Responsible for moving files from one storage adapter to another.
8
+ class CopyGenerator < BaseGenerator
9
+ self.output_extension = StorageLocations::SAME
10
+
11
+ include CopyFileConcern
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Responsible for finding or creating a hocr file (or configured :output_suffix) using
7
+ # tesseract. Will create and store a monochrome derivative if one is not found.
8
+ #
9
+ # @see http://tesseract-ocr.github.io
10
+ #
11
+ # From `tesseract -h`
12
+ #
13
+ # Usage:
14
+ # tesseract --help | --help-extra | --version
15
+ # tesseract --list-langs
16
+ # tesseract imagename outputbase [options...] [configfile...]
17
+ class HocrGenerator < BaseGenerator
18
+ ##
19
+ # @!group Class Attributes
20
+ # @!attribute [rw]
21
+ # Command arena variables to for tesseract command; default `nil`.
22
+ # Should be a space seperated string of KEY=value pairs
23
+ #
24
+ # @example
25
+ # # this works for space_stone aws lambda
26
+ # Derivative::Rodeo::Step::HocrStep.command_environment_variables =
27
+ # 'OMP_THREAD_LIMIT=1 TESSDATA_PREFIX=/opt/share/tessdata LD_LIBRARY_PATH=/opt/lib PATH=/opt/bin:$PATH'
28
+ class_attribute :command_environment_variables, default: "OMP_THREAD_LIMIT=1"
29
+
30
+ ##
31
+ # @!attribute [rw]
32
+ # Additional options to send to tesseract command; default `nil`.
33
+ class_attribute :additional_tessearct_options, default: nil
34
+
35
+ # @!attribute [rw]
36
+ # The tesseract command's output base; default `:hocr`.
37
+ class_attribute :output_suffix, default: :hocr
38
+
39
+ self.output_extension = 'hocr'
40
+ # @!endgroup Class Attributes
41
+
42
+ ##
43
+ # Run tesseract on monocrhome file and store the resulting output in the configured
44
+ # {.output_extension} (default 'hocr')
45
+ #
46
+ # @param output_location [StorageLocations::BaseLocation]
47
+ # @param input_tmp_file_path [String]
48
+ #
49
+ # @return [StorageLocations::BaseLocation]
50
+ #
51
+ # @see #requisite_files
52
+ def build_step(output_location:, input_tmp_file_path:, **)
53
+ tesseractify(input_tmp_file_path, output_location)
54
+ end
55
+
56
+ ##
57
+ # @param builder [Class, #generated_files]
58
+ #
59
+ # When generating a hocr file from an image, we've found the best results are when we're
60
+ # processing a monochrome image. As such, this generator will auto-convert a given image to
61
+ # monochrome.
62
+ #
63
+ # @yieldparam file [StorageLocations::BaseLocation]
64
+ # @yieldparam tmp_path [String]
65
+ #
66
+ # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
67
+ def with_each_requisite_location_and_tmp_file_path(builder: MonochromeGenerator)
68
+ mono_location_template = output_location_template.gsub(self.class.output_extension, builder.output_extension)
69
+ requisite_files ||= builder.new(input_uris: input_uris, output_location_template: mono_location_template).generated_files
70
+ requisite_files.each do |input_location|
71
+ input_location.with_existing_tmp_path do |tmp_file_path|
72
+ yield(input_location, tmp_file_path)
73
+ end
74
+ end
75
+ end
76
+
77
+ ##
78
+ # @api private
79
+ #
80
+ # Call `tesseract` on the monochrome file and store the resulting hocr
81
+ # in the tmp_path
82
+ #
83
+ # @param input_tmp_file_path [String].
84
+ # @param output_location [StorageLocations::BaseLocation]
85
+ def tesseractify(input_tmp_file_path, output_location)
86
+ output_location.with_new_tmp_path do |out_tmp_path|
87
+ run_tesseract(input_tmp_file_path, out_tmp_path)
88
+ end
89
+ end
90
+
91
+ ##
92
+ # @param in_path [String] the source of the file
93
+ # @param out_path [String]
94
+ def run_tesseract(in_path, out_path)
95
+ # we pull the extension off the output path, because tesseract will add it back
96
+ cmd = ""
97
+ cmd += command_environment_variables + " " if command_environment_variables.present?
98
+ # TODO: The line of code could mean we had a file with multiple periods and we'd just
99
+ # replace the first one. Should we instead prefer the following:
100
+ #
101
+ # `out_path.split(".")[0..-2].join('.') + ".#{output_extension}"`
102
+ output_to_path = out_path.sub('.' + output_extension, '')
103
+ cmd += "tesseract #{in_path} #{output_to_path}"
104
+ cmd += " #{additional_tessearct_options}" if additional_tessearct_options.present?
105
+ cmd += " #{output_suffix}"
106
+
107
+ # TODO: capture output in case of exceptions; perhaps delegate that to the #run method.
108
+ run(cmd)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Take images an ensures that we have a monochrome derivative of those images.
7
+ class MonochromeGenerator < BaseGenerator
8
+ # TODO: Can we assume a tiff?
9
+ self.output_extension = 'mono.tiff'
10
+
11
+ ##
12
+ # @param input_location [StorageLocations::BaseLocation]
13
+ # @param output_location [StorageLocations::BaseLocation]
14
+ # @return [StorageLocations::BaseLocation]
15
+ def build_step(input_location:, output_location:, input_tmp_file_path:)
16
+ image = DerivativeRodeo::Services::ImageService.new(input_tmp_file_path)
17
+ if image.monochrome?
18
+ # The input_location is already have a monochrome file, no need to run conversions.
19
+ input_location
20
+ else
21
+ # We need to write monochromify and the image.
22
+ monochromify(output_location, image)
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Convert the above image to a file at the monochrome_path
28
+ #
29
+ # @param monochrome_file [StorageLocations::BaseLocation]
30
+ # @param image [Services::ImageService]
31
+ # @return [StorageLocations::BaseLocation]
32
+ def monochromify(monochrome_file, image)
33
+ monochrome_file.with_new_tmp_path do |monochrome_path|
34
+ image.convert(destination: monochrome_path, monochrome: true)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+ require 'derivative_rodeo/generators/concerns/copy_file_concern'
3
+
4
+ module DerivativeRodeo
5
+ module Generators
6
+ ##
7
+ # This class is responsible for splitting each given PDF (e.g. {#input_files}) into one image
8
+ # per page (e.g. {#with_each_requisite_location_and_tmp_file_path}). We need to ensure that we
9
+ # have each of those image files in S3/file storage then enqueue those files for processing.
10
+ class PdfSplitGenerator < BaseGenerator
11
+ ##
12
+ # There is a duplication of the splitter name.
13
+ #
14
+ # @see #pdf_splitter_name
15
+ self.output_extension = "tiff"
16
+
17
+ include CopyFileConcern
18
+
19
+ ##
20
+ # @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
21
+ #
22
+ # @return [#call, Services::PdfSplitter::Base]
23
+ def pdf_splitter(name: pdf_splitter_name)
24
+ @pdf_splitter ||= Services::PdfSplitter.for(name)
25
+ end
26
+
27
+ ##
28
+ # @return [Symbol]
29
+ #
30
+ # @see .output_extension
31
+ def pdf_splitter_name
32
+ output_extension.to_s.split(".").last.to_sym
33
+ end
34
+
35
+ ##
36
+ # @api public
37
+ #
38
+ # Take the given PDF(s) and into one image per page. Remember that the URL should account for
39
+ # the page number.
40
+ #
41
+ # When we have two PDFs (10 pages and 20 pages respectively), we will have 30 requisite files;
42
+ # the files must have URLs that associate with their respective parent PDFs.
43
+ #
44
+ # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
45
+ # @yieldparam image_path [String] where to find this file in the tmp space
46
+ #
47
+ # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
48
+ def with_each_requisite_location_and_tmp_file_path
49
+ input_files.each do |input_location|
50
+ input_location.with_existing_tmp_path do |input_tmp_file_path|
51
+ image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
52
+ image_paths.each do |image_path|
53
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
54
+ yield(image_location, image_path)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # This generator is responsible for converting a given binary into a thumbnail. As of
7
+ # <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
8
+ class ThumbnailGenerator < BaseGenerator
9
+ ##
10
+ # We want to mirror the same file "last" extension as described in Hyrax.
11
+ #
12
+ # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
13
+ self.output_extension = 'thumbnail.jpeg'
14
+
15
+ ##
16
+ # @param output_location [StorageLocations::BaseLocation]
17
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
18
+ #
19
+ # @return [StorageLocations::BaseLocation]
20
+ def build_step(output_location:, input_tmp_file_path:, **)
21
+ output_location.with_new_tmp_path do |out_tmp_path|
22
+ thumbnify(path_of_file_to_create_thumbnail_from: input_tmp_file_path, path_for_thumbnail_output: out_tmp_path)
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Convert the file found at :path_to_input into a thumbnail, writing it to the
28
+ # :path_for_thumbnail_output
29
+ #
30
+ # @param path_of_file_to_create_thumbnail_from [String]
31
+ # @param path_for_thumbnail_output [String]
32
+ def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
33
+ # @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
34
+ `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Generate the word coordinates (as JSON) from the given input_uris.
7
+ #
8
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
9
+ class WordCoordinatesGenerator < BaseGenerator
10
+ self.output_extension = "coordinates.json"
11
+
12
+ ##
13
+ # @param output_location [StorageLocations::BaseLocation]
14
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
15
+ #
16
+ # @return [StorageLocations::BaseLocation]
17
+ #
18
+ # @see #requisite_files
19
+ def build_step(output_location:, input_tmp_file_path:, **)
20
+ output_location.with_new_tmp_path do |output_tmp_file_path|
21
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_coordinate: output_tmp_file_path)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ ##
28
+ # @param path_to_hocr [String]
29
+ # @param path_to_coordinate [String]
30
+ # @param service [#call, Services::ExtractWordCoordinatesFromHocrSgmlService]
31
+ def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
32
+ hocr_html = File.read(path_to_hocr)
33
+ File.open(path_to_coordinate, "w+") do |file|
34
+ file.puts service.call(hocr_html)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ # @api private
7
+ #
8
+ class BaseService
9
+ end
10
+ end
11
+ end
12
+
13
+ Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
14
+ require file unless File.directory?(file) || file.match?(/base_service/)
15
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ #
7
+ # A service to convert an array of :from_uris to :to_uris via a :template.
8
+ #
9
+ # @see .call
10
+ class ConvertUriViaTemplateService
11
+ DIR_PARTS_REPLACEMENT_REGEXP = %r{\{\{\s*dir_parts\[(?<left>\-?\d+)\.\.(?<right>\-?\d+)\]\s*\}\}}.freeze
12
+ FILENAME_REPLACEMENT_REGEXP = %r{\{\{\s*filename\s*\}\}}.freeze
13
+ BASENAME_REPLACEMENT_REGEXP = %r{\{\{\s*basename\s*\}\}}.freeze
14
+ EXTENSION_REPLACEMENT_REGEXP = %r{\{\{\s*extension\s*\}\}}.freeze
15
+ SCHEME_REPLACEMENT_REGEXP = %r{\{\{\s*scheme* \}\}}.freeze
16
+ SCHEME_FOR_URI_REGEXP = %r{^(?<from_scheme>[^:]+)://}.freeze
17
+ attr_accessor :from_uri, :template, :adapter, :separator, :uri, :from_scheme, :path, :parts, :dir_parts, :filename, :basename, :extension, :template_without_query, :template_query
18
+
19
+ ##
20
+ # Convert the given :from_uris to a different list of uris based on the given :template.
21
+ #
22
+ # Components of the template:
23
+ #
24
+ # - basename :: the file's basename without extension
25
+ # - extension :: the file's extension with the period
26
+ # - dir_parts :: the directory parts in which the file exists; excludes the scheme
27
+ # - filename :: a convenience that could be represented as `basename.extension`
28
+ # - scheme :: a convenience that could be represented as `basename.extension`
29
+ #
30
+ # The specs demonstrate the use cases.
31
+ #
32
+ # @param from_uri [String] Of the form "scheme://dir/parts/basename.extension"
33
+ # @param template [String] Another URI that may contain path_parts or scheme template values.
34
+ # @param adapter [StorageLocations::Location]
35
+ # @param separator [String]
36
+ #
37
+ # @return [String]
38
+ #
39
+ # @example
40
+ # DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
41
+ # from_uris: ["file:///path1/A/file.pdf", "file:///path2/B/file.pdf"],
42
+ # template: "file:///dest1/{{dir_parts[-2..-1]}}/{{filename}}")
43
+ # => ["file:///dest1/path2/A/file.pdf", "file:///dest1/path2/B/file.pdf"]
44
+ #
45
+ # DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
46
+ # from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
47
+ # template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
48
+ # => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
49
+ def self.call(from_uri:, template:, adapter: nil, separator: "/")
50
+ new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
51
+ end
52
+
53
+ def initialize(from_uri:, template:, adapter: nil, separator: "/")
54
+ @from_uri = from_uri
55
+ @template = template
56
+ @adapter = adapter
57
+ @separator = separator
58
+
59
+ @uri, _query = from_uri.split("?")
60
+ @from_scheme, @path = uri.split("://")
61
+ @parts = @path.split(separator)
62
+ @dir_parts = @parts[0..-2]
63
+ @filename = @parts[-1]
64
+ @basename = File.basename(@filename, ".*")
65
+ @extension = File.extname(@filename)
66
+
67
+ @template_without_query, @template_query = template.split("?")
68
+ end
69
+
70
+ def call
71
+ to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
72
+ # The yielded value does not include capture regions. So I'm re-matching things.
73
+ # capture region to handle this specific thing.
74
+ match = DIR_PARTS_REPLACEMENT_REGEXP.match(text)
75
+ dir_parts[(match[:left].to_i)..(match[:right].to_i)].join(separator)
76
+ end
77
+
78
+ to_uri = to_uri.gsub(SCHEME_REPLACEMENT_REGEXP, (adapter&.scheme || from_scheme))
79
+ to_uri = to_uri.gsub(EXTENSION_REPLACEMENT_REGEXP, extension)
80
+ to_uri = to_uri.gsub(BASENAME_REPLACEMENT_REGEXP, basename)
81
+ to_uri.gsub!(FILENAME_REPLACEMENT_REGEXP, filename)
82
+ to_uri = "#{to_uri}?#{template_query}" if template_query
83
+ to_uri
84
+ end
85
+ end
86
+ end
87
+ end