derivative-rodeo 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +15 -0
  4. data/README.md +251 -0
  5. data/Rakefile +42 -0
  6. data/derivative_rodeo.gemspec +54 -0
  7. data/lib/derivative/rodeo.rb +3 -0
  8. data/lib/derivative-rodeo.rb +3 -0
  9. data/lib/derivative_rodeo/configuration.rb +95 -0
  10. data/lib/derivative_rodeo/errors.rb +56 -0
  11. data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
  12. data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
  13. data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
  14. data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
  15. data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
  16. data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
  17. data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
  18. data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
  19. data/lib/derivative_rodeo/services/base_service.rb +15 -0
  20. data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
  21. data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
  22. data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
  23. data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
  24. data/lib/derivative_rodeo/services/image_service.rb +73 -0
  25. data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
  26. data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
  27. data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
  28. data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
  29. data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
  30. data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
  31. data/lib/derivative_rodeo/services/url_service.rb +42 -0
  32. data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
  33. data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
  34. data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
  35. data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
  36. data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
  37. data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
  38. data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
  39. data/lib/derivative_rodeo/technical_metadata.rb +23 -0
  40. data/lib/derivative_rodeo/version.rb +5 -0
  41. data/lib/derivative_rodeo.rb +36 -0
  42. metadata +339 -0
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ ##
5
+ # Generators execute a transformation on files and return new files.
6
+ #
7
+ # A new generator should inherit from {BaseGenerator}.
8
+ #
9
+ # @see BaseGenerator
10
+ module Generators
11
+ ##
12
+ # The Base Generator defines the interface and common methods.
13
+ #
14
+ # In extending a BaseGenerator you:
15
+ #
16
+ # - must assign an {.output_extension}
17
+ # - must impliment a {#build_step} method
18
+ # - may override {#with_each_requisite_location_and_tmp_file_path}
19
+ class BaseGenerator
20
+ ##
21
+ # @!group Class Attributes
22
+ # @!attribute [rw]
23
+ #
24
+ # @return [String] of the form that starts with a string and may contain periods (though
25
+ # likely not as the first character).
26
+ class_attribute :output_extension
27
+ # @!endgroup Class Attributes
28
+
29
+ attr_reader :input_uris,
30
+ :logger,
31
+ :output_location_template,
32
+ :preprocessed_location_template
33
+
34
+ ##
35
+ # @param input_uris [Array<String>]
36
+ # @param output_location_template [String] the template used to transform the given :input_uris
37
+ # via {Services::ConvertUriViaTemplateService}.
38
+ # @param preprocessed_location_template [NilClass, String] when `nil` ignore, otherwise attempt
39
+ # to find preprocessed uris by transforming the :input_uris via
40
+ # {Services::ConvertUriViaTemplateService} with the given
41
+ # :preprocessed_location_template.
42
+ # @param logger [Logger]
43
+ def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
44
+ @input_uris = Array.wrap(input_uris)
45
+ @output_location_template = output_location_template
46
+ @preprocessed_location_template = preprocessed_location_template
47
+ @logger = logger
48
+
49
+ return if valid_instantiation?
50
+
51
+ raise Errors::ExtensionMissingError.new(klass: self.class)
52
+ end
53
+
54
+ ##
55
+ # @api private
56
+ #
57
+ # @return [Boolean]
58
+ def valid_instantiation?
59
+ # When we have a BaseGenerator and not one of it's children or when we've assigned the
60
+ # output_extension. instance_of? is more specific than is_a?
61
+ instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
62
+ end
63
+
64
+ ##
65
+ # @api public
66
+ #
67
+ # @param input_location [StorageLocations::BaseLocation] the input source of the generation
68
+ # @param output_location [StorageLocations::BaseLocation] the output location of the generation
69
+ # @param input_tmp_file_path [String] the temporary path to the location of the given :input_location to
70
+ # enable further processing on the file.
71
+ #
72
+ # @return [StorageLocations::BaseLocation]
73
+ # @see #generated_files
74
+ def build_step(input_location:, output_location:, input_tmp_file_path:)
75
+ raise NotImplementedError, "#{self.class}#build_step"
76
+ end
77
+
78
+ ##
79
+ # @api public
80
+ #
81
+ # @return [Array<StorageLocations::BaseLocation>]
82
+ #
83
+ # @see #build_step
84
+ # @see #with_each_requisite_location_and_tmp_file_path
85
+ def generated_files
86
+ return @generated_files if defined?(@generated_files)
87
+
88
+ # As much as I would like to use map or returned values; given the implementations it's
89
+ # better to explicitly require that; reducing downstream implementation headaches.
90
+ #
91
+ # In other words, this little bit of ugly in a method that has yet to change in a subclass
92
+ # helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
93
+ # #build_step
94
+ @generated_files = []
95
+ with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
96
+ generated_file = destination(input_location)
97
+ @generated_files << if generated_file.exist?
98
+ generated_file
99
+ else
100
+ build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
101
+ end
102
+ end
103
+ @generated_files
104
+ end
105
+
106
+ ##
107
+ # @return [Array<String>]
108
+ # @see #generated_files
109
+ def generated_uris
110
+ # TODO: what do we do about nils?
111
+ generated_files.map { |file| file&.file_uri }
112
+ end
113
+
114
+ ##
115
+ # @api public
116
+ #
117
+ # The files that are required as part of the {#generated_files} (though more precisely the
118
+ # {#build_step}.)
119
+ #
120
+ # This method is responsible for one thing:
121
+ #
122
+ # - yielding a {StorageLocations::BaseLocation} and the path (as String) to the files
123
+ # location in the temporary working space.
124
+ #
125
+ # This method allows child classes to modify the file_uris for example, to filter out files
126
+ # that are not of the correct type or as a means of having "this" generator depend on another
127
+ # generator. The {Generators::HocrGenerator} requires that the input_location be a monochrome;
128
+ # so it does conversions of each given input_location. The {Generators::PdfSplitGenerator} uses
129
+ # this method to take each given PDF and generated one image per page of each given PDF.
130
+ # Those images are then treated as the requisite locations.
131
+ #
132
+ # @yieldparam input_location [StorageLocations::BaseLocations] the from location as represented by
133
+ # a URI.
134
+ # @yieldparam tmp_file_path [String] where to find the input_location's file in the processing tmp
135
+ # space.
136
+ #
137
+ # @see Generators::HocrGenerator
138
+ # @see Generators::PdfSplitGenerator
139
+ def with_each_requisite_location_and_tmp_file_path
140
+ input_files.each do |input_location|
141
+ input_location.with_existing_tmp_path do |tmp_file_path|
142
+ yield(input_location, tmp_file_path)
143
+ end
144
+ end
145
+ end
146
+
147
+ ##
148
+ # @return [Array<StorageLocations::BaseLocation>]
149
+ def input_files
150
+ @input_files ||= input_uris.map do |file_uri|
151
+ DerivativeRodeo::StorageLocations::BaseLocation.from_uri(file_uri)
152
+ end
153
+ end
154
+
155
+ ##
156
+ # Returns the location destination for the given :input_file. The file at the location
157
+ # destination might exist or might not. In the case of non-existence, then the {#build_step}
158
+ # will create the file.
159
+ #
160
+ # @param input_location [StorageLocations::BaseLocation]
161
+ #
162
+ # @return [StorageLocations::BaseLocation] the derivative of the given :file based on either the
163
+ # {#output_location_template} or {#preprocessed_location_template}.
164
+ #
165
+ # @see [StorageLocations::BaseLocation#exist?]
166
+ def destination(input_location)
167
+ output_location = input_location.derived_file_from(template: output_location_template)
168
+
169
+ return output_location if output_location.exist?
170
+ return output_location unless preprocessed_location_template
171
+
172
+ preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
173
+ # We only want
174
+ return preprocessed_location if preprocessed_location&.exist?
175
+
176
+ # NOTE: The file does not exist at the output_location; but we pass this information along so
177
+ # that the #build_step knows where to write the file.
178
+ output_location
179
+ end
180
+
181
+ ##
182
+ # A bit of indirection to create a common interface for running a shell command.
183
+ #
184
+ # @param command [String]
185
+ # @return [String]
186
+ def run(command)
187
+ logger.debug "* Start command: #{command}"
188
+ # TODO: What kind of error handling do we want?
189
+ result = `#{command}`
190
+ logger.debug "* Result: \n* #{result.gsub("\n", "\n* ")}"
191
+ logger.debug "* End command: #{command}"
192
+ result
193
+ end
194
+ end
195
+ end
196
+ end
197
+
198
+ Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
199
+ require file unless File.directory?(file) || file.match?(/base_generator/)
200
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+ module DerivativeRodeo
3
+ module Generators
4
+ ##
5
+ # A helper module for copying files from one location to another.
6
+ module CopyFileConcern
7
+ ##
8
+ # Copy files from one adapter to another.
9
+ #
10
+ # @param output_location [StorageLocations::BaseLocation]
11
+ # @param input_tmp_file_path [String]
12
+ #
13
+ # @return [StorageLocations::BaseLocation]
14
+ def build_step(output_location:, input_tmp_file_path:, **)
15
+ copy(input_tmp_file_path, output_location)
16
+ end
17
+
18
+ ##
19
+ # @api private
20
+ def copy(from_path, output_location)
21
+ output_location.with_new_tmp_path do |out_path|
22
+ # We can move here because we are done with the tmp file after this.
23
+ FileUtils.mv(from_path, out_path)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+ require 'derivative_rodeo/generators/concerns/copy_file_concern'
3
+
4
+ module DerivativeRodeo
5
+ module Generators
6
+ ##
7
+ # Responsible for moving files from one storage adapter to another.
8
+ class CopyGenerator < BaseGenerator
9
+ self.output_extension = StorageLocations::SAME
10
+
11
+ include CopyFileConcern
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Responsible for finding or creating a hocr file (or configured :output_suffix) using
7
+ # tesseract. Will create and store a monochrome derivative if one is not found.
8
+ #
9
+ # @see http://tesseract-ocr.github.io
10
+ #
11
+ # From `tesseract -h`
12
+ #
13
+ # Usage:
14
+ # tesseract --help | --help-extra | --version
15
+ # tesseract --list-langs
16
+ # tesseract imagename outputbase [options...] [configfile...]
17
+ class HocrGenerator < BaseGenerator
18
+ ##
19
+ # @!group Class Attributes
20
+ # @!attribute [rw]
21
+ # Command arena variables to for tesseract command; default `nil`.
22
+ # Should be a space seperated string of KEY=value pairs
23
+ #
24
+ # @example
25
+ # # this works for space_stone aws lambda
26
+ # Derivative::Rodeo::Step::HocrStep.command_environment_variables =
27
+ # 'OMP_THREAD_LIMIT=1 TESSDATA_PREFIX=/opt/share/tessdata LD_LIBRARY_PATH=/opt/lib PATH=/opt/bin:$PATH'
28
+ class_attribute :command_environment_variables, default: "OMP_THREAD_LIMIT=1"
29
+
30
+ ##
31
+ # @!attribute [rw]
32
+ # Additional options to send to tesseract command; default `nil`.
33
+ class_attribute :additional_tessearct_options, default: nil
34
+
35
+ # @!attribute [rw]
36
+ # The tesseract command's output base; default `:hocr`.
37
+ class_attribute :output_suffix, default: :hocr
38
+
39
+ self.output_extension = 'hocr'
40
+ # @!endgroup Class Attributes
41
+
42
+ ##
43
+ # Run tesseract on monocrhome file and store the resulting output in the configured
44
+ # {.output_extension} (default 'hocr')
45
+ #
46
+ # @param output_location [StorageLocations::BaseLocation]
47
+ # @param input_tmp_file_path [String]
48
+ #
49
+ # @return [StorageLocations::BaseLocation]
50
+ #
51
+ # @see #requisite_files
52
+ def build_step(output_location:, input_tmp_file_path:, **)
53
+ tesseractify(input_tmp_file_path, output_location)
54
+ end
55
+
56
+ ##
57
+ # @param builder [Class, #generated_files]
58
+ #
59
+ # When generating a hocr file from an image, we've found the best results are when we're
60
+ # processing a monochrome image. As such, this generator will auto-convert a given image to
61
+ # monochrome.
62
+ #
63
+ # @yieldparam file [StorageLocations::BaseLocation]
64
+ # @yieldparam tmp_path [String]
65
+ #
66
+ # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
67
+ def with_each_requisite_location_and_tmp_file_path(builder: MonochromeGenerator)
68
+ mono_location_template = output_location_template.gsub(self.class.output_extension, builder.output_extension)
69
+ requisite_files ||= builder.new(input_uris: input_uris, output_location_template: mono_location_template).generated_files
70
+ requisite_files.each do |input_location|
71
+ input_location.with_existing_tmp_path do |tmp_file_path|
72
+ yield(input_location, tmp_file_path)
73
+ end
74
+ end
75
+ end
76
+
77
+ ##
78
+ # @api private
79
+ #
80
+ # Call `tesseract` on the monochrome file and store the resulting hocr
81
+ # in the tmp_path
82
+ #
83
+ # @param input_tmp_file_path [String].
84
+ # @param output_location [StorageLocations::BaseLocation]
85
+ def tesseractify(input_tmp_file_path, output_location)
86
+ output_location.with_new_tmp_path do |out_tmp_path|
87
+ run_tesseract(input_tmp_file_path, out_tmp_path)
88
+ end
89
+ end
90
+
91
+ ##
92
+ # @param in_path [String] the source of the file
93
+ # @param out_path [String]
94
+ def run_tesseract(in_path, out_path)
95
+ # we pull the extension off the output path, because tesseract will add it back
96
+ cmd = ""
97
+ cmd += command_environment_variables + " " if command_environment_variables.present?
98
+ # TODO: The line of code could mean we had a file with multiple periods and we'd just
99
+ # replace the first one. Should we instead prefer the following:
100
+ #
101
+ # `out_path.split(".")[0..-2].join('.') + ".#{output_extension}"`
102
+ output_to_path = out_path.sub('.' + output_extension, '')
103
+ cmd += "tesseract #{in_path} #{output_to_path}"
104
+ cmd += " #{additional_tessearct_options}" if additional_tessearct_options.present?
105
+ cmd += " #{output_suffix}"
106
+
107
+ # TODO: capture output in case of exceptions; perhaps delegate that to the #run method.
108
+ run(cmd)
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Take images an ensures that we have a monochrome derivative of those images.
7
+ class MonochromeGenerator < BaseGenerator
8
+ # TODO: Can we assume a tiff?
9
+ self.output_extension = 'mono.tiff'
10
+
11
+ ##
12
+ # @param input_location [StorageLocations::BaseLocation]
13
+ # @param output_location [StorageLocations::BaseLocation]
14
+ # @return [StorageLocations::BaseLocation]
15
+ def build_step(input_location:, output_location:, input_tmp_file_path:)
16
+ image = DerivativeRodeo::Services::ImageService.new(input_tmp_file_path)
17
+ if image.monochrome?
18
+ # The input_location is already have a monochrome file, no need to run conversions.
19
+ input_location
20
+ else
21
+ # We need to write monochromify and the image.
22
+ monochromify(output_location, image)
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Convert the above image to a file at the monochrome_path
28
+ #
29
+ # @param monochrome_file [StorageLocations::BaseLocation]
30
+ # @param image [Services::ImageService]
31
+ # @return [StorageLocations::BaseLocation]
32
+ def monochromify(monochrome_file, image)
33
+ monochrome_file.with_new_tmp_path do |monochrome_path|
34
+ image.convert(destination: monochrome_path, monochrome: true)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+ require 'derivative_rodeo/generators/concerns/copy_file_concern'
3
+
4
+ module DerivativeRodeo
5
+ module Generators
6
+ ##
7
+ # This class is responsible for splitting each given PDF (e.g. {#input_files}) into one image
8
+ # per page (e.g. {#with_each_requisite_location_and_tmp_file_path}). We need to ensure that we
9
+ # have each of those image files in S3/file storage then enqueue those files for processing.
10
+ class PdfSplitGenerator < BaseGenerator
11
+ ##
12
+ # There is a duplication of the splitter name.
13
+ #
14
+ # @see #pdf_splitter_name
15
+ self.output_extension = "tiff"
16
+
17
+ include CopyFileConcern
18
+
19
+ ##
20
+ # @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
21
+ #
22
+ # @return [#call, Services::PdfSplitter::Base]
23
+ def pdf_splitter(name: pdf_splitter_name)
24
+ @pdf_splitter ||= Services::PdfSplitter.for(name)
25
+ end
26
+
27
+ ##
28
+ # @return [Symbol]
29
+ #
30
+ # @see .output_extension
31
+ def pdf_splitter_name
32
+ output_extension.to_s.split(".").last.to_sym
33
+ end
34
+
35
+ ##
36
+ # @api public
37
+ #
38
+ # Take the given PDF(s) and into one image per page. Remember that the URL should account for
39
+ # the page number.
40
+ #
41
+ # When we have two PDFs (10 pages and 20 pages respectively), we will have 30 requisite files;
42
+ # the files must have URLs that associate with their respective parent PDFs.
43
+ #
44
+ # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
45
+ # @yieldparam image_path [String] where to find this file in the tmp space
46
+ #
47
+ # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
48
+ def with_each_requisite_location_and_tmp_file_path
49
+ input_files.each do |input_location|
50
+ input_location.with_existing_tmp_path do |input_tmp_file_path|
51
+ image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
52
+ image_paths.each do |image_path|
53
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
54
+ yield(image_location, image_path)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # This generator is responsible for converting a given binary into a thumbnail. As of
7
+ # <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
8
+ class ThumbnailGenerator < BaseGenerator
9
+ ##
10
+ # We want to mirror the same file "last" extension as described in Hyrax.
11
+ #
12
+ # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
13
+ self.output_extension = 'thumbnail.jpeg'
14
+
15
+ ##
16
+ # @param output_location [StorageLocations::BaseLocation]
17
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
18
+ #
19
+ # @return [StorageLocations::BaseLocation]
20
+ def build_step(output_location:, input_tmp_file_path:, **)
21
+ output_location.with_new_tmp_path do |out_tmp_path|
22
+ thumbnify(path_of_file_to_create_thumbnail_from: input_tmp_file_path, path_for_thumbnail_output: out_tmp_path)
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Convert the file found at :path_to_input into a thumbnail, writing it to the
28
+ # :path_for_thumbnail_output
29
+ #
30
+ # @param path_of_file_to_create_thumbnail_from [String]
31
+ # @param path_for_thumbnail_output [String]
32
+ def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
33
+ # @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
34
+ `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Generators
5
+ ##
6
+ # Generate the word coordinates (as JSON) from the given input_uris.
7
+ #
8
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
9
+ class WordCoordinatesGenerator < BaseGenerator
10
+ self.output_extension = "coordinates.json"
11
+
12
+ ##
13
+ # @param output_location [StorageLocations::BaseLocation]
14
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
15
+ #
16
+ # @return [StorageLocations::BaseLocation]
17
+ #
18
+ # @see #requisite_files
19
+ def build_step(output_location:, input_tmp_file_path:, **)
20
+ output_location.with_new_tmp_path do |output_tmp_file_path|
21
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_coordinate: output_tmp_file_path)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ ##
28
+ # @param path_to_hocr [String]
29
+ # @param path_to_coordinate [String]
30
+ # @param service [#call, Services::ExtractWordCoordinatesFromHocrSgmlService]
31
+ def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
32
+ hocr_html = File.read(path_to_hocr)
33
+ File.open(path_to_coordinate, "w+") do |file|
34
+ file.puts service.call(hocr_html)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ # @api private
7
+ #
8
+ class BaseService
9
+ end
10
+ end
11
+ end
12
+
13
+ Dir.glob(File.join(__dir__, '**/*')).sort.each do |file|
14
+ require file unless File.directory?(file) || file.match?(/base_service/)
15
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ #
7
+ # A service to convert an array of :from_uris to :to_uris via a :template.
8
+ #
9
+ # @see .call
10
+ class ConvertUriViaTemplateService
11
+ DIR_PARTS_REPLACEMENT_REGEXP = %r{\{\{\s*dir_parts\[(?<left>\-?\d+)\.\.(?<right>\-?\d+)\]\s*\}\}}.freeze
12
+ FILENAME_REPLACEMENT_REGEXP = %r{\{\{\s*filename\s*\}\}}.freeze
13
+ BASENAME_REPLACEMENT_REGEXP = %r{\{\{\s*basename\s*\}\}}.freeze
14
+ EXTENSION_REPLACEMENT_REGEXP = %r{\{\{\s*extension\s*\}\}}.freeze
15
+ SCHEME_REPLACEMENT_REGEXP = %r{\{\{\s*scheme* \}\}}.freeze
16
+ SCHEME_FOR_URI_REGEXP = %r{^(?<from_scheme>[^:]+)://}.freeze
17
+ attr_accessor :from_uri, :template, :adapter, :separator, :uri, :from_scheme, :path, :parts, :dir_parts, :filename, :basename, :extension, :template_without_query, :template_query
18
+
19
+ ##
20
+ # Convert the given :from_uris to a different list of uris based on the given :template.
21
+ #
22
+ # Components of the template:
23
+ #
24
+ # - basename :: the file's basename without extension
25
+ # - extension :: the file's extension with the period
26
+ # - dir_parts :: the directory parts in which the file exists; excludes the scheme
27
+ # - filename :: a convenience that could be represented as `basename.extension`
28
+ # - scheme :: a convenience that could be represented as `basename.extension`
29
+ #
30
+ # The specs demonstrate the use cases.
31
+ #
32
+ # @param from_uri [String] Of the form "scheme://dir/parts/basename.extension"
33
+ # @param template [String] Another URI that may contain path_parts or scheme template values.
34
+ # @param adapter [StorageLocations::Location]
35
+ # @param separator [String]
36
+ #
37
+ # @return [String]
38
+ #
39
+ # @example
40
+ # DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
41
+ # from_uris: ["file:///path1/A/file.pdf", "file:///path2/B/file.pdf"],
42
+ # template: "file:///dest1/{{dir_parts[-2..-1]}}/{{filename}}")
43
+ # => ["file:///dest1/path2/A/file.pdf", "file:///dest1/path2/B/file.pdf"]
44
+ #
45
+ # DerivativeRodeo::Services::ConvertUriViaTemplateService.call(
46
+ # from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
47
+ # template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
48
+ # => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
49
+ def self.call(from_uri:, template:, adapter: nil, separator: "/")
50
+ new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
51
+ end
52
+
53
+ def initialize(from_uri:, template:, adapter: nil, separator: "/")
54
+ @from_uri = from_uri
55
+ @template = template
56
+ @adapter = adapter
57
+ @separator = separator
58
+
59
+ @uri, _query = from_uri.split("?")
60
+ @from_scheme, @path = uri.split("://")
61
+ @parts = @path.split(separator)
62
+ @dir_parts = @parts[0..-2]
63
+ @filename = @parts[-1]
64
+ @basename = File.basename(@filename, ".*")
65
+ @extension = File.extname(@filename)
66
+
67
+ @template_without_query, @template_query = template.split("?")
68
+ end
69
+
70
+ def call
71
+ to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
72
+ # The yielded value does not include capture regions. So I'm re-matching things.
73
+ # capture region to handle this specific thing.
74
+ match = DIR_PARTS_REPLACEMENT_REGEXP.match(text)
75
+ dir_parts[(match[:left].to_i)..(match[:right].to_i)].join(separator)
76
+ end
77
+
78
+ to_uri = to_uri.gsub(SCHEME_REPLACEMENT_REGEXP, (adapter&.scheme || from_scheme))
79
+ to_uri = to_uri.gsub(EXTENSION_REPLACEMENT_REGEXP, extension)
80
+ to_uri = to_uri.gsub(BASENAME_REPLACEMENT_REGEXP, basename)
81
+ to_uri.gsub!(FILENAME_REPLACEMENT_REGEXP, filename)
82
+ to_uri = "#{to_uri}?#{template_query}" if template_query
83
+ to_uri
84
+ end
85
+ end
86
+ end
87
+ end