derivative-rodeo 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/alto_generator.rb +42 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +10 -5
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +74 -10
- data/lib/derivative_rodeo/generators/plain_text_generator.rb +42 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +37 -2
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +1 -1
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +120 -5
- data/lib/derivative_rodeo/services/mime_type_service.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +39 -43
- data/lib/derivative_rodeo/services/url_service.rb +1 -1
- data/lib/derivative_rodeo/storage_locations/base_location.rb +18 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +3 -5
- data/lib/derivative_rodeo/storage_locations/file_location.rb +4 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +64 -5
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +27 -9
- data/lib/derivative_rodeo/version.rb +1 -1
- data/lib/derivative_rodeo.rb +4 -0
- data/lib/spec_support/aws_s3_faux_bucket.rb +48 -0
- data/lib/spec_support/aws_sqs_faux_client.rb +36 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
|
4
|
+
data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
|
7
|
+
data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Generators
|
7
|
+
##
|
8
|
+
# Generate the Alto XML from the given input_uris.
|
9
|
+
#
|
10
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
11
|
+
class AltoGenerator < BaseGenerator
|
12
|
+
self.output_extension = "alto.xml"
|
13
|
+
|
14
|
+
class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param output_location [StorageLocations::BaseLocation]
|
18
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
19
|
+
#
|
20
|
+
# @return [StorageLocations::BaseLocation]
|
21
|
+
#
|
22
|
+
# @see #requisite_files
|
23
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
24
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
25
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_alto: output_tmp_file_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# @param path_to_hocr [String]
|
33
|
+
# @param path_to_alto [String]
|
34
|
+
def convert_to_coordinates(path_to_hocr:, path_to_alto:)
|
35
|
+
hocr_html = File.read(path_to_hocr)
|
36
|
+
File.open(path_to_alto, "w+") do |file|
|
37
|
+
file.puts service.call(hocr_html).to_alto
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -27,7 +27,6 @@ module DerivativeRodeo
|
|
27
27
|
# @!endgroup Class Attributes
|
28
28
|
|
29
29
|
attr_reader :input_uris,
|
30
|
-
:logger,
|
31
30
|
:output_location_template,
|
32
31
|
:preprocessed_location_template
|
33
32
|
|
@@ -39,23 +38,25 @@ module DerivativeRodeo
|
|
39
38
|
# to find preprocessed uris by transforming the :input_uris via
|
40
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
41
40
|
# :preprocessed_location_template.
|
42
|
-
|
43
|
-
|
41
|
+
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
+
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
44
43
|
@input_uris = Array.wrap(input_uris)
|
45
44
|
@output_location_template = output_location_template
|
46
45
|
@preprocessed_location_template = preprocessed_location_template
|
47
|
-
@logger = logger
|
48
46
|
|
49
47
|
return if valid_instantiation?
|
50
48
|
|
51
49
|
raise Errors::ExtensionMissingError.new(klass: self.class)
|
52
50
|
end
|
53
51
|
|
52
|
+
delegate :logger, to: DerivativeRodeo
|
53
|
+
|
54
54
|
##
|
55
55
|
# @api private
|
56
56
|
#
|
57
57
|
# @return [Boolean]
|
58
58
|
def valid_instantiation?
|
59
|
+
# TODO: Does this even make sense.
|
59
60
|
# When we have a BaseGenerator and not one of it's children or when we've assigned the
|
60
61
|
# output_extension. instance_of? is more specific than is_a?
|
61
62
|
instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
|
@@ -83,6 +84,7 @@ module DerivativeRodeo
|
|
83
84
|
# @see #build_step
|
84
85
|
# @see #with_each_requisite_location_and_tmp_file_path
|
85
86
|
def generated_files
|
87
|
+
# TODO: Examples please
|
86
88
|
return @generated_files if defined?(@generated_files)
|
87
89
|
|
88
90
|
# As much as I would like to use map or returned values; given the implementations it's
|
@@ -92,6 +94,9 @@ module DerivativeRodeo
|
|
92
94
|
# helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
|
93
95
|
# #build_step
|
94
96
|
@generated_files = []
|
97
|
+
|
98
|
+
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
99
|
+
# "file:///Users/jfriesen/.profile"
|
95
100
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
96
101
|
generated_file = destination(input_location)
|
97
102
|
@generated_files << if generated_file.exist?
|
@@ -170,7 +175,7 @@ module DerivativeRodeo
|
|
170
175
|
return output_location unless preprocessed_location_template
|
171
176
|
|
172
177
|
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
|
173
|
-
# We only want
|
178
|
+
# We only want the location if it exists
|
174
179
|
return preprocessed_location if preprocessed_location&.exist?
|
175
180
|
|
176
181
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
@@ -17,19 +17,66 @@ module DerivativeRodeo
|
|
17
17
|
include CopyFileConcern
|
18
18
|
|
19
19
|
##
|
20
|
-
#
|
20
|
+
# A helper method for downstream implementations to ask if this file is perhaps split from a
|
21
|
+
# PDF.
|
21
22
|
#
|
22
|
-
# @
|
23
|
-
|
24
|
-
|
23
|
+
# @param filename [String]
|
24
|
+
# @param extension [String] the extension (either with or without the leading period); if none
|
25
|
+
# is provided use the extension of the given :filename.
|
26
|
+
# @return [TrueClass] when the file name likely represents a file split from a PDF.
|
27
|
+
# @return [FalseClass] when the file name does not, by convention, represent a file split from
|
28
|
+
# a PDF.
|
29
|
+
#
|
30
|
+
# @see #image_file_basename_template
|
31
|
+
def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
|
32
|
+
extension ||= File.extname(filename)
|
33
|
+
|
34
|
+
# Strip the leading period from the extension.
|
35
|
+
extension = extension[1..-1] if extension.start_with?('.')
|
36
|
+
regexp = %r{--page-\d+\.#{extension}$}
|
37
|
+
!!regexp.match(filename)
|
25
38
|
end
|
26
39
|
|
27
40
|
##
|
28
|
-
# @
|
41
|
+
# @param basename [String] The given PDF file's base name (e.g. "hello.pdf" would have a base name of
|
42
|
+
# "hello").
|
43
|
+
#
|
44
|
+
# @return [String] A template for the filenames of the images produced by Ghostscript.
|
45
|
+
#
|
46
|
+
# @note This must include "%d" in the returning value, as that is how Ghostscript will assign
|
47
|
+
# the page number.
|
48
|
+
#
|
49
|
+
# @note I have extracted this function to make it abundantly clear the expected location
|
50
|
+
# each split image. Further there is an interaction in this
|
29
51
|
#
|
30
|
-
# @see
|
31
|
-
|
32
|
-
|
52
|
+
# @see #existing_page_locations
|
53
|
+
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
|
+
def image_file_basename_template(basename:)
|
55
|
+
"#{basename}/pages/#{basename}--page-%d.#{output_extension}"
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
# We want to check the output location and pre-processed location for the existence of already
|
60
|
+
# split pages. This method checks both places.
|
61
|
+
#
|
62
|
+
# @param input_location [StorageLocations::BaseLocation]
|
63
|
+
#
|
64
|
+
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
+
# with :tail_glob.
|
66
|
+
#
|
67
|
+
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
|
+
#
|
69
|
+
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
|
+
def existing_page_locations(input_location:)
|
71
|
+
# See image_file_basename_template
|
72
|
+
tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
|
73
|
+
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
|
75
|
+
return output_locations if output_locations.count.positive?
|
76
|
+
|
77
|
+
return [] if preprocessed_location_template.blank?
|
78
|
+
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
|
33
80
|
end
|
34
81
|
|
35
82
|
##
|
@@ -44,18 +91,35 @@ module DerivativeRodeo
|
|
44
91
|
# @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
|
45
92
|
# @yieldparam image_path [String] where to find this file in the tmp space
|
46
93
|
#
|
94
|
+
# @note This function makes a concession; namely that if it encounters any
|
95
|
+
# {#existing_page_locations} it will use all of that result as the entire number of pages.
|
96
|
+
# We could make this smarter but at the moment we're deferring on that.
|
97
|
+
#
|
47
98
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
|
+
#
|
100
|
+
# rubocop:disable Metrics/MethodLength
|
48
101
|
def with_each_requisite_location_and_tmp_file_path
|
49
102
|
input_files.each do |input_location|
|
50
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
51
|
-
|
52
|
-
|
104
|
+
## We want a single call for a directory listing of the image_file_basename_template
|
105
|
+
generated_files = existing_page_locations(input_location: input_location)
|
106
|
+
|
107
|
+
if generated_files.count.zero?
|
108
|
+
generated_files = Services::PdfSplitter.call(
|
109
|
+
input_tmp_file_path,
|
110
|
+
image_extension: output_extension,
|
111
|
+
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
generated_files.each do |image_path|
|
53
116
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
54
117
|
yield(image_location, image_path)
|
55
118
|
end
|
56
119
|
end
|
57
120
|
end
|
58
121
|
end
|
122
|
+
# rubocop:enable Metrics/MethodLength
|
59
123
|
end
|
60
124
|
end
|
61
125
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Generators
|
7
|
+
##
|
8
|
+
# Generate the word coordinates (as JSON) from the given input_uris.
|
9
|
+
#
|
10
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
11
|
+
class PlainTextGenerator < BaseGenerator
|
12
|
+
self.output_extension = "plain_text.txt"
|
13
|
+
|
14
|
+
class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param output_location [StorageLocations::BaseLocation]
|
18
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
19
|
+
#
|
20
|
+
# @return [StorageLocations::BaseLocation]
|
21
|
+
#
|
22
|
+
# @see #requisite_files
|
23
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
24
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
25
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_plain_text: output_tmp_file_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# @param path_to_hocr [String]
|
33
|
+
# @param path_to_plain_text [String]
|
34
|
+
def convert_to_coordinates(path_to_hocr:, path_to_plain_text:)
|
35
|
+
hocr_html = File.read(path_to_hocr)
|
36
|
+
File.open(path_to_plain_text, "w+") do |file|
|
37
|
+
file.puts service.call(hocr_html).to_text
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -6,12 +6,33 @@ module DerivativeRodeo
|
|
6
6
|
# This generator is responsible for converting a given binary into a thumbnail. As of
|
7
7
|
# <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
|
8
8
|
class ThumbnailGenerator < BaseGenerator
|
9
|
+
##
|
10
|
+
# @!group Class Attributes
|
11
|
+
|
9
12
|
##
|
10
13
|
# We want to mirror the same file "last" extension as described in Hyrax.
|
11
14
|
#
|
12
15
|
# @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
|
13
16
|
self.output_extension = 'thumbnail.jpeg'
|
14
17
|
|
18
|
+
##
|
19
|
+
# @!attribute dimensions_by_type
|
20
|
+
#
|
21
|
+
# @return [Hash<Symbol,String>] the "types" (as categorized by
|
22
|
+
# Hyrax::FileSetDerivativeService). These aren't mime-types per se but a conceptual
|
23
|
+
# distillation of that.
|
24
|
+
#
|
25
|
+
# @see https://github.com/samvera/hyrax/blob/815e0abaacf9f331a5640c5d6129661d01eadf75/app/services/hyrax/file_set_derivatives_service.rb
|
26
|
+
class_attribute :dimensions_by_type, default: { pdf: "338x493" }
|
27
|
+
|
28
|
+
##
|
29
|
+
# @!attribute dimensions_fallback
|
30
|
+
#
|
31
|
+
# @return [String] when there's no match for {.dimensions_by_type} use this value.
|
32
|
+
class_attribute :dimensions_fallback, default: "200x150>"
|
33
|
+
# @!endgroup Class Attributes
|
34
|
+
##
|
35
|
+
|
15
36
|
##
|
16
37
|
# @param output_location [StorageLocations::BaseLocation]
|
17
38
|
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
@@ -23,6 +44,20 @@ module DerivativeRodeo
|
|
23
44
|
end
|
24
45
|
end
|
25
46
|
|
47
|
+
##
|
48
|
+
# @param filename [String]
|
49
|
+
# @return [String]
|
50
|
+
#
|
51
|
+
# @see .dimensions_by_type
|
52
|
+
# @see .dimensions_fallback
|
53
|
+
def self.dimensions_for(filename:)
|
54
|
+
type = DerivativeRodeo::Services::MimeTypeService.hyrax_type(filename: filename)
|
55
|
+
dimensions_by_type.fetch(type, dimensions_fallback)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Want to expose the dimensions_for as an instance method
|
59
|
+
delegate :dimensions_for, to: :class
|
60
|
+
|
26
61
|
##
|
27
62
|
# Convert the file found at :path_to_input into a thumbnail, writing it to the
|
28
63
|
# :path_for_thumbnail_output
|
@@ -30,8 +65,8 @@ module DerivativeRodeo
|
|
30
65
|
# @param path_of_file_to_create_thumbnail_from [String]
|
31
66
|
# @param path_for_thumbnail_output [String]
|
32
67
|
def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
|
33
|
-
|
34
|
-
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '
|
68
|
+
dimensions = dimensions_for(filename: path_of_file_to_create_thumbnail_from)
|
69
|
+
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '#{dimensions}' -flatten #{path_for_thumbnail_output}`
|
35
70
|
end
|
36
71
|
end
|
37
72
|
end
|
@@ -31,7 +31,7 @@ module DerivativeRodeo
|
|
31
31
|
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
|
32
32
|
hocr_html = File.read(path_to_hocr)
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
|
-
file.puts service.call(hocr_html)
|
34
|
+
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -13,7 +13,7 @@ module DerivativeRodeo
|
|
13
13
|
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
|
14
14
|
# @return [String] A JSON document
|
15
15
|
def self.call(sgml)
|
16
|
-
new(sgml)
|
16
|
+
new(sgml)
|
17
17
|
end
|
18
18
|
|
19
19
|
##
|
@@ -42,6 +42,21 @@ module DerivativeRodeo
|
|
42
42
|
end
|
43
43
|
alias json to_json
|
44
44
|
|
45
|
+
# Output plain text, keeping the method calls consistent with so calling this #to_text
|
46
|
+
#
|
47
|
+
# @return [String] plain text of OCR'd document
|
48
|
+
def to_text
|
49
|
+
@to_text ||= doc_stream.text
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_alto
|
53
|
+
@to_alto ||= AltoXml.to_alto(
|
54
|
+
words: doc_stream.words,
|
55
|
+
width: doc_stream.width,
|
56
|
+
height: doc_stream.height
|
57
|
+
)
|
58
|
+
end
|
59
|
+
|
45
60
|
private
|
46
61
|
|
47
62
|
def xml?(xml)
|
@@ -121,6 +136,7 @@ module DerivativeRodeo
|
|
121
136
|
# add trailing space to plaintext buffer for between words:
|
122
137
|
@text += ' '
|
123
138
|
@words.push(@current) if word_complete?
|
139
|
+
@current = nil # clear the current word
|
124
140
|
end
|
125
141
|
|
126
142
|
def end_line
|
@@ -156,10 +172,13 @@ module DerivativeRodeo
|
|
156
172
|
# Callback for element end; at this time, flush word coordinate state
|
157
173
|
# for current word, and append line endings to plain text:
|
158
174
|
#
|
159
|
-
# @param
|
160
|
-
def end_element(
|
161
|
-
|
162
|
-
|
175
|
+
# @param name [String] element name.
|
176
|
+
def end_element(name)
|
177
|
+
if name == 'span'
|
178
|
+
end_word if @element_class_name == 'ocrx_word'
|
179
|
+
@text += "\n" if @element_class_name.nil?
|
180
|
+
end
|
181
|
+
@element_class_name = nil
|
163
182
|
end
|
164
183
|
|
165
184
|
# Callback for completion of parsing hOCR, used to normalize generated
|
@@ -213,6 +232,102 @@ module DerivativeRodeo
|
|
213
232
|
JSON.generate(payload)
|
214
233
|
end
|
215
234
|
end
|
235
|
+
|
236
|
+
class AltoXml
|
237
|
+
##
|
238
|
+
# @api public
|
239
|
+
#
|
240
|
+
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
241
|
+
# @param width [Integer, nil] the width of the "canvas" on which the words appear.
|
242
|
+
# @param height [Integer, nil] the height of the "canvas" on which the words appear.
|
243
|
+
#
|
244
|
+
# @return [String] the ALTO XML representation of the given words and their coordinates.
|
245
|
+
def self.to_alto(words:, width: nil, height: nil)
|
246
|
+
new(words: words, width: width, height: height).to_alto
|
247
|
+
end
|
248
|
+
|
249
|
+
def initialize(words:, width:, height:, scaling: 1.0)
|
250
|
+
@words = words
|
251
|
+
@height = height.to_i
|
252
|
+
@width = width.to_i
|
253
|
+
@scaling = scaling
|
254
|
+
end
|
255
|
+
|
256
|
+
attr_reader :words, :width, :height, :scaling
|
257
|
+
|
258
|
+
# Output ALTO XML of word coordinates
|
259
|
+
#
|
260
|
+
# @return [String] ALTO XML representation of the words and their coordinates
|
261
|
+
def to_alto
|
262
|
+
page = alto_page(width, height) do |xml|
|
263
|
+
words.each do |word|
|
264
|
+
xml.String(
|
265
|
+
CONTENT: word[:word],
|
266
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
267
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
268
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
269
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
270
|
+
) { xml.text '' }
|
271
|
+
end
|
272
|
+
end
|
273
|
+
page.to_xml
|
274
|
+
end
|
275
|
+
|
276
|
+
private
|
277
|
+
|
278
|
+
# given block to manage word generation, wrap with page/block/line
|
279
|
+
def alto_page(pixel_width, pixel_height, &block)
|
280
|
+
builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
|
281
|
+
xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
|
282
|
+
xml.Description do
|
283
|
+
xml.MeasurementUnit 'pixel'
|
284
|
+
end
|
285
|
+
alto_layout(xml, pixel_width, pixel_height, &block)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
builder
|
289
|
+
end
|
290
|
+
|
291
|
+
def scale_point(value)
|
292
|
+
# NOTE: presuming non-fractional, even though ALTO 2.1
|
293
|
+
# specifies coordinates are xsd:float, not xsd:int,
|
294
|
+
# simplify to integer value for output:
|
295
|
+
(value * scaling).to_i
|
296
|
+
end
|
297
|
+
|
298
|
+
# return layout for page
|
299
|
+
def alto_layout(xml, pixel_width, pixel_height, &block)
|
300
|
+
xml.Layout do
|
301
|
+
xml.Page(ID: 'ID1',
|
302
|
+
PHYSICAL_IMG_NR: '1',
|
303
|
+
HEIGHT: pixel_height,
|
304
|
+
WIDTH: pixel_width) do
|
305
|
+
xml.PrintSpace(HEIGHT: pixel_height,
|
306
|
+
WIDTH: pixel_width,
|
307
|
+
HPOS: '0',
|
308
|
+
VPOS: '0') do
|
309
|
+
alto_blockline(xml, pixel_width, pixel_height, &block)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# make block line and call word-block
|
316
|
+
def alto_blockline(xml, pixel_width, pixel_height)
|
317
|
+
xml.TextBlock(ID: 'ID1a',
|
318
|
+
HEIGHT: pixel_height,
|
319
|
+
WIDTH: pixel_width,
|
320
|
+
HPOS: '0',
|
321
|
+
VPOS: '0') do
|
322
|
+
xml.TextLine(HEIGHT: pixel_height,
|
323
|
+
WIDTH: pixel_width,
|
324
|
+
HPOS: '0',
|
325
|
+
VPOS: '0') do
|
326
|
+
yield(xml)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
end
|
216
331
|
end
|
217
332
|
end
|
218
333
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'marcel'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Services
|
6
|
+
##
|
7
|
+
# This module provides an interface for determining a mime-type.
|
8
|
+
module MimeTypeService
|
9
|
+
##
|
10
|
+
# Hyrax has it's own compression of mime_types into conceptual types (as defined in
|
11
|
+
# Hyrax::FileSetDerivativesService). This provides a somewhat conceptual overlap with that,
|
12
|
+
# while also being more generalized.
|
13
|
+
#
|
14
|
+
# @param filename [String]
|
15
|
+
# @return [Symbol]
|
16
|
+
def self.hyrax_type(filename:)
|
17
|
+
mime = mime_type(filename: filename)
|
18
|
+
media_type, sub_type = mime.split("/")
|
19
|
+
case media_type
|
20
|
+
when "image", "audio", "text", "video"
|
21
|
+
media_type.to_sym
|
22
|
+
when "application" # The wild woolly weird world of all the things.
|
23
|
+
# TODO: Do we need to worry about office documents?
|
24
|
+
sub_type.to_sym
|
25
|
+
else
|
26
|
+
sub_type.to_sym
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Given a local :filename (e.g. downloaded and available on the server this is running),
|
32
|
+
# return the mime_type of the file.
|
33
|
+
#
|
34
|
+
# @param filename [String]
|
35
|
+
# @return [String] (e.g. "application/pdf", "text/plain")
|
36
|
+
def self.mime_type(filename:)
|
37
|
+
##
|
38
|
+
# TODO: Does this attempt to read the whole file? That may create memory constraints. By
|
39
|
+
# using Pathname (instead of File.read), we're letting Marcel do it's best mime magic.
|
40
|
+
pathname = Pathname.new(filename)
|
41
|
+
extension = filename.split(".")&.last&.downcase
|
42
|
+
if extension
|
43
|
+
# By including a possible extension, we can help nudge Marcel into making a more
|
44
|
+
# Without extension, we will get a lot of "application/octet-stream" results.
|
45
|
+
::Marcel::MimeType.for(pathname, extension: extension)
|
46
|
+
else
|
47
|
+
::Marcel::MimeType.for(pathname)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -2,17 +2,32 @@
|
|
2
2
|
|
3
3
|
require 'open3'
|
4
4
|
require 'securerandom'
|
5
|
-
require 'tmpdir'
|
6
|
-
|
7
5
|
module DerivativeRodeo
|
8
6
|
module Services
|
7
|
+
##
|
8
|
+
# A service module for splitting PDFs into one image per page.
|
9
|
+
#
|
10
|
+
# @see .call
|
9
11
|
module PdfSplitter
|
10
12
|
##
|
11
|
-
# @
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
# @api public
|
14
|
+
#
|
15
|
+
# Split the file found at the given :path
|
16
|
+
#
|
17
|
+
# @param path [String] the path to the source PDF that we're processing.
|
18
|
+
# @param image_extension [String] used to determine the splitting service we use; there is an
|
19
|
+
# implicit relationship between image_extension and image_file_basename_template
|
20
|
+
# (though filenames do not necessarily reflect mime types)
|
21
|
+
# @param image_file_basename_template [String] use this string to generate the unique filename
|
22
|
+
# for an image "split" from the given PDF. It must include "%d" as part of the
|
23
|
+
# declaration. For example if the template is "hello-world-%d.png" then the first
|
24
|
+
# split page will be "hello-world-1.png".
|
25
|
+
#
|
26
|
+
# @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
|
27
|
+
def self.call(path, image_extension:, image_file_basename_template:)
|
28
|
+
klass_name = "#{image_extension.to_s.classify}_page".classify
|
29
|
+
klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
|
30
|
+
klass.new(path, image_file_basename_template: image_file_basename_template)
|
16
31
|
end
|
17
32
|
|
18
33
|
##
|
@@ -31,38 +46,23 @@ module DerivativeRodeo
|
|
31
46
|
|
32
47
|
class_attribute :gsdevice, instance_accessor: false
|
33
48
|
class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
|
34
|
-
##
|
35
|
-
# @api public
|
36
|
-
#
|
37
|
-
# @param path [String] The path the the PDF
|
38
|
-
#
|
39
|
-
# @return [Enumerable, Utilities::PdfSplitter::Base]
|
40
|
-
def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
|
41
|
-
new(path, baseid: baseid, tmpdir: tmpdir)
|
42
|
-
end
|
43
49
|
|
44
|
-
##
|
45
|
-
# @param path [String] the path to the source PDF that we're processing.
|
46
|
-
# @param baseid [String] used for creating a unique identifier
|
47
|
-
# @param tmpdir [String] place to perform the "work" of splitting the PDF.
|
48
|
-
# @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
|
49
|
-
# extract this from the given path, but for testing purposes, you might want to
|
50
|
-
# provide a specific summary.
|
51
|
-
# @param logger [Logger, #error]
|
52
50
|
def initialize(path,
|
53
|
-
|
54
|
-
|
55
|
-
tmpdir: Dir.mktmpdir,
|
56
|
-
pdf_pages_summary: PagesSummary.extract_from(path: path),
|
57
|
-
logger: DerivativeRodeo.config.logger)
|
58
|
-
@baseid = baseid
|
51
|
+
image_file_basename_template:,
|
52
|
+
pdf_pages_summary: PagesSummary.extract_from(path: path))
|
59
53
|
@pdfpath = path
|
60
54
|
@pdf_pages_summary = pdf_pages_summary
|
61
|
-
@
|
62
|
-
|
55
|
+
@ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
|
56
|
+
|
57
|
+
# We need to ensure that this temporary directory exists so we can write the files to it.
|
58
|
+
# Fortunately, because this file space must be "local" tmp dir, we don't need to work
|
59
|
+
# through any of the location antics of {StorageLocations::BaseLocation}.
|
60
|
+
FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
|
63
61
|
end
|
64
62
|
|
65
|
-
attr_reader :
|
63
|
+
attr_reader :ghost_script_output_file_template
|
64
|
+
|
65
|
+
delegate :logger, to: DerivativeRodeo
|
66
66
|
|
67
67
|
# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
|
68
68
|
include Enumerable
|
@@ -80,8 +80,8 @@ module DerivativeRodeo
|
|
80
80
|
!pdf_pages_summary.valid?
|
81
81
|
end
|
82
82
|
|
83
|
-
attr_reader :pdf_pages_summary, :
|
84
|
-
private :pdf_pages_summary, :
|
83
|
+
attr_reader :pdf_pages_summary, :basename, :pdfpath
|
84
|
+
private :pdf_pages_summary, :basename, :pdfpath
|
85
85
|
|
86
86
|
# @api private
|
87
87
|
def gsdevice
|
@@ -99,16 +99,12 @@ module DerivativeRodeo
|
|
99
99
|
@entries = Array.wrap(gsconvert)
|
100
100
|
end
|
101
101
|
|
102
|
-
def output_base
|
103
|
-
@output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
|
104
|
-
end
|
105
|
-
|
106
102
|
def gsconvert
|
107
103
|
# NOTE: you must call gsdevice before compression, as compression is
|
108
104
|
# updated during the gsdevice call.
|
109
105
|
file_names = []
|
110
106
|
|
111
|
-
Open3.popen3(gsconvert_cmd(
|
107
|
+
Open3.popen3(gsconvert_cmd(ghost_script_output_file_template)) do |_stdin, stdout, stderr, _wait_thr|
|
112
108
|
err = stderr.read
|
113
109
|
logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
|
114
110
|
|
@@ -116,7 +112,7 @@ module DerivativeRodeo
|
|
116
112
|
stdout.read.split("\n").each do |line|
|
117
113
|
next unless line.start_with?('Page ')
|
118
114
|
|
119
|
-
file_names << format(
|
115
|
+
file_names << format(ghost_script_output_file_template, page_number)
|
120
116
|
page_number += 1
|
121
117
|
end
|
122
118
|
end
|
@@ -126,12 +122,12 @@ module DerivativeRodeo
|
|
126
122
|
|
127
123
|
def create_file_name(line:, page_number:); end
|
128
124
|
|
129
|
-
def gsconvert_cmd(
|
125
|
+
def gsconvert_cmd(ghost_script_output_file_template)
|
130
126
|
@gsconvert_cmd ||= begin
|
131
127
|
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
|
132
128
|
cmd += " -sCompression=#{compression}" if compression?
|
133
129
|
cmd += " -dJPEGQ=#{quality}" if quality?
|
134
|
-
cmd += " -sOutputFile=#{
|
130
|
+
cmd += " -sOutputFile=#{ghost_script_output_file_template} -r#{ppi} -f #{pdfpath}"
|
135
131
|
cmd
|
136
132
|
end
|
137
133
|
end
|
@@ -21,7 +21,7 @@ module DerivativeRodeo
|
|
21
21
|
def self.read(url)
|
22
22
|
HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
|
23
23
|
rescue StandardError => e
|
24
|
-
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
24
|
+
DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
25
25
|
raise e
|
26
26
|
end
|
27
27
|
|
@@ -42,6 +42,8 @@ module DerivativeRodeo
|
|
42
42
|
|
43
43
|
class << self
|
44
44
|
alias scheme location_name
|
45
|
+
|
46
|
+
delegate :config, to: DerivativeRodeo
|
45
47
|
end
|
46
48
|
|
47
49
|
##
|
@@ -206,6 +208,22 @@ module DerivativeRodeo
|
|
206
208
|
klass.build(from_uri: file_path, template: template)
|
207
209
|
end
|
208
210
|
|
211
|
+
##
|
212
|
+
# When you have a known location and want to check for files that are within that location,
|
213
|
+
# use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
|
214
|
+
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
|
+
#
|
216
|
+
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
|
+
# qualified" Dir.glob type search.
|
218
|
+
#
|
219
|
+
# @param tail_glob [String]
|
220
|
+
#
|
221
|
+
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
|
+
# array when there are none.
|
223
|
+
def globbed_tail_locations(tail_glob:)
|
224
|
+
raise NotImplementedError, "#{self.class}#globbed_locations"
|
225
|
+
end
|
226
|
+
|
209
227
|
##
|
210
228
|
# @param extension [String, StorageLocations::SAME]
|
211
229
|
# @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
|
@@ -43,7 +43,7 @@ module DerivativeRodeo
|
|
43
43
|
# @param url [String]
|
44
44
|
#
|
45
45
|
# @return [String]
|
46
|
-
def
|
46
|
+
def get(url)
|
47
47
|
HTTParty.get(url, logger: config.logger)
|
48
48
|
rescue => e
|
49
49
|
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
@@ -51,13 +51,11 @@ module DerivativeRodeo
|
|
51
51
|
end
|
52
52
|
|
53
53
|
##
|
54
|
-
# @param url [String]
|
55
|
-
#
|
56
54
|
# @return [URI] when the URL resolves successfully
|
57
55
|
# @return [FalseClass] when the URL's head request is not successful or we've exhausted our
|
58
56
|
# remaining redirects.
|
59
|
-
def
|
60
|
-
HTTParty.head(
|
57
|
+
def exist?
|
58
|
+
HTTParty.head(file_uri, logger: config.logger)
|
61
59
|
rescue => e
|
62
60
|
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
63
61
|
false
|
@@ -8,7 +8,15 @@ module DerivativeRodeo
|
|
8
8
|
# Location to download and upload files to S3
|
9
9
|
#
|
10
10
|
class S3Location < BaseLocation
|
11
|
-
|
11
|
+
##
|
12
|
+
# @!group Class Attributes
|
13
|
+
# @!attribute use_actual_s3_bucket
|
14
|
+
#
|
15
|
+
# When true , we are going to use a live S3 bucket. When false, we'll use a fake local bucket.
|
16
|
+
class_attribute :use_actual_s3_bucket, default: true
|
17
|
+
# @!endgroup Class Attributes
|
18
|
+
##
|
19
|
+
|
12
20
|
##
|
13
21
|
# Create a new uri of the classes type. Parts argument should have a default in
|
14
22
|
# implementing classes. Must support a number or the symbol :all
|
@@ -24,10 +32,11 @@ module DerivativeRodeo
|
|
24
32
|
end
|
25
33
|
|
26
34
|
##
|
27
|
-
# @param
|
35
|
+
# @param bucket_name [String, NilClass] when given, use this as the bucket, otherwise, def
|
36
|
+
#
|
28
37
|
# @return [String]
|
29
|
-
def self.adapter_prefix(
|
30
|
-
"#{scheme}://#{
|
38
|
+
def self.adapter_prefix(bucket_name: config.aws_s3_bucket)
|
39
|
+
"#{scheme}://#{bucket_name}.s3.#{config.aws_s3_region}.amazonaws.com"
|
31
40
|
end
|
32
41
|
|
33
42
|
##
|
@@ -53,6 +62,38 @@ module DerivativeRodeo
|
|
53
62
|
bucket.objects(prefix: file_path).count.positive?
|
54
63
|
end
|
55
64
|
|
65
|
+
##
|
66
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
|
+
#
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
|
69
|
+
# use the components of the file_path to fake that behavior.
|
70
|
+
#
|
71
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
+
def globbed_tail_locations(tail_glob:)
|
73
|
+
# file_path = "s3://blah/1234/hello-world/pages/*.tiff"
|
74
|
+
#
|
75
|
+
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
+
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
+
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
+
# and miscolate two PDFs.
|
79
|
+
#
|
80
|
+
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
+
globname = File.join(file_dir, tail_glob)
|
82
|
+
regexp = %r{#{File.extname(globname)}$}
|
83
|
+
|
84
|
+
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
+
# based on both the key of the item in the bucket as well as the bucket's host.
|
86
|
+
uri = URI.parse(file_uri)
|
87
|
+
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
|
+
|
89
|
+
bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
|
90
|
+
if object.key.match(regexp)
|
91
|
+
template = File.join(scheme_and_host, object.key)
|
92
|
+
derived_file_from(template: template)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
56
97
|
##
|
57
98
|
# @api public
|
58
99
|
# write the tmp file to the file_uri
|
@@ -71,6 +112,9 @@ module DerivativeRodeo
|
|
71
112
|
#
|
72
113
|
# @return [Aws::S3::Resource]
|
73
114
|
def resource
|
115
|
+
# TODO: Are there instantiation considerations when running in Lambda? In tests
|
116
|
+
# initializing a resource is very slow (e.g. 3 seconds or so). Should this be a class
|
117
|
+
# method? Can it be given the SpaceStone constraints?
|
74
118
|
@resource ||= if DerivativeRodeo.config.aws_s3_access_key_id
|
75
119
|
Aws::S3::Resource.new(region: DerivativeRodeo.config.aws_s3_region,
|
76
120
|
credentials: Aws::Credentials.new(
|
@@ -91,13 +135,28 @@ module DerivativeRodeo
|
|
91
135
|
raise Errors::BucketMissingError
|
92
136
|
end
|
93
137
|
|
138
|
+
# @see .use_actual_s3_bucket
|
94
139
|
def bucket
|
95
|
-
@bucket ||=
|
140
|
+
@bucket ||= if use_actual_s3_bucket?
|
141
|
+
resource.bucket(bucket_name)
|
142
|
+
else
|
143
|
+
self.class.faux_bucket
|
144
|
+
end
|
96
145
|
end
|
97
146
|
|
98
147
|
def file_path
|
99
148
|
@file_path ||= @file_uri.sub(%r{.+://.+?/}, '')
|
100
149
|
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# A fake constructed fake bucket that confroms to the narrow S3 interface that we use.
|
153
|
+
#
|
154
|
+
# @see .use_actual_s3_bucket
|
155
|
+
# @return [AwsS3FauxBucket]
|
156
|
+
def self.faux_bucket
|
157
|
+
# We are not requiring this file; except in the spec context.
|
158
|
+
@faux_bucket ||= AwsS3FauxBucket.new
|
159
|
+
end
|
101
160
|
end
|
102
161
|
end
|
103
162
|
end
|
@@ -14,9 +14,20 @@ module DerivativeRodeo
|
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
16
|
class SqsLocation < BaseLocation
|
17
|
+
##
|
18
|
+
# @!group Class Attributes
|
19
|
+
#
|
20
|
+
# @!attribute batch_size
|
21
|
+
# @return [Integer]
|
17
22
|
class_attribute :batch_size, default: 10
|
18
23
|
|
19
|
-
|
24
|
+
# @!attribute use_real_sqs
|
25
|
+
# When true, use the real SQS; else when false use a fake one. You probably don't want to
|
26
|
+
# use the fake one in your production. But it's exposed in this manner to ease testing of
|
27
|
+
# downstream dependencies.
|
28
|
+
class_attribute :use_real_sqs, default: true
|
29
|
+
# @!endgroup Class Attributes
|
30
|
+
##
|
20
31
|
|
21
32
|
##
|
22
33
|
# Create a new uri of the classes type. Parts argument should have a default in
|
@@ -82,19 +93,26 @@ module DerivativeRodeo
|
|
82
93
|
file_uri
|
83
94
|
end
|
84
95
|
|
96
|
+
# rubocop:disable Metrics/MethodLength
|
85
97
|
def client
|
86
|
-
@client ||= if
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
98
|
+
@client ||= if use_real_sqs?
|
99
|
+
if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
|
100
|
+
Aws::SQS::Client.new(
|
101
|
+
region: config.aws_sqs_region,
|
102
|
+
credentials: Aws::Credentials.new(
|
103
|
+
config.aws_sqs_access_key_id,
|
104
|
+
config.aws_sqs_secret_access_key
|
105
|
+
)
|
92
106
|
)
|
93
|
-
|
107
|
+
else
|
108
|
+
Aws::SQS::Client.new(region: config.aws_sqs_region)
|
109
|
+
end
|
94
110
|
else
|
95
|
-
|
111
|
+
# We are not requiring this file; except in the spec context.
|
112
|
+
AwsSqsFauxClient.new
|
96
113
|
end
|
97
114
|
end
|
115
|
+
# rubocop:enable Metrics/MethodLength
|
98
116
|
|
99
117
|
def add(message:)
|
100
118
|
client.send_message({
|
data/lib/derivative_rodeo.rb
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
##
|
4
|
+
# This class is very rudimentary implementation of a bucket. It conforms to the necessary
|
5
|
+
# interface for downloading and uploading and filter on prefix.
|
6
|
+
#
|
7
|
+
# It is provided as a lib/spec/support so that downstream implementations can leverage a fake S3
|
8
|
+
# bucket.
|
9
|
+
#
|
10
|
+
# @see [DerivativeRodeo::StorageLocations::S3Location]
|
11
|
+
class AwsS3FauxBucket
|
12
|
+
def initialize
|
13
|
+
@storage = {}
|
14
|
+
end
|
15
|
+
attr_reader :storage
|
16
|
+
def object(path)
|
17
|
+
# Yup, we've got nested buckets
|
18
|
+
@storage[path] ||= Storage.new(key: path)
|
19
|
+
end
|
20
|
+
|
21
|
+
def objects(prefix:)
|
22
|
+
@storage.each_with_object([]) do |(path, obj), accumulator|
|
23
|
+
accumulator << obj if path.start_with?(prefix)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Storage
|
28
|
+
# Because we're now coping with the glob tail finder, we need to account for the bucket entry's
|
29
|
+
# key.
|
30
|
+
def initialize(key:)
|
31
|
+
@key = key
|
32
|
+
@storage = {}
|
33
|
+
end
|
34
|
+
attr_reader :storage, :key
|
35
|
+
|
36
|
+
def upload_file(path)
|
37
|
+
@storage[:upload] = File.read(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
def download_file(path)
|
41
|
+
return false unless @storage.key?(:upload)
|
42
|
+
content = @storage.fetch(:upload)
|
43
|
+
File.open(path, 'wb') do |f|
|
44
|
+
f.puts(content)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'ostruct'
|
3
|
+
##
|
4
|
+
# This class is very rudimentary implementation of an SQS client. It conforms to the necessary
|
5
|
+
# interface for sending messages and reading messages
|
6
|
+
#
|
7
|
+
# @see [DerivativeRodeo::StorageAdapters::SqsAdapter]
|
8
|
+
class AwsSqsFauxClient
|
9
|
+
def initialize(queue_url: nil)
|
10
|
+
@queue_url = queue_url || 'https://sqs.us-west-2.amazonaws.com/5555555555/fake'
|
11
|
+
@storage = {}
|
12
|
+
end
|
13
|
+
attr_reader :storage
|
14
|
+
|
15
|
+
def send_message(arg_hash)
|
16
|
+
@storage[arg_hash[:queue_url]] ||= []
|
17
|
+
@storage[arg_hash[:queue_url]] << arg_hash[:message_body]
|
18
|
+
end
|
19
|
+
|
20
|
+
def send_message_batch(arg_hash)
|
21
|
+
@storage[arg_hash[:queue_url]] ||= []
|
22
|
+
@storage[arg_hash[:queue_url]] += arg_hash[:entries]
|
23
|
+
end
|
24
|
+
|
25
|
+
def receive_message(arg_hash)
|
26
|
+
output = []
|
27
|
+
args_hash[:mx_number_of_messages].times do
|
28
|
+
value = @storage[arg_hash[:queue_url]]&.pop
|
29
|
+
output << value if value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_queue_url(*)
|
34
|
+
OpenStruct.new(queue_url: @queue_url)
|
35
|
+
end
|
36
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-05
|
12
|
+
date: 2023-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -281,12 +281,14 @@ files:
|
|
281
281
|
- lib/derivative_rodeo.rb
|
282
282
|
- lib/derivative_rodeo/configuration.rb
|
283
283
|
- lib/derivative_rodeo/errors.rb
|
284
|
+
- lib/derivative_rodeo/generators/alto_generator.rb
|
284
285
|
- lib/derivative_rodeo/generators/base_generator.rb
|
285
286
|
- lib/derivative_rodeo/generators/concerns/copy_file_concern.rb
|
286
287
|
- lib/derivative_rodeo/generators/copy_generator.rb
|
287
288
|
- lib/derivative_rodeo/generators/hocr_generator.rb
|
288
289
|
- lib/derivative_rodeo/generators/monochrome_generator.rb
|
289
290
|
- lib/derivative_rodeo/generators/pdf_split_generator.rb
|
291
|
+
- lib/derivative_rodeo/generators/plain_text_generator.rb
|
290
292
|
- lib/derivative_rodeo/generators/thumbnail_generator.rb
|
291
293
|
- lib/derivative_rodeo/generators/word_coordinates_generator.rb
|
292
294
|
- lib/derivative_rodeo/services/base_service.rb
|
@@ -295,6 +297,7 @@ files:
|
|
295
297
|
- lib/derivative_rodeo/services/image_identify_service.rb
|
296
298
|
- lib/derivative_rodeo/services/image_jp2_service.rb
|
297
299
|
- lib/derivative_rodeo/services/image_service.rb
|
300
|
+
- lib/derivative_rodeo/services/mime_type_service.rb
|
298
301
|
- lib/derivative_rodeo/services/pdf_splitter/base.rb
|
299
302
|
- lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb
|
300
303
|
- lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb
|
@@ -311,6 +314,8 @@ files:
|
|
311
314
|
- lib/derivative_rodeo/storage_locations/sqs_location.rb
|
312
315
|
- lib/derivative_rodeo/technical_metadata.rb
|
313
316
|
- lib/derivative_rodeo/version.rb
|
317
|
+
- lib/spec_support/aws_s3_faux_bucket.rb
|
318
|
+
- lib/spec_support/aws_sqs_faux_client.rb
|
314
319
|
homepage: https://github.com/scientist-softserv/derivative_rodeo
|
315
320
|
licenses:
|
316
321
|
- APACHE-2.0
|