derivative-rodeo 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/alto_generator.rb +42 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +10 -5
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +74 -10
- data/lib/derivative_rodeo/generators/plain_text_generator.rb +42 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +37 -2
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +1 -1
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +120 -5
- data/lib/derivative_rodeo/services/mime_type_service.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +39 -43
- data/lib/derivative_rodeo/services/url_service.rb +1 -1
- data/lib/derivative_rodeo/storage_locations/base_location.rb +18 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +3 -5
- data/lib/derivative_rodeo/storage_locations/file_location.rb +4 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +64 -5
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +27 -9
- data/lib/derivative_rodeo/version.rb +1 -1
- data/lib/derivative_rodeo.rb +4 -0
- data/lib/spec_support/aws_s3_faux_bucket.rb +48 -0
- data/lib/spec_support/aws_sqs_faux_client.rb +36 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
|
4
|
+
data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
|
7
|
+
data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Generators
|
7
|
+
##
|
8
|
+
# Generate the Alto XML from the given input_uris.
|
9
|
+
#
|
10
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
11
|
+
class AltoGenerator < BaseGenerator
|
12
|
+
self.output_extension = "alto.xml"
|
13
|
+
|
14
|
+
class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param output_location [StorageLocations::BaseLocation]
|
18
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
19
|
+
#
|
20
|
+
# @return [StorageLocations::BaseLocation]
|
21
|
+
#
|
22
|
+
# @see #requisite_files
|
23
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
24
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
25
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_alto: output_tmp_file_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# @param path_to_hocr [String]
|
33
|
+
# @param path_to_alto [String]
|
34
|
+
def convert_to_coordinates(path_to_hocr:, path_to_alto:)
|
35
|
+
hocr_html = File.read(path_to_hocr)
|
36
|
+
File.open(path_to_alto, "w+") do |file|
|
37
|
+
file.puts service.call(hocr_html).to_alto
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -27,7 +27,6 @@ module DerivativeRodeo
|
|
27
27
|
# @!endgroup Class Attributes
|
28
28
|
|
29
29
|
attr_reader :input_uris,
|
30
|
-
:logger,
|
31
30
|
:output_location_template,
|
32
31
|
:preprocessed_location_template
|
33
32
|
|
@@ -39,23 +38,25 @@ module DerivativeRodeo
|
|
39
38
|
# to find preprocessed uris by transforming the :input_uris via
|
40
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
41
40
|
# :preprocessed_location_template.
|
42
|
-
|
43
|
-
|
41
|
+
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
+
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
44
43
|
@input_uris = Array.wrap(input_uris)
|
45
44
|
@output_location_template = output_location_template
|
46
45
|
@preprocessed_location_template = preprocessed_location_template
|
47
|
-
@logger = logger
|
48
46
|
|
49
47
|
return if valid_instantiation?
|
50
48
|
|
51
49
|
raise Errors::ExtensionMissingError.new(klass: self.class)
|
52
50
|
end
|
53
51
|
|
52
|
+
delegate :logger, to: DerivativeRodeo
|
53
|
+
|
54
54
|
##
|
55
55
|
# @api private
|
56
56
|
#
|
57
57
|
# @return [Boolean]
|
58
58
|
def valid_instantiation?
|
59
|
+
# TODO: Does this even make sense.
|
59
60
|
# When we have a BaseGenerator and not one of it's children or when we've assigned the
|
60
61
|
# output_extension. instance_of? is more specific than is_a?
|
61
62
|
instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
|
@@ -83,6 +84,7 @@ module DerivativeRodeo
|
|
83
84
|
# @see #build_step
|
84
85
|
# @see #with_each_requisite_location_and_tmp_file_path
|
85
86
|
def generated_files
|
87
|
+
# TODO: Examples please
|
86
88
|
return @generated_files if defined?(@generated_files)
|
87
89
|
|
88
90
|
# As much as I would like to use map or returned values; given the implementations it's
|
@@ -92,6 +94,9 @@ module DerivativeRodeo
|
|
92
94
|
# helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
|
93
95
|
# #build_step
|
94
96
|
@generated_files = []
|
97
|
+
|
98
|
+
# BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
|
99
|
+
# "file:///Users/jfriesen/.profile"
|
95
100
|
with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
|
96
101
|
generated_file = destination(input_location)
|
97
102
|
@generated_files << if generated_file.exist?
|
@@ -170,7 +175,7 @@ module DerivativeRodeo
|
|
170
175
|
return output_location unless preprocessed_location_template
|
171
176
|
|
172
177
|
preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
|
173
|
-
# We only want
|
178
|
+
# We only want the location if it exists
|
174
179
|
return preprocessed_location if preprocessed_location&.exist?
|
175
180
|
|
176
181
|
# NOTE: The file does not exist at the output_location; but we pass this information along so
|
@@ -17,19 +17,66 @@ module DerivativeRodeo
|
|
17
17
|
include CopyFileConcern
|
18
18
|
|
19
19
|
##
|
20
|
-
#
|
20
|
+
# A helper method for downstream implementations to ask if this file is perhaps split from a
|
21
|
+
# PDF.
|
21
22
|
#
|
22
|
-
# @
|
23
|
-
|
24
|
-
|
23
|
+
# @param filename [String]
|
24
|
+
# @param extension [String] the extension (either with or without the leading period); if none
|
25
|
+
# is provided use the extension of the given :filename.
|
26
|
+
# @return [TrueClass] when the file name likely represents a file split from a PDF.
|
27
|
+
# @return [FalseClass] when the file name does not, by convention, represent a file split from
|
28
|
+
# a PDF.
|
29
|
+
#
|
30
|
+
# @see #image_file_basename_template
|
31
|
+
def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
|
32
|
+
extension ||= File.extname(filename)
|
33
|
+
|
34
|
+
# Strip the leading period from the extension.
|
35
|
+
extension = extension[1..-1] if extension.start_with?('.')
|
36
|
+
regexp = %r{--page-\d+\.#{extension}$}
|
37
|
+
!!regexp.match(filename)
|
25
38
|
end
|
26
39
|
|
27
40
|
##
|
28
|
-
# @
|
41
|
+
# @param basename [String] The given PDF file's base name (e.g. "hello.pdf" would have a base name of
|
42
|
+
# "hello").
|
43
|
+
#
|
44
|
+
# @return [String] A template for the filenames of the images produced by Ghostscript.
|
45
|
+
#
|
46
|
+
# @note This must include "%d" in the returning value, as that is how Ghostscript will assign
|
47
|
+
# the page number.
|
48
|
+
#
|
49
|
+
# @note I have extracted this function to make it abundantly clear the expected location
|
50
|
+
# each split image. Further there is an interaction in this
|
29
51
|
#
|
30
|
-
# @see
|
31
|
-
|
32
|
-
|
52
|
+
# @see #existing_page_locations
|
53
|
+
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
|
+
def image_file_basename_template(basename:)
|
55
|
+
"#{basename}/pages/#{basename}--page-%d.#{output_extension}"
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
# We want to check the output location and pre-processed location for the existence of already
|
60
|
+
# split pages. This method checks both places.
|
61
|
+
#
|
62
|
+
# @param input_location [StorageLocations::BaseLocation]
|
63
|
+
#
|
64
|
+
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
+
# with :tail_glob.
|
66
|
+
#
|
67
|
+
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
|
+
#
|
69
|
+
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
|
+
def existing_page_locations(input_location:)
|
71
|
+
# See image_file_basename_template
|
72
|
+
tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
|
73
|
+
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
|
75
|
+
return output_locations if output_locations.count.positive?
|
76
|
+
|
77
|
+
return [] if preprocessed_location_template.blank?
|
78
|
+
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
|
33
80
|
end
|
34
81
|
|
35
82
|
##
|
@@ -44,18 +91,35 @@ module DerivativeRodeo
|
|
44
91
|
# @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
|
45
92
|
# @yieldparam image_path [String] where to find this file in the tmp space
|
46
93
|
#
|
94
|
+
# @note This function makes a concession; namely that if it encounters any
|
95
|
+
# {#existing_page_locations} it will use all of that result as the entire number of pages.
|
96
|
+
# We could make this smarter but at the moment we're deferring on that.
|
97
|
+
#
|
47
98
|
# @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
|
99
|
+
#
|
100
|
+
# rubocop:disable Metrics/MethodLength
|
48
101
|
def with_each_requisite_location_and_tmp_file_path
|
49
102
|
input_files.each do |input_location|
|
50
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
51
|
-
|
52
|
-
|
104
|
+
## We want a single call for a directory listing of the image_file_basename_template
|
105
|
+
generated_files = existing_page_locations(input_location: input_location)
|
106
|
+
|
107
|
+
if generated_files.count.zero?
|
108
|
+
generated_files = Services::PdfSplitter.call(
|
109
|
+
input_tmp_file_path,
|
110
|
+
image_extension: output_extension,
|
111
|
+
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
generated_files.each do |image_path|
|
53
116
|
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
54
117
|
yield(image_location, image_path)
|
55
118
|
end
|
56
119
|
end
|
57
120
|
end
|
58
121
|
end
|
122
|
+
# rubocop:enable Metrics/MethodLength
|
59
123
|
end
|
60
124
|
end
|
61
125
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Generators
|
7
|
+
##
|
8
|
+
# Generate the word coordinates (as JSON) from the given input_uris.
|
9
|
+
#
|
10
|
+
# @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
|
11
|
+
class PlainTextGenerator < BaseGenerator
|
12
|
+
self.output_extension = "plain_text.txt"
|
13
|
+
|
14
|
+
class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param output_location [StorageLocations::BaseLocation]
|
18
|
+
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
19
|
+
#
|
20
|
+
# @return [StorageLocations::BaseLocation]
|
21
|
+
#
|
22
|
+
# @see #requisite_files
|
23
|
+
def build_step(output_location:, input_tmp_file_path:, **)
|
24
|
+
output_location.with_new_tmp_path do |output_tmp_file_path|
|
25
|
+
convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_plain_text: output_tmp_file_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
##
|
32
|
+
# @param path_to_hocr [String]
|
33
|
+
# @param path_to_plain_text [String]
|
34
|
+
def convert_to_coordinates(path_to_hocr:, path_to_plain_text:)
|
35
|
+
hocr_html = File.read(path_to_hocr)
|
36
|
+
File.open(path_to_plain_text, "w+") do |file|
|
37
|
+
file.puts service.call(hocr_html).to_text
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -6,12 +6,33 @@ module DerivativeRodeo
|
|
6
6
|
# This generator is responsible for converting a given binary into a thumbnail. As of
|
7
7
|
# <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
|
8
8
|
class ThumbnailGenerator < BaseGenerator
|
9
|
+
##
|
10
|
+
# @!group Class Attributes
|
11
|
+
|
9
12
|
##
|
10
13
|
# We want to mirror the same file "last" extension as described in Hyrax.
|
11
14
|
#
|
12
15
|
# @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
|
13
16
|
self.output_extension = 'thumbnail.jpeg'
|
14
17
|
|
18
|
+
##
|
19
|
+
# @!attribute dimensions_by_type
|
20
|
+
#
|
21
|
+
# @return [Hash<Symbol,String>] the "types" (as categorized by
|
22
|
+
# Hyrax::FileSetDerivativeService). These aren't mime-types per se but a conceptual
|
23
|
+
# distillation of that.
|
24
|
+
#
|
25
|
+
# @see https://github.com/samvera/hyrax/blob/815e0abaacf9f331a5640c5d6129661d01eadf75/app/services/hyrax/file_set_derivatives_service.rb
|
26
|
+
class_attribute :dimensions_by_type, default: { pdf: "338x493" }
|
27
|
+
|
28
|
+
##
|
29
|
+
# @!attribute dimensions_fallback
|
30
|
+
#
|
31
|
+
# @return [String] when there's no match for {.dimensions_by_type} use this value.
|
32
|
+
class_attribute :dimensions_fallback, default: "200x150>"
|
33
|
+
# @!endgroup Class Attributes
|
34
|
+
##
|
35
|
+
|
15
36
|
##
|
16
37
|
# @param output_location [StorageLocations::BaseLocation]
|
17
38
|
# @param input_tmp_file_path [String] the location of the file that we can use for processing.
|
@@ -23,6 +44,20 @@ module DerivativeRodeo
|
|
23
44
|
end
|
24
45
|
end
|
25
46
|
|
47
|
+
##
|
48
|
+
# @param filename [String]
|
49
|
+
# @return [String]
|
50
|
+
#
|
51
|
+
# @see .dimensions_by_type
|
52
|
+
# @see .dimensions_fallback
|
53
|
+
def self.dimensions_for(filename:)
|
54
|
+
type = DerivativeRodeo::Services::MimeTypeService.hyrax_type(filename: filename)
|
55
|
+
dimensions_by_type.fetch(type, dimensions_fallback)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Want to expose the dimensions_for as an instance method
|
59
|
+
delegate :dimensions_for, to: :class
|
60
|
+
|
26
61
|
##
|
27
62
|
# Convert the file found at :path_to_input into a thumbnail, writing it to the
|
28
63
|
# :path_for_thumbnail_output
|
@@ -30,8 +65,8 @@ module DerivativeRodeo
|
|
30
65
|
# @param path_of_file_to_create_thumbnail_from [String]
|
31
66
|
# @param path_for_thumbnail_output [String]
|
32
67
|
def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
|
33
|
-
|
34
|
-
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '
|
68
|
+
dimensions = dimensions_for(filename: path_of_file_to_create_thumbnail_from)
|
69
|
+
`convert #{path_of_file_to_create_thumbnail_from} -thumbnail '#{dimensions}' -flatten #{path_for_thumbnail_output}`
|
35
70
|
end
|
36
71
|
end
|
37
72
|
end
|
@@ -31,7 +31,7 @@ module DerivativeRodeo
|
|
31
31
|
def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
|
32
32
|
hocr_html = File.read(path_to_hocr)
|
33
33
|
File.open(path_to_coordinate, "w+") do |file|
|
34
|
-
file.puts service.call(hocr_html)
|
34
|
+
file.puts service.call(hocr_html).to_json
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -13,7 +13,7 @@ module DerivativeRodeo
|
|
13
13
|
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
|
14
14
|
# @return [String] A JSON document
|
15
15
|
def self.call(sgml)
|
16
|
-
new(sgml)
|
16
|
+
new(sgml)
|
17
17
|
end
|
18
18
|
|
19
19
|
##
|
@@ -42,6 +42,21 @@ module DerivativeRodeo
|
|
42
42
|
end
|
43
43
|
alias json to_json
|
44
44
|
|
45
|
+
# Output plain text, keeping the method calls consistent with so calling this #to_text
|
46
|
+
#
|
47
|
+
# @return [String] plain text of OCR'd document
|
48
|
+
def to_text
|
49
|
+
@to_text ||= doc_stream.text
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_alto
|
53
|
+
@to_alto ||= AltoXml.to_alto(
|
54
|
+
words: doc_stream.words,
|
55
|
+
width: doc_stream.width,
|
56
|
+
height: doc_stream.height
|
57
|
+
)
|
58
|
+
end
|
59
|
+
|
45
60
|
private
|
46
61
|
|
47
62
|
def xml?(xml)
|
@@ -121,6 +136,7 @@ module DerivativeRodeo
|
|
121
136
|
# add trailing space to plaintext buffer for between words:
|
122
137
|
@text += ' '
|
123
138
|
@words.push(@current) if word_complete?
|
139
|
+
@current = nil # clear the current word
|
124
140
|
end
|
125
141
|
|
126
142
|
def end_line
|
@@ -156,10 +172,13 @@ module DerivativeRodeo
|
|
156
172
|
# Callback for element end; at this time, flush word coordinate state
|
157
173
|
# for current word, and append line endings to plain text:
|
158
174
|
#
|
159
|
-
# @param
|
160
|
-
def end_element(
|
161
|
-
|
162
|
-
|
175
|
+
# @param name [String] element name.
|
176
|
+
def end_element(name)
|
177
|
+
if name == 'span'
|
178
|
+
end_word if @element_class_name == 'ocrx_word'
|
179
|
+
@text += "\n" if @element_class_name.nil?
|
180
|
+
end
|
181
|
+
@element_class_name = nil
|
163
182
|
end
|
164
183
|
|
165
184
|
# Callback for completion of parsing hOCR, used to normalize generated
|
@@ -213,6 +232,102 @@ module DerivativeRodeo
|
|
213
232
|
JSON.generate(payload)
|
214
233
|
end
|
215
234
|
end
|
235
|
+
|
236
|
+
class AltoXml
|
237
|
+
##
|
238
|
+
# @api public
|
239
|
+
#
|
240
|
+
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
241
|
+
# @param width [Integer, nil] the width of the "canvas" on which the words appear.
|
242
|
+
# @param height [Integer, nil] the height of the "canvas" on which the words appear.
|
243
|
+
#
|
244
|
+
# @return [String] the ALTO XML representation of the given words and their coordinates.
|
245
|
+
def self.to_alto(words:, width: nil, height: nil)
|
246
|
+
new(words: words, width: width, height: height).to_alto
|
247
|
+
end
|
248
|
+
|
249
|
+
def initialize(words:, width:, height:, scaling: 1.0)
|
250
|
+
@words = words
|
251
|
+
@height = height.to_i
|
252
|
+
@width = width.to_i
|
253
|
+
@scaling = scaling
|
254
|
+
end
|
255
|
+
|
256
|
+
attr_reader :words, :width, :height, :scaling
|
257
|
+
|
258
|
+
# Output ALTO XML of word coordinates
|
259
|
+
#
|
260
|
+
# @return [String] ALTO XML representation of the words and their coordinates
|
261
|
+
def to_alto
|
262
|
+
page = alto_page(width, height) do |xml|
|
263
|
+
words.each do |word|
|
264
|
+
xml.String(
|
265
|
+
CONTENT: word[:word],
|
266
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
267
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
268
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
269
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
270
|
+
) { xml.text '' }
|
271
|
+
end
|
272
|
+
end
|
273
|
+
page.to_xml
|
274
|
+
end
|
275
|
+
|
276
|
+
private
|
277
|
+
|
278
|
+
# given block to manage word generation, wrap with page/block/line
|
279
|
+
def alto_page(pixel_width, pixel_height, &block)
|
280
|
+
builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
|
281
|
+
xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
|
282
|
+
xml.Description do
|
283
|
+
xml.MeasurementUnit 'pixel'
|
284
|
+
end
|
285
|
+
alto_layout(xml, pixel_width, pixel_height, &block)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
builder
|
289
|
+
end
|
290
|
+
|
291
|
+
def scale_point(value)
|
292
|
+
# NOTE: presuming non-fractional, even though ALTO 2.1
|
293
|
+
# specifies coordinates are xsd:float, not xsd:int,
|
294
|
+
# simplify to integer value for output:
|
295
|
+
(value * scaling).to_i
|
296
|
+
end
|
297
|
+
|
298
|
+
# return layout for page
|
299
|
+
def alto_layout(xml, pixel_width, pixel_height, &block)
|
300
|
+
xml.Layout do
|
301
|
+
xml.Page(ID: 'ID1',
|
302
|
+
PHYSICAL_IMG_NR: '1',
|
303
|
+
HEIGHT: pixel_height,
|
304
|
+
WIDTH: pixel_width) do
|
305
|
+
xml.PrintSpace(HEIGHT: pixel_height,
|
306
|
+
WIDTH: pixel_width,
|
307
|
+
HPOS: '0',
|
308
|
+
VPOS: '0') do
|
309
|
+
alto_blockline(xml, pixel_width, pixel_height, &block)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
# make block line and call word-block
|
316
|
+
def alto_blockline(xml, pixel_width, pixel_height)
|
317
|
+
xml.TextBlock(ID: 'ID1a',
|
318
|
+
HEIGHT: pixel_height,
|
319
|
+
WIDTH: pixel_width,
|
320
|
+
HPOS: '0',
|
321
|
+
VPOS: '0') do
|
322
|
+
xml.TextLine(HEIGHT: pixel_height,
|
323
|
+
WIDTH: pixel_width,
|
324
|
+
HPOS: '0',
|
325
|
+
VPOS: '0') do
|
326
|
+
yield(xml)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
end
|
216
331
|
end
|
217
332
|
end
|
218
333
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'marcel'
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Services
|
6
|
+
##
|
7
|
+
# This module provides an interface for determining a mime-type.
|
8
|
+
module MimeTypeService
|
9
|
+
##
|
10
|
+
# Hyrax has it's own compression of mime_types into conceptual types (as defined in
|
11
|
+
# Hyrax::FileSetDerivativesService). This provides a somewhat conceptual overlap with that,
|
12
|
+
# while also being more generalized.
|
13
|
+
#
|
14
|
+
# @param filename [String]
|
15
|
+
# @return [Symbol]
|
16
|
+
def self.hyrax_type(filename:)
|
17
|
+
mime = mime_type(filename: filename)
|
18
|
+
media_type, sub_type = mime.split("/")
|
19
|
+
case media_type
|
20
|
+
when "image", "audio", "text", "video"
|
21
|
+
media_type.to_sym
|
22
|
+
when "application" # The wild woolly weird world of all the things.
|
23
|
+
# TODO: Do we need to worry about office documents?
|
24
|
+
sub_type.to_sym
|
25
|
+
else
|
26
|
+
sub_type.to_sym
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Given a local :filename (e.g. downloaded and available on the server this is running),
|
32
|
+
# return the mime_type of the file.
|
33
|
+
#
|
34
|
+
# @param filename [String]
|
35
|
+
# @return [String] (e.g. "application/pdf", "text/plain")
|
36
|
+
def self.mime_type(filename:)
|
37
|
+
##
|
38
|
+
# TODO: Does this attempt to read the whole file? That may create memory constraints. By
|
39
|
+
# using Pathname (instead of File.read), we're letting Marcel do it's best mime magic.
|
40
|
+
pathname = Pathname.new(filename)
|
41
|
+
extension = filename.split(".")&.last&.downcase
|
42
|
+
if extension
|
43
|
+
# By including a possible extension, we can help nudge Marcel into making a more
|
44
|
+
# Without extension, we will get a lot of "application/octet-stream" results.
|
45
|
+
::Marcel::MimeType.for(pathname, extension: extension)
|
46
|
+
else
|
47
|
+
::Marcel::MimeType.for(pathname)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -2,17 +2,32 @@
|
|
2
2
|
|
3
3
|
require 'open3'
|
4
4
|
require 'securerandom'
|
5
|
-
require 'tmpdir'
|
6
|
-
|
7
5
|
module DerivativeRodeo
|
8
6
|
module Services
|
7
|
+
##
|
8
|
+
# A service module for splitting PDFs into one image per page.
|
9
|
+
#
|
10
|
+
# @see .call
|
9
11
|
module PdfSplitter
|
10
12
|
##
|
11
|
-
# @
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
# @api public
|
14
|
+
#
|
15
|
+
# Split the file found at the given :path
|
16
|
+
#
|
17
|
+
# @param path [String] the path to the source PDF that we're processing.
|
18
|
+
# @param image_extension [String] used to determine the splitting service we use; there is an
|
19
|
+
# implicit relationship between image_extension and image_file_basename_template
|
20
|
+
# (though filenames do not necessarily reflect mime types)
|
21
|
+
# @param image_file_basename_template [String] use this string to generate the unique filename
|
22
|
+
# for an image "split" from the given PDF. It must include "%d" as part of the
|
23
|
+
# declaration. For example if the template is "hello-world-%d.png" then the first
|
24
|
+
# split page will be "hello-world-1.png".
|
25
|
+
#
|
26
|
+
# @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
|
27
|
+
def self.call(path, image_extension:, image_file_basename_template:)
|
28
|
+
klass_name = "#{image_extension.to_s.classify}_page".classify
|
29
|
+
klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
|
30
|
+
klass.new(path, image_file_basename_template: image_file_basename_template)
|
16
31
|
end
|
17
32
|
|
18
33
|
##
|
@@ -31,38 +46,23 @@ module DerivativeRodeo
|
|
31
46
|
|
32
47
|
class_attribute :gsdevice, instance_accessor: false
|
33
48
|
class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
|
34
|
-
##
|
35
|
-
# @api public
|
36
|
-
#
|
37
|
-
# @param path [String] The path the the PDF
|
38
|
-
#
|
39
|
-
# @return [Enumerable, Utilities::PdfSplitter::Base]
|
40
|
-
def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
|
41
|
-
new(path, baseid: baseid, tmpdir: tmpdir)
|
42
|
-
end
|
43
49
|
|
44
|
-
##
|
45
|
-
# @param path [String] the path to the source PDF that we're processing.
|
46
|
-
# @param baseid [String] used for creating a unique identifier
|
47
|
-
# @param tmpdir [String] place to perform the "work" of splitting the PDF.
|
48
|
-
# @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
|
49
|
-
# extract this from the given path, but for testing purposes, you might want to
|
50
|
-
# provide a specific summary.
|
51
|
-
# @param logger [Logger, #error]
|
52
50
|
def initialize(path,
|
53
|
-
|
54
|
-
|
55
|
-
tmpdir: Dir.mktmpdir,
|
56
|
-
pdf_pages_summary: PagesSummary.extract_from(path: path),
|
57
|
-
logger: DerivativeRodeo.config.logger)
|
58
|
-
@baseid = baseid
|
51
|
+
image_file_basename_template:,
|
52
|
+
pdf_pages_summary: PagesSummary.extract_from(path: path))
|
59
53
|
@pdfpath = path
|
60
54
|
@pdf_pages_summary = pdf_pages_summary
|
61
|
-
@
|
62
|
-
|
55
|
+
@ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
|
56
|
+
|
57
|
+
# We need to ensure that this temporary directory exists so we can write the files to it.
|
58
|
+
# Fortunately, because this file space must be "local" tmp dir, we don't need to work
|
59
|
+
# through any of the location antics of {StorageLocations::BaseLocation}.
|
60
|
+
FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
|
63
61
|
end
|
64
62
|
|
65
|
-
attr_reader :
|
63
|
+
attr_reader :ghost_script_output_file_template
|
64
|
+
|
65
|
+
delegate :logger, to: DerivativeRodeo
|
66
66
|
|
67
67
|
# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
|
68
68
|
include Enumerable
|
@@ -80,8 +80,8 @@ module DerivativeRodeo
|
|
80
80
|
!pdf_pages_summary.valid?
|
81
81
|
end
|
82
82
|
|
83
|
-
attr_reader :pdf_pages_summary, :
|
84
|
-
private :pdf_pages_summary, :
|
83
|
+
attr_reader :pdf_pages_summary, :basename, :pdfpath
|
84
|
+
private :pdf_pages_summary, :basename, :pdfpath
|
85
85
|
|
86
86
|
# @api private
|
87
87
|
def gsdevice
|
@@ -99,16 +99,12 @@ module DerivativeRodeo
|
|
99
99
|
@entries = Array.wrap(gsconvert)
|
100
100
|
end
|
101
101
|
|
102
|
-
def output_base
|
103
|
-
@output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
|
104
|
-
end
|
105
|
-
|
106
102
|
def gsconvert
|
107
103
|
# NOTE: you must call gsdevice before compression, as compression is
|
108
104
|
# updated during the gsdevice call.
|
109
105
|
file_names = []
|
110
106
|
|
111
|
-
Open3.popen3(gsconvert_cmd(
|
107
|
+
Open3.popen3(gsconvert_cmd(ghost_script_output_file_template)) do |_stdin, stdout, stderr, _wait_thr|
|
112
108
|
err = stderr.read
|
113
109
|
logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
|
114
110
|
|
@@ -116,7 +112,7 @@ module DerivativeRodeo
|
|
116
112
|
stdout.read.split("\n").each do |line|
|
117
113
|
next unless line.start_with?('Page ')
|
118
114
|
|
119
|
-
file_names << format(
|
115
|
+
file_names << format(ghost_script_output_file_template, page_number)
|
120
116
|
page_number += 1
|
121
117
|
end
|
122
118
|
end
|
@@ -126,12 +122,12 @@ module DerivativeRodeo
|
|
126
122
|
|
127
123
|
def create_file_name(line:, page_number:); end
|
128
124
|
|
129
|
-
def gsconvert_cmd(
|
125
|
+
def gsconvert_cmd(ghost_script_output_file_template)
|
130
126
|
@gsconvert_cmd ||= begin
|
131
127
|
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
|
132
128
|
cmd += " -sCompression=#{compression}" if compression?
|
133
129
|
cmd += " -dJPEGQ=#{quality}" if quality?
|
134
|
-
cmd += " -sOutputFile=#{
|
130
|
+
cmd += " -sOutputFile=#{ghost_script_output_file_template} -r#{ppi} -f #{pdfpath}"
|
135
131
|
cmd
|
136
132
|
end
|
137
133
|
end
|
@@ -21,7 +21,7 @@ module DerivativeRodeo
|
|
21
21
|
def self.read(url)
|
22
22
|
HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
|
23
23
|
rescue StandardError => e
|
24
|
-
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
24
|
+
DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
25
25
|
raise e
|
26
26
|
end
|
27
27
|
|
@@ -42,6 +42,8 @@ module DerivativeRodeo
|
|
42
42
|
|
43
43
|
class << self
|
44
44
|
alias scheme location_name
|
45
|
+
|
46
|
+
delegate :config, to: DerivativeRodeo
|
45
47
|
end
|
46
48
|
|
47
49
|
##
|
@@ -206,6 +208,22 @@ module DerivativeRodeo
|
|
206
208
|
klass.build(from_uri: file_path, template: template)
|
207
209
|
end
|
208
210
|
|
211
|
+
##
|
212
|
+
# When you have a known location and want to check for files that are within that location,
|
213
|
+
# use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
|
214
|
+
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
|
+
#
|
216
|
+
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
|
+
# qualified" Dir.glob type search.
|
218
|
+
#
|
219
|
+
# @param tail_glob [String]
|
220
|
+
#
|
221
|
+
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
|
+
# array when there are none.
|
223
|
+
def globbed_tail_locations(tail_glob:)
|
224
|
+
raise NotImplementedError, "#{self.class}#globbed_locations"
|
225
|
+
end
|
226
|
+
|
209
227
|
##
|
210
228
|
# @param extension [String, StorageLocations::SAME]
|
211
229
|
# @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
|
@@ -43,7 +43,7 @@ module DerivativeRodeo
|
|
43
43
|
# @param url [String]
|
44
44
|
#
|
45
45
|
# @return [String]
|
46
|
-
def
|
46
|
+
def get(url)
|
47
47
|
HTTParty.get(url, logger: config.logger)
|
48
48
|
rescue => e
|
49
49
|
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
@@ -51,13 +51,11 @@ module DerivativeRodeo
|
|
51
51
|
end
|
52
52
|
|
53
53
|
##
|
54
|
-
# @param url [String]
|
55
|
-
#
|
56
54
|
# @return [URI] when the URL resolves successfully
|
57
55
|
# @return [FalseClass] when the URL's head request is not successful or we've exhausted our
|
58
56
|
# remaining redirects.
|
59
|
-
def
|
60
|
-
HTTParty.head(
|
57
|
+
def exist?
|
58
|
+
HTTParty.head(file_uri, logger: config.logger)
|
61
59
|
rescue => e
|
62
60
|
config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
|
63
61
|
false
|
@@ -8,7 +8,15 @@ module DerivativeRodeo
|
|
8
8
|
# Location to download and upload files to S3
|
9
9
|
#
|
10
10
|
class S3Location < BaseLocation
|
11
|
-
|
11
|
+
##
|
12
|
+
# @!group Class Attributes
|
13
|
+
# @!attribute use_actual_s3_bucket
|
14
|
+
#
|
15
|
+
# When true , we are going to use a live S3 bucket. When false, we'll use a fake local bucket.
|
16
|
+
class_attribute :use_actual_s3_bucket, default: true
|
17
|
+
# @!endgroup Class Attributes
|
18
|
+
##
|
19
|
+
|
12
20
|
##
|
13
21
|
# Create a new uri of the classes type. Parts argument should have a default in
|
14
22
|
# implementing classes. Must support a number or the symbol :all
|
@@ -24,10 +32,11 @@ module DerivativeRodeo
|
|
24
32
|
end
|
25
33
|
|
26
34
|
##
|
27
|
-
# @param
|
35
|
+
# @param bucket_name [String, NilClass] when given, use this as the bucket, otherwise, def
|
36
|
+
#
|
28
37
|
# @return [String]
|
29
|
-
def self.adapter_prefix(
|
30
|
-
"#{scheme}://#{
|
38
|
+
def self.adapter_prefix(bucket_name: config.aws_s3_bucket)
|
39
|
+
"#{scheme}://#{bucket_name}.s3.#{config.aws_s3_region}.amazonaws.com"
|
31
40
|
end
|
32
41
|
|
33
42
|
##
|
@@ -53,6 +62,38 @@ module DerivativeRodeo
|
|
53
62
|
bucket.objects(prefix: file_path).count.positive?
|
54
63
|
end
|
55
64
|
|
65
|
+
##
|
66
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
|
+
#
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
|
69
|
+
# use the components of the file_path to fake that behavior.
|
70
|
+
#
|
71
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
+
def globbed_tail_locations(tail_glob:)
|
73
|
+
# file_path = "s3://blah/1234/hello-world/pages/*.tiff"
|
74
|
+
#
|
75
|
+
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
+
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
+
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
+
# and miscolate two PDFs.
|
79
|
+
#
|
80
|
+
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
+
globname = File.join(file_dir, tail_glob)
|
82
|
+
regexp = %r{#{File.extname(globname)}$}
|
83
|
+
|
84
|
+
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
+
# based on both the key of the item in the bucket as well as the bucket's host.
|
86
|
+
uri = URI.parse(file_uri)
|
87
|
+
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
|
+
|
89
|
+
bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
|
90
|
+
if object.key.match(regexp)
|
91
|
+
template = File.join(scheme_and_host, object.key)
|
92
|
+
derived_file_from(template: template)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
56
97
|
##
|
57
98
|
# @api public
|
58
99
|
# write the tmp file to the file_uri
|
@@ -71,6 +112,9 @@ module DerivativeRodeo
|
|
71
112
|
#
|
72
113
|
# @return [Aws::S3::Resource]
|
73
114
|
def resource
|
115
|
+
# TODO: Are there instantiation considerations when running in Lambda? In tests
|
116
|
+
# initializing a resource is very slow (e.g. 3 seconds or so). Should this be a class
|
117
|
+
# method? Can it be given the SpaceStone constraints?
|
74
118
|
@resource ||= if DerivativeRodeo.config.aws_s3_access_key_id
|
75
119
|
Aws::S3::Resource.new(region: DerivativeRodeo.config.aws_s3_region,
|
76
120
|
credentials: Aws::Credentials.new(
|
@@ -91,13 +135,28 @@ module DerivativeRodeo
|
|
91
135
|
raise Errors::BucketMissingError
|
92
136
|
end
|
93
137
|
|
138
|
+
# @see .use_actual_s3_bucket
|
94
139
|
def bucket
|
95
|
-
@bucket ||=
|
140
|
+
@bucket ||= if use_actual_s3_bucket?
|
141
|
+
resource.bucket(bucket_name)
|
142
|
+
else
|
143
|
+
self.class.faux_bucket
|
144
|
+
end
|
96
145
|
end
|
97
146
|
|
98
147
|
def file_path
|
99
148
|
@file_path ||= @file_uri.sub(%r{.+://.+?/}, '')
|
100
149
|
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# A fake constructed fake bucket that confroms to the narrow S3 interface that we use.
|
153
|
+
#
|
154
|
+
# @see .use_actual_s3_bucket
|
155
|
+
# @return [AwsS3FauxBucket]
|
156
|
+
def self.faux_bucket
|
157
|
+
# We are not requiring this file; except in the spec context.
|
158
|
+
@faux_bucket ||= AwsS3FauxBucket.new
|
159
|
+
end
|
101
160
|
end
|
102
161
|
end
|
103
162
|
end
|
@@ -14,9 +14,20 @@ module DerivativeRodeo
|
|
14
14
|
# It uploads a file_uri to the queue, not the contents of that file
|
15
15
|
# reading from the queue is not currently implemented
|
16
16
|
class SqsLocation < BaseLocation
|
17
|
+
##
|
18
|
+
# @!group Class Attributes
|
19
|
+
#
|
20
|
+
# @!attribute batch_size
|
21
|
+
# @return [Integer]
|
17
22
|
class_attribute :batch_size, default: 10
|
18
23
|
|
19
|
-
|
24
|
+
# @!attribute use_real_sqs
|
25
|
+
# When true, use the real SQS; else when false use a fake one. You probably don't want to
|
26
|
+
# use the fake one in your production. But it's exposed in this manner to ease testing of
|
27
|
+
# downstream dependencies.
|
28
|
+
class_attribute :use_real_sqs, default: true
|
29
|
+
# @!endgroup Class Attributes
|
30
|
+
##
|
20
31
|
|
21
32
|
##
|
22
33
|
# Create a new uri of the classes type. Parts argument should have a default in
|
@@ -82,19 +93,26 @@ module DerivativeRodeo
|
|
82
93
|
file_uri
|
83
94
|
end
|
84
95
|
|
96
|
+
# rubocop:disable Metrics/MethodLength
|
85
97
|
def client
|
86
|
-
@client ||= if
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
98
|
+
@client ||= if use_real_sqs?
|
99
|
+
if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
|
100
|
+
Aws::SQS::Client.new(
|
101
|
+
region: config.aws_sqs_region,
|
102
|
+
credentials: Aws::Credentials.new(
|
103
|
+
config.aws_sqs_access_key_id,
|
104
|
+
config.aws_sqs_secret_access_key
|
105
|
+
)
|
92
106
|
)
|
93
|
-
|
107
|
+
else
|
108
|
+
Aws::SQS::Client.new(region: config.aws_sqs_region)
|
109
|
+
end
|
94
110
|
else
|
95
|
-
|
111
|
+
# We are not requiring this file; except in the spec context.
|
112
|
+
AwsSqsFauxClient.new
|
96
113
|
end
|
97
114
|
end
|
115
|
+
# rubocop:enable Metrics/MethodLength
|
98
116
|
|
99
117
|
def add(message:)
|
100
118
|
client.send_message({
|
data/lib/derivative_rodeo.rb
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
##
|
4
|
+
# This class is very rudimentary implementation of a bucket. It conforms to the necessary
|
5
|
+
# interface for downloading and uploading and filter on prefix.
|
6
|
+
#
|
7
|
+
# It is provided as a lib/spec/support so that downstream implementations can leverage a fake S3
|
8
|
+
# bucket.
|
9
|
+
#
|
10
|
+
# @see [DerivativeRodeo::StorageLocations::S3Location]
|
11
|
+
class AwsS3FauxBucket
|
12
|
+
def initialize
|
13
|
+
@storage = {}
|
14
|
+
end
|
15
|
+
attr_reader :storage
|
16
|
+
def object(path)
|
17
|
+
# Yup, we've got nested buckets
|
18
|
+
@storage[path] ||= Storage.new(key: path)
|
19
|
+
end
|
20
|
+
|
21
|
+
def objects(prefix:)
|
22
|
+
@storage.each_with_object([]) do |(path, obj), accumulator|
|
23
|
+
accumulator << obj if path.start_with?(prefix)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Storage
|
28
|
+
# Because we're now coping with the glob tail finder, we need to account for the bucket entry's
|
29
|
+
# key.
|
30
|
+
def initialize(key:)
|
31
|
+
@key = key
|
32
|
+
@storage = {}
|
33
|
+
end
|
34
|
+
attr_reader :storage, :key
|
35
|
+
|
36
|
+
def upload_file(path)
|
37
|
+
@storage[:upload] = File.read(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
def download_file(path)
|
41
|
+
return false unless @storage.key?(:upload)
|
42
|
+
content = @storage.fetch(:upload)
|
43
|
+
File.open(path, 'wb') do |f|
|
44
|
+
f.puts(content)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'ostruct'
|
3
|
+
##
|
4
|
+
# This class is very rudimentary implementation of an SQS client. It conforms to the necessary
|
5
|
+
# interface for sending messages and reading messages
|
6
|
+
#
|
7
|
+
# @see [DerivativeRodeo::StorageAdapters::SqsAdapter]
|
8
|
+
class AwsSqsFauxClient
|
9
|
+
def initialize(queue_url: nil)
|
10
|
+
@queue_url = queue_url || 'https://sqs.us-west-2.amazonaws.com/5555555555/fake'
|
11
|
+
@storage = {}
|
12
|
+
end
|
13
|
+
attr_reader :storage
|
14
|
+
|
15
|
+
def send_message(arg_hash)
|
16
|
+
@storage[arg_hash[:queue_url]] ||= []
|
17
|
+
@storage[arg_hash[:queue_url]] << arg_hash[:message_body]
|
18
|
+
end
|
19
|
+
|
20
|
+
def send_message_batch(arg_hash)
|
21
|
+
@storage[arg_hash[:queue_url]] ||= []
|
22
|
+
@storage[arg_hash[:queue_url]] += arg_hash[:entries]
|
23
|
+
end
|
24
|
+
|
25
|
+
def receive_message(arg_hash)
|
26
|
+
output = []
|
27
|
+
args_hash[:mx_number_of_messages].times do
|
28
|
+
value = @storage[arg_hash[:queue_url]]&.pop
|
29
|
+
output << value if value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_queue_url(*)
|
34
|
+
OpenStruct.new(queue_url: @queue_url)
|
35
|
+
end
|
36
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-05
|
12
|
+
date: 2023-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -281,12 +281,14 @@ files:
|
|
281
281
|
- lib/derivative_rodeo.rb
|
282
282
|
- lib/derivative_rodeo/configuration.rb
|
283
283
|
- lib/derivative_rodeo/errors.rb
|
284
|
+
- lib/derivative_rodeo/generators/alto_generator.rb
|
284
285
|
- lib/derivative_rodeo/generators/base_generator.rb
|
285
286
|
- lib/derivative_rodeo/generators/concerns/copy_file_concern.rb
|
286
287
|
- lib/derivative_rodeo/generators/copy_generator.rb
|
287
288
|
- lib/derivative_rodeo/generators/hocr_generator.rb
|
288
289
|
- lib/derivative_rodeo/generators/monochrome_generator.rb
|
289
290
|
- lib/derivative_rodeo/generators/pdf_split_generator.rb
|
291
|
+
- lib/derivative_rodeo/generators/plain_text_generator.rb
|
290
292
|
- lib/derivative_rodeo/generators/thumbnail_generator.rb
|
291
293
|
- lib/derivative_rodeo/generators/word_coordinates_generator.rb
|
292
294
|
- lib/derivative_rodeo/services/base_service.rb
|
@@ -295,6 +297,7 @@ files:
|
|
295
297
|
- lib/derivative_rodeo/services/image_identify_service.rb
|
296
298
|
- lib/derivative_rodeo/services/image_jp2_service.rb
|
297
299
|
- lib/derivative_rodeo/services/image_service.rb
|
300
|
+
- lib/derivative_rodeo/services/mime_type_service.rb
|
298
301
|
- lib/derivative_rodeo/services/pdf_splitter/base.rb
|
299
302
|
- lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb
|
300
303
|
- lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb
|
@@ -311,6 +314,8 @@ files:
|
|
311
314
|
- lib/derivative_rodeo/storage_locations/sqs_location.rb
|
312
315
|
- lib/derivative_rodeo/technical_metadata.rb
|
313
316
|
- lib/derivative_rodeo/version.rb
|
317
|
+
- lib/spec_support/aws_s3_faux_bucket.rb
|
318
|
+
- lib/spec_support/aws_sqs_faux_client.rb
|
314
319
|
homepage: https://github.com/scientist-softserv/derivative_rodeo
|
315
320
|
licenses:
|
316
321
|
- APACHE-2.0
|