derivative-rodeo 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 20e512d7162170875d60f90ff48bba694237ac7d31f38d806d2b87f570536c1c
4
- data.tar.gz: a1311ea39a3994b4d24ffdbdbade62b7fbb15d2c326a3b510607f315fb4dd865
3
+ metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
+ data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
5
5
  SHA512:
6
- metadata.gz: 157a9e276c6cefe739137fbe17e783557d0317dcee531cd353ad86987bda33ad55c2ada8179e254b306a467d01f8f759a6e89fc91b8cbcf6e968cf5a28a9037b
7
- data.tar.gz: 5bd45db467194cf1e8af7f7e1ed625c2b3898d011f20a581a9a55a2ccbb7be56ca8c276752b7bef41e2c1d5efa4737d1291f83211dd69cd18e4f7caeed25fef2
6
+ metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
+ data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
4
+
5
+ module DerivativeRodeo
6
+ module Generators
7
+ ##
8
+ # Generate the Alto XML from the given input_uris.
9
+ #
10
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
11
+ class AltoGenerator < BaseGenerator
12
+ self.output_extension = "alto.xml"
13
+
14
+ class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
15
+
16
+ ##
17
+ # @param output_location [StorageLocations::BaseLocation]
18
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
19
+ #
20
+ # @return [StorageLocations::BaseLocation]
21
+ #
22
+ # @see #requisite_files
23
+ def build_step(output_location:, input_tmp_file_path:, **)
24
+ output_location.with_new_tmp_path do |output_tmp_file_path|
25
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_alto: output_tmp_file_path)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # @param path_to_hocr [String]
33
+ # @param path_to_alto [String]
34
+ def convert_to_coordinates(path_to_hocr:, path_to_alto:)
35
+ hocr_html = File.read(path_to_hocr)
36
+ File.open(path_to_alto, "w+") do |file|
37
+ file.puts service.call(hocr_html).to_alto
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -27,7 +27,6 @@ module DerivativeRodeo
27
27
  # @!endgroup Class Attributes
28
28
 
29
29
  attr_reader :input_uris,
30
- :logger,
31
30
  :output_location_template,
32
31
  :preprocessed_location_template
33
32
 
@@ -39,23 +38,25 @@ module DerivativeRodeo
39
38
  # to find preprocessed uris by transforming the :input_uris via
40
39
  # {Services::ConvertUriViaTemplateService} with the given
41
40
  # :preprocessed_location_template.
42
- # @param logger [Logger]
43
- def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
41
+ def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
+ # NOTE: Are we using this preprocessed_location_template? Wondering?
44
43
  @input_uris = Array.wrap(input_uris)
45
44
  @output_location_template = output_location_template
46
45
  @preprocessed_location_template = preprocessed_location_template
47
- @logger = logger
48
46
 
49
47
  return if valid_instantiation?
50
48
 
51
49
  raise Errors::ExtensionMissingError.new(klass: self.class)
52
50
  end
53
51
 
52
+ delegate :logger, to: DerivativeRodeo
53
+
54
54
  ##
55
55
  # @api private
56
56
  #
57
57
  # @return [Boolean]
58
58
  def valid_instantiation?
59
+ # TODO: Does this even make sense.
59
60
  # When we have a BaseGenerator and not one of it's children or when we've assigned the
60
61
  # output_extension. instance_of? is more specific than is_a?
61
62
  instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
@@ -83,6 +84,7 @@ module DerivativeRodeo
83
84
  # @see #build_step
84
85
  # @see #with_each_requisite_location_and_tmp_file_path
85
86
  def generated_files
87
+ # TODO: Examples please
86
88
  return @generated_files if defined?(@generated_files)
87
89
 
88
90
  # As much as I would like to use map or returned values; given the implementations it's
@@ -92,6 +94,9 @@ module DerivativeRodeo
92
94
  # helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
93
95
  # #build_step
94
96
  @generated_files = []
97
+
98
+ # BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
99
+ # "file:///Users/jfriesen/.profile"
95
100
  with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
96
101
  generated_file = destination(input_location)
97
102
  @generated_files << if generated_file.exist?
@@ -170,7 +175,7 @@ module DerivativeRodeo
170
175
  return output_location unless preprocessed_location_template
171
176
 
172
177
  preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
173
- # We only want
178
+ # We only want the location if it exists
174
179
  return preprocessed_location if preprocessed_location&.exist?
175
180
 
176
181
  # NOTE: The file does not exist at the output_location; but we pass this information along so
@@ -17,19 +17,66 @@ module DerivativeRodeo
17
17
  include CopyFileConcern
18
18
 
19
19
  ##
20
- # @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
20
+ # A helper method for downstream implementations to ask if this file is perhaps split from a
21
+ # PDF.
21
22
  #
22
- # @return [#call, Services::PdfSplitter::Base]
23
- def pdf_splitter(name: pdf_splitter_name)
24
- @pdf_splitter ||= Services::PdfSplitter.for(name)
23
+ # @param filename [String]
24
+ # @param extension [String] the extension (either with or without the leading period); if none
25
+ # is provided use the extension of the given :filename.
26
+ # @return [TrueClass] when the file name likely represents a file split from a PDF.
27
+ # @return [FalseClass] when the file name does not, by convention, represent a file split from
28
+ # a PDF.
29
+ #
30
+ # @see #image_file_basename_template
31
+ def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
32
+ extension ||= File.extname(filename)
33
+
34
+ # Strip the leading period from the extension.
35
+ extension = extension[1..-1] if extension.start_with?('.')
36
+ regexp = %r{--page-\d+\.#{extension}$}
37
+ !!regexp.match(filename)
25
38
  end
26
39
 
27
40
  ##
28
- # @return [Symbol]
41
+ # @param basename [String] The given PDF file's base name (e.g. "hello.pdf" would have a base name of
42
+ # "hello").
43
+ #
44
+ # @return [String] A template for the filenames of the images produced by Ghostscript.
45
+ #
46
+ # @note This must include "%d" in the returning value, as that is how Ghostscript will assign
47
+ # the page number.
48
+ #
49
+ # @note I have extracted this function to make it abundantly clear the expected location
50
+ # each split image. Further there is an interaction in this
29
51
  #
30
- # @see .output_extension
31
- def pdf_splitter_name
32
- output_extension.to_s.split(".").last.to_sym
52
+ # @see #existing_page_locations
53
+ # @see .filename_for_a_derived_page_from_a_pdf?
54
+ def image_file_basename_template(basename:)
55
+ "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
56
+ end
57
+
58
+ ##
59
+ # We want to check the output location and pre-processed location for the existence of already
60
+ # split pages. This method checks both places.
61
+ #
62
+ # @param input_location [StorageLocations::BaseLocation]
63
+ #
64
+ # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
+ # with :tail_glob.
66
+ #
67
+ # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
+ #
69
+ # @note The tail_glob is in relation to the {#image_file_basename_template}
70
+ def existing_page_locations(input_location:)
71
+ # See image_file_basename_template
72
+ tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
73
+
74
+ output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
75
+ return output_locations if output_locations.count.positive?
76
+
77
+ return [] if preprocessed_location_template.blank?
78
+
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
33
80
  end
34
81
 
35
82
  ##
@@ -44,18 +91,35 @@ module DerivativeRodeo
44
91
  # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
45
92
  # @yieldparam image_path [String] where to find this file in the tmp space
46
93
  #
94
+ # @note This function makes a concession; namely that if it encounters any
95
+ # {#existing_page_locations} it will use all of that result as the entire number of pages.
96
+ # We could make this smarter but at the moment we're deferring on that.
97
+ #
47
98
  # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
99
+ #
100
+ # rubocop:disable Metrics/MethodLength
48
101
  def with_each_requisite_location_and_tmp_file_path
49
102
  input_files.each do |input_location|
50
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
51
- image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
52
- image_paths.each do |image_path|
104
+ ## We want a single call for a directory listing of the image_file_basename_template
105
+ generated_files = existing_page_locations(input_location: input_location)
106
+
107
+ if generated_files.count.zero?
108
+ generated_files = Services::PdfSplitter.call(
109
+ input_tmp_file_path,
110
+ image_extension: output_extension,
111
+ image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
+ )
113
+ end
114
+
115
+ generated_files.each do |image_path|
53
116
  image_location = StorageLocations::FileLocation.new("file://#{image_path}")
54
117
  yield(image_location, image_path)
55
118
  end
56
119
  end
57
120
  end
58
121
  end
122
+ # rubocop:enable Metrics/MethodLength
59
123
  end
60
124
  end
61
125
  end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
4
+
5
+ module DerivativeRodeo
6
+ module Generators
7
+ ##
8
+ # Generate the word coordinates (as JSON) from the given input_uris.
9
+ #
10
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
11
+ class PlainTextGenerator < BaseGenerator
12
+ self.output_extension = "plain_text.txt"
13
+
14
+ class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
15
+
16
+ ##
17
+ # @param output_location [StorageLocations::BaseLocation]
18
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
19
+ #
20
+ # @return [StorageLocations::BaseLocation]
21
+ #
22
+ # @see #requisite_files
23
+ def build_step(output_location:, input_tmp_file_path:, **)
24
+ output_location.with_new_tmp_path do |output_tmp_file_path|
25
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_plain_text: output_tmp_file_path)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # @param path_to_hocr [String]
33
+ # @param path_to_plain_text [String]
34
+ def convert_to_coordinates(path_to_hocr:, path_to_plain_text:)
35
+ hocr_html = File.read(path_to_hocr)
36
+ File.open(path_to_plain_text, "w+") do |file|
37
+ file.puts service.call(hocr_html).to_text
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -6,12 +6,33 @@ module DerivativeRodeo
6
6
  # This generator is responsible for converting a given binary into a thumbnail. As of
7
7
  # <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
8
8
  class ThumbnailGenerator < BaseGenerator
9
+ ##
10
+ # @!group Class Attributes
11
+
9
12
  ##
10
13
  # We want to mirror the same file "last" extension as described in Hyrax.
11
14
  #
12
15
  # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
13
16
  self.output_extension = 'thumbnail.jpeg'
14
17
 
18
+ ##
19
+ # @!attribute dimensions_by_type
20
+ #
21
+ # @return [Hash<Symbol,String>] the "types" (as categorized by
22
+ # Hyrax::FileSetDerivativeService). These aren't mime-types per se but a conceptual
23
+ # distillation of that.
24
+ #
25
+ # @see https://github.com/samvera/hyrax/blob/815e0abaacf9f331a5640c5d6129661d01eadf75/app/services/hyrax/file_set_derivatives_service.rb
26
+ class_attribute :dimensions_by_type, default: { pdf: "338x493" }
27
+
28
+ ##
29
+ # @!attribute dimensions_fallback
30
+ #
31
+ # @return [String] when there's no match for {.dimensions_by_type} use this value.
32
+ class_attribute :dimensions_fallback, default: "200x150>"
33
+ # @!endgroup Class Attributes
34
+ ##
35
+
15
36
  ##
16
37
  # @param output_location [StorageLocations::BaseLocation]
17
38
  # @param input_tmp_file_path [String] the location of the file that we can use for processing.
@@ -23,6 +44,20 @@ module DerivativeRodeo
23
44
  end
24
45
  end
25
46
 
47
+ ##
48
+ # @param filename [String]
49
+ # @return [String]
50
+ #
51
+ # @see .dimensions_by_type
52
+ # @see .dimensions_fallback
53
+ def self.dimensions_for(filename:)
54
+ type = DerivativeRodeo::Services::MimeTypeService.hyrax_type(filename: filename)
55
+ dimensions_by_type.fetch(type, dimensions_fallback)
56
+ end
57
+
58
+ # Want to expose the dimensions_for as an instance method
59
+ delegate :dimensions_for, to: :class
60
+
26
61
  ##
27
62
  # Convert the file found at :path_to_input into a thumbnail, writing it to the
28
63
  # :path_for_thumbnail_output
@@ -30,8 +65,8 @@ module DerivativeRodeo
30
65
  # @param path_of_file_to_create_thumbnail_from [String]
31
66
  # @param path_for_thumbnail_output [String]
32
67
  def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
33
- # @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
34
- `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
68
+ dimensions = dimensions_for(filename: path_of_file_to_create_thumbnail_from)
69
+ `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '#{dimensions}' -flatten #{path_for_thumbnail_output}`
35
70
  end
36
71
  end
37
72
  end
@@ -31,7 +31,7 @@ module DerivativeRodeo
31
31
  def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
32
32
  hocr_html = File.read(path_to_hocr)
33
33
  File.open(path_to_coordinate, "w+") do |file|
34
- file.puts service.call(hocr_html)
34
+ file.puts service.call(hocr_html).to_json
35
35
  end
36
36
  end
37
37
  end
@@ -13,7 +13,7 @@ module DerivativeRodeo
13
13
  # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
14
14
  # @return [String] A JSON document
15
15
  def self.call(sgml)
16
- new(sgml).to_json
16
+ new(sgml)
17
17
  end
18
18
 
19
19
  ##
@@ -42,6 +42,21 @@ module DerivativeRodeo
42
42
  end
43
43
  alias json to_json
44
44
 
45
+ # Output plain text, keeping the method calls consistent with so calling this #to_text
46
+ #
47
+ # @return [String] plain text of OCR'd document
48
+ def to_text
49
+ @to_text ||= doc_stream.text
50
+ end
51
+
52
+ def to_alto
53
+ @to_alto ||= AltoXml.to_alto(
54
+ words: doc_stream.words,
55
+ width: doc_stream.width,
56
+ height: doc_stream.height
57
+ )
58
+ end
59
+
45
60
  private
46
61
 
47
62
  def xml?(xml)
@@ -121,6 +136,7 @@ module DerivativeRodeo
121
136
  # add trailing space to plaintext buffer for between words:
122
137
  @text += ' '
123
138
  @words.push(@current) if word_complete?
139
+ @current = nil # clear the current word
124
140
  end
125
141
 
126
142
  def end_line
@@ -156,10 +172,13 @@ module DerivativeRodeo
156
172
  # Callback for element end; at this time, flush word coordinate state
157
173
  # for current word, and append line endings to plain text:
158
174
  #
159
- # @param _name [String] element name.
160
- def end_element(_name)
161
- end_line if @element_class_name == 'ocr_line'
162
- end_word if @element_class_name == 'ocrx_word'
175
+ # @param name [String] element name.
176
+ def end_element(name)
177
+ if name == 'span'
178
+ end_word if @element_class_name == 'ocrx_word'
179
+ @text += "\n" if @element_class_name.nil?
180
+ end
181
+ @element_class_name = nil
163
182
  end
164
183
 
165
184
  # Callback for completion of parsing hOCR, used to normalize generated
@@ -213,6 +232,102 @@ module DerivativeRodeo
213
232
  JSON.generate(payload)
214
233
  end
215
234
  end
235
+
236
+ class AltoXml
237
+ ##
238
+ # @api public
239
+ #
240
+ # @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
241
+ # @param width [Integer, nil] the width of the "canvas" on which the words appear.
242
+ # @param height [Integer, nil] the height of the "canvas" on which the words appear.
243
+ #
244
+ # @return [String] the ALTO XML representation of the given words and their coordinates.
245
+ def self.to_alto(words:, width: nil, height: nil)
246
+ new(words: words, width: width, height: height).to_alto
247
+ end
248
+
249
+ def initialize(words:, width:, height:, scaling: 1.0)
250
+ @words = words
251
+ @height = height.to_i
252
+ @width = width.to_i
253
+ @scaling = scaling
254
+ end
255
+
256
+ attr_reader :words, :width, :height, :scaling
257
+
258
+ # Output ALTO XML of word coordinates
259
+ #
260
+ # @return [String] ALTO XML representation of the words and their coordinates
261
+ def to_alto
262
+ page = alto_page(width, height) do |xml|
263
+ words.each do |word|
264
+ xml.String(
265
+ CONTENT: word[:word],
266
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
267
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
268
+ HPOS: scale_point(word[:coordinates][0]).to_s,
269
+ VPOS: scale_point(word[:coordinates][1]).to_s
270
+ ) { xml.text '' }
271
+ end
272
+ end
273
+ page.to_xml
274
+ end
275
+
276
+ private
277
+
278
+ # given block to manage word generation, wrap with page/block/line
279
+ def alto_page(pixel_width, pixel_height, &block)
280
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
281
+ xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
282
+ xml.Description do
283
+ xml.MeasurementUnit 'pixel'
284
+ end
285
+ alto_layout(xml, pixel_width, pixel_height, &block)
286
+ end
287
+ end
288
+ builder
289
+ end
290
+
291
+ def scale_point(value)
292
+ # NOTE: presuming non-fractional, even though ALTO 2.1
293
+ # specifies coordinates are xsd:float, not xsd:int,
294
+ # simplify to integer value for output:
295
+ (value * scaling).to_i
296
+ end
297
+
298
+ # return layout for page
299
+ def alto_layout(xml, pixel_width, pixel_height, &block)
300
+ xml.Layout do
301
+ xml.Page(ID: 'ID1',
302
+ PHYSICAL_IMG_NR: '1',
303
+ HEIGHT: pixel_height,
304
+ WIDTH: pixel_width) do
305
+ xml.PrintSpace(HEIGHT: pixel_height,
306
+ WIDTH: pixel_width,
307
+ HPOS: '0',
308
+ VPOS: '0') do
309
+ alto_blockline(xml, pixel_width, pixel_height, &block)
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ # make block line and call word-block
316
+ def alto_blockline(xml, pixel_width, pixel_height)
317
+ xml.TextBlock(ID: 'ID1a',
318
+ HEIGHT: pixel_height,
319
+ WIDTH: pixel_width,
320
+ HPOS: '0',
321
+ VPOS: '0') do
322
+ xml.TextLine(HEIGHT: pixel_height,
323
+ WIDTH: pixel_width,
324
+ HPOS: '0',
325
+ VPOS: '0') do
326
+ yield(xml)
327
+ end
328
+ end
329
+ end
330
+ end
216
331
  end
217
332
  end
218
333
  end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+ require 'marcel'
3
+
4
+ module DerivativeRodeo
5
+ module Services
6
+ ##
7
+ # This module provides an interface for determining a mime-type.
8
+ module MimeTypeService
9
+ ##
10
+ # Hyrax has it's own compression of mime_types into conceptual types (as defined in
11
+ # Hyrax::FileSetDerivativesService). This provides a somewhat conceptual overlap with that,
12
+ # while also being more generalized.
13
+ #
14
+ # @param filename [String]
15
+ # @return [Symbol]
16
+ def self.hyrax_type(filename:)
17
+ mime = mime_type(filename: filename)
18
+ media_type, sub_type = mime.split("/")
19
+ case media_type
20
+ when "image", "audio", "text", "video"
21
+ media_type.to_sym
22
+ when "application" # The wild woolly weird world of all the things.
23
+ # TODO: Do we need to worry about office documents?
24
+ sub_type.to_sym
25
+ else
26
+ sub_type.to_sym
27
+ end
28
+ end
29
+
30
+ ##
31
+ # Given a local :filename (e.g. downloaded and available on the server this is running),
32
+ # return the mime_type of the file.
33
+ #
34
+ # @param filename [String]
35
+ # @return [String] (e.g. "application/pdf", "text/plain")
36
+ def self.mime_type(filename:)
37
+ ##
38
+ # TODO: Does this attempt to read the whole file? That may create memory constraints. By
39
+ # using Pathname (instead of File.read), we're letting Marcel do it's best mime magic.
40
+ pathname = Pathname.new(filename)
41
+ extension = filename.split(".")&.last&.downcase
42
+ if extension
43
+ # By including a possible extension, we can help nudge Marcel into making a more
44
+ # Without extension, we will get a lot of "application/octet-stream" results.
45
+ ::Marcel::MimeType.for(pathname, extension: extension)
46
+ else
47
+ ::Marcel::MimeType.for(pathname)
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -2,17 +2,32 @@
2
2
 
3
3
  require 'open3'
4
4
  require 'securerandom'
5
- require 'tmpdir'
6
-
7
5
  module DerivativeRodeo
8
6
  module Services
7
+ ##
8
+ # A service module for splitting PDFs into one image per page.
9
+ #
10
+ # @see .call
9
11
  module PdfSplitter
10
12
  ##
11
- # @param name [String]
12
- # @return [PdfSplitter::Base]
13
- def self.for(name)
14
- klass_name = "#{name.to_s.classify}_page".classify
15
- "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
13
+ # @api public
14
+ #
15
+ # Split the file found at the given :path
16
+ #
17
+ # @param path [String] the path to the source PDF that we're processing.
18
+ # @param image_extension [String] used to determine the splitting service we use; there is an
19
+ # implicit relationship between image_extension and image_file_basename_template
20
+ # (though filenames do not necessarily reflect mime types)
21
+ # @param image_file_basename_template [String] use this string to generate the unique filename
22
+ # for an image "split" from the given PDF. It must include "%d" as part of the
23
+ # declaration. For example if the template is "hello-world-%d.png" then the first
24
+ # split page will be "hello-world-1.png".
25
+ #
26
+ # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
27
+ def self.call(path, image_extension:, image_file_basename_template:)
28
+ klass_name = "#{image_extension.to_s.classify}_page".classify
29
+ klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
30
+ klass.new(path, image_file_basename_template: image_file_basename_template)
16
31
  end
17
32
 
18
33
  ##
@@ -31,38 +46,23 @@ module DerivativeRodeo
31
46
 
32
47
  class_attribute :gsdevice, instance_accessor: false
33
48
  class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
34
- ##
35
- # @api public
36
- #
37
- # @param path [String] The path the the PDF
38
- #
39
- # @return [Enumerable, Utilities::PdfSplitter::Base]
40
- def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
41
- new(path, baseid: baseid, tmpdir: tmpdir)
42
- end
43
49
 
44
- ##
45
- # @param path [String] the path to the source PDF that we're processing.
46
- # @param baseid [String] used for creating a unique identifier
47
- # @param tmpdir [String] place to perform the "work" of splitting the PDF.
48
- # @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
49
- # extract this from the given path, but for testing purposes, you might want to
50
- # provide a specific summary.
51
- # @param logger [Logger, #error]
52
50
  def initialize(path,
53
- baseid: SecureRandom.uuid,
54
- # TODO: Do we need to provide the :tmpdir for the application?
55
- tmpdir: Dir.mktmpdir,
56
- pdf_pages_summary: PagesSummary.extract_from(path: path),
57
- logger: DerivativeRodeo.config.logger)
58
- @baseid = baseid
51
+ image_file_basename_template:,
52
+ pdf_pages_summary: PagesSummary.extract_from(path: path))
59
53
  @pdfpath = path
60
54
  @pdf_pages_summary = pdf_pages_summary
61
- @tmpdir = tmpdir
62
- @logger = logger
55
+ @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
56
+
57
+ # We need to ensure that this temporary directory exists so we can write the files to it.
58
+ # Fortunately, because this file space must be "local" tmp dir, we don't need to work
59
+ # through any of the location antics of {StorageLocations::BaseLocation}.
60
+ FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
63
61
  end
64
62
 
65
- attr_reader :logger
63
+ attr_reader :ghost_script_output_file_template
64
+
65
+ delegate :logger, to: DerivativeRodeo
66
66
 
67
67
  # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
68
68
  include Enumerable
@@ -80,8 +80,8 @@ module DerivativeRodeo
80
80
  !pdf_pages_summary.valid?
81
81
  end
82
82
 
83
- attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
84
- private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
83
+ attr_reader :pdf_pages_summary, :basename, :pdfpath
84
+ private :pdf_pages_summary, :basename, :pdfpath
85
85
 
86
86
  # @api private
87
87
  def gsdevice
@@ -99,16 +99,12 @@ module DerivativeRodeo
99
99
  @entries = Array.wrap(gsconvert)
100
100
  end
101
101
 
102
- def output_base
103
- @output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
104
- end
105
-
106
102
  def gsconvert
107
103
  # NOTE: you must call gsdevice before compression, as compression is
108
104
  # updated during the gsdevice call.
109
105
  file_names = []
110
106
 
111
- Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
107
+ Open3.popen3(gsconvert_cmd(ghost_script_output_file_template)) do |_stdin, stdout, stderr, _wait_thr|
112
108
  err = stderr.read
113
109
  logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
114
110
 
@@ -116,7 +112,7 @@ module DerivativeRodeo
116
112
  stdout.read.split("\n").each do |line|
117
113
  next unless line.start_with?('Page ')
118
114
 
119
- file_names << format(output_base, page_number)
115
+ file_names << format(ghost_script_output_file_template, page_number)
120
116
  page_number += 1
121
117
  end
122
118
  end
@@ -126,12 +122,12 @@ module DerivativeRodeo
126
122
 
127
123
  def create_file_name(line:, page_number:); end
128
124
 
129
- def gsconvert_cmd(output_base)
125
+ def gsconvert_cmd(ghost_script_output_file_template)
130
126
  @gsconvert_cmd ||= begin
131
127
  cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
132
128
  cmd += " -sCompression=#{compression}" if compression?
133
129
  cmd += " -dJPEGQ=#{quality}" if quality?
134
- cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
130
+ cmd += " -sOutputFile=#{ghost_script_output_file_template} -r#{ppi} -f #{pdfpath}"
135
131
  cmd
136
132
  end
137
133
  end
@@ -21,7 +21,7 @@ module DerivativeRodeo
21
21
  def self.read(url)
22
22
  HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
23
23
  rescue StandardError => e
24
- config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
24
+ DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
25
25
  raise e
26
26
  end
27
27
 
@@ -42,6 +42,8 @@ module DerivativeRodeo
42
42
 
43
43
  class << self
44
44
  alias scheme location_name
45
+
46
+ delegate :config, to: DerivativeRodeo
45
47
  end
46
48
 
47
49
  ##
@@ -206,6 +208,22 @@ module DerivativeRodeo
206
208
  klass.build(from_uri: file_path, template: template)
207
209
  end
208
210
 
211
+ ##
212
+ # When you have a known location and want to check for files that are within that location,
213
+ # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
214
+ # need to know the path to all of the image files we "split" off of the given PDF.
215
+ #
216
+ # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
+ # qualified" Dir.glob type search.
218
+ #
219
+ # @param tail_glob [String]
220
+ #
221
+ # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
+ # array when there are none.
223
+ def globbed_tail_locations(tail_glob:)
224
+ raise NotImplementedError, "#{self.class}#globbed_locations"
225
+ end
226
+
209
227
  ##
210
228
  # @param extension [String, StorageLocations::SAME]
211
229
  # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
@@ -43,7 +43,7 @@ module DerivativeRodeo
43
43
  # @param url [String]
44
44
  #
45
45
  # @return [String]
46
- def read(url)
46
+ def get(url)
47
47
  HTTParty.get(url, logger: config.logger)
48
48
  rescue => e
49
49
  config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
@@ -51,13 +51,11 @@ module DerivativeRodeo
51
51
  end
52
52
 
53
53
  ##
54
- # @param url [String]
55
- #
56
54
  # @return [URI] when the URL resolves successfully
57
55
  # @return [FalseClass] when the URL's head request is not successful or we've exhausted our
58
56
  # remaining redirects.
59
- def exists?(url)
60
- HTTParty.head(url, logger: config.logger)
57
+ def exist?
58
+ HTTParty.head(file_uri, logger: config.logger)
61
59
  rescue => e
62
60
  config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
63
61
  false
@@ -34,6 +34,10 @@ module DerivativeRodeo
34
34
  FileUtils.cp_r(tmp_file_path, file_path)
35
35
  file_uri
36
36
  end
37
+
38
+ def globbed_tail_locations(tail_glob:)
39
+ Dir.glob(File.join(file_dir, tail_glob))
40
+ end
37
41
  end
38
42
  end
39
43
  end
@@ -8,7 +8,15 @@ module DerivativeRodeo
8
8
  # Location to download and upload files to S3
9
9
  #
10
10
  class S3Location < BaseLocation
11
- attr_writer :bucket
11
+ ##
12
+ # @!group Class Attributes
13
+ # @!attribute use_actual_s3_bucket
14
+ #
15
+ # When true , we are going to use a live S3 bucket. When false, we'll use a fake local bucket.
16
+ class_attribute :use_actual_s3_bucket, default: true
17
+ # @!endgroup Class Attributes
18
+ ##
19
+
12
20
  ##
13
21
  # Create a new uri of the classes type. Parts argument should have a default in
14
22
  # implementing classes. Must support a number or the symbol :all
@@ -24,10 +32,11 @@ module DerivativeRodeo
24
32
  end
25
33
 
26
34
  ##
27
- # @param config [DerivativeRodeo::Configuration]
35
+ # @param bucket_name [String, NilClass] when given, use this as the bucket, otherwise, def
36
+ #
28
37
  # @return [String]
29
- def self.adapter_prefix(config: DerivativeRodeo.config)
30
- "#{scheme}://#{config.aws_s3_bucket}.s3.#{config.aws_s3_region}.amazonaws.com"
38
+ def self.adapter_prefix(bucket_name: config.aws_s3_bucket)
39
+ "#{scheme}://#{bucket_name}.s3.#{config.aws_s3_region}.amazonaws.com"
31
40
  end
32
41
 
33
42
  ##
@@ -53,6 +62,38 @@ module DerivativeRodeo
53
62
  bucket.objects(prefix: file_path).count.positive?
54
63
  end
55
64
 
65
+ ##
66
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
+ #
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
+ # use the components of the file_path to fake that behavior.
70
+ #
71
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
72
+ def globbed_tail_locations(tail_glob:)
73
+ # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
+ #
75
+ # NOTE: Should we be storing our files as such? The pattern we need is
76
+ # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
+ # more than one PDF (that we intend to split); we don't want to trample on those split files
78
+ # and miscolate two PDFs.
79
+ #
80
+ # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
+ globname = File.join(file_dir, tail_glob)
82
+ regexp = %r{#{File.extname(globname)}$}
83
+
84
+ # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
+ # based on both the key of the item in the bucket as well as the bucket's host.
86
+ uri = URI.parse(file_uri)
87
+ scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
+
89
+ bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
+ if object.key.match(regexp)
91
+ template = File.join(scheme_and_host, object.key)
92
+ derived_file_from(template: template)
93
+ end
94
+ end
95
+ end
96
+
56
97
  ##
57
98
  # @api public
58
99
  # write the tmp file to the file_uri
@@ -71,6 +112,9 @@ module DerivativeRodeo
71
112
  #
72
113
  # @return [Aws::S3::Resource]
73
114
  def resource
115
+ # TODO: Are there instantiation considerations when running in Lambda? In tests
116
+ # initializing a resource is very slow (e.g. 3 seconds or so). Should this be a class
117
+ # method? Can it be given the SpaceStone constraints?
74
118
  @resource ||= if DerivativeRodeo.config.aws_s3_access_key_id
75
119
  Aws::S3::Resource.new(region: DerivativeRodeo.config.aws_s3_region,
76
120
  credentials: Aws::Credentials.new(
@@ -91,13 +135,28 @@ module DerivativeRodeo
91
135
  raise Errors::BucketMissingError
92
136
  end
93
137
 
138
+ # @see .use_actual_s3_bucket
94
139
  def bucket
95
- @bucket ||= resource.bucket(bucket_name)
140
+ @bucket ||= if use_actual_s3_bucket?
141
+ resource.bucket(bucket_name)
142
+ else
143
+ self.class.faux_bucket
144
+ end
96
145
  end
97
146
 
98
147
  def file_path
99
148
  @file_path ||= @file_uri.sub(%r{.+://.+?/}, '')
100
149
  end
150
+
151
+ ##
152
+ # A fake constructed fake bucket that confroms to the narrow S3 interface that we use.
153
+ #
154
+ # @see .use_actual_s3_bucket
155
+ # @return [AwsS3FauxBucket]
156
+ def self.faux_bucket
157
+ # We are not requiring this file; except in the spec context.
158
+ @faux_bucket ||= AwsS3FauxBucket.new
159
+ end
101
160
  end
102
161
  end
103
162
  end
@@ -14,9 +14,20 @@ module DerivativeRodeo
14
14
  # It uploads a file_uri to the queue, not the contents of that file
15
15
  # reading from the queue is not currently implemented
16
16
  class SqsLocation < BaseLocation
17
+ ##
18
+ # @!group Class Attributes
19
+ #
20
+ # @!attribute batch_size
21
+ # @return [Integer]
17
22
  class_attribute :batch_size, default: 10
18
23
 
19
- attr_writer :client
24
+ # @!attribute use_real_sqs
25
+ # When true, use the real SQS; else when false use a fake one. You probably don't want to
26
+ # use the fake one in your production. But it's exposed in this manner to ease testing of
27
+ # downstream dependencies.
28
+ class_attribute :use_real_sqs, default: true
29
+ # @!endgroup Class Attributes
30
+ ##
20
31
 
21
32
  ##
22
33
  # Create a new uri of the classes type. Parts argument should have a default in
@@ -82,19 +93,26 @@ module DerivativeRodeo
82
93
  file_uri
83
94
  end
84
95
 
96
+ # rubocop:disable Metrics/MethodLength
85
97
  def client
86
- @client ||= if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
87
- Aws::SQS::Client.new(
88
- region: config.aws_sqs_region,
89
- credentials: Aws::Credentials.new(
90
- config.aws_sqs_access_key_id,
91
- config.aws_sqs_secret_access_key
98
+ @client ||= if use_real_sqs?
99
+ if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
100
+ Aws::SQS::Client.new(
101
+ region: config.aws_sqs_region,
102
+ credentials: Aws::Credentials.new(
103
+ config.aws_sqs_access_key_id,
104
+ config.aws_sqs_secret_access_key
105
+ )
92
106
  )
93
- )
107
+ else
108
+ Aws::SQS::Client.new(region: config.aws_sqs_region)
109
+ end
94
110
  else
95
- Aws::SQS::Client.new(region: config.aws_sqs_region)
111
+ # We are not requiring this file; except in the spec context.
112
+ AwsSqsFauxClient.new
96
113
  end
97
114
  end
115
+ # rubocop:enable Metrics/MethodLength
98
116
 
99
117
  def add(message:)
100
118
  client.send_message({
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
@@ -33,4 +33,8 @@ module DerivativeRodeo
33
33
  yield(@config) if block_given?
34
34
  @config
35
35
  end
36
+
37
+ class << self
38
+ delegate :logger, to: :config
39
+ end
36
40
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # This class is very rudimentary implementation of a bucket. It conforms to the necessary
5
+ # interface for downloading and uploading and filter on prefix.
6
+ #
7
+ # It is provided as a lib/spec/support so that downstream implementations can leverage a fake S3
8
+ # bucket.
9
+ #
10
+ # @see [DerivativeRodeo::StorageLocations::S3Location]
11
+ class AwsS3FauxBucket
12
+ def initialize
13
+ @storage = {}
14
+ end
15
+ attr_reader :storage
16
+ def object(path)
17
+ # Yup, we've got nested buckets
18
+ @storage[path] ||= Storage.new(key: path)
19
+ end
20
+
21
+ def objects(prefix:)
22
+ @storage.each_with_object([]) do |(path, obj), accumulator|
23
+ accumulator << obj if path.start_with?(prefix)
24
+ end
25
+ end
26
+
27
+ class Storage
28
+ # Because we're now coping with the glob tail finder, we need to account for the bucket entry's
29
+ # key.
30
+ def initialize(key:)
31
+ @key = key
32
+ @storage = {}
33
+ end
34
+ attr_reader :storage, :key
35
+
36
+ def upload_file(path)
37
+ @storage[:upload] = File.read(path)
38
+ end
39
+
40
+ def download_file(path)
41
+ return false unless @storage.key?(:upload)
42
+ content = @storage.fetch(:upload)
43
+ File.open(path, 'wb') do |f|
44
+ f.puts(content)
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+ require 'ostruct'
3
+ ##
4
+ # This class is very rudimentary implementation of an SQS client. It conforms to the necessary
5
+ # interface for sending messages and reading messages
6
+ #
7
+ # @see [DerivativeRodeo::StorageAdapters::SqsAdapter]
8
+ class AwsSqsFauxClient
9
+ def initialize(queue_url: nil)
10
+ @queue_url = queue_url || 'https://sqs.us-west-2.amazonaws.com/5555555555/fake'
11
+ @storage = {}
12
+ end
13
+ attr_reader :storage
14
+
15
+ def send_message(arg_hash)
16
+ @storage[arg_hash[:queue_url]] ||= []
17
+ @storage[arg_hash[:queue_url]] << arg_hash[:message_body]
18
+ end
19
+
20
+ def send_message_batch(arg_hash)
21
+ @storage[arg_hash[:queue_url]] ||= []
22
+ @storage[arg_hash[:queue_url]] += arg_hash[:entries]
23
+ end
24
+
25
+ def receive_message(arg_hash)
26
+ output = []
27
+ args_hash[:mx_number_of_messages].times do
28
+ value = @storage[arg_hash[:queue_url]]&.pop
29
+ output << value if value
30
+ end
31
+ end
32
+
33
+ def get_queue_url(*)
34
+ OpenStruct.new(queue_url: @queue_url)
35
+ end
36
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-25 00:00:00.000000000 Z
12
+ date: 2023-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -281,12 +281,14 @@ files:
281
281
  - lib/derivative_rodeo.rb
282
282
  - lib/derivative_rodeo/configuration.rb
283
283
  - lib/derivative_rodeo/errors.rb
284
+ - lib/derivative_rodeo/generators/alto_generator.rb
284
285
  - lib/derivative_rodeo/generators/base_generator.rb
285
286
  - lib/derivative_rodeo/generators/concerns/copy_file_concern.rb
286
287
  - lib/derivative_rodeo/generators/copy_generator.rb
287
288
  - lib/derivative_rodeo/generators/hocr_generator.rb
288
289
  - lib/derivative_rodeo/generators/monochrome_generator.rb
289
290
  - lib/derivative_rodeo/generators/pdf_split_generator.rb
291
+ - lib/derivative_rodeo/generators/plain_text_generator.rb
290
292
  - lib/derivative_rodeo/generators/thumbnail_generator.rb
291
293
  - lib/derivative_rodeo/generators/word_coordinates_generator.rb
292
294
  - lib/derivative_rodeo/services/base_service.rb
@@ -295,6 +297,7 @@ files:
295
297
  - lib/derivative_rodeo/services/image_identify_service.rb
296
298
  - lib/derivative_rodeo/services/image_jp2_service.rb
297
299
  - lib/derivative_rodeo/services/image_service.rb
300
+ - lib/derivative_rodeo/services/mime_type_service.rb
298
301
  - lib/derivative_rodeo/services/pdf_splitter/base.rb
299
302
  - lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb
300
303
  - lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb
@@ -311,6 +314,8 @@ files:
311
314
  - lib/derivative_rodeo/storage_locations/sqs_location.rb
312
315
  - lib/derivative_rodeo/technical_metadata.rb
313
316
  - lib/derivative_rodeo/version.rb
317
+ - lib/spec_support/aws_s3_faux_bucket.rb
318
+ - lib/spec_support/aws_sqs_faux_client.rb
314
319
  homepage: https://github.com/scientist-softserv/derivative_rodeo
315
320
  licenses:
316
321
  - APACHE-2.0