derivative-rodeo 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 20e512d7162170875d60f90ff48bba694237ac7d31f38d806d2b87f570536c1c
4
- data.tar.gz: a1311ea39a3994b4d24ffdbdbade62b7fbb15d2c326a3b510607f315fb4dd865
3
+ metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
+ data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
5
5
  SHA512:
6
- metadata.gz: 157a9e276c6cefe739137fbe17e783557d0317dcee531cd353ad86987bda33ad55c2ada8179e254b306a467d01f8f759a6e89fc91b8cbcf6e968cf5a28a9037b
7
- data.tar.gz: 5bd45db467194cf1e8af7f7e1ed625c2b3898d011f20a581a9a55a2ccbb7be56ca8c276752b7bef41e2c1d5efa4737d1291f83211dd69cd18e4f7caeed25fef2
6
+ metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
+ data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
4
+
5
+ module DerivativeRodeo
6
+ module Generators
7
+ ##
8
+ # Generate the Alto XML from the given input_uris.
9
+ #
10
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
11
+ class AltoGenerator < BaseGenerator
12
+ self.output_extension = "alto.xml"
13
+
14
+ class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
15
+
16
+ ##
17
+ # @param output_location [StorageLocations::BaseLocation]
18
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
19
+ #
20
+ # @return [StorageLocations::BaseLocation]
21
+ #
22
+ # @see #requisite_files
23
+ def build_step(output_location:, input_tmp_file_path:, **)
24
+ output_location.with_new_tmp_path do |output_tmp_file_path|
25
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_alto: output_tmp_file_path)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # @param path_to_hocr [String]
33
+ # @param path_to_alto [String]
34
+ def convert_to_coordinates(path_to_hocr:, path_to_alto:)
35
+ hocr_html = File.read(path_to_hocr)
36
+ File.open(path_to_alto, "w+") do |file|
37
+ file.puts service.call(hocr_html).to_alto
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -27,7 +27,6 @@ module DerivativeRodeo
27
27
  # @!endgroup Class Attributes
28
28
 
29
29
  attr_reader :input_uris,
30
- :logger,
31
30
  :output_location_template,
32
31
  :preprocessed_location_template
33
32
 
@@ -39,23 +38,25 @@ module DerivativeRodeo
39
38
  # to find preprocessed uris by transforming the :input_uris via
40
39
  # {Services::ConvertUriViaTemplateService} with the given
41
40
  # :preprocessed_location_template.
42
- # @param logger [Logger]
43
- def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil, logger: DerivativeRodeo.config.logger)
41
+ def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
+ # NOTE: Are we using this preprocessed_location_template? Wondering?
44
43
  @input_uris = Array.wrap(input_uris)
45
44
  @output_location_template = output_location_template
46
45
  @preprocessed_location_template = preprocessed_location_template
47
- @logger = logger
48
46
 
49
47
  return if valid_instantiation?
50
48
 
51
49
  raise Errors::ExtensionMissingError.new(klass: self.class)
52
50
  end
53
51
 
52
+ delegate :logger, to: DerivativeRodeo
53
+
54
54
  ##
55
55
  # @api private
56
56
  #
57
57
  # @return [Boolean]
58
58
  def valid_instantiation?
59
+ # TODO: Does this even make sense.
59
60
  # When we have a BaseGenerator and not one of it's children or when we've assigned the
60
61
  # output_extension. instance_of? is more specific than is_a?
61
62
  instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
@@ -83,6 +84,7 @@ module DerivativeRodeo
83
84
  # @see #build_step
84
85
  # @see #with_each_requisite_location_and_tmp_file_path
85
86
  def generated_files
87
+ # TODO: Examples please
86
88
  return @generated_files if defined?(@generated_files)
87
89
 
88
90
  # As much as I would like to use map or returned values; given the implementations it's
@@ -92,6 +94,9 @@ module DerivativeRodeo
92
94
  # helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
93
95
  # #build_step
94
96
  @generated_files = []
97
+
98
+ # BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
99
+ # "file:///Users/jfriesen/.profile"
95
100
  with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
96
101
  generated_file = destination(input_location)
97
102
  @generated_files << if generated_file.exist?
@@ -170,7 +175,7 @@ module DerivativeRodeo
170
175
  return output_location unless preprocessed_location_template
171
176
 
172
177
  preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
173
- # We only want
178
+ # We only want the location if it exists
174
179
  return preprocessed_location if preprocessed_location&.exist?
175
180
 
176
181
  # NOTE: The file does not exist at the output_location; but we pass this information along so
@@ -17,19 +17,66 @@ module DerivativeRodeo
17
17
  include CopyFileConcern
18
18
 
19
19
  ##
20
- # @param name [#to_s] Convert the given name into the resulting {Services::PdfSplitter::Base}.
20
+ # A helper method for downstream implementations to ask if this file is perhaps split from a
21
+ # PDF.
21
22
  #
22
- # @return [#call, Services::PdfSplitter::Base]
23
- def pdf_splitter(name: pdf_splitter_name)
24
- @pdf_splitter ||= Services::PdfSplitter.for(name)
23
+ # @param filename [String]
24
+ # @param extension [String] the extension (either with or without the leading period); if none
25
+ # is provided use the extension of the given :filename.
26
+ # @return [TrueClass] when the file name likely represents a file split from a PDF.
27
+ # @return [FalseClass] when the file name does not, by convention, represent a file split from
28
+ # a PDF.
29
+ #
30
+ # @see #image_file_basename_template
31
+ def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
32
+ extension ||= File.extname(filename)
33
+
34
+ # Strip the leading period from the extension.
35
+ extension = extension[1..-1] if extension.start_with?('.')
36
+ regexp = %r{--page-\d+\.#{extension}$}
37
+ !!regexp.match(filename)
25
38
  end
26
39
 
27
40
  ##
28
- # @return [Symbol]
41
+ # @param basename [String] The given PDF file's base name (e.g. "hello.pdf" would have a base name of
42
+ # "hello").
43
+ #
44
+ # @return [String] A template for the filenames of the images produced by Ghostscript.
45
+ #
46
+ # @note This must include "%d" in the returning value, as that is how Ghostscript will assign
47
+ # the page number.
48
+ #
49
+ # @note I have extracted this function to make it abundantly clear the expected location
50
+ # each split image. Further there is an interaction in this
29
51
  #
30
- # @see .output_extension
31
- def pdf_splitter_name
32
- output_extension.to_s.split(".").last.to_sym
52
+ # @see #existing_page_locations
53
+ # @see .filename_for_a_derived_page_from_a_pdf?
54
+ def image_file_basename_template(basename:)
55
+ "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
56
+ end
57
+
58
+ ##
59
+ # We want to check the output location and pre-processed location for the existence of already
60
+ # split pages. This method checks both places.
61
+ #
62
+ # @param input_location [StorageLocations::BaseLocation]
63
+ #
64
+ # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
+ # with :tail_glob.
66
+ #
67
+ # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
+ #
69
+ # @note The tail_glob is in relation to the {#image_file_basename_template}
70
+ def existing_page_locations(input_location:)
71
+ # See image_file_basename_template
72
+ tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
73
+
74
+ output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
75
+ return output_locations if output_locations.count.positive?
76
+
77
+ return [] if preprocessed_location_template.blank?
78
+
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
33
80
  end
34
81
 
35
82
  ##
@@ -44,18 +91,35 @@ module DerivativeRodeo
44
91
  # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
45
92
  # @yieldparam image_path [String] where to find this file in the tmp space
46
93
  #
94
+ # @note This function makes a concession; namely that if it encounters any
95
+ # {#existing_page_locations} it will use all of that result as the entire number of pages.
96
+ # We could make this smarter but at the moment we're deferring on that.
97
+ #
47
98
  # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
99
+ #
100
+ # rubocop:disable Metrics/MethodLength
48
101
  def with_each_requisite_location_and_tmp_file_path
49
102
  input_files.each do |input_location|
50
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
51
- image_paths = pdf_splitter.call(input_tmp_file_path, baseid: input_location.file_basename, tmpdir: File.dirname(input_tmp_file_path))
52
- image_paths.each do |image_path|
104
+ ## We want a single call for a directory listing of the image_file_basename_template
105
+ generated_files = existing_page_locations(input_location: input_location)
106
+
107
+ if generated_files.count.zero?
108
+ generated_files = Services::PdfSplitter.call(
109
+ input_tmp_file_path,
110
+ image_extension: output_extension,
111
+ image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
+ )
113
+ end
114
+
115
+ generated_files.each do |image_path|
53
116
  image_location = StorageLocations::FileLocation.new("file://#{image_path}")
54
117
  yield(image_location, image_path)
55
118
  end
56
119
  end
57
120
  end
58
121
  end
122
+ # rubocop:enable Metrics/MethodLength
59
123
  end
60
124
  end
61
125
  end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
4
+
5
+ module DerivativeRodeo
6
+ module Generators
7
+ ##
8
+ # Generate the word coordinates (as JSON) from the given input_uris.
9
+ #
10
+ # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
11
+ class PlainTextGenerator < BaseGenerator
12
+ self.output_extension = "plain_text.txt"
13
+
14
+ class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
15
+
16
+ ##
17
+ # @param output_location [StorageLocations::BaseLocation]
18
+ # @param input_tmp_file_path [String] the location of the file that we can use for processing.
19
+ #
20
+ # @return [StorageLocations::BaseLocation]
21
+ #
22
+ # @see #requisite_files
23
+ def build_step(output_location:, input_tmp_file_path:, **)
24
+ output_location.with_new_tmp_path do |output_tmp_file_path|
25
+ convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_plain_text: output_tmp_file_path)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ##
32
+ # @param path_to_hocr [String]
33
+ # @param path_to_plain_text [String]
34
+ def convert_to_coordinates(path_to_hocr:, path_to_plain_text:)
35
+ hocr_html = File.read(path_to_hocr)
36
+ File.open(path_to_plain_text, "w+") do |file|
37
+ file.puts service.call(hocr_html).to_text
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -6,12 +6,33 @@ module DerivativeRodeo
6
6
  # This generator is responsible for converting a given binary into a thumbnail. As of
7
7
  # <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
8
8
  class ThumbnailGenerator < BaseGenerator
9
+ ##
10
+ # @!group Class Attributes
11
+
9
12
  ##
10
13
  # We want to mirror the same file "last" extension as described in Hyrax.
11
14
  #
12
15
  # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
13
16
  self.output_extension = 'thumbnail.jpeg'
14
17
 
18
+ ##
19
+ # @!attribute dimensions_by_type
20
+ #
21
+ # @return [Hash<Symbol,String>] the "types" (as categorized by
22
+ # Hyrax::FileSetDerivativeService). These aren't mime-types per se but a conceptual
23
+ # distillation of that.
24
+ #
25
+ # @see https://github.com/samvera/hyrax/blob/815e0abaacf9f331a5640c5d6129661d01eadf75/app/services/hyrax/file_set_derivatives_service.rb
26
+ class_attribute :dimensions_by_type, default: { pdf: "338x493" }
27
+
28
+ ##
29
+ # @!attribute dimensions_fallback
30
+ #
31
+ # @return [String] when there's no match for {.dimensions_by_type} use this value.
32
+ class_attribute :dimensions_fallback, default: "200x150>"
33
+ # @!endgroup Class Attributes
34
+ ##
35
+
15
36
  ##
16
37
  # @param output_location [StorageLocations::BaseLocation]
17
38
  # @param input_tmp_file_path [String] the location of the file that we can use for processing.
@@ -23,6 +44,20 @@ module DerivativeRodeo
23
44
  end
24
45
  end
25
46
 
47
+ ##
48
+ # @param filename [String]
49
+ # @return [String]
50
+ #
51
+ # @see .dimensions_by_type
52
+ # @see .dimensions_fallback
53
+ def self.dimensions_for(filename:)
54
+ type = DerivativeRodeo::Services::MimeTypeService.hyrax_type(filename: filename)
55
+ dimensions_by_type.fetch(type, dimensions_fallback)
56
+ end
57
+
58
+ # Want to expose the dimensions_for as an instance method
59
+ delegate :dimensions_for, to: :class
60
+
26
61
  ##
27
62
  # Convert the file found at :path_to_input into a thumbnail, writing it to the
28
63
  # :path_for_thumbnail_output
@@ -30,8 +65,8 @@ module DerivativeRodeo
30
65
  # @param path_of_file_to_create_thumbnail_from [String]
31
66
  # @param path_for_thumbnail_output [String]
32
67
  def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
33
- # @todo the dimensions might not be always 200x150, figure out a way to make it dynamic
34
- `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '200x150>' -flatten #{path_for_thumbnail_output}`
68
+ dimensions = dimensions_for(filename: path_of_file_to_create_thumbnail_from)
69
+ `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '#{dimensions}' -flatten #{path_for_thumbnail_output}`
35
70
  end
36
71
  end
37
72
  end
@@ -31,7 +31,7 @@ module DerivativeRodeo
31
31
  def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
32
32
  hocr_html = File.read(path_to_hocr)
33
33
  File.open(path_to_coordinate, "w+") do |file|
34
- file.puts service.call(hocr_html)
34
+ file.puts service.call(hocr_html).to_json
35
35
  end
36
36
  end
37
37
  end
@@ -13,7 +13,7 @@ module DerivativeRodeo
13
13
  # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
14
14
  # @return [String] A JSON document
15
15
  def self.call(sgml)
16
- new(sgml).to_json
16
+ new(sgml)
17
17
  end
18
18
 
19
19
  ##
@@ -42,6 +42,21 @@ module DerivativeRodeo
42
42
  end
43
43
  alias json to_json
44
44
 
45
+ # Output plain text, keeping the method calls consistent with so calling this #to_text
46
+ #
47
+ # @return [String] plain text of OCR'd document
48
+ def to_text
49
+ @to_text ||= doc_stream.text
50
+ end
51
+
52
+ def to_alto
53
+ @to_alto ||= AltoXml.to_alto(
54
+ words: doc_stream.words,
55
+ width: doc_stream.width,
56
+ height: doc_stream.height
57
+ )
58
+ end
59
+
45
60
  private
46
61
 
47
62
  def xml?(xml)
@@ -121,6 +136,7 @@ module DerivativeRodeo
121
136
  # add trailing space to plaintext buffer for between words:
122
137
  @text += ' '
123
138
  @words.push(@current) if word_complete?
139
+ @current = nil # clear the current word
124
140
  end
125
141
 
126
142
  def end_line
@@ -156,10 +172,13 @@ module DerivativeRodeo
156
172
  # Callback for element end; at this time, flush word coordinate state
157
173
  # for current word, and append line endings to plain text:
158
174
  #
159
- # @param _name [String] element name.
160
- def end_element(_name)
161
- end_line if @element_class_name == 'ocr_line'
162
- end_word if @element_class_name == 'ocrx_word'
175
+ # @param name [String] element name.
176
+ def end_element(name)
177
+ if name == 'span'
178
+ end_word if @element_class_name == 'ocrx_word'
179
+ @text += "\n" if @element_class_name.nil?
180
+ end
181
+ @element_class_name = nil
163
182
  end
164
183
 
165
184
  # Callback for completion of parsing hOCR, used to normalize generated
@@ -213,6 +232,102 @@ module DerivativeRodeo
213
232
  JSON.generate(payload)
214
233
  end
215
234
  end
235
+
236
+ class AltoXml
237
+ ##
238
+ # @api public
239
+ #
240
+ # @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
241
+ # @param width [Integer, nil] the width of the "canvas" on which the words appear.
242
+ # @param height [Integer, nil] the height of the "canvas" on which the words appear.
243
+ #
244
+ # @return [String] the ALTO XML representation of the given words and their coordinates.
245
+ def self.to_alto(words:, width: nil, height: nil)
246
+ new(words: words, width: width, height: height).to_alto
247
+ end
248
+
249
+ def initialize(words:, width:, height:, scaling: 1.0)
250
+ @words = words
251
+ @height = height.to_i
252
+ @width = width.to_i
253
+ @scaling = scaling
254
+ end
255
+
256
+ attr_reader :words, :width, :height, :scaling
257
+
258
+ # Output ALTO XML of word coordinates
259
+ #
260
+ # @return [String] ALTO XML representation of the words and their coordinates
261
+ def to_alto
262
+ page = alto_page(width, height) do |xml|
263
+ words.each do |word|
264
+ xml.String(
265
+ CONTENT: word[:word],
266
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
267
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
268
+ HPOS: scale_point(word[:coordinates][0]).to_s,
269
+ VPOS: scale_point(word[:coordinates][1]).to_s
270
+ ) { xml.text '' }
271
+ end
272
+ end
273
+ page.to_xml
274
+ end
275
+
276
+ private
277
+
278
+ # given block to manage word generation, wrap with page/block/line
279
+ def alto_page(pixel_width, pixel_height, &block)
280
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
281
+ xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
282
+ xml.Description do
283
+ xml.MeasurementUnit 'pixel'
284
+ end
285
+ alto_layout(xml, pixel_width, pixel_height, &block)
286
+ end
287
+ end
288
+ builder
289
+ end
290
+
291
+ def scale_point(value)
292
+ # NOTE: presuming non-fractional, even though ALTO 2.1
293
+ # specifies coordinates are xsd:float, not xsd:int,
294
+ # simplify to integer value for output:
295
+ (value * scaling).to_i
296
+ end
297
+
298
+ # return layout for page
299
+ def alto_layout(xml, pixel_width, pixel_height, &block)
300
+ xml.Layout do
301
+ xml.Page(ID: 'ID1',
302
+ PHYSICAL_IMG_NR: '1',
303
+ HEIGHT: pixel_height,
304
+ WIDTH: pixel_width) do
305
+ xml.PrintSpace(HEIGHT: pixel_height,
306
+ WIDTH: pixel_width,
307
+ HPOS: '0',
308
+ VPOS: '0') do
309
+ alto_blockline(xml, pixel_width, pixel_height, &block)
310
+ end
311
+ end
312
+ end
313
+ end
314
+
315
+ # make block line and call word-block
316
+ def alto_blockline(xml, pixel_width, pixel_height)
317
+ xml.TextBlock(ID: 'ID1a',
318
+ HEIGHT: pixel_height,
319
+ WIDTH: pixel_width,
320
+ HPOS: '0',
321
+ VPOS: '0') do
322
+ xml.TextLine(HEIGHT: pixel_height,
323
+ WIDTH: pixel_width,
324
+ HPOS: '0',
325
+ VPOS: '0') do
326
+ yield(xml)
327
+ end
328
+ end
329
+ end
330
+ end
216
331
  end
217
332
  end
218
333
  end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+ require 'marcel'
3
+
4
+ module DerivativeRodeo
5
+ module Services
6
+ ##
7
+ # This module provides an interface for determining a mime-type.
8
+ module MimeTypeService
9
+ ##
10
+ # Hyrax has it's own compression of mime_types into conceptual types (as defined in
11
+ # Hyrax::FileSetDerivativesService). This provides a somewhat conceptual overlap with that,
12
+ # while also being more generalized.
13
+ #
14
+ # @param filename [String]
15
+ # @return [Symbol]
16
+ def self.hyrax_type(filename:)
17
+ mime = mime_type(filename: filename)
18
+ media_type, sub_type = mime.split("/")
19
+ case media_type
20
+ when "image", "audio", "text", "video"
21
+ media_type.to_sym
22
+ when "application" # The wild woolly weird world of all the things.
23
+ # TODO: Do we need to worry about office documents?
24
+ sub_type.to_sym
25
+ else
26
+ sub_type.to_sym
27
+ end
28
+ end
29
+
30
+ ##
31
+ # Given a local :filename (e.g. downloaded and available on the server this is running),
32
+ # return the mime_type of the file.
33
+ #
34
+ # @param filename [String]
35
+ # @return [String] (e.g. "application/pdf", "text/plain")
36
+ def self.mime_type(filename:)
37
+ ##
38
+ # TODO: Does this attempt to read the whole file? That may create memory constraints. By
39
+ # using Pathname (instead of File.read), we're letting Marcel do it's best mime magic.
40
+ pathname = Pathname.new(filename)
41
+ extension = filename.split(".")&.last&.downcase
42
+ if extension
43
+ # By including a possible extension, we can help nudge Marcel into making a more
44
+ # Without extension, we will get a lot of "application/octet-stream" results.
45
+ ::Marcel::MimeType.for(pathname, extension: extension)
46
+ else
47
+ ::Marcel::MimeType.for(pathname)
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -2,17 +2,32 @@
2
2
 
3
3
  require 'open3'
4
4
  require 'securerandom'
5
- require 'tmpdir'
6
-
7
5
  module DerivativeRodeo
8
6
  module Services
7
+ ##
8
+ # A service module for splitting PDFs into one image per page.
9
+ #
10
+ # @see .call
9
11
  module PdfSplitter
10
12
  ##
11
- # @param name [String]
12
- # @return [PdfSplitter::Base]
13
- def self.for(name)
14
- klass_name = "#{name.to_s.classify}_page".classify
15
- "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
13
+ # @api public
14
+ #
15
+ # Split the file found at the given :path
16
+ #
17
+ # @param path [String] the path to the source PDF that we're processing.
18
+ # @param image_extension [String] used to determine the splitting service we use; there is an
19
+ # implicit relationship between image_extension and image_file_basename_template
20
+ # (though filenames do not necessarily reflect mime types)
21
+ # @param image_file_basename_template [String] use this string to generate the unique filename
22
+ # for an image "split" from the given PDF. It must include "%d" as part of the
23
+ # declaration. For example if the template is "hello-world-%d.png" then the first
24
+ # split page will be "hello-world-1.png".
25
+ #
26
+ # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
27
+ def self.call(path, image_extension:, image_file_basename_template:)
28
+ klass_name = "#{image_extension.to_s.classify}_page".classify
29
+ klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
30
+ klass.new(path, image_file_basename_template: image_file_basename_template)
16
31
  end
17
32
 
18
33
  ##
@@ -31,38 +46,23 @@ module DerivativeRodeo
31
46
 
32
47
  class_attribute :gsdevice, instance_accessor: false
33
48
  class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
34
- ##
35
- # @api public
36
- #
37
- # @param path [String] The path the the PDF
38
- #
39
- # @return [Enumerable, Utilities::PdfSplitter::Base]
40
- def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
41
- new(path, baseid: baseid, tmpdir: tmpdir)
42
- end
43
49
 
44
- ##
45
- # @param path [String] the path to the source PDF that we're processing.
46
- # @param baseid [String] used for creating a unique identifier
47
- # @param tmpdir [String] place to perform the "work" of splitting the PDF.
48
- # @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
49
- # extract this from the given path, but for testing purposes, you might want to
50
- # provide a specific summary.
51
- # @param logger [Logger, #error]
52
50
  def initialize(path,
53
- baseid: SecureRandom.uuid,
54
- # TODO: Do we need to provide the :tmpdir for the application?
55
- tmpdir: Dir.mktmpdir,
56
- pdf_pages_summary: PagesSummary.extract_from(path: path),
57
- logger: DerivativeRodeo.config.logger)
58
- @baseid = baseid
51
+ image_file_basename_template:,
52
+ pdf_pages_summary: PagesSummary.extract_from(path: path))
59
53
  @pdfpath = path
60
54
  @pdf_pages_summary = pdf_pages_summary
61
- @tmpdir = tmpdir
62
- @logger = logger
55
+ @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
56
+
57
+ # We need to ensure that this temporary directory exists so we can write the files to it.
58
+ # Fortunately, because this file space must be "local" tmp dir, we don't need to work
59
+ # through any of the location antics of {StorageLocations::BaseLocation}.
60
+ FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
63
61
  end
64
62
 
65
- attr_reader :logger
63
+ attr_reader :ghost_script_output_file_template
64
+
65
+ delegate :logger, to: DerivativeRodeo
66
66
 
67
67
  # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
68
68
  include Enumerable
@@ -80,8 +80,8 @@ module DerivativeRodeo
80
80
  !pdf_pages_summary.valid?
81
81
  end
82
82
 
83
- attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
84
- private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
83
+ attr_reader :pdf_pages_summary, :basename, :pdfpath
84
+ private :pdf_pages_summary, :basename, :pdfpath
85
85
 
86
86
  # @api private
87
87
  def gsdevice
@@ -99,16 +99,12 @@ module DerivativeRodeo
99
99
  @entries = Array.wrap(gsconvert)
100
100
  end
101
101
 
102
- def output_base
103
- @output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
104
- end
105
-
106
102
  def gsconvert
107
103
  # NOTE: you must call gsdevice before compression, as compression is
108
104
  # updated during the gsdevice call.
109
105
  file_names = []
110
106
 
111
- Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
107
+ Open3.popen3(gsconvert_cmd(ghost_script_output_file_template)) do |_stdin, stdout, stderr, _wait_thr|
112
108
  err = stderr.read
113
109
  logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
114
110
 
@@ -116,7 +112,7 @@ module DerivativeRodeo
116
112
  stdout.read.split("\n").each do |line|
117
113
  next unless line.start_with?('Page ')
118
114
 
119
- file_names << format(output_base, page_number)
115
+ file_names << format(ghost_script_output_file_template, page_number)
120
116
  page_number += 1
121
117
  end
122
118
  end
@@ -126,12 +122,12 @@ module DerivativeRodeo
126
122
 
127
123
  def create_file_name(line:, page_number:); end
128
124
 
129
- def gsconvert_cmd(output_base)
125
+ def gsconvert_cmd(ghost_script_output_file_template)
130
126
  @gsconvert_cmd ||= begin
131
127
  cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
132
128
  cmd += " -sCompression=#{compression}" if compression?
133
129
  cmd += " -dJPEGQ=#{quality}" if quality?
134
- cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
130
+ cmd += " -sOutputFile=#{ghost_script_output_file_template} -r#{ppi} -f #{pdfpath}"
135
131
  cmd
136
132
  end
137
133
  end
@@ -21,7 +21,7 @@ module DerivativeRodeo
21
21
  def self.read(url)
22
22
  HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
23
23
  rescue StandardError => e
24
- config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
24
+ DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
25
25
  raise e
26
26
  end
27
27
 
@@ -42,6 +42,8 @@ module DerivativeRodeo
42
42
 
43
43
  class << self
44
44
  alias scheme location_name
45
+
46
+ delegate :config, to: DerivativeRodeo
45
47
  end
46
48
 
47
49
  ##
@@ -206,6 +208,22 @@ module DerivativeRodeo
206
208
  klass.build(from_uri: file_path, template: template)
207
209
  end
208
210
 
211
+ ##
212
+ # When you have a known location and want to check for files that are within that location,
213
+ # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
214
+ # need to know the path to all of the image files we "split" off of the given PDF.
215
+ #
216
+ # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
+ # qualified" Dir.glob type search.
218
+ #
219
+ # @param tail_glob [String]
220
+ #
221
+ # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
+ # array when there are none.
223
+ def globbed_tail_locations(tail_glob:)
224
+ raise NotImplementedError, "#{self.class}#globbed_locations"
225
+ end
226
+
209
227
  ##
210
228
  # @param extension [String, StorageLocations::SAME]
211
229
  # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
@@ -43,7 +43,7 @@ module DerivativeRodeo
43
43
  # @param url [String]
44
44
  #
45
45
  # @return [String]
46
- def read(url)
46
+ def get(url)
47
47
  HTTParty.get(url, logger: config.logger)
48
48
  rescue => e
49
49
  config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
@@ -51,13 +51,11 @@ module DerivativeRodeo
51
51
  end
52
52
 
53
53
  ##
54
- # @param url [String]
55
- #
56
54
  # @return [URI] when the URL resolves successfully
57
55
  # @return [FalseClass] when the URL's head request is not successful or we've exhausted our
58
56
  # remaining redirects.
59
- def exists?(url)
60
- HTTParty.head(url, logger: config.logger)
57
+ def exist?
58
+ HTTParty.head(file_uri, logger: config.logger)
61
59
  rescue => e
62
60
  config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
63
61
  false
@@ -34,6 +34,10 @@ module DerivativeRodeo
34
34
  FileUtils.cp_r(tmp_file_path, file_path)
35
35
  file_uri
36
36
  end
37
+
38
+ def globbed_tail_locations(tail_glob:)
39
+ Dir.glob(File.join(file_dir, tail_glob))
40
+ end
37
41
  end
38
42
  end
39
43
  end
@@ -8,7 +8,15 @@ module DerivativeRodeo
8
8
  # Location to download and upload files to S3
9
9
  #
10
10
  class S3Location < BaseLocation
11
- attr_writer :bucket
11
+ ##
12
+ # @!group Class Attributes
13
+ # @!attribute use_actual_s3_bucket
14
+ #
15
+ # When true , we are going to use a live S3 bucket. When false, we'll use a fake local bucket.
16
+ class_attribute :use_actual_s3_bucket, default: true
17
+ # @!endgroup Class Attributes
18
+ ##
19
+
12
20
  ##
13
21
  # Create a new uri of the classes type. Parts argument should have a default in
14
22
  # implementing classes. Must support a number or the symbol :all
@@ -24,10 +32,11 @@ module DerivativeRodeo
24
32
  end
25
33
 
26
34
  ##
27
- # @param config [DerivativeRodeo::Configuration]
35
+ # @param bucket_name [String, NilClass] when given, use this as the bucket, otherwise, def
36
+ #
28
37
  # @return [String]
29
- def self.adapter_prefix(config: DerivativeRodeo.config)
30
- "#{scheme}://#{config.aws_s3_bucket}.s3.#{config.aws_s3_region}.amazonaws.com"
38
+ def self.adapter_prefix(bucket_name: config.aws_s3_bucket)
39
+ "#{scheme}://#{bucket_name}.s3.#{config.aws_s3_region}.amazonaws.com"
31
40
  end
32
41
 
33
42
  ##
@@ -53,6 +62,38 @@ module DerivativeRodeo
53
62
  bucket.objects(prefix: file_path).count.positive?
54
63
  end
55
64
 
65
+ ##
66
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
+ #
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
+ # use the components of the file_path to fake that behavior.
70
+ #
71
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
72
+ def globbed_tail_locations(tail_glob:)
73
+ # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
+ #
75
+ # NOTE: Should we be storing our files as such? The pattern we need is
76
+ # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
+ # more than one PDF (that we intend to split); we don't want to trample on those split files
78
+ # and miscolate two PDFs.
79
+ #
80
+ # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
+ globname = File.join(file_dir, tail_glob)
82
+ regexp = %r{#{File.extname(globname)}$}
83
+
84
+ # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
+ # based on both the key of the item in the bucket as well as the bucket's host.
86
+ uri = URI.parse(file_uri)
87
+ scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
+
89
+ bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
+ if object.key.match(regexp)
91
+ template = File.join(scheme_and_host, object.key)
92
+ derived_file_from(template: template)
93
+ end
94
+ end
95
+ end
96
+
56
97
  ##
57
98
  # @api public
58
99
  # write the tmp file to the file_uri
@@ -71,6 +112,9 @@ module DerivativeRodeo
71
112
  #
72
113
  # @return [Aws::S3::Resource]
73
114
  def resource
115
+ # TODO: Are there instantiation considerations when running in Lambda? In tests
116
+ # initializing a resource is very slow (e.g. 3 seconds or so). Should this be a class
117
+ # method? Can it be given the SpaceStone constraints?
74
118
  @resource ||= if DerivativeRodeo.config.aws_s3_access_key_id
75
119
  Aws::S3::Resource.new(region: DerivativeRodeo.config.aws_s3_region,
76
120
  credentials: Aws::Credentials.new(
@@ -91,13 +135,28 @@ module DerivativeRodeo
91
135
  raise Errors::BucketMissingError
92
136
  end
93
137
 
138
+ # @see .use_actual_s3_bucket
94
139
  def bucket
95
- @bucket ||= resource.bucket(bucket_name)
140
+ @bucket ||= if use_actual_s3_bucket?
141
+ resource.bucket(bucket_name)
142
+ else
143
+ self.class.faux_bucket
144
+ end
96
145
  end
97
146
 
98
147
  def file_path
99
148
  @file_path ||= @file_uri.sub(%r{.+://.+?/}, '')
100
149
  end
150
+
151
+ ##
152
+ # A fake constructed fake bucket that confroms to the narrow S3 interface that we use.
153
+ #
154
+ # @see .use_actual_s3_bucket
155
+ # @return [AwsS3FauxBucket]
156
+ def self.faux_bucket
157
+ # We are not requiring this file; except in the spec context.
158
+ @faux_bucket ||= AwsS3FauxBucket.new
159
+ end
101
160
  end
102
161
  end
103
162
  end
@@ -14,9 +14,20 @@ module DerivativeRodeo
14
14
  # It uploads a file_uri to the queue, not the contents of that file
15
15
  # reading from the queue is not currently implemented
16
16
  class SqsLocation < BaseLocation
17
+ ##
18
+ # @!group Class Attributes
19
+ #
20
+ # @!attribute batch_size
21
+ # @return [Integer]
17
22
  class_attribute :batch_size, default: 10
18
23
 
19
- attr_writer :client
24
+ # @!attribute use_real_sqs
25
+ # When true, use the real SQS; else when false use a fake one. You probably don't want to
26
+ # use the fake one in your production. But it's exposed in this manner to ease testing of
27
+ # downstream dependencies.
28
+ class_attribute :use_real_sqs, default: true
29
+ # @!endgroup Class Attributes
30
+ ##
20
31
 
21
32
  ##
22
33
  # Create a new uri of the classes type. Parts argument should have a default in
@@ -82,19 +93,26 @@ module DerivativeRodeo
82
93
  file_uri
83
94
  end
84
95
 
96
+ # rubocop:disable Metrics/MethodLength
85
97
  def client
86
- @client ||= if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
87
- Aws::SQS::Client.new(
88
- region: config.aws_sqs_region,
89
- credentials: Aws::Credentials.new(
90
- config.aws_sqs_access_key_id,
91
- config.aws_sqs_secret_access_key
98
+ @client ||= if use_real_sqs?
99
+ if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
100
+ Aws::SQS::Client.new(
101
+ region: config.aws_sqs_region,
102
+ credentials: Aws::Credentials.new(
103
+ config.aws_sqs_access_key_id,
104
+ config.aws_sqs_secret_access_key
105
+ )
92
106
  )
93
- )
107
+ else
108
+ Aws::SQS::Client.new(region: config.aws_sqs_region)
109
+ end
94
110
  else
95
- Aws::SQS::Client.new(region: config.aws_sqs_region)
111
+ # We are not requiring this file; except in the spec context.
112
+ AwsSqsFauxClient.new
96
113
  end
97
114
  end
115
+ # rubocop:enable Metrics/MethodLength
98
116
 
99
117
  def add(message:)
100
118
  client.send_message({
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
@@ -33,4 +33,8 @@ module DerivativeRodeo
33
33
  yield(@config) if block_given?
34
34
  @config
35
35
  end
36
+
37
+ class << self
38
+ delegate :logger, to: :config
39
+ end
36
40
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # This class is very rudimentary implementation of a bucket. It conforms to the necessary
5
+ # interface for downloading and uploading and filter on prefix.
6
+ #
7
+ # It is provided as a lib/spec/support so that downstream implementations can leverage a fake S3
8
+ # bucket.
9
+ #
10
+ # @see [DerivativeRodeo::StorageLocations::S3Location]
11
+ class AwsS3FauxBucket
12
+ def initialize
13
+ @storage = {}
14
+ end
15
+ attr_reader :storage
16
+ def object(path)
17
+ # Yup, we've got nested buckets
18
+ @storage[path] ||= Storage.new(key: path)
19
+ end
20
+
21
+ def objects(prefix:)
22
+ @storage.each_with_object([]) do |(path, obj), accumulator|
23
+ accumulator << obj if path.start_with?(prefix)
24
+ end
25
+ end
26
+
27
+ class Storage
28
+ # Because we're now coping with the glob tail finder, we need to account for the bucket entry's
29
+ # key.
30
+ def initialize(key:)
31
+ @key = key
32
+ @storage = {}
33
+ end
34
+ attr_reader :storage, :key
35
+
36
+ def upload_file(path)
37
+ @storage[:upload] = File.read(path)
38
+ end
39
+
40
+ def download_file(path)
41
+ return false unless @storage.key?(:upload)
42
+ content = @storage.fetch(:upload)
43
+ File.open(path, 'wb') do |f|
44
+ f.puts(content)
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+ require 'ostruct'
3
+ ##
4
+ # This class is very rudimentary implementation of an SQS client. It conforms to the necessary
5
+ # interface for sending messages and reading messages
6
+ #
7
+ # @see [DerivativeRodeo::StorageAdapters::SqsAdapter]
8
+ class AwsSqsFauxClient
9
+ def initialize(queue_url: nil)
10
+ @queue_url = queue_url || 'https://sqs.us-west-2.amazonaws.com/5555555555/fake'
11
+ @storage = {}
12
+ end
13
+ attr_reader :storage
14
+
15
+ def send_message(arg_hash)
16
+ @storage[arg_hash[:queue_url]] ||= []
17
+ @storage[arg_hash[:queue_url]] << arg_hash[:message_body]
18
+ end
19
+
20
+ def send_message_batch(arg_hash)
21
+ @storage[arg_hash[:queue_url]] ||= []
22
+ @storage[arg_hash[:queue_url]] += arg_hash[:entries]
23
+ end
24
+
25
+ def receive_message(arg_hash)
26
+ output = []
27
+ args_hash[:mx_number_of_messages].times do
28
+ value = @storage[arg_hash[:queue_url]]&.pop
29
+ output << value if value
30
+ end
31
+ end
32
+
33
+ def get_queue_url(*)
34
+ OpenStruct.new(queue_url: @queue_url)
35
+ end
36
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-25 00:00:00.000000000 Z
12
+ date: 2023-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -281,12 +281,14 @@ files:
281
281
  - lib/derivative_rodeo.rb
282
282
  - lib/derivative_rodeo/configuration.rb
283
283
  - lib/derivative_rodeo/errors.rb
284
+ - lib/derivative_rodeo/generators/alto_generator.rb
284
285
  - lib/derivative_rodeo/generators/base_generator.rb
285
286
  - lib/derivative_rodeo/generators/concerns/copy_file_concern.rb
286
287
  - lib/derivative_rodeo/generators/copy_generator.rb
287
288
  - lib/derivative_rodeo/generators/hocr_generator.rb
288
289
  - lib/derivative_rodeo/generators/monochrome_generator.rb
289
290
  - lib/derivative_rodeo/generators/pdf_split_generator.rb
291
+ - lib/derivative_rodeo/generators/plain_text_generator.rb
290
292
  - lib/derivative_rodeo/generators/thumbnail_generator.rb
291
293
  - lib/derivative_rodeo/generators/word_coordinates_generator.rb
292
294
  - lib/derivative_rodeo/services/base_service.rb
@@ -295,6 +297,7 @@ files:
295
297
  - lib/derivative_rodeo/services/image_identify_service.rb
296
298
  - lib/derivative_rodeo/services/image_jp2_service.rb
297
299
  - lib/derivative_rodeo/services/image_service.rb
300
+ - lib/derivative_rodeo/services/mime_type_service.rb
298
301
  - lib/derivative_rodeo/services/pdf_splitter/base.rb
299
302
  - lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb
300
303
  - lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb
@@ -311,6 +314,8 @@ files:
311
314
  - lib/derivative_rodeo/storage_locations/sqs_location.rb
312
315
  - lib/derivative_rodeo/technical_metadata.rb
313
316
  - lib/derivative_rodeo/version.rb
317
+ - lib/spec_support/aws_s3_faux_bucket.rb
318
+ - lib/spec_support/aws_sqs_faux_client.rb
314
319
  homepage: https://github.com/scientist-softserv/derivative_rodeo
315
320
  licenses:
316
321
  - APACHE-2.0