derivative-rodeo 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/derivative_rodeo/generators/alto_generator.rb +42 -0
 - data/lib/derivative_rodeo/generators/base_generator.rb +10 -5
 - data/lib/derivative_rodeo/generators/pdf_split_generator.rb +74 -10
 - data/lib/derivative_rodeo/generators/plain_text_generator.rb +42 -0
 - data/lib/derivative_rodeo/generators/thumbnail_generator.rb +37 -2
 - data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +1 -1
 - data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +120 -5
 - data/lib/derivative_rodeo/services/mime_type_service.rb +52 -0
 - data/lib/derivative_rodeo/services/pdf_splitter/base.rb +39 -43
 - data/lib/derivative_rodeo/services/url_service.rb +1 -1
 - data/lib/derivative_rodeo/storage_locations/base_location.rb +18 -0
 - data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +3 -5
 - data/lib/derivative_rodeo/storage_locations/file_location.rb +4 -0
 - data/lib/derivative_rodeo/storage_locations/s3_location.rb +64 -5
 - data/lib/derivative_rodeo/storage_locations/sqs_location.rb +27 -9
 - data/lib/derivative_rodeo/version.rb +1 -1
 - data/lib/derivative_rodeo.rb +4 -0
 - data/lib/spec_support/aws_s3_faux_bucket.rb +48 -0
 - data/lib/spec_support/aws_sqs_faux_client.rb +36 -0
 - metadata +7 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
         
     | 
| 
         @@ -0,0 +1,42 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module DerivativeRodeo
         
     | 
| 
      
 6 
     | 
    
         
            +
              module Generators
         
     | 
| 
      
 7 
     | 
    
         
            +
                ##
         
     | 
| 
      
 8 
     | 
    
         
            +
                # Generate the Alto XML from the given input_uris.
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
         
     | 
| 
      
 11 
     | 
    
         
            +
                class AltoGenerator < BaseGenerator
         
     | 
| 
      
 12 
     | 
    
         
            +
                  self.output_extension = "alto.xml"
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # @param output_location [StorageLocations::BaseLocation]
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # @param input_tmp_file_path [String] the location of the file that we can use for processing.
         
     | 
| 
      
 19 
     | 
    
         
            +
                  #
         
     | 
| 
      
 20 
     | 
    
         
            +
                  # @return [StorageLocations::BaseLocation]
         
     | 
| 
      
 21 
     | 
    
         
            +
                  #
         
     | 
| 
      
 22 
     | 
    
         
            +
                  # @see #requisite_files
         
     | 
| 
      
 23 
     | 
    
         
            +
                  def build_step(output_location:, input_tmp_file_path:, **)
         
     | 
| 
      
 24 
     | 
    
         
            +
                    output_location.with_new_tmp_path do |output_tmp_file_path|
         
     | 
| 
      
 25 
     | 
    
         
            +
                      convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_alto: output_tmp_file_path)
         
     | 
| 
      
 26 
     | 
    
         
            +
                    end
         
     | 
| 
      
 27 
     | 
    
         
            +
                  end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  private
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 32 
     | 
    
         
            +
                  # @param path_to_hocr [String]
         
     | 
| 
      
 33 
     | 
    
         
            +
                  # @param path_to_alto [String]
         
     | 
| 
      
 34 
     | 
    
         
            +
                  def convert_to_coordinates(path_to_hocr:, path_to_alto:)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    hocr_html = File.read(path_to_hocr)
         
     | 
| 
      
 36 
     | 
    
         
            +
                    File.open(path_to_alto, "w+") do |file|
         
     | 
| 
      
 37 
     | 
    
         
            +
                      file.puts service.call(hocr_html).to_alto
         
     | 
| 
      
 38 
     | 
    
         
            +
                    end
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
              end
         
     | 
| 
      
 42 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -27,7 +27,6 @@ module DerivativeRodeo 
     | 
|
| 
       27 
27 
     | 
    
         
             
                  # @!endgroup Class Attributes
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
                  attr_reader :input_uris,
         
     | 
| 
       30 
     | 
    
         
            -
                              :logger,
         
     | 
| 
       31 
30 
     | 
    
         
             
                              :output_location_template,
         
     | 
| 
       32 
31 
     | 
    
         
             
                              :preprocessed_location_template
         
     | 
| 
       33 
32 
     | 
    
         | 
| 
         @@ -39,23 +38,25 @@ module DerivativeRodeo 
     | 
|
| 
       39 
38 
     | 
    
         
             
                  #        to find preprocessed uris by transforming the :input_uris via
         
     | 
| 
       40 
39 
     | 
    
         
             
                  #        {Services::ConvertUriViaTemplateService} with the given
         
     | 
| 
       41 
40 
     | 
    
         
             
                  #        :preprocessed_location_template.
         
     | 
| 
       42 
     | 
    
         
            -
                   
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
      
 41 
     | 
    
         
            +
                  def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
         
     | 
| 
      
 42 
     | 
    
         
            +
                    # NOTE: Are we using this preprocessed_location_template?  Wondering?
         
     | 
| 
       44 
43 
     | 
    
         
             
                    @input_uris = Array.wrap(input_uris)
         
     | 
| 
       45 
44 
     | 
    
         
             
                    @output_location_template = output_location_template
         
     | 
| 
       46 
45 
     | 
    
         
             
                    @preprocessed_location_template = preprocessed_location_template
         
     | 
| 
       47 
     | 
    
         
            -
                    @logger = logger
         
     | 
| 
       48 
46 
     | 
    
         | 
| 
       49 
47 
     | 
    
         
             
                    return if valid_instantiation?
         
     | 
| 
       50 
48 
     | 
    
         | 
| 
       51 
49 
     | 
    
         
             
                    raise Errors::ExtensionMissingError.new(klass: self.class)
         
     | 
| 
       52 
50 
     | 
    
         
             
                  end
         
     | 
| 
       53 
51 
     | 
    
         | 
| 
      
 52 
     | 
    
         
            +
                  delegate :logger, to: DerivativeRodeo
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
       54 
54 
     | 
    
         
             
                  ##
         
     | 
| 
       55 
55 
     | 
    
         
             
                  # @api private
         
     | 
| 
       56 
56 
     | 
    
         
             
                  #
         
     | 
| 
       57 
57 
     | 
    
         
             
                  # @return [Boolean]
         
     | 
| 
       58 
58 
     | 
    
         
             
                  def valid_instantiation?
         
     | 
| 
      
 59 
     | 
    
         
            +
                    # TODO: Does this even make sense.
         
     | 
| 
       59 
60 
     | 
    
         
             
                    # When we have a BaseGenerator and not one of it's children or when we've assigned the
         
     | 
| 
       60 
61 
     | 
    
         
             
                    # output_extension.  instance_of? is more specific than is_a?
         
     | 
| 
       61 
62 
     | 
    
         
             
                    instance_of?(DerivativeRodeo::Generators::BaseGenerator) || output_extension
         
     | 
| 
         @@ -83,6 +84,7 @@ module DerivativeRodeo 
     | 
|
| 
       83 
84 
     | 
    
         
             
                  # @see #build_step
         
     | 
| 
       84 
85 
     | 
    
         
             
                  # @see #with_each_requisite_location_and_tmp_file_path
         
     | 
| 
       85 
86 
     | 
    
         
             
                  def generated_files
         
     | 
| 
      
 87 
     | 
    
         
            +
                    # TODO: Examples please
         
     | 
| 
       86 
88 
     | 
    
         
             
                    return @generated_files if defined?(@generated_files)
         
     | 
| 
       87 
89 
     | 
    
         | 
| 
       88 
90 
     | 
    
         
             
                    # As much as I would like to use map or returned values; given the implementations it's
         
     | 
| 
         @@ -92,6 +94,9 @@ module DerivativeRodeo 
     | 
|
| 
       92 
94 
     | 
    
         
             
                    # helps ease subclass implementations of the #with_each_requisite_location_and_tmp_file_path or
         
     | 
| 
       93 
95 
     | 
    
         
             
                    # #build_step
         
     | 
| 
       94 
96 
     | 
    
         
             
                    @generated_files = []
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
                    # BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
         
     | 
| 
      
 99 
     | 
    
         
            +
                    # "file:///Users/jfriesen/.profile"
         
     | 
| 
       95 
100 
     | 
    
         
             
                    with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
         
     | 
| 
       96 
101 
     | 
    
         
             
                      generated_file = destination(input_location)
         
     | 
| 
       97 
102 
     | 
    
         
             
                      @generated_files << if generated_file.exist?
         
     | 
| 
         @@ -170,7 +175,7 @@ module DerivativeRodeo 
     | 
|
| 
       170 
175 
     | 
    
         
             
                    return output_location unless preprocessed_location_template
         
     | 
| 
       171 
176 
     | 
    
         | 
| 
       172 
177 
     | 
    
         
             
                    preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
         
     | 
| 
       173 
     | 
    
         
            -
                    # We only want
         
     | 
| 
      
 178 
     | 
    
         
            +
                    # We only want the location if it exists
         
     | 
| 
       174 
179 
     | 
    
         
             
                    return preprocessed_location if preprocessed_location&.exist?
         
     | 
| 
       175 
180 
     | 
    
         | 
| 
       176 
181 
     | 
    
         
             
                    # NOTE: The file does not exist at the output_location; but we pass this information along so
         
     | 
| 
         @@ -17,19 +17,66 @@ module DerivativeRodeo 
     | 
|
| 
       17 
17 
     | 
    
         
             
                  include CopyFileConcern
         
     | 
| 
       18 
18 
     | 
    
         | 
| 
       19 
19 
     | 
    
         
             
                  ##
         
     | 
| 
       20 
     | 
    
         
            -
                  #  
     | 
| 
      
 20 
     | 
    
         
            +
                  # A helper method for downstream implementations to ask if this file is perhaps split from a
         
     | 
| 
      
 21 
     | 
    
         
            +
                  # PDF.
         
     | 
| 
       21 
22 
     | 
    
         
             
                  #
         
     | 
| 
       22 
     | 
    
         
            -
                  # @ 
     | 
| 
       23 
     | 
    
         
            -
                   
     | 
| 
       24 
     | 
    
         
            -
                     
     | 
| 
      
 23 
     | 
    
         
            +
                  # @param filename [String]
         
     | 
| 
      
 24 
     | 
    
         
            +
                  # @param extension [String] the extension (either with or without the leading period); if none
         
     | 
| 
      
 25 
     | 
    
         
            +
                  #        is provided use the extension of the given :filename.
         
     | 
| 
      
 26 
     | 
    
         
            +
                  # @return [TrueClass] when the file name likely represents a file split from a PDF.
         
     | 
| 
      
 27 
     | 
    
         
            +
                  # @return [FalseClass] when the file name does not, by convention, represent a file split from
         
     | 
| 
      
 28 
     | 
    
         
            +
                  #         a PDF.
         
     | 
| 
      
 29 
     | 
    
         
            +
                  #
         
     | 
| 
      
 30 
     | 
    
         
            +
                  # @see #image_file_basename_template
         
     | 
| 
      
 31 
     | 
    
         
            +
                  def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
         
     | 
| 
      
 32 
     | 
    
         
            +
                    extension ||= File.extname(filename)
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                    # Strip the leading period from the extension.
         
     | 
| 
      
 35 
     | 
    
         
            +
                    extension = extension[1..-1] if extension.start_with?('.')
         
     | 
| 
      
 36 
     | 
    
         
            +
                    regexp = %r{--page-\d+\.#{extension}$}
         
     | 
| 
      
 37 
     | 
    
         
            +
                    !!regexp.match(filename)
         
     | 
| 
       25 
38 
     | 
    
         
             
                  end
         
     | 
| 
       26 
39 
     | 
    
         | 
| 
       27 
40 
     | 
    
         
             
                  ##
         
     | 
| 
       28 
     | 
    
         
            -
                  # @ 
     | 
| 
      
 41 
     | 
    
         
            +
                  # @param basename [String] The given PDF file's base name (e.g. "hello.pdf" would have a base name of
         
     | 
| 
      
 42 
     | 
    
         
            +
                  #        "hello").
         
     | 
| 
      
 43 
     | 
    
         
            +
                  #
         
     | 
| 
      
 44 
     | 
    
         
            +
                  # @return [String] A template for the filenames of the images produced by Ghostscript.
         
     | 
| 
      
 45 
     | 
    
         
            +
                  #
         
     | 
| 
      
 46 
     | 
    
         
            +
                  # @note This must include "%d" in the returning value, as that is how Ghostscript will assign
         
     | 
| 
      
 47 
     | 
    
         
            +
                  # the page number.
         
     | 
| 
      
 48 
     | 
    
         
            +
                  #
         
     | 
| 
      
 49 
     | 
    
         
            +
                  # @note I have extracted this function to make it abundantly clear the expected location
         
     | 
| 
      
 50 
     | 
    
         
            +
                  # each split image.  Further there is an interaction in this
         
     | 
| 
       29 
51 
     | 
    
         
             
                  #
         
     | 
| 
       30 
     | 
    
         
            -
                  # @see  
     | 
| 
       31 
     | 
    
         
            -
                   
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
      
 52 
     | 
    
         
            +
                  # @see #existing_page_locations
         
     | 
| 
      
 53 
     | 
    
         
            +
                  # @see .filename_for_a_derived_page_from_a_pdf?
         
     | 
| 
      
 54 
     | 
    
         
            +
                  def image_file_basename_template(basename:)
         
     | 
| 
      
 55 
     | 
    
         
            +
                    "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 59 
     | 
    
         
            +
                  # We want to check the output location and pre-processed location for the existence of already
         
     | 
| 
      
 60 
     | 
    
         
            +
                  # split pages.  This method checks both places.
         
     | 
| 
      
 61 
     | 
    
         
            +
                  #
         
     | 
| 
      
 62 
     | 
    
         
            +
                  # @param input_location [StorageLocations::BaseLocation]
         
     | 
| 
      
 63 
     | 
    
         
            +
                  #
         
     | 
| 
      
 64 
     | 
    
         
            +
                  # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
         
     | 
| 
      
 65 
     | 
    
         
            +
                  #         with :tail_glob.
         
     | 
| 
      
 66 
     | 
    
         
            +
                  #
         
     | 
| 
      
 67 
     | 
    
         
            +
                  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
         
     | 
| 
      
 68 
     | 
    
         
            +
                  #
         
     | 
| 
      
 69 
     | 
    
         
            +
                  # @note The tail_glob is in relation to the {#image_file_basename_template}
         
     | 
| 
      
 70 
     | 
    
         
            +
                  def existing_page_locations(input_location:)
         
     | 
| 
      
 71 
     | 
    
         
            +
                    # See image_file_basename_template
         
     | 
| 
      
 72 
     | 
    
         
            +
                    tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                    output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
         
     | 
| 
      
 75 
     | 
    
         
            +
                    return output_locations if output_locations.count.positive?
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                    return [] if preprocessed_location_template.blank?
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                    input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
         
     | 
| 
       33 
80 
     | 
    
         
             
                  end
         
     | 
| 
       34 
81 
     | 
    
         | 
| 
       35 
82 
     | 
    
         
             
                  ##
         
     | 
| 
         @@ -44,18 +91,35 @@ module DerivativeRodeo 
     | 
|
| 
       44 
91 
     | 
    
         
             
                  # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
         
     | 
| 
       45 
92 
     | 
    
         
             
                  # @yieldparam image_path [String] where to find this file in the tmp space
         
     | 
| 
       46 
93 
     | 
    
         
             
                  #
         
     | 
| 
      
 94 
     | 
    
         
            +
                  # @note This function makes a concession; namely that if it encounters any
         
     | 
| 
      
 95 
     | 
    
         
            +
                  # {#existing_page_locations} it will use all of that result as the entire number of pages.
         
     | 
| 
      
 96 
     | 
    
         
            +
                  # We could make this smarter but at the moment we're deferring on that.
         
     | 
| 
      
 97 
     | 
    
         
            +
                  #
         
     | 
| 
       47 
98 
     | 
    
         
             
                  # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
         
     | 
| 
      
 99 
     | 
    
         
            +
                  #
         
     | 
| 
      
 100 
     | 
    
         
            +
                  # rubocop:disable Metrics/MethodLength
         
     | 
| 
       48 
101 
     | 
    
         
             
                  def with_each_requisite_location_and_tmp_file_path
         
     | 
| 
       49 
102 
     | 
    
         
             
                    input_files.each do |input_location|
         
     | 
| 
       50 
103 
     | 
    
         
             
                      input_location.with_existing_tmp_path do |input_tmp_file_path|
         
     | 
| 
       51 
     | 
    
         
            -
                         
     | 
| 
       52 
     | 
    
         
            -
                         
     | 
| 
      
 104 
     | 
    
         
            +
                        ## We want a single call for a directory listing of the image_file_basename_template
         
     | 
| 
      
 105 
     | 
    
         
            +
                        generated_files = existing_page_locations(input_location: input_location)
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                        if generated_files.count.zero?
         
     | 
| 
      
 108 
     | 
    
         
            +
                          generated_files = Services::PdfSplitter.call(
         
     | 
| 
      
 109 
     | 
    
         
            +
                            input_tmp_file_path,
         
     | 
| 
      
 110 
     | 
    
         
            +
                            image_extension: output_extension,
         
     | 
| 
      
 111 
     | 
    
         
            +
                            image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
         
     | 
| 
      
 112 
     | 
    
         
            +
                          )
         
     | 
| 
      
 113 
     | 
    
         
            +
                        end
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
                        generated_files.each do |image_path|
         
     | 
| 
       53 
116 
     | 
    
         
             
                          image_location = StorageLocations::FileLocation.new("file://#{image_path}")
         
     | 
| 
       54 
117 
     | 
    
         
             
                          yield(image_location, image_path)
         
     | 
| 
       55 
118 
     | 
    
         
             
                        end
         
     | 
| 
       56 
119 
     | 
    
         
             
                      end
         
     | 
| 
       57 
120 
     | 
    
         
             
                    end
         
     | 
| 
       58 
121 
     | 
    
         
             
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                  # rubocop:enable Metrics/MethodLength
         
     | 
| 
       59 
123 
     | 
    
         
             
                end
         
     | 
| 
       60 
124 
     | 
    
         
             
              end
         
     | 
| 
       61 
125 
     | 
    
         
             
            end
         
     | 
| 
         @@ -0,0 +1,42 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative '../services/extract_word_coordinates_from_hocr_sgml_service'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module DerivativeRodeo
         
     | 
| 
      
 6 
     | 
    
         
            +
              module Generators
         
     | 
| 
      
 7 
     | 
    
         
            +
                ##
         
     | 
| 
      
 8 
     | 
    
         
            +
                # Generate the word coordinates (as JSON) from the given input_uris.
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # @note Assumes that we're receiving a HOCR file (generated via {HocrGenerator}).
         
     | 
| 
      
 11 
     | 
    
         
            +
                class PlainTextGenerator < BaseGenerator
         
     | 
| 
      
 12 
     | 
    
         
            +
                  self.output_extension = "plain_text.txt"
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  class_attribute :service, default: Services::ExtractWordCoordinatesFromHocrSgmlService
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # @param output_location [StorageLocations::BaseLocation]
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # @param input_tmp_file_path [String] the location of the file that we can use for processing.
         
     | 
| 
      
 19 
     | 
    
         
            +
                  #
         
     | 
| 
      
 20 
     | 
    
         
            +
                  # @return [StorageLocations::BaseLocation]
         
     | 
| 
      
 21 
     | 
    
         
            +
                  #
         
     | 
| 
      
 22 
     | 
    
         
            +
                  # @see #requisite_files
         
     | 
| 
      
 23 
     | 
    
         
            +
                  def build_step(output_location:, input_tmp_file_path:, **)
         
     | 
| 
      
 24 
     | 
    
         
            +
                    output_location.with_new_tmp_path do |output_tmp_file_path|
         
     | 
| 
      
 25 
     | 
    
         
            +
                      convert_to_coordinates(path_to_hocr: input_tmp_file_path, path_to_plain_text: output_tmp_file_path)
         
     | 
| 
      
 26 
     | 
    
         
            +
                    end
         
     | 
| 
      
 27 
     | 
    
         
            +
                  end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  private
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 32 
     | 
    
         
            +
                  # @param path_to_hocr [String]
         
     | 
| 
      
 33 
     | 
    
         
            +
                  # @param path_to_plain_text [String]
         
     | 
| 
      
 34 
     | 
    
         
            +
                  def convert_to_coordinates(path_to_hocr:, path_to_plain_text:)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    hocr_html = File.read(path_to_hocr)
         
     | 
| 
      
 36 
     | 
    
         
            +
                    File.open(path_to_plain_text, "w+") do |file|
         
     | 
| 
      
 37 
     | 
    
         
            +
                      file.puts service.call(hocr_html).to_text
         
     | 
| 
      
 38 
     | 
    
         
            +
                    end
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
              end
         
     | 
| 
      
 42 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -6,12 +6,33 @@ module DerivativeRodeo 
     | 
|
| 
       6 
6 
     | 
    
         
             
                # This generator is responsible for converting a given binary into a thumbnail.  As of
         
     | 
| 
       7 
7 
     | 
    
         
             
                # <2023-05-22 Mon>, we're needing to generate thumbnails for PDFs and images.
         
     | 
| 
       8 
8 
     | 
    
         
             
                class ThumbnailGenerator < BaseGenerator
         
     | 
| 
      
 9 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # @!group Class Attributes
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
       9 
12 
     | 
    
         
             
                  ##
         
     | 
| 
       10 
13 
     | 
    
         
             
                  # We want to mirror the same file "last" extension as described in Hyrax.
         
     | 
| 
       11 
14 
     | 
    
         
             
                  #
         
     | 
| 
       12 
15 
     | 
    
         
             
                  # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb
         
     | 
| 
       13 
16 
     | 
    
         
             
                  self.output_extension = 'thumbnail.jpeg'
         
     | 
| 
       14 
17 
     | 
    
         | 
| 
      
 18 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 19 
     | 
    
         
            +
                  # @!attribute dimensions_by_type
         
     | 
| 
      
 20 
     | 
    
         
            +
                  #
         
     | 
| 
      
 21 
     | 
    
         
            +
                  #   @return [Hash<Symbol,String>] the "types" (as categorized by
         
     | 
| 
      
 22 
     | 
    
         
            +
                  #           Hyrax::FileSetDerivativeService).  These aren't mime-types per se but a conceptual
         
     | 
| 
      
 23 
     | 
    
         
            +
                  #           distillation of that.
         
     | 
| 
      
 24 
     | 
    
         
            +
                  #
         
     | 
| 
      
 25 
     | 
    
         
            +
                  #   @see https://github.com/samvera/hyrax/blob/815e0abaacf9f331a5640c5d6129661d01eadf75/app/services/hyrax/file_set_derivatives_service.rb
         
     | 
| 
      
 26 
     | 
    
         
            +
                  class_attribute :dimensions_by_type, default: { pdf: "338x493" }
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 29 
     | 
    
         
            +
                  # @!attribute dimensions_fallback
         
     | 
| 
      
 30 
     | 
    
         
            +
                  #
         
     | 
| 
      
 31 
     | 
    
         
            +
                  #   @return [String] when there's no match for {.dimensions_by_type} use this value.
         
     | 
| 
      
 32 
     | 
    
         
            +
                  class_attribute :dimensions_fallback, default: "200x150>"
         
     | 
| 
      
 33 
     | 
    
         
            +
                  # @!endgroup Class Attributes
         
     | 
| 
      
 34 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
       15 
36 
     | 
    
         
             
                  ##
         
     | 
| 
       16 
37 
     | 
    
         
             
                  # @param output_location [StorageLocations::BaseLocation]
         
     | 
| 
       17 
38 
     | 
    
         
             
                  # @param input_tmp_file_path [String] the location of the file that we can use for processing.
         
     | 
| 
         @@ -23,6 +44,20 @@ module DerivativeRodeo 
     | 
|
| 
       23 
44 
     | 
    
         
             
                    end
         
     | 
| 
       24 
45 
     | 
    
         
             
                  end
         
     | 
| 
       25 
46 
     | 
    
         | 
| 
      
 47 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 48 
     | 
    
         
            +
                  # @param filename [String]
         
     | 
| 
      
 49 
     | 
    
         
            +
                  # @return [String]
         
     | 
| 
      
 50 
     | 
    
         
            +
                  #
         
     | 
| 
      
 51 
     | 
    
         
            +
                  # @see .dimensions_by_type
         
     | 
| 
      
 52 
     | 
    
         
            +
                  # @see .dimensions_fallback
         
     | 
| 
      
 53 
     | 
    
         
            +
                  def self.dimensions_for(filename:)
         
     | 
| 
      
 54 
     | 
    
         
            +
                    type = DerivativeRodeo::Services::MimeTypeService.hyrax_type(filename: filename)
         
     | 
| 
      
 55 
     | 
    
         
            +
                    dimensions_by_type.fetch(type, dimensions_fallback)
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                  # Want to expose the dimensions_for as an instance method
         
     | 
| 
      
 59 
     | 
    
         
            +
                  delegate :dimensions_for, to: :class
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
       26 
61 
     | 
    
         
             
                  ##
         
     | 
| 
       27 
62 
     | 
    
         
             
                  # Convert the file found at :path_to_input into a thumbnail, writing it to the
         
     | 
| 
       28 
63 
     | 
    
         
             
                  # :path_for_thumbnail_output
         
     | 
| 
         @@ -30,8 +65,8 @@ module DerivativeRodeo 
     | 
|
| 
       30 
65 
     | 
    
         
             
                  # @param path_of_file_to_create_thumbnail_from [String]
         
     | 
| 
       31 
66 
     | 
    
         
             
                  # @param path_for_thumbnail_output [String]
         
     | 
| 
       32 
67 
     | 
    
         
             
                  def thumbnify(path_of_file_to_create_thumbnail_from:, path_for_thumbnail_output:)
         
     | 
| 
       33 
     | 
    
         
            -
                     
     | 
| 
       34 
     | 
    
         
            -
                    `convert #{path_of_file_to_create_thumbnail_from} -thumbnail ' 
     | 
| 
      
 68 
     | 
    
         
            +
                    dimensions = dimensions_for(filename: path_of_file_to_create_thumbnail_from)
         
     | 
| 
      
 69 
     | 
    
         
            +
                    `convert #{path_of_file_to_create_thumbnail_from} -thumbnail '#{dimensions}' -flatten #{path_for_thumbnail_output}`
         
     | 
| 
       35 
70 
     | 
    
         
             
                  end
         
     | 
| 
       36 
71 
     | 
    
         
             
                end
         
     | 
| 
       37 
72 
     | 
    
         
             
              end
         
     | 
| 
         @@ -31,7 +31,7 @@ module DerivativeRodeo 
     | 
|
| 
       31 
31 
     | 
    
         
             
                  def convert_to_coordinates(path_to_hocr:, path_to_coordinate:, service: Services::ExtractWordCoordinatesFromHocrSgmlService)
         
     | 
| 
       32 
32 
     | 
    
         
             
                    hocr_html = File.read(path_to_hocr)
         
     | 
| 
       33 
33 
     | 
    
         
             
                    File.open(path_to_coordinate, "w+") do |file|
         
     | 
| 
       34 
     | 
    
         
            -
                      file.puts service.call(hocr_html)
         
     | 
| 
      
 34 
     | 
    
         
            +
                      file.puts service.call(hocr_html).to_json
         
     | 
| 
       35 
35 
     | 
    
         
             
                    end
         
     | 
| 
       36 
36 
     | 
    
         
             
                  end
         
     | 
| 
       37 
37 
     | 
    
         
             
                end
         
     | 
| 
         @@ -13,7 +13,7 @@ module DerivativeRodeo 
     | 
|
| 
       13 
13 
     | 
    
         
             
                  # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
         
     | 
| 
       14 
14 
     | 
    
         
             
                  # @return [String] A JSON document
         
     | 
| 
       15 
15 
     | 
    
         
             
                  def self.call(sgml)
         
     | 
| 
       16 
     | 
    
         
            -
                    new(sgml) 
     | 
| 
      
 16 
     | 
    
         
            +
                    new(sgml)
         
     | 
| 
       17 
17 
     | 
    
         
             
                  end
         
     | 
| 
       18 
18 
     | 
    
         | 
| 
       19 
19 
     | 
    
         
             
                  ##
         
     | 
| 
         @@ -42,6 +42,21 @@ module DerivativeRodeo 
     | 
|
| 
       42 
42 
     | 
    
         
             
                  end
         
     | 
| 
       43 
43 
     | 
    
         
             
                  alias json to_json
         
     | 
| 
       44 
44 
     | 
    
         | 
| 
      
 45 
     | 
    
         
            +
                  # Output plain text, keeping the method calls consistent with so calling this #to_text
         
     | 
| 
      
 46 
     | 
    
         
            +
                  #
         
     | 
| 
      
 47 
     | 
    
         
            +
                  # @return [String] plain text of OCR'd document
         
     | 
| 
      
 48 
     | 
    
         
            +
                  def to_text
         
     | 
| 
      
 49 
     | 
    
         
            +
                    @to_text ||= doc_stream.text
         
     | 
| 
      
 50 
     | 
    
         
            +
                  end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                  def to_alto
         
     | 
| 
      
 53 
     | 
    
         
            +
                    @to_alto ||= AltoXml.to_alto(
         
     | 
| 
      
 54 
     | 
    
         
            +
                      words: doc_stream.words,
         
     | 
| 
      
 55 
     | 
    
         
            +
                      width: doc_stream.width,
         
     | 
| 
      
 56 
     | 
    
         
            +
                      height: doc_stream.height
         
     | 
| 
      
 57 
     | 
    
         
            +
                    )
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
       45 
60 
     | 
    
         
             
                  private
         
     | 
| 
       46 
61 
     | 
    
         | 
| 
       47 
62 
     | 
    
         
             
                  def xml?(xml)
         
     | 
| 
         @@ -121,6 +136,7 @@ module DerivativeRodeo 
     | 
|
| 
       121 
136 
     | 
    
         
             
                      # add trailing space to plaintext buffer for between words:
         
     | 
| 
       122 
137 
     | 
    
         
             
                      @text += ' '
         
     | 
| 
       123 
138 
     | 
    
         
             
                      @words.push(@current) if word_complete?
         
     | 
| 
      
 139 
     | 
    
         
            +
                      @current = nil # clear the current word
         
     | 
| 
       124 
140 
     | 
    
         
             
                    end
         
     | 
| 
       125 
141 
     | 
    
         | 
| 
       126 
142 
     | 
    
         
             
                    def end_line
         
     | 
| 
         @@ -156,10 +172,13 @@ module DerivativeRodeo 
     | 
|
| 
       156 
172 
     | 
    
         
             
                    # Callback for element end; at this time, flush word coordinate state
         
     | 
| 
       157 
173 
     | 
    
         
             
                    #   for current word, and append line endings to plain text:
         
     | 
| 
       158 
174 
     | 
    
         
             
                    #
         
     | 
| 
       159 
     | 
    
         
            -
                    # @param  
     | 
| 
       160 
     | 
    
         
            -
                    def end_element( 
     | 
| 
       161 
     | 
    
         
            -
                       
     | 
| 
       162 
     | 
    
         
            -
             
     | 
| 
      
 175 
     | 
    
         
            +
                    # @param name [String] element name.
         
     | 
| 
      
 176 
     | 
    
         
            +
                    def end_element(name)
         
     | 
| 
      
 177 
     | 
    
         
            +
                      if name == 'span'
         
     | 
| 
      
 178 
     | 
    
         
            +
                        end_word if @element_class_name == 'ocrx_word'
         
     | 
| 
      
 179 
     | 
    
         
            +
                        @text += "\n" if @element_class_name.nil?
         
     | 
| 
      
 180 
     | 
    
         
            +
                      end
         
     | 
| 
      
 181 
     | 
    
         
            +
                      @element_class_name = nil
         
     | 
| 
       163 
182 
     | 
    
         
             
                    end
         
     | 
| 
       164 
183 
     | 
    
         | 
| 
       165 
184 
     | 
    
         
             
                    # Callback for completion of parsing hOCR, used to normalize generated
         
     | 
| 
         @@ -213,6 +232,102 @@ module DerivativeRodeo 
     | 
|
| 
       213 
232 
     | 
    
         
             
                      JSON.generate(payload)
         
     | 
| 
       214 
233 
     | 
    
         
             
                    end
         
     | 
| 
       215 
234 
     | 
    
         
             
                  end
         
     | 
| 
      
 235 
     | 
    
         
            +
             
     | 
| 
      
 236 
     | 
    
         
            +
                  class AltoXml
         
     | 
| 
      
 237 
     | 
    
         
            +
                    ##
         
     | 
| 
      
 238 
     | 
    
         
            +
                    # @api public
         
     | 
| 
      
 239 
     | 
    
         
            +
                    #
         
     | 
| 
      
 240 
     | 
    
         
            +
                    # @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
         
     | 
| 
      
 241 
     | 
    
         
            +
                    # @param width [Integer, nil] the width of the "canvas" on which the words appear.
         
     | 
| 
      
 242 
     | 
    
         
            +
                    # @param height [Integer, nil] the height of the "canvas" on which the words appear.
         
     | 
| 
      
 243 
     | 
    
         
            +
                    #
         
     | 
| 
      
 244 
     | 
    
         
            +
                    # @return [String] the ALTO XML representation of the given words and their coordinates.
         
     | 
| 
      
 245 
     | 
    
         
            +
                    def self.to_alto(words:, width: nil, height: nil)
         
     | 
| 
      
 246 
     | 
    
         
            +
                      new(words: words, width: width, height: height).to_alto
         
     | 
| 
      
 247 
     | 
    
         
            +
                    end
         
     | 
| 
      
 248 
     | 
    
         
            +
             
     | 
| 
      
 249 
     | 
    
         
            +
                    def initialize(words:, width:, height:, scaling: 1.0)
         
     | 
| 
      
 250 
     | 
    
         
            +
                      @words = words
         
     | 
| 
      
 251 
     | 
    
         
            +
                      @height = height.to_i
         
     | 
| 
      
 252 
     | 
    
         
            +
                      @width = width.to_i
         
     | 
| 
      
 253 
     | 
    
         
            +
                      @scaling = scaling
         
     | 
| 
      
 254 
     | 
    
         
            +
                    end
         
     | 
| 
      
 255 
     | 
    
         
            +
             
     | 
| 
      
 256 
     | 
    
         
            +
                    attr_reader :words, :width, :height, :scaling
         
     | 
| 
      
 257 
     | 
    
         
            +
             
     | 
| 
      
 258 
     | 
    
         
            +
                    # Output ALTO XML of word coordinates
         
     | 
| 
      
 259 
     | 
    
         
            +
                    #
         
     | 
| 
      
 260 
     | 
    
         
            +
                    # @return [String] ALTO XML representation of the words and their coordinates
         
     | 
| 
      
 261 
     | 
    
         
            +
                    def to_alto
         
     | 
| 
      
 262 
     | 
    
         
            +
                      page = alto_page(width, height) do |xml|
         
     | 
| 
      
 263 
     | 
    
         
            +
                        words.each do |word|
         
     | 
| 
      
 264 
     | 
    
         
            +
                          xml.String(
         
     | 
| 
      
 265 
     | 
    
         
            +
                            CONTENT: word[:word],
         
     | 
| 
      
 266 
     | 
    
         
            +
                            WIDTH: scale_point(word[:coordinates][2]).to_s,
         
     | 
| 
      
 267 
     | 
    
         
            +
                            HEIGHT: scale_point(word[:coordinates][3]).to_s,
         
     | 
| 
      
 268 
     | 
    
         
            +
                            HPOS: scale_point(word[:coordinates][0]).to_s,
         
     | 
| 
      
 269 
     | 
    
         
            +
                            VPOS: scale_point(word[:coordinates][1]).to_s
         
     | 
| 
      
 270 
     | 
    
         
            +
                          ) { xml.text '' }
         
     | 
| 
      
 271 
     | 
    
         
            +
                        end
         
     | 
| 
      
 272 
     | 
    
         
            +
                      end
         
     | 
| 
      
 273 
     | 
    
         
            +
                      page.to_xml
         
     | 
| 
      
 274 
     | 
    
         
            +
                    end
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
                    private
         
     | 
| 
      
 277 
     | 
    
         
            +
             
     | 
| 
      
 278 
     | 
    
         
            +
                    # given block to manage word generation, wrap with page/block/line
         
     | 
| 
      
 279 
     | 
    
         
            +
                    def alto_page(pixel_width, pixel_height, &block)
         
     | 
| 
      
 280 
     | 
    
         
            +
                      builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
         
     | 
| 
      
 281 
     | 
    
         
            +
                        xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
         
     | 
| 
      
 282 
     | 
    
         
            +
                          xml.Description do
         
     | 
| 
      
 283 
     | 
    
         
            +
                            xml.MeasurementUnit 'pixel'
         
     | 
| 
      
 284 
     | 
    
         
            +
                          end
         
     | 
| 
      
 285 
     | 
    
         
            +
                          alto_layout(xml, pixel_width, pixel_height, &block)
         
     | 
| 
      
 286 
     | 
    
         
            +
                        end
         
     | 
| 
      
 287 
     | 
    
         
            +
                      end
         
     | 
| 
      
 288 
     | 
    
         
            +
                      builder
         
     | 
| 
      
 289 
     | 
    
         
            +
                    end
         
     | 
| 
      
 290 
     | 
    
         
            +
             
     | 
| 
      
 291 
     | 
    
         
            +
                    def scale_point(value)
         
     | 
| 
      
 292 
     | 
    
         
            +
                      # NOTE: presuming non-fractional, even though ALTO 2.1
         
     | 
| 
      
 293 
     | 
    
         
            +
                      #   specifies coordinates are xsd:float, not xsd:int,
         
     | 
| 
      
 294 
     | 
    
         
            +
                      #   simplify to integer value for output:
         
     | 
| 
      
 295 
     | 
    
         
            +
                      (value * scaling).to_i
         
     | 
| 
      
 296 
     | 
    
         
            +
                    end
         
     | 
| 
      
 297 
     | 
    
         
            +
             
     | 
| 
      
 298 
     | 
    
         
            +
                    # return layout for page
         
     | 
| 
      
 299 
     | 
    
         
            +
                    def alto_layout(xml, pixel_width, pixel_height, &block)
         
     | 
| 
      
 300 
     | 
    
         
            +
                      xml.Layout do
         
     | 
| 
      
 301 
     | 
    
         
            +
                        xml.Page(ID: 'ID1',
         
     | 
| 
      
 302 
     | 
    
         
            +
                                 PHYSICAL_IMG_NR: '1',
         
     | 
| 
      
 303 
     | 
    
         
            +
                                 HEIGHT: pixel_height,
         
     | 
| 
      
 304 
     | 
    
         
            +
                                 WIDTH: pixel_width) do
         
     | 
| 
      
 305 
     | 
    
         
            +
                          xml.PrintSpace(HEIGHT: pixel_height,
         
     | 
| 
      
 306 
     | 
    
         
            +
                                         WIDTH: pixel_width,
         
     | 
| 
      
 307 
     | 
    
         
            +
                                         HPOS: '0',
         
     | 
| 
      
 308 
     | 
    
         
            +
                                         VPOS: '0') do
         
     | 
| 
      
 309 
     | 
    
         
            +
                            alto_blockline(xml, pixel_width, pixel_height, &block)
         
     | 
| 
      
 310 
     | 
    
         
            +
                          end
         
     | 
| 
      
 311 
     | 
    
         
            +
                        end
         
     | 
| 
      
 312 
     | 
    
         
            +
                      end
         
     | 
| 
      
 313 
     | 
    
         
            +
                    end
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
                    # make block line and call word-block
         
     | 
| 
      
 316 
     | 
    
         
            +
                    def alto_blockline(xml, pixel_width, pixel_height)
         
     | 
| 
      
 317 
     | 
    
         
            +
                      xml.TextBlock(ID: 'ID1a',
         
     | 
| 
      
 318 
     | 
    
         
            +
                                    HEIGHT: pixel_height,
         
     | 
| 
      
 319 
     | 
    
         
            +
                                    WIDTH: pixel_width,
         
     | 
| 
      
 320 
     | 
    
         
            +
                                    HPOS: '0',
         
     | 
| 
      
 321 
     | 
    
         
            +
                                    VPOS: '0') do
         
     | 
| 
      
 322 
     | 
    
         
            +
                        xml.TextLine(HEIGHT: pixel_height,
         
     | 
| 
      
 323 
     | 
    
         
            +
                                     WIDTH: pixel_width,
         
     | 
| 
      
 324 
     | 
    
         
            +
                                     HPOS: '0',
         
     | 
| 
      
 325 
     | 
    
         
            +
                                     VPOS: '0') do
         
     | 
| 
      
 326 
     | 
    
         
            +
                          yield(xml)
         
     | 
| 
      
 327 
     | 
    
         
            +
                        end
         
     | 
| 
      
 328 
     | 
    
         
            +
                      end
         
     | 
| 
      
 329 
     | 
    
         
            +
                    end
         
     | 
| 
      
 330 
     | 
    
         
            +
                  end
         
     | 
| 
       216 
331 
     | 
    
         
             
                end
         
     | 
| 
       217 
332 
     | 
    
         
             
              end
         
     | 
| 
       218 
333 
     | 
    
         
             
            end
         
     | 
| 
         @@ -0,0 +1,52 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'marcel'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module DerivativeRodeo
         
     | 
| 
      
 5 
     | 
    
         
            +
              module Services
         
     | 
| 
      
 6 
     | 
    
         
            +
                ##
         
     | 
| 
      
 7 
     | 
    
         
            +
                # This module provides an interface for determining a mime-type.
         
     | 
| 
      
 8 
     | 
    
         
            +
                module MimeTypeService
         
     | 
| 
      
 9 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # Hyrax has it's own compression of mime_types into conceptual types (as defined in
         
     | 
| 
      
 11 
     | 
    
         
            +
                  # Hyrax::FileSetDerivativesService).  This provides a somewhat conceptual overlap with that,
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # while also being more generalized.
         
     | 
| 
      
 13 
     | 
    
         
            +
                  #
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # @param filename [String]
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # @return [Symbol]
         
     | 
| 
      
 16 
     | 
    
         
            +
                  def self.hyrax_type(filename:)
         
     | 
| 
      
 17 
     | 
    
         
            +
                    mime = mime_type(filename: filename)
         
     | 
| 
      
 18 
     | 
    
         
            +
                    media_type, sub_type = mime.split("/")
         
     | 
| 
      
 19 
     | 
    
         
            +
                    case media_type
         
     | 
| 
      
 20 
     | 
    
         
            +
                    when "image", "audio", "text", "video"
         
     | 
| 
      
 21 
     | 
    
         
            +
                      media_type.to_sym
         
     | 
| 
      
 22 
     | 
    
         
            +
                    when "application" # The wild woolly weird world of all the things.
         
     | 
| 
      
 23 
     | 
    
         
            +
                      # TODO: Do we need to worry about office documents?
         
     | 
| 
      
 24 
     | 
    
         
            +
                      sub_type.to_sym
         
     | 
| 
      
 25 
     | 
    
         
            +
                    else
         
     | 
| 
      
 26 
     | 
    
         
            +
                      sub_type.to_sym
         
     | 
| 
      
 27 
     | 
    
         
            +
                    end
         
     | 
| 
      
 28 
     | 
    
         
            +
                  end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 31 
     | 
    
         
            +
                  # Given a local :filename (e.g. downloaded and available on the server this is running),
         
     | 
| 
      
 32 
     | 
    
         
            +
                  # return the mime_type of the file.
         
     | 
| 
      
 33 
     | 
    
         
            +
                  #
         
     | 
| 
      
 34 
     | 
    
         
            +
                  # @param filename [String]
         
     | 
| 
      
 35 
     | 
    
         
            +
                  # @return [String] (e.g. "application/pdf", "text/plain")
         
     | 
| 
      
 36 
     | 
    
         
            +
                  def self.mime_type(filename:)
         
     | 
| 
      
 37 
     | 
    
         
            +
                    ##
         
     | 
| 
      
 38 
     | 
    
         
            +
                    # TODO: Does this attempt to read the whole file?  That may create memory constraints.  By
         
     | 
| 
      
 39 
     | 
    
         
            +
                    # using Pathname (instead of File.read), we're letting Marcel do it's best mime magic.
         
     | 
| 
      
 40 
     | 
    
         
            +
                    pathname = Pathname.new(filename)
         
     | 
| 
      
 41 
     | 
    
         
            +
                    extension = filename.split(".")&.last&.downcase
         
     | 
| 
      
 42 
     | 
    
         
            +
                    if extension
         
     | 
| 
      
 43 
     | 
    
         
            +
                      # By including a possible extension, we can help nudge Marcel into making a more
         
     | 
| 
      
 44 
     | 
    
         
            +
                      # Without extension, we will get a lot of "application/octet-stream" results.
         
     | 
| 
      
 45 
     | 
    
         
            +
                      ::Marcel::MimeType.for(pathname, extension: extension)
         
     | 
| 
      
 46 
     | 
    
         
            +
                    else
         
     | 
| 
      
 47 
     | 
    
         
            +
                      ::Marcel::MimeType.for(pathname)
         
     | 
| 
      
 48 
     | 
    
         
            +
                    end
         
     | 
| 
      
 49 
     | 
    
         
            +
                  end
         
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
      
 52 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -2,17 +2,32 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require 'open3'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'securerandom'
         
     | 
| 
       5 
     | 
    
         
            -
            require 'tmpdir'
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
5 
     | 
    
         
             
            module DerivativeRodeo
         
     | 
| 
       8 
6 
     | 
    
         
             
              module Services
         
     | 
| 
      
 7 
     | 
    
         
            +
                ##
         
     | 
| 
      
 8 
     | 
    
         
            +
                # A service module for splitting PDFs into one image per page.
         
     | 
| 
      
 9 
     | 
    
         
            +
                #
         
     | 
| 
      
 10 
     | 
    
         
            +
                # @see .call
         
     | 
| 
       9 
11 
     | 
    
         
             
                module PdfSplitter
         
     | 
| 
       10 
12 
     | 
    
         
             
                  ##
         
     | 
| 
       11 
     | 
    
         
            -
                  # @ 
     | 
| 
       12 
     | 
    
         
            -
                  # 
     | 
| 
       13 
     | 
    
         
            -
                   
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
      
 13 
     | 
    
         
            +
                  # @api public
         
     | 
| 
      
 14 
     | 
    
         
            +
                  #
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # Split the file found at the given :path
         
     | 
| 
      
 16 
     | 
    
         
            +
                  #
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # @param path [String] the path to the source PDF that we're processing.
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # @param image_extension [String] used to determine the splitting service we use; there is an
         
     | 
| 
      
 19 
     | 
    
         
            +
                  #        implicit relationship between image_extension and image_file_basename_template
         
     | 
| 
      
 20 
     | 
    
         
            +
                  #        (though filenames do not necessarily reflect mime types)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  # @param image_file_basename_template [String] use this string to generate the unique filename
         
     | 
| 
      
 22 
     | 
    
         
            +
                  #        for an image "split" from the given PDF.  It must include "%d" as part of the
         
     | 
| 
      
 23 
     | 
    
         
            +
                  #        declaration.  For example if the template is "hello-world-%d.png" then the first
         
     | 
| 
      
 24 
     | 
    
         
            +
                  #        split page will be "hello-world-1.png".
         
     | 
| 
      
 25 
     | 
    
         
            +
                  #
         
     | 
| 
      
 26 
     | 
    
         
            +
                  # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
         
     | 
| 
      
 27 
     | 
    
         
            +
                  def self.call(path, image_extension:, image_file_basename_template:)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    klass_name = "#{image_extension.to_s.classify}_page".classify
         
     | 
| 
      
 29 
     | 
    
         
            +
                    klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
         
     | 
| 
      
 30 
     | 
    
         
            +
                    klass.new(path, image_file_basename_template: image_file_basename_template)
         
     | 
| 
       16 
31 
     | 
    
         
             
                  end
         
     | 
| 
       17 
32 
     | 
    
         | 
| 
       18 
33 
     | 
    
         
             
                  ##
         
     | 
| 
         @@ -31,38 +46,23 @@ module DerivativeRodeo 
     | 
|
| 
       31 
46 
     | 
    
         | 
| 
       32 
47 
     | 
    
         
             
                    class_attribute :gsdevice, instance_accessor: false
         
     | 
| 
       33 
48 
     | 
    
         
             
                    class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
         
     | 
| 
       34 
     | 
    
         
            -
                    ##
         
     | 
| 
       35 
     | 
    
         
            -
                    # @api public
         
     | 
| 
       36 
     | 
    
         
            -
                    #
         
     | 
| 
       37 
     | 
    
         
            -
                    # @param path [String] The path the the PDF
         
     | 
| 
       38 
     | 
    
         
            -
                    #
         
     | 
| 
       39 
     | 
    
         
            -
                    # @return [Enumerable, Utilities::PdfSplitter::Base]
         
     | 
| 
       40 
     | 
    
         
            -
                    def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
         
     | 
| 
       41 
     | 
    
         
            -
                      new(path, baseid: baseid, tmpdir: tmpdir)
         
     | 
| 
       42 
     | 
    
         
            -
                    end
         
     | 
| 
       43 
49 
     | 
    
         | 
| 
       44 
     | 
    
         
            -
                    ##
         
     | 
| 
       45 
     | 
    
         
            -
                    # @param path [String] the path to the source PDF that we're processing.
         
     | 
| 
       46 
     | 
    
         
            -
                    # @param baseid [String] used for creating a unique identifier
         
     | 
| 
       47 
     | 
    
         
            -
                    # @param tmpdir [String] place to perform the "work" of splitting the PDF.
         
     | 
| 
       48 
     | 
    
         
            -
                    # @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
         
     | 
| 
       49 
     | 
    
         
            -
                    #        extract this from the given path, but for testing purposes, you might want to
         
     | 
| 
       50 
     | 
    
         
            -
                    #        provide a specific summary.
         
     | 
| 
       51 
     | 
    
         
            -
                    # @param logger [Logger, #error]
         
     | 
| 
       52 
50 
     | 
    
         
             
                    def initialize(path,
         
     | 
| 
       53 
     | 
    
         
            -
                                    
     | 
| 
       54 
     | 
    
         
            -
                                    
     | 
| 
       55 
     | 
    
         
            -
                                   tmpdir: Dir.mktmpdir,
         
     | 
| 
       56 
     | 
    
         
            -
                                   pdf_pages_summary: PagesSummary.extract_from(path: path),
         
     | 
| 
       57 
     | 
    
         
            -
                                   logger: DerivativeRodeo.config.logger)
         
     | 
| 
       58 
     | 
    
         
            -
                      @baseid = baseid
         
     | 
| 
      
 51 
     | 
    
         
            +
                                   image_file_basename_template:,
         
     | 
| 
      
 52 
     | 
    
         
            +
                                   pdf_pages_summary: PagesSummary.extract_from(path: path))
         
     | 
| 
       59 
53 
     | 
    
         
             
                      @pdfpath = path
         
     | 
| 
       60 
54 
     | 
    
         
             
                      @pdf_pages_summary = pdf_pages_summary
         
     | 
| 
       61 
     | 
    
         
            -
                      @ 
     | 
| 
       62 
     | 
    
         
            -
             
     | 
| 
      
 55 
     | 
    
         
            +
                      @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                      # We need to ensure that this temporary directory exists so we can write the files to it.
         
     | 
| 
      
 58 
     | 
    
         
            +
                      # Fortunately, because this file space must be "local" tmp dir, we don't need to work
         
     | 
| 
      
 59 
     | 
    
         
            +
                      # through any of the location antics of {StorageLocations::BaseLocation}.
         
     | 
| 
      
 60 
     | 
    
         
            +
                      FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
         
     | 
| 
       63 
61 
     | 
    
         
             
                    end
         
     | 
| 
       64 
62 
     | 
    
         | 
| 
       65 
     | 
    
         
            -
                    attr_reader : 
     | 
| 
      
 63 
     | 
    
         
            +
                    attr_reader :ghost_script_output_file_template
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                    delegate :logger, to: DerivativeRodeo
         
     | 
| 
       66 
66 
     | 
    
         | 
| 
       67 
67 
     | 
    
         
             
                    # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
         
     | 
| 
       68 
68 
     | 
    
         
             
                    include Enumerable
         
     | 
| 
         @@ -80,8 +80,8 @@ module DerivativeRodeo 
     | 
|
| 
       80 
80 
     | 
    
         
             
                      !pdf_pages_summary.valid?
         
     | 
| 
       81 
81 
     | 
    
         
             
                    end
         
     | 
| 
       82 
82 
     | 
    
         | 
| 
       83 
     | 
    
         
            -
                    attr_reader :pdf_pages_summary, : 
     | 
| 
       84 
     | 
    
         
            -
                    private :pdf_pages_summary, : 
     | 
| 
      
 83 
     | 
    
         
            +
                    attr_reader :pdf_pages_summary, :basename, :pdfpath
         
     | 
| 
      
 84 
     | 
    
         
            +
                    private :pdf_pages_summary, :basename, :pdfpath
         
     | 
| 
       85 
85 
     | 
    
         | 
| 
       86 
86 
     | 
    
         
             
                    # @api private
         
     | 
| 
       87 
87 
     | 
    
         
             
                    def gsdevice
         
     | 
| 
         @@ -99,16 +99,12 @@ module DerivativeRodeo 
     | 
|
| 
       99 
99 
     | 
    
         
             
                      @entries = Array.wrap(gsconvert)
         
     | 
| 
       100 
100 
     | 
    
         
             
                    end
         
     | 
| 
       101 
101 
     | 
    
         | 
| 
       102 
     | 
    
         
            -
                    def output_base
         
     | 
| 
       103 
     | 
    
         
            -
                      @output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
         
     | 
| 
       104 
     | 
    
         
            -
                    end
         
     | 
| 
       105 
     | 
    
         
            -
             
     | 
| 
       106 
102 
     | 
    
         
             
                    def gsconvert
         
     | 
| 
       107 
103 
     | 
    
         
             
                      # NOTE: you must call gsdevice before compression, as compression is
         
     | 
| 
       108 
104 
     | 
    
         
             
                      # updated during the gsdevice call.
         
     | 
| 
       109 
105 
     | 
    
         
             
                      file_names = []
         
     | 
| 
       110 
106 
     | 
    
         | 
| 
       111 
     | 
    
         
            -
                      Open3.popen3(gsconvert_cmd( 
     | 
| 
      
 107 
     | 
    
         
            +
                      Open3.popen3(gsconvert_cmd(ghost_script_output_file_template)) do |_stdin, stdout, stderr, _wait_thr|
         
     | 
| 
       112 
108 
     | 
    
         
             
                        err = stderr.read
         
     | 
| 
       113 
109 
     | 
    
         
             
                        logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
         
     | 
| 
       114 
110 
     | 
    
         | 
| 
         @@ -116,7 +112,7 @@ module DerivativeRodeo 
     | 
|
| 
       116 
112 
     | 
    
         
             
                        stdout.read.split("\n").each do |line|
         
     | 
| 
       117 
113 
     | 
    
         
             
                          next unless line.start_with?('Page ')
         
     | 
| 
       118 
114 
     | 
    
         | 
| 
       119 
     | 
    
         
            -
                          file_names << format( 
     | 
| 
      
 115 
     | 
    
         
            +
                          file_names << format(ghost_script_output_file_template, page_number)
         
     | 
| 
       120 
116 
     | 
    
         
             
                          page_number += 1
         
     | 
| 
       121 
117 
     | 
    
         
             
                        end
         
     | 
| 
       122 
118 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -126,12 +122,12 @@ module DerivativeRodeo 
     | 
|
| 
       126 
122 
     | 
    
         | 
| 
       127 
123 
     | 
    
         
             
                    def create_file_name(line:, page_number:); end
         
     | 
| 
       128 
124 
     | 
    
         | 
| 
       129 
     | 
    
         
            -
                    def gsconvert_cmd( 
     | 
| 
      
 125 
     | 
    
         
            +
                    def gsconvert_cmd(ghost_script_output_file_template)
         
     | 
| 
       130 
126 
     | 
    
         
             
                      @gsconvert_cmd ||= begin
         
     | 
| 
       131 
127 
     | 
    
         
             
                                           cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
         
     | 
| 
       132 
128 
     | 
    
         
             
                                           cmd += " -sCompression=#{compression}" if compression?
         
     | 
| 
       133 
129 
     | 
    
         
             
                                           cmd += " -dJPEGQ=#{quality}" if quality?
         
     | 
| 
       134 
     | 
    
         
            -
                                           cmd += " -sOutputFile=#{ 
     | 
| 
      
 130 
     | 
    
         
            +
                                           cmd += " -sOutputFile=#{ghost_script_output_file_template} -r#{ppi} -f #{pdfpath}"
         
     | 
| 
       135 
131 
     | 
    
         
             
                                           cmd
         
     | 
| 
       136 
132 
     | 
    
         
             
                                         end
         
     | 
| 
       137 
133 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -21,7 +21,7 @@ module DerivativeRodeo 
     | 
|
| 
       21 
21 
     | 
    
         
             
                  def self.read(url)
         
     | 
| 
       22 
22 
     | 
    
         
             
                    HTTParty.get(url, logger: DerivativeRodeo.config.logger).body
         
     | 
| 
       23 
23 
     | 
    
         
             
                  rescue StandardError => e
         
     | 
| 
       24 
     | 
    
         
            -
                    config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
         
     | 
| 
      
 24 
     | 
    
         
            +
                    DerivativeRodeo.config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
         
     | 
| 
       25 
25 
     | 
    
         
             
                    raise e
         
     | 
| 
       26 
26 
     | 
    
         
             
                  end
         
     | 
| 
       27 
27 
     | 
    
         | 
| 
         @@ -42,6 +42,8 @@ module DerivativeRodeo 
     | 
|
| 
       42 
42 
     | 
    
         | 
| 
       43 
43 
     | 
    
         
             
                  class << self
         
     | 
| 
       44 
44 
     | 
    
         
             
                    alias scheme location_name
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                    delegate :config, to: DerivativeRodeo
         
     | 
| 
       45 
47 
     | 
    
         
             
                  end
         
     | 
| 
       46 
48 
     | 
    
         | 
| 
       47 
49 
     | 
    
         
             
                  ##
         
     | 
| 
         @@ -206,6 +208,22 @@ module DerivativeRodeo 
     | 
|
| 
       206 
208 
     | 
    
         
             
                    klass.build(from_uri: file_path, template: template)
         
     | 
| 
       207 
209 
     | 
    
         
             
                  end
         
     | 
| 
       208 
210 
     | 
    
         | 
| 
      
 211 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 212 
     | 
    
         
            +
                  # When you have a known location and want to check for files that are within that location,
         
     | 
| 
      
 213 
     | 
    
         
            +
                  # use the {#globbed_tail_locations} method.  In the case of {Generators::PdfSplitGenerator} we
         
     | 
| 
      
 214 
     | 
    
         
            +
                  # need to know the path to all of the image files we "split" off of the given PDF.
         
     | 
| 
      
 215 
     | 
    
         
            +
                  #
         
     | 
| 
      
 216 
     | 
    
         
            +
                  # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
         
     | 
| 
      
 217 
     | 
    
         
            +
                  # qualified" Dir.glob type search.
         
     | 
| 
      
 218 
     | 
    
         
            +
                  #
         
     | 
| 
      
 219 
     | 
    
         
            +
                  # @param tail_glob [String]
         
     | 
| 
      
 220 
     | 
    
         
            +
                  #
         
     | 
| 
      
 221 
     | 
    
         
            +
                  # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
         
     | 
| 
      
 222 
     | 
    
         
            +
                  #         array when there are none.
         
     | 
| 
      
 223 
     | 
    
         
            +
                  def globbed_tail_locations(tail_glob:)
         
     | 
| 
      
 224 
     | 
    
         
            +
                    raise NotImplementedError, "#{self.class}#globbed_locations"
         
     | 
| 
      
 225 
     | 
    
         
            +
                  end
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
       209 
227 
     | 
    
         
             
                  ##
         
     | 
| 
       210 
228 
     | 
    
         
             
                  # @param extension [String, StorageLocations::SAME]
         
     | 
| 
       211 
229 
     | 
    
         
             
                  # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use
         
     | 
| 
         @@ -43,7 +43,7 @@ module DerivativeRodeo 
     | 
|
| 
       43 
43 
     | 
    
         
             
                  # @param url [String]
         
     | 
| 
       44 
44 
     | 
    
         
             
                  #
         
     | 
| 
       45 
45 
     | 
    
         
             
                  # @return [String]
         
     | 
| 
       46 
     | 
    
         
            -
                  def  
     | 
| 
      
 46 
     | 
    
         
            +
                  def get(url)
         
     | 
| 
       47 
47 
     | 
    
         
             
                    HTTParty.get(url, logger: config.logger)
         
     | 
| 
       48 
48 
     | 
    
         
             
                  rescue => e
         
     | 
| 
       49 
49 
     | 
    
         
             
                    config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
         
     | 
| 
         @@ -51,13 +51,11 @@ module DerivativeRodeo 
     | 
|
| 
       51 
51 
     | 
    
         
             
                  end
         
     | 
| 
       52 
52 
     | 
    
         | 
| 
       53 
53 
     | 
    
         
             
                  ##
         
     | 
| 
       54 
     | 
    
         
            -
                  # @param url [String]
         
     | 
| 
       55 
     | 
    
         
            -
                  #
         
     | 
| 
       56 
54 
     | 
    
         
             
                  # @return [URI] when the URL resolves successfully
         
     | 
| 
       57 
55 
     | 
    
         
             
                  # @return [FalseClass] when the URL's head request is not successful or we've exhausted our
         
     | 
| 
       58 
56 
     | 
    
         
             
                  #         remaining redirects.
         
     | 
| 
       59 
     | 
    
         
            -
                  def  
     | 
| 
       60 
     | 
    
         
            -
                    HTTParty.head( 
     | 
| 
      
 57 
     | 
    
         
            +
                  def exist?
         
     | 
| 
      
 58 
     | 
    
         
            +
                    HTTParty.head(file_uri, logger: config.logger)
         
     | 
| 
       61 
59 
     | 
    
         
             
                  rescue => e
         
     | 
| 
       62 
60 
     | 
    
         
             
                    config.logger.error(%(#{e.message}\n#{e.backtrace.join("\n")}))
         
     | 
| 
       63 
61 
     | 
    
         
             
                    false
         
     | 
| 
         @@ -8,7 +8,15 @@ module DerivativeRodeo 
     | 
|
| 
       8 
8 
     | 
    
         
             
                # Location to download and upload files to S3
         
     | 
| 
       9 
9 
     | 
    
         
             
                #
         
     | 
| 
       10 
10 
     | 
    
         
             
                class S3Location < BaseLocation
         
     | 
| 
       11 
     | 
    
         
            -
                   
     | 
| 
      
 11 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # @!group Class Attributes
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # @!attribute use_actual_s3_bucket
         
     | 
| 
      
 14 
     | 
    
         
            +
                  #
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # When true , we are going to use a live S3 bucket.  When false, we'll use a fake local bucket.
         
     | 
| 
      
 16 
     | 
    
         
            +
                  class_attribute :use_actual_s3_bucket, default: true
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # @!endgroup Class Attributes
         
     | 
| 
      
 18 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
       12 
20 
     | 
    
         
             
                  ##
         
     | 
| 
       13 
21 
     | 
    
         
             
                  # Create a new uri of the classes type. Parts argument should have a default in
         
     | 
| 
       14 
22 
     | 
    
         
             
                  # implementing classes. Must support a number or the symbol :all
         
     | 
| 
         @@ -24,10 +32,11 @@ module DerivativeRodeo 
     | 
|
| 
       24 
32 
     | 
    
         
             
                  end
         
     | 
| 
       25 
33 
     | 
    
         | 
| 
       26 
34 
     | 
    
         
             
                  ##
         
     | 
| 
       27 
     | 
    
         
            -
                  # @param  
     | 
| 
      
 35 
     | 
    
         
            +
                  # @param bucket_name [String, NilClass] when given, use this as the bucket, otherwise, def
         
     | 
| 
      
 36 
     | 
    
         
            +
                  #
         
     | 
| 
       28 
37 
     | 
    
         
             
                  # @return [String]
         
     | 
| 
       29 
     | 
    
         
            -
                  def self.adapter_prefix( 
     | 
| 
       30 
     | 
    
         
            -
                    "#{scheme}://#{ 
     | 
| 
      
 38 
     | 
    
         
            +
                  def self.adapter_prefix(bucket_name: config.aws_s3_bucket)
         
     | 
| 
      
 39 
     | 
    
         
            +
                    "#{scheme}://#{bucket_name}.s3.#{config.aws_s3_region}.amazonaws.com"
         
     | 
| 
       31 
40 
     | 
    
         
             
                  end
         
     | 
| 
       32 
41 
     | 
    
         | 
| 
       33 
42 
     | 
    
         
             
                  ##
         
     | 
| 
         @@ -53,6 +62,38 @@ module DerivativeRodeo 
     | 
|
| 
       53 
62 
     | 
    
         
             
                    bucket.objects(prefix: file_path).count.positive?
         
     | 
| 
       54 
63 
     | 
    
         
             
                  end
         
     | 
| 
       55 
64 
     | 
    
         | 
| 
      
 65 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 66 
     | 
    
         
            +
                  # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
         
     | 
| 
      
 67 
     | 
    
         
            +
                  #
         
     | 
| 
      
 68 
     | 
    
         
            +
                  # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.  We can
         
     | 
| 
      
 69 
     | 
    
         
            +
                  #       use the components of the file_path to fake that behavior.
         
     | 
| 
      
 70 
     | 
    
         
            +
                  #
         
     | 
| 
      
 71 
     | 
    
         
            +
                  # @see Generators::PdfSplitGenerator#image_file_basename_template
         
     | 
| 
      
 72 
     | 
    
         
            +
                  def globbed_tail_locations(tail_glob:)
         
     | 
| 
      
 73 
     | 
    
         
            +
                    # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
         
     | 
| 
      
 74 
     | 
    
         
            +
                    #
         
     | 
| 
      
 75 
     | 
    
         
            +
                    # NOTE: Should we be storing our files as such?  The pattern we need is
         
     | 
| 
      
 76 
     | 
    
         
            +
                    # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
         
     | 
| 
      
 77 
     | 
    
         
            +
                    # more than one PDF (that we intend to split); we don't want to trample on those split files
         
     | 
| 
      
 78 
     | 
    
         
            +
                    # and miscolate two PDFs.
         
     | 
| 
      
 79 
     | 
    
         
            +
                    #
         
     | 
| 
      
 80 
     | 
    
         
            +
                    # file_path = "s3://blah/1234/hello-world/hello-world.pdf
         
     | 
| 
      
 81 
     | 
    
         
            +
                    globname = File.join(file_dir, tail_glob)
         
     | 
| 
      
 82 
     | 
    
         
            +
                    regexp = %r{#{File.extname(globname)}$}
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                    # NOTE: We're making some informed guesses, needing to include the fully qualified template
         
     | 
| 
      
 85 
     | 
    
         
            +
                    # based on both the key of the item in the bucket as well as the bucket's host.
         
     | 
| 
      
 86 
     | 
    
         
            +
                    uri = URI.parse(file_uri)
         
     | 
| 
      
 87 
     | 
    
         
            +
                    scheme_and_host = "#{uri.scheme}://#{uri.host}"
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                    bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
         
     | 
| 
      
 90 
     | 
    
         
            +
                      if object.key.match(regexp)
         
     | 
| 
      
 91 
     | 
    
         
            +
                        template = File.join(scheme_and_host, object.key)
         
     | 
| 
      
 92 
     | 
    
         
            +
                        derived_file_from(template: template)
         
     | 
| 
      
 93 
     | 
    
         
            +
                      end
         
     | 
| 
      
 94 
     | 
    
         
            +
                    end
         
     | 
| 
      
 95 
     | 
    
         
            +
                  end
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
       56 
97 
     | 
    
         
             
                  ##
         
     | 
| 
       57 
98 
     | 
    
         
             
                  # @api public
         
     | 
| 
       58 
99 
     | 
    
         
             
                  # write the tmp file to the file_uri
         
     | 
| 
         @@ -71,6 +112,9 @@ module DerivativeRodeo 
     | 
|
| 
       71 
112 
     | 
    
         
             
                  #
         
     | 
| 
       72 
113 
     | 
    
         
             
                  # @return [Aws::S3::Resource]
         
     | 
| 
       73 
114 
     | 
    
         
             
                  def resource
         
     | 
| 
      
 115 
     | 
    
         
            +
                    # TODO: Are there instantiation considerations when running in Lambda?  In tests
         
     | 
| 
      
 116 
     | 
    
         
            +
                    # initializing a resource is very slow (e.g. 3 seconds or so).  Should this be a class
         
     | 
| 
      
 117 
     | 
    
         
            +
                    # method?  Can it be given the SpaceStone constraints?
         
     | 
| 
       74 
118 
     | 
    
         
             
                    @resource ||= if DerivativeRodeo.config.aws_s3_access_key_id
         
     | 
| 
       75 
119 
     | 
    
         
             
                                    Aws::S3::Resource.new(region: DerivativeRodeo.config.aws_s3_region,
         
     | 
| 
       76 
120 
     | 
    
         
             
                                                          credentials: Aws::Credentials.new(
         
     | 
| 
         @@ -91,13 +135,28 @@ module DerivativeRodeo 
     | 
|
| 
       91 
135 
     | 
    
         
             
                    raise Errors::BucketMissingError
         
     | 
| 
       92 
136 
     | 
    
         
             
                  end
         
     | 
| 
       93 
137 
     | 
    
         | 
| 
      
 138 
     | 
    
         
            +
                  # @see .use_actual_s3_bucket
         
     | 
| 
       94 
139 
     | 
    
         
             
                  def bucket
         
     | 
| 
       95 
     | 
    
         
            -
                    @bucket ||=  
     | 
| 
      
 140 
     | 
    
         
            +
                    @bucket ||= if use_actual_s3_bucket?
         
     | 
| 
      
 141 
     | 
    
         
            +
                                  resource.bucket(bucket_name)
         
     | 
| 
      
 142 
     | 
    
         
            +
                                else
         
     | 
| 
      
 143 
     | 
    
         
            +
                                  self.class.faux_bucket
         
     | 
| 
      
 144 
     | 
    
         
            +
                                end
         
     | 
| 
       96 
145 
     | 
    
         
             
                  end
         
     | 
| 
       97 
146 
     | 
    
         | 
| 
       98 
147 
     | 
    
         
             
                  def file_path
         
     | 
| 
       99 
148 
     | 
    
         
             
                    @file_path ||= @file_uri.sub(%r{.+://.+?/}, '')
         
     | 
| 
       100 
149 
     | 
    
         
             
                  end
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 152 
     | 
    
         
            +
                  # A fake constructed fake bucket that confroms to the narrow S3 interface that we use.
         
     | 
| 
      
 153 
     | 
    
         
            +
                  #
         
     | 
| 
      
 154 
     | 
    
         
            +
                  # @see .use_actual_s3_bucket
         
     | 
| 
      
 155 
     | 
    
         
            +
                  # @return [AwsS3FauxBucket]
         
     | 
| 
      
 156 
     | 
    
         
            +
                  def self.faux_bucket
         
     | 
| 
      
 157 
     | 
    
         
            +
                    # We are not requiring this file; except in the spec context.
         
     | 
| 
      
 158 
     | 
    
         
            +
                    @faux_bucket ||= AwsS3FauxBucket.new
         
     | 
| 
      
 159 
     | 
    
         
            +
                  end
         
     | 
| 
       101 
160 
     | 
    
         
             
                end
         
     | 
| 
       102 
161 
     | 
    
         
             
              end
         
     | 
| 
       103 
162 
     | 
    
         
             
            end
         
     | 
| 
         @@ -14,9 +14,20 @@ module DerivativeRodeo 
     | 
|
| 
       14 
14 
     | 
    
         
             
                # It uploads a file_uri to the queue, not the contents of that file
         
     | 
| 
       15 
15 
     | 
    
         
             
                # reading from the queue is not currently implemented
         
     | 
| 
       16 
16 
     | 
    
         
             
                class SqsLocation < BaseLocation
         
     | 
| 
      
 17 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # @!group Class Attributes
         
     | 
| 
      
 19 
     | 
    
         
            +
                  #
         
     | 
| 
      
 20 
     | 
    
         
            +
                  # @!attribute batch_size
         
     | 
| 
      
 21 
     | 
    
         
            +
                  #   @return [Integer]
         
     | 
| 
       17 
22 
     | 
    
         
             
                  class_attribute :batch_size, default: 10
         
     | 
| 
       18 
23 
     | 
    
         | 
| 
       19 
     | 
    
         
            -
                   
     | 
| 
      
 24 
     | 
    
         
            +
                  # @!attribute use_real_sqs
         
     | 
| 
      
 25 
     | 
    
         
            +
                  #   When true, use the real SQS; else when false use a fake one.  You probably don't want to
         
     | 
| 
      
 26 
     | 
    
         
            +
                  #   use the fake one in your production.  But it's exposed in this manner to ease testing of
         
     | 
| 
      
 27 
     | 
    
         
            +
                  #   downstream dependencies.
         
     | 
| 
      
 28 
     | 
    
         
            +
                  class_attribute :use_real_sqs, default: true
         
     | 
| 
      
 29 
     | 
    
         
            +
                  # @!endgroup Class Attributes
         
     | 
| 
      
 30 
     | 
    
         
            +
                  ##
         
     | 
| 
       20 
31 
     | 
    
         | 
| 
       21 
32 
     | 
    
         
             
                  ##
         
     | 
| 
       22 
33 
     | 
    
         
             
                  # Create a new uri of the classes type. Parts argument should have a default in
         
     | 
| 
         @@ -82,19 +93,26 @@ module DerivativeRodeo 
     | 
|
| 
       82 
93 
     | 
    
         
             
                    file_uri
         
     | 
| 
       83 
94 
     | 
    
         
             
                  end
         
     | 
| 
       84 
95 
     | 
    
         | 
| 
      
 96 
     | 
    
         
            +
                  # rubocop:disable Metrics/MethodLength
         
     | 
| 
       85 
97 
     | 
    
         
             
                  def client
         
     | 
| 
       86 
     | 
    
         
            -
                    @client ||= if  
     | 
| 
       87 
     | 
    
         
            -
                                   
     | 
| 
       88 
     | 
    
         
            -
                                     
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
       90 
     | 
    
         
            -
                                       
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
      
 98 
     | 
    
         
            +
                    @client ||= if use_real_sqs?
         
     | 
| 
      
 99 
     | 
    
         
            +
                                  if config.aws_sqs_access_key_id && config.aws_sqs_secret_access_key
         
     | 
| 
      
 100 
     | 
    
         
            +
                                    Aws::SQS::Client.new(
         
     | 
| 
      
 101 
     | 
    
         
            +
                                      region: config.aws_sqs_region,
         
     | 
| 
      
 102 
     | 
    
         
            +
                                      credentials: Aws::Credentials.new(
         
     | 
| 
      
 103 
     | 
    
         
            +
                                        config.aws_sqs_access_key_id,
         
     | 
| 
      
 104 
     | 
    
         
            +
                                        config.aws_sqs_secret_access_key
         
     | 
| 
      
 105 
     | 
    
         
            +
                                      )
         
     | 
| 
       92 
106 
     | 
    
         
             
                                    )
         
     | 
| 
       93 
     | 
    
         
            -
                                   
     | 
| 
      
 107 
     | 
    
         
            +
                                  else
         
     | 
| 
      
 108 
     | 
    
         
            +
                                    Aws::SQS::Client.new(region: config.aws_sqs_region)
         
     | 
| 
      
 109 
     | 
    
         
            +
                                  end
         
     | 
| 
       94 
110 
     | 
    
         
             
                                else
         
     | 
| 
       95 
     | 
    
         
            -
                                   
     | 
| 
      
 111 
     | 
    
         
            +
                                  # We are not requiring this file; except in the spec context.
         
     | 
| 
      
 112 
     | 
    
         
            +
                                  AwsSqsFauxClient.new
         
     | 
| 
       96 
113 
     | 
    
         
             
                                end
         
     | 
| 
       97 
114 
     | 
    
         
             
                  end
         
     | 
| 
      
 115 
     | 
    
         
            +
                  # rubocop:enable Metrics/MethodLength
         
     | 
| 
       98 
116 
     | 
    
         | 
| 
       99 
117 
     | 
    
         
             
                  def add(message:)
         
     | 
| 
       100 
118 
     | 
    
         
             
                    client.send_message({
         
     | 
    
        data/lib/derivative_rodeo.rb
    CHANGED
    
    
| 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            ##
         
     | 
| 
      
 4 
     | 
    
         
            +
            # This class is very rudimentary implementation of a bucket.  It conforms to the necessary
         
     | 
| 
      
 5 
     | 
    
         
            +
            # interface for downloading and uploading and filter on prefix.
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            # It is provided as a lib/spec/support so that downstream implementations can leverage a fake S3
         
     | 
| 
      
 8 
     | 
    
         
            +
            # bucket.
         
     | 
| 
      
 9 
     | 
    
         
            +
            #
         
     | 
| 
      
 10 
     | 
    
         
            +
            # @see [DerivativeRodeo::StorageLocations::S3Location]
         
     | 
| 
      
 11 
     | 
    
         
            +
            class AwsS3FauxBucket
         
     | 
| 
      
 12 
     | 
    
         
            +
              def initialize
         
     | 
| 
      
 13 
     | 
    
         
            +
                @storage = {}
         
     | 
| 
      
 14 
     | 
    
         
            +
              end
         
     | 
| 
      
 15 
     | 
    
         
            +
              attr_reader :storage
         
     | 
| 
      
 16 
     | 
    
         
            +
              def object(path)
         
     | 
| 
      
 17 
     | 
    
         
            +
                # Yup, we've got nested buckets
         
     | 
| 
      
 18 
     | 
    
         
            +
                @storage[path] ||= Storage.new(key: path)
         
     | 
| 
      
 19 
     | 
    
         
            +
              end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              def objects(prefix:)
         
     | 
| 
      
 22 
     | 
    
         
            +
                @storage.each_with_object([]) do |(path, obj), accumulator|
         
     | 
| 
      
 23 
     | 
    
         
            +
                  accumulator << obj if path.start_with?(prefix)
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
              class Storage
         
     | 
| 
      
 28 
     | 
    
         
            +
                # Because we're now coping with the glob tail finder, we need to account for the bucket entry's
         
     | 
| 
      
 29 
     | 
    
         
            +
                # key.
         
     | 
| 
      
 30 
     | 
    
         
            +
                def initialize(key:)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  @key = key
         
     | 
| 
      
 32 
     | 
    
         
            +
                  @storage = {}
         
     | 
| 
      
 33 
     | 
    
         
            +
                end
         
     | 
| 
      
 34 
     | 
    
         
            +
                attr_reader :storage, :key
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                def upload_file(path)
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @storage[:upload] = File.read(path)
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                def download_file(path)
         
     | 
| 
      
 41 
     | 
    
         
            +
                  return false unless @storage.key?(:upload)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  content = @storage.fetch(:upload)
         
     | 
| 
      
 43 
     | 
    
         
            +
                  File.open(path, 'wb') do |f|
         
     | 
| 
      
 44 
     | 
    
         
            +
                    f.puts(content)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
              end
         
     | 
| 
      
 48 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'ostruct'
         
     | 
| 
      
 3 
     | 
    
         
            +
            ##
         
     | 
| 
      
 4 
     | 
    
         
            +
            # This class is very rudimentary implementation of an SQS client.  It conforms to the necessary
         
     | 
| 
      
 5 
     | 
    
         
            +
            # interface for sending messages and reading messages
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            # @see [DerivativeRodeo::StorageAdapters::SqsAdapter]
         
     | 
| 
      
 8 
     | 
    
         
            +
            class AwsSqsFauxClient
         
     | 
| 
      
 9 
     | 
    
         
            +
              def initialize(queue_url: nil)
         
     | 
| 
      
 10 
     | 
    
         
            +
                @queue_url = queue_url || 'https://sqs.us-west-2.amazonaws.com/5555555555/fake'
         
     | 
| 
      
 11 
     | 
    
         
            +
                @storage = {}
         
     | 
| 
      
 12 
     | 
    
         
            +
              end
         
     | 
| 
      
 13 
     | 
    
         
            +
              attr_reader :storage
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
              def send_message(arg_hash)
         
     | 
| 
      
 16 
     | 
    
         
            +
                @storage[arg_hash[:queue_url]] ||= []
         
     | 
| 
      
 17 
     | 
    
         
            +
                @storage[arg_hash[:queue_url]] << arg_hash[:message_body]
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
              def send_message_batch(arg_hash)
         
     | 
| 
      
 21 
     | 
    
         
            +
                @storage[arg_hash[:queue_url]] ||= []
         
     | 
| 
      
 22 
     | 
    
         
            +
                @storage[arg_hash[:queue_url]] += arg_hash[:entries]
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              def receive_message(arg_hash)
         
     | 
| 
      
 26 
     | 
    
         
            +
                output = []
         
     | 
| 
      
 27 
     | 
    
         
            +
                args_hash[:mx_number_of_messages].times do
         
     | 
| 
      
 28 
     | 
    
         
            +
                  value = @storage[arg_hash[:queue_url]]&.pop
         
     | 
| 
      
 29 
     | 
    
         
            +
                  output << value if value
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
              end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
              def get_queue_url(*)
         
     | 
| 
      
 34 
     | 
    
         
            +
                OpenStruct.new(queue_url: @queue_url)
         
     | 
| 
      
 35 
     | 
    
         
            +
              end
         
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: derivative-rodeo
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.3.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Rob Kaufman
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire:
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2023-05 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2023-06-05 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: activesupport
         
     | 
| 
         @@ -281,12 +281,14 @@ files: 
     | 
|
| 
       281 
281 
     | 
    
         
             
            - lib/derivative_rodeo.rb
         
     | 
| 
       282 
282 
     | 
    
         
             
            - lib/derivative_rodeo/configuration.rb
         
     | 
| 
       283 
283 
     | 
    
         
             
            - lib/derivative_rodeo/errors.rb
         
     | 
| 
      
 284 
     | 
    
         
            +
            - lib/derivative_rodeo/generators/alto_generator.rb
         
     | 
| 
       284 
285 
     | 
    
         
             
            - lib/derivative_rodeo/generators/base_generator.rb
         
     | 
| 
       285 
286 
     | 
    
         
             
            - lib/derivative_rodeo/generators/concerns/copy_file_concern.rb
         
     | 
| 
       286 
287 
     | 
    
         
             
            - lib/derivative_rodeo/generators/copy_generator.rb
         
     | 
| 
       287 
288 
     | 
    
         
             
            - lib/derivative_rodeo/generators/hocr_generator.rb
         
     | 
| 
       288 
289 
     | 
    
         
             
            - lib/derivative_rodeo/generators/monochrome_generator.rb
         
     | 
| 
       289 
290 
     | 
    
         
             
            - lib/derivative_rodeo/generators/pdf_split_generator.rb
         
     | 
| 
      
 291 
     | 
    
         
            +
            - lib/derivative_rodeo/generators/plain_text_generator.rb
         
     | 
| 
       290 
292 
     | 
    
         
             
            - lib/derivative_rodeo/generators/thumbnail_generator.rb
         
     | 
| 
       291 
293 
     | 
    
         
             
            - lib/derivative_rodeo/generators/word_coordinates_generator.rb
         
     | 
| 
       292 
294 
     | 
    
         
             
            - lib/derivative_rodeo/services/base_service.rb
         
     | 
| 
         @@ -295,6 +297,7 @@ files: 
     | 
|
| 
       295 
297 
     | 
    
         
             
            - lib/derivative_rodeo/services/image_identify_service.rb
         
     | 
| 
       296 
298 
     | 
    
         
             
            - lib/derivative_rodeo/services/image_jp2_service.rb
         
     | 
| 
       297 
299 
     | 
    
         
             
            - lib/derivative_rodeo/services/image_service.rb
         
     | 
| 
      
 300 
     | 
    
         
            +
            - lib/derivative_rodeo/services/mime_type_service.rb
         
     | 
| 
       298 
301 
     | 
    
         
             
            - lib/derivative_rodeo/services/pdf_splitter/base.rb
         
     | 
| 
       299 
302 
     | 
    
         
             
            - lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb
         
     | 
| 
       300 
303 
     | 
    
         
             
            - lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb
         
     | 
| 
         @@ -311,6 +314,8 @@ files: 
     | 
|
| 
       311 
314 
     | 
    
         
             
            - lib/derivative_rodeo/storage_locations/sqs_location.rb
         
     | 
| 
       312 
315 
     | 
    
         
             
            - lib/derivative_rodeo/technical_metadata.rb
         
     | 
| 
       313 
316 
     | 
    
         
             
            - lib/derivative_rodeo/version.rb
         
     | 
| 
      
 317 
     | 
    
         
            +
            - lib/spec_support/aws_s3_faux_bucket.rb
         
     | 
| 
      
 318 
     | 
    
         
            +
            - lib/spec_support/aws_sqs_faux_client.rb
         
     | 
| 
       314 
319 
     | 
    
         
             
            homepage: https://github.com/scientist-softserv/derivative_rodeo
         
     | 
| 
       315 
320 
     | 
    
         
             
            licenses:
         
     | 
| 
       316 
321 
     | 
    
         
             
            - APACHE-2.0
         
     |