RubyGems - iiif_print - Versions diffs - 1.0.0 → 1.1.0 - Mend

iiif_print 1.0.0 → 1.1.0

Files changed (108) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE.md +18 -0
data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
data/.github/workflows/build-lint-test-action.yaml +4 -5
data/.gitignore +5 -4
data/.rubocop.yml +1 -0
data/.solargraph.yml +19 -0
data/Gemfile.lock +1025 -0
data/README.md +98 -9
data/Rakefile +6 -0
data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
data/app/helpers/iiif_print_helper.rb +0 -20
data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
data/app/models/concerns/iiif_print/solr/document.rb +14 -0
data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
data/app/models/iiif_print/pending_relationship.rb +3 -0
data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
data/app/views/catalog/_index_header_list_default.html.erb +13 -0
data/app/views/hyrax/base/_representative_media.html.erb +4 -3
data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
data/config/locales/iiif_print.en.yml +4 -0
data/config/routes.rb +3 -0
data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
data/docker-compose.yml +2 -2
data/iiif_print.gemspec +10 -9
data/lib/generators/iiif_print/install_generator.rb +21 -1
data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
data/lib/iiif_print/base_derivative_service.rb +2 -1
data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
data/lib/iiif_print/catalog_search_builder.rb +5 -1
data/lib/iiif_print/configuration.rb +145 -8
data/lib/iiif_print/data/fileset_helper.rb +1 -1
data/lib/iiif_print/data/work_derivatives.rb +3 -3
data/lib/iiif_print/engine.rb +7 -13
data/lib/iiif_print/errors.rb +18 -0
data/lib/iiif_print/homepage_search_builder.rb +17 -0
data/lib/iiif_print/image_tool.rb +12 -8
data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
data/lib/iiif_print/lineage_service.rb +29 -8
data/lib/iiif_print/metadata.rb +67 -48
data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
data/lib/iiif_print/version.rb +1 -1
data/lib/iiif_print.rb +167 -12
data/lib/samvera/derivatives/configuration.rb +83 -0
data/lib/samvera/derivatives/hyrax.rb +129 -0
data/lib/samvera/derivatives.rb +238 -0
data/spec/factories/newspaper_page_solr_document.rb +9 -1
data/spec/fixtures/authorities/licenses.yml +4 -0
data/spec/fixtures/authorities/rights_statements.yml +4 -0
data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
data/spec/iiif_print/configuration_spec.rb +141 -15
data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
data/spec/iiif_print/lineage_service_spec.rb +1 -1
data/spec/iiif_print/metadata_spec.rb +157 -23
data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
data/spec/iiif_print_spec.rb +125 -5
data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
data/spec/samvera/derivatives/configuration_spec.rb +41 -0
data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
data/spec/samvera/derivatives_spec.rb +54 -0
data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
data/tasks/copy_authorities_to_test_app.rake +11 -0
data/tasks/iiif_print_dev.rake +4 -4
metadata +123 -35
data/app/helpers/hyrax/iiif_helper.rb +0 -22
data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6

data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb CHANGED Viewed

@@ -1,20 +1,76 @@
 # frozen_string_literal: true
-# Encapsulates methods used for pdf splitting into child works
 module IiifPrint
   module SplitPdfs
+    ##
+    # Encapsulates methods used for pdf splitting into child works.
+    #
+    # The primary point of entry is {.conditionally_enqueue}.
     class ChildWorkCreationFromPdfService
+      ##
+      # Responsible for conditionally enqueueing the PDF splitting job.  The conditions attempt to
+      # sniff out whether the given file was a PDF.
+      #
+      # @param file_set [FileSet] What is the containing file set for the provided file.
+      # @param file [#path, #id]
+      # @param user [User] Who did the upload?
+      # @param import_url [NilClass, String] Provided when we're dealing with a file provided via a
+      #        URL.
+      # @param work [Hydra::PCDM::Work] An optional parameter that saves us a bit of time in not
+      #        needing to query for the parent of the given :file_set (see {.parent_for})
+      #
+      # @return [Symbol] when we don't enqueue the job
+      # @return [TrueClass] when we actually enqueue the job underlying job.
+      # rubocop:disable Metrics/MethodLength
+      def self.conditionally_enqueue(file_set:, file:, user:, import_url: nil, work: nil)
+        work ||= IiifPrint.parent_for(file_set)
+        return :no_split_for_parent unless iiif_print_split?(work: work)
+        return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
+        file_locations = if import_url
+                           [Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
+                         else
+                           pdf_paths(files: [file.try(:id)&.to_s].compact)
+                         end
+        return :no_pdfs_to_split if file_locations.empty?
+        IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: file_locations, user: user)
+        :enqueued
+      end
+      # rubocop:enable Metrics/MethodLength
+      ##
+      # @api private
+      #
+      # Are there any PDF files?
+      # @param [Array > String] paths to PDFs
+      # @return [Boolean]
+      def self.pdfs?(paths:)
+        pdf_paths = pdfs_only_for(paths)
+        return false unless pdf_paths.count.positive?
+        true
+      end
+      ##
+      # @api private
       # Load an array of paths to pdf files
       # @param [Array > Hyrax::Upload file ids]
       # @return [Array > String] file paths to temp directory
       def self.pdf_paths(files:)
+        return [] if files.all?(&:empty?) # assumes an array
         upload_ids = filter_file_ids(files)
         return [] if upload_ids.empty?
         uploads = Hyrax::UploadedFile.find(upload_ids)
         paths = uploads.map(&method(:upload_path))
         pdfs_only_for(paths)
       end
+      ##
+      # @api private
+      #
       # Is child work splitting defined for model?
       # @param [GenericWork, etc] A valid type of hyrax work
       # @return [Boolean]
@@ -24,51 +80,31 @@ module IiifPrint
         false
       end
-      # Are there any PDF files?
-      # @param [Array > String] paths to PDFs
-      # @return [Boolean]
-      def self.pdfs?(paths:)
-        pdf_paths = pdfs_only_for(paths)
-        return false unless pdf_paths.count.positive?
-        true
-      end
-      # Submit the job to split PDF into child works
-      # @param [GenericWork, etc] A valid type of hyrax work
-      # @param [Array<String>] paths to PDF attachments
-      # @param [User] user
-      # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
-      def self.queue_job(work:, file_locations:, user:, admin_set_id:)
-        work.iiif_print_config.pdf_splitter_job.perform_later(
-          work,
-          file_locations,
-          user,
-          admin_set_id,
-          count_existing_pdfs(work)
-        )
-      end
+      ##
+      # @api private
       def self.filter_file_ids(input)
         Array.wrap(input).select(&:present?)
       end
+      ##
+      # @api private
+      #
       # Given Hyrax::Upload object, return path to file on local filesystem
       def self.upload_path(upload)
         # so many layers to this onion:
+        # TODO: Write a recursive function to keep calling file until
+        # the file doesn't respond to file then return that file.
         upload.file.file.file
       end
-      # TODO: implement a method to count existing PDFs on a work to support
-      #       adding more PDFs to an existing work.
-      def self.count_existing_pdfs(_work)
-        0
-      end
+      ##
+      # @api private
+      #
       # TODO: Consider other methods to identify a PDF file.
       #       This sub-selection may need to be moved to use mimetype if there
       #       is a need to support paths not ending in .pdf (i.e. remote_urls)
       def self.pdfs_only_for(paths)
-        paths.select { |path| path.end_with?('.pdf', '.PDF') }
+        paths.select { |path| IiifPrint.split_for_path_suffix?(path) }
       end
     end
   end

data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb ADDED Viewed

@@ -0,0 +1,166 @@
+module IiifPrint
+  module SplitPdfs
+    ##
+    # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed
+    # images, or split a PDF if there are no preprocessed images.
+    #
+    # We have already attached the original file to the file_set.  We want to convert that original
+    # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have
+    # written to Fedora as the PDF)
+    #
+    # @see .call
+    class DerivativeRodeoSplitter
+      ##
+      # @param filename [String] the local path to the PDFDerivativeServicele
+      # @param file_set [FileSet] file set containing the PDF file to split
+      #
+      # @return [Array<String>] paths to images split from each page of PDF file
+      #
+      # @see IiifPrint::SplitPdfs::BaseSplitter
+      def self.call(filename, file_set:)
+        new(filename, file_set: file_set).split_files
+      end
+      ##
+      # @param filename [String] path to the original file.  Note that we use {#filename} to
+      #        derivate {#input_uri}
+      # @param file_set [FileSet] the container for the original file and its derivatives.
+      #
+      # @param output_tmp_dir [String] where we will be writing things.  In using `Dir.mktmpdir`
+      #        we're creating a sudirectory on `Dir.tmpdir`
+      def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
+        @filename = filename
+        @file_set = file_set
+        @input_uri = "file://#{filename}"
+        # We are writing the images to a local location that CarrierWave can upload.  This is a
+        # local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but
+        # that's just a convenience.
+        output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')
+        @output_location_template = "file://#{output_template_path}"
+      end
+      attr_reader :filename, :file_set
+      ##
+      # This is where, in "Fedora" we have the original file.  This is not the original file in the
+      # pre-processing location but instead the long-term location of the file in the application
+      # that mounts IIIF Print.
+      #
+      # @return [String]
+      attr_reader :input_uri
+      ##
+      # This is the location where we're going to write the derivatives that will "go into Fedora";
+      # it is a local location, one that IIIF Print's mounting application can directly do
+      # "File.read"
+      #
+      # @return [String]
+      attr_reader :output_location_template
+      ##
+      # Where can we find the file that represents the pre-processing template.  In this case, the
+      # original PDF file.
+      #
+      # The logic handles a case where SpaceStone successfully fetched the file to then perform
+      # processing.
+      #
+      # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
+      # bucket that we then use for IIIF Print.
+      #
+      # @note The preprocessed_location_template should end in `.pdf`.  The
+      #       DerivativeRodeo::BaseGenerator::PdfSplitGenerator#derive_preprocessed_template_from
+      #       will coerce the template into one that represents the split pages.
+      #
+      # @return [String]
+      #
+      # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
+      # rubocop:disable Metrics/MethodLength
+      # rubocop:disable Metrics/AbcSize
+      def preprocessed_location_template
+        return @preprocessed_location_template if defined?(@preprocessed_location_template)
+        derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
+        @preprocessed_location_template =
+          if derivative_rodeo_candidate.blank?
+            message = "#{self.class}##{__method__} could not establish derivative_rodeo_candidate for " \
+                      "#{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} with filename #{filename.inspect}.  " \
+                      "Move along little buddy."
+            Rails.logger.debug(message)
+            nil
+          elsif rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
+            Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}.  High five partner!")
+            derivative_rodeo_candidate
+          elsif file_set.import_url
+            message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist.  " \
+                      "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
+            Rails.logger.warn(message)
+            handle_original_file_not_in_derivative_rodeo
+          else
+            message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
+                      "nor a remote_url for #{file_set.class} ID=#{file_set.id} #to_param=#{file_set&.to_param}.  " \
+                      "Returning `nil' as we have no possible preprocess.  " \
+                      "Maybe the input_uri #{input_uri.inspect} will be adequate."
+            Rails.logger.warn(message)
+            nil
+          end
+      end
+      # rubocop:enable Metrics/AbcSize
+      # rubocop:enable Metrics/MethodLength
+      ##
+      # @api private
+      #
+      # When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
+      # ensure that we have something locally.  We copy the {FileSet#import_url} to the {#input_uri}
+      # location.
+      #
+      # @return [String] should be the {#input_uri}
+      # @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
+      def handle_original_file_not_in_derivative_rodeo
+        # A quick short-circuit.  Don't attempt to copy.  Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
+        return input_uri if rodeo_conformant_uri_exists?(input_uri)
+        message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist.  " \
+                  "Perhaps there was a problem in SpaceStone downloading the file?  " \
+                  "Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists.  " \
+                  "However, we'll almost certainly be generating child pages locally."
+        Rails.logger.info(message)
+        # This ensures that we have a copy of the file_set.import_uri at the input_uri location;
+        # we likely have this.
+        DerivativeRodeo::Generators::CopyGenerator.new(
+          input_uris: [file_set.import_url],
+          output_location_template: input_uri
+        ).generated_uris.first
+      end
+      # private :handle_original_file_not_in_derivative_rodeo
+      def rodeo_conformant_uri_exists?(uri)
+        DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
+      end
+      private :rodeo_conformant_uri_exists?
+      ##
+      # @return [Array<Strings>] the paths to each of the images split off from the PDF.
+      def split_files
+        DerivativeRodeo::Generators::PdfSplitGenerator.new(
+          input_uris: [input_uri],
+          output_location_template: output_location_template,
+          preprocessed_location_template: preprocessed_location_template
+        ).generated_files.map(&:file_path)
+      rescue => e
+        message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
+                  "input_uri: #{input_uri.inspect}, " \
+                  "output_location_template: #{output_location_template.inspect}, and " \
+                  "preprocessed_location_template: #{preprocessed_location_template.inspect}."
+        exception = RuntimeError.new(message)
+        exception.set_backtrace(e.backtrace)
+        raise exception
+      end
+    end
+  end
+end

data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module IiifPrint
+  module SplitPdfs
+    ## Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
+    class DestroyPdfChildWorksService
+      ## @api public
+      # @param file_set [FileSet] What is the containing file set for the provided file.
+      # @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
+      def self.conditionally_destroy_spawned_children_of(file_set:, work:)
+        child_model = work.try(:iiif_print_config)&.pdf_split_child_model
+        return unless child_model
+        return unless file_set.class.pdf_mime_types.include?(file_set.mime_type)
+        IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
+        destroy_spawned_children(model: child_model, file_set: file_set, work: work)
+      end
+      private_class_method def self.destroy_spawned_children(model:, file_set:, work:)
+        # look first for children by the file set id they were split from
+        children = model.where(split_from_pdf_id: file_set.id)
+        if children.blank?
+          # find works where file name and work `to_param` are both in the title
+          children = model.where(title: file_set.label).where(title: work.to_param)
+        end
+        return if children.blank?
+        children.each do |rcd|
+          rcd.destroy(eradicate: true)
+        end
+      end
+    end
+  end
+end

data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module IiifPrint
+  module SplitPdfs
+    # @abstract
+    #
+    # The purpose of this class is to split the PDF into constituent jpg files.
+    #
+    # @see #each
+    class PagesToJpgsSplitter < BaseSplitter
+      self.image_extension = 'jpg'
+      self.quality = '50'
+      private
+      def gsdevice
+        'jpeg'
+      end
+    end
+  end
+end

data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module IiifPrint
+  module SplitPdfs
+    # @abstract
+    #
+    # The purpose of this class is to split the PDF into constituent png files.
+    #
+    # @see #each
+    class PagesToPngsSplitter < BaseSplitter
+      self.image_extension = 'png'
+      private
+      def gsdevice
+        color, _channels, bpc = pdfinfo.color
+        device = nil
+        # 1 Bit Grayscale, if applicable:
+        device = 'pngmonod' if color == 'gray' && bpc == 1
+        # 8 Bit Grayscale, if applicable:
+        device = 'pnggray' if color == 'gray' && bpc > 1
+        # otherwise 24 Bit RGB:
+        device = 'png16m' if device.nil?
+        device
+      end
+    end
+  end
+end

data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module IiifPrint
+  module SplitPdfs
+    # The purpose of this class is to split the PDF into constituent TIFF files.
+    #
+    # @see #each
+    class PagesToTiffsSplitter < BaseSplitter
+      self.image_extension = 'tiff'
+      DEFAULT_COMPRESSION = 'lzw'.freeze
+      self.compression = DEFAULT_COMPRESSION
+      private
+      def gsdevice
+        color, channels, bpc = pdfinfo.color
+        device = nil
+        if color == 'gray'
+          # CCITT Group 4 Black and White, if applicable:
+          if bpc == 1
+            device = 'tiffg4'
+            self.compression = 'g4'
+          elsif bpc > 1
+            # 8 Bit Grayscale, if applicable:
+            device = 'tiffgray'
+          end
+        end
+        # otherwise color:
+        device = colordevice(channels, bpc) if device.nil?
+        device
+      end
+      def colordevice(channels, bpc)
+        bits = bpc * channels
+        # will be either 8bpc/16bpd color TIFF,
+        #   with any CMYK source transformed to 8bpc RBG
+        bits = 24 unless [24, 48].include? bits
+        "tiff#{bits}nc"
+      end
+    end
+  end
+end

data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb CHANGED Viewed

@@ -8,78 +8,83 @@ module IiifPrint
     # For dpi extraction, falls back to calculating using MiniMagick,
     #   if neccessary.
     class PdfImageExtractionService
-      # class constant column numbers
-      COL_WIDTH = 3
-      COL_HEIGHT = 4
-      COL_COLOR = 5
-      COL_CHANNELS = 6
-      COL_BITS = 7
-      # only poppler 0.25+ has this column in output:
-      COL_XPPI = 12
       def initialize(path)
         @path = path
-        @cmd = format('pdfimages -list %<path>s', path: path)
-        @output = nil
-        @entries = nil
+        process(command: format('pdfimages -list %<path>s 2>/dev/null', path: path))
       end
-      def process
-        # call just once
-        if @output.nil?
-          Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
-            @output = stdout.read.split("\n")
-          end
-        end
-        @output.slice(2, @output.size - 1)
-      end
+      attr_reader :path, :page_count, :width, :height, :pixels_per_inch
+      alias ppi pixels_per_inch
-      def entries
-        if @entries.nil?
-          @entries = []
-          output = process
-          (0..output.size - 1).each do |i|
-            @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
-          end
-        end
-        @entries
+      # @return [Array<String, Integer, Integer>]
+      def color
+        [@color_description, @channels, @bits]
       end
-      def selectcolumn(i, &block)
-        result = entries.map { |e| e[i] }
-        return result.map!(&block) if block_given?
-        result
-      end
+      private
-      def width
-        selectcolumn(COL_WIDTH, &:to_i).max
-      end
+      # class constant column numbers
+      COL_WIDTH = 3
+      COL_HEIGHT = 4
+      COL_COLOR_DESC = 5
+      COL_CHANNELS = 6
+      COL_BITS = 7
+      # only poppler 0.25+ has this column in output:
+      COL_XPPI = 12
-      def height
-        selectcolumn(COL_HEIGHT, &:to_i).max
-      end
+      # rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
+      # rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
+      # rubocop:disable Metrics/CyclomaticComplexity
+      # rubocop:disable Metrics/PerceivedComplexity
+      #
+      # The first two lines are tabular header information:
+      #
+      # Example:
+      #
+      #   bash-5.1$ pdfimages -list fmc_color.pdf  | head -5
+      #   page   num  type   width height color comp bpc  enc interp  object ID x-ppi y-ppi size ratio
+      #   --------------------------------------------------------------------------------------------
+      #   1     0 image    2475   413  rgb     3   8  jpeg   no        10  0   300   300 21.8K 0.7%
+      def process(command:)
+        @page_count = 0
+        @color_description = 'gray'
+        @width = 0
+        @height = 0
+        @channels = 0
+        @bits = 0
+        @pixels_per_inch = 0
+        Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
+          stdout.read.split("\n").each_with_index do |line, index|
+            # Skip the two header lines
+            next if index <= 1
+            @page_count += 1
+            cells = line.gsub(/\s+/m, ' ').strip.split(" ")
-      def color
-        # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
-        #   so caller may want all of this information, and in case of
-        #   mixed color spaces across images, this returns maximum
-        desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
-        channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
-        bits = entries.map { |e| e[COL_BITS].to_i }.max
-        [desc, channels, bits]
-      end
+            @color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
+            @width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > @width
+            @height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > @height
+            @channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > @channels
+            @bits = cells[COL_BITS].to_i if cells[COL_BITS].to_i > @bits
-      def ppi
-        if entries[0].size <= 12
-          # poppler < 0.25
-          pdf = MiniMagick::Image.open(@path)
-          width_points = pdf.width
-          width_px = width
-          return (72 * width_px / width_points).to_i
+            # In the case of poppler version < 0.25, we will have no more than 12 columns.  As such,
+            # we need to do some alternative magic to calculate this.
+            if @page_count == 1 && cells.size <= 12
+              pdf = MiniMagick::Image.open(@path)
+              width_points = pdf.width
+              width_px = width
+              @pixels_per_inch = (72 * width_px / width_points).to_i
+            elsif cells[COL_XPPI].to_i > @pixels_per_inch
+              # By the magic of nil#to_i if we don't have more than 12 columns, we've already set
+              # the @pixels_per_inch and this line won't due much of anything.
+              @pixels_per_inch = cells[COL_XPPI].to_i
+            end
+          end
         end
-        # with poppler 0.25+, pdfimages just gives us this:
-        selectcolumn(COL_XPPI, &:to_i).max
       end
+      # rubocop:enable Metrics/AbcSize
+      # rubocop:enable Metrics/MethodLength
+      # rubocop:enable Metrics/CyclomaticComplexity
+      # rubocop:enable Metrics/PerceivedComplexity
     end
   end
 end

data/lib/iiif_print/text_extraction/hocr_reader.rb CHANGED Viewed

@@ -84,6 +84,7 @@ module IiifPrint
           # add trailing space to plaintext buffer for between words:
           @text += ' '
           @words.push(@current) if word_complete?
+          @current = nil # clear the current word
         end
         def end_line
@@ -120,9 +121,12 @@ module IiifPrint
         #   for current word, and append line endings to plain text:
         #
         # @param name [String] element name.
-        def end_element(_name)
-          end_line if @element_class_name == 'ocr_line'
-          end_word if @element_class_name == 'ocrx_word'
+        def end_element(name)
+          if name == 'span'
+            end_word if @element_class_name == 'ocrx_word'
+            @text += "\n" if @element_class_name.nil?
+          end
+          @element_class_name = nil
         end
         # Callback for completion of parsing hOCR, used to normalize generated

data/lib/iiif_print/text_extraction/page_ocr.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module IiifPrint
     class PageOCR
       attr_accessor :html, :path
-      def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
+      def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
         @path = path
         # hOCR html:
         @html = nil
@@ -17,13 +17,14 @@ module IiifPrint
         @source_meta = nil
         @box = nil
         @plain = nil
-        @additional_tessearct_options = additional_tessearct_options
+        @additional_tesseract_options = additional_tesseract_options
       end
       def run_ocr
         outfile = File.join(Dir.mktmpdir, 'output_html')
-        cmd = "tesseract #{path} #{outfile} hocr"
-        cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
+        cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
+        cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
+        cmd += " hocr"
         `#{cmd}`
         outfile + '.hocr'
       end

data/lib/iiif_print/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module IiifPrint
-  VERSION = '1.0.0'.freeze
+  VERSION = '1.1.0'.freeze
 end