RubyGems - mindee - Versions diffs - 3.12.0 → 3.14.0 - Mend

mindee 3.12.0 → 3.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

data/lib/mindee/client.rb CHANGED Viewed

@@ -17,13 +17,16 @@ module Mindee
     # Call prediction API on a document and parse the results.
     #
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint] Endpoint of the API
     # Doesn't need to be set in the case of OTS APIs.
     #
     # @param all_words [Boolean] Whether to include the full text for each page.
     #  This performs a full OCR operation on the server and will increase response time.
     #
+    # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
+    #  This performs a full OCR operation on the server and may increase response time.
+    #
     # @param close_file [Boolean] Whether to `close()` the file after parsing it.
     #  Set to false if you need to access the file after this operation.
     #
@@ -45,6 +48,7 @@ module Mindee
       product_class,
       endpoint: nil,
       all_words: false,
+      full_text: false,
       close_file: true,
       page_options: nil,
       cropper: false
@@ -53,20 +57,23 @@ module Mindee
         input_source.process_pdf(page_options)
       end
       endpoint = initialize_endpoint(product_class) if endpoint.nil?
-      prediction, raw_http = endpoint.predict(input_source, all_words, close_file, cropper)
+      prediction, raw_http = endpoint.predict(input_source, all_words, full_text, close_file, cropper)
       Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
     end
     # Enqueue a document for async parsing
     #
+    # @param product_class [Mindee::Inference] class of the product
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
     # Doesn't need to be set in the case of OTS APIs.
     #
     # @param all_words [Boolean] Whether to extract all the words on each page.
     #  This performs a full OCR operation on the server and will increase response time.
     #
+    # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
+    #  This performs a full OCR operation on the server and may increase response time.
+    #
     # @param close_file [Boolean] Whether to `close()` the file after parsing it.
     #  Set to false if you need to access the file after this operation.
     #
@@ -88,6 +95,7 @@ module Mindee
       product_class,
       endpoint: nil,
       all_words: false,
+      full_text: false,
       close_file: true,
       page_options: nil,
       cropper: false
@@ -96,7 +104,7 @@ module Mindee
         input_source.process_pdf(page_options)
       end
       endpoint = initialize_endpoint(product_class) if endpoint.nil?
-      prediction, raw_http = endpoint.predict_async(input_source, all_words, close_file, cropper)
+      prediction, raw_http = endpoint.predict_async(input_source, all_words, full_text, close_file, cropper)
       Mindee::Parsing::Common::ApiResponse.new(product_class,
                                                prediction, raw_http)
     end
@@ -104,7 +112,7 @@ module Mindee
     # Parses a queued document
     #
     # @param job_id [String] Id of the job (queue) to poll from
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
     # Doesn't need to be set in the case of OTS APIs.
     #
@@ -123,11 +131,13 @@ module Mindee
     # Enqueue a document for async parsing and automatically try to retrieve it
     #
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
     #   Doesn't need to be set in the case of OTS APIs.
     # @param all_words [Boolean] Whether to extract all the words on each page.
     #   This performs a full OCR operation on the server and will increase response time.
+    # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
+    #  This performs a full OCR operation on the server and may increase response time.
     # @param close_file [Boolean] Whether to `close()` the file after parsing it.
     #   Set to false if you need to access the file after this operation.
     # @param page_options [Hash, nil] Page cutting/merge options:
@@ -147,6 +157,7 @@ module Mindee
       product_class,
       endpoint: nil,
       all_words: false,
+      full_text: false,
       close_file: true,
       page_options: nil,
       cropper: false,
@@ -159,6 +170,7 @@ module Mindee
         product_class,
         endpoint: endpoint,
         all_words: all_words,
+        full_text: full_text,
         close_file: close_file,
         page_options: page_options,
         cropper: cropper
@@ -184,7 +196,7 @@ module Mindee
     # Load a prediction.
     #
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param local_response [Mindee::Input::LocalResponse]
     # @return [Mindee::Parsing::Common::ApiResponse]
     def load_prediction(product_class, local_response)
@@ -269,7 +281,7 @@ module Mindee
     end
     # Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     #
     # @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
     #  API Builder. Do not set for standard (off the shelf) endpoints.

data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb RENAMED Viewed

@@ -4,7 +4,7 @@ require_relative '../../input/sources'
 module Mindee
   # Image Extraction Module.
-  module ImageExtraction
+  module Extraction
     # Generic class for image extraction.
     class ExtractedImage
       # Id of the page the image was extracted from.

data/lib/mindee/extraction/common/image_extractor.rb ADDED Viewed

@@ -0,0 +1,192 @@
+# frozen_string_literal: true
+require 'mini_magick'
+require 'origami'
+require 'stringio'
+require 'tempfile'
+require_relative '../../input/sources'
+require_relative 'extracted_image'
+module Mindee
+  # Image Extraction Module.
+  module Extraction
+    # Image Extraction wrapper class.
+    class ImageExtractor
+      def self.attach_image_as_new_file(input_buffer)
+        # Attaches an image as a new page in a PdfDocument object.
+        #
+        # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
+        # @return [Origami::PDF] A PdfDocument handle.
+        magick_image = MiniMagick::Image.read(input_buffer)
+        # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
+        # converted.
+        magick_image.format('jpg')
+        original_density = magick_image.resolution
+        scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
+        # the pdf otherwise the resulting image shrinks.
+        magick_image.format('pdf', 0, { density: scale_factor.to_s })
+        Origami::PDF.read(StringIO.new(magick_image.to_blob))
+      end
+      # Extracts multiple images from a given local input source.
+      #
+      # @param [Mindee::Input::Source::LocalInputSource] input_source
+      # @param [Integer] page_id ID of the Page to extract from.
+      # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
+      # to extract.
+      # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
+      def self.extract_multiple_images_from_source(input_source, page_id, polygons)
+        new_stream = load_doc(input_source, page_id)
+        new_stream.seek(0)
+        extract_images_from_polygons(input_source, new_stream, page_id, polygons)
+      end
+      # Retrieves a PDF document's page.
+      #
+      # @param [Origami::PDF] pdf_doc Origami PDF handle.
+      # @param [Integer] page_id Page ID.
+      def self.get_page(pdf_doc, page_id)
+        stream = StringIO.new
+        pdf_doc.save(stream)
+        options = {
+          page_indexes: [page_id - 1],
+        }
+        Mindee::PDF::PdfProcessor.parse(stream, options)
+      end
+      # Extracts images from their positions on a file (as polygons).
+      #
+      # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
+      # @param [StringIO] pdf_stream Buffer of the PDF.
+      # @param [Integer] page_id Page ID.
+      # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
+      # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
+      def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
+        extracted_elements = []
+        polygons.each_with_index do |polygon, element_id|
+          polygon = normalize_polygon(polygon)
+          page_content = read_page_content(pdf_stream)
+          min_max_x = Geometry.get_min_max_x([
+                                               polygon.top_left,
+                                               polygon.bottom_right,
+                                               polygon.top_right,
+                                               polygon.bottom_left,
+                                             ])
+          min_max_y = Geometry.get_min_max_y([
+                                               polygon.top_left,
+                                               polygon.bottom_right,
+                                               polygon.top_right,
+                                               polygon.bottom_left,
+                                             ])
+          file_extension = determine_file_extension(input_source)
+          cropped_image = crop_image(page_content, min_max_x, min_max_y)
+          if file_extension == 'pdf'
+            cropped_image.format('jpg')
+          else
+            cropped_image.format(file_extension)
+          end
+          buffer = StringIO.new
+          write_image_to_buffer(cropped_image, buffer)
+          file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
+          extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
+        end
+        extracted_elements
+      end
+      # Retrieves the bounding box of a polygon.
+      #
+      # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
+      def self.normalize_polygon(polygon)
+        if polygon.is_a?(Mindee::Geometry::Polygon)
+          Mindee::Geometry.get_bounding_box(polygon)
+        else
+          polygon
+        end
+      end
+      # Loads a buffer into a MiniMagick Image.
+      #
+      # @param [StringIO] pdf_stream Buffer containg the PDF
+      # @return [MiniMagick::Image] a valid MiniMagick image handle.
+      def self.read_page_content(pdf_stream)
+        pdf_stream.rewind
+        MiniMagick::Image.read(pdf_stream)
+      end
+      # Crops a MiniMagick Image from a the given bounding box.
+      #
+      # @param [MiniMagick::Image] image Input Image.
+      # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
+      # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
+      def self.crop_image(image, min_max_x, min_max_y)
+        width = image[:width].to_i
+        height = image[:height].to_i
+        image.format('jpg')
+        new_width = (min_max_x.max - min_max_x.min) * width
+        new_height = (min_max_y.max - min_max_y.min) * height
+        image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
+        image
+      end
+      # Writes a MiniMagick::Image to a buffer.
+      #
+      # @param [MiniMagick::Image] image a valid MiniMagick image.
+      # @param [StringIO] buffer
+      def self.write_image_to_buffer(image, buffer)
+        image.write(buffer)
+      end
+      # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
+      #
+      # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
+      # @return [String] A valid file extension.
+      def self.determine_file_extension(input_source)
+        if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
+          'jpg'
+        else
+          File.extname(input_source.filename).strip.downcase[1..]
+        end
+      end
+      # Generates an ExtractedImage.
+      #
+      # @param [StringIO] buffer Buffer containing the image.
+      # @param [String] file_name Name for the file.
+      # @param [Object] page_id ID of the page the file was generated from.
+      # @param [Object] element_id ID of the element of a given page.
+      def self.create_extracted_image(buffer, file_name, page_id, element_id)
+        buffer.rewind
+        ExtractedImage.new(
+          Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
+          page_id,
+          element_id
+        )
+      end
+      # Loads a single_page from an image file or a pdf document.
+      #
+      # @param input_file [LocalInputSource] Local input.
+      # @param [Integer] page_id Page ID.
+      # @return [MiniMagick::Image] A valid PdfDocument handle.
+      def self.load_doc(input_file, page_id)
+        input_file.io_stream.rewind
+        if input_file.pdf?
+          get_page(Origami::PDF.read(input_file.io_stream), page_id)
+        else
+          input_file.io_stream
+        end
+      end
+    end
+  end
+end

data/lib/mindee/{image_extraction → extraction}/common.rb RENAMED Viewed

@@ -1,3 +1,4 @@
 # frozen_string_literal: true
+require_relative 'common/extracted_image'
 require_relative 'common/image_extractor'

data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+require_relative '../common/image_extractor'
+module Mindee
+  # Image Extraction Module.
+  module Extraction
+    # Multi-receipts extraction class wrapper.
+    class MultiReceiptsExtractor
+      def self.extract_receipts(input_source, inference)
+        # Extracts individual receipts from multi-receipts documents.
+        #
+        # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
+        # @param inference [Inference] Results of the inference.
+        # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
+        images = []
+        raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
+        (0...input_source.count_pdf_pages).each do |page_id|
+          receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
+          images.concat(
+            Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1,
+                                                                                   receipt_positions)
+          )
+        end
+        images
+      end
+    end
+  end
+end

data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+module Mindee
+  # Pdf Extraction Module.
+  module Extraction
+    module PdfExtractor
+      # An extracted sub-Pdf.
+      class ExtractedPdf
+        # Byte contents of the pdf
+        # @return [StreamIO]
+        attr_reader :pdf_bytes
+        # Name of the file.
+        # @return [String]
+        attr_reader :filename
+        # @param pdf_bytes [StreamIO]
+        # @param filename [String]
+        def initialize(pdf_bytes, filename)
+          @pdf_bytes = pdf_bytes
+          @filename = filename
+        end
+        # Retrieves the page count for a given pdf.
+        # @return [Integer]
+        def page_count
+          current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
+          current_pdf.pages.size
+        rescue TypeError
+          raise 'Could not retrieve page count from Extracted PDF object.'
+        end
+        # Writes the contents of the current PDF object to a file.
+        # @param output_path [String] Path to write to.
+        def write_to_file(output_path)
+          raise 'Provided path is not a file' if File.directory?(destination)
+          raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))
+          if File.extname(output_path).downcase == '.pdf'
+            base_path = File.expand_path('..', output_path)
+            output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
+          end
+          File.write(output_path, @pdf_bytes)
+        end
+        # Returns the current PDF object as a usable BytesInputSource.
+        # @return [Mindee::Input::Source::BytesInputSource]
+        def as_input_source
+          Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
+        end
+      end
+    end
+  end
+end

data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb ADDED Viewed

@@ -0,0 +1,111 @@
+# frozen_string_literal: true
+module Mindee
+  # Pdf Extraction Module.
+  module Extraction
+    # Pdf Extraction class.
+    module PdfExtractor
+      # Pdf extraction class.
+      class PdfExtractor
+        # @param local_input [Mindee::Input::Source::LocalInputSource]
+        def initialize(local_input)
+          @filename = local_input.filename
+          if local_input.pdf?
+            @source_pdf = local_input.io_stream
+          else
+            pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
+            io_buffer = StringIO.new
+            pdf_image.save(io_buffer)
+            @source_pdf = io_buffer
+          end
+        end
+        # Retrieves the page count for the Pdf object.
+        # @return [Integer]
+        def page_count
+          Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
+        end
+        # Creates a new Pdf from pages and save it into a buffer.
+        # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
+        # @return [StreamIO] The buffer containing the new Pdf.
+        def cut_pages(page_indexes)
+          options = {
+            page_indexes: page_indexes,
+          }
+          Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
+        end
+        # Extract the sub-documents from the main pdf, based on the given list of page indexes.
+        # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
+        # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
+        def extract_sub_documents(page_indexes)
+          extracted_pdfs = []
+          extension = File.extname(@filename)
+          basename = File.basename(@filename, extension)
+          page_indexes.each do |page_index_list|
+            if page_index_list.empty? || page_index_list.nil?
+              raise "Empty indexes aren't allowed for extraction #{page_index_list}"
+            end
+            page_index_list.each do |page_index|
+              raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
+            end
+            formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
+            field_filename = "#{basename}_#{format('%03d',
+                                                   (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
+            extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
+                                                                               field_filename)
+            extracted_pdfs << extracted_pdf
+          end
+          extracted_pdfs
+        end
+        # rubocop:disable Metrics/CyclomaticComplexity
+        # rubocop:disable Metrics/PerceivedComplexity
+        # Extracts invoices as complete PDFs from the document.
+        # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
+        # @param strict [Boolean]
+        # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
+        def extract_invoices(page_indexes, strict: false)
+          raise 'No indexes provided.' if page_indexes.empty?
+          unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
+            return extract_sub_documents(page_indexes)
+          end
+          return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
+          correct_page_indexes = []
+          current_list = []
+          previous_confidence = nil
+          page_indexes.each_with_index do |page_index, i|
+            confidence = page_index.confidence
+            page_list = page_index.page_indexes
+            if confidence >= 0.5 && previous_confidence.nil?
+              current_list = page_list
+            elsif confidence >= 0.5 && i < page_indexes.length - 1
+              correct_page_indexes << current_list
+              current_list = page_list
+            elsif confidence < 0.5 && i == page_indexes.length - 1
+              current_list.concat page_list
+              correct_page_indexes << current_list
+            else
+              correct_page_indexes << current_list
+              correct_page_indexes << page_list
+            end
+            previous_confidence = confidence
+          end
+          extract_sub_documents(correct_page_indexes)
+        end
+        # rubocop:enable Metrics/CyclomaticComplexity
+        # rubocop:enable Metrics/PerceivedComplexity
+        private
+        attr_reader :source_pdf, :filename
+      end
+    end
+  end
+end

data/lib/mindee/extraction/pdf_extractor.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# frozen_string_literal: true
+require_relative 'pdf_extractor/pdf_extractor'
+require_relative 'pdf_extractor/extracted_pdf'