mindee 3.14.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '043953eb35ed9f251d12ca8b984edcf9acbeb491689574450dc3682b83e9db37'
4
- data.tar.gz: 5916ad8aeada6713ee5a846d634693fd1ed5e585495238649f9a9c1d2e9fbd00
3
+ metadata.gz: 9d356c6733d8a7d00973b219dbae06199040ca8d4bece4eb3906c8ec873aebf0
4
+ data.tar.gz: ab240a95c8538891aa4a3ef48285903daa06cebaf13f9578eff1a9675258d3bb
5
5
  SHA512:
6
- metadata.gz: 7edd036d9666e1ffe0318b8ec68cdfe5b37b6bdbb90e36ae0c19efa5cc038f109af30d9e6df6838e4237db2f9495a6251f10164db7e0c7dff528720ef4477a2a
7
- data.tar.gz: 7dfbb27c5175bda760ba816be62b1447acf4517d6741506cb458bbe4b2b4c77338eff1d799d19f4b0bdca663323523eb479bc468e80f9ed1943226694ef04143
6
+ metadata.gz: ec71145b9604ba30c77a842a33c89c1ad4ab4c70301c3eed2292bc95803dd112ee99c964289ca88c16ddffcb6a37f63130b30a23326b4359929791d0dcef4214
7
+ data.tar.gz: 0c20c191f6abe4166075a1745860ba500a488294bcb59e2a28e0b61a3bcee07a25be2adfef113d045727eeaf10f935278271a7d259f78d63e580bf8eda3833f3
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # Mindee Ruby API Library Changelog
2
2
 
3
+ ## v3.15.0 - 2024-10-29
4
+ ### Changes
5
+ * :sparkles: add support for image compression
6
+ * :sparkles: add support for PDF compression
7
+ ### Fixes
8
+ * :recycle: refactor pdf & image namespaces
9
+ * :memo: fix rubocop directives unexpectedly appearing in Yard documentation
10
+ * :arrow_up: bump version for mini_magick
11
+
12
+
3
13
  ## v3.14.0 - 2024-10-11
4
14
  ### Changes
5
15
  * :sparkles: add support for Financial Document v1.10
data/lib/mindee/client.rb CHANGED
@@ -128,6 +128,7 @@ module Mindee
128
128
  end
129
129
 
130
130
  # rubocop:disable Metrics/ParameterLists
131
+
131
132
  # Enqueue a document for async parsing and automatically try to retrieve it
132
133
  #
133
134
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
@@ -54,7 +54,6 @@ module Mindee
54
54
  image = MiniMagick::Image.read(@buffer)
55
55
  image.format file_format.downcase
56
56
  image.write resolved_path.to_s
57
- logger.info("File saved successfully to '#{resolved_path}'.")
58
57
  rescue TypeError
59
58
  raise 'Invalid path/filename provided.'
60
59
  rescue StandardError
@@ -11,8 +11,8 @@ module Mindee
11
11
  # Image Extraction Module.
12
12
  module Extraction
13
13
  # Image Extraction wrapper class.
14
- class ImageExtractor
15
- def self.attach_image_as_new_file(input_buffer)
14
+ module ImageExtractor
15
+ def self.attach_image_as_new_file(input_buffer, format: 'jpg')
16
16
  # Attaches an image as a new page in a PdfDocument object.
17
17
  #
18
18
  # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
@@ -21,9 +21,9 @@ module Mindee
21
21
  magick_image = MiniMagick::Image.read(input_buffer)
22
22
  # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
23
23
  # converted.
24
- magick_image.format('jpg')
24
+ magick_image.format(format)
25
25
  original_density = magick_image.resolution
26
- scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
26
+ scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
27
27
  # the pdf otherwise the resulting image shrinks.
28
28
  magick_image.format('pdf', 0, { density: scale_factor.to_s })
29
29
  Origami::PDF.read(StringIO.new(magick_image.to_blob))
@@ -37,27 +37,12 @@ module Mindee
37
37
  # to extract.
38
38
  # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
39
39
  def self.extract_multiple_images_from_source(input_source, page_id, polygons)
40
- new_stream = load_doc(input_source, page_id)
40
+ new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
41
41
  new_stream.seek(0)
42
42
 
43
43
  extract_images_from_polygons(input_source, new_stream, page_id, polygons)
44
44
  end
45
45
 
46
- # Retrieves a PDF document's page.
47
- #
48
- # @param [Origami::PDF] pdf_doc Origami PDF handle.
49
- # @param [Integer] page_id Page ID.
50
- def self.get_page(pdf_doc, page_id)
51
- stream = StringIO.new
52
- pdf_doc.save(stream)
53
-
54
- options = {
55
- page_indexes: [page_id - 1],
56
- }
57
-
58
- Mindee::PDF::PdfProcessor.parse(stream, options)
59
- end
60
-
61
46
  # Extracts images from their positions on a file (as polygons).
62
47
  #
63
48
  # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
@@ -179,10 +164,10 @@ module Mindee
179
164
  # @param input_file [LocalInputSource] Local input.
180
165
  # @param [Integer] page_id Page ID.
181
166
  # @return [MiniMagick::Image] A valid PdfDocument handle.
182
- def self.load_doc(input_file, page_id)
167
+ def self.load_input_source_pdf_page_as_image(input_file, page_id)
183
168
  input_file.io_stream.rewind
184
169
  if input_file.pdf?
185
- get_page(Origami::PDF.read(input_file.io_stream), page_id)
170
+ Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
186
171
  else
187
172
  input_file.io_stream
188
173
  end
@@ -65,6 +65,7 @@ module Mindee
65
65
 
66
66
  # rubocop:disable Metrics/CyclomaticComplexity
67
67
  # rubocop:disable Metrics/PerceivedComplexity
68
+
68
69
  # Extracts invoices as complete PDFs from the document.
69
70
  # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
70
71
  # @param strict [Boolean]
@@ -99,6 +100,7 @@ module Mindee
99
100
  end
100
101
  extract_sub_documents(correct_page_indexes)
101
102
  end
103
+
102
104
  # rubocop:enable Metrics/CyclomaticComplexity
103
105
  # rubocop:enable Metrics/PerceivedComplexity
104
106
 
@@ -271,6 +271,7 @@ module Mindee
271
271
  end
272
272
  candidates
273
273
  end
274
+
274
275
  # rubocop:enable Metrics/CyclomaticComplexity
275
276
  # rubocop:enable Metrics/PerceivedComplexity
276
277
 
@@ -10,9 +10,10 @@ module Mindee
10
10
  # @return [Float]
11
11
  attr_accessor :y
12
12
 
13
+ # rubocop:disable Naming/MethodParameterName
14
+
13
15
  # @param x [Float]
14
16
  # @param y [Float]
15
- # rubocop:disable Naming/MethodParameterName
16
17
  def initialize(x, y)
17
18
  @x = x
18
19
  @y = y
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image processing module.
5
+ module Image
6
+ # Image compressor module to handle image compression.
7
+ module ImageCompressor
8
+ # Resize and/or compress an SKBitmap. This assumes the ratio was provided before hands.
9
+ # @param image [MiniMagick::Image, StringIO] Input image.
10
+ # @param quality [Integer, nil] Quality of the final file.
11
+ # @param max_width [Integer, nil] Maximum width. If not specified, the horizontal ratio will remain the same.
12
+ # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
13
+ # @return [StringIO]
14
+ def self.compress_image(image, quality: 85, max_width: nil, max_height: nil)
15
+ processed_image = ImageUtils.to_image(image)
16
+ processed_image.format 'jpg'
17
+ final_width, final_height = ImageUtils.calculate_new_dimensions(
18
+ processed_image,
19
+ max_width: max_width,
20
+ max_height: max_height
21
+ )
22
+ ImageUtils.resize_image(processed_image, final_width, final_height) if final_width || final_height
23
+ ImageUtils.compress_image_quality(processed_image, quality)
24
+
25
+ ImageUtils.image_to_stringio(processed_image)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image processing module.
5
+ module Image
6
+ # Miscellaneous image operations.
7
+ module ImageUtils
8
+ # Resizes a provided MiniMagick Image with the given width & height, if present.
9
+ # @param image [MiniMagick::Image] MiniMagick image handle.
10
+ # @param width [Integer] Width to comply with.
11
+ # @param height [Integer] Height to comply with.
12
+ def self.resize_image(image, width, height)
13
+ if width && height
14
+ image.resize "#{width}x#{height}"
15
+ elsif width
16
+ image.resize width.to_s
17
+ elsif height
18
+ image.resize "x#{height}"
19
+ end
20
+ end
21
+
22
+ # Compresses the quality of the provided MiniMagick image.
23
+ # @param image [MiniMagick::Image] MiniMagick image handle.
24
+ # @param quality [Integer] Quality to apply to the image. This is independent of a JPG's base quality.
25
+ def self.compress_image_quality(image, quality)
26
+ image.quality quality.to_s
27
+ end
28
+
29
+ # Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
30
+ # @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
31
+ # @return [MiniMagick::Image]
32
+ def self.to_image(image)
33
+ if image.respond_to?(:read) && image.respond_to?(:rewind)
34
+ image.rewind
35
+ MiniMagick::Image.read(image)
36
+ elsif image.is_a?(MiniMagick::Image)
37
+ image
38
+ else
39
+ raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
40
+ end
41
+ end
42
+
43
+ # Converts a StringIO containing an image into a MiniMagick image.
44
+ # @param image [MiniMagick::Image] the input image.
45
+ # @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
46
+ # @return [StringIO]
47
+ def self.image_to_stringio(image, format = 'JPEG')
48
+ image.format format
49
+ blob = image.to_blob
50
+ stringio = StringIO.new(blob)
51
+ stringio.rewind
52
+
53
+ stringio
54
+ end
55
+
56
+ # Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
57
+ # provided bounds.
58
+ # @param [MiniMagick::Image] original Input MiniMagick image.
59
+ # @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
60
+ # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
61
+ def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
62
+ raise 'Provided image could not be processed for resizing.' if original.nil?
63
+
64
+ return [original.width, original.height] if max_width.nil? && max_height.nil?
65
+
66
+ width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
67
+ height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY
68
+
69
+ scale_factor = [width_ratio, height_ratio].min
70
+
71
+ new_width = (original.width * scale_factor).to_i
72
+ new_height = (original.height * scale_factor).to_i
73
+
74
+ [new_width, new_height]
75
+ end
76
+
77
+ # Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
78
+ # @param image [MiniMagick::Image] The initial image that will fit into the page.
79
+ # @param media_box [Array<Integer>, nil]
80
+ # @return [Array<Integer>]
81
+ def self.calculate_dimensions_from_media_box(image, media_box)
82
+ if !media_box.nil? && media_box.any?
83
+ [
84
+ media_box[2]&.to_i || image[:width].to_i,
85
+ media_box[3]&.to_i || image[:height].to_i,
86
+ ]
87
+ else
88
+ [image[:width].to_i, image[:height].to_i]
89
+ end
90
+ end
91
+
92
+ # Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
93
+ # @param pdf_stream [StringIO] Input stream.
94
+ # @param image_quality [Integer] Quality to apply to the image.
95
+ # @return [MiniMagick::Image]
96
+ def self.pdf_to_magick_image(pdf_stream, image_quality)
97
+ compressed_image = MiniMagick::Image.read(pdf_stream.read)
98
+ compressed_image.format('jpg')
99
+ compressed_image.quality image_quality.to_s
100
+ compressed_image
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'image/image_compressor'
4
+ require_relative 'image/image_utils'
@@ -4,6 +4,7 @@ require 'stringio'
4
4
  require 'marcel'
5
5
 
6
6
  require_relative '../pdf'
7
+ require_relative '../image'
7
8
 
8
9
  module Mindee
9
10
  module Input
@@ -126,6 +127,41 @@ module Mindee
126
127
  pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
127
128
  pdf_processor.pages.size
128
129
  end
130
+
131
+ # Compresses the file, according to the provided info.
132
+ # @param [Integer] quality Quality of the output file.
133
+ # @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
134
+ # @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
135
+ # @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
136
+ # This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
137
+ # WARNING: this operation is strongly discouraged.
138
+ # @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
139
+ # not. Needs force_source_text to work.
140
+ def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
141
+ buffer = if pdf?
142
+ Mindee::PDF::PDFCompressor.compress_pdf(
143
+ @io_stream,
144
+ quality: quality,
145
+ force_source_text_compression: force_source_text,
146
+ disable_source_text: disable_source_text
147
+ )
148
+ else
149
+ Mindee::Image::ImageCompressor.compress_image(
150
+ @io_stream,
151
+ quality: quality,
152
+ max_width: max_width,
153
+ max_height: max_height
154
+ )
155
+ end
156
+ @io_stream = buffer
157
+ @io_stream.rewind
158
+ end
159
+
160
+ # Checks whether the file has source text if it is a pdf. False otherwise
161
+ # @return [Boolean] True if the file is a PDF and has source text.
162
+ def source_text?
163
+ Mindee::PDF::PDFTools.source_text?(@io_stream)
164
+ end
129
165
  end
130
166
 
131
167
  # Load a document from a path.
@@ -31,6 +31,8 @@ module Mindee
31
31
 
32
32
  # rubocop:disable Metrics/CyclomaticComplexity
33
33
  # rubocop:disable Metrics/PerceivedComplexity
34
+
35
+ # String representation.
34
36
  # @return [String]
35
37
  def to_s
36
38
  return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
@@ -40,6 +42,7 @@ module Mindee
40
42
 
41
43
  ''
42
44
  end
45
+
43
46
  # rubocop:enable Metrics/CyclomaticComplexity
44
47
  # rubocop:enable Metrics/PerceivedComplexity
45
48
 
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+ PDFReader = PDF
5
+
6
+ module Mindee
7
+ module PDF
8
+ # Image compressor module to handle PDF compression.
9
+ module PDFCompressor
10
+ # Compresses each page of a provided PDF stream. Skips if force_source_text isn't set and source text is detected.
11
+ # @param quality [Integer] Compression quality (70-100 for most JPG images in the test dataset).
12
+ # @param force_source_text_compression [Boolean] If true, attempts to re-write detected text.
13
+ # @param disable_source_text [Boolean] If true, doesn't re-apply source text to the original PDF.
14
+ def self.compress_pdf(pdf_data, quality: 85, force_source_text_compression: false, disable_source_text: true)
15
+ if PDFTools.source_text?(pdf_data)
16
+ if force_source_text_compression
17
+ if disable_source_text
18
+ puts "\e[33m[WARNING] Re-writing PDF source-text is an EXPERIMENTAL feature.\e[0m"
19
+ else
20
+ puts "\e[33m[WARNING] Source-file contains text, but disable_source_text flag is ignored. " \
21
+ "Resulting file will not contain any embedded text.\e[0m"
22
+ end
23
+ else
24
+ puts "\e[33m[WARNING] Source-text detected in input PDF. Aborting operation.\e[0m"
25
+ return pdf_data
26
+ end
27
+ end
28
+
29
+ pdf_data.rewind
30
+ pdf = Origami::PDF.read(pdf_data)
31
+ pages = process_pdf_pages(pdf, quality)
32
+
33
+ output_pdf = create_output_pdf(pages, disable_source_text, pdf_data)
34
+
35
+ output_stream = StringIO.new
36
+ output_pdf.save(output_stream)
37
+ output_stream
38
+ end
39
+
40
+ # Processes all pages in the PDF.
41
+ # @param pdf [Origami::PDF] The Origami PDF object to process.
42
+ # @param quality [Integer] Compression quality.
43
+ # @return [Array<Origami::Page>] Processed pages.
44
+ def self.process_pdf_pages(pdf, quality)
45
+ pdf.pages.map.with_index do |page, index|
46
+ process_pdf_page(Mindee::PDF::PdfProcessor.get_page(pdf, index), index, quality, page[:MediaBox])
47
+ end
48
+ end
49
+
50
+ # Creates the output PDF with processed pages.
51
+ # @param pages [Array] Processed pages.
52
+ # @param disable_source_text [Boolean] Whether to disable source text.
53
+ # @param pdf_data [StringIO] Original PDF data.
54
+ # @return [Origami::PDF] Output PDF object.
55
+ def self.create_output_pdf(pages, disable_source_text, pdf_data)
56
+ output_pdf = Origami::PDF.new
57
+ # NOTE: Page order and XObject handling require adjustment due to origami adding the last page first.
58
+ pages.rotate!(1) if pages.count >= 2
59
+
60
+ inject_text(pdf_data, pages) unless disable_source_text
61
+
62
+ pages.each { |page| output_pdf.append_page(page) }
63
+
64
+ output_pdf
65
+ end
66
+
67
+ # Extracts text from a source text PDF, and injects it into a newly-created one.
68
+ # @param pdf_data [StringIO] Stream representation of the PDF.
69
+ # @param pages [Array<Origami::Page>] Array of pages containing the rasterized version of the initial pages.
70
+ def self.inject_text(pdf_data, pages)
71
+ reader = PDFReader::Reader.new(pdf_data)
72
+
73
+ reader.pages.each_with_index do |original_page, index|
74
+ break if index >= pages.length
75
+
76
+ receiver = PDFReader::Reader::PageTextReceiver.new
77
+ original_page.walk(receiver)
78
+
79
+ receiver.runs.each do |text_run|
80
+ x = text_run.origin.x
81
+ y = text_run.origin.y
82
+ text = text_run.text
83
+ font_size = text_run.font_size
84
+
85
+ content_stream = Origami::Stream.new
86
+ content_stream.dictionary[:Filter] = :FlateDecode
87
+ content_stream.data = "BT\n/F1 #{font_size} Tf\n#{x} #{y} Td\n(#{text}) Tj\nET\n"
88
+
89
+ pages[index].Contents.data += content_stream.data
90
+ end
91
+ end
92
+ end
93
+
94
+ # Takes in a page stream, rasterizes it into a JPEG image, and applies the result onto a new Origami PDF page.
95
+ # @param page_stream [StringIO] Stream representation of a single page from the initial PDF.
96
+ # @param page_index [Integer] Index of the current page. Technically not needed, but left for debugging purposes.
97
+ # @param image_quality [Integer] Quality to apply to the rasterized page.
98
+ # @param media_box [Array<Integer>, nil] Extracted media box from the page. Can be nil.
99
+ # @return [Origami::Page]
100
+ def self.process_pdf_page(page_stream, page_index, image_quality, media_box)
101
+ new_page = Origami::Page.new
102
+ compressed_image = Mindee::Image::ImageUtils.pdf_to_magick_image(page_stream, image_quality)
103
+ width, height = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(compressed_image, media_box)
104
+
105
+ compressed_xobject = PDF::PDFTools.create_xobject(compressed_image)
106
+ PDF::PDFTools.set_xobject_properties(compressed_xobject, compressed_image)
107
+
108
+ xobject_name = "X#{page_index + 1}"
109
+ PDF::PDFTools.add_content_to_page(new_page, xobject_name, width, height)
110
+ new_page.add_xobject(compressed_xobject, xobject_name)
111
+
112
+ PDF::PDFTools.set_page_dimensions(new_page, width, height)
113
+ new_page
114
+ end
115
+ end
116
+ end
117
+ end
@@ -18,6 +18,7 @@ module Mindee
18
18
 
19
19
  # @param io_stream [StreamIO]
20
20
  # @param options [Hash]
21
+ # @return [StringIO]
21
22
  def self.parse(io_stream, options)
22
23
  options = DEFAULT_OPTIONS.merge(options)
23
24
 
@@ -74,6 +75,22 @@ module Mindee
74
75
  io_stream.seek(0)
75
76
  pdf_parser.parse(io_stream)
76
77
  end
78
+
79
+ # Retrieves a PDF document's page.
80
+ #
81
+ # @param [Origami::PDF] pdf_doc Origami PDF handle.
82
+ # @param [Integer] page_id Page ID.
83
+ # @return [StringIO]
84
+ def self.get_page(pdf_doc, page_id)
85
+ stream = StringIO.new
86
+ pdf_doc.save(stream)
87
+
88
+ options = {
89
+ page_indexes: [page_id - 1],
90
+ }
91
+
92
+ parse(stream, options)
93
+ end
77
94
  end
78
95
  end
79
96
  end
@@ -29,6 +29,106 @@ module Mindee
29
29
  io_stream.set_encoding Encoding::BINARY
30
30
  io_stream
31
31
  end
32
+
33
+ # Checks a PDFs stream content for text operators
34
+ # See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
35
+ # @param [StringIO] stream Stream object from a PDFs page.
36
+ # @return [Boolean] True if a text operator is found in the stream.
37
+ def self.stream_has_text?(stream)
38
+ data = stream.data
39
+ return false if data.nil? || data.empty?
40
+
41
+ text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
42
+ text_operators.any? { |op| data.include?(op) }
43
+ end
44
+
45
+ # Checks whether the file has source_text. Sends false if the file isn't a PDF.
46
+ # @param [StringIO] pdf_data
47
+ # @return [Boolean] True if the pdf has source text, false otherwise.
48
+ def self.source_text?(pdf_data)
49
+ begin
50
+ pdf_data.rewind
51
+ pdf = Origami::PDF.read(pdf_data)
52
+
53
+ pdf.each_page do |page|
54
+ next unless page[:Contents]
55
+
56
+ contents = page[:Contents].solve
57
+ contents = [contents] unless contents.is_a?(Origami::Array)
58
+
59
+ contents.each do |stream_ref|
60
+ stream = stream_ref.solve
61
+ return true if stream_has_text?(stream)
62
+ end
63
+ end
64
+
65
+ false
66
+ end
67
+
68
+ false
69
+ rescue Origami::InvalidPDFError
70
+ false
71
+ end
72
+
73
+ def self.create_xobject(image)
74
+ image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
75
+ Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
76
+ end
77
+
78
+ def self.set_xobject_properties(xobject, image)
79
+ xobject.dictionary[:BitsPerComponent] = 8
80
+ xobject.dictionary[:Filter] = determine_filter(image)
81
+ xobject.dictionary[:Width] = image[:width]
82
+ xobject.dictionary[:Height] = image[:height]
83
+ xobject.dictionary[:ColorSpace] = determine_colorspace(image)
84
+ end
85
+
86
+ def self.determine_filter(image)
87
+ filter = image.data['properties']['filter']
88
+ case filter
89
+ when %r{Zip}i then :FlateDecode
90
+ when %r{LZW}i then :LZWDecode
91
+ else :DCTDecode
92
+ end
93
+ end
94
+
95
+ def self.determine_colorspace(image)
96
+ colorspace = image.data['colorspace']
97
+ case colorspace
98
+ when 'CMYK' then :DeviceCMYK
99
+ when 'Gray', 'PseudoClass Gray' then :DeviceGray
100
+ else :DeviceRGB
101
+ end
102
+ end
103
+
104
+ def self.add_content_to_page(page, xobject_name, width, height)
105
+ content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
106
+ content_stream = Origami::Stream.new(content)
107
+ page.Contents = content_stream
108
+ end
109
+
110
+ def self.set_page_dimensions(page, width, height)
111
+ page[:MediaBox] = [0, 0, width, height]
112
+ page[:CropBox] = [0, 0, width, height]
113
+ end
114
+
115
+ def self.process_image_xobject(image_data, image_quality, width, height)
116
+ compressed_data = Image::ImageCompressor.compress_image(
117
+ image_data,
118
+ quality: image_quality,
119
+ max_width: width,
120
+ max_height: height
121
+ )
122
+
123
+ new_image = Origami::Graphics::ImageXObject.new
124
+ new_image.data = compressed_data
125
+ new_image.Width = width
126
+ new_image.Height = height
127
+ new_image.ColorSpace = :DeviceRGB
128
+ new_image.BitsPerComponent = 8
129
+
130
+ new_image
131
+ end
32
132
  end
33
133
  end
34
134
  end
data/lib/mindee/pdf.rb CHANGED
@@ -1,3 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'pdf/pdf_processing'
3
+ require_relative 'pdf/pdf_compressor'
4
+ require_relative 'pdf/pdf_processor'
5
+ require_relative 'pdf/pdf_tools'
@@ -3,7 +3,7 @@
3
3
  # Mindee
4
4
  module Mindee
5
5
  # Current version.
6
- VERSION = '3.14.0'
6
+ VERSION = '3.15.0'
7
7
 
8
8
  # Finds and return the current platform.
9
9
  # @return [String]
data/lib/mindee.rb CHANGED
@@ -19,6 +19,16 @@ module Mindee
19
19
  end
20
20
  end
21
21
 
22
+ module Image
23
+ # Miscellaneous image operations.
24
+ module ImageUtils
25
+ end
26
+
27
+ # Image compressor module to handle image compression.
28
+ module ImageCompressor
29
+ end
30
+ end
31
+
22
32
  # Custom extraction module
23
33
  module Extraction
24
34
  end
data/mindee.gemspec CHANGED
@@ -30,8 +30,9 @@ Gem::Specification.new do |spec|
30
30
  spec.required_ruby_version = Gem::Requirement.new('>= 2.6')
31
31
 
32
32
  spec.add_runtime_dependency 'marcel', '~> 1.0.2'
33
- spec.add_runtime_dependency 'mini_magick', '~> 4.13.0'
33
+ spec.add_runtime_dependency 'mini_magick', '>=4', '< 6'
34
34
  spec.add_runtime_dependency 'origamindee', '~> 3.1.0'
35
+ spec.add_runtime_dependency 'pdf-reader', '~> 2.12.0'
35
36
 
36
37
  spec.add_development_dependency 'rake', '~> 12.3.3'
37
38
  spec.add_development_dependency 'rspec', '~> 3.12.0'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mindee
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.14.0
4
+ version: 3.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mindee, SA
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-11 00:00:00.000000000 Z
11
+ date: 2024-10-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: marcel
@@ -28,16 +28,22 @@ dependencies:
28
28
  name: mini_magick
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '4'
34
+ - - "<"
32
35
  - !ruby/object:Gem::Version
33
- version: 4.13.0
36
+ version: '6'
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
- - - "~>"
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '4'
44
+ - - "<"
39
45
  - !ruby/object:Gem::Version
40
- version: 4.13.0
46
+ version: '6'
41
47
  - !ruby/object:Gem::Dependency
42
48
  name: origamindee
43
49
  requirement: !ruby/object:Gem::Requirement
@@ -52,6 +58,20 @@ dependencies:
52
58
  - - "~>"
53
59
  - !ruby/object:Gem::Version
54
60
  version: 3.1.0
61
+ - !ruby/object:Gem::Dependency
62
+ name: pdf-reader
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: 2.12.0
68
+ type: :runtime
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: 2.12.0
55
75
  - !ruby/object:Gem::Dependency
56
76
  name: rake
57
77
  requirement: !ruby/object:Gem::Requirement
@@ -224,6 +244,9 @@ files:
224
244
  - lib/mindee/http/endpoint.rb
225
245
  - lib/mindee/http/error.rb
226
246
  - lib/mindee/http/response_validation.rb
247
+ - lib/mindee/image.rb
248
+ - lib/mindee/image/image_compressor.rb
249
+ - lib/mindee/image/image_utils.rb
227
250
  - lib/mindee/input.rb
228
251
  - lib/mindee/input/local_response.rb
229
252
  - lib/mindee/input/sources.rb
@@ -262,7 +285,8 @@ files:
262
285
  - lib/mindee/parsing/standard/string_field.rb
263
286
  - lib/mindee/parsing/standard/tax_field.rb
264
287
  - lib/mindee/pdf.rb
265
- - lib/mindee/pdf/pdf_processing.rb
288
+ - lib/mindee/pdf/pdf_compressor.rb
289
+ - lib/mindee/pdf/pdf_processor.rb
266
290
  - lib/mindee/pdf/pdf_tools.rb
267
291
  - lib/mindee/product.rb
268
292
  - lib/mindee/product/.rubocop.yml