mindee 3.14.0 → 3.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/lib/mindee/client.rb +1 -0
- data/lib/mindee/extraction/common/extracted_image.rb +0 -1
- data/lib/mindee/extraction/common/image_extractor.rb +7 -22
- data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +2 -0
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +1 -0
- data/lib/mindee/geometry/point.rb +2 -1
- data/lib/mindee/image/image_compressor.rb +29 -0
- data/lib/mindee/image/image_utils.rb +104 -0
- data/lib/mindee/image.rb +4 -0
- data/lib/mindee/input/sources.rb +36 -0
- data/lib/mindee/parsing/standard/position_field.rb +3 -0
- data/lib/mindee/pdf/pdf_compressor.rb +117 -0
- data/lib/mindee/pdf/{pdf_processing.rb → pdf_processor.rb} +17 -0
- data/lib/mindee/pdf/pdf_tools.rb +100 -0
- data/lib/mindee/pdf.rb +3 -1
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +10 -0
- data/mindee.gemspec +2 -1
- metadata +31 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d356c6733d8a7d00973b219dbae06199040ca8d4bece4eb3906c8ec873aebf0
|
4
|
+
data.tar.gz: ab240a95c8538891aa4a3ef48285903daa06cebaf13f9578eff1a9675258d3bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec71145b9604ba30c77a842a33c89c1ad4ab4c70301c3eed2292bc95803dd112ee99c964289ca88c16ddffcb6a37f63130b30a23326b4359929791d0dcef4214
|
7
|
+
data.tar.gz: 0c20c191f6abe4166075a1745860ba500a488294bcb59e2a28e0b61a3bcee07a25be2adfef113d045727eeaf10f935278271a7d259f78d63e580bf8eda3833f3
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
# Mindee Ruby API Library Changelog
|
2
2
|
|
3
|
+
## v3.15.0 - 2024-10-29
|
4
|
+
### Changes
|
5
|
+
* :sparkles: add support for image compression
|
6
|
+
* :sparkles: add support for PDF compression
|
7
|
+
### Fixes
|
8
|
+
* :recycle: refactor pdf & image namespaces
|
9
|
+
* :memo: fix rubocop directives unexpectedly appearing in Yard documentation
|
10
|
+
* :arrow_up: bump version for mini_magick
|
11
|
+
|
12
|
+
|
3
13
|
## v3.14.0 - 2024-10-11
|
4
14
|
### Changes
|
5
15
|
* :sparkles: add support for Financial Document v1.10
|
data/lib/mindee/client.rb
CHANGED
@@ -128,6 +128,7 @@ module Mindee
|
|
128
128
|
end
|
129
129
|
|
130
130
|
# rubocop:disable Metrics/ParameterLists
|
131
|
+
|
131
132
|
# Enqueue a document for async parsing and automatically try to retrieve it
|
132
133
|
#
|
133
134
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
@@ -54,7 +54,6 @@ module Mindee
|
|
54
54
|
image = MiniMagick::Image.read(@buffer)
|
55
55
|
image.format file_format.downcase
|
56
56
|
image.write resolved_path.to_s
|
57
|
-
logger.info("File saved successfully to '#{resolved_path}'.")
|
58
57
|
rescue TypeError
|
59
58
|
raise 'Invalid path/filename provided.'
|
60
59
|
rescue StandardError
|
@@ -11,8 +11,8 @@ module Mindee
|
|
11
11
|
# Image Extraction Module.
|
12
12
|
module Extraction
|
13
13
|
# Image Extraction wrapper class.
|
14
|
-
|
15
|
-
def self.attach_image_as_new_file(input_buffer)
|
14
|
+
module ImageExtractor
|
15
|
+
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
|
16
16
|
# Attaches an image as a new page in a PdfDocument object.
|
17
17
|
#
|
18
18
|
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
|
@@ -21,9 +21,9 @@ module Mindee
|
|
21
21
|
magick_image = MiniMagick::Image.read(input_buffer)
|
22
22
|
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
|
23
23
|
# converted.
|
24
|
-
magick_image.format(
|
24
|
+
magick_image.format(format)
|
25
25
|
original_density = magick_image.resolution
|
26
|
-
scale_factor = original_density[0].to_f / 4.166666 # No clue why
|
26
|
+
scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
|
27
27
|
# the pdf otherwise the resulting image shrinks.
|
28
28
|
magick_image.format('pdf', 0, { density: scale_factor.to_s })
|
29
29
|
Origami::PDF.read(StringIO.new(magick_image.to_blob))
|
@@ -37,27 +37,12 @@ module Mindee
|
|
37
37
|
# to extract.
|
38
38
|
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
|
39
39
|
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
|
40
|
-
new_stream =
|
40
|
+
new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
|
41
41
|
new_stream.seek(0)
|
42
42
|
|
43
43
|
extract_images_from_polygons(input_source, new_stream, page_id, polygons)
|
44
44
|
end
|
45
45
|
|
46
|
-
# Retrieves a PDF document's page.
|
47
|
-
#
|
48
|
-
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
49
|
-
# @param [Integer] page_id Page ID.
|
50
|
-
def self.get_page(pdf_doc, page_id)
|
51
|
-
stream = StringIO.new
|
52
|
-
pdf_doc.save(stream)
|
53
|
-
|
54
|
-
options = {
|
55
|
-
page_indexes: [page_id - 1],
|
56
|
-
}
|
57
|
-
|
58
|
-
Mindee::PDF::PdfProcessor.parse(stream, options)
|
59
|
-
end
|
60
|
-
|
61
46
|
# Extracts images from their positions on a file (as polygons).
|
62
47
|
#
|
63
48
|
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
@@ -179,10 +164,10 @@ module Mindee
|
|
179
164
|
# @param input_file [LocalInputSource] Local input.
|
180
165
|
# @param [Integer] page_id Page ID.
|
181
166
|
# @return [MiniMagick::Image] A valid PdfDocument handle.
|
182
|
-
def self.
|
167
|
+
def self.load_input_source_pdf_page_as_image(input_file, page_id)
|
183
168
|
input_file.io_stream.rewind
|
184
169
|
if input_file.pdf?
|
185
|
-
get_page(Origami::PDF.read(input_file.io_stream), page_id)
|
170
|
+
Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
|
186
171
|
else
|
187
172
|
input_file.io_stream
|
188
173
|
end
|
@@ -65,6 +65,7 @@ module Mindee
|
|
65
65
|
|
66
66
|
# rubocop:disable Metrics/CyclomaticComplexity
|
67
67
|
# rubocop:disable Metrics/PerceivedComplexity
|
68
|
+
|
68
69
|
# Extracts invoices as complete PDFs from the document.
|
69
70
|
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
|
70
71
|
# @param strict [Boolean]
|
@@ -99,6 +100,7 @@ module Mindee
|
|
99
100
|
end
|
100
101
|
extract_sub_documents(correct_page_indexes)
|
101
102
|
end
|
103
|
+
|
102
104
|
# rubocop:enable Metrics/CyclomaticComplexity
|
103
105
|
# rubocop:enable Metrics/PerceivedComplexity
|
104
106
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image processing module.
|
5
|
+
module Image
|
6
|
+
# Image compressor module to handle image compression.
|
7
|
+
module ImageCompressor
|
8
|
+
# Resize and/or compress an SKBitmap. This assumes the ratio was provided before hands.
|
9
|
+
# @param image [MiniMagick::Image, StringIO] Input image.
|
10
|
+
# @param quality [Integer, nil] Quality of the final file.
|
11
|
+
# @param max_width [Integer, nil] Maximum width. If not specified, the horizontal ratio will remain the same.
|
12
|
+
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
|
13
|
+
# @return [StringIO]
|
14
|
+
def self.compress_image(image, quality: 85, max_width: nil, max_height: nil)
|
15
|
+
processed_image = ImageUtils.to_image(image)
|
16
|
+
processed_image.format 'jpg'
|
17
|
+
final_width, final_height = ImageUtils.calculate_new_dimensions(
|
18
|
+
processed_image,
|
19
|
+
max_width: max_width,
|
20
|
+
max_height: max_height
|
21
|
+
)
|
22
|
+
ImageUtils.resize_image(processed_image, final_width, final_height) if final_width || final_height
|
23
|
+
ImageUtils.compress_image_quality(processed_image, quality)
|
24
|
+
|
25
|
+
ImageUtils.image_to_stringio(processed_image)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image processing module.
|
5
|
+
module Image
|
6
|
+
# Miscellaneous image operations.
|
7
|
+
module ImageUtils
|
8
|
+
# Resizes a provided MiniMagick Image with the given width & height, if present.
|
9
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
10
|
+
# @param width [Integer] Width to comply with.
|
11
|
+
# @param height [Integer] Height to comply with.
|
12
|
+
def self.resize_image(image, width, height)
|
13
|
+
if width && height
|
14
|
+
image.resize "#{width}x#{height}"
|
15
|
+
elsif width
|
16
|
+
image.resize width.to_s
|
17
|
+
elsif height
|
18
|
+
image.resize "x#{height}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Compresses the quality of the provided MiniMagick image.
|
23
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
24
|
+
# @param quality [Integer] Quality to apply to the image. This is independent of a JPG's base quality.
|
25
|
+
def self.compress_image_quality(image, quality)
|
26
|
+
image.quality quality.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
# Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
|
30
|
+
# @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
|
31
|
+
# @return [MiniMagick::Image]
|
32
|
+
def self.to_image(image)
|
33
|
+
if image.respond_to?(:read) && image.respond_to?(:rewind)
|
34
|
+
image.rewind
|
35
|
+
MiniMagick::Image.read(image)
|
36
|
+
elsif image.is_a?(MiniMagick::Image)
|
37
|
+
image
|
38
|
+
else
|
39
|
+
raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Converts a StringIO containing an image into a MiniMagick image.
|
44
|
+
# @param image [MiniMagick::Image] the input image.
|
45
|
+
# @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
|
46
|
+
# @return [StringIO]
|
47
|
+
def self.image_to_stringio(image, format = 'JPEG')
|
48
|
+
image.format format
|
49
|
+
blob = image.to_blob
|
50
|
+
stringio = StringIO.new(blob)
|
51
|
+
stringio.rewind
|
52
|
+
|
53
|
+
stringio
|
54
|
+
end
|
55
|
+
|
56
|
+
# Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
|
57
|
+
# provided bounds.
|
58
|
+
# @param [MiniMagick::Image] original Input MiniMagick image.
|
59
|
+
# @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
|
60
|
+
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
|
61
|
+
def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
|
62
|
+
raise 'Provided image could not be processed for resizing.' if original.nil?
|
63
|
+
|
64
|
+
return [original.width, original.height] if max_width.nil? && max_height.nil?
|
65
|
+
|
66
|
+
width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
|
67
|
+
height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY
|
68
|
+
|
69
|
+
scale_factor = [width_ratio, height_ratio].min
|
70
|
+
|
71
|
+
new_width = (original.width * scale_factor).to_i
|
72
|
+
new_height = (original.height * scale_factor).to_i
|
73
|
+
|
74
|
+
[new_width, new_height]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
|
78
|
+
# @param image [MiniMagick::Image] The initial image that will fit into the page.
|
79
|
+
# @param media_box [Array<Integer>, nil]
|
80
|
+
# @return [Array<Integer>]
|
81
|
+
def self.calculate_dimensions_from_media_box(image, media_box)
|
82
|
+
if !media_box.nil? && media_box.any?
|
83
|
+
[
|
84
|
+
media_box[2]&.to_i || image[:width].to_i,
|
85
|
+
media_box[3]&.to_i || image[:height].to_i,
|
86
|
+
]
|
87
|
+
else
|
88
|
+
[image[:width].to_i, image[:height].to_i]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
|
93
|
+
# @param pdf_stream [StringIO] Input stream.
|
94
|
+
# @param image_quality [Integer] Quality to apply to the image.
|
95
|
+
# @return [MiniMagick::Image]
|
96
|
+
def self.pdf_to_magick_image(pdf_stream, image_quality)
|
97
|
+
compressed_image = MiniMagick::Image.read(pdf_stream.read)
|
98
|
+
compressed_image.format('jpg')
|
99
|
+
compressed_image.quality image_quality.to_s
|
100
|
+
compressed_image
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/mindee/image.rb
ADDED
data/lib/mindee/input/sources.rb
CHANGED
@@ -4,6 +4,7 @@ require 'stringio'
|
|
4
4
|
require 'marcel'
|
5
5
|
|
6
6
|
require_relative '../pdf'
|
7
|
+
require_relative '../image'
|
7
8
|
|
8
9
|
module Mindee
|
9
10
|
module Input
|
@@ -126,6 +127,41 @@ module Mindee
|
|
126
127
|
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
127
128
|
pdf_processor.pages.size
|
128
129
|
end
|
130
|
+
|
131
|
+
# Compresses the file, according to the provided info.
|
132
|
+
# @param [Integer] quality Quality of the output file.
|
133
|
+
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
+
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
+
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
+
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
+
# WARNING: this operation is strongly discouraged.
|
138
|
+
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
+
# not. Needs force_source_text to work.
|
140
|
+
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
+
buffer = if pdf?
|
142
|
+
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
+
@io_stream,
|
144
|
+
quality: quality,
|
145
|
+
force_source_text_compression: force_source_text,
|
146
|
+
disable_source_text: disable_source_text
|
147
|
+
)
|
148
|
+
else
|
149
|
+
Mindee::Image::ImageCompressor.compress_image(
|
150
|
+
@io_stream,
|
151
|
+
quality: quality,
|
152
|
+
max_width: max_width,
|
153
|
+
max_height: max_height
|
154
|
+
)
|
155
|
+
end
|
156
|
+
@io_stream = buffer
|
157
|
+
@io_stream.rewind
|
158
|
+
end
|
159
|
+
|
160
|
+
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
+
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
+
def source_text?
|
163
|
+
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
+
end
|
129
165
|
end
|
130
166
|
|
131
167
|
# Load a document from a path.
|
@@ -31,6 +31,8 @@ module Mindee
|
|
31
31
|
|
32
32
|
# rubocop:disable Metrics/CyclomaticComplexity
|
33
33
|
# rubocop:disable Metrics/PerceivedComplexity
|
34
|
+
|
35
|
+
# String representation.
|
34
36
|
# @return [String]
|
35
37
|
def to_s
|
36
38
|
return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
|
@@ -40,6 +42,7 @@ module Mindee
|
|
40
42
|
|
41
43
|
''
|
42
44
|
end
|
45
|
+
|
43
46
|
# rubocop:enable Metrics/CyclomaticComplexity
|
44
47
|
# rubocop:enable Metrics/PerceivedComplexity
|
45
48
|
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pdf-reader'
|
4
|
+
PDFReader = PDF
|
5
|
+
|
6
|
+
module Mindee
|
7
|
+
module PDF
|
8
|
+
# Image compressor module to handle PDF compression.
|
9
|
+
module PDFCompressor
|
10
|
+
# Compresses each page of a provided PDF stream. Skips if force_source_text isn't set and source text is detected.
|
11
|
+
# @param quality [Integer] Compression quality (70-100 for most JPG images in the test dataset).
|
12
|
+
# @param force_source_text_compression [Boolean] If true, attempts to re-write detected text.
|
13
|
+
# @param disable_source_text [Boolean] If true, doesn't re-apply source text to the original PDF.
|
14
|
+
def self.compress_pdf(pdf_data, quality: 85, force_source_text_compression: false, disable_source_text: true)
|
15
|
+
if PDFTools.source_text?(pdf_data)
|
16
|
+
if force_source_text_compression
|
17
|
+
if disable_source_text
|
18
|
+
puts "\e[33m[WARNING] Re-writing PDF source-text is an EXPERIMENTAL feature.\e[0m"
|
19
|
+
else
|
20
|
+
puts "\e[33m[WARNING] Source-file contains text, but disable_source_text flag is ignored. " \
|
21
|
+
"Resulting file will not contain any embedded text.\e[0m"
|
22
|
+
end
|
23
|
+
else
|
24
|
+
puts "\e[33m[WARNING] Source-text detected in input PDF. Aborting operation.\e[0m"
|
25
|
+
return pdf_data
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
pdf_data.rewind
|
30
|
+
pdf = Origami::PDF.read(pdf_data)
|
31
|
+
pages = process_pdf_pages(pdf, quality)
|
32
|
+
|
33
|
+
output_pdf = create_output_pdf(pages, disable_source_text, pdf_data)
|
34
|
+
|
35
|
+
output_stream = StringIO.new
|
36
|
+
output_pdf.save(output_stream)
|
37
|
+
output_stream
|
38
|
+
end
|
39
|
+
|
40
|
+
# Processes all pages in the PDF.
|
41
|
+
# @param pdf [Origami::PDF] The Origami PDF object to process.
|
42
|
+
# @param quality [Integer] Compression quality.
|
43
|
+
# @return [Array<Origami::Page>] Processed pages.
|
44
|
+
def self.process_pdf_pages(pdf, quality)
|
45
|
+
pdf.pages.map.with_index do |page, index|
|
46
|
+
process_pdf_page(Mindee::PDF::PdfProcessor.get_page(pdf, index), index, quality, page[:MediaBox])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Creates the output PDF with processed pages.
|
51
|
+
# @param pages [Array] Processed pages.
|
52
|
+
# @param disable_source_text [Boolean] Whether to disable source text.
|
53
|
+
# @param pdf_data [StringIO] Original PDF data.
|
54
|
+
# @return [Origami::PDF] Output PDF object.
|
55
|
+
def self.create_output_pdf(pages, disable_source_text, pdf_data)
|
56
|
+
output_pdf = Origami::PDF.new
|
57
|
+
# NOTE: Page order and XObject handling require adjustment due to origami adding the last page first.
|
58
|
+
pages.rotate!(1) if pages.count >= 2
|
59
|
+
|
60
|
+
inject_text(pdf_data, pages) unless disable_source_text
|
61
|
+
|
62
|
+
pages.each { |page| output_pdf.append_page(page) }
|
63
|
+
|
64
|
+
output_pdf
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extracts text from a source text PDF, and injects it into a newly-created one.
|
68
|
+
# @param pdf_data [StringIO] Stream representation of the PDF.
|
69
|
+
# @param pages [Array<Origami::Page>] Array of pages containing the rasterized version of the initial pages.
|
70
|
+
def self.inject_text(pdf_data, pages)
|
71
|
+
reader = PDFReader::Reader.new(pdf_data)
|
72
|
+
|
73
|
+
reader.pages.each_with_index do |original_page, index|
|
74
|
+
break if index >= pages.length
|
75
|
+
|
76
|
+
receiver = PDFReader::Reader::PageTextReceiver.new
|
77
|
+
original_page.walk(receiver)
|
78
|
+
|
79
|
+
receiver.runs.each do |text_run|
|
80
|
+
x = text_run.origin.x
|
81
|
+
y = text_run.origin.y
|
82
|
+
text = text_run.text
|
83
|
+
font_size = text_run.font_size
|
84
|
+
|
85
|
+
content_stream = Origami::Stream.new
|
86
|
+
content_stream.dictionary[:Filter] = :FlateDecode
|
87
|
+
content_stream.data = "BT\n/F1 #{font_size} Tf\n#{x} #{y} Td\n(#{text}) Tj\nET\n"
|
88
|
+
|
89
|
+
pages[index].Contents.data += content_stream.data
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Takes in a page stream, rasterizes it into a JPEG image, and applies the result onto a new Origami PDF page.
|
95
|
+
# @param page_stream [StringIO] Stream representation of a single page from the initial PDF.
|
96
|
+
# @param page_index [Integer] Index of the current page. Technically not needed, but left for debugging purposes.
|
97
|
+
# @param image_quality [Integer] Quality to apply to the rasterized page.
|
98
|
+
# @param media_box [Array<Integer>, nil] Extracted media box from the page. Can be nil.
|
99
|
+
# @return [Origami::Page]
|
100
|
+
def self.process_pdf_page(page_stream, page_index, image_quality, media_box)
|
101
|
+
new_page = Origami::Page.new
|
102
|
+
compressed_image = Mindee::Image::ImageUtils.pdf_to_magick_image(page_stream, image_quality)
|
103
|
+
width, height = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(compressed_image, media_box)
|
104
|
+
|
105
|
+
compressed_xobject = PDF::PDFTools.create_xobject(compressed_image)
|
106
|
+
PDF::PDFTools.set_xobject_properties(compressed_xobject, compressed_image)
|
107
|
+
|
108
|
+
xobject_name = "X#{page_index + 1}"
|
109
|
+
PDF::PDFTools.add_content_to_page(new_page, xobject_name, width, height)
|
110
|
+
new_page.add_xobject(compressed_xobject, xobject_name)
|
111
|
+
|
112
|
+
PDF::PDFTools.set_page_dimensions(new_page, width, height)
|
113
|
+
new_page
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -18,6 +18,7 @@ module Mindee
|
|
18
18
|
|
19
19
|
# @param io_stream [StreamIO]
|
20
20
|
# @param options [Hash]
|
21
|
+
# @return [StringIO]
|
21
22
|
def self.parse(io_stream, options)
|
22
23
|
options = DEFAULT_OPTIONS.merge(options)
|
23
24
|
|
@@ -74,6 +75,22 @@ module Mindee
|
|
74
75
|
io_stream.seek(0)
|
75
76
|
pdf_parser.parse(io_stream)
|
76
77
|
end
|
78
|
+
|
79
|
+
# Retrieves a PDF document's page.
|
80
|
+
#
|
81
|
+
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
82
|
+
# @param [Integer] page_id Page ID.
|
83
|
+
# @return [StringIO]
|
84
|
+
def self.get_page(pdf_doc, page_id)
|
85
|
+
stream = StringIO.new
|
86
|
+
pdf_doc.save(stream)
|
87
|
+
|
88
|
+
options = {
|
89
|
+
page_indexes: [page_id - 1],
|
90
|
+
}
|
91
|
+
|
92
|
+
parse(stream, options)
|
93
|
+
end
|
77
94
|
end
|
78
95
|
end
|
79
96
|
end
|
data/lib/mindee/pdf/pdf_tools.rb
CHANGED
@@ -29,6 +29,106 @@ module Mindee
|
|
29
29
|
io_stream.set_encoding Encoding::BINARY
|
30
30
|
io_stream
|
31
31
|
end
|
32
|
+
|
33
|
+
# Checks a PDFs stream content for text operators
|
34
|
+
# See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
|
35
|
+
# @param [StringIO] stream Stream object from a PDFs page.
|
36
|
+
# @return [Boolean] True if a text operator is found in the stream.
|
37
|
+
def self.stream_has_text?(stream)
|
38
|
+
data = stream.data
|
39
|
+
return false if data.nil? || data.empty?
|
40
|
+
|
41
|
+
text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
|
42
|
+
text_operators.any? { |op| data.include?(op) }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Checks whether the file has source_text. Sends false if the file isn't a PDF.
|
46
|
+
# @param [StringIO] pdf_data
|
47
|
+
# @return [Boolean] True if the pdf has source text, false otherwise.
|
48
|
+
def self.source_text?(pdf_data)
|
49
|
+
begin
|
50
|
+
pdf_data.rewind
|
51
|
+
pdf = Origami::PDF.read(pdf_data)
|
52
|
+
|
53
|
+
pdf.each_page do |page|
|
54
|
+
next unless page[:Contents]
|
55
|
+
|
56
|
+
contents = page[:Contents].solve
|
57
|
+
contents = [contents] unless contents.is_a?(Origami::Array)
|
58
|
+
|
59
|
+
contents.each do |stream_ref|
|
60
|
+
stream = stream_ref.solve
|
61
|
+
return true if stream_has_text?(stream)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
false
|
66
|
+
end
|
67
|
+
|
68
|
+
false
|
69
|
+
rescue Origami::InvalidPDFError
|
70
|
+
false
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.create_xobject(image)
|
74
|
+
image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
|
75
|
+
Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.set_xobject_properties(xobject, image)
|
79
|
+
xobject.dictionary[:BitsPerComponent] = 8
|
80
|
+
xobject.dictionary[:Filter] = determine_filter(image)
|
81
|
+
xobject.dictionary[:Width] = image[:width]
|
82
|
+
xobject.dictionary[:Height] = image[:height]
|
83
|
+
xobject.dictionary[:ColorSpace] = determine_colorspace(image)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.determine_filter(image)
|
87
|
+
filter = image.data['properties']['filter']
|
88
|
+
case filter
|
89
|
+
when %r{Zip}i then :FlateDecode
|
90
|
+
when %r{LZW}i then :LZWDecode
|
91
|
+
else :DCTDecode
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.determine_colorspace(image)
|
96
|
+
colorspace = image.data['colorspace']
|
97
|
+
case colorspace
|
98
|
+
when 'CMYK' then :DeviceCMYK
|
99
|
+
when 'Gray', 'PseudoClass Gray' then :DeviceGray
|
100
|
+
else :DeviceRGB
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.add_content_to_page(page, xobject_name, width, height)
|
105
|
+
content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
|
106
|
+
content_stream = Origami::Stream.new(content)
|
107
|
+
page.Contents = content_stream
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.set_page_dimensions(page, width, height)
|
111
|
+
page[:MediaBox] = [0, 0, width, height]
|
112
|
+
page[:CropBox] = [0, 0, width, height]
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.process_image_xobject(image_data, image_quality, width, height)
|
116
|
+
compressed_data = Image::ImageCompressor.compress_image(
|
117
|
+
image_data,
|
118
|
+
quality: image_quality,
|
119
|
+
max_width: width,
|
120
|
+
max_height: height
|
121
|
+
)
|
122
|
+
|
123
|
+
new_image = Origami::Graphics::ImageXObject.new
|
124
|
+
new_image.data = compressed_data
|
125
|
+
new_image.Width = width
|
126
|
+
new_image.Height = height
|
127
|
+
new_image.ColorSpace = :DeviceRGB
|
128
|
+
new_image.BitsPerComponent = 8
|
129
|
+
|
130
|
+
new_image
|
131
|
+
end
|
32
132
|
end
|
33
133
|
end
|
34
134
|
end
|
data/lib/mindee/pdf.rb
CHANGED
data/lib/mindee/version.rb
CHANGED
data/lib/mindee.rb
CHANGED
@@ -19,6 +19,16 @@ module Mindee
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
module Image
|
23
|
+
# Miscellaneous image operations.
|
24
|
+
module ImageUtils
|
25
|
+
end
|
26
|
+
|
27
|
+
# Image compressor module to handle image compression.
|
28
|
+
module ImageCompressor
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
22
32
|
# Custom extraction module
|
23
33
|
module Extraction
|
24
34
|
end
|
data/mindee.gemspec
CHANGED
@@ -30,8 +30,9 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.required_ruby_version = Gem::Requirement.new('>= 2.6')
|
31
31
|
|
32
32
|
spec.add_runtime_dependency 'marcel', '~> 1.0.2'
|
33
|
-
spec.add_runtime_dependency 'mini_magick', '
|
33
|
+
spec.add_runtime_dependency 'mini_magick', '>=4', '< 6'
|
34
34
|
spec.add_runtime_dependency 'origamindee', '~> 3.1.0'
|
35
|
+
spec.add_runtime_dependency 'pdf-reader', '~> 2.12.0'
|
35
36
|
|
36
37
|
spec.add_development_dependency 'rake', '~> 12.3.3'
|
37
38
|
spec.add_development_dependency 'rspec', '~> 3.12.0'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mindee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mindee, SA
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: marcel
|
@@ -28,16 +28,22 @@ dependencies:
|
|
28
28
|
name: mini_magick
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '4'
|
34
|
+
- - "<"
|
32
35
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
36
|
+
version: '6'
|
34
37
|
type: :runtime
|
35
38
|
prerelease: false
|
36
39
|
version_requirements: !ruby/object:Gem::Requirement
|
37
40
|
requirements:
|
38
|
-
- - "
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '4'
|
44
|
+
- - "<"
|
39
45
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
46
|
+
version: '6'
|
41
47
|
- !ruby/object:Gem::Dependency
|
42
48
|
name: origamindee
|
43
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,6 +58,20 @@ dependencies:
|
|
52
58
|
- - "~>"
|
53
59
|
- !ruby/object:Gem::Version
|
54
60
|
version: 3.1.0
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: pdf-reader
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: 2.12.0
|
68
|
+
type: :runtime
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 2.12.0
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
76
|
name: rake
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -224,6 +244,9 @@ files:
|
|
224
244
|
- lib/mindee/http/endpoint.rb
|
225
245
|
- lib/mindee/http/error.rb
|
226
246
|
- lib/mindee/http/response_validation.rb
|
247
|
+
- lib/mindee/image.rb
|
248
|
+
- lib/mindee/image/image_compressor.rb
|
249
|
+
- lib/mindee/image/image_utils.rb
|
227
250
|
- lib/mindee/input.rb
|
228
251
|
- lib/mindee/input/local_response.rb
|
229
252
|
- lib/mindee/input/sources.rb
|
@@ -262,7 +285,8 @@ files:
|
|
262
285
|
- lib/mindee/parsing/standard/string_field.rb
|
263
286
|
- lib/mindee/parsing/standard/tax_field.rb
|
264
287
|
- lib/mindee/pdf.rb
|
265
|
-
- lib/mindee/pdf/
|
288
|
+
- lib/mindee/pdf/pdf_compressor.rb
|
289
|
+
- lib/mindee/pdf/pdf_processor.rb
|
266
290
|
- lib/mindee/pdf/pdf_tools.rb
|
267
291
|
- lib/mindee/product.rb
|
268
292
|
- lib/mindee/product/.rubocop.yml
|