mindee 3.14.0 → 3.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/docs/business_card_v1.md +169 -0
- data/docs/code_samples/business_card_v1_async.txt +19 -0
- data/docs/code_samples/delivery_notes_v1_async.txt +19 -0
- data/docs/code_samples/expense_receipts_v5_async.txt +19 -0
- data/docs/code_samples/ind_passport_v1_async.txt +19 -0
- data/docs/delivery_notes_v1.md +143 -0
- data/docs/energy_bill_fra_v1.md +2 -2
- data/docs/expense_receipts_v5.md +27 -2
- data/docs/financial_document_v1.md +8 -4
- data/docs/ind_passport_v1.md +281 -0
- data/docs/invoices_v4.md +12 -8
- data/docs/resume_v1.md +17 -16
- data/lib/mindee/client.rb +9 -8
- data/lib/mindee/extraction/common/extracted_image.rb +0 -1
- data/lib/mindee/extraction/common/image_extractor.rb +7 -22
- data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +2 -0
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +1 -0
- data/lib/mindee/geometry/point.rb +2 -1
- data/lib/mindee/image/image_compressor.rb +29 -0
- data/lib/mindee/image/image_utils.rb +104 -0
- data/lib/mindee/image.rb +4 -0
- data/lib/mindee/input/sources.rb +36 -0
- data/lib/mindee/parsing/standard/position_field.rb +3 -0
- data/lib/mindee/pdf/pdf_compressor.rb +117 -0
- data/lib/mindee/pdf/{pdf_processing.rb → pdf_processor.rb} +17 -0
- data/lib/mindee/pdf/pdf_tools.rb +100 -0
- data/lib/mindee/pdf.rb +3 -1
- data/lib/mindee/product/business_card/business_card_v1.rb +39 -0
- data/lib/mindee/product/business_card/business_card_v1_document.rb +85 -0
- data/lib/mindee/product/business_card/business_card_v1_page.rb +32 -0
- data/lib/mindee/product/delivery_note/delivery_note_v1.rb +39 -0
- data/lib/mindee/product/delivery_note/delivery_note_v1_document.rb +61 -0
- data/lib/mindee/product/delivery_note/delivery_note_v1_page.rb +32 -0
- data/lib/mindee/product/financial_document/financial_document_v1_document.rb +1 -1
- data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
- data/lib/mindee/product/ind/indian_passport/indian_passport_v1.rb +41 -0
- data/lib/mindee/product/ind/indian_passport/indian_passport_v1_document.rb +143 -0
- data/lib/mindee/product/ind/indian_passport/indian_passport_v1_page.rb +34 -0
- data/lib/mindee/product/invoice/invoice_v4_document.rb +1 -1
- data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
- data/lib/mindee/product/resume/resume_v1_document.rb +3 -1
- data/lib/mindee/product/resume/resume_v1_page.rb +1 -1
- data/lib/mindee/product/resume/resume_v1_professional_experience.rb +8 -0
- data/lib/mindee/product.rb +10 -7
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +10 -0
- data/mindee.gemspec +2 -1
- metadata +47 -7
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image processing module.
|
5
|
+
module Image
|
6
|
+
# Miscellaneous image operations.
|
7
|
+
module ImageUtils
|
8
|
+
# Resizes a provided MiniMagick Image with the given width & height, if present.
|
9
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
10
|
+
# @param width [Integer] Width to comply with.
|
11
|
+
# @param height [Integer] Height to comply with.
|
12
|
+
def self.resize_image(image, width, height)
|
13
|
+
if width && height
|
14
|
+
image.resize "#{width}x#{height}"
|
15
|
+
elsif width
|
16
|
+
image.resize width.to_s
|
17
|
+
elsif height
|
18
|
+
image.resize "x#{height}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Compresses the quality of the provided MiniMagick image.
|
23
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
24
|
+
# @param quality [Integer] Quality to apply to the image. This is independent of a JPG's base quality.
|
25
|
+
def self.compress_image_quality(image, quality)
|
26
|
+
image.quality quality.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
# Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
|
30
|
+
# @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
|
31
|
+
# @return [MiniMagick::Image]
|
32
|
+
def self.to_image(image)
|
33
|
+
if image.respond_to?(:read) && image.respond_to?(:rewind)
|
34
|
+
image.rewind
|
35
|
+
MiniMagick::Image.read(image)
|
36
|
+
elsif image.is_a?(MiniMagick::Image)
|
37
|
+
image
|
38
|
+
else
|
39
|
+
raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Converts a StringIO containing an image into a MiniMagick image.
|
44
|
+
# @param image [MiniMagick::Image] the input image.
|
45
|
+
# @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
|
46
|
+
# @return [StringIO]
|
47
|
+
def self.image_to_stringio(image, format = 'JPEG')
|
48
|
+
image.format format
|
49
|
+
blob = image.to_blob
|
50
|
+
stringio = StringIO.new(blob)
|
51
|
+
stringio.rewind
|
52
|
+
|
53
|
+
stringio
|
54
|
+
end
|
55
|
+
|
56
|
+
# Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
|
57
|
+
# provided bounds.
|
58
|
+
# @param [MiniMagick::Image] original Input MiniMagick image.
|
59
|
+
# @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
|
60
|
+
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
|
61
|
+
def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
|
62
|
+
raise 'Provided image could not be processed for resizing.' if original.nil?
|
63
|
+
|
64
|
+
return [original.width, original.height] if max_width.nil? && max_height.nil?
|
65
|
+
|
66
|
+
width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
|
67
|
+
height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY
|
68
|
+
|
69
|
+
scale_factor = [width_ratio, height_ratio].min
|
70
|
+
|
71
|
+
new_width = (original.width * scale_factor).to_i
|
72
|
+
new_height = (original.height * scale_factor).to_i
|
73
|
+
|
74
|
+
[new_width, new_height]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
|
78
|
+
# @param image [MiniMagick::Image] The initial image that will fit into the page.
|
79
|
+
# @param media_box [Array<Integer>, nil]
|
80
|
+
# @return [Array<Integer>]
|
81
|
+
def self.calculate_dimensions_from_media_box(image, media_box)
|
82
|
+
if !media_box.nil? && media_box.any?
|
83
|
+
[
|
84
|
+
media_box[2]&.to_i || image[:width].to_i,
|
85
|
+
media_box[3]&.to_i || image[:height].to_i,
|
86
|
+
]
|
87
|
+
else
|
88
|
+
[image[:width].to_i, image[:height].to_i]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
|
93
|
+
# @param pdf_stream [StringIO] Input stream.
|
94
|
+
# @param image_quality [Integer] Quality to apply to the image.
|
95
|
+
# @return [MiniMagick::Image]
|
96
|
+
def self.pdf_to_magick_image(pdf_stream, image_quality)
|
97
|
+
compressed_image = MiniMagick::Image.read(pdf_stream.read)
|
98
|
+
compressed_image.format('jpg')
|
99
|
+
compressed_image.quality image_quality.to_s
|
100
|
+
compressed_image
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/mindee/image.rb
ADDED
data/lib/mindee/input/sources.rb
CHANGED
@@ -4,6 +4,7 @@ require 'stringio'
|
|
4
4
|
require 'marcel'
|
5
5
|
|
6
6
|
require_relative '../pdf'
|
7
|
+
require_relative '../image'
|
7
8
|
|
8
9
|
module Mindee
|
9
10
|
module Input
|
@@ -126,6 +127,41 @@ module Mindee
|
|
126
127
|
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
127
128
|
pdf_processor.pages.size
|
128
129
|
end
|
130
|
+
|
131
|
+
# Compresses the file, according to the provided info.
|
132
|
+
# @param [Integer] quality Quality of the output file.
|
133
|
+
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
+
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
+
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
+
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
+
# WARNING: this operation is strongly discouraged.
|
138
|
+
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
+
# not. Needs force_source_text to work.
|
140
|
+
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
+
buffer = if pdf?
|
142
|
+
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
+
@io_stream,
|
144
|
+
quality: quality,
|
145
|
+
force_source_text_compression: force_source_text,
|
146
|
+
disable_source_text: disable_source_text
|
147
|
+
)
|
148
|
+
else
|
149
|
+
Mindee::Image::ImageCompressor.compress_image(
|
150
|
+
@io_stream,
|
151
|
+
quality: quality,
|
152
|
+
max_width: max_width,
|
153
|
+
max_height: max_height
|
154
|
+
)
|
155
|
+
end
|
156
|
+
@io_stream = buffer
|
157
|
+
@io_stream.rewind
|
158
|
+
end
|
159
|
+
|
160
|
+
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
+
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
+
def source_text?
|
163
|
+
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
+
end
|
129
165
|
end
|
130
166
|
|
131
167
|
# Load a document from a path.
|
@@ -31,6 +31,8 @@ module Mindee
|
|
31
31
|
|
32
32
|
# rubocop:disable Metrics/CyclomaticComplexity
|
33
33
|
# rubocop:disable Metrics/PerceivedComplexity
|
34
|
+
|
35
|
+
# String representation.
|
34
36
|
# @return [String]
|
35
37
|
def to_s
|
36
38
|
return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
|
@@ -40,6 +42,7 @@ module Mindee
|
|
40
42
|
|
41
43
|
''
|
42
44
|
end
|
45
|
+
|
43
46
|
# rubocop:enable Metrics/CyclomaticComplexity
|
44
47
|
# rubocop:enable Metrics/PerceivedComplexity
|
45
48
|
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pdf-reader'
|
4
|
+
PDFReader = PDF
|
5
|
+
|
6
|
+
module Mindee
|
7
|
+
module PDF
|
8
|
+
# Image compressor module to handle PDF compression.
|
9
|
+
module PDFCompressor
|
10
|
+
# Compresses each page of a provided PDF stream. Skips if force_source_text isn't set and source text is detected.
|
11
|
+
# @param quality [Integer] Compression quality (70-100 for most JPG images in the test dataset).
|
12
|
+
# @param force_source_text_compression [Boolean] If true, attempts to re-write detected text.
|
13
|
+
# @param disable_source_text [Boolean] If true, doesn't re-apply source text to the original PDF.
|
14
|
+
def self.compress_pdf(pdf_data, quality: 85, force_source_text_compression: false, disable_source_text: true)
|
15
|
+
if PDFTools.source_text?(pdf_data)
|
16
|
+
if force_source_text_compression
|
17
|
+
if disable_source_text
|
18
|
+
puts "\e[33m[WARNING] Re-writing PDF source-text is an EXPERIMENTAL feature.\e[0m"
|
19
|
+
else
|
20
|
+
puts "\e[33m[WARNING] Source-file contains text, but disable_source_text flag is ignored. " \
|
21
|
+
"Resulting file will not contain any embedded text.\e[0m"
|
22
|
+
end
|
23
|
+
else
|
24
|
+
puts "\e[33m[WARNING] Source-text detected in input PDF. Aborting operation.\e[0m"
|
25
|
+
return pdf_data
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
pdf_data.rewind
|
30
|
+
pdf = Origami::PDF.read(pdf_data)
|
31
|
+
pages = process_pdf_pages(pdf, quality)
|
32
|
+
|
33
|
+
output_pdf = create_output_pdf(pages, disable_source_text, pdf_data)
|
34
|
+
|
35
|
+
output_stream = StringIO.new
|
36
|
+
output_pdf.save(output_stream)
|
37
|
+
output_stream
|
38
|
+
end
|
39
|
+
|
40
|
+
# Processes all pages in the PDF.
|
41
|
+
# @param pdf [Origami::PDF] The Origami PDF object to process.
|
42
|
+
# @param quality [Integer] Compression quality.
|
43
|
+
# @return [Array<Origami::Page>] Processed pages.
|
44
|
+
def self.process_pdf_pages(pdf, quality)
|
45
|
+
pdf.pages.map.with_index do |page, index|
|
46
|
+
process_pdf_page(Mindee::PDF::PdfProcessor.get_page(pdf, index), index, quality, page[:MediaBox])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Creates the output PDF with processed pages.
|
51
|
+
# @param pages [Array] Processed pages.
|
52
|
+
# @param disable_source_text [Boolean] Whether to disable source text.
|
53
|
+
# @param pdf_data [StringIO] Original PDF data.
|
54
|
+
# @return [Origami::PDF] Output PDF object.
|
55
|
+
def self.create_output_pdf(pages, disable_source_text, pdf_data)
|
56
|
+
output_pdf = Origami::PDF.new
|
57
|
+
# NOTE: Page order and XObject handling require adjustment due to origami adding the last page first.
|
58
|
+
pages.rotate!(1) if pages.count >= 2
|
59
|
+
|
60
|
+
inject_text(pdf_data, pages) unless disable_source_text
|
61
|
+
|
62
|
+
pages.each { |page| output_pdf.append_page(page) }
|
63
|
+
|
64
|
+
output_pdf
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extracts text from a source text PDF, and injects it into a newly-created one.
|
68
|
+
# @param pdf_data [StringIO] Stream representation of the PDF.
|
69
|
+
# @param pages [Array<Origami::Page>] Array of pages containing the rasterized version of the initial pages.
|
70
|
+
def self.inject_text(pdf_data, pages)
|
71
|
+
reader = PDFReader::Reader.new(pdf_data)
|
72
|
+
|
73
|
+
reader.pages.each_with_index do |original_page, index|
|
74
|
+
break if index >= pages.length
|
75
|
+
|
76
|
+
receiver = PDFReader::Reader::PageTextReceiver.new
|
77
|
+
original_page.walk(receiver)
|
78
|
+
|
79
|
+
receiver.runs.each do |text_run|
|
80
|
+
x = text_run.origin.x
|
81
|
+
y = text_run.origin.y
|
82
|
+
text = text_run.text
|
83
|
+
font_size = text_run.font_size
|
84
|
+
|
85
|
+
content_stream = Origami::Stream.new
|
86
|
+
content_stream.dictionary[:Filter] = :FlateDecode
|
87
|
+
content_stream.data = "BT\n/F1 #{font_size} Tf\n#{x} #{y} Td\n(#{text}) Tj\nET\n"
|
88
|
+
|
89
|
+
pages[index].Contents.data += content_stream.data
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Takes in a page stream, rasterizes it into a JPEG image, and applies the result onto a new Origami PDF page.
|
95
|
+
# @param page_stream [StringIO] Stream representation of a single page from the initial PDF.
|
96
|
+
# @param page_index [Integer] Index of the current page. Technically not needed, but left for debugging purposes.
|
97
|
+
# @param image_quality [Integer] Quality to apply to the rasterized page.
|
98
|
+
# @param media_box [Array<Integer>, nil] Extracted media box from the page. Can be nil.
|
99
|
+
# @return [Origami::Page]
|
100
|
+
def self.process_pdf_page(page_stream, page_index, image_quality, media_box)
|
101
|
+
new_page = Origami::Page.new
|
102
|
+
compressed_image = Mindee::Image::ImageUtils.pdf_to_magick_image(page_stream, image_quality)
|
103
|
+
width, height = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(compressed_image, media_box)
|
104
|
+
|
105
|
+
compressed_xobject = PDF::PDFTools.create_xobject(compressed_image)
|
106
|
+
PDF::PDFTools.set_xobject_properties(compressed_xobject, compressed_image)
|
107
|
+
|
108
|
+
xobject_name = "X#{page_index + 1}"
|
109
|
+
PDF::PDFTools.add_content_to_page(new_page, xobject_name, width, height)
|
110
|
+
new_page.add_xobject(compressed_xobject, xobject_name)
|
111
|
+
|
112
|
+
PDF::PDFTools.set_page_dimensions(new_page, width, height)
|
113
|
+
new_page
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -18,6 +18,7 @@ module Mindee
|
|
18
18
|
|
19
19
|
# @param io_stream [StreamIO]
|
20
20
|
# @param options [Hash]
|
21
|
+
# @return [StringIO]
|
21
22
|
def self.parse(io_stream, options)
|
22
23
|
options = DEFAULT_OPTIONS.merge(options)
|
23
24
|
|
@@ -74,6 +75,22 @@ module Mindee
|
|
74
75
|
io_stream.seek(0)
|
75
76
|
pdf_parser.parse(io_stream)
|
76
77
|
end
|
78
|
+
|
79
|
+
# Retrieves a PDF document's page.
|
80
|
+
#
|
81
|
+
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
82
|
+
# @param [Integer] page_id Page ID.
|
83
|
+
# @return [StringIO]
|
84
|
+
def self.get_page(pdf_doc, page_id)
|
85
|
+
stream = StringIO.new
|
86
|
+
pdf_doc.save(stream)
|
87
|
+
|
88
|
+
options = {
|
89
|
+
page_indexes: [page_id - 1],
|
90
|
+
}
|
91
|
+
|
92
|
+
parse(stream, options)
|
93
|
+
end
|
77
94
|
end
|
78
95
|
end
|
79
96
|
end
|
data/lib/mindee/pdf/pdf_tools.rb
CHANGED
@@ -29,6 +29,106 @@ module Mindee
|
|
29
29
|
io_stream.set_encoding Encoding::BINARY
|
30
30
|
io_stream
|
31
31
|
end
|
32
|
+
|
33
|
+
# Checks a PDFs stream content for text operators
|
34
|
+
# See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
|
35
|
+
# @param [StringIO] stream Stream object from a PDFs page.
|
36
|
+
# @return [Boolean] True if a text operator is found in the stream.
|
37
|
+
def self.stream_has_text?(stream)
|
38
|
+
data = stream.data
|
39
|
+
return false if data.nil? || data.empty?
|
40
|
+
|
41
|
+
text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
|
42
|
+
text_operators.any? { |op| data.include?(op) }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Checks whether the file has source_text. Sends false if the file isn't a PDF.
|
46
|
+
# @param [StringIO] pdf_data
|
47
|
+
# @return [Boolean] True if the pdf has source text, false otherwise.
|
48
|
+
def self.source_text?(pdf_data)
|
49
|
+
begin
|
50
|
+
pdf_data.rewind
|
51
|
+
pdf = Origami::PDF.read(pdf_data)
|
52
|
+
|
53
|
+
pdf.each_page do |page|
|
54
|
+
next unless page[:Contents]
|
55
|
+
|
56
|
+
contents = page[:Contents].solve
|
57
|
+
contents = [contents] unless contents.is_a?(Origami::Array)
|
58
|
+
|
59
|
+
contents.each do |stream_ref|
|
60
|
+
stream = stream_ref.solve
|
61
|
+
return true if stream_has_text?(stream)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
false
|
66
|
+
end
|
67
|
+
|
68
|
+
false
|
69
|
+
rescue Origami::InvalidPDFError
|
70
|
+
false
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.create_xobject(image)
|
74
|
+
image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
|
75
|
+
Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.set_xobject_properties(xobject, image)
|
79
|
+
xobject.dictionary[:BitsPerComponent] = 8
|
80
|
+
xobject.dictionary[:Filter] = determine_filter(image)
|
81
|
+
xobject.dictionary[:Width] = image[:width]
|
82
|
+
xobject.dictionary[:Height] = image[:height]
|
83
|
+
xobject.dictionary[:ColorSpace] = determine_colorspace(image)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.determine_filter(image)
|
87
|
+
filter = image.data['properties']['filter']
|
88
|
+
case filter
|
89
|
+
when %r{Zip}i then :FlateDecode
|
90
|
+
when %r{LZW}i then :LZWDecode
|
91
|
+
else :DCTDecode
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.determine_colorspace(image)
|
96
|
+
colorspace = image.data['colorspace']
|
97
|
+
case colorspace
|
98
|
+
when 'CMYK' then :DeviceCMYK
|
99
|
+
when 'Gray', 'PseudoClass Gray' then :DeviceGray
|
100
|
+
else :DeviceRGB
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.add_content_to_page(page, xobject_name, width, height)
|
105
|
+
content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
|
106
|
+
content_stream = Origami::Stream.new(content)
|
107
|
+
page.Contents = content_stream
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.set_page_dimensions(page, width, height)
|
111
|
+
page[:MediaBox] = [0, 0, width, height]
|
112
|
+
page[:CropBox] = [0, 0, width, height]
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.process_image_xobject(image_data, image_quality, width, height)
|
116
|
+
compressed_data = Image::ImageCompressor.compress_image(
|
117
|
+
image_data,
|
118
|
+
quality: image_quality,
|
119
|
+
max_width: width,
|
120
|
+
max_height: height
|
121
|
+
)
|
122
|
+
|
123
|
+
new_image = Origami::Graphics::ImageXObject.new
|
124
|
+
new_image.data = compressed_data
|
125
|
+
new_image.Width = width
|
126
|
+
new_image.Height = height
|
127
|
+
new_image.ColorSpace = :DeviceRGB
|
128
|
+
new_image.BitsPerComponent = 8
|
129
|
+
|
130
|
+
new_image
|
131
|
+
end
|
32
132
|
end
|
33
133
|
end
|
34
134
|
end
|
data/lib/mindee/pdf.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../parsing'
|
4
|
+
require_relative 'business_card_v1_document'
|
5
|
+
require_relative 'business_card_v1_page'
|
6
|
+
|
7
|
+
module Mindee
|
8
|
+
module Product
|
9
|
+
# Business Card module.
|
10
|
+
module BusinessCard
|
11
|
+
# Business Card API version 1 inference prediction.
|
12
|
+
class BusinessCardV1 < Mindee::Parsing::Common::Inference
|
13
|
+
@endpoint_name = 'business_card'
|
14
|
+
@endpoint_version = '1'
|
15
|
+
|
16
|
+
# @param prediction [Hash]
|
17
|
+
def initialize(prediction)
|
18
|
+
super
|
19
|
+
@prediction = BusinessCardV1Document.new(prediction['prediction'], nil)
|
20
|
+
@pages = []
|
21
|
+
prediction['pages'].each do |page|
|
22
|
+
if page.key?('prediction') && !page['prediction'].nil? && !page['prediction'].empty?
|
23
|
+
@pages.push(BusinessCardV1Page.new(page))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
# Name of the endpoint for this product.
|
30
|
+
# @return [String]
|
31
|
+
attr_reader :endpoint_name
|
32
|
+
# Version for this product.
|
33
|
+
# @return [String]
|
34
|
+
attr_reader :endpoint_version
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../parsing'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Product
|
7
|
+
module BusinessCard
|
8
|
+
# Business Card API version 1.0 document data.
|
9
|
+
class BusinessCardV1Document < Mindee::Parsing::Common::Prediction
|
10
|
+
include Mindee::Parsing::Standard
|
11
|
+
# The address of the person.
|
12
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
13
|
+
attr_reader :address
|
14
|
+
# The company the person works for.
|
15
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
16
|
+
attr_reader :company
|
17
|
+
# The email address of the person.
|
18
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
19
|
+
attr_reader :email
|
20
|
+
# The Fax number of the person.
|
21
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
22
|
+
attr_reader :fax_number
|
23
|
+
# The given name of the person.
|
24
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
25
|
+
attr_reader :firstname
|
26
|
+
# The job title of the person.
|
27
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
28
|
+
attr_reader :job_title
|
29
|
+
# The lastname of the person.
|
30
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
31
|
+
attr_reader :lastname
|
32
|
+
# The mobile number of the person.
|
33
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
34
|
+
attr_reader :mobile_number
|
35
|
+
# The phone number of the person.
|
36
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
37
|
+
attr_reader :phone_number
|
38
|
+
# The social media profiles of the person or company.
|
39
|
+
# @return [Array<Mindee::Parsing::Standard::StringField>]
|
40
|
+
attr_reader :social_media
|
41
|
+
# The website of the person or company.
|
42
|
+
# @return [Mindee::Parsing::Standard::StringField]
|
43
|
+
attr_reader :website
|
44
|
+
|
45
|
+
# @param prediction [Hash]
|
46
|
+
# @param page_id [Integer, nil]
|
47
|
+
def initialize(prediction, page_id)
|
48
|
+
super()
|
49
|
+
@address = StringField.new(prediction['address'], page_id)
|
50
|
+
@company = StringField.new(prediction['company'], page_id)
|
51
|
+
@email = StringField.new(prediction['email'], page_id)
|
52
|
+
@fax_number = StringField.new(prediction['fax_number'], page_id)
|
53
|
+
@firstname = StringField.new(prediction['firstname'], page_id)
|
54
|
+
@job_title = StringField.new(prediction['job_title'], page_id)
|
55
|
+
@lastname = StringField.new(prediction['lastname'], page_id)
|
56
|
+
@mobile_number = StringField.new(prediction['mobile_number'], page_id)
|
57
|
+
@phone_number = StringField.new(prediction['phone_number'], page_id)
|
58
|
+
@social_media = []
|
59
|
+
prediction['social_media'].each do |item|
|
60
|
+
@social_media.push(StringField.new(item, page_id))
|
61
|
+
end
|
62
|
+
@website = StringField.new(prediction['website'], page_id)
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [String]
|
66
|
+
def to_s
|
67
|
+
social_media = @social_media.join("\n #{' ' * 14}")
|
68
|
+
out_str = String.new
|
69
|
+
out_str << "\n:Firstname: #{@firstname}".rstrip
|
70
|
+
out_str << "\n:Lastname: #{@lastname}".rstrip
|
71
|
+
out_str << "\n:Job Title: #{@job_title}".rstrip
|
72
|
+
out_str << "\n:Company: #{@company}".rstrip
|
73
|
+
out_str << "\n:Email: #{@email}".rstrip
|
74
|
+
out_str << "\n:Phone Number: #{@phone_number}".rstrip
|
75
|
+
out_str << "\n:Mobile Number: #{@mobile_number}".rstrip
|
76
|
+
out_str << "\n:Fax Number: #{@fax_number}".rstrip
|
77
|
+
out_str << "\n:Address: #{@address}".rstrip
|
78
|
+
out_str << "\n:Website: #{@website}".rstrip
|
79
|
+
out_str << "\n:Social Media: #{social_media}".rstrip
|
80
|
+
out_str[1..].to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../parsing'
|
4
|
+
require_relative 'business_card_v1_document'
|
5
|
+
|
6
|
+
module Mindee
|
7
|
+
module Product
|
8
|
+
module BusinessCard
|
9
|
+
# Business Card API version 1.0 page data.
|
10
|
+
class BusinessCardV1Page < Mindee::Parsing::Common::Page
|
11
|
+
# @param prediction [Hash]
|
12
|
+
def initialize(prediction)
|
13
|
+
super(prediction)
|
14
|
+
@prediction = BusinessCardV1PagePrediction.new(
|
15
|
+
prediction['prediction'],
|
16
|
+
prediction['id']
|
17
|
+
)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Business Card V1 page prediction.
|
22
|
+
class BusinessCardV1PagePrediction < BusinessCardV1Document
|
23
|
+
# @return [String]
|
24
|
+
def to_s
|
25
|
+
out_str = String.new
|
26
|
+
out_str << "\n#{super}"
|
27
|
+
out_str
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../parsing'
|
4
|
+
require_relative 'delivery_note_v1_document'
|
5
|
+
require_relative 'delivery_note_v1_page'
|
6
|
+
|
7
|
+
module Mindee
|
8
|
+
module Product
|
9
|
+
# Delivery note module.
|
10
|
+
module DeliveryNote
|
11
|
+
# Delivery note API version 1 inference prediction.
|
12
|
+
class DeliveryNoteV1 < Mindee::Parsing::Common::Inference
|
13
|
+
@endpoint_name = 'delivery_notes'
|
14
|
+
@endpoint_version = '1'
|
15
|
+
|
16
|
+
# @param prediction [Hash]
|
17
|
+
def initialize(prediction)
|
18
|
+
super
|
19
|
+
@prediction = DeliveryNoteV1Document.new(prediction['prediction'], nil)
|
20
|
+
@pages = []
|
21
|
+
prediction['pages'].each do |page|
|
22
|
+
if page.key?('prediction') && !page['prediction'].nil? && !page['prediction'].empty?
|
23
|
+
@pages.push(DeliveryNoteV1Page.new(page))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
# Name of the endpoint for this product.
|
30
|
+
# @return [String]
|
31
|
+
attr_reader :endpoint_name
|
32
|
+
# Version for this product.
|
33
|
+
# @return [String]
|
34
|
+
attr_reader :endpoint_version
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|