mindee 3.12.0 → 3.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +26 -0
- data/README.md +23 -23
- data/Rakefile +5 -0
- data/docs/bank_account_details_v2.md +5 -1
- data/docs/bank_check_v1.md +6 -2
- data/docs/bank_statement_fr_v1.md +3 -0
- data/docs/barcode_reader_v1.md +5 -1
- data/docs/bill_of_lading_v1.md +251 -0
- data/docs/carte_grise_v1.md +5 -1
- data/docs/carte_vitale_v1.md +5 -1
- data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
- data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
- data/docs/code_samples/invoices_v4_async.txt +19 -0
- data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
- data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
- data/docs/cropper_v1.md +6 -2
- data/docs/custom_v1.md +5 -3
- data/docs/energy_bill_fra_v1.md +309 -0
- data/docs/eu_driver_license_v1.md +6 -2
- data/docs/expense_receipts_v5.md +30 -5
- data/docs/financial_document_v1.md +43 -1
- data/docs/generated_v1.md +3 -0
- data/docs/getting_started.md +3 -0
- data/docs/idcard_fr_v2.md +15 -2
- data/docs/international_id_v2.md +13 -1
- data/docs/invoice_splitter_v1.md +16 -13
- data/docs/invoices_v4.md +70 -23
- data/docs/license_plates_v1.md +5 -1
- data/docs/multi_receipts_detector_v1.md +5 -1
- data/docs/nutrition_facts_v1.md +374 -0
- data/docs/passport_v1.md +5 -1
- data/docs/payslip_fra_v2.md +294 -0
- data/docs/proof_of_address_v1.md +5 -1
- data/docs/resume_v1.md +24 -1
- data/docs/us_driver_license_v1.md +6 -2
- data/docs/us_healthcare_cards_v1.md +5 -1
- data/docs/us_mail_v2.md +6 -2
- data/docs/us_w9_v1.md +6 -2
- data/examples/auto_invoice_splitter_extraction.rb +48 -0
- data/examples/auto_multi_receipts_detector_extraction.rb +31 -0
- data/lib/mindee/client.rb +20 -8
- data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb +1 -1
- data/lib/mindee/extraction/common/image_extractor.rb +192 -0
- data/lib/mindee/{image_extraction → extraction}/common.rb +1 -0
- data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +32 -0
- data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
- data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
- data/lib/mindee/extraction/pdf_extractor.rb +4 -0
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
- data/lib/mindee/extraction/tax_extractor.rb +1 -320
- data/lib/mindee/extraction.rb +3 -0
- data/lib/mindee/http/endpoint.rb +18 -6
- data/lib/mindee/parsing/common/api_response.rb +1 -1
- data/lib/mindee/parsing/common/document.rb +31 -1
- data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
- data/lib/mindee/parsing/common/extras/extras.rb +50 -0
- data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
- data/lib/mindee/parsing/common/extras.rb +5 -0
- data/lib/mindee/parsing/common/page.rb +5 -0
- data/lib/mindee/parsing/standard/base_field.rb +1 -0
- data/lib/mindee/parsing/standard/date_field.rb +4 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
- data/lib/mindee/product/financial_document/financial_document_v1_document.rb +11 -1
- data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +15 -1
- data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
- data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
- data/lib/mindee/product/invoice/invoice_v4_document.rb +11 -1
- data/lib/mindee/product/invoice/invoice_v4_line_item.rb +15 -1
- data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
- data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
- data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
- data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
- data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
- data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
- data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
- data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +9 -1
- data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
- data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
- data/lib/mindee/product.rb +5 -1
- data/lib/mindee/version.rb +1 -1
- metadata +71 -9
- data/lib/mindee/image_extraction/common/image_extractor.rb +0 -191
- data/lib/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.rb +0 -26
- data/lib/mindee/image_extraction.rb +0 -4
- /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor.rb +0 -0
- /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
data/lib/mindee/client.rb
CHANGED
@@ -17,13 +17,16 @@ module Mindee
|
|
17
17
|
# Call prediction API on a document and parse the results.
|
18
18
|
#
|
19
19
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
20
|
-
# @param product_class [Mindee::
|
20
|
+
# @param product_class [Mindee::Inference] class of the product
|
21
21
|
# @param endpoint [HTTP::Endpoint] Endpoint of the API
|
22
22
|
# Doesn't need to be set in the case of OTS APIs.
|
23
23
|
#
|
24
24
|
# @param all_words [Boolean] Whether to include the full text for each page.
|
25
25
|
# This performs a full OCR operation on the server and will increase response time.
|
26
26
|
#
|
27
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
28
|
+
# This performs a full OCR operation on the server and may increase response time.
|
29
|
+
#
|
27
30
|
# @param close_file [Boolean] Whether to `close()` the file after parsing it.
|
28
31
|
# Set to false if you need to access the file after this operation.
|
29
32
|
#
|
@@ -45,6 +48,7 @@ module Mindee
|
|
45
48
|
product_class,
|
46
49
|
endpoint: nil,
|
47
50
|
all_words: false,
|
51
|
+
full_text: false,
|
48
52
|
close_file: true,
|
49
53
|
page_options: nil,
|
50
54
|
cropper: false
|
@@ -53,20 +57,23 @@ module Mindee
|
|
53
57
|
input_source.process_pdf(page_options)
|
54
58
|
end
|
55
59
|
endpoint = initialize_endpoint(product_class) if endpoint.nil?
|
56
|
-
prediction, raw_http = endpoint.predict(input_source, all_words, close_file, cropper)
|
60
|
+
prediction, raw_http = endpoint.predict(input_source, all_words, full_text, close_file, cropper)
|
57
61
|
Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
|
58
62
|
end
|
59
63
|
|
60
64
|
# Enqueue a document for async parsing
|
61
65
|
#
|
66
|
+
# @param product_class [Mindee::Inference] class of the product
|
62
67
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
63
|
-
# @param product_class [Mindee::Product] class of the product
|
64
68
|
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
|
65
69
|
# Doesn't need to be set in the case of OTS APIs.
|
66
70
|
#
|
67
71
|
# @param all_words [Boolean] Whether to extract all the words on each page.
|
68
72
|
# This performs a full OCR operation on the server and will increase response time.
|
69
73
|
#
|
74
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
75
|
+
# This performs a full OCR operation on the server and may increase response time.
|
76
|
+
#
|
70
77
|
# @param close_file [Boolean] Whether to `close()` the file after parsing it.
|
71
78
|
# Set to false if you need to access the file after this operation.
|
72
79
|
#
|
@@ -88,6 +95,7 @@ module Mindee
|
|
88
95
|
product_class,
|
89
96
|
endpoint: nil,
|
90
97
|
all_words: false,
|
98
|
+
full_text: false,
|
91
99
|
close_file: true,
|
92
100
|
page_options: nil,
|
93
101
|
cropper: false
|
@@ -96,7 +104,7 @@ module Mindee
|
|
96
104
|
input_source.process_pdf(page_options)
|
97
105
|
end
|
98
106
|
endpoint = initialize_endpoint(product_class) if endpoint.nil?
|
99
|
-
prediction, raw_http = endpoint.predict_async(input_source, all_words, close_file, cropper)
|
107
|
+
prediction, raw_http = endpoint.predict_async(input_source, all_words, full_text, close_file, cropper)
|
100
108
|
Mindee::Parsing::Common::ApiResponse.new(product_class,
|
101
109
|
prediction, raw_http)
|
102
110
|
end
|
@@ -104,7 +112,7 @@ module Mindee
|
|
104
112
|
# Parses a queued document
|
105
113
|
#
|
106
114
|
# @param job_id [String] Id of the job (queue) to poll from
|
107
|
-
# @param product_class [Mindee::
|
115
|
+
# @param product_class [Mindee::Inference] class of the product
|
108
116
|
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
|
109
117
|
# Doesn't need to be set in the case of OTS APIs.
|
110
118
|
#
|
@@ -123,11 +131,13 @@ module Mindee
|
|
123
131
|
# Enqueue a document for async parsing and automatically try to retrieve it
|
124
132
|
#
|
125
133
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
126
|
-
# @param product_class [Mindee::
|
134
|
+
# @param product_class [Mindee::Inference] class of the product
|
127
135
|
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
|
128
136
|
# Doesn't need to be set in the case of OTS APIs.
|
129
137
|
# @param all_words [Boolean] Whether to extract all the words on each page.
|
130
138
|
# This performs a full OCR operation on the server and will increase response time.
|
139
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
140
|
+
# This performs a full OCR operation on the server and may increase response time.
|
131
141
|
# @param close_file [Boolean] Whether to `close()` the file after parsing it.
|
132
142
|
# Set to false if you need to access the file after this operation.
|
133
143
|
# @param page_options [Hash, nil] Page cutting/merge options:
|
@@ -147,6 +157,7 @@ module Mindee
|
|
147
157
|
product_class,
|
148
158
|
endpoint: nil,
|
149
159
|
all_words: false,
|
160
|
+
full_text: false,
|
150
161
|
close_file: true,
|
151
162
|
page_options: nil,
|
152
163
|
cropper: false,
|
@@ -159,6 +170,7 @@ module Mindee
|
|
159
170
|
product_class,
|
160
171
|
endpoint: endpoint,
|
161
172
|
all_words: all_words,
|
173
|
+
full_text: full_text,
|
162
174
|
close_file: close_file,
|
163
175
|
page_options: page_options,
|
164
176
|
cropper: cropper
|
@@ -184,7 +196,7 @@ module Mindee
|
|
184
196
|
|
185
197
|
# Load a prediction.
|
186
198
|
#
|
187
|
-
# @param product_class [Mindee::
|
199
|
+
# @param product_class [Mindee::Inference] class of the product
|
188
200
|
# @param local_response [Mindee::Input::LocalResponse]
|
189
201
|
# @return [Mindee::Parsing::Common::ApiResponse]
|
190
202
|
def load_prediction(product_class, local_response)
|
@@ -269,7 +281,7 @@ module Mindee
|
|
269
281
|
end
|
270
282
|
|
271
283
|
# Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
|
272
|
-
# @param product_class [Mindee::
|
284
|
+
# @param product_class [Mindee::Inference] class of the product
|
273
285
|
#
|
274
286
|
# @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
|
275
287
|
# API Builder. Do not set for standard (off the shelf) endpoints.
|
@@ -0,0 +1,192 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mini_magick'
|
4
|
+
require 'origami'
|
5
|
+
require 'stringio'
|
6
|
+
require 'tempfile'
|
7
|
+
require_relative '../../input/sources'
|
8
|
+
require_relative 'extracted_image'
|
9
|
+
|
10
|
+
module Mindee
|
11
|
+
# Image Extraction Module.
|
12
|
+
module Extraction
|
13
|
+
# Image Extraction wrapper class.
|
14
|
+
class ImageExtractor
|
15
|
+
def self.attach_image_as_new_file(input_buffer)
|
16
|
+
# Attaches an image as a new page in a PdfDocument object.
|
17
|
+
#
|
18
|
+
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
|
19
|
+
# @return [Origami::PDF] A PdfDocument handle.
|
20
|
+
|
21
|
+
magick_image = MiniMagick::Image.read(input_buffer)
|
22
|
+
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
|
23
|
+
# converted.
|
24
|
+
magick_image.format('jpg')
|
25
|
+
original_density = magick_image.resolution
|
26
|
+
scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
|
27
|
+
# the pdf otherwise the resulting image shrinks.
|
28
|
+
magick_image.format('pdf', 0, { density: scale_factor.to_s })
|
29
|
+
Origami::PDF.read(StringIO.new(magick_image.to_blob))
|
30
|
+
end
|
31
|
+
|
32
|
+
# Extracts multiple images from a given local input source.
|
33
|
+
#
|
34
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source
|
35
|
+
# @param [Integer] page_id ID of the Page to extract from.
|
36
|
+
# @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
|
37
|
+
# to extract.
|
38
|
+
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
|
39
|
+
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
|
40
|
+
new_stream = load_doc(input_source, page_id)
|
41
|
+
new_stream.seek(0)
|
42
|
+
|
43
|
+
extract_images_from_polygons(input_source, new_stream, page_id, polygons)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Retrieves a PDF document's page.
|
47
|
+
#
|
48
|
+
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
49
|
+
# @param [Integer] page_id Page ID.
|
50
|
+
def self.get_page(pdf_doc, page_id)
|
51
|
+
stream = StringIO.new
|
52
|
+
pdf_doc.save(stream)
|
53
|
+
|
54
|
+
options = {
|
55
|
+
page_indexes: [page_id - 1],
|
56
|
+
}
|
57
|
+
|
58
|
+
Mindee::PDF::PdfProcessor.parse(stream, options)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Extracts images from their positions on a file (as polygons).
|
62
|
+
#
|
63
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
64
|
+
# @param [StringIO] pdf_stream Buffer of the PDF.
|
65
|
+
# @param [Integer] page_id Page ID.
|
66
|
+
# @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
|
67
|
+
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
|
68
|
+
def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
|
69
|
+
extracted_elements = []
|
70
|
+
|
71
|
+
polygons.each_with_index do |polygon, element_id|
|
72
|
+
polygon = normalize_polygon(polygon)
|
73
|
+
page_content = read_page_content(pdf_stream)
|
74
|
+
|
75
|
+
min_max_x = Geometry.get_min_max_x([
|
76
|
+
polygon.top_left,
|
77
|
+
polygon.bottom_right,
|
78
|
+
polygon.top_right,
|
79
|
+
polygon.bottom_left,
|
80
|
+
])
|
81
|
+
min_max_y = Geometry.get_min_max_y([
|
82
|
+
polygon.top_left,
|
83
|
+
polygon.bottom_right,
|
84
|
+
polygon.top_right,
|
85
|
+
polygon.bottom_left,
|
86
|
+
])
|
87
|
+
file_extension = determine_file_extension(input_source)
|
88
|
+
cropped_image = crop_image(page_content, min_max_x, min_max_y)
|
89
|
+
if file_extension == 'pdf'
|
90
|
+
cropped_image.format('jpg')
|
91
|
+
else
|
92
|
+
cropped_image.format(file_extension)
|
93
|
+
end
|
94
|
+
|
95
|
+
buffer = StringIO.new
|
96
|
+
write_image_to_buffer(cropped_image, buffer)
|
97
|
+
file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
|
98
|
+
|
99
|
+
extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
|
100
|
+
end
|
101
|
+
|
102
|
+
extracted_elements
|
103
|
+
end
|
104
|
+
|
105
|
+
# Retrieves the bounding box of a polygon.
|
106
|
+
#
|
107
|
+
# @param [Array<Point>, Mindee::Geometry::Polygon] polygon
|
108
|
+
def self.normalize_polygon(polygon)
|
109
|
+
if polygon.is_a?(Mindee::Geometry::Polygon)
|
110
|
+
Mindee::Geometry.get_bounding_box(polygon)
|
111
|
+
else
|
112
|
+
polygon
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Loads a buffer into a MiniMagick Image.
|
117
|
+
#
|
118
|
+
# @param [StringIO] pdf_stream Buffer containg the PDF
|
119
|
+
# @return [MiniMagick::Image] a valid MiniMagick image handle.
|
120
|
+
def self.read_page_content(pdf_stream)
|
121
|
+
pdf_stream.rewind
|
122
|
+
MiniMagick::Image.read(pdf_stream)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Crops a MiniMagick Image from a the given bounding box.
|
126
|
+
#
|
127
|
+
# @param [MiniMagick::Image] image Input Image.
|
128
|
+
# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
|
129
|
+
# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
|
130
|
+
def self.crop_image(image, min_max_x, min_max_y)
|
131
|
+
width = image[:width].to_i
|
132
|
+
height = image[:height].to_i
|
133
|
+
|
134
|
+
image.format('jpg')
|
135
|
+
new_width = (min_max_x.max - min_max_x.min) * width
|
136
|
+
new_height = (min_max_y.max - min_max_y.min) * height
|
137
|
+
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
|
138
|
+
|
139
|
+
image
|
140
|
+
end
|
141
|
+
|
142
|
+
# Writes a MiniMagick::Image to a buffer.
|
143
|
+
#
|
144
|
+
# @param [MiniMagick::Image] image a valid MiniMagick image.
|
145
|
+
# @param [StringIO] buffer
|
146
|
+
def self.write_image_to_buffer(image, buffer)
|
147
|
+
image.write(buffer)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
|
151
|
+
#
|
152
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
153
|
+
# @return [String] A valid file extension.
|
154
|
+
def self.determine_file_extension(input_source)
|
155
|
+
if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
|
156
|
+
'jpg'
|
157
|
+
else
|
158
|
+
File.extname(input_source.filename).strip.downcase[1..]
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Generates an ExtractedImage.
|
163
|
+
#
|
164
|
+
# @param [StringIO] buffer Buffer containing the image.
|
165
|
+
# @param [String] file_name Name for the file.
|
166
|
+
# @param [Object] page_id ID of the page the file was generated from.
|
167
|
+
# @param [Object] element_id ID of the element of a given page.
|
168
|
+
def self.create_extracted_image(buffer, file_name, page_id, element_id)
|
169
|
+
buffer.rewind
|
170
|
+
ExtractedImage.new(
|
171
|
+
Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
|
172
|
+
page_id,
|
173
|
+
element_id
|
174
|
+
)
|
175
|
+
end
|
176
|
+
|
177
|
+
# Loads a single_page from an image file or a pdf document.
|
178
|
+
#
|
179
|
+
# @param input_file [LocalInputSource] Local input.
|
180
|
+
# @param [Integer] page_id Page ID.
|
181
|
+
# @return [MiniMagick::Image] A valid PdfDocument handle.
|
182
|
+
def self.load_doc(input_file, page_id)
|
183
|
+
input_file.io_stream.rewind
|
184
|
+
if input_file.pdf?
|
185
|
+
get_page(Origami::PDF.read(input_file.io_stream), page_id)
|
186
|
+
else
|
187
|
+
input_file.io_stream
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../common/image_extractor'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
# Image Extraction Module.
|
7
|
+
module Extraction
|
8
|
+
# Multi-receipts extraction class wrapper.
|
9
|
+
class MultiReceiptsExtractor
|
10
|
+
def self.extract_receipts(input_source, inference)
|
11
|
+
# Extracts individual receipts from multi-receipts documents.
|
12
|
+
#
|
13
|
+
# @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
|
14
|
+
# @param inference [Inference] Results of the inference.
|
15
|
+
# @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
|
16
|
+
|
17
|
+
images = []
|
18
|
+
raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
|
19
|
+
|
20
|
+
(0...input_source.count_pdf_pages).each do |page_id|
|
21
|
+
receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
|
22
|
+
images.concat(
|
23
|
+
Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1,
|
24
|
+
receipt_positions)
|
25
|
+
)
|
26
|
+
end
|
27
|
+
|
28
|
+
images
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Pdf Extraction Module.
|
5
|
+
module Extraction
|
6
|
+
module PdfExtractor
|
7
|
+
# An extracted sub-Pdf.
|
8
|
+
class ExtractedPdf
|
9
|
+
# Byte contents of the pdf
|
10
|
+
# @return [StreamIO]
|
11
|
+
attr_reader :pdf_bytes
|
12
|
+
|
13
|
+
# Name of the file.
|
14
|
+
# @return [String]
|
15
|
+
attr_reader :filename
|
16
|
+
|
17
|
+
# @param pdf_bytes [StreamIO]
|
18
|
+
# @param filename [String]
|
19
|
+
def initialize(pdf_bytes, filename)
|
20
|
+
@pdf_bytes = pdf_bytes
|
21
|
+
@filename = filename
|
22
|
+
end
|
23
|
+
|
24
|
+
# Retrieves the page count for a given pdf.
|
25
|
+
# @return [Integer]
|
26
|
+
def page_count
|
27
|
+
current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
|
28
|
+
current_pdf.pages.size
|
29
|
+
rescue TypeError
|
30
|
+
raise 'Could not retrieve page count from Extracted PDF object.'
|
31
|
+
end
|
32
|
+
|
33
|
+
# Writes the contents of the current PDF object to a file.
|
34
|
+
# @param output_path [String] Path to write to.
|
35
|
+
def write_to_file(output_path)
|
36
|
+
raise 'Provided path is not a file' if File.directory?(destination)
|
37
|
+
raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))
|
38
|
+
|
39
|
+
if File.extname(output_path).downcase == '.pdf'
|
40
|
+
base_path = File.expand_path('..', output_path)
|
41
|
+
output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
|
42
|
+
end
|
43
|
+
|
44
|
+
File.write(output_path, @pdf_bytes)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns the current PDF object as a usable BytesInputSource.
|
48
|
+
# @return [Mindee::Input::Source::BytesInputSource]
|
49
|
+
def as_input_source
|
50
|
+
Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Pdf Extraction Module.
|
5
|
+
module Extraction
|
6
|
+
# Pdf Extraction class.
|
7
|
+
module PdfExtractor
|
8
|
+
# Pdf extraction class.
|
9
|
+
class PdfExtractor
|
10
|
+
# @param local_input [Mindee::Input::Source::LocalInputSource]
|
11
|
+
def initialize(local_input)
|
12
|
+
@filename = local_input.filename
|
13
|
+
if local_input.pdf?
|
14
|
+
@source_pdf = local_input.io_stream
|
15
|
+
else
|
16
|
+
pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
|
17
|
+
io_buffer = StringIO.new
|
18
|
+
pdf_image.save(io_buffer)
|
19
|
+
|
20
|
+
@source_pdf = io_buffer
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Retrieves the page count for the Pdf object.
|
25
|
+
# @return [Integer]
|
26
|
+
def page_count
|
27
|
+
Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
|
28
|
+
end
|
29
|
+
|
30
|
+
# Creates a new Pdf from pages and save it into a buffer.
|
31
|
+
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
|
32
|
+
# @return [StreamIO] The buffer containing the new Pdf.
|
33
|
+
def cut_pages(page_indexes)
|
34
|
+
options = {
|
35
|
+
page_indexes: page_indexes,
|
36
|
+
}
|
37
|
+
|
38
|
+
Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Extract the sub-documents from the main pdf, based on the given list of page indexes.
|
42
|
+
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
|
43
|
+
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
|
44
|
+
def extract_sub_documents(page_indexes)
|
45
|
+
extracted_pdfs = []
|
46
|
+
extension = File.extname(@filename)
|
47
|
+
basename = File.basename(@filename, extension)
|
48
|
+
page_indexes.each do |page_index_list|
|
49
|
+
if page_index_list.empty? || page_index_list.nil?
|
50
|
+
raise "Empty indexes aren't allowed for extraction #{page_index_list}"
|
51
|
+
end
|
52
|
+
|
53
|
+
page_index_list.each do |page_index|
|
54
|
+
raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
|
55
|
+
end
|
56
|
+
formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
|
57
|
+
field_filename = "#{basename}_#{format('%03d',
|
58
|
+
(page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
|
59
|
+
extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
|
60
|
+
field_filename)
|
61
|
+
extracted_pdfs << extracted_pdf
|
62
|
+
end
|
63
|
+
extracted_pdfs
|
64
|
+
end
|
65
|
+
|
66
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
67
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
68
|
+
# Extracts invoices as complete PDFs from the document.
|
69
|
+
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
|
70
|
+
# @param strict [Boolean]
|
71
|
+
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
|
72
|
+
def extract_invoices(page_indexes, strict: false)
|
73
|
+
raise 'No indexes provided.' if page_indexes.empty?
|
74
|
+
unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
|
75
|
+
return extract_sub_documents(page_indexes)
|
76
|
+
end
|
77
|
+
return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
|
78
|
+
|
79
|
+
correct_page_indexes = []
|
80
|
+
current_list = []
|
81
|
+
previous_confidence = nil
|
82
|
+
page_indexes.each_with_index do |page_index, i|
|
83
|
+
confidence = page_index.confidence
|
84
|
+
page_list = page_index.page_indexes
|
85
|
+
|
86
|
+
if confidence >= 0.5 && previous_confidence.nil?
|
87
|
+
current_list = page_list
|
88
|
+
elsif confidence >= 0.5 && i < page_indexes.length - 1
|
89
|
+
correct_page_indexes << current_list
|
90
|
+
current_list = page_list
|
91
|
+
elsif confidence < 0.5 && i == page_indexes.length - 1
|
92
|
+
current_list.concat page_list
|
93
|
+
correct_page_indexes << current_list
|
94
|
+
else
|
95
|
+
correct_page_indexes << current_list
|
96
|
+
correct_page_indexes << page_list
|
97
|
+
end
|
98
|
+
previous_confidence = confidence
|
99
|
+
end
|
100
|
+
extract_sub_documents(correct_page_indexes)
|
101
|
+
end
|
102
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
103
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
attr_reader :source_pdf, :filename
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|