mindee 3.11.0 → 3.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/CHANGELOG.md +27 -0
  4. data/README.md +23 -23
  5. data/Rakefile +5 -0
  6. data/bin/mindee.rb +7 -1
  7. data/docs/bank_account_details_v2.md +5 -1
  8. data/docs/bank_check_v1.md +6 -2
  9. data/docs/bank_statement_fr_v1.md +3 -0
  10. data/docs/barcode_reader_v1.md +5 -1
  11. data/docs/bill_of_lading_v1.md +202 -0
  12. data/docs/carte_grise_v1.md +5 -1
  13. data/docs/carte_vitale_v1.md +5 -1
  14. data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
  15. data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
  16. data/docs/code_samples/financial_document_v1_async.txt +19 -0
  17. data/docs/code_samples/invoices_v4_async.txt +19 -0
  18. data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
  19. data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
  20. data/docs/code_samples/us_healthcare_cards_v1_async.txt +19 -0
  21. data/docs/cropper_v1.md +6 -2
  22. data/docs/custom_v1.md +5 -3
  23. data/docs/energy_bill_fra_v1.md +249 -0
  24. data/docs/eu_driver_license_v1.md +6 -2
  25. data/docs/expense_receipts_v5.md +38 -11
  26. data/docs/financial_document_v1.md +80 -23
  27. data/docs/generated_v1.md +3 -0
  28. data/docs/getting_started.md +3 -0
  29. data/docs/idcard_fr_v2.md +15 -2
  30. data/docs/international_id_v2.md +13 -1
  31. data/docs/invoice_splitter_v1.md +16 -13
  32. data/docs/invoices_v4.md +57 -23
  33. data/docs/license_plates_v1.md +5 -1
  34. data/docs/multi_receipts_detector_v1.md +5 -1
  35. data/docs/nutrition_facts_v1.md +295 -0
  36. data/docs/passport_v1.md +5 -1
  37. data/docs/payslip_fra_v2.md +218 -0
  38. data/docs/proof_of_address_v1.md +5 -1
  39. data/docs/resume_v1.md +24 -1
  40. data/docs/us_driver_license_v1.md +6 -2
  41. data/docs/us_healthcare_cards_v1.md +208 -0
  42. data/docs/us_mail_v2.md +5 -1
  43. data/docs/us_w9_v1.md +6 -2
  44. data/examples/auto_invoice_splitter_extraction.rb +43 -0
  45. data/lib/mindee/client.rb +20 -8
  46. data/lib/mindee/extraction/common/extracted_image.rb +73 -0
  47. data/lib/mindee/extraction/common/image_extractor.rb +189 -0
  48. data/lib/mindee/extraction/common.rb +4 -0
  49. data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +26 -0
  50. data/lib/mindee/extraction/multi_receipts_extractor.rb +3 -0
  51. data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
  52. data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
  53. data/lib/mindee/extraction/pdf_extractor.rb +4 -0
  54. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
  55. data/lib/mindee/extraction/tax_extractor.rb +1 -320
  56. data/lib/mindee/extraction.rb +3 -0
  57. data/lib/mindee/http/endpoint.rb +18 -6
  58. data/lib/mindee/input/sources.rb +8 -0
  59. data/lib/mindee/parsing/common/api_response.rb +1 -1
  60. data/lib/mindee/parsing/common/document.rb +31 -1
  61. data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
  62. data/lib/mindee/parsing/common/extras/extras.rb +50 -0
  63. data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
  64. data/lib/mindee/parsing/common/extras.rb +5 -0
  65. data/lib/mindee/parsing/common/page.rb +5 -0
  66. data/lib/mindee/parsing/standard/base_field.rb +1 -0
  67. data/lib/mindee/parsing/standard/company_registration_field.rb +17 -0
  68. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
  69. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
  70. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
  71. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
  72. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
  73. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
  74. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
  75. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
  76. data/lib/mindee/product/financial_document/financial_document_v1_document.rb +3 -1
  77. data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +22 -1
  78. data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
  79. data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
  80. data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
  81. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
  82. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
  83. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
  84. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
  85. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
  86. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
  87. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
  88. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
  89. data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
  90. data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
  91. data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
  92. data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
  93. data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
  94. data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
  95. data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
  96. data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
  97. data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
  98. data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
  99. data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
  100. data/lib/mindee/product/international_id/international_id_v2_document.rb +1 -1
  101. data/lib/mindee/product/international_id/international_id_v2_page.rb +1 -1
  102. data/lib/mindee/product/invoice/invoice_v4_document.rb +3 -1
  103. data/lib/mindee/product/invoice/invoice_v4_line_item.rb +22 -1
  104. data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
  105. data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
  106. data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +1 -1
  107. data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +1 -1
  108. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
  109. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
  110. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
  111. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
  112. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
  113. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
  114. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
  115. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
  116. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
  117. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
  118. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
  119. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
  120. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
  121. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
  122. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
  123. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
  124. data/lib/mindee/product/receipt/receipt_v5_document.rb +1 -1
  125. data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
  126. data/lib/mindee/product/receipt/receipt_v5_page.rb +1 -1
  127. data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
  128. data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
  129. data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
  130. data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
  131. data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
  132. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1.rb +41 -0
  133. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +65 -0
  134. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_document.rb +127 -0
  135. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_page.rb +34 -0
  136. data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
  137. data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
  138. data/lib/mindee/product.rb +6 -1
  139. data/lib/mindee/version.rb +1 -1
  140. data/mindee.gemspec +1 -0
  141. metadata +91 -3
  142. /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
data/lib/mindee/client.rb CHANGED
@@ -17,13 +17,16 @@ module Mindee
17
17
  # Call prediction API on a document and parse the results.
18
18
  #
19
19
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
20
- # @param product_class [Mindee::Product] class of the product
20
+ # @param product_class [Mindee::Inference] class of the product
21
21
  # @param endpoint [HTTP::Endpoint] Endpoint of the API
22
22
  # Doesn't need to be set in the case of OTS APIs.
23
23
  #
24
24
  # @param all_words [Boolean] Whether to include the full text for each page.
25
25
  # This performs a full OCR operation on the server and will increase response time.
26
26
  #
27
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
28
+ # This performs a full OCR operation on the server and may increase response time.
29
+ #
27
30
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
28
31
  # Set to false if you need to access the file after this operation.
29
32
  #
@@ -45,6 +48,7 @@ module Mindee
45
48
  product_class,
46
49
  endpoint: nil,
47
50
  all_words: false,
51
+ full_text: false,
48
52
  close_file: true,
49
53
  page_options: nil,
50
54
  cropper: false
@@ -53,20 +57,23 @@ module Mindee
53
57
  input_source.process_pdf(page_options)
54
58
  end
55
59
  endpoint = initialize_endpoint(product_class) if endpoint.nil?
56
- prediction, raw_http = endpoint.predict(input_source, all_words, close_file, cropper)
60
+ prediction, raw_http = endpoint.predict(input_source, all_words, full_text, close_file, cropper)
57
61
  Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
58
62
  end
59
63
 
60
64
  # Enqueue a document for async parsing
61
65
  #
66
+ # @param product_class [Mindee::Inference] class of the product
62
67
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
63
- # @param product_class [Mindee::Product] class of the product
64
68
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
65
69
  # Doesn't need to be set in the case of OTS APIs.
66
70
  #
67
71
  # @param all_words [Boolean] Whether to extract all the words on each page.
68
72
  # This performs a full OCR operation on the server and will increase response time.
69
73
  #
74
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
75
+ # This performs a full OCR operation on the server and may increase response time.
76
+ #
70
77
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
71
78
  # Set to false if you need to access the file after this operation.
72
79
  #
@@ -88,6 +95,7 @@ module Mindee
88
95
  product_class,
89
96
  endpoint: nil,
90
97
  all_words: false,
98
+ full_text: false,
91
99
  close_file: true,
92
100
  page_options: nil,
93
101
  cropper: false
@@ -96,7 +104,7 @@ module Mindee
96
104
  input_source.process_pdf(page_options)
97
105
  end
98
106
  endpoint = initialize_endpoint(product_class) if endpoint.nil?
99
- prediction, raw_http = endpoint.predict_async(input_source, all_words, close_file, cropper)
107
+ prediction, raw_http = endpoint.predict_async(input_source, all_words, full_text, close_file, cropper)
100
108
  Mindee::Parsing::Common::ApiResponse.new(product_class,
101
109
  prediction, raw_http)
102
110
  end
@@ -104,7 +112,7 @@ module Mindee
104
112
  # Parses a queued document
105
113
  #
106
114
  # @param job_id [String] Id of the job (queue) to poll from
107
- # @param product_class [Mindee::Product] class of the product
115
+ # @param product_class [Mindee::Inference] class of the product
108
116
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
109
117
  # Doesn't need to be set in the case of OTS APIs.
110
118
  #
@@ -123,11 +131,13 @@ module Mindee
123
131
  # Enqueue a document for async parsing and automatically try to retrieve it
124
132
  #
125
133
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
126
- # @param product_class [Mindee::Product] class of the product
134
+ # @param product_class [Mindee::Inference] class of the product
127
135
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
128
136
  # Doesn't need to be set in the case of OTS APIs.
129
137
  # @param all_words [Boolean] Whether to extract all the words on each page.
130
138
  # This performs a full OCR operation on the server and will increase response time.
139
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
140
+ # This performs a full OCR operation on the server and may increase response time.
131
141
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
132
142
  # Set to false if you need to access the file after this operation.
133
143
  # @param page_options [Hash, nil] Page cutting/merge options:
@@ -147,6 +157,7 @@ module Mindee
147
157
  product_class,
148
158
  endpoint: nil,
149
159
  all_words: false,
160
+ full_text: false,
150
161
  close_file: true,
151
162
  page_options: nil,
152
163
  cropper: false,
@@ -159,6 +170,7 @@ module Mindee
159
170
  product_class,
160
171
  endpoint: endpoint,
161
172
  all_words: all_words,
173
+ full_text: full_text,
162
174
  close_file: close_file,
163
175
  page_options: page_options,
164
176
  cropper: cropper
@@ -184,7 +196,7 @@ module Mindee
184
196
 
185
197
  # Load a prediction.
186
198
  #
187
- # @param product_class [Mindee::Product] class of the product
199
+ # @param product_class [Mindee::Inference] class of the product
188
200
  # @param local_response [Mindee::Input::LocalResponse]
189
201
  # @return [Mindee::Parsing::Common::ApiResponse]
190
202
  def load_prediction(product_class, local_response)
@@ -269,7 +281,7 @@ module Mindee
269
281
  end
270
282
 
271
283
  # Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
272
- # @param product_class [Mindee::Product] class of the product
284
+ # @param product_class [Mindee::Inference] class of the product
273
285
  #
274
286
  # @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
275
287
  # API Builder. Do not set for standard (off the shelf) endpoints.
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../input/sources'
4
+
5
+ module Mindee
6
+ # Image Extraction Module.
7
+ module ImageExtraction
8
+ # Generic class for image extraction.
9
+ class ExtractedImage
10
+ # Id of the page the image was extracted from.
11
+ attr_reader :page_id
12
+
13
+ # Id of the element on a given page.
14
+ attr_reader :element_id
15
+
16
+ # Buffer object of the file's content.
17
+ attr_reader :buffer
18
+
19
+ # Internal name for the file.
20
+ attr_reader :internal_file_name
21
+
22
+ # Initializes the ExtractedImage with a buffer and an internal file name.
23
+ #
24
+ # @param input_source [LocalInputSource] Local source for input.
25
+ # @param page_id [Integer] ID of the page the element was found on.
26
+ # @param element_id [Integer, nil] ID of the element in a page.
27
+ def initialize(input_source, page_id, element_id)
28
+ @buffer = StringIO.new(input_source.io_stream.read)
29
+ @buffer.rewind
30
+ extension = if input_source.pdf?
31
+ 'jpg'
32
+ else
33
+ File.extname(input_source.filename)
34
+ end
35
+ @internal_file_name = "#{input_source.filename}_p#{page_id}_#{element_id}.#{extension}"
36
+ @page_id = page_id
37
+ @element_id = element_id.nil? ? 0 : element_id
38
+ end
39
+
40
+ # Saves the document to a file.
41
+ #
42
+ # @param output_path [String] Path to save the file to.
43
+ # @param file_format [String, nil] Optional MiniMagick-compatible format for the file. Inferred from file
44
+ # extension if not provided.
45
+ # @raise [MindeeError] If an invalid path or filename is provided.
46
+ def save_to_file(output_path, file_format = nil)
47
+ resolved_path = Pathname.new(output_path).realpath
48
+ if file_format.nil?
49
+ raise ArgumentError, 'Invalid file format.' if resolved_path.extname.delete('.').empty?
50
+
51
+ file_format = resolved_path.extname.delete('.').upcase
52
+ end
53
+ @buffer.rewind
54
+ image = MiniMagick::Image.read(@buffer)
55
+ image.format file_format.downcase
56
+ image.write resolved_path.to_s
57
+ logger.info("File saved successfully to '#{resolved_path}'.")
58
+ rescue TypeError
59
+ raise 'Invalid path/filename provided.'
60
+ rescue StandardError
61
+ raise "Could not save file #{Pathname.new(output_path).basename}."
62
+ end
63
+
64
+ # Return the file as a Mindee-compatible BufferInput source.
65
+ #
66
+ # @return [FileInputSource] A BufferInput source.
67
+ def as_source
68
+ @buffer.rewind
69
+ Mindee::Input::Source::BytesInputSource.new(@buffer.read, @internal_file_name)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mini_magick'
4
+ require 'origami'
5
+ require 'stringio'
6
+ require 'tempfile'
7
+ require_relative '../../input/sources'
8
+ require_relative 'extracted_image'
9
+
10
+ module Mindee
11
+ # Image Extraction Module.
12
+ module ImageExtraction
13
+ def self.attach_image_as_new_file(input_buffer)
14
+ # Attaches an image as a new page in a PdfDocument object.
15
+ #
16
+ # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
17
+ # @return [Origami::PDF] A PdfDocument handle.
18
+
19
+ magick_image = MiniMagick::Image.read(input_buffer)
20
+ # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
21
+ # converted.
22
+ magick_image.format('jpg')
23
+ original_density = magick_image.resolution
24
+ scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
25
+ # the pdf otherwise the resulting image shrinks.
26
+ magick_image.format('pdf', 0, { density: scale_factor.to_s })
27
+ Origami::PDF.read(StringIO.new(magick_image.to_blob))
28
+ end
29
+
30
+ # Extracts multiple images from a given local input source.
31
+ #
32
+ # @param [Mindee::Input::Source::LocalInputSource] input_source
33
+ # @param [Integer] page_id ID of the Page to extract from.
34
+ # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
35
+ # to extract.
36
+ # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
37
+ def extract_multiple_images_from_source(input_source, page_id, polygons)
38
+ new_stream = load_doc(input_source, page_id)
39
+ new_stream.seek(0)
40
+
41
+ extract_images_from_polygons(input_source, new_stream, page_id, polygons)
42
+ end
43
+
44
+ # Retrieves a PDF document's page.
45
+ #
46
+ # @param [Origami::PDF] pdf_doc Origami PDF handle.
47
+ # @param [Integer] page_id Page ID.
48
+ def get_page(pdf_doc, page_id)
49
+ stream = StringIO.new
50
+ pdf_doc.save(stream)
51
+
52
+ options = {
53
+ page_indexes: [page_id - 1],
54
+ }
55
+
56
+ Mindee::PDF::PdfProcessor.parse(stream, options)
57
+ end
58
+
59
+ # Extracts images from their positions on a file (as polygons).
60
+ #
61
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
62
+ # @param [StringIO] pdf_stream Buffer of the PDF.
63
+ # @param [Integer] page_id Page ID.
64
+ # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
65
+ # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
66
+ def extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
67
+ extracted_elements = []
68
+
69
+ polygons.each_with_index do |polygon, element_id|
70
+ polygon = normalize_polygon(polygon)
71
+ page_content = read_page_content(pdf_stream)
72
+
73
+ min_max_x = Geometry.get_min_max_x([
74
+ polygon.top_left,
75
+ polygon.bottom_right,
76
+ polygon.top_right,
77
+ polygon.bottom_left,
78
+ ])
79
+ min_max_y = Geometry.get_min_max_y([
80
+ polygon.top_left,
81
+ polygon.bottom_right,
82
+ polygon.top_right,
83
+ polygon.bottom_left,
84
+ ])
85
+ file_extension = determine_file_extension(input_source)
86
+ cropped_image = crop_image(page_content, min_max_x, min_max_y)
87
+ if file_extension == 'pdf'
88
+ cropped_image.format('jpg')
89
+ else
90
+ cropped_image.format(file_extension)
91
+ end
92
+
93
+ buffer = StringIO.new
94
+ write_image_to_buffer(cropped_image, buffer)
95
+ file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
96
+
97
+ extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
98
+ end
99
+
100
+ extracted_elements
101
+ end
102
+
103
+ # Retrieves the bounding box of a polygon.
104
+ #
105
+ # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
106
+ def normalize_polygon(polygon)
107
+ if polygon.is_a?(Mindee::Geometry::Polygon)
108
+ Mindee::Geometry.get_bounding_box(polygon)
109
+ else
110
+ polygon
111
+ end
112
+ end
113
+
114
+ # Loads a buffer into a MiniMagick Image.
115
+ #
116
+ # @param [StringIO] pdf_stream Buffer containg the PDF
117
+ # @return [MiniMagick::Image] a valid MiniMagick image handle.
118
+ def read_page_content(pdf_stream)
119
+ pdf_stream.rewind
120
+ MiniMagick::Image.read(pdf_stream)
121
+ end
122
+
123
+ # Crops a MiniMagick Image from a the given bounding box.
124
+ #
125
+ # @param [MiniMagick::Image] image Input Image.
126
+ # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
127
+ # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
128
+ def crop_image(image, min_max_x, min_max_y)
129
+ width = image[:width].to_i
130
+ height = image[:height].to_i
131
+
132
+ image.format('jpg')
133
+ new_width = (min_max_x.max - min_max_x.min) * width
134
+ new_height = (min_max_y.max - min_max_y.min) * height
135
+ image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
136
+
137
+ image
138
+ end
139
+
140
+ # Writes a MiniMagick::Image to a buffer.
141
+ #
142
+ # @param [MiniMagick::Image] image a valid MiniMagick image.
143
+ # @param [StringIO] buffer
144
+ def write_image_to_buffer(image, buffer)
145
+ image.write(buffer)
146
+ end
147
+
148
+ # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
149
+ #
150
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
151
+ # @return [String] A valid file extension.
152
+ def determine_file_extension(input_source)
153
+ if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
154
+ 'jpg'
155
+ else
156
+ File.extname(input_source.filename).strip.downcase[1..]
157
+ end
158
+ end
159
+
160
+ # Generates an ExtractedImage.
161
+ #
162
+ # @param [StringIO] buffer Buffer containing the image.
163
+ # @param [String] file_name Name for the file.
164
+ # @param [Object] page_id ID of the page the file was generated from.
165
+ # @param [Object] element_id ID of the element of a given page.
166
+ def create_extracted_image(buffer, file_name, page_id, element_id)
167
+ buffer.rewind
168
+ ExtractedImage.new(
169
+ Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
170
+ page_id,
171
+ element_id
172
+ )
173
+ end
174
+
175
+ # Loads a single_page from an image file or a pdf document.
176
+ #
177
+ # @param input_file [LocalInputSource] Local input.
178
+ # @param [Integer] page_id Page ID.
179
+ # @return [MiniMagick::Image] A valid PdfDocument handle.
180
+ def load_doc(input_file, page_id)
181
+ input_file.io_stream.rewind
182
+ if input_file.pdf?
183
+ get_page(Origami::PDF.read(input_file.io_stream), page_id)
184
+ else
185
+ input_file.io_stream
186
+ end
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'common/extracted_image'
4
+ require_relative 'common/image_extractor'
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image Extraction Module.
5
+ module ImageExtraction
6
+ def extract_receipts(input_source, inference)
7
+ # Extracts individual receipts from multi-receipts documents.
8
+ #
9
+ # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
10
+ # @param inference [Inference] Results of the inference.
11
+ # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
12
+
13
+ images = []
14
+ raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
15
+
16
+ (0...input_source.count_pdf_pages).each do |page_id|
17
+ receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
18
+ images.concat(
19
+ extract_multiple_images_from_source(input_source, page_id + 1, receipt_positions)
20
+ )
21
+ end
22
+
23
+ images
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'multi_receipts_extractor/multi_receipts_extractor'
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module Extraction
6
+ module PdfExtractor
7
+ # An extracted sub-Pdf.
8
+ class ExtractedPdf
9
+ # Byte contents of the pdf
10
+ # @return [StreamIO]
11
+ attr_reader :pdf_bytes
12
+
13
+ # Name of the file.
14
+ # @return [String]
15
+ attr_reader :filename
16
+
17
+ # @param pdf_bytes [StreamIO]
18
+ # @param filename [String]
19
+ def initialize(pdf_bytes, filename)
20
+ @pdf_bytes = pdf_bytes
21
+ @filename = filename
22
+ end
23
+
24
+ # Retrieves the page count for a given pdf.
25
+ # @return [Integer]
26
+ def page_count
27
+ current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
28
+ current_pdf.pages.size
29
+ rescue TypeError
30
+ raise 'Could not retrieve page count from Extracted PDF object.'
31
+ end
32
+
33
+ # Writes the contents of the current PDF object to a file.
34
+ # @param output_path [String] Path to write to.
35
+ def write_to_file(output_path)
36
+ raise 'Provided path is not a file' if File.directory?(destination)
37
+ raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))
38
+
39
+ if File.extname(output_path).downcase == '.pdf'
40
+ base_path = File.expand_path('..', output_path)
41
+ output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
42
+ end
43
+
44
+ File.write(output_path, @pdf_bytes)
45
+ end
46
+
47
+ # Returns the current PDF object as a usable BytesInputSource.
48
+ # @return [Mindee::Input::Source::BytesInputSource]
49
+ def as_input_source
50
+ Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module Extraction
6
+ # Pdf Extraction class.
7
+ module PdfExtractor
8
+ # Pdf extraction class.
9
+ class PdfExtractor
10
+ # @param local_input [Mindee::Input::Source::LocalInputSource]
11
+ def initialize(local_input)
12
+ @filename = local_input.filename
13
+ if local_input.pdf?
14
+ @source_pdf = local_input.io_stream
15
+ else
16
+ pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
17
+ io_buffer = StringIO.new
18
+ pdf_image.save(io_buffer)
19
+
20
+ @source_pdf = io_buffer
21
+ end
22
+ end
23
+
24
+ # Retrieves the page count for the Pdf object.
25
+ # @return [Integer]
26
+ def page_count
27
+ Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
28
+ end
29
+
30
+ # Creates a new Pdf from pages and save it into a buffer.
31
+ # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
32
+ # @return [StreamIO] The buffer containing the new Pdf.
33
+ def cut_pages(page_indexes)
34
+ options = {
35
+ page_indexes: page_indexes,
36
+ }
37
+
38
+ Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
39
+ end
40
+
41
+ # Extract the sub-documents from the main pdf, based on the given list of page indexes.
42
+ # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
43
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
44
+ def extract_sub_documents(page_indexes)
45
+ extracted_pdfs = []
46
+ extension = File.extname(@filename)
47
+ basename = File.basename(@filename, extension)
48
+ page_indexes.each do |page_index_list|
49
+ if page_index_list.empty? || page_index_list.nil?
50
+ raise "Empty indexes aren't allowed for extraction #{page_index_list}"
51
+ end
52
+
53
+ page_index_list.each do |page_index|
54
+ raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
55
+ end
56
+ formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
57
+ field_filename = "#{basename}_#{format('%03d',
58
+ (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
59
+ extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
60
+ field_filename)
61
+ extracted_pdfs << extracted_pdf
62
+ end
63
+ extracted_pdfs
64
+ end
65
+
66
+ # rubocop:disable Metrics/CyclomaticComplexity
67
+ # rubocop:disable Metrics/PerceivedComplexity
68
+ # Extracts invoices as complete PDFs from the document.
69
+ # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
70
+ # @param strict [Boolean]
71
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
72
+ def extract_invoices(page_indexes, strict: false)
73
+ raise 'No indexes provided.' if page_indexes.empty?
74
+ unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
75
+ return extract_sub_documents(page_indexes)
76
+ end
77
+ return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
78
+
79
+ correct_page_indexes = []
80
+ current_list = []
81
+ previous_confidence = nil
82
+ page_indexes.each_with_index do |page_index, i|
83
+ confidence = page_index.confidence
84
+ page_list = page_index.page_indexes
85
+
86
+ if confidence >= 0.5 && previous_confidence.nil?
87
+ current_list = page_list
88
+ elsif confidence >= 0.5 && i < page_indexes.length - 1
89
+ correct_page_indexes << current_list
90
+ current_list = page_list
91
+ elsif confidence < 0.5 && i == page_indexes.length - 1
92
+ current_list.concat page_list
93
+ correct_page_indexes << current_list
94
+ else
95
+ correct_page_indexes << current_list
96
+ correct_page_indexes << page_list
97
+ end
98
+ previous_confidence = confidence
99
+ end
100
+ extract_sub_documents(correct_page_indexes)
101
+ end
102
+ # rubocop:enable Metrics/CyclomaticComplexity
103
+ # rubocop:enable Metrics/PerceivedComplexity
104
+
105
+ private
106
+
107
+ attr_reader :source_pdf, :filename
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'pdf_extractor/pdf_extractor'
4
+ require_relative 'pdf_extractor/extracted_pdf'