mindee 3.12.0 → 3.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/CHANGELOG.md +26 -0
  4. data/README.md +23 -23
  5. data/Rakefile +5 -0
  6. data/docs/bank_account_details_v2.md +5 -1
  7. data/docs/bank_check_v1.md +6 -2
  8. data/docs/bank_statement_fr_v1.md +3 -0
  9. data/docs/barcode_reader_v1.md +5 -1
  10. data/docs/bill_of_lading_v1.md +251 -0
  11. data/docs/carte_grise_v1.md +5 -1
  12. data/docs/carte_vitale_v1.md +5 -1
  13. data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
  14. data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
  15. data/docs/code_samples/invoices_v4_async.txt +19 -0
  16. data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
  17. data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
  18. data/docs/cropper_v1.md +6 -2
  19. data/docs/custom_v1.md +5 -3
  20. data/docs/energy_bill_fra_v1.md +309 -0
  21. data/docs/eu_driver_license_v1.md +6 -2
  22. data/docs/expense_receipts_v5.md +30 -5
  23. data/docs/financial_document_v1.md +43 -1
  24. data/docs/generated_v1.md +3 -0
  25. data/docs/getting_started.md +3 -0
  26. data/docs/idcard_fr_v2.md +15 -2
  27. data/docs/international_id_v2.md +13 -1
  28. data/docs/invoice_splitter_v1.md +16 -13
  29. data/docs/invoices_v4.md +70 -23
  30. data/docs/license_plates_v1.md +5 -1
  31. data/docs/multi_receipts_detector_v1.md +5 -1
  32. data/docs/nutrition_facts_v1.md +374 -0
  33. data/docs/passport_v1.md +5 -1
  34. data/docs/payslip_fra_v2.md +294 -0
  35. data/docs/proof_of_address_v1.md +5 -1
  36. data/docs/resume_v1.md +24 -1
  37. data/docs/us_driver_license_v1.md +6 -2
  38. data/docs/us_healthcare_cards_v1.md +5 -1
  39. data/docs/us_mail_v2.md +6 -2
  40. data/docs/us_w9_v1.md +6 -2
  41. data/examples/auto_invoice_splitter_extraction.rb +48 -0
  42. data/examples/auto_multi_receipts_detector_extraction.rb +31 -0
  43. data/lib/mindee/client.rb +20 -8
  44. data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb +1 -1
  45. data/lib/mindee/extraction/common/image_extractor.rb +192 -0
  46. data/lib/mindee/{image_extraction → extraction}/common.rb +1 -0
  47. data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +32 -0
  48. data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
  49. data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
  50. data/lib/mindee/extraction/pdf_extractor.rb +4 -0
  51. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
  52. data/lib/mindee/extraction/tax_extractor.rb +1 -320
  53. data/lib/mindee/extraction.rb +3 -0
  54. data/lib/mindee/http/endpoint.rb +18 -6
  55. data/lib/mindee/parsing/common/api_response.rb +1 -1
  56. data/lib/mindee/parsing/common/document.rb +31 -1
  57. data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
  58. data/lib/mindee/parsing/common/extras/extras.rb +50 -0
  59. data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
  60. data/lib/mindee/parsing/common/extras.rb +5 -0
  61. data/lib/mindee/parsing/common/page.rb +5 -0
  62. data/lib/mindee/parsing/standard/base_field.rb +1 -0
  63. data/lib/mindee/parsing/standard/date_field.rb +4 -0
  64. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
  65. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
  66. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
  67. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
  68. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
  69. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
  70. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
  71. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
  72. data/lib/mindee/product/financial_document/financial_document_v1_document.rb +11 -1
  73. data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +15 -1
  74. data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
  75. data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
  76. data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
  77. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
  78. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
  79. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
  80. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
  81. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
  82. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
  83. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
  84. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
  85. data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
  86. data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
  87. data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
  88. data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
  89. data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
  90. data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
  91. data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
  92. data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
  93. data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
  94. data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
  95. data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
  96. data/lib/mindee/product/invoice/invoice_v4_document.rb +11 -1
  97. data/lib/mindee/product/invoice/invoice_v4_line_item.rb +15 -1
  98. data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
  99. data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
  100. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
  101. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
  102. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
  103. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
  104. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
  105. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
  106. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
  107. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
  108. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
  109. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
  110. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
  111. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
  112. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
  113. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
  114. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
  115. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
  116. data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
  117. data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
  118. data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
  119. data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
  120. data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
  121. data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
  122. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +9 -1
  123. data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
  124. data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
  125. data/lib/mindee/product.rb +5 -1
  126. data/lib/mindee/version.rb +1 -1
  127. metadata +71 -9
  128. data/lib/mindee/image_extraction/common/image_extractor.rb +0 -191
  129. data/lib/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.rb +0 -26
  130. data/lib/mindee/image_extraction.rb +0 -4
  131. /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor.rb +0 -0
  132. /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
data/lib/mindee/client.rb CHANGED
@@ -17,13 +17,16 @@ module Mindee
17
17
  # Call prediction API on a document and parse the results.
18
18
  #
19
19
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
20
- # @param product_class [Mindee::Product] class of the product
20
+ # @param product_class [Mindee::Inference] class of the product
21
21
  # @param endpoint [HTTP::Endpoint] Endpoint of the API
22
22
  # Doesn't need to be set in the case of OTS APIs.
23
23
  #
24
24
  # @param all_words [Boolean] Whether to include the full text for each page.
25
25
  # This performs a full OCR operation on the server and will increase response time.
26
26
  #
27
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
28
+ # This performs a full OCR operation on the server and may increase response time.
29
+ #
27
30
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
28
31
  # Set to false if you need to access the file after this operation.
29
32
  #
@@ -45,6 +48,7 @@ module Mindee
45
48
  product_class,
46
49
  endpoint: nil,
47
50
  all_words: false,
51
+ full_text: false,
48
52
  close_file: true,
49
53
  page_options: nil,
50
54
  cropper: false
@@ -53,20 +57,23 @@ module Mindee
53
57
  input_source.process_pdf(page_options)
54
58
  end
55
59
  endpoint = initialize_endpoint(product_class) if endpoint.nil?
56
- prediction, raw_http = endpoint.predict(input_source, all_words, close_file, cropper)
60
+ prediction, raw_http = endpoint.predict(input_source, all_words, full_text, close_file, cropper)
57
61
  Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
58
62
  end
59
63
 
60
64
  # Enqueue a document for async parsing
61
65
  #
66
+ # @param product_class [Mindee::Inference] class of the product
62
67
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
63
- # @param product_class [Mindee::Product] class of the product
64
68
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
65
69
  # Doesn't need to be set in the case of OTS APIs.
66
70
  #
67
71
  # @param all_words [Boolean] Whether to extract all the words on each page.
68
72
  # This performs a full OCR operation on the server and will increase response time.
69
73
  #
74
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
75
+ # This performs a full OCR operation on the server and may increase response time.
76
+ #
70
77
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
71
78
  # Set to false if you need to access the file after this operation.
72
79
  #
@@ -88,6 +95,7 @@ module Mindee
88
95
  product_class,
89
96
  endpoint: nil,
90
97
  all_words: false,
98
+ full_text: false,
91
99
  close_file: true,
92
100
  page_options: nil,
93
101
  cropper: false
@@ -96,7 +104,7 @@ module Mindee
96
104
  input_source.process_pdf(page_options)
97
105
  end
98
106
  endpoint = initialize_endpoint(product_class) if endpoint.nil?
99
- prediction, raw_http = endpoint.predict_async(input_source, all_words, close_file, cropper)
107
+ prediction, raw_http = endpoint.predict_async(input_source, all_words, full_text, close_file, cropper)
100
108
  Mindee::Parsing::Common::ApiResponse.new(product_class,
101
109
  prediction, raw_http)
102
110
  end
@@ -104,7 +112,7 @@ module Mindee
104
112
  # Parses a queued document
105
113
  #
106
114
  # @param job_id [String] Id of the job (queue) to poll from
107
- # @param product_class [Mindee::Product] class of the product
115
+ # @param product_class [Mindee::Inference] class of the product
108
116
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
109
117
  # Doesn't need to be set in the case of OTS APIs.
110
118
  #
@@ -123,11 +131,13 @@ module Mindee
123
131
  # Enqueue a document for async parsing and automatically try to retrieve it
124
132
  #
125
133
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
126
- # @param product_class [Mindee::Product] class of the product
134
+ # @param product_class [Mindee::Inference] class of the product
127
135
  # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
128
136
  # Doesn't need to be set in the case of OTS APIs.
129
137
  # @param all_words [Boolean] Whether to extract all the words on each page.
130
138
  # This performs a full OCR operation on the server and will increase response time.
139
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
140
+ # This performs a full OCR operation on the server and may increase response time.
131
141
  # @param close_file [Boolean] Whether to `close()` the file after parsing it.
132
142
  # Set to false if you need to access the file after this operation.
133
143
  # @param page_options [Hash, nil] Page cutting/merge options:
@@ -147,6 +157,7 @@ module Mindee
147
157
  product_class,
148
158
  endpoint: nil,
149
159
  all_words: false,
160
+ full_text: false,
150
161
  close_file: true,
151
162
  page_options: nil,
152
163
  cropper: false,
@@ -159,6 +170,7 @@ module Mindee
159
170
  product_class,
160
171
  endpoint: endpoint,
161
172
  all_words: all_words,
173
+ full_text: full_text,
162
174
  close_file: close_file,
163
175
  page_options: page_options,
164
176
  cropper: cropper
@@ -184,7 +196,7 @@ module Mindee
184
196
 
185
197
  # Load a prediction.
186
198
  #
187
- # @param product_class [Mindee::Product] class of the product
199
+ # @param product_class [Mindee::Inference] class of the product
188
200
  # @param local_response [Mindee::Input::LocalResponse]
189
201
  # @return [Mindee::Parsing::Common::ApiResponse]
190
202
  def load_prediction(product_class, local_response)
@@ -269,7 +281,7 @@ module Mindee
269
281
  end
270
282
 
271
283
  # Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
272
- # @param product_class [Mindee::Product] class of the product
284
+ # @param product_class [Mindee::Inference] class of the product
273
285
  #
274
286
  # @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
275
287
  # API Builder. Do not set for standard (off the shelf) endpoints.
@@ -4,7 +4,7 @@ require_relative '../../input/sources'
4
4
 
5
5
  module Mindee
6
6
  # Image Extraction Module.
7
- module ImageExtraction
7
+ module Extraction
8
8
  # Generic class for image extraction.
9
9
  class ExtractedImage
10
10
  # Id of the page the image was extracted from.
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mini_magick'
4
+ require 'origami'
5
+ require 'stringio'
6
+ require 'tempfile'
7
+ require_relative '../../input/sources'
8
+ require_relative 'extracted_image'
9
+
10
+ module Mindee
11
+ # Image Extraction Module.
12
+ module Extraction
13
+ # Image Extraction wrapper class.
14
+ class ImageExtractor
15
+ def self.attach_image_as_new_file(input_buffer)
16
+ # Attaches an image as a new page in a PdfDocument object.
17
+ #
18
+ # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
19
+ # @return [Origami::PDF] A PdfDocument handle.
20
+
21
+ magick_image = MiniMagick::Image.read(input_buffer)
22
+ # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
23
+ # converted.
24
+ magick_image.format('jpg')
25
+ original_density = magick_image.resolution
26
+ scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
27
+ # the pdf otherwise the resulting image shrinks.
28
+ magick_image.format('pdf', 0, { density: scale_factor.to_s })
29
+ Origami::PDF.read(StringIO.new(magick_image.to_blob))
30
+ end
31
+
32
+ # Extracts multiple images from a given local input source.
33
+ #
34
+ # @param [Mindee::Input::Source::LocalInputSource] input_source
35
+ # @param [Integer] page_id ID of the Page to extract from.
36
+ # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
37
+ # to extract.
38
+ # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
39
+ def self.extract_multiple_images_from_source(input_source, page_id, polygons)
40
+ new_stream = load_doc(input_source, page_id)
41
+ new_stream.seek(0)
42
+
43
+ extract_images_from_polygons(input_source, new_stream, page_id, polygons)
44
+ end
45
+
46
+ # Retrieves a PDF document's page.
47
+ #
48
+ # @param [Origami::PDF] pdf_doc Origami PDF handle.
49
+ # @param [Integer] page_id Page ID.
50
+ def self.get_page(pdf_doc, page_id)
51
+ stream = StringIO.new
52
+ pdf_doc.save(stream)
53
+
54
+ options = {
55
+ page_indexes: [page_id - 1],
56
+ }
57
+
58
+ Mindee::PDF::PdfProcessor.parse(stream, options)
59
+ end
60
+
61
+ # Extracts images from their positions on a file (as polygons).
62
+ #
63
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
64
+ # @param [StringIO] pdf_stream Buffer of the PDF.
65
+ # @param [Integer] page_id Page ID.
66
+ # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
67
+ # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
68
+ def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
69
+ extracted_elements = []
70
+
71
+ polygons.each_with_index do |polygon, element_id|
72
+ polygon = normalize_polygon(polygon)
73
+ page_content = read_page_content(pdf_stream)
74
+
75
+ min_max_x = Geometry.get_min_max_x([
76
+ polygon.top_left,
77
+ polygon.bottom_right,
78
+ polygon.top_right,
79
+ polygon.bottom_left,
80
+ ])
81
+ min_max_y = Geometry.get_min_max_y([
82
+ polygon.top_left,
83
+ polygon.bottom_right,
84
+ polygon.top_right,
85
+ polygon.bottom_left,
86
+ ])
87
+ file_extension = determine_file_extension(input_source)
88
+ cropped_image = crop_image(page_content, min_max_x, min_max_y)
89
+ if file_extension == 'pdf'
90
+ cropped_image.format('jpg')
91
+ else
92
+ cropped_image.format(file_extension)
93
+ end
94
+
95
+ buffer = StringIO.new
96
+ write_image_to_buffer(cropped_image, buffer)
97
+ file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
98
+
99
+ extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
100
+ end
101
+
102
+ extracted_elements
103
+ end
104
+
105
+ # Retrieves the bounding box of a polygon.
106
+ #
107
+ # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
108
+ def self.normalize_polygon(polygon)
109
+ if polygon.is_a?(Mindee::Geometry::Polygon)
110
+ Mindee::Geometry.get_bounding_box(polygon)
111
+ else
112
+ polygon
113
+ end
114
+ end
115
+
116
+ # Loads a buffer into a MiniMagick Image.
117
+ #
118
+ # @param [StringIO] pdf_stream Buffer containg the PDF
119
+ # @return [MiniMagick::Image] a valid MiniMagick image handle.
120
+ def self.read_page_content(pdf_stream)
121
+ pdf_stream.rewind
122
+ MiniMagick::Image.read(pdf_stream)
123
+ end
124
+
125
+ # Crops a MiniMagick Image from a the given bounding box.
126
+ #
127
+ # @param [MiniMagick::Image] image Input Image.
128
+ # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
129
+ # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
130
+ def self.crop_image(image, min_max_x, min_max_y)
131
+ width = image[:width].to_i
132
+ height = image[:height].to_i
133
+
134
+ image.format('jpg')
135
+ new_width = (min_max_x.max - min_max_x.min) * width
136
+ new_height = (min_max_y.max - min_max_y.min) * height
137
+ image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
138
+
139
+ image
140
+ end
141
+
142
+ # Writes a MiniMagick::Image to a buffer.
143
+ #
144
+ # @param [MiniMagick::Image] image a valid MiniMagick image.
145
+ # @param [StringIO] buffer
146
+ def self.write_image_to_buffer(image, buffer)
147
+ image.write(buffer)
148
+ end
149
+
150
+ # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
151
+ #
152
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
153
+ # @return [String] A valid file extension.
154
+ def self.determine_file_extension(input_source)
155
+ if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
156
+ 'jpg'
157
+ else
158
+ File.extname(input_source.filename).strip.downcase[1..]
159
+ end
160
+ end
161
+
162
+ # Generates an ExtractedImage.
163
+ #
164
+ # @param [StringIO] buffer Buffer containing the image.
165
+ # @param [String] file_name Name for the file.
166
+ # @param [Object] page_id ID of the page the file was generated from.
167
+ # @param [Object] element_id ID of the element of a given page.
168
+ def self.create_extracted_image(buffer, file_name, page_id, element_id)
169
+ buffer.rewind
170
+ ExtractedImage.new(
171
+ Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
172
+ page_id,
173
+ element_id
174
+ )
175
+ end
176
+
177
+ # Loads a single_page from an image file or a pdf document.
178
+ #
179
+ # @param input_file [LocalInputSource] Local input.
180
+ # @param [Integer] page_id Page ID.
181
+ # @return [MiniMagick::Image] A valid PdfDocument handle.
182
+ def self.load_doc(input_file, page_id)
183
+ input_file.io_stream.rewind
184
+ if input_file.pdf?
185
+ get_page(Origami::PDF.read(input_file.io_stream), page_id)
186
+ else
187
+ input_file.io_stream
188
+ end
189
+ end
190
+ end
191
+ end
192
+ end
@@ -1,3 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'common/extracted_image'
3
4
  require_relative 'common/image_extractor'
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../common/image_extractor'
4
+
5
+ module Mindee
6
+ # Image Extraction Module.
7
+ module Extraction
8
+ # Multi-receipts extraction class wrapper.
9
+ class MultiReceiptsExtractor
10
+ def self.extract_receipts(input_source, inference)
11
+ # Extracts individual receipts from multi-receipts documents.
12
+ #
13
+ # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
14
+ # @param inference [Inference] Results of the inference.
15
+ # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
16
+
17
+ images = []
18
+ raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
19
+
20
+ (0...input_source.count_pdf_pages).each do |page_id|
21
+ receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
22
+ images.concat(
23
+ Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1,
24
+ receipt_positions)
25
+ )
26
+ end
27
+
28
+ images
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module Extraction
6
+ module PdfExtractor
7
+ # An extracted sub-Pdf.
8
+ class ExtractedPdf
9
+ # Byte contents of the pdf
10
+ # @return [StreamIO]
11
+ attr_reader :pdf_bytes
12
+
13
+ # Name of the file.
14
+ # @return [String]
15
+ attr_reader :filename
16
+
17
+ # @param pdf_bytes [StreamIO]
18
+ # @param filename [String]
19
+ def initialize(pdf_bytes, filename)
20
+ @pdf_bytes = pdf_bytes
21
+ @filename = filename
22
+ end
23
+
24
+ # Retrieves the page count for a given pdf.
25
+ # @return [Integer]
26
+ def page_count
27
+ current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
28
+ current_pdf.pages.size
29
+ rescue TypeError
30
+ raise 'Could not retrieve page count from Extracted PDF object.'
31
+ end
32
+
33
+ # Writes the contents of the current PDF object to a file.
34
+ # @param output_path [String] Path to write to.
35
+ def write_to_file(output_path)
36
+ raise 'Provided path is not a file' if File.directory?(destination)
37
+ raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))
38
+
39
+ if File.extname(output_path).downcase == '.pdf'
40
+ base_path = File.expand_path('..', output_path)
41
+ output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
42
+ end
43
+
44
+ File.write(output_path, @pdf_bytes)
45
+ end
46
+
47
+ # Returns the current PDF object as a usable BytesInputSource.
48
+ # @return [Mindee::Input::Source::BytesInputSource]
49
+ def as_input_source
50
+ Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module Extraction
6
+ # Pdf Extraction class.
7
+ module PdfExtractor
8
+ # Pdf extraction class.
9
+ class PdfExtractor
10
+ # @param local_input [Mindee::Input::Source::LocalInputSource]
11
+ def initialize(local_input)
12
+ @filename = local_input.filename
13
+ if local_input.pdf?
14
+ @source_pdf = local_input.io_stream
15
+ else
16
+ pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
17
+ io_buffer = StringIO.new
18
+ pdf_image.save(io_buffer)
19
+
20
+ @source_pdf = io_buffer
21
+ end
22
+ end
23
+
24
+ # Retrieves the page count for the Pdf object.
25
+ # @return [Integer]
26
+ def page_count
27
+ Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
28
+ end
29
+
30
+ # Creates a new Pdf from pages and save it into a buffer.
31
+ # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
32
+ # @return [StreamIO] The buffer containing the new Pdf.
33
+ def cut_pages(page_indexes)
34
+ options = {
35
+ page_indexes: page_indexes,
36
+ }
37
+
38
+ Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
39
+ end
40
+
41
+ # Extract the sub-documents from the main pdf, based on the given list of page indexes.
42
+ # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
43
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
44
+ def extract_sub_documents(page_indexes)
45
+ extracted_pdfs = []
46
+ extension = File.extname(@filename)
47
+ basename = File.basename(@filename, extension)
48
+ page_indexes.each do |page_index_list|
49
+ if page_index_list.empty? || page_index_list.nil?
50
+ raise "Empty indexes aren't allowed for extraction #{page_index_list}"
51
+ end
52
+
53
+ page_index_list.each do |page_index|
54
+ raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
55
+ end
56
+ formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
57
+ field_filename = "#{basename}_#{format('%03d',
58
+ (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
59
+ extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
60
+ field_filename)
61
+ extracted_pdfs << extracted_pdf
62
+ end
63
+ extracted_pdfs
64
+ end
65
+
66
+ # rubocop:disable Metrics/CyclomaticComplexity
67
+ # rubocop:disable Metrics/PerceivedComplexity
68
+ # Extracts invoices as complete PDFs from the document.
69
+ # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
70
+ # @param strict [Boolean]
71
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
72
+ def extract_invoices(page_indexes, strict: false)
73
+ raise 'No indexes provided.' if page_indexes.empty?
74
+ unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
75
+ return extract_sub_documents(page_indexes)
76
+ end
77
+ return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
78
+
79
+ correct_page_indexes = []
80
+ current_list = []
81
+ previous_confidence = nil
82
+ page_indexes.each_with_index do |page_index, i|
83
+ confidence = page_index.confidence
84
+ page_list = page_index.page_indexes
85
+
86
+ if confidence >= 0.5 && previous_confidence.nil?
87
+ current_list = page_list
88
+ elsif confidence >= 0.5 && i < page_indexes.length - 1
89
+ correct_page_indexes << current_list
90
+ current_list = page_list
91
+ elsif confidence < 0.5 && i == page_indexes.length - 1
92
+ current_list.concat page_list
93
+ correct_page_indexes << current_list
94
+ else
95
+ correct_page_indexes << current_list
96
+ correct_page_indexes << page_list
97
+ end
98
+ previous_confidence = confidence
99
+ end
100
+ extract_sub_documents(correct_page_indexes)
101
+ end
102
+ # rubocop:enable Metrics/CyclomaticComplexity
103
+ # rubocop:enable Metrics/PerceivedComplexity
104
+
105
+ private
106
+
107
+ attr_reader :source_pdf, :filename
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'pdf_extractor/pdf_extractor'
4
+ require_relative 'pdf_extractor/extracted_pdf'