mindee 3.10.0 → 3.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bdc0f8cc57ce32684b3da523ce0a6666a251ff90d7a6d894a7b4601231bed3e4
4
- data.tar.gz: e249e12de6c86cc4a7683392504d5cd914afb941a99b5b00bff3f90fd0f4ae36
3
+ metadata.gz: cdf4542787e1ef24c74bf2104d3eacc5205afff3066ea24b0bea52b7d43d6993
4
+ data.tar.gz: 6d816b54bb34feb7d6c59db0f0a1f90cc34c5b6dd8155be315dfb698c88838d4
5
5
  SHA512:
6
- metadata.gz: 713cf8cb6259b54637865f6799db1f5da7d7f27c436960316f84e2d404fe743d343d2d307feb2023c0fcab580421e5e0004bc70d34735d5800f0530d135c70ea
7
- data.tar.gz: 535dc099fe76a27b3af26bb5cb726756e1b158056027782b607820e2648b92dbc32e65cf453d356c33355c5b94a3df9ffbbd86fabfc18fd5f439b48efa68816c
6
+ metadata.gz: 3ee7621e86789bf3f3ff294d505d71272018f342936d05f74f17a57e7b15916f8cde1218ffe1fa7ff7e7d2331c2982562725994bbf3565290645b870756d113e
7
+ data.tar.gz: 9b48e7088ffcc75136714e10e0fca34975b6b94f0f8a5241fd775fbcde0e8a05a836ca9f8703a7dc1d00c2820717ccc0cc989c8c1ea390716a3119114347e111
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Mindee Ruby API Library Changelog
2
2
 
3
+ ## v3.11.0 - 2024-06-10
4
+ ### Changes
5
+ * :sparkles: add custom tax extraction feature (#76)
6
+
7
+
3
8
  ## v3.10.0 - 2024-05-31
4
9
  ### Changes
5
10
  * :sparkles: add support for us mail v2 (#98)
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ module Extraction
5
+ # Generic extractor class
6
+ class OcrExtractor
7
+ # Checks for a list of possible matches in a string & returns the index of the first found candidate.
8
+ # Case & diacritics insensitive.
9
+ # @param text [String] string to search for matches.
10
+ # @param str_candidates [Array<String>] array of values to look for
11
+ # @return [Integer, nil]
12
+ def self.match_index(text, str_candidates)
13
+ idx = nil
14
+ str_candidates.each do |str_candidate|
15
+ found_idx = remove_accents(text.downcase).index(remove_accents(str_candidate.downcase))
16
+ idx = found_idx if idx.nil?
17
+ idx = found_idx if !found_idx.nil? && found_idx >= idx
18
+ end
19
+ idx
20
+ end
21
+
22
+ # Normalizes text by removing diacritics.
23
+ # @param input_str [String] string to handle.
24
+ # @return [String]
25
+ def self.remove_accents(input_str)
26
+ diacritics = [*0x1DC0..0x1DFF, *0x0300..0x036F, *0xFE20..0xFE2F].pack('U*')
27
+ input_str
28
+ .unicode_normalize(:nfd)
29
+ .tr(diacritics, '')
30
+ .unicode_normalize(:nfc)
31
+ .scrub
32
+ end
33
+
34
+ # Checks if a given percentage value is within the allowed range
35
+ # @param value [Integer] The value to check
36
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
37
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
38
+ # @return [Boolean]
39
+ def self.valid_percentage?(value, min_rate_percentage, max_rate_percentage)
40
+ return false if value.nil?
41
+
42
+ value > min_rate_percentage && value < max_rate_percentage
43
+ end
44
+
45
+ # Parses a percentage from a string, and returns it as a float.
46
+ # Returns nil if candidate isn't a valid percentage.
47
+ # @param percentage_str [String] String candidate.
48
+ # @return [Float, nil]
49
+ def self.parse_percentage(percentage_str)
50
+ percentage_str.gsub!('%', '')
51
+ percentage_str.strip
52
+ percentage_str.gsub!(',', '.')
53
+ Float(percentage_str.scrub)
54
+ rescue ArgumentError
55
+ nil
56
+ end
57
+
58
+ # Parses an amount from a string, and returns it as a float.
59
+ # Returns nil if candidate isn't a valid amount.
60
+ # @param amount_str [String] String candidate.
61
+ # @return [Float, nil]
62
+ def self.parse_amount(amount_str)
63
+ cleaned_str = amount_str.gsub(' ', '')
64
+ cleaned_str = standardize_delimiters(cleaned_str)
65
+ Float(cleaned_str)
66
+ rescue ArgumentError
67
+ nil
68
+ end
69
+
70
+ private
71
+
72
+ def self.standardize_delimiters(str)
73
+ if comma_decimal?(str)
74
+ str.gsub('.', '').gsub(',', '.')
75
+ elsif dot_decimal?(str)
76
+ str.gsub(',', '')
77
+ else
78
+ str
79
+ end
80
+ end
81
+
82
+ def self.comma_decimal?(str)
83
+ (str.length > 3 && str[-3] == ',') || str[-2] == ','
84
+ end
85
+
86
+ def self.dot_decimal?(str)
87
+ (str.length > 3 && str[-3] == '.') || str[-2] == '.'
88
+ end
89
+
90
+ # Removes most common currency symbols from string
91
+ # @param input_string [String] string to remove the symbols from
92
+ # @return [String]
93
+ def self.remove_currency_symbols(input_string)
94
+ # Define an array of common currency symbols
95
+ currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '฿', '₺', '₴', '₿', '₡', '₮', '₱', '₲', '₪', '₫', '₩', '₵',
96
+ '₦', '₢', '₤', '₣', '₧', '₯', '₠', '₶', '₸', '₷', '₼', '₾', '₺', '﹩', '₨', '₹', '$', '﹫']
97
+
98
+ # Iterate over each currency symbol and remove it from the input string
99
+ currency_symbols.each do |symbol|
100
+ input_string.gsub!(symbol, '')
101
+ end
102
+
103
+ input_string
104
+ end
105
+
106
+ private_class_method :remove_accents, :match_index, :parse_amount, :parse_percentage, :remove_currency_symbols,
107
+ :valid_percentage?, :comma_decimal?, :dot_decimal?, :standardize_delimiters
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ocr_extractor'
4
+
5
+ module Mindee
6
+ module Extraction
7
+ # Tax extractor class
8
+ class TaxExtractor < OcrExtractor
9
+ # Extracts the most relevant candidate.
10
+ # @param candidates [Array<Hash>] a candidate for the tax.
11
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
12
+ # @return [Hash, nil]
13
+ def self.pick_best(candidates, tax_names)
14
+ return candidates[0] if candidates.size == 1
15
+ return nil if candidates.empty?
16
+
17
+ picked = 0
18
+ picked_score = 0
19
+
20
+ candidates.each_with_index do |candidate, i|
21
+ next unless valid_candidate?(candidate, tax_names)
22
+
23
+ sum_fields_score = calculate_score(candidate, i)
24
+
25
+ if picked_score < sum_fields_score
26
+ picked_score = sum_fields_score
27
+ picked = i
28
+ end
29
+ end
30
+
31
+ candidates[picked]
32
+ end
33
+
34
+ # Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
35
+ # due to unsupported diacritics.
36
+ # @param candidate [Hash] A candidate for the tax.
37
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
38
+ # @return [Boolean]
39
+ def self.valid_candidate?(candidate, tax_names)
40
+ return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
41
+
42
+ tax_names.each do |tax_name|
43
+ return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
44
+ end
45
+ false
46
+ end
47
+
48
+ # [Experimental] computes the score of a valid candidate for a tax.
49
+ # @param candidate [Hash] A candidate for the tax.
50
+ # @param index [Integer]
51
+ def self.calculate_score(candidate, index)
52
+ score = index + 1
53
+ unless candidate['rate'].nil?
54
+ score += 1
55
+ score -= 2 if candidate['rate'] > 100
56
+ score -= 1 if candidate['rate'] > 30
57
+ end
58
+ score += 4 unless candidate['value'].nil?
59
+ score += 1 unless candidate['base'].nil?
60
+ score
61
+ end
62
+
63
+ # Curates tax values based on simple rules to avoid improbable data
64
+ # @param found_hash [Hash] Hash of currently retrieved values
65
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
66
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
67
+ # @return [Hash]
68
+ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
69
+ reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
70
+ return reconstructed_hash if found_hash.nil?
71
+
72
+ reconstructed_hash['code'] =
73
+ found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
+
75
+ if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
+ found_hash['rate'] =
77
+ found_hash['rate'] * 100
78
+ end
79
+ found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
+ found_hash = decimate_rates_if_needed(found_hash)
81
+ found_hash = fix_rate(found_hash)
82
+ reconstructed_hash['rate'] = found_hash['rate']
83
+ set_base_and_value(reconstructed_hash, found_hash)
84
+ end
85
+
86
+ # Swaps the rate with base or value if rate is out of bounds
87
+ # @param found_hash [Hash] Hash of currently retrieved values
88
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
89
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
90
+ # @return [Hash]
91
+ def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
92
+ if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
93
+ if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
94
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
95
+ elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
96
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
97
+ end
98
+ end
99
+ found_hash
100
+ end
101
+
102
+ # Rates can't be negative if set.
103
+ # @param found_hash [Hash] Hash of currently retrieved values
104
+ def self.fix_rate(found_hash)
105
+ found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
106
+ found_hash
107
+ end
108
+
109
+ # Swaps the rate with base or value if rate is out of bounds
110
+ # @param found_hash [Hash] Hash of currently retrieved values
111
+ # @return [Hash]
112
+ def self.decimate_rates_if_needed(found_hash)
113
+ if found_hash['rate'] && found_hash['rate'] > 100
114
+ if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
115
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
116
+ elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
117
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
118
+ end
119
+ end
120
+ found_hash
121
+ end
122
+
123
+ # Sets the base and value in the reconstructed hash based on certain conditions
124
+ # @param reconstructed_hash [Hash] Hash being reconstructed with new values
125
+ # @param found_hash [Hash] Hash of currently retrieved values
126
+ # @return [Hash]
127
+ def self.set_base_and_value(reconstructed_hash, found_hash)
128
+ if found_hash['base'].nil?
129
+ reconstructed_hash['base'] = found_hash['base']
130
+ reconstructed_hash['value'] = found_hash['value']
131
+ elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
+ reconstructed_hash['base'] = found_hash['value']
133
+ reconstructed_hash['value'] = found_hash['base']
134
+ else
135
+ reconstructed_hash['value'] = found_hash['value']
136
+ end
137
+ reconstructed_hash
138
+ end
139
+
140
+ # Extracts a single custom type of tax.
141
+ # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
143
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
144
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
145
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
146
+ # @return [Mindee::Parsing::Standard::TaxField, nil]
147
+ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
148
+ return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
149
+
150
+ tax_names.sort!
151
+ found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
+ # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
+ if found_hash.nil? || found_hash['value'].nil?
154
+ found_hash = extract_vertical_tax(ocr_result, tax_names,
155
+ found_hash)
156
+ end
157
+ found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
158
+
159
+ return if found_hash.nil? || found_hash.empty?
160
+
161
+ create_tax_field(found_hash)
162
+ end
163
+
164
+ # Creates a tax field from a given hash.
165
+ # @param found_hash [Hash] Hash of currently retrieved values
166
+ # @return [Mindee::Parsing::Standard::TaxField]
167
+ def self.create_tax_field(found_hash)
168
+ Mindee::Parsing::Standard::TaxField.new(
169
+ found_hash,
170
+ found_hash.key?('page_id') ? found_hash['page_id'] : nil
171
+ )
172
+ end
173
+
174
+ # Extracts the rate and code, if found, from matches into the found_hash.
175
+ # @param matches [MatchData] RegEx matches of the values for taxes
176
+ # @param found_hash [Hash] Hash of currently retrieved values
177
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
178
+ # @return [Hash]
179
+ def self.extract_percentage_from_tax(matches, found_hash, percent_first)
180
+ if percent_first
181
+ found_hash['code'] = matches[2].strip unless matches[2].nil?
182
+ found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
183
+ else
184
+ found_hash['code'] = matches[1].strip unless matches[1].nil?
185
+ found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
186
+ end
187
+ found_hash
188
+ end
189
+
190
+ # rubocop:disable Metrics/CyclomaticComplexity
191
+ # rubocop:disable Metrics/PerceivedComplexity
192
+
193
+ # Extracts the basis and value of a tax from regex matches, independent of the order.
194
+ # @param matches [MatchData] RegEx matches of the values for taxes
195
+ # @param found_hash [Hash] Hash of currently retrieved values
196
+ # @return [Hash]
197
+ def self.extract_basis_and_value(matches, found_hash)
198
+ if matches[4].nil? && !matches[3].nil?
199
+ found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
200
+ elsif matches[3].nil? && !matches[4].nil?
201
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
202
+ elsif !matches[3].nil? && !matches[4].nil?
203
+ found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
204
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
205
+ end
206
+ found_hash
207
+ end
208
+
209
+ # rubocop:enable Metrics/CyclomaticComplexity
210
+ # rubocop:enable Metrics/PerceivedComplexity
211
+
212
+ # Extracts tax information from a horizontal line.
213
+ # @param line [String] Line to be processed.
214
+ # @param pattern [Regexp] RegEx pattern to search the line with.
215
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
216
+ # @return [Hash]
217
+ def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
218
+ found_hash = {}
219
+
220
+ matches = line.match(pattern)
221
+
222
+ # Edge case for when the tax is split-up between two pages, we'll consider that
223
+ # the answer belongs to the first one.
224
+ found_hash['page_id'] = page_id unless found_hash.key?('page_id')
225
+ return found_hash if matches.nil?
226
+
227
+ found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
228
+ extract_basis_and_value(matches, found_hash)
229
+ end
230
+
231
+ # rubocop:disable Metrics/CyclomaticComplexity
232
+ # rubocop:disable Metrics/PerceivedComplexity
233
+
234
+ # Processes a horizontal line for tax extraction. Returns a hash of collected values.
235
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
236
+ # @param tax_names [Array<String>] Possible tax names candidates.
237
+ # @return [Array<Hash>]
238
+ def self.extract_horizontal_tax(ocr_result, tax_names)
239
+ candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
240
+ linear_pattern_percent_first = %r{
241
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
245
+ }x
246
+ linear_pattern_percent_second = %r{
247
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
251
+ }x
252
+ ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
+ page.all_lines.each do |line|
254
+ clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
255
+ .gsub(%r{ +}, ' ').strip
256
+
257
+ next if match_index(clean_line, tax_names).nil?
258
+
259
+ unless clean_line.match(linear_pattern_percent_second).nil?
260
+ candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
261
+ linear_pattern_percent_second, page_id, false))
262
+ end
263
+ if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
264
+ candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
265
+ linear_pattern_percent_first, page_id, true))
266
+ elsif !clean_line.match(linear_pattern_percent_first).nil?
267
+ candidates.append(extract_tax_from_horizontal_line(clean_line,
268
+ linear_pattern_percent_first, page_id, true))
269
+ end
270
+ end
271
+ end
272
+ candidates
273
+ end
274
+ # rubocop:enable Metrics/CyclomaticComplexity
275
+ # rubocop:enable Metrics/PerceivedComplexity
276
+
277
+ # Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
278
+ # @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
279
+ # @param found_hash [Hash] Hash containing previously found values, if any.
280
+ # @return [Hash]
281
+ def self.extract_vertical_tax_values(line, found_hash)
282
+ amounts = []
283
+ line.each do |reconstructed_word|
284
+ amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
285
+ end
286
+ if amounts.length == 1 && !found_hash.key?('value')
287
+ found_hash['value'] = amounts[0]
288
+ else
289
+ found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
290
+ found_hash['value'] = amounts[1] if found_hash['value'].nil?
291
+ end
292
+ found_hash
293
+ end
294
+
295
+ # Extracts tax data from a vertical reconstructed row.
296
+ # @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
297
+ # @param tax_names [Array<String>] Array of possible names a tax can have
298
+ # @param found_hash [Hash] Hash of currently retrieved values
299
+ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
300
+ found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
301
+
302
+ ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
303
+ page.all_words.each do |word|
304
+ next if match_index(word.text, tax_names).nil?
305
+
306
+ reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
307
+ found_hash['page_id'] = page_id if found_hash['page_id'].nil?
308
+ found_hash['code'] = word.text.strip if found_hash['code'].nil?
309
+ found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
310
+ end
311
+ end
312
+ found_hash
313
+ end
314
+
315
+ private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
316
+ :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
317
+ :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
318
+ :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
319
+ :swap_rates_if_needed
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'extraction/tax_extractor'
@@ -77,5 +77,24 @@ module Mindee
77
77
  coords = points.map(&:x)
78
78
  MinMax.new(coords.min, coords.max)
79
79
  end
80
+
81
+ # Checks whether a set of coordinates is below another on the page, with a slight margin for the lateral value.
82
+ # @param candidate [Array<Mindee::Geometry::Point] Polygon to check
83
+ # @param anchor [Array<Mindee::Geometry::Point] Reference polygon
84
+ # @param margin_left [Float] Margin tolerance on the left of the anchor
85
+ # @param margin_right [Float] Margin tolerance on the right of the anchor
86
+ def self.below?(candidate, anchor, margin_left, margin_right)
87
+ return false if Geometry.get_min_max_y(candidate).min < Geometry.get_min_max_y(anchor).min
88
+ if Geometry.get_min_max_x(candidate).min <
89
+ Geometry.get_min_max_x(anchor).min - (Geometry.get_min_max_x(anchor).min * margin_left)
90
+ return false
91
+ end
92
+ if Geometry.get_min_max_x(candidate).max >
93
+ Geometry.get_min_max_x(anchor).max + (Geometry.get_min_max_x(anchor).max * margin_right)
94
+ return false
95
+ end
96
+
97
+ true
98
+ end
80
99
  end
81
100
  end
@@ -100,7 +100,7 @@ module Mindee
100
100
  attr_reader :job
101
101
  # @return [Mindee::Parsing::Common::ApiRequest]
102
102
  attr_reader :api_request
103
- # @return [Hash]
103
+ # @return [String]
104
104
  attr_reader :raw_http
105
105
 
106
106
  # @param product_class [Class<Mindee::Product>]
@@ -27,6 +27,22 @@ module Mindee
27
27
  end
28
28
  out_str.strip
29
29
  end
30
+
31
+ # Constructs a line from a column, located underneath given coordinates
32
+ # @param coordinates [Array<Mindee::Geometry::Point>] Polygon or bounding box where the reconstruction should
33
+ # start.
34
+ # @param page_id [Integer] ID of the page to start at
35
+ # @param x_margin [Float] Margin of misalignment for the x coordinate.
36
+ # @return [Mindee::Parsing::Common::Ocr::OcrLine]
37
+ def reconstruct_vertically(coordinates, page_id, x_margin)
38
+ line_arr = OcrLine.new([])
39
+ @pages[page_id].all_lines.each do |line|
40
+ line.each do |word|
41
+ line_arr.push(word) if Geometry.below?(word.polygon, coordinates, x_margin / 2, x_margin * 2)
42
+ end
43
+ end
44
+ line_arr
45
+ end
30
46
  end
31
47
  end
32
48
  end
@@ -163,6 +163,16 @@ module Mindee
163
163
  def to_s
164
164
  @mvision_v1.to_s
165
165
  end
166
+
167
+ # Constructs a line from a column, located underneath given coordinates
168
+ # @param coordinates [Array<Mindee::Geometry::Point>] Polygon or bounding box where the reconstruction should
169
+ # start
170
+ # @param page_id [Integer] ID of the page to start at
171
+ # @param x_margin [Float] Margin of misalignment for the x coordinate (default 10%)
172
+ # @return [Mindee::Parsing::Common::Ocr::OcrLine]
173
+ def reconstruct_vertically(coordinates, page_id, x_margin = 0.05)
174
+ @mvision_v1.reconstruct_vertically(coordinates, page_id, x_margin)
175
+ end
166
176
  end
167
177
  end
168
178
  end
@@ -3,7 +3,7 @@
3
3
  # Mindee
4
4
  module Mindee
5
5
  # Current version.
6
- VERSION = '3.10.0'
6
+ VERSION = '3.11.0'
7
7
 
8
8
  # Finds and return the current platform.
9
9
  # @return [String]
data/lib/mindee.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'mindee/client'
4
+ require 'mindee/extraction'
4
5
 
5
6
  module Mindee
6
7
  # Mindee internal http module.
@@ -18,6 +19,10 @@ module Mindee
18
19
  end
19
20
  end
20
21
 
22
+ # Custom extraction module
23
+ module Extraction
24
+ end
25
+
21
26
  # Parsing internals and fields.
22
27
  module Parsing
23
28
  # Common fields and functions.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mindee
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.10.0
4
+ version: 3.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mindee, SA
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-05-31 00:00:00.000000000 Z
11
+ date: 2024-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: marcel
@@ -173,6 +173,9 @@ files:
173
173
  - docs/us_w9_v1.md
174
174
  - lib/mindee.rb
175
175
  - lib/mindee/client.rb
176
+ - lib/mindee/extraction.rb
177
+ - lib/mindee/extraction/ocr_extractor.rb
178
+ - lib/mindee/extraction/tax_extractor.rb
176
179
  - lib/mindee/geometry.rb
177
180
  - lib/mindee/geometry/min_max.rb
178
181
  - lib/mindee/geometry/point.rb