mindee 3.10.0 → 3.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/mindee/extraction/ocr_extractor.rb +110 -0
- data/lib/mindee/extraction/tax_extractor.rb +322 -0
- data/lib/mindee/extraction.rb +3 -0
- data/lib/mindee/geometry/utils.rb +19 -0
- data/lib/mindee/parsing/common/api_response.rb +1 -1
- data/lib/mindee/parsing/common/ocr/mvision_v1.rb +16 -0
- data/lib/mindee/parsing/common/ocr/ocr.rb +10 -0
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +5 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cdf4542787e1ef24c74bf2104d3eacc5205afff3066ea24b0bea52b7d43d6993
|
4
|
+
data.tar.gz: 6d816b54bb34feb7d6c59db0f0a1f90cc34c5b6dd8155be315dfb698c88838d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ee7621e86789bf3f3ff294d505d71272018f342936d05f74f17a57e7b15916f8cde1218ffe1fa7ff7e7d2331c2982562725994bbf3565290645b870756d113e
|
7
|
+
data.tar.gz: 9b48e7088ffcc75136714e10e0fca34975b6b94f0f8a5241fd775fbcde0e8a05a836ca9f8703a7dc1d00c2820717ccc0cc989c8c1ea390716a3119114347e111
|
data/CHANGELOG.md
CHANGED
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
module Extraction
|
5
|
+
# Generic extractor class
|
6
|
+
class OcrExtractor
|
7
|
+
# Checks for a list of possible matches in a string & returns the index of the first found candidate.
|
8
|
+
# Case & diacritics insensitive.
|
9
|
+
# @param text [String] string to search for matches.
|
10
|
+
# @param str_candidates [Array<String>] array of values to look for
|
11
|
+
# @return [Integer, nil]
|
12
|
+
def self.match_index(text, str_candidates)
|
13
|
+
idx = nil
|
14
|
+
str_candidates.each do |str_candidate|
|
15
|
+
found_idx = remove_accents(text.downcase).index(remove_accents(str_candidate.downcase))
|
16
|
+
idx = found_idx if idx.nil?
|
17
|
+
idx = found_idx if !found_idx.nil? && found_idx >= idx
|
18
|
+
end
|
19
|
+
idx
|
20
|
+
end
|
21
|
+
|
22
|
+
# Normalizes text by removing diacritics.
|
23
|
+
# @param input_str [String] string to handle.
|
24
|
+
# @return [String]
|
25
|
+
def self.remove_accents(input_str)
|
26
|
+
diacritics = [*0x1DC0..0x1DFF, *0x0300..0x036F, *0xFE20..0xFE2F].pack('U*')
|
27
|
+
input_str
|
28
|
+
.unicode_normalize(:nfd)
|
29
|
+
.tr(diacritics, '')
|
30
|
+
.unicode_normalize(:nfc)
|
31
|
+
.scrub
|
32
|
+
end
|
33
|
+
|
34
|
+
# Checks if a given percentage value is within the allowed range
|
35
|
+
# @param value [Integer] The value to check
|
36
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
37
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.valid_percentage?(value, min_rate_percentage, max_rate_percentage)
|
40
|
+
return false if value.nil?
|
41
|
+
|
42
|
+
value > min_rate_percentage && value < max_rate_percentage
|
43
|
+
end
|
44
|
+
|
45
|
+
# Parses a percentage from a string, and returns it as a float.
|
46
|
+
# Returns nil if candidate isn't a valid percentage.
|
47
|
+
# @param percentage_str [String] String candidate.
|
48
|
+
# @return [Float, nil]
|
49
|
+
def self.parse_percentage(percentage_str)
|
50
|
+
percentage_str.gsub!('%', '')
|
51
|
+
percentage_str.strip
|
52
|
+
percentage_str.gsub!(',', '.')
|
53
|
+
Float(percentage_str.scrub)
|
54
|
+
rescue ArgumentError
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Parses an amount from a string, and returns it as a float.
|
59
|
+
# Returns nil if candidate isn't a valid amount.
|
60
|
+
# @param amount_str [String] String candidate.
|
61
|
+
# @return [Float, nil]
|
62
|
+
def self.parse_amount(amount_str)
|
63
|
+
cleaned_str = amount_str.gsub(' ', '')
|
64
|
+
cleaned_str = standardize_delimiters(cleaned_str)
|
65
|
+
Float(cleaned_str)
|
66
|
+
rescue ArgumentError
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def self.standardize_delimiters(str)
|
73
|
+
if comma_decimal?(str)
|
74
|
+
str.gsub('.', '').gsub(',', '.')
|
75
|
+
elsif dot_decimal?(str)
|
76
|
+
str.gsub(',', '')
|
77
|
+
else
|
78
|
+
str
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.comma_decimal?(str)
|
83
|
+
(str.length > 3 && str[-3] == ',') || str[-2] == ','
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.dot_decimal?(str)
|
87
|
+
(str.length > 3 && str[-3] == '.') || str[-2] == '.'
|
88
|
+
end
|
89
|
+
|
90
|
+
# Removes most common currency symbols from string
|
91
|
+
# @param input_string [String] string to remove the symbols from
|
92
|
+
# @return [String]
|
93
|
+
def self.remove_currency_symbols(input_string)
|
94
|
+
# Define an array of common currency symbols
|
95
|
+
currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '฿', '₺', '₴', '₿', '₡', '₮', '₱', '₲', '₪', '₫', '₩', '₵',
|
96
|
+
'₦', '₢', '₤', '₣', '₧', '₯', '₠', '₶', '₸', '₷', '₼', '₾', '₺', '﹩', '₨', '₹', '$', '﹫']
|
97
|
+
|
98
|
+
# Iterate over each currency symbol and remove it from the input string
|
99
|
+
currency_symbols.each do |symbol|
|
100
|
+
input_string.gsub!(symbol, '')
|
101
|
+
end
|
102
|
+
|
103
|
+
input_string
|
104
|
+
end
|
105
|
+
|
106
|
+
private_class_method :remove_accents, :match_index, :parse_amount, :parse_percentage, :remove_currency_symbols,
|
107
|
+
:valid_percentage?, :comma_decimal?, :dot_decimal?, :standardize_delimiters
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,322 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'ocr_extractor'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Extraction
|
7
|
+
# Tax extractor class
|
8
|
+
class TaxExtractor < OcrExtractor
|
9
|
+
# Extracts the most relevant candidate.
|
10
|
+
# @param candidates [Array<Hash>] a candidate for the tax.
|
11
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
12
|
+
# @return [Hash, nil]
|
13
|
+
def self.pick_best(candidates, tax_names)
|
14
|
+
return candidates[0] if candidates.size == 1
|
15
|
+
return nil if candidates.empty?
|
16
|
+
|
17
|
+
picked = 0
|
18
|
+
picked_score = 0
|
19
|
+
|
20
|
+
candidates.each_with_index do |candidate, i|
|
21
|
+
next unless valid_candidate?(candidate, tax_names)
|
22
|
+
|
23
|
+
sum_fields_score = calculate_score(candidate, i)
|
24
|
+
|
25
|
+
if picked_score < sum_fields_score
|
26
|
+
picked_score = sum_fields_score
|
27
|
+
picked = i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
candidates[picked]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
|
35
|
+
# due to unsupported diacritics.
|
36
|
+
# @param candidate [Hash] A candidate for the tax.
|
37
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.valid_candidate?(candidate, tax_names)
|
40
|
+
return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
|
41
|
+
|
42
|
+
tax_names.each do |tax_name|
|
43
|
+
return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
# [Experimental] computes the score of a valid candidate for a tax.
|
49
|
+
# @param candidate [Hash] A candidate for the tax.
|
50
|
+
# @param index [Integer]
|
51
|
+
def self.calculate_score(candidate, index)
|
52
|
+
score = index + 1
|
53
|
+
unless candidate['rate'].nil?
|
54
|
+
score += 1
|
55
|
+
score -= 2 if candidate['rate'] > 100
|
56
|
+
score -= 1 if candidate['rate'] > 30
|
57
|
+
end
|
58
|
+
score += 4 unless candidate['value'].nil?
|
59
|
+
score += 1 unless candidate['base'].nil?
|
60
|
+
score
|
61
|
+
end
|
62
|
+
|
63
|
+
# Curates tax values based on simple rules to avoid improbable data
|
64
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
65
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
66
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
67
|
+
# @return [Hash]
|
68
|
+
def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
69
|
+
reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
|
70
|
+
return reconstructed_hash if found_hash.nil?
|
71
|
+
|
72
|
+
reconstructed_hash['code'] =
|
73
|
+
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
|
74
|
+
|
75
|
+
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
|
76
|
+
found_hash['rate'] =
|
77
|
+
found_hash['rate'] * 100
|
78
|
+
end
|
79
|
+
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
80
|
+
found_hash = decimate_rates_if_needed(found_hash)
|
81
|
+
found_hash = fix_rate(found_hash)
|
82
|
+
reconstructed_hash['rate'] = found_hash['rate']
|
83
|
+
set_base_and_value(reconstructed_hash, found_hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Swaps the rate with base or value if rate is out of bounds
|
87
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
88
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
89
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
90
|
+
# @return [Hash]
|
91
|
+
def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
92
|
+
if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
|
93
|
+
if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
|
94
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
95
|
+
elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
|
96
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
97
|
+
end
|
98
|
+
end
|
99
|
+
found_hash
|
100
|
+
end
|
101
|
+
|
102
|
+
# Rates can't be negative if set.
|
103
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
104
|
+
def self.fix_rate(found_hash)
|
105
|
+
found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
|
106
|
+
found_hash
|
107
|
+
end
|
108
|
+
|
109
|
+
# Swaps the rate with base or value if rate is out of bounds
|
110
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
111
|
+
# @return [Hash]
|
112
|
+
def self.decimate_rates_if_needed(found_hash)
|
113
|
+
if found_hash['rate'] && found_hash['rate'] > 100
|
114
|
+
if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
|
115
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
116
|
+
elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
|
117
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
118
|
+
end
|
119
|
+
end
|
120
|
+
found_hash
|
121
|
+
end
|
122
|
+
|
123
|
+
# Sets the base and value in the reconstructed hash based on certain conditions
|
124
|
+
# @param reconstructed_hash [Hash] Hash being reconstructed with new values
|
125
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
126
|
+
# @return [Hash]
|
127
|
+
def self.set_base_and_value(reconstructed_hash, found_hash)
|
128
|
+
if found_hash['base'].nil?
|
129
|
+
reconstructed_hash['base'] = found_hash['base']
|
130
|
+
reconstructed_hash['value'] = found_hash['value']
|
131
|
+
elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
|
132
|
+
reconstructed_hash['base'] = found_hash['value']
|
133
|
+
reconstructed_hash['value'] = found_hash['base']
|
134
|
+
else
|
135
|
+
reconstructed_hash['value'] = found_hash['value']
|
136
|
+
end
|
137
|
+
reconstructed_hash
|
138
|
+
end
|
139
|
+
|
140
|
+
# Extracts a single custom type of tax.
|
141
|
+
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
|
142
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
|
143
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
144
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
145
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
146
|
+
# @return [Mindee::Parsing::Standard::TaxField, nil]
|
147
|
+
def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
|
148
|
+
return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
|
149
|
+
|
150
|
+
tax_names.sort!
|
151
|
+
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
|
152
|
+
# a tax is considered found horizontally if it has a value, otherwise it is vertical
|
153
|
+
if found_hash.nil? || found_hash['value'].nil?
|
154
|
+
found_hash = extract_vertical_tax(ocr_result, tax_names,
|
155
|
+
found_hash)
|
156
|
+
end
|
157
|
+
found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
158
|
+
|
159
|
+
return if found_hash.nil? || found_hash.empty?
|
160
|
+
|
161
|
+
create_tax_field(found_hash)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Creates a tax field from a given hash.
|
165
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
166
|
+
# @return [Mindee::Parsing::Standard::TaxField]
|
167
|
+
def self.create_tax_field(found_hash)
|
168
|
+
Mindee::Parsing::Standard::TaxField.new(
|
169
|
+
found_hash,
|
170
|
+
found_hash.key?('page_id') ? found_hash['page_id'] : nil
|
171
|
+
)
|
172
|
+
end
|
173
|
+
|
174
|
+
# Extracts the rate and code, if found, from matches into the found_hash.
|
175
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
176
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
177
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
178
|
+
# @return [Hash]
|
179
|
+
def self.extract_percentage_from_tax(matches, found_hash, percent_first)
|
180
|
+
if percent_first
|
181
|
+
found_hash['code'] = matches[2].strip unless matches[2].nil?
|
182
|
+
found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
|
183
|
+
else
|
184
|
+
found_hash['code'] = matches[1].strip unless matches[1].nil?
|
185
|
+
found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
|
186
|
+
end
|
187
|
+
found_hash
|
188
|
+
end
|
189
|
+
|
190
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
191
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
192
|
+
|
193
|
+
# Extracts the basis and value of a tax from regex matches, independent of the order.
|
194
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
195
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
196
|
+
# @return [Hash]
|
197
|
+
def self.extract_basis_and_value(matches, found_hash)
|
198
|
+
if matches[4].nil? && !matches[3].nil?
|
199
|
+
found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
|
200
|
+
elsif matches[3].nil? && !matches[4].nil?
|
201
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
202
|
+
elsif !matches[3].nil? && !matches[4].nil?
|
203
|
+
found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
|
204
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
205
|
+
end
|
206
|
+
found_hash
|
207
|
+
end
|
208
|
+
|
209
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
210
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
211
|
+
|
212
|
+
# Extracts tax information from a horizontal line.
|
213
|
+
# @param line [String] Line to be processed.
|
214
|
+
# @param pattern [Regexp] RegEx pattern to search the line with.
|
215
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
216
|
+
# @return [Hash]
|
217
|
+
def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
|
218
|
+
found_hash = {}
|
219
|
+
|
220
|
+
matches = line.match(pattern)
|
221
|
+
|
222
|
+
# Edge case for when the tax is split-up between two pages, we'll consider that
|
223
|
+
# the answer belongs to the first one.
|
224
|
+
found_hash['page_id'] = page_id unless found_hash.key?('page_id')
|
225
|
+
return found_hash if matches.nil?
|
226
|
+
|
227
|
+
found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
|
228
|
+
extract_basis_and_value(matches, found_hash)
|
229
|
+
end
|
230
|
+
|
231
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
232
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
233
|
+
|
234
|
+
# Processes a horizontal line for tax extraction. Returns a hash of collected values.
|
235
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
|
236
|
+
# @param tax_names [Array<String>] Possible tax names candidates.
|
237
|
+
# @return [Array<Hash>]
|
238
|
+
def self.extract_horizontal_tax(ocr_result, tax_names)
|
239
|
+
candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
|
240
|
+
linear_pattern_percent_first = %r{
|
241
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
242
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
|
243
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
244
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
245
|
+
}x
|
246
|
+
linear_pattern_percent_second = %r{
|
247
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
|
248
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
249
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
250
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
251
|
+
}x
|
252
|
+
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
|
253
|
+
page.all_lines.each do |line|
|
254
|
+
clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
|
255
|
+
.gsub(%r{ +}, ' ').strip
|
256
|
+
|
257
|
+
next if match_index(clean_line, tax_names).nil?
|
258
|
+
|
259
|
+
unless clean_line.match(linear_pattern_percent_second).nil?
|
260
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
|
261
|
+
linear_pattern_percent_second, page_id, false))
|
262
|
+
end
|
263
|
+
if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
|
264
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
|
265
|
+
linear_pattern_percent_first, page_id, true))
|
266
|
+
elsif !clean_line.match(linear_pattern_percent_first).nil?
|
267
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line,
|
268
|
+
linear_pattern_percent_first, page_id, true))
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
candidates
|
273
|
+
end
|
274
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
275
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
276
|
+
|
277
|
+
# Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
|
278
|
+
# @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
|
279
|
+
# @param found_hash [Hash] Hash containing previously found values, if any.
|
280
|
+
# @return [Hash]
|
281
|
+
def self.extract_vertical_tax_values(line, found_hash)
|
282
|
+
amounts = []
|
283
|
+
line.each do |reconstructed_word|
|
284
|
+
amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
|
285
|
+
end
|
286
|
+
if amounts.length == 1 && !found_hash.key?('value')
|
287
|
+
found_hash['value'] = amounts[0]
|
288
|
+
else
|
289
|
+
found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
|
290
|
+
found_hash['value'] = amounts[1] if found_hash['value'].nil?
|
291
|
+
end
|
292
|
+
found_hash
|
293
|
+
end
|
294
|
+
|
295
|
+
# Extracts tax data from a vertical reconstructed row.
|
296
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
|
297
|
+
# @param tax_names [Array<String>] Array of possible names a tax can have
|
298
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
299
|
+
def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
|
300
|
+
found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
|
301
|
+
|
302
|
+
ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
|
303
|
+
page.all_words.each do |word|
|
304
|
+
next if match_index(word.text, tax_names).nil?
|
305
|
+
|
306
|
+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
|
307
|
+
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
|
308
|
+
found_hash['code'] = word.text.strip if found_hash['code'].nil?
|
309
|
+
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
found_hash
|
313
|
+
end
|
314
|
+
|
315
|
+
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
|
316
|
+
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
|
317
|
+
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
|
318
|
+
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
|
319
|
+
:swap_rates_if_needed
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
@@ -77,5 +77,24 @@ module Mindee
|
|
77
77
|
coords = points.map(&:x)
|
78
78
|
MinMax.new(coords.min, coords.max)
|
79
79
|
end
|
80
|
+
|
81
|
+
# Checks whether a set of coordinates is below another on the page, with a slight margin for the lateral value.
|
82
|
+
# @param candidate [Array<Mindee::Geometry::Point] Polygon to check
|
83
|
+
# @param anchor [Array<Mindee::Geometry::Point] Reference polygon
|
84
|
+
# @param margin_left [Float] Margin tolerance on the left of the anchor
|
85
|
+
# @param margin_right [Float] Margin tolerance on the right of the anchor
|
86
|
+
def self.below?(candidate, anchor, margin_left, margin_right)
|
87
|
+
return false if Geometry.get_min_max_y(candidate).min < Geometry.get_min_max_y(anchor).min
|
88
|
+
if Geometry.get_min_max_x(candidate).min <
|
89
|
+
Geometry.get_min_max_x(anchor).min - (Geometry.get_min_max_x(anchor).min * margin_left)
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
if Geometry.get_min_max_x(candidate).max >
|
93
|
+
Geometry.get_min_max_x(anchor).max + (Geometry.get_min_max_x(anchor).max * margin_right)
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
|
97
|
+
true
|
98
|
+
end
|
80
99
|
end
|
81
100
|
end
|
@@ -27,6 +27,22 @@ module Mindee
|
|
27
27
|
end
|
28
28
|
out_str.strip
|
29
29
|
end
|
30
|
+
|
31
|
+
# Constructs a line from a column, located underneath given coordinates
|
32
|
+
# @param coordinates [Array<Mindee::Geometry::Point>] Polygon or bounding box where the reconstruction should
|
33
|
+
# start.
|
34
|
+
# @param page_id [Integer] ID of the page to start at
|
35
|
+
# @param x_margin [Float] Margin of misalignment for the x coordinate.
|
36
|
+
# @return [Mindee::Parsing::Common::Ocr::OcrLine]
|
37
|
+
def reconstruct_vertically(coordinates, page_id, x_margin)
|
38
|
+
line_arr = OcrLine.new([])
|
39
|
+
@pages[page_id].all_lines.each do |line|
|
40
|
+
line.each do |word|
|
41
|
+
line_arr.push(word) if Geometry.below?(word.polygon, coordinates, x_margin / 2, x_margin * 2)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
line_arr
|
45
|
+
end
|
30
46
|
end
|
31
47
|
end
|
32
48
|
end
|
@@ -163,6 +163,16 @@ module Mindee
|
|
163
163
|
def to_s
|
164
164
|
@mvision_v1.to_s
|
165
165
|
end
|
166
|
+
|
167
|
+
# Constructs a line from a column, located underneath given coordinates
|
168
|
+
# @param coordinates [Array<Mindee::Geometry::Point>] Polygon or bounding box where the reconstruction should
|
169
|
+
# start
|
170
|
+
# @param page_id [Integer] ID of the page to start at
|
171
|
+
# @param x_margin [Float] Margin of misalignment for the x coordinate (default 10%)
|
172
|
+
# @return [Mindee::Parsing::Common::Ocr::OcrLine]
|
173
|
+
def reconstruct_vertically(coordinates, page_id, x_margin = 0.05)
|
174
|
+
@mvision_v1.reconstruct_vertically(coordinates, page_id, x_margin)
|
175
|
+
end
|
166
176
|
end
|
167
177
|
end
|
168
178
|
end
|
data/lib/mindee/version.rb
CHANGED
data/lib/mindee.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'mindee/client'
|
4
|
+
require 'mindee/extraction'
|
4
5
|
|
5
6
|
module Mindee
|
6
7
|
# Mindee internal http module.
|
@@ -18,6 +19,10 @@ module Mindee
|
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
22
|
+
# Custom extraction module
|
23
|
+
module Extraction
|
24
|
+
end
|
25
|
+
|
21
26
|
# Parsing internals and fields.
|
22
27
|
module Parsing
|
23
28
|
# Common fields and functions.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mindee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mindee, SA
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: marcel
|
@@ -173,6 +173,9 @@ files:
|
|
173
173
|
- docs/us_w9_v1.md
|
174
174
|
- lib/mindee.rb
|
175
175
|
- lib/mindee/client.rb
|
176
|
+
- lib/mindee/extraction.rb
|
177
|
+
- lib/mindee/extraction/ocr_extractor.rb
|
178
|
+
- lib/mindee/extraction/tax_extractor.rb
|
176
179
|
- lib/mindee/geometry.rb
|
177
180
|
- lib/mindee/geometry/min_max.rb
|
178
181
|
- lib/mindee/geometry/point.rb
|