mindee 3.9.0 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/docs/code_samples/us_mail_v2_async.txt +19 -0
- data/docs/us_mail_v2.md +135 -0
- data/lib/mindee/client.rb +11 -0
- data/lib/mindee/extraction/ocr_extractor.rb +110 -0
- data/lib/mindee/extraction/tax_extractor.rb +322 -0
- data/lib/mindee/extraction.rb +3 -0
- data/lib/mindee/geometry/utils.rb +19 -0
- data/lib/mindee/input/local_response.rb +72 -0
- data/lib/mindee/input/sources.rb +3 -3
- data/lib/mindee/parsing/common/api_response.rb +5 -4
- data/lib/mindee/parsing/common/ocr/mvision_v1.rb +16 -0
- data/lib/mindee/parsing/common/ocr/ocr.rb +10 -0
- data/lib/mindee/parsing/standard/base_field.rb +3 -1
- data/lib/mindee/parsing/standard/boolean_field.rb +20 -0
- data/lib/mindee/parsing/standard/locale_field.rb +4 -4
- data/lib/mindee/parsing/standard.rb +1 -0
- data/lib/mindee/product/us/us_mail/us_mail_v2.rb +41 -0
- data/lib/mindee/product/us/us_mail/us_mail_v2_document.rb +100 -0
- data/lib/mindee/product/us/us_mail/us_mail_v2_page.rb +34 -0
- data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +92 -0
- data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +78 -0
- data/lib/mindee/product.rb +1 -0
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +5 -0
- metadata +14 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cdf4542787e1ef24c74bf2104d3eacc5205afff3066ea24b0bea52b7d43d6993
|
4
|
+
data.tar.gz: 6d816b54bb34feb7d6c59db0f0a1f90cc34c5b6dd8155be315dfb698c88838d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ee7621e86789bf3f3ff294d505d71272018f342936d05f74f17a57e7b15916f8cde1218ffe1fa7ff7e7d2331c2982562725994bbf3565290645b870756d113e
|
7
|
+
data.tar.gz: 9b48e7088ffcc75136714e10e0fca34975b6b94f0f8a5241fd775fbcde0e8a05a836ca9f8703a7dc1d00c2820717ccc0cc989c8c1ea390716a3119114347e111
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# Mindee Ruby API Library Changelog
|
2
2
|
|
3
|
+
## v3.11.0 - 2024-06-10
|
4
|
+
### Changes
|
5
|
+
* :sparkles: add custom tax extraction feature (#76)
|
6
|
+
|
7
|
+
|
8
|
+
## v3.10.0 - 2024-05-31
|
9
|
+
### Changes
|
10
|
+
* :sparkles: add support for us mail v2 (#98)
|
11
|
+
* :sparkles: add support for boolean fields
|
12
|
+
* :sparkles: add support for webhooks (#97)
|
13
|
+
### Fixes
|
14
|
+
* :recycle: tweak display for LocaleField
|
15
|
+
|
16
|
+
|
3
17
|
## v3.9.0 - 2024-05-16
|
4
18
|
### Changes
|
5
19
|
* :sparkles: update financial document to v1.7 & receipts to v5.2
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'mindee'
|
2
|
+
|
3
|
+
# Init a new client
|
4
|
+
mindee_client = Mindee::Client.new(api_key: 'my-api-key')
|
5
|
+
|
6
|
+
# Load a file from disk
|
7
|
+
input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
8
|
+
|
9
|
+
# Parse the file
|
10
|
+
result = mindee_client.enqueue_and_parse(
|
11
|
+
input_source,
|
12
|
+
Mindee::Product::US::UsMail::UsMailV2
|
13
|
+
)
|
14
|
+
|
15
|
+
# Print a full summary of the parsed data in RST format
|
16
|
+
puts result.document
|
17
|
+
|
18
|
+
# Print the document-level parsed data
|
19
|
+
# puts result.document.inference.prediction
|
data/docs/us_mail_v2.md
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
---
|
2
|
+
title: US US Mail OCR Ruby
|
3
|
+
---
|
4
|
+
The Ruby OCR SDK supports the [US Mail API](https://platform.mindee.com/mindee/us_mail).
|
5
|
+
|
6
|
+
Using the [sample below](https://github.com/mindee/client-lib-test-data/blob/main/products/us_mail/default_sample.jpg), we are going to illustrate how to extract the data that we want using the OCR SDK.
|
7
|
+

|
8
|
+
|
9
|
+
# Quick-Start
|
10
|
+
```rb
|
11
|
+
require 'mindee'
|
12
|
+
|
13
|
+
# Init a new client
|
14
|
+
mindee_client = Mindee::Client.new(api_key: 'my-api-key')
|
15
|
+
|
16
|
+
# Load a file from disk
|
17
|
+
input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
18
|
+
|
19
|
+
# Parse the file
|
20
|
+
result = mindee_client.enqueue_and_parse(
|
21
|
+
input_source,
|
22
|
+
Mindee::Product::US::UsMail::UsMailV2
|
23
|
+
)
|
24
|
+
|
25
|
+
# Print a full summary of the parsed data in RST format
|
26
|
+
puts result.document
|
27
|
+
|
28
|
+
# Print the document-level parsed data
|
29
|
+
# puts result.document.inference.prediction
|
30
|
+
```
|
31
|
+
|
32
|
+
**Output (RST):**
|
33
|
+
```rst
|
34
|
+
:Sender Name: zed
|
35
|
+
:Sender Address:
|
36
|
+
:City: Dallas
|
37
|
+
:Complete Address: 54321 Elm Street, Dallas, Texas ...
|
38
|
+
:Postal Code: 54321
|
39
|
+
:State: TX
|
40
|
+
:Street: 54321 Elm Street
|
41
|
+
:Recipient Names: Jane Doe
|
42
|
+
:Recipient Addresses:
|
43
|
+
+-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+
|
44
|
+
| City | Complete Address | Is Address Change | Postal Code | Private Mailbox Number | State | Street |
|
45
|
+
+=================+=====================================+===================+=============+========================+=======+===========================+
|
46
|
+
| Detroit | 1234 Market Street PMB 4321, Det... | | 12345 | 4321 | MI | 1234 Market Street |
|
47
|
+
+-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+
|
48
|
+
```
|
49
|
+
|
50
|
+
# Field Types
|
51
|
+
## Standard Fields
|
52
|
+
These fields are generic and used in several products.
|
53
|
+
|
54
|
+
### Basic Field
|
55
|
+
Each prediction object contains a set of fields that inherit from the generic `Field` class.
|
56
|
+
A typical `Field` object will have the following attributes:
|
57
|
+
|
58
|
+
* **value** (`String`, `Float`, `Integer`, `Boolean`): corresponds to the field value. Can be `nil` if no value was extracted.
|
59
|
+
* **confidence** (Float, nil): the confidence score of the field prediction.
|
60
|
+
* **bounding_box** (`Mindee::Geometry::Quadrilateral`, `nil`): contains exactly 4 relative vertices (points) coordinates of a right rectangle containing the field in the document.
|
61
|
+
* **polygon** (`Mindee::Geometry::Polygon`, `nil`): contains the relative vertices coordinates (`Point`) of a polygon containing the field in the image.
|
62
|
+
* **page_id** (`Integer`, `nil`): the ID of the page, is `nil` when at document-level.
|
63
|
+
* **reconstructed** (`Boolean`): indicates whether an object was reconstructed (not extracted as the API gave it).
|
64
|
+
|
65
|
+
|
66
|
+
Aside from the previous attributes, all basic fields have access to a `to_s` method that can be used to print their value as a string.
|
67
|
+
|
68
|
+
### String Field
|
69
|
+
The text field `StringField` only has one constraint: it's **value** is a `String` (or `nil`).
|
70
|
+
|
71
|
+
## Specific Fields
|
72
|
+
Fields which are specific to this product; they are not used in any other product.
|
73
|
+
|
74
|
+
### Recipient Addresses Field
|
75
|
+
The addresses of the recipients.
|
76
|
+
|
77
|
+
A `UsMailV2RecipientAddress` implements the following attributes:
|
78
|
+
|
79
|
+
* `city` (String): The city of the recipient's address.
|
80
|
+
* `complete` (String): The complete address of the recipient.
|
81
|
+
* `is_address_change` (Boolean): Indicates if the recipient's address is a change of address.
|
82
|
+
* `postal_code` (String): The postal code of the recipient's address.
|
83
|
+
* `private_mailbox_number` (String): The private mailbox number of the recipient's address.
|
84
|
+
* `state` (String): Second part of the ISO 3166-2 code, consisting of two letters indicating the US State.
|
85
|
+
* `street` (String): The street of the recipient's address.
|
86
|
+
Fields which are specific to this product; they are not used in any other product.
|
87
|
+
|
88
|
+
### Sender Address Field
|
89
|
+
The address of the sender.
|
90
|
+
|
91
|
+
A `UsMailV2SenderAddress` implements the following attributes:
|
92
|
+
|
93
|
+
* `city` (String): The city of the sender's address.
|
94
|
+
* `complete` (String): The complete address of the sender.
|
95
|
+
* `postal_code` (String): The postal code of the sender's address.
|
96
|
+
* `state` (String): Second part of the ISO 3166-2 code, consisting of two letters indicating the US State.
|
97
|
+
* `street` (String): The street of the sender's address.
|
98
|
+
|
99
|
+
# Attributes
|
100
|
+
The following fields are extracted for US Mail V2:
|
101
|
+
|
102
|
+
## Recipient Addresses
|
103
|
+
**recipient_addresses** (Array<[UsMailV2RecipientAddress](#recipient-addresses-field)>): The addresses of the recipients.
|
104
|
+
|
105
|
+
```rb
|
106
|
+
for recipient_addresses_elem in result.document.inference.prediction.recipient_addresses do
|
107
|
+
puts recipient_addresses_elem.value
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
## Recipient Names
|
112
|
+
**recipient_names** (Array<[StringField](#string-field)>): The names of the recipients.
|
113
|
+
|
114
|
+
```rb
|
115
|
+
for recipient_names_elem in result.document.inference.prediction.recipient_names do
|
116
|
+
puts recipient_names_elem.value
|
117
|
+
end
|
118
|
+
```
|
119
|
+
|
120
|
+
## Sender Address
|
121
|
+
**sender_address** ([UsMailV2SenderAddress](#sender-address-field)): The address of the sender.
|
122
|
+
|
123
|
+
```rb
|
124
|
+
puts result.document.inference.prediction.sender_address.value
|
125
|
+
```
|
126
|
+
|
127
|
+
## Sender Name
|
128
|
+
**sender_name** ([StringField](#string-field)): The name of the sender.
|
129
|
+
|
130
|
+
```rb
|
131
|
+
puts result.document.inference.prediction.sender_name.value
|
132
|
+
```
|
133
|
+
|
134
|
+
# Questions?
|
135
|
+
[Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-2d0ds7dtz-DPAF81ZqTy20chsYpQBW5g)
|
data/lib/mindee/client.rb
CHANGED
@@ -182,6 +182,17 @@ module Mindee
|
|
182
182
|
|
183
183
|
# rubocop:enable Metrics/ParameterLists
|
184
184
|
|
185
|
+
# Load a prediction.
|
186
|
+
#
|
187
|
+
# @param product_class [Mindee::Product] class of the product
|
188
|
+
# @param local_response [Mindee::Input::LocalResponse]
|
189
|
+
# @return [Mindee::Parsing::Common::ApiResponse]
|
190
|
+
def load_prediction(product_class, local_response)
|
191
|
+
Mindee::Parsing::Common::ApiResponse.new(product_class, local_response.as_hash, local_response.as_hash.to_json)
|
192
|
+
rescue KeyError
|
193
|
+
raise 'No prediction found in local response.'
|
194
|
+
end
|
195
|
+
|
185
196
|
# Load a document from an absolute path, as a string.
|
186
197
|
# @param input_path [String] Path of file to open
|
187
198
|
# @param fix_pdf [Boolean] Attempts to fix broken pdf if true
|
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
module Extraction
|
5
|
+
# Generic extractor class
|
6
|
+
class OcrExtractor
|
7
|
+
# Checks for a list of possible matches in a string & returns the index of the first found candidate.
|
8
|
+
# Case & diacritics insensitive.
|
9
|
+
# @param text [String] string to search for matches.
|
10
|
+
# @param str_candidates [Array<String>] array of values to look for
|
11
|
+
# @return [Integer, nil]
|
12
|
+
def self.match_index(text, str_candidates)
|
13
|
+
idx = nil
|
14
|
+
str_candidates.each do |str_candidate|
|
15
|
+
found_idx = remove_accents(text.downcase).index(remove_accents(str_candidate.downcase))
|
16
|
+
idx = found_idx if idx.nil?
|
17
|
+
idx = found_idx if !found_idx.nil? && found_idx >= idx
|
18
|
+
end
|
19
|
+
idx
|
20
|
+
end
|
21
|
+
|
22
|
+
# Normalizes text by removing diacritics.
|
23
|
+
# @param input_str [String] string to handle.
|
24
|
+
# @return [String]
|
25
|
+
def self.remove_accents(input_str)
|
26
|
+
diacritics = [*0x1DC0..0x1DFF, *0x0300..0x036F, *0xFE20..0xFE2F].pack('U*')
|
27
|
+
input_str
|
28
|
+
.unicode_normalize(:nfd)
|
29
|
+
.tr(diacritics, '')
|
30
|
+
.unicode_normalize(:nfc)
|
31
|
+
.scrub
|
32
|
+
end
|
33
|
+
|
34
|
+
# Checks if a given percentage value is within the allowed range
|
35
|
+
# @param value [Integer] The value to check
|
36
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
37
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.valid_percentage?(value, min_rate_percentage, max_rate_percentage)
|
40
|
+
return false if value.nil?
|
41
|
+
|
42
|
+
value > min_rate_percentage && value < max_rate_percentage
|
43
|
+
end
|
44
|
+
|
45
|
+
# Parses a percentage from a string, and returns it as a float.
|
46
|
+
# Returns nil if candidate isn't a valid percentage.
|
47
|
+
# @param percentage_str [String] String candidate.
|
48
|
+
# @return [Float, nil]
|
49
|
+
def self.parse_percentage(percentage_str)
|
50
|
+
percentage_str.gsub!('%', '')
|
51
|
+
percentage_str.strip
|
52
|
+
percentage_str.gsub!(',', '.')
|
53
|
+
Float(percentage_str.scrub)
|
54
|
+
rescue ArgumentError
|
55
|
+
nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Parses an amount from a string, and returns it as a float.
|
59
|
+
# Returns nil if candidate isn't a valid amount.
|
60
|
+
# @param amount_str [String] String candidate.
|
61
|
+
# @return [Float, nil]
|
62
|
+
def self.parse_amount(amount_str)
|
63
|
+
cleaned_str = amount_str.gsub(' ', '')
|
64
|
+
cleaned_str = standardize_delimiters(cleaned_str)
|
65
|
+
Float(cleaned_str)
|
66
|
+
rescue ArgumentError
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def self.standardize_delimiters(str)
|
73
|
+
if comma_decimal?(str)
|
74
|
+
str.gsub('.', '').gsub(',', '.')
|
75
|
+
elsif dot_decimal?(str)
|
76
|
+
str.gsub(',', '')
|
77
|
+
else
|
78
|
+
str
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.comma_decimal?(str)
|
83
|
+
(str.length > 3 && str[-3] == ',') || str[-2] == ','
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.dot_decimal?(str)
|
87
|
+
(str.length > 3 && str[-3] == '.') || str[-2] == '.'
|
88
|
+
end
|
89
|
+
|
90
|
+
# Removes most common currency symbols from string
|
91
|
+
# @param input_string [String] string to remove the symbols from
|
92
|
+
# @return [String]
|
93
|
+
def self.remove_currency_symbols(input_string)
|
94
|
+
# Define an array of common currency symbols
|
95
|
+
currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '฿', '₺', '₴', '₿', '₡', '₮', '₱', '₲', '₪', '₫', '₩', '₵',
|
96
|
+
'₦', '₢', '₤', '₣', '₧', '₯', '₠', '₶', '₸', '₷', '₼', '₾', '₺', '﹩', '₨', '₹', '$', '﹫']
|
97
|
+
|
98
|
+
# Iterate over each currency symbol and remove it from the input string
|
99
|
+
currency_symbols.each do |symbol|
|
100
|
+
input_string.gsub!(symbol, '')
|
101
|
+
end
|
102
|
+
|
103
|
+
input_string
|
104
|
+
end
|
105
|
+
|
106
|
+
private_class_method :remove_accents, :match_index, :parse_amount, :parse_percentage, :remove_currency_symbols,
|
107
|
+
:valid_percentage?, :comma_decimal?, :dot_decimal?, :standardize_delimiters
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,322 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'ocr_extractor'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Extraction
|
7
|
+
# Tax extractor class
|
8
|
+
class TaxExtractor < OcrExtractor
|
9
|
+
# Extracts the most relevant candidate.
|
10
|
+
# @param candidates [Array<Hash>] a candidate for the tax.
|
11
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
12
|
+
# @return [Hash, nil]
|
13
|
+
def self.pick_best(candidates, tax_names)
|
14
|
+
return candidates[0] if candidates.size == 1
|
15
|
+
return nil if candidates.empty?
|
16
|
+
|
17
|
+
picked = 0
|
18
|
+
picked_score = 0
|
19
|
+
|
20
|
+
candidates.each_with_index do |candidate, i|
|
21
|
+
next unless valid_candidate?(candidate, tax_names)
|
22
|
+
|
23
|
+
sum_fields_score = calculate_score(candidate, i)
|
24
|
+
|
25
|
+
if picked_score < sum_fields_score
|
26
|
+
picked_score = sum_fields_score
|
27
|
+
picked = i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
candidates[picked]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
|
35
|
+
# due to unsupported diacritics.
|
36
|
+
# @param candidate [Hash] A candidate for the tax.
|
37
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.valid_candidate?(candidate, tax_names)
|
40
|
+
return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
|
41
|
+
|
42
|
+
tax_names.each do |tax_name|
|
43
|
+
return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
# [Experimental] computes the score of a valid candidate for a tax.
|
49
|
+
# @param candidate [Hash] A candidate for the tax.
|
50
|
+
# @param index [Integer]
|
51
|
+
def self.calculate_score(candidate, index)
|
52
|
+
score = index + 1
|
53
|
+
unless candidate['rate'].nil?
|
54
|
+
score += 1
|
55
|
+
score -= 2 if candidate['rate'] > 100
|
56
|
+
score -= 1 if candidate['rate'] > 30
|
57
|
+
end
|
58
|
+
score += 4 unless candidate['value'].nil?
|
59
|
+
score += 1 unless candidate['base'].nil?
|
60
|
+
score
|
61
|
+
end
|
62
|
+
|
63
|
+
# Curates tax values based on simple rules to avoid improbable data
|
64
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
65
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
66
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
67
|
+
# @return [Hash]
|
68
|
+
def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
69
|
+
reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
|
70
|
+
return reconstructed_hash if found_hash.nil?
|
71
|
+
|
72
|
+
reconstructed_hash['code'] =
|
73
|
+
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
|
74
|
+
|
75
|
+
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
|
76
|
+
found_hash['rate'] =
|
77
|
+
found_hash['rate'] * 100
|
78
|
+
end
|
79
|
+
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
80
|
+
found_hash = decimate_rates_if_needed(found_hash)
|
81
|
+
found_hash = fix_rate(found_hash)
|
82
|
+
reconstructed_hash['rate'] = found_hash['rate']
|
83
|
+
set_base_and_value(reconstructed_hash, found_hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Swaps the rate with base or value if rate is out of bounds
|
87
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
88
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
89
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
90
|
+
# @return [Hash]
|
91
|
+
def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
92
|
+
if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
|
93
|
+
if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
|
94
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
95
|
+
elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
|
96
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
97
|
+
end
|
98
|
+
end
|
99
|
+
found_hash
|
100
|
+
end
|
101
|
+
|
102
|
+
# Rates can't be negative if set.
|
103
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
104
|
+
def self.fix_rate(found_hash)
|
105
|
+
found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
|
106
|
+
found_hash
|
107
|
+
end
|
108
|
+
|
109
|
+
# Swaps the rate with base or value if rate is out of bounds
|
110
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
111
|
+
# @return [Hash]
|
112
|
+
def self.decimate_rates_if_needed(found_hash)
|
113
|
+
if found_hash['rate'] && found_hash['rate'] > 100
|
114
|
+
if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
|
115
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
116
|
+
elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
|
117
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
118
|
+
end
|
119
|
+
end
|
120
|
+
found_hash
|
121
|
+
end
|
122
|
+
|
123
|
+
# Sets the base and value in the reconstructed hash based on certain conditions
|
124
|
+
# @param reconstructed_hash [Hash] Hash being reconstructed with new values
|
125
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
126
|
+
# @return [Hash]
|
127
|
+
def self.set_base_and_value(reconstructed_hash, found_hash)
|
128
|
+
if found_hash['base'].nil?
|
129
|
+
reconstructed_hash['base'] = found_hash['base']
|
130
|
+
reconstructed_hash['value'] = found_hash['value']
|
131
|
+
elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
|
132
|
+
reconstructed_hash['base'] = found_hash['value']
|
133
|
+
reconstructed_hash['value'] = found_hash['base']
|
134
|
+
else
|
135
|
+
reconstructed_hash['value'] = found_hash['value']
|
136
|
+
end
|
137
|
+
reconstructed_hash
|
138
|
+
end
|
139
|
+
|
140
|
+
# Extracts a single custom type of tax.
|
141
|
+
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
|
142
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
|
143
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
144
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
145
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
146
|
+
# @return [Mindee::Parsing::Standard::TaxField, nil]
|
147
|
+
def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
|
148
|
+
return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
|
149
|
+
|
150
|
+
tax_names.sort!
|
151
|
+
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
|
152
|
+
# a tax is considered found horizontally if it has a value, otherwise it is vertical
|
153
|
+
if found_hash.nil? || found_hash['value'].nil?
|
154
|
+
found_hash = extract_vertical_tax(ocr_result, tax_names,
|
155
|
+
found_hash)
|
156
|
+
end
|
157
|
+
found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
158
|
+
|
159
|
+
return if found_hash.nil? || found_hash.empty?
|
160
|
+
|
161
|
+
create_tax_field(found_hash)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Creates a tax field from a given hash.
|
165
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
166
|
+
# @return [Mindee::Parsing::Standard::TaxField]
|
167
|
+
def self.create_tax_field(found_hash)
|
168
|
+
Mindee::Parsing::Standard::TaxField.new(
|
169
|
+
found_hash,
|
170
|
+
found_hash.key?('page_id') ? found_hash['page_id'] : nil
|
171
|
+
)
|
172
|
+
end
|
173
|
+
|
174
|
+
# Extracts the rate and code, if found, from matches into the found_hash.
|
175
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
176
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
177
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
178
|
+
# @return [Hash]
|
179
|
+
def self.extract_percentage_from_tax(matches, found_hash, percent_first)
|
180
|
+
if percent_first
|
181
|
+
found_hash['code'] = matches[2].strip unless matches[2].nil?
|
182
|
+
found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
|
183
|
+
else
|
184
|
+
found_hash['code'] = matches[1].strip unless matches[1].nil?
|
185
|
+
found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
|
186
|
+
end
|
187
|
+
found_hash
|
188
|
+
end
|
189
|
+
|
190
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
191
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
192
|
+
|
193
|
+
# Extracts the basis and value of a tax from regex matches, independent of the order.
|
194
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
195
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
196
|
+
# @return [Hash]
|
197
|
+
def self.extract_basis_and_value(matches, found_hash)
|
198
|
+
if matches[4].nil? && !matches[3].nil?
|
199
|
+
found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
|
200
|
+
elsif matches[3].nil? && !matches[4].nil?
|
201
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
202
|
+
elsif !matches[3].nil? && !matches[4].nil?
|
203
|
+
found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
|
204
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
205
|
+
end
|
206
|
+
found_hash
|
207
|
+
end
|
208
|
+
|
209
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
210
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
211
|
+
|
212
|
+
# Extracts tax information from a horizontal line.
|
213
|
+
# @param line [String] Line to be processed.
|
214
|
+
# @param pattern [Regexp] RegEx pattern to search the line with.
|
215
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
216
|
+
# @return [Hash]
|
217
|
+
def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
|
218
|
+
found_hash = {}
|
219
|
+
|
220
|
+
matches = line.match(pattern)
|
221
|
+
|
222
|
+
# Edge case for when the tax is split-up between two pages, we'll consider that
|
223
|
+
# the answer belongs to the first one.
|
224
|
+
found_hash['page_id'] = page_id unless found_hash.key?('page_id')
|
225
|
+
return found_hash if matches.nil?
|
226
|
+
|
227
|
+
found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
|
228
|
+
extract_basis_and_value(matches, found_hash)
|
229
|
+
end
|
230
|
+
|
231
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
232
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
233
|
+
|
234
|
+
# Processes a horizontal line for tax extraction. Returns a hash of collected values.
|
235
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
|
236
|
+
# @param tax_names [Array<String>] Possible tax names candidates.
|
237
|
+
# @return [Array<Hash>]
|
238
|
+
def self.extract_horizontal_tax(ocr_result, tax_names)
|
239
|
+
candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
|
240
|
+
linear_pattern_percent_first = %r{
|
241
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
242
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
|
243
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
244
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
245
|
+
}x
|
246
|
+
linear_pattern_percent_second = %r{
|
247
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
|
248
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
249
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
250
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
251
|
+
}x
|
252
|
+
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
|
253
|
+
page.all_lines.each do |line|
|
254
|
+
clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
|
255
|
+
.gsub(%r{ +}, ' ').strip
|
256
|
+
|
257
|
+
next if match_index(clean_line, tax_names).nil?
|
258
|
+
|
259
|
+
unless clean_line.match(linear_pattern_percent_second).nil?
|
260
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
|
261
|
+
linear_pattern_percent_second, page_id, false))
|
262
|
+
end
|
263
|
+
if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
|
264
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
|
265
|
+
linear_pattern_percent_first, page_id, true))
|
266
|
+
elsif !clean_line.match(linear_pattern_percent_first).nil?
|
267
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line,
|
268
|
+
linear_pattern_percent_first, page_id, true))
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
candidates
|
273
|
+
end
|
274
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
275
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
276
|
+
|
277
|
+
# Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
|
278
|
+
# @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
|
279
|
+
# @param found_hash [Hash] Hash containing previously found values, if any.
|
280
|
+
# @return [Hash]
|
281
|
+
def self.extract_vertical_tax_values(line, found_hash)
|
282
|
+
amounts = []
|
283
|
+
line.each do |reconstructed_word|
|
284
|
+
amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
|
285
|
+
end
|
286
|
+
if amounts.length == 1 && !found_hash.key?('value')
|
287
|
+
found_hash['value'] = amounts[0]
|
288
|
+
else
|
289
|
+
found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
|
290
|
+
found_hash['value'] = amounts[1] if found_hash['value'].nil?
|
291
|
+
end
|
292
|
+
found_hash
|
293
|
+
end
|
294
|
+
|
295
|
+
# Extracts tax data from a vertical reconstructed row.
|
296
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
|
297
|
+
# @param tax_names [Array<String>] Array of possible names a tax can have
|
298
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
299
|
+
def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
|
300
|
+
found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
|
301
|
+
|
302
|
+
ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
|
303
|
+
page.all_words.each do |word|
|
304
|
+
next if match_index(word.text, tax_names).nil?
|
305
|
+
|
306
|
+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
|
307
|
+
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
|
308
|
+
found_hash['code'] = word.text.strip if found_hash['code'].nil?
|
309
|
+
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
found_hash
|
313
|
+
end
|
314
|
+
|
315
|
+
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
|
316
|
+
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
|
317
|
+
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
|
318
|
+
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
|
319
|
+
:swap_rates_if_needed
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
@@ -77,5 +77,24 @@ module Mindee
|
|
77
77
|
coords = points.map(&:x)
|
78
78
|
MinMax.new(coords.min, coords.max)
|
79
79
|
end
|
80
|
+
|
81
|
+
# Checks whether a set of coordinates is below another on the page, with a slight margin for the lateral value.
|
82
|
+
# @param candidate [Array<Mindee::Geometry::Point] Polygon to check
|
83
|
+
# @param anchor [Array<Mindee::Geometry::Point] Reference polygon
|
84
|
+
# @param margin_left [Float] Margin tolerance on the left of the anchor
|
85
|
+
# @param margin_right [Float] Margin tolerance on the right of the anchor
|
86
|
+
def self.below?(candidate, anchor, margin_left, margin_right)
|
87
|
+
return false if Geometry.get_min_max_y(candidate).min < Geometry.get_min_max_y(anchor).min
|
88
|
+
if Geometry.get_min_max_x(candidate).min <
|
89
|
+
Geometry.get_min_max_x(anchor).min - (Geometry.get_min_max_x(anchor).min * margin_left)
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
if Geometry.get_min_max_x(candidate).max >
|
93
|
+
Geometry.get_min_max_x(anchor).max + (Geometry.get_min_max_x(anchor).max * margin_right)
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
|
97
|
+
true
|
98
|
+
end
|
80
99
|
end
|
81
100
|
end
|