shear 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +13 -0
- data/.github/workflows/publisher.yml +18 -0
- data/.gitignore +56 -0
- data/.hound.yml +4 -0
- data/.rspec +2 -0
- data/.rubocop.yml +491 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +11 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +173 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +45 -0
- data/Rakefile +1 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/shear.rb +13 -0
- data/lib/shear/bounding_box_utils.rb +64 -0
- data/lib/shear/template.rb +187 -0
- data/lib/shear/template_match.rb +36 -0
- data/lib/shear/version.rb +3 -0
- data/lib/shear/word_collection.rb +157 -0
- data/lib/stencils/base_stencil.rb +20 -0
- data/lib/stencils/example_back_stencil.rb +37 -0
- data/lib/stencils/example_front_stencil.rb +44 -0
- data/lib/stencils/stencil_group.rb +40 -0
- data/lib/utils/vision_utils.rb +101 -0
- data/shear.gemspec +37 -0
- metadata +250 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
module Shear
|
2
|
+
class TemplateMatch
|
3
|
+
attr_reader :labels, :words, :error, :transform
|
4
|
+
|
5
|
+
def initialize(_labels, _words, _error, _transform)
|
6
|
+
@labels = _labels
|
7
|
+
@error = _error
|
8
|
+
@words = _words
|
9
|
+
@transform = _transform
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](_key)
|
13
|
+
@labels[_key]
|
14
|
+
end
|
15
|
+
|
16
|
+
def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false)
|
17
|
+
@words.read(_upper_left_pt,
|
18
|
+
_lower_right_pt,
|
19
|
+
line_height: line_height,
|
20
|
+
exclusion: exclusion,
|
21
|
+
delete: delete)
|
22
|
+
end
|
23
|
+
|
24
|
+
def read_relative(_label, _upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[],
|
25
|
+
delete: false)
|
26
|
+
ref_pt = @labels[_label]
|
27
|
+
raise ArgumentError, 'invalid label' if ref_pt.nil?
|
28
|
+
|
29
|
+
@words.read(
|
30
|
+
[ref_pt[0] + _upper_left_pt[0], ref_pt[1] + _upper_left_pt[1]],
|
31
|
+
[ref_pt[0] + _lower_right_pt[0], ref_pt[1] + _lower_right_pt[1]],
|
32
|
+
line_height: line_height, exclusion: exclusion, delete: delete
|
33
|
+
)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
require 'active_support'
|
3
|
+
|
4
|
+
module Shear
|
5
|
+
class WordCollection
|
6
|
+
class ReadString
|
7
|
+
attr_reader :string, :confidence
|
8
|
+
|
9
|
+
def initialize(_string, _confidence)
|
10
|
+
@string = _string
|
11
|
+
@confidence = _confidence
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
@string
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.build_from_url(_url, _google_vision_api_key)
|
20
|
+
word_layout = VisionUtils.get_word_layout(_url, _google_vision_api_key)
|
21
|
+
new.tap do |collection|
|
22
|
+
word_layout.each do |wtext, bounding_box, confidence|
|
23
|
+
collection.push_word(wtext, bounding_box: bounding_box, confidence: confidence)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
attr_reader :words
|
29
|
+
|
30
|
+
def initialize(words = [])
|
31
|
+
@words = words
|
32
|
+
end
|
33
|
+
|
34
|
+
def word(_index)
|
35
|
+
@words[_index][:word]
|
36
|
+
end
|
37
|
+
|
38
|
+
def tl_word(_index)
|
39
|
+
@words[_index][:tl_word]
|
40
|
+
end
|
41
|
+
|
42
|
+
def location(_index)
|
43
|
+
@words[_index][:bounding_box][0]
|
44
|
+
end
|
45
|
+
|
46
|
+
def confidence(_index)
|
47
|
+
@words[_index][:conf]
|
48
|
+
end
|
49
|
+
|
50
|
+
def original_location(_index)
|
51
|
+
@words[_index][:original_bounding_box][0]
|
52
|
+
end
|
53
|
+
|
54
|
+
def bounding_box(_index)
|
55
|
+
@words[_index][:bounding_box]
|
56
|
+
end
|
57
|
+
|
58
|
+
def original_bounding_box(_index)
|
59
|
+
@words[_index][:original_bounding_box]
|
60
|
+
end
|
61
|
+
|
62
|
+
def deleted(_index)
|
63
|
+
@words[_index][:deleted]
|
64
|
+
end
|
65
|
+
|
66
|
+
def count
|
67
|
+
@words.count
|
68
|
+
end
|
69
|
+
|
70
|
+
def push_word(_word, bounding_box:, confidence: 1.0)
|
71
|
+
@words << {
|
72
|
+
word: _word,
|
73
|
+
tl_word: I18n.transliterate(_word).upcase,
|
74
|
+
conf: confidence,
|
75
|
+
bounding_box: bounding_box,
|
76
|
+
original_bounding_box: bounding_box.clone,
|
77
|
+
deleted: false
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
def search(_word, min_confidence: 0.0)
|
82
|
+
word = I18n.transliterate(_word).upcase
|
83
|
+
@words.each_index.select do |i|
|
84
|
+
@words[i][:tl_word] == word && @words[i][:conf] >= min_confidence
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def clone
|
89
|
+
self.class.new.tap do |coll_clone|
|
90
|
+
@words.each do |w|
|
91
|
+
coll_clone.push_word_raw(w)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def transform!(_matrix)
|
97
|
+
@words.each do |w|
|
98
|
+
w[:original_bounding_box].each_with_index do |vertex, index|
|
99
|
+
new_vertex = (_matrix * Matrix.column_vector(vertex + [1.0])).transpose.to_a.first[0..1]
|
100
|
+
w[:bounding_box][index] = new_vertex
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false,
|
108
|
+
min_confidence: 0)
|
109
|
+
read_words = select_inside_box(
|
110
|
+
_upper_left_pt,
|
111
|
+
_lower_right_pt,
|
112
|
+
min_confidence,
|
113
|
+
exclusion,
|
114
|
+
delete
|
115
|
+
)
|
116
|
+
confidence = read_words.map { |w| w[:conf] }.min || 1.0
|
117
|
+
|
118
|
+
lines = []
|
119
|
+
while !read_words.empty?
|
120
|
+
line_words, read_words = partition_by_line(read_words, line_height)
|
121
|
+
|
122
|
+
lines << line_words.sort_by { |w| w[:bounding_box][0][0] }.map { |w| w[:word] }.join(' ')
|
123
|
+
end
|
124
|
+
|
125
|
+
ReadString.new lines.join("\n"), confidence
|
126
|
+
end
|
127
|
+
|
128
|
+
protected
|
129
|
+
|
130
|
+
def push_word_raw(_raw)
|
131
|
+
@words << _raw
|
132
|
+
end
|
133
|
+
|
134
|
+
def select_inside_box(_upper_left_pt, _lower_right_pt, _min_confidence, _exclusion, _delete)
|
135
|
+
inside_box = []
|
136
|
+
@words.each do |w|
|
137
|
+
next if _exclusion.include? w[:tl_word]
|
138
|
+
|
139
|
+
aabb = { "min": _upper_left_pt, "max": _lower_right_pt }
|
140
|
+
if w[:conf] >= _min_confidence &&
|
141
|
+
BoundingBoxUtils.collides?(aabb, w[:bounding_box]) && !w[:deleted]
|
142
|
+
inside_box << w
|
143
|
+
w[:deleted] = true if _delete
|
144
|
+
end
|
145
|
+
end
|
146
|
+
inside_box
|
147
|
+
end
|
148
|
+
|
149
|
+
def partition_by_line(_words, _line_height)
|
150
|
+
upper_word = _words.min { |a, b| a[:bounding_box][0][1] <=> b[:bounding_box][0][1] }
|
151
|
+
|
152
|
+
_words.partition do |word|
|
153
|
+
word[:bounding_box][0][1] - upper_word[:bounding_box][0][1] < _line_height
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class BaseStencil
|
2
|
+
def self.match(_word_collection, max_error: default_max_error)
|
3
|
+
template_match = template.match(_word_collection)
|
4
|
+
|
5
|
+
return nil if template_match.nil?
|
6
|
+
return nil if max_error.present? && template_match.error > max_error
|
7
|
+
|
8
|
+
new(template_match).tap &:process_match
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.default_max_error
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :match
|
16
|
+
|
17
|
+
def initialize(_match)
|
18
|
+
@match = _match
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class ExampleBackStencil < BaseStencil
|
2
|
+
def self.template
|
3
|
+
@template ||= Shear::Template.build do |t|
|
4
|
+
t.set 'NAMES', at: [1.3, 0.9]
|
5
|
+
t.set 'DOCUMENT', at: [25.4, 0.8]
|
6
|
+
t.set 'SURNAMES', at: [1.6, 8.0]
|
7
|
+
t.set 'ID', at: [28.9, 8.1]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.default_max_error
|
12
|
+
10
|
13
|
+
end
|
14
|
+
|
15
|
+
def face
|
16
|
+
:back
|
17
|
+
end
|
18
|
+
|
19
|
+
def fields
|
20
|
+
@fields ||= Set[
|
21
|
+
"names",
|
22
|
+
"surnames",
|
23
|
+
"has_sensible_data?"
|
24
|
+
]
|
25
|
+
end
|
26
|
+
|
27
|
+
def has_sensible_data?
|
28
|
+
false
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :names, :surnames
|
32
|
+
|
33
|
+
def process_match
|
34
|
+
@names = match.read([2.5, 4.4], [6.1, 6.2]).to_s
|
35
|
+
@surnames = match.read([2.1, 13.1], [15.0, 15.0]).to_s
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
class ExampleFrontStencil < BaseStencil
|
2
|
+
def self.template
|
3
|
+
@template ||= Shear::Template.build do |t|
|
4
|
+
t.set 'DOCUMENT', at: [0.9, 0.3]
|
5
|
+
t.set 'ID', at: [30.8, 1.2]
|
6
|
+
t.set 'COUNTRY', at: [1.1, 8.1]
|
7
|
+
t.set 'NUMBER', at: [0.8, 15.1]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def face
|
12
|
+
:front
|
13
|
+
end
|
14
|
+
|
15
|
+
def fields
|
16
|
+
@fields ||= Set[
|
17
|
+
"number",
|
18
|
+
"parsed_number",
|
19
|
+
"has_sensible_data?"
|
20
|
+
]
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_sensible_data?
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_reader :number, :parsed_number
|
28
|
+
|
29
|
+
def process_match
|
30
|
+
@number = match.read([23.9, 15.0], [34.5, 16.9]).to_s
|
31
|
+
@parsed_number = parse_number(@number)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def parse_number(_number)
|
37
|
+
parts = _number.split(".")
|
38
|
+
return nil if parts.length != 3
|
39
|
+
|
40
|
+
return nil if parts.any? { |part| !/\A\d+\z/.match(part) }
|
41
|
+
|
42
|
+
parts.inject { |cumulate, part| cumulate + part }.to_i
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
class StencilGroup
|
2
|
+
def self.match(_stencil_map, _document_words)
|
3
|
+
stencils = {}
|
4
|
+
_stencil_map.each do |face, stencil|
|
5
|
+
stencils[face] = _document_words.include?(face) ? stencil.match(_document_words[face]) : nil
|
6
|
+
end
|
7
|
+
|
8
|
+
new(stencils)
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :stencils
|
12
|
+
|
13
|
+
def initialize(_stencils)
|
14
|
+
@stencils = _stencils
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_any?(_field)
|
18
|
+
@stencils.values.each do |stencil|
|
19
|
+
raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
|
20
|
+
end
|
21
|
+
|
22
|
+
@stencils.values.any? { |stencil| stencil.public_send(_field) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_all?(_field)
|
26
|
+
@stencils.values.each do |stencil|
|
27
|
+
raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
|
28
|
+
end
|
29
|
+
|
30
|
+
@stencils.values.all? { |stencil| stencil.public_send(_field) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_attribute(_field)
|
34
|
+
@stencils.values.each do |stencil|
|
35
|
+
return stencil.public_send(_field) if stencil.fields.include? _field
|
36
|
+
end
|
37
|
+
|
38
|
+
raise "Unknown field #{_field}"
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'google/cloud/vision'
|
2
|
+
require 'faraday_middleware'
|
3
|
+
|
4
|
+
module VisionUtils
|
5
|
+
extend self
|
6
|
+
|
7
|
+
def get_word_layout(_url, _api_key, timeout: 30)
|
8
|
+
extract_word_layout handle_response conn(timeout).post(
|
9
|
+
"/v1/images:annotate?key=#{_api_key}",
|
10
|
+
'requests' => [
|
11
|
+
{
|
12
|
+
'image' => build_image_source(_url),
|
13
|
+
'features' => [
|
14
|
+
{ 'type' => 'DOCUMENT_TEXT_DETECTION' }
|
15
|
+
]
|
16
|
+
}
|
17
|
+
]
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def build_image_source(_url)
|
24
|
+
uri = URI(_url)
|
25
|
+
if uri_is_local(uri)
|
26
|
+
{ 'content' => Base64.encode64(uri.open(&:read)) }
|
27
|
+
else
|
28
|
+
{ 'source' => { 'imageUri': _url } }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def uri_is_local(_uri)
|
33
|
+
return true if _uri.scheme != 'http' && _uri.scheme != 'https'
|
34
|
+
|
35
|
+
_uri.host == 'localhost' || _uri.host == '127.0.0.1'
|
36
|
+
end
|
37
|
+
|
38
|
+
def extract_word_layout(_data)
|
39
|
+
image_text = _data['responses'].first['fullTextAnnotation']
|
40
|
+
return [] if image_text.nil?
|
41
|
+
|
42
|
+
[].tap do |result|
|
43
|
+
image_text['pages'].each do |page|
|
44
|
+
page['blocks'].each do |block|
|
45
|
+
next unless block['blockType'] == 'TEXT'
|
46
|
+
|
47
|
+
block['paragraphs'].each do |para|
|
48
|
+
para['words'].each do |w|
|
49
|
+
extracted_word = extract_word(w)
|
50
|
+
result << extracted_word if extracted_word
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_bounding_box(_vertices)
|
59
|
+
[].tap do |bounding_box|
|
60
|
+
_vertices.each do |vertex|
|
61
|
+
if vertex['x'] == nil || vertex['y'] == nil
|
62
|
+
return nil
|
63
|
+
end
|
64
|
+
|
65
|
+
bounding_box << [vertex['x'], vertex['y']]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def extract_word(_raw_word)
|
71
|
+
wtext = _raw_word['symbols'].map { |sym| sym['text'] }.join
|
72
|
+
confidence = _raw_word['symbols'].map { |sym| sym['confidence'].to_f }.min
|
73
|
+
vertices = _raw_word['boundingBox']['vertices']
|
74
|
+
bounding_box = extract_bounding_box(vertices)
|
75
|
+
|
76
|
+
if bounding_box == nil
|
77
|
+
return nil
|
78
|
+
end
|
79
|
+
|
80
|
+
[wtext, bounding_box, confidence]
|
81
|
+
end
|
82
|
+
|
83
|
+
def handle_response(_response)
|
84
|
+
if _response.status != 200 && _response.status != 201
|
85
|
+
raise 'Service error'
|
86
|
+
end
|
87
|
+
|
88
|
+
_response.body
|
89
|
+
end
|
90
|
+
|
91
|
+
def conn(_timeout)
|
92
|
+
@conn ||= Faraday.new(url: "https://vision.googleapis.com") do |faraday|
|
93
|
+
faraday.request :json
|
94
|
+
faraday.response :json
|
95
|
+
faraday.adapter :patron
|
96
|
+
|
97
|
+
faraday.options.timeout = _timeout
|
98
|
+
faraday.options.open_timeout = _timeout
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|