shear 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ module Shear
2
+ class TemplateMatch
3
+ attr_reader :labels, :words, :error, :transform
4
+
5
+ def initialize(_labels, _words, _error, _transform)
6
+ @labels = _labels
7
+ @error = _error
8
+ @words = _words
9
+ @transform = _transform
10
+ end
11
+
12
+ def [](_key)
13
+ @labels[_key]
14
+ end
15
+
16
+ def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false)
17
+ @words.read(_upper_left_pt,
18
+ _lower_right_pt,
19
+ line_height: line_height,
20
+ exclusion: exclusion,
21
+ delete: delete)
22
+ end
23
+
24
+ def read_relative(_label, _upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[],
25
+ delete: false)
26
+ ref_pt = @labels[_label]
27
+ raise ArgumentError, 'invalid label' if ref_pt.nil?
28
+
29
+ @words.read(
30
+ [ref_pt[0] + _upper_left_pt[0], ref_pt[1] + _upper_left_pt[1]],
31
+ [ref_pt[0] + _lower_right_pt[0], ref_pt[1] + _lower_right_pt[1]],
32
+ line_height: line_height, exclusion: exclusion, delete: delete
33
+ )
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,3 @@
1
+ module Shear
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,157 @@
1
+ require 'matrix'
2
+ require 'active_support'
3
+
4
+ module Shear
5
+ class WordCollection
6
+ class ReadString
7
+ attr_reader :string, :confidence
8
+
9
+ def initialize(_string, _confidence)
10
+ @string = _string
11
+ @confidence = _confidence
12
+ end
13
+
14
+ def to_s
15
+ @string
16
+ end
17
+ end
18
+
19
+ def self.build_from_url(_url, _google_vision_api_key)
20
+ word_layout = VisionUtils.get_word_layout(_url, _google_vision_api_key)
21
+ new.tap do |collection|
22
+ word_layout.each do |wtext, bounding_box, confidence|
23
+ collection.push_word(wtext, bounding_box: bounding_box, confidence: confidence)
24
+ end
25
+ end
26
+ end
27
+
28
+ attr_reader :words
29
+
30
+ def initialize(words = [])
31
+ @words = words
32
+ end
33
+
34
+ def word(_index)
35
+ @words[_index][:word]
36
+ end
37
+
38
+ def tl_word(_index)
39
+ @words[_index][:tl_word]
40
+ end
41
+
42
+ def location(_index)
43
+ @words[_index][:bounding_box][0]
44
+ end
45
+
46
+ def confidence(_index)
47
+ @words[_index][:conf]
48
+ end
49
+
50
+ def original_location(_index)
51
+ @words[_index][:original_bounding_box][0]
52
+ end
53
+
54
+ def bounding_box(_index)
55
+ @words[_index][:bounding_box]
56
+ end
57
+
58
+ def original_bounding_box(_index)
59
+ @words[_index][:original_bounding_box]
60
+ end
61
+
62
+ def deleted(_index)
63
+ @words[_index][:deleted]
64
+ end
65
+
66
+ def count
67
+ @words.count
68
+ end
69
+
70
+ def push_word(_word, bounding_box:, confidence: 1.0)
71
+ @words << {
72
+ word: _word,
73
+ tl_word: I18n.transliterate(_word).upcase,
74
+ conf: confidence,
75
+ bounding_box: bounding_box,
76
+ original_bounding_box: bounding_box.clone,
77
+ deleted: false
78
+ }
79
+ end
80
+
81
+ def search(_word, min_confidence: 0.0)
82
+ word = I18n.transliterate(_word).upcase
83
+ @words.each_index.select do |i|
84
+ @words[i][:tl_word] == word && @words[i][:conf] >= min_confidence
85
+ end
86
+ end
87
+
88
+ def clone
89
+ self.class.new.tap do |coll_clone|
90
+ @words.each do |w|
91
+ coll_clone.push_word_raw(w)
92
+ end
93
+ end
94
+ end
95
+
96
+ def transform!(_matrix)
97
+ @words.each do |w|
98
+ w[:original_bounding_box].each_with_index do |vertex, index|
99
+ new_vertex = (_matrix * Matrix.column_vector(vertex + [1.0])).transpose.to_a.first[0..1]
100
+ w[:bounding_box][index] = new_vertex
101
+ end
102
+ end
103
+
104
+ self
105
+ end
106
+
107
+ def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false,
108
+ min_confidence: 0)
109
+ read_words = select_inside_box(
110
+ _upper_left_pt,
111
+ _lower_right_pt,
112
+ min_confidence,
113
+ exclusion,
114
+ delete
115
+ )
116
+ confidence = read_words.map { |w| w[:conf] }.min || 1.0
117
+
118
+ lines = []
119
+ while !read_words.empty?
120
+ line_words, read_words = partition_by_line(read_words, line_height)
121
+
122
+ lines << line_words.sort_by { |w| w[:bounding_box][0][0] }.map { |w| w[:word] }.join(' ')
123
+ end
124
+
125
+ ReadString.new lines.join("\n"), confidence
126
+ end
127
+
128
+ protected
129
+
130
+ def push_word_raw(_raw)
131
+ @words << _raw
132
+ end
133
+
134
+ def select_inside_box(_upper_left_pt, _lower_right_pt, _min_confidence, _exclusion, _delete)
135
+ inside_box = []
136
+ @words.each do |w|
137
+ next if _exclusion.include? w[:tl_word]
138
+
139
+ aabb = { "min": _upper_left_pt, "max": _lower_right_pt }
140
+ if w[:conf] >= _min_confidence &&
141
+ BoundingBoxUtils.collides?(aabb, w[:bounding_box]) && !w[:deleted]
142
+ inside_box << w
143
+ w[:deleted] = true if _delete
144
+ end
145
+ end
146
+ inside_box
147
+ end
148
+
149
+ def partition_by_line(_words, _line_height)
150
+ upper_word = _words.min { |a, b| a[:bounding_box][0][1] <=> b[:bounding_box][0][1] }
151
+
152
+ _words.partition do |word|
153
+ word[:bounding_box][0][1] - upper_word[:bounding_box][0][1] < _line_height
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,20 @@
1
+ class BaseStencil
2
+ def self.match(_word_collection, max_error: default_max_error)
3
+ template_match = template.match(_word_collection)
4
+
5
+ return nil if template_match.nil?
6
+ return nil if max_error.present? && template_match.error > max_error
7
+
8
+ new(template_match).tap &:process_match
9
+ end
10
+
11
+ def self.default_max_error
12
+ nil
13
+ end
14
+
15
+ attr_reader :match
16
+
17
+ def initialize(_match)
18
+ @match = _match
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ class ExampleBackStencil < BaseStencil
2
+ def self.template
3
+ @template ||= Shear::Template.build do |t|
4
+ t.set 'NAMES', at: [1.3, 0.9]
5
+ t.set 'DOCUMENT', at: [25.4, 0.8]
6
+ t.set 'SURNAMES', at: [1.6, 8.0]
7
+ t.set 'ID', at: [28.9, 8.1]
8
+ end
9
+ end
10
+
11
+ def self.default_max_error
12
+ 10
13
+ end
14
+
15
+ def face
16
+ :back
17
+ end
18
+
19
+ def fields
20
+ @fields ||= Set[
21
+ "names",
22
+ "surnames",
23
+ "has_sensible_data?"
24
+ ]
25
+ end
26
+
27
+ def has_sensible_data?
28
+ false
29
+ end
30
+
31
+ attr_reader :names, :surnames
32
+
33
+ def process_match
34
+ @names = match.read([2.5, 4.4], [6.1, 6.2]).to_s
35
+ @surnames = match.read([2.1, 13.1], [15.0, 15.0]).to_s
36
+ end
37
+ end
@@ -0,0 +1,44 @@
1
+ class ExampleFrontStencil < BaseStencil
2
+ def self.template
3
+ @template ||= Shear::Template.build do |t|
4
+ t.set 'DOCUMENT', at: [0.9, 0.3]
5
+ t.set 'ID', at: [30.8, 1.2]
6
+ t.set 'COUNTRY', at: [1.1, 8.1]
7
+ t.set 'NUMBER', at: [0.8, 15.1]
8
+ end
9
+ end
10
+
11
+ def face
12
+ :front
13
+ end
14
+
15
+ def fields
16
+ @fields ||= Set[
17
+ "number",
18
+ "parsed_number",
19
+ "has_sensible_data?"
20
+ ]
21
+ end
22
+
23
+ def has_sensible_data?
24
+ true
25
+ end
26
+
27
+ attr_reader :number, :parsed_number
28
+
29
+ def process_match
30
+ @number = match.read([23.9, 15.0], [34.5, 16.9]).to_s
31
+ @parsed_number = parse_number(@number)
32
+ end
33
+
34
+ private
35
+
36
+ def parse_number(_number)
37
+ parts = _number.split(".")
38
+ return nil if parts.length != 3
39
+
40
+ return nil if parts.any? { |part| !/\A\d+\z/.match(part) }
41
+
42
+ parts.inject { |cumulate, part| cumulate + part }.to_i
43
+ end
44
+ end
@@ -0,0 +1,40 @@
1
+ class StencilGroup
2
+ def self.match(_stencil_map, _document_words)
3
+ stencils = {}
4
+ _stencil_map.each do |face, stencil|
5
+ stencils[face] = _document_words.include?(face) ? stencil.match(_document_words[face]) : nil
6
+ end
7
+
8
+ new(stencils)
9
+ end
10
+
11
+ attr_reader :stencils
12
+
13
+ def initialize(_stencils)
14
+ @stencils = _stencils
15
+ end
16
+
17
+ def get_any?(_field)
18
+ @stencils.values.each do |stencil|
19
+ raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
20
+ end
21
+
22
+ @stencils.values.any? { |stencil| stencil.public_send(_field) }
23
+ end
24
+
25
+ def get_all?(_field)
26
+ @stencils.values.each do |stencil|
27
+ raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
28
+ end
29
+
30
+ @stencils.values.all? { |stencil| stencil.public_send(_field) }
31
+ end
32
+
33
+ def get_attribute(_field)
34
+ @stencils.values.each do |stencil|
35
+ return stencil.public_send(_field) if stencil.fields.include? _field
36
+ end
37
+
38
+ raise "Unknown field #{_field}"
39
+ end
40
+ end
@@ -0,0 +1,101 @@
1
+ require 'google/cloud/vision'
2
+ require 'faraday_middleware'
3
+
4
+ module VisionUtils
5
+ extend self
6
+
7
+ def get_word_layout(_url, _api_key, timeout: 30)
8
+ extract_word_layout handle_response conn(timeout).post(
9
+ "/v1/images:annotate?key=#{_api_key}",
10
+ 'requests' => [
11
+ {
12
+ 'image' => build_image_source(_url),
13
+ 'features' => [
14
+ { 'type' => 'DOCUMENT_TEXT_DETECTION' }
15
+ ]
16
+ }
17
+ ]
18
+ )
19
+ end
20
+
21
+ private
22
+
23
+ def build_image_source(_url)
24
+ uri = URI(_url)
25
+ if uri_is_local(uri)
26
+ { 'content' => Base64.encode64(uri.open(&:read)) }
27
+ else
28
+ { 'source' => { 'imageUri': _url } }
29
+ end
30
+ end
31
+
32
+ def uri_is_local(_uri)
33
+ return true if _uri.scheme != 'http' && _uri.scheme != 'https'
34
+
35
+ _uri.host == 'localhost' || _uri.host == '127.0.0.1'
36
+ end
37
+
38
+ def extract_word_layout(_data)
39
+ image_text = _data['responses'].first['fullTextAnnotation']
40
+ return [] if image_text.nil?
41
+
42
+ [].tap do |result|
43
+ image_text['pages'].each do |page|
44
+ page['blocks'].each do |block|
45
+ next unless block['blockType'] == 'TEXT'
46
+
47
+ block['paragraphs'].each do |para|
48
+ para['words'].each do |w|
49
+ extracted_word = extract_word(w)
50
+ result << extracted_word if extracted_word
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ def extract_bounding_box(_vertices)
59
+ [].tap do |bounding_box|
60
+ _vertices.each do |vertex|
61
+ if vertex['x'] == nil || vertex['y'] == nil
62
+ return nil
63
+ end
64
+
65
+ bounding_box << [vertex['x'], vertex['y']]
66
+ end
67
+ end
68
+ end
69
+
70
+ def extract_word(_raw_word)
71
+ wtext = _raw_word['symbols'].map { |sym| sym['text'] }.join
72
+ confidence = _raw_word['symbols'].map { |sym| sym['confidence'].to_f }.min
73
+ vertices = _raw_word['boundingBox']['vertices']
74
+ bounding_box = extract_bounding_box(vertices)
75
+
76
+ if bounding_box == nil
77
+ return nil
78
+ end
79
+
80
+ [wtext, bounding_box, confidence]
81
+ end
82
+
83
+ def handle_response(_response)
84
+ if _response.status != 200 && _response.status != 201
85
+ raise 'Service error'
86
+ end
87
+
88
+ _response.body
89
+ end
90
+
91
+ def conn(_timeout)
92
+ @conn ||= Faraday.new(url: "https://vision.googleapis.com") do |faraday|
93
+ faraday.request :json
94
+ faraday.response :json
95
+ faraday.adapter :patron
96
+
97
+ faraday.options.timeout = _timeout
98
+ faraday.options.open_timeout = _timeout
99
+ end
100
+ end
101
+ end