shear 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ module Shear
2
+ class TemplateMatch
3
+ attr_reader :labels, :words, :error, :transform
4
+
5
+ def initialize(_labels, _words, _error, _transform)
6
+ @labels = _labels
7
+ @error = _error
8
+ @words = _words
9
+ @transform = _transform
10
+ end
11
+
12
+ def [](_key)
13
+ @labels[_key]
14
+ end
15
+
16
+ def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false)
17
+ @words.read(_upper_left_pt,
18
+ _lower_right_pt,
19
+ line_height: line_height,
20
+ exclusion: exclusion,
21
+ delete: delete)
22
+ end
23
+
24
+ def read_relative(_label, _upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[],
25
+ delete: false)
26
+ ref_pt = @labels[_label]
27
+ raise ArgumentError, 'invalid label' if ref_pt.nil?
28
+
29
+ @words.read(
30
+ [ref_pt[0] + _upper_left_pt[0], ref_pt[1] + _upper_left_pt[1]],
31
+ [ref_pt[0] + _lower_right_pt[0], ref_pt[1] + _lower_right_pt[1]],
32
+ line_height: line_height, exclusion: exclusion, delete: delete
33
+ )
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,3 @@
1
+ module Shear
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,157 @@
1
+ require 'matrix'
2
+ require 'active_support'
3
+
4
+ module Shear
5
+ class WordCollection
6
+ class ReadString
7
+ attr_reader :string, :confidence
8
+
9
+ def initialize(_string, _confidence)
10
+ @string = _string
11
+ @confidence = _confidence
12
+ end
13
+
14
+ def to_s
15
+ @string
16
+ end
17
+ end
18
+
19
+ def self.build_from_url(_url, _google_vision_api_key)
20
+ word_layout = VisionUtils.get_word_layout(_url, _google_vision_api_key)
21
+ new.tap do |collection|
22
+ word_layout.each do |wtext, bounding_box, confidence|
23
+ collection.push_word(wtext, bounding_box: bounding_box, confidence: confidence)
24
+ end
25
+ end
26
+ end
27
+
28
+ attr_reader :words
29
+
30
+ def initialize(words = [])
31
+ @words = words
32
+ end
33
+
34
+ def word(_index)
35
+ @words[_index][:word]
36
+ end
37
+
38
+ def tl_word(_index)
39
+ @words[_index][:tl_word]
40
+ end
41
+
42
+ def location(_index)
43
+ @words[_index][:bounding_box][0]
44
+ end
45
+
46
+ def confidence(_index)
47
+ @words[_index][:conf]
48
+ end
49
+
50
+ def original_location(_index)
51
+ @words[_index][:original_bounding_box][0]
52
+ end
53
+
54
+ def bounding_box(_index)
55
+ @words[_index][:bounding_box]
56
+ end
57
+
58
+ def original_bounding_box(_index)
59
+ @words[_index][:original_bounding_box]
60
+ end
61
+
62
+ def deleted(_index)
63
+ @words[_index][:deleted]
64
+ end
65
+
66
+ def count
67
+ @words.count
68
+ end
69
+
70
+ def push_word(_word, bounding_box:, confidence: 1.0)
71
+ @words << {
72
+ word: _word,
73
+ tl_word: I18n.transliterate(_word).upcase,
74
+ conf: confidence,
75
+ bounding_box: bounding_box,
76
+ original_bounding_box: bounding_box.clone,
77
+ deleted: false
78
+ }
79
+ end
80
+
81
+ def search(_word, min_confidence: 0.0)
82
+ word = I18n.transliterate(_word).upcase
83
+ @words.each_index.select do |i|
84
+ @words[i][:tl_word] == word && @words[i][:conf] >= min_confidence
85
+ end
86
+ end
87
+
88
+ def clone
89
+ self.class.new.tap do |coll_clone|
90
+ @words.each do |w|
91
+ coll_clone.push_word_raw(w)
92
+ end
93
+ end
94
+ end
95
+
96
+ def transform!(_matrix)
97
+ @words.each do |w|
98
+ w[:original_bounding_box].each_with_index do |vertex, index|
99
+ new_vertex = (_matrix * Matrix.column_vector(vertex + [1.0])).transpose.to_a.first[0..1]
100
+ w[:bounding_box][index] = new_vertex
101
+ end
102
+ end
103
+
104
+ self
105
+ end
106
+
107
+ def read(_upper_left_pt, _lower_right_pt, line_height: 2.0, exclusion: Set[], delete: false,
108
+ min_confidence: 0)
109
+ read_words = select_inside_box(
110
+ _upper_left_pt,
111
+ _lower_right_pt,
112
+ min_confidence,
113
+ exclusion,
114
+ delete
115
+ )
116
+ confidence = read_words.map { |w| w[:conf] }.min || 1.0
117
+
118
+ lines = []
119
+ while !read_words.empty?
120
+ line_words, read_words = partition_by_line(read_words, line_height)
121
+
122
+ lines << line_words.sort_by { |w| w[:bounding_box][0][0] }.map { |w| w[:word] }.join(' ')
123
+ end
124
+
125
+ ReadString.new lines.join("\n"), confidence
126
+ end
127
+
128
+ protected
129
+
130
+ def push_word_raw(_raw)
131
+ @words << _raw
132
+ end
133
+
134
+ def select_inside_box(_upper_left_pt, _lower_right_pt, _min_confidence, _exclusion, _delete)
135
+ inside_box = []
136
+ @words.each do |w|
137
+ next if _exclusion.include? w[:tl_word]
138
+
139
+ aabb = { "min": _upper_left_pt, "max": _lower_right_pt }
140
+ if w[:conf] >= _min_confidence &&
141
+ BoundingBoxUtils.collides?(aabb, w[:bounding_box]) && !w[:deleted]
142
+ inside_box << w
143
+ w[:deleted] = true if _delete
144
+ end
145
+ end
146
+ inside_box
147
+ end
148
+
149
+ def partition_by_line(_words, _line_height)
150
+ upper_word = _words.min { |a, b| a[:bounding_box][0][1] <=> b[:bounding_box][0][1] }
151
+
152
+ _words.partition do |word|
153
+ word[:bounding_box][0][1] - upper_word[:bounding_box][0][1] < _line_height
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,20 @@
1
+ class BaseStencil
2
+ def self.match(_word_collection, max_error: default_max_error)
3
+ template_match = template.match(_word_collection)
4
+
5
+ return nil if template_match.nil?
6
+ return nil if max_error.present? && template_match.error > max_error
7
+
8
+ new(template_match).tap &:process_match
9
+ end
10
+
11
+ def self.default_max_error
12
+ nil
13
+ end
14
+
15
+ attr_reader :match
16
+
17
+ def initialize(_match)
18
+ @match = _match
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ class ExampleBackStencil < BaseStencil
2
+ def self.template
3
+ @template ||= Shear::Template.build do |t|
4
+ t.set 'NAMES', at: [1.3, 0.9]
5
+ t.set 'DOCUMENT', at: [25.4, 0.8]
6
+ t.set 'SURNAMES', at: [1.6, 8.0]
7
+ t.set 'ID', at: [28.9, 8.1]
8
+ end
9
+ end
10
+
11
+ def self.default_max_error
12
+ 10
13
+ end
14
+
15
+ def face
16
+ :back
17
+ end
18
+
19
+ def fields
20
+ @fields ||= Set[
21
+ "names",
22
+ "surnames",
23
+ "has_sensible_data?"
24
+ ]
25
+ end
26
+
27
+ def has_sensible_data?
28
+ false
29
+ end
30
+
31
+ attr_reader :names, :surnames
32
+
33
+ def process_match
34
+ @names = match.read([2.5, 4.4], [6.1, 6.2]).to_s
35
+ @surnames = match.read([2.1, 13.1], [15.0, 15.0]).to_s
36
+ end
37
+ end
@@ -0,0 +1,44 @@
1
+ class ExampleFrontStencil < BaseStencil
2
+ def self.template
3
+ @template ||= Shear::Template.build do |t|
4
+ t.set 'DOCUMENT', at: [0.9, 0.3]
5
+ t.set 'ID', at: [30.8, 1.2]
6
+ t.set 'COUNTRY', at: [1.1, 8.1]
7
+ t.set 'NUMBER', at: [0.8, 15.1]
8
+ end
9
+ end
10
+
11
+ def face
12
+ :front
13
+ end
14
+
15
+ def fields
16
+ @fields ||= Set[
17
+ "number",
18
+ "parsed_number",
19
+ "has_sensible_data?"
20
+ ]
21
+ end
22
+
23
+ def has_sensible_data?
24
+ true
25
+ end
26
+
27
+ attr_reader :number, :parsed_number
28
+
29
+ def process_match
30
+ @number = match.read([23.9, 15.0], [34.5, 16.9]).to_s
31
+ @parsed_number = parse_number(@number)
32
+ end
33
+
34
+ private
35
+
36
+ def parse_number(_number)
37
+ parts = _number.split(".")
38
+ return nil if parts.length != 3
39
+
40
+ return nil if parts.any? { |part| !/\A\d+\z/.match(part) }
41
+
42
+ parts.inject { |cumulate, part| cumulate + part }.to_i
43
+ end
44
+ end
@@ -0,0 +1,40 @@
1
+ class StencilGroup
2
+ def self.match(_stencil_map, _document_words)
3
+ stencils = {}
4
+ _stencil_map.each do |face, stencil|
5
+ stencils[face] = _document_words.include?(face) ? stencil.match(_document_words[face]) : nil
6
+ end
7
+
8
+ new(stencils)
9
+ end
10
+
11
+ attr_reader :stencils
12
+
13
+ def initialize(_stencils)
14
+ @stencils = _stencils
15
+ end
16
+
17
+ def get_any?(_field)
18
+ @stencils.values.each do |stencil|
19
+ raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
20
+ end
21
+
22
+ @stencils.values.any? { |stencil| stencil.public_send(_field) }
23
+ end
24
+
25
+ def get_all?(_field)
26
+ @stencils.values.each do |stencil|
27
+ raise "#{self.class.name} has no field #{_field}" if !stencil.fields.include? _field
28
+ end
29
+
30
+ @stencils.values.all? { |stencil| stencil.public_send(_field) }
31
+ end
32
+
33
+ def get_attribute(_field)
34
+ @stencils.values.each do |stencil|
35
+ return stencil.public_send(_field) if stencil.fields.include? _field
36
+ end
37
+
38
+ raise "Unknown field #{_field}"
39
+ end
40
+ end
@@ -0,0 +1,101 @@
1
+ require 'google/cloud/vision'
2
+ require 'faraday_middleware'
3
+
4
+ module VisionUtils
5
+ extend self
6
+
7
+ def get_word_layout(_url, _api_key, timeout: 30)
8
+ extract_word_layout handle_response conn(timeout).post(
9
+ "/v1/images:annotate?key=#{_api_key}",
10
+ 'requests' => [
11
+ {
12
+ 'image' => build_image_source(_url),
13
+ 'features' => [
14
+ { 'type' => 'DOCUMENT_TEXT_DETECTION' }
15
+ ]
16
+ }
17
+ ]
18
+ )
19
+ end
20
+
21
+ private
22
+
23
+ def build_image_source(_url)
24
+ uri = URI(_url)
25
+ if uri_is_local(uri)
26
+ { 'content' => Base64.encode64(uri.open(&:read)) }
27
+ else
28
+ { 'source' => { 'imageUri': _url } }
29
+ end
30
+ end
31
+
32
+ def uri_is_local(_uri)
33
+ return true if _uri.scheme != 'http' && _uri.scheme != 'https'
34
+
35
+ _uri.host == 'localhost' || _uri.host == '127.0.0.1'
36
+ end
37
+
38
+ def extract_word_layout(_data)
39
+ image_text = _data['responses'].first['fullTextAnnotation']
40
+ return [] if image_text.nil?
41
+
42
+ [].tap do |result|
43
+ image_text['pages'].each do |page|
44
+ page['blocks'].each do |block|
45
+ next unless block['blockType'] == 'TEXT'
46
+
47
+ block['paragraphs'].each do |para|
48
+ para['words'].each do |w|
49
+ extracted_word = extract_word(w)
50
+ result << extracted_word if extracted_word
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ def extract_bounding_box(_vertices)
59
+ [].tap do |bounding_box|
60
+ _vertices.each do |vertex|
61
+ if vertex['x'] == nil || vertex['y'] == nil
62
+ return nil
63
+ end
64
+
65
+ bounding_box << [vertex['x'], vertex['y']]
66
+ end
67
+ end
68
+ end
69
+
70
+ def extract_word(_raw_word)
71
+ wtext = _raw_word['symbols'].map { |sym| sym['text'] }.join
72
+ confidence = _raw_word['symbols'].map { |sym| sym['confidence'].to_f }.min
73
+ vertices = _raw_word['boundingBox']['vertices']
74
+ bounding_box = extract_bounding_box(vertices)
75
+
76
+ if bounding_box == nil
77
+ return nil
78
+ end
79
+
80
+ [wtext, bounding_box, confidence]
81
+ end
82
+
83
+ def handle_response(_response)
84
+ if _response.status != 200 && _response.status != 201
85
+ raise 'Service error'
86
+ end
87
+
88
+ _response.body
89
+ end
90
+
91
+ def conn(_timeout)
92
+ @conn ||= Faraday.new(url: "https://vision.googleapis.com") do |faraday|
93
+ faraday.request :json
94
+ faraday.response :json
95
+ faraday.adapter :patron
96
+
97
+ faraday.options.timeout = _timeout
98
+ faraday.options.open_timeout = _timeout
99
+ end
100
+ end
101
+ end