shear 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ 2.4
@@ -0,0 +1,11 @@
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file.
3
+ This project adheres to [Semantic Versioning](http://semver.org/).
4
+
5
+ ### v0.1.0
6
+
7
+ * Initial release.
8
+
9
+ ### v0.1.1
10
+
11
+ * Fix runtime dependencies.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in shear.gemspec
4
+ gemspec
@@ -0,0 +1,173 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ shear (0.1.1)
5
+ activesupport (>= 4.2)
6
+ faraday (~> 0.17)
7
+ faraday_middleware (~> 0.14)
8
+ google-cloud-vision (~> 1.0)
9
+ patron (~> 0.6)
10
+ rake (>= 10, < 14)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ activesupport (5.2.4.3)
16
+ concurrent-ruby (~> 1.0, >= 1.0.2)
17
+ i18n (>= 0.7, < 2)
18
+ minitest (~> 5.1)
19
+ tzinfo (~> 1.1)
20
+ addressable (2.7.0)
21
+ public_suffix (>= 2.0.2, < 5.0)
22
+ coderay (1.1.3)
23
+ concurrent-ruby (1.1.7)
24
+ coveralls (0.8.23)
25
+ json (>= 1.8, < 3)
26
+ simplecov (~> 0.16.1)
27
+ term-ansicolor (~> 1.3)
28
+ thor (>= 0.19.4, < 2.0)
29
+ tins (~> 1.6)
30
+ crack (0.4.3)
31
+ safe_yaml (~> 1.0.0)
32
+ diff-lcs (1.4.4)
33
+ docile (1.3.2)
34
+ faraday (0.17.3)
35
+ multipart-post (>= 1.2, < 3)
36
+ faraday_middleware (0.14.0)
37
+ faraday (>= 0.7.4, < 1.0)
38
+ ffi (1.13.1)
39
+ formatador (0.2.5)
40
+ gapic-common (0.3.4)
41
+ google-protobuf (~> 3.12, >= 3.12.2)
42
+ googleapis-common-protos (>= 1.3.9, < 2.0)
43
+ googleapis-common-protos-types (>= 1.0.4, < 2.0)
44
+ googleauth (~> 0.9)
45
+ grpc (~> 1.25)
46
+ google-cloud-core (1.5.0)
47
+ google-cloud-env (~> 1.0)
48
+ google-cloud-errors (~> 1.0)
49
+ google-cloud-env (1.3.3)
50
+ faraday (>= 0.17.3, < 2.0)
51
+ google-cloud-errors (1.0.1)
52
+ google-cloud-vision (1.0.0)
53
+ google-cloud-core (~> 1.5)
54
+ google-cloud-vision-v1 (~> 0.0)
55
+ google-cloud-vision-v1p3beta1 (~> 0.0)
56
+ google-cloud-vision-v1 (0.2.5)
57
+ gapic-common (~> 0.3)
58
+ google-cloud-errors (~> 1.0)
59
+ google-cloud-vision-v1p3beta1 (0.2.5)
60
+ gapic-common (~> 0.3)
61
+ google-cloud-errors (~> 1.0)
62
+ google-protobuf (3.12.4)
63
+ googleapis-common-protos (1.3.10)
64
+ google-protobuf (~> 3.11)
65
+ googleapis-common-protos-types (>= 1.0.5, < 2.0)
66
+ grpc (~> 1.27)
67
+ googleapis-common-protos-types (1.0.5)
68
+ google-protobuf (~> 3.11)
69
+ googleauth (0.13.1)
70
+ faraday (>= 0.17.3, < 2.0)
71
+ jwt (>= 1.4, < 3.0)
72
+ memoist (~> 0.16)
73
+ multi_json (~> 1.11)
74
+ os (>= 0.9, < 2.0)
75
+ signet (~> 0.14)
76
+ grpc (1.30.2)
77
+ google-protobuf (~> 3.12)
78
+ googleapis-common-protos-types (~> 1.0)
79
+ guard (2.16.2)
80
+ formatador (>= 0.2.4)
81
+ listen (>= 2.7, < 4.0)
82
+ lumberjack (>= 1.0.12, < 2.0)
83
+ nenv (~> 0.1)
84
+ notiffany (~> 0.0)
85
+ pry (>= 0.9.12)
86
+ shellany (~> 0.0)
87
+ thor (>= 0.18.1)
88
+ guard-compat (1.2.1)
89
+ guard-rspec (4.7.3)
90
+ guard (~> 2.1)
91
+ guard-compat (~> 1.1)
92
+ rspec (>= 2.99.0, < 4.0)
93
+ hashdiff (1.0.1)
94
+ i18n (1.8.5)
95
+ concurrent-ruby (~> 1.0)
96
+ json (2.3.1)
97
+ jwt (2.2.1)
98
+ listen (3.2.1)
99
+ rb-fsevent (~> 0.10, >= 0.10.3)
100
+ rb-inotify (~> 0.9, >= 0.9.10)
101
+ lumberjack (1.2.7)
102
+ memoist (0.16.2)
103
+ method_source (1.0.0)
104
+ minitest (5.14.1)
105
+ multi_json (1.15.0)
106
+ multipart-post (2.1.1)
107
+ nenv (0.3.0)
108
+ notiffany (0.1.3)
109
+ nenv (~> 0.1)
110
+ shellany (~> 0.0)
111
+ os (1.1.1)
112
+ patron (0.13.3)
113
+ pry (0.13.1)
114
+ coderay (~> 1.1)
115
+ method_source (~> 1.0)
116
+ public_suffix (4.0.5)
117
+ rake (13.0.1)
118
+ rb-fsevent (0.10.4)
119
+ rb-inotify (0.10.1)
120
+ ffi (~> 1.0)
121
+ rspec (3.9.0)
122
+ rspec-core (~> 3.9.0)
123
+ rspec-expectations (~> 3.9.0)
124
+ rspec-mocks (~> 3.9.0)
125
+ rspec-core (3.9.2)
126
+ rspec-support (~> 3.9.3)
127
+ rspec-expectations (3.9.2)
128
+ diff-lcs (>= 1.2.0, < 2.0)
129
+ rspec-support (~> 3.9.0)
130
+ rspec-mocks (3.9.1)
131
+ diff-lcs (>= 1.2.0, < 2.0)
132
+ rspec-support (~> 3.9.0)
133
+ rspec-support (3.9.3)
134
+ safe_yaml (1.0.5)
135
+ shellany (0.0.1)
136
+ signet (0.14.0)
137
+ addressable (~> 2.3)
138
+ faraday (>= 0.17.3, < 2.0)
139
+ jwt (>= 1.5, < 3.0)
140
+ multi_json (~> 1.10)
141
+ simplecov (0.16.1)
142
+ docile (~> 1.1)
143
+ json (>= 1.8, < 3)
144
+ simplecov-html (~> 0.10.0)
145
+ simplecov-html (0.10.2)
146
+ sync (0.5.0)
147
+ term-ansicolor (1.7.1)
148
+ tins (~> 1.0)
149
+ thor (1.0.1)
150
+ thread_safe (0.3.6)
151
+ tins (1.25.0)
152
+ sync
153
+ tzinfo (1.2.7)
154
+ thread_safe (~> 0.1)
155
+ webmock (3.8.3)
156
+ addressable (>= 2.3.6)
157
+ crack (>= 0.3.2)
158
+ hashdiff (>= 0.4.0, < 2.0.0)
159
+
160
+ PLATFORMS
161
+ ruby
162
+
163
+ DEPENDENCIES
164
+ bundler (~> 2.1)
165
+ coveralls (~> 0.8)
166
+ guard-rspec (~> 4.7)
167
+ pry (~> 0.13)
168
+ rspec (~> 3.4)
169
+ shear!
170
+ webmock (~> 3.8)
171
+
172
+ BUNDLED WITH
173
+ 2.1.4
@@ -0,0 +1,5 @@
1
+ guard :rspec, cmd: 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2020 Buda.com SpA
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,45 @@
1
+ # Shear
2
+
3
+ A tool used to extract data from a given structured document image
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ $ gem install shear
9
+ ```
10
+
11
+ Or add to your Gemfile:
12
+
13
+ ```ruby
14
+ gem "shear"
15
+ ```
16
+
17
+ ```bash
18
+ bundle install
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ You can check the [wiki](https://github.com/budacom/shear/wiki) to learn how to use Shear.
24
+
25
+ ## Demo
26
+
27
+ Check the [official shear-demo](https://shear-demo.herokuapp.com/) for a live example of shear. Also check the demo's code on the [shear-demo repository](https://github.com/budacom/shear-demo).
28
+
29
+ ## Testing
30
+
31
+ 1. Fork it
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
36
+
37
+ ## Credits
38
+
39
+ This gem was created by [Antonio López](https://github.com/alopez7) and [Buda.com SpA](https://www.buda.com/).
40
+
41
+ Shear is maintained by [Buda.com SpA](https://www.buda.com/).
42
+
43
+ ## License
44
+
45
+ Shear is a free software and may be redistributed under the terms specified in the LICENSE file.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "shear"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,13 @@
1
+ require 'shear/bounding_box_utils'
2
+ require 'shear/template'
3
+ require 'shear/template_match'
4
+ require "shear/version"
5
+ require 'shear/word_collection'
6
+ require 'utils/vision_utils'
7
+ require 'stencils/base_stencil'
8
+ require 'stencils/example_back_stencil'
9
+ require 'stencils/example_front_stencil'
10
+ require 'stencils/stencil_group'
11
+
12
+ module Shear
13
+ end
@@ -0,0 +1,64 @@
1
+ module Shear
2
+ module BoundingBoxUtils
3
+ extend self
4
+
5
+ def vertex_in_aabb?(aabb, vertex)
6
+ return false if vertex[0] < aabb[:min][0]
7
+ return false if vertex[0] > aabb[:max][0]
8
+ return false if vertex[1] < aabb[:min][1]
9
+ return false if vertex[1] > aabb[:max][1]
10
+
11
+ true
12
+ end
13
+
14
+ def edges_touch?(edge1, edge2) # rubocop:disable AbcSize, MethodLength
15
+ ax = edge1[0][0].to_f
16
+ ay = edge1[0][1].to_f
17
+ bx = edge1[1][0].to_f
18
+ by = edge1[1][1].to_f
19
+
20
+ cx = edge2[0][0].to_f
21
+ cy = edge2[0][1].to_f
22
+ dx = edge2[1][0].to_f
23
+ dy = edge2[1][1].to_f
24
+
25
+ return false if ((cx - dx) * (ay - by) - (cy - dy) * (ax - bx)).zero?
26
+
27
+ alpha_numerator = (by - dy) * (cx - dx) - (bx - dx) * (cy - dy)
28
+ alpha = alpha_numerator / ((ax - bx) * (cy - dy) - (ay - by) * (cx - dx))
29
+ beta_numerator = (dy - by) * (ax - bx) - (dx - bx) * (ay - by)
30
+ beta = beta_numerator / ((cx - dx) * (ay - by) - (cy - dy) * (ax - bx))
31
+
32
+ return false if alpha.negative? || alpha > 1
33
+ return false if beta.negative? || beta > 1
34
+
35
+ true
36
+ end
37
+
38
+ def collides?(aabb, box) # rubocop:disable AbcSize
39
+ # Checks if one point of the box is inside the aabb or if edges touch
40
+ # Ignores case when aabb is inside box since is highly unprobable
41
+
42
+ box.each { |vertex| return true if vertex_in_aabb?(aabb, vertex) }
43
+
44
+ box.each_index do |vertex1|
45
+ vertex2 = (vertex1 + 1) % 4
46
+ edge1 = [box[vertex1], box[vertex2]]
47
+
48
+ edge2 = [aabb[:min], [aabb[:min][0], aabb[:max][1]]]
49
+ return true if edges_touch?(edge1, edge2)
50
+
51
+ edge2 = [aabb[:min], [aabb[:max][0], aabb[:min][1]]]
52
+ return true if edges_touch?(edge1, edge2)
53
+
54
+ edge2 = [aabb[:max], [aabb[:max][0], aabb[:min][1]]]
55
+ return true if edges_touch?(edge1, edge2)
56
+
57
+ edge2 = [aabb[:max], [aabb[:min][0], aabb[:max][1]]]
58
+ return true if edges_touch?(edge1, edge2)
59
+ end
60
+
61
+ false
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,187 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext'
3
+
4
+ module Shear
5
+ class Template
6
+ def self.build(&_block)
7
+ template = new
8
+ _block.call(template)
9
+ template.seal
10
+ template
11
+ end
12
+
13
+ attr_reader :sealed, :fixtures, :exclusions
14
+
15
+ def initialize
16
+ @sealed = false
17
+ @fixtures = []
18
+ @exclusions = {}
19
+ end
20
+
21
+ def set(_word, at: nil, label: nil, filter: nil)
22
+ raise 'template sealed' if @sealed
23
+
24
+ @fixtures << [_word, at, label, filter]
25
+ end
26
+
27
+ def set_exclusion(_field_name, _excluded_word)
28
+ raise 'template sealed' if @sealed
29
+
30
+ @exclusions[_field_name] = Set[] if !@exclusions.include?(_field_name)
31
+ @exclusions[_field_name] << I18n.transliterate(_excluded_word).upcase
32
+ end
33
+
34
+ def get_exclusions(_field_name)
35
+ @exclusions.include?(_field_name) ? @exclusions[_field_name] : Set[]
36
+ end
37
+
38
+ def seal
39
+ @sealed = true
40
+ end
41
+
42
+ def match(_word_collection)
43
+ filtered_words_collection = filter_words(_word_collection)
44
+ return nil if should_discard_stencil?(filtered_words_collection.words)
45
+
46
+ recursive_match(filtered_words_collection, [], 0)
47
+ end
48
+
49
+ private
50
+
51
+ def should_discard_stencil?(_words)
52
+ return true if should_discard_stencil_by_discard_fixture?(_words)
53
+ return true if should_discard_stencil_by_unique_fixture?(_words)
54
+
55
+ false
56
+ end
57
+
58
+ def should_discard_stencil_by_discard_fixture?(_words)
59
+ discard_fixtures_tl =
60
+ @fixtures
61
+ .select { |f| f[3] == 'discard' }
62
+ .map { |f| I18n.transliterate(f[0]).upcase }
63
+ return true if _words.any? { |w| discard_fixtures_tl.include? w[:tl_word] }
64
+
65
+ false
66
+ end
67
+
68
+ def should_discard_stencil_by_unique_fixture?(_words)
69
+ unique_fixtures_tl =
70
+ @fixtures.select { |f| f[3] == 'unique' }.map { |f| I18n.transliterate(f[0]).upcase }
71
+ unique_fixture_words =
72
+ _words.map { |w| w[:tl_word] }.select { |tl_w| unique_fixtures_tl.include? tl_w }
73
+ return true if unique_fixture_words.uniq.length != unique_fixture_words.length
74
+
75
+ false
76
+ end
77
+
78
+ def filter_words(_word_collection)
79
+ filtered_words = reject_words_with_duplicate_bounding_box(_word_collection.words)
80
+ filtered_words = select_big_words_with_larger_bounding_box(filtered_words)
81
+ filtered_words = select_words_with_high_confidence(filtered_words)
82
+ WordCollection.new(filtered_words)
83
+ end
84
+
85
+ def reject_words_with_duplicate_bounding_box(_words)
86
+ _words.uniq { |w| [w[:original_bounding_box], w[:tl_word]] }
87
+ end
88
+
89
+ def select_big_words_with_larger_bounding_box(_words)
90
+ big_fixtures_tl =
91
+ @fixtures.select { |f| f[3] == 'big' }.map { |f| I18n.transliterate(f[0]).upcase }
92
+ big_fixture_words = _words.select { |w| big_fixtures_tl.include? w[:tl_word] }
93
+ non_big_fixture_words = _words - big_fixture_words
94
+ filtered_big_words =
95
+ big_fixture_words
96
+ .sort_by { |w| distance(w[:original_bounding_box][0], w[:original_bounding_box][1]) }
97
+ .reverse
98
+ .uniq { |w| w[:tl_word] }
99
+ non_big_fixture_words + filtered_big_words
100
+ end
101
+
102
+ def select_words_with_high_confidence(_words)
103
+ confidence_fixtures_tl =
104
+ @fixtures.select { |f| f[3] == 'confidence' }.map { |f| I18n.transliterate(f[0]).upcase }
105
+ confidence_fixture_words = _words.select { |w| confidence_fixtures_tl.include? w[:tl_word] }
106
+ non_confidence_fixture_words = _words - confidence_fixture_words
107
+ filtered_confidence_words =
108
+ confidence_fixture_words
109
+ .sort_by { |w| w[:conf] }
110
+ .reverse
111
+ .uniq { |w| w[:tl_word] }
112
+ non_confidence_fixture_words + filtered_confidence_words
113
+ end
114
+
115
+ def recursive_match(_words, _result, _fixture_index) # rubocop:disable all
116
+ word_indexes = []
117
+ while word_indexes.empty?
118
+ return calculate_match(_words, _result) if _fixture_index == @fixtures.length
119
+
120
+ word, _, _, word_filter = @fixtures[_fixture_index]
121
+ word_indexes = _words.search(word)
122
+ _fixture_index += 1 if word_indexes.empty? || word_filter == 'discard'
123
+ end
124
+ return calculate_match(_words, _result) if _fixture_index == @fixtures.length
125
+
126
+ word_indexes.inject(nil) do |best_match, word_index|
127
+ new_result = _result + [[word_index, _fixture_index]]
128
+ match = recursive_match(_words, new_result, _fixture_index + 1)
129
+ next match if best_match.nil?
130
+ next best_match if match.nil?
131
+
132
+ match.error < best_match.error ? match : best_match
133
+ end
134
+ end
135
+
136
+ def calculate_match(_words, _result) # rubocop:disable AbcSize, MethodLength
137
+ # select 3 points and calculate transformation matrix (solve T for W * T = F)
138
+
139
+ return nil if _result.length <= 3
140
+
141
+ word_loc1 = _words.original_location(_result.first[0]) + [1]
142
+ word_loc2 = _words.original_location(_result.second[0]) + [1]
143
+ word_loc3 = _words.original_location(_result.third[0]) + [1]
144
+
145
+ matrix_w = Matrix[word_loc1, word_loc2, word_loc3]
146
+ matrix_f = Matrix[
147
+ @fixtures[_result.first[1]][1] + [1],
148
+ @fixtures[_result.second[1]][1] + [1],
149
+ @fixtures[_result.third[1]][1] + [1]
150
+ ]
151
+
152
+ raise "Found locations are collinear" if matrix_w.singular?
153
+
154
+ transform = (matrix_w.inverse * matrix_f).transpose
155
+
156
+ # transform collection
157
+
158
+ norm_words = _words.clone.transform! transform
159
+
160
+ # calculate error mean(distance_from_fixture ^ 2)
161
+
162
+ errors = _result[3..-1].each.map do |index_pair|
163
+ distance(norm_words.location(index_pair[0]), @fixtures[index_pair[1]][1])
164
+ end
165
+
166
+ TemplateMatch.new(
167
+ load_labeled_points(_result, norm_words), norm_words, mean_sq_error(errors), transform
168
+ )
169
+ end
170
+
171
+ def load_labeled_points(_result, _words)
172
+ Hash[_result.each.map do |index_pair|
173
+ fixture_label = @fixtures[index_pair[1]][2]
174
+ word_location = _words.location(index_pair[0])
175
+ [fixture_label, word_location] if fixture_label
176
+ end.reject(&:nil?)]
177
+ end
178
+
179
+ def distance(_pt1, _pt2)
180
+ Math.sqrt((_pt2[0].to_d - _pt1[0])**2 + (_pt2[1].to_d - _pt1[1])**2)
181
+ end
182
+
183
+ def mean_sq_error(_errors)
184
+ _errors.sum { |e| e * e } / _errors.count
185
+ end
186
+ end
187
+ end