sqed 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sqed.rb +11 -5
- data/lib/sqed/boundaries.rb +14 -0
- data/lib/sqed/boundary_finder.rb +65 -4
- data/lib/sqed/boundary_finder/color_line_finder.rb +28 -15
- data/lib/sqed/boundary_finder/cross_finder.rb +3 -3
- data/lib/sqed/boundary_finder/stage_finder.rb +21 -18
- data/lib/sqed/extractor.rb +2 -1
- data/lib/sqed/parser.rb +2 -1
- data/lib/sqed/parser/barcode_parser.rb +1 -1
- data/lib/sqed/parser/ocr_parser.rb +86 -49
- data/lib/sqed/version.rb +1 -1
- data/spec/lib/sqed/boundary_finder/color_line_finder_spec.rb +6 -6
- data/spec/lib/sqed/boundary_finder/cross_finder_spec.rb +2 -1
- data/spec/lib/sqed/boundary_finder_spec.rb +8 -2
- data/spec/lib/sqed_spec.rb +10 -6
- data/sqed.gemspec +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd0958fc2efbed976b77385f511ce9025f26b5b2
|
4
|
+
data.tar.gz: 2f3efab45b172677057170dfdbdee8366372fac0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04c09b12b5212a5b7c6cf356caa6e04ef4c422b5e83f14c652996089b9f30c016102572cc2963c2710d56135d7411e2778303034de9b8a44597d4feee7796b43
|
7
|
+
data.tar.gz: 14a8a7cd8bd19a6c3c00e3105b82b513f119bca6f3febac4751e9d1714e2ff08d4aaa3a5bbd3ead4f349d56078830aba02272bc3231424cea3877064b5b9f5b4
|
data/lib/sqed.rb
CHANGED
@@ -41,7 +41,10 @@ class Sqed
|
|
41
41
|
# a symbol, :red, :green, :blue, describing the boundary color within the stage
|
42
42
|
attr_accessor :boundary_color
|
43
43
|
|
44
|
-
|
44
|
+
# Boolean, whether to do the boundary detection (not stage detection at present) against a thumbnail version of the passed image (faster, less accurate, true be default)
|
45
|
+
attr_accessor :use_thumbnail
|
46
|
+
|
47
|
+
def initialize(image: image, pattern: pattern, has_border: true, boundary_color: :green, use_thumbnail: true)
|
45
48
|
raise 'extraction pattern not defined' if pattern && !SqedConfig::EXTRACTION_PATTERNS.keys.include?(pattern)
|
46
49
|
|
47
50
|
@image = image
|
@@ -51,7 +54,7 @@ class Sqed
|
|
51
54
|
@pattern = pattern
|
52
55
|
@pattern ||= :cross
|
53
56
|
@boundary_color = boundary_color
|
54
|
-
|
57
|
+
@use_thumbnail = use_thumbnail
|
55
58
|
set_stage_boundary if @image
|
56
59
|
end
|
57
60
|
|
@@ -108,6 +111,7 @@ class Sqed
|
|
108
111
|
extractor.result
|
109
112
|
end
|
110
113
|
|
114
|
+
# Debugging purposes
|
111
115
|
def attributes
|
112
116
|
{
|
113
117
|
image: @image,
|
@@ -115,7 +119,8 @@ class Sqed
|
|
115
119
|
stage_boundary: stage_boundary,
|
116
120
|
has_border: @has_border,
|
117
121
|
pattern: @pattern,
|
118
|
-
boundary_color: @boundary_color
|
122
|
+
boundary_color: @boundary_color,
|
123
|
+
use_thumbnail: @use_thumbnail
|
119
124
|
}
|
120
125
|
end
|
121
126
|
|
@@ -138,11 +143,12 @@ class Sqed
|
|
138
143
|
def get_section_boundaries
|
139
144
|
boundary_finder_class = SqedConfig::EXTRACTION_PATTERNS[@pattern][:boundary_finder]
|
140
145
|
|
141
|
-
options = {image: stage_image}
|
146
|
+
options = {image: stage_image, use_thumbnail: use_thumbnail}
|
142
147
|
options.merge!( layout: SqedConfig::EXTRACTION_PATTERNS[@pattern][:layout] ) unless boundary_finder_class.name == 'Sqed::BoundaryFinder::CrossFinder'
|
143
|
-
options.merge!( boundary_color: @boundary_color) if
|
148
|
+
options.merge!( boundary_color: @boundary_color) if boundary_finder_class.name == 'Sqed::BoundaryFinder::ColorLineFinder'
|
144
149
|
|
145
150
|
boundary_finder_class.new(options).boundaries
|
151
|
+
|
146
152
|
end
|
147
153
|
|
148
154
|
end
|
data/lib/sqed/boundaries.rb
CHANGED
@@ -91,4 +91,18 @@ class Sqed::Boundaries
|
|
91
91
|
end
|
92
92
|
true
|
93
93
|
end
|
94
|
+
|
95
|
+
def zoom(width_factor, height_factor)
|
96
|
+
coordinates.keys.each do |i|
|
97
|
+
set(i, [
|
98
|
+
(x_for(i).to_f * width_factor).to_i,
|
99
|
+
(y_for(i).to_f * height_factor).to_i,
|
100
|
+
(width_for(i).to_f * width_factor).to_i,
|
101
|
+
(height_for(i).to_f * height_factor).to_i
|
102
|
+
])
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
|
94
108
|
end
|
data/lib/sqed/boundary_finder.rb
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
# return derivative images. Finders operate on cropped images, i.e. only the "stage".
|
3
3
|
#
|
4
4
|
class Sqed::BoundaryFinder
|
5
|
+
|
6
|
+
THUMB_SIZE = 100
|
7
|
+
|
5
8
|
# the passed image
|
6
9
|
attr_reader :img
|
7
10
|
|
@@ -11,10 +14,18 @@ class Sqed::BoundaryFinder
|
|
11
14
|
# A Sqed::Boundaries instance, stores the coordinates of all of the layout sections
|
12
15
|
attr_reader :boundaries
|
13
16
|
|
14
|
-
|
17
|
+
# Whether to compress the original image to a thumbnail when finding boundaries
|
18
|
+
attr_reader :use_thumbnail
|
19
|
+
|
20
|
+
# when we compute using a derived thumbnail we temporarily store the full size image here
|
21
|
+
attr_reader :original_image
|
22
|
+
|
23
|
+
def initialize(image: image, layout: layout, use_thumbnail: true)
|
15
24
|
raise 'No layout provided.' if layout.nil?
|
16
25
|
raise 'No image provided.' if image.nil? || image.class.name != 'Magick::Image'
|
17
26
|
|
27
|
+
@use_thumbnail = use_thumbnail
|
28
|
+
|
18
29
|
@layout = layout
|
19
30
|
@img = image
|
20
31
|
true
|
@@ -25,12 +36,46 @@ class Sqed::BoundaryFinder
|
|
25
36
|
@boundaries ||= Sqed::Boundaries.new(@layout)
|
26
37
|
end
|
27
38
|
|
39
|
+
def longest_thumbnail_axis
|
40
|
+
img.columns > img.rows ? :width : :height
|
41
|
+
end
|
42
|
+
|
43
|
+
def thumbnail_height
|
44
|
+
if longest_thumbnail_axis == :height
|
45
|
+
THUMB_SIZE
|
46
|
+
else
|
47
|
+
(img.rows.to_f * (THUMB_SIZE.to_f / img.columns.to_f)).round.to_i
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def thumbnail_width
|
52
|
+
if longest_thumbnail_axis == :width
|
53
|
+
THUMB_SIZE
|
54
|
+
else
|
55
|
+
(img.columns.to_f * (THUMB_SIZE.to_f / img.rows.to_f)).round.to_i
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# see https://rmagick.github.io/image3.html#thumbnail
|
60
|
+
def thumbnail
|
61
|
+
img.thumbnail(thumbnail_width, thumbnail_height)
|
62
|
+
end
|
63
|
+
|
64
|
+
def width_factor
|
65
|
+
img.columns.to_f / thumbnail_width.to_f
|
66
|
+
end
|
67
|
+
|
68
|
+
def height_factor
|
69
|
+
img.rows.to_f / thumbnail_height.to_f
|
70
|
+
end
|
71
|
+
|
72
|
+
def zoom_boundaries
|
73
|
+
boundaries.zoom(width_factor, height_factor )
|
74
|
+
end
|
28
75
|
|
29
76
|
# return [Integer, nil]
|
30
77
|
# sample more with small images, less with large images
|
31
78
|
# we want to return larger numbers (= faster sampling)
|
32
|
-
#
|
33
|
-
#
|
34
79
|
def self.get_subdivision_size(image_width)
|
35
80
|
case image_width
|
36
81
|
when nil
|
@@ -69,6 +114,7 @@ class Sqed::BoundaryFinder
|
|
69
114
|
# (:rows|:columns), :rows finds vertical borders, :columns finds horizontal borders
|
70
115
|
#
|
71
116
|
def self.color_boundary_finder(image: image, sample_subdivision_size: nil, sample_cutoff_factor: nil, scan: :rows, boundary_color: :green)
|
117
|
+
|
72
118
|
image_width = image.send(scan)
|
73
119
|
sample_subdivision_size = get_subdivision_size(image_width) if sample_subdivision_size.nil?
|
74
120
|
samples_to_take = (image_width / sample_subdivision_size).to_i - 1
|
@@ -105,6 +151,8 @@ class Sqed::BoundaryFinder
|
|
105
151
|
|
106
152
|
if sample_cutoff_factor.nil?
|
107
153
|
cutoff = max_difference(border_hits.values)
|
154
|
+
|
155
|
+
cutoff = border_hits.values.first - 1 if cutoff == 0 # difference of two identical things is 0
|
108
156
|
else
|
109
157
|
cutoff = (samples_to_take * sample_cutoff_factor).to_i
|
110
158
|
end
|
@@ -132,6 +180,7 @@ class Sqed::BoundaryFinder
|
|
132
180
|
# return [Array]
|
133
181
|
# the median position of all (pixel) positions that have a count greater than the cutoff
|
134
182
|
def self.frequency_stats(frequency_hash, sample_cutoff = 0)
|
183
|
+
|
135
184
|
return nil if sample_cutoff.nil? || sample_cutoff < 1
|
136
185
|
hit_ranges = []
|
137
186
|
|
@@ -141,7 +190,18 @@ class Sqed::BoundaryFinder
|
|
141
190
|
end
|
142
191
|
end
|
143
192
|
|
144
|
-
|
193
|
+
case hit_ranges.size
|
194
|
+
when 1
|
195
|
+
c = hit_ranges[0]
|
196
|
+
hit_ranges = [c - 1, c, c + 1]
|
197
|
+
when 2
|
198
|
+
hit_ranges.sort!
|
199
|
+
c1 = hit_ranges[0]
|
200
|
+
c2 = hit_ranges[1]
|
201
|
+
hit_ranges = [c1, c2, c2 + (c2 - c1)]
|
202
|
+
when 0
|
203
|
+
return nil
|
204
|
+
end
|
145
205
|
|
146
206
|
# we have to sort because the keys (positions) we examined came unordered from a hash originally
|
147
207
|
hit_ranges.sort!
|
@@ -174,3 +234,4 @@ class Sqed::BoundaryFinder
|
|
174
234
|
end
|
175
235
|
|
176
236
|
end
|
237
|
+
|
@@ -4,10 +4,15 @@ require 'rmagick'
|
|
4
4
|
#
|
5
5
|
class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
|
6
6
|
|
7
|
-
def initialize(image: image, layout: layout, boundary_color: :green)
|
8
|
-
super(image: image, layout: layout)
|
7
|
+
def initialize(image: image, layout: layout, boundary_color: :green, use_thumbnail: true)
|
8
|
+
super(image: image, layout: layout, use_thumbnail: use_thumbnail)
|
9
9
|
raise 'No layout provided.' if @layout.nil?
|
10
10
|
@boundary_color = boundary_color
|
11
|
+
|
12
|
+
if use_thumbnail
|
13
|
+
@original_image = @img.copy
|
14
|
+
@img = thumbnail
|
15
|
+
end
|
11
16
|
find_bands
|
12
17
|
end
|
13
18
|
|
@@ -25,26 +30,28 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
|
|
25
30
|
when :horizontal_split
|
26
31
|
t = Sqed::BoundaryFinder.color_boundary_finder(image: img, scan: :columns, boundary_color: @boundary_color) # set to detect horizontal division, (green line)
|
27
32
|
return if t.nil?
|
33
|
+
|
28
34
|
boundaries.set(0, [0, 0, img.columns, t[0]]) # upper section of image
|
29
35
|
boundaries.set(1, [0, t[2], img.columns, img.rows - t[2]]) # lower section of image
|
30
36
|
|
31
37
|
when :right_t # only 3 zones expected, with horizontal division in right-side of vertical division
|
32
|
-
vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
|
33
|
-
irt = img.crop(*vertical.for(1), true)
|
34
|
-
right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
|
38
|
+
vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
35
39
|
|
36
|
-
|
40
|
+
irt = img.crop(*vertical.for(1), true)
|
41
|
+
right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
42
|
+
|
43
|
+
boundaries.set(0, vertical.for(0))
|
37
44
|
boundaries.set(1, [ vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ] )
|
38
45
|
boundaries.set(2, [ vertical.x_for(1), right.y_for(1), right.width_for(1), right.height_for(1)] )
|
39
46
|
|
40
47
|
when :vertical_offset_cross # 4 zones expected, with (varying) horizontal division in left- and right- sides of vertical division
|
41
|
-
vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
|
42
|
-
|
48
|
+
vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
|
49
|
+
|
43
50
|
ilt = img.crop(*vertical.for(0), true)
|
44
51
|
irt = img.crop(*vertical.for(1), true)
|
45
52
|
|
46
|
-
left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
|
47
|
-
right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
|
53
|
+
left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries # fails
|
54
|
+
right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries # OK
|
48
55
|
|
49
56
|
boundaries.set(0, [0, 0, left.width_for(0), left.height_for(0) ])
|
50
57
|
boundaries.set(1, [vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ])
|
@@ -53,13 +60,13 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
|
|
53
60
|
|
54
61
|
# No specs for this yet
|
55
62
|
when :horizontal_offset_cross
|
56
|
-
horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
|
63
|
+
horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
57
64
|
|
58
65
|
itop = img.crop(*horizontal.for(0), true)
|
59
66
|
ibottom = img.crop(*horizontal.for(1), true)
|
60
67
|
|
61
|
-
top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
|
62
|
-
bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
|
68
|
+
top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
69
|
+
bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
63
70
|
|
64
71
|
boundaries.set(0, [0, 0, top.width_for(0), top.height_for(0) ])
|
65
72
|
boundaries.set(1, [top.x_for(1), 0, top.width_for(1), top.height_for(1) ])
|
@@ -67,8 +74,8 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
|
|
67
74
|
boundaries.set(3, [0, horizontal.y_for(1), bottom.width_for(0), bottom.height_for(0) ])
|
68
75
|
|
69
76
|
when :cross # 4 zones, with perfectly intersected horizontal and vertical division
|
70
|
-
v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
|
71
|
-
h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
|
77
|
+
v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
|
78
|
+
h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
|
72
79
|
|
73
80
|
return if v.nil? || h.nil?
|
74
81
|
|
@@ -84,6 +91,12 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
|
|
84
91
|
|
85
92
|
boundaries.complete = true if boundaries.populated?
|
86
93
|
|
94
|
+
if use_thumbnail
|
95
|
+
@img = @original_image
|
96
|
+
zoom_boundaries
|
97
|
+
@original_image = nil
|
98
|
+
end
|
99
|
+
|
87
100
|
end
|
88
101
|
|
89
102
|
|
@@ -5,13 +5,13 @@ require 'rmagick'
|
|
5
5
|
class Sqed::BoundaryFinder::CrossFinder < Sqed::BoundaryFinder
|
6
6
|
|
7
7
|
def initialize(image: image)
|
8
|
-
@
|
8
|
+
@img = image
|
9
9
|
find_edges
|
10
10
|
end
|
11
11
|
|
12
12
|
def find_edges
|
13
|
-
width = @
|
14
|
-
height = @
|
13
|
+
width = @img.columns / 2
|
14
|
+
height = @img.rows / 2
|
15
15
|
|
16
16
|
boundaries.coordinates[0] = [0, 0, width, height]
|
17
17
|
boundaries.coordinates[1] = [width, 0, width, height]
|
@@ -3,7 +3,7 @@ require 'rmagick'
|
|
3
3
|
# Some of this code was originally inspired by Emmanuel Oga's gist https://gist.github.com/EmmanuelOga/2476153.
|
4
4
|
#
|
5
5
|
class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
6
|
-
|
6
|
+
|
7
7
|
# The proc containing the border finding algorithim
|
8
8
|
attr_reader :is_border
|
9
9
|
|
@@ -11,23 +11,24 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
11
11
|
|
12
12
|
# How small we accept a cropped picture to be. E.G. if it was 100x100 and
|
13
13
|
# ratio 0.1, min output should be 10x10
|
14
|
-
MIN_CROP_RATIO = 0.1
|
14
|
+
MIN_CROP_RATIO = 0.1
|
15
15
|
|
16
|
-
attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
|
16
|
+
attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
|
17
17
|
|
18
18
|
def initialize(image: image, is_border_proc: nil, min_ratio: MIN_CROP_RATIO)
|
19
|
-
super(image: image, layout: :internal_box)
|
19
|
+
super(image: image, layout: :internal_box)
|
20
20
|
|
21
|
-
@min_ratio =
|
21
|
+
@min_ratio = min_ratio
|
22
22
|
|
23
23
|
# Initial co-ordinates
|
24
24
|
@x0, @y0 = 0, 0
|
25
|
-
@x1, @y1 = img.columns, img.rows
|
25
|
+
@x1, @y1 = img.columns, img.rows
|
26
26
|
@min_width, @min_height = img.columns * @min_ratio, img.rows * @min_ratio # minimum resultant area
|
27
27
|
@columns, @rows = img.columns, img.rows
|
28
28
|
|
29
|
+
|
29
30
|
# We need a border finder proc. Provide one if none was given.
|
30
|
-
@is_border = is_border_proc || self.class.default_border_finder(img)
|
31
|
+
@is_border = is_border_proc || self.class.default_border_finder(img) # if no proc specified, use default below
|
31
32
|
|
32
33
|
@x00 = @x0
|
33
34
|
@y00 = @y0
|
@@ -51,8 +52,9 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
51
52
|
# works for 0.5, >0.137; 0.60, >0.14 0.65, >0.146; 0.70, >0.1875; 0.75, >0.1875; 0.8, >0.237; 0.85, >0.24; 0.90, >0.28; 0.95, >0.25
|
52
53
|
# fails for 0.75, (0.18, 0.17,0.16,0.15); 0.70, 0.18;
|
53
54
|
#
|
54
|
-
|
55
|
-
|
55
|
+
# this sets variables (locally) for find_edges
|
56
|
+
def self.default_border_finder(img, samples = 5, threshold = 0.75, fuzz_factor = 0.40) # working on non-synthetic images 04-dec-2014
|
57
|
+
fuzz = ((Magick::QuantumRange + 1) * fuzz_factor).to_i
|
56
58
|
# Returns true if the edge is a border (border meaning outer region to be cropped)
|
57
59
|
lambda do |edge|
|
58
60
|
border, non_border = 0.0, 0.0 # maybe should be called outer, inner
|
@@ -79,7 +81,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
79
81
|
# handle this exception
|
80
82
|
return unless is_border # return if no process defined or set for @is_border
|
81
83
|
|
82
|
-
u = x1 - 1
|
84
|
+
u = x1 - 1 # rightmost pixel (kind of)
|
83
85
|
# increment from left to right
|
84
86
|
x0.upto(u) do |x|
|
85
87
|
if width_croppable? && is_border[vline(x)] then
|
@@ -89,7 +91,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
89
91
|
end
|
90
92
|
end
|
91
93
|
# increment from left to right
|
92
|
-
(u).downto(x0) { |x| width_croppable?
|
94
|
+
(u).downto(x0) { |x| width_croppable? && is_border[vline(x)] ? @x1 = x - 1 : break }
|
93
95
|
|
94
96
|
u = y1 - 1
|
95
97
|
0.upto(u) do |y|
|
@@ -104,10 +106,11 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
104
106
|
|
105
107
|
delta_x = 0 #width/50 # 2% of cropped image to make up for trapezoidal distortion
|
106
108
|
delta_y = 0 #height/50 # 2% of cropped image to make up for trapezoidal distortion <- NOT 3%
|
107
|
-
|
109
|
+
|
108
110
|
# TODO: add conditions
|
109
|
-
boundaries.complete = true
|
110
|
-
boundaries.
|
111
|
+
boundaries.complete = true
|
112
|
+
boundaries.set(0, [x0 + delta_x, y0 + delta_y, width - 2*delta_x, height - 2*delta_y])
|
113
|
+
|
111
114
|
end
|
112
115
|
|
113
116
|
def width_croppable?
|
@@ -127,13 +130,13 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
|
|
127
130
|
end
|
128
131
|
|
129
132
|
# actually + 1 (starting at zero?)
|
130
|
-
def width
|
131
|
-
@x1 - @x0
|
133
|
+
def width
|
134
|
+
@x1 - @x0
|
132
135
|
end
|
133
|
-
|
136
|
+
|
134
137
|
# actually + 1 (starting at zero?)
|
135
138
|
def height
|
136
|
-
@y1 - @y0
|
139
|
+
@y1 - @y0
|
137
140
|
end
|
138
141
|
|
139
142
|
end
|
data/lib/sqed/extractor.rb
CHANGED
@@ -42,10 +42,11 @@ class Sqed::Extractor
|
|
42
42
|
if parsers = SqedConfig::SECTION_PARSERS[section_type]
|
43
43
|
|
44
44
|
section_image = r.send("#{section_type}_image")
|
45
|
+
|
45
46
|
updated = r.send(section_type)
|
46
47
|
|
47
48
|
parsers.each do |p|
|
48
|
-
parsed_result = p.new(section_image).text
|
49
|
+
parsed_result = p.new(section_image).text(section_type: section_type)
|
49
50
|
updated.merge!(p::TYPE => parsed_result) if parsed_result
|
50
51
|
end
|
51
52
|
|
data/lib/sqed/parser.rb
CHANGED
@@ -2,7 +2,20 @@
|
|
2
2
|
#
|
3
3
|
# Given a single image return all text in that image.
|
4
4
|
#
|
5
|
-
# For
|
5
|
+
# For reference
|
6
|
+
# http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
|
7
|
+
# https://code.google.com/p/tesseract-ocr/wiki/FAQ
|
8
|
+
# http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
|
9
|
+
#
|
10
|
+
# "There is a minimum text size for reasonable accuracy.
|
11
|
+
# You have to consider resolution as well as point size.
|
12
|
+
# Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi.
|
13
|
+
# A quick check is to count the pixels of the x-height of your characters.
|
14
|
+
# (X-height is the height of the lower case x.)
|
15
|
+
# At 10pt x 300dpi x-heights are typically about 20 pixels, although this
|
16
|
+
# can vary dramatically from font to font.
|
17
|
+
# Below an x-height of 10 pixels, you have very little chance of accurate results,
|
18
|
+
# and below about 8 pixels, most of the text will be "noise removed".
|
6
19
|
#
|
7
20
|
require 'rtesseract'
|
8
21
|
|
@@ -10,60 +23,84 @@ class Sqed::Parser::OcrParser < Sqed::Parser
|
|
10
23
|
|
11
24
|
TYPE = :text
|
12
25
|
|
26
|
+
# Tesseract parameters default/specific to section type,
|
27
|
+
# default is merged into the type
|
28
|
+
SECTION_PARAMS = {
|
29
|
+
default: {
|
30
|
+
psm: 3,
|
31
|
+
# classify_debug_level: 5,
|
32
|
+
# lang: 'eng',
|
33
|
+
# load_system_dawg: 0,
|
34
|
+
# load_unambig_dawg: 0,
|
35
|
+
# load_freq_dawg: 0,
|
36
|
+
# load_fixed_length_dawgs: 0,
|
37
|
+
# load_number_dawg: 0,
|
38
|
+
# load_punc_dawg: 1, ## important
|
39
|
+
# load_unambig_dawg: 1,
|
40
|
+
# chop_enable: 0,
|
41
|
+
# enable_new_segsearch: 1,
|
42
|
+
# tessedit_debug_quality_metrics: 1,
|
43
|
+
# tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
|
44
|
+
# tessedit_write_images: 1,
|
45
|
+
# equationdetect_save_merged_image: 1,
|
46
|
+
# tessedit_dump_pageseg_images: 1,
|
47
|
+
# equationdetect_save_bi_image: 1
|
48
|
+
},
|
49
|
+
annotated_specimen: {
|
50
|
+
edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
|
51
|
+
},
|
52
|
+
identifier: {
|
53
|
+
psm: 1,
|
54
|
+
# tessedit_char_whitelist: '0123456789'
|
55
|
+
# edges_children_count_limit: 4000
|
56
|
+
},
|
57
|
+
curator_metadata: {
|
58
|
+
},
|
59
|
+
labels: {
|
60
|
+
psm: 3, # may need to be 6
|
61
|
+
},
|
62
|
+
deterimination_labels: {
|
63
|
+
psm: 3
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
13
67
|
# the text extracted from the image
|
14
68
|
attr_accessor :text
|
15
69
|
|
16
|
-
#
|
17
|
-
def
|
18
|
-
|
19
|
-
|
20
|
-
# @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
|
21
|
-
# get potential border pixel color (based on quadrant?)
|
22
|
-
new_color = img.pixel_color(1, 1)
|
23
|
-
# img = img.scale(2)
|
24
|
-
# img.write('foo0.jpg.jpg')
|
25
|
-
# img = img.enhance
|
26
|
-
# img.write('foo1.jpg')
|
27
|
-
# img = img.quantize(8, Magick::GRAYColorspace)
|
28
|
-
# img.write('foo1.jpg')
|
29
|
-
# img = img.sharpen(1.0, 0.2)
|
30
|
-
# img.write('foo2.jpg')
|
31
|
-
# border_color = img.pixel_color(img.columns - 1, img.rows - 1)
|
32
|
-
# img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
|
33
|
-
# img.write('tmp/foo4.jpg')
|
34
|
-
# img = img.quantize(2, Magick::GRAYColorspace)
|
35
|
-
# #img = img.threshold(0.5)
|
36
|
-
# img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
|
37
|
-
# img = img.equalize #(32, Magick::GRAYColorspace)
|
38
|
-
# img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
|
39
|
-
# #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
|
40
|
-
#
|
41
|
-
# img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
|
42
|
-
|
43
|
-
|
44
|
-
# From https://code.google.com/p/tesseract-ocr/wiki/FAQ
|
45
|
-
# " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed".
|
46
|
-
|
47
|
-
|
48
|
-
# http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
|
49
|
-
# doesn't supprot outputbase
|
50
|
-
r = RTesseract.new(img, lang: 'eng', psm: 1,
|
51
|
-
load_system_dawg: 0,
|
52
|
-
tessedit_debug_quality_metrics: 1,
|
53
|
-
load_freq_dawg: 1 ,
|
54
|
-
chop_enable: 1,
|
55
|
-
tessedit_write_images: 1,
|
56
|
-
equationdetect_save_merged_image: 1,
|
57
|
-
tessedit_dump_pageseg_images: 1,
|
58
|
-
equationdetect_save_bi_image: 1,
|
59
|
-
load_unambig_dawg: 0,
|
60
|
-
tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,
|
61
|
-
|
62
|
-
# img = img.white_threshold(245)
|
70
|
+
# future consideration
|
71
|
+
# def enhance_image(img)
|
72
|
+
# get potential border pixel color (based on quadrant?)
|
73
|
+
# new_color = img.pixel_color(1, 1)
|
63
74
|
|
75
|
+
# img = img.scale(2)
|
76
|
+
# img.write('foo0.jpg.jpg')
|
77
|
+
# img = img.enhance
|
78
|
+
# img.write('foo1.jpg')
|
79
|
+
# img = img.quantize(8, Magick::GRAYColorspace)
|
80
|
+
# img.write('foo1.jpg')
|
81
|
+
# img = img.sharpen(1.0, 0.2)
|
82
|
+
# img.write('foo2.jpg')
|
83
|
+
# border_color = img.pixel_color(img.columns - 1, img.rows - 1)
|
84
|
+
# img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
|
85
|
+
# img.write('tmp/foo4.jpg')
|
86
|
+
# img = img.quantize(2, Magick::GRAYColorspace)
|
87
|
+
# #img = img.threshold(0.5)
|
88
|
+
# img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
|
89
|
+
# img = img.equalize #(32, Magick::GRAYColorspace)
|
90
|
+
# img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
|
91
|
+
# #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
|
92
|
+
#
|
93
|
+
# img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
|
94
|
+
# img = img.white_threshold(245)
|
95
|
+
# img
|
96
|
+
# end
|
97
|
+
|
98
|
+
def text(section_type: :default)
|
99
|
+
img = @image
|
100
|
+
params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
|
101
|
+
r = RTesseract.new(img, params)
|
64
102
|
@text = r.to_s.strip
|
65
103
|
end
|
66
104
|
|
67
|
-
# Need to provide tuning methods here, i.e. image transormations that facilitate OCR
|
68
105
|
|
69
106
|
end
|
data/lib/sqed/version.rb
CHANGED
@@ -8,25 +8,25 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
|
|
8
8
|
let(:c) {b.boundaries}
|
9
9
|
let(:d) { image.crop(*c.for(0), true) }
|
10
10
|
|
11
|
-
let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t) }
|
11
|
+
let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t, use_thumbnail: false) }
|
12
12
|
let(:f) { e.boundaries }
|
13
|
-
let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross)}
|
13
|
+
let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross, use_thumbnail: false)}
|
14
14
|
let(:h) { g.boundaries }
|
15
|
-
let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split) }
|
15
|
+
let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split, use_thumbnail: false) }
|
16
16
|
let(:hv) { gv.boundaries }
|
17
17
|
|
18
18
|
let(:ah) { ImageHelpers.vertical_offset_cross_red }
|
19
19
|
let(:bh) { Sqed::BoundaryFinder::StageFinder.new(image: ah) }
|
20
20
|
let(:ch) { bh.boundaries }
|
21
21
|
let(:dh) { ah.crop(*ch.for(0), true) }
|
22
|
-
let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red) } # was :horizontal_split
|
22
|
+
let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red, use_thumbnail: false) } # was :horizontal_split
|
23
23
|
let(:hh) { gh.boundaries }
|
24
24
|
|
25
25
|
let(:ibs) { ImageHelpers.black_stage_green_line_specimen }
|
26
26
|
let(:bbs) { Sqed::BoundaryFinder::StageFinder.new(image: ibs) }
|
27
27
|
let(:cbs) { bbs.boundaries }
|
28
28
|
let(:dbs) { ibs.crop(*cbs.for(0), true) }
|
29
|
-
let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross) }
|
29
|
+
let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross, use_thumbnail: false) }
|
30
30
|
let(:hbs) { gbs.boundaries }
|
31
31
|
|
32
32
|
specify 'initial image columns are as expected for :image above' do
|
@@ -167,7 +167,7 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
|
|
167
167
|
context 'thumbnail processing finds reasonable boundaries' do
|
168
168
|
|
169
169
|
let(:thumb) { ImageHelpers.frost_stage_thumb }
|
170
|
-
let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross)}
|
170
|
+
let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross, use_thumbnail: false)}
|
171
171
|
let(:finder_boundaries) { finder.boundaries }
|
172
172
|
|
173
173
|
let(:pct) { 0.08 }
|
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe Sqed::BoundaryFinder do
|
4
4
|
|
5
5
|
specify 'when no image provided, #new raises' do
|
6
|
-
expect { Sqed::BoundaryFinder.new() }.to raise_error
|
6
|
+
expect { Sqed::BoundaryFinder.new() }.to raise_error('No layout provided.')
|
7
7
|
end
|
8
8
|
|
9
9
|
context 'when initiated with an image' do
|
@@ -103,9 +103,15 @@ describe Sqed::BoundaryFinder do
|
|
103
103
|
expect( Sqed::BoundaryFinder.frequency_stats(i, 12)).to eq([3, 4, 5])
|
104
104
|
end
|
105
105
|
|
106
|
+
specify 'returns estimated borders if only one hit greater than samples taken' do
|
107
|
+
expect( Sqed::BoundaryFinder.frequency_stats(i, 15)).to eq([2,3,4])
|
108
|
+
end
|
109
|
+
|
106
110
|
specify 'returns nil if no count is greater than samples taken' do
|
107
|
-
expect( Sqed::BoundaryFinder.frequency_stats(i,
|
111
|
+
expect( Sqed::BoundaryFinder.frequency_stats(i, 20)).to eq(nil)
|
108
112
|
end
|
113
|
+
|
114
|
+
|
109
115
|
end
|
110
116
|
|
111
117
|
context 'offset boundaries from crossy_black_line_specimen image ' do
|
data/spec/lib/sqed_spec.rb
CHANGED
@@ -90,20 +90,23 @@ describe Sqed do
|
|
90
90
|
|
91
91
|
context '#result' do
|
92
92
|
let(:r) { s.result }
|
93
|
+
|
93
94
|
specify 'returns a Sqed::Result' do
|
94
95
|
expect(r.class.name).to eq('Sqed::Result')
|
95
96
|
end
|
96
97
|
|
97
98
|
context 'extracted data' do
|
98
|
-
specify 'for an :identifier section' do
|
99
|
+
specify 'text for an :identifier section' do
|
100
|
+
|
101
|
+
r.identifier_image.write('41.jpg')
|
99
102
|
expect(r.text_for(:identifier)).to match('000041196')
|
100
103
|
end
|
101
104
|
|
102
|
-
specify 'for an annotated_specimen section' do
|
105
|
+
specify 'text for an annotated_specimen section' do
|
103
106
|
expect(r.text_for(:annotated_specimen)).to match('Saucier Creek')
|
104
107
|
end
|
105
108
|
|
106
|
-
specify 'for a curator_metadata section' do
|
109
|
+
specify 'text for a curator_metadata section' do
|
107
110
|
expect(r.text_for(:curator_metadata)).to match('Frost Entomological Museum')
|
108
111
|
end
|
109
112
|
end
|
@@ -113,7 +116,7 @@ describe Sqed do
|
|
113
116
|
context 'all together, with border' do
|
114
117
|
let(:image) { ImageHelpers.greenline_image }
|
115
118
|
let(:pattern) { :right_t }
|
116
|
-
let(:s) { Sqed.new(image: image, pattern: pattern, has_border:
|
119
|
+
let(:s) { Sqed.new(image: image, pattern: pattern, has_border: true) }
|
117
120
|
|
118
121
|
specify '#boundaries returns a Sqed::Boundaries instance' do
|
119
122
|
expect(s.boundaries.class.name).to eq('Sqed::Boundaries')
|
@@ -138,11 +141,12 @@ describe Sqed do
|
|
138
141
|
end
|
139
142
|
|
140
143
|
context 'extracted data' do
|
141
|
-
specify 'for an :identifier section' do
|
144
|
+
specify 'text for an :identifier section' do
|
145
|
+
r.identifier_image.write('85.jpg')
|
142
146
|
expect(r.text_for(:identifier)).to match('000085067')
|
143
147
|
end
|
144
148
|
|
145
|
-
specify 'for a specimen section' do
|
149
|
+
specify 'text for a specimen section' do
|
146
150
|
expect(r.text_for(:annotated_specimen)).to match('Aeshna')
|
147
151
|
end
|
148
152
|
end
|
data/sqed.gemspec
CHANGED
@@ -23,7 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency 'rtesseract', '~> 1.2.6'
|
24
24
|
spec.add_dependency 'zxing_cpp', '~> 0.1.0'
|
25
25
|
|
26
|
-
spec.add_development_dependency 'rspec'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.3'
|
27
27
|
spec.add_development_dependency 'bundler', '~> 1.5'
|
28
28
|
spec.add_development_dependency 'did_you_mean', '~> 0.9'
|
29
29
|
spec.add_development_dependency 'byebug'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sqed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Yoder
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-
|
12
|
+
date: 2015-09-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -71,16 +71,16 @@ dependencies:
|
|
71
71
|
name: rspec
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
|
-
- - "
|
74
|
+
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '
|
76
|
+
version: '3.3'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
|
-
- - "
|
81
|
+
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: '
|
83
|
+
version: '3.3'
|
84
84
|
- !ruby/object:Gem::Dependency
|
85
85
|
name: bundler
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|