sqed 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 11687929d5850b325870f2f76a4d628f2707f324
4
- data.tar.gz: f38950c622a96cda352855e494e97ce3bdb8bf8e
3
+ metadata.gz: bd0958fc2efbed976b77385f511ce9025f26b5b2
4
+ data.tar.gz: 2f3efab45b172677057170dfdbdee8366372fac0
5
5
  SHA512:
6
- metadata.gz: 486844c8d1499848dbab9bedfd69155a023eac59d2470914cdec6d80c58a8457db398386ada6f9d4b153e549ece076c471279d3ef768be13333bb767f78d1e9e
7
- data.tar.gz: bcf06a0b5d27545bbac8123a1dd69f84166f37d760dff9a95e33b5e8d73f688ed8d12cd419ec84a76011fd328c4aa1ccaa58ecaed863a94a22fd2d5ab82fc424
6
+ metadata.gz: 04c09b12b5212a5b7c6cf356caa6e04ef4c422b5e83f14c652996089b9f30c016102572cc2963c2710d56135d7411e2778303034de9b8a44597d4feee7796b43
7
+ data.tar.gz: 14a8a7cd8bd19a6c3c00e3105b82b513f119bca6f3febac4751e9d1714e2ff08d4aaa3a5bbd3ead4f349d56078830aba02272bc3231424cea3877064b5b9f5b4
@@ -41,7 +41,10 @@ class Sqed
41
41
  # a symbol, :red, :green, :blue, describing the boundary color within the stage
42
42
  attr_accessor :boundary_color
43
43
 
44
- def initialize(image: image, pattern: pattern, has_border: true, boundary_color: :green)
44
+ # Boolean, whether to do the boundary detection (not stage detection at present) against a thumbnail version of the passed image (faster, less accurate, true be default)
45
+ attr_accessor :use_thumbnail
46
+
47
+ def initialize(image: image, pattern: pattern, has_border: true, boundary_color: :green, use_thumbnail: true)
45
48
  raise 'extraction pattern not defined' if pattern && !SqedConfig::EXTRACTION_PATTERNS.keys.include?(pattern)
46
49
 
47
50
  @image = image
@@ -51,7 +54,7 @@ class Sqed
51
54
  @pattern = pattern
52
55
  @pattern ||= :cross
53
56
  @boundary_color = boundary_color
54
-
57
+ @use_thumbnail = use_thumbnail
55
58
  set_stage_boundary if @image
56
59
  end
57
60
 
@@ -108,6 +111,7 @@ class Sqed
108
111
  extractor.result
109
112
  end
110
113
 
114
+ # Debugging purposes
111
115
  def attributes
112
116
  {
113
117
  image: @image,
@@ -115,7 +119,8 @@ class Sqed
115
119
  stage_boundary: stage_boundary,
116
120
  has_border: @has_border,
117
121
  pattern: @pattern,
118
- boundary_color: @boundary_color
122
+ boundary_color: @boundary_color,
123
+ use_thumbnail: @use_thumbnail
119
124
  }
120
125
  end
121
126
 
@@ -138,11 +143,12 @@ class Sqed
138
143
  def get_section_boundaries
139
144
  boundary_finder_class = SqedConfig::EXTRACTION_PATTERNS[@pattern][:boundary_finder]
140
145
 
141
- options = {image: stage_image}
146
+ options = {image: stage_image, use_thumbnail: use_thumbnail}
142
147
  options.merge!( layout: SqedConfig::EXTRACTION_PATTERNS[@pattern][:layout] ) unless boundary_finder_class.name == 'Sqed::BoundaryFinder::CrossFinder'
143
- options.merge!( boundary_color: @boundary_color) if boundary_finder_class.name == 'Sqed::BoundaryFinder::ColorLineFinder'
148
+ options.merge!( boundary_color: @boundary_color) if boundary_finder_class.name == 'Sqed::BoundaryFinder::ColorLineFinder'
144
149
 
145
150
  boundary_finder_class.new(options).boundaries
151
+
146
152
  end
147
153
 
148
154
  end
@@ -91,4 +91,18 @@ class Sqed::Boundaries
91
91
  end
92
92
  true
93
93
  end
94
+
95
+ def zoom(width_factor, height_factor)
96
+ coordinates.keys.each do |i|
97
+ set(i, [
98
+ (x_for(i).to_f * width_factor).to_i,
99
+ (y_for(i).to_f * height_factor).to_i,
100
+ (width_for(i).to_f * width_factor).to_i,
101
+ (height_for(i).to_f * height_factor).to_i
102
+ ])
103
+
104
+ end
105
+ end
106
+
107
+
94
108
  end
@@ -2,6 +2,9 @@
2
2
  # return derivative images. Finders operate on cropped images, i.e. only the "stage".
3
3
  #
4
4
  class Sqed::BoundaryFinder
5
+
6
+ THUMB_SIZE = 100
7
+
5
8
  # the passed image
6
9
  attr_reader :img
7
10
 
@@ -11,10 +14,18 @@ class Sqed::BoundaryFinder
11
14
  # A Sqed::Boundaries instance, stores the coordinates of all of the layout sections
12
15
  attr_reader :boundaries
13
16
 
14
- def initialize(image: image, layout: layout)
17
+ # Whether to compress the original image to a thumbnail when finding boundaries
18
+ attr_reader :use_thumbnail
19
+
20
+ # when we compute using a derived thumbnail we temporarily store the full size image here
21
+ attr_reader :original_image
22
+
23
+ def initialize(image: image, layout: layout, use_thumbnail: true)
15
24
  raise 'No layout provided.' if layout.nil?
16
25
  raise 'No image provided.' if image.nil? || image.class.name != 'Magick::Image'
17
26
 
27
+ @use_thumbnail = use_thumbnail
28
+
18
29
  @layout = layout
19
30
  @img = image
20
31
  true
@@ -25,12 +36,46 @@ class Sqed::BoundaryFinder
25
36
  @boundaries ||= Sqed::Boundaries.new(@layout)
26
37
  end
27
38
 
39
+ def longest_thumbnail_axis
40
+ img.columns > img.rows ? :width : :height
41
+ end
42
+
43
+ def thumbnail_height
44
+ if longest_thumbnail_axis == :height
45
+ THUMB_SIZE
46
+ else
47
+ (img.rows.to_f * (THUMB_SIZE.to_f / img.columns.to_f)).round.to_i
48
+ end
49
+ end
50
+
51
+ def thumbnail_width
52
+ if longest_thumbnail_axis == :width
53
+ THUMB_SIZE
54
+ else
55
+ (img.columns.to_f * (THUMB_SIZE.to_f / img.rows.to_f)).round.to_i
56
+ end
57
+ end
58
+
59
+ # see https://rmagick.github.io/image3.html#thumbnail
60
+ def thumbnail
61
+ img.thumbnail(thumbnail_width, thumbnail_height)
62
+ end
63
+
64
+ def width_factor
65
+ img.columns.to_f / thumbnail_width.to_f
66
+ end
67
+
68
+ def height_factor
69
+ img.rows.to_f / thumbnail_height.to_f
70
+ end
71
+
72
+ def zoom_boundaries
73
+ boundaries.zoom(width_factor, height_factor )
74
+ end
28
75
 
29
76
  # return [Integer, nil]
30
77
  # sample more with small images, less with large images
31
78
  # we want to return larger numbers (= faster sampling)
32
- #
33
- #
34
79
  def self.get_subdivision_size(image_width)
35
80
  case image_width
36
81
  when nil
@@ -69,6 +114,7 @@ class Sqed::BoundaryFinder
69
114
  # (:rows|:columns), :rows finds vertical borders, :columns finds horizontal borders
70
115
  #
71
116
  def self.color_boundary_finder(image: image, sample_subdivision_size: nil, sample_cutoff_factor: nil, scan: :rows, boundary_color: :green)
117
+
72
118
  image_width = image.send(scan)
73
119
  sample_subdivision_size = get_subdivision_size(image_width) if sample_subdivision_size.nil?
74
120
  samples_to_take = (image_width / sample_subdivision_size).to_i - 1
@@ -105,6 +151,8 @@ class Sqed::BoundaryFinder
105
151
 
106
152
  if sample_cutoff_factor.nil?
107
153
  cutoff = max_difference(border_hits.values)
154
+
155
+ cutoff = border_hits.values.first - 1 if cutoff == 0 # difference of two identical things is 0
108
156
  else
109
157
  cutoff = (samples_to_take * sample_cutoff_factor).to_i
110
158
  end
@@ -132,6 +180,7 @@ class Sqed::BoundaryFinder
132
180
  # return [Array]
133
181
  # the median position of all (pixel) positions that have a count greater than the cutoff
134
182
  def self.frequency_stats(frequency_hash, sample_cutoff = 0)
183
+
135
184
  return nil if sample_cutoff.nil? || sample_cutoff < 1
136
185
  hit_ranges = []
137
186
 
@@ -141,7 +190,18 @@ class Sqed::BoundaryFinder
141
190
  end
142
191
  end
143
192
 
144
- return nil if hit_ranges.size < 3
193
+ case hit_ranges.size
194
+ when 1
195
+ c = hit_ranges[0]
196
+ hit_ranges = [c - 1, c, c + 1]
197
+ when 2
198
+ hit_ranges.sort!
199
+ c1 = hit_ranges[0]
200
+ c2 = hit_ranges[1]
201
+ hit_ranges = [c1, c2, c2 + (c2 - c1)]
202
+ when 0
203
+ return nil
204
+ end
145
205
 
146
206
  # we have to sort because the keys (positions) we examined came unordered from a hash originally
147
207
  hit_ranges.sort!
@@ -174,3 +234,4 @@ class Sqed::BoundaryFinder
174
234
  end
175
235
 
176
236
  end
237
+
@@ -4,10 +4,15 @@ require 'rmagick'
4
4
  #
5
5
  class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
6
6
 
7
- def initialize(image: image, layout: layout, boundary_color: :green)
8
- super(image: image, layout: layout)
7
+ def initialize(image: image, layout: layout, boundary_color: :green, use_thumbnail: true)
8
+ super(image: image, layout: layout, use_thumbnail: use_thumbnail)
9
9
  raise 'No layout provided.' if @layout.nil?
10
10
  @boundary_color = boundary_color
11
+
12
+ if use_thumbnail
13
+ @original_image = @img.copy
14
+ @img = thumbnail
15
+ end
11
16
  find_bands
12
17
  end
13
18
 
@@ -25,26 +30,28 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
25
30
  when :horizontal_split
26
31
  t = Sqed::BoundaryFinder.color_boundary_finder(image: img, scan: :columns, boundary_color: @boundary_color) # set to detect horizontal division, (green line)
27
32
  return if t.nil?
33
+
28
34
  boundaries.set(0, [0, 0, img.columns, t[0]]) # upper section of image
29
35
  boundaries.set(1, [0, t[2], img.columns, img.rows - t[2]]) # lower section of image
30
36
 
31
37
  when :right_t # only 3 zones expected, with horizontal division in right-side of vertical division
32
- vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
33
- irt = img.crop(*vertical.for(1), true)
34
- right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
38
+ vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
35
39
 
36
- boundaries.set(0, vertical.for(0))
40
+ irt = img.crop(*vertical.for(1), true)
41
+ right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
42
+
43
+ boundaries.set(0, vertical.for(0))
37
44
  boundaries.set(1, [ vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ] )
38
45
  boundaries.set(2, [ vertical.x_for(1), right.y_for(1), right.width_for(1), right.height_for(1)] )
39
46
 
40
47
  when :vertical_offset_cross # 4 zones expected, with (varying) horizontal division in left- and right- sides of vertical division
41
- vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
42
-
48
+ vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
49
+
43
50
  ilt = img.crop(*vertical.for(0), true)
44
51
  irt = img.crop(*vertical.for(1), true)
45
52
 
46
- left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
47
- right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
53
+ left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries # fails
54
+ right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries # OK
48
55
 
49
56
  boundaries.set(0, [0, 0, left.width_for(0), left.height_for(0) ])
50
57
  boundaries.set(1, [vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ])
@@ -53,13 +60,13 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
53
60
 
54
61
  # No specs for this yet
55
62
  when :horizontal_offset_cross
56
- horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
63
+ horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
57
64
 
58
65
  itop = img.crop(*horizontal.for(0), true)
59
66
  ibottom = img.crop(*horizontal.for(1), true)
60
67
 
61
- top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
62
- bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
68
+ top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
69
+ bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
63
70
 
64
71
  boundaries.set(0, [0, 0, top.width_for(0), top.height_for(0) ])
65
72
  boundaries.set(1, [top.x_for(1), 0, top.width_for(1), top.height_for(1) ])
@@ -67,8 +74,8 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
67
74
  boundaries.set(3, [0, horizontal.y_for(1), bottom.width_for(0), bottom.height_for(0) ])
68
75
 
69
76
  when :cross # 4 zones, with perfectly intersected horizontal and vertical division
70
- v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
71
- h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
77
+ v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
78
+ h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
72
79
 
73
80
  return if v.nil? || h.nil?
74
81
 
@@ -84,6 +91,12 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
84
91
 
85
92
  boundaries.complete = true if boundaries.populated?
86
93
 
94
+ if use_thumbnail
95
+ @img = @original_image
96
+ zoom_boundaries
97
+ @original_image = nil
98
+ end
99
+
87
100
  end
88
101
 
89
102
 
@@ -5,13 +5,13 @@ require 'rmagick'
5
5
  class Sqed::BoundaryFinder::CrossFinder < Sqed::BoundaryFinder
6
6
 
7
7
  def initialize(image: image)
8
- @image = image
8
+ @img = image
9
9
  find_edges
10
10
  end
11
11
 
12
12
  def find_edges
13
- width = @image.columns / 2
14
- height = @image.rows / 2
13
+ width = @img.columns / 2
14
+ height = @img.rows / 2
15
15
 
16
16
  boundaries.coordinates[0] = [0, 0, width, height]
17
17
  boundaries.coordinates[1] = [width, 0, width, height]
@@ -3,7 +3,7 @@ require 'rmagick'
3
3
  # Some of this code was originally inspired by Emmanuel Oga's gist https://gist.github.com/EmmanuelOga/2476153.
4
4
  #
5
5
  class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
6
-
6
+
7
7
  # The proc containing the border finding algorithim
8
8
  attr_reader :is_border
9
9
 
@@ -11,23 +11,24 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
11
11
 
12
12
  # How small we accept a cropped picture to be. E.G. if it was 100x100 and
13
13
  # ratio 0.1, min output should be 10x10
14
- MIN_CROP_RATIO = 0.1
14
+ MIN_CROP_RATIO = 0.1
15
15
 
16
- attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
16
+ attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
17
17
 
18
18
  def initialize(image: image, is_border_proc: nil, min_ratio: MIN_CROP_RATIO)
19
- super(image: image, layout: :internal_box)
19
+ super(image: image, layout: :internal_box)
20
20
 
21
- @min_ratio = min_ratio
21
+ @min_ratio = min_ratio
22
22
 
23
23
  # Initial co-ordinates
24
24
  @x0, @y0 = 0, 0
25
- @x1, @y1 = img.columns, img.rows
25
+ @x1, @y1 = img.columns, img.rows
26
26
  @min_width, @min_height = img.columns * @min_ratio, img.rows * @min_ratio # minimum resultant area
27
27
  @columns, @rows = img.columns, img.rows
28
28
 
29
+
29
30
  # We need a border finder proc. Provide one if none was given.
30
- @is_border = is_border_proc || self.class.default_border_finder(img) # if no proc specified, use default below
31
+ @is_border = is_border_proc || self.class.default_border_finder(img) # if no proc specified, use default below
31
32
 
32
33
  @x00 = @x0
33
34
  @y00 = @y0
@@ -51,8 +52,9 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
51
52
  # works for 0.5, >0.137; 0.60, >0.14 0.65, >0.146; 0.70, >0.1875; 0.75, >0.1875; 0.8, >0.237; 0.85, >0.24; 0.90, >0.28; 0.95, >0.25
52
53
  # fails for 0.75, (0.18, 0.17,0.16,0.15); 0.70, 0.18;
53
54
  #
54
- def self.default_border_finder(img, samples = 5, threshold = 0.75, fuzz_factor = 0.40) # working on non-synthetic images 04-dec-2014
55
- fuzz = ((Magick::QuantumRange + 1) * fuzz_factor).to_i
55
+ # this sets variables (locally) for find_edges
56
+ def self.default_border_finder(img, samples = 5, threshold = 0.75, fuzz_factor = 0.40) # working on non-synthetic images 04-dec-2014
57
+ fuzz = ((Magick::QuantumRange + 1) * fuzz_factor).to_i
56
58
  # Returns true if the edge is a border (border meaning outer region to be cropped)
57
59
  lambda do |edge|
58
60
  border, non_border = 0.0, 0.0 # maybe should be called outer, inner
@@ -79,7 +81,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
79
81
  # handle this exception
80
82
  return unless is_border # return if no process defined or set for @is_border
81
83
 
82
- u = x1 - 1 # rightmost pixel (kind of)
84
+ u = x1 - 1 # rightmost pixel (kind of)
83
85
  # increment from left to right
84
86
  x0.upto(u) do |x|
85
87
  if width_croppable? && is_border[vline(x)] then
@@ -89,7 +91,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
89
91
  end
90
92
  end
91
93
  # increment from left to right
92
- (u).downto(x0) { |x| width_croppable? && is_border[vline(x)] ? @x1 = x - 1 : break }
94
+ (u).downto(x0) { |x| width_croppable? && is_border[vline(x)] ? @x1 = x - 1 : break }
93
95
 
94
96
  u = y1 - 1
95
97
  0.upto(u) do |y|
@@ -104,10 +106,11 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
104
106
 
105
107
  delta_x = 0 #width/50 # 2% of cropped image to make up for trapezoidal distortion
106
108
  delta_y = 0 #height/50 # 2% of cropped image to make up for trapezoidal distortion <- NOT 3%
107
-
109
+
108
110
  # TODO: add conditions
109
- boundaries.complete = true
110
- boundaries.coordinates[0] = [x0 + delta_x, y0 + delta_y, width - 2*delta_x, height - 2*delta_y]
111
+ boundaries.complete = true
112
+ boundaries.set(0, [x0 + delta_x, y0 + delta_y, width - 2*delta_x, height - 2*delta_y])
113
+
111
114
  end
112
115
 
113
116
  def width_croppable?
@@ -127,13 +130,13 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
127
130
  end
128
131
 
129
132
  # actually + 1 (starting at zero?)
130
- def width
131
- @x1 - @x0
133
+ def width
134
+ @x1 - @x0
132
135
  end
133
-
136
+
134
137
  # actually + 1 (starting at zero?)
135
138
  def height
136
- @y1 - @y0
139
+ @y1 - @y0
137
140
  end
138
141
 
139
142
  end
@@ -42,10 +42,11 @@ class Sqed::Extractor
42
42
  if parsers = SqedConfig::SECTION_PARSERS[section_type]
43
43
 
44
44
  section_image = r.send("#{section_type}_image")
45
+
45
46
  updated = r.send(section_type)
46
47
 
47
48
  parsers.each do |p|
48
- parsed_result = p.new(section_image).text
49
+ parsed_result = p.new(section_image).text(section_type: section_type)
49
50
  updated.merge!(p::TYPE => parsed_result) if parsed_result
50
51
  end
51
52
 
@@ -10,8 +10,9 @@ class Sqed::Parser
10
10
  raise 'no image provided to parser' if @image && !(@image.class.name == 'Magick::Image')
11
11
  end
12
12
 
13
+ # TODO: is this required?!j
13
14
  # must be provided in subclasses
14
- def text
15
+ def text(section_type: :default)
15
16
  nil
16
17
  end
17
18
 
@@ -36,7 +36,7 @@ class Sqed::Parser::BarcodeParser < Sqed::Parser
36
36
  #end
37
37
 
38
38
  # alias to a universal method
39
- def text
39
+ def text(section_type: :default)
40
40
  barcode
41
41
  end
42
42
 
@@ -2,7 +2,20 @@
2
2
  #
3
3
  # Given a single image return all text in that image.
4
4
  #
5
- # For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
5
+ # For reference
6
+ # http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
7
+ # https://code.google.com/p/tesseract-ocr/wiki/FAQ
8
+ # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
9
+ #
10
+ # "There is a minimum text size for reasonable accuracy.
11
+ # You have to consider resolution as well as point size.
12
+ # Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi.
13
+ # A quick check is to count the pixels of the x-height of your characters.
14
+ # (X-height is the height of the lower case x.)
15
+ # At 10pt x 300dpi x-heights are typically about 20 pixels, although this
16
+ # can vary dramatically from font to font.
17
+ # Below an x-height of 10 pixels, you have very little chance of accurate results,
18
+ # and below about 8 pixels, most of the text will be "noise removed".
6
19
  #
7
20
  require 'rtesseract'
8
21
 
@@ -10,60 +23,84 @@ class Sqed::Parser::OcrParser < Sqed::Parser
10
23
 
11
24
  TYPE = :text
12
25
 
26
+ # Tesseract parameters default/specific to section type,
27
+ # default is merged into the type
28
+ SECTION_PARAMS = {
29
+ default: {
30
+ psm: 3,
31
+ # classify_debug_level: 5,
32
+ # lang: 'eng',
33
+ # load_system_dawg: 0,
34
+ # load_unambig_dawg: 0,
35
+ # load_freq_dawg: 0,
36
+ # load_fixed_length_dawgs: 0,
37
+ # load_number_dawg: 0,
38
+ # load_punc_dawg: 1, ## important
39
+ # load_unambig_dawg: 1,
40
+ # chop_enable: 0,
41
+ # enable_new_segsearch: 1,
42
+ # tessedit_debug_quality_metrics: 1,
43
+ # tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
44
+ # tessedit_write_images: 1,
45
+ # equationdetect_save_merged_image: 1,
46
+ # tessedit_dump_pageseg_images: 1,
47
+ # equationdetect_save_bi_image: 1
48
+ },
49
+ annotated_specimen: {
50
+ edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
51
+ },
52
+ identifier: {
53
+ psm: 1,
54
+ # tessedit_char_whitelist: '0123456789'
55
+ # edges_children_count_limit: 4000
56
+ },
57
+ curator_metadata: {
58
+ },
59
+ labels: {
60
+ psm: 3, # may need to be 6
61
+ },
62
+ deterimination_labels: {
63
+ psm: 3
64
+ }
65
+ }
66
+
13
67
  # the text extracted from the image
14
68
  attr_accessor :text
15
69
 
16
- # https://code.google.com/p/tesseract-ocr/wiki/FAQ
17
- def text
18
- img = @image #.white_threshold(245)
19
-
20
- # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
21
- # get potential border pixel color (based on quadrant?)
22
- new_color = img.pixel_color(1, 1)
23
- # img = img.scale(2)
24
- # img.write('foo0.jpg.jpg')
25
- # img = img.enhance
26
- # img.write('foo1.jpg')
27
- # img = img.quantize(8, Magick::GRAYColorspace)
28
- # img.write('foo1.jpg')
29
- # img = img.sharpen(1.0, 0.2)
30
- # img.write('foo2.jpg')
31
- # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
32
- # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
33
- # img.write('tmp/foo4.jpg')
34
- # img = img.quantize(2, Magick::GRAYColorspace)
35
- # #img = img.threshold(0.5)
36
- # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
37
- # img = img.equalize #(32, Magick::GRAYColorspace)
38
- # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
39
- # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
40
- #
41
- # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
42
-
43
-
44
- # From https://code.google.com/p/tesseract-ocr/wiki/FAQ
45
- # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed".
46
-
47
-
48
- # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
49
- # doesn't supprot outputbase
50
- r = RTesseract.new(img, lang: 'eng', psm: 1,
51
- load_system_dawg: 0,
52
- tessedit_debug_quality_metrics: 1,
53
- load_freq_dawg: 1 ,
54
- chop_enable: 1,
55
- tessedit_write_images: 1,
56
- equationdetect_save_merged_image: 1,
57
- tessedit_dump_pageseg_images: 1,
58
- equationdetect_save_bi_image: 1,
59
- load_unambig_dawg: 0,
60
- tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,
61
-
62
- # img = img.white_threshold(245)
70
+ # future consideration
71
+ # def enhance_image(img)
72
+ # get potential border pixel color (based on quadrant?)
73
+ # new_color = img.pixel_color(1, 1)
63
74
 
75
+ # img = img.scale(2)
76
+ # img.write('foo0.jpg.jpg')
77
+ # img = img.enhance
78
+ # img.write('foo1.jpg')
79
+ # img = img.quantize(8, Magick::GRAYColorspace)
80
+ # img.write('foo1.jpg')
81
+ # img = img.sharpen(1.0, 0.2)
82
+ # img.write('foo2.jpg')
83
+ # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
84
+ # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
85
+ # img.write('tmp/foo4.jpg')
86
+ # img = img.quantize(2, Magick::GRAYColorspace)
87
+ # #img = img.threshold(0.5)
88
+ # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
89
+ # img = img.equalize #(32, Magick::GRAYColorspace)
90
+ # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
91
+ # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
92
+ #
93
+ # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
94
+ # img = img.white_threshold(245)
95
+ # img
96
+ # end
97
+
98
+ def text(section_type: :default)
99
+ img = @image
100
+ params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
101
+ r = RTesseract.new(img, params)
64
102
  @text = r.to_s.strip
65
103
  end
66
104
 
67
- # Need to provide tuning methods here, i.e. image transormations that facilitate OCR
68
105
 
69
106
  end
@@ -1,3 +1,3 @@
1
1
  class Sqed
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -8,25 +8,25 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
8
8
  let(:c) {b.boundaries}
9
9
  let(:d) { image.crop(*c.for(0), true) }
10
10
 
11
- let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t) }
11
+ let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t, use_thumbnail: false) }
12
12
  let(:f) { e.boundaries }
13
- let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross)}
13
+ let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross, use_thumbnail: false)}
14
14
  let(:h) { g.boundaries }
15
- let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split) }
15
+ let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split, use_thumbnail: false) }
16
16
  let(:hv) { gv.boundaries }
17
17
 
18
18
  let(:ah) { ImageHelpers.vertical_offset_cross_red }
19
19
  let(:bh) { Sqed::BoundaryFinder::StageFinder.new(image: ah) }
20
20
  let(:ch) { bh.boundaries }
21
21
  let(:dh) { ah.crop(*ch.for(0), true) }
22
- let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red) } # was :horizontal_split
22
+ let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red, use_thumbnail: false) } # was :horizontal_split
23
23
  let(:hh) { gh.boundaries }
24
24
 
25
25
  let(:ibs) { ImageHelpers.black_stage_green_line_specimen }
26
26
  let(:bbs) { Sqed::BoundaryFinder::StageFinder.new(image: ibs) }
27
27
  let(:cbs) { bbs.boundaries }
28
28
  let(:dbs) { ibs.crop(*cbs.for(0), true) }
29
- let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross) }
29
+ let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross, use_thumbnail: false) }
30
30
  let(:hbs) { gbs.boundaries }
31
31
 
32
32
  specify 'initial image columns are as expected for :image above' do
@@ -167,7 +167,7 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
167
167
  context 'thumbnail processing finds reasonable boundaries' do
168
168
 
169
169
  let(:thumb) { ImageHelpers.frost_stage_thumb }
170
- let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross)}
170
+ let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross, use_thumbnail: false)}
171
171
  let(:finder_boundaries) { finder.boundaries }
172
172
 
173
173
  let(:pct) { 0.08 }
@@ -22,7 +22,8 @@ describe Sqed::BoundaryFinder::CrossFinder do
22
22
  end
23
23
 
24
24
  specify 'the 0th image has height = 300' do
25
- expect(c.height_for(0)).to eq(300)
25
+ pct = 0.02
26
+ expect(c.height_for(0)).to be_within(pct*300).of(300)
26
27
  end
27
28
 
28
29
  end
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Sqed::BoundaryFinder do
4
4
 
5
5
  specify 'when no image provided, #new raises' do
6
- expect { Sqed::BoundaryFinder.new() }.to raise_error
6
+ expect { Sqed::BoundaryFinder.new() }.to raise_error('No layout provided.')
7
7
  end
8
8
 
9
9
  context 'when initiated with an image' do
@@ -103,9 +103,15 @@ describe Sqed::BoundaryFinder do
103
103
  expect( Sqed::BoundaryFinder.frequency_stats(i, 12)).to eq([3, 4, 5])
104
104
  end
105
105
 
106
+ specify 'returns estimated borders if only one hit greater than samples taken' do
107
+ expect( Sqed::BoundaryFinder.frequency_stats(i, 15)).to eq([2,3,4])
108
+ end
109
+
106
110
  specify 'returns nil if no count is greater than samples taken' do
107
- expect( Sqed::BoundaryFinder.frequency_stats(i, 15)).to eq(nil)
111
+ expect( Sqed::BoundaryFinder.frequency_stats(i, 20)).to eq(nil)
108
112
  end
113
+
114
+
109
115
  end
110
116
 
111
117
  context 'offset boundaries from crossy_black_line_specimen image ' do
@@ -90,20 +90,23 @@ describe Sqed do
90
90
 
91
91
  context '#result' do
92
92
  let(:r) { s.result }
93
+
93
94
  specify 'returns a Sqed::Result' do
94
95
  expect(r.class.name).to eq('Sqed::Result')
95
96
  end
96
97
 
97
98
  context 'extracted data' do
98
- specify 'for an :identifier section' do
99
+ specify 'text for an :identifier section' do
100
+
101
+ r.identifier_image.write('41.jpg')
99
102
  expect(r.text_for(:identifier)).to match('000041196')
100
103
  end
101
104
 
102
- specify 'for an annotated_specimen section' do
105
+ specify 'text for an annotated_specimen section' do
103
106
  expect(r.text_for(:annotated_specimen)).to match('Saucier Creek')
104
107
  end
105
108
 
106
- specify 'for a curator_metadata section' do
109
+ specify 'text for a curator_metadata section' do
107
110
  expect(r.text_for(:curator_metadata)).to match('Frost Entomological Museum')
108
111
  end
109
112
  end
@@ -113,7 +116,7 @@ describe Sqed do
113
116
  context 'all together, with border' do
114
117
  let(:image) { ImageHelpers.greenline_image }
115
118
  let(:pattern) { :right_t }
116
- let(:s) { Sqed.new(image: image, pattern: pattern, has_border: false) }
119
+ let(:s) { Sqed.new(image: image, pattern: pattern, has_border: true) }
117
120
 
118
121
  specify '#boundaries returns a Sqed::Boundaries instance' do
119
122
  expect(s.boundaries.class.name).to eq('Sqed::Boundaries')
@@ -138,11 +141,12 @@ describe Sqed do
138
141
  end
139
142
 
140
143
  context 'extracted data' do
141
- specify 'for an :identifier section' do
144
+ specify 'text for an :identifier section' do
145
+ r.identifier_image.write('85.jpg')
142
146
  expect(r.text_for(:identifier)).to match('000085067')
143
147
  end
144
148
 
145
- specify 'for a specimen section' do
149
+ specify 'text for a specimen section' do
146
150
  expect(r.text_for(:annotated_specimen)).to match('Aeshna')
147
151
  end
148
152
  end
@@ -23,7 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency 'rtesseract', '~> 1.2.6'
24
24
  spec.add_dependency 'zxing_cpp', '~> 0.1.0'
25
25
 
26
- spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency 'rspec', '~> 3.3'
27
27
  spec.add_development_dependency 'bundler', '~> 1.5'
28
28
  spec.add_development_dependency 'did_you_mean', '~> 0.9'
29
29
  spec.add_development_dependency 'byebug'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sqed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Yoder
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-06-11 00:00:00.000000000 Z
12
+ date: 2015-09-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -71,16 +71,16 @@ dependencies:
71
71
  name: rspec
72
72
  requirement: !ruby/object:Gem::Requirement
73
73
  requirements:
74
- - - ">="
74
+ - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0'
76
+ version: '3.3'
77
77
  type: :development
78
78
  prerelease: false
79
79
  version_requirements: !ruby/object:Gem::Requirement
80
80
  requirements:
81
- - - ">="
81
+ - - "~>"
82
82
  - !ruby/object:Gem::Version
83
- version: '0'
83
+ version: '3.3'
84
84
  - !ruby/object:Gem::Dependency
85
85
  name: bundler
86
86
  requirement: !ruby/object:Gem::Requirement