sqed 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 11687929d5850b325870f2f76a4d628f2707f324
4
- data.tar.gz: f38950c622a96cda352855e494e97ce3bdb8bf8e
3
+ metadata.gz: bd0958fc2efbed976b77385f511ce9025f26b5b2
4
+ data.tar.gz: 2f3efab45b172677057170dfdbdee8366372fac0
5
5
  SHA512:
6
- metadata.gz: 486844c8d1499848dbab9bedfd69155a023eac59d2470914cdec6d80c58a8457db398386ada6f9d4b153e549ece076c471279d3ef768be13333bb767f78d1e9e
7
- data.tar.gz: bcf06a0b5d27545bbac8123a1dd69f84166f37d760dff9a95e33b5e8d73f688ed8d12cd419ec84a76011fd328c4aa1ccaa58ecaed863a94a22fd2d5ab82fc424
6
+ metadata.gz: 04c09b12b5212a5b7c6cf356caa6e04ef4c422b5e83f14c652996089b9f30c016102572cc2963c2710d56135d7411e2778303034de9b8a44597d4feee7796b43
7
+ data.tar.gz: 14a8a7cd8bd19a6c3c00e3105b82b513f119bca6f3febac4751e9d1714e2ff08d4aaa3a5bbd3ead4f349d56078830aba02272bc3231424cea3877064b5b9f5b4
@@ -41,7 +41,10 @@ class Sqed
41
41
  # a symbol, :red, :green, :blue, describing the boundary color within the stage
42
42
  attr_accessor :boundary_color
43
43
 
44
- def initialize(image: image, pattern: pattern, has_border: true, boundary_color: :green)
44
+ # Boolean, whether to do the boundary detection (not stage detection at present) against a thumbnail version of the passed image (faster, less accurate, true be default)
45
+ attr_accessor :use_thumbnail
46
+
47
+ def initialize(image: image, pattern: pattern, has_border: true, boundary_color: :green, use_thumbnail: true)
45
48
  raise 'extraction pattern not defined' if pattern && !SqedConfig::EXTRACTION_PATTERNS.keys.include?(pattern)
46
49
 
47
50
  @image = image
@@ -51,7 +54,7 @@ class Sqed
51
54
  @pattern = pattern
52
55
  @pattern ||= :cross
53
56
  @boundary_color = boundary_color
54
-
57
+ @use_thumbnail = use_thumbnail
55
58
  set_stage_boundary if @image
56
59
  end
57
60
 
@@ -108,6 +111,7 @@ class Sqed
108
111
  extractor.result
109
112
  end
110
113
 
114
+ # Debugging purposes
111
115
  def attributes
112
116
  {
113
117
  image: @image,
@@ -115,7 +119,8 @@ class Sqed
115
119
  stage_boundary: stage_boundary,
116
120
  has_border: @has_border,
117
121
  pattern: @pattern,
118
- boundary_color: @boundary_color
122
+ boundary_color: @boundary_color,
123
+ use_thumbnail: @use_thumbnail
119
124
  }
120
125
  end
121
126
 
@@ -138,11 +143,12 @@ class Sqed
138
143
  def get_section_boundaries
139
144
  boundary_finder_class = SqedConfig::EXTRACTION_PATTERNS[@pattern][:boundary_finder]
140
145
 
141
- options = {image: stage_image}
146
+ options = {image: stage_image, use_thumbnail: use_thumbnail}
142
147
  options.merge!( layout: SqedConfig::EXTRACTION_PATTERNS[@pattern][:layout] ) unless boundary_finder_class.name == 'Sqed::BoundaryFinder::CrossFinder'
143
- options.merge!( boundary_color: @boundary_color) if boundary_finder_class.name == 'Sqed::BoundaryFinder::ColorLineFinder'
148
+ options.merge!( boundary_color: @boundary_color) if boundary_finder_class.name == 'Sqed::BoundaryFinder::ColorLineFinder'
144
149
 
145
150
  boundary_finder_class.new(options).boundaries
151
+
146
152
  end
147
153
 
148
154
  end
@@ -91,4 +91,18 @@ class Sqed::Boundaries
91
91
  end
92
92
  true
93
93
  end
94
+
95
+ def zoom(width_factor, height_factor)
96
+ coordinates.keys.each do |i|
97
+ set(i, [
98
+ (x_for(i).to_f * width_factor).to_i,
99
+ (y_for(i).to_f * height_factor).to_i,
100
+ (width_for(i).to_f * width_factor).to_i,
101
+ (height_for(i).to_f * height_factor).to_i
102
+ ])
103
+
104
+ end
105
+ end
106
+
107
+
94
108
  end
@@ -2,6 +2,9 @@
2
2
  # return derivative images. Finders operate on cropped images, i.e. only the "stage".
3
3
  #
4
4
  class Sqed::BoundaryFinder
5
+
6
+ THUMB_SIZE = 100
7
+
5
8
  # the passed image
6
9
  attr_reader :img
7
10
 
@@ -11,10 +14,18 @@ class Sqed::BoundaryFinder
11
14
  # A Sqed::Boundaries instance, stores the coordinates of all of the layout sections
12
15
  attr_reader :boundaries
13
16
 
14
- def initialize(image: image, layout: layout)
17
+ # Whether to compress the original image to a thumbnail when finding boundaries
18
+ attr_reader :use_thumbnail
19
+
20
+ # when we compute using a derived thumbnail we temporarily store the full size image here
21
+ attr_reader :original_image
22
+
23
+ def initialize(image: image, layout: layout, use_thumbnail: true)
15
24
  raise 'No layout provided.' if layout.nil?
16
25
  raise 'No image provided.' if image.nil? || image.class.name != 'Magick::Image'
17
26
 
27
+ @use_thumbnail = use_thumbnail
28
+
18
29
  @layout = layout
19
30
  @img = image
20
31
  true
@@ -25,12 +36,46 @@ class Sqed::BoundaryFinder
25
36
  @boundaries ||= Sqed::Boundaries.new(@layout)
26
37
  end
27
38
 
39
+ def longest_thumbnail_axis
40
+ img.columns > img.rows ? :width : :height
41
+ end
42
+
43
+ def thumbnail_height
44
+ if longest_thumbnail_axis == :height
45
+ THUMB_SIZE
46
+ else
47
+ (img.rows.to_f * (THUMB_SIZE.to_f / img.columns.to_f)).round.to_i
48
+ end
49
+ end
50
+
51
+ def thumbnail_width
52
+ if longest_thumbnail_axis == :width
53
+ THUMB_SIZE
54
+ else
55
+ (img.columns.to_f * (THUMB_SIZE.to_f / img.rows.to_f)).round.to_i
56
+ end
57
+ end
58
+
59
+ # see https://rmagick.github.io/image3.html#thumbnail
60
+ def thumbnail
61
+ img.thumbnail(thumbnail_width, thumbnail_height)
62
+ end
63
+
64
+ def width_factor
65
+ img.columns.to_f / thumbnail_width.to_f
66
+ end
67
+
68
+ def height_factor
69
+ img.rows.to_f / thumbnail_height.to_f
70
+ end
71
+
72
+ def zoom_boundaries
73
+ boundaries.zoom(width_factor, height_factor )
74
+ end
28
75
 
29
76
  # return [Integer, nil]
30
77
  # sample more with small images, less with large images
31
78
  # we want to return larger numbers (= faster sampling)
32
- #
33
- #
34
79
  def self.get_subdivision_size(image_width)
35
80
  case image_width
36
81
  when nil
@@ -69,6 +114,7 @@ class Sqed::BoundaryFinder
69
114
  # (:rows|:columns), :rows finds vertical borders, :columns finds horizontal borders
70
115
  #
71
116
  def self.color_boundary_finder(image: image, sample_subdivision_size: nil, sample_cutoff_factor: nil, scan: :rows, boundary_color: :green)
117
+
72
118
  image_width = image.send(scan)
73
119
  sample_subdivision_size = get_subdivision_size(image_width) if sample_subdivision_size.nil?
74
120
  samples_to_take = (image_width / sample_subdivision_size).to_i - 1
@@ -105,6 +151,8 @@ class Sqed::BoundaryFinder
105
151
 
106
152
  if sample_cutoff_factor.nil?
107
153
  cutoff = max_difference(border_hits.values)
154
+
155
+ cutoff = border_hits.values.first - 1 if cutoff == 0 # difference of two identical things is 0
108
156
  else
109
157
  cutoff = (samples_to_take * sample_cutoff_factor).to_i
110
158
  end
@@ -132,6 +180,7 @@ class Sqed::BoundaryFinder
132
180
  # return [Array]
133
181
  # the median position of all (pixel) positions that have a count greater than the cutoff
134
182
  def self.frequency_stats(frequency_hash, sample_cutoff = 0)
183
+
135
184
  return nil if sample_cutoff.nil? || sample_cutoff < 1
136
185
  hit_ranges = []
137
186
 
@@ -141,7 +190,18 @@ class Sqed::BoundaryFinder
141
190
  end
142
191
  end
143
192
 
144
- return nil if hit_ranges.size < 3
193
+ case hit_ranges.size
194
+ when 1
195
+ c = hit_ranges[0]
196
+ hit_ranges = [c - 1, c, c + 1]
197
+ when 2
198
+ hit_ranges.sort!
199
+ c1 = hit_ranges[0]
200
+ c2 = hit_ranges[1]
201
+ hit_ranges = [c1, c2, c2 + (c2 - c1)]
202
+ when 0
203
+ return nil
204
+ end
145
205
 
146
206
  # we have to sort because the keys (positions) we examined came unordered from a hash originally
147
207
  hit_ranges.sort!
@@ -174,3 +234,4 @@ class Sqed::BoundaryFinder
174
234
  end
175
235
 
176
236
  end
237
+
@@ -4,10 +4,15 @@ require 'rmagick'
4
4
  #
5
5
  class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
6
6
 
7
- def initialize(image: image, layout: layout, boundary_color: :green)
8
- super(image: image, layout: layout)
7
+ def initialize(image: image, layout: layout, boundary_color: :green, use_thumbnail: true)
8
+ super(image: image, layout: layout, use_thumbnail: use_thumbnail)
9
9
  raise 'No layout provided.' if @layout.nil?
10
10
  @boundary_color = boundary_color
11
+
12
+ if use_thumbnail
13
+ @original_image = @img.copy
14
+ @img = thumbnail
15
+ end
11
16
  find_bands
12
17
  end
13
18
 
@@ -25,26 +30,28 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
25
30
  when :horizontal_split
26
31
  t = Sqed::BoundaryFinder.color_boundary_finder(image: img, scan: :columns, boundary_color: @boundary_color) # set to detect horizontal division, (green line)
27
32
  return if t.nil?
33
+
28
34
  boundaries.set(0, [0, 0, img.columns, t[0]]) # upper section of image
29
35
  boundaries.set(1, [0, t[2], img.columns, img.rows - t[2]]) # lower section of image
30
36
 
31
37
  when :right_t # only 3 zones expected, with horizontal division in right-side of vertical division
32
- vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
33
- irt = img.crop(*vertical.for(1), true)
34
- right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
38
+ vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
35
39
 
36
- boundaries.set(0, vertical.for(0))
40
+ irt = img.crop(*vertical.for(1), true)
41
+ right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
42
+
43
+ boundaries.set(0, vertical.for(0))
37
44
  boundaries.set(1, [ vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ] )
38
45
  boundaries.set(2, [ vertical.x_for(1), right.y_for(1), right.width_for(1), right.height_for(1)] )
39
46
 
40
47
  when :vertical_offset_cross # 4 zones expected, with (varying) horizontal division in left- and right- sides of vertical division
41
- vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
42
-
48
+ vertical = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
49
+
43
50
  ilt = img.crop(*vertical.for(0), true)
44
51
  irt = img.crop(*vertical.for(1), true)
45
52
 
46
- left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
47
- right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
53
+ left = self.class.new(image: ilt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries # fails
54
+ right = self.class.new(image: irt, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries # OK
48
55
 
49
56
  boundaries.set(0, [0, 0, left.width_for(0), left.height_for(0) ])
50
57
  boundaries.set(1, [vertical.x_for(1), 0, right.width_for(0), right.height_for(0) ])
@@ -53,13 +60,13 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
53
60
 
54
61
  # No specs for this yet
55
62
  when :horizontal_offset_cross
56
- horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
63
+ horizontal = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
57
64
 
58
65
  itop = img.crop(*horizontal.for(0), true)
59
66
  ibottom = img.crop(*horizontal.for(1), true)
60
67
 
61
- top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
62
- bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
68
+ top = self.class.new(image: ilt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
69
+ bottom = self.class.new(image: irt, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
63
70
 
64
71
  boundaries.set(0, [0, 0, top.width_for(0), top.height_for(0) ])
65
72
  boundaries.set(1, [top.x_for(1), 0, top.width_for(1), top.height_for(1) ])
@@ -67,8 +74,8 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
67
74
  boundaries.set(3, [0, horizontal.y_for(1), bottom.width_for(0), bottom.height_for(0) ])
68
75
 
69
76
  when :cross # 4 zones, with perfectly intersected horizontal and vertical division
70
- v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color ).boundaries
71
- h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color ).boundaries
77
+ v = self.class.new(image: @img, layout: :vertical_split, boundary_color: @boundary_color, use_thumbnail: false ).boundaries
78
+ h = self.class.new(image: @img, layout: :horizontal_split, boundary_color: @boundary_color, use_thumbnail: false).boundaries
72
79
 
73
80
  return if v.nil? || h.nil?
74
81
 
@@ -84,6 +91,12 @@ class Sqed::BoundaryFinder::ColorLineFinder < Sqed::BoundaryFinder
84
91
 
85
92
  boundaries.complete = true if boundaries.populated?
86
93
 
94
+ if use_thumbnail
95
+ @img = @original_image
96
+ zoom_boundaries
97
+ @original_image = nil
98
+ end
99
+
87
100
  end
88
101
 
89
102
 
@@ -5,13 +5,13 @@ require 'rmagick'
5
5
  class Sqed::BoundaryFinder::CrossFinder < Sqed::BoundaryFinder
6
6
 
7
7
  def initialize(image: image)
8
- @image = image
8
+ @img = image
9
9
  find_edges
10
10
  end
11
11
 
12
12
  def find_edges
13
- width = @image.columns / 2
14
- height = @image.rows / 2
13
+ width = @img.columns / 2
14
+ height = @img.rows / 2
15
15
 
16
16
  boundaries.coordinates[0] = [0, 0, width, height]
17
17
  boundaries.coordinates[1] = [width, 0, width, height]
@@ -3,7 +3,7 @@ require 'rmagick'
3
3
  # Some of this code was originally inspired by Emmanuel Oga's gist https://gist.github.com/EmmanuelOga/2476153.
4
4
  #
5
5
  class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
6
-
6
+
7
7
  # The proc containing the border finding algorithim
8
8
  attr_reader :is_border
9
9
 
@@ -11,23 +11,24 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
11
11
 
12
12
  # How small we accept a cropped picture to be. E.G. if it was 100x100 and
13
13
  # ratio 0.1, min output should be 10x10
14
- MIN_CROP_RATIO = 0.1
14
+ MIN_CROP_RATIO = 0.1
15
15
 
16
- attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
16
+ attr_reader :x0, :y0, :x1, :y1, :min_width, :min_height, :rows, :columns
17
17
 
18
18
  def initialize(image: image, is_border_proc: nil, min_ratio: MIN_CROP_RATIO)
19
- super(image: image, layout: :internal_box)
19
+ super(image: image, layout: :internal_box)
20
20
 
21
- @min_ratio = min_ratio
21
+ @min_ratio = min_ratio
22
22
 
23
23
  # Initial co-ordinates
24
24
  @x0, @y0 = 0, 0
25
- @x1, @y1 = img.columns, img.rows
25
+ @x1, @y1 = img.columns, img.rows
26
26
  @min_width, @min_height = img.columns * @min_ratio, img.rows * @min_ratio # minimum resultant area
27
27
  @columns, @rows = img.columns, img.rows
28
28
 
29
+
29
30
  # We need a border finder proc. Provide one if none was given.
30
- @is_border = is_border_proc || self.class.default_border_finder(img) # if no proc specified, use default below
31
+ @is_border = is_border_proc || self.class.default_border_finder(img) # if no proc specified, use default below
31
32
 
32
33
  @x00 = @x0
33
34
  @y00 = @y0
@@ -51,8 +52,9 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
51
52
  # works for 0.5, >0.137; 0.60, >0.14 0.65, >0.146; 0.70, >0.1875; 0.75, >0.1875; 0.8, >0.237; 0.85, >0.24; 0.90, >0.28; 0.95, >0.25
52
53
  # fails for 0.75, (0.18, 0.17,0.16,0.15); 0.70, 0.18;
53
54
  #
54
- def self.default_border_finder(img, samples = 5, threshold = 0.75, fuzz_factor = 0.40) # working on non-synthetic images 04-dec-2014
55
- fuzz = ((Magick::QuantumRange + 1) * fuzz_factor).to_i
55
+ # this sets variables (locally) for find_edges
56
+ def self.default_border_finder(img, samples = 5, threshold = 0.75, fuzz_factor = 0.40) # working on non-synthetic images 04-dec-2014
57
+ fuzz = ((Magick::QuantumRange + 1) * fuzz_factor).to_i
56
58
  # Returns true if the edge is a border (border meaning outer region to be cropped)
57
59
  lambda do |edge|
58
60
  border, non_border = 0.0, 0.0 # maybe should be called outer, inner
@@ -79,7 +81,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
79
81
  # handle this exception
80
82
  return unless is_border # return if no process defined or set for @is_border
81
83
 
82
- u = x1 - 1 # rightmost pixel (kind of)
84
+ u = x1 - 1 # rightmost pixel (kind of)
83
85
  # increment from left to right
84
86
  x0.upto(u) do |x|
85
87
  if width_croppable? && is_border[vline(x)] then
@@ -89,7 +91,7 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
89
91
  end
90
92
  end
91
93
  # increment from left to right
92
- (u).downto(x0) { |x| width_croppable? && is_border[vline(x)] ? @x1 = x - 1 : break }
94
+ (u).downto(x0) { |x| width_croppable? && is_border[vline(x)] ? @x1 = x - 1 : break }
93
95
 
94
96
  u = y1 - 1
95
97
  0.upto(u) do |y|
@@ -104,10 +106,11 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
104
106
 
105
107
  delta_x = 0 #width/50 # 2% of cropped image to make up for trapezoidal distortion
106
108
  delta_y = 0 #height/50 # 2% of cropped image to make up for trapezoidal distortion <- NOT 3%
107
-
109
+
108
110
  # TODO: add conditions
109
- boundaries.complete = true
110
- boundaries.coordinates[0] = [x0 + delta_x, y0 + delta_y, width - 2*delta_x, height - 2*delta_y]
111
+ boundaries.complete = true
112
+ boundaries.set(0, [x0 + delta_x, y0 + delta_y, width - 2*delta_x, height - 2*delta_y])
113
+
111
114
  end
112
115
 
113
116
  def width_croppable?
@@ -127,13 +130,13 @@ class Sqed::BoundaryFinder::StageFinder < Sqed::BoundaryFinder
127
130
  end
128
131
 
129
132
  # actually + 1 (starting at zero?)
130
- def width
131
- @x1 - @x0
133
+ def width
134
+ @x1 - @x0
132
135
  end
133
-
136
+
134
137
  # actually + 1 (starting at zero?)
135
138
  def height
136
- @y1 - @y0
139
+ @y1 - @y0
137
140
  end
138
141
 
139
142
  end
@@ -42,10 +42,11 @@ class Sqed::Extractor
42
42
  if parsers = SqedConfig::SECTION_PARSERS[section_type]
43
43
 
44
44
  section_image = r.send("#{section_type}_image")
45
+
45
46
  updated = r.send(section_type)
46
47
 
47
48
  parsers.each do |p|
48
- parsed_result = p.new(section_image).text
49
+ parsed_result = p.new(section_image).text(section_type: section_type)
49
50
  updated.merge!(p::TYPE => parsed_result) if parsed_result
50
51
  end
51
52
 
@@ -10,8 +10,9 @@ class Sqed::Parser
10
10
  raise 'no image provided to parser' if @image && !(@image.class.name == 'Magick::Image')
11
11
  end
12
12
 
13
+ # TODO: is this required?!j
13
14
  # must be provided in subclasses
14
- def text
15
+ def text(section_type: :default)
15
16
  nil
16
17
  end
17
18
 
@@ -36,7 +36,7 @@ class Sqed::Parser::BarcodeParser < Sqed::Parser
36
36
  #end
37
37
 
38
38
  # alias to a universal method
39
- def text
39
+ def text(section_type: :default)
40
40
  barcode
41
41
  end
42
42
 
@@ -2,7 +2,20 @@
2
2
  #
3
3
  # Given a single image return all text in that image.
4
4
  #
5
- # For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
5
+ # For reference
6
+ # http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
7
+ # https://code.google.com/p/tesseract-ocr/wiki/FAQ
8
+ # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
9
+ #
10
+ # "There is a minimum text size for reasonable accuracy.
11
+ # You have to consider resolution as well as point size.
12
+ # Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi.
13
+ # A quick check is to count the pixels of the x-height of your characters.
14
+ # (X-height is the height of the lower case x.)
15
+ # At 10pt x 300dpi x-heights are typically about 20 pixels, although this
16
+ # can vary dramatically from font to font.
17
+ # Below an x-height of 10 pixels, you have very little chance of accurate results,
18
+ # and below about 8 pixels, most of the text will be "noise removed".
6
19
  #
7
20
  require 'rtesseract'
8
21
 
@@ -10,60 +23,84 @@ class Sqed::Parser::OcrParser < Sqed::Parser
10
23
 
11
24
  TYPE = :text
12
25
 
26
+ # Tesseract parameters default/specific to section type,
27
+ # default is merged into the type
28
+ SECTION_PARAMS = {
29
+ default: {
30
+ psm: 3,
31
+ # classify_debug_level: 5,
32
+ # lang: 'eng',
33
+ # load_system_dawg: 0,
34
+ # load_unambig_dawg: 0,
35
+ # load_freq_dawg: 0,
36
+ # load_fixed_length_dawgs: 0,
37
+ # load_number_dawg: 0,
38
+ # load_punc_dawg: 1, ## important
39
+ # load_unambig_dawg: 1,
40
+ # chop_enable: 0,
41
+ # enable_new_segsearch: 1,
42
+ # tessedit_debug_quality_metrics: 1,
43
+ # tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
44
+ # tessedit_write_images: 1,
45
+ # equationdetect_save_merged_image: 1,
46
+ # tessedit_dump_pageseg_images: 1,
47
+ # equationdetect_save_bi_image: 1
48
+ },
49
+ annotated_specimen: {
50
+ edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
51
+ },
52
+ identifier: {
53
+ psm: 1,
54
+ # tessedit_char_whitelist: '0123456789'
55
+ # edges_children_count_limit: 4000
56
+ },
57
+ curator_metadata: {
58
+ },
59
+ labels: {
60
+ psm: 3, # may need to be 6
61
+ },
62
+ deterimination_labels: {
63
+ psm: 3
64
+ }
65
+ }
66
+
13
67
  # the text extracted from the image
14
68
  attr_accessor :text
15
69
 
16
- # https://code.google.com/p/tesseract-ocr/wiki/FAQ
17
- def text
18
- img = @image #.white_threshold(245)
19
-
20
- # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
21
- # get potential border pixel color (based on quadrant?)
22
- new_color = img.pixel_color(1, 1)
23
- # img = img.scale(2)
24
- # img.write('foo0.jpg.jpg')
25
- # img = img.enhance
26
- # img.write('foo1.jpg')
27
- # img = img.quantize(8, Magick::GRAYColorspace)
28
- # img.write('foo1.jpg')
29
- # img = img.sharpen(1.0, 0.2)
30
- # img.write('foo2.jpg')
31
- # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
32
- # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
33
- # img.write('tmp/foo4.jpg')
34
- # img = img.quantize(2, Magick::GRAYColorspace)
35
- # #img = img.threshold(0.5)
36
- # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
37
- # img = img.equalize #(32, Magick::GRAYColorspace)
38
- # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
39
- # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
40
- #
41
- # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
42
-
43
-
44
- # From https://code.google.com/p/tesseract-ocr/wiki/FAQ
45
- # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed".
46
-
47
-
48
- # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
49
- # doesn't supprot outputbase
50
- r = RTesseract.new(img, lang: 'eng', psm: 1,
51
- load_system_dawg: 0,
52
- tessedit_debug_quality_metrics: 1,
53
- load_freq_dawg: 1 ,
54
- chop_enable: 1,
55
- tessedit_write_images: 1,
56
- equationdetect_save_merged_image: 1,
57
- tessedit_dump_pageseg_images: 1,
58
- equationdetect_save_bi_image: 1,
59
- load_unambig_dawg: 0,
60
- tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,
61
-
62
- # img = img.white_threshold(245)
70
+ # future consideration
71
+ # def enhance_image(img)
72
+ # get potential border pixel color (based on quadrant?)
73
+ # new_color = img.pixel_color(1, 1)
63
74
 
75
+ # img = img.scale(2)
76
+ # img.write('foo0.jpg.jpg')
77
+ # img = img.enhance
78
+ # img.write('foo1.jpg')
79
+ # img = img.quantize(8, Magick::GRAYColorspace)
80
+ # img.write('foo1.jpg')
81
+ # img = img.sharpen(1.0, 0.2)
82
+ # img.write('foo2.jpg')
83
+ # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
84
+ # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
85
+ # img.write('tmp/foo4.jpg')
86
+ # img = img.quantize(2, Magick::GRAYColorspace)
87
+ # #img = img.threshold(0.5)
88
+ # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
89
+ # img = img.equalize #(32, Magick::GRAYColorspace)
90
+ # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
91
+ # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
92
+ #
93
+ # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
94
+ # img = img.white_threshold(245)
95
+ # img
96
+ # end
97
+
98
+ def text(section_type: :default)
99
+ img = @image
100
+ params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
101
+ r = RTesseract.new(img, params)
64
102
  @text = r.to_s.strip
65
103
  end
66
104
 
67
- # Need to provide tuning methods here, i.e. image transormations that facilitate OCR
68
105
 
69
106
  end
@@ -1,3 +1,3 @@
1
1
  class Sqed
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -8,25 +8,25 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
8
8
  let(:c) {b.boundaries}
9
9
  let(:d) { image.crop(*c.for(0), true) }
10
10
 
11
- let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t) }
11
+ let(:e) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :right_t, use_thumbnail: false) }
12
12
  let(:f) { e.boundaries }
13
- let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross)}
13
+ let(:g) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_offset_cross, use_thumbnail: false)}
14
14
  let(:h) { g.boundaries }
15
- let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split) }
15
+ let(:gv) { Sqed::BoundaryFinder::ColorLineFinder.new(image: d, layout: :vertical_split, use_thumbnail: false) }
16
16
  let(:hv) { gv.boundaries }
17
17
 
18
18
  let(:ah) { ImageHelpers.vertical_offset_cross_red }
19
19
  let(:bh) { Sqed::BoundaryFinder::StageFinder.new(image: ah) }
20
20
  let(:ch) { bh.boundaries }
21
21
  let(:dh) { ah.crop(*ch.for(0), true) }
22
- let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red) } # was :horizontal_split
22
+ let(:gh) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dh, layout: :horizontal_split, boundary_color: :red, use_thumbnail: false) } # was :horizontal_split
23
23
  let(:hh) { gh.boundaries }
24
24
 
25
25
  let(:ibs) { ImageHelpers.black_stage_green_line_specimen }
26
26
  let(:bbs) { Sqed::BoundaryFinder::StageFinder.new(image: ibs) }
27
27
  let(:cbs) { bbs.boundaries }
28
28
  let(:dbs) { ibs.crop(*cbs.for(0), true) }
29
- let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross) }
29
+ let(:gbs) { Sqed::BoundaryFinder::ColorLineFinder.new(image: dbs, layout: :vertical_offset_cross, use_thumbnail: false) }
30
30
  let(:hbs) { gbs.boundaries }
31
31
 
32
32
  specify 'initial image columns are as expected for :image above' do
@@ -167,7 +167,7 @@ describe Sqed::BoundaryFinder::ColorLineFinder do
167
167
  context 'thumbnail processing finds reasonable boundaries' do
168
168
 
169
169
  let(:thumb) { ImageHelpers.frost_stage_thumb }
170
- let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross)}
170
+ let(:finder) { Sqed::BoundaryFinder::ColorLineFinder.new(image: thumb, layout: :cross, use_thumbnail: false)}
171
171
  let(:finder_boundaries) { finder.boundaries }
172
172
 
173
173
  let(:pct) { 0.08 }
@@ -22,7 +22,8 @@ describe Sqed::BoundaryFinder::CrossFinder do
22
22
  end
23
23
 
24
24
  specify 'the 0th image has height = 300' do
25
- expect(c.height_for(0)).to eq(300)
25
+ pct = 0.02
26
+ expect(c.height_for(0)).to be_within(pct*300).of(300)
26
27
  end
27
28
 
28
29
  end
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Sqed::BoundaryFinder do
4
4
 
5
5
  specify 'when no image provided, #new raises' do
6
- expect { Sqed::BoundaryFinder.new() }.to raise_error
6
+ expect { Sqed::BoundaryFinder.new() }.to raise_error('No layout provided.')
7
7
  end
8
8
 
9
9
  context 'when initiated with an image' do
@@ -103,9 +103,15 @@ describe Sqed::BoundaryFinder do
103
103
  expect( Sqed::BoundaryFinder.frequency_stats(i, 12)).to eq([3, 4, 5])
104
104
  end
105
105
 
106
+ specify 'returns estimated borders if only one hit greater than samples taken' do
107
+ expect( Sqed::BoundaryFinder.frequency_stats(i, 15)).to eq([2,3,4])
108
+ end
109
+
106
110
  specify 'returns nil if no count is greater than samples taken' do
107
- expect( Sqed::BoundaryFinder.frequency_stats(i, 15)).to eq(nil)
111
+ expect( Sqed::BoundaryFinder.frequency_stats(i, 20)).to eq(nil)
108
112
  end
113
+
114
+
109
115
  end
110
116
 
111
117
  context 'offset boundaries from crossy_black_line_specimen image ' do
@@ -90,20 +90,23 @@ describe Sqed do
90
90
 
91
91
  context '#result' do
92
92
  let(:r) { s.result }
93
+
93
94
  specify 'returns a Sqed::Result' do
94
95
  expect(r.class.name).to eq('Sqed::Result')
95
96
  end
96
97
 
97
98
  context 'extracted data' do
98
- specify 'for an :identifier section' do
99
+ specify 'text for an :identifier section' do
100
+
101
+ r.identifier_image.write('41.jpg')
99
102
  expect(r.text_for(:identifier)).to match('000041196')
100
103
  end
101
104
 
102
- specify 'for an annotated_specimen section' do
105
+ specify 'text for an annotated_specimen section' do
103
106
  expect(r.text_for(:annotated_specimen)).to match('Saucier Creek')
104
107
  end
105
108
 
106
- specify 'for a curator_metadata section' do
109
+ specify 'text for a curator_metadata section' do
107
110
  expect(r.text_for(:curator_metadata)).to match('Frost Entomological Museum')
108
111
  end
109
112
  end
@@ -113,7 +116,7 @@ describe Sqed do
113
116
  context 'all together, with border' do
114
117
  let(:image) { ImageHelpers.greenline_image }
115
118
  let(:pattern) { :right_t }
116
- let(:s) { Sqed.new(image: image, pattern: pattern, has_border: false) }
119
+ let(:s) { Sqed.new(image: image, pattern: pattern, has_border: true) }
117
120
 
118
121
  specify '#boundaries returns a Sqed::Boundaries instance' do
119
122
  expect(s.boundaries.class.name).to eq('Sqed::Boundaries')
@@ -138,11 +141,12 @@ describe Sqed do
138
141
  end
139
142
 
140
143
  context 'extracted data' do
141
- specify 'for an :identifier section' do
144
+ specify 'text for an :identifier section' do
145
+ r.identifier_image.write('85.jpg')
142
146
  expect(r.text_for(:identifier)).to match('000085067')
143
147
  end
144
148
 
145
- specify 'for a specimen section' do
149
+ specify 'text for a specimen section' do
146
150
  expect(r.text_for(:annotated_specimen)).to match('Aeshna')
147
151
  end
148
152
  end
@@ -23,7 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency 'rtesseract', '~> 1.2.6'
24
24
  spec.add_dependency 'zxing_cpp', '~> 0.1.0'
25
25
 
26
- spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency 'rspec', '~> 3.3'
27
27
  spec.add_development_dependency 'bundler', '~> 1.5'
28
28
  spec.add_development_dependency 'did_you_mean', '~> 0.9'
29
29
  spec.add_development_dependency 'byebug'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sqed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Yoder
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-06-11 00:00:00.000000000 Z
12
+ date: 2015-09-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -71,16 +71,16 @@ dependencies:
71
71
  name: rspec
72
72
  requirement: !ruby/object:Gem::Requirement
73
73
  requirements:
74
- - - ">="
74
+ - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0'
76
+ version: '3.3'
77
77
  type: :development
78
78
  prerelease: false
79
79
  version_requirements: !ruby/object:Gem::Requirement
80
80
  requirements:
81
- - - ">="
81
+ - - "~>"
82
82
  - !ruby/object:Gem::Version
83
- version: '0'
83
+ version: '3.3'
84
84
  - !ruby/object:Gem::Dependency
85
85
  name: bundler
86
86
  requirement: !ruby/object:Gem::Requirement