tesseract-ocr 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class Baseline
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::Baseline.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ def inspect
39
+ "#<Baseline: #{x1};#{y1} #{x2};#{y2}>"
40
+ end
41
+ end
42
+
43
+ end; end
@@ -0,0 +1,54 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class BoundingBox
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::BoundingBox.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias x left
39
+ alias y top
40
+
41
+ def width
42
+ right - left
43
+ end
44
+
45
+ def height
46
+ bottom - top
47
+ end
48
+
49
+ def inspect
50
+ "#<BoundingBox(#{x}, #{y}): #{width}x#{height}>"
51
+ end
52
+ end
53
+
54
+ end; end
@@ -0,0 +1,50 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class FontAttributes
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::FontAttributes.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias bold? is_bold
39
+ alias italic? is_italic
40
+ alias underlined? is_underlined
41
+ alias monospace? is_monospace
42
+ alias serif? is_serif
43
+ alias smallcaps? is_smallcaps
44
+
45
+ def inspect
46
+ "#<Font(#{id} #{name || 'unknown'} #{pointsize}pt):#{' bold' if bold?}#{' italic' if italic?}#{' underlined' if underlined?}#{' monospace' if monospace?}#{' serif' if serif?}#{' smallcaps' if smallcaps?}>"
47
+ end
48
+ end
49
+
50
+ end; end
@@ -0,0 +1,161 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ require 'tesseract/engine/bounding_box'
26
+ require 'tesseract/engine/baseline'
27
+ require 'tesseract/engine/orientation'
28
+ require 'tesseract/engine/font_attributes'
29
+
30
+ module Tesseract; class Engine
31
+
32
+ class Iterator
33
+ class Element
34
+ def self.for (level)
35
+ Iterator.const_get(level.capitalize)
36
+ rescue
37
+ self
38
+ end
39
+
40
+ def initialize (level, iterator)
41
+ @level = level
42
+ @iterator = iterator
43
+ end
44
+
45
+ memoize
46
+ def bounding_box
47
+ BoundingBox.new(@iterator.bounding_box(@level))
48
+ end
49
+
50
+ memoize
51
+ def binary_image
52
+ @iterator.get_binary_image(@level) rescue nil
53
+ end
54
+
55
+ memoize
56
+ def image
57
+ @iterator.get_image(@level) rescue nil
58
+ end
59
+
60
+ memoize
61
+ def baseline
62
+ Baseline.new(@iterator.baseline(@level))
63
+ end
64
+
65
+ memoize
66
+ def orientation
67
+ Orientation.new(@iterator.orientation)
68
+ end
69
+
70
+ memoize
71
+ def text
72
+ @iterator.get_text(@level)
73
+ end
74
+
75
+ memoize
76
+ def confidence
77
+ @iterator.confidence(@level)
78
+ end
79
+
80
+ alias to_s text
81
+
82
+ def inspect
83
+ "#<Tesseract::#{@level.capitalize}(#{confidence}): #{text.inspect}>"
84
+ end
85
+ end
86
+
87
+ class Block < Element
88
+ memoize
89
+ def type
90
+ @iterator.block_type
91
+ end
92
+ end
93
+
94
+ class Word < Element
95
+ memoize
96
+ def font_attributes
97
+ FontAttributes.new(@iterator.word_font_attributes)
98
+ end
99
+
100
+ memoize
101
+ def from_dictionary?
102
+ @iterator.word_is_from_dictionary?
103
+ end
104
+
105
+ memoize
106
+ def numeric?
107
+ @iterator.word_is_numeric?
108
+ end
109
+ end
110
+
111
+ class Symbol < Element
112
+ memoize
113
+ def superscript?
114
+ @iterator.symbol_is_superscript?
115
+ end
116
+
117
+ memoize
118
+ def subscript?
119
+ @iterator.symbol_is_subscript?
120
+ end
121
+
122
+ memoize
123
+ def dropcap?
124
+ @iterator.symbol_is_dropcap?
125
+ end
126
+ end
127
+
128
+ def initialize (iterator)
129
+ @iterator = iterator
130
+ end
131
+
132
+ %w(block paragraph line word symbol).each {|level|
133
+ define_method "each_#{level}" do |&block|
134
+ return enum_for "each_#{level}" unless block
135
+
136
+ @iterator.begin
137
+
138
+ begin
139
+ block.call Element.for(level).new(level, @iterator)
140
+ end while @iterator.next level
141
+ end
142
+
143
+ define_method "#{level}s" do
144
+ __send__("each_#{level}").map {|e|
145
+ e.methods.each {|name|
146
+ if e.respond_to? "__memoized_#{name}"
147
+ e.__send__ name
148
+ end
149
+ }
150
+
151
+ e.instance_eval {
152
+ @iterator = nil
153
+ }
154
+
155
+ e
156
+ }
157
+ end
158
+ }
159
+ end
160
+
161
+ end; end
@@ -0,0 +1,45 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class Orientation
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::Orientation.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias direction orientation
39
+
40
+ def inspect
41
+ "#<Orientation: #{orientation} #{writing_direction} #{textline_order} #{deskew_angle}>"
42
+ end
43
+ end
44
+
45
+ end; end
@@ -23,16 +23,6 @@
23
23
  #++
24
24
 
25
25
  require 'namedic'
26
+ require 'memoized'
26
27
  require 'iso-639'
27
-
28
- module Kernel
29
- def suppress_stderr
30
- old = IO.pipe.last.reopen($stderr)
31
-
32
- $stderr.reopen(IO.pipe.last)
33
- result = yield
34
- $stderr.reopen(old)
35
-
36
- result
37
- end
38
- end
28
+ require 'io/manage'
@@ -0,0 +1,38 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract
26
+
27
+ class Iterator
28
+ def initialize (api, pointer)
29
+ @api = api
30
+ @internal = pointer
31
+ end
32
+
33
+ def to_ffi
34
+ @internal
35
+ end
36
+ end
37
+
38
+ end
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.0.2'
27
+ '0.0.3'
28
28
  end
29
29
  end
@@ -15,6 +15,7 @@ Gem::Specification.new {|s|
15
15
  s.require_paths = ['lib']
16
16
 
17
17
  s.add_dependency 'namedic'
18
+ s.add_dependency 'memoized'
18
19
  s.add_dependency 'iso-639'
19
20
 
20
21
  s.add_dependency 'ffi-extra'
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env ruby
2
+ require 'tesseract'
3
+ require 'benchmark'
4
+
5
+ Benchmark.bm do |b|
6
+ engine = Tesseract::Engine.new
7
+
8
+ b.report 'text_for: ' do
9
+ 100.times do
10
+ engine.text_for('first.png')
11
+
12
+ GC.start
13
+ end
14
+ end
15
+
16
+ b.report 'words_for: ' do
17
+ 100.times do
18
+ engine.words_for('first.png')
19
+
20
+ GC.start
21
+ end
22
+ end
23
+
24
+ b.report 'symbols_for: ' do
25
+ 100.times do
26
+ engine.symbols_for('first.png')
27
+
28
+ GC.start
29
+ end
30
+ end
31
+ end