tesseract-ocr 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class Baseline
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::Baseline.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ def inspect
39
+ "#<Baseline: #{x1};#{y1} #{x2};#{y2}>"
40
+ end
41
+ end
42
+
43
+ end; end
@@ -0,0 +1,54 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class BoundingBox
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::BoundingBox.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias x left
39
+ alias y top
40
+
41
+ def width
42
+ right - left
43
+ end
44
+
45
+ def height
46
+ bottom - top
47
+ end
48
+
49
+ def inspect
50
+ "#<BoundingBox(#{x}, #{y}): #{width}x#{height}>"
51
+ end
52
+ end
53
+
54
+ end; end
@@ -0,0 +1,50 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class FontAttributes
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::FontAttributes.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias bold? is_bold
39
+ alias italic? is_italic
40
+ alias underlined? is_underlined
41
+ alias monospace? is_monospace
42
+ alias serif? is_serif
43
+ alias smallcaps? is_smallcaps
44
+
45
+ def inspect
46
+ "#<Font(#{id} #{name || 'unknown'} #{pointsize}pt):#{' bold' if bold?}#{' italic' if italic?}#{' underlined' if underlined?}#{' monospace' if monospace?}#{' serif' if serif?}#{' smallcaps' if smallcaps?}>"
47
+ end
48
+ end
49
+
50
+ end; end
@@ -0,0 +1,161 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ require 'tesseract/engine/bounding_box'
26
+ require 'tesseract/engine/baseline'
27
+ require 'tesseract/engine/orientation'
28
+ require 'tesseract/engine/font_attributes'
29
+
30
+ module Tesseract; class Engine
31
+
32
+ class Iterator
33
+ class Element
34
+ def self.for (level)
35
+ Iterator.const_get(level.capitalize)
36
+ rescue
37
+ self
38
+ end
39
+
40
+ def initialize (level, iterator)
41
+ @level = level
42
+ @iterator = iterator
43
+ end
44
+
45
+ memoize
46
+ def bounding_box
47
+ BoundingBox.new(@iterator.bounding_box(@level))
48
+ end
49
+
50
+ memoize
51
+ def binary_image
52
+ @iterator.get_binary_image(@level) rescue nil
53
+ end
54
+
55
+ memoize
56
+ def image
57
+ @iterator.get_image(@level) rescue nil
58
+ end
59
+
60
+ memoize
61
+ def baseline
62
+ Baseline.new(@iterator.baseline(@level))
63
+ end
64
+
65
+ memoize
66
+ def orientation
67
+ Orientation.new(@iterator.orientation)
68
+ end
69
+
70
+ memoize
71
+ def text
72
+ @iterator.get_text(@level)
73
+ end
74
+
75
+ memoize
76
+ def confidence
77
+ @iterator.confidence(@level)
78
+ end
79
+
80
+ alias to_s text
81
+
82
+ def inspect
83
+ "#<Tesseract::#{@level.capitalize}(#{confidence}): #{text.inspect}>"
84
+ end
85
+ end
86
+
87
+ class Block < Element
88
+ memoize
89
+ def type
90
+ @iterator.block_type
91
+ end
92
+ end
93
+
94
+ class Word < Element
95
+ memoize
96
+ def font_attributes
97
+ FontAttributes.new(@iterator.word_font_attributes)
98
+ end
99
+
100
+ memoize
101
+ def from_dictionary?
102
+ @iterator.word_is_from_dictionary?
103
+ end
104
+
105
+ memoize
106
+ def numeric?
107
+ @iterator.word_is_numeric?
108
+ end
109
+ end
110
+
111
+ class Symbol < Element
112
+ memoize
113
+ def superscript?
114
+ @iterator.symbol_is_superscript?
115
+ end
116
+
117
+ memoize
118
+ def subscript?
119
+ @iterator.symbol_is_subscript?
120
+ end
121
+
122
+ memoize
123
+ def dropcap?
124
+ @iterator.symbol_is_dropcap?
125
+ end
126
+ end
127
+
128
+ def initialize (iterator)
129
+ @iterator = iterator
130
+ end
131
+
132
+ %w(block paragraph line word symbol).each {|level|
133
+ define_method "each_#{level}" do |&block|
134
+ return enum_for "each_#{level}" unless block
135
+
136
+ @iterator.begin
137
+
138
+ begin
139
+ block.call Element.for(level).new(level, @iterator)
140
+ end while @iterator.next level
141
+ end
142
+
143
+ define_method "#{level}s" do
144
+ __send__("each_#{level}").map {|e|
145
+ e.methods.each {|name|
146
+ if e.respond_to? "__memoized_#{name}"
147
+ e.__send__ name
148
+ end
149
+ }
150
+
151
+ e.instance_eval {
152
+ @iterator = nil
153
+ }
154
+
155
+ e
156
+ }
157
+ end
158
+ }
159
+ end
160
+
161
+ end; end
@@ -0,0 +1,45 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract; class Engine
26
+
27
+ class Orientation
28
+ def initialize (struct)
29
+ @internal = struct
30
+ end
31
+
32
+ C::Iterator::Orientation.layout.members.each {|name|
33
+ define_method name do
34
+ @internal[name]
35
+ end
36
+ }
37
+
38
+ alias direction orientation
39
+
40
+ def inspect
41
+ "#<Orientation: #{orientation} #{writing_direction} #{textline_order} #{deskew_angle}>"
42
+ end
43
+ end
44
+
45
+ end; end
@@ -23,16 +23,6 @@
23
23
  #++
24
24
 
25
25
  require 'namedic'
26
+ require 'memoized'
26
27
  require 'iso-639'
27
-
28
- module Kernel
29
- def suppress_stderr
30
- old = IO.pipe.last.reopen($stderr)
31
-
32
- $stderr.reopen(IO.pipe.last)
33
- result = yield
34
- $stderr.reopen(old)
35
-
36
- result
37
- end
38
- end
28
+ require 'io/manage'
@@ -0,0 +1,38 @@
1
+ #--
2
+ # Copyright 2011 meh. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without modification, are
5
+ # permitted provided that the following conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above copyright notice, this list of
8
+ # conditions and the following disclaimer.
9
+ #
10
+ # THIS SOFTWARE IS PROVIDED BY meh ''AS IS'' AND ANY EXPRESS OR IMPLIED
11
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12
+ # FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL meh OR
13
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
14
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
15
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
16
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
17
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
18
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19
+ #
20
+ # The views and conclusions contained in the software and documentation are those of the
21
+ # authors and should not be interpreted as representing official policies, either expressed
22
+ # or implied, of meh.
23
+ #++
24
+
25
+ module Tesseract
26
+
27
+ class Iterator
28
+ def initialize (api, pointer)
29
+ @api = api
30
+ @internal = pointer
31
+ end
32
+
33
+ def to_ffi
34
+ @internal
35
+ end
36
+ end
37
+
38
+ end
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.0.2'
27
+ '0.0.3'
28
28
  end
29
29
  end
@@ -15,6 +15,7 @@ Gem::Specification.new {|s|
15
15
  s.require_paths = ['lib']
16
16
 
17
17
  s.add_dependency 'namedic'
18
+ s.add_dependency 'memoized'
18
19
  s.add_dependency 'iso-639'
19
20
 
20
21
  s.add_dependency 'ffi-extra'
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env ruby
2
+ require 'tesseract'
3
+ require 'benchmark'
4
+
5
+ Benchmark.bm do |b|
6
+ engine = Tesseract::Engine.new
7
+
8
+ b.report 'text_for: ' do
9
+ 100.times do
10
+ engine.text_for('first.png')
11
+
12
+ GC.start
13
+ end
14
+ end
15
+
16
+ b.report 'words_for: ' do
17
+ 100.times do
18
+ engine.words_for('first.png')
19
+
20
+ GC.start
21
+ end
22
+ end
23
+
24
+ b.report 'symbols_for: ' do
25
+ 100.times do
26
+ engine.symbols_for('first.png')
27
+
28
+ GC.start
29
+ end
30
+ end
31
+ end