tesseract-ocr 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. data/README.md +54 -16
  2. data/bin/tesseract-train.rb +63 -0
  3. data/examples/nerdz-captcha-breaker/break.rb +18 -10
  4. data/examples/nerdz-captcha-breaker/captchas/001.png +0 -0
  5. data/examples/nerdz-captcha-breaker/captchas/002.png +0 -0
  6. data/examples/nerdz-captcha-breaker/captchas/003.png +0 -0
  7. data/examples/nerdz-captcha-breaker/captchas/004.png +0 -0
  8. data/examples/nerdz-captcha-breaker/captchas/005.png +0 -0
  9. data/examples/nerdz-captcha-breaker/captchas/006.png +0 -0
  10. data/examples/nerdz-captcha-breaker/captchas/007.png +0 -0
  11. data/examples/nerdz-captcha-breaker/captchas/008.png +0 -0
  12. data/examples/nerdz-captcha-breaker/captchas/009.png +0 -0
  13. data/examples/nerdz-captcha-breaker/captchas/010.png +0 -0
  14. data/examples/nerdz-captcha-breaker/captchas/011.png +0 -0
  15. data/examples/nerdz-captcha-breaker/captchas/012.png +0 -0
  16. data/examples/nerdz-captcha-breaker/captchas/013.png +0 -0
  17. data/examples/nerdz-captcha-breaker/captchas/014.png +0 -0
  18. data/examples/nerdz-captcha-breaker/captchas/015.png +0 -0
  19. data/examples/nerdz-captcha-breaker/captchas/016.png +0 -0
  20. data/examples/nerdz-captcha-breaker/captchas/017.png +0 -0
  21. data/examples/nerdz-captcha-breaker/captchas/018.png +0 -0
  22. data/examples/nerdz-captcha-breaker/captchas/019.png +0 -0
  23. data/examples/nerdz-captcha-breaker/captchas/020.png +0 -0
  24. data/examples/nerdz-captcha-breaker/captchas/021.png +0 -0
  25. data/examples/nerdz-captcha-breaker/captchas/022.png +0 -0
  26. data/examples/nerdz-captcha-breaker/captchas/023.png +0 -0
  27. data/examples/nerdz-captcha-breaker/captchas/024.png +0 -0
  28. data/examples/nerdz-captcha-breaker/captchas/025.png +0 -0
  29. data/examples/nerdz-captcha-breaker/captchas/026.png +0 -0
  30. data/examples/nerdz-captcha-breaker/captchas/027.png +0 -0
  31. data/examples/nerdz-captcha-breaker/captchas/028.png +0 -0
  32. data/examples/nerdz-captcha-breaker/captchas/029.png +0 -0
  33. data/examples/nerdz-captcha-breaker/captchas/030.png +0 -0
  34. data/examples/nerdz-captcha-breaker/captchas/031.png +0 -0
  35. data/examples/nerdz-captcha-breaker/captchas/032.png +0 -0
  36. data/examples/nerdz-captcha-breaker/captchas/033.png +0 -0
  37. data/examples/nerdz-captcha-breaker/captchas/034.png +0 -0
  38. data/examples/nerdz-captcha-breaker/captchas/035.png +0 -0
  39. data/examples/nerdz-captcha-breaker/captchas/036.png +0 -0
  40. data/examples/nerdz-captcha-breaker/captchas/037.png +0 -0
  41. data/examples/nerdz-captcha-breaker/captchas/038.png +0 -0
  42. data/examples/nerdz-captcha-breaker/captchas/039.png +0 -0
  43. data/examples/nerdz-captcha-breaker/captchas/040.png +0 -0
  44. data/examples/nerdz-captcha-breaker/captchas/041.png +0 -0
  45. data/examples/nerdz-captcha-breaker/captchas/042.png +0 -0
  46. data/examples/nerdz-captcha-breaker/captchas/043.png +0 -0
  47. data/examples/nerdz-captcha-breaker/captchas/044.png +0 -0
  48. data/examples/nerdz-captcha-breaker/captchas/045.png +0 -0
  49. data/examples/nerdz-captcha-breaker/captchas/046.png +0 -0
  50. data/examples/nerdz-captcha-breaker/captchas/047.png +0 -0
  51. data/examples/nerdz-captcha-breaker/captchas/048.png +0 -0
  52. data/examples/nerdz-captcha-breaker/captchas/049.png +0 -0
  53. data/examples/nerdz-captcha-breaker/captchas/050.png +0 -0
  54. data/examples/nerdz-captcha-breaker/captchas/051.png +0 -0
  55. data/examples/nerdz-captcha-breaker/captchas/052.png +0 -0
  56. data/examples/nerdz-captcha-breaker/captchas/053.png +0 -0
  57. data/examples/nerdz-captcha-breaker/captchas/054.png +0 -0
  58. data/examples/nerdz-captcha-breaker/captchas/055.png +0 -0
  59. data/examples/nerdz-captcha-breaker/captchas/056.png +0 -0
  60. data/examples/nerdz-captcha-breaker/captchas/057.png +0 -0
  61. data/examples/nerdz-captcha-breaker/captchas/058.png +0 -0
  62. data/examples/nerdz-captcha-breaker/captchas/059.png +0 -0
  63. data/examples/nerdz-captcha-breaker/captchas/060.png +0 -0
  64. data/examples/nerdz-captcha-breaker/captchas/061.png +0 -0
  65. data/examples/nerdz-captcha-breaker/captchas/062.png +0 -0
  66. data/examples/nerdz-captcha-breaker/captchas/063.png +0 -0
  67. data/examples/nerdz-captcha-breaker/captchas/064.png +0 -0
  68. data/examples/nerdz-captcha-breaker/captchas/065.png +0 -0
  69. data/examples/nerdz-captcha-breaker/captchas/066.png +0 -0
  70. data/examples/nerdz-captcha-breaker/captchas/067.png +0 -0
  71. data/examples/nerdz-captcha-breaker/captchas/068.png +0 -0
  72. data/examples/nerdz-captcha-breaker/captchas/069.png +0 -0
  73. data/examples/nerdz-captcha-breaker/captchas/070.png +0 -0
  74. data/examples/nerdz-captcha-breaker/captchas/071.png +0 -0
  75. data/examples/nerdz-captcha-breaker/captchas/072.png +0 -0
  76. data/examples/nerdz-captcha-breaker/captchas/073.png +0 -0
  77. data/examples/nerdz-captcha-breaker/captchas/074.png +0 -0
  78. data/examples/nerdz-captcha-breaker/captchas/075.png +0 -0
  79. data/examples/nerdz-captcha-breaker/captchas/076.png +0 -0
  80. data/examples/nerdz-captcha-breaker/captchas/077.png +0 -0
  81. data/examples/nerdz-captcha-breaker/captchas/078.png +0 -0
  82. data/examples/nerdz-captcha-breaker/captchas/079.png +0 -0
  83. data/examples/nerdz-captcha-breaker/captchas/080.png +0 -0
  84. data/examples/nerdz-captcha-breaker/captchas/081.png +0 -0
  85. data/examples/nerdz-captcha-breaker/captchas/082.png +0 -0
  86. data/examples/nerdz-captcha-breaker/captchas/083.png +0 -0
  87. data/examples/nerdz-captcha-breaker/captchas/084.png +0 -0
  88. data/examples/nerdz-captcha-breaker/captchas/085.png +0 -0
  89. data/examples/nerdz-captcha-breaker/captchas/086.png +0 -0
  90. data/examples/nerdz-captcha-breaker/captchas/087.png +0 -0
  91. data/examples/nerdz-captcha-breaker/captchas/088.png +0 -0
  92. data/examples/nerdz-captcha-breaker/captchas/089.png +0 -0
  93. data/examples/nerdz-captcha-breaker/captchas/090.png +0 -0
  94. data/examples/nerdz-captcha-breaker/captchas/091.png +0 -0
  95. data/examples/nerdz-captcha-breaker/captchas/092.png +0 -0
  96. data/examples/nerdz-captcha-breaker/captchas/093.png +0 -0
  97. data/examples/nerdz-captcha-breaker/captchas/094.png +0 -0
  98. data/examples/nerdz-captcha-breaker/captchas/095.png +0 -0
  99. data/examples/nerdz-captcha-breaker/captchas/096.png +0 -0
  100. data/examples/nerdz-captcha-breaker/captchas/097.png +0 -0
  101. data/examples/nerdz-captcha-breaker/captchas/098.png +0 -0
  102. data/examples/nerdz-captcha-breaker/captchas/099.png +0 -0
  103. data/examples/nerdz-captcha-breaker/captchas/100.png +0 -0
  104. data/examples/nerdz-captcha-breaker/captchas/captchas.txt +100 -0
  105. data/examples/nerdz-captcha-breaker/tessdata/generate.rb +21 -0
  106. data/examples/nerdz-captcha-breaker/tessdata/lol.box +600 -0
  107. data/examples/nerdz-captcha-breaker/tessdata/lol.tif +0 -0
  108. data/examples/nerdz-captcha-breaker/tessdata/lol.traineddata +0 -0
  109. data/examples/nerdz-captcha-breaker/test.rb +11 -0
  110. data/lib/tesseract/engine.rb +19 -2
  111. data/lib/tesseract/extensions.rb +0 -1
  112. data/lib/tesseract/version.rb +1 -1
  113. data/test/jsmj.png +0 -0
  114. data/test/tesseract_spec.rb +9 -0
  115. metadata +122 -14
  116. data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.box +0 -112
  117. data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif +0 -0
data/README.md CHANGED
@@ -12,8 +12,14 @@ because it's still under review for upstream merging.
12
12
 
13
13
  The gem is called `tesseract-ocr`.
14
14
 
15
- Example
16
- -------
15
+ If you're having any problem requiring tesseract or any dependencies, check the permissions of the installed
16
+ gems.
17
+
18
+ Examples
19
+ --------
20
+ Following are some examples that show the functionalities provided by tesseract-ocr.
21
+
22
+ ### Basic functionality of tesseract
17
23
 
18
24
  ```ruby
19
25
  require 'tesseract'
@@ -24,26 +30,58 @@ e = Tesseract::Engine.new {|e|
24
30
  }
25
31
 
26
32
  e.text_for('test/first.png').strip # => 'ABC'
27
-
28
- e.words_for('test/second.png') # [
29
- # [ 0] #<Tesseract(93.41653442382812): "|'m">,
30
- # [ 1] #<Tesseract(91.11811828613281): "12">,
31
- # [ 2] #<Tesseract(85.71760559082031): "and">,
32
- # [ 3] #<Tesseract(83.4853515625): "what">,
33
- # [ 4] #<Tesseract(86.71072387695312): "is">,
34
- # [ 5] #<Tesseract(83.2227783203125): "this.">,
35
- # [ 6] #<Tesseract(82.81439208984375): "INSTALL">,
36
- # [ 7] #<Tesseract(86.46566772460938): "GENTOO">,
37
- # [ 8] #<Tesseract(93.19613647460938): "OH">,
38
- # [ 9] #<Tesseract(82.81439208984375): "HAI">,
39
- # [10] #<Tesseract(85.9158935546875): "1234">
40
- # ]
41
33
  ```
42
34
 
43
35
  You can pass to `#text_for` either a path, an IO object, a string containing the image or
44
36
  an object that responds to `#to_blob` (for example Magick::Image), keep in mind that
45
37
  the format has to be supported by leptonica.
46
38
 
39
+ ### Accessing advanced features
40
+
41
+ With advanced features you get access to blocks, paragraphs, lines, words and symbols.
42
+
43
+ There are lot of way to access those levels, the methods are the following (replace level
44
+ with one of the accessible features, so `each_level` can be `each_block` or `each_paragraph`
45
+ etc.)
46
+
47
+ The following kind of accessors need a block to be passed and they pass to the block each
48
+ `Element` object. The Element object has various getters to access certain features, I'll
49
+ talk about them later.
50
+
51
+ The methods are:
52
+
53
+ * `each_level`
54
+ * `each_level_for`
55
+ * `each_level_at`
56
+
57
+ The following accessors instead return an `Array` of `Element`s with cached getters, the getters
58
+ are cached beacause the values accessible in the `Element` are linked to the state of the internal
59
+ API, and that state changes if you access something else.
60
+
61
+ The methods are:
62
+
63
+ * `levels`
64
+ * `levels_for`
65
+ * `levels_at`
66
+
67
+ Again, to `*_for` methods you can pass what you can pass to a `#text_for`.
68
+
69
+ Each `Element` object has the following getters:
70
+
71
+ * `bounding_box`, this will return the box where the element is confined into
72
+ * `binary_image`, this will return the bichromatic image of the element
73
+ * `image`, this will return the image of the element
74
+ * `baseline`, this will return the line where the text is with a pair of coordinates
75
+ * `orientation`, this will return the orientation of the element
76
+ * `text`, this will return the text of the element
77
+ * `confidence`, this will return the confidence of correctness for the element
78
+
79
+ `Block` elements also have `type` accessors that specify the type of the block.
80
+
81
+ `Word` elements also have `font_attributes`, `from_dictionary?` and `numeric?` getters.
82
+
83
+ `Symbol` elements also have `superscript?`, `subscript?` and `dropcap?` getters.
84
+
47
85
  Using the binary
48
86
  ----------------
49
87
  You can also use the shipped executable in the following way:
@@ -0,0 +1,63 @@
1
+ #! /usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'tmpdir'
4
+ require 'fileutils'
5
+ require 'shellwords'
6
+
7
+ options = {}
8
+
9
+ OptionParser.new do |o|
10
+ o.on '-b', '--box FILE', 'the box file to use' do |value|
11
+ options[:box] = File.realpath(value)
12
+ end
13
+
14
+ o.on '-i', '--image FILE', 'the image file to use' do |value|
15
+ options[:image] = File.realpath(value)
16
+ end
17
+
18
+ o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
19
+ options[:output] = File.expand_path(value)
20
+ end
21
+ end.parse!
22
+
23
+ if language = ARGV.shift
24
+ options[:box] = File.realpath("#{language}.box")
25
+ options[:image] = File.realpath("#{language}.tif")
26
+ options[:output] = File.expand_path("#{language}.traineddata")
27
+ else
28
+ language = options[:box][/^(.*?)\./, 1]
29
+ end
30
+
31
+ Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
32
+
33
+ language = language.shellescape
34
+
35
+ %x{
36
+ cp #{options[:box].shellescape} #{language}.box
37
+ cp #{options[:image].shellescape} #{language}#{File.extname(options[:image])}
38
+
39
+ tesseract #{language}#{File.extname(options[:image])} #{language} nobatch box.train.stderr
40
+
41
+ unicharset_extractor #{language}.box
42
+
43
+ echo #{language} 0 0 0 0 0 > font_properties
44
+ mftraining -F font_properties -U unicharset #{language}.tr
45
+ mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
46
+ cntraining #{language}.tr
47
+
48
+ mv Microfeat #{language}.Microfeat
49
+ mv normproto #{language}.normproto
50
+ mv pffmtable #{language}.pffmtable
51
+ mv mfunicharset #{language}.mfunicharset
52
+ mv inttemp #{language}.inttemp
53
+
54
+ combine_tessdata #{language}.
55
+
56
+ mv #{language}.traineddata #{options[:output].shellescape}
57
+ }
58
+
59
+ path = File.realpath(Dir.pwd)
60
+
61
+ Dir.chdir '/'
62
+
63
+ FileUtils.rm_rf path
@@ -2,21 +2,29 @@
2
2
  require 'tesseract'
3
3
  require 'RMagick'
4
4
 
5
+ # this function is used to get points near the current pixel to
6
+ # cleanup the oblique lines mess, horizontal points seem to output
7
+ # better cleanup
5
8
  def near (x, y)
6
9
  [
7
- [x - 1, y - 1],
8
- [x, y - 1],
9
- [x + 1, y - 1],
10
+ # [x - 1, y - 1],
11
+ # [x, y - 1],
12
+ # [x + 1, y - 1],
10
13
  [x - 1, y ],
11
- # FIRE IN THE HOLE
12
14
  [x + 1, y ],
13
- [x - 1, y + 1],
14
- [x, y + 1],
15
- [x + 1, y + 1]
15
+ # [x - 1, y + 1],
16
+ # [x, y + 1],
17
+ # [x + 1, y + 1]
16
18
  ]
17
19
  end
18
20
 
19
- Tesseract::Engine.new.tap {|engine|
21
+ ENV['TESSDATA_PREFIX'] = './'
22
+
23
+ Tesseract::Engine.new {|engine|
24
+ engine.language = :lol
25
+ engine.page_segmentation_mode = 8
26
+ engine.whitelist = [*'a'..'z', *'A'..'Z', *0..9].join
27
+ }.tap {|engine|
20
28
  ARGV.each {|path|
21
29
  image = Magick::Image.read(path).first
22
30
  pixels = Hash.new { |h, k| h[k] = 0 }
@@ -51,8 +59,8 @@ Tesseract::Engine.new.tap {|engine|
51
59
  image.pixel_color x, y, 'black'
52
60
  }
53
61
 
54
- File.open('/tmp/lol.png', ?w) { |f| f.write(image.resize(10).to_blob) }
62
+ image.scale(4).display if ENV['DEBUG']
55
63
 
56
- puts engine.text_for(image.resize 10).strip
64
+ puts "#{path}: #{engine.text_for(image.scale(4)).strip}"
57
65
  }
58
66
  }
@@ -0,0 +1,100 @@
1
+ enKzaV
2
+ CZU6tf
3
+ ZO5mGY
4
+ FNNhZv
5
+ Dwp1Vy
6
+ JYsDAi
7
+ dld510
8
+ yG615j
9
+ WHxAUZ
10
+ k9IhZu
11
+ qWIPSr
12
+ nDSXc5
13
+ 9iTYeZ
14
+ s44iQ9
15
+ VPNXWy
16
+ 80zxvW
17
+ QA7IYj
18
+ D8Ro4U
19
+ OiEg1U
20
+ pJS7Z8
21
+ 6w8eik
22
+ s5igED
23
+ 7bJe8p
24
+ VtYdW3
25
+ jNNdcO
26
+ neLPNV
27
+ KONPnl
28
+ Q8aSXJ
29
+ kIwSqv
30
+ 8LQExn
31
+ RwcDU2
32
+ LMLg5K
33
+ C0YmdD
34
+ mqAvES
35
+ Ai0Wxi
36
+ bopETp
37
+ L3yP5u
38
+ w4rw3b
39
+ oSEUMU
40
+ bftqDK
41
+ mM7cKE
42
+ rYl6x4
43
+ 3hVI8X
44
+ Tm2PPp
45
+ VfmqQ6
46
+ 0EZAgC
47
+ QW6gBS
48
+ UTS137
49
+ YXXTqk
50
+ a6LU3K
51
+ SVzguN
52
+ l9G8Y9
53
+ ZP9TDM
54
+ yj7zmS
55
+ sD0Ub9
56
+ XeWY3A
57
+ w8EKVl
58
+ 3gO266
59
+ yYN3oJ
60
+ NumjLi
61
+ EEzwCz
62
+ 8bUSjW
63
+ GAo6ap
64
+ AXcn6K
65
+ KquWmp
66
+ PM8LYt
67
+ uGS7GO
68
+ mucVzf
69
+ UIZ6cf
70
+ mXRdGq
71
+ lcn8OP
72
+ lKqkw7
73
+ CTHnR1
74
+ ShvmSD
75
+ klCoXI
76
+ 8epftU
77
+ nA277p
78
+ uXfavQ
79
+ EhInXB
80
+ KJLUZf
81
+ nAWswu
82
+ savOX1
83
+ bJtoCK
84
+ cTnmnF
85
+ hoYGbS
86
+ d3J04A
87
+ c18WaN
88
+ aO5VVN
89
+ DvP3vf
90
+ 7cuFOq
91
+ mPaSmA
92
+ k0rNqQ
93
+ tvG40k
94
+ 154xFW
95
+ x9ioCI
96
+ p9rR4J
97
+ DNLPJA
98
+ wGR2O1
99
+ zVr96r
100
+ 856sHR