tesseract-ocr 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. data/README.md +54 -16
  2. data/bin/tesseract-train.rb +63 -0
  3. data/examples/nerdz-captcha-breaker/break.rb +18 -10
  4. data/examples/nerdz-captcha-breaker/captchas/001.png +0 -0
  5. data/examples/nerdz-captcha-breaker/captchas/002.png +0 -0
  6. data/examples/nerdz-captcha-breaker/captchas/003.png +0 -0
  7. data/examples/nerdz-captcha-breaker/captchas/004.png +0 -0
  8. data/examples/nerdz-captcha-breaker/captchas/005.png +0 -0
  9. data/examples/nerdz-captcha-breaker/captchas/006.png +0 -0
  10. data/examples/nerdz-captcha-breaker/captchas/007.png +0 -0
  11. data/examples/nerdz-captcha-breaker/captchas/008.png +0 -0
  12. data/examples/nerdz-captcha-breaker/captchas/009.png +0 -0
  13. data/examples/nerdz-captcha-breaker/captchas/010.png +0 -0
  14. data/examples/nerdz-captcha-breaker/captchas/011.png +0 -0
  15. data/examples/nerdz-captcha-breaker/captchas/012.png +0 -0
  16. data/examples/nerdz-captcha-breaker/captchas/013.png +0 -0
  17. data/examples/nerdz-captcha-breaker/captchas/014.png +0 -0
  18. data/examples/nerdz-captcha-breaker/captchas/015.png +0 -0
  19. data/examples/nerdz-captcha-breaker/captchas/016.png +0 -0
  20. data/examples/nerdz-captcha-breaker/captchas/017.png +0 -0
  21. data/examples/nerdz-captcha-breaker/captchas/018.png +0 -0
  22. data/examples/nerdz-captcha-breaker/captchas/019.png +0 -0
  23. data/examples/nerdz-captcha-breaker/captchas/020.png +0 -0
  24. data/examples/nerdz-captcha-breaker/captchas/021.png +0 -0
  25. data/examples/nerdz-captcha-breaker/captchas/022.png +0 -0
  26. data/examples/nerdz-captcha-breaker/captchas/023.png +0 -0
  27. data/examples/nerdz-captcha-breaker/captchas/024.png +0 -0
  28. data/examples/nerdz-captcha-breaker/captchas/025.png +0 -0
  29. data/examples/nerdz-captcha-breaker/captchas/026.png +0 -0
  30. data/examples/nerdz-captcha-breaker/captchas/027.png +0 -0
  31. data/examples/nerdz-captcha-breaker/captchas/028.png +0 -0
  32. data/examples/nerdz-captcha-breaker/captchas/029.png +0 -0
  33. data/examples/nerdz-captcha-breaker/captchas/030.png +0 -0
  34. data/examples/nerdz-captcha-breaker/captchas/031.png +0 -0
  35. data/examples/nerdz-captcha-breaker/captchas/032.png +0 -0
  36. data/examples/nerdz-captcha-breaker/captchas/033.png +0 -0
  37. data/examples/nerdz-captcha-breaker/captchas/034.png +0 -0
  38. data/examples/nerdz-captcha-breaker/captchas/035.png +0 -0
  39. data/examples/nerdz-captcha-breaker/captchas/036.png +0 -0
  40. data/examples/nerdz-captcha-breaker/captchas/037.png +0 -0
  41. data/examples/nerdz-captcha-breaker/captchas/038.png +0 -0
  42. data/examples/nerdz-captcha-breaker/captchas/039.png +0 -0
  43. data/examples/nerdz-captcha-breaker/captchas/040.png +0 -0
  44. data/examples/nerdz-captcha-breaker/captchas/041.png +0 -0
  45. data/examples/nerdz-captcha-breaker/captchas/042.png +0 -0
  46. data/examples/nerdz-captcha-breaker/captchas/043.png +0 -0
  47. data/examples/nerdz-captcha-breaker/captchas/044.png +0 -0
  48. data/examples/nerdz-captcha-breaker/captchas/045.png +0 -0
  49. data/examples/nerdz-captcha-breaker/captchas/046.png +0 -0
  50. data/examples/nerdz-captcha-breaker/captchas/047.png +0 -0
  51. data/examples/nerdz-captcha-breaker/captchas/048.png +0 -0
  52. data/examples/nerdz-captcha-breaker/captchas/049.png +0 -0
  53. data/examples/nerdz-captcha-breaker/captchas/050.png +0 -0
  54. data/examples/nerdz-captcha-breaker/captchas/051.png +0 -0
  55. data/examples/nerdz-captcha-breaker/captchas/052.png +0 -0
  56. data/examples/nerdz-captcha-breaker/captchas/053.png +0 -0
  57. data/examples/nerdz-captcha-breaker/captchas/054.png +0 -0
  58. data/examples/nerdz-captcha-breaker/captchas/055.png +0 -0
  59. data/examples/nerdz-captcha-breaker/captchas/056.png +0 -0
  60. data/examples/nerdz-captcha-breaker/captchas/057.png +0 -0
  61. data/examples/nerdz-captcha-breaker/captchas/058.png +0 -0
  62. data/examples/nerdz-captcha-breaker/captchas/059.png +0 -0
  63. data/examples/nerdz-captcha-breaker/captchas/060.png +0 -0
  64. data/examples/nerdz-captcha-breaker/captchas/061.png +0 -0
  65. data/examples/nerdz-captcha-breaker/captchas/062.png +0 -0
  66. data/examples/nerdz-captcha-breaker/captchas/063.png +0 -0
  67. data/examples/nerdz-captcha-breaker/captchas/064.png +0 -0
  68. data/examples/nerdz-captcha-breaker/captchas/065.png +0 -0
  69. data/examples/nerdz-captcha-breaker/captchas/066.png +0 -0
  70. data/examples/nerdz-captcha-breaker/captchas/067.png +0 -0
  71. data/examples/nerdz-captcha-breaker/captchas/068.png +0 -0
  72. data/examples/nerdz-captcha-breaker/captchas/069.png +0 -0
  73. data/examples/nerdz-captcha-breaker/captchas/070.png +0 -0
  74. data/examples/nerdz-captcha-breaker/captchas/071.png +0 -0
  75. data/examples/nerdz-captcha-breaker/captchas/072.png +0 -0
  76. data/examples/nerdz-captcha-breaker/captchas/073.png +0 -0
  77. data/examples/nerdz-captcha-breaker/captchas/074.png +0 -0
  78. data/examples/nerdz-captcha-breaker/captchas/075.png +0 -0
  79. data/examples/nerdz-captcha-breaker/captchas/076.png +0 -0
  80. data/examples/nerdz-captcha-breaker/captchas/077.png +0 -0
  81. data/examples/nerdz-captcha-breaker/captchas/078.png +0 -0
  82. data/examples/nerdz-captcha-breaker/captchas/079.png +0 -0
  83. data/examples/nerdz-captcha-breaker/captchas/080.png +0 -0
  84. data/examples/nerdz-captcha-breaker/captchas/081.png +0 -0
  85. data/examples/nerdz-captcha-breaker/captchas/082.png +0 -0
  86. data/examples/nerdz-captcha-breaker/captchas/083.png +0 -0
  87. data/examples/nerdz-captcha-breaker/captchas/084.png +0 -0
  88. data/examples/nerdz-captcha-breaker/captchas/085.png +0 -0
  89. data/examples/nerdz-captcha-breaker/captchas/086.png +0 -0
  90. data/examples/nerdz-captcha-breaker/captchas/087.png +0 -0
  91. data/examples/nerdz-captcha-breaker/captchas/088.png +0 -0
  92. data/examples/nerdz-captcha-breaker/captchas/089.png +0 -0
  93. data/examples/nerdz-captcha-breaker/captchas/090.png +0 -0
  94. data/examples/nerdz-captcha-breaker/captchas/091.png +0 -0
  95. data/examples/nerdz-captcha-breaker/captchas/092.png +0 -0
  96. data/examples/nerdz-captcha-breaker/captchas/093.png +0 -0
  97. data/examples/nerdz-captcha-breaker/captchas/094.png +0 -0
  98. data/examples/nerdz-captcha-breaker/captchas/095.png +0 -0
  99. data/examples/nerdz-captcha-breaker/captchas/096.png +0 -0
  100. data/examples/nerdz-captcha-breaker/captchas/097.png +0 -0
  101. data/examples/nerdz-captcha-breaker/captchas/098.png +0 -0
  102. data/examples/nerdz-captcha-breaker/captchas/099.png +0 -0
  103. data/examples/nerdz-captcha-breaker/captchas/100.png +0 -0
  104. data/examples/nerdz-captcha-breaker/captchas/captchas.txt +100 -0
  105. data/examples/nerdz-captcha-breaker/tessdata/generate.rb +21 -0
  106. data/examples/nerdz-captcha-breaker/tessdata/lol.box +600 -0
  107. data/examples/nerdz-captcha-breaker/tessdata/lol.tif +0 -0
  108. data/examples/nerdz-captcha-breaker/tessdata/lol.traineddata +0 -0
  109. data/examples/nerdz-captcha-breaker/test.rb +11 -0
  110. data/lib/tesseract/engine.rb +19 -2
  111. data/lib/tesseract/extensions.rb +0 -1
  112. data/lib/tesseract/version.rb +1 -1
  113. data/test/jsmj.png +0 -0
  114. data/test/tesseract_spec.rb +9 -0
  115. metadata +122 -14
  116. data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.box +0 -112
  117. data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif +0 -0
data/README.md CHANGED
@@ -12,8 +12,14 @@ because it's still under review for upstream merging.
12
12
 
13
13
  The gem is called `tesseract-ocr`.
14
14
 
15
- Example
16
- -------
15
+ If you're having any problem requiring tesseract or any dependencies, check the permissions of the installed
16
+ gems.
17
+
18
+ Examples
19
+ --------
20
+ Following are some examples that show the functionalities provided by tesseract-ocr.
21
+
22
+ ### Basic functionality of tesseract
17
23
 
18
24
  ```ruby
19
25
  require 'tesseract'
@@ -24,26 +30,58 @@ e = Tesseract::Engine.new {|e|
24
30
  }
25
31
 
26
32
  e.text_for('test/first.png').strip # => 'ABC'
27
-
28
- e.words_for('test/second.png') # [
29
- # [ 0] #<Tesseract(93.41653442382812): "|'m">,
30
- # [ 1] #<Tesseract(91.11811828613281): "12">,
31
- # [ 2] #<Tesseract(85.71760559082031): "and">,
32
- # [ 3] #<Tesseract(83.4853515625): "what">,
33
- # [ 4] #<Tesseract(86.71072387695312): "is">,
34
- # [ 5] #<Tesseract(83.2227783203125): "this.">,
35
- # [ 6] #<Tesseract(82.81439208984375): "INSTALL">,
36
- # [ 7] #<Tesseract(86.46566772460938): "GENTOO">,
37
- # [ 8] #<Tesseract(93.19613647460938): "OH">,
38
- # [ 9] #<Tesseract(82.81439208984375): "HAI">,
39
- # [10] #<Tesseract(85.9158935546875): "1234">
40
- # ]
41
33
  ```
42
34
 
43
35
  You can pass to `#text_for` either a path, an IO object, a string containing the image or
44
36
  an object that responds to `#to_blob` (for example Magick::Image), keep in mind that
45
37
  the format has to be supported by leptonica.
46
38
 
39
+ ### Accessing advanced features
40
+
41
+ With advanced features you get access to blocks, paragraphs, lines, words and symbols.
42
+
43
+ There are lot of way to access those levels, the methods are the following (replace level
44
+ with one of the accessible features, so `each_level` can be `each_block` or `each_paragraph`
45
+ etc.)
46
+
47
+ The following kind of accessors need a block to be passed and they pass to the block each
48
+ `Element` object. The Element object has various getters to access certain features, I'll
49
+ talk about them later.
50
+
51
+ The methods are:
52
+
53
+ * `each_level`
54
+ * `each_level_for`
55
+ * `each_level_at`
56
+
57
+ The following accessors instead return an `Array` of `Element`s with cached getters, the getters
58
+ are cached beacause the values accessible in the `Element` are linked to the state of the internal
59
+ API, and that state changes if you access something else.
60
+
61
+ The methods are:
62
+
63
+ * `levels`
64
+ * `levels_for`
65
+ * `levels_at`
66
+
67
+ Again, to `*_for` methods you can pass what you can pass to a `#text_for`.
68
+
69
+ Each `Element` object has the following getters:
70
+
71
+ * `bounding_box`, this will return the box where the element is confined into
72
+ * `binary_image`, this will return the bichromatic image of the element
73
+ * `image`, this will return the image of the element
74
+ * `baseline`, this will return the line where the text is with a pair of coordinates
75
+ * `orientation`, this will return the orientation of the element
76
+ * `text`, this will return the text of the element
77
+ * `confidence`, this will return the confidence of correctness for the element
78
+
79
+ `Block` elements also have `type` accessors that specify the type of the block.
80
+
81
+ `Word` elements also have `font_attributes`, `from_dictionary?` and `numeric?` getters.
82
+
83
+ `Symbol` elements also have `superscript?`, `subscript?` and `dropcap?` getters.
84
+
47
85
  Using the binary
48
86
  ----------------
49
87
  You can also use the shipped executable in the following way:
@@ -0,0 +1,63 @@
1
+ #! /usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'tmpdir'
4
+ require 'fileutils'
5
+ require 'shellwords'
6
+
7
+ options = {}
8
+
9
+ OptionParser.new do |o|
10
+ o.on '-b', '--box FILE', 'the box file to use' do |value|
11
+ options[:box] = File.realpath(value)
12
+ end
13
+
14
+ o.on '-i', '--image FILE', 'the image file to use' do |value|
15
+ options[:image] = File.realpath(value)
16
+ end
17
+
18
+ o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
19
+ options[:output] = File.expand_path(value)
20
+ end
21
+ end.parse!
22
+
23
+ if language = ARGV.shift
24
+ options[:box] = File.realpath("#{language}.box")
25
+ options[:image] = File.realpath("#{language}.tif")
26
+ options[:output] = File.expand_path("#{language}.traineddata")
27
+ else
28
+ language = options[:box][/^(.*?)\./, 1]
29
+ end
30
+
31
+ Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
32
+
33
+ language = language.shellescape
34
+
35
+ %x{
36
+ cp #{options[:box].shellescape} #{language}.box
37
+ cp #{options[:image].shellescape} #{language}#{File.extname(options[:image])}
38
+
39
+ tesseract #{language}#{File.extname(options[:image])} #{language} nobatch box.train.stderr
40
+
41
+ unicharset_extractor #{language}.box
42
+
43
+ echo #{language} 0 0 0 0 0 > font_properties
44
+ mftraining -F font_properties -U unicharset #{language}.tr
45
+ mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
46
+ cntraining #{language}.tr
47
+
48
+ mv Microfeat #{language}.Microfeat
49
+ mv normproto #{language}.normproto
50
+ mv pffmtable #{language}.pffmtable
51
+ mv mfunicharset #{language}.mfunicharset
52
+ mv inttemp #{language}.inttemp
53
+
54
+ combine_tessdata #{language}.
55
+
56
+ mv #{language}.traineddata #{options[:output].shellescape}
57
+ }
58
+
59
+ path = File.realpath(Dir.pwd)
60
+
61
+ Dir.chdir '/'
62
+
63
+ FileUtils.rm_rf path
@@ -2,21 +2,29 @@
2
2
  require 'tesseract'
3
3
  require 'RMagick'
4
4
 
5
+ # this function is used to get points near the current pixel to
6
+ # cleanup the oblique lines mess, horizontal points seem to output
7
+ # better cleanup
5
8
  def near (x, y)
6
9
  [
7
- [x - 1, y - 1],
8
- [x, y - 1],
9
- [x + 1, y - 1],
10
+ # [x - 1, y - 1],
11
+ # [x, y - 1],
12
+ # [x + 1, y - 1],
10
13
  [x - 1, y ],
11
- # FIRE IN THE HOLE
12
14
  [x + 1, y ],
13
- [x - 1, y + 1],
14
- [x, y + 1],
15
- [x + 1, y + 1]
15
+ # [x - 1, y + 1],
16
+ # [x, y + 1],
17
+ # [x + 1, y + 1]
16
18
  ]
17
19
  end
18
20
 
19
- Tesseract::Engine.new.tap {|engine|
21
+ ENV['TESSDATA_PREFIX'] = './'
22
+
23
+ Tesseract::Engine.new {|engine|
24
+ engine.language = :lol
25
+ engine.page_segmentation_mode = 8
26
+ engine.whitelist = [*'a'..'z', *'A'..'Z', *0..9].join
27
+ }.tap {|engine|
20
28
  ARGV.each {|path|
21
29
  image = Magick::Image.read(path).first
22
30
  pixels = Hash.new { |h, k| h[k] = 0 }
@@ -51,8 +59,8 @@ Tesseract::Engine.new.tap {|engine|
51
59
  image.pixel_color x, y, 'black'
52
60
  }
53
61
 
54
- File.open('/tmp/lol.png', ?w) { |f| f.write(image.resize(10).to_blob) }
62
+ image.scale(4).display if ENV['DEBUG']
55
63
 
56
- puts engine.text_for(image.resize 10).strip
64
+ puts "#{path}: #{engine.text_for(image.scale(4)).strip}"
57
65
  }
58
66
  }
@@ -0,0 +1,100 @@
1
+ enKzaV
2
+ CZU6tf
3
+ ZO5mGY
4
+ FNNhZv
5
+ Dwp1Vy
6
+ JYsDAi
7
+ dld510
8
+ yG615j
9
+ WHxAUZ
10
+ k9IhZu
11
+ qWIPSr
12
+ nDSXc5
13
+ 9iTYeZ
14
+ s44iQ9
15
+ VPNXWy
16
+ 80zxvW
17
+ QA7IYj
18
+ D8Ro4U
19
+ OiEg1U
20
+ pJS7Z8
21
+ 6w8eik
22
+ s5igED
23
+ 7bJe8p
24
+ VtYdW3
25
+ jNNdcO
26
+ neLPNV
27
+ KONPnl
28
+ Q8aSXJ
29
+ kIwSqv
30
+ 8LQExn
31
+ RwcDU2
32
+ LMLg5K
33
+ C0YmdD
34
+ mqAvES
35
+ Ai0Wxi
36
+ bopETp
37
+ L3yP5u
38
+ w4rw3b
39
+ oSEUMU
40
+ bftqDK
41
+ mM7cKE
42
+ rYl6x4
43
+ 3hVI8X
44
+ Tm2PPp
45
+ VfmqQ6
46
+ 0EZAgC
47
+ QW6gBS
48
+ UTS137
49
+ YXXTqk
50
+ a6LU3K
51
+ SVzguN
52
+ l9G8Y9
53
+ ZP9TDM
54
+ yj7zmS
55
+ sD0Ub9
56
+ XeWY3A
57
+ w8EKVl
58
+ 3gO266
59
+ yYN3oJ
60
+ NumjLi
61
+ EEzwCz
62
+ 8bUSjW
63
+ GAo6ap
64
+ AXcn6K
65
+ KquWmp
66
+ PM8LYt
67
+ uGS7GO
68
+ mucVzf
69
+ UIZ6cf
70
+ mXRdGq
71
+ lcn8OP
72
+ lKqkw7
73
+ CTHnR1
74
+ ShvmSD
75
+ klCoXI
76
+ 8epftU
77
+ nA277p
78
+ uXfavQ
79
+ EhInXB
80
+ KJLUZf
81
+ nAWswu
82
+ savOX1
83
+ bJtoCK
84
+ cTnmnF
85
+ hoYGbS
86
+ d3J04A
87
+ c18WaN
88
+ aO5VVN
89
+ DvP3vf
90
+ 7cuFOq
91
+ mPaSmA
92
+ k0rNqQ
93
+ tvG40k
94
+ 154xFW
95
+ x9ioCI
96
+ p9rR4J
97
+ DNLPJA
98
+ wGR2O1
99
+ zVr96r
100
+ 856sHR