tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1105 @@
1
+ /**********************************************************************
2
+ * File: baseapi.cpp
3
+ * Description: Simple API for calling tesseract.
4
+ * Author: Ray Smith
5
+ * Created: Fri Oct 06 15:35:01 PDT 2006
6
+ *
7
+ * (C) Copyright 2006, Google Inc.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "baseapi.h"
21
+
22
+
23
+ // Include automatically generated configuration file if running autoconf.
24
+ #ifdef HAVE_CONFIG_H
25
+ #include "config_auto.h"
26
+ #endif
27
+
28
+ #ifdef HAVE_LIBLEPT
29
+ // Include leptonica library only if autoconf (or makefile etc) tell us to.
30
+ #include "allheaders.h"
31
+ #endif
32
+
33
+ #include "tessedit.h"
34
+ #include "ocrclass.h"
35
+ #include "pageres.h"
36
+ #include "tessvars.h"
37
+ #include "control.h"
38
+ #include "applybox.h"
39
+ #include "pgedit.h"
40
+ #include "varabled.h"
41
+ #include "variables.h"
42
+ #include "output.h"
43
+ #include "globals.h"
44
+ #include "adaptmatch.h"
45
+ #include "edgblob.h"
46
+ #include "tessbox.h"
47
+ #include "tordvars.h"
48
+ #include "imgs.h"
49
+ #include "makerow.h"
50
+ #include "tstruct.h"
51
+ #include "tessout.h"
52
+ #include "tface.h"
53
+ #include "permute.h"
54
+
55
+ BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
56
+ "Take segmentation and labeling from box file");
57
+ BOOL_VAR(tessedit_train_from_boxes, FALSE,
58
+ "Generate training data from boxed chars");
59
+
60
+ // Minimum sensible image size to be worth running tesseract.
61
+ const int kMinRectSize = 10;
62
+
63
+ static STRING input_file = "noname.tif";
64
+
65
+ // Set the value of an internal "variable" (of either old or new types).
66
+ // Supply the name of the variable and the value as a string, just as
67
+ // you would in a config file.
68
+ // Returns false if the name lookup failed.
69
+ bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
70
+ if (set_new_style_variable(variable, value))
71
+ return true;
72
+ return set_old_style_variable(variable, value);
73
+ }
74
+
75
+ void TessBaseAPI::SimpleInit(const char* datapath,
76
+ const char* language,
77
+ bool numeric_mode) {
78
+ InitWithLanguage(datapath, NULL, language, NULL, numeric_mode, 0, NULL);
79
+ }
80
+
81
+ // Start tesseract.
82
+ // The datapath must be the name of the data directory or some other file
83
+ // in which the data directory resides (for instance argv[0].)
84
+ // The configfile is the name of a file in the tessconfigs directory
85
+ // (eg batch) or NULL to run on defaults.
86
+ // Outputbase may also be NULL, and is the basename of various output files.
87
+ // If the output of any of these files is enabled, then a name nmust be given.
88
+ // If numeric_mode is true, only possible digits and roman numbers are
89
+ // returned. Returns 0 if successful. Crashes if not.
90
+ // The argc and argv may be 0 and NULL respectively. They are used for
91
+ // providing config files for debug/display purposes.
92
+ // TODO(rays) get the facts straight. Is it OK to call
93
+ // it more than once? Make it properly check for errors and return them.
94
+ int TessBaseAPI::Init(const char* datapath, const char* outputbase,
95
+ const char* configfile, bool numeric_mode,
96
+ int argc, char* argv[]) {
97
+ return InitWithLanguage(datapath, outputbase, NULL, configfile,
98
+ numeric_mode, argc, argv);
99
+ }
100
+
101
+ // Start tesseract.
102
+ // Similar to Init() except that it is possible to specify the language.
103
+ // Language is the code of the language for which the data will be loaded.
104
+ // (Codes follow ISO 639-3.) If it is NULL, english (eng) will be loaded.
105
+ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
106
+ const char* language, const char* configfile,
107
+ bool numeric_mode, int argc, char* argv[]) {
108
+ int result = init_tesseract(datapath, outputbase, language,
109
+ configfile, argc, argv);
110
+
111
+ bln_numericmode.set_value(numeric_mode);
112
+ return result;
113
+ }
114
+
115
+ // Init the lang model component of Tesseract
116
+ int TessBaseAPI::InitLangMod(const char* datapath, const char* outputbase,
117
+ const char* language, const char* configfile,
118
+ bool numeric_mode, int argc, char* argv[]) {
119
+ return init_tesseract_lm(datapath, outputbase, language,
120
+ configfile, argc, argv);
121
+ }
122
+
123
+ // Set the name of the input file. Needed only for training and
124
+ // loading a UNLV zone file.
125
+ void TessBaseAPI::SetInputName(const char* name) {
126
+ input_file = name;
127
+ }
128
+
129
+ // Recognize a rectangle from an image and return the result as a string.
130
+ // May be called many times for a single Init.
131
+ // Currently has no error checking.
132
+ // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
133
+ // Palette color images will not work properly and must be converted to
134
+ // 24 bit.
135
+ // Binary images of 1 bit per pixel may also be given but they must be
136
+ // byte packed with the MSB of the first byte being the first pixel, and a
137
+ // one pixel is WHITE. For binary images set bytes_per_pixel=0.
138
+ // The recognized text is returned as a char* which (in future will be coded
139
+ // as UTF8 and) must be freed with the delete [] operator.
140
+ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
141
+ int bytes_per_pixel,
142
+ int bytes_per_line,
143
+ int left, int top,
144
+ int width, int height) {
145
+ if (width < kMinRectSize || height < kMinRectSize)
146
+ return NULL; // Nothing worth doing.
147
+
148
+ // Copy/Threshold the image to the tesseract global page_image.
149
+ CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
150
+ left, top, width, height);
151
+
152
+ return RecognizeToString();
153
+ }
154
+
155
+ // As TesseractRect but produces a box file as output.
156
+ char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
157
+ int bytes_per_pixel,
158
+ int bytes_per_line,
159
+ int left, int top,
160
+ int width, int height,
161
+ int imageheight) {
162
+ if (width < kMinRectSize || height < kMinRectSize)
163
+ return NULL; // Nothing worth doing.
164
+
165
+ // Copy/Threshold the image to the tesseract global page_image.
166
+ CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
167
+ left, top, width, height);
168
+
169
+ BLOCK_LIST block_list;
170
+
171
+ FindLines(&block_list);
172
+
173
+ // Now run the main recognition.
174
+ PAGE_RES* page_res = Recognize(&block_list, NULL);
175
+
176
+ return TesseractToBoxText(page_res, left, imageheight - (top + height));
177
+ }
178
+
179
+ char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
180
+ int bytes_per_pixel,
181
+ int bytes_per_line,
182
+ int left, int top,
183
+ int width, int height) {
184
+ if (width < kMinRectSize || height < kMinRectSize)
185
+ return NULL; // Nothing worth doing.
186
+
187
+ // Copy/Threshold the image to the tesseract global page_image.
188
+ CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
189
+ left, top, width, height);
190
+
191
+ BLOCK_LIST block_list;
192
+
193
+ FindLines(&block_list);
194
+
195
+ // Now run the main recognition.
196
+ PAGE_RES* page_res = Recognize(&block_list, NULL);
197
+
198
+ return TesseractToUNLV(page_res);
199
+ }
200
+
201
+ // Call between pages or documents etc to free up memory and forget
202
+ // adaptive data.
203
+ void TessBaseAPI::ClearAdaptiveClassifier() {
204
+ ResetAdaptiveClassifier();
205
+ }
206
+
207
+ // Close down tesseract and free up memory.
208
+ void TessBaseAPI::End() {
209
+ ResetAdaptiveClassifier();
210
+ end_tesseract();
211
+ }
212
+
213
+ // Dump the internal binary image to a PGM file.
214
+ void TessBaseAPI::DumpPGM(const char* filename) {
215
+ IMAGELINE line;
216
+ line.init(page_image.get_xsize());
217
+ FILE *fp = fopen(filename, "w");
218
+ fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", page_image.get_xsize(),
219
+ page_image.get_ysize());
220
+ for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
221
+ page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
222
+ for (int i = 0; i < page_image.get_xsize(); ++i) {
223
+ uinT8 b = line.pixels[i] ? 255 : 0;
224
+ fwrite(&b, 1, 1, fp);
225
+ }
226
+ }
227
+ fclose(fp);
228
+ }
229
+
230
+ #ifdef HAVE_LIBLEPT
231
+ // ONLY available if you have Leptonica installed.
232
+ // Get a copy of the thresholded global image from Tesseract.
233
+ Pix* TessBaseAPI::GetTesseractImage() {
234
+ return page_image.ToPix();
235
+ }
236
+ #endif // HAVE_LIBLEPT
237
+
238
+ // Copy the given image rectangle to Tesseract, with adaptive thresholding
239
+ // if the image is not already binary.
240
+ void TessBaseAPI::CopyImageToTesseract(const unsigned char* imagedata,
241
+ int bytes_per_pixel,
242
+ int bytes_per_line,
243
+ int left, int top,
244
+ int width, int height) {
245
+ if (bytes_per_pixel > 0) {
246
+ // Threshold grey or color.
247
+ int* thresholds = new int[bytes_per_pixel];
248
+ int* hi_values = new int[bytes_per_pixel];
249
+
250
+ // Compute the thresholds.
251
+ OtsuThreshold(imagedata, bytes_per_pixel, bytes_per_line,
252
+ left, top, left + width, top + height,
253
+ thresholds, hi_values);
254
+
255
+ // Threshold the image to the tesseract global page_image.
256
+ ThresholdRect(imagedata, bytes_per_pixel, bytes_per_line,
257
+ left, top, width, height,
258
+ thresholds, hi_values);
259
+ delete [] thresholds;
260
+ delete [] hi_values;
261
+ } else {
262
+ CopyBinaryRect(imagedata, bytes_per_line, left, top, width, height);
263
+ }
264
+ }
265
+
266
+ // Compute the Otsu threshold(s) for the given image rectangle, making one
267
+ // for each channel. Each channel is always one byte per pixel.
268
+ // Returns an array of threshold values and an array of hi_values, such
269
+ // that a pixel value >threshold[channel] is considered foreground if
270
+ // hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates
271
+ // that there is no apparent foreground. At least one hi_value will not be -1.
272
+ // thresholds and hi_values are assumed to be of bytes_per_pixel size.
273
+ void TessBaseAPI::OtsuThreshold(const unsigned char* imagedata,
274
+ int bytes_per_pixel,
275
+ int bytes_per_line,
276
+ int left, int top, int right, int bottom,
277
+ int* thresholds,
278
+ int* hi_values) {
279
+ // Of all channels with no good hi_value, keep the best so we can always
280
+ // produce at least one answer.
281
+ int best_hi_value = 0;
282
+ int best_hi_index = 0;
283
+ bool any_good_hivalue = false;
284
+ double best_hi_dist = 0.0;
285
+
286
+ for (int ch = 0; ch < bytes_per_pixel; ++ch) {
287
+ thresholds[ch] = 0;
288
+ hi_values[ch] = -1;
289
+ // Compute the histogram of the image rectangle.
290
+ int histogram[256];
291
+ HistogramRect(imagedata + ch, bytes_per_pixel, bytes_per_line,
292
+ left, top, right, bottom, histogram);
293
+ int H;
294
+ int best_omega_0;
295
+ int best_t = OtsuStats(histogram, &H, &best_omega_0);
296
+ if (best_omega_0 == 0 || best_omega_0 == H) {
297
+ // This channel is empty.
298
+ continue;
299
+ }
300
+ // To be a convincing foreground we must have a small fraction of H
301
+ // or to be a convincing background we must have a large fraction of H.
302
+ // In between we assume this channel contains no thresholding information.
303
+ int hi_value = best_omega_0 < H * 0.5;
304
+ thresholds[ch] = best_t;
305
+ if (best_omega_0 > H * 0.75) {
306
+ any_good_hivalue = true;
307
+ hi_values[ch] = 0;
308
+ }
309
+ else if (best_omega_0 < H * 0.25) {
310
+ any_good_hivalue = true;
311
+ hi_values[ch] = 1;
312
+ }
313
+ else {
314
+ // In case all channels are like this, keep the best of the bad lot.
315
+ double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
316
+ if (hi_dist > best_hi_dist) {
317
+ best_hi_dist = hi_dist;
318
+ best_hi_value = hi_value;
319
+ best_hi_index = ch;
320
+ }
321
+ }
322
+ }
323
+ if (!any_good_hivalue) {
324
+ // Use the best of the ones that were not good enough.
325
+ hi_values[best_hi_index] = best_hi_value;
326
+ }
327
+ }
328
+
329
+ // Compute the histogram for the given image rectangle, and the given
330
+ // channel. (Channel pointed to by imagedata.) Each channel is always
331
+ // one byte per pixel.
332
+ // Bytes per pixel is used to skip channels not being
333
+ // counted with this call in a multi-channel (pixel-major) image.
334
+ // Histogram is always a 256 element array to count occurrences of
335
+ // each pixel value.
336
+ void TessBaseAPI::HistogramRect(const unsigned char* imagedata,
337
+ int bytes_per_pixel,
338
+ int bytes_per_line,
339
+ int left, int top, int right, int bottom,
340
+ int* histogram) {
341
+ int width = right - left;
342
+ memset(histogram, 0, sizeof(*histogram) * 256);
343
+ const unsigned char* pixels = imagedata +
344
+ top*bytes_per_line +
345
+ left*bytes_per_pixel;
346
+ for (int y = top; y < bottom; ++y) {
347
+ for (int x = 0; x < width; ++x) {
348
+ ++histogram[pixels[x * bytes_per_pixel]];
349
+ }
350
+ pixels += bytes_per_line;
351
+ }
352
+ }
353
+
354
+ // Compute the Otsu threshold(s) for the given histogram.
355
+ // Also returns H = total count in histogram, and
356
+ // omega0 = count of histogram below threshold.
357
+ int TessBaseAPI::OtsuStats(const int* histogram,
358
+ int* H_out,
359
+ int* omega0_out) {
360
+ int H = 0;
361
+ double mu_T = 0.0;
362
+ for (int i = 0; i < 256; ++i) {
363
+ H += histogram[i];
364
+ mu_T += i * histogram[i];
365
+ }
366
+
367
+ // Now maximize sig_sq_B over t.
368
+ // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
369
+ int best_t = -1;
370
+ int omega_0, omega_1;
371
+ int best_omega_0 = 0;
372
+ double best_sig_sq_B = 0.0;
373
+ double mu_0, mu_1, mu_t;
374
+ omega_0 = 0;
375
+ mu_t = 0.0;
376
+ for (int t = 0; t < 255; ++t) {
377
+ omega_0 += histogram[t];
378
+ mu_t += t * static_cast<double>(histogram[t]);
379
+ if (omega_0 == 0)
380
+ continue;
381
+ omega_1 = H - omega_0;
382
+ mu_0 = mu_t / omega_0;
383
+ mu_1 = (mu_T - mu_t) / omega_1;
384
+ double sig_sq_B = mu_1 - mu_0;
385
+ sig_sq_B *= sig_sq_B * omega_0 * omega_1;
386
+ if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
387
+ best_sig_sq_B = sig_sq_B;
388
+ best_t = t;
389
+ best_omega_0 = omega_0;
390
+ }
391
+ }
392
+ if (H_out != NULL) *H_out = H;
393
+ if (omega0_out != NULL) *omega0_out = best_omega_0;
394
+ return best_t;
395
+ }
396
+
397
+ // Threshold the given grey or color image into the tesseract global
398
+ // image ready for recognition. Requires thresholds and hi_value
399
+ // produced by OtsuThreshold above.
400
+ void TessBaseAPI::ThresholdRect(const unsigned char* imagedata,
401
+ int bytes_per_pixel,
402
+ int bytes_per_line,
403
+ int left, int top,
404
+ int width, int height,
405
+ const int* thresholds,
406
+ const int* hi_values) {
407
+ IMAGELINE line;
408
+ page_image.create(width, height, 1);
409
+ line.init(width);
410
+ // For each line in the image, fill the IMAGELINE class and put it into the
411
+ // Tesseract global page_image. Note that Tesseract stores images with the
412
+ // bottom at y=0 and 0 is black, so we need 2 kinds of inversion.
413
+ const unsigned char* data = imagedata + top*bytes_per_line +
414
+ left*bytes_per_pixel;
415
+ for (int y = height - 1 ; y >= 0; --y) {
416
+ const unsigned char* pix = data;
417
+ for (int x = 0; x < width; ++x, pix += bytes_per_pixel) {
418
+ line.pixels[x] = 1;
419
+ for (int ch = 0; ch < bytes_per_pixel; ++ch) {
420
+ if (hi_values[ch] >= 0 &&
421
+ (pix[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
422
+ line.pixels[x] = 0;
423
+ break;
424
+ }
425
+ }
426
+ }
427
+ page_image.put_line(0, y, width, &line, 0);
428
+ data += bytes_per_line;
429
+ }
430
+ }
431
+
432
+ // Cut out the requested rectangle of the binary image to the
433
+ // tesseract global image ready for recognition.
434
+ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
435
+ int bytes_per_line,
436
+ int left, int top,
437
+ int width, int height) {
438
+ // Copy binary image, cutting out the required rectangle.
439
+ IMAGE image;
440
+ image.capture(const_cast<unsigned char*>(imagedata),
441
+ bytes_per_line*8, top + height, 1);
442
+ page_image.create(width, height, 1);
443
+ copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
444
+ }
445
+
446
+ // Low-level function to recognize the current global image to a string.
447
+ char* TessBaseAPI::RecognizeToString() {
448
+ BLOCK_LIST block_list;
449
+
450
+ FindLines(&block_list);
451
+
452
+ // Now run the main recognition.
453
+ PAGE_RES* page_res = Recognize(&block_list, NULL);
454
+
455
+ return TesseractToText(page_res);
456
+ }
457
+
458
+ // Find lines from the image making the BLOCK_LIST.
459
+ void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
460
+ // The following call creates a full-page block and then runs connected
461
+ // component analysis and text line creation.
462
+ pgeditor_read_file(input_file, block_list);
463
+ }
464
+
465
+ // Recognize the tesseract global image and return the result as Tesseract
466
+ // internal structures.
467
+ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
468
+ if (tessedit_resegment_from_boxes)
469
+ apply_boxes(block_list);
470
+
471
+ PAGE_RES* page_res = new PAGE_RES(block_list);
472
+ if (interactive_mode) {
473
+ #ifndef GRAPHICS_DISABLED
474
+ pgeditor_main(block_list); // pgeditor user I/F
475
+ #endif
476
+ } else if (tessedit_train_from_boxes) {
477
+ apply_box_training(block_list);
478
+ } else {
479
+ // Now run the main recognition.
480
+ recog_all_words(page_res, monitor);
481
+ }
482
+ return page_res;
483
+ }
484
+
485
+ // Return the maximum length that the output text string might occupy.
486
+ int TessBaseAPI::TextLength(PAGE_RES* page_res) {
487
+ PAGE_RES_IT page_res_it(page_res);
488
+ int total_length = 2;
489
+ // Iterate over the data structures to extract the recognition result.
490
+ for (page_res_it.restart_page(); page_res_it.word () != NULL;
491
+ page_res_it.forward()) {
492
+ WERD_RES *word = page_res_it.word();
493
+ WERD_CHOICE* choice = word->best_choice;
494
+ if (choice != NULL) {
495
+ total_length += choice->string().length() + 1;
496
+ for (int i = 0; i < word->reject_map.length(); ++i) {
497
+ if (word->reject_map[i].rejected())
498
+ ++total_length;
499
+ }
500
+ }
501
+ }
502
+ return total_length;
503
+ }
504
+
505
+ // Returns an array of all word confidences, terminated by -1.
506
+ int* TessBaseAPI::AllTextConfidences(PAGE_RES* page_res) {
507
+ if (!page_res) return NULL;
508
+ int n_word = 0;
509
+ PAGE_RES_IT res_it(page_res);
510
+ for (res_it.restart_page(); res_it.word () != NULL; res_it.forward())
511
+ n_word++;
512
+
513
+ int* conf = new int[n_word+1];
514
+ n_word = 0;
515
+ for (res_it.restart_page(); res_it.word () != NULL; res_it.forward()) {
516
+ WERD_RES *word = res_it.word();
517
+ WERD_CHOICE* choice = word->best_choice;
518
+ int w_conf = static_cast<int>(100 + 5 * choice->certainty());
519
+ // This is the eq for converting Tesseract confidence to 1..100
520
+ if (w_conf < 0) w_conf = 0;
521
+ if (w_conf > 100) w_conf = 100;
522
+ conf[n_word++] = w_conf;
523
+ }
524
+ conf[n_word] = -1;
525
+ return conf;
526
+ }
527
+
528
+ // Returns the average word confidence for Tesseract page result.
529
+ int TessBaseAPI::TextConf(PAGE_RES* page_res) {
530
+ int* conf = AllTextConfidences(page_res);
531
+ if (!conf) return 0;
532
+ int sum = 0;
533
+ int *pt = conf;
534
+ while (*pt >= 0) sum += *pt++;
535
+ if (pt != conf) sum /= pt - conf;
536
+ delete [] conf;
537
+ return sum;
538
+ }
539
+
540
+ // Make a text string from the internal data structures.
541
+ // The input page_res is deleted.
542
+ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
543
+ if (page_res != NULL) {
544
+ int total_length = TextLength(page_res);
545
+ PAGE_RES_IT page_res_it(page_res);
546
+ char* result = new char[total_length];
547
+ char* ptr = result;
548
+ for (page_res_it.restart_page(); page_res_it.word () != NULL;
549
+ page_res_it.forward()) {
550
+ WERD_RES *word = page_res_it.word();
551
+ WERD_CHOICE* choice = word->best_choice;
552
+ if (choice != NULL) {
553
+ strcpy(ptr, choice->string().string());
554
+ ptr += strlen(ptr);
555
+ if (word->word->flag(W_EOL))
556
+ *ptr++ = '\n';
557
+ else
558
+ *ptr++ = ' ';
559
+ }
560
+ }
561
+ *ptr++ = '\n';
562
+ *ptr = '\0';
563
+ delete page_res;
564
+ return result;
565
+ }
566
+ return NULL;
567
+ }
568
+
569
+ static int ConvertWordToBoxText(WERD_RES *word,
570
+ ROW_RES* row,
571
+ int left,
572
+ int bottom,
573
+ char* word_str) {
574
+ // Copy the output word and denormalize it back to image coords.
575
+ WERD copy_outword;
576
+ copy_outword = *(word->outword);
577
+ copy_outword.baseline_denormalise(&word->denorm);
578
+ PBLOB_IT blob_it;
579
+ blob_it.set_to_list(copy_outword.blob_list());
580
+ int length = copy_outword.blob_list()->length();
581
+ int output_size = 0;
582
+
583
+ if (length > 0) {
584
+ for (int index = 0, offset = 0; index < length;
585
+ offset += word->best_choice->lengths()[index++], blob_it.forward()) {
586
+ PBLOB* blob = blob_it.data();
587
+ TBOX blob_box = blob->bounding_box();
588
+ int box_left = MAX(blob_box.left(), 0);
589
+ int box_right = MIN(blob_box.right(), page_image.get_xsize());
590
+ int box_bottom = MAX(blob_box.bottom(), 0);
591
+ int box_top = MIN(blob_box.top(), page_image.get_ysize());
592
+
593
+ if (word->tess_failed || box_left >= box_right || box_bottom >= box_top) {
594
+ // Bounding boxes can be illegal when tess fails on a word.
595
+ TBOX word_box = word->word->bounding_box(); // Original word is backup.
596
+ if (box_left < word_box.left()) box_left = word_box.left();
597
+ if (box_right > word_box.right()) box_right = word_box.right();
598
+ if (box_bottom < word_box.bottom()) box_bottom = word_box.bottom();
599
+ if (box_top > word_box.top()) box_top = word_box.top();
600
+ tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
601
+ box_left, box_bottom, box_right, box_top);
602
+ }
603
+
604
+ // A single classification unit can be composed of several UTF-8
605
+ // characters. Append each of them to the result.
606
+ for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
607
+ char ch = word->best_choice->string()[offset + sub];
608
+ // Tesseract uses space for recognition failure. Fix to a reject
609
+ // character, '~' so we don't create illegal box files.
610
+ if (ch == ' ')
611
+ ch = '~';
612
+ word_str[output_size++] = ch;
613
+ }
614
+ sprintf(word_str + output_size, " %d %d %d %d\n",
615
+ box_left + left,box_bottom + bottom,
616
+ box_right + left, box_top + bottom);
617
+ output_size += strlen(word_str + output_size);
618
+ }
619
+ }
620
+ return output_size;
621
+ }
622
+
623
+ // Multiplier for textlength assumes 4 numbers @ 5 digits and a space
624
+ // plus the newline and the orginial character = 4*(5+1)+2
625
+ const int kMaxCharsPerChar = 26;
626
+
627
+ // Make a text string from the internal data structures.
628
+ // The input page_res is deleted.
629
+ // The text string takes the form of a box file as needed for training.
630
+ char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
631
+ int left, int bottom) {
632
+ if (page_res != NULL) {
633
+ int total_length = TextLength(page_res) * kMaxCharsPerChar;
634
+ PAGE_RES_IT page_res_it(page_res);
635
+ char* result = new char[total_length];
636
+ char* ptr = result;
637
+ for (page_res_it.restart_page(); page_res_it.word () != NULL;
638
+ page_res_it.forward()) {
639
+ WERD_RES *word = page_res_it.word();
640
+ ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
641
+ }
642
+ *ptr = '\0';
643
+ delete page_res;
644
+ return result;
645
+ }
646
+ return NULL;
647
+ }
648
+
649
+ // Make a text string from the internal data structures.
650
+ // The input page_res is deleted. The text string is converted
651
+ // to UNLV-format: Latin-1 with specific reject and suspect codes.
652
+ const char kUnrecognized = '~';
653
+ // Conversion table for non-latin characters.
654
+ // Maps characters out of the latin set into the latin set.
655
+ // TODO(rays) incorporate this translation into unicharset.
656
+ const int kUniChs[] = {
657
+ 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
658
+ };
659
+ // Latin chars corresponding to the unicode chars above.
660
+ const int kLatinChs[] = {
661
+ 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
662
+ };
663
+
664
+ char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
665
+ bool tilde_crunch_written = false;
666
+ bool last_char_was_newline = true;
667
+ bool last_char_was_tilde = false;
668
+
669
+ if (page_res != NULL) {
670
+ int total_length = TextLength(page_res);
671
+ PAGE_RES_IT page_res_it(page_res);
672
+ char* result = new char[total_length];
673
+ char* ptr = result;
674
+ for (page_res_it.restart_page(); page_res_it.word () != NULL;
675
+ page_res_it.forward()) {
676
+ WERD_RES *word = page_res_it.word();
677
+ // Process the current word.
678
+ if (word->unlv_crunch_mode != CR_NONE) {
679
+ if (word->unlv_crunch_mode != CR_DELETE &&
680
+ (!tilde_crunch_written ||
681
+ (word->unlv_crunch_mode == CR_KEEP_SPACE &&
682
+ word->word->space () > 0 &&
683
+ !word->word->flag (W_FUZZY_NON) &&
684
+ !word->word->flag (W_FUZZY_SP)))) {
685
+ if (!word->word->flag (W_BOL) &&
686
+ word->word->space () > 0 &&
687
+ !word->word->flag (W_FUZZY_NON) &&
688
+ !word->word->flag (W_FUZZY_SP)) {
689
+ /* Write a space to separate from preceeding good text */
690
+ *ptr++ = ' ';
691
+ last_char_was_tilde = false;
692
+ }
693
+ if (!last_char_was_tilde) {
694
+ // Write a reject char.
695
+ last_char_was_tilde = true;
696
+ *ptr++ = kUnrecognized;
697
+ tilde_crunch_written = true;
698
+ last_char_was_newline = false;
699
+ }
700
+ }
701
+ } else {
702
+ // NORMAL PROCESSING of non tilde crunched words.
703
+ tilde_crunch_written = false;
704
+
705
+ if (last_char_was_tilde &&
706
+ word->word->space () == 0 &&
707
+ (word->best_choice->string ()[0] == ' ')) {
708
+ /* Prevent adjacent tilde across words - we know that adjacent tildes within
709
+ words have been removed */
710
+ char* p = (char *) word->best_choice->string().string ();
711
+ strcpy (p, p + 1); //shuffle up
712
+ p = (char *) word->best_choice->lengths().string ();
713
+ strcpy (p, p + 1); //shuffle up
714
+ word->reject_map.remove_pos (0);
715
+ PBLOB_IT blob_it = word->outword->blob_list ();
716
+ delete blob_it.extract (); //get rid of reject blob
717
+ }
718
+
719
+ if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
720
+ ensure_rep_chars_are_consistent(word);
721
+
722
+ set_unlv_suspects(word);
723
+ const char* wordstr = word->best_choice->string().string();
724
+ if (wordstr[0] != 0) {
725
+ if (!last_char_was_newline)
726
+ *ptr++ = ' ';
727
+ else
728
+ last_char_was_newline = false;
729
+ int offset = 0;
730
+ const STRING& lengths = word->best_choice->lengths();
731
+ int length = lengths.length();
732
+ for (int i = 0; i < length; offset += lengths[i++]) {
733
+ if (wordstr[offset] == ' ' ||
734
+ wordstr[offset] == '~' ||
735
+ wordstr[offset] == '|') {
736
+ *ptr++ = kUnrecognized;
737
+ last_char_was_tilde = true;
738
+ } else {
739
+ if (word->reject_map[i].rejected())
740
+ *ptr++ = '^';
741
+ UNICHAR ch(wordstr + offset, lengths[i]);
742
+ int uni_ch = ch.first_uni();
743
+ for (int j = 0; kUniChs[j] != 0; ++j) {
744
+ if (kUniChs[j] == uni_ch) {
745
+ uni_ch = kLatinChs[j];
746
+ break;
747
+ }
748
+ }
749
+ if (uni_ch <= 0xff) {
750
+ *ptr++ = static_cast<char>(uni_ch);
751
+ last_char_was_tilde = false;
752
+ } else {
753
+ *ptr++ = kUnrecognized;
754
+ last_char_was_tilde = true;
755
+ }
756
+ }
757
+ }
758
+ }
759
+ }
760
+ if (word->word->flag(W_EOL) && !last_char_was_newline) {
761
+ /* Add a new line output */
762
+ *ptr++ = '\n';
763
+ tilde_crunch_written = false;
764
+ last_char_was_newline = true;
765
+ last_char_was_tilde = false;
766
+ }
767
+ }
768
+ *ptr++ = '\n';
769
+ *ptr = '\0';
770
+ delete page_res;
771
+ return result;
772
+ }
773
+ return NULL;
774
+ }
775
+ // ____________________________________________________________________________
776
+ // Ocropus add-ons.
777
+
778
+ // Find lines from the image making the BLOCK_LIST.
779
+ BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
780
+ BLOCK_LIST *block_list = new BLOCK_LIST();
781
+ FindLines(block_list);
782
+ return block_list;
783
+ }
784
+
785
+ // Delete a block list.
786
+ // This is to keep BLOCK_LIST pointer opaque
787
+ // and let go of including the other headers.
788
+ void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
789
+ delete block_list;
790
+ }
791
+
792
+
793
+ static ROW *make_tess_ocrrow(float baseline,
794
+ float xheight,
795
+ float descender,
796
+ float ascender) {
797
+ inT32 xstarts[] = {-32000};
798
+ double quad_coeffs[] = {0,0,baseline};
799
+ return new ROW(1,
800
+ xstarts,
801
+ quad_coeffs,
802
+ xheight,
803
+ ascender - (baseline + xheight),
804
+ descender - baseline,
805
+ 0,
806
+ 0);
807
+ }
808
+
809
+ // Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
810
+ static void fill_dummy_row(float baseline, float xheight,
811
+ float descender, float ascender,
812
+ TEXTROW* tessrow) {
813
+ tessrow->baseline.segments = 1;
814
+ tessrow->baseline.xstarts[0] = -32767;
815
+ tessrow->baseline.xstarts[1] = 32767;
816
+ tessrow->baseline.quads[0].a = 0;
817
+ tessrow->baseline.quads[0].b = 0;
818
+ tessrow->baseline.quads[0].c = bln_baseline_offset;
819
+ tessrow->xheight.segments = 1;
820
+ tessrow->xheight.xstarts[0] = -32767;
821
+ tessrow->xheight.xstarts[1] = 32767;
822
+ tessrow->xheight.quads[0].a = 0;
823
+ tessrow->xheight.quads[0].b = 0;
824
+ tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
825
+ tessrow->lineheight = bln_x_height;
826
+ tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
827
+ tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
828
+ }
829
+
830
+
831
+ /// Return a TBLOB * from the whole page_image.
832
+ /// To be freed later with free_blob().
833
+ TBLOB *make_tesseract_blob(float baseline, float xheight, float descender, float ascender) {
834
+ BLOCK *block = new BLOCK ("a character",
835
+ TRUE,
836
+ 0, 0,
837
+ 0, 0,
838
+ page_image.get_xsize(),
839
+ page_image.get_ysize());
840
+
841
+ // Create C_BLOBs from the page
842
+ extract_edges(
843
+ #ifndef GRAPHICS_DISABLED
844
+ NULL,
845
+ #endif
846
+ &page_image, &page_image,
847
+ ICOORD(page_image.get_xsize(), page_image.get_ysize()),
848
+ block);
849
+
850
+ // Create one PBLOB from all C_BLOBs
851
+ C_BLOB_LIST *list = block->blob_list();
852
+ C_BLOB_IT c_blob_it(list);
853
+ PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list
854
+ for (c_blob_it.mark_cycle_pt();
855
+ !c_blob_it.cycled_list();
856
+ c_blob_it.forward()) {
857
+ C_BLOB *c_blob = c_blob_it.data();
858
+ PBLOB c_as_p(c_blob, baseline + xheight);
859
+ merge_blobs(pblob, &c_as_p);
860
+ }
861
+ PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word
862
+ PBLOB_IT pblob_it(pblob_list);
863
+ pblob_it.add_after_then_move(pblob);
864
+
865
+ // Normalize PBLOB
866
+ WERD word(pblob_list, 0, " ");
867
+ ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
868
+ word.baseline_normalise(row);
869
+ delete row;
870
+
871
+ // Create a TBLOB from PBLOB
872
+ return make_tess_blob(pblob, /* flatten: */ TRUE);
873
+ }
874
+
875
+
876
+ // Adapt to recognize the current image as the given character.
877
+ // The image must be preloaded and be just an image of a single character.
878
+ void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
879
+ int length,
880
+ float baseline,
881
+ float xheight,
882
+ float descender,
883
+ float ascender) {
884
+ UNICHAR_ID id = unicharset.unichar_to_id(unichar_repr, length);
885
+ LINE_STATS LineStats;
886
+ TEXTROW row;
887
+ fill_dummy_row(baseline, xheight, descender, ascender, &row);
888
+ GetLineStatsFromRow(&row, &LineStats);
889
+
890
+ TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
891
+ float threshold;
892
+ int best_class = 0;
893
+ float best_rating = -100;
894
+
895
+
896
+ // Classify to get a raw choice.
897
+ LIST result = AdaptiveClassifier(blob, NULL, &row);
898
+ LIST p;
899
+ for (p = result; p != NULL; p = p->next) {
900
+ A_CHOICE *tesschoice = (A_CHOICE *) p->node;
901
+ if (tesschoice->rating > best_rating) {
902
+ best_rating = tesschoice->rating;
903
+ best_class = tesschoice->string[0];
904
+ }
905
+ }
906
+
907
+ FLOAT32 GetBestRatingFor(TBLOB *Blob, LINE_STATS *LineStats, CLASS_ID ClassId);
908
+
909
+ // We have to use char-level adaptation because otherwise
910
+ // someone should do forced alignment somewhere.
911
+ void AdaptToChar(TBLOB *Blob,
912
+ LINE_STATS *LineStats,
913
+ CLASS_ID ClassId,
914
+ FLOAT32 Threshold);
915
+
916
+
917
+ if (id == best_class)
918
+ threshold = GoodAdaptiveMatch;
919
+ else {
920
+ /* the blob was incorrectly classified - find the rating threshold
921
+ needed to create a template which will correct the error with
922
+ some margin. However, don't waste time trying to make
923
+ templates which are too tight. */
924
+ threshold = GetBestRatingFor(blob, &LineStats, id);
925
+ threshold *= .9;
926
+ const float max_threshold = .125;
927
+ const float min_threshold = .02;
928
+
929
+ if (threshold > max_threshold)
930
+ threshold = max_threshold;
931
+
932
+ // I have cuddled the following line to set it out of the strike
933
+ // of the coverage testing tool. I have no idea how to trigger
934
+ // this situation nor I have any necessity to do it. --mezhirov
935
+ if (threshold < min_threshold) threshold = min_threshold;
936
+ }
937
+
938
+ if (blob->outlines)
939
+ AdaptToChar(blob, &LineStats, id, threshold);
940
+ free_blob(blob);
941
+ }
942
+
943
+
944
+ PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
945
+ PAGE_RES *page_res = new PAGE_RES(block_list);
946
+ recog_all_words(page_res, NULL, NULL, 1);
947
+ return page_res;
948
+ }
949
+
950
+ PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
951
+ PAGE_RES* pass1_result) {
952
+ if (!pass1_result)
953
+ pass1_result = new PAGE_RES(block_list);
954
+ recog_all_words(pass1_result, NULL, NULL, 2);
955
+ return pass1_result;
956
+ }
957
+
958
+ struct TESS_CHAR : ELIST_LINK {
959
+ char *unicode_repr;
960
+ int length; // of unicode_repr
961
+ float cost;
962
+ TBOX box;
963
+
964
+ TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
965
+ length = (len == -1 ? strlen(repr) : len);
966
+ unicode_repr = new char[length + 1];
967
+ strncpy(unicode_repr, repr, length);
968
+ }
969
+
970
+ ~TESS_CHAR() {
971
+ delete unicode_repr;
972
+ }
973
+ };
974
+
975
+
976
+ static void add_space(ELIST_ITERATOR *it) {
977
+ TESS_CHAR *t = new TESS_CHAR(0, " ");
978
+ it->add_after_then_move(t);
979
+ }
980
+
981
+
982
+ static float rating_to_cost(float rating) {
983
+ rating = 100 + rating;
984
+ // cuddled that to save from coverage profiler
985
+ // (I have never seen ratings worse than -100,
986
+ // but the check won't hurt)
987
+ if (rating < 0) rating = 0;
988
+ return rating;
989
+ }
990
+
991
+
992
+ // Extract the OCR results, costs (penalty points for uncertainty),
993
+ // and the bounding boxes of the characters.
994
+ static void extract_result(ELIST_ITERATOR *out,
995
+ PAGE_RES* page_res) {
996
+ PAGE_RES_IT page_res_it(page_res);
997
+ int word_count = 0;
998
+ while (page_res_it.word() != NULL) {
999
+ WERD_RES *word = page_res_it.word();
1000
+ const char *str = word->best_choice->string().string();
1001
+ const char *len = word->best_choice->lengths().string();
1002
+
1003
+ if (word_count)
1004
+ add_space(out);
1005
+ TBOX bln_rect;
1006
+ PBLOB_LIST *blobs = word->outword->blob_list();
1007
+ PBLOB_IT it(blobs);
1008
+ int n = strlen(len);
1009
+ TBOX** boxes_to_fix = new TBOX*[n];
1010
+ for (int i = 0; i < n; i++) {
1011
+ PBLOB *blob = it.data();
1012
+ TBOX current = blob->bounding_box();
1013
+ bln_rect = bln_rect.bounding_union(current);
1014
+ TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
1015
+ str, *len);
1016
+ tc->box = current;
1017
+ boxes_to_fix[i] = &tc->box;
1018
+
1019
+ out->add_after_then_move(tc);
1020
+ it.forward();
1021
+ str += *len;
1022
+ len++;
1023
+ }
1024
+
1025
+ // Find the word bbox before normalization.
1026
+ // Here we can't use the C_BLOB bboxes directly,
1027
+ // since connected letters are not yet cut.
1028
+ TBOX real_rect = word->word->bounding_box();
1029
+
1030
+ // Denormalize boxes by transforming the bbox of the whole bln word
1031
+ // into the denorm bbox (`real_rect') of the whole word.
1032
+ double x_stretch = double(real_rect.width()) / bln_rect.width();
1033
+ double y_stretch = double(real_rect.height()) / bln_rect.height();
1034
+ for (int j = 0; j < n; j++) {
1035
+ TBOX *box = boxes_to_fix[j];
1036
+ int x0 = int(real_rect.left() +
1037
+ x_stretch * (box->left() - bln_rect.left()) + 0.5);
1038
+ int x1 = int(real_rect.left() +
1039
+ x_stretch * (box->right() - bln_rect.left()) + 0.5);
1040
+ int y0 = int(real_rect.bottom() +
1041
+ y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
1042
+ int y1 = int(real_rect.bottom() +
1043
+ y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
1044
+ *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
1045
+ }
1046
+ delete [] boxes_to_fix;
1047
+
1048
+ page_res_it.forward();
1049
+ word_count++;
1050
+ }
1051
+ }
1052
+
1053
+
1054
+ // Extract the OCR results, costs (penalty points for uncertainty),
1055
+ // and the bounding boxes of the characters.
1056
+ int TessBaseAPI::TesseractExtractResult(char** string,
1057
+ int** lengths,
1058
+ float** costs,
1059
+ int** x0,
1060
+ int** y0,
1061
+ int** x1,
1062
+ int** y1,
1063
+ PAGE_RES* page_res) {
1064
+ ELIST tess_chars;
1065
+ ELIST_ITERATOR tess_chars_it(&tess_chars);
1066
+ extract_result(&tess_chars_it, page_res);
1067
+ tess_chars_it.move_to_first();
1068
+ int n = tess_chars.length();
1069
+ int string_len = 0;
1070
+ *lengths = new int[n];
1071
+ *costs = new float[n];
1072
+ *x0 = new int[n];
1073
+ *y0 = new int[n];
1074
+ *x1 = new int[n];
1075
+ *y1 = new int[n];
1076
+ int i = 0;
1077
+ for (tess_chars_it.mark_cycle_pt();
1078
+ !tess_chars_it.cycled_list();
1079
+ tess_chars_it.forward(), i++) {
1080
+ TESS_CHAR *tc = (TESS_CHAR *) tess_chars_it.data();
1081
+ string_len += (*lengths)[i] = tc->length;
1082
+ (*costs)[i] = tc->cost;
1083
+ (*x0)[i] = tc->box.left();
1084
+ (*y0)[i] = tc->box.bottom();
1085
+ (*x1)[i] = tc->box.right();
1086
+ (*y1)[i] = tc->box.top();
1087
+ }
1088
+ char *p = *string = new char[string_len];
1089
+
1090
+ tess_chars_it.move_to_first();
1091
+ for (tess_chars_it.mark_cycle_pt();
1092
+ !tess_chars_it.cycled_list();
1093
+ tess_chars_it.forward()) {
1094
+ TESS_CHAR *tc = (TESS_CHAR *) tess_chars_it.data();
1095
+ strncpy(p, tc->unicode_repr, tc->length);
1096
+ p += tc->length;
1097
+ }
1098
+ return n;
1099
+ }
1100
+
1101
+ // Check whether a word is valid according to Tesseract's language model
1102
+ // returns 0 if the string is invalid, non-zero if valid
1103
+ int TessBaseAPI::IsValidWord(const char *string) {
1104
+ return valid_word(string);
1105
+ }