tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,891 @@
1
+ /**********************************************************************
2
+ * File: applybox.cpp (Formerly applybox.c)
3
+ * Description: Re segment rows according to box file data
4
+ * Author: Phil Cheatle
5
+ * Created: Wed Nov 24 09:11:23 GMT 1993
6
+ *
7
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+ /*
20
+ define SECURE_NAMES for code versions which go to UNLV to stop tessedit
21
+ including all the newdiff stuff (which contains lots of text indicating
22
+ what measures we are interested in.
23
+ */
24
+ /* #define SECURE_NAMES done in secnames.h when necessary*/
25
+
26
+ #include "mfcpch.h"
27
+ #include "applybox.h"
28
+ #include <ctype.h>
29
+ #include <string.h>
30
+ #ifdef __UNIX__
31
+ #include <assert.h>
32
+ #include <errno.h>
33
+ #endif
34
+ #include "boxread.h"
35
+ #include "mainblk.h"
36
+ #include "genblob.h"
37
+ #include "fixxht.h"
38
+ #include "control.h"
39
+ #include "tessbox.h"
40
+ #include "globals.h"
41
+ #include "secname.h"
42
+ #include "unichar.h"
43
+ #include "matchdefs.h"
44
+
45
+ #define SECURE_NAMES
46
+ #ifndef SECURE_NAMES
47
+ #include "wordstats.h"
48
+ #endif
49
+
50
+ #define EXTERN
51
+ EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
52
+ EXTERN INT_VAR (applybox_debug, 5, "Debug level");
53
+ EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");
54
+ EXTERN STRING_VAR (applybox_test_exclusions, "",
55
+ "Chars ignored for testing");
56
+ EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
57
+
58
+ // The unicharset used during box training
59
+ static UNICHARSET unicharset_boxes;
60
+
61
+ static void PrintString(const char* str) {
62
+ tprintf("%s:", str);
63
+ int step = 0;
64
+ for (int i = 0; str[i]; i += step) {
65
+ step = UNICHAR::utf8_step(str + i);
66
+ if (step == 0)
67
+ step = 1;
68
+ UNICHAR ch(str + i, step);
69
+ tprintf("[%x]", ch.first_uni());
70
+ }
71
+ tprintf("\n", str);
72
+ }
73
+
74
+ /*************************************************************************
75
+ * The code re-assigns outlines to form words each with ONE labelled blob.
76
+ * Noise is left in UNLABELLED words. The chars on the page are checked crudely
77
+ * for sensible position relative to baseline and xht. Failed boxes are
78
+ * compensated for by duplicating other believable instances of the character.
79
+ *
80
+ * The box file is assumed to contain box definitions, one per line, of the
81
+ * following format:
82
+ * <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused
83
+ *
84
+ * The approach taken is to search the WHOLE page for stuff overlapping each box.
85
+ * - This is not too inefficient and is SAFE.
86
+ * - We can detect overlapping blobs as we will be attempting to put a blob
87
+ * from a LABELLED word into the current word.
88
+ * - When all the boxes have been processed we can detect any stuff which is
89
+ * being ignored - it is the unlabelled words left on the page.
90
+ *
91
+ * A box should only overlap one row.
92
+ *
93
+ * A warning is given if the box is on the same row as the previous box, but NOT
94
+ * on the same row as the previous blob.
95
+ *
96
+ * Any OUTLINE which overlaps the box is put into the new word.
97
+ *
98
+ * ascender chars must ascend above xht significantly
99
+ * xht chars must not rise above row xht significantly
100
+ * bl chars must not descend below baseline significantly
101
+ * descender chars must descend below baseline significantly
102
+ *
103
+ * ?? Certain chars are DROPPED - to limit the training data.
104
+ *
105
+ *************************************************************************/
106
+
107
+ void apply_boxes(BLOCK_LIST *block_list //real blocks
108
+ ) {
109
+ inT16 boxfile_lineno = 0;
110
+ inT16 boxfile_charno = 0;
111
+ TBOX box; //boxfile box
112
+ UNICHAR_ID uch_id; //correct ch from boxfile
113
+ ROW *row;
114
+ ROW *prev_row = NULL;
115
+ inT16 prev_box_right = MAX_INT16;
116
+ inT16 block_id;
117
+ inT16 row_id;
118
+ inT16 box_count = 0;
119
+ inT16 box_failures = 0;
120
+ inT16 labels_ok;
121
+ inT16 rows_ok;
122
+ inT16 bad_blobs;
123
+ inT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples
124
+ // inT16 labelled_char_counts[128]; //No. of unique labelled samples
125
+ inT16 i;
126
+ inT16 rebalance_count = 0;
127
+ UNICHAR_ID min_uch_id;
128
+ inT16 min_samples;
129
+ inT16 final_labelled_blob_count;
130
+
131
+ // Clean the unichar set
132
+ unicharset_boxes.clear();
133
+ // Space character needed to represent NIL classification
134
+ unicharset_boxes.unichar_insert(" ");
135
+
136
+ for (i = 0; i < MAX_NUM_CLASSES; i++)
137
+ tgt_char_counts[i] = 0;
138
+
139
+ FILE* box_file;
140
+ STRING filename = imagefile;
141
+ filename += ".box";
142
+ if (!(box_file = fopen (filename.string(), "r"))) {
143
+ CANTOPENFILE.error ("read_next_box", EXIT,
144
+ "Cant open box file %s %d",
145
+ filename.string(), errno);
146
+ }
147
+
148
+ clear_any_old_text(block_list);
149
+ while (read_next_box(applybox_page, box_file, &box, &uch_id)) {
150
+ box_count++;
151
+ tgt_char_counts[uch_id]++;
152
+ row = find_row_of_box (block_list, box, block_id, row_id);
153
+ if (box.left () < prev_box_right) {
154
+ boxfile_lineno++;
155
+ boxfile_charno = 1;
156
+ }
157
+ else
158
+ boxfile_charno++;
159
+
160
+ if (row == NULL) {
161
+ box_failures++;
162
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
163
+ unicharset_boxes.id_to_unichar(uch_id),
164
+ "FAILURE! box overlaps no blobs or blobs in multiple rows");
165
+ }
166
+ else {
167
+ if ((box.left () >= prev_box_right) && (row != prev_row))
168
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
169
+ unicharset_boxes.id_to_unichar(uch_id),
170
+ "WARNING! false row break");
171
+ box_failures += resegment_box (row, box, uch_id, block_id, row_id,
172
+ boxfile_lineno, boxfile_charno);
173
+ prev_row = row;
174
+ }
175
+ prev_box_right = box.right ();
176
+ }
177
+ tidy_up(block_list,
178
+ labels_ok,
179
+ rows_ok,
180
+ bad_blobs,
181
+ tgt_char_counts,
182
+ rebalance_count,
183
+ &min_uch_id,
184
+ min_samples,
185
+ final_labelled_blob_count);
186
+ tprintf ("APPLY_BOXES:\n");
187
+ tprintf (" Boxes read from boxfile: %6d\n", box_count);
188
+ tprintf (" Initially labelled blobs: %6d in %d rows\n",
189
+ labels_ok, rows_ok);
190
+ tprintf (" Box failures detected: %6d\n", box_failures);
191
+ tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
192
+ tprintf (" \"%s\" has fewest samples:%6d\n",
193
+ unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
194
+ tprintf (" Total unlabelled words: %6d\n",
195
+ bad_blobs);
196
+ tprintf (" Final labelled words: %6d\n",
197
+ final_labelled_blob_count);
198
+ }
199
+
200
+
201
+ void clear_any_old_text( //remove correct text
202
+ BLOCK_LIST *block_list //real blocks
203
+ ) {
204
+ BLOCK_IT block_it(block_list);
205
+ ROW_IT row_it;
206
+ WERD_IT word_it;
207
+
208
+ for (block_it.mark_cycle_pt ();
209
+ !block_it.cycled_list (); block_it.forward ()) {
210
+ row_it.set_to_list (block_it.data ()->row_list ());
211
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
212
+ word_it.set_to_list (row_it.data ()->word_list ());
213
+ for (word_it.mark_cycle_pt ();
214
+ !word_it.cycled_list (); word_it.forward ()) {
215
+ word_it.data ()->set_text ("");
216
+ }
217
+ }
218
+ }
219
+ }
220
+
221
+
222
+ BOOL8 read_next_box(int page,
223
+ FILE* box_file, //
224
+ TBOX *box,
225
+ UNICHAR_ID *uch_id) {
226
+ int x_min;
227
+ int y_min;
228
+ int x_max;
229
+ int y_max;
230
+ char uch[kBoxReadBufSize];
231
+
232
+ while (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
233
+ if (!unicharset_boxes.contains_unichar(uch))
234
+ {
235
+ unicharset_boxes.unichar_insert(uch);
236
+ if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
237
+ tprintf("Error: Size of unicharset of boxes is "
238
+ "greater than MAX_NUM_CLASSES (%d)\n",
239
+ MAX_NUM_CLASSES);
240
+ exit(1);
241
+ }
242
+ }
243
+ *uch_id = unicharset_boxes.unichar_to_id(uch);
244
+ *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
245
+ return TRUE; //read a box ok
246
+ }
247
+ return FALSE; //EOF
248
+ }
249
+
250
+
251
+ ROW *find_row_of_box( //
252
+ BLOCK_LIST *block_list, //real blocks
253
+ TBOX box, //from boxfile
254
+ inT16 &block_id,
255
+ inT16 &row_id_to_process) {
256
+ BLOCK_IT block_it(block_list);
257
+ BLOCK *block;
258
+ ROW_IT row_it;
259
+ ROW *row;
260
+ ROW *row_to_process = NULL;
261
+ inT16 row_id;
262
+ WERD_IT word_it;
263
+ WERD *word;
264
+ BOOL8 polyg;
265
+ PBLOB_IT blob_it;
266
+ PBLOB *blob;
267
+ OUTLINE_IT outline_it;
268
+ OUTLINE *outline;
269
+
270
+ /*
271
+ Find row to process - error if box REALLY overlaps more than one row. (I.e
272
+ it overlaps blobs in the row - not just overlaps the bounding box of the
273
+ whole row.)
274
+ */
275
+
276
+ block_id = 0;
277
+ for (block_it.mark_cycle_pt ();
278
+ !block_it.cycled_list (); block_it.forward ()) {
279
+ block_id++;
280
+ row_id = 0;
281
+ block = block_it.data ();
282
+ if (block->bounding_box ().overlap (box)) {
283
+ row_it.set_to_list (block->row_list ());
284
+ for (row_it.mark_cycle_pt ();
285
+ !row_it.cycled_list (); row_it.forward ()) {
286
+ row_id++;
287
+ row = row_it.data ();
288
+ if (row->bounding_box ().overlap (box)) {
289
+ word_it.set_to_list (row->word_list ());
290
+ for (word_it.mark_cycle_pt ();
291
+ !word_it.cycled_list (); word_it.forward ()) {
292
+ word = word_it.data ();
293
+ polyg = word->flag (W_POLYGON);
294
+ if (word->bounding_box ().overlap (box)) {
295
+ blob_it.set_to_list (word->gblob_list ());
296
+ for (blob_it.mark_cycle_pt ();
297
+ !blob_it.cycled_list (); blob_it.forward ()) {
298
+ blob = blob_it.data ();
299
+ if (gblob_bounding_box (blob, polyg).
300
+ overlap (box)) {
301
+ outline_it.
302
+ set_to_list (gblob_out_list
303
+ (blob, polyg));
304
+ for (outline_it.mark_cycle_pt ();
305
+ !outline_it.cycled_list ();
306
+ outline_it.forward ()) {
307
+ outline = outline_it.data ();
308
+ if (goutline_bounding_box
309
+ (outline, polyg).major_overlap (box)) {
310
+ if ((row_to_process == NULL) ||
311
+ (row_to_process == row)) {
312
+ row_to_process = row;
313
+ row_id_to_process = row_id;
314
+ }
315
+ else
316
+ /* RETURN ERROR Box overlaps blobs in more than one row */
317
+ return NULL;
318
+ }
319
+ }
320
+ }
321
+ }
322
+ }
323
+ }
324
+ }
325
+ }
326
+ }
327
+ }
328
+ return row_to_process;
329
+ }
330
+
331
+
332
+ inT16 resegment_box( //
333
+ ROW *row,
334
+ TBOX box,
335
+ UNICHAR_ID uch_id,
336
+ inT16 block_id,
337
+ inT16 row_id,
338
+ inT16 boxfile_lineno,
339
+ inT16 boxfile_charno) {
340
+ WERD_IT word_it;
341
+ WERD *word;
342
+ WERD *new_word = NULL;
343
+ BOOL8 polyg = false;
344
+ PBLOB_IT blob_it;
345
+ PBLOB_IT new_blob_it;
346
+ PBLOB *blob;
347
+ PBLOB *new_blob;
348
+ OUTLINE_IT outline_it;
349
+ OUTLINE_LIST dummy; // Just to initialize new_outline_it.
350
+ OUTLINE_IT new_outline_it = &dummy;
351
+ OUTLINE *outline;
352
+ TBOX new_word_box;
353
+ float word_x_centre;
354
+ float baseline;
355
+ inT16 error_count = 0; //number of chars lost
356
+
357
+ word_it.set_to_list (row->word_list ());
358
+ for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
359
+ word = word_it.data ();
360
+ polyg = word->flag (W_POLYGON);
361
+ if (word->bounding_box ().overlap (box)) {
362
+ blob_it.set_to_list (word->gblob_list ());
363
+ for (blob_it.mark_cycle_pt ();
364
+ !blob_it.cycled_list (); blob_it.forward ()) {
365
+ blob = blob_it.data ();
366
+ if (gblob_bounding_box (blob, polyg).overlap (box)) {
367
+ outline_it.set_to_list (gblob_out_list (blob, polyg));
368
+ for (outline_it.mark_cycle_pt ();
369
+ !outline_it.cycled_list (); outline_it.forward ()) {
370
+ outline = outline_it.data ();
371
+ if (goutline_bounding_box (outline, polyg).
372
+ major_overlap (box)) {
373
+ if (strlen (word->text ()) > 0) {
374
+ if (error_count == 0) {
375
+ error_count = 1;
376
+ if (applybox_debug > 4)
377
+ report_failed_box (boxfile_lineno,
378
+ boxfile_charno,
379
+ box, unicharset_boxes.id_to_unichar(uch_id),
380
+ "FAILURE! box overlaps blob in labelled word");
381
+ }
382
+ if (applybox_debug > 4)
383
+ tprintf
384
+ ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
385
+ block_id, row_id,
386
+ word_it.data ()->text ());
387
+ word_it.data ()->set_text ("");
388
+ //UN label it
389
+ error_count++;
390
+ }
391
+
392
+ if (error_count == 0) {
393
+ if (new_word == NULL) {
394
+ /* Make a new word with a single blob */
395
+ new_word = word->shallow_copy ();
396
+ new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
397
+ if (polyg)
398
+ new_blob = new PBLOB;
399
+ else
400
+ new_blob = (PBLOB *) new C_BLOB;
401
+ new_blob_it.set_to_list (new_word->
402
+ gblob_list ());
403
+ new_blob_it.add_to_end (new_blob);
404
+ new_outline_it.
405
+ set_to_list (gblob_out_list
406
+ (new_blob, polyg));
407
+ }
408
+ new_outline_it.add_to_end (outline_it.
409
+ extract ());
410
+ //move blob
411
+ }
412
+ }
413
+ }
414
+ //no outlines in blob
415
+ if (outline_it.empty ())
416
+ //so delete blob
417
+ delete blob_it.extract ();
418
+ }
419
+ }
420
+ if (blob_it.empty ()) //no blobs in word
421
+ //so delete word
422
+ delete word_it.extract ();
423
+ }
424
+ }
425
+ if (error_count > 0)
426
+ return error_count;
427
+
428
+ if (new_word != NULL) {
429
+ gblob_sort_list (new_word->gblob_list (), polyg);
430
+ word_it.add_to_end (new_word);
431
+ new_word_box = new_word->bounding_box ();
432
+ word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
433
+ baseline = row->base_line (word_x_centre);
434
+
435
+ #if 0
436
+ if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
437
+ if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
438
+ (new_word_box.top () <
439
+ baseline + (1 + applybox_error_band) * row->x_height ())) {
440
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
441
+ unicharset_boxes.id_to_unichar(uch_id),
442
+ "FAILURE! caps-ht char didn't ascend");
443
+ new_word->set_text ("");
444
+ return 1;
445
+ }
446
+ if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
447
+ (new_word_box.top () <
448
+ baseline + (1 - applybox_error_band) * row->x_height ())) {
449
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
450
+ unicharset_boxes.id_to_unichar(uch_id),
451
+ "FAILURE! Odd top char below xht");
452
+ new_word->set_text ("");
453
+ return 1;
454
+ }
455
+ if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
456
+ ((new_word_box.top () >
457
+ baseline + (1 + applybox_error_band) * row->x_height ()) ||
458
+ (new_word_box.top () <
459
+ baseline + (1 - applybox_error_band) * row->x_height ()))) {
460
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
461
+ unicharset_boxes.id_to_unichar(uch_id),
462
+ "FAILURE! x-ht char didn't have top near xht");
463
+ new_word->set_text ("");
464
+ return 1;
465
+ }
466
+ if (STRING (chs_non_ambig_bl).contains
467
+ (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
468
+ ((new_word_box.bottom () <
469
+ baseline - applybox_error_band * row->x_height ()) ||
470
+ (new_word_box.bottom () >
471
+ baseline + applybox_error_band * row->x_height ()))) {
472
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
473
+ unicharset_boxes.id_to_unichar(uch_id),
474
+ "FAILURE! non ambig BL char didnt have bottom near baseline");
475
+ new_word->set_text ("");
476
+ return 1;
477
+ }
478
+ if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
479
+ (new_word_box.bottom () >
480
+ baseline + applybox_error_band * row->x_height ())) {
481
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
482
+ unicharset_boxes.id_to_unichar(uch_id),
483
+ "FAILURE! Odd bottom char above baseline");
484
+ new_word->set_text ("");
485
+ return 1;
486
+ }
487
+ if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
488
+ (new_word_box.bottom () >
489
+ baseline - applybox_error_band * row->x_height ())) {
490
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
491
+ unicharset_boxes.id_to_unichar(uch_id),
492
+ "FAILURE! Descender doesn't descend");
493
+ new_word->set_text ("");
494
+ return 1;
495
+ }
496
+ }
497
+ #endif
498
+ return 0;
499
+ }
500
+ else {
501
+ report_failed_box (boxfile_lineno, boxfile_charno, box,
502
+ unicharset_boxes.id_to_unichar(uch_id),
503
+ "FAILURE! Couldn't find any blobs");
504
+ return 1;
505
+ }
506
+ }
507
+
508
+
509
+ /*************************************************************************
510
+ * tidy_up()
511
+ * - report >1 block
512
+ * - sort the words in each row.
513
+ * - report any rows with no labelled words.
514
+ * - report any remaining unlabelled words
515
+ * - report total labelled words
516
+ *
517
+ *************************************************************************/
518
+ void tidy_up( //
519
+ BLOCK_LIST *block_list, //real blocks
520
+ inT16 &ok_char_count,
521
+ inT16 &ok_row_count,
522
+ inT16 &unlabelled_words,
523
+ inT16 *tgt_char_counts,
524
+ inT16 &rebalance_count,
525
+ UNICHAR_ID *min_uch_id,
526
+ inT16 &min_samples,
527
+ inT16 &final_labelled_blob_count) {
528
+ BLOCK_IT block_it(block_list);
529
+ ROW_IT row_it;
530
+ ROW *row;
531
+ WERD_IT word_it;
532
+ WERD *word;
533
+ WERD *duplicate_word;
534
+ inT16 block_idx = 0;
535
+ inT16 row_idx;
536
+ inT16 all_row_idx = 0;
537
+ BOOL8 row_ok;
538
+ BOOL8 rebalance_needed = FALSE;
539
+ //No. of unique labelled samples
540
+ inT16 labelled_char_counts[MAX_NUM_CLASSES];
541
+ inT16 i;
542
+ UNICHAR_ID uch_id;
543
+ UNICHAR_ID prev_uch_id = -1;
544
+ BOOL8 at_dupe_of_prev_word;
545
+ ROW *prev_row = NULL;
546
+ inT16 left;
547
+ inT16 prev_left = -1;
548
+
549
+ for (i = 0; i < MAX_NUM_CLASSES; i++)
550
+ labelled_char_counts[i] = 0;
551
+
552
+ ok_char_count = 0;
553
+ ok_row_count = 0;
554
+ unlabelled_words = 0;
555
+ if ((applybox_debug > 4) && (block_it.length () != 1))
556
+
557
+ tprintf ("APPLY_BOXES: More than one block??\n");
558
+
559
+ for (block_it.mark_cycle_pt ();
560
+ !block_it.cycled_list (); block_it.forward ()) {
561
+ block_idx++;
562
+ row_idx = 0;
563
+ row_ok = FALSE;
564
+ row_it.set_to_list (block_it.data ()->row_list ());
565
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
566
+ row_idx++;
567
+ all_row_idx++;
568
+ row = row_it.data ();
569
+ word_it.set_to_list (row->word_list ());
570
+ word_it.sort (word_comparator);
571
+ for (word_it.mark_cycle_pt ();
572
+ !word_it.cycled_list (); word_it.forward ()) {
573
+ word = word_it.data ();
574
+ if (strlen (word->text ()) == 0) {
575
+ unlabelled_words++;
576
+ if (applybox_debug > 4) {
577
+ tprintf
578
+ ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
579
+ block_idx, row_idx, all_row_idx);
580
+ }
581
+ }
582
+ else {
583
+ if (word->gblob_list ()->length () != 1)
584
+ tprintf
585
+ ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
586
+ block_idx, row_idx, all_row_idx);
587
+
588
+ ok_char_count++;
589
+ labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
590
+ row_ok = TRUE;
591
+ }
592
+ }
593
+ if ((applybox_debug > 4) && (!row_ok)) {
594
+ tprintf
595
+ ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
596
+ block_idx, row_idx, all_row_idx);
597
+ }
598
+ else
599
+ ok_row_count++;
600
+ }
601
+ }
602
+
603
+ min_samples = 9999;
604
+ for (i = 0; i < unicharset_boxes.size(); i++) {
605
+ if (tgt_char_counts[i] > labelled_char_counts[i]) {
606
+ if (labelled_char_counts[i] <= 1) {
607
+ tprintf
608
+ ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d:\n",
609
+ labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
610
+ PrintString(unicharset_boxes.id_to_unichar(i));
611
+ }
612
+ else {
613
+ rebalance_needed = TRUE;
614
+ if (applybox_debug > 0)
615
+ tprintf
616
+ ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
617
+ unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
618
+ }
619
+ }
620
+ if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
621
+ min_samples = labelled_char_counts[i];
622
+ *min_uch_id = i;
623
+ }
624
+ }
625
+
626
+ while (applybox_rebalance && rebalance_needed) {
627
+ block_it.set_to_list (block_list);
628
+ for (block_it.mark_cycle_pt ();
629
+ !block_it.cycled_list (); block_it.forward ()) {
630
+ row_it.set_to_list (block_it.data ()->row_list ());
631
+ for (row_it.mark_cycle_pt ();
632
+ !row_it.cycled_list (); row_it.forward ()) {
633
+ row = row_it.data ();
634
+ word_it.set_to_list (row->word_list ());
635
+ for (word_it.mark_cycle_pt ();
636
+ !word_it.cycled_list (); word_it.forward ()) {
637
+ word = word_it.data ();
638
+ left = word->bounding_box ().left ();
639
+ if (*word->text () != '\0')
640
+ uch_id = unicharset_boxes.unichar_to_id(word->text ());
641
+ else
642
+ uch_id = -1;
643
+ at_dupe_of_prev_word = ((row == prev_row) &&
644
+ (left = prev_left) &&
645
+ (uch_id == prev_uch_id));
646
+ if ((uch_id != -1) &&
647
+ (labelled_char_counts[uch_id] > 1) &&
648
+ (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
649
+ (!at_dupe_of_prev_word)) {
650
+ /* Duplicate the word to rebalance the labelled samples */
651
+ if (applybox_debug > 9) {
652
+ tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
653
+ word->bounding_box ().print ();
654
+ }
655
+ duplicate_word = new WERD;
656
+ *duplicate_word = *word;
657
+ word_it.add_after_then_move (duplicate_word);
658
+ rebalance_count++;
659
+ labelled_char_counts[uch_id]++;
660
+ }
661
+ prev_row = row;
662
+ prev_left = left;
663
+ prev_uch_id = uch_id;
664
+ }
665
+ }
666
+ }
667
+ rebalance_needed = FALSE;
668
+ for (i = 0; i < unicharset_boxes.size(); i++) {
669
+ if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
670
+ (labelled_char_counts[i] > 1)) {
671
+ rebalance_needed = TRUE;
672
+ break;
673
+ }
674
+ }
675
+ }
676
+
677
+ /* Now final check - count labelled blobs */
678
+ final_labelled_blob_count = 0;
679
+ block_it.set_to_list (block_list);
680
+ for (block_it.mark_cycle_pt ();
681
+ !block_it.cycled_list (); block_it.forward ()) {
682
+ row_it.set_to_list (block_it.data ()->row_list ());
683
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
684
+ row = row_it.data ();
685
+ word_it.set_to_list (row->word_list ());
686
+ word_it.sort (word_comparator);
687
+ for (word_it.mark_cycle_pt ();
688
+ !word_it.cycled_list (); word_it.forward ()) {
689
+ word = word_it.data ();
690
+ if ((strlen (word->text ()) > 0) &&
691
+ (word->gblob_list ()->length () == 1))
692
+ final_labelled_blob_count++;
693
+ }
694
+ }
695
+ }
696
+ }
697
+
698
+
699
+ void report_failed_box(inT16 boxfile_lineno,
700
+ inT16 boxfile_charno,
701
+ TBOX box,
702
+ const char *box_ch,
703
+ const char *err_msg) {
704
+ if (applybox_debug > 4)
705
+ tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
706
+ boxfile_lineno,
707
+ boxfile_charno,
708
+ box_ch,
709
+ box.left (), box.bottom (), box.right (), box.top (), err_msg);
710
+ }
711
+
712
+
713
+ void apply_box_training(BLOCK_LIST *block_list) {
714
+ BLOCK_IT block_it(block_list);
715
+ ROW_IT row_it;
716
+ ROW *row;
717
+ WERD_IT word_it;
718
+ WERD *word;
719
+ WERD *bln_word;
720
+ WERD copy_outword; // copy to denorm
721
+ PBLOB_IT blob_it;
722
+ DENORM denorm;
723
+ inT16 count = 0;
724
+ char unichar[UNICHAR_LEN + 1];
725
+
726
+ unichar[UNICHAR_LEN] = '\0';
727
+ tprintf ("Generating training data\n");
728
+ for (block_it.mark_cycle_pt ();
729
+ !block_it.cycled_list (); block_it.forward ()) {
730
+ row_it.set_to_list (block_it.data ()->row_list ());
731
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
732
+ row = row_it.data ();
733
+ word_it.set_to_list (row->word_list ());
734
+ for (word_it.mark_cycle_pt ();
735
+ !word_it.cycled_list (); word_it.forward ()) {
736
+ word = word_it.data ();
737
+ if ((strlen (word->text ()) > 0) &&
738
+ (word->gblob_list ()->length () == 1)) {
739
+ /* Here is a word with a single unichar label and a single blob so train on it */
740
+ bln_word =
741
+ make_bln_copy (word, row, row->x_height (), &denorm);
742
+ blob_it.set_to_list (bln_word->blob_list ());
743
+ strncpy(unichar, word->text (), UNICHAR_LEN);
744
+ tess_training_tester (blob_it.data (),
745
+ //single blob
746
+ &denorm, TRUE, //correct
747
+ unichar, //correct character
748
+ strlen(unichar), //character length
749
+ NULL);
750
+ copy_outword = *(bln_word);
751
+ copy_outword.baseline_denormalise (&denorm);
752
+ blob_it.set_to_list (copy_outword.blob_list ());
753
+ delete bln_word;
754
+ count++;
755
+ }
756
+ }
757
+ }
758
+ }
759
+ tprintf ("Generated training data for %d blobs\n", count);
760
+ }
761
+
762
+
763
+ void apply_box_testing(BLOCK_LIST *block_list) {
764
+ BLOCK_IT block_it(block_list);
765
+ ROW_IT row_it;
766
+ ROW *row;
767
+ inT16 row_count = 0;
768
+ WERD_IT word_it;
769
+ WERD *word;
770
+ WERD *bln_word;
771
+ inT16 word_count = 0;
772
+ PBLOB_IT blob_it;
773
+ DENORM denorm;
774
+ inT16 count = 0;
775
+ char ch[2];
776
+ WERD *outword; //bln best choice
777
+ //segmentation
778
+ WERD_CHOICE *best_choice; //tess output
779
+ WERD_CHOICE *raw_choice; //top choice permuter
780
+ //detailed results
781
+ BLOB_CHOICE_LIST_CLIST blob_choices;
782
+ inT16 char_count = 0;
783
+ inT16 correct_count = 0;
784
+ inT16 err_count = 0;
785
+ inT16 rej_count = 0;
786
+ #ifndef SECURE_NAMES
787
+ WERDSTATS wordstats; //As from newdiff
788
+ #endif
789
+ char tess_rej_str[3];
790
+ char tess_long_str[3];
791
+
792
+ ch[1] = '\0';
793
+ strcpy (tess_rej_str, "|A");
794
+ strcpy (tess_long_str, "|B");
795
+
796
+ for (block_it.mark_cycle_pt ();
797
+ !block_it.cycled_list (); block_it.forward ()) {
798
+ row_it.set_to_list (block_it.data ()->row_list ());
799
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
800
+ row = row_it.data ();
801
+ row_count++;
802
+ word_count = 0;
803
+ word_it.set_to_list (row->word_list ());
804
+ for (word_it.mark_cycle_pt ();
805
+ !word_it.cycled_list (); word_it.forward ()) {
806
+ word = word_it.data ();
807
+ word_count++;
808
+ if ((strlen (word->text ()) == 1) &&
809
+ !STRING (applybox_test_exclusions).contains (*word->text ())
810
+ && (word->gblob_list ()->length () == 1)) {
811
+ /* Here is a word with a single char label and a single blob so test it */
812
+ bln_word =
813
+ make_bln_copy (word, row, row->x_height (), &denorm);
814
+ blob_it.set_to_list (bln_word->blob_list ());
815
+ ch[0] = *word->text ();
816
+ char_count++;
817
+ best_choice = tess_segment_pass1 (bln_word,
818
+ &denorm,
819
+ tess_default_matcher,
820
+ raw_choice,
821
+ &blob_choices, outword);
822
+
823
+ /*
824
+ Test for TESS screw up on word. Recog_word has already ensured that the
825
+ choice list, outword blob lists and best_choice string are the same
826
+ length. A TESS screw up is indicated by a blank filled or 0 length string.
827
+ */
828
+ if ((best_choice->lengths ().length () == 0) ||
829
+ (strspn (best_choice->string ().string (), " ") ==
830
+ best_choice->string ().length ())) {
831
+ rej_count++;
832
+ tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
833
+ row_count, word_count, ch);
834
+ #ifndef SECURE_NAMES
835
+ wordstats.word (tess_rej_str, 2, ch, 1);
836
+ #endif
837
+ }
838
+ else {
839
+ if ((best_choice->lengths ().length () !=
840
+ outword->blob_list ()->length ()) ||
841
+ (best_choice->lengths ().length () !=
842
+ blob_choices.length ())) {
843
+ tprintf
844
+ ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
845
+ best_choice->string ().string (),
846
+ best_choice->lengths ().length (),
847
+ outword->blob_list ()->length (),
848
+ blob_choices.length ());
849
+ }
850
+ ASSERT_HOST (best_choice->lengths ().length () ==
851
+ outword->blob_list ()->length ());
852
+ ASSERT_HOST (best_choice->lengths ().length () ==
853
+ blob_choices.length ());
854
+ fix_quotes (best_choice,
855
+ //turn to double
856
+ outword, &blob_choices);
857
+ if (strcmp (best_choice->string ().string (), ch) != 0) {
858
+ err_count++;
859
+ tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
860
+ row_count, word_count, ch,
861
+ best_choice->string ().string ());
862
+ }
863
+ else
864
+ correct_count++;
865
+ #ifndef SECURE_NAMES
866
+ if (best_choice->string ().length () > 2)
867
+ wordstats.word (tess_long_str, 2, ch, 1);
868
+ else
869
+ wordstats.word ((char *) best_choice->string ().
870
+ string (),
871
+ best_choice->string ().length (), ch,
872
+ 1);
873
+ #endif
874
+ }
875
+ delete bln_word;
876
+ delete outword;
877
+ delete best_choice;
878
+ delete raw_choice;
879
+ blob_choices.deep_clear ();
880
+ count++;
881
+ }
882
+ }
883
+ }
884
+ }
885
+ #ifndef SECURE_NAMES
886
+ wordstats.print (1, 100.0);
887
+ wordstats.conf_matrix ();
888
+ tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
889
+ char_count, correct_count, rej_count, err_count);
890
+ #endif
891
+ }