tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1481 @@
1
+ /******************************************************************
2
+ * File: docqual.cpp (Formerly docqual.c)
3
+ * Description: Document Quality Metrics
4
+ * Author: Phil Cheatle
5
+ * Created: Mon May 9 11:27:28 BST 1994
6
+ *
7
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include <ctype.h>
22
+ #include "docqual.h"
23
+ #include "tstruct.h"
24
+ #include "tfacep.h"
25
+ #include "reject.h"
26
+ #include "tessvars.h"
27
+ #include "genblob.h"
28
+ #include "secname.h"
29
+ #include "globals.h"
30
+
31
+ #define EXTERN
32
+
33
+ EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
34
+ EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
35
+ "Non standard number of outlines");
36
+ EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
37
+ "Allow outline errs in unrejection?");
38
+ EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
39
+ "Reduce rejection on good docs");
40
+ EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
41
+ EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
42
+ "%rej allowed before rej whole doc");
43
+ EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
44
+ "%rej allowed before rej whole block");
45
+ EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
46
+ "%rej allowed before rej whole row");
47
+ EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
48
+ "%of row rejects in whole word rejects which prevents whole row rejection");
49
+ EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
50
+ "Only rej partially rejected words in block rejection");
51
+ EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
52
+ "Only rej partially rejected words in row rejection");
53
+ EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
54
+ "Use word segmentation quality metric");
55
+ EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
56
+ "Use word segmentation quality metric");
57
+ EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
58
+ "Only preserve wds longer than this");
59
+ EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
60
+ "Apply row rejection to good docs");
61
+ EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
62
+ "rej good doc wd if more than this fraction rejected");
63
+ EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
64
+ "Reject all bad quality wds");
65
+ EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
66
+ EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
67
+ "Output data to debug file");
68
+ EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
69
+ EXTERN double_VAR (quality_rowrej_pc, 1.1,
70
+ "good_quality_doc gte good char limit");
71
+
72
+ EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
73
+ "Mark v.bad words for tilde crunch");
74
+ EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
75
+ EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
76
+ "Take out ~^ early?");
77
+
78
+ EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
79
+ EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
80
+ EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
81
+ "crunch garbage cert lt this");
82
+ EXTERN double_VAR (crunch_poor_garbage_rate, 60,
83
+ "crunch garbage rating lt this");
84
+
85
+ EXTERN double_VAR (crunch_pot_poor_rate, 40,
86
+ "POTENTIAL crunch rating lt this");
87
+ EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
88
+ "POTENTIAL crunch cert lt this");
89
+ EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
90
+
91
+ EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
92
+ EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
93
+ EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
94
+ EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
95
+ EXTERN double_VAR (crunch_del_min_width, 3.0,
96
+ "Del if word width lt xht x this");
97
+ EXTERN double_VAR (crunch_del_high_word, 1.5,
98
+ "Del if word gt xht x this above bl");
99
+ EXTERN double_VAR (crunch_del_low_word, 0.5,
100
+ "Del if word gt xht x this below bl");
101
+ EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
102
+
103
+ EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
104
+ EXTERN INT_VAR (crunch_pot_indicators, 1,
105
+ "How many potential indicators needed");
106
+
107
+ EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
108
+ "Dont touch sensible strings");
109
+ EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
110
+ EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
111
+ "Dont pot crunch sensible strings");
112
+ EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
113
+ EXTERN INT_VAR (crunch_leave_lc_strings, 4,
114
+ "Dont crunch words with long lower case strings");
115
+ EXTERN INT_VAR (crunch_leave_uc_strings, 4,
116
+ "Dont crunch words with long lower case strings");
117
+ EXTERN INT_VAR (crunch_long_repetitions, 3,
118
+ "Crunch words with long repetitions");
119
+
120
+ EXTERN INT_VAR (crunch_debug, 0, "As it says");
121
+
122
+ /*************************************************************************
123
+ * word_blob_quality()
124
+ * How many blobs in the outword are identical to those of the inword?
125
+ * ASSUME blobs in both initial word and outword are in ascending order of
126
+ * left hand blob edge.
127
+ *************************************************************************/
128
+ inT16 word_blob_quality( //Blob seg changes
129
+ WERD_RES *word,
130
+ ROW *row) {
131
+ WERD *bln_word; //BL norm init word
132
+ TWERD *tessword; //tess format
133
+ WERD *init_word; //BL norm init word
134
+ PBLOB_IT outword_it;
135
+ PBLOB_IT initial_it;
136
+ inT16 i;
137
+ inT16 init_blobs_left;
138
+ inT16 match_count = 0;
139
+ BOOL8 matched;
140
+ TBOX out_box;
141
+ PBLOB *test_blob;
142
+ DENORM denorm;
143
+ float bln_xht;
144
+
145
+ if (word->word->gblob_list ()->empty ())
146
+ return 0;
147
+ //xht used for blnorm
148
+ bln_xht = bln_x_height / word->denorm.scale ();
149
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
150
+ /*
151
+ NOTE: Need to convert to tess format and back again to ensure that the
152
+ same float -> int rounding of coords is done to source wd as out wd before
153
+ comparison
154
+ */
155
+ // if (!bln_word->flag(W_POLYGON))
156
+ // tprintf( "NON POLYGON BLN WERD\n");
157
+ tessword = make_tess_word (bln_word, NULL);
158
+ //convert word
159
+ init_word = make_ed_word (tessword, bln_word);
160
+ // if (!init_word->flag(W_POLYGON))
161
+ // tprintf( "NON POLYGON INIT WERD\n");
162
+ // tprintf( "SOURCE BLOBS-AFTER TESS:\n");
163
+ // print_boxes( init_word );
164
+ // tprintf( "OUTPUT BLOBS:\n");
165
+ // print_boxes( word->outword );
166
+
167
+ initial_it.set_to_list (init_word->blob_list ());
168
+ init_blobs_left = initial_it.length ();
169
+ outword_it.set_to_list (word->outword->blob_list ());
170
+ delete bln_word;
171
+ delete_word(tessword); //get rid of it
172
+
173
+ for (outword_it.mark_cycle_pt ();
174
+ !outword_it.cycled_list (); outword_it.forward ()) {
175
+ out_box = outword_it.data ()->bounding_box ();
176
+
177
+ /* Skip any initial blobs LEFT of current outword blob */
178
+ while (!initial_it.at_last () &&
179
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
180
+ initial_it.forward ();
181
+ init_blobs_left--;
182
+ }
183
+
184
+ /* See if current outword blob matches any initial blob with the same left
185
+ coord. (Normally only one but possibly more - in unknown order) */
186
+
187
+ i = 0;
188
+ matched = FALSE;
189
+ do {
190
+ test_blob = initial_it.data_relative (i++);
191
+ matched = crude_match_blobs (test_blob, outword_it.data ());
192
+ if (matched)
193
+ match_count++;
194
+ }
195
+ while (!matched &&
196
+ (init_blobs_left - i > 0) &&
197
+ (i < 129) &&
198
+ !initial_it.at_last () &&
199
+ test_blob->bounding_box ().left () == out_box.left ());
200
+ }
201
+ delete init_word;
202
+ return match_count;
203
+ }
204
+
205
+
206
+ /*************************************************************************
207
+ * crude_match_blobs()
208
+ * Check bounding boxes are the same and the number of outlines are the same.
209
+ *************************************************************************/
210
+ BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
211
+ TBOX box1 = blob1->bounding_box ();
212
+ TBOX box2 = blob2->bounding_box ();
213
+
214
+ if (box1.contains (box2) &&
215
+ box2.contains (box1) &&
216
+ (blob1->out_list ()->length () == blob1->out_list ()->length ()))
217
+ return TRUE;
218
+ else
219
+ return FALSE;
220
+ }
221
+
222
+
223
+ inT16 word_outline_errs( //Outline count errs
224
+ WERD_RES *word) {
225
+ PBLOB_IT outword_it;
226
+ inT16 i = 0;
227
+ inT16 err_count = 0;
228
+
229
+ outword_it.set_to_list (word->outword->blob_list ());
230
+
231
+ for (outword_it.mark_cycle_pt ();
232
+ !outword_it.cycled_list (); outword_it.forward ()) {
233
+ err_count += count_outline_errs (word->best_choice->string ()[i],
234
+ outword_it.data ()->out_list ()->
235
+ length ());
236
+ i++;
237
+ }
238
+ return err_count;
239
+ }
240
+
241
+
242
+ /*************************************************************************
243
+ * word_char_quality()
244
+ * Combination of blob quality and outline quality - how many good chars are
245
+ * there? - I.e chars which pass the blob AND outline tests.
246
+ *************************************************************************/
247
+ void word_char_quality( //Blob seg changes
248
+ WERD_RES *word,
249
+ ROW *row,
250
+ inT16 *match_count,
251
+ inT16 *accepted_match_count) {
252
+ WERD *bln_word; //BL norm init word
253
+ TWERD *tessword; //tess format
254
+ WERD *init_word; //BL norm init word
255
+ PBLOB_IT outword_it;
256
+ PBLOB_IT initial_it;
257
+ inT16 i;
258
+ inT16 init_blobs_left;
259
+ BOOL8 matched;
260
+ TBOX out_box;
261
+ PBLOB *test_blob;
262
+ DENORM denorm;
263
+ float bln_xht;
264
+ inT16 j = 0;
265
+
266
+ *match_count = 0;
267
+ *accepted_match_count = 0;
268
+ if (word->word->gblob_list ()->empty ())
269
+ return;
270
+
271
+ //xht used for blnorm
272
+ bln_xht = bln_x_height / word->denorm.scale ();
273
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
274
+ /*
275
+ NOTE: Need to convert to tess format and back again to ensure that the
276
+ same float -> int rounding of coords is done to source wd as out wd before
277
+ comparison
278
+ */
279
+ tessword = make_tess_word (bln_word, NULL);
280
+ //convert word
281
+ init_word = make_ed_word (tessword, bln_word);
282
+ delete bln_word;
283
+ delete_word(tessword); //get rid of it
284
+ // tprintf( "SOURCE BLOBS-AFTER TESS:\n");
285
+ // print_boxes( init_word );
286
+ // tprintf( "OUTPUT BLOBS:\n");
287
+ // print_boxes( word->outword );
288
+
289
+ initial_it.set_to_list (init_word->blob_list ());
290
+ init_blobs_left = initial_it.length ();
291
+ outword_it.set_to_list (word->outword->blob_list ());
292
+
293
+ for (outword_it.mark_cycle_pt ();
294
+ !outword_it.cycled_list (); outword_it.forward ()) {
295
+ out_box = outword_it.data ()->bounding_box ();
296
+
297
+ /* Skip any initial blobs LEFT of current outword blob */
298
+ while (!initial_it.at_last () &&
299
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
300
+ initial_it.forward ();
301
+ init_blobs_left--;
302
+ }
303
+
304
+ /* See if current outword blob matches any initial blob with the same left
305
+ coord. (Normally only one but possibly more - in unknown order) */
306
+
307
+ i = 0;
308
+ matched = FALSE;
309
+ do {
310
+ test_blob = initial_it.data_relative (i++);
311
+ matched = crude_match_blobs (test_blob, outword_it.data ());
312
+ if (matched &&
313
+ (count_outline_errs (word->best_choice->string ()[j],
314
+ outword_it.data ()->out_list ()->length ())
315
+ == 0)) {
316
+ (*match_count)++;
317
+ if (word->reject_map[j].accepted ())
318
+ (*accepted_match_count)++;
319
+ }
320
+ }
321
+ while (!matched &&
322
+ (init_blobs_left - i > 0) &&
323
+ (i < 129) &&
324
+ !initial_it.at_last () &&
325
+ test_blob->bounding_box ().left () == out_box.left ());
326
+ j++;
327
+ }
328
+ delete init_word;
329
+ }
330
+
331
+
332
+ /*************************************************************************
333
+ * unrej_good_chs()
334
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
335
+ *************************************************************************/
336
+ void unrej_good_chs(WERD_RES *word, ROW *row) {
337
+ WERD *bln_word; //BL norm init word
338
+ TWERD *tessword; //tess format
339
+ WERD *init_word; //BL norm init word
340
+ PBLOB_IT outword_it;
341
+ PBLOB_IT initial_it;
342
+ inT16 i;
343
+ inT16 init_blobs_left;
344
+ BOOL8 matched;
345
+ TBOX out_box;
346
+ PBLOB *test_blob;
347
+ DENORM denorm;
348
+ float bln_xht;
349
+ inT16 j = 0;
350
+
351
+ if (word->word->gblob_list ()->empty ())
352
+ return;
353
+
354
+ //xht used for blnorm
355
+ bln_xht = bln_x_height / word->denorm.scale ();
356
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
357
+ /*
358
+ NOTE: Need to convert to tess format and back again to ensure that the
359
+ same float -> int rounding of coords is done to source wd as out wd before
360
+ comparison
361
+ */
362
+ tessword = make_tess_word (bln_word, NULL);
363
+ //convert word
364
+ init_word = make_ed_word (tessword, bln_word);
365
+ delete bln_word;
366
+ delete_word(tessword); //get rid of it
367
+
368
+ initial_it.set_to_list (init_word->blob_list ());
369
+ init_blobs_left = initial_it.length ();
370
+ outword_it.set_to_list (word->outword->blob_list ());
371
+
372
+ for (outword_it.mark_cycle_pt ();
373
+ !outword_it.cycled_list (); outword_it.forward ()) {
374
+ out_box = outword_it.data ()->bounding_box ();
375
+
376
+ /* Skip any initial blobs LEFT of current outword blob */
377
+ while (!initial_it.at_last () &&
378
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
379
+ initial_it.forward ();
380
+ init_blobs_left--;
381
+ }
382
+
383
+ /* See if current outword blob matches any initial blob with the same left
384
+ coord. (Normally only one but possibly more - in unknown order) */
385
+
386
+ i = 0;
387
+ matched = FALSE;
388
+ do {
389
+ test_blob = initial_it.data_relative (i++);
390
+ matched = crude_match_blobs (test_blob, outword_it.data ());
391
+ if (matched &&
392
+ (word->reject_map[j].accept_if_good_quality ()) &&
393
+ (docqual_excuse_outline_errs ||
394
+ (count_outline_errs (word->best_choice->string ()[j],
395
+ outword_it.data ()->out_list ()->
396
+ length ()) == 0)))
397
+ word->reject_map[j].setrej_quality_accept ();
398
+ }
399
+ while (!matched &&
400
+ (init_blobs_left - i > 0) &&
401
+ (i < 129) &&
402
+ !initial_it.at_last () &&
403
+ test_blob->bounding_box ().left () == out_box.left ());
404
+ j++;
405
+ }
406
+ delete init_word;
407
+ }
408
+
409
+
410
+ void print_boxes(WERD *word) {
411
+ PBLOB_IT it;
412
+ TBOX box;
413
+
414
+ it.set_to_list (word->blob_list ());
415
+ for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
416
+ box = it.data ()->bounding_box ();
417
+ box.print ();
418
+ }
419
+ }
420
+
421
+
422
+ inT16 count_outline_errs(char c, inT16 outline_count) {
423
+ int expected_outline_count;
424
+
425
+ if (STRING (outlines_odd).contains (c))
426
+ return 0; //Dont use this char
427
+ else if (STRING (outlines_2).contains (c))
428
+ expected_outline_count = 2;
429
+ else
430
+ expected_outline_count = 1;
431
+ return abs (outline_count - expected_outline_count);
432
+ }
433
+
434
+
435
+ void quality_based_rejection(PAGE_RES_IT &page_res_it,
436
+ BOOL8 good_quality_doc) {
437
+ if ((tessedit_good_quality_unrej && good_quality_doc))
438
+ unrej_good_quality_words(page_res_it);
439
+ doc_and_block_rejection(page_res_it, good_quality_doc);
440
+
441
+ page_res_it.restart_page ();
442
+ while (page_res_it.word () != NULL) {
443
+ insert_rej_cblobs (page_res_it.word ());
444
+ page_res_it.forward ();
445
+ }
446
+
447
+ if (unlv_tilde_crunching) {
448
+ tilde_crunch(page_res_it);
449
+ tilde_delete(page_res_it);
450
+ }
451
+ }
452
+
453
+
454
+ /*************************************************************************
455
+ * unrej_good_quality_words()
456
+ * Accept potential rejects in words which pass the following checks:
457
+ * - Contains a potential reject
458
+ * - Word looks like a sensible alpha word.
459
+ * - Word segmentation is the same as the original image
460
+ * - All characters have the expected number of outlines
461
+ * NOTE - the rejection counts are recalculated after unrejection
462
+ * - CANT do it in a single pass without a bit of fiddling
463
+ * - keep it simple but inefficient
464
+ *************************************************************************/
465
+ void unrej_good_quality_words( //unreject potential
466
+ PAGE_RES_IT &page_res_it) {
467
+ WERD_RES *word;
468
+ ROW_RES *current_row;
469
+ BLOCK_RES *current_block;
470
+ int i;
471
+
472
+ page_res_it.restart_page ();
473
+ while (page_res_it.word () != NULL) {
474
+ check_debug_pt (page_res_it.word (), 100);
475
+ if (bland_unrej) {
476
+ word = page_res_it.word ();
477
+ for (i = 0; i < word->reject_map.length (); i++) {
478
+ if (word->reject_map[i].accept_if_good_quality ())
479
+ word->reject_map[i].setrej_quality_accept ();
480
+ }
481
+ page_res_it.forward ();
482
+ }
483
+ else if ((page_res_it.row ()->char_count > 0) &&
484
+ ((page_res_it.row ()->rej_count /
485
+ (float) page_res_it.row ()->char_count) <=
486
+ quality_rowrej_pc)) {
487
+ word = page_res_it.word ();
488
+ if (word->reject_map.quality_recoverable_rejects () &&
489
+ (tessedit_unrej_any_wd ||
490
+ acceptable_word_string (word->best_choice->string ().string (),
491
+ word->best_choice->lengths().string())
492
+ != AC_UNACCEPTABLE)) {
493
+ unrej_good_chs (word, page_res_it.row ()->row);
494
+ }
495
+ page_res_it.forward ();
496
+ }
497
+ else {
498
+ /* Skip to end of dodgy row */
499
+ current_row = page_res_it.row ();
500
+ while ((page_res_it.word () != NULL) &&
501
+ (page_res_it.row () == current_row))
502
+ page_res_it.forward ();
503
+ }
504
+ check_debug_pt (page_res_it.word (), 110);
505
+ }
506
+ page_res_it.restart_page ();
507
+ page_res_it.page_res->char_count = 0;
508
+ page_res_it.page_res->rej_count = 0;
509
+ current_block = NULL;
510
+ current_row = NULL;
511
+ while (page_res_it.word () != NULL) {
512
+ if (current_block != page_res_it.block ()) {
513
+ current_block = page_res_it.block ();
514
+ current_block->char_count = 0;
515
+ current_block->rej_count = 0;
516
+ }
517
+ if (current_row != page_res_it.row ()) {
518
+ current_row = page_res_it.row ();
519
+ current_row->char_count = 0;
520
+ current_row->rej_count = 0;
521
+ current_row->whole_word_rej_count = 0;
522
+ }
523
+ page_res_it.rej_stat_word ();
524
+ page_res_it.forward ();
525
+ }
526
+ }
527
+
528
+
529
+ /*************************************************************************
530
+ * doc_and_block_rejection()
531
+ *
532
+ * If the page has too many rejects - reject all of it.
533
+ * If any block has too many rejects - reject all words in the block
534
+ *************************************************************************/
535
+
536
+ void doc_and_block_rejection( //reject big chunks
537
+ PAGE_RES_IT &page_res_it,
538
+ BOOL8 good_quality_doc) {
539
+ inT16 block_no = 0;
540
+ inT16 row_no = 0;
541
+ BLOCK_RES *current_block;
542
+ ROW_RES *current_row;
543
+
544
+ BOOL8 rej_word;
545
+ BOOL8 prev_word_rejected;
546
+ inT16 char_quality;
547
+ inT16 accepted_char_quality;
548
+
549
+ if ((page_res_it.page_res->rej_count * 100.0 /
550
+ page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
551
+ reject_whole_page(page_res_it);
552
+ #ifndef SECURE_NAMES
553
+ if (tessedit_debug_doc_rejection) {
554
+ tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
555
+ page_res_it.page_res->char_count,
556
+ page_res_it.page_res->rej_count);
557
+ }
558
+ #endif
559
+ }
560
+ else {
561
+ #ifndef SECURE_NAMES
562
+ if (tessedit_debug_doc_rejection)
563
+ tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
564
+ page_res_it.page_res->char_count,
565
+ page_res_it.page_res->rej_count);
566
+ #endif
567
+
568
+ /* Walk blocks testing for block rejection */
569
+
570
+ page_res_it.restart_page ();
571
+ while (page_res_it.word () != NULL) {
572
+ current_block = page_res_it.block ();
573
+ if (current_block->block->text_region () != NULL)
574
+ block_no = current_block->block->text_region ()->id_no ();
575
+ else
576
+ block_no = -1;
577
+ if ((page_res_it.block ()->char_count > 0) &&
578
+ ((page_res_it.block ()->rej_count * 100.0 /
579
+ page_res_it.block ()->char_count) >
580
+ tessedit_reject_block_percent)) {
581
+ #ifndef SECURE_NAMES
582
+ if (tessedit_debug_block_rejection)
583
+ tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
584
+ block_no,
585
+ page_res_it.block ()->char_count,
586
+ page_res_it.block ()->rej_count);
587
+ #endif
588
+ prev_word_rejected = FALSE;
589
+ while ((page_res_it.word () != NULL) &&
590
+ (page_res_it.block () == current_block)) {
591
+ if (tessedit_preserve_blk_rej_perfect_wds) {
592
+ rej_word =
593
+ (page_res_it.word ()->reject_map.reject_count () > 0)
594
+ || (page_res_it.word ()->reject_map.length () <
595
+ tessedit_preserve_min_wd_len);
596
+ if (rej_word && tessedit_dont_blkrej_good_wds
597
+ && !(page_res_it.word ()->reject_map.length () <
598
+ tessedit_preserve_min_wd_len)
599
+ &&
600
+ (acceptable_word_string
601
+ (page_res_it.word ()->best_choice->string ().
602
+ string (),
603
+ page_res_it.word ()->best_choice->lengths ().
604
+ string ()) != AC_UNACCEPTABLE)) {
605
+ word_char_quality (page_res_it.word (),
606
+ page_res_it.row ()->row,
607
+ &char_quality,
608
+ &accepted_char_quality);
609
+ rej_word = char_quality !=
610
+ page_res_it.word ()->reject_map.length ();
611
+ }
612
+ }
613
+ else
614
+ rej_word = TRUE;
615
+ if (rej_word) {
616
+ /*
617
+ Reject spacing if both current and prev words are rejected.
618
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
619
+ more space errors.
620
+ */
621
+ if (tessedit_use_reject_spaces &&
622
+ prev_word_rejected &&
623
+ (page_res_it.prev_row () == page_res_it.row ()) &&
624
+ (page_res_it.word ()->word->space () == 1))
625
+ page_res_it.word ()->reject_spaces = TRUE;
626
+ page_res_it.word ()->reject_map.rej_word_block_rej ();
627
+ }
628
+ prev_word_rejected = rej_word;
629
+ page_res_it.forward ();
630
+ }
631
+ }
632
+ else {
633
+ #ifndef SECURE_NAMES
634
+ if (tessedit_debug_block_rejection)
635
+ tprintf
636
+ ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
637
+ block_no, page_res_it.block ()->char_count,
638
+ page_res_it.block ()->rej_count);
639
+ #endif
640
+
641
+ /* Walk rows in block testing for row rejection */
642
+ row_no = 0;
643
+ while ((page_res_it.word () != NULL) &&
644
+ (page_res_it.block () == current_block)) {
645
+ current_row = page_res_it.row ();
646
+ row_no++;
647
+ /* Reject whole row if:
648
+ fraction of chars on row which are rejected exceed a limit AND
649
+ fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
650
+ */
651
+ if ((page_res_it.row ()->char_count > 0) &&
652
+ ((page_res_it.row ()->rej_count * 100.0 /
653
+ page_res_it.row ()->char_count) >
654
+ tessedit_reject_row_percent) &&
655
+ ((page_res_it.row ()->whole_word_rej_count * 100.0 /
656
+ page_res_it.row ()->rej_count) <
657
+ tessedit_whole_wd_rej_row_percent)) {
658
+ #ifndef SECURE_NAMES
659
+ if (tessedit_debug_block_rejection)
660
+ tprintf
661
+ ("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
662
+ row_no, page_res_it.row ()->char_count,
663
+ page_res_it.row ()->rej_count);
664
+ #endif
665
+ prev_word_rejected = FALSE;
666
+ while ((page_res_it.word () != NULL) &&
667
+ (page_res_it.row () == current_row)) {
668
+ /* Preserve words on good docs unless they are mostly rejected*/
669
+ if (!tessedit_row_rej_good_docs && good_quality_doc) {
670
+ rej_word =
671
+ page_res_it.word ()->reject_map.
672
+ reject_count () /
673
+ (float) page_res_it.word ()->reject_map.
674
+ length () > tessedit_good_doc_still_rowrej_wd;
675
+ }
676
+
677
+ /* Preserve perfect words anyway */
678
+ else if (tessedit_preserve_row_rej_perfect_wds) {
679
+ rej_word =
680
+ (page_res_it.word ()->reject_map.
681
+ reject_count () > 0)
682
+ || (page_res_it.word ()->reject_map.
683
+ length () < tessedit_preserve_min_wd_len);
684
+ if (rej_word && tessedit_dont_rowrej_good_wds
685
+ && !(page_res_it.word ()->reject_map.
686
+ length () <
687
+ tessedit_preserve_min_wd_len)
688
+ &&
689
+ (acceptable_word_string
690
+ (page_res_it.word ()->best_choice->
691
+ string ().string (),
692
+ page_res_it.word ()->best_choice->
693
+ lengths ().string ()) != AC_UNACCEPTABLE)) {
694
+ word_char_quality (page_res_it.word (),
695
+ page_res_it.row ()->row,
696
+ &char_quality,
697
+ &accepted_char_quality);
698
+ rej_word = char_quality !=
699
+ page_res_it.word ()->reject_map.length ();
700
+ }
701
+ }
702
+ else
703
+ rej_word = TRUE;
704
+ if (rej_word) {
705
+ /*
706
+ Reject spacing if both current and prev words are rejected.
707
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
708
+ more space errors.
709
+ */
710
+ if (tessedit_use_reject_spaces &&
711
+ prev_word_rejected &&
712
+ (page_res_it.prev_row () ==
713
+ page_res_it.row ())
714
+ && (page_res_it.word ()->word->space () ==
715
+ 1))
716
+ page_res_it.word ()->reject_spaces = TRUE;
717
+ page_res_it.word ()->reject_map.
718
+ rej_word_row_rej();
719
+ }
720
+ prev_word_rejected = rej_word;
721
+ page_res_it.forward ();
722
+ }
723
+ }
724
+ else {
725
+ #ifndef SECURE_NAMES
726
+ if (tessedit_debug_block_rejection)
727
+ tprintf
728
+ ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
729
+ row_no, page_res_it.row ()->char_count,
730
+ page_res_it.row ()->rej_count);
731
+ #endif
732
+ while ((page_res_it.word () != NULL) &&
733
+ (page_res_it.row () == current_row))
734
+ page_res_it.forward ();
735
+ }
736
+ }
737
+ }
738
+ }
739
+ }
740
+ }
741
+
742
+
743
+ /*************************************************************************
744
+ * reject_whole_page()
745
+ * Dont believe any of it - set the reject map to 00..00 in all words
746
+ *
747
+ *************************************************************************/
748
+
749
+ void reject_whole_page(PAGE_RES_IT &page_res_it) {
750
+ page_res_it.restart_page ();
751
+ while (page_res_it.word () != NULL) {
752
+ page_res_it.word ()->reject_map.rej_word_doc_rej ();
753
+ page_res_it.forward ();
754
+ }
755
+ //whole page is rejected
756
+ page_res_it.page_res->rejected = TRUE;
757
+ }
758
+
759
+
760
+ void tilde_crunch(PAGE_RES_IT &page_res_it) {
761
+ WERD_RES *word;
762
+ GARBAGE_LEVEL garbage_level;
763
+ PAGE_RES_IT copy_it;
764
+ BOOL8 prev_potential_marked = FALSE;
765
+ BOOL8 found_terrible_word = FALSE;
766
+ int dict_type;
767
+ BOOL8 ok_dict_word;
768
+
769
+ page_res_it.restart_page ();
770
+ while (page_res_it.word () != NULL) {
771
+ word = page_res_it.word ();
772
+
773
+ if (crunch_early_convert_bad_unlv_chs)
774
+ convert_bad_unlv_chs(word);
775
+
776
+ if (crunch_early_merge_tess_fails)
777
+ merge_tess_fails(word);
778
+
779
+ if (word->reject_map.accept_count () != 0) {
780
+ found_terrible_word = FALSE;
781
+ //Forget earlier potential crunches
782
+ prev_potential_marked = FALSE;
783
+ }
784
+ else {
785
+ dict_type = dict_word (word->best_choice->string ().string ());
786
+ ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
787
+ garbage_level = garbage_word (word, ok_dict_word);
788
+
789
+ if ((garbage_level != G_NEVER_CRUNCH) &&
790
+ (terrible_word_crunch (word, garbage_level))) {
791
+ if (crunch_debug > 0) {
792
+ tprintf ("T CRUNCHING: \"%s\"\n",
793
+ word->best_choice->string ().string ());
794
+ }
795
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
796
+ if (prev_potential_marked) {
797
+ while (copy_it.word () != word) {
798
+ if (crunch_debug > 0) {
799
+ tprintf ("P1 CRUNCHING: \"%s\"\n",
800
+ copy_it.word ()->best_choice->string ().
801
+ string ());
802
+ }
803
+ copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
804
+ copy_it.forward ();
805
+ }
806
+ prev_potential_marked = FALSE;
807
+ }
808
+ found_terrible_word = TRUE;
809
+ }
810
+ else if ((garbage_level != G_NEVER_CRUNCH) &&
811
+ (potential_word_crunch (word,
812
+ garbage_level, ok_dict_word))) {
813
+ if (found_terrible_word) {
814
+ if (crunch_debug > 0) {
815
+ tprintf ("P2 CRUNCHING: \"%s\"\n",
816
+ word->best_choice->string ().string ());
817
+ }
818
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
819
+ }
820
+ else if (!prev_potential_marked) {
821
+ copy_it = page_res_it;
822
+ prev_potential_marked = TRUE;
823
+ if (crunch_debug > 1) {
824
+ tprintf ("P3 CRUNCHING: \"%s\"\n",
825
+ word->best_choice->string ().string ());
826
+ }
827
+ }
828
+ }
829
+ else {
830
+ found_terrible_word = FALSE;
831
+ //Forget earlier potential crunches
832
+ prev_potential_marked = FALSE;
833
+ if (crunch_debug > 2) {
834
+ tprintf ("NO CRUNCH: \"%s\"\n",
835
+ word->best_choice->string ().string ());
836
+ }
837
+ }
838
+ }
839
+ page_res_it.forward ();
840
+ }
841
+ }
842
+
843
+
844
+ BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
845
+ float rating_per_ch;
846
+ int adjusted_len;
847
+ int crunch_mode = 0;
848
+
849
+ if ((word->best_choice->string ().length () == 0) ||
850
+ (strspn (word->best_choice->string ().string (), " ") ==
851
+ word->best_choice->string ().length ()))
852
+ crunch_mode = 1;
853
+ else {
854
+ adjusted_len = word->reject_map.length ();
855
+ if (adjusted_len > crunch_rating_max)
856
+ adjusted_len = crunch_rating_max;
857
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
858
+
859
+ if (rating_per_ch > crunch_terrible_rating)
860
+ crunch_mode = 2;
861
+ else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
862
+ crunch_mode = 3;
863
+ else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
864
+ (garbage_level != G_OK))
865
+ crunch_mode = 4;
866
+ else if ((rating_per_ch > crunch_poor_garbage_rate) &&
867
+ (garbage_level != G_OK))
868
+ crunch_mode = 5;
869
+ }
870
+ if (crunch_mode > 0) {
871
+ if (crunch_debug > 2) {
872
+ tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
873
+ crunch_mode, word->best_choice->string ().string ());
874
+ }
875
+ return TRUE;
876
+ }
877
+ else
878
+ return FALSE;
879
+ }
880
+
881
+
882
+ BOOL8 potential_word_crunch(WERD_RES *word,
883
+ GARBAGE_LEVEL garbage_level,
884
+ BOOL8 ok_dict_word) {
885
+ float rating_per_ch;
886
+ int adjusted_len;
887
+ const char *str = word->best_choice->string ().string ();
888
+ const char *lengths = word->best_choice->lengths ().string ();
889
+ BOOL8 word_crunchable;
890
+ int poor_indicator_count = 0;
891
+
892
+ word_crunchable =
893
+ !crunch_leave_accept_strings ||
894
+ (word->reject_map.length () < 3) ||
895
+ ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
896
+ !ok_dict_word);
897
+
898
+ adjusted_len = word->reject_map.length ();
899
+ if (adjusted_len > 10)
900
+ adjusted_len = 10;
901
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
902
+
903
+ if (rating_per_ch > crunch_pot_poor_rate) {
904
+ if (crunch_debug > 2) {
905
+ tprintf ("Potential poor rating on \"%s\"\n",
906
+ word->best_choice->string ().string ());
907
+ }
908
+ poor_indicator_count++;
909
+ }
910
+
911
+ if (word_crunchable &&
912
+ (word->best_choice->certainty () < crunch_pot_poor_cert)) {
913
+ if (crunch_debug > 2) {
914
+ tprintf ("Potential poor cert on \"%s\"\n",
915
+ word->best_choice->string ().string ());
916
+ }
917
+ poor_indicator_count++;
918
+ }
919
+
920
+ if (garbage_level != G_OK) {
921
+ if (crunch_debug > 2) {
922
+ tprintf ("Potential garbage on \"%s\"\n",
923
+ word->best_choice->string ().string ());
924
+ }
925
+ poor_indicator_count++;
926
+ }
927
+ return (poor_indicator_count >= crunch_pot_indicators);
928
+ }
929
+
930
+
931
+ void tilde_delete(PAGE_RES_IT &page_res_it) {
932
+ WERD_RES *word;
933
+ PAGE_RES_IT copy_it;
934
+ BOOL8 deleting_from_bol = FALSE;
935
+ BOOL8 marked_delete_point = FALSE;
936
+ inT16 debug_delete_mode;
937
+ CRUNCH_MODE delete_mode;
938
+ inT16 x_debug_delete_mode;
939
+ CRUNCH_MODE x_delete_mode;
940
+
941
+ page_res_it.restart_page ();
942
+ while (page_res_it.word () != NULL) {
943
+ word = page_res_it.word ();
944
+
945
+ delete_mode = word_deletable (word, debug_delete_mode);
946
+ if (delete_mode != CR_NONE) {
947
+ if (word->word->flag (W_BOL) || deleting_from_bol) {
948
+ if (crunch_debug > 0) {
949
+ tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
950
+ debug_delete_mode,
951
+ word->best_choice->string ().string ());
952
+ }
953
+ word->unlv_crunch_mode = delete_mode;
954
+ deleting_from_bol = TRUE;
955
+ }
956
+ else if (word->word->flag (W_EOL)) {
957
+ if (marked_delete_point) {
958
+ while (copy_it.word () != word) {
959
+ x_delete_mode = word_deletable (copy_it.word (),
960
+ x_debug_delete_mode);
961
+ if (crunch_debug > 0) {
962
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
963
+ x_debug_delete_mode,
964
+ copy_it.word ()->best_choice->string ().
965
+ string ());
966
+ }
967
+ copy_it.word ()->unlv_crunch_mode = x_delete_mode;
968
+ copy_it.forward ();
969
+ }
970
+ }
971
+ if (crunch_debug > 0) {
972
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
973
+ debug_delete_mode,
974
+ word->best_choice->string ().string ());
975
+ }
976
+ word->unlv_crunch_mode = delete_mode;
977
+ deleting_from_bol = FALSE;
978
+ marked_delete_point = FALSE;
979
+ }
980
+ else {
981
+ if (!marked_delete_point) {
982
+ copy_it = page_res_it;
983
+ marked_delete_point = TRUE;
984
+ }
985
+ }
986
+ }
987
+ else {
988
+ deleting_from_bol = FALSE;
989
+ //Forget earlier potential crunches
990
+ marked_delete_point = FALSE;
991
+ }
992
+ /*
993
+ The following step has been left till now as the tess fails are used to
994
+ determine if the word is deletable.
995
+ */
996
+ if (!crunch_early_merge_tess_fails)
997
+ merge_tess_fails(word);
998
+ page_res_it.forward ();
999
+ }
1000
+ }
1001
+
1002
+
1003
+ void convert_bad_unlv_chs( //word to do
1004
+ WERD_RES *word_res) {
1005
+ char *ptr; //string ptr
1006
+ int i;
1007
+ int offset;
1008
+
1009
+ ptr = (char *) word_res->best_choice->string ().string ();
1010
+ for (i = 0, offset = 0; i < word_res->reject_map.length ();
1011
+ offset += word_res->best_choice->lengths ()[i++]) {
1012
+ if (word_res->best_choice->lengths ()[i] == 1 &&
1013
+ ptr[offset] == '~') {
1014
+ ptr[offset] = '-';
1015
+ if (word_res->reject_map[i].accepted ())
1016
+ word_res->reject_map[i].setrej_unlv_rej ();
1017
+ }
1018
+ if (word_res->best_choice->lengths ()[i] == 1 &&
1019
+ ptr[offset] == '^') {
1020
+ ptr[offset] = ' ';
1021
+ if (word_res->reject_map[i].accepted ())
1022
+ word_res->reject_map[i].setrej_unlv_rej ();
1023
+ }
1024
+ }
1025
+ }
1026
+
1027
+
1028
+ /**********************************************************************
1029
+ * merge_tess_fails
1030
+ *
1031
+ * Change pairs of tess failures to a single one
1032
+ **********************************************************************/
1033
+
1034
+ void merge_tess_fails( //word to do
1035
+ WERD_RES *word_res) {
1036
+ char *ptr; //string ptr
1037
+ char *ptr_lengths; //lengths ptr
1038
+ PBLOB_IT blob_it; //blobs
1039
+ int i = 0;
1040
+ int len;
1041
+
1042
+ len = strlen (word_res->best_choice->lengths ().string ());
1043
+ ASSERT_HOST (word_res->reject_map.length () == len);
1044
+ ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1045
+
1046
+ ptr = (char *) word_res->best_choice->string ().string ();
1047
+ ptr_lengths = (char *) word_res->best_choice->lengths ().string ();
1048
+ blob_it = word_res->outword->blob_list ();
1049
+ while (*ptr != '\0') {
1050
+ if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
1051
+ strcpy (ptr + 1, ptr + 2); //shuffle up
1052
+ strcpy (ptr_lengths + 1, ptr_lengths + 2); //shuffle up
1053
+ word_res->reject_map.remove_pos (i);
1054
+ merge_blobs (blob_it.data_relative (1), blob_it.data ());
1055
+ delete blob_it.extract (); //get rid of spare
1056
+ }
1057
+ else {
1058
+ i++;
1059
+ ptr += *(ptr_lengths++);
1060
+ }
1061
+ blob_it.forward ();
1062
+ }
1063
+ len = strlen (word_res->best_choice->lengths ().string ());
1064
+ ASSERT_HOST (word_res->reject_map.length () == len);
1065
+ ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1066
+ }
1067
+
1068
+
1069
+ GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
1070
+ enum STATES
1071
+ {
1072
+ JUNK,
1073
+ FIRST_UPPER,
1074
+ FIRST_LOWER,
1075
+ FIRST_NUM,
1076
+ SUBSEQUENT_UPPER,
1077
+ SUBSEQUENT_LOWER,
1078
+ SUBSEQUENT_NUM
1079
+ };
1080
+ const char *str = word->best_choice->string ().string ();
1081
+ const char *lengths = word->best_choice->lengths ().string ();
1082
+ STATES state = JUNK;
1083
+ int len = 0;
1084
+ int isolated_digits = 0;
1085
+ int isolated_alphas = 0;
1086
+ int bad_char_count = 0;
1087
+ int tess_rejs = 0;
1088
+ int dodgy_chars = 0;
1089
+ int ok_chars;
1090
+ UNICHAR_ID last_char = -1;
1091
+ int alpha_repetition_count = 0;
1092
+ int longest_alpha_repetition_count = 0;
1093
+ int longest_lower_run_len = 0;
1094
+ int lower_string_count = 0;
1095
+ int longest_upper_run_len = 0;
1096
+ int upper_string_count = 0;
1097
+ int total_alpha_count = 0;
1098
+ int total_digit_count = 0;
1099
+
1100
+ for (; *str != '\0'; str += *(lengths++)) {
1101
+ len++;
1102
+ if (unicharset.get_isupper (str, *lengths)) {
1103
+ total_alpha_count++;
1104
+ switch (state) {
1105
+ case SUBSEQUENT_UPPER:
1106
+ case FIRST_UPPER:
1107
+ state = SUBSEQUENT_UPPER;
1108
+ upper_string_count++;
1109
+ if (longest_upper_run_len < upper_string_count)
1110
+ longest_upper_run_len = upper_string_count;
1111
+ if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1112
+ alpha_repetition_count++;
1113
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
1114
+ longest_alpha_repetition_count = alpha_repetition_count;
1115
+ }
1116
+ }
1117
+ else {
1118
+ last_char = unicharset.unichar_to_id(str, *lengths);
1119
+ alpha_repetition_count = 1;
1120
+ }
1121
+ break;
1122
+ case FIRST_NUM:
1123
+ isolated_digits++;
1124
+ default:
1125
+ state = FIRST_UPPER;
1126
+ last_char = unicharset.unichar_to_id(str, *lengths);
1127
+ alpha_repetition_count = 1;
1128
+ upper_string_count = 1;
1129
+ break;
1130
+ }
1131
+ }
1132
+ else if (unicharset.get_islower (str, *lengths)) {
1133
+ total_alpha_count++;
1134
+ switch (state) {
1135
+ case SUBSEQUENT_LOWER:
1136
+ case FIRST_LOWER:
1137
+ state = SUBSEQUENT_LOWER;
1138
+ lower_string_count++;
1139
+ if (longest_lower_run_len < lower_string_count)
1140
+ longest_lower_run_len = lower_string_count;
1141
+ if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1142
+ alpha_repetition_count++;
1143
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
1144
+ longest_alpha_repetition_count = alpha_repetition_count;
1145
+ }
1146
+ }
1147
+ else {
1148
+ last_char = unicharset.unichar_to_id(str, *lengths);
1149
+ alpha_repetition_count = 1;
1150
+ }
1151
+ break;
1152
+ case FIRST_NUM:
1153
+ isolated_digits++;
1154
+ default:
1155
+ state = FIRST_LOWER;
1156
+ last_char = unicharset.unichar_to_id(str, *lengths);
1157
+ alpha_repetition_count = 1;
1158
+ lower_string_count = 1;
1159
+ break;
1160
+ }
1161
+ }
1162
+ else if (unicharset.get_isdigit (str, *lengths)) {
1163
+ total_digit_count++;
1164
+ switch (state) {
1165
+ case FIRST_NUM:
1166
+ state = SUBSEQUENT_NUM;
1167
+ case SUBSEQUENT_NUM:
1168
+ break;
1169
+ case FIRST_UPPER:
1170
+ case FIRST_LOWER:
1171
+ isolated_alphas++;
1172
+ default:
1173
+ state = FIRST_NUM;
1174
+ break;
1175
+ }
1176
+ }
1177
+ else {
1178
+ if (*lengths == 1 && *str == ' ')
1179
+ tess_rejs++;
1180
+ else
1181
+ bad_char_count++;
1182
+ switch (state) {
1183
+ case FIRST_NUM:
1184
+ isolated_digits++;
1185
+ break;
1186
+ case FIRST_UPPER:
1187
+ case FIRST_LOWER:
1188
+ isolated_alphas++;
1189
+ default:
1190
+ break;
1191
+ }
1192
+ state = JUNK;
1193
+ }
1194
+ }
1195
+
1196
+ switch (state) {
1197
+ case FIRST_NUM:
1198
+ isolated_digits++;
1199
+ break;
1200
+ case FIRST_UPPER:
1201
+ case FIRST_LOWER:
1202
+ isolated_alphas++;
1203
+ default:
1204
+ break;
1205
+ }
1206
+
1207
+ if (crunch_include_numerals) {
1208
+ total_alpha_count += total_digit_count - isolated_digits;
1209
+ }
1210
+
1211
+ if (crunch_leave_ok_strings &&
1212
+ (len >= 4) &&
1213
+ (2 * (total_alpha_count - isolated_alphas) > len) &&
1214
+ (longest_alpha_repetition_count < crunch_long_repetitions)) {
1215
+ if ((crunch_accept_ok &&
1216
+ (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
1217
+ (longest_lower_run_len > crunch_leave_lc_strings) ||
1218
+ (longest_upper_run_len > crunch_leave_uc_strings))
1219
+ return G_NEVER_CRUNCH;
1220
+ }
1221
+ if ((word->reject_map.length () > 1) &&
1222
+ (strpbrk (str, " ") == NULL) &&
1223
+ ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1224
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1225
+ (word->best_choice->permuter () == USER_DAWG_PERM) ||
1226
+ (word->best_choice->permuter () == NUMBER_PERM) ||
1227
+ (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
1228
+ return G_OK;
1229
+
1230
+ ok_chars = len - bad_char_count - isolated_digits -
1231
+ isolated_alphas - tess_rejs;
1232
+
1233
+ if (crunch_debug > 3) {
1234
+ tprintf ("garbage_word: \"%s\"\n",
1235
+ word->best_choice->string ().string ());
1236
+ tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
1237
+ len,
1238
+ bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
1239
+ }
1240
+ if ((bad_char_count == 0) &&
1241
+ (tess_rejs == 0) &&
1242
+ ((len > isolated_digits + isolated_alphas) || (len <= 2)))
1243
+ return G_OK;
1244
+
1245
+ if ((tess_rejs > ok_chars) ||
1246
+ ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
1247
+ return G_TERRIBLE;
1248
+
1249
+ if (len > 4) {
1250
+ dodgy_chars = 2 * tess_rejs + bad_char_count +
1251
+ isolated_digits + isolated_alphas;
1252
+ if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
1253
+ return G_DODGY;
1254
+ else
1255
+ return G_OK;
1256
+ }
1257
+ else {
1258
+ dodgy_chars = 2 * tess_rejs + bad_char_count;
1259
+ if (((len == 4) && (dodgy_chars > 2)) ||
1260
+ ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
1261
+ return G_DODGY;
1262
+ else
1263
+ return G_OK;
1264
+ }
1265
+ }
1266
+
1267
+
1268
+ /*************************************************************************
1269
+ * word_deletable()
1270
+ * DELETE WERDS AT ENDS OF ROWS IF
1271
+ * Word is crunched &&
1272
+ * ( string length = 0 OR
1273
+ * > 50% of chars are "|" (before merging) OR
1274
+ * certainty < -10 OR
1275
+ * rating /char > 60 OR
1276
+ * TOP of word is more than 0.5 xht BELOW baseline OR
1277
+ * BOTTOM of word is more than 0.5 xht ABOVE xht OR
1278
+ * length of word < 3xht OR
1279
+ * height of word < 0.7 xht OR
1280
+ * height of word > 3.0 xht OR
1281
+ * >75% of the outline BBs have longest dimension < 0.5xht
1282
+ *************************************************************************/
1283
+
1284
+ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
1285
+ int word_len = word->reject_map.length ();
1286
+ float rating_per_ch;
1287
+ TBOX box; //BB of word
1288
+
1289
+ if (word->unlv_crunch_mode == CR_NONE) {
1290
+ delete_mode = 0;
1291
+ return CR_NONE;
1292
+ }
1293
+
1294
+ if (word_len == 0) {
1295
+ delete_mode = 1;
1296
+ return CR_DELETE;
1297
+ }
1298
+
1299
+ box = word->outword->bounding_box ();
1300
+ if (box.height () < crunch_del_min_ht * bln_x_height) {
1301
+ delete_mode = 4;
1302
+ return CR_DELETE;
1303
+ }
1304
+
1305
+ if (noise_outlines (word->outword)) {
1306
+ delete_mode = 5;
1307
+ return CR_DELETE;
1308
+ }
1309
+
1310
+ if ((failure_count (word) * 1.5) > word_len) {
1311
+ delete_mode = 2;
1312
+ return CR_LOOSE_SPACE;
1313
+ }
1314
+
1315
+ if (word->best_choice->certainty () < crunch_del_cert) {
1316
+ delete_mode = 7;
1317
+ return CR_LOOSE_SPACE;
1318
+ }
1319
+
1320
+ rating_per_ch = word->best_choice->rating () / word_len;
1321
+
1322
+ if (rating_per_ch > crunch_del_rating) {
1323
+ delete_mode = 8;
1324
+ return CR_LOOSE_SPACE;
1325
+ }
1326
+
1327
+ if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
1328
+ delete_mode = 9;
1329
+ return CR_LOOSE_SPACE;
1330
+ }
1331
+
1332
+ if (box.bottom () >
1333
+ bln_baseline_offset + crunch_del_high_word * bln_x_height) {
1334
+ delete_mode = 10;
1335
+ return CR_LOOSE_SPACE;
1336
+ }
1337
+
1338
+ if (box.height () > crunch_del_max_ht * bln_x_height) {
1339
+ delete_mode = 11;
1340
+ return CR_LOOSE_SPACE;
1341
+ }
1342
+
1343
+ if (box.width () < crunch_del_min_width * bln_x_height) {
1344
+ delete_mode = 3;
1345
+ return CR_LOOSE_SPACE;
1346
+ }
1347
+
1348
+ delete_mode = 0;
1349
+ return CR_NONE;
1350
+ }
1351
+
1352
+
1353
+ inT16 failure_count(WERD_RES *word) {
1354
+ char *str = (char *) word->best_choice->string ().string ();
1355
+ int tess_rejs = 0;
1356
+
1357
+ for (; *str != '\0'; str++) {
1358
+ if (*str == ' ')
1359
+ tess_rejs++;
1360
+ }
1361
+ return tess_rejs;
1362
+ }
1363
+
1364
+
1365
+ BOOL8 noise_outlines(WERD *word) {
1366
+ PBLOB_IT blob_it;
1367
+ OUTLINE_IT outline_it;
1368
+ TBOX box; //BB of outline
1369
+ inT16 outline_count = 0;
1370
+ inT16 small_outline_count = 0;
1371
+ inT16 max_dimension;
1372
+ float small_limit = bln_x_height * crunch_small_outlines_size;
1373
+
1374
+ blob_it.set_to_list (word->blob_list ());
1375
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1376
+ outline_it.set_to_list (blob_it.data ()->out_list ());
1377
+ for (outline_it.mark_cycle_pt ();
1378
+ !outline_it.cycled_list (); outline_it.forward ()) {
1379
+ outline_count++;
1380
+ box = outline_it.data ()->bounding_box ();
1381
+ if (box.height () > box.width ())
1382
+ max_dimension = box.height ();
1383
+ else
1384
+ max_dimension = box.width ();
1385
+ if (max_dimension < small_limit)
1386
+ small_outline_count++;
1387
+ }
1388
+ }
1389
+ return (small_outline_count >= outline_count);
1390
+ }
1391
+
1392
+
1393
+ /*************************************************************************
1394
+ * insert_rej_cblobs()
1395
+ * Put rejected word blobs back into the outword.
1396
+ * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
1397
+ * OF ELEMENTS.
1398
+ *************************************************************************/
1399
+ void insert_rej_cblobs( //word to do
1400
+ WERD_RES *word) {
1401
+ PBLOB_IT blob_it; //blob iterator
1402
+ PBLOB_IT rej_blob_it;
1403
+ const STRING *word_str;
1404
+ const STRING *word_lengths;
1405
+ int old_len;
1406
+ int rej_len;
1407
+ char new_str[512 * UNICHAR_LEN];
1408
+ char new_lengths[512];
1409
+ REJMAP new_map;
1410
+ int i = 0; //new_str index
1411
+ int j = 0; //old_str index
1412
+ int i_offset = 0; //new_str offset
1413
+ int j_offset = 0; //old_str offset
1414
+ int new_len;
1415
+
1416
+ gblob_sort_list (word->outword->rej_blob_list (), TRUE);
1417
+ rej_blob_it.set_to_list (word->outword->rej_blob_list ());
1418
+ if (rej_blob_it.empty ())
1419
+ return;
1420
+ rej_len = rej_blob_it.length ();
1421
+ blob_it.set_to_list (word->outword->blob_list ());
1422
+ word_str = &(word->best_choice->string ());
1423
+ word_lengths = &(word->best_choice->lengths ());
1424
+ old_len = word->best_choice->lengths().length ();
1425
+ ASSERT_HOST (word->reject_map.length () == old_len);
1426
+ ASSERT_HOST (blob_it.length () == old_len);
1427
+ if ((old_len + rej_len) > 511)
1428
+ return; //Word is garbage anyway prevent abort
1429
+ new_map.initialise (old_len + rej_len);
1430
+
1431
+ while (!rej_blob_it.empty ()) {
1432
+ if ((j >= old_len) ||
1433
+ (rej_blob_it.data ()->bounding_box ().left () <=
1434
+ blob_it.data ()->bounding_box ().left ())) {
1435
+ /* Insert reject blob */
1436
+ if (j >= old_len)
1437
+ blob_it.add_to_end (rej_blob_it.extract ());
1438
+ else
1439
+ blob_it.add_before_stay_put (rej_blob_it.extract ());
1440
+ if (!rej_blob_it.empty ())
1441
+ rej_blob_it.forward ();
1442
+ new_str[i_offset] = ' ';
1443
+ new_lengths[i] = 1;
1444
+ new_map[i].setrej_rej_cblob ();
1445
+ i_offset += new_lengths[i++];
1446
+ }
1447
+ else {
1448
+ strncpy(new_str + i_offset, &(*word_str)[j_offset],
1449
+ (*word_lengths)[j]);
1450
+ new_lengths[i] = (*word_lengths)[j];
1451
+ new_map[i] = word->reject_map[j];
1452
+ i_offset += new_lengths[i++];
1453
+ j_offset += (*word_lengths)[j++];
1454
+ blob_it.forward ();
1455
+ }
1456
+ }
1457
+ /* Add any extra normal blobs to strings */
1458
+ while (j < word_lengths->length ()) {
1459
+ strncpy(new_str + i_offset, &(*word_str)[j_offset],
1460
+ (*word_lengths)[j]);
1461
+ new_lengths[i] = (*word_lengths)[j];
1462
+ new_map[i] = word->reject_map[j];
1463
+ i_offset += new_lengths[i++];
1464
+ j_offset += (*word_lengths)[j++];
1465
+ }
1466
+ new_str[i_offset] = '\0';
1467
+ new_lengths[i] = 0;
1468
+ /*
1469
+ tprintf(
1470
+ "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
1471
+ old_len, i, new_str, new_map );
1472
+ */
1473
+ ASSERT_HOST (i == blob_it.length ());
1474
+ ASSERT_HOST (i == old_len + rej_len);
1475
+ word->reject_map = new_map;
1476
+ *((STRING *) word_str) = new_str;
1477
+ *((STRING *) word_lengths) = new_lengths;
1478
+ new_len = word->best_choice->lengths ().length ();
1479
+ ASSERT_HOST (word->reject_map.length () == new_len);
1480
+ ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
1481
+ }