tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1481 @@
1
+ /******************************************************************
2
+ * File: docqual.cpp (Formerly docqual.c)
3
+ * Description: Document Quality Metrics
4
+ * Author: Phil Cheatle
5
+ * Created: Mon May 9 11:27:28 BST 1994
6
+ *
7
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include <ctype.h>
22
+ #include "docqual.h"
23
+ #include "tstruct.h"
24
+ #include "tfacep.h"
25
+ #include "reject.h"
26
+ #include "tessvars.h"
27
+ #include "genblob.h"
28
+ #include "secname.h"
29
+ #include "globals.h"
30
+
31
+ #define EXTERN
32
+
33
+ EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
34
+ EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
35
+ "Non standard number of outlines");
36
+ EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
37
+ "Allow outline errs in unrejection?");
38
+ EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
39
+ "Reduce rejection on good docs");
40
+ EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
41
+ EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
42
+ "%rej allowed before rej whole doc");
43
+ EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
44
+ "%rej allowed before rej whole block");
45
+ EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
46
+ "%rej allowed before rej whole row");
47
+ EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
48
+ "%of row rejects in whole word rejects which prevents whole row rejection");
49
+ EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
50
+ "Only rej partially rejected words in block rejection");
51
+ EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
52
+ "Only rej partially rejected words in row rejection");
53
+ EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
54
+ "Use word segmentation quality metric");
55
+ EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
56
+ "Use word segmentation quality metric");
57
+ EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
58
+ "Only preserve wds longer than this");
59
+ EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
60
+ "Apply row rejection to good docs");
61
+ EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
62
+ "rej good doc wd if more than this fraction rejected");
63
+ EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
64
+ "Reject all bad quality wds");
65
+ EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
66
+ EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
67
+ "Output data to debug file");
68
+ EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
69
+ EXTERN double_VAR (quality_rowrej_pc, 1.1,
70
+ "good_quality_doc gte good char limit");
71
+
72
+ EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
73
+ "Mark v.bad words for tilde crunch");
74
+ EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
75
+ EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
76
+ "Take out ~^ early?");
77
+
78
+ EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
79
+ EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
80
+ EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
81
+ "crunch garbage cert lt this");
82
+ EXTERN double_VAR (crunch_poor_garbage_rate, 60,
83
+ "crunch garbage rating lt this");
84
+
85
+ EXTERN double_VAR (crunch_pot_poor_rate, 40,
86
+ "POTENTIAL crunch rating lt this");
87
+ EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
88
+ "POTENTIAL crunch cert lt this");
89
+ EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
90
+
91
+ EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
92
+ EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
93
+ EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
94
+ EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
95
+ EXTERN double_VAR (crunch_del_min_width, 3.0,
96
+ "Del if word width lt xht x this");
97
+ EXTERN double_VAR (crunch_del_high_word, 1.5,
98
+ "Del if word gt xht x this above bl");
99
+ EXTERN double_VAR (crunch_del_low_word, 0.5,
100
+ "Del if word gt xht x this below bl");
101
+ EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
102
+
103
+ EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
104
+ EXTERN INT_VAR (crunch_pot_indicators, 1,
105
+ "How many potential indicators needed");
106
+
107
+ EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
108
+ "Dont touch sensible strings");
109
+ EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
110
+ EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
111
+ "Dont pot crunch sensible strings");
112
+ EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
113
+ EXTERN INT_VAR (crunch_leave_lc_strings, 4,
114
+ "Dont crunch words with long lower case strings");
115
+ EXTERN INT_VAR (crunch_leave_uc_strings, 4,
116
+ "Dont crunch words with long lower case strings");
117
+ EXTERN INT_VAR (crunch_long_repetitions, 3,
118
+ "Crunch words with long repetitions");
119
+
120
+ EXTERN INT_VAR (crunch_debug, 0, "As it says");
121
+
122
+ /*************************************************************************
123
+ * word_blob_quality()
124
+ * How many blobs in the outword are identical to those of the inword?
125
+ * ASSUME blobs in both initial word and outword are in ascending order of
126
+ * left hand blob edge.
127
+ *************************************************************************/
128
+ inT16 word_blob_quality( //Blob seg changes
129
+ WERD_RES *word,
130
+ ROW *row) {
131
+ WERD *bln_word; //BL norm init word
132
+ TWERD *tessword; //tess format
133
+ WERD *init_word; //BL norm init word
134
+ PBLOB_IT outword_it;
135
+ PBLOB_IT initial_it;
136
+ inT16 i;
137
+ inT16 init_blobs_left;
138
+ inT16 match_count = 0;
139
+ BOOL8 matched;
140
+ TBOX out_box;
141
+ PBLOB *test_blob;
142
+ DENORM denorm;
143
+ float bln_xht;
144
+
145
+ if (word->word->gblob_list ()->empty ())
146
+ return 0;
147
+ //xht used for blnorm
148
+ bln_xht = bln_x_height / word->denorm.scale ();
149
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
150
+ /*
151
+ NOTE: Need to convert to tess format and back again to ensure that the
152
+ same float -> int rounding of coords is done to source wd as out wd before
153
+ comparison
154
+ */
155
+ // if (!bln_word->flag(W_POLYGON))
156
+ // tprintf( "NON POLYGON BLN WERD\n");
157
+ tessword = make_tess_word (bln_word, NULL);
158
+ //convert word
159
+ init_word = make_ed_word (tessword, bln_word);
160
+ // if (!init_word->flag(W_POLYGON))
161
+ // tprintf( "NON POLYGON INIT WERD\n");
162
+ // tprintf( "SOURCE BLOBS-AFTER TESS:\n");
163
+ // print_boxes( init_word );
164
+ // tprintf( "OUTPUT BLOBS:\n");
165
+ // print_boxes( word->outword );
166
+
167
+ initial_it.set_to_list (init_word->blob_list ());
168
+ init_blobs_left = initial_it.length ();
169
+ outword_it.set_to_list (word->outword->blob_list ());
170
+ delete bln_word;
171
+ delete_word(tessword); //get rid of it
172
+
173
+ for (outword_it.mark_cycle_pt ();
174
+ !outword_it.cycled_list (); outword_it.forward ()) {
175
+ out_box = outword_it.data ()->bounding_box ();
176
+
177
+ /* Skip any initial blobs LEFT of current outword blob */
178
+ while (!initial_it.at_last () &&
179
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
180
+ initial_it.forward ();
181
+ init_blobs_left--;
182
+ }
183
+
184
+ /* See if current outword blob matches any initial blob with the same left
185
+ coord. (Normally only one but possibly more - in unknown order) */
186
+
187
+ i = 0;
188
+ matched = FALSE;
189
+ do {
190
+ test_blob = initial_it.data_relative (i++);
191
+ matched = crude_match_blobs (test_blob, outword_it.data ());
192
+ if (matched)
193
+ match_count++;
194
+ }
195
+ while (!matched &&
196
+ (init_blobs_left - i > 0) &&
197
+ (i < 129) &&
198
+ !initial_it.at_last () &&
199
+ test_blob->bounding_box ().left () == out_box.left ());
200
+ }
201
+ delete init_word;
202
+ return match_count;
203
+ }
204
+
205
+
206
+ /*************************************************************************
207
+ * crude_match_blobs()
208
+ * Check bounding boxes are the same and the number of outlines are the same.
209
+ *************************************************************************/
210
+ BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
211
+ TBOX box1 = blob1->bounding_box ();
212
+ TBOX box2 = blob2->bounding_box ();
213
+
214
+ if (box1.contains (box2) &&
215
+ box2.contains (box1) &&
216
+ (blob1->out_list ()->length () == blob1->out_list ()->length ()))
217
+ return TRUE;
218
+ else
219
+ return FALSE;
220
+ }
221
+
222
+
223
+ inT16 word_outline_errs( //Outline count errs
224
+ WERD_RES *word) {
225
+ PBLOB_IT outword_it;
226
+ inT16 i = 0;
227
+ inT16 err_count = 0;
228
+
229
+ outword_it.set_to_list (word->outword->blob_list ());
230
+
231
+ for (outword_it.mark_cycle_pt ();
232
+ !outword_it.cycled_list (); outword_it.forward ()) {
233
+ err_count += count_outline_errs (word->best_choice->string ()[i],
234
+ outword_it.data ()->out_list ()->
235
+ length ());
236
+ i++;
237
+ }
238
+ return err_count;
239
+ }
240
+
241
+
242
+ /*************************************************************************
243
+ * word_char_quality()
244
+ * Combination of blob quality and outline quality - how many good chars are
245
+ * there? - I.e chars which pass the blob AND outline tests.
246
+ *************************************************************************/
247
+ void word_char_quality( //Blob seg changes
248
+ WERD_RES *word,
249
+ ROW *row,
250
+ inT16 *match_count,
251
+ inT16 *accepted_match_count) {
252
+ WERD *bln_word; //BL norm init word
253
+ TWERD *tessword; //tess format
254
+ WERD *init_word; //BL norm init word
255
+ PBLOB_IT outword_it;
256
+ PBLOB_IT initial_it;
257
+ inT16 i;
258
+ inT16 init_blobs_left;
259
+ BOOL8 matched;
260
+ TBOX out_box;
261
+ PBLOB *test_blob;
262
+ DENORM denorm;
263
+ float bln_xht;
264
+ inT16 j = 0;
265
+
266
+ *match_count = 0;
267
+ *accepted_match_count = 0;
268
+ if (word->word->gblob_list ()->empty ())
269
+ return;
270
+
271
+ //xht used for blnorm
272
+ bln_xht = bln_x_height / word->denorm.scale ();
273
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
274
+ /*
275
+ NOTE: Need to convert to tess format and back again to ensure that the
276
+ same float -> int rounding of coords is done to source wd as out wd before
277
+ comparison
278
+ */
279
+ tessword = make_tess_word (bln_word, NULL);
280
+ //convert word
281
+ init_word = make_ed_word (tessword, bln_word);
282
+ delete bln_word;
283
+ delete_word(tessword); //get rid of it
284
+ // tprintf( "SOURCE BLOBS-AFTER TESS:\n");
285
+ // print_boxes( init_word );
286
+ // tprintf( "OUTPUT BLOBS:\n");
287
+ // print_boxes( word->outword );
288
+
289
+ initial_it.set_to_list (init_word->blob_list ());
290
+ init_blobs_left = initial_it.length ();
291
+ outword_it.set_to_list (word->outword->blob_list ());
292
+
293
+ for (outword_it.mark_cycle_pt ();
294
+ !outword_it.cycled_list (); outword_it.forward ()) {
295
+ out_box = outword_it.data ()->bounding_box ();
296
+
297
+ /* Skip any initial blobs LEFT of current outword blob */
298
+ while (!initial_it.at_last () &&
299
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
300
+ initial_it.forward ();
301
+ init_blobs_left--;
302
+ }
303
+
304
+ /* See if current outword blob matches any initial blob with the same left
305
+ coord. (Normally only one but possibly more - in unknown order) */
306
+
307
+ i = 0;
308
+ matched = FALSE;
309
+ do {
310
+ test_blob = initial_it.data_relative (i++);
311
+ matched = crude_match_blobs (test_blob, outword_it.data ());
312
+ if (matched &&
313
+ (count_outline_errs (word->best_choice->string ()[j],
314
+ outword_it.data ()->out_list ()->length ())
315
+ == 0)) {
316
+ (*match_count)++;
317
+ if (word->reject_map[j].accepted ())
318
+ (*accepted_match_count)++;
319
+ }
320
+ }
321
+ while (!matched &&
322
+ (init_blobs_left - i > 0) &&
323
+ (i < 129) &&
324
+ !initial_it.at_last () &&
325
+ test_blob->bounding_box ().left () == out_box.left ());
326
+ j++;
327
+ }
328
+ delete init_word;
329
+ }
330
+
331
+
332
+ /*************************************************************************
333
+ * unrej_good_chs()
334
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
335
+ *************************************************************************/
336
+ void unrej_good_chs(WERD_RES *word, ROW *row) {
337
+ WERD *bln_word; //BL norm init word
338
+ TWERD *tessword; //tess format
339
+ WERD *init_word; //BL norm init word
340
+ PBLOB_IT outword_it;
341
+ PBLOB_IT initial_it;
342
+ inT16 i;
343
+ inT16 init_blobs_left;
344
+ BOOL8 matched;
345
+ TBOX out_box;
346
+ PBLOB *test_blob;
347
+ DENORM denorm;
348
+ float bln_xht;
349
+ inT16 j = 0;
350
+
351
+ if (word->word->gblob_list ()->empty ())
352
+ return;
353
+
354
+ //xht used for blnorm
355
+ bln_xht = bln_x_height / word->denorm.scale ();
356
+ bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
357
+ /*
358
+ NOTE: Need to convert to tess format and back again to ensure that the
359
+ same float -> int rounding of coords is done to source wd as out wd before
360
+ comparison
361
+ */
362
+ tessword = make_tess_word (bln_word, NULL);
363
+ //convert word
364
+ init_word = make_ed_word (tessword, bln_word);
365
+ delete bln_word;
366
+ delete_word(tessword); //get rid of it
367
+
368
+ initial_it.set_to_list (init_word->blob_list ());
369
+ init_blobs_left = initial_it.length ();
370
+ outword_it.set_to_list (word->outword->blob_list ());
371
+
372
+ for (outword_it.mark_cycle_pt ();
373
+ !outword_it.cycled_list (); outword_it.forward ()) {
374
+ out_box = outword_it.data ()->bounding_box ();
375
+
376
+ /* Skip any initial blobs LEFT of current outword blob */
377
+ while (!initial_it.at_last () &&
378
+ (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
379
+ initial_it.forward ();
380
+ init_blobs_left--;
381
+ }
382
+
383
+ /* See if current outword blob matches any initial blob with the same left
384
+ coord. (Normally only one but possibly more - in unknown order) */
385
+
386
+ i = 0;
387
+ matched = FALSE;
388
+ do {
389
+ test_blob = initial_it.data_relative (i++);
390
+ matched = crude_match_blobs (test_blob, outword_it.data ());
391
+ if (matched &&
392
+ (word->reject_map[j].accept_if_good_quality ()) &&
393
+ (docqual_excuse_outline_errs ||
394
+ (count_outline_errs (word->best_choice->string ()[j],
395
+ outword_it.data ()->out_list ()->
396
+ length ()) == 0)))
397
+ word->reject_map[j].setrej_quality_accept ();
398
+ }
399
+ while (!matched &&
400
+ (init_blobs_left - i > 0) &&
401
+ (i < 129) &&
402
+ !initial_it.at_last () &&
403
+ test_blob->bounding_box ().left () == out_box.left ());
404
+ j++;
405
+ }
406
+ delete init_word;
407
+ }
408
+
409
+
410
+ void print_boxes(WERD *word) {
411
+ PBLOB_IT it;
412
+ TBOX box;
413
+
414
+ it.set_to_list (word->blob_list ());
415
+ for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
416
+ box = it.data ()->bounding_box ();
417
+ box.print ();
418
+ }
419
+ }
420
+
421
+
422
+ inT16 count_outline_errs(char c, inT16 outline_count) {
423
+ int expected_outline_count;
424
+
425
+ if (STRING (outlines_odd).contains (c))
426
+ return 0; //Dont use this char
427
+ else if (STRING (outlines_2).contains (c))
428
+ expected_outline_count = 2;
429
+ else
430
+ expected_outline_count = 1;
431
+ return abs (outline_count - expected_outline_count);
432
+ }
433
+
434
+
435
+ void quality_based_rejection(PAGE_RES_IT &page_res_it,
436
+ BOOL8 good_quality_doc) {
437
+ if ((tessedit_good_quality_unrej && good_quality_doc))
438
+ unrej_good_quality_words(page_res_it);
439
+ doc_and_block_rejection(page_res_it, good_quality_doc);
440
+
441
+ page_res_it.restart_page ();
442
+ while (page_res_it.word () != NULL) {
443
+ insert_rej_cblobs (page_res_it.word ());
444
+ page_res_it.forward ();
445
+ }
446
+
447
+ if (unlv_tilde_crunching) {
448
+ tilde_crunch(page_res_it);
449
+ tilde_delete(page_res_it);
450
+ }
451
+ }
452
+
453
+
454
+ /*************************************************************************
455
+ * unrej_good_quality_words()
456
+ * Accept potential rejects in words which pass the following checks:
457
+ * - Contains a potential reject
458
+ * - Word looks like a sensible alpha word.
459
+ * - Word segmentation is the same as the original image
460
+ * - All characters have the expected number of outlines
461
+ * NOTE - the rejection counts are recalculated after unrejection
462
+ * - CANT do it in a single pass without a bit of fiddling
463
+ * - keep it simple but inefficient
464
+ *************************************************************************/
465
+ void unrej_good_quality_words( //unreject potential
466
+ PAGE_RES_IT &page_res_it) {
467
+ WERD_RES *word;
468
+ ROW_RES *current_row;
469
+ BLOCK_RES *current_block;
470
+ int i;
471
+
472
+ page_res_it.restart_page ();
473
+ while (page_res_it.word () != NULL) {
474
+ check_debug_pt (page_res_it.word (), 100);
475
+ if (bland_unrej) {
476
+ word = page_res_it.word ();
477
+ for (i = 0; i < word->reject_map.length (); i++) {
478
+ if (word->reject_map[i].accept_if_good_quality ())
479
+ word->reject_map[i].setrej_quality_accept ();
480
+ }
481
+ page_res_it.forward ();
482
+ }
483
+ else if ((page_res_it.row ()->char_count > 0) &&
484
+ ((page_res_it.row ()->rej_count /
485
+ (float) page_res_it.row ()->char_count) <=
486
+ quality_rowrej_pc)) {
487
+ word = page_res_it.word ();
488
+ if (word->reject_map.quality_recoverable_rejects () &&
489
+ (tessedit_unrej_any_wd ||
490
+ acceptable_word_string (word->best_choice->string ().string (),
491
+ word->best_choice->lengths().string())
492
+ != AC_UNACCEPTABLE)) {
493
+ unrej_good_chs (word, page_res_it.row ()->row);
494
+ }
495
+ page_res_it.forward ();
496
+ }
497
+ else {
498
+ /* Skip to end of dodgy row */
499
+ current_row = page_res_it.row ();
500
+ while ((page_res_it.word () != NULL) &&
501
+ (page_res_it.row () == current_row))
502
+ page_res_it.forward ();
503
+ }
504
+ check_debug_pt (page_res_it.word (), 110);
505
+ }
506
+ page_res_it.restart_page ();
507
+ page_res_it.page_res->char_count = 0;
508
+ page_res_it.page_res->rej_count = 0;
509
+ current_block = NULL;
510
+ current_row = NULL;
511
+ while (page_res_it.word () != NULL) {
512
+ if (current_block != page_res_it.block ()) {
513
+ current_block = page_res_it.block ();
514
+ current_block->char_count = 0;
515
+ current_block->rej_count = 0;
516
+ }
517
+ if (current_row != page_res_it.row ()) {
518
+ current_row = page_res_it.row ();
519
+ current_row->char_count = 0;
520
+ current_row->rej_count = 0;
521
+ current_row->whole_word_rej_count = 0;
522
+ }
523
+ page_res_it.rej_stat_word ();
524
+ page_res_it.forward ();
525
+ }
526
+ }
527
+
528
+
529
+ /*************************************************************************
530
+ * doc_and_block_rejection()
531
+ *
532
+ * If the page has too many rejects - reject all of it.
533
+ * If any block has too many rejects - reject all words in the block
534
+ *************************************************************************/
535
+
536
+ void doc_and_block_rejection( //reject big chunks
537
+ PAGE_RES_IT &page_res_it,
538
+ BOOL8 good_quality_doc) {
539
+ inT16 block_no = 0;
540
+ inT16 row_no = 0;
541
+ BLOCK_RES *current_block;
542
+ ROW_RES *current_row;
543
+
544
+ BOOL8 rej_word;
545
+ BOOL8 prev_word_rejected;
546
+ inT16 char_quality;
547
+ inT16 accepted_char_quality;
548
+
549
+ if ((page_res_it.page_res->rej_count * 100.0 /
550
+ page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
551
+ reject_whole_page(page_res_it);
552
+ #ifndef SECURE_NAMES
553
+ if (tessedit_debug_doc_rejection) {
554
+ tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
555
+ page_res_it.page_res->char_count,
556
+ page_res_it.page_res->rej_count);
557
+ }
558
+ #endif
559
+ }
560
+ else {
561
+ #ifndef SECURE_NAMES
562
+ if (tessedit_debug_doc_rejection)
563
+ tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
564
+ page_res_it.page_res->char_count,
565
+ page_res_it.page_res->rej_count);
566
+ #endif
567
+
568
+ /* Walk blocks testing for block rejection */
569
+
570
+ page_res_it.restart_page ();
571
+ while (page_res_it.word () != NULL) {
572
+ current_block = page_res_it.block ();
573
+ if (current_block->block->text_region () != NULL)
574
+ block_no = current_block->block->text_region ()->id_no ();
575
+ else
576
+ block_no = -1;
577
+ if ((page_res_it.block ()->char_count > 0) &&
578
+ ((page_res_it.block ()->rej_count * 100.0 /
579
+ page_res_it.block ()->char_count) >
580
+ tessedit_reject_block_percent)) {
581
+ #ifndef SECURE_NAMES
582
+ if (tessedit_debug_block_rejection)
583
+ tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
584
+ block_no,
585
+ page_res_it.block ()->char_count,
586
+ page_res_it.block ()->rej_count);
587
+ #endif
588
+ prev_word_rejected = FALSE;
589
+ while ((page_res_it.word () != NULL) &&
590
+ (page_res_it.block () == current_block)) {
591
+ if (tessedit_preserve_blk_rej_perfect_wds) {
592
+ rej_word =
593
+ (page_res_it.word ()->reject_map.reject_count () > 0)
594
+ || (page_res_it.word ()->reject_map.length () <
595
+ tessedit_preserve_min_wd_len);
596
+ if (rej_word && tessedit_dont_blkrej_good_wds
597
+ && !(page_res_it.word ()->reject_map.length () <
598
+ tessedit_preserve_min_wd_len)
599
+ &&
600
+ (acceptable_word_string
601
+ (page_res_it.word ()->best_choice->string ().
602
+ string (),
603
+ page_res_it.word ()->best_choice->lengths ().
604
+ string ()) != AC_UNACCEPTABLE)) {
605
+ word_char_quality (page_res_it.word (),
606
+ page_res_it.row ()->row,
607
+ &char_quality,
608
+ &accepted_char_quality);
609
+ rej_word = char_quality !=
610
+ page_res_it.word ()->reject_map.length ();
611
+ }
612
+ }
613
+ else
614
+ rej_word = TRUE;
615
+ if (rej_word) {
616
+ /*
617
+ Reject spacing if both current and prev words are rejected.
618
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
619
+ more space errors.
620
+ */
621
+ if (tessedit_use_reject_spaces &&
622
+ prev_word_rejected &&
623
+ (page_res_it.prev_row () == page_res_it.row ()) &&
624
+ (page_res_it.word ()->word->space () == 1))
625
+ page_res_it.word ()->reject_spaces = TRUE;
626
+ page_res_it.word ()->reject_map.rej_word_block_rej ();
627
+ }
628
+ prev_word_rejected = rej_word;
629
+ page_res_it.forward ();
630
+ }
631
+ }
632
+ else {
633
+ #ifndef SECURE_NAMES
634
+ if (tessedit_debug_block_rejection)
635
+ tprintf
636
+ ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
637
+ block_no, page_res_it.block ()->char_count,
638
+ page_res_it.block ()->rej_count);
639
+ #endif
640
+
641
+ /* Walk rows in block testing for row rejection */
642
+ row_no = 0;
643
+ while ((page_res_it.word () != NULL) &&
644
+ (page_res_it.block () == current_block)) {
645
+ current_row = page_res_it.row ();
646
+ row_no++;
647
+ /* Reject whole row if:
648
+ fraction of chars on row which are rejected exceed a limit AND
649
+ fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
650
+ */
651
+ if ((page_res_it.row ()->char_count > 0) &&
652
+ ((page_res_it.row ()->rej_count * 100.0 /
653
+ page_res_it.row ()->char_count) >
654
+ tessedit_reject_row_percent) &&
655
+ ((page_res_it.row ()->whole_word_rej_count * 100.0 /
656
+ page_res_it.row ()->rej_count) <
657
+ tessedit_whole_wd_rej_row_percent)) {
658
+ #ifndef SECURE_NAMES
659
+ if (tessedit_debug_block_rejection)
660
+ tprintf
661
+ ("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
662
+ row_no, page_res_it.row ()->char_count,
663
+ page_res_it.row ()->rej_count);
664
+ #endif
665
+ prev_word_rejected = FALSE;
666
+ while ((page_res_it.word () != NULL) &&
667
+ (page_res_it.row () == current_row)) {
668
+ /* Preserve words on good docs unless they are mostly rejected*/
669
+ if (!tessedit_row_rej_good_docs && good_quality_doc) {
670
+ rej_word =
671
+ page_res_it.word ()->reject_map.
672
+ reject_count () /
673
+ (float) page_res_it.word ()->reject_map.
674
+ length () > tessedit_good_doc_still_rowrej_wd;
675
+ }
676
+
677
+ /* Preserve perfect words anyway */
678
+ else if (tessedit_preserve_row_rej_perfect_wds) {
679
+ rej_word =
680
+ (page_res_it.word ()->reject_map.
681
+ reject_count () > 0)
682
+ || (page_res_it.word ()->reject_map.
683
+ length () < tessedit_preserve_min_wd_len);
684
+ if (rej_word && tessedit_dont_rowrej_good_wds
685
+ && !(page_res_it.word ()->reject_map.
686
+ length () <
687
+ tessedit_preserve_min_wd_len)
688
+ &&
689
+ (acceptable_word_string
690
+ (page_res_it.word ()->best_choice->
691
+ string ().string (),
692
+ page_res_it.word ()->best_choice->
693
+ lengths ().string ()) != AC_UNACCEPTABLE)) {
694
+ word_char_quality (page_res_it.word (),
695
+ page_res_it.row ()->row,
696
+ &char_quality,
697
+ &accepted_char_quality);
698
+ rej_word = char_quality !=
699
+ page_res_it.word ()->reject_map.length ();
700
+ }
701
+ }
702
+ else
703
+ rej_word = TRUE;
704
+ if (rej_word) {
705
+ /*
706
+ Reject spacing if both current and prev words are rejected.
707
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
708
+ more space errors.
709
+ */
710
+ if (tessedit_use_reject_spaces &&
711
+ prev_word_rejected &&
712
+ (page_res_it.prev_row () ==
713
+ page_res_it.row ())
714
+ && (page_res_it.word ()->word->space () ==
715
+ 1))
716
+ page_res_it.word ()->reject_spaces = TRUE;
717
+ page_res_it.word ()->reject_map.
718
+ rej_word_row_rej();
719
+ }
720
+ prev_word_rejected = rej_word;
721
+ page_res_it.forward ();
722
+ }
723
+ }
724
+ else {
725
+ #ifndef SECURE_NAMES
726
+ if (tessedit_debug_block_rejection)
727
+ tprintf
728
+ ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
729
+ row_no, page_res_it.row ()->char_count,
730
+ page_res_it.row ()->rej_count);
731
+ #endif
732
+ while ((page_res_it.word () != NULL) &&
733
+ (page_res_it.row () == current_row))
734
+ page_res_it.forward ();
735
+ }
736
+ }
737
+ }
738
+ }
739
+ }
740
+ }
741
+
742
+
743
+ /*************************************************************************
744
+ * reject_whole_page()
745
+ * Dont believe any of it - set the reject map to 00..00 in all words
746
+ *
747
+ *************************************************************************/
748
+
749
+ void reject_whole_page(PAGE_RES_IT &page_res_it) {
750
+ page_res_it.restart_page ();
751
+ while (page_res_it.word () != NULL) {
752
+ page_res_it.word ()->reject_map.rej_word_doc_rej ();
753
+ page_res_it.forward ();
754
+ }
755
+ //whole page is rejected
756
+ page_res_it.page_res->rejected = TRUE;
757
+ }
758
+
759
+
760
+ void tilde_crunch(PAGE_RES_IT &page_res_it) {
761
+ WERD_RES *word;
762
+ GARBAGE_LEVEL garbage_level;
763
+ PAGE_RES_IT copy_it;
764
+ BOOL8 prev_potential_marked = FALSE;
765
+ BOOL8 found_terrible_word = FALSE;
766
+ int dict_type;
767
+ BOOL8 ok_dict_word;
768
+
769
+ page_res_it.restart_page ();
770
+ while (page_res_it.word () != NULL) {
771
+ word = page_res_it.word ();
772
+
773
+ if (crunch_early_convert_bad_unlv_chs)
774
+ convert_bad_unlv_chs(word);
775
+
776
+ if (crunch_early_merge_tess_fails)
777
+ merge_tess_fails(word);
778
+
779
+ if (word->reject_map.accept_count () != 0) {
780
+ found_terrible_word = FALSE;
781
+ //Forget earlier potential crunches
782
+ prev_potential_marked = FALSE;
783
+ }
784
+ else {
785
+ dict_type = dict_word (word->best_choice->string ().string ());
786
+ ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
787
+ garbage_level = garbage_word (word, ok_dict_word);
788
+
789
+ if ((garbage_level != G_NEVER_CRUNCH) &&
790
+ (terrible_word_crunch (word, garbage_level))) {
791
+ if (crunch_debug > 0) {
792
+ tprintf ("T CRUNCHING: \"%s\"\n",
793
+ word->best_choice->string ().string ());
794
+ }
795
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
796
+ if (prev_potential_marked) {
797
+ while (copy_it.word () != word) {
798
+ if (crunch_debug > 0) {
799
+ tprintf ("P1 CRUNCHING: \"%s\"\n",
800
+ copy_it.word ()->best_choice->string ().
801
+ string ());
802
+ }
803
+ copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
804
+ copy_it.forward ();
805
+ }
806
+ prev_potential_marked = FALSE;
807
+ }
808
+ found_terrible_word = TRUE;
809
+ }
810
+ else if ((garbage_level != G_NEVER_CRUNCH) &&
811
+ (potential_word_crunch (word,
812
+ garbage_level, ok_dict_word))) {
813
+ if (found_terrible_word) {
814
+ if (crunch_debug > 0) {
815
+ tprintf ("P2 CRUNCHING: \"%s\"\n",
816
+ word->best_choice->string ().string ());
817
+ }
818
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
819
+ }
820
+ else if (!prev_potential_marked) {
821
+ copy_it = page_res_it;
822
+ prev_potential_marked = TRUE;
823
+ if (crunch_debug > 1) {
824
+ tprintf ("P3 CRUNCHING: \"%s\"\n",
825
+ word->best_choice->string ().string ());
826
+ }
827
+ }
828
+ }
829
+ else {
830
+ found_terrible_word = FALSE;
831
+ //Forget earlier potential crunches
832
+ prev_potential_marked = FALSE;
833
+ if (crunch_debug > 2) {
834
+ tprintf ("NO CRUNCH: \"%s\"\n",
835
+ word->best_choice->string ().string ());
836
+ }
837
+ }
838
+ }
839
+ page_res_it.forward ();
840
+ }
841
+ }
842
+
843
+
844
+ BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
845
+ float rating_per_ch;
846
+ int adjusted_len;
847
+ int crunch_mode = 0;
848
+
849
+ if ((word->best_choice->string ().length () == 0) ||
850
+ (strspn (word->best_choice->string ().string (), " ") ==
851
+ word->best_choice->string ().length ()))
852
+ crunch_mode = 1;
853
+ else {
854
+ adjusted_len = word->reject_map.length ();
855
+ if (adjusted_len > crunch_rating_max)
856
+ adjusted_len = crunch_rating_max;
857
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
858
+
859
+ if (rating_per_ch > crunch_terrible_rating)
860
+ crunch_mode = 2;
861
+ else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
862
+ crunch_mode = 3;
863
+ else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
864
+ (garbage_level != G_OK))
865
+ crunch_mode = 4;
866
+ else if ((rating_per_ch > crunch_poor_garbage_rate) &&
867
+ (garbage_level != G_OK))
868
+ crunch_mode = 5;
869
+ }
870
+ if (crunch_mode > 0) {
871
+ if (crunch_debug > 2) {
872
+ tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
873
+ crunch_mode, word->best_choice->string ().string ());
874
+ }
875
+ return TRUE;
876
+ }
877
+ else
878
+ return FALSE;
879
+ }
880
+
881
+
882
+ BOOL8 potential_word_crunch(WERD_RES *word,
883
+ GARBAGE_LEVEL garbage_level,
884
+ BOOL8 ok_dict_word) {
885
+ float rating_per_ch;
886
+ int adjusted_len;
887
+ const char *str = word->best_choice->string ().string ();
888
+ const char *lengths = word->best_choice->lengths ().string ();
889
+ BOOL8 word_crunchable;
890
+ int poor_indicator_count = 0;
891
+
892
+ word_crunchable =
893
+ !crunch_leave_accept_strings ||
894
+ (word->reject_map.length () < 3) ||
895
+ ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
896
+ !ok_dict_word);
897
+
898
+ adjusted_len = word->reject_map.length ();
899
+ if (adjusted_len > 10)
900
+ adjusted_len = 10;
901
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
902
+
903
+ if (rating_per_ch > crunch_pot_poor_rate) {
904
+ if (crunch_debug > 2) {
905
+ tprintf ("Potential poor rating on \"%s\"\n",
906
+ word->best_choice->string ().string ());
907
+ }
908
+ poor_indicator_count++;
909
+ }
910
+
911
+ if (word_crunchable &&
912
+ (word->best_choice->certainty () < crunch_pot_poor_cert)) {
913
+ if (crunch_debug > 2) {
914
+ tprintf ("Potential poor cert on \"%s\"\n",
915
+ word->best_choice->string ().string ());
916
+ }
917
+ poor_indicator_count++;
918
+ }
919
+
920
+ if (garbage_level != G_OK) {
921
+ if (crunch_debug > 2) {
922
+ tprintf ("Potential garbage on \"%s\"\n",
923
+ word->best_choice->string ().string ());
924
+ }
925
+ poor_indicator_count++;
926
+ }
927
+ return (poor_indicator_count >= crunch_pot_indicators);
928
+ }
929
+
930
+
931
+ void tilde_delete(PAGE_RES_IT &page_res_it) {
932
+ WERD_RES *word;
933
+ PAGE_RES_IT copy_it;
934
+ BOOL8 deleting_from_bol = FALSE;
935
+ BOOL8 marked_delete_point = FALSE;
936
+ inT16 debug_delete_mode;
937
+ CRUNCH_MODE delete_mode;
938
+ inT16 x_debug_delete_mode;
939
+ CRUNCH_MODE x_delete_mode;
940
+
941
+ page_res_it.restart_page ();
942
+ while (page_res_it.word () != NULL) {
943
+ word = page_res_it.word ();
944
+
945
+ delete_mode = word_deletable (word, debug_delete_mode);
946
+ if (delete_mode != CR_NONE) {
947
+ if (word->word->flag (W_BOL) || deleting_from_bol) {
948
+ if (crunch_debug > 0) {
949
+ tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
950
+ debug_delete_mode,
951
+ word->best_choice->string ().string ());
952
+ }
953
+ word->unlv_crunch_mode = delete_mode;
954
+ deleting_from_bol = TRUE;
955
+ }
956
+ else if (word->word->flag (W_EOL)) {
957
+ if (marked_delete_point) {
958
+ while (copy_it.word () != word) {
959
+ x_delete_mode = word_deletable (copy_it.word (),
960
+ x_debug_delete_mode);
961
+ if (crunch_debug > 0) {
962
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
963
+ x_debug_delete_mode,
964
+ copy_it.word ()->best_choice->string ().
965
+ string ());
966
+ }
967
+ copy_it.word ()->unlv_crunch_mode = x_delete_mode;
968
+ copy_it.forward ();
969
+ }
970
+ }
971
+ if (crunch_debug > 0) {
972
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
973
+ debug_delete_mode,
974
+ word->best_choice->string ().string ());
975
+ }
976
+ word->unlv_crunch_mode = delete_mode;
977
+ deleting_from_bol = FALSE;
978
+ marked_delete_point = FALSE;
979
+ }
980
+ else {
981
+ if (!marked_delete_point) {
982
+ copy_it = page_res_it;
983
+ marked_delete_point = TRUE;
984
+ }
985
+ }
986
+ }
987
+ else {
988
+ deleting_from_bol = FALSE;
989
+ //Forget earlier potential crunches
990
+ marked_delete_point = FALSE;
991
+ }
992
+ /*
993
+ The following step has been left till now as the tess fails are used to
994
+ determine if the word is deletable.
995
+ */
996
+ if (!crunch_early_merge_tess_fails)
997
+ merge_tess_fails(word);
998
+ page_res_it.forward ();
999
+ }
1000
+ }
1001
+
1002
+
1003
+ void convert_bad_unlv_chs( //word to do
1004
+ WERD_RES *word_res) {
1005
+ char *ptr; //string ptr
1006
+ int i;
1007
+ int offset;
1008
+
1009
+ ptr = (char *) word_res->best_choice->string ().string ();
1010
+ for (i = 0, offset = 0; i < word_res->reject_map.length ();
1011
+ offset += word_res->best_choice->lengths ()[i++]) {
1012
+ if (word_res->best_choice->lengths ()[i] == 1 &&
1013
+ ptr[offset] == '~') {
1014
+ ptr[offset] = '-';
1015
+ if (word_res->reject_map[i].accepted ())
1016
+ word_res->reject_map[i].setrej_unlv_rej ();
1017
+ }
1018
+ if (word_res->best_choice->lengths ()[i] == 1 &&
1019
+ ptr[offset] == '^') {
1020
+ ptr[offset] = ' ';
1021
+ if (word_res->reject_map[i].accepted ())
1022
+ word_res->reject_map[i].setrej_unlv_rej ();
1023
+ }
1024
+ }
1025
+ }
1026
+
1027
+
1028
+ /**********************************************************************
1029
+ * merge_tess_fails
1030
+ *
1031
+ * Change pairs of tess failures to a single one
1032
+ **********************************************************************/
1033
+
1034
+ void merge_tess_fails( //word to do
1035
+ WERD_RES *word_res) {
1036
+ char *ptr; //string ptr
1037
+ char *ptr_lengths; //lengths ptr
1038
+ PBLOB_IT blob_it; //blobs
1039
+ int i = 0;
1040
+ int len;
1041
+
1042
+ len = strlen (word_res->best_choice->lengths ().string ());
1043
+ ASSERT_HOST (word_res->reject_map.length () == len);
1044
+ ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1045
+
1046
+ ptr = (char *) word_res->best_choice->string ().string ();
1047
+ ptr_lengths = (char *) word_res->best_choice->lengths ().string ();
1048
+ blob_it = word_res->outword->blob_list ();
1049
+ while (*ptr != '\0') {
1050
+ if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
1051
+ strcpy (ptr + 1, ptr + 2); //shuffle up
1052
+ strcpy (ptr_lengths + 1, ptr_lengths + 2); //shuffle up
1053
+ word_res->reject_map.remove_pos (i);
1054
+ merge_blobs (blob_it.data_relative (1), blob_it.data ());
1055
+ delete blob_it.extract (); //get rid of spare
1056
+ }
1057
+ else {
1058
+ i++;
1059
+ ptr += *(ptr_lengths++);
1060
+ }
1061
+ blob_it.forward ();
1062
+ }
1063
+ len = strlen (word_res->best_choice->lengths ().string ());
1064
+ ASSERT_HOST (word_res->reject_map.length () == len);
1065
+ ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
1066
+ }
1067
+
1068
+
1069
+ GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
1070
+ enum STATES
1071
+ {
1072
+ JUNK,
1073
+ FIRST_UPPER,
1074
+ FIRST_LOWER,
1075
+ FIRST_NUM,
1076
+ SUBSEQUENT_UPPER,
1077
+ SUBSEQUENT_LOWER,
1078
+ SUBSEQUENT_NUM
1079
+ };
1080
+ const char *str = word->best_choice->string ().string ();
1081
+ const char *lengths = word->best_choice->lengths ().string ();
1082
+ STATES state = JUNK;
1083
+ int len = 0;
1084
+ int isolated_digits = 0;
1085
+ int isolated_alphas = 0;
1086
+ int bad_char_count = 0;
1087
+ int tess_rejs = 0;
1088
+ int dodgy_chars = 0;
1089
+ int ok_chars;
1090
+ UNICHAR_ID last_char = -1;
1091
+ int alpha_repetition_count = 0;
1092
+ int longest_alpha_repetition_count = 0;
1093
+ int longest_lower_run_len = 0;
1094
+ int lower_string_count = 0;
1095
+ int longest_upper_run_len = 0;
1096
+ int upper_string_count = 0;
1097
+ int total_alpha_count = 0;
1098
+ int total_digit_count = 0;
1099
+
1100
+ for (; *str != '\0'; str += *(lengths++)) {
1101
+ len++;
1102
+ if (unicharset.get_isupper (str, *lengths)) {
1103
+ total_alpha_count++;
1104
+ switch (state) {
1105
+ case SUBSEQUENT_UPPER:
1106
+ case FIRST_UPPER:
1107
+ state = SUBSEQUENT_UPPER;
1108
+ upper_string_count++;
1109
+ if (longest_upper_run_len < upper_string_count)
1110
+ longest_upper_run_len = upper_string_count;
1111
+ if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1112
+ alpha_repetition_count++;
1113
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
1114
+ longest_alpha_repetition_count = alpha_repetition_count;
1115
+ }
1116
+ }
1117
+ else {
1118
+ last_char = unicharset.unichar_to_id(str, *lengths);
1119
+ alpha_repetition_count = 1;
1120
+ }
1121
+ break;
1122
+ case FIRST_NUM:
1123
+ isolated_digits++;
1124
+ default:
1125
+ state = FIRST_UPPER;
1126
+ last_char = unicharset.unichar_to_id(str, *lengths);
1127
+ alpha_repetition_count = 1;
1128
+ upper_string_count = 1;
1129
+ break;
1130
+ }
1131
+ }
1132
+ else if (unicharset.get_islower (str, *lengths)) {
1133
+ total_alpha_count++;
1134
+ switch (state) {
1135
+ case SUBSEQUENT_LOWER:
1136
+ case FIRST_LOWER:
1137
+ state = SUBSEQUENT_LOWER;
1138
+ lower_string_count++;
1139
+ if (longest_lower_run_len < lower_string_count)
1140
+ longest_lower_run_len = lower_string_count;
1141
+ if (last_char == unicharset.unichar_to_id(str, *lengths)) {
1142
+ alpha_repetition_count++;
1143
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
1144
+ longest_alpha_repetition_count = alpha_repetition_count;
1145
+ }
1146
+ }
1147
+ else {
1148
+ last_char = unicharset.unichar_to_id(str, *lengths);
1149
+ alpha_repetition_count = 1;
1150
+ }
1151
+ break;
1152
+ case FIRST_NUM:
1153
+ isolated_digits++;
1154
+ default:
1155
+ state = FIRST_LOWER;
1156
+ last_char = unicharset.unichar_to_id(str, *lengths);
1157
+ alpha_repetition_count = 1;
1158
+ lower_string_count = 1;
1159
+ break;
1160
+ }
1161
+ }
1162
+ else if (unicharset.get_isdigit (str, *lengths)) {
1163
+ total_digit_count++;
1164
+ switch (state) {
1165
+ case FIRST_NUM:
1166
+ state = SUBSEQUENT_NUM;
1167
+ case SUBSEQUENT_NUM:
1168
+ break;
1169
+ case FIRST_UPPER:
1170
+ case FIRST_LOWER:
1171
+ isolated_alphas++;
1172
+ default:
1173
+ state = FIRST_NUM;
1174
+ break;
1175
+ }
1176
+ }
1177
+ else {
1178
+ if (*lengths == 1 && *str == ' ')
1179
+ tess_rejs++;
1180
+ else
1181
+ bad_char_count++;
1182
+ switch (state) {
1183
+ case FIRST_NUM:
1184
+ isolated_digits++;
1185
+ break;
1186
+ case FIRST_UPPER:
1187
+ case FIRST_LOWER:
1188
+ isolated_alphas++;
1189
+ default:
1190
+ break;
1191
+ }
1192
+ state = JUNK;
1193
+ }
1194
+ }
1195
+
1196
+ switch (state) {
1197
+ case FIRST_NUM:
1198
+ isolated_digits++;
1199
+ break;
1200
+ case FIRST_UPPER:
1201
+ case FIRST_LOWER:
1202
+ isolated_alphas++;
1203
+ default:
1204
+ break;
1205
+ }
1206
+
1207
+ if (crunch_include_numerals) {
1208
+ total_alpha_count += total_digit_count - isolated_digits;
1209
+ }
1210
+
1211
+ if (crunch_leave_ok_strings &&
1212
+ (len >= 4) &&
1213
+ (2 * (total_alpha_count - isolated_alphas) > len) &&
1214
+ (longest_alpha_repetition_count < crunch_long_repetitions)) {
1215
+ if ((crunch_accept_ok &&
1216
+ (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
1217
+ (longest_lower_run_len > crunch_leave_lc_strings) ||
1218
+ (longest_upper_run_len > crunch_leave_uc_strings))
1219
+ return G_NEVER_CRUNCH;
1220
+ }
1221
+ if ((word->reject_map.length () > 1) &&
1222
+ (strpbrk (str, " ") == NULL) &&
1223
+ ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1224
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1225
+ (word->best_choice->permuter () == USER_DAWG_PERM) ||
1226
+ (word->best_choice->permuter () == NUMBER_PERM) ||
1227
+ (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
1228
+ return G_OK;
1229
+
1230
+ ok_chars = len - bad_char_count - isolated_digits -
1231
+ isolated_alphas - tess_rejs;
1232
+
1233
+ if (crunch_debug > 3) {
1234
+ tprintf ("garbage_word: \"%s\"\n",
1235
+ word->best_choice->string ().string ());
1236
+ tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
1237
+ len,
1238
+ bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
1239
+ }
1240
+ if ((bad_char_count == 0) &&
1241
+ (tess_rejs == 0) &&
1242
+ ((len > isolated_digits + isolated_alphas) || (len <= 2)))
1243
+ return G_OK;
1244
+
1245
+ if ((tess_rejs > ok_chars) ||
1246
+ ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
1247
+ return G_TERRIBLE;
1248
+
1249
+ if (len > 4) {
1250
+ dodgy_chars = 2 * tess_rejs + bad_char_count +
1251
+ isolated_digits + isolated_alphas;
1252
+ if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
1253
+ return G_DODGY;
1254
+ else
1255
+ return G_OK;
1256
+ }
1257
+ else {
1258
+ dodgy_chars = 2 * tess_rejs + bad_char_count;
1259
+ if (((len == 4) && (dodgy_chars > 2)) ||
1260
+ ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
1261
+ return G_DODGY;
1262
+ else
1263
+ return G_OK;
1264
+ }
1265
+ }
1266
+
1267
+
1268
+ /*************************************************************************
1269
+ * word_deletable()
1270
+ * DELETE WERDS AT ENDS OF ROWS IF
1271
+ * Word is crunched &&
1272
+ * ( string length = 0 OR
1273
+ * > 50% of chars are "|" (before merging) OR
1274
+ * certainty < -10 OR
1275
+ * rating /char > 60 OR
1276
+ * TOP of word is more than 0.5 xht BELOW baseline OR
1277
+ * BOTTOM of word is more than 0.5 xht ABOVE xht OR
1278
+ * length of word < 3xht OR
1279
+ * height of word < 0.7 xht OR
1280
+ * height of word > 3.0 xht OR
1281
+ * >75% of the outline BBs have longest dimension < 0.5xht
1282
+ *************************************************************************/
1283
+
1284
+ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
1285
+ int word_len = word->reject_map.length ();
1286
+ float rating_per_ch;
1287
+ TBOX box; //BB of word
1288
+
1289
+ if (word->unlv_crunch_mode == CR_NONE) {
1290
+ delete_mode = 0;
1291
+ return CR_NONE;
1292
+ }
1293
+
1294
+ if (word_len == 0) {
1295
+ delete_mode = 1;
1296
+ return CR_DELETE;
1297
+ }
1298
+
1299
+ box = word->outword->bounding_box ();
1300
+ if (box.height () < crunch_del_min_ht * bln_x_height) {
1301
+ delete_mode = 4;
1302
+ return CR_DELETE;
1303
+ }
1304
+
1305
+ if (noise_outlines (word->outword)) {
1306
+ delete_mode = 5;
1307
+ return CR_DELETE;
1308
+ }
1309
+
1310
+ if ((failure_count (word) * 1.5) > word_len) {
1311
+ delete_mode = 2;
1312
+ return CR_LOOSE_SPACE;
1313
+ }
1314
+
1315
+ if (word->best_choice->certainty () < crunch_del_cert) {
1316
+ delete_mode = 7;
1317
+ return CR_LOOSE_SPACE;
1318
+ }
1319
+
1320
+ rating_per_ch = word->best_choice->rating () / word_len;
1321
+
1322
+ if (rating_per_ch > crunch_del_rating) {
1323
+ delete_mode = 8;
1324
+ return CR_LOOSE_SPACE;
1325
+ }
1326
+
1327
+ if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
1328
+ delete_mode = 9;
1329
+ return CR_LOOSE_SPACE;
1330
+ }
1331
+
1332
+ if (box.bottom () >
1333
+ bln_baseline_offset + crunch_del_high_word * bln_x_height) {
1334
+ delete_mode = 10;
1335
+ return CR_LOOSE_SPACE;
1336
+ }
1337
+
1338
+ if (box.height () > crunch_del_max_ht * bln_x_height) {
1339
+ delete_mode = 11;
1340
+ return CR_LOOSE_SPACE;
1341
+ }
1342
+
1343
+ if (box.width () < crunch_del_min_width * bln_x_height) {
1344
+ delete_mode = 3;
1345
+ return CR_LOOSE_SPACE;
1346
+ }
1347
+
1348
+ delete_mode = 0;
1349
+ return CR_NONE;
1350
+ }
1351
+
1352
+
1353
+ inT16 failure_count(WERD_RES *word) {
1354
+ char *str = (char *) word->best_choice->string ().string ();
1355
+ int tess_rejs = 0;
1356
+
1357
+ for (; *str != '\0'; str++) {
1358
+ if (*str == ' ')
1359
+ tess_rejs++;
1360
+ }
1361
+ return tess_rejs;
1362
+ }
1363
+
1364
+
1365
+ BOOL8 noise_outlines(WERD *word) {
1366
+ PBLOB_IT blob_it;
1367
+ OUTLINE_IT outline_it;
1368
+ TBOX box; //BB of outline
1369
+ inT16 outline_count = 0;
1370
+ inT16 small_outline_count = 0;
1371
+ inT16 max_dimension;
1372
+ float small_limit = bln_x_height * crunch_small_outlines_size;
1373
+
1374
+ blob_it.set_to_list (word->blob_list ());
1375
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1376
+ outline_it.set_to_list (blob_it.data ()->out_list ());
1377
+ for (outline_it.mark_cycle_pt ();
1378
+ !outline_it.cycled_list (); outline_it.forward ()) {
1379
+ outline_count++;
1380
+ box = outline_it.data ()->bounding_box ();
1381
+ if (box.height () > box.width ())
1382
+ max_dimension = box.height ();
1383
+ else
1384
+ max_dimension = box.width ();
1385
+ if (max_dimension < small_limit)
1386
+ small_outline_count++;
1387
+ }
1388
+ }
1389
+ return (small_outline_count >= outline_count);
1390
+ }
1391
+
1392
+
1393
+ /*************************************************************************
1394
+ * insert_rej_cblobs()
1395
+ * Put rejected word blobs back into the outword.
1396
+ * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
1397
+ * OF ELEMENTS.
1398
+ *************************************************************************/
1399
+ void insert_rej_cblobs( //word to do
1400
+ WERD_RES *word) {
1401
+ PBLOB_IT blob_it; //blob iterator
1402
+ PBLOB_IT rej_blob_it;
1403
+ const STRING *word_str;
1404
+ const STRING *word_lengths;
1405
+ int old_len;
1406
+ int rej_len;
1407
+ char new_str[512 * UNICHAR_LEN];
1408
+ char new_lengths[512];
1409
+ REJMAP new_map;
1410
+ int i = 0; //new_str index
1411
+ int j = 0; //old_str index
1412
+ int i_offset = 0; //new_str offset
1413
+ int j_offset = 0; //old_str offset
1414
+ int new_len;
1415
+
1416
+ gblob_sort_list (word->outword->rej_blob_list (), TRUE);
1417
+ rej_blob_it.set_to_list (word->outword->rej_blob_list ());
1418
+ if (rej_blob_it.empty ())
1419
+ return;
1420
+ rej_len = rej_blob_it.length ();
1421
+ blob_it.set_to_list (word->outword->blob_list ());
1422
+ word_str = &(word->best_choice->string ());
1423
+ word_lengths = &(word->best_choice->lengths ());
1424
+ old_len = word->best_choice->lengths().length ();
1425
+ ASSERT_HOST (word->reject_map.length () == old_len);
1426
+ ASSERT_HOST (blob_it.length () == old_len);
1427
+ if ((old_len + rej_len) > 511)
1428
+ return; //Word is garbage anyway prevent abort
1429
+ new_map.initialise (old_len + rej_len);
1430
+
1431
+ while (!rej_blob_it.empty ()) {
1432
+ if ((j >= old_len) ||
1433
+ (rej_blob_it.data ()->bounding_box ().left () <=
1434
+ blob_it.data ()->bounding_box ().left ())) {
1435
+ /* Insert reject blob */
1436
+ if (j >= old_len)
1437
+ blob_it.add_to_end (rej_blob_it.extract ());
1438
+ else
1439
+ blob_it.add_before_stay_put (rej_blob_it.extract ());
1440
+ if (!rej_blob_it.empty ())
1441
+ rej_blob_it.forward ();
1442
+ new_str[i_offset] = ' ';
1443
+ new_lengths[i] = 1;
1444
+ new_map[i].setrej_rej_cblob ();
1445
+ i_offset += new_lengths[i++];
1446
+ }
1447
+ else {
1448
+ strncpy(new_str + i_offset, &(*word_str)[j_offset],
1449
+ (*word_lengths)[j]);
1450
+ new_lengths[i] = (*word_lengths)[j];
1451
+ new_map[i] = word->reject_map[j];
1452
+ i_offset += new_lengths[i++];
1453
+ j_offset += (*word_lengths)[j++];
1454
+ blob_it.forward ();
1455
+ }
1456
+ }
1457
+ /* Add any extra normal blobs to strings */
1458
+ while (j < word_lengths->length ()) {
1459
+ strncpy(new_str + i_offset, &(*word_str)[j_offset],
1460
+ (*word_lengths)[j]);
1461
+ new_lengths[i] = (*word_lengths)[j];
1462
+ new_map[i] = word->reject_map[j];
1463
+ i_offset += new_lengths[i++];
1464
+ j_offset += (*word_lengths)[j++];
1465
+ }
1466
+ new_str[i_offset] = '\0';
1467
+ new_lengths[i] = 0;
1468
+ /*
1469
+ tprintf(
1470
+ "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
1471
+ old_len, i, new_str, new_map );
1472
+ */
1473
+ ASSERT_HOST (i == blob_it.length ());
1474
+ ASSERT_HOST (i == old_len + rej_len);
1475
+ word->reject_map = new_map;
1476
+ *((STRING *) word_str) = new_str;
1477
+ *((STRING *) word_lengths) = new_lengths;
1478
+ new_len = word->best_choice->lengths ().length ();
1479
+ ASSERT_HOST (word->reject_map.length () == new_len);
1480
+ ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
1481
+ }