tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1775 @@
1
+ /**********************************************************************
2
+ * File: reject.cpp (Formerly reject.c)
3
+ * Description: Rejection functions used in tessedit
4
+ * Author: Phil Cheatle
5
+ * Created: Wed Sep 23 16:50:21 BST 1992
6
+ *
7
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include "tessvars.h"
22
+ #ifdef __UNIX__
23
+ #include <assert.h>
24
+ #include <errno.h>
25
+ #endif
26
+ #include "scanutils.h"
27
+ #include <ctype.h>
28
+ #include <string.h>
29
+ //#include "tessbox.h"
30
+ #include "memry.h"
31
+ #include "reject.h"
32
+ #include "tfacep.h"
33
+ #include "mainblk.h"
34
+ #include "charcut.h"
35
+ #include "imgs.h"
36
+ #include "scaleimg.h"
37
+ #include "control.h"
38
+ #include "docqual.h"
39
+ #include "secname.h"
40
+ #include "globals.h"
41
+
42
+ /* #define SECURE_NAMES done in secnames.h when necessary */
43
+
44
+ //extern "C" {
45
+ #include "callnet.h"
46
+ //}
47
+
48
+ #include "notdll.h"
49
+
50
+ CLISTIZEH (STRING) CLISTIZE (STRING)
51
+ #define EXTERN
52
+ EXTERN
53
+ INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
54
+ EXTERN
55
+ INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
56
+ EXTERN
57
+ BOOL_VAR (tessedit_use_nn, FALSE, "");
58
+ EXTERN
59
+ BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
60
+ EXTERN
61
+ BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
62
+ EXTERN
63
+ BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
64
+ EXTERN
65
+ double_VAR (tessedit_lower_flip_hyphen, 1.5,
66
+ "Aspect ratio dot/hyphen test");
67
+ EXTERN
68
+ double_VAR (tessedit_upper_flip_hyphen, 1.8,
69
+ "Aspect ratio dot/hyphen test");
70
+
71
+ EXTERN
72
+ BOOL_VAR (rej_trust_doc_dawg, FALSE,
73
+ "Use DOC dawg in 11l conf. detector");
74
+ EXTERN
75
+ BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
76
+ EXTERN
77
+ BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
78
+
79
+ EXTERN
80
+ BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
81
+ EXTERN
82
+ BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
83
+ EXTERN
84
+ BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
85
+ EXTERN
86
+ BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
87
+ EXTERN
88
+ BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
89
+ EXTERN
90
+ BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
91
+ EXTERN
92
+ BOOL_VAR (nn_conf_double_check_dict, TRUE,
93
+ "Double check for confusions");
94
+ EXTERN
95
+ BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
96
+ EXTERN
97
+ BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
98
+ EXTERN
99
+ BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
100
+ EXTERN
101
+ BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
102
+ EXTERN
103
+ BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
104
+ EXTERN
105
+ BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
106
+ EXTERN
107
+ BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
108
+ "Require stronger NN match");
109
+ EXTERN
110
+ double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
111
+ EXTERN
112
+ INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
113
+ EXTERN
114
+ INT_VAR (nn_conf_initial_i_level, 3,
115
+ "NN accept initial Ii match level ");
116
+
117
+ EXTERN
118
+ BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
119
+ EXTERN
120
+ BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
121
+ EXTERN
122
+ BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
123
+ EXTERN
124
+ BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
125
+ EXTERN
126
+ BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
127
+ EXTERN
128
+ BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
129
+ EXTERN
130
+ BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
131
+ EXTERN
132
+ BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
133
+
134
+ EXTERN
135
+ double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
136
+ "if >this fract");
137
+ EXTERN
138
+ INT_VAR (rej_mostly_reject_mode, 1,
139
+ "0-never, 1-afterNN, 2-after new xht");
140
+ EXTERN
141
+ double_VAR (tessed_fullstop_aspect_ratio, 1.2,
142
+ "if >this fract then reject");
143
+
144
+ EXTERN
145
+ INT_VAR (net_image_width, 40, "NN input image width");
146
+ EXTERN
147
+ INT_VAR (net_image_height, 36, "NN input image height");
148
+ EXTERN
149
+ INT_VAR (net_image_x_height, 22, "NN input image x_height");
150
+ EXTERN
151
+ INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
152
+
153
+ /*
154
+ Net input is assumed to have (net_image_width * net_image_height) input
155
+ units of image pixels, followed by 0, 1, or N units representing the
156
+ baseline position. 0 implies no baseline information. 1 implies a floating
157
+ point value. N implies a "guage" of N units. For any char an initial set
158
+ of these are ON, the remainder OFF to indicate the "level" of the
159
+ baseline.
160
+
161
+ HOWEVER!!! NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A
162
+ DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules
163
+ generated by aspirin with HARD CODED CONSTANTS
164
+ */
165
+
166
+ EXTERN
167
+ INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
168
+
169
+ EXTERN
170
+ double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
171
+ EXTERN
172
+ double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
173
+
174
+ /* NOTE - ctoh doesn't handle "=" properly, hence \075 */
175
+ EXTERN
176
+ STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
177
+ "Allow NN to unrej");
178
+ EXTERN
179
+ STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
180
+ "Allow NN to unrej");
181
+ EXTERN
182
+ STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
183
+ EXTERN
184
+ STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
185
+ EXTERN
186
+ STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
187
+ EXTERN
188
+ STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
189
+ "Unreliable chars");
190
+ EXTERN
191
+ STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
192
+ "Unreliable chars");
193
+
194
+ EXTERN
195
+ INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
196
+
197
+ /*************************************************************************
198
+ * set_done()
199
+ *
200
+ * Set the done flag based on the word acceptability criteria
201
+ *************************************************************************/
202
+
203
+ void set_done( //set done flag
204
+ WERD_RES *word,
205
+ inT16 pass) {
206
+ /*
207
+ 0: Original heuristic used in Tesseract and Ray's prototype Resaljet
208
+ */
209
+ if (tessedit_ok_mode == 0) {
210
+ /* NOTE - done even if word contains some or all spaces !!! */
211
+ word->done = word->tess_accepted;
212
+ }
213
+ /*
214
+ 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
215
+ */
216
+ else if (tessedit_ok_mode == 1) {
217
+ word->done = word->tess_accepted &&
218
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
219
+
220
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
221
+ word->done = FALSE;
222
+ }
223
+ /*
224
+ 2: as 1 + only accept dict words or numerics in pass 1
225
+ */
226
+ else if (tessedit_ok_mode == 2) {
227
+ word->done = word->tess_accepted &&
228
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
229
+
230
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
231
+ word->done = FALSE;
232
+
233
+ if (word->done &&
234
+ (pass == 1) &&
235
+ (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
236
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
237
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
238
+ (word->best_choice->permuter () != NUMBER_PERM)) {
239
+ #ifndef SECURE_NAMES
240
+ if (tessedit_rejection_debug)
241
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
242
+ word->best_choice->string ().string ());
243
+ #endif
244
+ word->done = FALSE;
245
+ }
246
+ }
247
+ /*
248
+ 3: as 2 + only accept dict words or numerics in pass 2 as well
249
+ */
250
+ else if (tessedit_ok_mode == 3) {
251
+ word->done = word->tess_accepted &&
252
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
253
+
254
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
255
+ word->done = FALSE;
256
+
257
+ if (word->done &&
258
+ (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
259
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
260
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
261
+ (word->best_choice->permuter () != NUMBER_PERM)) {
262
+ #ifndef SECURE_NAMES
263
+ if (tessedit_rejection_debug)
264
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
265
+ word->best_choice->string ().string ());
266
+ #endif
267
+ word->done = FALSE;
268
+ }
269
+ }
270
+ /*
271
+ 4: as 2 + reject dict ambigs in pass 1
272
+ */
273
+ else if (tessedit_ok_mode == 4) {
274
+ word->done = word->tess_accepted &&
275
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
276
+
277
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
278
+ word->done = FALSE;
279
+
280
+ if (word->done &&
281
+ (pass == 1) &&
282
+ (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
283
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
284
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
285
+ (word->best_choice->permuter () != NUMBER_PERM)) ||
286
+ (test_ambig_word (word)))) {
287
+ #ifndef SECURE_NAMES
288
+ if (tessedit_rejection_debug)
289
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
290
+ word->best_choice->string ().string ());
291
+ #endif
292
+ word->done = FALSE;
293
+ }
294
+ }
295
+ /*
296
+ 5: as 3 + reject dict ambigs in both passes
297
+ */
298
+ else if (tessedit_ok_mode == 5) {
299
+ word->done = word->tess_accepted &&
300
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
301
+
302
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
303
+ word->done = FALSE;
304
+
305
+ if (word->done &&
306
+ (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
307
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
308
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
309
+ (word->best_choice->permuter () != NUMBER_PERM)) ||
310
+ (test_ambig_word (word)))) {
311
+ #ifndef SECURE_NAMES
312
+ if (tessedit_rejection_debug)
313
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
314
+ word->best_choice->string ().string ());
315
+ #endif
316
+ word->done = FALSE;
317
+ }
318
+ }
319
+
320
+ else {
321
+ tprintf ("BAD tessedit_ok_mode\n");
322
+ err_exit();
323
+ }
324
+ }
325
+
326
+
327
+ /*************************************************************************
328
+ * make_reject_map()
329
+ *
330
+ * Sets the done flag to indicate whether the resylt is acceptable.
331
+ *
332
+ * Sets a reject map for the word.
333
+ *************************************************************************/
334
+
335
+ void make_reject_map( //make rej map for wd //detailed results
336
+ WERD_RES *word,
337
+ BLOB_CHOICE_LIST_CLIST *blob_choices,
338
+ ROW *row,
339
+ inT16 pass //1st or 2nd?
340
+ ) {
341
+ int i;
342
+ int offset;
343
+
344
+ flip_0O(word);
345
+ check_debug_pt (word, -1); //For trap only
346
+ set_done(word, pass); //Set acceptance
347
+ word->reject_map.initialise (word->best_choice->lengths ().length ());
348
+ reject_blanks(word);
349
+ /*
350
+ 0: Rays original heuristic - the baseline
351
+ */
352
+ if (tessedit_reject_mode == 0) {
353
+ if (!word->done)
354
+ reject_poor_matches(word, blob_choices);
355
+ }
356
+ /*
357
+ 5: Reject I/1/l from words where there is no strong contextual confirmation;
358
+ the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
359
+ and the whole of any words which are very small
360
+ */
361
+ else if (tessedit_reject_mode == 5) {
362
+ if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
363
+ word->reject_map.rej_word_small_xht ();
364
+ else {
365
+ one_ell_conflict(word, TRUE);
366
+ /*
367
+ Originally the code here just used the done flag. Now I have duplicated
368
+ and unpacked the conditions for setting the done flag so that each
369
+ mechanism can be turned on or off independently. This works WITHOUT
370
+ affecting the done flag setting.
371
+ */
372
+ if (rej_use_tess_accepted && !word->tess_accepted)
373
+ word->reject_map.rej_word_not_tess_accepted ();
374
+
375
+ if (rej_use_tess_blanks &&
376
+ (strchr (word->best_choice->string ().string (), ' ') != NULL))
377
+ word->reject_map.rej_word_contains_blanks ();
378
+
379
+ if (rej_use_good_perm) {
380
+ if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
381
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
382
+ (word->best_choice->permuter () == USER_DAWG_PERM)) &&
383
+ (!rej_use_sensible_wd ||
384
+ (acceptable_word_string
385
+ (word->best_choice->string ().string (),
386
+ word->best_choice->lengths ().string ()) !=
387
+ AC_UNACCEPTABLE))) {
388
+ //PASSED TEST
389
+ }
390
+ else if (word->best_choice->permuter () == NUMBER_PERM) {
391
+ if (rej_alphas_in_number_perm) {
392
+ for (i = 0, offset = 0;
393
+ word->best_choice->string ()[offset] != '\0';
394
+ offset += word->best_choice->lengths()[i++]) {
395
+ if (word->reject_map[i].accepted () &&
396
+ unicharset.get_isalpha (word->best_choice->string ().string()
397
+ + offset,
398
+ word->best_choice->lengths()[i]))
399
+ word->reject_map[i].setrej_bad_permuter ();
400
+ //rej alpha
401
+ }
402
+ }
403
+ }
404
+ else {
405
+ word->reject_map.rej_word_bad_permuter ();
406
+ }
407
+ }
408
+
409
+ /* Ambig word rejection was here once !!*/
410
+
411
+ }
412
+ }
413
+ else {
414
+ tprintf ("BAD tessedit_reject_mode\n");
415
+ err_exit();
416
+ }
417
+
418
+ if (tessedit_image_border > -1)
419
+ reject_edge_blobs(word);
420
+
421
+ check_debug_pt (word, 10);
422
+ if (tessedit_rejection_debug) {
423
+ tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
424
+ tprintf ("Certainty: %f Rating: %f\n",
425
+ word->best_choice->certainty (), word->best_choice->rating ());
426
+ tprintf ("Dict word: %d\n",
427
+ dict_word (word->best_choice->string ().string ()));
428
+ }
429
+
430
+ /* Un-reject any rejected characters if NN permits */
431
+
432
+ if (tessedit_use_nn && (pass == 2) &&
433
+ word->reject_map.recoverable_rejects ())
434
+ nn_recover_rejects(word, row);
435
+ flip_hyphens(word);
436
+ check_debug_pt (word, 20);
437
+ }
438
+
439
+
440
+ void reject_blanks(WERD_RES *word) {
441
+ inT16 i;
442
+ inT16 offset;
443
+
444
+ for (i = 0, offset = 0; word->best_choice->string ()[offset] != '\0';
445
+ offset += word->best_choice->lengths ()[i], i += 1) {
446
+ if (word->best_choice->string ()[offset] == ' ')
447
+ //rej unrecognised blobs
448
+ word->reject_map[i].setrej_tess_failure ();
449
+ }
450
+ }
451
+
452
+
453
+ void reject_I_1_L(WERD_RES *word) {
454
+ inT16 i;
455
+ inT16 offset;
456
+
457
+ for (i = 0, offset = 0; word->best_choice->string ()[offset] != '\0';
458
+ offset += word->best_choice->lengths ()[i], i += 1) {
459
+ if (STRING (conflict_set_I_l_1).
460
+ contains (word->best_choice->string ()[offset])) {
461
+ //rej 1Il conflict
462
+ word->reject_map[i].setrej_1Il_conflict ();
463
+ }
464
+ }
465
+ }
466
+
467
+
468
+ void reject_poor_matches( //detailed results
469
+ WERD_RES *word,
470
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
471
+ float threshold;
472
+ inT16 i = 0;
473
+ inT16 offset = 0;
474
+ //super iterator
475
+ BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
476
+ BLOB_CHOICE_IT choice_it; //real iterator
477
+
478
+ #ifndef SECURE_NAMES
479
+ if (strlen (word->best_choice->lengths ().string ()) != list_it.length ()) {
480
+ tprintf
481
+ ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
482
+ word->best_choice->string ().string (),
483
+ strlen (word->best_choice->lengths ().string ()), list_it.length (),
484
+ word->outword->blob_list ()->length ());
485
+ }
486
+ #endif
487
+ ASSERT_HOST (strlen (word->best_choice->lengths ().string ()) ==
488
+ list_it.length ());
489
+ ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
490
+ threshold = compute_reject_threshold (blob_choices);
491
+
492
+ for (list_it.mark_cycle_pt ();
493
+ !list_it.cycled_list (); list_it.forward (), i++,
494
+ offset += word->best_choice->lengths ()[i]) {
495
+ /* NB - only compares the threshold against the TOP choice char in the
496
+ choices list for a blob !! - the selected one may be below the threshold */
497
+ choice_it.set_to_list (list_it.data ());
498
+ if ((word->best_choice->string ()[offset] == ' ') ||
499
+ (choice_it.length () == 0))
500
+ //rej unrecognised blobs
501
+ word->reject_map[i].setrej_tess_failure ();
502
+ else if (choice_it.data ()->certainty () < threshold)
503
+ //rej poor score blob
504
+ word->reject_map[i].setrej_poor_match ();
505
+ }
506
+ }
507
+
508
+
509
+ /**********************************************************************
510
+ * compute_reject_threshold
511
+ *
512
+ * Set a rejection threshold for this word.
513
+ * Initially this is a trivial function which looks for the largest
514
+ * gap in the certainty value.
515
+ **********************************************************************/
516
+
517
+ float compute_reject_threshold( //compute threshold //detailed results
518
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
519
+ inT16 index; //to ratings
520
+ inT16 blob_count; //no of blobs in word
521
+ inT16 ok_blob_count = 0; //non TESS rej blobs in word
522
+ float *ratings; //array of confidences
523
+ float threshold; //rejection threshold
524
+ float bestgap; //biggest gap
525
+ float gapstart; //bottom of gap
526
+ //super iterator
527
+ BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
528
+ BLOB_CHOICE_IT choice_it; //real iterator
529
+
530
+ blob_count = blob_choices->length ();
531
+ ratings = (float *) alloc_mem (blob_count * sizeof (float));
532
+ for (list_it.mark_cycle_pt (), index = 0;
533
+ !list_it.cycled_list (); list_it.forward (), index++) {
534
+ choice_it.set_to_list (list_it.data ());
535
+ if (choice_it.length () > 0) {
536
+ ratings[ok_blob_count] = choice_it.data ()->certainty ();
537
+ //get in an array
538
+ // tprintf("Rating[%d]=%c %g %g\n",
539
+ // index,choice_it.data()->char_class(),
540
+ // choice_it.data()->rating(),choice_it.data()->certainty());
541
+ ok_blob_count++;
542
+ }
543
+ }
544
+ ASSERT_HOST (index == blob_count);
545
+ qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
546
+ //sort them
547
+ bestgap = 0;
548
+ gapstart = ratings[0] - 1; //all reject if none better
549
+ if (ok_blob_count >= 3) {
550
+ for (index = 0; index < ok_blob_count - 1; index++) {
551
+ if (ratings[index + 1] - ratings[index] > bestgap) {
552
+ bestgap = ratings[index + 1] - ratings[index];
553
+ //find biggest
554
+ gapstart = ratings[index];
555
+ }
556
+ }
557
+ }
558
+ threshold = gapstart + bestgap / 2;
559
+ // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
560
+ // ratings[0],ratings[index],bestgap,threshold);
561
+
562
+ free_mem(ratings);
563
+ return threshold;
564
+ }
565
+
566
+
567
+ /**********************************************************************
568
+ * sort_floats
569
+ *
570
+ * qsort function to sort 2 floats.
571
+ **********************************************************************/
572
+
573
+ int sort_floats( //qsort function
574
+ const void *arg1, //ptrs to floats
575
+ const void *arg2) {
576
+ float diff; //difference
577
+
578
+ diff = *((float *) arg1) - *((float *) arg2);
579
+ if (diff > 0)
580
+ return 1;
581
+ else if (diff < 0)
582
+ return -1;
583
+ else
584
+ return 0;
585
+ }
586
+
587
+
588
+ /*************************************************************************
589
+ * reject_edge_blobs()
590
+ *
591
+ * If the word is perilously close to the edge of the image, reject those blobs
592
+ * in the word which are too close to the edge as they could be clipped.
593
+ *************************************************************************/
594
+
595
+ void reject_edge_blobs(WERD_RES *word) {
596
+ TBOX word_box = word->word->bounding_box ();
597
+ TBOX blob_box;
598
+ PBLOB_IT blob_it = word->outword->blob_list ();
599
+ //blobs
600
+ int blobindex = 0;
601
+ float centre;
602
+
603
+ if ((word_box.left () < tessedit_image_border) ||
604
+ (word_box.bottom () < tessedit_image_border) ||
605
+ (word_box.right () + tessedit_image_border >
606
+ page_image.get_xsize () - 1) ||
607
+ (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
608
+ ASSERT_HOST (word->reject_map.length () == blob_it.length ());
609
+ for (blobindex = 0, blob_it.mark_cycle_pt ();
610
+ !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
611
+ blob_box = blob_it.data ()->bounding_box ();
612
+ centre = (blob_box.left () + blob_box.right ()) / 2.0;
613
+ if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
614
+ (word->denorm.y (blob_box.bottom (), centre) <
615
+ tessedit_image_border) ||
616
+ (word->denorm.x (blob_box.right ()) + tessedit_image_border >
617
+ page_image.get_xsize () - 1) ||
618
+ (word->denorm.y (blob_box.top (), centre)
619
+ + tessedit_image_border > page_image.get_ysize () - 1)) {
620
+ word->reject_map[blobindex].setrej_edge_char ();
621
+ //close to edge
622
+ }
623
+ }
624
+ }
625
+ }
626
+
627
+
628
+ /**********************************************************************
629
+ * one_ell_conflict()
630
+ *
631
+ * Identify words where there is a potential I/l/1 error.
632
+ * - A bundle of contextual heuristics!
633
+ **********************************************************************/
634
+
635
+ BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
636
+ const char *word;
637
+ const char *lengths;
638
+ inT16 word_len; //its length
639
+ inT16 first_alphanum_index_;
640
+ inT16 first_alphanum_offset_;
641
+ inT16 i;
642
+ inT16 offset;
643
+ BOOL8 non_conflict_set_char; //non conf set a/n?
644
+ BOOL8 conflict = FALSE;
645
+ BOOL8 allow_1s;
646
+ ACCEPTABLE_WERD_TYPE word_type;
647
+ BOOL8 dict_perm_type;
648
+ BOOL8 dict_word_ok;
649
+ int dict_word_type;
650
+
651
+ word = word_res->best_choice->string ().string ();
652
+ lengths = word_res->best_choice->lengths().string();
653
+ word_len = strlen (lengths);
654
+ /*
655
+ If there are no occurrences of the conflict set characters then the word
656
+ is OK.
657
+ */
658
+ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
659
+ return FALSE;
660
+
661
+ /*
662
+ There is a conflict if there are NO other (confirmed) alphanumerics apart
663
+ from those in the conflict set.
664
+ */
665
+
666
+ for (i = 0, offset = 0, non_conflict_set_char = FALSE;
667
+ (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
668
+ non_conflict_set_char =
669
+ (unicharset.get_isalpha(word + offset, lengths[i]) ||
670
+ unicharset.get_isdigit(word + offset, lengths[i])) &&
671
+ !STRING (conflict_set_I_l_1).contains (word[offset]);
672
+ if (!non_conflict_set_char) {
673
+ if (update_map)
674
+ reject_I_1_L(word_res);
675
+ return TRUE;
676
+ }
677
+
678
+ /*
679
+ If the word is accepted by a dawg permuter, and the first alpha character
680
+ is "I" or "l", check to see if the alternative is also a dawg word. If it
681
+ is, then there is a potential error otherwise the word is ok.
682
+ */
683
+
684
+ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
685
+ (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
686
+ (rej_trust_doc_dawg &&
687
+ (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
688
+ (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
689
+ dict_word_type = dict_word (word);
690
+ dict_word_ok = (dict_word_type > 0) &&
691
+ (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
692
+
693
+ if ((rej_1Il_use_dict_word && dict_word_ok) ||
694
+ (rej_1Il_trust_permuter_type && dict_perm_type) ||
695
+ (dict_perm_type && dict_word_ok)) {
696
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
697
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
698
+ if (lengths[first_alphanum_index_] == 1 &&
699
+ word[first_alphanum_offset_] == 'I') {
700
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
701
+ if (safe_dict_word (word) > 0) {
702
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
703
+ if (update_map)
704
+ word_res->reject_map[first_alphanum_index_].
705
+ setrej_1Il_conflict();
706
+ return TRUE;
707
+ }
708
+ else {
709
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
710
+ return FALSE;
711
+ }
712
+ }
713
+
714
+ if (lengths[first_alphanum_index_] == 1 &&
715
+ word[first_alphanum_offset_] == 'l') {
716
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
717
+ if (safe_dict_word (word) > 0) {
718
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
719
+ if (update_map)
720
+ word_res->reject_map[first_alphanum_index_].
721
+ setrej_1Il_conflict();
722
+ return TRUE;
723
+ }
724
+ else {
725
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
726
+ return FALSE;
727
+ }
728
+ }
729
+ return FALSE;
730
+ }
731
+
732
+ /*
733
+ NEW 1Il code. The old code relied on permuter types too much. In fact,
734
+ tess will use TOP_CHOICE permute for good things like "palette".
735
+ In this code the string is examined independently to see if it looks like
736
+ a well formed word.
737
+ */
738
+
739
+ /*
740
+ REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
741
+ dictionary word.
742
+ */
743
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
744
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
745
+ if (lengths[first_alphanum_index_] == 1 &&
746
+ word[first_alphanum_offset_] == 'l') {
747
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
748
+ if (safe_dict_word (word) > 0)
749
+ return FALSE;
750
+ else
751
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
752
+ }
753
+ else if (lengths[first_alphanum_index_] == 1 &&
754
+ word[first_alphanum_offset_] == 'I') {
755
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
756
+ if (safe_dict_word (word) > 0)
757
+ return FALSE;
758
+ else
759
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
760
+ }
761
+ /*
762
+ For strings containing digits:
763
+ If there are no alphas OR the numeric permuter liked the word,
764
+ reject any non 1 conflict chs
765
+ Else reject all conflict chs
766
+ */
767
+ if (word_contains_non_1_digit (word, lengths)) {
768
+ allow_1s = (alpha_count (word, lengths) == 0) ||
769
+ (word_res->best_choice->permuter () == NUMBER_PERM);
770
+
771
+ inT16 offset;
772
+ conflict = FALSE;
773
+ for (i = 0, offset = 0; word[offset] != '\0';
774
+ offset += word_res->best_choice->lengths ()[i++]) {
775
+ if ((!allow_1s || (word[offset] != '1')) &&
776
+ STRING (conflict_set_I_l_1).contains (word[offset])) {
777
+ if (update_map)
778
+ word_res->reject_map[i].setrej_1Il_conflict ();
779
+ conflict = TRUE;
780
+ }
781
+ }
782
+ return conflict;
783
+ }
784
+ /*
785
+ For anything else. See if it conforms to an acceptable word type. If so,
786
+ treat accordingly.
787
+ */
788
+ word_type = acceptable_word_string (word, lengths);
789
+ if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
790
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
791
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
792
+ if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
793
+ if (update_map)
794
+ word_res->reject_map[first_alphanum_index_].
795
+ setrej_1Il_conflict ();
796
+ return TRUE;
797
+ }
798
+ else
799
+ return FALSE;
800
+ }
801
+ else if (word_type == AC_UPPER_CASE) {
802
+ return FALSE;
803
+ }
804
+ else {
805
+ if (update_map)
806
+ reject_I_1_L(word_res);
807
+ return TRUE;
808
+ }
809
+ }
810
+
811
+
812
+ inT16 first_alphanum_index(const char *word,
813
+ const char *word_lengths) {
814
+ inT16 i;
815
+ inT16 offset;
816
+
817
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
818
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
819
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
820
+ return i;
821
+ }
822
+ return -1;
823
+ }
824
+
825
+ inT16 first_alphanum_offset(const char *word,
826
+ const char *word_lengths) {
827
+ inT16 i;
828
+ inT16 offset;
829
+
830
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
831
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
832
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
833
+ return offset;
834
+ }
835
+ return -1;
836
+ }
837
+
838
+ inT16 alpha_count(const char *word,
839
+ const char *word_lengths) {
840
+ inT16 i;
841
+ inT16 offset;
842
+ inT16 count = 0;
843
+
844
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
845
+ if (unicharset.get_isalpha (word + offset, word_lengths[i]))
846
+ count++;
847
+ }
848
+ return count;
849
+ }
850
+
851
+
852
+ BOOL8 word_contains_non_1_digit(const char *word,
853
+ const char *word_lengths) {
854
+ inT16 i;
855
+ inT16 offset;
856
+
857
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
858
+ if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
859
+ (word_lengths[i] != 1 || word[offset] != '1'))
860
+ return TRUE;
861
+ }
862
+ return FALSE;
863
+ }
864
+
865
+
866
+ BOOL8 test_ambig_word( //test for ambiguity
867
+ WERD_RES *word) {
868
+ BOOL8 ambig = FALSE;
869
+
870
+ if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
871
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
872
+ (word->best_choice->permuter () == USER_DAWG_PERM)) {
873
+ ambig = !NoDangerousAmbig(word->best_choice->string().string(),
874
+ word->best_choice->lengths().string(),
875
+ NULL);
876
+ }
877
+ return ambig;
878
+ }
879
+
880
+
881
+ /*************************************************************************
882
+ * ambig_word()
883
+ *
884
+ * This is a recursive routine which tests the dictionary for all combinations
885
+ * of conflict set alternatives for characters in a given word.
886
+ *************************************************************************/
887
+
888
+ BOOL8 ambig_word( //original word
889
+ const char *start_word,
890
+ char *temp_word, //alterable copy
891
+ inT16 test_char_pos //idx to char to alter
892
+ ) {
893
+ const char *ambigs; //Ambiguities for char
894
+
895
+ if (*(temp_word + test_char_pos) == '\0') {
896
+ if (safe_dict_word (temp_word)) {
897
+ if (strcmp (start_word, temp_word) == 0)
898
+ return FALSE;
899
+ else
900
+ return TRUE;
901
+ }
902
+ else
903
+ return FALSE;
904
+ }
905
+ else {
906
+ ambigs = char_ambiguities (*(temp_word + test_char_pos));
907
+ if (ambigs == NULL)
908
+ return ambig_word (start_word, temp_word, test_char_pos + 1);
909
+ else {
910
+ while (*ambigs != '\0') {
911
+ *(temp_word + test_char_pos) = *ambigs++;
912
+ //test next ambiguity
913
+ if (ambig_word (start_word, temp_word, test_char_pos + 1))
914
+ return TRUE;
915
+ }
916
+ return FALSE;
917
+ }
918
+ }
919
+ }
920
+
921
+
922
+ /*************************************************************************
923
+ * char_ambiguities()
924
+ *
925
+ * Return a pointer to a string containing the full conflict set of characters
926
+ * which includes the specified character, if there is one. If the specified
927
+ * character is not a member of a conflict set, return NULL.
928
+ * (NOTE that a character is assumed to be a member of only ONE conflict set.)
929
+ *************************************************************************/
930
+
931
+ const char *char_ambiguities(char c) {
932
+ static STRING_CLIST conflict_sets;
933
+ static BOOL8 read_conflict_sets = FALSE;
934
+ STRING_C_IT cs_it(&conflict_sets);
935
+ const char *cs;
936
+ STRING cs_file_name;
937
+ FILE *cs_file;
938
+ char buff[1024];
939
+
940
+ if (!read_conflict_sets) {
941
+ cs_file_name = datadir + "confsets";
942
+ if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
943
+ CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
944
+ cs_file_name.string (), errno);
945
+ }
946
+ while (fscanf (cs_file, "%s", buff) == 1) {
947
+ cs_it.add_after_then_move (new STRING (buff));
948
+ }
949
+ read_conflict_sets = TRUE;
950
+ cs_it.move_to_first ();
951
+ if (tessedit_rejection_debug) {
952
+ for (cs_it.mark_cycle_pt ();
953
+ !cs_it.cycled_list (); cs_it.forward ()) {
954
+ tprintf ("\"%s\"\n", cs_it.data ()->string ());
955
+ }
956
+ }
957
+ }
958
+
959
+ cs_it.move_to_first ();
960
+ for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
961
+ cs = cs_it.data ()->string ();
962
+ if (strchr (cs, c) != NULL)
963
+ return cs;
964
+ }
965
+ return NULL;
966
+ }
967
+
968
+ #ifndef EMBEDDED
969
+ void test_ambigs(const char *word) {
970
+ char orig_word[80];
971
+ char temp_word[80];
972
+
973
+ if (strlen (word) > 80)
974
+ tprintf ("Ridiculously long word \"%s\"\n", word);
975
+ else {
976
+ strcpy(orig_word, word);
977
+ while (strlen (orig_word) > 0) {
978
+ strcpy(temp_word, orig_word);
979
+
980
+ #ifndef SECURE_NAMES
981
+ if (ambig_word (orig_word, temp_word, 0))
982
+ tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);
983
+ else
984
+ tprintf ("NO Ambiguities for \"%s\"\n", orig_word);
985
+ tprintf ("Next Word > ");
986
+ #endif
987
+ scanf ("%s", orig_word);
988
+ }
989
+ }
990
+ }
991
+ #endif
992
+
993
+ /*************************************************************************
994
+ * nn_recover_rejects()
995
+ * Generate the nn_reject_map - a copy of the current reject map, but dont
996
+ * reject previously rejected chars if the NN matcher agrees with the best
997
+ * choice.
998
+ *************************************************************************/
999
+
1000
+ void nn_recover_rejects(WERD_RES *word, ROW *row) {
1001
+ //copy for debug
1002
+ REJMAP old_map = word->reject_map;
1003
+ /*
1004
+ NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
1005
+ MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE
1006
+ REJECT CHARACTERS (Though initial use is when words are total rejects
1007
+ anyway).
1008
+ */
1009
+
1010
+ set_global_subsubloc_code(SUBSUBLOC_NN);
1011
+ nn_match_word(word, row);
1012
+
1013
+ if (no_unrej_1Il)
1014
+ dont_allow_1Il(word);
1015
+ if (no_unrej_dubious_chars)
1016
+ dont_allow_dubious_chars(word);
1017
+
1018
+ if (rej_mostly_reject_mode == 1)
1019
+ reject_mostly_rejects(word);
1020
+ /*
1021
+ IF there are no unrejected alphanumerics AND
1022
+ The word is not an acceptable single non alphanum char word AND
1023
+ The word is not an acceptable repeated non alphanum char word
1024
+ THEN Reject whole word
1025
+ */
1026
+ if (no_unrej_no_alphanum_wds &&
1027
+ (count_alphanums (word) < 1) &&
1028
+ !((word->best_choice->lengths ().length () == 1) &&
1029
+ STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->
1030
+ string ()[0]))
1031
+ && !repeated_nonalphanum_wd (word, row))
1032
+
1033
+ word->reject_map.rej_word_no_alphanums ();
1034
+
1035
+ #ifndef SECURE_NAMES
1036
+
1037
+ if (nn_debug) {
1038
+ tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());
1039
+ old_map.print (stdout);
1040
+ tprintf ("->");
1041
+ word->reject_map.print (stdout);
1042
+ tprintf ("\n");
1043
+ }
1044
+ #endif
1045
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
1046
+ }
1047
+
1048
+
1049
+ void nn_match_word( //Match a word
1050
+ WERD_RES *word,
1051
+ ROW *row) {
1052
+ PIXROW_LIST *pixrow_list;
1053
+ PIXROW_IT pixrow_it;
1054
+ IMAGELINE *imlines; //lines of the image
1055
+ TBOX pix_box; //box of imlines extent
1056
+ #ifndef GRAPHICS_DISABLED
1057
+ ScrollView* win = NULL;
1058
+ #endif
1059
+ IMAGE clip_image;
1060
+ IMAGE scaled_image;
1061
+ float baseline_pos;
1062
+ inT16 net_image_size;
1063
+ inT16 clip_image_size;
1064
+ WERD copy_outword; // copy to denorm
1065
+ inT16 i;
1066
+
1067
+ const char *word_string;
1068
+ const char *word_string_lengths;
1069
+ BOOL8 word_in_dict; //Tess wd in dict
1070
+ BOOL8 checked_dict_word; //Tess wd definitely in dict
1071
+ BOOL8 sensible_word; //OK char string
1072
+ BOOL8 centre; //Not at word end chs
1073
+ BOOL8 good_quality_word;
1074
+ inT16 char_quality;
1075
+ inT16 accepted_char_quality;
1076
+
1077
+ inT16 conf_level; //0:REJECT
1078
+ //1:DODGY ACCEPT
1079
+ //2:DICT ACCEPT
1080
+ //3:CLEAR ACCEPT
1081
+ inT16 first_alphanum_index_;
1082
+ inT16 first_alphanum_offset_;
1083
+
1084
+ word_string = word->best_choice->string ().string ();
1085
+ word_string_lengths = word->best_choice->lengths ().string ();
1086
+ first_alphanum_index_ = first_alphanum_index (word_string,
1087
+ word_string_lengths);
1088
+ first_alphanum_offset_ = first_alphanum_offset (word_string,
1089
+ word_string_lengths);
1090
+ word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1091
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1092
+ (word->best_choice->permuter () == USER_DAWG_PERM));
1093
+ checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);
1094
+ sensible_word = acceptable_word_string (word_string, word_string_lengths) !=
1095
+ AC_UNACCEPTABLE;
1096
+
1097
+ word_char_quality(word, row, &char_quality, &accepted_char_quality);
1098
+ good_quality_word = word->best_choice->lengths ().length () == char_quality;
1099
+
1100
+ #ifndef SECURE_NAMES
1101
+ if (nn_reject_debug) {
1102
+ tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n",
1103
+ word_in_dict ? 'T' : 'F',
1104
+ checked_dict_word ? 'T' : 'F',
1105
+ sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
1106
+ }
1107
+ #endif
1108
+
1109
+ if (word->best_choice->lengths ().length () !=
1110
+ word->outword->blob_list ()->length ()) {
1111
+ #ifndef SECURE_NAMES
1112
+ tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n",
1113
+ word->best_choice->string ().string (),
1114
+ word->outword->blob_list ()->length ());
1115
+ #endif
1116
+ err_exit();
1117
+ }
1118
+
1119
+ copy_outword = *(word->outword);
1120
+ copy_outword.baseline_denormalise (&word->denorm);
1121
+ /*
1122
+ For each character, generate and match a new image, containing JUST the
1123
+ character we have clipped, centered in the image, on a white background.
1124
+ Note that we MUST have a square image so that we can scale it uniformly in
1125
+ x and y. We base the size on x_height as this can be found fairly reliably.
1126
+ */
1127
+ net_image_size = (net_image_width > net_image_height) ?
1128
+ net_image_width : net_image_height;
1129
+ clip_image_size = (inT16) floor (0.5 +
1130
+ net_image_size * word->x_height /
1131
+ net_image_x_height);
1132
+ if ((clip_image_size <= 1) || (net_image_size <= 1)) {
1133
+ return;
1134
+ }
1135
+
1136
+ /*
1137
+ Get the image of the word and the pix positions of each char
1138
+ */
1139
+ char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
1140
+ #ifndef GRAPHICS_DISABLED
1141
+ if (show_char_clipping) {
1142
+ win = display_clip_image (&copy_outword, page_image,
1143
+ pixrow_list, pix_box);
1144
+ }
1145
+ #endif
1146
+ pixrow_it.set_to_list (pixrow_list);
1147
+ pixrow_it.move_to_first ();
1148
+ for (pixrow_it.mark_cycle_pt (), i = 0;
1149
+ !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
1150
+ if (pixrow_it.data ()->
1151
+ bad_box (page_image.get_xsize (), page_image.get_ysize ()))
1152
+ continue;
1153
+ clip_image.create (clip_image_size, clip_image_size, 1);
1154
+ //make bin imge
1155
+ if (!copy_outword.flag (W_INVERSE))
1156
+ invert_image(&clip_image); //white background for black on white
1157
+ pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
1158
+ clip_image, baseline_pos);
1159
+ if (copy_outword.flag (W_INVERSE))
1160
+ invert_image(&clip_image); //invert white on black for scaling &NN
1161
+ scaled_image.create (net_image_size, net_image_size, 1);
1162
+ scale_image(clip_image, scaled_image);
1163
+ baseline_pos *= net_image_size / clip_image_size;
1164
+ //scale with im
1165
+ centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
1166
+
1167
+ conf_level = nn_match_char (scaled_image, baseline_pos,
1168
+ word_in_dict, checked_dict_word,
1169
+ sensible_word, centre,
1170
+ good_quality_word, word_string[i]);
1171
+ if (word->reject_map[i].recoverable ()) {
1172
+ if ((i == first_alphanum_index_) &&
1173
+ word_string_lengths[first_alphanum_index_] == 1 &&
1174
+ ((word_string[first_alphanum_offset_] == 'I') ||
1175
+ (word_string[first_alphanum_offset_] == 'i'))) {
1176
+ if (conf_level >= nn_conf_initial_i_level)
1177
+ word->reject_map[i].setrej_nn_accept ();
1178
+ //un-reject char
1179
+ }
1180
+ else if (conf_level > 0)
1181
+ //un-reject char
1182
+ word->reject_map[i].setrej_nn_accept ();
1183
+ }
1184
+ #ifndef GRAPHICS_DISABLED
1185
+ if (show_char_clipping)
1186
+ display_images(clip_image, scaled_image);
1187
+ #endif
1188
+ clip_image.destroy();
1189
+ scaled_image.destroy();
1190
+ }
1191
+
1192
+ delete[]imlines; // Free array of imlines
1193
+ delete pixrow_list;
1194
+
1195
+ #ifndef GRAPHICS_DISABLED
1196
+ if (show_char_clipping) {
1197
+ // destroy_window(win);
1198
+ // win->Destroy();
1199
+ delete win;
1200
+ }
1201
+ #endif
1202
+ }
1203
+
1204
+
1205
+ /*************************************************************************
1206
+ * nn_match_char()
1207
+ * Call Neural Net matcher to match a single character, given a scaled,
1208
+ * square image
1209
+ *************************************************************************/
1210
+
1211
+ inT16 nn_match_char( //of character
1212
+ IMAGE &scaled_image,
1213
+ float baseline_pos, //rel to scaled_image
1214
+ BOOL8 dict_word, //part of dict wd?
1215
+ BOOL8 checked_dict_word, //part of dict wd?
1216
+ BOOL8 sensible_word, //part acceptable str?
1217
+ BOOL8 centre, //not at word ends?
1218
+ BOOL8 good_quality_word, //initial segmentation
1219
+ char tess_ch //confirm this?
1220
+ ) {
1221
+ inT16 conf_level; //0..2
1222
+ inT32 row;
1223
+ inT32 col;
1224
+ inT32 y_size = scaled_image.get_ysize ();
1225
+ inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
1226
+ inT32 end_y = start_y - net_image_height + 1;
1227
+ IMAGELINE imline;
1228
+ float *input_vector;
1229
+ float *input_vec_ptr;
1230
+ char top;
1231
+ float top_score;
1232
+ char next;
1233
+ float next_score;
1234
+ inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
1235
+ inT16 j;
1236
+
1237
+ input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
1238
+ input_vec_ptr = input_vector;
1239
+
1240
+ invert_image(&scaled_image); //cos nns work better
1241
+ for (row = start_y; row >= end_y; row--) {
1242
+ scaled_image.fast_get_line (0, row, net_image_width, &imline);
1243
+ for (col = 0; col < net_image_width; col++)
1244
+ *input_vec_ptr++ = imline.pixels[col];
1245
+ }
1246
+ /*
1247
+ The bit map presented to the net may be shorter than the image, so shift
1248
+ the coord to be relative to the bitmap portion.
1249
+ */
1250
+ baseline_pos -= (y_size - net_image_height) / 2.0;
1251
+ /*
1252
+ Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
1253
+ This is represented to the net as a set of bl_nodes, an initial proportion
1254
+ of which are set to 1.0, indicating the level of the baseline. The
1255
+ remainder are 0.0
1256
+ */
1257
+
1258
+ if (baseline_pos < 0)
1259
+ baseline_pos = 0;
1260
+ else if (baseline_pos >= net_image_height)
1261
+ baseline_pos = net_image_height + 1;
1262
+ else
1263
+ baseline_pos = baseline_pos + 1;
1264
+ baseline_pos = baseline_pos / (net_image_height + 1);
1265
+
1266
+ if (net_bl_nodes > 0) {
1267
+ baseline_pos *= 1.7; //Use a wider range
1268
+ if (net_bl_nodes > 1) {
1269
+ /* Multi-node baseline representation */
1270
+ for (j = 0; j < net_bl_nodes; j++) {
1271
+ if (baseline_pos > ((float) j / net_bl_nodes))
1272
+ *input_vec_ptr++ = 1.0;
1273
+ else
1274
+ *input_vec_ptr++ = 0.0;
1275
+ }
1276
+ }
1277
+ else {
1278
+ /* Single node baseline */
1279
+ *input_vec_ptr++ = baseline_pos;
1280
+ }
1281
+ }
1282
+
1283
+ callnet(input_vector, &top, &top_score, &next, &next_score);
1284
+ conf_level = evaluate_net_match (top, top_score, next, next_score,
1285
+ tess_ch, dict_word, checked_dict_word,
1286
+ sensible_word, centre, good_quality_word);
1287
+ #ifndef SECURE_NAMES
1288
+ if (nn_reject_debug) {
1289
+ tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n",
1290
+ top, top_score, next, next_score, tess_ch, conf_level);
1291
+ }
1292
+ #endif
1293
+ free_mem(input_vector);
1294
+ return conf_level;
1295
+ }
1296
+
1297
+
1298
+ inT16 evaluate_net_match(char top,
1299
+ float top_score,
1300
+ char next,
1301
+ float next_score,
1302
+ char tess_ch,
1303
+ BOOL8 dict_word,
1304
+ BOOL8 checked_dict_word,
1305
+ BOOL8 sensible_word,
1306
+ BOOL8 centre,
1307
+ BOOL8 good_quality_word) {
1308
+ inT16 accept_level; //0 Very clearly matched
1309
+ //1 Clearly top
1310
+ //2 Top but poor match
1311
+ //3 Next & poor top match
1312
+ //4 Next but good top match
1313
+ //5 No chance
1314
+ BOOL8 good_top_choice;
1315
+ BOOL8 excellent_top_choice;
1316
+ BOOL8 confusion_match = FALSE;
1317
+ BOOL8 dodgy_char = !isalnum (tess_ch);
1318
+
1319
+ good_top_choice = (top_score > nn_reject_threshold) &&
1320
+ (nn_reject_head_and_shoulders * top_score > next_score);
1321
+
1322
+ excellent_top_choice = good_top_choice &&
1323
+ (top_score > nn_dodgy_char_threshold);
1324
+
1325
+ if (top == tess_ch) {
1326
+ if (excellent_top_choice)
1327
+ accept_level = 0;
1328
+ else if (good_top_choice)
1329
+ accept_level = 1; //Top correct and well matched
1330
+ else
1331
+ accept_level = 2; //Top correct but poor match
1332
+ }
1333
+ else if ((nn_conf_1Il &&
1334
+ STRING (conflict_set_I_l_1).contains (tess_ch) &&
1335
+ STRING (conflict_set_I_l_1).contains (top)) ||
1336
+ (nn_conf_hyphen &&
1337
+ STRING (conflict_set_hyphen).contains (tess_ch) &&
1338
+ STRING (conflict_set_hyphen).contains (top)) ||
1339
+ (nn_conf_Ss &&
1340
+ STRING (conflict_set_S_s).contains (tess_ch) &&
1341
+ STRING (conflict_set_S_s).contains (top))) {
1342
+ confusion_match = TRUE;
1343
+ if (good_top_choice)
1344
+ accept_level = 1; //Good top confusion
1345
+ else
1346
+ accept_level = 2; //Poor top confusion
1347
+ }
1348
+ else if ((nn_conf_1Il &&
1349
+ STRING (conflict_set_I_l_1).contains (tess_ch) &&
1350
+ STRING (conflict_set_I_l_1).contains (next)) ||
1351
+ (nn_conf_hyphen &&
1352
+ STRING (conflict_set_hyphen).contains (tess_ch) &&
1353
+ STRING (conflict_set_hyphen).contains (next)) ||
1354
+ (nn_conf_Ss &&
1355
+ STRING (conflict_set_S_s).contains (tess_ch) &&
1356
+ STRING (conflict_set_S_s).contains (next))) {
1357
+ confusion_match = TRUE;
1358
+ if (!good_top_choice)
1359
+ accept_level = 3; //Next confusion and top match dodgy
1360
+ else
1361
+ accept_level = 4; //Next confusion and good top match
1362
+ }
1363
+ else if (next == tess_ch) {
1364
+ if (!good_top_choice)
1365
+ accept_level = 3; //Next match and top match dodgy
1366
+ else
1367
+ accept_level = 4; //Next match and good top match
1368
+ }
1369
+ else
1370
+ accept_level = 5;
1371
+
1372
+ /* Could allow some match flexibility here sS$ etc */
1373
+
1374
+ /* Now set confirmation level according to how much we can believe the tess
1375
+ char. */
1376
+
1377
+ if ((accept_level == 0) && !confusion_match)
1378
+ return 3;
1379
+
1380
+ if ((accept_level <= 1) &&
1381
+ (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
1382
+ return 3;
1383
+
1384
+ if ((accept_level == 2) &&
1385
+ !confusion_match && !dodgy_char &&
1386
+ good_quality_word &&
1387
+ dict_word &&
1388
+ (checked_dict_word || !nn_double_check_dict) && sensible_word)
1389
+ return 2;
1390
+
1391
+ if (confusion_match &&
1392
+ (accept_level <= nn_conf_accept_level) &&
1393
+ (good_quality_word ||
1394
+ (!nn_conf_test_good_qual &&
1395
+ !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
1396
+ (dict_word || !nn_conf_test_dict) &&
1397
+ (checked_dict_word || !nn_conf_double_check_dict) &&
1398
+ (sensible_word || !nn_conf_test_sensible))
1399
+ return 1;
1400
+
1401
+ if (!confusion_match &&
1402
+ nn_lax &&
1403
+ (accept_level == 3) &&
1404
+ (good_quality_word || !nn_conf_test_good_qual) &&
1405
+ (dict_word || !nn_conf_test_dict) &&
1406
+ (sensible_word || !nn_conf_test_sensible))
1407
+ return 1;
1408
+ else
1409
+ return 0;
1410
+ }
1411
+
1412
+
1413
+ /*************************************************************************
1414
+ * dont_allow_dubious_chars()
1415
+ * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong
1416
+ * if adjacent to a reject.
1417
+ *************************************************************************/
1418
+ void dont_allow_dubious_chars(WERD_RES *word) {
1419
+ int i = 0;
1420
+ int offset = 0;
1421
+ int rej_pos;
1422
+ int word_len = word->reject_map.length ();
1423
+
1424
+ while (i < word_len) {
1425
+ /* Find next reject */
1426
+
1427
+ while ((i < word_len) && (word->reject_map[i].accepted ()))
1428
+ {
1429
+ offset += word->best_choice->lengths()[i];
1430
+ i++;
1431
+ }
1432
+
1433
+ if (i < word_len) {
1434
+ rej_pos = i;
1435
+
1436
+ /* Reject dubious chars to the left */
1437
+ i--;
1438
+ offset -= word->best_choice->lengths()[i];
1439
+ while ((i >= 0) &&
1440
+ STRING (dubious_chars_left_of_reject).contains (word->
1441
+ best_choice->
1442
+ string ()
1443
+ [offset])) {
1444
+ word->reject_map[i--].setrej_dubious ();
1445
+ offset -= word->best_choice->lengths()[i];
1446
+ }
1447
+
1448
+ /* Skip adjacent rejects */
1449
+
1450
+ for (i = rej_pos;
1451
+ (i < word_len) && (word->reject_map[i].rejected ());
1452
+ offset += word->best_choice->lengths()[i++]);
1453
+
1454
+ /* Reject dubious chars to the right */
1455
+
1456
+ while ((i < word_len) &&
1457
+ STRING (dubious_chars_right_of_reject).contains (word->
1458
+ best_choice->
1459
+ string ()
1460
+ [offset])) {
1461
+ offset += word->best_choice->lengths()[i];
1462
+ word->reject_map[i++].setrej_dubious ();
1463
+ }
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+
1469
+ /*************************************************************************
1470
+ * dont_allow_1Il()
1471
+ * Dont unreject LONE accepted 1Il conflict set chars
1472
+ *************************************************************************/
1473
+ void dont_allow_1Il(WERD_RES *word) {
1474
+ int i = 0;
1475
+ int offset;
1476
+ int word_len = word->reject_map.length ();
1477
+ const char *s = word->best_choice->string ().string ();
1478
+ const char *lengths = word->best_choice->lengths ().string ();
1479
+ BOOL8 accepted_1Il = FALSE;
1480
+
1481
+ for (i = 0, offset = 0; i < word_len;
1482
+ offset += word->best_choice->lengths()[i++]) {
1483
+ if (word->reject_map[i].accepted ()) {
1484
+ if (STRING (conflict_set_I_l_1).contains (s[offset]))
1485
+ accepted_1Il = TRUE;
1486
+ else {
1487
+ if (unicharset.get_isalpha (s + offset, lengths[i]) ||
1488
+ unicharset.get_isdigit (s + offset, lengths[i]))
1489
+ return; // >=1 non 1Il ch accepted
1490
+ }
1491
+ }
1492
+ }
1493
+ if (!accepted_1Il)
1494
+ return; //Nothing to worry about
1495
+
1496
+ for (i = 0, offset = 0; i < word_len;
1497
+ offset += word->best_choice->lengths()[i++]) {
1498
+ if (STRING (conflict_set_I_l_1).contains (s[offset]) &&
1499
+ word->reject_map[i].accepted ())
1500
+ word->reject_map[i].setrej_postNN_1Il ();
1501
+ }
1502
+ }
1503
+
1504
+
1505
+ inT16 count_alphanums( //how many alphanums
1506
+ WERD_RES *word) {
1507
+ int count = 0;
1508
+ int i;
1509
+ int offset;
1510
+
1511
+ for (i = 0, offset = 0; i < word->reject_map.length ();
1512
+ offset += word->best_choice->lengths()[i++]) {
1513
+ if ((word->reject_map[i].accepted ()) &&
1514
+ (unicharset.get_isalpha (word->best_choice->string ().string() + offset,
1515
+ word->best_choice->lengths ()[i]) ||
1516
+ unicharset.get_isdigit (word->best_choice->string ().string() + offset,
1517
+ word->best_choice->lengths ()[i])))
1518
+ count++;
1519
+ }
1520
+ return count;
1521
+ }
1522
+
1523
+
1524
+ void reject_mostly_rejects( //rej all if most rejectd
1525
+ WERD_RES *word) {
1526
+ /* Reject the whole of the word if the fraction of rejects exceeds a limit */
1527
+
1528
+ if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
1529
+ rej_whole_of_mostly_reject_word_fract)
1530
+ word->reject_map.rej_word_mostly_rej ();
1531
+ }
1532
+
1533
+
1534
+ BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
1535
+ inT16 char_quality;
1536
+ inT16 accepted_char_quality;
1537
+
1538
+ if (word->best_choice->lengths ().length () <= 1)
1539
+ return FALSE;
1540
+
1541
+ if (!STRING (ok_repeated_ch_non_alphanum_wds).
1542
+ contains (word->best_choice->string ()[0]))
1543
+ return FALSE;
1544
+
1545
+ if (!repeated_ch_string (word->best_choice->string ().string (),
1546
+ word->best_choice->lengths ().string ()))
1547
+ return FALSE;
1548
+
1549
+ word_char_quality(word, row, &char_quality, &accepted_char_quality);
1550
+
1551
+ if ((word->best_choice->lengths ().length () == char_quality) &&
1552
+ (char_quality == accepted_char_quality))
1553
+ return TRUE;
1554
+ else
1555
+ return FALSE;
1556
+ }
1557
+
1558
+
1559
+ BOOL8 repeated_ch_string(const char *rep_ch_str,
1560
+ const char *lengths) {
1561
+ UNICHAR_ID c;
1562
+
1563
+ if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
1564
+ return FALSE;
1565
+ }
1566
+
1567
+ c = unicharset.unichar_to_id(rep_ch_str, *lengths);
1568
+ rep_ch_str += *(lengths++);
1569
+ while (*rep_ch_str != '\0' &&
1570
+ unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {
1571
+ rep_ch_str++;
1572
+ }
1573
+ if (*rep_ch_str == '\0')
1574
+ return TRUE;
1575
+ return FALSE;
1576
+ }
1577
+
1578
+
1579
+ inT16 safe_dict_word(const char *s) {
1580
+ int dict_word_type;
1581
+
1582
+ dict_word_type = dict_word (s);
1583
+ if (dict_word_type == DOC_DAWG_PERM)
1584
+ return 0;
1585
+ else
1586
+ return dict_word_type;
1587
+ }
1588
+
1589
+
1590
+ void flip_hyphens(WERD_RES *word) {
1591
+ char *str = (char *) word->best_choice->string ().string ();
1592
+ int i = 0;
1593
+ int offset = 0;
1594
+ PBLOB_IT outword_it;
1595
+ int prev_right = -9999;
1596
+ int next_left;
1597
+ TBOX out_box;
1598
+ float aspect_ratio;
1599
+
1600
+ if (tessedit_lower_flip_hyphen <= 1)
1601
+ return;
1602
+
1603
+ outword_it.set_to_list (word->outword->blob_list ());
1604
+
1605
+ for (outword_it.mark_cycle_pt ();
1606
+ !outword_it.cycled_list (); outword_it.forward (),
1607
+ offset += word->best_choice->lengths()[i++]) {
1608
+ out_box = outword_it.data ()->bounding_box ();
1609
+ if (outword_it.at_last ())
1610
+ next_left = 9999;
1611
+ else
1612
+ next_left = outword_it.data_relative (1)->bounding_box ().left ();
1613
+ /*
1614
+ Dont touch small or touching blobs - it is too dangerous
1615
+ */
1616
+ if ((out_box.width () > 8 * word->denorm.scale ()) &&
1617
+ (out_box.left () > prev_right) && (out_box.right () < next_left)) {
1618
+ aspect_ratio = out_box.width () / (float) out_box.height ();
1619
+ if (str[offset] == '.') {
1620
+ if (aspect_ratio >= tessedit_upper_flip_hyphen) {
1621
+ /* Certain HYPHEN */
1622
+ str[offset] = '-';
1623
+ if (word->reject_map[i].rejected ())
1624
+ word->reject_map[i].setrej_hyphen_accept ();
1625
+ }
1626
+ if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
1627
+ word->reject_map[i].accepted ())
1628
+ //Suspected HYPHEN
1629
+ word->reject_map[i].setrej_hyphen ();
1630
+ }
1631
+ else if (str[offset] == '-') {
1632
+ if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
1633
+ (word->reject_map[i].rejected ()))
1634
+ word->reject_map[i].setrej_hyphen_accept ();
1635
+ //Certain HYPHEN
1636
+
1637
+ if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
1638
+ (word->reject_map[i].accepted ()))
1639
+ //Suspected HYPHEN
1640
+ word->reject_map[i].setrej_hyphen ();
1641
+ }
1642
+ }
1643
+ prev_right = out_box.right ();
1644
+ }
1645
+ }
1646
+
1647
+
1648
+ void flip_0O(WERD_RES *word) {
1649
+ char *str = (char *) word->best_choice->string ().string ();
1650
+ char *lengths = (char *) word->best_choice->lengths ().string ();
1651
+ int i;
1652
+ int offset;
1653
+ PBLOB_IT outword_it;
1654
+ TBOX out_box;
1655
+
1656
+ if (!tessedit_flip_0O)
1657
+ return;
1658
+
1659
+ outword_it.set_to_list (word->outword->blob_list ());
1660
+
1661
+ for (i = 0, offset = 0, outword_it.mark_cycle_pt ();
1662
+ !outword_it.cycled_list (); offset += lengths[i++], outword_it.forward ()) {
1663
+ if (unicharset.get_isupper (str + offset, lengths[i]) ||
1664
+ unicharset.get_isdigit (str + offset, lengths[i])) {
1665
+ out_box = outword_it.data ()->bounding_box ();
1666
+ if ((out_box.top () < bln_baseline_offset + bln_x_height) ||
1667
+ (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))
1668
+ return; //Beware words with sub/superscripts
1669
+ }
1670
+ }
1671
+
1672
+ for (i = 1, offset = lengths[0]; str[offset] != '\0';
1673
+ offset += lengths[i++], outword_it.forward ()) {
1674
+ if (lengths[i] == 1 &&
1675
+ ((str[offset] == '0') || (str[offset] == 'O'))) {
1676
+ /* A0A */
1677
+ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1678
+ lengths[i + 1] > 0 &&
1679
+ non_O_upper (str + offset + lengths[i], lengths[i + 1])) {
1680
+ str[offset] = 'O';
1681
+ }
1682
+ /* A00A */
1683
+ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1684
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1685
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1686
+ lengths[i + 2] > 0 &&
1687
+ non_O_upper (str + offset + lengths[i] + lengths[i + 1],
1688
+ lengths[i + 2])) {
1689
+ str[offset] = 'O';
1690
+ str[offset + lengths[i]] = 'O';
1691
+ offset += lengths[i++];
1692
+ }
1693
+ /* AA0<non digit or end of word> */
1694
+ if ((i > 1) &&
1695
+ non_O_upper (str + offset - lengths[i - 1] - lengths[i - 2],
1696
+ lengths[i - 2]) &&
1697
+ non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1698
+ lengths[i + 1] > 0 &&
1699
+ !unicharset.get_isdigit (str + offset + lengths[i], lengths[i + 1]) &&
1700
+ (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'l') &&
1701
+ (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'I')) {
1702
+ str[offset] = 'O';
1703
+ }
1704
+ /* 9O9 */
1705
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1706
+ lengths[i + 1] > 0 &&
1707
+ non_0_digit (str + offset + lengths[i], lengths[i + 1])) {
1708
+ str[offset] = '0';
1709
+ }
1710
+ /* 9OOO */
1711
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1712
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1713
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1714
+ ((lengths[i + 2] == 1 &&
1715
+ str[offset + lengths[i] + lengths[i + 1]] == '0') ||
1716
+ (lengths[i + 2] == 1 &&
1717
+ str[offset + lengths[i] + lengths[i + 1]] == 'O'))) {
1718
+ str[offset] = '0';
1719
+ str[offset + lengths[i]] = '0';
1720
+ str[offset + lengths[i] + lengths[i + 1]] = '0';
1721
+ offset += lengths[i++];
1722
+ offset += lengths[i++];
1723
+ }
1724
+ /* 9OO<non upper> */
1725
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1726
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1727
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1728
+ lengths[i + 2] > 0 &&
1729
+ !unicharset.get_isupper (str + offset + lengths[i] + lengths[i + 1],
1730
+ lengths[i + 2])) {
1731
+ str[offset] = '0';
1732
+ str[offset + lengths[i]] = '0';
1733
+ offset += lengths[i++];
1734
+ }
1735
+ /* 9O<non upper> */
1736
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1737
+ lengths[i + 1] > 0 &&
1738
+ !unicharset.get_isupper (str + offset + lengths[i], lengths[i + 1])) {
1739
+ str[offset] = '0';
1740
+ }
1741
+ /* 9[.,]OOO.. */
1742
+ if ((i > 1) &&
1743
+ ((lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == '.') ||
1744
+ (lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == ',')) &&
1745
+ (unicharset.get_isdigit (str + offset -
1746
+ lengths[i - 1] - lengths[i - 2],
1747
+ lengths[i - 2]) ||
1748
+ (lengths[i - 2] == 1 &&
1749
+ str[offset - lengths[i - 1] - lengths[i - 2]] == 'O'))) {
1750
+ if (lengths[i - 2] == 1 &&
1751
+ str[offset - lengths[i - 1] - lengths[i - 2]] == 'O')
1752
+ str[offset - lengths[i - 1] - lengths[i - 2]] = '0';
1753
+ while (lengths[i] == 1 &&
1754
+ ((str[offset] == 'O') || (str[offset] == '0'))) {
1755
+ str[offset] = '0';
1756
+ offset += lengths[i++];
1757
+ }
1758
+ i--;
1759
+ offset -= lengths[i];
1760
+ }
1761
+ }
1762
+ }
1763
+ }
1764
+
1765
+
1766
+ BOOL8 non_O_upper(const char* str, int length) {
1767
+ return unicharset.get_isupper (str, length) &&
1768
+ (!unicharset.eq(unicharset.unichar_to_id(str, length), "O"));
1769
+ }
1770
+
1771
+
1772
+ BOOL8 non_0_digit(const char* str, int length) {
1773
+ return unicharset.get_isdigit (str, length) &&
1774
+ (!unicharset.eq(unicharset.unichar_to_id(str, length), "0"));
1775
+ }