tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1775 @@
1
+ /**********************************************************************
2
+ * File: reject.cpp (Formerly reject.c)
3
+ * Description: Rejection functions used in tessedit
4
+ * Author: Phil Cheatle
5
+ * Created: Wed Sep 23 16:50:21 BST 1992
6
+ *
7
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include "tessvars.h"
22
+ #ifdef __UNIX__
23
+ #include <assert.h>
24
+ #include <errno.h>
25
+ #endif
26
+ #include "scanutils.h"
27
+ #include <ctype.h>
28
+ #include <string.h>
29
+ //#include "tessbox.h"
30
+ #include "memry.h"
31
+ #include "reject.h"
32
+ #include "tfacep.h"
33
+ #include "mainblk.h"
34
+ #include "charcut.h"
35
+ #include "imgs.h"
36
+ #include "scaleimg.h"
37
+ #include "control.h"
38
+ #include "docqual.h"
39
+ #include "secname.h"
40
+ #include "globals.h"
41
+
42
+ /* #define SECURE_NAMES done in secnames.h when necessary */
43
+
44
+ //extern "C" {
45
+ #include "callnet.h"
46
+ //}
47
+
48
+ #include "notdll.h"
49
+
50
+ CLISTIZEH (STRING) CLISTIZE (STRING)
51
+ #define EXTERN
52
+ EXTERN
53
+ INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
54
+ EXTERN
55
+ INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
56
+ EXTERN
57
+ BOOL_VAR (tessedit_use_nn, FALSE, "");
58
+ EXTERN
59
+ BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
60
+ EXTERN
61
+ BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
62
+ EXTERN
63
+ BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
64
+ EXTERN
65
+ double_VAR (tessedit_lower_flip_hyphen, 1.5,
66
+ "Aspect ratio dot/hyphen test");
67
+ EXTERN
68
+ double_VAR (tessedit_upper_flip_hyphen, 1.8,
69
+ "Aspect ratio dot/hyphen test");
70
+
71
+ EXTERN
72
+ BOOL_VAR (rej_trust_doc_dawg, FALSE,
73
+ "Use DOC dawg in 11l conf. detector");
74
+ EXTERN
75
+ BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
76
+ EXTERN
77
+ BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
78
+
79
+ EXTERN
80
+ BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
81
+ EXTERN
82
+ BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
83
+ EXTERN
84
+ BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
85
+ EXTERN
86
+ BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
87
+ EXTERN
88
+ BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
89
+ EXTERN
90
+ BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
91
+ EXTERN
92
+ BOOL_VAR (nn_conf_double_check_dict, TRUE,
93
+ "Double check for confusions");
94
+ EXTERN
95
+ BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
96
+ EXTERN
97
+ BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
98
+ EXTERN
99
+ BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
100
+ EXTERN
101
+ BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
102
+ EXTERN
103
+ BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
104
+ EXTERN
105
+ BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
106
+ EXTERN
107
+ BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
108
+ "Require stronger NN match");
109
+ EXTERN
110
+ double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
111
+ EXTERN
112
+ INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
113
+ EXTERN
114
+ INT_VAR (nn_conf_initial_i_level, 3,
115
+ "NN accept initial Ii match level ");
116
+
117
+ EXTERN
118
+ BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
119
+ EXTERN
120
+ BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
121
+ EXTERN
122
+ BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
123
+ EXTERN
124
+ BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
125
+ EXTERN
126
+ BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
127
+ EXTERN
128
+ BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
129
+ EXTERN
130
+ BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
131
+ EXTERN
132
+ BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
133
+
134
+ EXTERN
135
+ double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
136
+ "if >this fract");
137
+ EXTERN
138
+ INT_VAR (rej_mostly_reject_mode, 1,
139
+ "0-never, 1-afterNN, 2-after new xht");
140
+ EXTERN
141
+ double_VAR (tessed_fullstop_aspect_ratio, 1.2,
142
+ "if >this fract then reject");
143
+
144
+ EXTERN
145
+ INT_VAR (net_image_width, 40, "NN input image width");
146
+ EXTERN
147
+ INT_VAR (net_image_height, 36, "NN input image height");
148
+ EXTERN
149
+ INT_VAR (net_image_x_height, 22, "NN input image x_height");
150
+ EXTERN
151
+ INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
152
+
153
+ /*
154
+ Net input is assumed to have (net_image_width * net_image_height) input
155
+ units of image pixels, followed by 0, 1, or N units representing the
156
+ baseline position. 0 implies no baseline information. 1 implies a floating
157
+ point value. N implies a "guage" of N units. For any char an initial set
158
+ of these are ON, the remainder OFF to indicate the "level" of the
159
+ baseline.
160
+
161
+ HOWEVER!!! NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A
162
+ DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules
163
+ generated by aspirin with HARD CODED CONSTANTS
164
+ */
165
+
166
+ EXTERN
167
+ INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
168
+
169
+ EXTERN
170
+ double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
171
+ EXTERN
172
+ double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
173
+
174
+ /* NOTE - ctoh doesn't handle "=" properly, hence \075 */
175
+ EXTERN
176
+ STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
177
+ "Allow NN to unrej");
178
+ EXTERN
179
+ STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
180
+ "Allow NN to unrej");
181
+ EXTERN
182
+ STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
183
+ EXTERN
184
+ STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
185
+ EXTERN
186
+ STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
187
+ EXTERN
188
+ STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
189
+ "Unreliable chars");
190
+ EXTERN
191
+ STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
192
+ "Unreliable chars");
193
+
194
+ EXTERN
195
+ INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
196
+
197
+ /*************************************************************************
198
+ * set_done()
199
+ *
200
+ * Set the done flag based on the word acceptability criteria
201
+ *************************************************************************/
202
+
203
+ void set_done( //set done flag
204
+ WERD_RES *word,
205
+ inT16 pass) {
206
+ /*
207
+ 0: Original heuristic used in Tesseract and Ray's prototype Resaljet
208
+ */
209
+ if (tessedit_ok_mode == 0) {
210
+ /* NOTE - done even if word contains some or all spaces !!! */
211
+ word->done = word->tess_accepted;
212
+ }
213
+ /*
214
+ 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
215
+ */
216
+ else if (tessedit_ok_mode == 1) {
217
+ word->done = word->tess_accepted &&
218
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
219
+
220
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
221
+ word->done = FALSE;
222
+ }
223
+ /*
224
+ 2: as 1 + only accept dict words or numerics in pass 1
225
+ */
226
+ else if (tessedit_ok_mode == 2) {
227
+ word->done = word->tess_accepted &&
228
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
229
+
230
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
231
+ word->done = FALSE;
232
+
233
+ if (word->done &&
234
+ (pass == 1) &&
235
+ (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
236
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
237
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
238
+ (word->best_choice->permuter () != NUMBER_PERM)) {
239
+ #ifndef SECURE_NAMES
240
+ if (tessedit_rejection_debug)
241
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
242
+ word->best_choice->string ().string ());
243
+ #endif
244
+ word->done = FALSE;
245
+ }
246
+ }
247
+ /*
248
+ 3: as 2 + only accept dict words or numerics in pass 2 as well
249
+ */
250
+ else if (tessedit_ok_mode == 3) {
251
+ word->done = word->tess_accepted &&
252
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
253
+
254
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
255
+ word->done = FALSE;
256
+
257
+ if (word->done &&
258
+ (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
259
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
260
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
261
+ (word->best_choice->permuter () != NUMBER_PERM)) {
262
+ #ifndef SECURE_NAMES
263
+ if (tessedit_rejection_debug)
264
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
265
+ word->best_choice->string ().string ());
266
+ #endif
267
+ word->done = FALSE;
268
+ }
269
+ }
270
+ /*
271
+ 4: as 2 + reject dict ambigs in pass 1
272
+ */
273
+ else if (tessedit_ok_mode == 4) {
274
+ word->done = word->tess_accepted &&
275
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
276
+
277
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
278
+ word->done = FALSE;
279
+
280
+ if (word->done &&
281
+ (pass == 1) &&
282
+ (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
283
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
284
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
285
+ (word->best_choice->permuter () != NUMBER_PERM)) ||
286
+ (test_ambig_word (word)))) {
287
+ #ifndef SECURE_NAMES
288
+ if (tessedit_rejection_debug)
289
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
290
+ word->best_choice->string ().string ());
291
+ #endif
292
+ word->done = FALSE;
293
+ }
294
+ }
295
+ /*
296
+ 5: as 3 + reject dict ambigs in both passes
297
+ */
298
+ else if (tessedit_ok_mode == 5) {
299
+ word->done = word->tess_accepted &&
300
+ (strchr (word->best_choice->string ().string (), ' ') == NULL);
301
+
302
+ if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
303
+ word->done = FALSE;
304
+
305
+ if (word->done &&
306
+ (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
307
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
308
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
309
+ (word->best_choice->permuter () != NUMBER_PERM)) ||
310
+ (test_ambig_word (word)))) {
311
+ #ifndef SECURE_NAMES
312
+ if (tessedit_rejection_debug)
313
+ tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
314
+ word->best_choice->string ().string ());
315
+ #endif
316
+ word->done = FALSE;
317
+ }
318
+ }
319
+
320
+ else {
321
+ tprintf ("BAD tessedit_ok_mode\n");
322
+ err_exit();
323
+ }
324
+ }
325
+
326
+
327
+ /*************************************************************************
328
+ * make_reject_map()
329
+ *
330
+ * Sets the done flag to indicate whether the resylt is acceptable.
331
+ *
332
+ * Sets a reject map for the word.
333
+ *************************************************************************/
334
+
335
+ void make_reject_map( //make rej map for wd //detailed results
336
+ WERD_RES *word,
337
+ BLOB_CHOICE_LIST_CLIST *blob_choices,
338
+ ROW *row,
339
+ inT16 pass //1st or 2nd?
340
+ ) {
341
+ int i;
342
+ int offset;
343
+
344
+ flip_0O(word);
345
+ check_debug_pt (word, -1); //For trap only
346
+ set_done(word, pass); //Set acceptance
347
+ word->reject_map.initialise (word->best_choice->lengths ().length ());
348
+ reject_blanks(word);
349
+ /*
350
+ 0: Rays original heuristic - the baseline
351
+ */
352
+ if (tessedit_reject_mode == 0) {
353
+ if (!word->done)
354
+ reject_poor_matches(word, blob_choices);
355
+ }
356
+ /*
357
+ 5: Reject I/1/l from words where there is no strong contextual confirmation;
358
+ the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
359
+ and the whole of any words which are very small
360
+ */
361
+ else if (tessedit_reject_mode == 5) {
362
+ if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
363
+ word->reject_map.rej_word_small_xht ();
364
+ else {
365
+ one_ell_conflict(word, TRUE);
366
+ /*
367
+ Originally the code here just used the done flag. Now I have duplicated
368
+ and unpacked the conditions for setting the done flag so that each
369
+ mechanism can be turned on or off independently. This works WITHOUT
370
+ affecting the done flag setting.
371
+ */
372
+ if (rej_use_tess_accepted && !word->tess_accepted)
373
+ word->reject_map.rej_word_not_tess_accepted ();
374
+
375
+ if (rej_use_tess_blanks &&
376
+ (strchr (word->best_choice->string ().string (), ' ') != NULL))
377
+ word->reject_map.rej_word_contains_blanks ();
378
+
379
+ if (rej_use_good_perm) {
380
+ if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
381
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
382
+ (word->best_choice->permuter () == USER_DAWG_PERM)) &&
383
+ (!rej_use_sensible_wd ||
384
+ (acceptable_word_string
385
+ (word->best_choice->string ().string (),
386
+ word->best_choice->lengths ().string ()) !=
387
+ AC_UNACCEPTABLE))) {
388
+ //PASSED TEST
389
+ }
390
+ else if (word->best_choice->permuter () == NUMBER_PERM) {
391
+ if (rej_alphas_in_number_perm) {
392
+ for (i = 0, offset = 0;
393
+ word->best_choice->string ()[offset] != '\0';
394
+ offset += word->best_choice->lengths()[i++]) {
395
+ if (word->reject_map[i].accepted () &&
396
+ unicharset.get_isalpha (word->best_choice->string ().string()
397
+ + offset,
398
+ word->best_choice->lengths()[i]))
399
+ word->reject_map[i].setrej_bad_permuter ();
400
+ //rej alpha
401
+ }
402
+ }
403
+ }
404
+ else {
405
+ word->reject_map.rej_word_bad_permuter ();
406
+ }
407
+ }
408
+
409
+ /* Ambig word rejection was here once !!*/
410
+
411
+ }
412
+ }
413
+ else {
414
+ tprintf ("BAD tessedit_reject_mode\n");
415
+ err_exit();
416
+ }
417
+
418
+ if (tessedit_image_border > -1)
419
+ reject_edge_blobs(word);
420
+
421
+ check_debug_pt (word, 10);
422
+ if (tessedit_rejection_debug) {
423
+ tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
424
+ tprintf ("Certainty: %f Rating: %f\n",
425
+ word->best_choice->certainty (), word->best_choice->rating ());
426
+ tprintf ("Dict word: %d\n",
427
+ dict_word (word->best_choice->string ().string ()));
428
+ }
429
+
430
+ /* Un-reject any rejected characters if NN permits */
431
+
432
+ if (tessedit_use_nn && (pass == 2) &&
433
+ word->reject_map.recoverable_rejects ())
434
+ nn_recover_rejects(word, row);
435
+ flip_hyphens(word);
436
+ check_debug_pt (word, 20);
437
+ }
438
+
439
+
440
+ void reject_blanks(WERD_RES *word) {
441
+ inT16 i;
442
+ inT16 offset;
443
+
444
+ for (i = 0, offset = 0; word->best_choice->string ()[offset] != '\0';
445
+ offset += word->best_choice->lengths ()[i], i += 1) {
446
+ if (word->best_choice->string ()[offset] == ' ')
447
+ //rej unrecognised blobs
448
+ word->reject_map[i].setrej_tess_failure ();
449
+ }
450
+ }
451
+
452
+
453
+ void reject_I_1_L(WERD_RES *word) {
454
+ inT16 i;
455
+ inT16 offset;
456
+
457
+ for (i = 0, offset = 0; word->best_choice->string ()[offset] != '\0';
458
+ offset += word->best_choice->lengths ()[i], i += 1) {
459
+ if (STRING (conflict_set_I_l_1).
460
+ contains (word->best_choice->string ()[offset])) {
461
+ //rej 1Il conflict
462
+ word->reject_map[i].setrej_1Il_conflict ();
463
+ }
464
+ }
465
+ }
466
+
467
+
468
+ void reject_poor_matches( //detailed results
469
+ WERD_RES *word,
470
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
471
+ float threshold;
472
+ inT16 i = 0;
473
+ inT16 offset = 0;
474
+ //super iterator
475
+ BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
476
+ BLOB_CHOICE_IT choice_it; //real iterator
477
+
478
+ #ifndef SECURE_NAMES
479
+ if (strlen (word->best_choice->lengths ().string ()) != list_it.length ()) {
480
+ tprintf
481
+ ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
482
+ word->best_choice->string ().string (),
483
+ strlen (word->best_choice->lengths ().string ()), list_it.length (),
484
+ word->outword->blob_list ()->length ());
485
+ }
486
+ #endif
487
+ ASSERT_HOST (strlen (word->best_choice->lengths ().string ()) ==
488
+ list_it.length ());
489
+ ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
490
+ threshold = compute_reject_threshold (blob_choices);
491
+
492
+ for (list_it.mark_cycle_pt ();
493
+ !list_it.cycled_list (); list_it.forward (), i++,
494
+ offset += word->best_choice->lengths ()[i]) {
495
+ /* NB - only compares the threshold against the TOP choice char in the
496
+ choices list for a blob !! - the selected one may be below the threshold */
497
+ choice_it.set_to_list (list_it.data ());
498
+ if ((word->best_choice->string ()[offset] == ' ') ||
499
+ (choice_it.length () == 0))
500
+ //rej unrecognised blobs
501
+ word->reject_map[i].setrej_tess_failure ();
502
+ else if (choice_it.data ()->certainty () < threshold)
503
+ //rej poor score blob
504
+ word->reject_map[i].setrej_poor_match ();
505
+ }
506
+ }
507
+
508
+
509
+ /**********************************************************************
510
+ * compute_reject_threshold
511
+ *
512
+ * Set a rejection threshold for this word.
513
+ * Initially this is a trivial function which looks for the largest
514
+ * gap in the certainty value.
515
+ **********************************************************************/
516
+
517
+ float compute_reject_threshold( //compute threshold //detailed results
518
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
519
+ inT16 index; //to ratings
520
+ inT16 blob_count; //no of blobs in word
521
+ inT16 ok_blob_count = 0; //non TESS rej blobs in word
522
+ float *ratings; //array of confidences
523
+ float threshold; //rejection threshold
524
+ float bestgap; //biggest gap
525
+ float gapstart; //bottom of gap
526
+ //super iterator
527
+ BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
528
+ BLOB_CHOICE_IT choice_it; //real iterator
529
+
530
+ blob_count = blob_choices->length ();
531
+ ratings = (float *) alloc_mem (blob_count * sizeof (float));
532
+ for (list_it.mark_cycle_pt (), index = 0;
533
+ !list_it.cycled_list (); list_it.forward (), index++) {
534
+ choice_it.set_to_list (list_it.data ());
535
+ if (choice_it.length () > 0) {
536
+ ratings[ok_blob_count] = choice_it.data ()->certainty ();
537
+ //get in an array
538
+ // tprintf("Rating[%d]=%c %g %g\n",
539
+ // index,choice_it.data()->char_class(),
540
+ // choice_it.data()->rating(),choice_it.data()->certainty());
541
+ ok_blob_count++;
542
+ }
543
+ }
544
+ ASSERT_HOST (index == blob_count);
545
+ qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
546
+ //sort them
547
+ bestgap = 0;
548
+ gapstart = ratings[0] - 1; //all reject if none better
549
+ if (ok_blob_count >= 3) {
550
+ for (index = 0; index < ok_blob_count - 1; index++) {
551
+ if (ratings[index + 1] - ratings[index] > bestgap) {
552
+ bestgap = ratings[index + 1] - ratings[index];
553
+ //find biggest
554
+ gapstart = ratings[index];
555
+ }
556
+ }
557
+ }
558
+ threshold = gapstart + bestgap / 2;
559
+ // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
560
+ // ratings[0],ratings[index],bestgap,threshold);
561
+
562
+ free_mem(ratings);
563
+ return threshold;
564
+ }
565
+
566
+
567
+ /**********************************************************************
568
+ * sort_floats
569
+ *
570
+ * qsort function to sort 2 floats.
571
+ **********************************************************************/
572
+
573
+ int sort_floats( //qsort function
574
+ const void *arg1, //ptrs to floats
575
+ const void *arg2) {
576
+ float diff; //difference
577
+
578
+ diff = *((float *) arg1) - *((float *) arg2);
579
+ if (diff > 0)
580
+ return 1;
581
+ else if (diff < 0)
582
+ return -1;
583
+ else
584
+ return 0;
585
+ }
586
+
587
+
588
+ /*************************************************************************
589
+ * reject_edge_blobs()
590
+ *
591
+ * If the word is perilously close to the edge of the image, reject those blobs
592
+ * in the word which are too close to the edge as they could be clipped.
593
+ *************************************************************************/
594
+
595
+ void reject_edge_blobs(WERD_RES *word) {
596
+ TBOX word_box = word->word->bounding_box ();
597
+ TBOX blob_box;
598
+ PBLOB_IT blob_it = word->outword->blob_list ();
599
+ //blobs
600
+ int blobindex = 0;
601
+ float centre;
602
+
603
+ if ((word_box.left () < tessedit_image_border) ||
604
+ (word_box.bottom () < tessedit_image_border) ||
605
+ (word_box.right () + tessedit_image_border >
606
+ page_image.get_xsize () - 1) ||
607
+ (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
608
+ ASSERT_HOST (word->reject_map.length () == blob_it.length ());
609
+ for (blobindex = 0, blob_it.mark_cycle_pt ();
610
+ !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
611
+ blob_box = blob_it.data ()->bounding_box ();
612
+ centre = (blob_box.left () + blob_box.right ()) / 2.0;
613
+ if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
614
+ (word->denorm.y (blob_box.bottom (), centre) <
615
+ tessedit_image_border) ||
616
+ (word->denorm.x (blob_box.right ()) + tessedit_image_border >
617
+ page_image.get_xsize () - 1) ||
618
+ (word->denorm.y (blob_box.top (), centre)
619
+ + tessedit_image_border > page_image.get_ysize () - 1)) {
620
+ word->reject_map[blobindex].setrej_edge_char ();
621
+ //close to edge
622
+ }
623
+ }
624
+ }
625
+ }
626
+
627
+
628
+ /**********************************************************************
629
+ * one_ell_conflict()
630
+ *
631
+ * Identify words where there is a potential I/l/1 error.
632
+ * - A bundle of contextual heuristics!
633
+ **********************************************************************/
634
+
635
+ BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
636
+ const char *word;
637
+ const char *lengths;
638
+ inT16 word_len; //its length
639
+ inT16 first_alphanum_index_;
640
+ inT16 first_alphanum_offset_;
641
+ inT16 i;
642
+ inT16 offset;
643
+ BOOL8 non_conflict_set_char; //non conf set a/n?
644
+ BOOL8 conflict = FALSE;
645
+ BOOL8 allow_1s;
646
+ ACCEPTABLE_WERD_TYPE word_type;
647
+ BOOL8 dict_perm_type;
648
+ BOOL8 dict_word_ok;
649
+ int dict_word_type;
650
+
651
+ word = word_res->best_choice->string ().string ();
652
+ lengths = word_res->best_choice->lengths().string();
653
+ word_len = strlen (lengths);
654
+ /*
655
+ If there are no occurrences of the conflict set characters then the word
656
+ is OK.
657
+ */
658
+ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
659
+ return FALSE;
660
+
661
+ /*
662
+ There is a conflict if there are NO other (confirmed) alphanumerics apart
663
+ from those in the conflict set.
664
+ */
665
+
666
+ for (i = 0, offset = 0, non_conflict_set_char = FALSE;
667
+ (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
668
+ non_conflict_set_char =
669
+ (unicharset.get_isalpha(word + offset, lengths[i]) ||
670
+ unicharset.get_isdigit(word + offset, lengths[i])) &&
671
+ !STRING (conflict_set_I_l_1).contains (word[offset]);
672
+ if (!non_conflict_set_char) {
673
+ if (update_map)
674
+ reject_I_1_L(word_res);
675
+ return TRUE;
676
+ }
677
+
678
+ /*
679
+ If the word is accepted by a dawg permuter, and the first alpha character
680
+ is "I" or "l", check to see if the alternative is also a dawg word. If it
681
+ is, then there is a potential error otherwise the word is ok.
682
+ */
683
+
684
+ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
685
+ (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
686
+ (rej_trust_doc_dawg &&
687
+ (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
688
+ (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
689
+ dict_word_type = dict_word (word);
690
+ dict_word_ok = (dict_word_type > 0) &&
691
+ (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
692
+
693
+ if ((rej_1Il_use_dict_word && dict_word_ok) ||
694
+ (rej_1Il_trust_permuter_type && dict_perm_type) ||
695
+ (dict_perm_type && dict_word_ok)) {
696
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
697
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
698
+ if (lengths[first_alphanum_index_] == 1 &&
699
+ word[first_alphanum_offset_] == 'I') {
700
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
701
+ if (safe_dict_word (word) > 0) {
702
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
703
+ if (update_map)
704
+ word_res->reject_map[first_alphanum_index_].
705
+ setrej_1Il_conflict();
706
+ return TRUE;
707
+ }
708
+ else {
709
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
710
+ return FALSE;
711
+ }
712
+ }
713
+
714
+ if (lengths[first_alphanum_index_] == 1 &&
715
+ word[first_alphanum_offset_] == 'l') {
716
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
717
+ if (safe_dict_word (word) > 0) {
718
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
719
+ if (update_map)
720
+ word_res->reject_map[first_alphanum_index_].
721
+ setrej_1Il_conflict();
722
+ return TRUE;
723
+ }
724
+ else {
725
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
726
+ return FALSE;
727
+ }
728
+ }
729
+ return FALSE;
730
+ }
731
+
732
+ /*
733
+ NEW 1Il code. The old code relied on permuter types too much. In fact,
734
+ tess will use TOP_CHOICE permute for good things like "palette".
735
+ In this code the string is examined independently to see if it looks like
736
+ a well formed word.
737
+ */
738
+
739
+ /*
740
+ REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
741
+ dictionary word.
742
+ */
743
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
744
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
745
+ if (lengths[first_alphanum_index_] == 1 &&
746
+ word[first_alphanum_offset_] == 'l') {
747
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
748
+ if (safe_dict_word (word) > 0)
749
+ return FALSE;
750
+ else
751
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
752
+ }
753
+ else if (lengths[first_alphanum_index_] == 1 &&
754
+ word[first_alphanum_offset_] == 'I') {
755
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'l';
756
+ if (safe_dict_word (word) > 0)
757
+ return FALSE;
758
+ else
759
+ word_res->best_choice->string ()[first_alphanum_offset_] = 'I';
760
+ }
761
+ /*
762
+ For strings containing digits:
763
+ If there are no alphas OR the numeric permuter liked the word,
764
+ reject any non 1 conflict chs
765
+ Else reject all conflict chs
766
+ */
767
+ if (word_contains_non_1_digit (word, lengths)) {
768
+ allow_1s = (alpha_count (word, lengths) == 0) ||
769
+ (word_res->best_choice->permuter () == NUMBER_PERM);
770
+
771
+ inT16 offset;
772
+ conflict = FALSE;
773
+ for (i = 0, offset = 0; word[offset] != '\0';
774
+ offset += word_res->best_choice->lengths ()[i++]) {
775
+ if ((!allow_1s || (word[offset] != '1')) &&
776
+ STRING (conflict_set_I_l_1).contains (word[offset])) {
777
+ if (update_map)
778
+ word_res->reject_map[i].setrej_1Il_conflict ();
779
+ conflict = TRUE;
780
+ }
781
+ }
782
+ return conflict;
783
+ }
784
+ /*
785
+ For anything else. See if it conforms to an acceptable word type. If so,
786
+ treat accordingly.
787
+ */
788
+ word_type = acceptable_word_string (word, lengths);
789
+ if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
790
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
791
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
792
+ if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
793
+ if (update_map)
794
+ word_res->reject_map[first_alphanum_index_].
795
+ setrej_1Il_conflict ();
796
+ return TRUE;
797
+ }
798
+ else
799
+ return FALSE;
800
+ }
801
+ else if (word_type == AC_UPPER_CASE) {
802
+ return FALSE;
803
+ }
804
+ else {
805
+ if (update_map)
806
+ reject_I_1_L(word_res);
807
+ return TRUE;
808
+ }
809
+ }
810
+
811
+
812
+ inT16 first_alphanum_index(const char *word,
813
+ const char *word_lengths) {
814
+ inT16 i;
815
+ inT16 offset;
816
+
817
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
818
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
819
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
820
+ return i;
821
+ }
822
+ return -1;
823
+ }
824
+
825
+ inT16 first_alphanum_offset(const char *word,
826
+ const char *word_lengths) {
827
+ inT16 i;
828
+ inT16 offset;
829
+
830
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
831
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
832
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
833
+ return offset;
834
+ }
835
+ return -1;
836
+ }
837
+
838
+ inT16 alpha_count(const char *word,
839
+ const char *word_lengths) {
840
+ inT16 i;
841
+ inT16 offset;
842
+ inT16 count = 0;
843
+
844
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
845
+ if (unicharset.get_isalpha (word + offset, word_lengths[i]))
846
+ count++;
847
+ }
848
+ return count;
849
+ }
850
+
851
+
852
+ BOOL8 word_contains_non_1_digit(const char *word,
853
+ const char *word_lengths) {
854
+ inT16 i;
855
+ inT16 offset;
856
+
857
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
858
+ if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
859
+ (word_lengths[i] != 1 || word[offset] != '1'))
860
+ return TRUE;
861
+ }
862
+ return FALSE;
863
+ }
864
+
865
+
866
+ BOOL8 test_ambig_word( //test for ambiguity
867
+ WERD_RES *word) {
868
+ BOOL8 ambig = FALSE;
869
+
870
+ if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
871
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
872
+ (word->best_choice->permuter () == USER_DAWG_PERM)) {
873
+ ambig = !NoDangerousAmbig(word->best_choice->string().string(),
874
+ word->best_choice->lengths().string(),
875
+ NULL);
876
+ }
877
+ return ambig;
878
+ }
879
+
880
+
881
+ /*************************************************************************
882
+ * ambig_word()
883
+ *
884
+ * This is a recursive routine which tests the dictionary for all combinations
885
+ * of conflict set alternatives for characters in a given word.
886
+ *************************************************************************/
887
+
888
+ BOOL8 ambig_word( //original word
889
+ const char *start_word,
890
+ char *temp_word, //alterable copy
891
+ inT16 test_char_pos //idx to char to alter
892
+ ) {
893
+ const char *ambigs; //Ambiguities for char
894
+
895
+ if (*(temp_word + test_char_pos) == '\0') {
896
+ if (safe_dict_word (temp_word)) {
897
+ if (strcmp (start_word, temp_word) == 0)
898
+ return FALSE;
899
+ else
900
+ return TRUE;
901
+ }
902
+ else
903
+ return FALSE;
904
+ }
905
+ else {
906
+ ambigs = char_ambiguities (*(temp_word + test_char_pos));
907
+ if (ambigs == NULL)
908
+ return ambig_word (start_word, temp_word, test_char_pos + 1);
909
+ else {
910
+ while (*ambigs != '\0') {
911
+ *(temp_word + test_char_pos) = *ambigs++;
912
+ //test next ambiguity
913
+ if (ambig_word (start_word, temp_word, test_char_pos + 1))
914
+ return TRUE;
915
+ }
916
+ return FALSE;
917
+ }
918
+ }
919
+ }
920
+
921
+
922
+ /*************************************************************************
923
+ * char_ambiguities()
924
+ *
925
+ * Return a pointer to a string containing the full conflict set of characters
926
+ * which includes the specified character, if there is one. If the specified
927
+ * character is not a member of a conflict set, return NULL.
928
+ * (NOTE that a character is assumed to be a member of only ONE conflict set.)
929
+ *************************************************************************/
930
+
931
+ const char *char_ambiguities(char c) {
932
+ static STRING_CLIST conflict_sets;
933
+ static BOOL8 read_conflict_sets = FALSE;
934
+ STRING_C_IT cs_it(&conflict_sets);
935
+ const char *cs;
936
+ STRING cs_file_name;
937
+ FILE *cs_file;
938
+ char buff[1024];
939
+
940
+ if (!read_conflict_sets) {
941
+ cs_file_name = datadir + "confsets";
942
+ if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
943
+ CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
944
+ cs_file_name.string (), errno);
945
+ }
946
+ while (fscanf (cs_file, "%s", buff) == 1) {
947
+ cs_it.add_after_then_move (new STRING (buff));
948
+ }
949
+ read_conflict_sets = TRUE;
950
+ cs_it.move_to_first ();
951
+ if (tessedit_rejection_debug) {
952
+ for (cs_it.mark_cycle_pt ();
953
+ !cs_it.cycled_list (); cs_it.forward ()) {
954
+ tprintf ("\"%s\"\n", cs_it.data ()->string ());
955
+ }
956
+ }
957
+ }
958
+
959
+ cs_it.move_to_first ();
960
+ for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
961
+ cs = cs_it.data ()->string ();
962
+ if (strchr (cs, c) != NULL)
963
+ return cs;
964
+ }
965
+ return NULL;
966
+ }
967
+
968
+ #ifndef EMBEDDED
969
+ void test_ambigs(const char *word) {
970
+ char orig_word[80];
971
+ char temp_word[80];
972
+
973
+ if (strlen (word) > 80)
974
+ tprintf ("Ridiculously long word \"%s\"\n", word);
975
+ else {
976
+ strcpy(orig_word, word);
977
+ while (strlen (orig_word) > 0) {
978
+ strcpy(temp_word, orig_word);
979
+
980
+ #ifndef SECURE_NAMES
981
+ if (ambig_word (orig_word, temp_word, 0))
982
+ tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);
983
+ else
984
+ tprintf ("NO Ambiguities for \"%s\"\n", orig_word);
985
+ tprintf ("Next Word > ");
986
+ #endif
987
+ scanf ("%s", orig_word);
988
+ }
989
+ }
990
+ }
991
+ #endif
992
+
993
+ /*************************************************************************
994
+ * nn_recover_rejects()
995
+ * Generate the nn_reject_map - a copy of the current reject map, but dont
996
+ * reject previously rejected chars if the NN matcher agrees with the best
997
+ * choice.
998
+ *************************************************************************/
999
+
1000
+ void nn_recover_rejects(WERD_RES *word, ROW *row) {
1001
+ //copy for debug
1002
+ REJMAP old_map = word->reject_map;
1003
+ /*
1004
+ NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
1005
+ MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE
1006
+ REJECT CHARACTERS (Though initial use is when words are total rejects
1007
+ anyway).
1008
+ */
1009
+
1010
+ set_global_subsubloc_code(SUBSUBLOC_NN);
1011
+ nn_match_word(word, row);
1012
+
1013
+ if (no_unrej_1Il)
1014
+ dont_allow_1Il(word);
1015
+ if (no_unrej_dubious_chars)
1016
+ dont_allow_dubious_chars(word);
1017
+
1018
+ if (rej_mostly_reject_mode == 1)
1019
+ reject_mostly_rejects(word);
1020
+ /*
1021
+ IF there are no unrejected alphanumerics AND
1022
+ The word is not an acceptable single non alphanum char word AND
1023
+ The word is not an acceptable repeated non alphanum char word
1024
+ THEN Reject whole word
1025
+ */
1026
+ if (no_unrej_no_alphanum_wds &&
1027
+ (count_alphanums (word) < 1) &&
1028
+ !((word->best_choice->lengths ().length () == 1) &&
1029
+ STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->
1030
+ string ()[0]))
1031
+ && !repeated_nonalphanum_wd (word, row))
1032
+
1033
+ word->reject_map.rej_word_no_alphanums ();
1034
+
1035
+ #ifndef SECURE_NAMES
1036
+
1037
+ if (nn_debug) {
1038
+ tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());
1039
+ old_map.print (stdout);
1040
+ tprintf ("->");
1041
+ word->reject_map.print (stdout);
1042
+ tprintf ("\n");
1043
+ }
1044
+ #endif
1045
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
1046
+ }
1047
+
1048
+
1049
+ void nn_match_word( //Match a word
1050
+ WERD_RES *word,
1051
+ ROW *row) {
1052
+ PIXROW_LIST *pixrow_list;
1053
+ PIXROW_IT pixrow_it;
1054
+ IMAGELINE *imlines; //lines of the image
1055
+ TBOX pix_box; //box of imlines extent
1056
+ #ifndef GRAPHICS_DISABLED
1057
+ ScrollView* win = NULL;
1058
+ #endif
1059
+ IMAGE clip_image;
1060
+ IMAGE scaled_image;
1061
+ float baseline_pos;
1062
+ inT16 net_image_size;
1063
+ inT16 clip_image_size;
1064
+ WERD copy_outword; // copy to denorm
1065
+ inT16 i;
1066
+
1067
+ const char *word_string;
1068
+ const char *word_string_lengths;
1069
+ BOOL8 word_in_dict; //Tess wd in dict
1070
+ BOOL8 checked_dict_word; //Tess wd definitely in dict
1071
+ BOOL8 sensible_word; //OK char string
1072
+ BOOL8 centre; //Not at word end chs
1073
+ BOOL8 good_quality_word;
1074
+ inT16 char_quality;
1075
+ inT16 accepted_char_quality;
1076
+
1077
+ inT16 conf_level; //0:REJECT
1078
+ //1:DODGY ACCEPT
1079
+ //2:DICT ACCEPT
1080
+ //3:CLEAR ACCEPT
1081
+ inT16 first_alphanum_index_;
1082
+ inT16 first_alphanum_offset_;
1083
+
1084
+ word_string = word->best_choice->string ().string ();
1085
+ word_string_lengths = word->best_choice->lengths ().string ();
1086
+ first_alphanum_index_ = first_alphanum_index (word_string,
1087
+ word_string_lengths);
1088
+ first_alphanum_offset_ = first_alphanum_offset (word_string,
1089
+ word_string_lengths);
1090
+ word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
1091
+ (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
1092
+ (word->best_choice->permuter () == USER_DAWG_PERM));
1093
+ checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);
1094
+ sensible_word = acceptable_word_string (word_string, word_string_lengths) !=
1095
+ AC_UNACCEPTABLE;
1096
+
1097
+ word_char_quality(word, row, &char_quality, &accepted_char_quality);
1098
+ good_quality_word = word->best_choice->lengths ().length () == char_quality;
1099
+
1100
+ #ifndef SECURE_NAMES
1101
+ if (nn_reject_debug) {
1102
+ tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n",
1103
+ word_in_dict ? 'T' : 'F',
1104
+ checked_dict_word ? 'T' : 'F',
1105
+ sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
1106
+ }
1107
+ #endif
1108
+
1109
+ if (word->best_choice->lengths ().length () !=
1110
+ word->outword->blob_list ()->length ()) {
1111
+ #ifndef SECURE_NAMES
1112
+ tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n",
1113
+ word->best_choice->string ().string (),
1114
+ word->outword->blob_list ()->length ());
1115
+ #endif
1116
+ err_exit();
1117
+ }
1118
+
1119
+ copy_outword = *(word->outword);
1120
+ copy_outword.baseline_denormalise (&word->denorm);
1121
+ /*
1122
+ For each character, generate and match a new image, containing JUST the
1123
+ character we have clipped, centered in the image, on a white background.
1124
+ Note that we MUST have a square image so that we can scale it uniformly in
1125
+ x and y. We base the size on x_height as this can be found fairly reliably.
1126
+ */
1127
+ net_image_size = (net_image_width > net_image_height) ?
1128
+ net_image_width : net_image_height;
1129
+ clip_image_size = (inT16) floor (0.5 +
1130
+ net_image_size * word->x_height /
1131
+ net_image_x_height);
1132
+ if ((clip_image_size <= 1) || (net_image_size <= 1)) {
1133
+ return;
1134
+ }
1135
+
1136
+ /*
1137
+ Get the image of the word and the pix positions of each char
1138
+ */
1139
+ char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
1140
+ #ifndef GRAPHICS_DISABLED
1141
+ if (show_char_clipping) {
1142
+ win = display_clip_image (&copy_outword, page_image,
1143
+ pixrow_list, pix_box);
1144
+ }
1145
+ #endif
1146
+ pixrow_it.set_to_list (pixrow_list);
1147
+ pixrow_it.move_to_first ();
1148
+ for (pixrow_it.mark_cycle_pt (), i = 0;
1149
+ !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
1150
+ if (pixrow_it.data ()->
1151
+ bad_box (page_image.get_xsize (), page_image.get_ysize ()))
1152
+ continue;
1153
+ clip_image.create (clip_image_size, clip_image_size, 1);
1154
+ //make bin imge
1155
+ if (!copy_outword.flag (W_INVERSE))
1156
+ invert_image(&clip_image); //white background for black on white
1157
+ pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
1158
+ clip_image, baseline_pos);
1159
+ if (copy_outword.flag (W_INVERSE))
1160
+ invert_image(&clip_image); //invert white on black for scaling &NN
1161
+ scaled_image.create (net_image_size, net_image_size, 1);
1162
+ scale_image(clip_image, scaled_image);
1163
+ baseline_pos *= net_image_size / clip_image_size;
1164
+ //scale with im
1165
+ centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
1166
+
1167
+ conf_level = nn_match_char (scaled_image, baseline_pos,
1168
+ word_in_dict, checked_dict_word,
1169
+ sensible_word, centre,
1170
+ good_quality_word, word_string[i]);
1171
+ if (word->reject_map[i].recoverable ()) {
1172
+ if ((i == first_alphanum_index_) &&
1173
+ word_string_lengths[first_alphanum_index_] == 1 &&
1174
+ ((word_string[first_alphanum_offset_] == 'I') ||
1175
+ (word_string[first_alphanum_offset_] == 'i'))) {
1176
+ if (conf_level >= nn_conf_initial_i_level)
1177
+ word->reject_map[i].setrej_nn_accept ();
1178
+ //un-reject char
1179
+ }
1180
+ else if (conf_level > 0)
1181
+ //un-reject char
1182
+ word->reject_map[i].setrej_nn_accept ();
1183
+ }
1184
+ #ifndef GRAPHICS_DISABLED
1185
+ if (show_char_clipping)
1186
+ display_images(clip_image, scaled_image);
1187
+ #endif
1188
+ clip_image.destroy();
1189
+ scaled_image.destroy();
1190
+ }
1191
+
1192
+ delete[]imlines; // Free array of imlines
1193
+ delete pixrow_list;
1194
+
1195
+ #ifndef GRAPHICS_DISABLED
1196
+ if (show_char_clipping) {
1197
+ // destroy_window(win);
1198
+ // win->Destroy();
1199
+ delete win;
1200
+ }
1201
+ #endif
1202
+ }
1203
+
1204
+
1205
+ /*************************************************************************
1206
+ * nn_match_char()
1207
+ * Call Neural Net matcher to match a single character, given a scaled,
1208
+ * square image
1209
+ *************************************************************************/
1210
+
1211
+ inT16 nn_match_char( //of character
1212
+ IMAGE &scaled_image,
1213
+ float baseline_pos, //rel to scaled_image
1214
+ BOOL8 dict_word, //part of dict wd?
1215
+ BOOL8 checked_dict_word, //part of dict wd?
1216
+ BOOL8 sensible_word, //part acceptable str?
1217
+ BOOL8 centre, //not at word ends?
1218
+ BOOL8 good_quality_word, //initial segmentation
1219
+ char tess_ch //confirm this?
1220
+ ) {
1221
+ inT16 conf_level; //0..2
1222
+ inT32 row;
1223
+ inT32 col;
1224
+ inT32 y_size = scaled_image.get_ysize ();
1225
+ inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
1226
+ inT32 end_y = start_y - net_image_height + 1;
1227
+ IMAGELINE imline;
1228
+ float *input_vector;
1229
+ float *input_vec_ptr;
1230
+ char top;
1231
+ float top_score;
1232
+ char next;
1233
+ float next_score;
1234
+ inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
1235
+ inT16 j;
1236
+
1237
+ input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
1238
+ input_vec_ptr = input_vector;
1239
+
1240
+ invert_image(&scaled_image); //cos nns work better
1241
+ for (row = start_y; row >= end_y; row--) {
1242
+ scaled_image.fast_get_line (0, row, net_image_width, &imline);
1243
+ for (col = 0; col < net_image_width; col++)
1244
+ *input_vec_ptr++ = imline.pixels[col];
1245
+ }
1246
+ /*
1247
+ The bit map presented to the net may be shorter than the image, so shift
1248
+ the coord to be relative to the bitmap portion.
1249
+ */
1250
+ baseline_pos -= (y_size - net_image_height) / 2.0;
1251
+ /*
1252
+ Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
1253
+ This is represented to the net as a set of bl_nodes, an initial proportion
1254
+ of which are set to 1.0, indicating the level of the baseline. The
1255
+ remainder are 0.0
1256
+ */
1257
+
1258
+ if (baseline_pos < 0)
1259
+ baseline_pos = 0;
1260
+ else if (baseline_pos >= net_image_height)
1261
+ baseline_pos = net_image_height + 1;
1262
+ else
1263
+ baseline_pos = baseline_pos + 1;
1264
+ baseline_pos = baseline_pos / (net_image_height + 1);
1265
+
1266
+ if (net_bl_nodes > 0) {
1267
+ baseline_pos *= 1.7; //Use a wider range
1268
+ if (net_bl_nodes > 1) {
1269
+ /* Multi-node baseline representation */
1270
+ for (j = 0; j < net_bl_nodes; j++) {
1271
+ if (baseline_pos > ((float) j / net_bl_nodes))
1272
+ *input_vec_ptr++ = 1.0;
1273
+ else
1274
+ *input_vec_ptr++ = 0.0;
1275
+ }
1276
+ }
1277
+ else {
1278
+ /* Single node baseline */
1279
+ *input_vec_ptr++ = baseline_pos;
1280
+ }
1281
+ }
1282
+
1283
+ callnet(input_vector, &top, &top_score, &next, &next_score);
1284
+ conf_level = evaluate_net_match (top, top_score, next, next_score,
1285
+ tess_ch, dict_word, checked_dict_word,
1286
+ sensible_word, centre, good_quality_word);
1287
+ #ifndef SECURE_NAMES
1288
+ if (nn_reject_debug) {
1289
+ tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n",
1290
+ top, top_score, next, next_score, tess_ch, conf_level);
1291
+ }
1292
+ #endif
1293
+ free_mem(input_vector);
1294
+ return conf_level;
1295
+ }
1296
+
1297
+
1298
+ inT16 evaluate_net_match(char top,
1299
+ float top_score,
1300
+ char next,
1301
+ float next_score,
1302
+ char tess_ch,
1303
+ BOOL8 dict_word,
1304
+ BOOL8 checked_dict_word,
1305
+ BOOL8 sensible_word,
1306
+ BOOL8 centre,
1307
+ BOOL8 good_quality_word) {
1308
+ inT16 accept_level; //0 Very clearly matched
1309
+ //1 Clearly top
1310
+ //2 Top but poor match
1311
+ //3 Next & poor top match
1312
+ //4 Next but good top match
1313
+ //5 No chance
1314
+ BOOL8 good_top_choice;
1315
+ BOOL8 excellent_top_choice;
1316
+ BOOL8 confusion_match = FALSE;
1317
+ BOOL8 dodgy_char = !isalnum (tess_ch);
1318
+
1319
+ good_top_choice = (top_score > nn_reject_threshold) &&
1320
+ (nn_reject_head_and_shoulders * top_score > next_score);
1321
+
1322
+ excellent_top_choice = good_top_choice &&
1323
+ (top_score > nn_dodgy_char_threshold);
1324
+
1325
+ if (top == tess_ch) {
1326
+ if (excellent_top_choice)
1327
+ accept_level = 0;
1328
+ else if (good_top_choice)
1329
+ accept_level = 1; //Top correct and well matched
1330
+ else
1331
+ accept_level = 2; //Top correct but poor match
1332
+ }
1333
+ else if ((nn_conf_1Il &&
1334
+ STRING (conflict_set_I_l_1).contains (tess_ch) &&
1335
+ STRING (conflict_set_I_l_1).contains (top)) ||
1336
+ (nn_conf_hyphen &&
1337
+ STRING (conflict_set_hyphen).contains (tess_ch) &&
1338
+ STRING (conflict_set_hyphen).contains (top)) ||
1339
+ (nn_conf_Ss &&
1340
+ STRING (conflict_set_S_s).contains (tess_ch) &&
1341
+ STRING (conflict_set_S_s).contains (top))) {
1342
+ confusion_match = TRUE;
1343
+ if (good_top_choice)
1344
+ accept_level = 1; //Good top confusion
1345
+ else
1346
+ accept_level = 2; //Poor top confusion
1347
+ }
1348
+ else if ((nn_conf_1Il &&
1349
+ STRING (conflict_set_I_l_1).contains (tess_ch) &&
1350
+ STRING (conflict_set_I_l_1).contains (next)) ||
1351
+ (nn_conf_hyphen &&
1352
+ STRING (conflict_set_hyphen).contains (tess_ch) &&
1353
+ STRING (conflict_set_hyphen).contains (next)) ||
1354
+ (nn_conf_Ss &&
1355
+ STRING (conflict_set_S_s).contains (tess_ch) &&
1356
+ STRING (conflict_set_S_s).contains (next))) {
1357
+ confusion_match = TRUE;
1358
+ if (!good_top_choice)
1359
+ accept_level = 3; //Next confusion and top match dodgy
1360
+ else
1361
+ accept_level = 4; //Next confusion and good top match
1362
+ }
1363
+ else if (next == tess_ch) {
1364
+ if (!good_top_choice)
1365
+ accept_level = 3; //Next match and top match dodgy
1366
+ else
1367
+ accept_level = 4; //Next match and good top match
1368
+ }
1369
+ else
1370
+ accept_level = 5;
1371
+
1372
+ /* Could allow some match flexibility here sS$ etc */
1373
+
1374
+ /* Now set confirmation level according to how much we can believe the tess
1375
+ char. */
1376
+
1377
+ if ((accept_level == 0) && !confusion_match)
1378
+ return 3;
1379
+
1380
+ if ((accept_level <= 1) &&
1381
+ (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
1382
+ return 3;
1383
+
1384
+ if ((accept_level == 2) &&
1385
+ !confusion_match && !dodgy_char &&
1386
+ good_quality_word &&
1387
+ dict_word &&
1388
+ (checked_dict_word || !nn_double_check_dict) && sensible_word)
1389
+ return 2;
1390
+
1391
+ if (confusion_match &&
1392
+ (accept_level <= nn_conf_accept_level) &&
1393
+ (good_quality_word ||
1394
+ (!nn_conf_test_good_qual &&
1395
+ !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
1396
+ (dict_word || !nn_conf_test_dict) &&
1397
+ (checked_dict_word || !nn_conf_double_check_dict) &&
1398
+ (sensible_word || !nn_conf_test_sensible))
1399
+ return 1;
1400
+
1401
+ if (!confusion_match &&
1402
+ nn_lax &&
1403
+ (accept_level == 3) &&
1404
+ (good_quality_word || !nn_conf_test_good_qual) &&
1405
+ (dict_word || !nn_conf_test_dict) &&
1406
+ (sensible_word || !nn_conf_test_sensible))
1407
+ return 1;
1408
+ else
1409
+ return 0;
1410
+ }
1411
+
1412
+
1413
+ /*************************************************************************
1414
+ * dont_allow_dubious_chars()
1415
+ * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong
1416
+ * if adjacent to a reject.
1417
+ *************************************************************************/
1418
+ void dont_allow_dubious_chars(WERD_RES *word) {
1419
+ int i = 0;
1420
+ int offset = 0;
1421
+ int rej_pos;
1422
+ int word_len = word->reject_map.length ();
1423
+
1424
+ while (i < word_len) {
1425
+ /* Find next reject */
1426
+
1427
+ while ((i < word_len) && (word->reject_map[i].accepted ()))
1428
+ {
1429
+ offset += word->best_choice->lengths()[i];
1430
+ i++;
1431
+ }
1432
+
1433
+ if (i < word_len) {
1434
+ rej_pos = i;
1435
+
1436
+ /* Reject dubious chars to the left */
1437
+ i--;
1438
+ offset -= word->best_choice->lengths()[i];
1439
+ while ((i >= 0) &&
1440
+ STRING (dubious_chars_left_of_reject).contains (word->
1441
+ best_choice->
1442
+ string ()
1443
+ [offset])) {
1444
+ word->reject_map[i--].setrej_dubious ();
1445
+ offset -= word->best_choice->lengths()[i];
1446
+ }
1447
+
1448
+ /* Skip adjacent rejects */
1449
+
1450
+ for (i = rej_pos;
1451
+ (i < word_len) && (word->reject_map[i].rejected ());
1452
+ offset += word->best_choice->lengths()[i++]);
1453
+
1454
+ /* Reject dubious chars to the right */
1455
+
1456
+ while ((i < word_len) &&
1457
+ STRING (dubious_chars_right_of_reject).contains (word->
1458
+ best_choice->
1459
+ string ()
1460
+ [offset])) {
1461
+ offset += word->best_choice->lengths()[i];
1462
+ word->reject_map[i++].setrej_dubious ();
1463
+ }
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+
1469
+ /*************************************************************************
1470
+ * dont_allow_1Il()
1471
+ * Dont unreject LONE accepted 1Il conflict set chars
1472
+ *************************************************************************/
1473
+ void dont_allow_1Il(WERD_RES *word) {
1474
+ int i = 0;
1475
+ int offset;
1476
+ int word_len = word->reject_map.length ();
1477
+ const char *s = word->best_choice->string ().string ();
1478
+ const char *lengths = word->best_choice->lengths ().string ();
1479
+ BOOL8 accepted_1Il = FALSE;
1480
+
1481
+ for (i = 0, offset = 0; i < word_len;
1482
+ offset += word->best_choice->lengths()[i++]) {
1483
+ if (word->reject_map[i].accepted ()) {
1484
+ if (STRING (conflict_set_I_l_1).contains (s[offset]))
1485
+ accepted_1Il = TRUE;
1486
+ else {
1487
+ if (unicharset.get_isalpha (s + offset, lengths[i]) ||
1488
+ unicharset.get_isdigit (s + offset, lengths[i]))
1489
+ return; // >=1 non 1Il ch accepted
1490
+ }
1491
+ }
1492
+ }
1493
+ if (!accepted_1Il)
1494
+ return; //Nothing to worry about
1495
+
1496
+ for (i = 0, offset = 0; i < word_len;
1497
+ offset += word->best_choice->lengths()[i++]) {
1498
+ if (STRING (conflict_set_I_l_1).contains (s[offset]) &&
1499
+ word->reject_map[i].accepted ())
1500
+ word->reject_map[i].setrej_postNN_1Il ();
1501
+ }
1502
+ }
1503
+
1504
+
1505
+ inT16 count_alphanums( //how many alphanums
1506
+ WERD_RES *word) {
1507
+ int count = 0;
1508
+ int i;
1509
+ int offset;
1510
+
1511
+ for (i = 0, offset = 0; i < word->reject_map.length ();
1512
+ offset += word->best_choice->lengths()[i++]) {
1513
+ if ((word->reject_map[i].accepted ()) &&
1514
+ (unicharset.get_isalpha (word->best_choice->string ().string() + offset,
1515
+ word->best_choice->lengths ()[i]) ||
1516
+ unicharset.get_isdigit (word->best_choice->string ().string() + offset,
1517
+ word->best_choice->lengths ()[i])))
1518
+ count++;
1519
+ }
1520
+ return count;
1521
+ }
1522
+
1523
+
1524
+ void reject_mostly_rejects( //rej all if most rejectd
1525
+ WERD_RES *word) {
1526
+ /* Reject the whole of the word if the fraction of rejects exceeds a limit */
1527
+
1528
+ if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
1529
+ rej_whole_of_mostly_reject_word_fract)
1530
+ word->reject_map.rej_word_mostly_rej ();
1531
+ }
1532
+
1533
+
1534
+ BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
1535
+ inT16 char_quality;
1536
+ inT16 accepted_char_quality;
1537
+
1538
+ if (word->best_choice->lengths ().length () <= 1)
1539
+ return FALSE;
1540
+
1541
+ if (!STRING (ok_repeated_ch_non_alphanum_wds).
1542
+ contains (word->best_choice->string ()[0]))
1543
+ return FALSE;
1544
+
1545
+ if (!repeated_ch_string (word->best_choice->string ().string (),
1546
+ word->best_choice->lengths ().string ()))
1547
+ return FALSE;
1548
+
1549
+ word_char_quality(word, row, &char_quality, &accepted_char_quality);
1550
+
1551
+ if ((word->best_choice->lengths ().length () == char_quality) &&
1552
+ (char_quality == accepted_char_quality))
1553
+ return TRUE;
1554
+ else
1555
+ return FALSE;
1556
+ }
1557
+
1558
+
1559
+ BOOL8 repeated_ch_string(const char *rep_ch_str,
1560
+ const char *lengths) {
1561
+ UNICHAR_ID c;
1562
+
1563
+ if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
1564
+ return FALSE;
1565
+ }
1566
+
1567
+ c = unicharset.unichar_to_id(rep_ch_str, *lengths);
1568
+ rep_ch_str += *(lengths++);
1569
+ while (*rep_ch_str != '\0' &&
1570
+ unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {
1571
+ rep_ch_str++;
1572
+ }
1573
+ if (*rep_ch_str == '\0')
1574
+ return TRUE;
1575
+ return FALSE;
1576
+ }
1577
+
1578
+
1579
+ inT16 safe_dict_word(const char *s) {
1580
+ int dict_word_type;
1581
+
1582
+ dict_word_type = dict_word (s);
1583
+ if (dict_word_type == DOC_DAWG_PERM)
1584
+ return 0;
1585
+ else
1586
+ return dict_word_type;
1587
+ }
1588
+
1589
+
1590
+ void flip_hyphens(WERD_RES *word) {
1591
+ char *str = (char *) word->best_choice->string ().string ();
1592
+ int i = 0;
1593
+ int offset = 0;
1594
+ PBLOB_IT outword_it;
1595
+ int prev_right = -9999;
1596
+ int next_left;
1597
+ TBOX out_box;
1598
+ float aspect_ratio;
1599
+
1600
+ if (tessedit_lower_flip_hyphen <= 1)
1601
+ return;
1602
+
1603
+ outword_it.set_to_list (word->outword->blob_list ());
1604
+
1605
+ for (outword_it.mark_cycle_pt ();
1606
+ !outword_it.cycled_list (); outword_it.forward (),
1607
+ offset += word->best_choice->lengths()[i++]) {
1608
+ out_box = outword_it.data ()->bounding_box ();
1609
+ if (outword_it.at_last ())
1610
+ next_left = 9999;
1611
+ else
1612
+ next_left = outword_it.data_relative (1)->bounding_box ().left ();
1613
+ /*
1614
+ Dont touch small or touching blobs - it is too dangerous
1615
+ */
1616
+ if ((out_box.width () > 8 * word->denorm.scale ()) &&
1617
+ (out_box.left () > prev_right) && (out_box.right () < next_left)) {
1618
+ aspect_ratio = out_box.width () / (float) out_box.height ();
1619
+ if (str[offset] == '.') {
1620
+ if (aspect_ratio >= tessedit_upper_flip_hyphen) {
1621
+ /* Certain HYPHEN */
1622
+ str[offset] = '-';
1623
+ if (word->reject_map[i].rejected ())
1624
+ word->reject_map[i].setrej_hyphen_accept ();
1625
+ }
1626
+ if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
1627
+ word->reject_map[i].accepted ())
1628
+ //Suspected HYPHEN
1629
+ word->reject_map[i].setrej_hyphen ();
1630
+ }
1631
+ else if (str[offset] == '-') {
1632
+ if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
1633
+ (word->reject_map[i].rejected ()))
1634
+ word->reject_map[i].setrej_hyphen_accept ();
1635
+ //Certain HYPHEN
1636
+
1637
+ if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
1638
+ (word->reject_map[i].accepted ()))
1639
+ //Suspected HYPHEN
1640
+ word->reject_map[i].setrej_hyphen ();
1641
+ }
1642
+ }
1643
+ prev_right = out_box.right ();
1644
+ }
1645
+ }
1646
+
1647
+
1648
+ void flip_0O(WERD_RES *word) {
1649
+ char *str = (char *) word->best_choice->string ().string ();
1650
+ char *lengths = (char *) word->best_choice->lengths ().string ();
1651
+ int i;
1652
+ int offset;
1653
+ PBLOB_IT outword_it;
1654
+ TBOX out_box;
1655
+
1656
+ if (!tessedit_flip_0O)
1657
+ return;
1658
+
1659
+ outword_it.set_to_list (word->outword->blob_list ());
1660
+
1661
+ for (i = 0, offset = 0, outword_it.mark_cycle_pt ();
1662
+ !outword_it.cycled_list (); offset += lengths[i++], outword_it.forward ()) {
1663
+ if (unicharset.get_isupper (str + offset, lengths[i]) ||
1664
+ unicharset.get_isdigit (str + offset, lengths[i])) {
1665
+ out_box = outword_it.data ()->bounding_box ();
1666
+ if ((out_box.top () < bln_baseline_offset + bln_x_height) ||
1667
+ (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))
1668
+ return; //Beware words with sub/superscripts
1669
+ }
1670
+ }
1671
+
1672
+ for (i = 1, offset = lengths[0]; str[offset] != '\0';
1673
+ offset += lengths[i++], outword_it.forward ()) {
1674
+ if (lengths[i] == 1 &&
1675
+ ((str[offset] == '0') || (str[offset] == 'O'))) {
1676
+ /* A0A */
1677
+ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1678
+ lengths[i + 1] > 0 &&
1679
+ non_O_upper (str + offset + lengths[i], lengths[i + 1])) {
1680
+ str[offset] = 'O';
1681
+ }
1682
+ /* A00A */
1683
+ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1684
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1685
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1686
+ lengths[i + 2] > 0 &&
1687
+ non_O_upper (str + offset + lengths[i] + lengths[i + 1],
1688
+ lengths[i + 2])) {
1689
+ str[offset] = 'O';
1690
+ str[offset + lengths[i]] = 'O';
1691
+ offset += lengths[i++];
1692
+ }
1693
+ /* AA0<non digit or end of word> */
1694
+ if ((i > 1) &&
1695
+ non_O_upper (str + offset - lengths[i - 1] - lengths[i - 2],
1696
+ lengths[i - 2]) &&
1697
+ non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&
1698
+ lengths[i + 1] > 0 &&
1699
+ !unicharset.get_isdigit (str + offset + lengths[i], lengths[i + 1]) &&
1700
+ (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'l') &&
1701
+ (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'I')) {
1702
+ str[offset] = 'O';
1703
+ }
1704
+ /* 9O9 */
1705
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1706
+ lengths[i + 1] > 0 &&
1707
+ non_0_digit (str + offset + lengths[i], lengths[i + 1])) {
1708
+ str[offset] = '0';
1709
+ }
1710
+ /* 9OOO */
1711
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1712
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1713
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1714
+ ((lengths[i + 2] == 1 &&
1715
+ str[offset + lengths[i] + lengths[i + 1]] == '0') ||
1716
+ (lengths[i + 2] == 1 &&
1717
+ str[offset + lengths[i] + lengths[i + 1]] == 'O'))) {
1718
+ str[offset] = '0';
1719
+ str[offset + lengths[i]] = '0';
1720
+ str[offset + lengths[i] + lengths[i + 1]] = '0';
1721
+ offset += lengths[i++];
1722
+ offset += lengths[i++];
1723
+ }
1724
+ /* 9OO<non upper> */
1725
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1726
+ ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||
1727
+ (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&
1728
+ lengths[i + 2] > 0 &&
1729
+ !unicharset.get_isupper (str + offset + lengths[i] + lengths[i + 1],
1730
+ lengths[i + 2])) {
1731
+ str[offset] = '0';
1732
+ str[offset + lengths[i]] = '0';
1733
+ offset += lengths[i++];
1734
+ }
1735
+ /* 9O<non upper> */
1736
+ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&
1737
+ lengths[i + 1] > 0 &&
1738
+ !unicharset.get_isupper (str + offset + lengths[i], lengths[i + 1])) {
1739
+ str[offset] = '0';
1740
+ }
1741
+ /* 9[.,]OOO.. */
1742
+ if ((i > 1) &&
1743
+ ((lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == '.') ||
1744
+ (lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == ',')) &&
1745
+ (unicharset.get_isdigit (str + offset -
1746
+ lengths[i - 1] - lengths[i - 2],
1747
+ lengths[i - 2]) ||
1748
+ (lengths[i - 2] == 1 &&
1749
+ str[offset - lengths[i - 1] - lengths[i - 2]] == 'O'))) {
1750
+ if (lengths[i - 2] == 1 &&
1751
+ str[offset - lengths[i - 1] - lengths[i - 2]] == 'O')
1752
+ str[offset - lengths[i - 1] - lengths[i - 2]] = '0';
1753
+ while (lengths[i] == 1 &&
1754
+ ((str[offset] == 'O') || (str[offset] == '0'))) {
1755
+ str[offset] = '0';
1756
+ offset += lengths[i++];
1757
+ }
1758
+ i--;
1759
+ offset -= lengths[i];
1760
+ }
1761
+ }
1762
+ }
1763
+ }
1764
+
1765
+
1766
+ BOOL8 non_O_upper(const char* str, int length) {
1767
+ return unicharset.get_isupper (str, length) &&
1768
+ (!unicharset.eq(unicharset.unichar_to_id(str, length), "O"));
1769
+ }
1770
+
1771
+
1772
+ BOOL8 non_0_digit(const char* str, int length) {
1773
+ return unicharset.get_isdigit (str, length) &&
1774
+ (!unicharset.eq(unicharset.unichar_to_id(str, length), "0"));
1775
+ }