tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1842 @@
1
+ /******************************************************************
2
+ * File: control.cpp (Formerly control.c)
3
+ * Description: Module-independent matcher controller.
4
+ * Author: Ray Smith
5
+ * Created: Thu Apr 23 11:09:58 BST 1992
6
+ * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7
+ *
8
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
9
+ ** Licensed under the Apache License, Version 2.0 (the "License");
10
+ ** you may not use this file except in compliance with the License.
11
+ ** You may obtain a copy of the License at
12
+ ** http://www.apache.org/licenses/LICENSE-2.0
13
+ ** Unless required by applicable law or agreed to in writing, software
14
+ ** distributed under the License is distributed on an "AS IS" BASIS,
15
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ ** See the License for the specific language governing permissions and
17
+ ** limitations under the License.
18
+ *
19
+ **********************************************************************/
20
+
21
+ #include "mfcpch.h"
22
+ #include "mainblk.h"
23
+ #include <string.h>
24
+ #include <math.h>
25
+ #ifdef __UNIX__
26
+ #include <assert.h>
27
+ #include <unistd.h>
28
+ #include <errno.h>
29
+ #endif
30
+ #include <ctype.h>
31
+ #include "ocrclass.h"
32
+ #include "werdit.h"
33
+ #include "drawfx.h"
34
+ #include "tfacep.h"
35
+ #include "tessbox.h"
36
+ #include "tessvars.h"
37
+ //#include "fxtop.h"
38
+ #include "pgedit.h"
39
+ #include "reject.h"
40
+ #include "adaptions.h"
41
+ #include "charcut.h"
42
+ #include "fixxht.h"
43
+ #include "fixspace.h"
44
+ #include "genblob.h"
45
+ #include "docqual.h"
46
+ #include "control.h"
47
+ #include "secname.h"
48
+ #include "output.h"
49
+ #include "callcpp.h"
50
+ #include "notdll.h"
51
+ #include "tordvars.h"
52
+ #include "adaptmatch.h"
53
+ #include "globals.h"
54
+
55
+ #define MIN_FONT_ROW_COUNT 8
56
+ #define MAX_XHEIGHT_DIFF 3
57
+
58
+ #define EXTERN
59
+ //extern "C" {
60
+ //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");
61
+
62
+ //extern FILE* matcher_fp;
63
+ //extern FILE* correct_fp;
64
+ //};
65
+ BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
66
+ EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
67
+ EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
68
+ EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
69
+ EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
70
+ EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
71
+ EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
72
+ EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
73
+ EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
74
+ "Try to improve fuzzy spaces");
75
+ EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
76
+ "Dont bother with word plausibility");
77
+ EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
78
+
79
+ EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
80
+ EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
81
+ "Reject suspect fullstops");
82
+ EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
83
+ EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
84
+ "Do our own adaption - ems only");
85
+ EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
86
+ "Add words to the document dictionary");
87
+ EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
88
+ EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
89
+ EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
90
+ "Apply xht fix up even if done");
91
+ EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
92
+ "Apply xht fix up even in no rejects");
93
+ EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
94
+ EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
95
+ EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
96
+ EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
97
+ "Block and Row stats");
98
+ EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
99
+ EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
100
+ EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
101
+
102
+ EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
103
+ EXTERN
104
+ STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
105
+ EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
106
+ "2nd Trailing punctuation");
107
+
108
+ EXTERN double_VAR (quality_rej_pc, 0.08,
109
+ "good_quality_doc lte rejection limit");
110
+ EXTERN double_VAR (quality_blob_pc, 0.0,
111
+ "good_quality_doc gte good blobs limit");
112
+ EXTERN double_VAR (quality_outline_pc, 1.0,
113
+ "good_quality_doc lte outline error limit");
114
+ EXTERN double_VAR (quality_char_pc, 0.95,
115
+ "good_quality_doc gte good char limit");
116
+ EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
117
+ "alphas in a good word");
118
+
119
+ EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
120
+ "Use reject map to control Tesseract adaption");
121
+ EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
122
+ "Adaptation decision algorithm for tess");
123
+ EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
124
+ "Adaptation decision algorithm for ems matrix matcher");
125
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
126
+ "Adapt using clusterer after pass 1");
127
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
128
+ "Adapt using clusterer after pass 1");
129
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
130
+ "Adapt using clusterer after pass 1");
131
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
132
+ "Adapt using clusterer before Tess adaping during pass 1");
133
+ EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
134
+ "Adaptation decision algorithm for matrix matcher");
135
+ EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
136
+ "Generate and print debug information for adaption");
137
+ EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
138
+ "Do minimal rejection on pass 1 output");
139
+ EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
140
+ "Test adaption criteria");
141
+ EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
142
+ "Adapt to all docs over time");
143
+ EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
144
+ EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
145
+ "Adaptation decision algorithm for tess");
146
+ BOOL_VAR (save_best_choices, FALSE, "Save the results of the recognition step"
147
+ " (blob_choices) within the corresponding WERD_CHOICE");
148
+
149
+ EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
150
+ EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
151
+ EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
152
+
153
+ extern int MatcherDebugLevel;
154
+ extern int display_ratings;
155
+ extern int number_debug;
156
+ extern int adjust_debug;
157
+ FILE *choice_file = NULL; //Choice file ptr
158
+
159
+ CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
160
+ /* DEBUGGING */
161
+ inT16 blob_count(WERD *w) {
162
+ return w->blob_list ()->length ();
163
+ }
164
+
165
+
166
+ /**********************************************************************
167
+ * recog_pseudo_word
168
+ *
169
+ * Make a word from the selected blobs and run Tess on them.
170
+ **********************************************************************/
171
+
172
+ void recog_pseudo_word( //recognize blobs
173
+ BLOCK_LIST *block_list, //blocks to check
174
+ TBOX &selection_box) {
175
+ WERD *word;
176
+ ROW *pseudo_row; //row of word
177
+ BLOCK *pseudo_block; //block of word
178
+
179
+ word = make_pseudo_word (block_list, selection_box,
180
+ pseudo_block, pseudo_row);
181
+ if (word != NULL) {
182
+ recog_interactive(pseudo_block, pseudo_row, word);
183
+ delete word;
184
+ }
185
+ }
186
+
187
+
188
+ /**********************************************************************
189
+ * recog_interactive
190
+ *
191
+ * Recognize a single word in interactive mode.
192
+ **********************************************************************/
193
+
194
+ BOOL8 recog_interactive( //recognize blobs
195
+ BLOCK *, //block
196
+ ROW *row, //row of word
197
+ WERD *word //word to recognize
198
+ ) {
199
+ WERD_RES word_res(word);
200
+ inT16 char_qual;
201
+ inT16 good_char_qual;
202
+
203
+ classify_word_pass2(&word_res, row);
204
+ #ifndef SECURE_NAMES
205
+ if (tessedit_debug_quality_metrics) {
206
+ word_char_quality(&word_res, row, &char_qual, &good_char_qual);
207
+ tprintf
208
+ ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
209
+ word_res.reject_map.length (), word_blob_quality (&word_res, row),
210
+ word_outline_errs (&word_res), char_qual, good_char_qual);
211
+ }
212
+ #endif
213
+ return TRUE;
214
+ }
215
+
216
+
217
+ /**********************************************************************
218
+ * recog_all_words()
219
+ *
220
+ * Walk the current block list applying the specified word processor function
221
+ * to all words
222
+ **********************************************************************/
223
+
224
+ void recog_all_words( //process words
225
+ PAGE_RES *page_res, //page structure
226
+ volatile ETEXT_DESC *monitor, //progress monitor
227
+ TBOX *target_word_box,//specifies just to extract a retangle
228
+ inT16 dopasses //0 - all, 1 just pass 1, 2 passes 2 and higher
229
+ ) {
230
+ //reset page iterator
231
+ static PAGE_RES_IT page_res_it;
232
+ inT16 chars_in_word;
233
+ inT16 rejects_in_word;
234
+ static CHAR_SAMPLES_LIST em_clusters;
235
+ static CHAR_SAMPLE_LIST ems_waiting;
236
+ static CHAR_SAMPLES_LIST char_clusters;
237
+ static CHAR_SAMPLE_LIST chars_waiting;
238
+ inT16 blob_quality = 0;
239
+ inT16 outline_errs = 0;
240
+ static inT16 doc_blob_quality = 0;
241
+ static inT16 doc_outline_errs = 0;
242
+ static inT16 doc_char_quality = 0;
243
+ inT16 all_char_quality;
244
+ inT16 accepted_all_char_quality;
245
+ static inT16 good_char_count = 0;
246
+ static inT16 doc_good_char_quality = 0;
247
+ int i;
248
+
249
+
250
+ inT32 tess_adapt_mode = 0;
251
+ static inT32 word_count; //count of words in doc
252
+ inT32 word_index; //current word
253
+ static int dict_words;
254
+
255
+ if (tessedit_minimal_rej_pass1) {
256
+ tessedit_test_adaption.set_value (TRUE);
257
+ tessedit_minimal_rejection.set_value (TRUE);
258
+ }
259
+
260
+ if (tessedit_cluster_adapt_before_pass1) {
261
+ tess_adapt_mode = tessedit_tess_adaption_mode;
262
+ tessedit_tess_adaption_mode.set_value (0);
263
+ tessedit_tess_adapt_to_rejmap.set_value (TRUE);
264
+ }
265
+
266
+
267
+ if (dopasses==0 || dopasses==1)
268
+ {
269
+ page_res_it.page_res=page_res;
270
+ page_res_it.restart_page();
271
+
272
+ /* Pass 1 */
273
+ word_count = 0;
274
+ if (monitor != NULL) {
275
+ monitor->ocr_alive = TRUE;
276
+ while (page_res_it.word () != NULL) {
277
+ word_count++;
278
+ page_res_it.forward ();
279
+ }
280
+ page_res_it.restart_page ();
281
+ }
282
+ else
283
+ word_count = 1;
284
+
285
+ word_index = 0;
286
+
287
+ em_clusters.clear();
288
+ ems_waiting.clear();
289
+ char_clusters.clear();
290
+ chars_waiting.clear();
291
+ dict_words = 0;
292
+ doc_blob_quality = 0;
293
+ doc_outline_errs = 0;
294
+ doc_char_quality = 0;
295
+ good_char_count = 0;
296
+ doc_good_char_quality = 0;
297
+
298
+ while (page_res_it.word () != NULL) {
299
+ set_global_loc_code(LOC_PASS1);
300
+ word_index++;
301
+ if (monitor != NULL) {
302
+ monitor->ocr_alive = TRUE;
303
+ monitor->progress = 30 + 50 * word_index / word_count;
304
+ if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
305
+ (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
306
+ dict_words)))
307
+ return;
308
+ }
309
+ classify_word_pass1 (page_res_it.word (),
310
+ page_res_it.row ()->row, FALSE, NULL, NULL);
311
+
312
+ if (tessedit_test_adaption && !tessedit_minimal_rejection) {
313
+ if (!word_adaptable (page_res_it.word (),
314
+ tessedit_test_adaption_mode))
315
+ page_res_it.word ()->reject_map.rej_word_tess_failure ();
316
+ //FAKE PERM REJ
317
+ else {
318
+ const STRING* wordstr = &(page_res_it.word ()->best_choice->string ());
319
+ /* Override rejection mechanisms for this word */
320
+ const char* text = wordstr->string ();
321
+ for (i = 0; text[i] != '\0'; i++) {
322
+ if ((text[i] != ' ')
323
+ && page_res_it.word ()->reject_map[i].rejected ())
324
+ page_res_it.word ()->reject_map[i].
325
+ setrej_minimal_rej_accept();
326
+ }
327
+ }
328
+ }
329
+
330
+ if ((tessedit_cluster_adapt_after_pass1
331
+ || tessedit_cluster_adapt_after_pass3
332
+ || tessedit_cluster_adapt_before_pass1)
333
+ && tessedit_cluster_adaption_mode != 0) {
334
+ collect_characters_for_adaption (page_res_it.word (),
335
+ &char_clusters, &chars_waiting);
336
+ }
337
+ // Count dict words.
338
+ if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
339
+ ++dict_words;
340
+ page_res_it.forward ();
341
+ }
342
+
343
+ if (tessedit_cluster_adapt_before_pass1)
344
+ tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
345
+
346
+ page_res_it.restart_page ();
347
+ while ((tessedit_cluster_adapt_after_pass1
348
+ || tessedit_cluster_adapt_before_pass1)
349
+ && page_res_it.word () != NULL) {
350
+ if (monitor != NULL)
351
+ monitor->ocr_alive = TRUE;
352
+ if (tessedit_cluster_adapt_after_pass1)
353
+ adapt_to_good_samples (page_res_it.word (),
354
+ &char_clusters, &chars_waiting);
355
+ else
356
+ classify_word_pass1 (page_res_it.word (),
357
+ page_res_it.row ()->row,
358
+ TRUE, &char_clusters, &chars_waiting);
359
+
360
+ page_res_it.forward ();
361
+ }
362
+
363
+ //
364
+
365
+
366
+ }
367
+
368
+ if (dopasses==1) return;
369
+
370
+ /* Pass 2 */
371
+ page_res_it.restart_page ();
372
+ word_index = 0;
373
+ while (!tessedit_test_adaption && page_res_it.word () != NULL) {
374
+ set_global_loc_code(LOC_PASS2);
375
+ word_index++;
376
+ if (monitor != NULL) {
377
+ monitor->ocr_alive = TRUE;
378
+ monitor->progress = 80 + 10 * word_index / word_count;
379
+ if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
380
+ (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
381
+ dict_words)))
382
+ return;
383
+ }
384
+ //changed by jetsoft
385
+ //specific to its needs to extract one word when need
386
+
387
+ if (target_word_box)
388
+ {
389
+
390
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
391
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
392
+ if (!target_word_box->contains(center_pt))
393
+ {
394
+ page_res_it.forward ();
395
+ continue;
396
+ }
397
+
398
+ }
399
+ //end jetsoft
400
+
401
+ classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row);
402
+
403
+ if (tessedit_em_adaption_mode > 0)
404
+ collect_ems_for_adaption (page_res_it.word (),
405
+ &em_clusters, &ems_waiting);
406
+
407
+ if (tessedit_cluster_adapt_after_pass2
408
+ && tessedit_cluster_adaption_mode != 0)
409
+ collect_characters_for_adaption (page_res_it.word (),
410
+ &char_clusters, &chars_waiting);
411
+ page_res_it.forward ();
412
+ }
413
+
414
+ /* Another pass */
415
+ set_global_loc_code(LOC_FUZZY_SPACE);
416
+
417
+ if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
418
+ && !tessedit_word_for_word)
419
+ fix_fuzzy_spaces(monitor, word_count, page_res);
420
+
421
+ if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
422
+ // Initially ems only
423
+ print_em_stats(&em_clusters, &ems_waiting);
424
+
425
+ /* Pass 3 - used for checking confusion sets */
426
+ page_res_it.restart_page ();
427
+ word_index = 0;
428
+ while (!tessedit_test_adaption && page_res_it.word () != NULL) {
429
+ set_global_loc_code(LOC_MM_ADAPT);
430
+ word_index++;
431
+ if (monitor != NULL) {
432
+ monitor->ocr_alive = TRUE;
433
+ monitor->progress = 95 + 5 * word_index / word_count;
434
+ }
435
+ check_debug_pt (page_res_it.word (), 70);
436
+ /* Use good matches to sort out confusions */
437
+
438
+
439
+ //changed by jetsoft
440
+ //specific to its needs to extract one word when need
441
+
442
+ if (target_word_box)
443
+ {
444
+
445
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
446
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
447
+ if (!target_word_box->contains(center_pt))
448
+ {
449
+ page_res_it.forward ();
450
+ continue;
451
+ }
452
+
453
+ }
454
+ // end jetsoft
455
+
456
+ if (tessedit_em_adaption_mode != 0)
457
+ adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
458
+
459
+ if (tessedit_cluster_adapt_after_pass2
460
+ && tessedit_cluster_adaption_mode != 0)
461
+ adapt_to_good_samples (page_res_it.word (),
462
+ &char_clusters, &chars_waiting);
463
+
464
+ if (tessedit_reject_fullstops
465
+ && strchr (page_res_it.word ()->best_choice->string ().string (),
466
+ '.') != NULL)
467
+ reject_all_fullstops (page_res_it.word ());
468
+ else if (tessedit_reject_suspect_fullstops
469
+ && strchr (page_res_it.word ()->best_choice->string ().
470
+ string (), '.') != NULL)
471
+ reject_suspect_fullstops (page_res_it.word ());
472
+
473
+ page_res_it.rej_stat_word ();
474
+ chars_in_word = page_res_it.word ()->reject_map.length ();
475
+ rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
476
+
477
+ blob_quality = word_blob_quality (page_res_it.word (),
478
+ page_res_it.row ()->row);
479
+ doc_blob_quality += blob_quality;
480
+ outline_errs = word_outline_errs (page_res_it.word ());
481
+ doc_outline_errs += outline_errs;
482
+ word_char_quality (page_res_it.word (),
483
+ page_res_it.row ()->row,
484
+ &all_char_quality, &accepted_all_char_quality);
485
+ doc_char_quality += all_char_quality;
486
+ uinT8 permuter_type = page_res_it.word ()->best_choice->permuter ();
487
+ if ((permuter_type == SYSTEM_DAWG_PERM) ||
488
+ (permuter_type == FREQ_DAWG_PERM) ||
489
+ (permuter_type == USER_DAWG_PERM)) {
490
+ good_char_count += chars_in_word - rejects_in_word;
491
+ doc_good_char_quality += accepted_all_char_quality;
492
+ }
493
+ check_debug_pt (page_res_it.word (), 80);
494
+ if (tessedit_reject_bad_qual_wds &&
495
+ (blob_quality == 0) && (outline_errs >= chars_in_word))
496
+ page_res_it.word ()->reject_map.rej_word_bad_quality ();
497
+ check_debug_pt (page_res_it.word (), 90);
498
+ page_res_it.forward ();
499
+ }
500
+
501
+ page_res_it.restart_page ();
502
+ while (!tessedit_test_adaption
503
+ && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
504
+ if (monitor != NULL)
505
+ monitor->ocr_alive = TRUE;
506
+
507
+ //changed by jetsoft
508
+ //specific to its needs to extract one word when need
509
+
510
+ if (target_word_box)
511
+ {
512
+
513
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
514
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
515
+ if (!target_word_box->contains(center_pt))
516
+ {
517
+ page_res_it.forward ();
518
+ continue;
519
+ }
520
+
521
+ }
522
+
523
+ //end jetsoft
524
+ if (tessedit_cluster_adaption_mode != 0)
525
+ adapt_to_good_samples (page_res_it.word (),
526
+ &char_clusters, &chars_waiting);
527
+ page_res_it.forward ();
528
+ }
529
+
530
+ #ifndef SECURE_NAMES
531
+ if (tessedit_debug_quality_metrics) {
532
+ tprintf
533
+ ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
534
+ page_res->char_count, page_res->rej_count,
535
+ page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
536
+ doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
537
+ doc_outline_errs / (float) page_res->char_count, doc_char_quality,
538
+ doc_char_quality / (float) page_res->char_count,
539
+ doc_good_char_quality,
540
+ good_char_count >
541
+ 0 ? doc_good_char_quality / (float) good_char_count : 0.0);
542
+ }
543
+ #endif
544
+ BOOL8 good_quality_doc =
545
+ (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
546
+ &&
547
+ (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
548
+ (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
549
+ (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
550
+
551
+ /* Do whole document or whole block rejection pass*/
552
+
553
+ if (!tessedit_test_adaption) {
554
+ set_global_loc_code(LOC_DOC_BLK_REJ);
555
+ quality_based_rejection(page_res_it, good_quality_doc);
556
+ }
557
+ font_recognition_pass(page_res_it);
558
+
559
+ /* Write results pass */
560
+ set_global_loc_code(LOC_WRITE_RESULTS);
561
+ // This is now redundant, but retained commented so show how to obtain
562
+ // bounding boxes and style information.
563
+
564
+ ////changed by jetsoft
565
+ //needed for dll to output memory structure
566
+ if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
567
+ output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
568
+ // end jetsoft
569
+
570
+ }
571
+
572
+
573
+ /**********************************************************************
574
+ * classify_word_pass1
575
+ *
576
+ * Baseline normalize the word and pass it to Tess.
577
+ **********************************************************************/
578
+
579
+ void classify_word_pass1( //recog one word
580
+ WERD_RES *word, //word to do
581
+ ROW *row,
582
+ BOOL8 cluster_adapt,
583
+ CHAR_SAMPLES_LIST *char_clusters,
584
+ CHAR_SAMPLE_LIST *chars_waiting) {
585
+ WERD *bln_word; //baseline norm copy
586
+ //detailed results
587
+ BLOB_CHOICE_LIST_CLIST local_blob_choices;
588
+ BLOB_CHOICE_LIST_CLIST *blob_choices;
589
+ BOOL8 adapt_ok;
590
+ const char *rejmap;
591
+ inT16 index;
592
+ STRING mapstr = "";
593
+ char *match_string;
594
+ char word_string[1024];
595
+
596
+ if (save_best_choices)
597
+ blob_choices = new BLOB_CHOICE_LIST_CLIST();
598
+ else
599
+ blob_choices = &local_blob_choices;
600
+
601
+ if (matcher_fp != NULL) {
602
+ fgets (word_string, 1023, correct_fp);
603
+ if ((match_string = strchr (word_string, '\r')) != NULL)
604
+ *match_string = '\0';
605
+ if ((match_string = strchr (word_string, '\n')) != NULL)
606
+ *match_string = '\0';
607
+ if (word_string[0] != '\0') {
608
+ word->word->set_text (word_string);
609
+ word_answer = (char *) word->word->text ();
610
+ }
611
+ else
612
+ word_answer = NULL;
613
+ }
614
+
615
+ check_debug_pt (word, 0);
616
+ matcher_pass = 0;
617
+ bln_word = make_bln_copy (word->word, row, word->x_height, &word->denorm);
618
+
619
+ word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
620
+ tess_default_matcher,
621
+ word->raw_choice, blob_choices,
622
+ word->outword);
623
+ /*
624
+ Test for TESS screw up on word. Recog_word has already ensured that the
625
+ choice list, outword blob lists and best_choice string are the same
626
+ length. A TESS screw up is indicated by a blank filled or 0 length string.
627
+ */
628
+ if ((word->best_choice->lengths ().length () == 0) ||
629
+ (strspn (word->best_choice->string ().string (), " ") ==
630
+ word->best_choice->string ().length ())) {
631
+ word->done = FALSE; //Try again on pass2 - adaption may help
632
+ word->tess_failed = TRUE;
633
+ word->reject_map.initialise (word->best_choice->lengths ().length ());
634
+ word->reject_map.rej_word_tess_failure ();
635
+ }
636
+ else {
637
+ word->tess_failed = FALSE;
638
+ if ((word->best_choice->lengths ().length () !=
639
+ word->outword->blob_list ()->length ()) ||
640
+ (word->best_choice->lengths ().length () != blob_choices->length ())) {
641
+ tprintf
642
+ ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
643
+ word->best_choice->string ().string (),
644
+ word->best_choice->lengths ().length (),
645
+ word->outword->blob_list ()->length (), blob_choices->length ());
646
+ }
647
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
648
+ word->outword->blob_list ()->length ());
649
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
650
+ blob_choices->length ());
651
+
652
+ /*
653
+ The adaption step used to be here. It has been moved to after
654
+ make_reject_map so that we know whether the word will be accepted in the
655
+ first pass or not. This move will PREVENT adaption to words containing
656
+ double quotes because the word will not be identical to what tess thinks
657
+ its best choice is. (See CurrentBestChoiceIs in
658
+ danj/microfeatures/stopper.c which is used by AdaptableWord in
659
+ danj/microfeatures/adaptmatch.c)
660
+ */
661
+
662
+ if (word->word->flag (W_REP_CHAR)) {
663
+ fix_rep_char(word);
664
+ }
665
+ else {
666
+ fix_quotes (word->best_choice,
667
+ //turn to double
668
+ word->outword, blob_choices);
669
+ if (tessedit_fix_hyphens)
670
+ //turn 2 to 1
671
+ fix_hyphens (word->best_choice, word->outword, blob_choices);
672
+ record_certainty (word->best_choice->certainty (), 1);
673
+ //accounting
674
+
675
+ word->tess_accepted = tess_acceptable_word (word->best_choice,
676
+ word->raw_choice);
677
+
678
+ word->tess_would_adapt = tess_adaptable_word (word->outword,
679
+ word->best_choice,
680
+ word->raw_choice);
681
+ // Also sets word->done flag
682
+ make_reject_map (word, blob_choices, row, 1);
683
+
684
+ adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
685
+
686
+ if (cluster_adapt)
687
+ adapt_to_good_samples(word, char_clusters, chars_waiting);
688
+
689
+ if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
690
+ if (!tessedit_tess_adapt_to_rejmap)
691
+ rejmap = NULL;
692
+ else {
693
+ ASSERT_HOST (word->reject_map.length () ==
694
+ word->best_choice->lengths ().length ());
695
+
696
+ for (index = 0; index < word->reject_map.length (); index++) {
697
+ if (adapt_ok || word->reject_map[index].accepted ())
698
+ mapstr += '1';
699
+ else
700
+ mapstr += '0';
701
+ }
702
+ rejmap = mapstr.string ();
703
+ }
704
+
705
+ //adapt to it
706
+ tess_adapter (word->outword, &word->denorm,
707
+ *word->best_choice,
708
+ *word->raw_choice, rejmap);
709
+ }
710
+
711
+ if (tessedit_enable_doc_dict)
712
+ tess_add_doc_word (word->best_choice);
713
+ set_word_fonts(word, blob_choices);
714
+ }
715
+ }
716
+ #if 0
717
+ if (tessedit_print_text) {
718
+ write_cooked_text (bln_word, word->best_choice->string (),
719
+ word->done, FALSE, stdout);
720
+ }
721
+ #endif
722
+ delete bln_word;
723
+
724
+ // Save best choices in the WERD_CHOICE if needed
725
+ if (blob_choices != &local_blob_choices)
726
+ word->best_choice->set_blob_choices(blob_choices);
727
+ else
728
+ blob_choices->deep_clear();
729
+ }
730
+
731
+
732
+ /**********************************************************************
733
+ * classify_word_pass2
734
+ *
735
+ * Control what to do with the word in pass 2
736
+ **********************************************************************/
737
+
738
+ void classify_word_pass2( //word to do
739
+ WERD_RES *word,
740
+ ROW *row) {
741
+ BOOL8 done_this_pass = FALSE;
742
+ WERD_RES new_x_ht_word (word->word);
743
+ float new_x_ht = 0.0;
744
+ inT16 old_xht_reject_count;
745
+ inT16 new_xht_reject_count;
746
+ inT16 old_xht_accept_count;
747
+ inT16 new_xht_accept_count;
748
+ BOOL8 accept_new_x_ht = FALSE;
749
+ inT16 old_chs_in_wd;
750
+ inT16 new_chs_in_wd;
751
+ inT16 old_word_quality;
752
+ inT16 new_word_quality;
753
+ inT16 dummy;
754
+
755
+ set_global_subloc_code(SUBLOC_NORM);
756
+ check_debug_pt (word, 30);
757
+ if (!word->done ||
758
+ tessedit_training_tess ||
759
+ tessedit_training_wiseowl || tessedit_dump_choices) {
760
+ word->caps_height = 0.0;
761
+ if (word->x_height == 0.0f)
762
+ word->x_height = row->x_height();
763
+ if (word->outword != NULL) {
764
+ delete word->outword; //get rid of junk
765
+ delete word->best_choice;
766
+ delete word->raw_choice;
767
+ }
768
+ match_word_pass2 (word, row, word->x_height);
769
+ done_this_pass = TRUE;
770
+ check_debug_pt (word, 40);
771
+ }
772
+
773
+ if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
774
+ set_global_subloc_code(SUBLOC_FIX_XHT);
775
+ if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
776
+ (tessedit_xht_fiddles_on_no_rej_wds ||
777
+ (word->reject_map.reject_count () > 0))) {
778
+ if ((x_ht_check_word_occ >= 2) && word_occ_first)
779
+ check_block_occ(word);
780
+
781
+ if (tessedit_redo_xheight)
782
+ re_estimate_x_ht(word, &new_x_ht);
783
+
784
+ if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
785
+ ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
786
+ check_block_occ(word);
787
+ }
788
+ if (new_x_ht > 0) {
789
+ old_chs_in_wd = word->reject_map.length ();
790
+
791
+ /* Re-estimated x_ht error suggests a rematch is worthwhile. */
792
+ new_x_ht_word.x_height = new_x_ht;
793
+ new_x_ht_word.caps_height = 0.0;
794
+ match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height);
795
+ if (!new_x_ht_word.tess_failed) {
796
+ if ((x_ht_check_word_occ >= 1) && word_occ_first)
797
+ check_block_occ(&new_x_ht_word);
798
+
799
+ re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
800
+
801
+ if ((x_ht_check_word_occ >= 1) && !word_occ_first)
802
+ check_block_occ(&new_x_ht_word);
803
+
804
+ old_xht_reject_count = word->reject_map.reject_count ();
805
+ old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
806
+ new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
807
+ new_chs_in_wd = new_x_ht_word.reject_map.length ();
808
+ new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
809
+ accept_new_x_ht =
810
+ ((new_xht_accept_count > old_xht_accept_count) ||
811
+ ((new_xht_accept_count == old_xht_accept_count) &&
812
+ (new_xht_accept_count > 0))) &&
813
+ (!new_x_ht_word.guessed_x_ht ||
814
+ !new_x_ht_word.guessed_caps_ht);
815
+
816
+ if (accept_new_x_ht && x_ht_quality_check) {
817
+ word_char_quality(word, row, &old_word_quality, &dummy);
818
+ word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
819
+ if (old_word_quality > new_word_quality)
820
+ accept_new_x_ht = FALSE;
821
+ }
822
+
823
+ if (accept_new_x_ht && (x_ht_stringency > 0)) {
824
+ accept_new_x_ht =
825
+ (count_alphanums (&new_x_ht_word) > x_ht_stringency);
826
+ if (!accept_new_x_ht && rej_use_xht) {
827
+ if (debug_x_ht_level >= 1)
828
+ tprintf
829
+ ("Failed stringency test so reject original word\n");
830
+ word->reject_map.rej_word_xht_fixup ();
831
+ }
832
+ }
833
+
834
+ #ifndef SECURE_NAMES
835
+ if (debug_x_ht_level >= 1) {
836
+ tprintf ("New XHT Match:: %s ",
837
+ word->best_choice->string ().string ());
838
+ word->reject_map.print (debug_fp);
839
+ tprintf (" -> %s ",
840
+ new_x_ht_word.best_choice->string ().string ());
841
+ new_x_ht_word.reject_map.print (debug_fp);
842
+ tprintf (" %s->%s %s %s\n",
843
+ word->guessed_x_ht ? "GUESS" : "CERT",
844
+ new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
845
+ new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
846
+ accept_new_x_ht ? "ACCEPTED" : "");
847
+ }
848
+ #endif
849
+ }
850
+ if (accept_new_x_ht) {
851
+ /*
852
+ The new x_ht is deemed superior so put the final results in the real word
853
+ and destroy the old results
854
+ */
855
+ delete word->outword; //get rid of junk
856
+ word->outword = new_x_ht_word.outword;
857
+ word->denorm = new_x_ht_word.denorm;
858
+ delete word->best_choice;
859
+ word->best_choice = new_x_ht_word.best_choice;
860
+ delete word->raw_choice;
861
+ word->raw_choice = new_x_ht_word.raw_choice;
862
+ word->reject_map = new_x_ht_word.reject_map;
863
+ word->done = new_x_ht_word.done;
864
+ done_this_pass = TRUE;
865
+ }
866
+ else {
867
+ /*
868
+ The new x_ht is no better, so destroy the copy word and put any uncertain
869
+ x or cap ht estimate back to default. (I.e. dont blame me if its bad!)
870
+ Conditionally, use any ammended block occ chars.
871
+ */
872
+ //get rid of junk
873
+ delete new_x_ht_word.outword;
874
+ delete new_x_ht_word.best_choice;
875
+ delete new_x_ht_word.raw_choice;
876
+ }
877
+ //to keep new destructor happy
878
+ new_x_ht_word.outword = NULL;
879
+ //to keep new destructor happy
880
+ new_x_ht_word.best_choice = NULL;
881
+ //to keep new destructor happy
882
+ new_x_ht_word.raw_choice = NULL;
883
+
884
+ if (rej_mostly_reject_mode == 2) {
885
+ reject_mostly_rejects(word);
886
+ tprintf ("Rejecting mostly rejects on %s ",
887
+ word->best_choice->string ().string ());
888
+ }
889
+ }
890
+
891
+ set_global_subloc_code(SUBLOC_NORM);
892
+
893
+ if (done_this_pass && !word->done && tessedit_save_stats)
894
+ SaveBadWord (word->best_choice->string ().string (),
895
+ word->best_choice->certainty ());
896
+ record_certainty (word->best_choice->certainty (), 2);
897
+ //accounting
898
+ }
899
+ #ifndef GRAPHICS_DISABLED
900
+ if (tessedit_draw_outwords) {
901
+ if (fx_win == NULL)
902
+ create_fx_win();
903
+ clear_fx_win();
904
+ word->outword->plot (fx_win);
905
+ TBOX wbox = word->outword->bounding_box();
906
+ fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
907
+ wbox.right(), wbox.bottom());
908
+ //make_picture_current(fx_win);
909
+ ScrollView::Update();
910
+ }
911
+ #endif
912
+
913
+ set_global_subloc_code(SUBLOC_NORM);
914
+ #if 0
915
+ if (tessedit_print_text) {
916
+ write_cooked_text (word->outword, word->best_choice->string (),
917
+ word->done, done_this_pass, stdout);
918
+ }
919
+ #endif
920
+ check_debug_pt (word, 50);
921
+ }
922
+
923
+
924
+ /**********************************************************************
925
+ * match_word_pass2
926
+ *
927
+ * Baseline normalize the word and pass it to Tess.
928
+ **********************************************************************/
929
+
930
+ void match_word_pass2( //recog one word
931
+ WERD_RES *word, //word to do
932
+ ROW *row,
933
+ float x_height) {
934
+ WERD *bln_word; //baseline norm copy
935
+ //detailed results
936
+ BLOB_CHOICE_LIST_CLIST local_blob_choices;
937
+ BLOB_CHOICE_LIST_CLIST *blob_choices;
938
+
939
+ if (save_best_choices)
940
+ blob_choices = new BLOB_CHOICE_LIST_CLIST();
941
+ else
942
+ blob_choices = &local_blob_choices;
943
+
944
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
945
+ if (matcher_fp != NULL) {
946
+ word_answer = (char *) word->word->text ();
947
+ if (word_answer != NULL && word_answer[0] == '\0')
948
+ word_answer = NULL;
949
+ }
950
+ matcher_pass = 0;
951
+ bln_word = make_bln_copy (word->word, row, x_height, &word->denorm);
952
+ set_global_subsubloc_code(SUBSUBLOC_TESS);
953
+ if (tessedit_training_tess)
954
+ word->best_choice = correct_segment_pass2 (bln_word,
955
+ &word->denorm,
956
+ tess_default_matcher,
957
+ tess_training_tester,
958
+ word->raw_choice,
959
+ blob_choices, word->outword);
960
+ else if (tessedit_dump_choices)
961
+ word->best_choice = test_segment_pass2 (bln_word,
962
+ &word->denorm,
963
+ tess_default_matcher,
964
+ choice_dump_tester,
965
+ word->raw_choice,
966
+ blob_choices, word->outword);
967
+ // else if (tessedit_training_wiseowl)
968
+ // best_choice=correct_segment_pass2( word, &denorm,
969
+ // tess_default_matcher,wo_learn,
970
+ // raw_choice,blob_choices,outword);
971
+ // else if (tessedit_matcher_is_wiseowl)
972
+ // best_choice=tess_segment_pass2( word, &denorm, wo_classify,
973
+ // raw_choice, blob_choices, outword);
974
+ else {
975
+ word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
976
+ tess_default_matcher,
977
+ word->raw_choice, blob_choices,
978
+ word->outword);
979
+ }
980
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
981
+ /*
982
+ Test for TESS screw up on word. Recog_word has already ensured that the
983
+ choice list, outword blob lists and best_choice string are the same
984
+ length. A TESS screw up is indicated by a blank filled or 0 length string.
985
+ */
986
+ if ((word->best_choice->string ().length () == 0) ||
987
+ (strspn (word->best_choice->string ().string (), " ") ==
988
+ word->best_choice->string ().length ())) {
989
+ word->tess_failed = TRUE;
990
+ word->reject_map.initialise (word->best_choice->string ().length ());
991
+ word->reject_map.rej_word_tess_failure ();
992
+ // tprintf("Empty word produced\n");
993
+ }
994
+ else {
995
+ if ((word->best_choice->lengths ().length () !=
996
+ word->outword->blob_list ()->length ()) ||
997
+ (word->best_choice->lengths ().length () != blob_choices->length ())) {
998
+ tprintf
999
+ ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1000
+ word->best_choice->string ().string (),
1001
+ word->best_choice->lengths ().length (),
1002
+ word->outword->blob_list ()->length (), blob_choices->length ());
1003
+ }
1004
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1005
+ word->outword->blob_list ()->length ());
1006
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1007
+ blob_choices->length ());
1008
+
1009
+ word->tess_failed = FALSE;
1010
+ if (word->word->flag (W_REP_CHAR)) {
1011
+ fix_rep_char(word);
1012
+ }
1013
+ else {
1014
+ fix_quotes (word->best_choice,
1015
+ word->outword, blob_choices);
1016
+ if (tessedit_fix_hyphens)
1017
+ fix_hyphens (word->best_choice,
1018
+ word->outword, blob_choices);
1019
+ /* Dont trust fix_quotes! - though I think I've fixed the bug */
1020
+ if ((word->best_choice->lengths ().length () !=
1021
+ word->outword->blob_list ()->length ()) ||
1022
+ (word->best_choice->lengths ().length () !=
1023
+ blob_choices->length ())) {
1024
+ #ifndef SECURE_NAMES
1025
+ tprintf
1026
+ ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1027
+ word->best_choice->string ().string (),
1028
+ word->best_choice->lengths ().length (),
1029
+ word->outword->blob_list ()->length (),
1030
+ blob_choices->length ());
1031
+ #endif
1032
+
1033
+ }
1034
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1035
+ word->outword->blob_list ()->length ());
1036
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1037
+ blob_choices->length ());
1038
+
1039
+ word->tess_accepted = tess_acceptable_word (word->best_choice,
1040
+ word->raw_choice);
1041
+
1042
+ make_reject_map (word, blob_choices, row, 2);
1043
+ }
1044
+ }
1045
+
1046
+ // Save best choices in the WERD_CHOICE if needed
1047
+ if (blob_choices != &local_blob_choices)
1048
+ word->best_choice->set_blob_choices(blob_choices);
1049
+ else
1050
+ blob_choices->deep_clear();
1051
+
1052
+ delete bln_word;
1053
+ assert (word->raw_choice != NULL);
1054
+ }
1055
+
1056
+
1057
+ /*************************************************************************
1058
+ * fix_rep_char()
1059
+ * The word is a repeated char. Find the repeated char character. Make a reject
1060
+ * string which rejects any char other than the voted char. Set the word to done
1061
+ * to stop rematching it.
1062
+ *
1063
+ *************************************************************************/
1064
+ void fix_rep_char( //Repeated char word
1065
+ WERD_RES *word //word to do
1066
+ ) {
1067
+ struct REP_CH
1068
+ {
1069
+ char ch[UNICHAR_LEN + 1];
1070
+ int count;
1071
+ };
1072
+
1073
+ REP_CH *rep_ch; //array of char counts
1074
+ int word_len;
1075
+ int rep_ch_count = 0; //how many unique chs
1076
+ const char *word_str; //the repeated chs
1077
+ int i, j;
1078
+ int offset;
1079
+ int total = 0;
1080
+ int max = 0;
1081
+ char *maxch = NULL; //Most common char
1082
+
1083
+ word_str = word->best_choice->string ().string ();
1084
+ word_len = word->best_choice->lengths ().length ();;
1085
+ rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
1086
+ for (i = 0, offset = 0; i < word_len;
1087
+ offset += word->best_choice->lengths()[i++]) {
1088
+ for (j = 0; j < rep_ch_count &&
1089
+ strncmp(rep_ch[j].ch, word_str + offset,
1090
+ word->best_choice->lengths()[i]) != 0; j++);
1091
+ if (j < rep_ch_count)
1092
+ rep_ch[j].count++;
1093
+ else {
1094
+ strncpy(rep_ch[rep_ch_count].ch, word_str + offset,
1095
+ word->best_choice->lengths()[i]);
1096
+ rep_ch[rep_ch_count].ch[word->best_choice->lengths()[i]] = '\0';
1097
+ rep_ch[rep_ch_count].count = 1;
1098
+ rep_ch_count++;
1099
+ }
1100
+ }
1101
+
1102
+ for (j = 0; j < rep_ch_count; j++) {
1103
+ total += rep_ch[j].count;
1104
+ if ((rep_ch[j].count > max) && (*rep_ch[j].ch != ' ')) {
1105
+ max = rep_ch[j].count;
1106
+ maxch = rep_ch[j].ch;
1107
+ }
1108
+ }
1109
+ // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
1110
+ // word_str, word_len, total, maxch );
1111
+ free_mem(rep_ch);
1112
+
1113
+ word->reject_map.initialise (word_len);
1114
+ for (i = 0, offset = 0; i < word_len;
1115
+ offset += word->best_choice->lengths()[i++]) {
1116
+ if (strncmp(word_str + offset, maxch,
1117
+ word->best_choice->lengths()[i]) != 0)
1118
+ //rej unrecognised blobs
1119
+ word->reject_map[i].setrej_bad_repetition ();
1120
+ }
1121
+ word->done = TRUE;
1122
+ }
1123
+
1124
+ // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1125
+ // training data.
1126
+
1127
+ // Utility function for fix_quotes
1128
+ // Return true if the next character in the string (given the UTF8 length in
1129
+ // bytes) is a quote character.
1130
+ static int is_simple_quote(const char* signed_str, int length) {
1131
+ const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str);
1132
+ //standard 1 byte quotes
1133
+ return (length == 1 && (*str == '\'' || *str == '`')) ||
1134
+ //utf8 3 bytes curved quotes
1135
+ (length == 3 && ((*str == 0xe2 &&
1136
+ *(str + 1) == 0x80 &&
1137
+ *(str + 2) == 0x98) ||
1138
+ (*str == 0xe2 &&
1139
+ *(str + 1) == 0x80 &&
1140
+ *(str + 2) == 0x99)));
1141
+ }
1142
+
1143
+ /**********************************************************************
1144
+ * fix_quotes
1145
+ *
1146
+ * Change pairs of quotes to double quotes.
1147
+ **********************************************************************/
1148
+ void fix_quotes( //make double quotes
1149
+ WERD_CHOICE *choice, //choice to fix
1150
+ WERD *word, //word to do //char choices
1151
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1152
+ char *str = (char *) choice->string().string();//string ptr
1153
+ int i;
1154
+ int offset;
1155
+ //blobs
1156
+ PBLOB_IT blob_it = word->blob_list ();
1157
+ //choices
1158
+ BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
1159
+ BLOB_CHOICE_IT it1; //first choices
1160
+ BLOB_CHOICE_IT it2; //second choices
1161
+
1162
+ for (i = 0, offset = 0; str[offset] != '\0';
1163
+ offset += choice->lengths()[i++],
1164
+ blob_it.forward (), choice_it.forward ()) {
1165
+ if (str[offset + choice->lengths()[i]] != '\0' &&
1166
+ is_simple_quote(str + offset, choice->lengths()[i]) &&
1167
+ is_simple_quote(str + offset + choice->lengths()[i],
1168
+ choice->lengths()[i + 1]) &&
1169
+ unicharset.contains_unichar("\"")) {
1170
+ str[offset] = '"'; //turn to double
1171
+ strcpy (str + offset + 1,
1172
+ str + offset + choice->lengths()[i] +
1173
+ choice->lengths()[i + 1]); //shuffle up
1174
+ choice->lengths()[i] = 1;
1175
+ strcpy ((char*) choice->lengths().string() + i + 1,
1176
+ choice->lengths().string() + i + 2);
1177
+ merge_blobs (blob_it.data (), blob_it.data_relative (1));
1178
+ blob_it.forward ();
1179
+ delete blob_it.extract (); //get rid of spare
1180
+
1181
+ it1.set_to_list (choice_it.data ());
1182
+ it2.set_to_list (choice_it.data_relative (1));
1183
+ if (it1.data ()->certainty () < it2.data ()->certainty ()) {
1184
+ choice_it.forward ();
1185
+ //get rid of spare
1186
+ delete choice_it.extract ();
1187
+ }
1188
+ else {
1189
+ //get rid of spare
1190
+ delete choice_it.extract ();
1191
+ choice_it.forward ();
1192
+ }
1193
+ }
1194
+ }
1195
+ }
1196
+
1197
+
1198
+ /**********************************************************************
1199
+ * fix_hyphens
1200
+ *
1201
+ * Change pairs of hyphens to a single hyphen if the bounding boxes touch
1202
+ * Typically a long dash which has been segmented.
1203
+ **********************************************************************/
1204
+ void fix_hyphens( //crunch double hyphens
1205
+ WERD_CHOICE *choice, //choice to fix
1206
+ WERD *word, //word to do //char choices
1207
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1208
+ char *str = (char *) choice->string().string();//string ptr
1209
+ int i;
1210
+ int offset;
1211
+ //blobs
1212
+ PBLOB_IT blob_it = word->blob_list ();
1213
+ //choices
1214
+ BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
1215
+ BLOB_CHOICE_IT it1; //first choices
1216
+ BLOB_CHOICE_IT it2; //second choices
1217
+
1218
+ for (i = 0, offset = 0; str[offset] != '\0';
1219
+ offset += choice->lengths()[i++],
1220
+ blob_it.forward (), choice_it.forward ()) {
1221
+ if ((str[offset] == '-' || str[offset] == '~') &&
1222
+ (str[offset + choice->lengths()[i]] == '-' ||
1223
+ str[offset + choice->lengths()[i]] == '~') &&
1224
+ (blob_it.data ()->bounding_box ().right () >=
1225
+ blob_it.data_relative (1)->bounding_box ().left ())) {
1226
+ str[offset] = '-'; //turn to single hyphen
1227
+ strcpy (str + offset + choice->lengths()[i],
1228
+ str + offset + choice->lengths()[i] +
1229
+ choice->lengths()[i + 1]); //shuffle up
1230
+ strcpy ((char*) choice->lengths().string() + i + 1,
1231
+ choice->lengths().string() + i + 2);
1232
+ merge_blobs (blob_it.data (), blob_it.data_relative (1));
1233
+ blob_it.forward ();
1234
+ delete blob_it.extract (); //get rid of spare
1235
+
1236
+ it1.set_to_list (choice_it.data ());
1237
+ it2.set_to_list (choice_it.data_relative (1));
1238
+ if (it1.data ()->certainty () < it2.data ()->certainty ()) {
1239
+ choice_it.forward ();
1240
+ //get rid of spare
1241
+ delete choice_it.extract ();
1242
+ }
1243
+ else {
1244
+ //get rid of spare
1245
+ delete choice_it.extract ();
1246
+ choice_it.forward ();
1247
+ }
1248
+ }
1249
+ }
1250
+ }
1251
+
1252
+
1253
+ /**********************************************************************
1254
+ * merge_blobs
1255
+ *
1256
+ * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted.
1257
+ **********************************************************************/
1258
+
1259
+ void merge_blobs( //combine 2 blobs
1260
+ PBLOB *blob1, //dest blob
1261
+ PBLOB *blob2 //source blob
1262
+ ) {
1263
+ OUTLINE_IT outline_it = blob1->out_list ();
1264
+ //iterator
1265
+
1266
+ outline_it.move_to_last (); //go to end
1267
+ //do it
1268
+ outline_it.add_list_after (blob2->out_list ());
1269
+ }
1270
+
1271
+
1272
+ /**********************************************************************
1273
+ * choice_dump_tester
1274
+ *
1275
+ * Matcher tester function which generates .chc file entries.
1276
+ * Called via test_segment_pass2 for every blob tested by tess in a word.
1277
+ * (But only for words for which a correct segmentation could be found.)
1278
+ **********************************************************************/
1279
+
1280
+ void choice_dump_tester( //dump chars in word
1281
+ PBLOB *, //blob
1282
+ DENORM *, //de-normaliser
1283
+ BOOL8 correct, //ly segmented
1284
+ char *text, //correct text
1285
+ inT32 count, //chars in text
1286
+ BLOB_CHOICE_LIST *ratings //list of results
1287
+ ) {
1288
+ STRING choice_file_name;
1289
+ BLOB_CHOICE *blob_choice;
1290
+ BLOB_CHOICE_IT it;
1291
+ char source_chars[20];
1292
+ char correct_char[3];
1293
+
1294
+ if (choice_file == NULL) {
1295
+ choice_file_name = imagebasename + ".chc";
1296
+ if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
1297
+ CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
1298
+ choice_file_name.string (), errno);
1299
+ }
1300
+ }
1301
+
1302
+ if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
1303
+ strcpy (source_chars, "$$");
1304
+ strcpy (correct_char, "$$");
1305
+ }
1306
+ else {
1307
+ strncpy(source_chars, text, count);
1308
+ source_chars[count] = '\0';
1309
+ if (correct) {
1310
+ correct_char[0] = text[0];
1311
+ correct_char[1] = '\0';
1312
+ }
1313
+ else {
1314
+ strcpy (correct_char, "$$");
1315
+ }
1316
+ }
1317
+ fprintf (choice_file, "%s\t%s", source_chars, correct_char);
1318
+
1319
+ it.set_to_list (ratings);
1320
+ for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
1321
+ blob_choice = it.data ();
1322
+ fprintf (choice_file, "\t%s\t%f\t%f",
1323
+ blob_choice->unichar (),
1324
+ blob_choice->rating (), blob_choice->certainty ());
1325
+ }
1326
+ fprintf (choice_file, "\n");
1327
+ }
1328
+
1329
+
1330
+ /*************************************************************************
1331
+ * make_bln_copy()
1332
+ *
1333
+ * Generate a baseline normalised copy of the source word. The copy is done so
1334
+ * that whatever format the original word is in, a polygonal bln version is
1335
+ * generated as output.
1336
+ *************************************************************************/
1337
+
1338
+ WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
1339
+ WERD *result;
1340
+
1341
+ // if (wordit_linearc && !src_word->flag(W_POLYGON))
1342
+ // {
1343
+ // larc_word = src_word->larc_copy( row->x_height() );
1344
+ // result = larc_word->poly_copy( row->x_height() );
1345
+ // delete larc_word;
1346
+ // }
1347
+ // else
1348
+ result = src_word->poly_copy (row->x_height ());
1349
+
1350
+ // if (tessedit_draw_words)
1351
+ // {
1352
+ // if ( la_win == NO_WINDOW )
1353
+ // create_la_win();
1354
+ // result->plot( la_win );
1355
+ // }
1356
+ result->baseline_normalise_x (row, x_height, denorm);
1357
+ return result;
1358
+ }
1359
+
1360
+
1361
+ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
1362
+ const char *lengths) {
1363
+ int i = 0;
1364
+ int offset = 0;
1365
+ int leading_punct_count;
1366
+ int upper_count = 0;
1367
+ int hyphen_pos = -1;
1368
+ ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1369
+
1370
+ if (strlen (lengths) > 20)
1371
+ return word_type;
1372
+
1373
+ /* Single Leading punctuation char*/
1374
+
1375
+ if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
1376
+ offset += lengths[i++];
1377
+ leading_punct_count = i;
1378
+
1379
+ /* Initial cap */
1380
+ while ((s[offset] != '\0') &&
1381
+ unicharset.get_isupper(s + offset, lengths[i])) {
1382
+ offset += lengths[i++];
1383
+ upper_count++;
1384
+ }
1385
+ if (upper_count > 1)
1386
+ word_type = AC_UPPER_CASE;
1387
+ else {
1388
+ /* Lower case word, possibly with an initial cap */
1389
+ while ((s[offset] != '\0') &&
1390
+ unicharset.get_islower (s + offset, lengths[i])) {
1391
+ offset += lengths[i++];
1392
+ }
1393
+ if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1394
+ goto not_a_word;
1395
+ /*
1396
+ Allow a single hyphen in a lower case word
1397
+ - dont trust upper case - I've seen several cases of "H" -> "I-I"
1398
+ */
1399
+ if (lengths[i] == 1 && s[offset] == '-') {
1400
+ hyphen_pos = i;
1401
+ offset += lengths[i++];
1402
+ if (s[offset] != '\0') {
1403
+ while ((s[offset] != '\0') &&
1404
+ unicharset.get_islower(s + offset, lengths[i])) {
1405
+ offset += lengths[i++];
1406
+ }
1407
+ if (i < hyphen_pos + 3)
1408
+ goto not_a_word;
1409
+ }
1410
+ }
1411
+ else {
1412
+ /* Allow "'s" in NON hyphenated lower case words */
1413
+ if (lengths[i] == 1 && (s[offset] == '\'') &&
1414
+ lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1415
+ offset += lengths[i++];
1416
+ offset += lengths[i++];
1417
+ }
1418
+ }
1419
+ if (upper_count > 0)
1420
+ word_type = AC_INITIAL_CAP;
1421
+ else
1422
+ word_type = AC_LOWER_CASE;
1423
+ }
1424
+
1425
+ /* Up to two different, constrained trailing punctuation chars */
1426
+ if (lengths[i] == 1 && (s[offset] != '\0') &&
1427
+ (STRING (chs_trailing_punct1).contains (s[offset])))
1428
+ offset += lengths[i++];
1429
+ if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
1430
+ (s[offset - lengths[i - 1]] != s[offset]) &&
1431
+ (STRING (chs_trailing_punct2).contains (s[offset])))
1432
+ offset += lengths[i++];
1433
+
1434
+ if (s[offset] != '\0')
1435
+ word_type = AC_UNACCEPTABLE;
1436
+
1437
+ not_a_word:
1438
+
1439
+ if (word_type == AC_UNACCEPTABLE) {
1440
+ /* Look for abbreviation string */
1441
+ i = 0;
1442
+ offset = 0;
1443
+ if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
1444
+ word_type = AC_UC_ABBREV;
1445
+ while ((s[offset] != '\0') &&
1446
+ unicharset.get_isupper(s + offset, lengths[i]) &&
1447
+ (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1448
+ offset += lengths[i++];
1449
+ offset += lengths[i++];
1450
+ }
1451
+ }
1452
+ else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
1453
+ word_type = AC_LC_ABBREV;
1454
+ while ((s[offset] != '\0') &&
1455
+ unicharset.get_islower(s + offset, lengths[i]) &&
1456
+ (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1457
+ offset += lengths[i++];
1458
+ offset += lengths[i++];
1459
+ }
1460
+ }
1461
+ if (s[offset] != '\0')
1462
+ word_type = AC_UNACCEPTABLE;
1463
+ }
1464
+
1465
+ return word_type;
1466
+ }
1467
+
1468
+
1469
+ /* DEBUGGING ROUTINE */
1470
+
1471
+ BOOL8 check_debug_pt(WERD_RES *word, int location) {
1472
+ BOOL8 show_map_detail = FALSE;
1473
+ inT16 i;
1474
+
1475
+ #ifndef SECURE_NAMES
1476
+ if (!test_pt)
1477
+ return FALSE;
1478
+
1479
+ tessedit_rejection_debug.set_value (FALSE);
1480
+ debug_x_ht_level.set_value (0);
1481
+ tessedit_cluster_debug.set_value (FALSE);
1482
+ nn_debug.set_value (FALSE);
1483
+ nn_reject_debug.set_value (FALSE);
1484
+
1485
+ if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1486
+ if (location < 0)
1487
+ return TRUE; //For breakpoint use
1488
+ tessedit_rejection_debug.set_value (TRUE);
1489
+ debug_x_ht_level.set_value (20);
1490
+ tessedit_cluster_debug.set_value (TRUE);
1491
+ nn_debug.set_value (TRUE);
1492
+ nn_reject_debug.set_value (TRUE);
1493
+ tprintf ("\n\nTESTWD::");
1494
+ switch (location) {
1495
+ case 0:
1496
+ tprintf ("classify_word_pass1 start\n");
1497
+ word->word->print (debug_fp);
1498
+ break;
1499
+ case 10:
1500
+ tprintf ("make_reject_map: initial map");
1501
+ break;
1502
+ case 20:
1503
+ tprintf ("make_reject_map: after NN");
1504
+ break;
1505
+ case 30:
1506
+ tprintf ("classify_word_pass2 - START");
1507
+ break;
1508
+ case 40:
1509
+ tprintf ("classify_word_pass2 - Pre Xht");
1510
+ break;
1511
+ case 50:
1512
+ tprintf ("classify_word_pass2 - END");
1513
+ show_map_detail = TRUE;
1514
+ break;
1515
+ case 60:
1516
+ tprintf ("fixspace");
1517
+ break;
1518
+ case 70:
1519
+ tprintf ("MM pass START");
1520
+ break;
1521
+ case 80:
1522
+ tprintf ("MM pass END");
1523
+ break;
1524
+ case 90:
1525
+ tprintf ("After Poor quality rejection");
1526
+ break;
1527
+ case 100:
1528
+ tprintf ("unrej_good_quality_words - START");
1529
+ break;
1530
+ case 110:
1531
+ tprintf ("unrej_good_quality_words - END");
1532
+ break;
1533
+ case 120:
1534
+ tprintf ("Write results pass");
1535
+ show_map_detail = TRUE;
1536
+ break;
1537
+ }
1538
+ tprintf (" \"%s\" ", word->best_choice->string ().string ());
1539
+ word->reject_map.print (debug_fp);
1540
+ tprintf ("\n");
1541
+ if (show_map_detail) {
1542
+ tprintf ("\"%s\"\n", word->best_choice->string ().string ());
1543
+ for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
1544
+ tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
1545
+ word->reject_map[i].full_print (debug_fp);
1546
+ }
1547
+ }
1548
+
1549
+ tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1550
+ tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1551
+ return TRUE;
1552
+ }
1553
+ else
1554
+ #endif
1555
+ return FALSE;
1556
+ }
1557
+
1558
+
1559
+ /**********************************************************************
1560
+ * set_word_fonts
1561
+ *
1562
+ * Get the fonts for the word.
1563
+ **********************************************************************/
1564
+
1565
+ void set_word_fonts( //good chars in word
1566
+ WERD_RES *word, //word to adapt to //detailed results
1567
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1568
+ inT32 index; //char index
1569
+ inT32 offset; //char offset
1570
+ char choice_char[UNICHAR_LEN + 1]; //char from word
1571
+ inT8 config; //font of char
1572
+ //character iterator
1573
+ BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1574
+ BLOB_CHOICE_IT choice_it; //choice iterator
1575
+ STATS fonts (0, 32); //font counters
1576
+ static inT8 italic_table[32] = {
1577
+ 1, -1, 1, -1,
1578
+ 1, -1, 1, -1,
1579
+ 1, -1, 1, -1,
1580
+ 1, -1, 1, -1,
1581
+ 1, -1, 1, -1,
1582
+ 1, -1, 1, -1,
1583
+ 1, -1, 1, -1,
1584
+ 1, -1, 1, -1
1585
+ };
1586
+ static inT8 bold_table[32] = {
1587
+ 1, 1, -1, -1,
1588
+ 1, 1, -1, -1,
1589
+ 1, 1, -1, -1,
1590
+ 1, 1, -1, -1,
1591
+ 1, 1, -1, -1,
1592
+ 1, 1, -1, -1,
1593
+ 1, 1, -1, -1,
1594
+ 1, 1, -1, -1
1595
+ };
1596
+ static inT8 font_table[32] = {
1597
+ 2, 2, 2, 2,
1598
+ -1, -1, -1, -1,
1599
+ 0, 0, 0, 0,
1600
+ 1, 1, 1, 1,
1601
+ 3, 3, 3, 3,
1602
+ 4, 4, 4, 4,
1603
+ 5, 5, 5, 5,
1604
+ 2, 2, 2, 2
1605
+ };
1606
+
1607
+ word->italic = 0;
1608
+ word->bold = 0;
1609
+ for (char_it.mark_cycle_pt (), index = 0, offset = 0;
1610
+ !char_it.cycled_list (); char_it.forward (),
1611
+ offset += word->best_choice->lengths()[index++]) {
1612
+ strncpy(choice_char, word->best_choice->string ().string() + offset,
1613
+ word->best_choice->lengths()[index]);
1614
+ choice_char[word->best_choice->lengths()[index]] = '\0';
1615
+ choice_it.set_to_list (char_it.data ());
1616
+ for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
1617
+ choice_it.forward ()) {
1618
+ if (strcmp(choice_it.data ()->unichar (), choice_char) == 0) {
1619
+ config = choice_it.data ()->config ();
1620
+ if (tessedit_debug_fonts)
1621
+ tprintf ("%s(%d=%d%c%c)",
1622
+ choice_char, config, (config & 31) >> 2,
1623
+ config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
1624
+ if (config != -1) {
1625
+ config &= 31;
1626
+ word->italic += italic_table[config];
1627
+ word->bold += bold_table[config];
1628
+ if (font_table[config] != -1)
1629
+ fonts.add (font_table[config], 1);
1630
+ }
1631
+ break;
1632
+ }
1633
+ }
1634
+ }
1635
+ find_modal_font (&fonts, &word->font1, &word->font1_count);
1636
+ find_modal_font (&fonts, &word->font2, &word->font2_count);
1637
+ if (tessedit_debug_fonts)
1638
+ tprintf ("\n");
1639
+ /* if (word->font1_count>0)
1640
+ {
1641
+ for (char_it.mark_cycle_pt(),index=0;
1642
+ !char_it.cycled_list();char_it.forward(),index++)
1643
+ {
1644
+ choice_char=word->best_choice->string()[index];
1645
+ choice_it.set_to_list(char_it.data());
1646
+ for (choice_it.mark_cycle_pt();!choice_it.cycled_list();choice_it.forward())
1647
+ {
1648
+ if (choice_it.data()->char_class()==choice_char)
1649
+ {
1650
+ config=choice_it.data()->config();
1651
+ if (config!=-1 && font_table[config&31]==word->font1)
1652
+ {
1653
+ word->italic+=italic_table[config];
1654
+ word->bold+=bold_table[config];
1655
+ }
1656
+ break;
1657
+ }
1658
+ }
1659
+ }
1660
+ }*/
1661
+ }
1662
+
1663
+
1664
+ /**********************************************************************
1665
+ * font_recognition_pass
1666
+ *
1667
+ * Smooth the fonts for the document.
1668
+ **********************************************************************/
1669
+
1670
+ void font_recognition_pass( //good chars in word
1671
+ PAGE_RES_IT &page_res_it) {
1672
+ inT32 length; //of word
1673
+ inT32 count; //of a feature
1674
+ inT8 doc_font; //modal font
1675
+ inT8 doc_font_count; //modal font
1676
+ inT32 doc_italic; //total italics
1677
+ inT32 doc_bold; //total bolds
1678
+ ROW_RES *row = NULL; //current row
1679
+ WERD_RES *word; //current word
1680
+ STATS fonts (0, 32); //font counters
1681
+ STATS doc_fonts (0, 32); //font counters
1682
+
1683
+ doc_italic = 0;
1684
+ doc_bold = 0;
1685
+ page_res_it.restart_page ();
1686
+ while (page_res_it.word () != NULL) {
1687
+ if (row != page_res_it.row ()) {
1688
+ if (row != NULL) {
1689
+ find_modal_font (&fonts, &row->font1, &row->font1_count);
1690
+ find_modal_font (&fonts, &row->font2, &row->font2_count);
1691
+ }
1692
+ row = page_res_it.row (); //current row
1693
+ fonts.clear (); //clear counters
1694
+ row->italic = 0;
1695
+ row->bold = 0;
1696
+ }
1697
+ word = page_res_it.word ();
1698
+ row->italic += word->italic;
1699
+ row->bold += word->bold;
1700
+ fonts.add (word->font1, word->font1_count);
1701
+ fonts.add (word->font2, word->font2_count);
1702
+ doc_italic += word->italic;
1703
+ doc_bold += word->bold;
1704
+ doc_fonts.add (word->font1, word->font1_count);
1705
+ doc_fonts.add (word->font2, word->font2_count);
1706
+ page_res_it.forward ();
1707
+ }
1708
+ if (row != NULL) {
1709
+ find_modal_font (&fonts, &row->font1, &row->font1_count);
1710
+ find_modal_font (&fonts, &row->font2, &row->font2_count);
1711
+ }
1712
+ find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1713
+ /*
1714
+ row=NULL;
1715
+ page_res_it.restart_page();
1716
+ while (page_res_it.word() != NULL)
1717
+ {
1718
+ if (row!=page_res_it.row())
1719
+ {
1720
+ row2=row;
1721
+ row=page_res_it.row();
1722
+ if (row->font1_count<MIN_FONT_ROW_COUNT)
1723
+ {
1724
+ fonts.clear();
1725
+ italic=0;
1726
+ bold=0;
1727
+ add_in_one_row(row,&fonts,&italic,&bold);
1728
+ if (row2!=NULL)
1729
+ {
1730
+ hdiff=row->row->x_height()-row2->row->x_height();
1731
+ if (hdiff<0)
1732
+ hdiff=-hdiff;
1733
+ if (hdiff<MAX_XHEIGHT_DIFF)
1734
+ add_in_one_row(row2,&fonts,&italic,&bold);
1735
+ }
1736
+ do
1737
+ page_res_it.forward();
1738
+ while (page_res_it.row()==row);
1739
+ row2=page_res_it.row();
1740
+ if (row2!=NULL)
1741
+ {
1742
+ hdiff=row->row->x_height()-row2->row->x_height();
1743
+ if (hdiff<0)
1744
+ hdiff=-hdiff;
1745
+ if (hdiff<MAX_XHEIGHT_DIFF)
1746
+ add_in_one_row(row2,&fonts,&italic,&bold);
1747
+ }
1748
+ row->italic=italic;
1749
+ row->bold=bold;
1750
+ find_modal_font(&fonts,&row->font1,&row->font1_count);
1751
+ find_modal_font(&fonts,&row->font2,&row->font2_count);
1752
+ }
1753
+ else
1754
+ page_res_it.forward();
1755
+ }
1756
+ else
1757
+ page_res_it.forward();
1758
+ }*/
1759
+
1760
+ page_res_it.restart_page ();
1761
+ while (page_res_it.word () != NULL) {
1762
+ row = page_res_it.row (); //current row
1763
+ word = page_res_it.word ();
1764
+ length = word->best_choice->string ().length ();
1765
+
1766
+ count = word->italic;
1767
+ if (count < 0)
1768
+ count = -count;
1769
+ if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1770
+ word->italic = doc_italic > 0 ? 1 : -1;
1771
+
1772
+ count = word->bold;
1773
+ if (count < 0)
1774
+ count = -count;
1775
+ if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1776
+ word->bold = doc_bold > 0 ? 1 : -1;
1777
+
1778
+ count = word->font1_count;
1779
+ if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1780
+ word->font1 = doc_font;
1781
+ word->font1_count = doc_font_count;
1782
+ }
1783
+
1784
+ page_res_it.forward ();
1785
+ }
1786
+ }
1787
+
1788
+
1789
+ /**********************************************************************
1790
+ * add_in_one_row
1791
+ *
1792
+ * Add into the stats for one row.
1793
+ **********************************************************************/
1794
+
1795
+ void add_in_one_row( //good chars in word
1796
+ ROW_RES *row, //current row
1797
+ STATS *fonts, //font stats
1798
+ inT8 *italic, //output count
1799
+ inT8 *bold //output count
1800
+ ) {
1801
+ WERD_RES *word; //current word
1802
+ WERD_RES_IT word_it = &row->word_res_list;
1803
+
1804
+ for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
1805
+ word = word_it.data ();
1806
+ *italic += word->italic;
1807
+ *bold += word->bold;
1808
+ if (word->font1_count > 0)
1809
+ fonts->add (word->font1, word->font1_count);
1810
+ if (word->font2_count > 0)
1811
+ fonts->add (word->font2, word->font2_count);
1812
+
1813
+ }
1814
+ }
1815
+
1816
+
1817
+ /**********************************************************************
1818
+ * find_modal_font
1819
+ *
1820
+ * Find the modal font and remove from the stats.
1821
+ **********************************************************************/
1822
+
1823
+ void find_modal_font( //good chars in word
1824
+ STATS *fonts, //font stats
1825
+ inT8 *font_out, //output font
1826
+ inT8 *font_count //output count
1827
+ ) {
1828
+ inT8 font; //font index
1829
+ inT32 count; //pile couat
1830
+
1831
+ if (fonts->get_total () > 0) {
1832
+ font = (inT8) fonts->mode ();
1833
+ *font_out = font;
1834
+ count = fonts->pile_count (font);
1835
+ *font_count = count < MAX_INT8 ? count : MAX_INT8;
1836
+ fonts->add (font, -*font_count);
1837
+ }
1838
+ else {
1839
+ *font_out = -1;
1840
+ *font_count = 0;
1841
+ }
1842
+ }