tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1842 @@
1
+ /******************************************************************
2
+ * File: control.cpp (Formerly control.c)
3
+ * Description: Module-independent matcher controller.
4
+ * Author: Ray Smith
5
+ * Created: Thu Apr 23 11:09:58 BST 1992
6
+ * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7
+ *
8
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
9
+ ** Licensed under the Apache License, Version 2.0 (the "License");
10
+ ** you may not use this file except in compliance with the License.
11
+ ** You may obtain a copy of the License at
12
+ ** http://www.apache.org/licenses/LICENSE-2.0
13
+ ** Unless required by applicable law or agreed to in writing, software
14
+ ** distributed under the License is distributed on an "AS IS" BASIS,
15
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ ** See the License for the specific language governing permissions and
17
+ ** limitations under the License.
18
+ *
19
+ **********************************************************************/
20
+
21
+ #include "mfcpch.h"
22
+ #include "mainblk.h"
23
+ #include <string.h>
24
+ #include <math.h>
25
+ #ifdef __UNIX__
26
+ #include <assert.h>
27
+ #include <unistd.h>
28
+ #include <errno.h>
29
+ #endif
30
+ #include <ctype.h>
31
+ #include "ocrclass.h"
32
+ #include "werdit.h"
33
+ #include "drawfx.h"
34
+ #include "tfacep.h"
35
+ #include "tessbox.h"
36
+ #include "tessvars.h"
37
+ //#include "fxtop.h"
38
+ #include "pgedit.h"
39
+ #include "reject.h"
40
+ #include "adaptions.h"
41
+ #include "charcut.h"
42
+ #include "fixxht.h"
43
+ #include "fixspace.h"
44
+ #include "genblob.h"
45
+ #include "docqual.h"
46
+ #include "control.h"
47
+ #include "secname.h"
48
+ #include "output.h"
49
+ #include "callcpp.h"
50
+ #include "notdll.h"
51
+ #include "tordvars.h"
52
+ #include "adaptmatch.h"
53
+ #include "globals.h"
54
+
55
+ #define MIN_FONT_ROW_COUNT 8
56
+ #define MAX_XHEIGHT_DIFF 3
57
+
58
+ #define EXTERN
59
+ //extern "C" {
60
+ //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");
61
+
62
+ //extern FILE* matcher_fp;
63
+ //extern FILE* correct_fp;
64
+ //};
65
+ BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
66
+ EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
67
+ EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
68
+ EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
69
+ EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
70
+ EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
71
+ EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
72
+ EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
73
+ EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
74
+ "Try to improve fuzzy spaces");
75
+ EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
76
+ "Dont bother with word plausibility");
77
+ EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
78
+
79
+ EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
80
+ EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
81
+ "Reject suspect fullstops");
82
+ EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
83
+ EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
84
+ "Do our own adaption - ems only");
85
+ EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
86
+ "Add words to the document dictionary");
87
+ EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
88
+ EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
89
+ EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
90
+ "Apply xht fix up even if done");
91
+ EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
92
+ "Apply xht fix up even in no rejects");
93
+ EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
94
+ EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
95
+ EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
96
+ EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
97
+ "Block and Row stats");
98
+ EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
99
+ EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
100
+ EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
101
+
102
+ EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
103
+ EXTERN
104
+ STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
105
+ EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
106
+ "2nd Trailing punctuation");
107
+
108
+ EXTERN double_VAR (quality_rej_pc, 0.08,
109
+ "good_quality_doc lte rejection limit");
110
+ EXTERN double_VAR (quality_blob_pc, 0.0,
111
+ "good_quality_doc gte good blobs limit");
112
+ EXTERN double_VAR (quality_outline_pc, 1.0,
113
+ "good_quality_doc lte outline error limit");
114
+ EXTERN double_VAR (quality_char_pc, 0.95,
115
+ "good_quality_doc gte good char limit");
116
+ EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
117
+ "alphas in a good word");
118
+
119
+ EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
120
+ "Use reject map to control Tesseract adaption");
121
+ EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
122
+ "Adaptation decision algorithm for tess");
123
+ EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
124
+ "Adaptation decision algorithm for ems matrix matcher");
125
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
126
+ "Adapt using clusterer after pass 1");
127
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
128
+ "Adapt using clusterer after pass 1");
129
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
130
+ "Adapt using clusterer after pass 1");
131
+ EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
132
+ "Adapt using clusterer before Tess adaping during pass 1");
133
+ EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
134
+ "Adaptation decision algorithm for matrix matcher");
135
+ EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
136
+ "Generate and print debug information for adaption");
137
+ EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
138
+ "Do minimal rejection on pass 1 output");
139
+ EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
140
+ "Test adaption criteria");
141
+ EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
142
+ "Adapt to all docs over time");
143
+ EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
144
+ EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
145
+ "Adaptation decision algorithm for tess");
146
+ BOOL_VAR (save_best_choices, FALSE, "Save the results of the recognition step"
147
+ " (blob_choices) within the corresponding WERD_CHOICE");
148
+
149
+ EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
150
+ EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
151
+ EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
152
+
153
+ extern int MatcherDebugLevel;
154
+ extern int display_ratings;
155
+ extern int number_debug;
156
+ extern int adjust_debug;
157
+ FILE *choice_file = NULL; //Choice file ptr
158
+
159
+ CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
160
+ /* DEBUGGING */
161
+ inT16 blob_count(WERD *w) {
162
+ return w->blob_list ()->length ();
163
+ }
164
+
165
+
166
+ /**********************************************************************
167
+ * recog_pseudo_word
168
+ *
169
+ * Make a word from the selected blobs and run Tess on them.
170
+ **********************************************************************/
171
+
172
+ void recog_pseudo_word( //recognize blobs
173
+ BLOCK_LIST *block_list, //blocks to check
174
+ TBOX &selection_box) {
175
+ WERD *word;
176
+ ROW *pseudo_row; //row of word
177
+ BLOCK *pseudo_block; //block of word
178
+
179
+ word = make_pseudo_word (block_list, selection_box,
180
+ pseudo_block, pseudo_row);
181
+ if (word != NULL) {
182
+ recog_interactive(pseudo_block, pseudo_row, word);
183
+ delete word;
184
+ }
185
+ }
186
+
187
+
188
+ /**********************************************************************
189
+ * recog_interactive
190
+ *
191
+ * Recognize a single word in interactive mode.
192
+ **********************************************************************/
193
+
194
+ BOOL8 recog_interactive( //recognize blobs
195
+ BLOCK *, //block
196
+ ROW *row, //row of word
197
+ WERD *word //word to recognize
198
+ ) {
199
+ WERD_RES word_res(word);
200
+ inT16 char_qual;
201
+ inT16 good_char_qual;
202
+
203
+ classify_word_pass2(&word_res, row);
204
+ #ifndef SECURE_NAMES
205
+ if (tessedit_debug_quality_metrics) {
206
+ word_char_quality(&word_res, row, &char_qual, &good_char_qual);
207
+ tprintf
208
+ ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
209
+ word_res.reject_map.length (), word_blob_quality (&word_res, row),
210
+ word_outline_errs (&word_res), char_qual, good_char_qual);
211
+ }
212
+ #endif
213
+ return TRUE;
214
+ }
215
+
216
+
217
+ /**********************************************************************
218
+ * recog_all_words()
219
+ *
220
+ * Walk the current block list applying the specified word processor function
221
+ * to all words
222
+ **********************************************************************/
223
+
224
+ void recog_all_words( //process words
225
+ PAGE_RES *page_res, //page structure
226
+ volatile ETEXT_DESC *monitor, //progress monitor
227
+ TBOX *target_word_box,//specifies just to extract a retangle
228
+ inT16 dopasses //0 - all, 1 just pass 1, 2 passes 2 and higher
229
+ ) {
230
+ //reset page iterator
231
+ static PAGE_RES_IT page_res_it;
232
+ inT16 chars_in_word;
233
+ inT16 rejects_in_word;
234
+ static CHAR_SAMPLES_LIST em_clusters;
235
+ static CHAR_SAMPLE_LIST ems_waiting;
236
+ static CHAR_SAMPLES_LIST char_clusters;
237
+ static CHAR_SAMPLE_LIST chars_waiting;
238
+ inT16 blob_quality = 0;
239
+ inT16 outline_errs = 0;
240
+ static inT16 doc_blob_quality = 0;
241
+ static inT16 doc_outline_errs = 0;
242
+ static inT16 doc_char_quality = 0;
243
+ inT16 all_char_quality;
244
+ inT16 accepted_all_char_quality;
245
+ static inT16 good_char_count = 0;
246
+ static inT16 doc_good_char_quality = 0;
247
+ int i;
248
+
249
+
250
+ inT32 tess_adapt_mode = 0;
251
+ static inT32 word_count; //count of words in doc
252
+ inT32 word_index; //current word
253
+ static int dict_words;
254
+
255
+ if (tessedit_minimal_rej_pass1) {
256
+ tessedit_test_adaption.set_value (TRUE);
257
+ tessedit_minimal_rejection.set_value (TRUE);
258
+ }
259
+
260
+ if (tessedit_cluster_adapt_before_pass1) {
261
+ tess_adapt_mode = tessedit_tess_adaption_mode;
262
+ tessedit_tess_adaption_mode.set_value (0);
263
+ tessedit_tess_adapt_to_rejmap.set_value (TRUE);
264
+ }
265
+
266
+
267
+ if (dopasses==0 || dopasses==1)
268
+ {
269
+ page_res_it.page_res=page_res;
270
+ page_res_it.restart_page();
271
+
272
+ /* Pass 1 */
273
+ word_count = 0;
274
+ if (monitor != NULL) {
275
+ monitor->ocr_alive = TRUE;
276
+ while (page_res_it.word () != NULL) {
277
+ word_count++;
278
+ page_res_it.forward ();
279
+ }
280
+ page_res_it.restart_page ();
281
+ }
282
+ else
283
+ word_count = 1;
284
+
285
+ word_index = 0;
286
+
287
+ em_clusters.clear();
288
+ ems_waiting.clear();
289
+ char_clusters.clear();
290
+ chars_waiting.clear();
291
+ dict_words = 0;
292
+ doc_blob_quality = 0;
293
+ doc_outline_errs = 0;
294
+ doc_char_quality = 0;
295
+ good_char_count = 0;
296
+ doc_good_char_quality = 0;
297
+
298
+ while (page_res_it.word () != NULL) {
299
+ set_global_loc_code(LOC_PASS1);
300
+ word_index++;
301
+ if (monitor != NULL) {
302
+ monitor->ocr_alive = TRUE;
303
+ monitor->progress = 30 + 50 * word_index / word_count;
304
+ if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
305
+ (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
306
+ dict_words)))
307
+ return;
308
+ }
309
+ classify_word_pass1 (page_res_it.word (),
310
+ page_res_it.row ()->row, FALSE, NULL, NULL);
311
+
312
+ if (tessedit_test_adaption && !tessedit_minimal_rejection) {
313
+ if (!word_adaptable (page_res_it.word (),
314
+ tessedit_test_adaption_mode))
315
+ page_res_it.word ()->reject_map.rej_word_tess_failure ();
316
+ //FAKE PERM REJ
317
+ else {
318
+ const STRING* wordstr = &(page_res_it.word ()->best_choice->string ());
319
+ /* Override rejection mechanisms for this word */
320
+ const char* text = wordstr->string ();
321
+ for (i = 0; text[i] != '\0'; i++) {
322
+ if ((text[i] != ' ')
323
+ && page_res_it.word ()->reject_map[i].rejected ())
324
+ page_res_it.word ()->reject_map[i].
325
+ setrej_minimal_rej_accept();
326
+ }
327
+ }
328
+ }
329
+
330
+ if ((tessedit_cluster_adapt_after_pass1
331
+ || tessedit_cluster_adapt_after_pass3
332
+ || tessedit_cluster_adapt_before_pass1)
333
+ && tessedit_cluster_adaption_mode != 0) {
334
+ collect_characters_for_adaption (page_res_it.word (),
335
+ &char_clusters, &chars_waiting);
336
+ }
337
+ // Count dict words.
338
+ if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
339
+ ++dict_words;
340
+ page_res_it.forward ();
341
+ }
342
+
343
+ if (tessedit_cluster_adapt_before_pass1)
344
+ tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
345
+
346
+ page_res_it.restart_page ();
347
+ while ((tessedit_cluster_adapt_after_pass1
348
+ || tessedit_cluster_adapt_before_pass1)
349
+ && page_res_it.word () != NULL) {
350
+ if (monitor != NULL)
351
+ monitor->ocr_alive = TRUE;
352
+ if (tessedit_cluster_adapt_after_pass1)
353
+ adapt_to_good_samples (page_res_it.word (),
354
+ &char_clusters, &chars_waiting);
355
+ else
356
+ classify_word_pass1 (page_res_it.word (),
357
+ page_res_it.row ()->row,
358
+ TRUE, &char_clusters, &chars_waiting);
359
+
360
+ page_res_it.forward ();
361
+ }
362
+
363
+ //
364
+
365
+
366
+ }
367
+
368
+ if (dopasses==1) return;
369
+
370
+ /* Pass 2 */
371
+ page_res_it.restart_page ();
372
+ word_index = 0;
373
+ while (!tessedit_test_adaption && page_res_it.word () != NULL) {
374
+ set_global_loc_code(LOC_PASS2);
375
+ word_index++;
376
+ if (monitor != NULL) {
377
+ monitor->ocr_alive = TRUE;
378
+ monitor->progress = 80 + 10 * word_index / word_count;
379
+ if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
380
+ (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
381
+ dict_words)))
382
+ return;
383
+ }
384
+ //changed by jetsoft
385
+ //specific to its needs to extract one word when need
386
+
387
+ if (target_word_box)
388
+ {
389
+
390
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
391
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
392
+ if (!target_word_box->contains(center_pt))
393
+ {
394
+ page_res_it.forward ();
395
+ continue;
396
+ }
397
+
398
+ }
399
+ //end jetsoft
400
+
401
+ classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row);
402
+
403
+ if (tessedit_em_adaption_mode > 0)
404
+ collect_ems_for_adaption (page_res_it.word (),
405
+ &em_clusters, &ems_waiting);
406
+
407
+ if (tessedit_cluster_adapt_after_pass2
408
+ && tessedit_cluster_adaption_mode != 0)
409
+ collect_characters_for_adaption (page_res_it.word (),
410
+ &char_clusters, &chars_waiting);
411
+ page_res_it.forward ();
412
+ }
413
+
414
+ /* Another pass */
415
+ set_global_loc_code(LOC_FUZZY_SPACE);
416
+
417
+ if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
418
+ && !tessedit_word_for_word)
419
+ fix_fuzzy_spaces(monitor, word_count, page_res);
420
+
421
+ if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
422
+ // Initially ems only
423
+ print_em_stats(&em_clusters, &ems_waiting);
424
+
425
+ /* Pass 3 - used for checking confusion sets */
426
+ page_res_it.restart_page ();
427
+ word_index = 0;
428
+ while (!tessedit_test_adaption && page_res_it.word () != NULL) {
429
+ set_global_loc_code(LOC_MM_ADAPT);
430
+ word_index++;
431
+ if (monitor != NULL) {
432
+ monitor->ocr_alive = TRUE;
433
+ monitor->progress = 95 + 5 * word_index / word_count;
434
+ }
435
+ check_debug_pt (page_res_it.word (), 70);
436
+ /* Use good matches to sort out confusions */
437
+
438
+
439
+ //changed by jetsoft
440
+ //specific to its needs to extract one word when need
441
+
442
+ if (target_word_box)
443
+ {
444
+
445
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
446
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
447
+ if (!target_word_box->contains(center_pt))
448
+ {
449
+ page_res_it.forward ();
450
+ continue;
451
+ }
452
+
453
+ }
454
+ // end jetsoft
455
+
456
+ if (tessedit_em_adaption_mode != 0)
457
+ adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
458
+
459
+ if (tessedit_cluster_adapt_after_pass2
460
+ && tessedit_cluster_adaption_mode != 0)
461
+ adapt_to_good_samples (page_res_it.word (),
462
+ &char_clusters, &chars_waiting);
463
+
464
+ if (tessedit_reject_fullstops
465
+ && strchr (page_res_it.word ()->best_choice->string ().string (),
466
+ '.') != NULL)
467
+ reject_all_fullstops (page_res_it.word ());
468
+ else if (tessedit_reject_suspect_fullstops
469
+ && strchr (page_res_it.word ()->best_choice->string ().
470
+ string (), '.') != NULL)
471
+ reject_suspect_fullstops (page_res_it.word ());
472
+
473
+ page_res_it.rej_stat_word ();
474
+ chars_in_word = page_res_it.word ()->reject_map.length ();
475
+ rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
476
+
477
+ blob_quality = word_blob_quality (page_res_it.word (),
478
+ page_res_it.row ()->row);
479
+ doc_blob_quality += blob_quality;
480
+ outline_errs = word_outline_errs (page_res_it.word ());
481
+ doc_outline_errs += outline_errs;
482
+ word_char_quality (page_res_it.word (),
483
+ page_res_it.row ()->row,
484
+ &all_char_quality, &accepted_all_char_quality);
485
+ doc_char_quality += all_char_quality;
486
+ uinT8 permuter_type = page_res_it.word ()->best_choice->permuter ();
487
+ if ((permuter_type == SYSTEM_DAWG_PERM) ||
488
+ (permuter_type == FREQ_DAWG_PERM) ||
489
+ (permuter_type == USER_DAWG_PERM)) {
490
+ good_char_count += chars_in_word - rejects_in_word;
491
+ doc_good_char_quality += accepted_all_char_quality;
492
+ }
493
+ check_debug_pt (page_res_it.word (), 80);
494
+ if (tessedit_reject_bad_qual_wds &&
495
+ (blob_quality == 0) && (outline_errs >= chars_in_word))
496
+ page_res_it.word ()->reject_map.rej_word_bad_quality ();
497
+ check_debug_pt (page_res_it.word (), 90);
498
+ page_res_it.forward ();
499
+ }
500
+
501
+ page_res_it.restart_page ();
502
+ while (!tessedit_test_adaption
503
+ && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
504
+ if (monitor != NULL)
505
+ monitor->ocr_alive = TRUE;
506
+
507
+ //changed by jetsoft
508
+ //specific to its needs to extract one word when need
509
+
510
+ if (target_word_box)
511
+ {
512
+
513
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
514
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
515
+ if (!target_word_box->contains(center_pt))
516
+ {
517
+ page_res_it.forward ();
518
+ continue;
519
+ }
520
+
521
+ }
522
+
523
+ //end jetsoft
524
+ if (tessedit_cluster_adaption_mode != 0)
525
+ adapt_to_good_samples (page_res_it.word (),
526
+ &char_clusters, &chars_waiting);
527
+ page_res_it.forward ();
528
+ }
529
+
530
+ #ifndef SECURE_NAMES
531
+ if (tessedit_debug_quality_metrics) {
532
+ tprintf
533
+ ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
534
+ page_res->char_count, page_res->rej_count,
535
+ page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
536
+ doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
537
+ doc_outline_errs / (float) page_res->char_count, doc_char_quality,
538
+ doc_char_quality / (float) page_res->char_count,
539
+ doc_good_char_quality,
540
+ good_char_count >
541
+ 0 ? doc_good_char_quality / (float) good_char_count : 0.0);
542
+ }
543
+ #endif
544
+ BOOL8 good_quality_doc =
545
+ (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
546
+ &&
547
+ (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
548
+ (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
549
+ (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
550
+
551
+ /* Do whole document or whole block rejection pass*/
552
+
553
+ if (!tessedit_test_adaption) {
554
+ set_global_loc_code(LOC_DOC_BLK_REJ);
555
+ quality_based_rejection(page_res_it, good_quality_doc);
556
+ }
557
+ font_recognition_pass(page_res_it);
558
+
559
+ /* Write results pass */
560
+ set_global_loc_code(LOC_WRITE_RESULTS);
561
+ // This is now redundant, but retained commented so show how to obtain
562
+ // bounding boxes and style information.
563
+
564
+ ////changed by jetsoft
565
+ //needed for dll to output memory structure
566
+ if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
567
+ output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
568
+ // end jetsoft
569
+
570
+ }
571
+
572
+
573
+ /**********************************************************************
574
+ * classify_word_pass1
575
+ *
576
+ * Baseline normalize the word and pass it to Tess.
577
+ **********************************************************************/
578
+
579
+ void classify_word_pass1( //recog one word
580
+ WERD_RES *word, //word to do
581
+ ROW *row,
582
+ BOOL8 cluster_adapt,
583
+ CHAR_SAMPLES_LIST *char_clusters,
584
+ CHAR_SAMPLE_LIST *chars_waiting) {
585
+ WERD *bln_word; //baseline norm copy
586
+ //detailed results
587
+ BLOB_CHOICE_LIST_CLIST local_blob_choices;
588
+ BLOB_CHOICE_LIST_CLIST *blob_choices;
589
+ BOOL8 adapt_ok;
590
+ const char *rejmap;
591
+ inT16 index;
592
+ STRING mapstr = "";
593
+ char *match_string;
594
+ char word_string[1024];
595
+
596
+ if (save_best_choices)
597
+ blob_choices = new BLOB_CHOICE_LIST_CLIST();
598
+ else
599
+ blob_choices = &local_blob_choices;
600
+
601
+ if (matcher_fp != NULL) {
602
+ fgets (word_string, 1023, correct_fp);
603
+ if ((match_string = strchr (word_string, '\r')) != NULL)
604
+ *match_string = '\0';
605
+ if ((match_string = strchr (word_string, '\n')) != NULL)
606
+ *match_string = '\0';
607
+ if (word_string[0] != '\0') {
608
+ word->word->set_text (word_string);
609
+ word_answer = (char *) word->word->text ();
610
+ }
611
+ else
612
+ word_answer = NULL;
613
+ }
614
+
615
+ check_debug_pt (word, 0);
616
+ matcher_pass = 0;
617
+ bln_word = make_bln_copy (word->word, row, word->x_height, &word->denorm);
618
+
619
+ word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
620
+ tess_default_matcher,
621
+ word->raw_choice, blob_choices,
622
+ word->outword);
623
+ /*
624
+ Test for TESS screw up on word. Recog_word has already ensured that the
625
+ choice list, outword blob lists and best_choice string are the same
626
+ length. A TESS screw up is indicated by a blank filled or 0 length string.
627
+ */
628
+ if ((word->best_choice->lengths ().length () == 0) ||
629
+ (strspn (word->best_choice->string ().string (), " ") ==
630
+ word->best_choice->string ().length ())) {
631
+ word->done = FALSE; //Try again on pass2 - adaption may help
632
+ word->tess_failed = TRUE;
633
+ word->reject_map.initialise (word->best_choice->lengths ().length ());
634
+ word->reject_map.rej_word_tess_failure ();
635
+ }
636
+ else {
637
+ word->tess_failed = FALSE;
638
+ if ((word->best_choice->lengths ().length () !=
639
+ word->outword->blob_list ()->length ()) ||
640
+ (word->best_choice->lengths ().length () != blob_choices->length ())) {
641
+ tprintf
642
+ ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
643
+ word->best_choice->string ().string (),
644
+ word->best_choice->lengths ().length (),
645
+ word->outword->blob_list ()->length (), blob_choices->length ());
646
+ }
647
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
648
+ word->outword->blob_list ()->length ());
649
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
650
+ blob_choices->length ());
651
+
652
+ /*
653
+ The adaption step used to be here. It has been moved to after
654
+ make_reject_map so that we know whether the word will be accepted in the
655
+ first pass or not. This move will PREVENT adaption to words containing
656
+ double quotes because the word will not be identical to what tess thinks
657
+ its best choice is. (See CurrentBestChoiceIs in
658
+ danj/microfeatures/stopper.c which is used by AdaptableWord in
659
+ danj/microfeatures/adaptmatch.c)
660
+ */
661
+
662
+ if (word->word->flag (W_REP_CHAR)) {
663
+ fix_rep_char(word);
664
+ }
665
+ else {
666
+ fix_quotes (word->best_choice,
667
+ //turn to double
668
+ word->outword, blob_choices);
669
+ if (tessedit_fix_hyphens)
670
+ //turn 2 to 1
671
+ fix_hyphens (word->best_choice, word->outword, blob_choices);
672
+ record_certainty (word->best_choice->certainty (), 1);
673
+ //accounting
674
+
675
+ word->tess_accepted = tess_acceptable_word (word->best_choice,
676
+ word->raw_choice);
677
+
678
+ word->tess_would_adapt = tess_adaptable_word (word->outword,
679
+ word->best_choice,
680
+ word->raw_choice);
681
+ // Also sets word->done flag
682
+ make_reject_map (word, blob_choices, row, 1);
683
+
684
+ adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
685
+
686
+ if (cluster_adapt)
687
+ adapt_to_good_samples(word, char_clusters, chars_waiting);
688
+
689
+ if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
690
+ if (!tessedit_tess_adapt_to_rejmap)
691
+ rejmap = NULL;
692
+ else {
693
+ ASSERT_HOST (word->reject_map.length () ==
694
+ word->best_choice->lengths ().length ());
695
+
696
+ for (index = 0; index < word->reject_map.length (); index++) {
697
+ if (adapt_ok || word->reject_map[index].accepted ())
698
+ mapstr += '1';
699
+ else
700
+ mapstr += '0';
701
+ }
702
+ rejmap = mapstr.string ();
703
+ }
704
+
705
+ //adapt to it
706
+ tess_adapter (word->outword, &word->denorm,
707
+ *word->best_choice,
708
+ *word->raw_choice, rejmap);
709
+ }
710
+
711
+ if (tessedit_enable_doc_dict)
712
+ tess_add_doc_word (word->best_choice);
713
+ set_word_fonts(word, blob_choices);
714
+ }
715
+ }
716
+ #if 0
717
+ if (tessedit_print_text) {
718
+ write_cooked_text (bln_word, word->best_choice->string (),
719
+ word->done, FALSE, stdout);
720
+ }
721
+ #endif
722
+ delete bln_word;
723
+
724
+ // Save best choices in the WERD_CHOICE if needed
725
+ if (blob_choices != &local_blob_choices)
726
+ word->best_choice->set_blob_choices(blob_choices);
727
+ else
728
+ blob_choices->deep_clear();
729
+ }
730
+
731
+
732
+ /**********************************************************************
733
+ * classify_word_pass2
734
+ *
735
+ * Control what to do with the word in pass 2
736
+ **********************************************************************/
737
+
738
+ void classify_word_pass2( //word to do
739
+ WERD_RES *word,
740
+ ROW *row) {
741
+ BOOL8 done_this_pass = FALSE;
742
+ WERD_RES new_x_ht_word (word->word);
743
+ float new_x_ht = 0.0;
744
+ inT16 old_xht_reject_count;
745
+ inT16 new_xht_reject_count;
746
+ inT16 old_xht_accept_count;
747
+ inT16 new_xht_accept_count;
748
+ BOOL8 accept_new_x_ht = FALSE;
749
+ inT16 old_chs_in_wd;
750
+ inT16 new_chs_in_wd;
751
+ inT16 old_word_quality;
752
+ inT16 new_word_quality;
753
+ inT16 dummy;
754
+
755
+ set_global_subloc_code(SUBLOC_NORM);
756
+ check_debug_pt (word, 30);
757
+ if (!word->done ||
758
+ tessedit_training_tess ||
759
+ tessedit_training_wiseowl || tessedit_dump_choices) {
760
+ word->caps_height = 0.0;
761
+ if (word->x_height == 0.0f)
762
+ word->x_height = row->x_height();
763
+ if (word->outword != NULL) {
764
+ delete word->outword; //get rid of junk
765
+ delete word->best_choice;
766
+ delete word->raw_choice;
767
+ }
768
+ match_word_pass2 (word, row, word->x_height);
769
+ done_this_pass = TRUE;
770
+ check_debug_pt (word, 40);
771
+ }
772
+
773
+ if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
774
+ set_global_subloc_code(SUBLOC_FIX_XHT);
775
+ if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
776
+ (tessedit_xht_fiddles_on_no_rej_wds ||
777
+ (word->reject_map.reject_count () > 0))) {
778
+ if ((x_ht_check_word_occ >= 2) && word_occ_first)
779
+ check_block_occ(word);
780
+
781
+ if (tessedit_redo_xheight)
782
+ re_estimate_x_ht(word, &new_x_ht);
783
+
784
+ if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
785
+ ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
786
+ check_block_occ(word);
787
+ }
788
+ if (new_x_ht > 0) {
789
+ old_chs_in_wd = word->reject_map.length ();
790
+
791
+ /* Re-estimated x_ht error suggests a rematch is worthwhile. */
792
+ new_x_ht_word.x_height = new_x_ht;
793
+ new_x_ht_word.caps_height = 0.0;
794
+ match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height);
795
+ if (!new_x_ht_word.tess_failed) {
796
+ if ((x_ht_check_word_occ >= 1) && word_occ_first)
797
+ check_block_occ(&new_x_ht_word);
798
+
799
+ re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
800
+
801
+ if ((x_ht_check_word_occ >= 1) && !word_occ_first)
802
+ check_block_occ(&new_x_ht_word);
803
+
804
+ old_xht_reject_count = word->reject_map.reject_count ();
805
+ old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
806
+ new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
807
+ new_chs_in_wd = new_x_ht_word.reject_map.length ();
808
+ new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
809
+ accept_new_x_ht =
810
+ ((new_xht_accept_count > old_xht_accept_count) ||
811
+ ((new_xht_accept_count == old_xht_accept_count) &&
812
+ (new_xht_accept_count > 0))) &&
813
+ (!new_x_ht_word.guessed_x_ht ||
814
+ !new_x_ht_word.guessed_caps_ht);
815
+
816
+ if (accept_new_x_ht && x_ht_quality_check) {
817
+ word_char_quality(word, row, &old_word_quality, &dummy);
818
+ word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
819
+ if (old_word_quality > new_word_quality)
820
+ accept_new_x_ht = FALSE;
821
+ }
822
+
823
+ if (accept_new_x_ht && (x_ht_stringency > 0)) {
824
+ accept_new_x_ht =
825
+ (count_alphanums (&new_x_ht_word) > x_ht_stringency);
826
+ if (!accept_new_x_ht && rej_use_xht) {
827
+ if (debug_x_ht_level >= 1)
828
+ tprintf
829
+ ("Failed stringency test so reject original word\n");
830
+ word->reject_map.rej_word_xht_fixup ();
831
+ }
832
+ }
833
+
834
+ #ifndef SECURE_NAMES
835
+ if (debug_x_ht_level >= 1) {
836
+ tprintf ("New XHT Match:: %s ",
837
+ word->best_choice->string ().string ());
838
+ word->reject_map.print (debug_fp);
839
+ tprintf (" -> %s ",
840
+ new_x_ht_word.best_choice->string ().string ());
841
+ new_x_ht_word.reject_map.print (debug_fp);
842
+ tprintf (" %s->%s %s %s\n",
843
+ word->guessed_x_ht ? "GUESS" : "CERT",
844
+ new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
845
+ new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
846
+ accept_new_x_ht ? "ACCEPTED" : "");
847
+ }
848
+ #endif
849
+ }
850
+ if (accept_new_x_ht) {
851
+ /*
852
+ The new x_ht is deemed superior so put the final results in the real word
853
+ and destroy the old results
854
+ */
855
+ delete word->outword; //get rid of junk
856
+ word->outword = new_x_ht_word.outword;
857
+ word->denorm = new_x_ht_word.denorm;
858
+ delete word->best_choice;
859
+ word->best_choice = new_x_ht_word.best_choice;
860
+ delete word->raw_choice;
861
+ word->raw_choice = new_x_ht_word.raw_choice;
862
+ word->reject_map = new_x_ht_word.reject_map;
863
+ word->done = new_x_ht_word.done;
864
+ done_this_pass = TRUE;
865
+ }
866
+ else {
867
+ /*
868
+ The new x_ht is no better, so destroy the copy word and put any uncertain
869
+ x or cap ht estimate back to default. (I.e. dont blame me if its bad!)
870
+ Conditionally, use any ammended block occ chars.
871
+ */
872
+ //get rid of junk
873
+ delete new_x_ht_word.outword;
874
+ delete new_x_ht_word.best_choice;
875
+ delete new_x_ht_word.raw_choice;
876
+ }
877
+ //to keep new destructor happy
878
+ new_x_ht_word.outword = NULL;
879
+ //to keep new destructor happy
880
+ new_x_ht_word.best_choice = NULL;
881
+ //to keep new destructor happy
882
+ new_x_ht_word.raw_choice = NULL;
883
+
884
+ if (rej_mostly_reject_mode == 2) {
885
+ reject_mostly_rejects(word);
886
+ tprintf ("Rejecting mostly rejects on %s ",
887
+ word->best_choice->string ().string ());
888
+ }
889
+ }
890
+
891
+ set_global_subloc_code(SUBLOC_NORM);
892
+
893
+ if (done_this_pass && !word->done && tessedit_save_stats)
894
+ SaveBadWord (word->best_choice->string ().string (),
895
+ word->best_choice->certainty ());
896
+ record_certainty (word->best_choice->certainty (), 2);
897
+ //accounting
898
+ }
899
+ #ifndef GRAPHICS_DISABLED
900
+ if (tessedit_draw_outwords) {
901
+ if (fx_win == NULL)
902
+ create_fx_win();
903
+ clear_fx_win();
904
+ word->outword->plot (fx_win);
905
+ TBOX wbox = word->outword->bounding_box();
906
+ fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
907
+ wbox.right(), wbox.bottom());
908
+ //make_picture_current(fx_win);
909
+ ScrollView::Update();
910
+ }
911
+ #endif
912
+
913
+ set_global_subloc_code(SUBLOC_NORM);
914
+ #if 0
915
+ if (tessedit_print_text) {
916
+ write_cooked_text (word->outword, word->best_choice->string (),
917
+ word->done, done_this_pass, stdout);
918
+ }
919
+ #endif
920
+ check_debug_pt (word, 50);
921
+ }
922
+
923
+
924
+ /**********************************************************************
925
+ * match_word_pass2
926
+ *
927
+ * Baseline normalize the word and pass it to Tess.
928
+ **********************************************************************/
929
+
930
+ void match_word_pass2( //recog one word
931
+ WERD_RES *word, //word to do
932
+ ROW *row,
933
+ float x_height) {
934
+ WERD *bln_word; //baseline norm copy
935
+ //detailed results
936
+ BLOB_CHOICE_LIST_CLIST local_blob_choices;
937
+ BLOB_CHOICE_LIST_CLIST *blob_choices;
938
+
939
+ if (save_best_choices)
940
+ blob_choices = new BLOB_CHOICE_LIST_CLIST();
941
+ else
942
+ blob_choices = &local_blob_choices;
943
+
944
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
945
+ if (matcher_fp != NULL) {
946
+ word_answer = (char *) word->word->text ();
947
+ if (word_answer != NULL && word_answer[0] == '\0')
948
+ word_answer = NULL;
949
+ }
950
+ matcher_pass = 0;
951
+ bln_word = make_bln_copy (word->word, row, x_height, &word->denorm);
952
+ set_global_subsubloc_code(SUBSUBLOC_TESS);
953
+ if (tessedit_training_tess)
954
+ word->best_choice = correct_segment_pass2 (bln_word,
955
+ &word->denorm,
956
+ tess_default_matcher,
957
+ tess_training_tester,
958
+ word->raw_choice,
959
+ blob_choices, word->outword);
960
+ else if (tessedit_dump_choices)
961
+ word->best_choice = test_segment_pass2 (bln_word,
962
+ &word->denorm,
963
+ tess_default_matcher,
964
+ choice_dump_tester,
965
+ word->raw_choice,
966
+ blob_choices, word->outword);
967
+ // else if (tessedit_training_wiseowl)
968
+ // best_choice=correct_segment_pass2( word, &denorm,
969
+ // tess_default_matcher,wo_learn,
970
+ // raw_choice,blob_choices,outword);
971
+ // else if (tessedit_matcher_is_wiseowl)
972
+ // best_choice=tess_segment_pass2( word, &denorm, wo_classify,
973
+ // raw_choice, blob_choices, outword);
974
+ else {
975
+ word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
976
+ tess_default_matcher,
977
+ word->raw_choice, blob_choices,
978
+ word->outword);
979
+ }
980
+ set_global_subsubloc_code(SUBSUBLOC_OTHER);
981
+ /*
982
+ Test for TESS screw up on word. Recog_word has already ensured that the
983
+ choice list, outword blob lists and best_choice string are the same
984
+ length. A TESS screw up is indicated by a blank filled or 0 length string.
985
+ */
986
+ if ((word->best_choice->string ().length () == 0) ||
987
+ (strspn (word->best_choice->string ().string (), " ") ==
988
+ word->best_choice->string ().length ())) {
989
+ word->tess_failed = TRUE;
990
+ word->reject_map.initialise (word->best_choice->string ().length ());
991
+ word->reject_map.rej_word_tess_failure ();
992
+ // tprintf("Empty word produced\n");
993
+ }
994
+ else {
995
+ if ((word->best_choice->lengths ().length () !=
996
+ word->outword->blob_list ()->length ()) ||
997
+ (word->best_choice->lengths ().length () != blob_choices->length ())) {
998
+ tprintf
999
+ ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1000
+ word->best_choice->string ().string (),
1001
+ word->best_choice->lengths ().length (),
1002
+ word->outword->blob_list ()->length (), blob_choices->length ());
1003
+ }
1004
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1005
+ word->outword->blob_list ()->length ());
1006
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1007
+ blob_choices->length ());
1008
+
1009
+ word->tess_failed = FALSE;
1010
+ if (word->word->flag (W_REP_CHAR)) {
1011
+ fix_rep_char(word);
1012
+ }
1013
+ else {
1014
+ fix_quotes (word->best_choice,
1015
+ word->outword, blob_choices);
1016
+ if (tessedit_fix_hyphens)
1017
+ fix_hyphens (word->best_choice,
1018
+ word->outword, blob_choices);
1019
+ /* Dont trust fix_quotes! - though I think I've fixed the bug */
1020
+ if ((word->best_choice->lengths ().length () !=
1021
+ word->outword->blob_list ()->length ()) ||
1022
+ (word->best_choice->lengths ().length () !=
1023
+ blob_choices->length ())) {
1024
+ #ifndef SECURE_NAMES
1025
+ tprintf
1026
+ ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
1027
+ word->best_choice->string ().string (),
1028
+ word->best_choice->lengths ().length (),
1029
+ word->outword->blob_list ()->length (),
1030
+ blob_choices->length ());
1031
+ #endif
1032
+
1033
+ }
1034
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1035
+ word->outword->blob_list ()->length ());
1036
+ ASSERT_HOST (word->best_choice->lengths ().length () ==
1037
+ blob_choices->length ());
1038
+
1039
+ word->tess_accepted = tess_acceptable_word (word->best_choice,
1040
+ word->raw_choice);
1041
+
1042
+ make_reject_map (word, blob_choices, row, 2);
1043
+ }
1044
+ }
1045
+
1046
+ // Save best choices in the WERD_CHOICE if needed
1047
+ if (blob_choices != &local_blob_choices)
1048
+ word->best_choice->set_blob_choices(blob_choices);
1049
+ else
1050
+ blob_choices->deep_clear();
1051
+
1052
+ delete bln_word;
1053
+ assert (word->raw_choice != NULL);
1054
+ }
1055
+
1056
+
1057
+ /*************************************************************************
1058
+ * fix_rep_char()
1059
+ * The word is a repeated char. Find the repeated char character. Make a reject
1060
+ * string which rejects any char other than the voted char. Set the word to done
1061
+ * to stop rematching it.
1062
+ *
1063
+ *************************************************************************/
1064
+ void fix_rep_char( //Repeated char word
1065
+ WERD_RES *word //word to do
1066
+ ) {
1067
+ struct REP_CH
1068
+ {
1069
+ char ch[UNICHAR_LEN + 1];
1070
+ int count;
1071
+ };
1072
+
1073
+ REP_CH *rep_ch; //array of char counts
1074
+ int word_len;
1075
+ int rep_ch_count = 0; //how many unique chs
1076
+ const char *word_str; //the repeated chs
1077
+ int i, j;
1078
+ int offset;
1079
+ int total = 0;
1080
+ int max = 0;
1081
+ char *maxch = NULL; //Most common char
1082
+
1083
+ word_str = word->best_choice->string ().string ();
1084
+ word_len = word->best_choice->lengths ().length ();;
1085
+ rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
1086
+ for (i = 0, offset = 0; i < word_len;
1087
+ offset += word->best_choice->lengths()[i++]) {
1088
+ for (j = 0; j < rep_ch_count &&
1089
+ strncmp(rep_ch[j].ch, word_str + offset,
1090
+ word->best_choice->lengths()[i]) != 0; j++);
1091
+ if (j < rep_ch_count)
1092
+ rep_ch[j].count++;
1093
+ else {
1094
+ strncpy(rep_ch[rep_ch_count].ch, word_str + offset,
1095
+ word->best_choice->lengths()[i]);
1096
+ rep_ch[rep_ch_count].ch[word->best_choice->lengths()[i]] = '\0';
1097
+ rep_ch[rep_ch_count].count = 1;
1098
+ rep_ch_count++;
1099
+ }
1100
+ }
1101
+
1102
+ for (j = 0; j < rep_ch_count; j++) {
1103
+ total += rep_ch[j].count;
1104
+ if ((rep_ch[j].count > max) && (*rep_ch[j].ch != ' ')) {
1105
+ max = rep_ch[j].count;
1106
+ maxch = rep_ch[j].ch;
1107
+ }
1108
+ }
1109
+ // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
1110
+ // word_str, word_len, total, maxch );
1111
+ free_mem(rep_ch);
1112
+
1113
+ word->reject_map.initialise (word_len);
1114
+ for (i = 0, offset = 0; i < word_len;
1115
+ offset += word->best_choice->lengths()[i++]) {
1116
+ if (strncmp(word_str + offset, maxch,
1117
+ word->best_choice->lengths()[i]) != 0)
1118
+ //rej unrecognised blobs
1119
+ word->reject_map[i].setrej_bad_repetition ();
1120
+ }
1121
+ word->done = TRUE;
1122
+ }
1123
+
1124
+ // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1125
+ // training data.
1126
+
1127
+ // Utility function for fix_quotes
1128
+ // Return true if the next character in the string (given the UTF8 length in
1129
+ // bytes) is a quote character.
1130
+ static int is_simple_quote(const char* signed_str, int length) {
1131
+ const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str);
1132
+ //standard 1 byte quotes
1133
+ return (length == 1 && (*str == '\'' || *str == '`')) ||
1134
+ //utf8 3 bytes curved quotes
1135
+ (length == 3 && ((*str == 0xe2 &&
1136
+ *(str + 1) == 0x80 &&
1137
+ *(str + 2) == 0x98) ||
1138
+ (*str == 0xe2 &&
1139
+ *(str + 1) == 0x80 &&
1140
+ *(str + 2) == 0x99)));
1141
+ }
1142
+
1143
+ /**********************************************************************
1144
+ * fix_quotes
1145
+ *
1146
+ * Change pairs of quotes to double quotes.
1147
+ **********************************************************************/
1148
+ void fix_quotes( //make double quotes
1149
+ WERD_CHOICE *choice, //choice to fix
1150
+ WERD *word, //word to do //char choices
1151
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1152
+ char *str = (char *) choice->string().string();//string ptr
1153
+ int i;
1154
+ int offset;
1155
+ //blobs
1156
+ PBLOB_IT blob_it = word->blob_list ();
1157
+ //choices
1158
+ BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
1159
+ BLOB_CHOICE_IT it1; //first choices
1160
+ BLOB_CHOICE_IT it2; //second choices
1161
+
1162
+ for (i = 0, offset = 0; str[offset] != '\0';
1163
+ offset += choice->lengths()[i++],
1164
+ blob_it.forward (), choice_it.forward ()) {
1165
+ if (str[offset + choice->lengths()[i]] != '\0' &&
1166
+ is_simple_quote(str + offset, choice->lengths()[i]) &&
1167
+ is_simple_quote(str + offset + choice->lengths()[i],
1168
+ choice->lengths()[i + 1]) &&
1169
+ unicharset.contains_unichar("\"")) {
1170
+ str[offset] = '"'; //turn to double
1171
+ strcpy (str + offset + 1,
1172
+ str + offset + choice->lengths()[i] +
1173
+ choice->lengths()[i + 1]); //shuffle up
1174
+ choice->lengths()[i] = 1;
1175
+ strcpy ((char*) choice->lengths().string() + i + 1,
1176
+ choice->lengths().string() + i + 2);
1177
+ merge_blobs (blob_it.data (), blob_it.data_relative (1));
1178
+ blob_it.forward ();
1179
+ delete blob_it.extract (); //get rid of spare
1180
+
1181
+ it1.set_to_list (choice_it.data ());
1182
+ it2.set_to_list (choice_it.data_relative (1));
1183
+ if (it1.data ()->certainty () < it2.data ()->certainty ()) {
1184
+ choice_it.forward ();
1185
+ //get rid of spare
1186
+ delete choice_it.extract ();
1187
+ }
1188
+ else {
1189
+ //get rid of spare
1190
+ delete choice_it.extract ();
1191
+ choice_it.forward ();
1192
+ }
1193
+ }
1194
+ }
1195
+ }
1196
+
1197
+
1198
+ /**********************************************************************
1199
+ * fix_hyphens
1200
+ *
1201
+ * Change pairs of hyphens to a single hyphen if the bounding boxes touch
1202
+ * Typically a long dash which has been segmented.
1203
+ **********************************************************************/
1204
+ void fix_hyphens( //crunch double hyphens
1205
+ WERD_CHOICE *choice, //choice to fix
1206
+ WERD *word, //word to do //char choices
1207
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1208
+ char *str = (char *) choice->string().string();//string ptr
1209
+ int i;
1210
+ int offset;
1211
+ //blobs
1212
+ PBLOB_IT blob_it = word->blob_list ();
1213
+ //choices
1214
+ BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
1215
+ BLOB_CHOICE_IT it1; //first choices
1216
+ BLOB_CHOICE_IT it2; //second choices
1217
+
1218
+ for (i = 0, offset = 0; str[offset] != '\0';
1219
+ offset += choice->lengths()[i++],
1220
+ blob_it.forward (), choice_it.forward ()) {
1221
+ if ((str[offset] == '-' || str[offset] == '~') &&
1222
+ (str[offset + choice->lengths()[i]] == '-' ||
1223
+ str[offset + choice->lengths()[i]] == '~') &&
1224
+ (blob_it.data ()->bounding_box ().right () >=
1225
+ blob_it.data_relative (1)->bounding_box ().left ())) {
1226
+ str[offset] = '-'; //turn to single hyphen
1227
+ strcpy (str + offset + choice->lengths()[i],
1228
+ str + offset + choice->lengths()[i] +
1229
+ choice->lengths()[i + 1]); //shuffle up
1230
+ strcpy ((char*) choice->lengths().string() + i + 1,
1231
+ choice->lengths().string() + i + 2);
1232
+ merge_blobs (blob_it.data (), blob_it.data_relative (1));
1233
+ blob_it.forward ();
1234
+ delete blob_it.extract (); //get rid of spare
1235
+
1236
+ it1.set_to_list (choice_it.data ());
1237
+ it2.set_to_list (choice_it.data_relative (1));
1238
+ if (it1.data ()->certainty () < it2.data ()->certainty ()) {
1239
+ choice_it.forward ();
1240
+ //get rid of spare
1241
+ delete choice_it.extract ();
1242
+ }
1243
+ else {
1244
+ //get rid of spare
1245
+ delete choice_it.extract ();
1246
+ choice_it.forward ();
1247
+ }
1248
+ }
1249
+ }
1250
+ }
1251
+
1252
+
1253
+ /**********************************************************************
1254
+ * merge_blobs
1255
+ *
1256
+ * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted.
1257
+ **********************************************************************/
1258
+
1259
+ void merge_blobs( //combine 2 blobs
1260
+ PBLOB *blob1, //dest blob
1261
+ PBLOB *blob2 //source blob
1262
+ ) {
1263
+ OUTLINE_IT outline_it = blob1->out_list ();
1264
+ //iterator
1265
+
1266
+ outline_it.move_to_last (); //go to end
1267
+ //do it
1268
+ outline_it.add_list_after (blob2->out_list ());
1269
+ }
1270
+
1271
+
1272
+ /**********************************************************************
1273
+ * choice_dump_tester
1274
+ *
1275
+ * Matcher tester function which generates .chc file entries.
1276
+ * Called via test_segment_pass2 for every blob tested by tess in a word.
1277
+ * (But only for words for which a correct segmentation could be found.)
1278
+ **********************************************************************/
1279
+
1280
+ void choice_dump_tester( //dump chars in word
1281
+ PBLOB *, //blob
1282
+ DENORM *, //de-normaliser
1283
+ BOOL8 correct, //ly segmented
1284
+ char *text, //correct text
1285
+ inT32 count, //chars in text
1286
+ BLOB_CHOICE_LIST *ratings //list of results
1287
+ ) {
1288
+ STRING choice_file_name;
1289
+ BLOB_CHOICE *blob_choice;
1290
+ BLOB_CHOICE_IT it;
1291
+ char source_chars[20];
1292
+ char correct_char[3];
1293
+
1294
+ if (choice_file == NULL) {
1295
+ choice_file_name = imagebasename + ".chc";
1296
+ if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
1297
+ CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
1298
+ choice_file_name.string (), errno);
1299
+ }
1300
+ }
1301
+
1302
+ if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
1303
+ strcpy (source_chars, "$$");
1304
+ strcpy (correct_char, "$$");
1305
+ }
1306
+ else {
1307
+ strncpy(source_chars, text, count);
1308
+ source_chars[count] = '\0';
1309
+ if (correct) {
1310
+ correct_char[0] = text[0];
1311
+ correct_char[1] = '\0';
1312
+ }
1313
+ else {
1314
+ strcpy (correct_char, "$$");
1315
+ }
1316
+ }
1317
+ fprintf (choice_file, "%s\t%s", source_chars, correct_char);
1318
+
1319
+ it.set_to_list (ratings);
1320
+ for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
1321
+ blob_choice = it.data ();
1322
+ fprintf (choice_file, "\t%s\t%f\t%f",
1323
+ blob_choice->unichar (),
1324
+ blob_choice->rating (), blob_choice->certainty ());
1325
+ }
1326
+ fprintf (choice_file, "\n");
1327
+ }
1328
+
1329
+
1330
+ /*************************************************************************
1331
+ * make_bln_copy()
1332
+ *
1333
+ * Generate a baseline normalised copy of the source word. The copy is done so
1334
+ * that whatever format the original word is in, a polygonal bln version is
1335
+ * generated as output.
1336
+ *************************************************************************/
1337
+
1338
+ WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
1339
+ WERD *result;
1340
+
1341
+ // if (wordit_linearc && !src_word->flag(W_POLYGON))
1342
+ // {
1343
+ // larc_word = src_word->larc_copy( row->x_height() );
1344
+ // result = larc_word->poly_copy( row->x_height() );
1345
+ // delete larc_word;
1346
+ // }
1347
+ // else
1348
+ result = src_word->poly_copy (row->x_height ());
1349
+
1350
+ // if (tessedit_draw_words)
1351
+ // {
1352
+ // if ( la_win == NO_WINDOW )
1353
+ // create_la_win();
1354
+ // result->plot( la_win );
1355
+ // }
1356
+ result->baseline_normalise_x (row, x_height, denorm);
1357
+ return result;
1358
+ }
1359
+
1360
+
1361
+ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
1362
+ const char *lengths) {
1363
+ int i = 0;
1364
+ int offset = 0;
1365
+ int leading_punct_count;
1366
+ int upper_count = 0;
1367
+ int hyphen_pos = -1;
1368
+ ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1369
+
1370
+ if (strlen (lengths) > 20)
1371
+ return word_type;
1372
+
1373
+ /* Single Leading punctuation char*/
1374
+
1375
+ if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
1376
+ offset += lengths[i++];
1377
+ leading_punct_count = i;
1378
+
1379
+ /* Initial cap */
1380
+ while ((s[offset] != '\0') &&
1381
+ unicharset.get_isupper(s + offset, lengths[i])) {
1382
+ offset += lengths[i++];
1383
+ upper_count++;
1384
+ }
1385
+ if (upper_count > 1)
1386
+ word_type = AC_UPPER_CASE;
1387
+ else {
1388
+ /* Lower case word, possibly with an initial cap */
1389
+ while ((s[offset] != '\0') &&
1390
+ unicharset.get_islower (s + offset, lengths[i])) {
1391
+ offset += lengths[i++];
1392
+ }
1393
+ if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1394
+ goto not_a_word;
1395
+ /*
1396
+ Allow a single hyphen in a lower case word
1397
+ - dont trust upper case - I've seen several cases of "H" -> "I-I"
1398
+ */
1399
+ if (lengths[i] == 1 && s[offset] == '-') {
1400
+ hyphen_pos = i;
1401
+ offset += lengths[i++];
1402
+ if (s[offset] != '\0') {
1403
+ while ((s[offset] != '\0') &&
1404
+ unicharset.get_islower(s + offset, lengths[i])) {
1405
+ offset += lengths[i++];
1406
+ }
1407
+ if (i < hyphen_pos + 3)
1408
+ goto not_a_word;
1409
+ }
1410
+ }
1411
+ else {
1412
+ /* Allow "'s" in NON hyphenated lower case words */
1413
+ if (lengths[i] == 1 && (s[offset] == '\'') &&
1414
+ lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1415
+ offset += lengths[i++];
1416
+ offset += lengths[i++];
1417
+ }
1418
+ }
1419
+ if (upper_count > 0)
1420
+ word_type = AC_INITIAL_CAP;
1421
+ else
1422
+ word_type = AC_LOWER_CASE;
1423
+ }
1424
+
1425
+ /* Up to two different, constrained trailing punctuation chars */
1426
+ if (lengths[i] == 1 && (s[offset] != '\0') &&
1427
+ (STRING (chs_trailing_punct1).contains (s[offset])))
1428
+ offset += lengths[i++];
1429
+ if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
1430
+ (s[offset - lengths[i - 1]] != s[offset]) &&
1431
+ (STRING (chs_trailing_punct2).contains (s[offset])))
1432
+ offset += lengths[i++];
1433
+
1434
+ if (s[offset] != '\0')
1435
+ word_type = AC_UNACCEPTABLE;
1436
+
1437
+ not_a_word:
1438
+
1439
+ if (word_type == AC_UNACCEPTABLE) {
1440
+ /* Look for abbreviation string */
1441
+ i = 0;
1442
+ offset = 0;
1443
+ if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
1444
+ word_type = AC_UC_ABBREV;
1445
+ while ((s[offset] != '\0') &&
1446
+ unicharset.get_isupper(s + offset, lengths[i]) &&
1447
+ (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1448
+ offset += lengths[i++];
1449
+ offset += lengths[i++];
1450
+ }
1451
+ }
1452
+ else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
1453
+ word_type = AC_LC_ABBREV;
1454
+ while ((s[offset] != '\0') &&
1455
+ unicharset.get_islower(s + offset, lengths[i]) &&
1456
+ (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
1457
+ offset += lengths[i++];
1458
+ offset += lengths[i++];
1459
+ }
1460
+ }
1461
+ if (s[offset] != '\0')
1462
+ word_type = AC_UNACCEPTABLE;
1463
+ }
1464
+
1465
+ return word_type;
1466
+ }
1467
+
1468
+
1469
+ /* DEBUGGING ROUTINE */
1470
+
1471
+ BOOL8 check_debug_pt(WERD_RES *word, int location) {
1472
+ BOOL8 show_map_detail = FALSE;
1473
+ inT16 i;
1474
+
1475
+ #ifndef SECURE_NAMES
1476
+ if (!test_pt)
1477
+ return FALSE;
1478
+
1479
+ tessedit_rejection_debug.set_value (FALSE);
1480
+ debug_x_ht_level.set_value (0);
1481
+ tessedit_cluster_debug.set_value (FALSE);
1482
+ nn_debug.set_value (FALSE);
1483
+ nn_reject_debug.set_value (FALSE);
1484
+
1485
+ if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1486
+ if (location < 0)
1487
+ return TRUE; //For breakpoint use
1488
+ tessedit_rejection_debug.set_value (TRUE);
1489
+ debug_x_ht_level.set_value (20);
1490
+ tessedit_cluster_debug.set_value (TRUE);
1491
+ nn_debug.set_value (TRUE);
1492
+ nn_reject_debug.set_value (TRUE);
1493
+ tprintf ("\n\nTESTWD::");
1494
+ switch (location) {
1495
+ case 0:
1496
+ tprintf ("classify_word_pass1 start\n");
1497
+ word->word->print (debug_fp);
1498
+ break;
1499
+ case 10:
1500
+ tprintf ("make_reject_map: initial map");
1501
+ break;
1502
+ case 20:
1503
+ tprintf ("make_reject_map: after NN");
1504
+ break;
1505
+ case 30:
1506
+ tprintf ("classify_word_pass2 - START");
1507
+ break;
1508
+ case 40:
1509
+ tprintf ("classify_word_pass2 - Pre Xht");
1510
+ break;
1511
+ case 50:
1512
+ tprintf ("classify_word_pass2 - END");
1513
+ show_map_detail = TRUE;
1514
+ break;
1515
+ case 60:
1516
+ tprintf ("fixspace");
1517
+ break;
1518
+ case 70:
1519
+ tprintf ("MM pass START");
1520
+ break;
1521
+ case 80:
1522
+ tprintf ("MM pass END");
1523
+ break;
1524
+ case 90:
1525
+ tprintf ("After Poor quality rejection");
1526
+ break;
1527
+ case 100:
1528
+ tprintf ("unrej_good_quality_words - START");
1529
+ break;
1530
+ case 110:
1531
+ tprintf ("unrej_good_quality_words - END");
1532
+ break;
1533
+ case 120:
1534
+ tprintf ("Write results pass");
1535
+ show_map_detail = TRUE;
1536
+ break;
1537
+ }
1538
+ tprintf (" \"%s\" ", word->best_choice->string ().string ());
1539
+ word->reject_map.print (debug_fp);
1540
+ tprintf ("\n");
1541
+ if (show_map_detail) {
1542
+ tprintf ("\"%s\"\n", word->best_choice->string ().string ());
1543
+ for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
1544
+ tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
1545
+ word->reject_map[i].full_print (debug_fp);
1546
+ }
1547
+ }
1548
+
1549
+ tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1550
+ tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1551
+ return TRUE;
1552
+ }
1553
+ else
1554
+ #endif
1555
+ return FALSE;
1556
+ }
1557
+
1558
+
1559
+ /**********************************************************************
1560
+ * set_word_fonts
1561
+ *
1562
+ * Get the fonts for the word.
1563
+ **********************************************************************/
1564
+
1565
+ void set_word_fonts( //good chars in word
1566
+ WERD_RES *word, //word to adapt to //detailed results
1567
+ BLOB_CHOICE_LIST_CLIST *blob_choices) {
1568
+ inT32 index; //char index
1569
+ inT32 offset; //char offset
1570
+ char choice_char[UNICHAR_LEN + 1]; //char from word
1571
+ inT8 config; //font of char
1572
+ //character iterator
1573
+ BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1574
+ BLOB_CHOICE_IT choice_it; //choice iterator
1575
+ STATS fonts (0, 32); //font counters
1576
+ static inT8 italic_table[32] = {
1577
+ 1, -1, 1, -1,
1578
+ 1, -1, 1, -1,
1579
+ 1, -1, 1, -1,
1580
+ 1, -1, 1, -1,
1581
+ 1, -1, 1, -1,
1582
+ 1, -1, 1, -1,
1583
+ 1, -1, 1, -1,
1584
+ 1, -1, 1, -1
1585
+ };
1586
+ static inT8 bold_table[32] = {
1587
+ 1, 1, -1, -1,
1588
+ 1, 1, -1, -1,
1589
+ 1, 1, -1, -1,
1590
+ 1, 1, -1, -1,
1591
+ 1, 1, -1, -1,
1592
+ 1, 1, -1, -1,
1593
+ 1, 1, -1, -1,
1594
+ 1, 1, -1, -1
1595
+ };
1596
+ static inT8 font_table[32] = {
1597
+ 2, 2, 2, 2,
1598
+ -1, -1, -1, -1,
1599
+ 0, 0, 0, 0,
1600
+ 1, 1, 1, 1,
1601
+ 3, 3, 3, 3,
1602
+ 4, 4, 4, 4,
1603
+ 5, 5, 5, 5,
1604
+ 2, 2, 2, 2
1605
+ };
1606
+
1607
+ word->italic = 0;
1608
+ word->bold = 0;
1609
+ for (char_it.mark_cycle_pt (), index = 0, offset = 0;
1610
+ !char_it.cycled_list (); char_it.forward (),
1611
+ offset += word->best_choice->lengths()[index++]) {
1612
+ strncpy(choice_char, word->best_choice->string ().string() + offset,
1613
+ word->best_choice->lengths()[index]);
1614
+ choice_char[word->best_choice->lengths()[index]] = '\0';
1615
+ choice_it.set_to_list (char_it.data ());
1616
+ for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
1617
+ choice_it.forward ()) {
1618
+ if (strcmp(choice_it.data ()->unichar (), choice_char) == 0) {
1619
+ config = choice_it.data ()->config ();
1620
+ if (tessedit_debug_fonts)
1621
+ tprintf ("%s(%d=%d%c%c)",
1622
+ choice_char, config, (config & 31) >> 2,
1623
+ config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
1624
+ if (config != -1) {
1625
+ config &= 31;
1626
+ word->italic += italic_table[config];
1627
+ word->bold += bold_table[config];
1628
+ if (font_table[config] != -1)
1629
+ fonts.add (font_table[config], 1);
1630
+ }
1631
+ break;
1632
+ }
1633
+ }
1634
+ }
1635
+ find_modal_font (&fonts, &word->font1, &word->font1_count);
1636
+ find_modal_font (&fonts, &word->font2, &word->font2_count);
1637
+ if (tessedit_debug_fonts)
1638
+ tprintf ("\n");
1639
+ /* if (word->font1_count>0)
1640
+ {
1641
+ for (char_it.mark_cycle_pt(),index=0;
1642
+ !char_it.cycled_list();char_it.forward(),index++)
1643
+ {
1644
+ choice_char=word->best_choice->string()[index];
1645
+ choice_it.set_to_list(char_it.data());
1646
+ for (choice_it.mark_cycle_pt();!choice_it.cycled_list();choice_it.forward())
1647
+ {
1648
+ if (choice_it.data()->char_class()==choice_char)
1649
+ {
1650
+ config=choice_it.data()->config();
1651
+ if (config!=-1 && font_table[config&31]==word->font1)
1652
+ {
1653
+ word->italic+=italic_table[config];
1654
+ word->bold+=bold_table[config];
1655
+ }
1656
+ break;
1657
+ }
1658
+ }
1659
+ }
1660
+ }*/
1661
+ }
1662
+
1663
+
1664
+ /**********************************************************************
1665
+ * font_recognition_pass
1666
+ *
1667
+ * Smooth the fonts for the document.
1668
+ **********************************************************************/
1669
+
1670
+ void font_recognition_pass( //good chars in word
1671
+ PAGE_RES_IT &page_res_it) {
1672
+ inT32 length; //of word
1673
+ inT32 count; //of a feature
1674
+ inT8 doc_font; //modal font
1675
+ inT8 doc_font_count; //modal font
1676
+ inT32 doc_italic; //total italics
1677
+ inT32 doc_bold; //total bolds
1678
+ ROW_RES *row = NULL; //current row
1679
+ WERD_RES *word; //current word
1680
+ STATS fonts (0, 32); //font counters
1681
+ STATS doc_fonts (0, 32); //font counters
1682
+
1683
+ doc_italic = 0;
1684
+ doc_bold = 0;
1685
+ page_res_it.restart_page ();
1686
+ while (page_res_it.word () != NULL) {
1687
+ if (row != page_res_it.row ()) {
1688
+ if (row != NULL) {
1689
+ find_modal_font (&fonts, &row->font1, &row->font1_count);
1690
+ find_modal_font (&fonts, &row->font2, &row->font2_count);
1691
+ }
1692
+ row = page_res_it.row (); //current row
1693
+ fonts.clear (); //clear counters
1694
+ row->italic = 0;
1695
+ row->bold = 0;
1696
+ }
1697
+ word = page_res_it.word ();
1698
+ row->italic += word->italic;
1699
+ row->bold += word->bold;
1700
+ fonts.add (word->font1, word->font1_count);
1701
+ fonts.add (word->font2, word->font2_count);
1702
+ doc_italic += word->italic;
1703
+ doc_bold += word->bold;
1704
+ doc_fonts.add (word->font1, word->font1_count);
1705
+ doc_fonts.add (word->font2, word->font2_count);
1706
+ page_res_it.forward ();
1707
+ }
1708
+ if (row != NULL) {
1709
+ find_modal_font (&fonts, &row->font1, &row->font1_count);
1710
+ find_modal_font (&fonts, &row->font2, &row->font2_count);
1711
+ }
1712
+ find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1713
+ /*
1714
+ row=NULL;
1715
+ page_res_it.restart_page();
1716
+ while (page_res_it.word() != NULL)
1717
+ {
1718
+ if (row!=page_res_it.row())
1719
+ {
1720
+ row2=row;
1721
+ row=page_res_it.row();
1722
+ if (row->font1_count<MIN_FONT_ROW_COUNT)
1723
+ {
1724
+ fonts.clear();
1725
+ italic=0;
1726
+ bold=0;
1727
+ add_in_one_row(row,&fonts,&italic,&bold);
1728
+ if (row2!=NULL)
1729
+ {
1730
+ hdiff=row->row->x_height()-row2->row->x_height();
1731
+ if (hdiff<0)
1732
+ hdiff=-hdiff;
1733
+ if (hdiff<MAX_XHEIGHT_DIFF)
1734
+ add_in_one_row(row2,&fonts,&italic,&bold);
1735
+ }
1736
+ do
1737
+ page_res_it.forward();
1738
+ while (page_res_it.row()==row);
1739
+ row2=page_res_it.row();
1740
+ if (row2!=NULL)
1741
+ {
1742
+ hdiff=row->row->x_height()-row2->row->x_height();
1743
+ if (hdiff<0)
1744
+ hdiff=-hdiff;
1745
+ if (hdiff<MAX_XHEIGHT_DIFF)
1746
+ add_in_one_row(row2,&fonts,&italic,&bold);
1747
+ }
1748
+ row->italic=italic;
1749
+ row->bold=bold;
1750
+ find_modal_font(&fonts,&row->font1,&row->font1_count);
1751
+ find_modal_font(&fonts,&row->font2,&row->font2_count);
1752
+ }
1753
+ else
1754
+ page_res_it.forward();
1755
+ }
1756
+ else
1757
+ page_res_it.forward();
1758
+ }*/
1759
+
1760
+ page_res_it.restart_page ();
1761
+ while (page_res_it.word () != NULL) {
1762
+ row = page_res_it.row (); //current row
1763
+ word = page_res_it.word ();
1764
+ length = word->best_choice->string ().length ();
1765
+
1766
+ count = word->italic;
1767
+ if (count < 0)
1768
+ count = -count;
1769
+ if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1770
+ word->italic = doc_italic > 0 ? 1 : -1;
1771
+
1772
+ count = word->bold;
1773
+ if (count < 0)
1774
+ count = -count;
1775
+ if (!(count == length || (length > 3 && count >= length * 3 / 4)))
1776
+ word->bold = doc_bold > 0 ? 1 : -1;
1777
+
1778
+ count = word->font1_count;
1779
+ if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1780
+ word->font1 = doc_font;
1781
+ word->font1_count = doc_font_count;
1782
+ }
1783
+
1784
+ page_res_it.forward ();
1785
+ }
1786
+ }
1787
+
1788
+
1789
+ /**********************************************************************
1790
+ * add_in_one_row
1791
+ *
1792
+ * Add into the stats for one row.
1793
+ **********************************************************************/
1794
+
1795
+ void add_in_one_row( //good chars in word
1796
+ ROW_RES *row, //current row
1797
+ STATS *fonts, //font stats
1798
+ inT8 *italic, //output count
1799
+ inT8 *bold //output count
1800
+ ) {
1801
+ WERD_RES *word; //current word
1802
+ WERD_RES_IT word_it = &row->word_res_list;
1803
+
1804
+ for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
1805
+ word = word_it.data ();
1806
+ *italic += word->italic;
1807
+ *bold += word->bold;
1808
+ if (word->font1_count > 0)
1809
+ fonts->add (word->font1, word->font1_count);
1810
+ if (word->font2_count > 0)
1811
+ fonts->add (word->font2, word->font2_count);
1812
+
1813
+ }
1814
+ }
1815
+
1816
+
1817
+ /**********************************************************************
1818
+ * find_modal_font
1819
+ *
1820
+ * Find the modal font and remove from the stats.
1821
+ **********************************************************************/
1822
+
1823
+ void find_modal_font( //good chars in word
1824
+ STATS *fonts, //font stats
1825
+ inT8 *font_out, //output font
1826
+ inT8 *font_count //output count
1827
+ ) {
1828
+ inT8 font; //font index
1829
+ inT32 count; //pile couat
1830
+
1831
+ if (fonts->get_total () > 0) {
1832
+ font = (inT8) fonts->mode ();
1833
+ *font_out = font;
1834
+ count = fonts->pile_count (font);
1835
+ *font_count = count < MAX_INT8 ? count : MAX_INT8;
1836
+ fonts->add (font, -*font_count);
1837
+ }
1838
+ else {
1839
+ *font_out = -1;
1840
+ *font_count = 0;
1841
+ }
1842
+ }