tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1273 @@
1
+ /******************************************************************
2
+ * File: output.cpp (Formerly output.c)
3
+ * Description: Output pass
4
+ * Author: Phil Cheatle
5
+ * Created: Thu Aug 4 10:56:08 BST 1994
6
+ *
7
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include "ocrshell.h"
22
+ #include <string.h>
23
+ #include <ctype.h>
24
+ #ifdef __UNIX__
25
+ #include <assert.h>
26
+ #include <unistd.h>
27
+ #include <errno.h>
28
+ #endif
29
+ #include "mainblk.h"
30
+ #include "tfacep.h"
31
+ #include "tessvars.h"
32
+ #include "control.h"
33
+ #include "secname.h"
34
+ #include "reject.h"
35
+ #include "docqual.h"
36
+ #include "output.h"
37
+ #include "bestfirst.h"
38
+ #include "globals.h"
39
+
40
+ #define EXTERN
41
+
42
+ #define EPAPER_EXT ".ep"
43
+ #define PAGE_YSIZE 3508
44
+ #define CTRL_INSET '\024' //dc4=text inset
45
+ #define CTRL_FONT '\016' //so=font change
46
+ #define CTRL_DEFAULT '\017' //si=default font
47
+ #define CTRL_SHIFT '\022' //dc2=x shift
48
+ #define CTRL_TAB '\011' //tab
49
+ #define CTRL_NEWLINE '\012' //newline
50
+ #define CTRL_HARDLINE '\015' //cr
51
+ int NO_BLOCK = 0; //don't output block information
52
+ inT16 XOFFSET = 0; //the image can be a part of bigger picture and we want to have the original coordinates
53
+ inT16 YOFFSET = 0;
54
+
55
+ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
56
+ "Write block separators in output");
57
+ EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
58
+ "Write raw stuff to name.raw");
59
+ EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
60
+ EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
61
+ "Return ratings in IPEOCRAPI data");
62
+ EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
63
+ "Write .txt to .etx map file");
64
+ EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
65
+ "Write repetition char code");
66
+ EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
67
+ EXTERN STRING_EVAR (unrecognised_char, "|",
68
+ "Output char for unidentified blobs");
69
+ EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
70
+ EXTERN INT_VAR (suspect_space_level, 100,
71
+ "Min suspect level for rejecting spaces");
72
+ EXTERN INT_VAR (suspect_short_words, 2,
73
+ "Dont Suspect dict wds longer than this");
74
+ EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
75
+ "UNLV keep 1Il chars rejected");
76
+ EXTERN double_VAR (suspect_rating_per_ch, 999.9,
77
+ "Dont touch bad rating limit");
78
+ EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
79
+
80
+ EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
81
+ "Only reject tess failures");
82
+ EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
83
+ EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
84
+ "Make output have exactly one word per WERD");
85
+ EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
86
+ "Dont reject ANYTHING AT ALL");
87
+ EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
88
+ "Force all rep chars the same");
89
+
90
+ FILE *txt_mapfile = NULL; //reject map
91
+ FILE *unlv_file = NULL; //reject map
92
+
93
+ /**********************************************************************
94
+ * pixels_to_pts
95
+ *
96
+ * Convert an integer number of pixels to the nearest integer
97
+ * number of points.
98
+ **********************************************************************/
99
+
100
+ inT32 pixels_to_pts( //convert coords
101
+ inT32 pixels,
102
+ inT32 pix_res //resolution
103
+ ) {
104
+ float pts; //converted value
105
+
106
+ pts = pixels * 72.0 / pix_res;
107
+ return (inT32) (pts + 0.5); //round it
108
+ }
109
+
110
+ void output_pass( //Tess output pass //send to api
111
+ PAGE_RES_IT &page_res_it,
112
+ BOOL8 write_to_shm,
113
+ TBOX *target_word_box) {
114
+ BLOCK_RES *block_of_last_word;
115
+ inT16 block_id;
116
+ BOOL8 force_eol; //During output
117
+ BLOCK *nextblock; //block of next word
118
+ WERD *nextword; //next word
119
+
120
+ if (tessedit_write_txt_map)
121
+ txt_mapfile = open_outfile (".map");
122
+
123
+ page_res_it.restart_page ();
124
+ block_of_last_word = NULL;
125
+ while (page_res_it.word () != NULL) {
126
+ check_debug_pt (page_res_it.word (), 120);
127
+
128
+ if (target_word_box)
129
+ {
130
+
131
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
132
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
133
+ if (!target_word_box->contains(center_pt))
134
+ {
135
+ page_res_it.forward ();
136
+ continue;
137
+ }
138
+
139
+ }
140
+ if (tessedit_write_block_separators &&
141
+ block_of_last_word != page_res_it.block ()) {
142
+ block_of_last_word = page_res_it.block ();
143
+ if (block_of_last_word->block->text_region () == NULL) {
144
+ if (block_of_last_word->block->poly_block () == NULL)
145
+ block_id = 1;
146
+ else
147
+ block_id =
148
+ ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
149
+ id_no();
150
+ }
151
+ else
152
+ block_id = block_of_last_word->block->text_region ()->id_no ();
153
+ if (!NO_BLOCK)
154
+ fprintf (textfile, "|^~tr%d\n", block_id);
155
+ fprintf (txt_mapfile, "|^~tr%d\n", block_id);
156
+ }
157
+
158
+ force_eol = (tessedit_write_block_separators &&
159
+ (page_res_it.block () != page_res_it.next_block ())) ||
160
+ (page_res_it.next_word () == NULL);
161
+
162
+ if (page_res_it.next_word () != NULL)
163
+ nextword = page_res_it.next_word ()->word;
164
+ else
165
+ nextword = NULL;
166
+ if (page_res_it.next_block () != NULL)
167
+ nextblock = page_res_it.next_block ()->block;
168
+ else
169
+ nextblock = NULL;
170
+ //regardless of tilde crunching
171
+ write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
172
+ write_to_shm);
173
+ page_res_it.forward ();
174
+ }
175
+ if (write_to_shm)
176
+ ocr_send_text(FALSE);
177
+ if (tessedit_write_block_separators) {
178
+ if (!NO_BLOCK)
179
+ fprintf (textfile, "|^~tr\n");
180
+ fprintf (txt_mapfile, "|^~tr\n");
181
+ }
182
+ if (tessedit_write_txt_map) {
183
+ fprintf (txt_mapfile, "\n"); //because txt gets one
184
+ #ifdef __UNIX__
185
+ fsync (fileno (txt_mapfile));
186
+ #endif
187
+ fclose(txt_mapfile);
188
+ }
189
+ }
190
+
191
+ /*************************************************************************
192
+ * write_results()
193
+ *
194
+ * All recognition and rejection has now been done. Generate the following:
195
+ * .txt file - giving the final best choices with NO highlighting
196
+ * .raw file - giving the tesseract top choice output for each word
197
+ * .map file - showing how the .txt file has been rejected in the .ep file
198
+ * epchoice list - a list of one element per word, containing the text for the
199
+ * epaper. Reject strings are inserted.
200
+ * inset list - a list of bounding boxes of reject insets - indexed by the
201
+ * reject strings in the epchoice text.
202
+ *************************************************************************/
203
+
204
+ void write_results( //output a word
205
+ PAGE_RES_IT &page_res_it, //full info
206
+ char newline_type, //type of newline
207
+ BOOL8 force_eol, //override tilde crunch?
208
+ BOOL8 write_to_shm //send to api
209
+ ) {
210
+ //word to do
211
+ WERD_RES *word = page_res_it.word ();
212
+ // WERD_CHOICE *ep_choice; //ep format
213
+ STRING repetition_code;
214
+ const STRING *wordstr;
215
+ STRING wordstr_lengths;
216
+ const char *text;
217
+ int i;
218
+ char unrecognised = STRING (unrecognised_char)[0];
219
+ char ep_chars[32]; //Only for unlv_tilde_crunch
220
+ int ep_chars_index = 0;
221
+ char txt_chs[32]; //Only for unlv_tilde_crunch
222
+ char map_chs[32]; //Only for unlv_tilde_crunch
223
+ int txt_index = 0;
224
+ static BOOL8 tilde_crunch_written = FALSE;
225
+ static BOOL8 last_char_was_newline = TRUE;
226
+ static BOOL8 last_char_was_tilde = FALSE;
227
+ static BOOL8 empty_block = TRUE;
228
+ BOOL8 need_reject = FALSE;
229
+ char *ptr; //string ptr
230
+ PBLOB_IT blob_it; //blobs
231
+
232
+ /* if (word->best_choice->string().length() == 0)
233
+ {
234
+ tprintf("No output: to output\n");
235
+ }
236
+ else if (word->best_choice->string()[0]==' ')
237
+ {
238
+ tprintf("spaceword to output\n");
239
+ }
240
+ else if (word->best_choice->string()[0]=='\0')
241
+ {
242
+ tprintf("null to output\n");
243
+ }*/
244
+ if (word->unlv_crunch_mode != CR_NONE
245
+ && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
246
+ if ((word->unlv_crunch_mode != CR_DELETE) &&
247
+ (!tilde_crunch_written ||
248
+ ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
249
+ (word->word->space () > 0) &&
250
+ !word->word->flag (W_FUZZY_NON) &&
251
+ !word->word->flag (W_FUZZY_SP)))) {
252
+ if (!word->word->flag (W_BOL) &&
253
+ (word->word->space () > 0) &&
254
+ !word->word->flag (W_FUZZY_NON) &&
255
+ !word->word->flag (W_FUZZY_SP)) {
256
+ /* Write a space to separate from preceeding good text */
257
+ txt_chs[txt_index] = ' ';
258
+ map_chs[txt_index++] = '1';
259
+ ep_chars[ep_chars_index++] = ' ';
260
+ last_char_was_tilde = FALSE;
261
+ }
262
+ need_reject = TRUE;
263
+ }
264
+ if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
265
+ /* Write a reject char - mark as rejected unless zero_rejection mode */
266
+ last_char_was_tilde = TRUE;
267
+ txt_chs[txt_index] = unrecognised;
268
+ if (tessedit_zero_rejection || (suspect_level == 0)) {
269
+ map_chs[txt_index++] = '1';
270
+ ep_chars[ep_chars_index++] = unrecognised;
271
+ }
272
+ else {
273
+ map_chs[txt_index++] = '0';
274
+ /*
275
+ The ep_choice string is a faked reject to allow newdiff to sync the .etx
276
+ with the .txt and .map files.
277
+ */
278
+ ep_chars[ep_chars_index++] = CTRL_INSET;
279
+ //escape code
280
+ //dummy reject
281
+ ep_chars[ep_chars_index++] = 1;
282
+ //dummy reject
283
+ ep_chars[ep_chars_index++] = 1;
284
+ //type
285
+ ep_chars[ep_chars_index++] = 2;
286
+ //dummy reject
287
+ ep_chars[ep_chars_index++] = 1;
288
+ //dummy reject
289
+ ep_chars[ep_chars_index++] = 1;
290
+ }
291
+ tilde_crunch_written = TRUE;
292
+ last_char_was_newline = FALSE;
293
+ empty_block = FALSE;
294
+ }
295
+
296
+ if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
297
+ /* Add a new line output */
298
+ txt_chs[txt_index] = '\n';
299
+ map_chs[txt_index++] = '\n';
300
+ //end line
301
+ ep_chars[ep_chars_index++] = newline_type;
302
+
303
+ //Cos of the real newline
304
+ tilde_crunch_written = FALSE;
305
+ last_char_was_newline = TRUE;
306
+ last_char_was_tilde = FALSE;
307
+ }
308
+ txt_chs[txt_index] = '\0';
309
+ map_chs[txt_index] = '\0';
310
+ //xiaofan
311
+ if (tessedit_write_output && !NO_BLOCK)
312
+ fprintf (textfile, "%s", txt_chs);
313
+
314
+ if (tessedit_write_txt_map)
315
+ fprintf (txt_mapfile, "%s", map_chs);
316
+
317
+ //terminate string
318
+ ep_chars[ep_chars_index] = '\0';
319
+ word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
320
+
321
+ if (force_eol)
322
+ empty_block = TRUE;
323
+ return;
324
+ }
325
+
326
+ /* NORMAL PROCESSING of non tilde crunched words */
327
+
328
+ tilde_crunch_written = FALSE;
329
+ if (newline_type)
330
+ last_char_was_newline = TRUE;
331
+ else
332
+ last_char_was_newline = FALSE;
333
+ empty_block = force_eol; //About to write a real word
334
+
335
+ if (unlv_tilde_crunching &&
336
+ last_char_was_tilde &&
337
+ (word->word->space () == 0) &&
338
+ !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
339
+ (word->best_choice->string ()[0] == ' ')) {
340
+ /* Prevent adjacent tilde across words - we know that adjacent tildes within
341
+ words have been removed */
342
+ ptr = (char *) word->best_choice->string ().string ();
343
+ strcpy (ptr, ptr + 1); //shuffle up
344
+ ptr = (char *) word->best_choice->lengths ().string ();
345
+ strcpy (ptr, ptr + 1); //shuffle up
346
+ word->reject_map.remove_pos (0);
347
+ blob_it = word->outword->blob_list ();
348
+ delete blob_it.extract (); //get rid of reject blob
349
+ }
350
+ if (newline_type ||
351
+ (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
352
+ last_char_was_tilde = FALSE;
353
+ else {
354
+ if (word->reject_map.length () > 0) {
355
+ for (i = 0, ptr = (char *) word->best_choice->string().string();
356
+ i < word->reject_map.length () - 1; ++i)
357
+ ptr += word->best_choice->lengths()[i];
358
+ if (*ptr == ' ')
359
+ last_char_was_tilde = TRUE;
360
+ else
361
+ last_char_was_tilde = FALSE;
362
+ }
363
+ else if (word->word->space () > 0)
364
+ last_char_was_tilde = FALSE;
365
+ /* else it is unchanged as there are no output chars */
366
+ }
367
+
368
+ ptr = (char *) word->best_choice->lengths ().string ();
369
+ ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
370
+
371
+ if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
372
+ ensure_rep_chars_are_consistent(word);
373
+
374
+ set_unlv_suspects(word);
375
+ check_debug_pt (word, 120);
376
+ if (tessedit_rejection_debug) {
377
+ tprintf ("Dict word: \"%s\": %d\n",
378
+ word->best_choice->string ().string (),
379
+ dict_word (word->best_choice->string ().string ()));
380
+ }
381
+
382
+ #if 0
383
+ if (tessedit_write_unlv) {
384
+ write_unlv_text(word);
385
+ }
386
+ #endif
387
+
388
+ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
389
+ repetition_code = "|^~R";
390
+ wordstr_lengths = "\001\001\001\001";
391
+ repetition_code += unicharset.id_to_unichar(get_rep_char (word));
392
+ wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
393
+ wordstr = &repetition_code;
394
+ }
395
+ else {
396
+ wordstr = &(word->best_choice->string ());
397
+ wordstr_lengths = word->best_choice->lengths ();
398
+ if (tessedit_zero_rejection) {
399
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
400
+ text = wordstr->string ();
401
+ for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
402
+ if (word->reject_map[i].rejected ())
403
+ word->reject_map[i].setrej_minimal_rej_accept ();
404
+ }
405
+ }
406
+ if (tessedit_minimal_rejection) {
407
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
408
+ text = wordstr->string ();
409
+ for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
410
+ if ((*text != ' ') && word->reject_map[i].rejected ())
411
+ word->reject_map[i].setrej_minimal_rej_accept ();
412
+ }
413
+ }
414
+ }
415
+
416
+ if (write_to_shm)
417
+ write_shm_text (word, page_res_it.block ()->block,
418
+ page_res_it.row (), *wordstr, wordstr_lengths);
419
+
420
+ #if 0
421
+ if (tessedit_write_output)
422
+ write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
423
+
424
+ if (tessedit_write_raw_output)
425
+ write_cooked_text (word->word, word->raw_choice->string (),
426
+ TRUE, FALSE, rawfile);
427
+
428
+ if (tessedit_write_txt_map)
429
+ write_map(txt_mapfile, word);
430
+
431
+ ep_choice = make_epaper_choice (word, newline_type);
432
+ word->ep_choice = ep_choice;
433
+ #endif
434
+
435
+ character_count += word->best_choice->lengths ().length ();
436
+ word_count++;
437
+ }
438
+
439
+ /**********************************************************************
440
+ * make_epaper_choice
441
+ *
442
+ * Construct the epaper text string for a word, using the reject map to
443
+ * determine whether each blob should be rejected.
444
+ **********************************************************************/
445
+
446
+ #if 0
447
+ WERD_CHOICE *make_epaper_choice( //convert one word
448
+ WERD_RES *word, //word to do
449
+ char newline_type //type of newline
450
+ ) {
451
+ inT16 index = 0; //to string
452
+ inT16 blobindex; //to word
453
+ inT16 prevright = 0; //right of previous blob
454
+ inT16 nextleft; //left of next blob
455
+ PBLOB *blob;
456
+ TBOX inset_box; //bounding box
457
+ PBLOB_IT blob_it; //blob iterator
458
+ char word_string[MAX_PATH]; //converted string
459
+ BOOL8 force_total_reject;
460
+ char unrecognised = STRING (unrecognised_char)[0];
461
+
462
+ blob_it.set_to_list (word->outword->blob_list ());
463
+
464
+ ASSERT_HOST (word->reject_map.length () ==
465
+ word->best_choice->string ().length ());
466
+ /*
467
+ tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n",
468
+ word->best_choice->string().string(),
469
+ word->best_choice->string().length(),
470
+ blob_it.length(),
471
+ blob_count( word->outword ) );
472
+ */
473
+
474
+ if (word->best_choice->string ().length () == 0)
475
+ force_total_reject = TRUE;
476
+ else {
477
+ force_total_reject = FALSE;
478
+ ASSERT_HOST (blob_it.length () ==
479
+ word->best_choice->string ().length ());
480
+ }
481
+ if (!blob_it.empty ()) {
482
+ for (index = 0; index < word->word->space (); index++)
483
+ word_string[index] = ' '; //leading blanks
484
+ }
485
+ /* Why does this generate leading blanks regardless of whether the
486
+ word_choice string is empty, when write_cooked_text ony generates leading
487
+ blanks when the string is NOT empty???. */
488
+
489
+ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
490
+ strcpy (word_string + index, "|^~R");
491
+ index += 4;
492
+ strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
493
+ index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
494
+ }
495
+ else {
496
+ if (!blob_it.empty ())
497
+ prevright = blob_it.data ()->bounding_box ().left ();
498
+ //actually first left
499
+ for (blobindex = 0, blob_it.mark_cycle_pt ();
500
+ !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
501
+ blob = blob_it.data ();
502
+ if (word->reject_map[blobindex].accepted ()) {
503
+ if (word->best_choice->string ()[blobindex] == ' ')
504
+ //but not rejected!!
505
+ word_string[index++] = unrecognised;
506
+ else
507
+ word_string[index++] =
508
+ word->best_choice->string ()[blobindex];
509
+ }
510
+ else { // start reject
511
+ inset_box = blob->bounding_box ();
512
+ /* Extend reject box to include rejected neighbours */
513
+ while (!blob_it.at_last () &&
514
+ (force_total_reject ||
515
+ (word->reject_map[blobindex + 1].rejected ()))) {
516
+ blobindex++;
517
+ blob = blob_it.forward ();
518
+ //get total box
519
+ inset_box += blob->bounding_box ();
520
+ }
521
+ if (blob_it.at_last ())
522
+ nextleft = inset_box.right ();
523
+ else
524
+ nextleft = blob_it.data_relative (1)->bounding_box ().left ();
525
+
526
+ // tprintf("Making reject from (%d,%d)->(%d,%d)\n",
527
+ // inset_box.left(),inset_box.bottom(),
528
+ // inset_box.right(),inset_box.top());
529
+
530
+ index += make_reject (&inset_box, prevright, nextleft,
531
+ &word->denorm, &word_string[index]);
532
+ }
533
+ prevright = blob->bounding_box ().right ();
534
+ }
535
+ }
536
+ if (newline_type)
537
+ //end line
538
+ word_string[index++] = newline_type;
539
+ word_string[index] = '\0'; //terminate string
540
+ if (strlen (word_string) != index) {
541
+ tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
542
+ word_string, index, strlen (word_string));
543
+ }
544
+ //don't pass any zeros
545
+ ASSERT_HOST (strlen (word_string) == index);
546
+ return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
547
+ }
548
+ #endif
549
+
550
+ /**********************************************************************
551
+ * make_reject
552
+ *
553
+ * Add the escape code to the string for the reject.
554
+ **********************************************************************/
555
+
556
+ inT16
557
+ make_reject ( //make reject code
558
+ TBOX * inset_box, //bounding box
559
+ inT16 prevright, //previous char
560
+ inT16 nextleft, //next char
561
+ DENORM * denorm, //de-normalizer
562
+ char word_string[] //output string
563
+ ) {
564
+ inT16 index; //to string
565
+ inT16 xpos; //start of inset
566
+ inT16 ypos;
567
+ inT16 width; //size of inset
568
+ inT16 height;
569
+ inT16 left_offset; //shift form prev char
570
+ inT16 right_offset; //shift to next char
571
+ inT16 baseline_offset; //shift from baseline
572
+ inT16 inset_index = 0; //number of inset
573
+ inT16 min_chars; //min width estimate
574
+ inT16 max_chars; //max width estimate
575
+ float x_centre; //centre of box
576
+
577
+ index = 0;
578
+ x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
579
+ left_offset =
580
+ (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
581
+ right_offset =
582
+ (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
583
+ xpos = (inT16) floor (denorm->x (inset_box->left ()));
584
+ width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
585
+ ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
586
+ height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
587
+ baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
588
+ //escape code
589
+ word_string[index++] = CTRL_INSET;
590
+ min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
591
+ max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
592
+ /*
593
+ Ensure min_chars and max_chars are in the range 0..254. This ensures that
594
+ we can add 1 to them to avoid putting \0 in a string, and still not exceed
595
+ the max value in a byte.
596
+ */
597
+ if (min_chars < 0)
598
+ min_chars = 0;
599
+ if (min_chars > 254)
600
+ min_chars = 254;
601
+ if (max_chars < min_chars)
602
+ max_chars = min_chars;
603
+ if (max_chars > 254)
604
+ max_chars = 254;
605
+ //min chars
606
+ word_string[index++] = min_chars + 1;
607
+ //max chars
608
+ word_string[index++] = max_chars + 1;
609
+ word_string[index++] = 2; //type?
610
+ //store index
611
+ word_string[index++] = inset_index / 255 + 1;
612
+ word_string[index++] = inset_index % 255 + 1;
613
+ return index; //size of string
614
+ }
615
+
616
+
617
+ /**********************************************************************
618
+ * determine_newline_type
619
+ *
620
+ * Find whether we have a wrapping or hard newline.
621
+ * Return FALSE if not at end of line.
622
+ **********************************************************************/
623
+
624
+ char determine_newline_type( //test line ends
625
+ WERD *word, //word to do
626
+ BLOCK *block, //current block
627
+ WERD *next_word, //next word
628
+ BLOCK *next_block //block of next word
629
+ ) {
630
+ inT16 end_gap; //to right edge
631
+ inT16 width; //of next word
632
+ TBOX word_box; //bounding
633
+ TBOX next_box; //next word
634
+ TBOX block_box; //block bounding
635
+
636
+ if (!word->flag (W_EOL))
637
+ return FALSE; //not end of line
638
+ if (next_word == NULL || next_block == NULL || block != next_block)
639
+ return CTRL_NEWLINE;
640
+ if (next_word->space () > 0)
641
+ return CTRL_HARDLINE; //it is tabbed
642
+ word_box = word->bounding_box ();
643
+ next_box = next_word->bounding_box ();
644
+ block_box = block->bounding_box ();
645
+ //gap to eol
646
+ end_gap = block_box.right () - word_box.right ();
647
+ end_gap -= (inT32) block->space ();
648
+ width = next_box.right () - next_box.left ();
649
+ // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
650
+ // block_box.right(),word_box.right(),end_gap,
651
+ // next_box.right(),next_box.left(),width,
652
+ // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
653
+ return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
654
+ }
655
+
656
+
657
+ /**********************************************************************
658
+ * write_cooked_text
659
+ *
660
+ * Write the cooked text (with bold for pass2 and underline for reject)
661
+ * to the given file.
662
+ **********************************************************************/
663
+
664
+ #if 0
665
+ void write_cooked_text( //write output
666
+ WERD *word, //word to do
667
+ const STRING &text, //text to write
668
+ BOOL8 acceptable, //good stuff
669
+ BOOL8 pass2, //done on pass2
670
+ FILE *fp //file to write
671
+ ) {
672
+ inT16 index; //blank counter
673
+ int status;
674
+ static int newaline = 1;
675
+ static int havespace = 0;
676
+ char buff[512];
677
+ const char *wordstr = text.string ();
678
+ int i = 0;
679
+ char unrecognised = STRING (unrecognised_char)[0];
680
+ static int old_segs = 0;
681
+ TBOX mybox;
682
+ for (i = 0; wordstr[i] != '\0'; i++) {
683
+ if (wordstr[i] == ' ')
684
+ buff[i] = unrecognised;
685
+ else
686
+ buff[i] = wordstr[i];
687
+ }
688
+ buff[i] = '\0';
689
+
690
+ if (fp == stdout) {
691
+ tprintf ("Cooked=%s, %d segs, acceptable=%d",
692
+ buff, num_popped - old_segs, acceptable);
693
+ old_segs = num_popped;
694
+ return;
695
+ }
696
+
697
+ if (text.length () > 0) {
698
+ for (index = 0; index < word->space (); index++) {
699
+ status = fprintf (fp, " ");
700
+ havespace = 1;
701
+ if (status < 0)
702
+ WRITEFAILED.error ("write_cooked_text", EXIT,
703
+ "Space Errno: %d", errno);
704
+ }
705
+ if (pass2) {
706
+ status = fprintf (fp, BOLD_ON);
707
+ if (status < 0)
708
+ WRITEFAILED.error ("write_cooked_text", EXIT,
709
+ "Bold Errno: %d", errno);
710
+ }
711
+ if (!acceptable) {
712
+ status = fprintf (fp, UNDERLINE_ON);
713
+ if (status < 0)
714
+ WRITEFAILED.error ("write_cooked_text", EXIT,
715
+ "Underline Errno: %d", errno);
716
+ }
717
+
718
+ //xiaofan
719
+ if (NO_BLOCK && word && strlen (buff)) {
720
+ mybox = word->bounding_box ();
721
+ if (newaline || !havespace) {
722
+ fprintf (fp, " ");
723
+ newaline = 0;
724
+ }
725
+ fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
726
+ XOFFSET + mybox.left (),
727
+ YOFFSET + page_image.get_ysize () - mybox.top (),
728
+ XOFFSET + mybox.right (),
729
+ YOFFSET + page_image.get_ysize () - mybox.bottom ());
730
+ havespace = 0;
731
+ }
732
+
733
+ status = fprintf (fp, "%s", buff);
734
+ if (status < 0)
735
+ WRITEFAILED.error ("write_cooked_text", EXIT,
736
+ "Word Errno: %d", errno);
737
+ if (pass2) {
738
+ status = fprintf (fp, BOLD_OFF);
739
+ if (status < 0)
740
+ WRITEFAILED.error ("write_cooked_text", EXIT,
741
+ "Bold off Errno: %d", errno);
742
+ }
743
+ if (!acceptable) {
744
+ status = fprintf (fp, UNDERLINE_OFF);
745
+ if (status < 0)
746
+ WRITEFAILED.error ("write_cooked_text", EXIT,
747
+ "Underline off Errno: %d", errno);
748
+ }
749
+ }
750
+ if (word->flag (W_EOL)) {
751
+ status = fprintf (fp, "\n");
752
+ newaline = 1;
753
+ if (status < 0)
754
+ WRITEFAILED.error ("write_cooked_text", EXIT,
755
+ "Newline Errno: %d", errno);
756
+ }
757
+ status = fflush (fp);
758
+ if (status != 0)
759
+ WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
760
+ }
761
+ #endif
762
+
763
+
764
+ /**********************************************************************
765
+ * write_shm_text
766
+ *
767
+ * Write the cooked text to the shared memory for the api.
768
+ **********************************************************************/
769
+
770
+ void write_shm_text( //write output
771
+ WERD_RES *word, //word to do
772
+ BLOCK *block, //block it is from
773
+ ROW_RES *row, //row it is from
774
+ const STRING &text, //text to write
775
+ const STRING &text_lengths
776
+ ) {
777
+ inT32 index; //char counter
778
+ inT32 index2; //char counter
779
+ inT32 length; //chars in word
780
+ inT32 ptsize; //font size
781
+ inT8 blanks; //blanks in word
782
+ uinT8 enhancement; //bold etc
783
+ uinT8 font; //font index
784
+ char unrecognised = STRING (unrecognised_char)[0];
785
+ PBLOB *blob;
786
+ TBOX blob_box; //bounding box
787
+ PBLOB_IT blob_it; //blob iterator
788
+ WERD copy_outword; // copy to denorm
789
+ uinT32 rating; //of char
790
+ BOOL8 lineend; //end of line
791
+ int offset;
792
+ int offset2;
793
+
794
+ //point size
795
+ ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
796
+ if (word->word->flag (W_BOL) && ocr_char_space () < 128
797
+ && ocr_send_text (TRUE) != OKAY)
798
+ return; //release failed
799
+ copy_outword = *(word->outword);
800
+ copy_outword.baseline_denormalise (&word->denorm);
801
+ blob_it.set_to_list (copy_outword.blob_list ());
802
+ length = text_lengths.length ();
803
+
804
+ if (length > 0) {
805
+ blanks = word->word->space ();
806
+ if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
807
+ blanks = 1;
808
+ for (index = 0, offset = 0; index < length;
809
+ offset += text_lengths[index++], blob_it.forward ()) {
810
+ blob = blob_it.data ();
811
+ blob_box = blob->bounding_box ();
812
+
813
+ enhancement = 0;
814
+ if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
815
+ enhancement |= EUC_ITALIC;
816
+ if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
817
+ enhancement |= EUC_BOLD;
818
+ if (tessedit_write_ratings)
819
+ rating = (uinT32) (-word->best_choice->certainty () / 0.035);
820
+ else if (tessedit_zero_rejection)
821
+ rating = text[offset] == ' ' ? 100 : 0;
822
+ else
823
+ rating = word->reject_map[index].accepted ()? 0 : 100;
824
+ if (rating > 255)
825
+ rating = 255;
826
+ if (word->font1_count > 2)
827
+ font = word->font1;
828
+ else if (row->font1_count > 8)
829
+ font = row->font1;
830
+ else
831
+ //font index
832
+ font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
833
+
834
+ lineend = word->word->flag (W_EOL) && index == length - 1;
835
+ if (word->word->flag (W_EOL) && tessedit_zero_rejection
836
+ && index < length - 1 && text[index + text_lengths[index]] == ' ') {
837
+ for (index2 = index + 1, offset2 = offset + text_lengths[index];
838
+ index2 < length && text[offset2] == ' ';
839
+ offset2 += text_lengths[index2++]);
840
+ if (index2 == length)
841
+ lineend = TRUE;
842
+ }
843
+
844
+ if (!tessedit_zero_rejection || text[offset] != ' '
845
+ || tessedit_word_for_word) {
846
+ //confidence
847
+ if (text[offset] == ' ') {
848
+ ocr_append_char (unrecognised,
849
+ blob_box.left (), blob_box.right (),
850
+ page_image.get_ysize () - 1 - blob_box.top (),
851
+ page_image.get_ysize () - 1 - blob_box.bottom (),
852
+ font, (uinT8) rating,
853
+ ptsize, //point size
854
+ blanks, enhancement, //enhancement
855
+ OCR_CDIR_LEFT_RIGHT,
856
+ OCR_LDIR_DOWN_RIGHT,
857
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
858
+ } else {
859
+ for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
860
+ ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
861
+ blob_box.left (), blob_box.right (),
862
+ page_image.get_ysize () - 1 - blob_box.top (),
863
+ page_image.get_ysize () - 1 - blob_box.bottom (),
864
+ font, (uinT8) rating,
865
+ ptsize, //point size
866
+ blanks, enhancement, //enhancement
867
+ OCR_CDIR_LEFT_RIGHT,
868
+ OCR_LDIR_DOWN_RIGHT,
869
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
870
+ }
871
+ blanks = 0;
872
+ }
873
+
874
+ }
875
+ }
876
+ else if (tessedit_word_for_word) {
877
+ blanks = word->word->space ();
878
+ if (blanks == 0 && !word->word->flag (W_BOL))
879
+ blanks = 1;
880
+ blob_box = word->word->bounding_box ();
881
+
882
+ enhancement = 0;
883
+ if (word->italic > 0)
884
+ enhancement |= EUC_ITALIC;
885
+ if (word->bold > 0)
886
+ enhancement |= EUC_BOLD;
887
+ rating = 100;
888
+ if (word->font1_count > 2)
889
+ font = word->font1;
890
+ else if (row->font1_count > 8)
891
+ font = row->font1;
892
+ else
893
+ //font index
894
+ font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
895
+
896
+ lineend = word->word->flag (W_EOL);
897
+
898
+ //font index
899
+ ocr_append_char (unrecognised,
900
+ blob_box.left (), blob_box.right (),
901
+ page_image.get_ysize () - 1 - blob_box.top (),
902
+ page_image.get_ysize () - 1 - blob_box.bottom (),
903
+ font,
904
+ rating, //confidence
905
+ ptsize, //point size
906
+ blanks, enhancement, //enhancement
907
+ OCR_CDIR_LEFT_RIGHT,
908
+ OCR_LDIR_DOWN_RIGHT,
909
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
910
+ }
911
+ }
912
+
913
+
914
+ /**********************************************************************
915
+ * write_map
916
+ *
917
+ * Write a map file of 0's and 1'a which associates characters from the .txt
918
+ * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
919
+ * is kept. Note that there may be reject regions in the .etx file WITHOUT
920
+ * .txt chars being rejected. The map file should be the same length, and
921
+ * the same number of lines as the .txt file
922
+ *
923
+ * The paramaterised input is because I thought I might be able to generate
924
+ * multiple map files in a single run. However, it didn't work because
925
+ * newdiff needs etx files!
926
+ **********************************************************************/
927
+
928
+ #if 0
929
+ void write_map( //output a map file
930
+ FILE *mapfile, //mapfile to write to
931
+ WERD_RES *word) {
932
+ inT16 index;
933
+ int status;
934
+ STRING mapstr = "";
935
+
936
+ if (word->best_choice->string ().length () > 0) {
937
+ for (index = 0; index < word->word->space (); index++) {
938
+ if (word->reject_spaces &&
939
+ (suspect_level >= suspect_space_level) &&
940
+ !tessedit_minimal_rejection && !tessedit_zero_rejection)
941
+ /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
942
+ accepted spaces AFTER generating basic space stats but BEFORE using .etx */
943
+ status = fprintf (mapfile, "0");
944
+ else
945
+ status = fprintf (mapfile, "1");
946
+ if (status < 0)
947
+ WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
948
+ }
949
+
950
+ if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
951
+ for (index = 0; index < 5; index++)
952
+ mapstr += '1';
953
+ }
954
+ else {
955
+ ASSERT_HOST (word->reject_map.length () ==
956
+ word->best_choice->string ().length ());
957
+
958
+ for (index = 0; index < word->reject_map.length (); index++) {
959
+ if (word->reject_map[index].accepted ())
960
+ mapstr += '1';
961
+ else
962
+ mapstr += '0';
963
+ }
964
+ }
965
+ status = fprintf (mapfile, "%s", mapstr.string ());
966
+ if (status < 0)
967
+ WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
968
+ }
969
+ if (word->word->flag (W_EOL)) {
970
+ status = fprintf (mapfile, "\n");
971
+ if (status < 0)
972
+ WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
973
+ }
974
+ status = fflush (mapfile);
975
+ if (status != 0)
976
+ WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
977
+ }
978
+ #endif
979
+
980
+
981
+ /*************************************************************************
982
+ * open_file()
983
+ *************************************************************************/
984
+
985
+ FILE *open_outfile( //open .map & .unlv file
986
+ const char *extension) {
987
+ STRING file_name;
988
+ FILE *outfile;
989
+
990
+ file_name = imagebasename + extension;
991
+ if (!(outfile = fopen (file_name.string (), "w"))) {
992
+ CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
993
+ file_name.string (), errno);
994
+ }
995
+ return outfile;
996
+ }
997
+
998
+
999
+ #if 0
1000
+ void write_unlv_text(WERD_RES *word) {
1001
+ const char *wordstr;
1002
+
1003
+ char buff[512]; //string to output
1004
+ int i = 0;
1005
+ int j = 0;
1006
+ char unrecognised = STRING (unrecognised_char)[0];
1007
+ int status;
1008
+ char space_str[3];
1009
+
1010
+ wordstr = word->best_choice->string ().string ();
1011
+
1012
+ /* DONT need to do anything special for repeated char words - at this stage
1013
+ the repetition char has been identified and any other chars have been
1014
+ rejected.
1015
+ */
1016
+
1017
+ for (; wordstr[i] != '\0'; i++) {
1018
+ if ((wordstr[i] == ' ') ||
1019
+ (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
1020
+ buff[j++] = unrecognised;
1021
+ else {
1022
+ if (word->reject_map[i].rejected ())
1023
+ buff[j++] = '^'; //Add suspect marker
1024
+ buff[j++] = wordstr[i];
1025
+ }
1026
+ }
1027
+ buff[j] = '\0';
1028
+
1029
+ if (strlen (wordstr) > 0) {
1030
+ if (word->reject_spaces &&
1031
+ (suspect_level >= suspect_space_level) &&
1032
+ !tessedit_minimal_rejection && !tessedit_zero_rejection)
1033
+ strcpy (space_str, "^ "); //Suspect space
1034
+ else
1035
+ strcpy (space_str, " "); //Certain space
1036
+
1037
+ for (i = 0; i < word->word->space (); i++) {
1038
+ status = fprintf (unlv_file, "%s", space_str);
1039
+ if (status < 0)
1040
+ WRITEFAILED.error ("write_unlv_text", EXIT,
1041
+ "Space Errno: %d", errno);
1042
+ }
1043
+
1044
+ status = fprintf (unlv_file, "%s", buff);
1045
+ if (status < 0)
1046
+ WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
1047
+ }
1048
+ if (word->word->flag (W_EOL)) {
1049
+ status = fprintf (unlv_file, "\n");
1050
+ if (status < 0)
1051
+ WRITEFAILED.error ("write_unlv_text", EXIT,
1052
+ "Newline Errno: %d", errno);
1053
+ }
1054
+ status = fflush (unlv_file);
1055
+ if (status != 0)
1056
+ WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
1057
+ }
1058
+ #endif
1059
+
1060
+
1061
+ /*************************************************************************
1062
+ * get_rep_char()
1063
+ * Return the first accepted character from the repetition string. This is the
1064
+ * character which is repeated - as determined earlier by fix_rep_char()
1065
+ *************************************************************************/
1066
+ UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
1067
+ int i;
1068
+ int offset;
1069
+
1070
+ for (i = 0, offset = 0;
1071
+ ((i < word->reject_map.length ()) &&
1072
+ (word->reject_map[i].rejected ()));
1073
+ offset += word->best_choice->lengths()[i++]);
1074
+ if (i < word->reject_map.length ())
1075
+ return unicharset.unichar_to_id(word->best_choice->string().string()
1076
+ + offset,
1077
+ word->best_choice->lengths()[i]);
1078
+ else
1079
+ return unicharset.unichar_to_id(unrecognised_char.string());
1080
+ }
1081
+
1082
+ void ensure_rep_chars_are_consistent(WERD_RES *word) {
1083
+ #if 0
1084
+ char rep_char = get_rep_char (word);
1085
+ char *ptr;
1086
+
1087
+ ptr = (char *) word->best_choice->string ().string ();
1088
+ for (; *ptr != '\0'; ptr++) {
1089
+ if (*ptr != rep_char)
1090
+ *ptr = rep_char;
1091
+ }
1092
+ #endif
1093
+
1094
+ #if 0
1095
+ UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
1096
+ int i;
1097
+ char *ptr;
1098
+ STRING consistent_string;
1099
+ STRING consistent_string_lengths;
1100
+
1101
+ ptr = (char *) word->best_choice->string ().string ();
1102
+ for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
1103
+ consistent_string += unicharset.id_to_unichar(rep_char);
1104
+ consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
1105
+ }
1106
+ word->best_choice->string() = consistent_string;
1107
+ word->best_choice->lengths() = consistent_string_lengths;
1108
+ #endif
1109
+ }
1110
+
1111
+ /*************************************************************************
1112
+ * SUSPECT LEVELS
1113
+ *
1114
+ * 0 - dont reject ANYTHING
1115
+ * 1,2 - partial rejection
1116
+ * 3 - BEST
1117
+ *
1118
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
1119
+ * tessedit_minimal_rejection.
1120
+ *************************************************************************/
1121
+
1122
+ void set_unlv_suspects(WERD_RES *word) {
1123
+ int len = word->reject_map.length ();
1124
+ int i;
1125
+ int offset;
1126
+ const char *ptr;
1127
+ const char *lengths = word->best_choice->lengths ().string ();
1128
+ float rating_per_ch;
1129
+
1130
+ ptr = word->best_choice->string ().string ();
1131
+
1132
+ if (suspect_level == 0) {
1133
+ for (i = 0; i < len; i++) {
1134
+ if (word->reject_map[i].rejected ())
1135
+ word->reject_map[i].setrej_minimal_rej_accept ();
1136
+ }
1137
+ return;
1138
+ }
1139
+
1140
+ if (suspect_level >= 3)
1141
+ return; //Use defaults
1142
+
1143
+ /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
1144
+
1145
+ if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
1146
+ suspect_short_words)) {
1147
+ /* Unreject alphas in dictionary words */
1148
+ for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
1149
+ if (word->reject_map[i].rejected () &&
1150
+ unicharset.get_isalpha (ptr + offset, lengths[i]))
1151
+ word->reject_map[i].setrej_minimal_rej_accept ();
1152
+ }
1153
+ }
1154
+
1155
+ rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
1156
+
1157
+ if (rating_per_ch >= suspect_rating_per_ch)
1158
+ return; //Dont touch bad ratings
1159
+
1160
+ if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
1161
+ /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
1162
+ for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
1163
+ if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
1164
+ word->reject_map[i].setrej_minimal_rej_accept ();
1165
+ }
1166
+ }
1167
+
1168
+ for (i = 0; i < len; i++) {
1169
+ if (word->reject_map[i].rejected ()) {
1170
+ if (word->reject_map[i].flag (R_DOC_REJ))
1171
+ word->reject_map[i].setrej_minimal_rej_accept ();
1172
+ if (word->reject_map[i].flag (R_BLOCK_REJ))
1173
+ word->reject_map[i].setrej_minimal_rej_accept ();
1174
+ if (word->reject_map[i].flag (R_ROW_REJ))
1175
+ word->reject_map[i].setrej_minimal_rej_accept ();
1176
+ }
1177
+ }
1178
+
1179
+ if (suspect_level == 2)
1180
+ return;
1181
+
1182
+ if (!suspect_constrain_1Il ||
1183
+ (word->reject_map.length () <= suspect_short_words)) {
1184
+ for (i = 0; i < len; i++) {
1185
+ if (word->reject_map[i].rejected ()) {
1186
+ if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
1187
+ word->reject_map[i].flag (R_POSTNN_1IL)))
1188
+ word->reject_map[i].setrej_minimal_rej_accept ();
1189
+
1190
+ if (!suspect_constrain_1Il &&
1191
+ word->reject_map[i].flag (R_MM_REJECT))
1192
+ word->reject_map[i].setrej_minimal_rej_accept ();
1193
+ }
1194
+ }
1195
+ }
1196
+
1197
+ if ((acceptable_word_string (word->best_choice->string ().string (),
1198
+ word->best_choice->lengths ().string ())
1199
+ != AC_UNACCEPTABLE) ||
1200
+ acceptable_number_string (word->best_choice->string ().string (),
1201
+ word->best_choice->lengths ().string ())) {
1202
+ if (word->reject_map.length () > suspect_short_words) {
1203
+ for (i = 0; i < len; i++) {
1204
+ if (word->reject_map[i].rejected () &&
1205
+ (!word->reject_map[i].perm_rejected () ||
1206
+ word->reject_map[i].flag (R_1IL_CONFLICT) ||
1207
+ word->reject_map[i].flag (R_POSTNN_1IL) ||
1208
+ word->reject_map[i].flag (R_MM_REJECT))) {
1209
+ word->reject_map[i].setrej_minimal_rej_accept ();
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+
1216
+
1217
+ inT16 count_alphas( //how many alphas
1218
+ const char *s,
1219
+ const char *lengths) {
1220
+ int count = 0;
1221
+
1222
+ for (; *s != '\0'; s += *(lengths++)) {
1223
+ if (unicharset.get_isalpha(s, *lengths))
1224
+ count++;
1225
+ }
1226
+ return count;
1227
+ }
1228
+
1229
+
1230
+ inT16 count_alphanums( //how many alphanums
1231
+ const char *s,
1232
+ const char *lengths) {
1233
+ int count = 0;
1234
+
1235
+ for (; *s != '\0'; s += *(lengths++)) {
1236
+ if (unicharset.get_isalpha(s, *lengths) ||
1237
+ unicharset.get_isdigit(s, *lengths))
1238
+ count++;
1239
+ }
1240
+ return count;
1241
+ }
1242
+
1243
+
1244
+ BOOL8 acceptable_number_string(const char *s,
1245
+ const char *lengths) {
1246
+ BOOL8 prev_digit = FALSE;
1247
+
1248
+ if (*lengths == 1 && *s == '(')
1249
+ s++;
1250
+
1251
+ if (*lengths == 1 &&
1252
+ ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
1253
+ s++;
1254
+
1255
+ for (; *s != '\0'; s += *(lengths++)) {
1256
+ if (unicharset.get_isdigit (s, *lengths))
1257
+ prev_digit = TRUE;
1258
+ else if (prev_digit &&
1259
+ (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
1260
+ prev_digit = FALSE;
1261
+ else if (prev_digit && *lengths == 1 &&
1262
+ (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
1263
+ return TRUE;
1264
+ else if (prev_digit &&
1265
+ *lengths == 1 && (*s == '%') &&
1266
+ (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
1267
+ (*(s + *lengths + *(lengths + 1)) == '\0'))
1268
+ return TRUE;
1269
+ else
1270
+ return FALSE;
1271
+ }
1272
+ return TRUE;
1273
+ }