tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1273 @@
1
+ /******************************************************************
2
+ * File: output.cpp (Formerly output.c)
3
+ * Description: Output pass
4
+ * Author: Phil Cheatle
5
+ * Created: Thu Aug 4 10:56:08 BST 1994
6
+ *
7
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #include "ocrshell.h"
22
+ #include <string.h>
23
+ #include <ctype.h>
24
+ #ifdef __UNIX__
25
+ #include <assert.h>
26
+ #include <unistd.h>
27
+ #include <errno.h>
28
+ #endif
29
+ #include "mainblk.h"
30
+ #include "tfacep.h"
31
+ #include "tessvars.h"
32
+ #include "control.h"
33
+ #include "secname.h"
34
+ #include "reject.h"
35
+ #include "docqual.h"
36
+ #include "output.h"
37
+ #include "bestfirst.h"
38
+ #include "globals.h"
39
+
40
+ #define EXTERN
41
+
42
+ #define EPAPER_EXT ".ep"
43
+ #define PAGE_YSIZE 3508
44
+ #define CTRL_INSET '\024' //dc4=text inset
45
+ #define CTRL_FONT '\016' //so=font change
46
+ #define CTRL_DEFAULT '\017' //si=default font
47
+ #define CTRL_SHIFT '\022' //dc2=x shift
48
+ #define CTRL_TAB '\011' //tab
49
+ #define CTRL_NEWLINE '\012' //newline
50
+ #define CTRL_HARDLINE '\015' //cr
51
+ int NO_BLOCK = 0; //don't output block information
52
+ inT16 XOFFSET = 0; //the image can be a part of bigger picture and we want to have the original coordinates
53
+ inT16 YOFFSET = 0;
54
+
55
+ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
56
+ "Write block separators in output");
57
+ EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
58
+ "Write raw stuff to name.raw");
59
+ EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
60
+ EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
61
+ "Return ratings in IPEOCRAPI data");
62
+ EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
63
+ "Write .txt to .etx map file");
64
+ EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
65
+ "Write repetition char code");
66
+ EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
67
+ EXTERN STRING_EVAR (unrecognised_char, "|",
68
+ "Output char for unidentified blobs");
69
+ EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
70
+ EXTERN INT_VAR (suspect_space_level, 100,
71
+ "Min suspect level for rejecting spaces");
72
+ EXTERN INT_VAR (suspect_short_words, 2,
73
+ "Dont Suspect dict wds longer than this");
74
+ EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
75
+ "UNLV keep 1Il chars rejected");
76
+ EXTERN double_VAR (suspect_rating_per_ch, 999.9,
77
+ "Dont touch bad rating limit");
78
+ EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
79
+
80
+ EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
81
+ "Only reject tess failures");
82
+ EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
83
+ EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
84
+ "Make output have exactly one word per WERD");
85
+ EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
86
+ "Dont reject ANYTHING AT ALL");
87
+ EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
88
+ "Force all rep chars the same");
89
+
90
+ FILE *txt_mapfile = NULL; //reject map
91
+ FILE *unlv_file = NULL; //reject map
92
+
93
+ /**********************************************************************
94
+ * pixels_to_pts
95
+ *
96
+ * Convert an integer number of pixels to the nearest integer
97
+ * number of points.
98
+ **********************************************************************/
99
+
100
+ inT32 pixels_to_pts( //convert coords
101
+ inT32 pixels,
102
+ inT32 pix_res //resolution
103
+ ) {
104
+ float pts; //converted value
105
+
106
+ pts = pixels * 72.0 / pix_res;
107
+ return (inT32) (pts + 0.5); //round it
108
+ }
109
+
110
+ void output_pass( //Tess output pass //send to api
111
+ PAGE_RES_IT &page_res_it,
112
+ BOOL8 write_to_shm,
113
+ TBOX *target_word_box) {
114
+ BLOCK_RES *block_of_last_word;
115
+ inT16 block_id;
116
+ BOOL8 force_eol; //During output
117
+ BLOCK *nextblock; //block of next word
118
+ WERD *nextword; //next word
119
+
120
+ if (tessedit_write_txt_map)
121
+ txt_mapfile = open_outfile (".map");
122
+
123
+ page_res_it.restart_page ();
124
+ block_of_last_word = NULL;
125
+ while (page_res_it.word () != NULL) {
126
+ check_debug_pt (page_res_it.word (), 120);
127
+
128
+ if (target_word_box)
129
+ {
130
+
131
+ TBOX current_word_box=page_res_it.word ()->word->bounding_box();
132
+ FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
133
+ if (!target_word_box->contains(center_pt))
134
+ {
135
+ page_res_it.forward ();
136
+ continue;
137
+ }
138
+
139
+ }
140
+ if (tessedit_write_block_separators &&
141
+ block_of_last_word != page_res_it.block ()) {
142
+ block_of_last_word = page_res_it.block ();
143
+ if (block_of_last_word->block->text_region () == NULL) {
144
+ if (block_of_last_word->block->poly_block () == NULL)
145
+ block_id = 1;
146
+ else
147
+ block_id =
148
+ ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
149
+ id_no();
150
+ }
151
+ else
152
+ block_id = block_of_last_word->block->text_region ()->id_no ();
153
+ if (!NO_BLOCK)
154
+ fprintf (textfile, "|^~tr%d\n", block_id);
155
+ fprintf (txt_mapfile, "|^~tr%d\n", block_id);
156
+ }
157
+
158
+ force_eol = (tessedit_write_block_separators &&
159
+ (page_res_it.block () != page_res_it.next_block ())) ||
160
+ (page_res_it.next_word () == NULL);
161
+
162
+ if (page_res_it.next_word () != NULL)
163
+ nextword = page_res_it.next_word ()->word;
164
+ else
165
+ nextword = NULL;
166
+ if (page_res_it.next_block () != NULL)
167
+ nextblock = page_res_it.next_block ()->block;
168
+ else
169
+ nextblock = NULL;
170
+ //regardless of tilde crunching
171
+ write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
172
+ write_to_shm);
173
+ page_res_it.forward ();
174
+ }
175
+ if (write_to_shm)
176
+ ocr_send_text(FALSE);
177
+ if (tessedit_write_block_separators) {
178
+ if (!NO_BLOCK)
179
+ fprintf (textfile, "|^~tr\n");
180
+ fprintf (txt_mapfile, "|^~tr\n");
181
+ }
182
+ if (tessedit_write_txt_map) {
183
+ fprintf (txt_mapfile, "\n"); //because txt gets one
184
+ #ifdef __UNIX__
185
+ fsync (fileno (txt_mapfile));
186
+ #endif
187
+ fclose(txt_mapfile);
188
+ }
189
+ }
190
+
191
+ /*************************************************************************
192
+ * write_results()
193
+ *
194
+ * All recognition and rejection has now been done. Generate the following:
195
+ * .txt file - giving the final best choices with NO highlighting
196
+ * .raw file - giving the tesseract top choice output for each word
197
+ * .map file - showing how the .txt file has been rejected in the .ep file
198
+ * epchoice list - a list of one element per word, containing the text for the
199
+ * epaper. Reject strings are inserted.
200
+ * inset list - a list of bounding boxes of reject insets - indexed by the
201
+ * reject strings in the epchoice text.
202
+ *************************************************************************/
203
+
204
+ void write_results( //output a word
205
+ PAGE_RES_IT &page_res_it, //full info
206
+ char newline_type, //type of newline
207
+ BOOL8 force_eol, //override tilde crunch?
208
+ BOOL8 write_to_shm //send to api
209
+ ) {
210
+ //word to do
211
+ WERD_RES *word = page_res_it.word ();
212
+ // WERD_CHOICE *ep_choice; //ep format
213
+ STRING repetition_code;
214
+ const STRING *wordstr;
215
+ STRING wordstr_lengths;
216
+ const char *text;
217
+ int i;
218
+ char unrecognised = STRING (unrecognised_char)[0];
219
+ char ep_chars[32]; //Only for unlv_tilde_crunch
220
+ int ep_chars_index = 0;
221
+ char txt_chs[32]; //Only for unlv_tilde_crunch
222
+ char map_chs[32]; //Only for unlv_tilde_crunch
223
+ int txt_index = 0;
224
+ static BOOL8 tilde_crunch_written = FALSE;
225
+ static BOOL8 last_char_was_newline = TRUE;
226
+ static BOOL8 last_char_was_tilde = FALSE;
227
+ static BOOL8 empty_block = TRUE;
228
+ BOOL8 need_reject = FALSE;
229
+ char *ptr; //string ptr
230
+ PBLOB_IT blob_it; //blobs
231
+
232
+ /* if (word->best_choice->string().length() == 0)
233
+ {
234
+ tprintf("No output: to output\n");
235
+ }
236
+ else if (word->best_choice->string()[0]==' ')
237
+ {
238
+ tprintf("spaceword to output\n");
239
+ }
240
+ else if (word->best_choice->string()[0]=='\0')
241
+ {
242
+ tprintf("null to output\n");
243
+ }*/
244
+ if (word->unlv_crunch_mode != CR_NONE
245
+ && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
246
+ if ((word->unlv_crunch_mode != CR_DELETE) &&
247
+ (!tilde_crunch_written ||
248
+ ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
249
+ (word->word->space () > 0) &&
250
+ !word->word->flag (W_FUZZY_NON) &&
251
+ !word->word->flag (W_FUZZY_SP)))) {
252
+ if (!word->word->flag (W_BOL) &&
253
+ (word->word->space () > 0) &&
254
+ !word->word->flag (W_FUZZY_NON) &&
255
+ !word->word->flag (W_FUZZY_SP)) {
256
+ /* Write a space to separate from preceeding good text */
257
+ txt_chs[txt_index] = ' ';
258
+ map_chs[txt_index++] = '1';
259
+ ep_chars[ep_chars_index++] = ' ';
260
+ last_char_was_tilde = FALSE;
261
+ }
262
+ need_reject = TRUE;
263
+ }
264
+ if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
265
+ /* Write a reject char - mark as rejected unless zero_rejection mode */
266
+ last_char_was_tilde = TRUE;
267
+ txt_chs[txt_index] = unrecognised;
268
+ if (tessedit_zero_rejection || (suspect_level == 0)) {
269
+ map_chs[txt_index++] = '1';
270
+ ep_chars[ep_chars_index++] = unrecognised;
271
+ }
272
+ else {
273
+ map_chs[txt_index++] = '0';
274
+ /*
275
+ The ep_choice string is a faked reject to allow newdiff to sync the .etx
276
+ with the .txt and .map files.
277
+ */
278
+ ep_chars[ep_chars_index++] = CTRL_INSET;
279
+ //escape code
280
+ //dummy reject
281
+ ep_chars[ep_chars_index++] = 1;
282
+ //dummy reject
283
+ ep_chars[ep_chars_index++] = 1;
284
+ //type
285
+ ep_chars[ep_chars_index++] = 2;
286
+ //dummy reject
287
+ ep_chars[ep_chars_index++] = 1;
288
+ //dummy reject
289
+ ep_chars[ep_chars_index++] = 1;
290
+ }
291
+ tilde_crunch_written = TRUE;
292
+ last_char_was_newline = FALSE;
293
+ empty_block = FALSE;
294
+ }
295
+
296
+ if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
297
+ /* Add a new line output */
298
+ txt_chs[txt_index] = '\n';
299
+ map_chs[txt_index++] = '\n';
300
+ //end line
301
+ ep_chars[ep_chars_index++] = newline_type;
302
+
303
+ //Cos of the real newline
304
+ tilde_crunch_written = FALSE;
305
+ last_char_was_newline = TRUE;
306
+ last_char_was_tilde = FALSE;
307
+ }
308
+ txt_chs[txt_index] = '\0';
309
+ map_chs[txt_index] = '\0';
310
+ //xiaofan
311
+ if (tessedit_write_output && !NO_BLOCK)
312
+ fprintf (textfile, "%s", txt_chs);
313
+
314
+ if (tessedit_write_txt_map)
315
+ fprintf (txt_mapfile, "%s", map_chs);
316
+
317
+ //terminate string
318
+ ep_chars[ep_chars_index] = '\0';
319
+ word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
320
+
321
+ if (force_eol)
322
+ empty_block = TRUE;
323
+ return;
324
+ }
325
+
326
+ /* NORMAL PROCESSING of non tilde crunched words */
327
+
328
+ tilde_crunch_written = FALSE;
329
+ if (newline_type)
330
+ last_char_was_newline = TRUE;
331
+ else
332
+ last_char_was_newline = FALSE;
333
+ empty_block = force_eol; //About to write a real word
334
+
335
+ if (unlv_tilde_crunching &&
336
+ last_char_was_tilde &&
337
+ (word->word->space () == 0) &&
338
+ !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
339
+ (word->best_choice->string ()[0] == ' ')) {
340
+ /* Prevent adjacent tilde across words - we know that adjacent tildes within
341
+ words have been removed */
342
+ ptr = (char *) word->best_choice->string ().string ();
343
+ strcpy (ptr, ptr + 1); //shuffle up
344
+ ptr = (char *) word->best_choice->lengths ().string ();
345
+ strcpy (ptr, ptr + 1); //shuffle up
346
+ word->reject_map.remove_pos (0);
347
+ blob_it = word->outword->blob_list ();
348
+ delete blob_it.extract (); //get rid of reject blob
349
+ }
350
+ if (newline_type ||
351
+ (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
352
+ last_char_was_tilde = FALSE;
353
+ else {
354
+ if (word->reject_map.length () > 0) {
355
+ for (i = 0, ptr = (char *) word->best_choice->string().string();
356
+ i < word->reject_map.length () - 1; ++i)
357
+ ptr += word->best_choice->lengths()[i];
358
+ if (*ptr == ' ')
359
+ last_char_was_tilde = TRUE;
360
+ else
361
+ last_char_was_tilde = FALSE;
362
+ }
363
+ else if (word->word->space () > 0)
364
+ last_char_was_tilde = FALSE;
365
+ /* else it is unchanged as there are no output chars */
366
+ }
367
+
368
+ ptr = (char *) word->best_choice->lengths ().string ();
369
+ ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
370
+
371
+ if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
372
+ ensure_rep_chars_are_consistent(word);
373
+
374
+ set_unlv_suspects(word);
375
+ check_debug_pt (word, 120);
376
+ if (tessedit_rejection_debug) {
377
+ tprintf ("Dict word: \"%s\": %d\n",
378
+ word->best_choice->string ().string (),
379
+ dict_word (word->best_choice->string ().string ()));
380
+ }
381
+
382
+ #if 0
383
+ if (tessedit_write_unlv) {
384
+ write_unlv_text(word);
385
+ }
386
+ #endif
387
+
388
+ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
389
+ repetition_code = "|^~R";
390
+ wordstr_lengths = "\001\001\001\001";
391
+ repetition_code += unicharset.id_to_unichar(get_rep_char (word));
392
+ wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
393
+ wordstr = &repetition_code;
394
+ }
395
+ else {
396
+ wordstr = &(word->best_choice->string ());
397
+ wordstr_lengths = word->best_choice->lengths ();
398
+ if (tessedit_zero_rejection) {
399
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
400
+ text = wordstr->string ();
401
+ for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
402
+ if (word->reject_map[i].rejected ())
403
+ word->reject_map[i].setrej_minimal_rej_accept ();
404
+ }
405
+ }
406
+ if (tessedit_minimal_rejection) {
407
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
408
+ text = wordstr->string ();
409
+ for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
410
+ if ((*text != ' ') && word->reject_map[i].rejected ())
411
+ word->reject_map[i].setrej_minimal_rej_accept ();
412
+ }
413
+ }
414
+ }
415
+
416
+ if (write_to_shm)
417
+ write_shm_text (word, page_res_it.block ()->block,
418
+ page_res_it.row (), *wordstr, wordstr_lengths);
419
+
420
+ #if 0
421
+ if (tessedit_write_output)
422
+ write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
423
+
424
+ if (tessedit_write_raw_output)
425
+ write_cooked_text (word->word, word->raw_choice->string (),
426
+ TRUE, FALSE, rawfile);
427
+
428
+ if (tessedit_write_txt_map)
429
+ write_map(txt_mapfile, word);
430
+
431
+ ep_choice = make_epaper_choice (word, newline_type);
432
+ word->ep_choice = ep_choice;
433
+ #endif
434
+
435
+ character_count += word->best_choice->lengths ().length ();
436
+ word_count++;
437
+ }
438
+
439
+ /**********************************************************************
440
+ * make_epaper_choice
441
+ *
442
+ * Construct the epaper text string for a word, using the reject map to
443
+ * determine whether each blob should be rejected.
444
+ **********************************************************************/
445
+
446
+ #if 0
447
+ WERD_CHOICE *make_epaper_choice( //convert one word
448
+ WERD_RES *word, //word to do
449
+ char newline_type //type of newline
450
+ ) {
451
+ inT16 index = 0; //to string
452
+ inT16 blobindex; //to word
453
+ inT16 prevright = 0; //right of previous blob
454
+ inT16 nextleft; //left of next blob
455
+ PBLOB *blob;
456
+ TBOX inset_box; //bounding box
457
+ PBLOB_IT blob_it; //blob iterator
458
+ char word_string[MAX_PATH]; //converted string
459
+ BOOL8 force_total_reject;
460
+ char unrecognised = STRING (unrecognised_char)[0];
461
+
462
+ blob_it.set_to_list (word->outword->blob_list ());
463
+
464
+ ASSERT_HOST (word->reject_map.length () ==
465
+ word->best_choice->string ().length ());
466
+ /*
467
+ tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n",
468
+ word->best_choice->string().string(),
469
+ word->best_choice->string().length(),
470
+ blob_it.length(),
471
+ blob_count( word->outword ) );
472
+ */
473
+
474
+ if (word->best_choice->string ().length () == 0)
475
+ force_total_reject = TRUE;
476
+ else {
477
+ force_total_reject = FALSE;
478
+ ASSERT_HOST (blob_it.length () ==
479
+ word->best_choice->string ().length ());
480
+ }
481
+ if (!blob_it.empty ()) {
482
+ for (index = 0; index < word->word->space (); index++)
483
+ word_string[index] = ' '; //leading blanks
484
+ }
485
+ /* Why does this generate leading blanks regardless of whether the
486
+ word_choice string is empty, when write_cooked_text ony generates leading
487
+ blanks when the string is NOT empty???. */
488
+
489
+ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
490
+ strcpy (word_string + index, "|^~R");
491
+ index += 4;
492
+ strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
493
+ index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
494
+ }
495
+ else {
496
+ if (!blob_it.empty ())
497
+ prevright = blob_it.data ()->bounding_box ().left ();
498
+ //actually first left
499
+ for (blobindex = 0, blob_it.mark_cycle_pt ();
500
+ !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
501
+ blob = blob_it.data ();
502
+ if (word->reject_map[blobindex].accepted ()) {
503
+ if (word->best_choice->string ()[blobindex] == ' ')
504
+ //but not rejected!!
505
+ word_string[index++] = unrecognised;
506
+ else
507
+ word_string[index++] =
508
+ word->best_choice->string ()[blobindex];
509
+ }
510
+ else { // start reject
511
+ inset_box = blob->bounding_box ();
512
+ /* Extend reject box to include rejected neighbours */
513
+ while (!blob_it.at_last () &&
514
+ (force_total_reject ||
515
+ (word->reject_map[blobindex + 1].rejected ()))) {
516
+ blobindex++;
517
+ blob = blob_it.forward ();
518
+ //get total box
519
+ inset_box += blob->bounding_box ();
520
+ }
521
+ if (blob_it.at_last ())
522
+ nextleft = inset_box.right ();
523
+ else
524
+ nextleft = blob_it.data_relative (1)->bounding_box ().left ();
525
+
526
+ // tprintf("Making reject from (%d,%d)->(%d,%d)\n",
527
+ // inset_box.left(),inset_box.bottom(),
528
+ // inset_box.right(),inset_box.top());
529
+
530
+ index += make_reject (&inset_box, prevright, nextleft,
531
+ &word->denorm, &word_string[index]);
532
+ }
533
+ prevright = blob->bounding_box ().right ();
534
+ }
535
+ }
536
+ if (newline_type)
537
+ //end line
538
+ word_string[index++] = newline_type;
539
+ word_string[index] = '\0'; //terminate string
540
+ if (strlen (word_string) != index) {
541
+ tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
542
+ word_string, index, strlen (word_string));
543
+ }
544
+ //don't pass any zeros
545
+ ASSERT_HOST (strlen (word_string) == index);
546
+ return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
547
+ }
548
+ #endif
549
+
550
+ /**********************************************************************
551
+ * make_reject
552
+ *
553
+ * Add the escape code to the string for the reject.
554
+ **********************************************************************/
555
+
556
+ inT16
557
+ make_reject ( //make reject code
558
+ TBOX * inset_box, //bounding box
559
+ inT16 prevright, //previous char
560
+ inT16 nextleft, //next char
561
+ DENORM * denorm, //de-normalizer
562
+ char word_string[] //output string
563
+ ) {
564
+ inT16 index; //to string
565
+ inT16 xpos; //start of inset
566
+ inT16 ypos;
567
+ inT16 width; //size of inset
568
+ inT16 height;
569
+ inT16 left_offset; //shift form prev char
570
+ inT16 right_offset; //shift to next char
571
+ inT16 baseline_offset; //shift from baseline
572
+ inT16 inset_index = 0; //number of inset
573
+ inT16 min_chars; //min width estimate
574
+ inT16 max_chars; //max width estimate
575
+ float x_centre; //centre of box
576
+
577
+ index = 0;
578
+ x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
579
+ left_offset =
580
+ (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
581
+ right_offset =
582
+ (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
583
+ xpos = (inT16) floor (denorm->x (inset_box->left ()));
584
+ width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
585
+ ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
586
+ height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
587
+ baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
588
+ //escape code
589
+ word_string[index++] = CTRL_INSET;
590
+ min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
591
+ max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
592
+ /*
593
+ Ensure min_chars and max_chars are in the range 0..254. This ensures that
594
+ we can add 1 to them to avoid putting \0 in a string, and still not exceed
595
+ the max value in a byte.
596
+ */
597
+ if (min_chars < 0)
598
+ min_chars = 0;
599
+ if (min_chars > 254)
600
+ min_chars = 254;
601
+ if (max_chars < min_chars)
602
+ max_chars = min_chars;
603
+ if (max_chars > 254)
604
+ max_chars = 254;
605
+ //min chars
606
+ word_string[index++] = min_chars + 1;
607
+ //max chars
608
+ word_string[index++] = max_chars + 1;
609
+ word_string[index++] = 2; //type?
610
+ //store index
611
+ word_string[index++] = inset_index / 255 + 1;
612
+ word_string[index++] = inset_index % 255 + 1;
613
+ return index; //size of string
614
+ }
615
+
616
+
617
+ /**********************************************************************
618
+ * determine_newline_type
619
+ *
620
+ * Find whether we have a wrapping or hard newline.
621
+ * Return FALSE if not at end of line.
622
+ **********************************************************************/
623
+
624
+ char determine_newline_type( //test line ends
625
+ WERD *word, //word to do
626
+ BLOCK *block, //current block
627
+ WERD *next_word, //next word
628
+ BLOCK *next_block //block of next word
629
+ ) {
630
+ inT16 end_gap; //to right edge
631
+ inT16 width; //of next word
632
+ TBOX word_box; //bounding
633
+ TBOX next_box; //next word
634
+ TBOX block_box; //block bounding
635
+
636
+ if (!word->flag (W_EOL))
637
+ return FALSE; //not end of line
638
+ if (next_word == NULL || next_block == NULL || block != next_block)
639
+ return CTRL_NEWLINE;
640
+ if (next_word->space () > 0)
641
+ return CTRL_HARDLINE; //it is tabbed
642
+ word_box = word->bounding_box ();
643
+ next_box = next_word->bounding_box ();
644
+ block_box = block->bounding_box ();
645
+ //gap to eol
646
+ end_gap = block_box.right () - word_box.right ();
647
+ end_gap -= (inT32) block->space ();
648
+ width = next_box.right () - next_box.left ();
649
+ // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
650
+ // block_box.right(),word_box.right(),end_gap,
651
+ // next_box.right(),next_box.left(),width,
652
+ // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
653
+ return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
654
+ }
655
+
656
+
657
+ /**********************************************************************
658
+ * write_cooked_text
659
+ *
660
+ * Write the cooked text (with bold for pass2 and underline for reject)
661
+ * to the given file.
662
+ **********************************************************************/
663
+
664
+ #if 0
665
+ void write_cooked_text( //write output
666
+ WERD *word, //word to do
667
+ const STRING &text, //text to write
668
+ BOOL8 acceptable, //good stuff
669
+ BOOL8 pass2, //done on pass2
670
+ FILE *fp //file to write
671
+ ) {
672
+ inT16 index; //blank counter
673
+ int status;
674
+ static int newaline = 1;
675
+ static int havespace = 0;
676
+ char buff[512];
677
+ const char *wordstr = text.string ();
678
+ int i = 0;
679
+ char unrecognised = STRING (unrecognised_char)[0];
680
+ static int old_segs = 0;
681
+ TBOX mybox;
682
+ for (i = 0; wordstr[i] != '\0'; i++) {
683
+ if (wordstr[i] == ' ')
684
+ buff[i] = unrecognised;
685
+ else
686
+ buff[i] = wordstr[i];
687
+ }
688
+ buff[i] = '\0';
689
+
690
+ if (fp == stdout) {
691
+ tprintf ("Cooked=%s, %d segs, acceptable=%d",
692
+ buff, num_popped - old_segs, acceptable);
693
+ old_segs = num_popped;
694
+ return;
695
+ }
696
+
697
+ if (text.length () > 0) {
698
+ for (index = 0; index < word->space (); index++) {
699
+ status = fprintf (fp, " ");
700
+ havespace = 1;
701
+ if (status < 0)
702
+ WRITEFAILED.error ("write_cooked_text", EXIT,
703
+ "Space Errno: %d", errno);
704
+ }
705
+ if (pass2) {
706
+ status = fprintf (fp, BOLD_ON);
707
+ if (status < 0)
708
+ WRITEFAILED.error ("write_cooked_text", EXIT,
709
+ "Bold Errno: %d", errno);
710
+ }
711
+ if (!acceptable) {
712
+ status = fprintf (fp, UNDERLINE_ON);
713
+ if (status < 0)
714
+ WRITEFAILED.error ("write_cooked_text", EXIT,
715
+ "Underline Errno: %d", errno);
716
+ }
717
+
718
+ //xiaofan
719
+ if (NO_BLOCK && word && strlen (buff)) {
720
+ mybox = word->bounding_box ();
721
+ if (newaline || !havespace) {
722
+ fprintf (fp, " ");
723
+ newaline = 0;
724
+ }
725
+ fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
726
+ XOFFSET + mybox.left (),
727
+ YOFFSET + page_image.get_ysize () - mybox.top (),
728
+ XOFFSET + mybox.right (),
729
+ YOFFSET + page_image.get_ysize () - mybox.bottom ());
730
+ havespace = 0;
731
+ }
732
+
733
+ status = fprintf (fp, "%s", buff);
734
+ if (status < 0)
735
+ WRITEFAILED.error ("write_cooked_text", EXIT,
736
+ "Word Errno: %d", errno);
737
+ if (pass2) {
738
+ status = fprintf (fp, BOLD_OFF);
739
+ if (status < 0)
740
+ WRITEFAILED.error ("write_cooked_text", EXIT,
741
+ "Bold off Errno: %d", errno);
742
+ }
743
+ if (!acceptable) {
744
+ status = fprintf (fp, UNDERLINE_OFF);
745
+ if (status < 0)
746
+ WRITEFAILED.error ("write_cooked_text", EXIT,
747
+ "Underline off Errno: %d", errno);
748
+ }
749
+ }
750
+ if (word->flag (W_EOL)) {
751
+ status = fprintf (fp, "\n");
752
+ newaline = 1;
753
+ if (status < 0)
754
+ WRITEFAILED.error ("write_cooked_text", EXIT,
755
+ "Newline Errno: %d", errno);
756
+ }
757
+ status = fflush (fp);
758
+ if (status != 0)
759
+ WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
760
+ }
761
+ #endif
762
+
763
+
764
+ /**********************************************************************
765
+ * write_shm_text
766
+ *
767
+ * Write the cooked text to the shared memory for the api.
768
+ **********************************************************************/
769
+
770
+ void write_shm_text( //write output
771
+ WERD_RES *word, //word to do
772
+ BLOCK *block, //block it is from
773
+ ROW_RES *row, //row it is from
774
+ const STRING &text, //text to write
775
+ const STRING &text_lengths
776
+ ) {
777
+ inT32 index; //char counter
778
+ inT32 index2; //char counter
779
+ inT32 length; //chars in word
780
+ inT32 ptsize; //font size
781
+ inT8 blanks; //blanks in word
782
+ uinT8 enhancement; //bold etc
783
+ uinT8 font; //font index
784
+ char unrecognised = STRING (unrecognised_char)[0];
785
+ PBLOB *blob;
786
+ TBOX blob_box; //bounding box
787
+ PBLOB_IT blob_it; //blob iterator
788
+ WERD copy_outword; // copy to denorm
789
+ uinT32 rating; //of char
790
+ BOOL8 lineend; //end of line
791
+ int offset;
792
+ int offset2;
793
+
794
+ //point size
795
+ ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
796
+ if (word->word->flag (W_BOL) && ocr_char_space () < 128
797
+ && ocr_send_text (TRUE) != OKAY)
798
+ return; //release failed
799
+ copy_outword = *(word->outword);
800
+ copy_outword.baseline_denormalise (&word->denorm);
801
+ blob_it.set_to_list (copy_outword.blob_list ());
802
+ length = text_lengths.length ();
803
+
804
+ if (length > 0) {
805
+ blanks = word->word->space ();
806
+ if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
807
+ blanks = 1;
808
+ for (index = 0, offset = 0; index < length;
809
+ offset += text_lengths[index++], blob_it.forward ()) {
810
+ blob = blob_it.data ();
811
+ blob_box = blob->bounding_box ();
812
+
813
+ enhancement = 0;
814
+ if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
815
+ enhancement |= EUC_ITALIC;
816
+ if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
817
+ enhancement |= EUC_BOLD;
818
+ if (tessedit_write_ratings)
819
+ rating = (uinT32) (-word->best_choice->certainty () / 0.035);
820
+ else if (tessedit_zero_rejection)
821
+ rating = text[offset] == ' ' ? 100 : 0;
822
+ else
823
+ rating = word->reject_map[index].accepted ()? 0 : 100;
824
+ if (rating > 255)
825
+ rating = 255;
826
+ if (word->font1_count > 2)
827
+ font = word->font1;
828
+ else if (row->font1_count > 8)
829
+ font = row->font1;
830
+ else
831
+ //font index
832
+ font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
833
+
834
+ lineend = word->word->flag (W_EOL) && index == length - 1;
835
+ if (word->word->flag (W_EOL) && tessedit_zero_rejection
836
+ && index < length - 1 && text[index + text_lengths[index]] == ' ') {
837
+ for (index2 = index + 1, offset2 = offset + text_lengths[index];
838
+ index2 < length && text[offset2] == ' ';
839
+ offset2 += text_lengths[index2++]);
840
+ if (index2 == length)
841
+ lineend = TRUE;
842
+ }
843
+
844
+ if (!tessedit_zero_rejection || text[offset] != ' '
845
+ || tessedit_word_for_word) {
846
+ //confidence
847
+ if (text[offset] == ' ') {
848
+ ocr_append_char (unrecognised,
849
+ blob_box.left (), blob_box.right (),
850
+ page_image.get_ysize () - 1 - blob_box.top (),
851
+ page_image.get_ysize () - 1 - blob_box.bottom (),
852
+ font, (uinT8) rating,
853
+ ptsize, //point size
854
+ blanks, enhancement, //enhancement
855
+ OCR_CDIR_LEFT_RIGHT,
856
+ OCR_LDIR_DOWN_RIGHT,
857
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
858
+ } else {
859
+ for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
860
+ ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
861
+ blob_box.left (), blob_box.right (),
862
+ page_image.get_ysize () - 1 - blob_box.top (),
863
+ page_image.get_ysize () - 1 - blob_box.bottom (),
864
+ font, (uinT8) rating,
865
+ ptsize, //point size
866
+ blanks, enhancement, //enhancement
867
+ OCR_CDIR_LEFT_RIGHT,
868
+ OCR_LDIR_DOWN_RIGHT,
869
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
870
+ }
871
+ blanks = 0;
872
+ }
873
+
874
+ }
875
+ }
876
+ else if (tessedit_word_for_word) {
877
+ blanks = word->word->space ();
878
+ if (blanks == 0 && !word->word->flag (W_BOL))
879
+ blanks = 1;
880
+ blob_box = word->word->bounding_box ();
881
+
882
+ enhancement = 0;
883
+ if (word->italic > 0)
884
+ enhancement |= EUC_ITALIC;
885
+ if (word->bold > 0)
886
+ enhancement |= EUC_BOLD;
887
+ rating = 100;
888
+ if (word->font1_count > 2)
889
+ font = word->font1;
890
+ else if (row->font1_count > 8)
891
+ font = row->font1;
892
+ else
893
+ //font index
894
+ font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
895
+
896
+ lineend = word->word->flag (W_EOL);
897
+
898
+ //font index
899
+ ocr_append_char (unrecognised,
900
+ blob_box.left (), blob_box.right (),
901
+ page_image.get_ysize () - 1 - blob_box.top (),
902
+ page_image.get_ysize () - 1 - blob_box.bottom (),
903
+ font,
904
+ rating, //confidence
905
+ ptsize, //point size
906
+ blanks, enhancement, //enhancement
907
+ OCR_CDIR_LEFT_RIGHT,
908
+ OCR_LDIR_DOWN_RIGHT,
909
+ lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
910
+ }
911
+ }
912
+
913
+
914
+ /**********************************************************************
915
+ * write_map
916
+ *
917
+ * Write a map file of 0's and 1'a which associates characters from the .txt
918
+ * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
919
+ * is kept. Note that there may be reject regions in the .etx file WITHOUT
920
+ * .txt chars being rejected. The map file should be the same length, and
921
+ * the same number of lines as the .txt file
922
+ *
923
+ * The paramaterised input is because I thought I might be able to generate
924
+ * multiple map files in a single run. However, it didn't work because
925
+ * newdiff needs etx files!
926
+ **********************************************************************/
927
+
928
+ #if 0
929
+ void write_map( //output a map file
930
+ FILE *mapfile, //mapfile to write to
931
+ WERD_RES *word) {
932
+ inT16 index;
933
+ int status;
934
+ STRING mapstr = "";
935
+
936
+ if (word->best_choice->string ().length () > 0) {
937
+ for (index = 0; index < word->word->space (); index++) {
938
+ if (word->reject_spaces &&
939
+ (suspect_level >= suspect_space_level) &&
940
+ !tessedit_minimal_rejection && !tessedit_zero_rejection)
941
+ /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
942
+ accepted spaces AFTER generating basic space stats but BEFORE using .etx */
943
+ status = fprintf (mapfile, "0");
944
+ else
945
+ status = fprintf (mapfile, "1");
946
+ if (status < 0)
947
+ WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
948
+ }
949
+
950
+ if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
951
+ for (index = 0; index < 5; index++)
952
+ mapstr += '1';
953
+ }
954
+ else {
955
+ ASSERT_HOST (word->reject_map.length () ==
956
+ word->best_choice->string ().length ());
957
+
958
+ for (index = 0; index < word->reject_map.length (); index++) {
959
+ if (word->reject_map[index].accepted ())
960
+ mapstr += '1';
961
+ else
962
+ mapstr += '0';
963
+ }
964
+ }
965
+ status = fprintf (mapfile, "%s", mapstr.string ());
966
+ if (status < 0)
967
+ WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
968
+ }
969
+ if (word->word->flag (W_EOL)) {
970
+ status = fprintf (mapfile, "\n");
971
+ if (status < 0)
972
+ WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
973
+ }
974
+ status = fflush (mapfile);
975
+ if (status != 0)
976
+ WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
977
+ }
978
+ #endif
979
+
980
+
981
+ /*************************************************************************
982
+ * open_file()
983
+ *************************************************************************/
984
+
985
+ FILE *open_outfile( //open .map & .unlv file
986
+ const char *extension) {
987
+ STRING file_name;
988
+ FILE *outfile;
989
+
990
+ file_name = imagebasename + extension;
991
+ if (!(outfile = fopen (file_name.string (), "w"))) {
992
+ CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
993
+ file_name.string (), errno);
994
+ }
995
+ return outfile;
996
+ }
997
+
998
+
999
+ #if 0
1000
+ void write_unlv_text(WERD_RES *word) {
1001
+ const char *wordstr;
1002
+
1003
+ char buff[512]; //string to output
1004
+ int i = 0;
1005
+ int j = 0;
1006
+ char unrecognised = STRING (unrecognised_char)[0];
1007
+ int status;
1008
+ char space_str[3];
1009
+
1010
+ wordstr = word->best_choice->string ().string ();
1011
+
1012
+ /* DONT need to do anything special for repeated char words - at this stage
1013
+ the repetition char has been identified and any other chars have been
1014
+ rejected.
1015
+ */
1016
+
1017
+ for (; wordstr[i] != '\0'; i++) {
1018
+ if ((wordstr[i] == ' ') ||
1019
+ (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
1020
+ buff[j++] = unrecognised;
1021
+ else {
1022
+ if (word->reject_map[i].rejected ())
1023
+ buff[j++] = '^'; //Add suspect marker
1024
+ buff[j++] = wordstr[i];
1025
+ }
1026
+ }
1027
+ buff[j] = '\0';
1028
+
1029
+ if (strlen (wordstr) > 0) {
1030
+ if (word->reject_spaces &&
1031
+ (suspect_level >= suspect_space_level) &&
1032
+ !tessedit_minimal_rejection && !tessedit_zero_rejection)
1033
+ strcpy (space_str, "^ "); //Suspect space
1034
+ else
1035
+ strcpy (space_str, " "); //Certain space
1036
+
1037
+ for (i = 0; i < word->word->space (); i++) {
1038
+ status = fprintf (unlv_file, "%s", space_str);
1039
+ if (status < 0)
1040
+ WRITEFAILED.error ("write_unlv_text", EXIT,
1041
+ "Space Errno: %d", errno);
1042
+ }
1043
+
1044
+ status = fprintf (unlv_file, "%s", buff);
1045
+ if (status < 0)
1046
+ WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
1047
+ }
1048
+ if (word->word->flag (W_EOL)) {
1049
+ status = fprintf (unlv_file, "\n");
1050
+ if (status < 0)
1051
+ WRITEFAILED.error ("write_unlv_text", EXIT,
1052
+ "Newline Errno: %d", errno);
1053
+ }
1054
+ status = fflush (unlv_file);
1055
+ if (status != 0)
1056
+ WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
1057
+ }
1058
+ #endif
1059
+
1060
+
1061
+ /*************************************************************************
1062
+ * get_rep_char()
1063
+ * Return the first accepted character from the repetition string. This is the
1064
+ * character which is repeated - as determined earlier by fix_rep_char()
1065
+ *************************************************************************/
1066
+ UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
1067
+ int i;
1068
+ int offset;
1069
+
1070
+ for (i = 0, offset = 0;
1071
+ ((i < word->reject_map.length ()) &&
1072
+ (word->reject_map[i].rejected ()));
1073
+ offset += word->best_choice->lengths()[i++]);
1074
+ if (i < word->reject_map.length ())
1075
+ return unicharset.unichar_to_id(word->best_choice->string().string()
1076
+ + offset,
1077
+ word->best_choice->lengths()[i]);
1078
+ else
1079
+ return unicharset.unichar_to_id(unrecognised_char.string());
1080
+ }
1081
+
1082
+ void ensure_rep_chars_are_consistent(WERD_RES *word) {
1083
+ #if 0
1084
+ char rep_char = get_rep_char (word);
1085
+ char *ptr;
1086
+
1087
+ ptr = (char *) word->best_choice->string ().string ();
1088
+ for (; *ptr != '\0'; ptr++) {
1089
+ if (*ptr != rep_char)
1090
+ *ptr = rep_char;
1091
+ }
1092
+ #endif
1093
+
1094
+ #if 0
1095
+ UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
1096
+ int i;
1097
+ char *ptr;
1098
+ STRING consistent_string;
1099
+ STRING consistent_string_lengths;
1100
+
1101
+ ptr = (char *) word->best_choice->string ().string ();
1102
+ for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
1103
+ consistent_string += unicharset.id_to_unichar(rep_char);
1104
+ consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
1105
+ }
1106
+ word->best_choice->string() = consistent_string;
1107
+ word->best_choice->lengths() = consistent_string_lengths;
1108
+ #endif
1109
+ }
1110
+
1111
+ /*************************************************************************
1112
+ * SUSPECT LEVELS
1113
+ *
1114
+ * 0 - dont reject ANYTHING
1115
+ * 1,2 - partial rejection
1116
+ * 3 - BEST
1117
+ *
1118
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
1119
+ * tessedit_minimal_rejection.
1120
+ *************************************************************************/
1121
+
1122
+ void set_unlv_suspects(WERD_RES *word) {
1123
+ int len = word->reject_map.length ();
1124
+ int i;
1125
+ int offset;
1126
+ const char *ptr;
1127
+ const char *lengths = word->best_choice->lengths ().string ();
1128
+ float rating_per_ch;
1129
+
1130
+ ptr = word->best_choice->string ().string ();
1131
+
1132
+ if (suspect_level == 0) {
1133
+ for (i = 0; i < len; i++) {
1134
+ if (word->reject_map[i].rejected ())
1135
+ word->reject_map[i].setrej_minimal_rej_accept ();
1136
+ }
1137
+ return;
1138
+ }
1139
+
1140
+ if (suspect_level >= 3)
1141
+ return; //Use defaults
1142
+
1143
+ /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
1144
+
1145
+ if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
1146
+ suspect_short_words)) {
1147
+ /* Unreject alphas in dictionary words */
1148
+ for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
1149
+ if (word->reject_map[i].rejected () &&
1150
+ unicharset.get_isalpha (ptr + offset, lengths[i]))
1151
+ word->reject_map[i].setrej_minimal_rej_accept ();
1152
+ }
1153
+ }
1154
+
1155
+ rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
1156
+
1157
+ if (rating_per_ch >= suspect_rating_per_ch)
1158
+ return; //Dont touch bad ratings
1159
+
1160
+ if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
1161
+ /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
1162
+ for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
1163
+ if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
1164
+ word->reject_map[i].setrej_minimal_rej_accept ();
1165
+ }
1166
+ }
1167
+
1168
+ for (i = 0; i < len; i++) {
1169
+ if (word->reject_map[i].rejected ()) {
1170
+ if (word->reject_map[i].flag (R_DOC_REJ))
1171
+ word->reject_map[i].setrej_minimal_rej_accept ();
1172
+ if (word->reject_map[i].flag (R_BLOCK_REJ))
1173
+ word->reject_map[i].setrej_minimal_rej_accept ();
1174
+ if (word->reject_map[i].flag (R_ROW_REJ))
1175
+ word->reject_map[i].setrej_minimal_rej_accept ();
1176
+ }
1177
+ }
1178
+
1179
+ if (suspect_level == 2)
1180
+ return;
1181
+
1182
+ if (!suspect_constrain_1Il ||
1183
+ (word->reject_map.length () <= suspect_short_words)) {
1184
+ for (i = 0; i < len; i++) {
1185
+ if (word->reject_map[i].rejected ()) {
1186
+ if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
1187
+ word->reject_map[i].flag (R_POSTNN_1IL)))
1188
+ word->reject_map[i].setrej_minimal_rej_accept ();
1189
+
1190
+ if (!suspect_constrain_1Il &&
1191
+ word->reject_map[i].flag (R_MM_REJECT))
1192
+ word->reject_map[i].setrej_minimal_rej_accept ();
1193
+ }
1194
+ }
1195
+ }
1196
+
1197
+ if ((acceptable_word_string (word->best_choice->string ().string (),
1198
+ word->best_choice->lengths ().string ())
1199
+ != AC_UNACCEPTABLE) ||
1200
+ acceptable_number_string (word->best_choice->string ().string (),
1201
+ word->best_choice->lengths ().string ())) {
1202
+ if (word->reject_map.length () > suspect_short_words) {
1203
+ for (i = 0; i < len; i++) {
1204
+ if (word->reject_map[i].rejected () &&
1205
+ (!word->reject_map[i].perm_rejected () ||
1206
+ word->reject_map[i].flag (R_1IL_CONFLICT) ||
1207
+ word->reject_map[i].flag (R_POSTNN_1IL) ||
1208
+ word->reject_map[i].flag (R_MM_REJECT))) {
1209
+ word->reject_map[i].setrej_minimal_rej_accept ();
1210
+ }
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+
1216
+
1217
+ inT16 count_alphas( //how many alphas
1218
+ const char *s,
1219
+ const char *lengths) {
1220
+ int count = 0;
1221
+
1222
+ for (; *s != '\0'; s += *(lengths++)) {
1223
+ if (unicharset.get_isalpha(s, *lengths))
1224
+ count++;
1225
+ }
1226
+ return count;
1227
+ }
1228
+
1229
+
1230
+ inT16 count_alphanums( //how many alphanums
1231
+ const char *s,
1232
+ const char *lengths) {
1233
+ int count = 0;
1234
+
1235
+ for (; *s != '\0'; s += *(lengths++)) {
1236
+ if (unicharset.get_isalpha(s, *lengths) ||
1237
+ unicharset.get_isdigit(s, *lengths))
1238
+ count++;
1239
+ }
1240
+ return count;
1241
+ }
1242
+
1243
+
1244
+ BOOL8 acceptable_number_string(const char *s,
1245
+ const char *lengths) {
1246
+ BOOL8 prev_digit = FALSE;
1247
+
1248
+ if (*lengths == 1 && *s == '(')
1249
+ s++;
1250
+
1251
+ if (*lengths == 1 &&
1252
+ ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
1253
+ s++;
1254
+
1255
+ for (; *s != '\0'; s += *(lengths++)) {
1256
+ if (unicharset.get_isdigit (s, *lengths))
1257
+ prev_digit = TRUE;
1258
+ else if (prev_digit &&
1259
+ (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
1260
+ prev_digit = FALSE;
1261
+ else if (prev_digit && *lengths == 1 &&
1262
+ (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
1263
+ return TRUE;
1264
+ else if (prev_digit &&
1265
+ *lengths == 1 && (*s == '%') &&
1266
+ (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
1267
+ (*(s + *lengths + *(lengths + 1)) == '\0'))
1268
+ return TRUE;
1269
+ else
1270
+ return FALSE;
1271
+ }
1272
+ return TRUE;
1273
+ }