tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,1458 @@
1
+ /******************************************************************************
2
+ ** Filename: stopper.c
3
+ ** Purpose: Stopping criteria for word classifier.
4
+ ** Author: Dan Johnson
5
+ ** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
6
+ **
7
+ ** (c) Copyright Hewlett-Packard Company, 1988.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ ******************************************************************************/
18
+ /**----------------------------------------------------------------------------
19
+ Include Files and Type Defines
20
+ ----------------------------------------------------------------------------**/
21
+ #include "stopper.h"
22
+ #include "emalloc.h"
23
+ #include "matchdefs.h"
24
+ #include "debug.h"
25
+ #include "callcpp.h"
26
+ #include "permute.h"
27
+ #include "context.h"
28
+ #include "permnum.h"
29
+ #include "danerror.h"
30
+ #include "const.h"
31
+ #include "freelist.h"
32
+ #include "efio.h"
33
+ #include "globals.h"
34
+ #include "scanutils.h"
35
+ #include "unichar.h"
36
+
37
+ #include <stdio.h>
38
+ #include <string.h>
39
+ #include <ctype.h>
40
+ #include <math.h>
41
+ #ifdef __UNIX__
42
+ #include <assert.h>
43
+ #endif
44
+
45
+ /* these are kludges - add appropriate .h file later */
46
+ extern float CertaintyScale; /* from subfeat.h */
47
+
48
+ #define MAX_WERD_SIZE 100
49
+ #define MAX_AMBIG_SIZE 3
50
+ #define DANGEROUS_AMBIGS "DangAmbigs"
51
+
52
+ typedef LIST AMBIG_TABLE;
53
+
54
+ typedef struct
55
+ {
56
+ UNICHAR_ID Class;
57
+ uinT16 NumChunks;
58
+ float Certainty;
59
+ }
60
+
61
+
62
+ CHAR_CHOICE;
63
+
64
+ typedef struct
65
+ {
66
+ float Rating;
67
+ float Certainty;
68
+ FLOAT32 AdjustFactor;
69
+ int Length;
70
+ CHAR_CHOICE Blob[1];
71
+ } VIABLE_CHOICE_STRUCT;
72
+ typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
73
+
74
+ typedef struct
75
+ {
76
+ VIABLE_CHOICE Choice;
77
+ float ChunkCertainty[MAX_NUM_CHUNKS];
78
+ UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
79
+ }
80
+
81
+
82
+ EXPANDED_CHOICE;
83
+
84
+ typedef struct
85
+ {
86
+ char ambig[2 * (UNICHAR_LEN * MAX_AMBIG_SIZE) + 2];
87
+ char lengths[2 * (MAX_AMBIG_SIZE) + 2];
88
+ } AMBIG_SPEC;
89
+
90
+ /**----------------------------------------------------------------------------
91
+ Macros
92
+ ----------------------------------------------------------------------------**/
93
+ #define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty)
94
+ #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
95
+ #define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
96
+
97
+ #define AmbigThreshold(F1,F2) (((F2) - (F1)) * AmbigThresholdGain - \
98
+ AmbigThresholdOffset)
99
+
100
+ /*---------------------------------------------------------------------------
101
+ Private Function Prototoypes
102
+ ----------------------------------------------------------------------------*/
103
+ void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
104
+
105
+ int AmbigsFound(char *Word,
106
+ char *CurrentChar,
107
+ const char *Tail,
108
+ const char *Tail_lengths,
109
+ LIST Ambigs,
110
+ DANGERR *fixpt);
111
+
112
+ int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice);
113
+
114
+ int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
115
+ void *arg2); //VIABLE_CHOICE Choice2);
116
+
117
+ void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
118
+
119
+ AMBIG_TABLE *FillAmbigTable();
120
+
121
+ int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
122
+ void *item2); //EXPANDED_CHOICE *BestChoice);
123
+
124
+ int LengthOfShortestAlphaRun(register char *Word, const char *Word_lengths);
125
+
126
+ VIABLE_CHOICE NewViableChoice (A_CHOICE * Choice,
127
+ FLOAT32 AdjustFactor, float Certainties[]);
128
+
129
+ void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
130
+
131
+ void ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
132
+ A_CHOICE * NewChoice,
133
+ FLOAT32 AdjustFactor, float Certainties[]);
134
+
135
+ int StringSameAs(const char *String,
136
+ const char *String_lengths,
137
+ VIABLE_CHOICE ViableChoice);
138
+
139
+ int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice);
140
+
141
+ /**----------------------------------------------------------------------------
142
+ Global Data Definitions and Declarations
143
+ ----------------------------------------------------------------------------**/
144
+ /* Name of file containing potentially dangerous ambiguities */
145
+ static const char *DangerousAmbigs = DANGEROUS_AMBIGS;
146
+
147
+ /* Word for which stopper debug information should be printed to stdout */
148
+ static char *WordToDebug = NULL;
149
+ static char *WordToDebug_lengths = NULL;
150
+
151
+ /* flag used to disable accumulation of word choices during compound word
152
+ permutation */
153
+ BOOL8 KeepWordChoices = TRUE;
154
+
155
+ /* additional certainty padding allowed before a word is rejected */
156
+ static FLOAT32 RejectOffset = 0.0;
157
+
158
+ /* structures to keep track of viable word choices */
159
+ static VIABLE_CHOICE BestRawChoice = NULL;
160
+ static LIST BestChoices = NIL;
161
+ static PIECES_STATE CurrentSegmentation;
162
+
163
+ make_float_var (NonDictCertainty, -2.50, MakeNonDictCertainty,
164
+ 17, 2, SetNonDictCertainty,
165
+ "Certainty threshold for non-dict words");
166
+
167
+ make_float_var (RejectCertaintyOffset, 1.0, MakeRejectCertaintyOffset,
168
+ 17, 3, SetRejectCertaintyOffset, "Reject certainty offset");
169
+
170
+ make_int_var (SmallWordSize, 2, MakeSmallWordSize,
171
+ 17, 4, SetSmallWordSize,
172
+ "Size of dict word to be treated as non-dict word");
173
+
174
+ make_float_var (CertaintyPerChar, -0.50, MakeCertaintyPerChar,
175
+ 17, 5, SetCertaintyPerChar,
176
+ "Certainty to add for each dict char above SmallWordSize");
177
+
178
+ make_float_var (CertaintyVariation, 3.0, MakeCertaintyVariation,
179
+ 17, 6, SetCertaintyVariation,
180
+ "Max certaintly variation allowed in a word (in sigma)");
181
+
182
+ make_int_var (StopperDebugLevel, 0, MakeStopperDebugLevel,
183
+ 17, 7, SetStopperDebugLevel, "Stopper debug level");
184
+
185
+ make_float_var (AmbigThresholdGain, 8.0, MakeAmbigThresholdGain,
186
+ 17, 8, SetAmbigThresholdGain,
187
+ "Gain factor for ambiguity threshold");
188
+
189
+ make_float_var (AmbigThresholdOffset, 1.5, MakeAmbigThresholdOffset,
190
+ 17, 9, SetAmbigThresholdOffset,
191
+ "Certainty offset for ambiguity threshold");
192
+
193
+ extern int first_pass;
194
+ INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
195
+
196
+ /**----------------------------------------------------------------------------
197
+ Public Code
198
+ ----------------------------------------------------------------------------**/
199
+ /*---------------------------------------------------------------------------*/
200
+ int AcceptableChoice(CHOICES_LIST Choices,
201
+ A_CHOICE *BestChoice,
202
+ A_CHOICE *RawChoice,
203
+ DANGERR *fixpt) {
204
+ /*
205
+ ** Parameters:
206
+ ** Choices choices for current segmentation
207
+ ** BestChoice best choice for current segmentation
208
+ ** RawChoice best raw choice for current segmentation
209
+ ** Globals:
210
+ ** NonDictCertainty certainty for a non-dict word
211
+ ** SmallWordSize size of word to be treated as non-word
212
+ ** CertaintyPerChar certainty to add for each dict char
213
+ ** Operation: Return TRUE if the results from this segmentation are
214
+ ** good enough to stop. Otherwise return FALSE.
215
+ ** Return: TRUE or FALSE.
216
+ ** Exceptions: none
217
+ ** History: Mon Apr 29 14:57:32 1991, DSJ, Created.
218
+ */
219
+ float CertaintyThreshold = NonDictCertainty;
220
+ int WordSize;
221
+
222
+ if (fixpt != NULL)
223
+ fixpt->index = -1;
224
+ if ((BestChoice == NULL) || (class_string (BestChoice) == NULL))
225
+ return (FALSE);
226
+
227
+ if (StopperDebugLevel >= 1)
228
+ cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n",
229
+ class_string (BestChoice),
230
+ (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
231
+ (case_ok (class_string (BestChoice),
232
+ class_lengths (BestChoice)) ? 'y' : 'n'),
233
+ ((punctuation_ok (class_string (BestChoice),
234
+ class_lengths (BestChoice)) !=
235
+ -1) ? 'y' : 'n'));
236
+
237
+ if (valid_word (class_string (BestChoice)) &&
238
+ case_ok (class_string (BestChoice), class_lengths (BestChoice)) &&
239
+ punctuation_ok (class_string (BestChoice),
240
+ class_lengths (BestChoice)) != -1) {
241
+ WordSize = LengthOfShortestAlphaRun (class_string (BestChoice),
242
+ class_lengths (BestChoice));
243
+ WordSize -= SmallWordSize;
244
+ if (WordSize < 0)
245
+ WordSize = 0;
246
+ CertaintyThreshold += WordSize * CertaintyPerChar;
247
+ }
248
+ else if (stopper_numbers_on && valid_number (class_string (BestChoice),
249
+ class_lengths (BestChoice))) {
250
+ CertaintyThreshold += stopper_numbers_on * CertaintyPerChar;
251
+ }
252
+
253
+ if (StopperDebugLevel >= 1)
254
+ cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
255
+ class_certainty (BestChoice), CertaintyThreshold);
256
+
257
+ if (NoDangerousAmbig (class_string (BestChoice),
258
+ class_lengths (BestChoice), fixpt)
259
+ && class_certainty (BestChoice) > CertaintyThreshold &&
260
+ UniformCertainties (Choices, BestChoice))
261
+ return (TRUE);
262
+ else
263
+ return (FALSE);
264
+
265
+ } /* AcceptableChoice */
266
+
267
+
268
+ /*---------------------------------------------------------------------------*/
269
+ int AcceptableResult(A_CHOICE *BestChoice, A_CHOICE *RawChoice) {
270
+ /*
271
+ ** Parameters:
272
+ ** BestChoice best choice for current word
273
+ ** RawChoice best raw choice for current word
274
+ ** Globals:
275
+ ** NonDictCertainty certainty for a non-dict word
276
+ ** SmallWordSize size of word to be treated as non-word
277
+ ** CertaintyPerChar certainty to add for each dict char
278
+ ** BestChoices list of all good choices found
279
+ ** RejectOffset allowed offset before a word is rejected
280
+ ** Operation: Return FALSE if the best choice for the current word
281
+ ** is questionable and should be tried again on the second
282
+ ** pass or should be flagged to the user.
283
+ ** Return: TRUE or FALSE.
284
+ ** Exceptions: none
285
+ ** History: Thu May 9 14:05:05 1991, DSJ, Created.
286
+ */
287
+ float CertaintyThreshold = NonDictCertainty - RejectOffset;
288
+ int WordSize;
289
+
290
+ if (StopperDebugLevel >= 1)
291
+ cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
292
+ class_string (BestChoice),
293
+ (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
294
+ (case_ok (class_string (BestChoice),
295
+ class_lengths (BestChoice)) ? 'y' : 'n'),
296
+ ((punctuation_ok (class_string (BestChoice),
297
+ class_lengths (BestChoice)) != -1) ? 'y' : 'n'),
298
+ ((rest (BestChoices) != NIL) ? 'n' : 'y'));
299
+
300
+ if ((BestChoice == NULL) ||
301
+ (class_string (BestChoice) == NULL) || CurrentWordAmbig ())
302
+ return (FALSE);
303
+
304
+ if (valid_word (class_string (BestChoice)) &&
305
+ case_ok (class_string (BestChoice), class_lengths (BestChoice)) &&
306
+ punctuation_ok (class_string (BestChoice),
307
+ class_lengths (BestChoice)) != -1) {
308
+ WordSize = LengthOfShortestAlphaRun (class_string (BestChoice),
309
+ class_lengths (BestChoice));
310
+ WordSize -= SmallWordSize;
311
+ if (WordSize < 0)
312
+ WordSize = 0;
313
+ CertaintyThreshold += WordSize * CertaintyPerChar;
314
+ }
315
+
316
+ if (StopperDebugLevel >= 1)
317
+ cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
318
+ class_certainty (BestChoice), CertaintyThreshold);
319
+
320
+ if (class_certainty (BestChoice) > CertaintyThreshold) {
321
+ if (StopperDebugLevel >= 1)
322
+ cprintf ("ACCEPTED\n");
323
+ return (TRUE);
324
+ }
325
+ else {
326
+ if (StopperDebugLevel >= 1)
327
+ cprintf ("REJECTED\n");
328
+ return (FALSE);
329
+ }
330
+ } /* AcceptableResult */
331
+
332
+
333
+ /*---------------------------------------------------------------------------*/
334
+ int AlternativeChoicesWorseThan(FLOAT32 Threshold) {
335
+ /*
336
+ ** Parameters:
337
+ ** Threshold minimum adjust factor for alternative choices
338
+ ** Globals:
339
+ ** BestChoices alternative choices for current word
340
+ ** Operation: This routine returns TRUE if there are no alternative
341
+ ** choices for the current word OR if all alternatives have
342
+ ** an adjust factor worse than Threshold.
343
+ ** Return: TRUE or FALSE.
344
+ ** Exceptions: none
345
+ ** History: Mon Jun 3 09:36:31 1991, DSJ, Created.
346
+ */
347
+ LIST Alternatives;
348
+ VIABLE_CHOICE Choice;
349
+
350
+ Alternatives = rest (BestChoices);
351
+ iterate(Alternatives) {
352
+ Choice = (VIABLE_CHOICE) first_node (Alternatives);
353
+ if (Choice->AdjustFactor <= Threshold)
354
+ return (FALSE);
355
+ }
356
+
357
+ return (TRUE);
358
+
359
+ } /* AlternativeChoicesWorseThan */
360
+
361
+
362
+ /*---------------------------------------------------------------------------*/
363
+ int CurrentBestChoiceIs(const char *Word, const char *Word_lengths) {
364
+ /*
365
+ ** Parameters:
366
+ ** Word string to compare to current best choice
367
+ ** Word_lengths lengths of unichars in Word
368
+ ** Globals:
369
+ ** BestChoices set of best choices for current word
370
+ ** Operation: Returns TRUE if Word is the same as the current best
371
+ ** choice, FALSE otherwise.
372
+ ** Return: TRUE or FALSE
373
+ ** Exceptions: none
374
+ ** History: Thu May 30 14:44:22 1991, DSJ, Created.
375
+ */
376
+ return (BestChoices != NIL &&
377
+ StringSameAs (Word, Word_lengths,
378
+ (VIABLE_CHOICE) first_node (BestChoices)));
379
+
380
+ } /* CurrentBestChoiceIs */
381
+
382
+
383
+ /*---------------------------------------------------------------------------*/
384
+ FLOAT32 CurrentBestChoiceAdjustFactor() {
385
+ /*
386
+ ** Parameters: none
387
+ ** Globals:
388
+ ** BestChoices set of best choices for current word
389
+ ** Operation: Return the adjustment factor for the best choice for
390
+ ** the current word.
391
+ ** Return: Adjust factor for current best choice.
392
+ ** Exceptions: none
393
+ ** History: Thu May 30 14:48:24 1991, DSJ, Created.
394
+ */
395
+ VIABLE_CHOICE BestChoice;
396
+
397
+ if (BestChoices == NIL)
398
+ return (MAX_FLOAT32);
399
+
400
+ BestChoice = (VIABLE_CHOICE) first_node (BestChoices);
401
+ return (BestChoice->AdjustFactor);
402
+
403
+ } /* CurrentBestChoiceAdjustFactor */
404
+
405
+
406
+ /*---------------------------------------------------------------------------*/
407
+ int CurrentWordAmbig() {
408
+ /*
409
+ ** Parameters: none
410
+ ** Globals:
411
+ ** BestChoices set of best choices for current word
412
+ ** Operation: This routine returns TRUE if there are multiple good
413
+ ** choices for the current word and FALSE otherwise.
414
+ ** Return: TRUE or FALSE
415
+ ** Exceptions: none
416
+ ** History: Wed May 22 15:38:38 1991, DSJ, Created.
417
+ */
418
+ return (rest (BestChoices) != NIL);
419
+
420
+ } /* CurrentWordAmbig */
421
+
422
+
423
+ /*---------------------------------------------------------------------------*/
424
+ void DebugWordChoices() {
425
+ /*
426
+ ** Parameters: none
427
+ ** Globals:
428
+ ** BestRawChoice
429
+ ** BestChoices
430
+ ** Operation: Print the current choices for this word to stdout.
431
+ ** Return: none
432
+ ** Exceptions: none
433
+ ** History: Wed May 15 13:52:08 1991, DSJ, Created.
434
+ */
435
+ LIST Choices;
436
+ int i;
437
+ char LabelString[80];
438
+
439
+ if (StopperDebugLevel >= 1 ||
440
+ (WordToDebug && BestChoices &&
441
+ StringSameAs (WordToDebug, WordToDebug_lengths,
442
+ (VIABLE_CHOICE) first_node (BestChoices)))) {
443
+ if (BestRawChoice)
444
+ PrintViableChoice (stderr, "\nBest Raw Choice: ", BestRawChoice);
445
+
446
+ i = 1;
447
+ Choices = BestChoices;
448
+ if (Choices)
449
+ cprintf ("\nBest Cooked Choices:\n");
450
+ iterate(Choices) {
451
+ sprintf (LabelString, "Cooked Choice #%d: ", i);
452
+ PrintViableChoice (stderr, LabelString,
453
+ (VIABLE_CHOICE) first_node (Choices));
454
+ i++;
455
+ }
456
+ }
457
+ } /* DebugWordChoices */
458
+
459
+
460
+ /*---------------------------------------------------------------------------*/
461
+ void FilterWordChoices() {
462
+ /*
463
+ ** Parameters: none
464
+ ** Globals:
465
+ ** BestChoices set of choices for current word
466
+ ** Operation: This routine removes from BestChoices all choices which
467
+ ** are not within a reasonable range of the best choice.
468
+ ** Return: none
469
+ ** Exceptions: none
470
+ ** History: Wed May 15 13:08:24 1991, DSJ, Created.
471
+ */
472
+ EXPANDED_CHOICE BestChoice;
473
+
474
+ if (BestChoices == NIL || second_node (BestChoices) == NIL)
475
+ return;
476
+
477
+ /* compute certainties and class for each chunk in best choice */
478
+ ExpandChoice ((VIABLE_CHOICE_STRUCT *) first_node (BestChoices), &BestChoice);
479
+
480
+ set_rest (BestChoices, delete_d (rest (BestChoices),
481
+ &BestChoice, FreeBadChoice));
482
+
483
+ } /* FilterWordChoices */
484
+
485
+
486
+ /*---------------------------------------------------------------------------*/
487
+ void
488
+ FindClassifierErrors (FLOAT32 MinRating,
489
+ FLOAT32 MaxRating,
490
+ FLOAT32 RatingMargin, FLOAT32 Thresholds[]) {
491
+ /*
492
+ ** Parameters:
493
+ ** MinRating limits how tight to make a template
494
+ ** MaxRating limits how loose to make a template
495
+ ** RatingMargin amount of margin to put in template
496
+ ** Thresholds[] place to put error thresholds
497
+ ** Globals: none
498
+ ** Operation: This routine compares the best choice for the current
499
+ ** word to the best raw choice to determine which characters
500
+ ** were classified incorrectly by the classifier. It then
501
+ ** places a separate threshold into Thresholds for each
502
+ ** character in the word. If the classifier was correct,
503
+ ** MaxRating is placed into Thresholds. If the
504
+ ** classifier was incorrect, the avg. match rating (error
505
+ ** percentage) of the classifier's incorrect choice minus
506
+ ** some margin is
507
+ ** placed into thresholds. This can then be used by the
508
+ ** caller to try to create a new template for the desired
509
+ ** class that will classify the character with a rating better
510
+ ** than the threshold value. The match rating placed into
511
+ ** Thresholds is never allowed to be below MinRating in order
512
+ ** to prevent trying to make overly tight templates.
513
+ ** Return: none (results are placed in Thresholds)
514
+ ** Exceptions: none
515
+ ** History: Fri May 31 16:02:57 1991, DSJ, Created.
516
+ */
517
+ EXPANDED_CHOICE BestRaw;
518
+ VIABLE_CHOICE Choice;
519
+ int i, j, Chunk;
520
+ FLOAT32 AvgRating;
521
+ int NumErrorChunks;
522
+
523
+ assert (BestChoices != NIL);
524
+ assert (BestRawChoice != NULL);
525
+
526
+ ExpandChoice(BestRawChoice, &BestRaw);
527
+ Choice = (VIABLE_CHOICE) first_node (BestChoices);
528
+
529
+ for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
530
+ AvgRating = 0.0;
531
+ NumErrorChunks = 0;
532
+
533
+ for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
534
+ if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
535
+ AvgRating += BestRaw.ChunkCertainty[Chunk];
536
+ NumErrorChunks++;
537
+ }
538
+
539
+ if (NumErrorChunks > 0) {
540
+ AvgRating /= NumErrorChunks;
541
+ *Thresholds = (AvgRating / -CertaintyScale) * (1.0 - RatingMargin);
542
+ }
543
+ else
544
+ *Thresholds = MaxRating;
545
+
546
+ if (*Thresholds > MaxRating)
547
+ *Thresholds = MaxRating;
548
+ if (*Thresholds < MinRating)
549
+ *Thresholds = MinRating;
550
+ }
551
+ } /* FindClassifierErrors */
552
+
553
+
554
+ /*---------------------------------------------------------------------------*/
555
+ void InitStopperVars() {
556
+ /*
557
+ ** Parameters: none
558
+ ** Globals: none
559
+ ** Operation: Initializes the control variables used by the stopper.
560
+ ** Return: none
561
+ ** Exceptions: none
562
+ ** History: Thu May 9 10:06:04 1991, DSJ, Created.
563
+ */
564
+ VALUE dummy;
565
+
566
+ string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
567
+ string_variable (WordToDebug, "WordToDebug", "");
568
+ string_variable (WordToDebug_lengths, "WordToDebug_lengths", "");
569
+
570
+ MakeNonDictCertainty();
571
+ MakeRejectCertaintyOffset();
572
+ MakeSmallWordSize();
573
+ MakeCertaintyPerChar();
574
+ MakeCertaintyVariation();
575
+ MakeStopperDebugLevel();
576
+ MakeAmbigThresholdGain();
577
+ MakeAmbigThresholdOffset();
578
+ } /* InitStopperVars */
579
+
580
+
581
+ /*---------------------------------------------------------------------------*/
582
+ void InitChoiceAccum() {
583
+ /*
584
+ ** Parameters: none
585
+ ** Globals: none
586
+ ** Operation: This routine initializes the data structures used to
587
+ ** keep track the good word choices found for a word.
588
+ ** Return: none
589
+ ** Exceptions: none
590
+ ** History: Fri May 17 07:59:00 1991, DSJ, Created.
591
+ */
592
+ BLOB_WIDTH *BlobWidth, *End;
593
+
594
+ if (BestRawChoice)
595
+ memfree(BestRawChoice);
596
+
597
+ if (BestChoices)
598
+ destroy_nodes(BestChoices, memfree);
599
+
600
+ BestRawChoice = NULL;
601
+ BestChoices = NIL;
602
+ EnableChoiceAccum();
603
+
604
+ for (BlobWidth = CurrentSegmentation,
605
+ End = CurrentSegmentation + MAX_NUM_CHUNKS;
606
+ BlobWidth < End; *BlobWidth++ = 1);
607
+
608
+ } /* InitChoiceAccum */
609
+
610
+
611
+ /*---------------------------------------------------------------------------*/
612
+ void
613
+ LogNewRawChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
614
+ /*
615
+ ** Parameters:
616
+ ** Choice new raw choice for current word
617
+ ** AdjustFactor adjustment factor which was applied to choice
618
+ ** Certainties certainties for each char in new choice
619
+ ** Globals:
620
+ ** BestRawChoice best raw choice so far for current word
621
+ ** Operation: This routine compares Choice to the best raw (non-dict)
622
+ ** choice so far and replaces it if the new choice is better.
623
+ ** Return: none
624
+ ** Exceptions: none
625
+ ** History: Wed May 15 09:57:19 1991, DSJ, Created.
626
+ */
627
+ if (!KeepWordChoices)
628
+ return;
629
+
630
+ if (!BestRawChoice)
631
+ BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
632
+ else if (class_probability (Choice) < BestRawChoice->Rating) {
633
+ if (ChoiceSameAs (Choice, BestRawChoice))
634
+ ReplaceDuplicateChoice(BestRawChoice, Choice, AdjustFactor, Certainties);
635
+ else {
636
+ memfree(BestRawChoice);
637
+ BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
638
+ }
639
+ }
640
+ } /* LogNewRawChoice */
641
+
642
+
643
+ /*---------------------------------------------------------------------------*/
644
+ void LogNewSegmentation(PIECES_STATE BlobWidth) {
645
+ /*
646
+ ** Parameters:
647
+ ** BlobWidth[] number of chunks in each blob in segmentation
648
+ ** Globals:
649
+ ** CurrentSegmentation blob widths for current segmentation
650
+ ** Operation: This routine updates the blob widths in CurrentSegmentation
651
+ ** to be the same as provided in BlobWidth.
652
+ ** Return: none
653
+ ** Exceptions: none
654
+ ** History: Mon May 20 11:52:26 1991, DSJ, Created.
655
+ */
656
+ BLOB_WIDTH *Segmentation;
657
+
658
+ for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
659
+ BlobWidth++, Segmentation++)
660
+ *Segmentation = *BlobWidth;
661
+ *Segmentation = 0;
662
+
663
+ } /* LogNewSegmentation */
664
+
665
+
666
+ /*---------------------------------------------------------------------------*/
667
+ void LogNewSplit(int Blob) {
668
+ /*
669
+ ** Parameters:
670
+ ** Blob index of blob that was split
671
+ ** Globals:
672
+ ** BestRawChoice current best raw choice
673
+ ** BestChoices list of best choices found so far
674
+ ** Operation: This routine adds 1 chunk to the specified blob for each
675
+ ** choice in BestChoices and for the BestRawChoice.
676
+ ** Return: none
677
+ ** Exceptions: none
678
+ ** History: Mon May 20 11:38:56 1991, DSJ, Created.
679
+ */
680
+ LIST Choices;
681
+
682
+ if (BestRawChoice) {
683
+ AddNewChunk(BestRawChoice, Blob);
684
+ }
685
+
686
+ Choices = BestChoices;
687
+ iterate(Choices) {
688
+ AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
689
+ }
690
+
691
+ } /* LogNewSplit */
692
+
693
+
694
+ /*---------------------------------------------------------------------------*/
695
+ void
696
+ LogNewWordChoice (A_CHOICE * Choice,
697
+ FLOAT32 AdjustFactor, float Certainties[]) {
698
+ /*
699
+ ** Parameters:
700
+ ** Choice new choice for current word
701
+ ** AdjustFactor adjustment factor which was applied to choice
702
+ ** Certainties certainties for each char in new choice
703
+ ** Globals:
704
+ ** BestChoices best choices so far for current word
705
+ ** Operation: This routine adds Choice to BestChoices if the
706
+ ** adjusted certainty for Choice is within a reasonable range
707
+ ** of the best choice in BestChoices. The BestChoices
708
+ ** list is kept in sorted order by rating. Duplicates are
709
+ ** removed.
710
+ ** Return: none
711
+ ** Exceptions: none
712
+ ** History: Wed May 15 09:57:19 1991, DSJ, Created.
713
+ */
714
+ VIABLE_CHOICE NewChoice;
715
+ LIST Choices;
716
+ FLOAT32 Threshold;
717
+
718
+ if (!KeepWordChoices)
719
+ return;
720
+
721
+ /* throw out obviously bad choices to save some work */
722
+ if (BestChoices != NIL) {
723
+ Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
724
+ if (Threshold > -AmbigThresholdOffset)
725
+ Threshold = -AmbigThresholdOffset;
726
+ if (class_certainty (Choice) - BestCertainty (BestChoices) < Threshold)
727
+ return;
728
+ }
729
+
730
+ /* see if a choice with the same text string has already been found */
731
+ NewChoice = NULL;
732
+ Choices = BestChoices;
733
+ iterate(Choices) {
734
+ if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices))) {
735
+ if (class_probability (Choice) < BestRating (Choices))
736
+ NewChoice = (VIABLE_CHOICE) first_node (Choices);
737
+ else
738
+ return;
739
+ }
740
+ }
741
+
742
+ if (NewChoice) {
743
+ ReplaceDuplicateChoice(NewChoice, Choice, AdjustFactor, Certainties);
744
+ BestChoices = delete_d (BestChoices, NewChoice, is_same_node);
745
+ }
746
+ else {
747
+ NewChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
748
+ }
749
+
750
+ BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
751
+ if (StopperDebugLevel >= 2)
752
+ PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
753
+ if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
754
+ Choices =
755
+ (LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
756
+ destroy_nodes (rest (Choices), Efree);
757
+ set_rest(Choices, NIL);
758
+ }
759
+
760
+ } /* LogNewWordChoice */
761
+
762
+
763
+ /*---------------------------------------------------------------------------*/
764
+ static AMBIG_TABLE *AmbigFor = NULL;
765
+
766
+ int NoDangerousAmbig(const char *Word,
767
+ const char *Word_lengths,
768
+ DANGERR *fixpt) {
769
+ /*
770
+ ** Parameters:
771
+ ** Word word to check for dangerous ambiguities
772
+ ** Word_lengths lengths of unichars in Word
773
+ ** Globals: none
774
+ ** Operation: This word checks each letter in word against a list
775
+ ** of potentially ambiguous characters. If a match is found
776
+ ** that letter is replaced with its ambiguity and tested in
777
+ ** the dictionary. If the ambiguous word is found in the
778
+ ** dictionary, FALSE is returned. Otherwise, the search
779
+ ** continues for other ambiguities. If no ambiguities that
780
+ ** match in the dictionary are found, TRUE is returned.
781
+ ** Return: TRUE if Word contains no dangerous ambiguities.
782
+ ** Exceptions: none
783
+ ** History: Mon May 6 16:28:56 1991, DSJ, Created.
784
+ */
785
+
786
+ char NewWord[MAX_WERD_SIZE * UNICHAR_LEN + 1];
787
+ char *NextNewChar;
788
+ int bad_index = 0;
789
+
790
+ if (!AmbigFor)
791
+ AmbigFor = FillAmbigTable ();
792
+
793
+ NextNewChar = NewWord;
794
+ while (*Word)
795
+ if (AmbigsFound (NewWord, NextNewChar,
796
+ Word + *Word_lengths, Word_lengths + 1,
797
+ AmbigFor[unicharset.unichar_to_id(Word, *Word_lengths)],
798
+ fixpt)) {
799
+ if (fixpt != NULL)
800
+ fixpt->index = bad_index;
801
+ return (FALSE);
802
+ }
803
+ else {
804
+ strncpy(NextNewChar, Word, *Word_lengths);
805
+ NextNewChar += *Word_lengths;
806
+ Word += *Word_lengths;
807
+ Word_lengths++;
808
+ bad_index++;
809
+ }
810
+
811
+ return (TRUE);
812
+
813
+ } /* NoDangerousAmbig */
814
+
815
+ void EndDangerousAmbigs() {
816
+ if (AmbigFor != NULL) {
817
+ for (int i = 0; i <= MAX_CLASS_ID; ++i) {
818
+ destroy_nodes(AmbigFor[i], Efree);
819
+ }
820
+ Efree(AmbigFor);
821
+ AmbigFor = NULL;
822
+ }
823
+ }
824
+
825
+ /*---------------------------------------------------------------------------*/
826
+ void SettupStopperPass1() {
827
+ /*
828
+ ** Parameters: none
829
+ ** Globals:
830
+ ** RejectOffset offset allowed before word is rejected
831
+ ** Operation: This routine performs any settup of stopper variables
832
+ ** that is needed in preparation for the first pass.
833
+ ** Return: none
834
+ ** Exceptions: none
835
+ ** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
836
+ */
837
+ RejectOffset = 0.0;
838
+ } /* SettupStopperPass1 */
839
+
840
+
841
+ /*---------------------------------------------------------------------------*/
842
+ void SettupStopperPass2() {
843
+ /*
844
+ ** Parameters: none
845
+ ** Globals:
846
+ ** RejectOffset offset allowed before word is rejected
847
+ ** Operation: This routine performs any settup of stopper variables
848
+ ** that is needed in preparation for the second pass.
849
+ ** Return: none
850
+ ** Exceptions: none
851
+ ** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
852
+ */
853
+ RejectOffset = RejectCertaintyOffset;
854
+ } /* SettupStopperPass2 */
855
+
856
+
857
+ /**----------------------------------------------------------------------------
858
+ Private Code
859
+ ----------------------------------------------------------------------------**/
860
+ /*---------------------------------------------------------------------------*/
861
+ void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
862
+ /*
863
+ ** Parameters:
864
+ ** Choice choice to add a new chunk to
865
+ ** Blob index of blob being split
866
+ ** Globals: none
867
+ ** Operation: This routine increments the chunk count of the character
868
+ ** in Choice which corresponds to Blob.
869
+ ** Return: none
870
+ ** Exceptions: none
871
+ ** History: Mon May 20 11:43:27 1991, DSJ, Created.
872
+ */
873
+ int i, LastChunk;
874
+
875
+ for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
876
+ LastChunk += Choice->Blob[i].NumChunks;
877
+ if (Blob < LastChunk) {
878
+ (Choice->Blob[i].NumChunks)++;
879
+ return;
880
+ }
881
+ }
882
+ mem_tidy (1);
883
+ cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
884
+ Choice->Length, LastChunk, Blob);
885
+ assert(FALSE); /* this should never get executed */
886
+
887
+ } /* AddNewChunk */
888
+
889
+
890
+ /*---------------------------------------------------------------------------*/
891
+ int AmbigsFound(char *Word,
892
+ char *CurrentChar,
893
+ const char *Tail,
894
+ const char *Tail_lengths,
895
+ LIST Ambigs,
896
+ DANGERR *fixpt) {
897
+ /*
898
+ ** Parameters:
899
+ ** Word word being tested for ambiguities
900
+ ** CurrentChar position in Word to put ambig replacement
901
+ ** Tail end of word to place after ambiguity
902
+ ** Tail_lengths lengths of the unichars in Tail
903
+ ** Ambigs list of ambiguities to test at this position
904
+ ** Globals: none
905
+ ** Operation: For each ambiguity in Ambigs, see if the remainder of
906
+ ** the test string matches the start of Tail. If it does,
907
+ ** construct a word consisting of the contents of Word up to,
908
+ ** but not including, CurrentChar followed by the replacement
909
+ ** string for the ambiguity followed by the unmatched
910
+ ** contents of Tail. Then test this word to see if it
911
+ ** is a dictionary word. If it is return TRUE. If none of
912
+ ** the ambiguities result in a dictionary word, return FALSE.
913
+ ** Return: TRUE if the Word is ambiguous at the specified position
914
+ ** Exceptions: none
915
+ ** History: Thu May 9 10:10:28 1991, DSJ, Created.
916
+ */
917
+ AMBIG_SPEC *AmbigSpec;
918
+ char *ambig;
919
+ char *ambig_lengths;
920
+ const char *UnmatchedTail;
921
+ const char *UnmatchedTail_lengths;
922
+ int Matches;
923
+ int bad_length;
924
+
925
+ iterate(Ambigs) {
926
+ AmbigSpec = (AMBIG_SPEC *) first_node (Ambigs);
927
+ ambig = AmbigSpec->ambig;
928
+ ambig_lengths = AmbigSpec->lengths;
929
+ bad_length = 1;
930
+ UnmatchedTail = Tail;
931
+ UnmatchedTail_lengths = Tail_lengths;
932
+ Matches = TRUE;
933
+
934
+ while (*ambig != ' ' && Matches)
935
+ if (*UnmatchedTail_lengths == *ambig_lengths &&
936
+ strncmp(ambig, UnmatchedTail, *ambig_lengths) == 0) {
937
+ ambig += *(ambig_lengths++);
938
+ UnmatchedTail += *(UnmatchedTail_lengths++);
939
+ bad_length++;
940
+ }
941
+ else
942
+ Matches = FALSE;
943
+
944
+ if (Matches) {
945
+ ambig += *(ambig_lengths++); /* skip over the space */
946
+ /* insert replacement string */
947
+ strcpy(CurrentChar, ambig);
948
+ /* add tail */
949
+ strcat(Word, UnmatchedTail);
950
+ if (valid_word (Word)) {
951
+ if (StopperDebugLevel >= 1)
952
+ cprintf ("Stopper: Possible ambiguous word = %s\n", Word);
953
+ if (fixpt != NULL) {
954
+ fixpt->good_length = strlen (ambig_lengths);
955
+ fixpt->bad_length = bad_length;
956
+ }
957
+ return (TRUE);
958
+ }
959
+ }
960
+ }
961
+ return (FALSE);
962
+
963
+ } /* AmbigsFound */
964
+
965
+
966
+ /*---------------------------------------------------------------------------*/
967
+ int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice) {
968
+ /*
969
+ ** Parameters:
970
+ ** Choice choice to compare to ViableChoice
971
+ ** ViableChoice viable choice to compare to Choice
972
+ ** Globals: none
973
+ ** Operation: This routine compares the corresponding strings of
974
+ ** Choice and ViableChoice and returns TRUE if they are the
975
+ ** same, FALSE otherwise.
976
+ ** Return: TRUE or FALSE.
977
+ ** Exceptions: none
978
+ ** History: Fri May 17 08:48:04 1991, DSJ, Created.
979
+ */
980
+ return (StringSameAs (class_string (Choice), class_lengths (Choice),
981
+ ViableChoice));
982
+
983
+ } /* ChoiceSameAs */
984
+
985
+
986
+ /*---------------------------------------------------------------------------*/
987
+ int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
988
+ void *arg2) { //VIABLE_CHOICE Choice2)
989
+ /*
990
+ ** Parameters:
991
+ ** Choice1, Choice2 choices to compare ratings for
992
+ ** Globals: none
993
+ ** Operation: Return -1 if the rating for Choice1 is less than the
994
+ ** rating for Choice2, otherwise return (1).
995
+ ** Return: -1 or 1
996
+ ** Exceptions: none
997
+ ** History: Wed May 15 13:02:37 1991, DSJ, Created.
998
+ */
999
+ float R1, R2;
1000
+ VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
1001
+ VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
1002
+
1003
+ R1 = Choice1->Rating;
1004
+ R2 = Choice2->Rating;
1005
+
1006
+ if (R1 < R2)
1007
+ return (-1);
1008
+ else
1009
+ return (1);
1010
+
1011
+ } /* CmpChoiceRatings */
1012
+
1013
+
1014
+ /*---------------------------------------------------------------------------*/
1015
+ void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
1016
+ /*
1017
+ ** Parameters:
1018
+ ** Choice choice to be expanded
1019
+ ** ExpandedChoice place to put resulting expanded choice
1020
+ ** Globals: none
1021
+ ** Operation: This routine expands Choice and places the results
1022
+ ** in ExpandedChoice. The primary function of expansion
1023
+ ** is to create an two arrays, one which holds the corresponding
1024
+ ** certainty for each chunk in Choice, and one which holds
1025
+ ** the class for each chunk.
1026
+ ** Return: none (results are placed in ExpandedChoice)
1027
+ ** Exceptions: none
1028
+ ** History: Fri May 31 15:21:57 1991, DSJ, Created.
1029
+ */
1030
+ int i, j, Chunk;
1031
+
1032
+ ExpandedChoice->Choice = Choice;
1033
+ for (i = 0, Chunk = 0; i < Choice->Length; i++)
1034
+ for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
1035
+ ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
1036
+ ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
1037
+ }
1038
+ } /* ExpandChoice */
1039
+
1040
+
1041
+ /*---------------------------------------------------------------------------*/
1042
+ AMBIG_TABLE *FillAmbigTable() {
1043
+ /*
1044
+ ** Parameters: none
1045
+ ** Globals:
1046
+ ** DangerousAmbigs filename of dangerous ambig info
1047
+ ** Operation: This routine allocates a new ambiguity table and fills
1048
+ ** it in from the file specified by DangerousAmbigs. An
1049
+ ** ambiguity table is an array of lists. The array is indexed
1050
+ ** by a class id. Therefore, each entry in the table provides
1051
+ ** a list of potential ambiguities which can start with the
1052
+ ** corresponding character. Each potential ambiguity is
1053
+ ** described by a string which contains the remainder of the
1054
+ ** test string followed by a space followed by the replacement
1055
+ ** string. For example the ambiguity "rn -> m", would be
1056
+ ** located in the table at index 'r'. The string corresponding
1057
+ ** to this ambiguity would be "n m".
1058
+ ** Return: Pointer to new ambiguity table.
1059
+ ** Exceptions: none
1060
+ ** History: Thu May 9 09:20:57 1991, DSJ, Created.
1061
+ */
1062
+ FILE *AmbigFile;
1063
+ AMBIG_TABLE *NewTable;
1064
+ int i;
1065
+ int AmbigPartSize;
1066
+ char buffer[256 * UNICHAR_LEN];
1067
+ char TestString[256 * UNICHAR_LEN];
1068
+ char TestString_lengths[256];
1069
+ char ReplacementString[256 * UNICHAR_LEN];
1070
+ char ReplacementString_lengths[256];
1071
+ STRING name;
1072
+ char lengths[2];
1073
+ AMBIG_SPEC *AmbigSpec;
1074
+ UNICHAR_ID unichar_id;
1075
+
1076
+ lengths[1] = 0;
1077
+
1078
+ name = language_data_path_prefix;
1079
+ name += DangerousAmbigs;
1080
+ AmbigFile = Efopen (name.string(), "r");
1081
+ NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));
1082
+
1083
+ for (i = 0; i <= MAX_CLASS_ID; i++)
1084
+ NewTable[i] = NIL;
1085
+
1086
+ while (fscanf (AmbigFile, "%d", &AmbigPartSize) == 1) {
1087
+ TestString[0] = '\0';
1088
+ TestString_lengths[0] = 0;
1089
+ ReplacementString[0] = '\0';
1090
+ ReplacementString_lengths[0] = 0;
1091
+ bool illegal_char = false;
1092
+ for (i = 0; i < AmbigPartSize; ++i) {
1093
+ fscanf (AmbigFile, "%s", buffer);
1094
+ strcat(TestString, buffer);
1095
+ lengths[0] = strlen(buffer);
1096
+ strcat(TestString_lengths, lengths);
1097
+ if (!unicharset.contains_unichar(buffer))
1098
+ illegal_char = true;
1099
+ }
1100
+ fscanf (AmbigFile, "%d", &AmbigPartSize);
1101
+ for (i = 0; i < AmbigPartSize; ++i) {
1102
+ fscanf (AmbigFile, "%s", buffer);
1103
+ strcat(ReplacementString, buffer);
1104
+ lengths[0] = strlen(buffer);
1105
+ strcat(ReplacementString_lengths, lengths);
1106
+ if (!unicharset.contains_unichar(buffer))
1107
+ illegal_char = true;
1108
+ }
1109
+
1110
+ if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
1111
+ strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
1112
+ DoError (0, "Illegal ambiguity specification!");
1113
+ if (illegal_char) {
1114
+ continue;
1115
+ }
1116
+
1117
+ AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));
1118
+
1119
+ strcpy(AmbigSpec->ambig, TestString + TestString_lengths[0]);
1120
+ strcat(AmbigSpec->ambig, " ");
1121
+ strcat(AmbigSpec->ambig, ReplacementString);
1122
+
1123
+ strcpy(AmbigSpec->lengths, TestString_lengths + 1);
1124
+ lengths[0] = 1;
1125
+ strcat(AmbigSpec->lengths, lengths);
1126
+ strcat(AmbigSpec->lengths, ReplacementString_lengths);
1127
+ unichar_id = unicharset.unichar_to_id(TestString, TestString_lengths[0]);
1128
+ NewTable[unichar_id] = push_last (NewTable[unichar_id], AmbigSpec);
1129
+ }
1130
+
1131
+ fclose(AmbigFile);
1132
+ return (NewTable);
1133
+
1134
+ } /* FillAmbigTable */
1135
+
1136
+
1137
+ /*---------------------------------------------------------------------------*/
1138
+ int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
1139
+ void *item2) { //EXPANDED_CHOICE *BestChoice)
1140
+ /*
1141
+ ** Parameters:
1142
+ ** Choice choice to be tested
1143
+ ** BestChoice best choice found
1144
+ ** Globals:
1145
+ ** AmbigThresholdGain
1146
+ ** AmbigThresholdOffset
1147
+ ** Operation: If the certainty of any chunk in Choice is not ambiguous
1148
+ ** with the corresponding chunk in the best choice, free
1149
+ ** Choice and return TRUE. Otherwise, return FALSE.
1150
+ ** Return: TRUE or FALSE.
1151
+ ** Exceptions: none
1152
+ ** History: Wed May 15 13:20:26 1991, DSJ, Created.
1153
+ */
1154
+ int i, j, Chunk;
1155
+ FLOAT32 Threshold;
1156
+ VIABLE_CHOICE Choice;
1157
+ EXPANDED_CHOICE *BestChoice;
1158
+
1159
+ Choice = (VIABLE_CHOICE) item1;
1160
+ BestChoice = (EXPANDED_CHOICE *) item2;
1161
+
1162
+ Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
1163
+ Choice->AdjustFactor);
1164
+
1165
+ for (i = 0, Chunk = 0; i < Choice->Length; i++)
1166
+ for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
1167
+ if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
1168
+ Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
1169
+ Threshold) {
1170
+ memfree(Choice);
1171
+ return (TRUE);
1172
+ }
1173
+
1174
+ return (FALSE);
1175
+
1176
+ } /* FreeBadChoice */
1177
+
1178
+
1179
+ /*---------------------------------------------------------------------------*/
1180
+ int LengthOfShortestAlphaRun(register char *Word, const char *Word_lengths) {
1181
+ /*
1182
+ ** Parameters:
1183
+ ** Word word to be tested
1184
+ ** Word_lengths lengths of the unichars in Word
1185
+ ** Globals: none
1186
+ ** Operation: Return the length of the shortest alpha run in Word.
1187
+ ** Return: Return the length of the shortest alpha run in Word.
1188
+ ** Exceptions: none
1189
+ ** History: Tue May 14 07:50:45 1991, DSJ, Created.
1190
+ */
1191
+ register int Shortest = MAX_INT32;
1192
+ register int Length;
1193
+
1194
+ for (; *Word; Word += *(Word_lengths++))
1195
+ if (unicharset.get_isalpha(Word, *Word_lengths)) {
1196
+ for (Length = 1, Word += *(Word_lengths++);
1197
+ *Word && unicharset.get_isalpha(Word, *Word_lengths);
1198
+ Word += *(Word_lengths++), Length++);
1199
+ if (Length < Shortest)
1200
+ Shortest = Length;
1201
+
1202
+ if (*Word == 0)
1203
+ break;
1204
+ }
1205
+ if (Shortest == MAX_INT32)
1206
+ Shortest = 0;
1207
+
1208
+ return (Shortest);
1209
+
1210
+ } /* LengthOfShortestAlphaRun */
1211
+
1212
+
1213
+ /*---------------------------------------------------------------------------*/
1214
+ VIABLE_CHOICE
1215
+ NewViableChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
1216
+ /*
1217
+ ** Parameters:
1218
+ ** Choice choice to be converted to a viable choice
1219
+ ** AdjustFactor factor used to adjust ratings for Choice
1220
+ ** Certainties certainty for each character in Choice
1221
+ ** Globals:
1222
+ ** CurrentSegmentation segmentation corresponding to Choice
1223
+ ** Operation: Allocate a new viable choice data structure, copy
1224
+ ** Choice, Certainties, and CurrentSegmentation into it,
1225
+ ** and return a pointer to it.
1226
+ ** Return: Ptr to new viable choice.
1227
+ ** Exceptions: none
1228
+ ** History: Thu May 16 15:28:29 1991, DSJ, Created.
1229
+ */
1230
+ VIABLE_CHOICE NewChoice;
1231
+ int Length;
1232
+ char *Word;
1233
+ char *Word_lengths;
1234
+ CHAR_CHOICE *NewChar;
1235
+ BLOB_WIDTH *BlobWidth;
1236
+
1237
+ Length = strlen (class_lengths (Choice));
1238
+ assert (Length <= MAX_NUM_CHUNKS && Length > 0);
1239
+
1240
+ NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) +
1241
+ (Length - 1) * sizeof (CHAR_CHOICE));
1242
+
1243
+ NewChoice->Rating = class_probability (Choice);
1244
+ NewChoice->Certainty = class_certainty (Choice);
1245
+ NewChoice->AdjustFactor = AdjustFactor;
1246
+ NewChoice->Length = Length;
1247
+ for (Word = class_string (Choice),
1248
+ Word_lengths = class_lengths (Choice),
1249
+ NewChar = &(NewChoice->Blob[0]),
1250
+ BlobWidth = CurrentSegmentation;
1251
+ *Word;
1252
+ Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) {
1253
+ NewChar->Class = unicharset.unichar_to_id(Word, *Word_lengths);
1254
+ NewChar->NumChunks = *BlobWidth;
1255
+ NewChar->Certainty = *Certainties;
1256
+ }
1257
+
1258
+ return (NewChoice);
1259
+
1260
+ } /* NewViableChoice */
1261
+
1262
+
1263
+ /*---------------------------------------------------------------------------*/
1264
+ void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
1265
+ /*
1266
+ ** Parameters:
1267
+ ** File open text file to print Choice to
1268
+ ** Label text label to be printed with Choice
1269
+ ** Choice choice to be printed
1270
+ ** Globals: none
1271
+ ** Operation: This routine dumps a text representation of the
1272
+ ** specified Choice to File.
1273
+ ** Return: none
1274
+ ** Exceptions: none
1275
+ ** History: Mon May 20 11:16:44 1991, DSJ, Created.
1276
+ */
1277
+ int i, j;
1278
+
1279
+ fprintf (File, "%s", Label);
1280
+
1281
+ fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f) ",
1282
+ Choice->Rating, Choice->Certainty, Choice->AdjustFactor);
1283
+
1284
+ for (i = 0; i < Choice->Length; i++)
1285
+ fprintf (File, "%s", unicharset.id_to_unichar(Choice->Blob[i].Class));
1286
+ fprintf (File, "\n");
1287
+
1288
+ for (i = 0; i < Choice->Length; i++) {
1289
+ fprintf (File, " %s", unicharset.id_to_unichar(Choice->Blob[i].Class));
1290
+ for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
1291
+ fprintf (File, " ");
1292
+ }
1293
+ fprintf (File, "\n");
1294
+
1295
+ for (i = 0; i < Choice->Length; i++) {
1296
+ for (j = 0; j < Choice->Blob[i].NumChunks; j++)
1297
+ fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0));
1298
+ }
1299
+ fprintf (File, "\n");
1300
+
1301
+ } /* PrintViableChoice */
1302
+
1303
+
1304
+ /*---------------------------------------------------------------------------*/
1305
+ void
1306
+ ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
1307
+ A_CHOICE * NewChoice,
1308
+ FLOAT32 AdjustFactor, float Certainties[]) {
1309
+ /*
1310
+ ** Parameters:
1311
+ ** OldChoice existing viable choice to be replaced
1312
+ ** NewChoice choice to replace OldChoice with
1313
+ ** AdjustFactor factor used to adjust ratings for OldChoice
1314
+ ** Certainties certainty for each character in OldChoice
1315
+ ** Globals:
1316
+ ** CurrentSegmentation segmentation for NewChoice
1317
+ ** Operation: This routine is used whenever a better segmentation (or
1318
+ ** contextual interpretation) is found for a word which already
1319
+ ** exists. The OldChoice is updated with the relevant
1320
+ ** information from the new choice. The text string itself
1321
+ ** does not need to be copied since, by definition, has not
1322
+ ** changed.
1323
+ ** Return: none
1324
+ ** Exceptions: none
1325
+ ** History: Fri May 17 13:35:58 1991, DSJ, Created.
1326
+ */
1327
+ char *Word;
1328
+ char *Word_lengths;
1329
+ CHAR_CHOICE *NewChar;
1330
+ BLOB_WIDTH *BlobWidth;
1331
+
1332
+ OldChoice->Rating = class_probability (NewChoice);
1333
+ OldChoice->Certainty = class_certainty (NewChoice);
1334
+ OldChoice->AdjustFactor = AdjustFactor;
1335
+
1336
+ for (Word = class_string (NewChoice),
1337
+ Word_lengths = class_lengths (NewChoice),
1338
+ NewChar = &(OldChoice->Blob[0]),
1339
+ BlobWidth = CurrentSegmentation;
1340
+ *Word;
1341
+ Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) {
1342
+ NewChar->NumChunks = *BlobWidth;
1343
+ NewChar->Certainty = *Certainties;
1344
+ }
1345
+ } /* ReplaceDuplicateChoice */
1346
+
1347
+
1348
+ /*---------------------------------------------------------------------------*/
1349
+ int StringSameAs(const char *String,
1350
+ const char *String_lengths,
1351
+ VIABLE_CHOICE ViableChoice) {
1352
+ /*
1353
+ ** Parameters:
1354
+ ** String string to compare to ViableChoice
1355
+ ** String_lengths lengths of unichars in String
1356
+ ** ViableChoice viable choice to compare to String
1357
+ ** Globals: none
1358
+ ** Operation: This routine compares String to ViableChoice and
1359
+ ** returns TRUE if they are the same, FALSE otherwise.
1360
+ ** Return: TRUE or FALSE.
1361
+ ** Exceptions: none
1362
+ ** History: Fri May 17 08:48:04 1991, DSJ, Created.
1363
+ */
1364
+ CHAR_CHOICE *Char;
1365
+ int i;
1366
+ int current_unichar_length;
1367
+
1368
+ for (Char = &(ViableChoice->Blob[0]), i = 0;
1369
+ i < ViableChoice->Length;
1370
+ String += *(String_lengths++), Char++, i++) {
1371
+ current_unichar_length = strlen(unicharset.id_to_unichar(Char->Class));
1372
+ if (current_unichar_length != *String_lengths ||
1373
+ strncmp(String, unicharset.id_to_unichar(Char->Class),
1374
+ current_unichar_length) != 0)
1375
+ return (FALSE);
1376
+ }
1377
+
1378
+ if (*String == 0)
1379
+ return (TRUE);
1380
+ else
1381
+ return (FALSE);
1382
+
1383
+ } /* StringSameAs */
1384
+
1385
+
1386
+ /*---------------------------------------------------------------------------*/
1387
+ int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice) {
1388
+ /*
1389
+ ** Parameters:
1390
+ ** Choices choices for current segmentation
1391
+ ** BestChoice best choice for current segmentation
1392
+ ** Globals:
1393
+ ** CertaintyVariation max allowed certainty variation
1394
+ ** Operation: This routine returns TRUE if the certainty of the
1395
+ ** BestChoice word is within a reasonable range of the average
1396
+ ** certainties for the best choices for each character in
1397
+ ** the segmentation. This test is used to catch words in which
1398
+ ** one character is much worse than the other characters in
1399
+ ** the word (i.e. FALSE will be returned in that case).
1400
+ ** The algorithm computes the mean and std deviation of the
1401
+ ** certainties in the word with the worst certainty thrown out.
1402
+ ** Return: TRUE or FALSE.
1403
+ ** Exceptions: none
1404
+ ** History: Tue May 14 08:23:21 1991, DSJ, Created.
1405
+ */
1406
+ int i;
1407
+ CHOICES CharChoices;
1408
+ float Certainty;
1409
+ float WorstCertainty = MAX_FLOAT32;
1410
+ float CertaintyThreshold;
1411
+ FLOAT64 TotalCertainty;
1412
+ FLOAT64 TotalCertaintySquared;
1413
+ FLOAT64 Variance;
1414
+ FLOAT32 Mean, StdDev;
1415
+ int WordLength;
1416
+
1417
+ WordLength = array_count (Choices);
1418
+ if (WordLength < 3)
1419
+ return (TRUE);
1420
+
1421
+ TotalCertainty = TotalCertaintySquared = 0.0;
1422
+ for_each_choice(Choices, i) {
1423
+ CharChoices = (CHOICES) array_index (Choices, i);
1424
+ Certainty = best_certainty (CharChoices);
1425
+ TotalCertainty += Certainty;
1426
+ TotalCertaintySquared += Certainty * Certainty;
1427
+ if (Certainty < WorstCertainty)
1428
+ WorstCertainty = Certainty;
1429
+ }
1430
+
1431
+ /* subtract off worst certainty from statistics */
1432
+ WordLength--;
1433
+ TotalCertainty -= WorstCertainty;
1434
+ TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1435
+
1436
+ Mean = TotalCertainty / WordLength;
1437
+ Variance = ((WordLength * TotalCertaintySquared -
1438
+ TotalCertainty * TotalCertainty) /
1439
+ (WordLength * (WordLength - 1)));
1440
+ if (Variance < 0.0)
1441
+ Variance = 0.0;
1442
+ StdDev = sqrt (Variance);
1443
+
1444
+ CertaintyThreshold = Mean - CertaintyVariation * StdDev;
1445
+ if (CertaintyThreshold > NonDictCertainty)
1446
+ CertaintyThreshold = NonDictCertainty;
1447
+
1448
+ if (class_certainty (BestChoice) < CertaintyThreshold) {
1449
+ if (StopperDebugLevel >= 1)
1450
+ cprintf
1451
+ ("Stopper: Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n",
1452
+ class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold);
1453
+ return (FALSE);
1454
+ }
1455
+ else
1456
+ return (TRUE);
1457
+
1458
+ } /* UniformCertainties */