tesseract_bin 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,2628 @@
1
+ /**********************************************************************
2
+ * File: makerow.cpp (Formerly makerows.c)
3
+ * Description: Code to arrange blobs into rows of text.
4
+ * Author: Ray Smith
5
+ * Created: Mon Sep 21 14:34:48 BST 1992
6
+ *
7
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #ifdef __UNIX__
22
+ #include <assert.h>
23
+ #endif
24
+ #include "stderr.h"
25
+ #include "blobbox.h"
26
+ #include "lmedsq.h"
27
+ #include "statistc.h"
28
+ #include "drawtord.h"
29
+ #include "blkocc.h"
30
+ #include "sortflts.h"
31
+ #include "oldbasel.h"
32
+ #include "tordmain.h"
33
+ #include "underlin.h"
34
+ #include "makerow.h"
35
+ #include "tprintf.h"
36
+
37
+ #define EXTERN
38
+
39
+ EXTERN BOOL_VAR (textord_heavy_nr, FALSE, "Vigorously remove noise");
40
+ EXTERN BOOL_VAR (textord_show_initial_rows, FALSE,
41
+ "Display row accumulation");
42
+ EXTERN BOOL_VAR (textord_show_parallel_rows, FALSE,
43
+ "Display page correlated rows");
44
+ EXTERN BOOL_VAR (textord_show_expanded_rows, FALSE,
45
+ "Display rows after expanding");
46
+ EXTERN BOOL_VAR (textord_show_final_rows, FALSE,
47
+ "Display rows after final fittin");
48
+ EXTERN BOOL_VAR (textord_show_final_blobs, FALSE,
49
+ "Display blob bounds after pre-ass");
50
+ EXTERN BOOL_VAR (textord_test_landscape, FALSE, "Tests refer to land/port");
51
+ EXTERN BOOL_VAR (textord_parallel_baselines, TRUE,
52
+ "Force parallel baselines");
53
+ EXTERN BOOL_VAR (textord_straight_baselines, FALSE,
54
+ "Force straight baselines");
55
+ EXTERN BOOL_VAR (textord_quadratic_baselines, FALSE, "Use quadratic splines");
56
+ EXTERN BOOL_VAR (textord_old_baselines, TRUE, "Use old baseline algorithm");
57
+ EXTERN BOOL_VAR (textord_old_xheight, TRUE, "Use old xheight algorithm");
58
+ EXTERN BOOL_VAR (textord_fix_xheight_bug, TRUE, "Use spline baseline");
59
+ EXTERN BOOL_VAR (textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
60
+ EXTERN BOOL_VAR (textord_row_xheights, FALSE, "Use row height policy");
61
+ EXTERN BOOL_VAR (textord_block_xheights, TRUE, "Use block height policy");
62
+ EXTERN BOOL_VAR (textord_xheight_tweak, FALSE, "New min condition on height");
63
+ EXTERN BOOL_VAR (textord_cblob_blockocc, TRUE,
64
+ "Use new projection for underlines");
65
+ EXTERN BOOL_VAR (textord_debug_xheights, FALSE, "Test xheight algorithms");
66
+ EXTERN BOOL_VAR (textord_biased_skewcalc, TRUE,
67
+ "Bias skew estimates with line length");
68
+ EXTERN BOOL_VAR (textord_interpolating_skew, TRUE, "Interpolate across gaps");
69
+ EXTERN INT_VAR (textord_skewsmooth_offset, 2, "For smooth factor");
70
+ EXTERN INT_VAR (textord_skewsmooth_offset2, 1, "For smooth factor");
71
+ EXTERN INT_VAR (textord_test_x, -1, "coord of test pt");
72
+ EXTERN INT_VAR (textord_test_y, -1, "coord of test pt");
73
+ EXTERN INT_VAR (textord_min_blobs_in_row, 4,
74
+ "Min blobs before gradient counted");
75
+ EXTERN INT_VAR (textord_spline_minblobs, 8,
76
+ "Min blobs in each spline segment");
77
+ EXTERN INT_VAR (textord_spline_medianwin, 6,
78
+ "Size of window for spline segmentation");
79
+ EXTERN INT_VAR (textord_min_xheight, 10, "Min credible pixel xheight");
80
+ EXTERN double_VAR (textord_spline_shift_fraction, 0.02,
81
+ "Fraction of line spacing for quad");
82
+ EXTERN double_VAR (textord_spline_outlier_fraction, 0.1,
83
+ "Fraction of line spacing for outlier");
84
+ EXTERN double_VAR (textord_skew_ile, 0.5, "Ile of gradients for page skew");
85
+ EXTERN double_VAR (textord_skew_lag, 0.01,
86
+ "Lag for skew on row accumulation");
87
+ EXTERN double_VAR (textord_linespace_iqrlimit, 0.2,
88
+ "Max iqr/median for linespace");
89
+ EXTERN double_VAR (textord_width_limit, 8, "Max width of blobs to make rows");
90
+ EXTERN double_VAR (textord_chop_width, 1.5, "Max width before chopping");
91
+ EXTERN double_VAR (textord_expansion_factor, 1.0,
92
+ "Factor to expand rows by in expand_rows");
93
+ EXTERN double_VAR (textord_overlap_x, 0.5,
94
+ "Fraction of linespace for good overlap");
95
+ EXTERN double_VAR (textord_merge_desc, 0.25,
96
+ "Fraction of linespace for desc drop");
97
+ EXTERN double_VAR (textord_merge_x, 0.5,
98
+ "Fraction of linespace for x height");
99
+ EXTERN double_VAR (textord_merge_asc, 0.25,
100
+ "Fraction of linespace for asc height");
101
+ EXTERN double_VAR (textord_minxh, 0.25,
102
+ "fraction of linesize for min xheight");
103
+ EXTERN double_VAR (textord_min_linesize, 1.25,
104
+ "* blob height for initial linesize");
105
+ EXTERN double_VAR (textord_excess_blobsize, 1.3,
106
+ "New row made if blob makes row this big");
107
+ EXTERN double_VAR (textord_occupancy_threshold, 0.4,
108
+ "Fraction of neighbourhood");
109
+ EXTERN double_VAR (textord_underline_width, 2.0,
110
+ "Multiple of line_size for underline");
111
+ EXTERN double_VAR (textord_xheight_mode_fraction, 0.4,
112
+ "Min pile height to make xheight");
113
+ EXTERN double_VAR (textord_ascheight_mode_fraction, 0.15,
114
+ "Min pile height to make ascheight");
115
+ EXTERN double_VAR (textord_ascx_ratio_min, 1.2, "Min cap/xheight");
116
+ EXTERN double_VAR (textord_ascx_ratio_max, 1.7, "Max cap/xheight");
117
+ EXTERN double_VAR (textord_descx_ratio_min, 0.15, "Min desc/xheight");
118
+ EXTERN double_VAR (textord_descx_ratio_max, 0.6, "Max desc/xheight");
119
+ EXTERN double_VAR (textord_xheight_error_margin, 0.1, "Accepted variation");
120
+
121
+ #define MAX_HEIGHT_MODES 12
122
+
123
+ /**********************************************************************
124
+ * make_rows
125
+ *
126
+ * Arrange the blobs into rows.
127
+ **********************************************************************/
128
+
129
+ float make_rows( //make rows
130
+ ICOORD page_tr, //top right
131
+ BLOCK_LIST *blocks, //block list
132
+ TO_BLOCK_LIST *land_blocks, //rotated for landscape
133
+ TO_BLOCK_LIST *port_blocks //output list
134
+ ) {
135
+ float port_m; //global skew
136
+ float port_err; //global noise
137
+ // float land_m; //global skew
138
+ // float land_err; //global noise
139
+ TO_BLOCK_IT block_it; //iterator
140
+
141
+ //don't do landscape for now
142
+ // block_it.set_to_list(land_blocks);
143
+ // for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
144
+ // make_initial_textrows(page_tr,block_it.data(),FCOORD(0,-1),
145
+ // (BOOL8)textord_test_landscape);
146
+ block_it.set_to_list (port_blocks);
147
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
148
+ block_it.forward ())
149
+ make_initial_textrows (page_tr, block_it.data (), FCOORD (1.0f, 0.0f),
150
+ !(BOOL8) textord_test_landscape);
151
+ //compute globally
152
+ compute_page_skew(port_blocks, port_m, port_err);
153
+ // compute_page_skew(land_blocks,land_m,land_err); //compute globally
154
+ // tprintf("Portrait skew gradient=%g, error=%g.\n",
155
+ // port_m,port_err);
156
+ // tprintf("Landscape skew gradient=%g, error=%g.\n",
157
+ // land_m,land_err);
158
+ block_it.set_to_list (port_blocks);
159
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
160
+ block_it.forward ()) {
161
+ cleanup_rows (page_tr, block_it.data (), port_m, FCOORD (1.0f, 0.0f),
162
+ block_it.data ()->block->bounding_box ().left (),
163
+ !(BOOL8) textord_test_landscape);
164
+ }
165
+ block_it.set_to_list (land_blocks);
166
+ // for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
167
+ // {
168
+ // cleanup_rows(page_tr,block_it.data(),land_m,FCOORD(0,-1),
169
+ // -block_it.data()->block->bounding_box().top(),
170
+ // (BOOL8)textord_test_landscape);
171
+ // }
172
+ return port_m; //global skew
173
+ }
174
+
175
+
176
+ /**********************************************************************
177
+ * make_initial_textrows
178
+ *
179
+ * Arrange the good blobs into rows of text.
180
+ **********************************************************************/
181
+
182
+ void make_initial_textrows( //find lines
183
+ ICOORD page_tr,
184
+ TO_BLOCK *block, //block to do
185
+ FCOORD rotation, //for drawing
186
+ BOOL8 testing_on //correct orientation
187
+ ) {
188
+ TO_ROW_IT row_it = block->get_rows ();
189
+
190
+ #ifndef GRAPHICS_DISABLED
191
+ ScrollView::Color colour; //of row
192
+
193
+ if (textord_show_initial_rows && testing_on) {
194
+ if (to_win == NULL)
195
+ create_to_win(page_tr);
196
+ }
197
+ #endif
198
+ //guess skew
199
+ assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
200
+ row_it.move_to_first ();
201
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
202
+ fit_lms_line (row_it.data ());
203
+ #ifndef GRAPHICS_DISABLED
204
+ if (textord_show_initial_rows && testing_on) {
205
+ colour = ScrollView::RED;
206
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
207
+ plot_to_row (row_it.data (), colour, rotation);
208
+ colour = (ScrollView::Color) (colour + 1);
209
+ if (colour > ScrollView::MAGENTA)
210
+ colour = ScrollView::RED;
211
+ }
212
+ }
213
+ #endif
214
+ }
215
+
216
+
217
+ /**********************************************************************
218
+ * fit_lms_line
219
+ *
220
+ * Fit an LMS line to a row.
221
+ **********************************************************************/
222
+
223
+ void fit_lms_line( //sort function
224
+ TO_ROW *row //row to fit
225
+ ) {
226
+ float m, c; //fitted line
227
+ TBOX box; //blob box
228
+ LMS lms (row->blob_list ()->length ());
229
+ //blobs
230
+ BLOBNBOX_IT blob_it = row->blob_list ();
231
+
232
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
233
+ box = blob_it.data ()->bounding_box ();
234
+ lms.add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
235
+ }
236
+ lms.fit (m, c);
237
+ row->set_line (m, c, lms.error ());
238
+ }
239
+
240
+
241
+ /**********************************************************************
242
+ * compute_page_skew
243
+ *
244
+ * Compute the skew over a full page by averaging the gradients over
245
+ * all the lines. Get the error of the same row.
246
+ **********************************************************************/
247
+
248
+ void compute_page_skew( //get average gradient
249
+ TO_BLOCK_LIST *blocks, //list of blocks
250
+ float &page_m, //average gradient
251
+ float &page_err //average error
252
+ ) {
253
+ inT32 row_count; //total rows
254
+ inT32 blob_count; //total_blobs
255
+ inT32 row_err; //integer error
256
+ float *gradients; //of rows
257
+ float *errors; //of rows
258
+ inT32 row_index; //of total
259
+ TO_ROW *row; //current row
260
+ TO_BLOCK_IT block_it = blocks; //iterator
261
+ TO_ROW_IT row_it;
262
+
263
+ row_count = 0;
264
+ blob_count = 0;
265
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
266
+ block_it.forward ()) {
267
+ row_count += block_it.data ()->get_rows ()->length ();
268
+ //count up rows
269
+ row_it.set_to_list (block_it.data ()->get_rows ());
270
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
271
+ blob_count += row_it.data ()->blob_list ()->length ();
272
+ }
273
+ if (row_count == 0) {
274
+ page_m = 0.0f;
275
+ page_err = 0.0f;
276
+ return;
277
+ }
278
+ gradients = (float *) alloc_mem (blob_count * sizeof (float));
279
+ //get mem
280
+ errors = (float *) alloc_mem (blob_count * sizeof (float));
281
+ if (gradients == NULL || errors == NULL)
282
+ MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
283
+
284
+ row_index = 0;
285
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
286
+ block_it.forward ()) {
287
+ row_it.set_to_list (block_it.data ()->get_rows ());
288
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
289
+ row = row_it.data ();
290
+ blob_count = row->blob_list ()->length ();
291
+ row_err = (inT32) ceil (row->line_error ());
292
+ if (row_err <= 0)
293
+ row_err = 1;
294
+ if (textord_biased_skewcalc) {
295
+ blob_count /= row_err;
296
+ for (blob_count /= row_err; blob_count > 0; blob_count--) {
297
+ gradients[row_index] = row->line_m ();
298
+ errors[row_index] = row->line_error ();
299
+ row_index++;
300
+ }
301
+ }
302
+ else if (blob_count >= textord_min_blobs_in_row) {
303
+ //get gradient
304
+ gradients[row_index] = row->line_m ();
305
+ errors[row_index] = row->line_error ();
306
+ row_index++;
307
+ }
308
+ }
309
+ }
310
+ if (row_index == 0) {
311
+ //desperate
312
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
313
+ block_it.forward ()) {
314
+ row_it.set_to_list (block_it.data ()->get_rows ());
315
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
316
+ row_it.forward ()) {
317
+ row = row_it.data ();
318
+ gradients[row_index] = row->line_m ();
319
+ errors[row_index] = row->line_error ();
320
+ row_index++;
321
+ }
322
+ }
323
+ }
324
+ row_count = row_index;
325
+ row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
326
+ gradients, row_count);
327
+ page_m = gradients[row_index];
328
+ row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
329
+ errors, row_count);
330
+ page_err = errors[row_index];
331
+ free_mem(gradients);
332
+ free_mem(errors);
333
+ }
334
+
335
+ const double kNoiseSize = 0.5; // Fraction of xheight.
336
+ const int kMinSize = 8; // Min pixels to be xheight.
337
+
338
+ // Return true if the dot looks like it is part of the i.
339
+ // Doesn't work for any other diacritical.
340
+ static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
341
+ const TBOX& ibox = i->bounding_box();
342
+ const TBOX& dotbox = dot->bounding_box();
343
+
344
+ // Must overlap horizontally by enough and be high enough.
345
+ int overlap = MIN(dotbox.right(), ibox.right()) -
346
+ MAX(dotbox.left(), ibox.left());
347
+ if (ibox.height() <= 2 * dotbox.height() ||
348
+ (overlap * 2 < ibox.width() && overlap < dotbox.width()))
349
+ return false;
350
+
351
+ // If the i is tall and thin then it is good.
352
+ if (ibox.height() > ibox.width() * 2)
353
+ return true; // The i or ! must be tall and thin.
354
+
355
+ // It might still be tall and thin, but it might be joined to something.
356
+ // So search the outline for a piece of large height close to the edges
357
+ // of the dot.
358
+ const double kHeightFraction = 0.6;
359
+ double target_height = MIN(dotbox.bottom(), ibox.top());
360
+ target_height -= row->line_m()*dotbox.left() + row->line_c();
361
+ target_height *= kHeightFraction;
362
+ int left_min = dotbox.left() - dotbox.width();
363
+ int middle = (dotbox.left() + dotbox.right())/2;
364
+ int right_max = dotbox.right() + dotbox.width();
365
+ int left_miny = 0;
366
+ int left_maxy = 0;
367
+ int right_miny = 0;
368
+ int right_maxy = 0;
369
+ bool found_left = false;
370
+ bool found_right = false;
371
+ bool in_left = false;
372
+ bool in_right = false;
373
+ C_BLOB* blob = i->cblob();
374
+ C_OUTLINE_IT o_it = blob->out_list();
375
+ for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
376
+ C_OUTLINE* outline = o_it.data();
377
+ int length = outline->pathlength();
378
+ ICOORD pos = outline->start_pos();
379
+ for (int step = 0; step < length; pos += outline->step(step++)) {
380
+ int x = pos.x();
381
+ int y = pos.y();
382
+ if (x >= left_min && x < middle && !found_left) {
383
+ // We are in the left part so find min and max y.
384
+ if (in_left) {
385
+ if (y > left_maxy) left_maxy = y;
386
+ if (y < left_miny) left_miny = y;
387
+ } else {
388
+ left_maxy = left_miny = y;
389
+ in_left = true;
390
+ }
391
+ } else if (in_left) {
392
+ // We just left the left so look for size.
393
+ if (left_maxy - left_miny > target_height) {
394
+ if (found_right)
395
+ return true;
396
+ found_left = true;
397
+ }
398
+ in_left = false;
399
+ }
400
+ if (x <= right_max && x > middle && !found_right) {
401
+ // We are in the right part so find min and max y.
402
+ if (in_right) {
403
+ if (y > right_maxy) right_maxy = y;
404
+ if (y < right_miny) right_miny = y;
405
+ } else {
406
+ right_maxy = right_miny = y;
407
+ in_right = true;
408
+ }
409
+ } else if (in_right) {
410
+ // We just left the right so look for size.
411
+ if (right_maxy - right_miny > target_height) {
412
+ if (found_left)
413
+ return true;
414
+ found_right = true;
415
+ }
416
+ in_right = false;
417
+ }
418
+ }
419
+ }
420
+ return false;
421
+ }
422
+
423
+ static void vigorous_noise_removal(TO_BLOCK* block) {
424
+ TO_ROW_IT row_it = block->get_rows ();
425
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
426
+ TO_ROW* row = row_it.data();
427
+ BLOBNBOX_IT b_it = row->blob_list();
428
+ // Estimate the xheight on the row.
429
+ int max_height = 0;
430
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
431
+ BLOBNBOX* blob = b_it.data();
432
+ if (blob->bounding_box().height() > max_height)
433
+ max_height = blob->bounding_box().height();
434
+ }
435
+ STATS hstats(0, max_height + 1);
436
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
437
+ BLOBNBOX* blob = b_it.data();
438
+ int height = blob->bounding_box().height();
439
+ if (height >= kMinSize)
440
+ hstats.add(blob->bounding_box().height(), 1);
441
+ }
442
+ float xheight = hstats.median();
443
+ // Delete small objects.
444
+ BLOBNBOX* prev = NULL;
445
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
446
+ BLOBNBOX* blob = b_it.data();
447
+ const TBOX& box = blob->bounding_box();
448
+ if (box.height() < kNoiseSize * xheight) {
449
+ // Small so delete unless it looks like an i dot.
450
+ if (prev != NULL) {
451
+ if (dot_of_i(blob, prev, row))
452
+ continue; // Looks OK.
453
+ }
454
+ if (!b_it.at_last()) {
455
+ BLOBNBOX* next = b_it.data_relative(1);
456
+ if (dot_of_i(blob, next, row))
457
+ continue; // Looks OK.
458
+ }
459
+ // It might be noise so get rid of it.
460
+ if (blob->blob() != NULL)
461
+ delete blob->blob();
462
+ if (blob->cblob() != NULL)
463
+ delete blob->cblob();
464
+ delete b_it.extract();
465
+ } else {
466
+ prev = blob;
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ /**********************************************************************
473
+ * cleanup_rows
474
+ *
475
+ * Remove overlapping rows and fit all the blobs to what's left.
476
+ **********************************************************************/
477
+
478
+ void cleanup_rows( //find lines
479
+ ICOORD page_tr, //top right
480
+ TO_BLOCK *block, //block to do
481
+ float gradient, //gradient to fit
482
+ FCOORD rotation, //for drawing
483
+ inT32 block_edge, //edge of block
484
+ BOOL8 testing_on //correct orientation
485
+ ) {
486
+ //iterators
487
+ BLOBNBOX_IT blob_it = &block->blobs;
488
+ TO_ROW_IT row_it = block->get_rows ();
489
+
490
+ #ifndef GRAPHICS_DISABLED
491
+ if (textord_show_parallel_rows && testing_on) {
492
+ if (to_win == NULL)
493
+ create_to_win(page_tr);
494
+ }
495
+ #endif
496
+ //get row coords
497
+ fit_parallel_rows(block,
498
+ gradient,
499
+ rotation,
500
+ block_edge,
501
+ textord_show_parallel_rows &&testing_on);
502
+ delete_non_dropout_rows(block,
503
+ gradient,
504
+ rotation,
505
+ block_edge,
506
+ textord_show_parallel_rows &&testing_on);
507
+ expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
508
+ blob_it.set_to_list (&block->blobs);
509
+ row_it.set_to_list (block->get_rows ());
510
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
511
+ blob_it.add_list_after (row_it.data ()->blob_list ());
512
+ //give blobs back
513
+ assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
514
+ //now new rows must be genuine
515
+ blob_it.set_to_list (&block->blobs);
516
+ blob_it.add_list_after (&block->large_blobs);
517
+ assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
518
+ //safe to use big ones now
519
+ blob_it.set_to_list (&block->blobs);
520
+ //throw all blobs in
521
+ blob_it.add_list_after (&block->noise_blobs);
522
+ blob_it.add_list_after (&block->small_blobs);
523
+ assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
524
+ //no rows for noise
525
+ row_it.set_to_list (block->get_rows ());
526
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
527
+ row_it.data ()->blob_list ()->sort (blob_x_order);
528
+ fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
529
+ if (textord_heavy_nr) {
530
+ vigorous_noise_removal(block);
531
+ }
532
+ separate_underlines(block, gradient, rotation, testing_on);
533
+ pre_associate_blobs(page_tr, block, rotation, testing_on);
534
+
535
+ #ifndef GRAPHICS_DISABLED
536
+ if (textord_show_final_rows && testing_on) {
537
+ if (to_win == NULL)
538
+ create_to_win(page_tr);
539
+ }
540
+ #endif
541
+
542
+ fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
543
+ // textord_show_final_rows && testing_on);
544
+ make_spline_rows(block,
545
+ gradient,
546
+ rotation,
547
+ block_edge,
548
+ textord_show_final_rows &&testing_on);
549
+ if (!textord_old_xheight || !textord_old_baselines)
550
+ compute_block_xheight(block, gradient);
551
+ if (textord_restore_underlines)
552
+ //fix underlines
553
+ restore_underlined_blobs(block);
554
+ #ifndef GRAPHICS_DISABLED
555
+ if (textord_show_final_rows && testing_on) {
556
+ plot_blob_list (to_win, &block->blobs,
557
+ ScrollView::MAGENTA, ScrollView::WHITE);
558
+ //show discarded blobs
559
+ plot_blob_list (to_win, &block->underlines,
560
+ ScrollView::YELLOW, ScrollView::CORAL);
561
+ }
562
+ if (textord_show_final_rows && testing_on && block->blobs.length () > 0)
563
+ tprintf ("%d blobs discarded as noise\n", block->blobs.length ());
564
+ if (textord_show_final_rows && testing_on) {
565
+ draw_meanlines(block, gradient, block_edge, ScrollView::WHITE, rotation);
566
+ }
567
+ #endif
568
+ }
569
+
570
+
571
+ /**********************************************************************
572
+ * delete_non_dropout_rows
573
+ *
574
+ * Compute the linespacing and offset.
575
+ **********************************************************************/
576
+
577
+ void delete_non_dropout_rows( //find lines
578
+ TO_BLOCK *block, //block to do
579
+ float gradient, //global skew
580
+ FCOORD rotation, //deskew vector
581
+ inT32 block_edge, //left edge
582
+ BOOL8 testing_on //correct orientation
583
+ ) {
584
+ TBOX block_box; //deskewed block
585
+ inT32 *deltas; //change in occupation
586
+ inT32 *occupation; //of pixel coords
587
+ inT32 max_y; //in block
588
+ inT32 min_y;
589
+ inT32 line_index; //of scan line
590
+ inT32 line_count; //no of scan lines
591
+ inT32 distance; //to drop-out
592
+ inT32 xleft; //of block
593
+ inT32 ybottom; //of block
594
+ TO_ROW *row; //current row
595
+ TO_ROW_IT row_it = block->get_rows ();
596
+ BLOBNBOX_IT blob_it = &block->blobs;
597
+
598
+ if (row_it.length () == 0)
599
+ return; //empty block
600
+ block_box = deskew_block_coords (block, gradient);
601
+ xleft = block->block->bounding_box ().left ();
602
+ ybottom = block->block->bounding_box ().bottom ();
603
+ min_y = block_box.bottom () - 1;
604
+ max_y = block_box.top () + 1;
605
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
606
+ line_index = (inT32) floor (row_it.data ()->intercept ());
607
+ if (line_index <= min_y)
608
+ min_y = line_index - 1;
609
+ if (line_index >= max_y)
610
+ max_y = line_index + 1;
611
+ }
612
+ line_count = max_y - min_y + 1;
613
+ if (line_count <= 0)
614
+ return; //empty block
615
+ deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
616
+ occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
617
+ if (deltas == NULL || occupation == NULL)
618
+ MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
619
+
620
+ compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
621
+ compute_occupation_threshold ((inT32)
622
+ ceil (block->line_spacing *
623
+ (textord_merge_desc +
624
+ textord_merge_asc)),
625
+ (inT32) ceil (block->line_spacing *
626
+ (textord_merge_x +
627
+ textord_merge_asc)),
628
+ max_y - min_y + 1, occupation, deltas);
629
+ #ifndef GRAPHICS_DISABLED
630
+ if (testing_on) {
631
+ draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
632
+ }
633
+ #endif
634
+ compute_dropout_distances(occupation, deltas, line_count);
635
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
636
+ row = row_it.data ();
637
+ line_index = (inT32) floor (row->intercept ());
638
+ distance = deltas[line_index - min_y];
639
+ if (find_best_dropout_row (row, distance, block->line_spacing / 2,
640
+ line_index, &row_it, testing_on)) {
641
+ #ifndef GRAPHICS_DISABLED
642
+ if (testing_on)
643
+ plot_parallel_row(row, gradient, block_edge,
644
+ ScrollView::WHITE, rotation);
645
+ #endif
646
+ blob_it.add_list_after (row_it.data ()->blob_list ());
647
+ delete row_it.extract (); //too far away
648
+ }
649
+ }
650
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
651
+ blob_it.add_list_after (row_it.data ()->blob_list ());
652
+ }
653
+
654
+ free_mem(deltas);
655
+ free_mem(occupation);
656
+ }
657
+
658
+
659
+ /**********************************************************************
660
+ * find_best_dropout_row
661
+ *
662
+ * Delete this row if it has a neighbour with better dropout characteristics.
663
+ * TRUE is returned if the row should be deleted.
664
+ **********************************************************************/
665
+
666
+ BOOL8 find_best_dropout_row( //find neighbours
667
+ TO_ROW *row, //row to test
668
+ inT32 distance, //dropout dist
669
+ float dist_limit, //threshold distance
670
+ inT32 line_index, //index of row
671
+ TO_ROW_IT *row_it, //current position
672
+ BOOL8 testing_on //correct orientation
673
+ ) {
674
+ inT32 next_index; //of neigbouring row
675
+ inT32 row_offset; //from current row
676
+ inT32 abs_dist; //absolute distance
677
+ inT8 row_inc; //increment to row_index
678
+ TO_ROW *next_row; //nextious row
679
+
680
+ if (testing_on)
681
+ tprintf ("Row at %g(%g), dropout dist=%d,",
682
+ row->intercept (), row->parallel_c (), distance);
683
+ if (distance < 0) {
684
+ row_inc = 1;
685
+ abs_dist = -distance;
686
+ }
687
+ else {
688
+ row_inc = -1;
689
+ abs_dist = distance;
690
+ }
691
+ if (abs_dist > dist_limit) {
692
+ if (testing_on) {
693
+ tprintf (" too far - deleting\n");
694
+ }
695
+ return TRUE;
696
+ }
697
+ if ((distance < 0 && !row_it->at_last ())
698
+ || (distance >= 0 && !row_it->at_first ())) {
699
+ row_offset = row_inc;
700
+ do {
701
+ next_row = row_it->data_relative (row_offset);
702
+ next_index = (inT32) floor (next_row->intercept ());
703
+ if ((distance < 0
704
+ && next_index < line_index
705
+ && next_index > line_index + distance + distance)
706
+ || (distance >= 0
707
+ && next_index > line_index
708
+ && next_index < line_index + distance + distance)) {
709
+ if (testing_on) {
710
+ tprintf (" nearer neighbour (%d) at %g\n",
711
+ line_index + distance - next_index,
712
+ next_row->intercept ());
713
+ }
714
+ return TRUE; //other is nearer
715
+ }
716
+ else if (next_index == line_index
717
+ || next_index == line_index + distance + distance) {
718
+ if (row->believability () <= next_row->believability ()) {
719
+ if (testing_on) {
720
+ tprintf (" equal but more believable at %g (%g/%g)\n",
721
+ next_row->intercept (),
722
+ row->believability (),
723
+ next_row->believability ());
724
+ }
725
+ return TRUE; //other is more believable
726
+ }
727
+ }
728
+ row_offset += row_inc;
729
+ }
730
+ while ((next_index == line_index
731
+ || next_index == line_index + distance + distance)
732
+ && row_offset < row_it->length ());
733
+ if (testing_on)
734
+ tprintf (" keeping\n");
735
+ }
736
+ return FALSE;
737
+ }
738
+
739
+
740
+ /**********************************************************************
741
+ * deskew_block_coords
742
+ *
743
+ * Compute the bounding box of all the blobs in the block
744
+ * if they were deskewed without actually doing it.
745
+ **********************************************************************/
746
+
747
+ TBOX deskew_block_coords( //block box
748
+ TO_BLOCK *block, //block to do
749
+ float gradient //global skew
750
+ ) {
751
+ TBOX result; //block bounds
752
+ TBOX blob_box; //of block
753
+ FCOORD rotation; //deskew vector
754
+ float length; //of gradient vector
755
+ TO_ROW_IT row_it = block->get_rows ();
756
+ TO_ROW *row; //current row
757
+ BLOBNBOX *blob; //current blob
758
+ BLOBNBOX_IT blob_it; //iterator
759
+
760
+ length = sqrt (gradient * gradient + 1);
761
+ rotation = FCOORD (1 / length, -gradient / length);
762
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
763
+ row = row_it.data ();
764
+ blob_it.set_to_list (row->blob_list ());
765
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
766
+ blob_it.forward ()) {
767
+ blob = blob_it.data ();
768
+ blob_box = blob->bounding_box ();
769
+ blob_box.rotate (rotation);//de-skew it
770
+ result += blob_box;
771
+ }
772
+ }
773
+ return result;
774
+ }
775
+
776
+
777
+ /**********************************************************************
778
+ * compute_line_occupation
779
+ *
780
+ * Compute the pixel projection back on the y axis given the global
781
+ * skew. Also compute the 1st derivative.
782
+ **********************************************************************/
783
+
784
+ void compute_line_occupation( //project blobs
785
+ TO_BLOCK *block, //block to do
786
+ float gradient, //global skew
787
+ inT32 min_y, //min coord in block
788
+ inT32 max_y, //in block
789
+ inT32 *occupation, //output projection
790
+ inT32 *deltas //derivative
791
+ ) {
792
+ inT32 line_count; //maxy-miny+1
793
+ inT32 line_index; //of scan line
794
+ int index; //array index for daft compilers
795
+ float top, bottom; //coords of blob
796
+ inT32 width; //of blob
797
+ TO_ROW *row; //current row
798
+ TO_ROW_IT row_it = block->get_rows ();
799
+ BLOBNBOX *blob; //current blob
800
+ BLOBNBOX_IT blob_it; //iterator
801
+ float length; //of skew vector
802
+ TBOX blob_box; //bounding box
803
+ FCOORD rotation; //inverse of skew
804
+
805
+ line_count = max_y - min_y + 1;
806
+ length = sqrt (gradient * gradient + 1);
807
+ rotation = FCOORD (1 / length, -gradient / length);
808
+ for (line_index = 0; line_index < line_count; line_index++)
809
+ deltas[line_index] = 0;
810
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
811
+ row = row_it.data ();
812
+ blob_it.set_to_list (row->blob_list ());
813
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
814
+ blob_it.forward ()) {
815
+ blob = blob_it.data ();
816
+ blob_box = blob->bounding_box ();
817
+ blob_box.rotate (rotation);//de-skew it
818
+ top = blob_box.top ();
819
+ bottom = blob_box.bottom ();
820
+ width =
821
+ (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
822
+ if ((inT32) floor (bottom) < min_y
823
+ || (inT32) floor (bottom) - min_y >= line_count)
824
+ fprintf (stderr,
825
+ "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
826
+ INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
827
+ //count transitions
828
+ index = (inT32) floor (bottom) - min_y;
829
+ deltas[index] += width;
830
+ if ((inT32) floor (top) < min_y
831
+ || (inT32) floor (top) - min_y >= line_count)
832
+ fprintf (stderr,
833
+ "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
834
+ INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
835
+ index = (inT32) floor (top) - min_y;
836
+ deltas[index] -= width;
837
+ }
838
+ }
839
+ occupation[0] = deltas[0];
840
+ for (line_index = 1; line_index < line_count; line_index++)
841
+ occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
842
+ }
843
+
844
+
845
+ /**********************************************************************
846
+ * compute_occupation_threshold
847
+ *
848
+ * Compute thresholds for textline or not for the occupation array.
849
+ **********************************************************************/
850
+
851
+ void compute_occupation_threshold( //project blobs
852
+ inT32 low_window, //below result point
853
+ inT32 high_window, //above result point
854
+ inT32 line_count, //array sizes
855
+ inT32 *occupation, //input projection
856
+ inT32 *thresholds //output thresholds
857
+ ) {
858
+ inT32 line_index; //of thresholds line
859
+ inT32 low_index; //in occupation
860
+ inT32 high_index; //in occupation
861
+ inT32 sum; //current average
862
+ inT32 divisor; //to get thresholds
863
+ inT32 min_index; //of min occ
864
+ inT32 min_occ; //min in locality
865
+ inT32 test_index; //for finding min
866
+
867
+ divisor =
868
+ (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
869
+ if (low_window + high_window < line_count) {
870
+ for (sum = 0, high_index = 0; high_index < low_window; high_index++)
871
+ sum += occupation[high_index];
872
+ for (low_index = 0; low_index < high_window; low_index++, high_index++)
873
+ sum += occupation[high_index];
874
+ min_occ = occupation[0];
875
+ min_index = 0;
876
+ for (test_index = 1; test_index < high_index; test_index++) {
877
+ if (occupation[test_index] <= min_occ) {
878
+ min_occ = occupation[test_index];
879
+ min_index = test_index; //find min in region
880
+ }
881
+ }
882
+ for (line_index = 0; line_index < low_window; line_index++)
883
+ thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
884
+ //same out to end
885
+ for (low_index = 0; high_index < line_count; low_index++, high_index++) {
886
+ sum -= occupation[low_index];
887
+ sum += occupation[high_index];
888
+ if (occupation[high_index] <= min_occ) {
889
+ //find min in region
890
+ min_occ = occupation[high_index];
891
+ min_index = high_index;
892
+ }
893
+ //lost min from region
894
+ if (min_index <= low_index) {
895
+ min_occ = occupation[low_index + 1];
896
+ min_index = low_index + 1;
897
+ for (test_index = low_index + 2; test_index <= high_index;
898
+ test_index++) {
899
+ if (occupation[test_index] <= min_occ) {
900
+ min_occ = occupation[test_index];
901
+ //find min in region
902
+ min_index = test_index;
903
+ }
904
+ }
905
+ }
906
+ thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
907
+ }
908
+ }
909
+ else {
910
+ min_occ = occupation[0];
911
+ min_index = 0;
912
+ for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
913
+ if (occupation[low_index] < min_occ) {
914
+ min_occ = occupation[low_index];
915
+ min_index = low_index;
916
+ }
917
+ sum += occupation[low_index];
918
+ }
919
+ line_index = 0;
920
+ }
921
+ for (; line_index < line_count; line_index++)
922
+ thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
923
+ //same out to end
924
+ }
925
+
926
+
927
+ /**********************************************************************
928
+ * compute_dropout_distances
929
+ *
930
+ * Compute the distance from each coordinate to the nearest dropout.
931
+ **********************************************************************/
932
+
933
+ void compute_dropout_distances( //project blobs
934
+ inT32 *occupation, //input projection
935
+ inT32 *thresholds, //output thresholds
936
+ inT32 line_count //array sizes
937
+ ) {
938
+ inT32 line_index; //of thresholds line
939
+ inT32 distance; //from prev dropout
940
+ inT32 next_dist; //to next dropout
941
+ inT32 back_index; //for back filling
942
+ inT32 prev_threshold; //before overwrite
943
+
944
+ distance = -line_count;
945
+ line_index = 0;
946
+ do {
947
+ do {
948
+ distance--;
949
+ prev_threshold = thresholds[line_index];
950
+ //distance from prev
951
+ thresholds[line_index] = distance;
952
+ line_index++;
953
+ }
954
+ while (line_index < line_count
955
+ && (occupation[line_index] < thresholds[line_index]
956
+ || occupation[line_index - 1] >= prev_threshold));
957
+ if (line_index < line_count) {
958
+ back_index = line_index - 1;
959
+ next_dist = 1;
960
+ while (next_dist < -distance && back_index >= 0) {
961
+ thresholds[back_index] = next_dist;
962
+ back_index--;
963
+ next_dist++;
964
+ distance++;
965
+ }
966
+ distance = 1;
967
+ }
968
+ }
969
+ while (line_index < line_count);
970
+ }
971
+
972
+
973
+ /**********************************************************************
974
+ * expand_rows
975
+ *
976
+ * Expand each row to the least of its allowed size and touching its
977
+ * neighbours. If the expansion would entirely swallow a neighbouring row
978
+ * then do so.
979
+ **********************************************************************/
980
+
981
+ void expand_rows( //find lines
982
+ ICOORD page_tr, //top right
983
+ TO_BLOCK *block, //block to do
984
+ float gradient, //gradient to fit
985
+ FCOORD rotation, //for drawing
986
+ inT32 block_edge, //edge of block
987
+ BOOL8 testing_on //correct orientation
988
+ ) {
989
+ BOOL8 swallowed_row; //eaten a neighbour
990
+ float y_max, y_min; //new row limits
991
+ float y_bottom, y_top; //allowed limits
992
+ TO_ROW *test_row; //next row
993
+ TO_ROW *row; //current row
994
+ //iterators
995
+ BLOBNBOX_IT blob_it = &block->blobs;
996
+ TO_ROW_IT row_it = block->get_rows ();
997
+
998
+ #ifndef GRAPHICS_DISABLED
999
+ if (textord_show_expanded_rows && testing_on) {
1000
+ if (to_win == NULL)
1001
+ create_to_win(page_tr);
1002
+ }
1003
+ #endif
1004
+
1005
+ adjust_row_limits(block); //shift min,max.
1006
+ if (textord_new_initial_xheight) {
1007
+ if (block->get_rows ()->length () == 0)
1008
+ return;
1009
+ compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1010
+ }
1011
+ assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
1012
+ //get real membership
1013
+ if (block->get_rows ()->length () == 0)
1014
+ return;
1015
+ fit_parallel_rows(block,
1016
+ gradient,
1017
+ rotation,
1018
+ block_edge,
1019
+ textord_show_expanded_rows &&testing_on);
1020
+ if (!textord_new_initial_xheight)
1021
+ compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1022
+ row_it.move_to_last ();
1023
+ do {
1024
+ row = row_it.data ();
1025
+ y_max = row->max_y (); //get current limits
1026
+ y_min = row->min_y ();
1027
+ y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
1028
+ textord_merge_desc;
1029
+ y_top = row->intercept () + block->line_size * textord_expansion_factor *
1030
+ (textord_merge_x + textord_merge_asc);
1031
+ if (y_min > y_bottom) { //expansion allowed
1032
+ if (textord_show_expanded_rows && testing_on)
1033
+ tprintf("Expanding bottom of row at %f from %f to %f\n",
1034
+ row->intercept(), y_min, y_bottom);
1035
+ //expandable
1036
+ swallowed_row = TRUE;
1037
+ while (swallowed_row && !row_it.at_last ()) {
1038
+ swallowed_row = FALSE;
1039
+ //get next one
1040
+ test_row = row_it.data_relative (1);
1041
+ //overlaps space
1042
+ if (test_row->max_y () > y_bottom) {
1043
+ if (test_row->min_y () > y_bottom) {
1044
+ if (textord_show_expanded_rows && testing_on)
1045
+ tprintf("Eating row below at %f\n", test_row->intercept());
1046
+ row_it.forward ();
1047
+ #ifndef GRAPHICS_DISABLED
1048
+ if (textord_show_expanded_rows && testing_on)
1049
+ plot_parallel_row(test_row,
1050
+ gradient,
1051
+ block_edge,
1052
+ ScrollView::WHITE,
1053
+ rotation);
1054
+ #endif
1055
+ blob_it.set_to_list (row->blob_list ());
1056
+ blob_it.add_list_after (test_row->blob_list ());
1057
+ //swallow complete row
1058
+ delete row_it.extract ();
1059
+ row_it.backward ();
1060
+ swallowed_row = TRUE;
1061
+ }
1062
+ else if (test_row->max_y () < y_min) {
1063
+ //shorter limit
1064
+ y_bottom = test_row->max_y ();
1065
+ if (textord_show_expanded_rows && testing_on)
1066
+ tprintf("Truncating limit to %f due to touching row at %f\n",
1067
+ y_bottom, test_row->intercept());
1068
+ }
1069
+ else {
1070
+ y_bottom = y_min; //can't expand it
1071
+ if (textord_show_expanded_rows && testing_on)
1072
+ tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1073
+ y_bottom, test_row->intercept());
1074
+ }
1075
+ }
1076
+ }
1077
+ y_min = y_bottom; //expand it
1078
+ }
1079
+ if (y_max < y_top) { //expansion allowed
1080
+ if (textord_show_expanded_rows && testing_on)
1081
+ tprintf("Expanding top of row at %f from %f to %f\n",
1082
+ row->intercept(), y_max, y_top);
1083
+ swallowed_row = TRUE;
1084
+ while (swallowed_row && !row_it.at_first ()) {
1085
+ swallowed_row = FALSE;
1086
+ //get one above
1087
+ test_row = row_it.data_relative (-1);
1088
+ if (test_row->min_y () < y_top) {
1089
+ if (test_row->max_y () < y_top) {
1090
+ if (textord_show_expanded_rows && testing_on)
1091
+ tprintf("Eating row above at %f\n", test_row->intercept());
1092
+ row_it.backward ();
1093
+ blob_it.set_to_list (row->blob_list ());
1094
+ #ifndef GRAPHICS_DISABLED
1095
+ if (textord_show_expanded_rows && testing_on)
1096
+ plot_parallel_row(test_row,
1097
+ gradient,
1098
+ block_edge,
1099
+ ScrollView::WHITE,
1100
+ rotation);
1101
+ #endif
1102
+ blob_it.add_list_after (test_row->blob_list ());
1103
+ //swallow complete row
1104
+ delete row_it.extract ();
1105
+ row_it.forward ();
1106
+ swallowed_row = TRUE;
1107
+ }
1108
+ else if (test_row->min_y () < y_max) {
1109
+ //shorter limit
1110
+ y_top = test_row->min_y ();
1111
+ if (textord_show_expanded_rows && testing_on)
1112
+ tprintf("Truncating limit to %f due to touching row at %f\n",
1113
+ y_top, test_row->intercept());
1114
+ }
1115
+ else {
1116
+ y_top = y_max; //can't expand it
1117
+ if (textord_show_expanded_rows && testing_on)
1118
+ tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1119
+ y_top, test_row->intercept());
1120
+ }
1121
+ }
1122
+ }
1123
+ y_max = y_top;
1124
+ }
1125
+ //new limits
1126
+ row->set_limits (y_min, y_max);
1127
+ row_it.backward ();
1128
+ }
1129
+ while (!row_it.at_last ());
1130
+ }
1131
+
1132
+
1133
+ /**********************************************************************
1134
+ * adjust_row_limits
1135
+ *
1136
+ * Change the limits of rows to suit the default fractions.
1137
+ **********************************************************************/
1138
+
1139
+ void adjust_row_limits( //tidy limits
1140
+ TO_BLOCK *block //block to do
1141
+ ) {
1142
+ TO_ROW *row; //current row
1143
+ float size; //size of row
1144
+ float ymax; //top of row
1145
+ float ymin; //bottom of row
1146
+ TO_ROW_IT row_it = block->get_rows ();
1147
+
1148
+ if (textord_show_expanded_rows)
1149
+ tprintf("Adjusting row limits for block(%d,%d)\n",
1150
+ block->block->bounding_box().left(),
1151
+ block->block->bounding_box().top());
1152
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1153
+ row = row_it.data ();
1154
+ size = row->max_y () - row->min_y ();
1155
+ if (textord_show_expanded_rows)
1156
+ tprintf("Row at %f has min %f, max %f, size %f\n",
1157
+ row->intercept(), row->min_y(), row->max_y(), size);
1158
+ size /= textord_merge_x + textord_merge_asc + textord_merge_desc;
1159
+ ymax = size * (textord_merge_x + textord_merge_asc);
1160
+ ymin = -size * textord_merge_desc;
1161
+ row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
1162
+ row->merged = FALSE;
1163
+ }
1164
+ }
1165
+
1166
+
1167
+ /**********************************************************************
1168
+ * compute_row_stats
1169
+ *
1170
+ * Compute the linespacing and offset.
1171
+ **********************************************************************/
1172
+
1173
+ void compute_row_stats( //find lines
1174
+ TO_BLOCK *block, //block to do
1175
+ BOOL8 testing_on //correct orientation
1176
+ ) {
1177
+ inT32 row_index; //of median
1178
+ TO_ROW *row; //current row
1179
+ TO_ROW *prev_row; //previous row
1180
+ float iqr; //inter quartile range
1181
+ TO_ROW_IT row_it = block->get_rows ();
1182
+ //number of rows
1183
+ inT16 rowcount = row_it.length ();
1184
+ TO_ROW **rows; //for choose nth
1185
+
1186
+ rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
1187
+ if (rows == NULL)
1188
+ MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
1189
+ rowcount = 0;
1190
+ prev_row = NULL;
1191
+ row_it.move_to_last (); //start at bottom
1192
+ do {
1193
+ row = row_it.data ();
1194
+ if (prev_row != NULL) {
1195
+ rows[rowcount++] = prev_row;
1196
+ prev_row->spacing = row->intercept () - prev_row->intercept ();
1197
+ if (testing_on)
1198
+ tprintf ("Row at %g yields spacing of %g\n",
1199
+ row->intercept (), prev_row->spacing);
1200
+ }
1201
+ prev_row = row;
1202
+ row_it.backward ();
1203
+ }
1204
+ while (!row_it.at_last ());
1205
+ block->key_row = prev_row;
1206
+ block->baseline_offset =
1207
+ fmod (prev_row->parallel_c (), block->line_spacing);
1208
+ if (testing_on)
1209
+ tprintf ("Blob based spacing=(%g,%g), offset=%g",
1210
+ block->line_size, block->line_spacing, block->baseline_offset);
1211
+ if (rowcount > 0) {
1212
+ row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
1213
+ sizeof (TO_ROW *), row_spacing_order);
1214
+ iqr = rows[row_index]->spacing;
1215
+ row_index = choose_nth_item (rowcount / 4, rows, rowcount,
1216
+ sizeof (TO_ROW *), row_spacing_order);
1217
+ iqr -= rows[row_index]->spacing;
1218
+ row_index = choose_nth_item (rowcount / 2, rows, rowcount,
1219
+ sizeof (TO_ROW *), row_spacing_order);
1220
+ block->key_row = rows[row_index];
1221
+ if (testing_on)
1222
+ tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
1223
+ if (rowcount > 2
1224
+ && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1225
+ if (!textord_new_initial_xheight) {
1226
+ if (rows[row_index]->spacing < block->line_spacing
1227
+ && rows[row_index]->spacing > block->line_size)
1228
+ //within range
1229
+ block->line_size = rows[row_index]->spacing;
1230
+ //spacing=size
1231
+ else if (rows[row_index]->spacing > block->line_spacing)
1232
+ block->line_size = block->line_spacing;
1233
+ //too big so use max
1234
+ }
1235
+ else {
1236
+ if (rows[row_index]->spacing < block->line_spacing)
1237
+ block->line_size = rows[row_index]->spacing;
1238
+ else
1239
+ block->line_size = block->line_spacing;
1240
+ //too big so use max
1241
+ }
1242
+ if (block->line_size < textord_min_xheight)
1243
+ block->line_size = (float) textord_min_xheight;
1244
+ block->line_spacing = rows[row_index]->spacing;
1245
+ block->max_blob_size =
1246
+ block->line_spacing * textord_excess_blobsize;
1247
+ }
1248
+ block->baseline_offset = fmod (rows[row_index]->intercept (),
1249
+ block->line_spacing);
1250
+ }
1251
+ if (testing_on)
1252
+ tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
1253
+ block->line_size, block->line_spacing, block->baseline_offset);
1254
+ free_mem(rows);
1255
+ }
1256
+
1257
+
1258
+ /**********************************************************************
1259
+ * compute_block_xheight
1260
+ *
1261
+ * Compute the xheight of the individual rows, then correlate them
1262
+ * and interpret ascenderless lines, correcting xheights.
1263
+ **********************************************************************/
1264
+
1265
+ void compute_block_xheight( //find lines
1266
+ TO_BLOCK *block, //block to do
1267
+ float gradient //global skew
1268
+ ) {
1269
+ TO_ROW *row; //current row
1270
+ int xh_count, desc_count; //no of samples
1271
+ float block_median; //median blob size
1272
+ int asc_count, cap_count;
1273
+ inT32 min_size, max_size; //limits on xheight
1274
+ inT32 evidence; //no of samples on row
1275
+ float xh_sum, desc_sum; //for averages
1276
+ float asc_sum, cap_sum;
1277
+ TO_ROW_IT row_it = block->get_rows ();
1278
+ STATS row_heights; //block evidence
1279
+
1280
+ if (row_it.empty ())
1281
+ return; //no rows
1282
+ block_median = median_block_xheight (block, gradient);
1283
+ block_median *= 2;
1284
+ if (block_median < block->line_size)
1285
+ block_median = block->line_size;
1286
+ // tprintf("Block median=%g, linesize=%g\n",
1287
+ // block_median,block->line_size);
1288
+ max_size = (inT32) ceil (block_median);
1289
+ min_size = (inT32) floor (block_median * textord_minxh);
1290
+ row_heights.set_range (min_size, max_size + 1);
1291
+ xh_count = desc_count = asc_count = cap_count = 0;
1292
+ xh_sum = desc_sum = asc_sum = cap_sum = 0.0f;
1293
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1294
+ row = row_it.data ();
1295
+ evidence = compute_row_xheight (row, min_size, max_size, gradient);
1296
+ if (row->xheight > 0 && row->ascrise > 0) {
1297
+ row_heights.add ((inT32) row->xheight, evidence);
1298
+ xh_count += evidence;
1299
+ asc_sum += row->ascrise;
1300
+ asc_count++;
1301
+ }
1302
+ else if (row->xheight > 0) {
1303
+ cap_sum += row->xheight; //assume just caps
1304
+ cap_count++;
1305
+ }
1306
+ if (row->descdrop != 0) {
1307
+ desc_sum += row->descdrop;
1308
+ desc_count++;
1309
+ }
1310
+ }
1311
+ if (xh_count > 0) {
1312
+ //median
1313
+ xh_sum = row_heights.ile (0.5);
1314
+ asc_sum /= asc_count;
1315
+ }
1316
+ else if (cap_count > 0) {
1317
+ cap_sum /= cap_count; //must assume caps
1318
+ xh_sum =
1319
+ cap_sum * textord_merge_x / (textord_merge_x + textord_merge_asc);
1320
+ asc_sum =
1321
+ cap_sum * textord_merge_asc / (textord_merge_x + textord_merge_asc);
1322
+ }
1323
+ else {
1324
+ //default sizes
1325
+ xh_sum = block_median * textord_merge_x;
1326
+ asc_sum = block_median * textord_merge_asc;
1327
+ }
1328
+ if (desc_count > 0) {
1329
+ desc_sum /= desc_count;
1330
+ }
1331
+ else {
1332
+ desc_sum = xh_sum * textord_merge_desc / textord_merge_x;
1333
+ }
1334
+ // tprintf("Block average x height=%g, count=%d, asc=%g/%d, desc=%g/%d,cap=%g/%d\n",
1335
+ // xh_sum,xh_count,asc_sum,asc_count,desc_sum,desc_count,
1336
+ // cap_sum,cap_count);
1337
+ if (xh_sum < textord_min_xheight)
1338
+ xh_sum = (float) textord_min_xheight;
1339
+ block->xheight = xh_sum;
1340
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1341
+ correct_row_xheight (row_it.data (), xh_sum, asc_sum, desc_sum);
1342
+ }
1343
+ }
1344
+
1345
+
1346
+ /**********************************************************************
1347
+ * median_block_xheight
1348
+ *
1349
+ * Compute the linespacing and offset.
1350
+ **********************************************************************/
1351
+
1352
+ float median_block_xheight( //find lines
1353
+ TO_BLOCK *block, //block to do
1354
+ float gradient //global skew
1355
+ ) {
1356
+ TO_ROW *row; //current row
1357
+ float result; //output size
1358
+ float xcentre; //centre of blob
1359
+ TO_ROW_IT row_it = block->get_rows ();
1360
+ BLOBNBOX_IT blob_it;
1361
+ BLOBNBOX *blob; //current blob
1362
+ float *heights; //for choose nth
1363
+ inT32 blob_count; //blobs in block
1364
+ inT32 blob_index; //current blob
1365
+
1366
+ blob_count = 0;
1367
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
1368
+ blob_count += row_it.data ()->blob_list ()->length ();
1369
+ heights = (float *) alloc_mem (blob_count * sizeof (float));
1370
+ if (heights == NULL)
1371
+ MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
1372
+
1373
+ blob_index = 0;
1374
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1375
+ row = row_it.data ();
1376
+ blob_it.set_to_list (row->blob_list ());
1377
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1378
+ blob_it.forward ()) {
1379
+ blob = blob_it.data ();
1380
+ if (!blob->joined_to_prev ()) {
1381
+ xcentre =
1382
+ (blob->bounding_box ().left () +
1383
+ blob->bounding_box ().right ()) / 2.0f;
1384
+ heights[blob_index] =
1385
+ blob->bounding_box ().top () - gradient * xcentre -
1386
+ row->parallel_c ();
1387
+ if (heights[blob_index] > 0)
1388
+ blob_index++;
1389
+ }
1390
+ }
1391
+ }
1392
+ ASSERT_HOST (blob_index > 0); //dont expect 0
1393
+ blob_count = blob_index;
1394
+ blob_index = choose_nth_item (blob_count / 2, heights, blob_count);
1395
+ result = heights[blob_index];
1396
+ free_mem(heights);
1397
+ return result;
1398
+ }
1399
+
1400
+
1401
+ /**********************************************************************
1402
+ * compute_row_xheight
1403
+ *
1404
+ * Estimate the xheight of this row.
1405
+ * Compute the ascender rise and descender drop at the same time.
1406
+ **********************************************************************/
1407
+
1408
+ inT32 compute_row_xheight( //find lines
1409
+ TO_ROW *row, //row to do
1410
+ inT32 min_height, //min xheight
1411
+ inT32 max_height, //max xheight
1412
+ float gradient //global skew
1413
+ ) {
1414
+ BOOL8 in_best_pile; //control of mode size
1415
+ inT32 prev_size; //previous size
1416
+ float xcentre; //centre of blob
1417
+ float height; //height of blob
1418
+ BLOBNBOX_IT blob_it = row->blob_list ();
1419
+ BLOBNBOX *blob; //current blob
1420
+ inT32 blob_count; //blobs in block
1421
+ inT32 x; //xheight index
1422
+ inT32 asc; //ascender index
1423
+ inT32 blob_index; //current blob
1424
+ inT32 mode_count; //no of modes
1425
+ inT32 best_count; //count of best x so far
1426
+ float ratio; //size ratio
1427
+ inT32 modes[MAX_HEIGHT_MODES]; //biggest piles
1428
+ STATS heights (min_height, max_height + 1);
1429
+
1430
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1431
+ blob = blob_it.data ();
1432
+ if (!blob->joined_to_prev ()) {
1433
+ xcentre =
1434
+ (blob->bounding_box ().left () +
1435
+ blob->bounding_box ().right ()) / 2.0f;
1436
+ height = blob->bounding_box ().top ();
1437
+ if (textord_fix_xheight_bug)
1438
+ height -= row->baseline.y (xcentre);
1439
+ else
1440
+ height -= gradient * xcentre + row->parallel_c ();
1441
+ if (height >= min_height && height <= max_height
1442
+ && (!textord_xheight_tweak || height > textord_min_xheight))
1443
+ heights.add ((inT32) floor (height + 0.5), 1);
1444
+ }
1445
+ }
1446
+ blob_index = heights.mode (); //find mode
1447
+ //get count of mode
1448
+ blob_count = heights.pile_count (blob_index);
1449
+ if (textord_debug_xheights)
1450
+ tprintf ("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d,%d\n",
1451
+ min_height, max_height, blob_index, blob_count,
1452
+ heights.get_total (), row->blob_list ()->length ());
1453
+ row->ascrise = 0.0f;
1454
+ row->xheight = 0.0f;
1455
+ row->descdrop = 0.0f; //undefined;
1456
+ in_best_pile = FALSE;
1457
+ prev_size = -MAX_INT32;
1458
+ best_count = 0;
1459
+ if (blob_count > 0) {
1460
+ //get biggest ones
1461
+ mode_count = compute_height_modes (&heights, min_height, max_height, modes, MAX_HEIGHT_MODES);
1462
+ for (x = 0; x < mode_count - 1; x++) {
1463
+ if (modes[x] != prev_size + 1)
1464
+ in_best_pile = FALSE; //had empty height
1465
+ if (heights.pile_count (modes[x])
1466
+ >= blob_count * textord_xheight_mode_fraction
1467
+ && (in_best_pile || heights.pile_count (modes[x]) > best_count)) {
1468
+ for (asc = x + 1; asc < mode_count; asc++) {
1469
+ ratio = (float) modes[asc] / modes[x];
1470
+ if (textord_ascx_ratio_min < ratio
1471
+ && ratio < textord_ascx_ratio_max
1472
+ && heights.pile_count (modes[asc])
1473
+ >= blob_count * textord_ascheight_mode_fraction) {
1474
+ if (heights.pile_count (modes[x]) > best_count) {
1475
+ in_best_pile = TRUE;
1476
+ best_count = heights.pile_count (modes[x]);
1477
+ }
1478
+ // tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
1479
+ // modes[x],modes[asc]-modes[x],
1480
+ // heights.pile_count(modes[x]),
1481
+ // ratio);
1482
+ prev_size = modes[x];
1483
+ row->xheight = (float) modes[x];
1484
+ row->ascrise = (float) (modes[asc] - modes[x]);
1485
+ }
1486
+ }
1487
+ }
1488
+ }
1489
+ if (row->xheight == 0) {
1490
+ //single mode
1491
+ row->xheight = (float) blob_index;
1492
+ row->ascrise = 0.0f;
1493
+ if (textord_debug_xheights)
1494
+ tprintf ("Single mode xheight set to %g\n", row->xheight);
1495
+ }
1496
+ else if (textord_debug_xheights)
1497
+ tprintf ("Multi-mode xheight set to %g, asc=%g\n",
1498
+ row->xheight, row->ascrise);
1499
+ row->descdrop = (float) compute_row_descdrop (row, gradient);
1500
+ //find descenders
1501
+ }
1502
+ return best_count;
1503
+ }
1504
+
1505
+
1506
+ /**********************************************************************
1507
+ * compute_row_descdrop
1508
+ *
1509
+ * Estimate the descdrop of this row.
1510
+ **********************************************************************/
1511
+
1512
+ inT32 compute_row_descdrop( //find lines
1513
+ TO_ROW *row, //row to do
1514
+ float gradient //global skew
1515
+ ) {
1516
+ inT32 min_height = (inT32) floor (row->xheight * textord_descx_ratio_min);
1517
+ inT32 max_height = (inT32) floor (row->xheight * textord_descx_ratio_max);
1518
+ float xcentre; //centre of blob
1519
+ float height; //height of blob
1520
+ BLOBNBOX_IT blob_it = row->blob_list ();
1521
+ BLOBNBOX *blob; //current blob
1522
+ inT32 blob_count; //blobs in block
1523
+ inT32 blob_index; //current blob
1524
+ STATS heights (min_height, max_height + 1);
1525
+
1526
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1527
+ blob = blob_it.data ();
1528
+ if (!blob->joined_to_prev ()) {
1529
+ xcentre =
1530
+ (blob->bounding_box ().left () +
1531
+ blob->bounding_box ().right ()) / 2.0f;
1532
+ height =
1533
+ gradient * xcentre + row->parallel_c () -
1534
+ blob->bounding_box ().bottom ();
1535
+ if (height >= min_height && height <= max_height)
1536
+ heights.add ((inT32) floor (height + 0.5), 1);
1537
+ }
1538
+ }
1539
+ blob_index = heights.mode (); //find mode
1540
+ //get count of mode
1541
+ blob_count = heights.pile_count (blob_index);
1542
+ return blob_count > 0 ? -blob_index : 0;
1543
+ }
1544
+
1545
+
1546
+ /**********************************************************************
1547
+ * compute_height_modes
1548
+ *
1549
+ * Find the top maxmodes values in the input array and put their
1550
+ * indices in the output in the order in which they occurred.
1551
+ **********************************************************************/
1552
+
1553
+ inT32 compute_height_modes( //find lines
1554
+ STATS *heights, //stats to search
1555
+ inT32 min_height, //bottom of range
1556
+ inT32 max_height, //top of range
1557
+ inT32 *modes, //output array
1558
+ inT32 maxmodes //size of modes
1559
+ ) {
1560
+ inT32 pile_count; //no in source pile
1561
+ inT32 src_count; //no of source entries
1562
+ inT32 src_index; //current entry
1563
+ inT32 least_count; //height of smalllest
1564
+ inT32 least_index; //index of least
1565
+ inT32 dest_count; //index in modes
1566
+
1567
+ src_count = max_height + 1 - min_height;
1568
+ dest_count = 0;
1569
+ least_count = MAX_INT32;
1570
+ least_index = -1;
1571
+ for (src_index = 0; src_index < src_count; src_index++) {
1572
+ pile_count = heights->pile_count (min_height + src_index);
1573
+ if (pile_count > 0) {
1574
+ if (dest_count < maxmodes) {
1575
+ if (pile_count < least_count) {
1576
+ //find smallest in array
1577
+ least_count = pile_count;
1578
+ least_index = dest_count;
1579
+ }
1580
+ modes[dest_count++] = min_height + src_index;
1581
+ }
1582
+ else if (pile_count >= least_count) {
1583
+ while (least_index < maxmodes - 1) {
1584
+ modes[least_index] = modes[least_index + 1];
1585
+ //shuffle up
1586
+ least_index++;
1587
+ }
1588
+ //new one on end
1589
+ modes[maxmodes - 1] = min_height + src_index;
1590
+ if (pile_count == least_count) {
1591
+ //new smallest
1592
+ least_index = maxmodes - 1;
1593
+ }
1594
+ else {
1595
+ least_count = heights->pile_count (modes[0]);
1596
+ least_index = 0;
1597
+ for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1598
+ pile_count = heights->pile_count (modes[dest_count]);
1599
+ if (pile_count < least_count) {
1600
+ //find smallest
1601
+ least_count = pile_count;
1602
+ least_index = dest_count;
1603
+ }
1604
+ }
1605
+ }
1606
+ }
1607
+ }
1608
+ }
1609
+ return dest_count;
1610
+ }
1611
+
1612
+
1613
+ /**********************************************************************
1614
+ * correct_row_xheight
1615
+ *
1616
+ * Adjust the xheight etc of this row if not within reasonable limits
1617
+ * of the average for the block.
1618
+ **********************************************************************/
1619
+
1620
+ void correct_row_xheight( //fix bad values
1621
+ TO_ROW *row, //row to fix
1622
+ float xheight, //average values
1623
+ float ascrise,
1624
+ float descdrop) {
1625
+ if (textord_row_xheights) {
1626
+ if (row->xheight <= 0)
1627
+ row->xheight = xheight;
1628
+ if (row->ascrise < row->xheight * (textord_ascx_ratio_min - 1)) {
1629
+ if (row->xheight >= xheight * (1 - textord_xheight_error_margin)
1630
+ && row->xheight <= xheight * (1 + textord_xheight_error_margin)) {
1631
+ row->all_caps = FALSE;
1632
+ row->ascrise = ascrise;
1633
+ }
1634
+ else if (row->xheight >=
1635
+ (xheight + ascrise) * (1 - textord_xheight_error_margin)
1636
+ && row->xheight <=
1637
+ (xheight + ascrise) * (1 + textord_xheight_error_margin)) {
1638
+ row->all_caps = TRUE;
1639
+ //it was caps
1640
+ row->ascrise = row->xheight - xheight;
1641
+ row->xheight = xheight;
1642
+ }
1643
+ else {
1644
+ row->all_caps = TRUE;
1645
+ row->ascrise = row->xheight * ascrise / (xheight + ascrise);
1646
+ row->xheight -= row->ascrise;
1647
+ }
1648
+ }
1649
+ else
1650
+ row->all_caps = FALSE;
1651
+ row->ascrise = ascrise;
1652
+ if (row->descdrop >= -row->xheight * (textord_ascx_ratio_min - 1))
1653
+ row->descdrop = descdrop;
1654
+ }
1655
+ else {
1656
+ if (row->xheight < xheight * (1 - textord_xheight_error_margin)
1657
+ || row->xheight > xheight * (1 + textord_xheight_error_margin))
1658
+ row->xheight = xheight; //set to average
1659
+ row->all_caps = row->ascrise <= 0;
1660
+ if (row->ascrise < ascrise * (1 - textord_xheight_error_margin)
1661
+ || row->ascrise > ascrise * (1 + textord_xheight_error_margin))
1662
+ row->ascrise = ascrise; //set to average
1663
+ if (row->descdrop < descdrop * (1 - textord_xheight_error_margin)
1664
+ || row->descdrop > descdrop * (1 + textord_xheight_error_margin))
1665
+ row->descdrop = descdrop; //set to average
1666
+ }
1667
+ }
1668
+
1669
+
1670
+ /**********************************************************************
1671
+ * separate_underlines
1672
+ *
1673
+ * Test wide objects for being potential underlines. If they are then
1674
+ * put them in a separate list in the block.
1675
+ **********************************************************************/
1676
+
1677
+ void separate_underlines( //make rough chars
1678
+ TO_BLOCK *block, //block to do
1679
+ float gradient, //skew angle
1680
+ FCOORD rotation, //inverse landscape
1681
+ BOOL8 testing_on //correct orientation
1682
+ ) {
1683
+ BLOBNBOX *blob; //current blob
1684
+ PBLOB *poly_blob; //rotated blob
1685
+ C_BLOB *rotated_blob; //rotated blob
1686
+ TO_ROW *row; //current row
1687
+ float length; //of g_vec
1688
+ TBOX blob_box;
1689
+ FCOORD blob_rotation; //inverse of rotation
1690
+ FCOORD g_vec; //skew rotation
1691
+ BLOBNBOX_IT blob_it; //iterator
1692
+ //iterator
1693
+ BLOBNBOX_IT under_it = &block->underlines;
1694
+ TO_ROW_IT row_it = block->get_rows ();
1695
+
1696
+ //length of vector
1697
+ length = sqrt (1 + gradient * gradient);
1698
+ g_vec = FCOORD (1 / length, -gradient / length);
1699
+ blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1700
+ blob_rotation.rotate (g_vec); //unoding everything
1701
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1702
+ row = row_it.data ();
1703
+ //get blobs
1704
+ blob_it.set_to_list (row->blob_list ());
1705
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1706
+ blob_it.forward ()) {
1707
+ blob = blob_it.data ();
1708
+ blob_box = blob->bounding_box ();
1709
+ if (blob_box.width () > block->line_size * textord_underline_width) {
1710
+ if (textord_cblob_blockocc && blob->cblob () != NULL) {
1711
+ rotated_blob = crotate_cblob (blob->cblob (),
1712
+ blob_rotation);
1713
+ if (test_underline (testing_on && textord_show_final_rows,
1714
+ rotated_blob, (inT16) row->intercept (),
1715
+ (inT16) (block->line_size *
1716
+ (textord_merge_x +
1717
+ textord_merge_asc / 2.0f)))) {
1718
+ under_it.add_after_then_move (blob_it.extract ());
1719
+ if (testing_on && textord_show_final_rows) {
1720
+ tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
1721
+ rotated_blob->bounding_box ().left (),
1722
+ rotated_blob->bounding_box ().bottom (),
1723
+ rotated_blob->bounding_box ().right (),
1724
+ rotated_blob->bounding_box ().top ());
1725
+ tprintf ("(Was (%d,%d)->(%d,%d))\n",
1726
+ blob_box.left (), blob_box.bottom (),
1727
+ blob_box.right (), blob_box.top ());
1728
+ }
1729
+ }
1730
+ delete rotated_blob;
1731
+ }
1732
+ else {
1733
+ if (blob->blob () != NULL) {
1734
+ // if (testing_on && textord_show_final_rows)
1735
+ // tprintf("Rotating by (%g,%g)\n",
1736
+ // blob_rotation.x(),blob_rotation.y());
1737
+ poly_blob = rotate_blob (blob->blob (), blob_rotation);
1738
+ }
1739
+ else
1740
+ poly_blob = rotate_cblob (blob->cblob (),
1741
+ block->line_size,
1742
+ blob_rotation);
1743
+ if (test_underline
1744
+ (testing_on
1745
+ && textord_show_final_rows, poly_blob,
1746
+ row->intercept (),
1747
+ block->line_size * (textord_merge_x +
1748
+ textord_merge_asc / 2))) {
1749
+ if (testing_on && textord_show_final_rows) {
1750
+ tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
1751
+ poly_blob->bounding_box ().left (),
1752
+ poly_blob->bounding_box ().bottom (),
1753
+ poly_blob->bounding_box ().right (),
1754
+ poly_blob->bounding_box ().top ());
1755
+ tprintf ("(Was (%d,%d)->(%d,%d))\n",
1756
+ blob_box.left (), blob_box.bottom (),
1757
+ blob_box.right (), blob_box.top ());
1758
+ }
1759
+ under_it.add_after_then_move (blob_it.extract ());
1760
+ }
1761
+ delete poly_blob;
1762
+ }
1763
+ }
1764
+ }
1765
+ }
1766
+ }
1767
+
1768
+
1769
+ /**********************************************************************
1770
+ * pre_associate_blobs
1771
+ *
1772
+ * Associate overlapping blobs and fake chop wide blobs.
1773
+ **********************************************************************/
1774
+
1775
+ void pre_associate_blobs( //make rough chars
1776
+ ICOORD page_tr, //top right
1777
+ TO_BLOCK *block, //block to do
1778
+ FCOORD rotation, //inverse landscape
1779
+ BOOL8 testing_on //correct orientation
1780
+ ) {
1781
+ #ifndef GRAPHICS_DISABLED
1782
+ ScrollView::Color colour; //of boxes
1783
+ #endif
1784
+ inT16 overlap; //of adjacent boxes
1785
+ BLOBNBOX *blob; //current blob
1786
+ BLOBNBOX *nextblob; //next in list
1787
+ TBOX blob_box;
1788
+ TBOX next_box; //next blob
1789
+ FCOORD blob_rotation; //inverse of rotation
1790
+ BLOBNBOX_IT blob_it; //iterator
1791
+ BLOBNBOX_IT start_it; //iterator
1792
+ TO_ROW_IT row_it = block->get_rows ();
1793
+
1794
+ #ifndef GRAPHICS_DISABLED
1795
+ colour = ScrollView::RED;
1796
+ #endif
1797
+
1798
+ blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1799
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1800
+ //get blobs
1801
+ blob_it.set_to_list (row_it.data ()->blob_list ());
1802
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1803
+ blob_it.forward ()) {
1804
+ blob = blob_it.data ();
1805
+ blob_box = blob->bounding_box ();
1806
+ start_it = blob_it; //save start point
1807
+ // if (testing_on && textord_show_final_blobs)
1808
+ // {
1809
+ // tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
1810
+ // blob_box.left(),blob_box.bottom(),
1811
+ // blob_box.right(),blob_box.top(),
1812
+ // (void*)blob,blob_it.length());
1813
+ // }
1814
+ do {
1815
+ if (!blob_it.at_last ()) {
1816
+ nextblob = blob_it.data_relative (1);
1817
+ next_box = nextblob->bounding_box ();
1818
+ overlap = next_box.width ();
1819
+ if (blob_box.left () > next_box.left ())
1820
+ overlap -= blob_box.left () - next_box.left ();
1821
+ if (blob_box.right () < next_box.right ())
1822
+ overlap -= next_box.right () - blob_box.right ();
1823
+ if (overlap >= next_box.width () / 2
1824
+ || overlap >= blob_box.width () / 2) {
1825
+ //merge new blob
1826
+ blob->merge (nextblob);
1827
+ //get bigger box
1828
+ blob_box = blob->bounding_box ();
1829
+ blob_it.forward ();
1830
+ }
1831
+ else
1832
+ overlap = -1; //no overlap
1833
+ }
1834
+ else
1835
+ overlap = -1; //no overlap
1836
+ }
1837
+ while (overlap >= 0);
1838
+ blob->chop (&start_it, &blob_it,
1839
+ blob_rotation,
1840
+ block->line_size * textord_merge_x *
1841
+ textord_chop_width);
1842
+ //attempt chop
1843
+ }
1844
+ #ifndef GRAPHICS_DISABLED
1845
+ if (testing_on && textord_show_final_blobs) {
1846
+ if (to_win == NULL)
1847
+ create_to_win(page_tr);
1848
+ to_win->Pen(colour);
1849
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1850
+ blob_it.forward ()) {
1851
+ blob = blob_it.data ();
1852
+ blob_box = blob->bounding_box ();
1853
+ blob_box.rotate (rotation);
1854
+ if (!blob->joined_to_prev ()) {
1855
+ to_win->Rectangle (blob_box.left (), blob_box.bottom (),
1856
+ blob_box.right (), blob_box.top ());
1857
+ }
1858
+ }
1859
+ colour = (ScrollView::Color) (colour + 1);
1860
+ if (colour > ScrollView::MAGENTA)
1861
+ colour = ScrollView::RED;
1862
+ }
1863
+ #endif
1864
+ }
1865
+ }
1866
+
1867
+
1868
+ /**********************************************************************
1869
+ * fit_parallel_rows
1870
+ *
1871
+ * Re-fit the rows in the block to the given gradient.
1872
+ **********************************************************************/
1873
+
1874
+ void fit_parallel_rows( //find lines
1875
+ TO_BLOCK *block, //block to do
1876
+ float gradient, //gradient to fit
1877
+ FCOORD rotation, //for drawing
1878
+ inT32 block_edge, //edge of block
1879
+ BOOL8 testing_on //correct orientation
1880
+ ) {
1881
+ #ifndef GRAPHICS_DISABLED
1882
+ ScrollView::Color colour; //of row
1883
+ #endif
1884
+ TO_ROW_IT row_it = block->get_rows ();
1885
+
1886
+ row_it.move_to_first ();
1887
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1888
+ if (row_it.data ()->blob_list ()->empty ())
1889
+ delete row_it.extract (); //nothing in it
1890
+ else
1891
+ fit_parallel_lms (gradient, row_it.data ());
1892
+ }
1893
+ #ifndef GRAPHICS_DISABLED
1894
+ if (testing_on) {
1895
+ colour = ScrollView::RED;
1896
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1897
+ plot_parallel_row (row_it.data (), gradient,
1898
+ block_edge, colour, rotation);
1899
+ colour = (ScrollView::Color) (colour + 1);
1900
+ if (colour > ScrollView::MAGENTA)
1901
+ colour = ScrollView::RED;
1902
+ }
1903
+ }
1904
+ #endif
1905
+ row_it.sort (row_y_order); //may have gone out of order
1906
+ }
1907
+
1908
+
1909
+ /**********************************************************************
1910
+ * fit_parallel_lms
1911
+ *
1912
+ * Fit an LMS line to a row.
1913
+ * Make the fit parallel to the given gradient and set the
1914
+ * row accordingly.
1915
+ **********************************************************************/
1916
+
1917
+ void fit_parallel_lms( //sort function
1918
+ float gradient, //forced gradient
1919
+ TO_ROW *row //row to fit
1920
+ ) {
1921
+ float c; //fitted line
1922
+ int blobcount; //no of blobs
1923
+ TBOX box; //blob box
1924
+ LMS lms (row->blob_list ()->length ());
1925
+ //blobs
1926
+ BLOBNBOX_IT blob_it = row->blob_list ();
1927
+
1928
+ blobcount = 0;
1929
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1930
+ if (!blob_it.data ()->joined_to_prev ()) {
1931
+ box = blob_it.data ()->bounding_box ();
1932
+ lms.
1933
+ add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
1934
+ blobcount++;
1935
+ }
1936
+ }
1937
+ lms.constrained_fit (gradient, c);
1938
+ row->set_parallel_line (gradient, c, lms.error ());
1939
+ if (textord_straight_baselines && blobcount > lms_line_trials) {
1940
+ lms.fit (gradient, c);
1941
+ }
1942
+ //set the other too
1943
+ row->set_line (gradient, c, lms.error ());
1944
+ }
1945
+
1946
+
1947
+ /**********************************************************************
1948
+ * make_spline_rows
1949
+ *
1950
+ * Re-fit the rows in the block to the given gradient.
1951
+ **********************************************************************/
1952
+
1953
+ void make_spline_rows( //find lines
1954
+ TO_BLOCK *block, //block to do
1955
+ float gradient, //gradient to fit
1956
+ FCOORD rotation, //for drawing
1957
+ inT32 block_edge, //edge of block
1958
+ BOOL8 testing_on //correct orientation
1959
+ ) {
1960
+ #ifndef GRAPHICS_DISABLED
1961
+ ScrollView::Color colour; //of row
1962
+ #endif
1963
+ TO_ROW_IT row_it = block->get_rows ();
1964
+
1965
+ row_it.move_to_first ();
1966
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1967
+ if (row_it.data ()->blob_list ()->empty ())
1968
+ delete row_it.extract (); //nothing in it
1969
+ else
1970
+ make_baseline_spline (row_it.data (), block);
1971
+ }
1972
+ if (textord_old_baselines) {
1973
+ #ifndef GRAPHICS_DISABLED
1974
+ if (testing_on) {
1975
+ colour = ScrollView::RED;
1976
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
1977
+ row_it.forward ()) {
1978
+ row_it.data ()->baseline.plot (to_win, colour);
1979
+ colour = (ScrollView::Color) (colour + 1);
1980
+ if (colour > ScrollView::MAGENTA)
1981
+ colour = ScrollView::RED;
1982
+ }
1983
+ }
1984
+ #endif
1985
+ make_old_baselines(block, testing_on);
1986
+ }
1987
+ #ifndef GRAPHICS_DISABLED
1988
+ if (testing_on) {
1989
+ colour = ScrollView::RED;
1990
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1991
+ row_it.data ()->baseline.plot (to_win, colour);
1992
+ colour = (ScrollView::Color) (colour + 1);
1993
+ if (colour > ScrollView::MAGENTA)
1994
+ colour = ScrollView::RED;
1995
+ }
1996
+ }
1997
+ #endif
1998
+ }
1999
+
2000
+
2001
+ /**********************************************************************
2002
+ * make_baseline_spline
2003
+ *
2004
+ * Fit an LMS line to a row.
2005
+ * Make the fit parallel to the given gradient and set the
2006
+ * row accordingly.
2007
+ **********************************************************************/
2008
+
2009
+ void make_baseline_spline( //sort function
2010
+ TO_ROW *row, //row to fit
2011
+ TO_BLOCK *block //block it came from
2012
+ ) {
2013
+ float b, c; //fitted curve
2014
+ float middle; //x middle of blob
2015
+ TBOX box; //blob box
2016
+ LMS lms (row->blob_list ()->length ());
2017
+ //blobs
2018
+ BLOBNBOX_IT blob_it = row->blob_list ();
2019
+ inT32 *xstarts; //spline boundaries
2020
+ double *coeffs; //quadratic coeffs
2021
+ inT32 segments; //no of segments
2022
+ inT32 segment; //current segment
2023
+
2024
+ xstarts =
2025
+ (inT32 *) alloc_mem ((row->blob_list ()->length () + 1) * sizeof (inT32));
2026
+ if (segment_baseline (row, block, segments, xstarts)
2027
+ && !textord_straight_baselines && !textord_parallel_baselines) {
2028
+ if (textord_quadratic_baselines) {
2029
+ coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2030
+ for (segment = 0; segment < segments; segment++) {
2031
+ lms.clear ();
2032
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
2033
+ blob_it.forward ()) {
2034
+ if (!blob_it.data ()->joined_to_prev ()) {
2035
+ box = blob_it.data ()->bounding_box ();
2036
+ middle = (box.left () + box.right ()) / 2.0;
2037
+ if (middle >= xstarts[segment]
2038
+ && middle < xstarts[segment + 1]) {
2039
+ lms.add (FCOORD (middle, box.bottom ()));
2040
+ }
2041
+ }
2042
+ }
2043
+ if (textord_quadratic_baselines)
2044
+ lms.fit_quadratic (block->line_size *
2045
+ textord_spline_outlier_fraction,
2046
+ coeffs[segment * 3], b, c);
2047
+ else {
2048
+ lms.fit (b, c);
2049
+ coeffs[segment * 3] = 0;
2050
+ }
2051
+ coeffs[segment * 3 + 1] = b;
2052
+ coeffs[segment * 3 + 2] = c;
2053
+ }
2054
+ }
2055
+ else
2056
+ coeffs = linear_spline_baseline (row, block, segments, xstarts);
2057
+ }
2058
+ else {
2059
+ xstarts[1] = xstarts[segments];
2060
+ segments = 1;
2061
+ coeffs = (double *) alloc_mem (3 * sizeof (double));
2062
+ coeffs[0] = 0;
2063
+ coeffs[1] = row->line_m ();
2064
+ coeffs[2] = row->line_c ();
2065
+ }
2066
+ row->baseline = QSPLINE (segments, xstarts, coeffs);
2067
+ free_mem(coeffs);
2068
+ free_mem(xstarts);
2069
+ }
2070
+
2071
+
2072
+ /**********************************************************************
2073
+ * segment_baseline
2074
+ *
2075
+ * Divide the baseline up into segments which require a different
2076
+ * quadratic fitted to them.
2077
+ * Return TRUE if enough blobs were far enough away to need a quadratic.
2078
+ **********************************************************************/
2079
+
2080
+ BOOL8
2081
+ segment_baseline ( //split baseline
2082
+ TO_ROW * row, //row to fit
2083
+ TO_BLOCK * block, //block it came from
2084
+ inT32 & segments, //no fo segments
2085
+ inT32 xstarts[] //coords of segments
2086
+ ) {
2087
+ BOOL8 needs_curve; //needs curved line
2088
+ int blobcount; //no of blobs
2089
+ int blobindex; //current blob
2090
+ int last_state; //above, on , below
2091
+ int state; //of current blob
2092
+ float yshift; //from baseline
2093
+ TBOX box; //blob box
2094
+ TBOX new_box; //new_it box
2095
+ float middle; //xcentre of blob
2096
+ //blobs
2097
+ BLOBNBOX_IT blob_it = row->blob_list ();
2098
+ BLOBNBOX_IT new_it = blob_it; //front end
2099
+ SORTED_FLOATS yshifts; //shifts from baseline
2100
+
2101
+ needs_curve = FALSE;
2102
+ box = box_next_pre_chopped (&blob_it);
2103
+ xstarts[0] = box.left ();
2104
+ segments = 1;
2105
+ blobcount = row->blob_list ()->length ();
2106
+ if (textord_oldbl_debug)
2107
+ tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
2108
+ blobcount, box.left (), box.bottom ());
2109
+ if (blobcount <= textord_spline_medianwin
2110
+ || blobcount < textord_spline_minblobs) {
2111
+ blob_it.move_to_last ();
2112
+ box = blob_it.data ()->bounding_box ();
2113
+ xstarts[1] = box.right ();
2114
+ return FALSE;
2115
+ }
2116
+ last_state = 0;
2117
+ new_it.mark_cycle_pt ();
2118
+ for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2119
+ new_box = box_next_pre_chopped (&new_it);
2120
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2121
+ yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2122
+ //record shift
2123
+ yshifts.add (yshift, blobindex);
2124
+ if (new_it.cycled_list ()) {
2125
+ xstarts[1] = new_box.right ();
2126
+ return FALSE;
2127
+ }
2128
+ }
2129
+ for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
2130
+ box = box_next_pre_chopped (&blob_it);
2131
+ do {
2132
+ new_box = box_next_pre_chopped (&new_it);
2133
+ //get middle one
2134
+ yshift = yshifts[textord_spline_medianwin / 2];
2135
+ if (yshift > textord_spline_shift_fraction * block->line_size)
2136
+ state = 1;
2137
+ else if (-yshift > textord_spline_shift_fraction * block->line_size)
2138
+ state = -1;
2139
+ else
2140
+ state = 0;
2141
+ if (state != 0)
2142
+ needs_curve = TRUE;
2143
+ // tprintf("State=%d, prev=%d, shift=%g\n",
2144
+ // state,last_state,yshift);
2145
+ if (state != last_state && blobcount > textord_spline_minblobs) {
2146
+ xstarts[segments++] = box.left ();
2147
+ blobcount = 0;
2148
+ }
2149
+ last_state = state;
2150
+ yshifts.remove (blobindex - textord_spline_medianwin);
2151
+ box = box_next_pre_chopped (&blob_it);
2152
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2153
+ yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2154
+ yshifts.add (yshift, blobindex);
2155
+ blobindex++;
2156
+ blobcount++;
2157
+ }
2158
+ while (!new_it.cycled_list ());
2159
+ if (blobcount > textord_spline_minblobs || segments == 1) {
2160
+ xstarts[segments] = new_box.right ();
2161
+ }
2162
+ else {
2163
+ xstarts[--segments] = new_box.right ();
2164
+ }
2165
+ if (textord_oldbl_debug)
2166
+ tprintf ("Made %d segments on row at (%d,%d)\n",
2167
+ segments, box.right (), box.bottom ());
2168
+ return needs_curve;
2169
+ }
2170
+
2171
+
2172
+ /**********************************************************************
2173
+ * linear_spline_baseline
2174
+ *
2175
+ * Divide the baseline up into segments which require a different
2176
+ * quadratic fitted to them.
2177
+ * Return TRUE if enough blobs were far enough away to need a quadratic.
2178
+ **********************************************************************/
2179
+
2180
+ double *
2181
+ linear_spline_baseline ( //split baseline
2182
+ TO_ROW * row, //row to fit
2183
+ TO_BLOCK * block, //block it came from
2184
+ inT32 & segments, //no fo segments
2185
+ inT32 xstarts[] //coords of segments
2186
+ ) {
2187
+ int blobcount; //no of blobs
2188
+ int blobindex; //current blob
2189
+ int index1, index2; //blob numbers
2190
+ int blobs_per_segment; //blobs in each
2191
+ TBOX box; //blob box
2192
+ TBOX new_box; //new_it box
2193
+ float middle; //xcentre of blob
2194
+ //blobs
2195
+ BLOBNBOX_IT blob_it = row->blob_list ();
2196
+ BLOBNBOX_IT new_it = blob_it; //front end
2197
+ float b, c; //fitted curve
2198
+ LMS lms (row->blob_list ()->length ());
2199
+ double *coeffs; //quadratic coeffs
2200
+ inT32 segment; //current segment
2201
+
2202
+ box = box_next_pre_chopped (&blob_it);
2203
+ xstarts[0] = box.left ();
2204
+ blobcount = 1;
2205
+ while (!blob_it.at_first ()) {
2206
+ blobcount++;
2207
+ box = box_next_pre_chopped (&blob_it);
2208
+ }
2209
+ segments = blobcount / textord_spline_medianwin;
2210
+ if (segments < 1)
2211
+ segments = 1;
2212
+ blobs_per_segment = blobcount / segments;
2213
+ coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2214
+ if (textord_oldbl_debug)
2215
+ tprintf
2216
+ ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2217
+ blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
2218
+ segment = 1;
2219
+ for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2220
+ box_next_pre_chopped(&new_it);
2221
+ index1 = 0;
2222
+ blobindex = index2;
2223
+ do {
2224
+ blobindex += blobs_per_segment;
2225
+ lms.clear ();
2226
+ while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2227
+ box = box_next_pre_chopped (&blob_it);
2228
+ middle = (box.left () + box.right ()) / 2.0;
2229
+ lms.add (FCOORD (middle, box.bottom ()));
2230
+ index1++;
2231
+ if (index1 == blobindex - blobs_per_segment / 2
2232
+ || index1 == blobcount - 1) {
2233
+ xstarts[segment] = box.left ();
2234
+ }
2235
+ }
2236
+ lms.fit (b, c);
2237
+ coeffs[segment * 3 - 3] = 0;
2238
+ coeffs[segment * 3 - 2] = b;
2239
+ coeffs[segment * 3 - 1] = c;
2240
+ segment++;
2241
+ if (segment > segments)
2242
+ break;
2243
+
2244
+ blobindex += blobs_per_segment;
2245
+ lms.clear ();
2246
+ while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2247
+ new_box = box_next_pre_chopped (&new_it);
2248
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2249
+ lms.add (FCOORD (middle, new_box.bottom ()));
2250
+ index2++;
2251
+ if (index2 == blobindex - blobs_per_segment / 2
2252
+ || index2 == blobcount - 1) {
2253
+ xstarts[segment] = new_box.left ();
2254
+ }
2255
+ }
2256
+ lms.fit (b, c);
2257
+ coeffs[segment * 3 - 3] = 0;
2258
+ coeffs[segment * 3 - 2] = b;
2259
+ coeffs[segment * 3 - 1] = c;
2260
+ segment++;
2261
+ }
2262
+ while (segment <= segments);
2263
+ return coeffs;
2264
+ }
2265
+
2266
+
2267
+ /**********************************************************************
2268
+ * assign_blobs_to_rows
2269
+ *
2270
+ * Make enough rows to allocate all the given blobs to one.
2271
+ * If a block skew is given, use that, else attempt to track it.
2272
+ **********************************************************************/
2273
+
2274
+ void assign_blobs_to_rows( //find lines
2275
+ TO_BLOCK *block, //block to do
2276
+ float *gradient, //block skew
2277
+ int pass, //identification
2278
+ BOOL8 reject_misses, //chuck big ones out
2279
+ BOOL8 make_new_rows, //add rows for unmatched
2280
+ BOOL8 drawing_skew //draw smoothed skew
2281
+ ) {
2282
+ OVERLAP_STATE overlap_result; //what to do with it
2283
+ float ycoord; //current y
2284
+ float top, bottom; //of blob
2285
+ float g_length = 1.0f; //from gradient
2286
+ inT16 row_count; //no of rows
2287
+ inT16 left_x; //left edge
2288
+ inT16 last_x; //previous edge
2289
+ float block_skew; //y delta
2290
+ float smooth_factor; //for new coords
2291
+ float near_dist; //dist to nearest row
2292
+ ICOORD testpt; //testing only
2293
+ BLOBNBOX *blob; //current blob
2294
+ TO_ROW *row; //current row
2295
+ TO_ROW *dest_row; //row to put blob in
2296
+ //iterators
2297
+ BLOBNBOX_IT blob_it = &block->blobs;
2298
+ TO_ROW_IT row_it = block->get_rows ();
2299
+
2300
+ ycoord =
2301
+ (block->block->bounding_box ().bottom () +
2302
+ block->block->bounding_box ().top ()) / 2.0f;
2303
+ if (gradient != NULL)
2304
+ g_length = sqrt (1 + *gradient * *gradient);
2305
+ #ifndef GRAPHICS_DISABLED
2306
+ if (drawing_skew)
2307
+ to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
2308
+ #endif
2309
+ testpt = ICOORD (textord_test_x, textord_test_y);
2310
+ blob_it.sort (blob_x_order);
2311
+ smooth_factor = 1.0;
2312
+ block_skew = 0.0f;
2313
+ row_count = row_it.length (); //might have rows
2314
+ if (!blob_it.empty ()) {
2315
+ left_x = blob_it.data ()->bounding_box ().left ();
2316
+ }
2317
+ else {
2318
+ left_x = block->block->bounding_box ().left ();
2319
+ }
2320
+ last_x = left_x;
2321
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2322
+ blob = blob_it.data ();
2323
+ if (gradient != NULL) {
2324
+ block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
2325
+ + *gradient / g_length * blob->bounding_box ().left ();
2326
+ }
2327
+ else if (blob->bounding_box ().left () - last_x > block->line_size / 2
2328
+ && last_x - left_x > block->line_size * 2
2329
+ && textord_interpolating_skew) {
2330
+ // tprintf("Interpolating skew from %g",block_skew);
2331
+ block_skew *= (float) (blob->bounding_box ().left () - left_x)
2332
+ / (last_x - left_x);
2333
+ // tprintf("to %g\n",block_skew);
2334
+ }
2335
+ last_x = blob->bounding_box ().left ();
2336
+ top = blob->bounding_box ().top () - block_skew;
2337
+ bottom = blob->bounding_box ().bottom () - block_skew;
2338
+ #ifndef GRAPHICS_DISABLED
2339
+ if (drawing_skew)
2340
+ to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
2341
+ #endif
2342
+ if (!row_it.empty ()) {
2343
+ for (row_it.move_to_first ();
2344
+ !row_it.at_last () && row_it.data ()->min_y () > top;
2345
+ row_it.forward ());
2346
+ row = row_it.data ();
2347
+ if (row->min_y () <= top && row->max_y () >= bottom) {
2348
+ //any overlap
2349
+ dest_row = row;
2350
+ overlap_result = most_overlapping_row (&row_it, dest_row,
2351
+ top, bottom,
2352
+ block->line_size,
2353
+ blob->bounding_box ().
2354
+ contains (testpt));
2355
+ if (overlap_result == NEW_ROW && !reject_misses)
2356
+ overlap_result = ASSIGN;
2357
+ }
2358
+ else {
2359
+ overlap_result = NEW_ROW;
2360
+ if (!make_new_rows) {
2361
+ near_dist = row_it.data_relative (-1)->min_y () - top;
2362
+ //below bottom
2363
+ if (bottom < row->min_y ()) {
2364
+ if (row->min_y () - bottom <=
2365
+ (block->line_spacing -
2366
+ block->line_size) * textord_merge_desc) {
2367
+ //done it
2368
+ overlap_result = ASSIGN;
2369
+ dest_row = row;
2370
+ }
2371
+ }
2372
+ else if (near_dist > 0
2373
+ && near_dist < bottom - row->max_y ()) {
2374
+ row_it.backward ();
2375
+ dest_row = row_it.data ();
2376
+ if (dest_row->min_y () - bottom <=
2377
+ (block->line_spacing -
2378
+ block->line_size) * textord_merge_desc) {
2379
+ //done it
2380
+ overlap_result = ASSIGN;
2381
+ }
2382
+ }
2383
+ else {
2384
+ if (top - row->max_y () <=
2385
+ (block->line_spacing -
2386
+ block->line_size) * (textord_overlap_x +
2387
+ textord_merge_asc)) {
2388
+ //done it
2389
+ overlap_result = ASSIGN;
2390
+ dest_row = row;
2391
+ }
2392
+ }
2393
+ }
2394
+ }
2395
+ if (overlap_result == ASSIGN)
2396
+ dest_row->add_blob (blob_it.extract (), top, bottom,
2397
+ block->line_size);
2398
+ if (overlap_result == NEW_ROW) {
2399
+ if (make_new_rows && top - bottom < block->max_blob_size) {
2400
+ dest_row =
2401
+ new TO_ROW (blob_it.extract (), top, bottom,
2402
+ block->line_size);
2403
+ row_count++;
2404
+ if (bottom > row_it.data ()->min_y ())
2405
+ row_it.add_before_then_move (dest_row);
2406
+ //insert in right place
2407
+ else
2408
+ row_it.add_after_then_move (dest_row);
2409
+ smooth_factor =
2410
+ 1.0 / (row_count * textord_skew_lag +
2411
+ textord_skewsmooth_offset);
2412
+ }
2413
+ else
2414
+ overlap_result = REJECT;
2415
+ }
2416
+ }
2417
+ else if (make_new_rows && top - bottom < block->max_blob_size) {
2418
+ overlap_result = NEW_ROW;
2419
+ dest_row =
2420
+ new TO_ROW (blob_it.extract (), top, bottom, block->line_size);
2421
+ row_count++;
2422
+ row_it.add_after_then_move (dest_row);
2423
+ smooth_factor = 1.0 / (row_count * textord_skew_lag +
2424
+ textord_skewsmooth_offset2);
2425
+ }
2426
+ else
2427
+ overlap_result = REJECT;
2428
+ if (blob->bounding_box ().contains (testpt)) {
2429
+ if (overlap_result != REJECT) {
2430
+ tprintf ("Test blob assigned to row at (%g,%g) on pass %d\n",
2431
+ dest_row->min_y (), dest_row->max_y (), pass);
2432
+ }
2433
+ else {
2434
+ tprintf ("Test blob assigned to no row on pass %d\n", pass);
2435
+ }
2436
+ }
2437
+ if (overlap_result != REJECT) {
2438
+ while (!row_it.at_first ()
2439
+ && row_it.data ()->min_y () >
2440
+ row_it.data_relative (-1)->min_y ()) {
2441
+ row = row_it.extract ();
2442
+ row_it.backward ();
2443
+ row_it.add_before_then_move (row);
2444
+ }
2445
+ while (!row_it.at_last ()
2446
+ && row_it.data ()->min_y () <
2447
+ row_it.data_relative (1)->min_y ()) {
2448
+ row = row_it.extract ();
2449
+ row_it.forward ();
2450
+ //keep rows in order
2451
+ row_it.add_after_then_move (row);
2452
+ }
2453
+ block_skew = (1 - smooth_factor) * block_skew
2454
+ + smooth_factor * (blob->bounding_box ().bottom () -
2455
+ dest_row->initial_min_y ());
2456
+ }
2457
+ }
2458
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2459
+ if (row_it.data ()->blob_list ()->empty ())
2460
+ delete row_it.extract (); //discard empty rows
2461
+ }
2462
+ }
2463
+
2464
+
2465
+ /**********************************************************************
2466
+ * most_overlapping_row
2467
+ *
2468
+ * Return the row which most overlaps the blob.
2469
+ **********************************************************************/
2470
+
2471
+ OVERLAP_STATE most_overlapping_row( //find best row
2472
+ TO_ROW_IT *row_it, //iterator
2473
+ TO_ROW *&best_row, //output row
2474
+ float top, //top of blob
2475
+ float bottom, //bottom of blob
2476
+ float rowsize, //max row size
2477
+ BOOL8 testing_blob //test stuff
2478
+ ) {
2479
+ OVERLAP_STATE result; //result of tests
2480
+ float overlap; //of blob & row
2481
+ float bestover; //nearest row
2482
+ float merge_top, merge_bottom; //size of merged row
2483
+ ICOORD testpt; //testing only
2484
+ TO_ROW *row; //current row
2485
+ TO_ROW *test_row; //for multiple overlaps
2486
+ BLOBNBOX_IT blob_it; //for merging rows
2487
+
2488
+ result = ASSIGN;
2489
+ row = row_it->data ();
2490
+ bestover = top - bottom;
2491
+ if (top > row->max_y ())
2492
+ bestover -= top - row->max_y ();
2493
+ if (bottom < row->min_y ())
2494
+ //compute overlap
2495
+ bestover -= row->min_y () - bottom;
2496
+ if (testing_blob) {
2497
+ tprintf ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f\n",
2498
+ bottom, top, row->min_y (), row->max_y (), bestover);
2499
+ }
2500
+ test_row = row;
2501
+ do {
2502
+ if (!row_it->at_last ()) {
2503
+ row_it->forward ();
2504
+ test_row = row_it->data ();
2505
+ if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
2506
+ merge_top =
2507
+ test_row->max_y () >
2508
+ row->max_y ()? test_row->max_y () : row->max_y ();
2509
+ merge_bottom =
2510
+ test_row->min_y () <
2511
+ row->min_y ()? test_row->min_y () : row->min_y ();
2512
+ if (merge_top - merge_bottom <= rowsize) {
2513
+ if (testing_blob) {
2514
+ tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
2515
+ row->min_y (), row->max_y (),
2516
+ test_row->min_y (), test_row->max_y ());
2517
+ }
2518
+ test_row->set_limits (merge_bottom, merge_top);
2519
+ blob_it.set_to_list (test_row->blob_list ());
2520
+ blob_it.add_list_after (row->blob_list ());
2521
+ blob_it.sort (blob_x_order);
2522
+ row_it->backward ();
2523
+ delete row_it->extract ();
2524
+ row_it->forward ();
2525
+ bestover = -1.0f; //force replacement
2526
+ }
2527
+ overlap = top - bottom;
2528
+ if (top > test_row->max_y ())
2529
+ overlap -= top - test_row->max_y ();
2530
+ if (bottom < test_row->min_y ())
2531
+ overlap -= test_row->min_y () - bottom;
2532
+ if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2533
+ result = REJECT;
2534
+ }
2535
+ if (overlap > bestover) {
2536
+ bestover = overlap; //find biggest overlap
2537
+ row = test_row;
2538
+ }
2539
+ if (testing_blob) {
2540
+ tprintf
2541
+ ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f->%f\n",
2542
+ bottom, top, test_row->min_y (), test_row->max_y (),
2543
+ overlap, bestover);
2544
+ }
2545
+ }
2546
+ }
2547
+ }
2548
+ while (!row_it->at_last ()
2549
+ && test_row->min_y () <= top && test_row->max_y () >= bottom);
2550
+ while (row_it->data () != row)
2551
+ row_it->backward (); //make it point to row
2552
+ //doesn't overlap much
2553
+ if (top - bottom - bestover > rowsize * textord_overlap_x &&
2554
+ (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
2555
+ && result == ASSIGN)
2556
+ result = NEW_ROW; //doesn't overlap enough
2557
+ best_row = row;
2558
+ return result;
2559
+ }
2560
+
2561
+
2562
+ /**********************************************************************
2563
+ * blob_x_order
2564
+ *
2565
+ * Sort function to sort blobs in x from page left.
2566
+ **********************************************************************/
2567
+
2568
+ int blob_x_order( //sort function
2569
+ const void *item1, //items to compare
2570
+ const void *item2) {
2571
+ //converted ptr
2572
+ BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
2573
+ //converted ptr
2574
+ BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
2575
+
2576
+ if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
2577
+ return -1;
2578
+ else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
2579
+ return 1;
2580
+ else
2581
+ return 0;
2582
+ }
2583
+
2584
+
2585
+ /**********************************************************************
2586
+ * row_y_order
2587
+ *
2588
+ * Sort function to sort rows in y from page top.
2589
+ **********************************************************************/
2590
+
2591
+ int row_y_order( //sort function
2592
+ const void *item1, //items to compare
2593
+ const void *item2) {
2594
+ //converted ptr
2595
+ TO_ROW *row1 = *(TO_ROW **) item1;
2596
+ //converted ptr
2597
+ TO_ROW *row2 = *(TO_ROW **) item2;
2598
+
2599
+ if (row1->parallel_c () > row2->parallel_c ())
2600
+ return -1;
2601
+ else if (row1->parallel_c () < row2->parallel_c ())
2602
+ return 1;
2603
+ else
2604
+ return 0;
2605
+ }
2606
+
2607
+
2608
+ /**********************************************************************
2609
+ * row_spacing_order
2610
+ *
2611
+ * Qsort style function to compare 2 TO_ROWS based on their spacing value.
2612
+ **********************************************************************/
2613
+
2614
+ int row_spacing_order( //sort function
2615
+ const void *item1, //items to compare
2616
+ const void *item2) {
2617
+ //converted ptr
2618
+ TO_ROW *row1 = *(TO_ROW **) item1;
2619
+ //converted ptr
2620
+ TO_ROW *row2 = *(TO_ROW **) item2;
2621
+
2622
+ if (row1->spacing < row2->spacing)
2623
+ return -1;
2624
+ else if (row1->spacing > row2->spacing)
2625
+ return 1;
2626
+ else
2627
+ return 0;
2628
+ }