tesseract_bin 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (612) hide show
  1. data/.document +5 -0
  2. data/Gemfile +14 -0
  3. data/Gemfile.lock +23 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +46 -0
  7. data/VERSION +1 -0
  8. data/ext/tesseract_bin/extconf.rb +17 -0
  9. data/lib/tesseract_bin.rb +12 -0
  10. data/tesseract_bin.gemspec +660 -0
  11. data/test/helper.rb +18 -0
  12. data/test/test_tesseract_bin.rb +7 -0
  13. data/vendor/tesseract-2.04/AUTHORS +8 -0
  14. data/vendor/tesseract-2.04/COPYING +23 -0
  15. data/vendor/tesseract-2.04/ChangeLog +71 -0
  16. data/vendor/tesseract-2.04/INSTALL +229 -0
  17. data/vendor/tesseract-2.04/Makefile.am +20 -0
  18. data/vendor/tesseract-2.04/Makefile.in +641 -0
  19. data/vendor/tesseract-2.04/NEWS +1 -0
  20. data/vendor/tesseract-2.04/README +138 -0
  21. data/vendor/tesseract-2.04/ReleaseNotes +213 -0
  22. data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
  23. data/vendor/tesseract-2.04/StdAfx.h +24 -0
  24. data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
  25. data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
  26. data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
  27. data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
  28. data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
  29. data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
  30. data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
  31. data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
  32. data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
  33. data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
  34. data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
  35. data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
  36. data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
  37. data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
  38. data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
  39. data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
  40. data/vendor/tesseract-2.04/ccmain/control.h +198 -0
  41. data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
  42. data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
  43. data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
  44. data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
  45. data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
  46. data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
  47. data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
  48. data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
  49. data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
  50. data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
  51. data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
  52. data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
  53. data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
  54. data/vendor/tesseract-2.04/ccmain/output.h +116 -0
  55. data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
  56. data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
  57. data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
  58. data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
  59. data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
  60. data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
  61. data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
  62. data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
  63. data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
  64. data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
  65. data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
  66. data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
  67. data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
  68. data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
  69. data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
  70. data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
  71. data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
  72. data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
  73. data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
  74. data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
  75. data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
  76. data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
  77. data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
  78. data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
  79. data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
  80. data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
  81. data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
  82. data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
  83. data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
  84. data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
  85. data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
  86. data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
  87. data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
  88. data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
  89. data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
  90. data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
  91. data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
  92. data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
  93. data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
  94. data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
  95. data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
  96. data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
  97. data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
  98. data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
  99. data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
  100. data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
  101. data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
  102. data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
  103. data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
  104. data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
  105. data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
  106. data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
  107. data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
  108. data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
  109. data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
  110. data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
  111. data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
  112. data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
  113. data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
  114. data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
  115. data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
  116. data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
  117. data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
  118. data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
  119. data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
  120. data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
  121. data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
  122. data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
  123. data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
  124. data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
  125. data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
  126. data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
  127. data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
  128. data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
  129. data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
  130. data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
  131. data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
  132. data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
  133. data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
  134. data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
  135. data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
  136. data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
  137. data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
  138. data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
  139. data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
  140. data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
  141. data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
  142. data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
  143. data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
  144. data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
  145. data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
  146. data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
  147. data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
  148. data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
  149. data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
  150. data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
  151. data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
  152. data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
  153. data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
  154. data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
  155. data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
  156. data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
  157. data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
  158. data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
  159. data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
  160. data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
  161. data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
  162. data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
  163. data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
  164. data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
  165. data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
  166. data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
  167. data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
  168. data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
  169. data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
  170. data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
  171. data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
  172. data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
  173. data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
  174. data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
  175. data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
  176. data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
  177. data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
  178. data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
  179. data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
  180. data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
  181. data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
  182. data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
  183. data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
  184. data/vendor/tesseract-2.04/ccutil/host.h +180 -0
  185. data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
  186. data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
  187. data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
  188. data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
  189. data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
  190. data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
  191. data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
  192. data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
  193. data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
  194. data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
  195. data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
  196. data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
  197. data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
  198. data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
  199. data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
  200. data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
  201. data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
  202. data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
  203. data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
  204. data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
  205. data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
  206. data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
  207. data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
  208. data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
  209. data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
  210. data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
  211. data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
  212. data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
  213. data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
  214. data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
  215. data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
  216. data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
  217. data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
  218. data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
  219. data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
  220. data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
  221. data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
  222. data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
  223. data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
  224. data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
  225. data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
  226. data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
  227. data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
  228. data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
  229. data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
  230. data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
  231. data/vendor/tesseract-2.04/classify/baseline.h +91 -0
  232. data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
  233. data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
  234. data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
  235. data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
  236. data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
  237. data/vendor/tesseract-2.04/classify/cluster.h +158 -0
  238. data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
  239. data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
  240. data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
  241. data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
  242. data/vendor/tesseract-2.04/classify/extern.h +32 -0
  243. data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
  244. data/vendor/tesseract-2.04/classify/extract.h +36 -0
  245. data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
  246. data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
  247. data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
  248. data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
  249. data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
  250. data/vendor/tesseract-2.04/classify/float2int.h +65 -0
  251. data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
  252. data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
  253. data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
  254. data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
  255. data/vendor/tesseract-2.04/classify/fxid.h +69 -0
  256. data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
  257. data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
  258. data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
  259. data/vendor/tesseract-2.04/classify/intfx.h +63 -0
  260. data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
  261. data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
  262. data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
  263. data/vendor/tesseract-2.04/classify/intproto.h +320 -0
  264. data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
  265. data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
  266. data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
  267. data/vendor/tesseract-2.04/classify/mf.h +43 -0
  268. data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
  269. data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
  270. data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
  271. data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
  272. data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
  273. data/vendor/tesseract-2.04/classify/mfx.h +52 -0
  274. data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
  275. data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
  276. data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
  277. data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
  278. data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
  279. data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
  280. data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
  281. data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
  282. data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
  283. data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
  284. data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
  285. data/vendor/tesseract-2.04/classify/protos.h +258 -0
  286. data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
  287. data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
  288. data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
  289. data/vendor/tesseract-2.04/classify/speckle.h +69 -0
  290. data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
  291. data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
  292. data/vendor/tesseract-2.04/config/config.guess +1466 -0
  293. data/vendor/tesseract-2.04/config/config.h.in +188 -0
  294. data/vendor/tesseract-2.04/config/config.sub +1579 -0
  295. data/vendor/tesseract-2.04/config/depcomp +530 -0
  296. data/vendor/tesseract-2.04/config/install-sh +269 -0
  297. data/vendor/tesseract-2.04/config/missing +198 -0
  298. data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
  299. data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
  300. data/vendor/tesseract-2.04/configure +10424 -0
  301. data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
  302. data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
  303. data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
  304. data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
  305. data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
  306. data/vendor/tesseract-2.04/cutil/const.h +108 -0
  307. data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
  308. data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
  309. data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
  310. data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
  311. data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
  312. data/vendor/tesseract-2.04/cutil/debug.h +348 -0
  313. data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
  314. data/vendor/tesseract-2.04/cutil/efio.h +32 -0
  315. data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
  316. data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
  317. data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
  318. data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
  319. data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
  320. data/vendor/tesseract-2.04/cutil/general.h +33 -0
  321. data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
  322. data/vendor/tesseract-2.04/cutil/globals.h +70 -0
  323. data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
  324. data/vendor/tesseract-2.04/cutil/listio.h +43 -0
  325. data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
  326. data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
  327. data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
  328. data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
  329. data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
  330. data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
  331. data/vendor/tesseract-2.04/cutil/structures.h +112 -0
  332. data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
  333. data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
  334. data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
  335. data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
  336. data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
  337. data/vendor/tesseract-2.04/cutil/variables.h +170 -0
  338. data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
  339. data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
  340. data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
  341. data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
  342. data/vendor/tesseract-2.04/dict/choices.h +241 -0
  343. data/vendor/tesseract-2.04/dict/context.cpp +270 -0
  344. data/vendor/tesseract-2.04/dict/context.h +82 -0
  345. data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
  346. data/vendor/tesseract-2.04/dict/dawg.h +394 -0
  347. data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
  348. data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
  349. data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
  350. data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
  351. data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
  352. data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
  353. data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
  354. data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
  355. data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
  356. data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
  357. data/vendor/tesseract-2.04/dict/permngram.h +33 -0
  358. data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
  359. data/vendor/tesseract-2.04/dict/permnum.h +83 -0
  360. data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
  361. data/vendor/tesseract-2.04/dict/permute.h +93 -0
  362. data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
  363. data/vendor/tesseract-2.04/dict/reduce.h +112 -0
  364. data/vendor/tesseract-2.04/dict/states.cpp +382 -0
  365. data/vendor/tesseract-2.04/dict/states.h +111 -0
  366. data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
  367. data/vendor/tesseract-2.04/dict/stopper.h +103 -0
  368. data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
  369. data/vendor/tesseract-2.04/dict/trie.h +190 -0
  370. data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
  371. data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
  372. data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
  373. data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
  374. data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
  375. data/vendor/tesseract-2.04/eurotext.tif +0 -0
  376. data/vendor/tesseract-2.04/image/Makefile.am +10 -0
  377. data/vendor/tesseract-2.04/image/Makefile.in +596 -0
  378. data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
  379. data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
  380. data/vendor/tesseract-2.04/image/img.h +336 -0
  381. data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
  382. data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
  383. data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
  384. data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
  385. data/vendor/tesseract-2.04/image/imgio.h +22 -0
  386. data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
  387. data/vendor/tesseract-2.04/image/imgs.h +102 -0
  388. data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
  389. data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
  390. data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
  391. data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
  392. data/vendor/tesseract-2.04/image/svshowim.h +25 -0
  393. data/vendor/tesseract-2.04/java/Makefile.am +4 -0
  394. data/vendor/tesseract-2.04/java/Makefile.in +473 -0
  395. data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
  396. data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
  397. data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
  398. data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
  399. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
  400. data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
  401. data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
  402. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
  403. data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
  404. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
  405. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
  406. data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
  407. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
  408. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
  409. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
  410. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
  411. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
  412. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
  413. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
  414. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
  415. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
  416. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
  417. data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
  418. data/vendor/tesseract-2.04/java/makefile +55 -0
  419. data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
  420. data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
  421. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
  422. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
  423. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
  424. data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
  425. data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
  426. data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
  427. data/vendor/tesseract-2.04/phototest.tif +0 -0
  428. data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
  429. data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
  430. data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
  431. data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
  432. data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
  433. data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
  434. data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
  435. data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
  436. data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
  437. data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
  438. data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
  439. data/vendor/tesseract-2.04/tessdata/confsets +3 -0
  440. data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
  441. data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
  442. data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
  443. data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
  444. data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
  445. data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
  446. data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
  447. data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
  448. data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
  449. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
  450. data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
  451. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
  452. data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
  453. data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
  454. data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
  455. data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
  456. data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
  457. data/vendor/tesseract-2.04/tessdll.cpp +351 -0
  458. data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
  459. data/vendor/tesseract-2.04/tessdll.h +143 -0
  460. data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
  461. data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
  462. data/vendor/tesseract-2.04/tesseract.dsw +116 -0
  463. data/vendor/tesseract-2.04/tesseract.sln +59 -0
  464. data/vendor/tesseract-2.04/tesseract.spec +188 -0
  465. data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
  466. data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
  467. data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
  468. data/vendor/tesseract-2.04/testing/README +43 -0
  469. data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
  470. data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
  471. data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
  472. data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
  473. data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
  474. data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
  475. data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
  476. data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
  477. data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
  478. data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
  479. data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
  480. data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
  481. data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
  482. data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
  483. data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
  484. data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
  485. data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
  486. data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
  487. data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
  488. data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
  489. data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
  490. data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
  491. data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
  492. data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
  493. data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
  494. data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
  495. data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
  496. data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
  497. data/vendor/tesseract-2.04/textord/makerow.h +295 -0
  498. data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
  499. data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
  500. data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
  501. data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
  502. data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
  503. data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
  504. data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
  505. data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
  506. data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
  507. data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
  508. data/vendor/tesseract-2.04/textord/tessout.h +76 -0
  509. data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
  510. data/vendor/tesseract-2.04/textord/topitch.h +195 -0
  511. data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
  512. data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
  513. data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
  514. data/vendor/tesseract-2.04/textord/tospace.h +193 -0
  515. data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
  516. data/vendor/tesseract-2.04/textord/tovars.h +94 -0
  517. data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
  518. data/vendor/tesseract-2.04/textord/underlin.h +53 -0
  519. data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
  520. data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
  521. data/vendor/tesseract-2.04/training/Makefile.am +54 -0
  522. data/vendor/tesseract-2.04/training/Makefile.in +720 -0
  523. data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
  524. data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
  525. data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
  526. data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
  527. data/vendor/tesseract-2.04/training/mergenf.h +106 -0
  528. data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
  529. data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
  530. data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
  531. data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
  532. data/vendor/tesseract-2.04/training/name2char.h +38 -0
  533. data/vendor/tesseract-2.04/training/training.cpp +190 -0
  534. data/vendor/tesseract-2.04/training/training.h +130 -0
  535. data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
  536. data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
  537. data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
  538. data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
  539. data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
  540. data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
  541. data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
  542. data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
  543. data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
  544. data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
  545. data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
  546. data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
  547. data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
  548. data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
  549. data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
  550. data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
  551. data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
  552. data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
  553. data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
  554. data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
  555. data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
  556. data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
  557. data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
  558. data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
  559. data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
  560. data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
  561. data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
  562. data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
  563. data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
  564. data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
  565. data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
  566. data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
  567. data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
  568. data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
  569. data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
  570. data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
  571. data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
  572. data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
  573. data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
  574. data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
  575. data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
  576. data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
  577. data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
  578. data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
  579. data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
  580. data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
  581. data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
  582. data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
  583. data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
  584. data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
  585. data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
  586. data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
  587. data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
  588. data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
  589. data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
  590. data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
  591. data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
  592. data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
  593. data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
  594. data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
  595. data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
  596. data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
  597. data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
  598. data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
  599. data/vendor/tesseract-2.04/wordrec/render.h +58 -0
  600. data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
  601. data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
  602. data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
  603. data/vendor/tesseract-2.04/wordrec/split.h +115 -0
  604. data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
  605. data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
  606. data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
  607. data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
  608. data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
  609. data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
  610. data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
  611. data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
  612. metadata +708 -0
@@ -0,0 +1,2628 @@
1
+ /**********************************************************************
2
+ * File: makerow.cpp (Formerly makerows.c)
3
+ * Description: Code to arrange blobs into rows of text.
4
+ * Author: Ray Smith
5
+ * Created: Mon Sep 21 14:34:48 BST 1992
6
+ *
7
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
8
+ ** Licensed under the Apache License, Version 2.0 (the "License");
9
+ ** you may not use this file except in compliance with the License.
10
+ ** You may obtain a copy of the License at
11
+ ** http://www.apache.org/licenses/LICENSE-2.0
12
+ ** Unless required by applicable law or agreed to in writing, software
13
+ ** distributed under the License is distributed on an "AS IS" BASIS,
14
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ ** See the License for the specific language governing permissions and
16
+ ** limitations under the License.
17
+ *
18
+ **********************************************************************/
19
+
20
+ #include "mfcpch.h"
21
+ #ifdef __UNIX__
22
+ #include <assert.h>
23
+ #endif
24
+ #include "stderr.h"
25
+ #include "blobbox.h"
26
+ #include "lmedsq.h"
27
+ #include "statistc.h"
28
+ #include "drawtord.h"
29
+ #include "blkocc.h"
30
+ #include "sortflts.h"
31
+ #include "oldbasel.h"
32
+ #include "tordmain.h"
33
+ #include "underlin.h"
34
+ #include "makerow.h"
35
+ #include "tprintf.h"
36
+
37
+ #define EXTERN
38
+
39
+ EXTERN BOOL_VAR (textord_heavy_nr, FALSE, "Vigorously remove noise");
40
+ EXTERN BOOL_VAR (textord_show_initial_rows, FALSE,
41
+ "Display row accumulation");
42
+ EXTERN BOOL_VAR (textord_show_parallel_rows, FALSE,
43
+ "Display page correlated rows");
44
+ EXTERN BOOL_VAR (textord_show_expanded_rows, FALSE,
45
+ "Display rows after expanding");
46
+ EXTERN BOOL_VAR (textord_show_final_rows, FALSE,
47
+ "Display rows after final fittin");
48
+ EXTERN BOOL_VAR (textord_show_final_blobs, FALSE,
49
+ "Display blob bounds after pre-ass");
50
+ EXTERN BOOL_VAR (textord_test_landscape, FALSE, "Tests refer to land/port");
51
+ EXTERN BOOL_VAR (textord_parallel_baselines, TRUE,
52
+ "Force parallel baselines");
53
+ EXTERN BOOL_VAR (textord_straight_baselines, FALSE,
54
+ "Force straight baselines");
55
+ EXTERN BOOL_VAR (textord_quadratic_baselines, FALSE, "Use quadratic splines");
56
+ EXTERN BOOL_VAR (textord_old_baselines, TRUE, "Use old baseline algorithm");
57
+ EXTERN BOOL_VAR (textord_old_xheight, TRUE, "Use old xheight algorithm");
58
+ EXTERN BOOL_VAR (textord_fix_xheight_bug, TRUE, "Use spline baseline");
59
+ EXTERN BOOL_VAR (textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
60
+ EXTERN BOOL_VAR (textord_row_xheights, FALSE, "Use row height policy");
61
+ EXTERN BOOL_VAR (textord_block_xheights, TRUE, "Use block height policy");
62
+ EXTERN BOOL_VAR (textord_xheight_tweak, FALSE, "New min condition on height");
63
+ EXTERN BOOL_VAR (textord_cblob_blockocc, TRUE,
64
+ "Use new projection for underlines");
65
+ EXTERN BOOL_VAR (textord_debug_xheights, FALSE, "Test xheight algorithms");
66
+ EXTERN BOOL_VAR (textord_biased_skewcalc, TRUE,
67
+ "Bias skew estimates with line length");
68
+ EXTERN BOOL_VAR (textord_interpolating_skew, TRUE, "Interpolate across gaps");
69
+ EXTERN INT_VAR (textord_skewsmooth_offset, 2, "For smooth factor");
70
+ EXTERN INT_VAR (textord_skewsmooth_offset2, 1, "For smooth factor");
71
+ EXTERN INT_VAR (textord_test_x, -1, "coord of test pt");
72
+ EXTERN INT_VAR (textord_test_y, -1, "coord of test pt");
73
+ EXTERN INT_VAR (textord_min_blobs_in_row, 4,
74
+ "Min blobs before gradient counted");
75
+ EXTERN INT_VAR (textord_spline_minblobs, 8,
76
+ "Min blobs in each spline segment");
77
+ EXTERN INT_VAR (textord_spline_medianwin, 6,
78
+ "Size of window for spline segmentation");
79
+ EXTERN INT_VAR (textord_min_xheight, 10, "Min credible pixel xheight");
80
+ EXTERN double_VAR (textord_spline_shift_fraction, 0.02,
81
+ "Fraction of line spacing for quad");
82
+ EXTERN double_VAR (textord_spline_outlier_fraction, 0.1,
83
+ "Fraction of line spacing for outlier");
84
+ EXTERN double_VAR (textord_skew_ile, 0.5, "Ile of gradients for page skew");
85
+ EXTERN double_VAR (textord_skew_lag, 0.01,
86
+ "Lag for skew on row accumulation");
87
+ EXTERN double_VAR (textord_linespace_iqrlimit, 0.2,
88
+ "Max iqr/median for linespace");
89
+ EXTERN double_VAR (textord_width_limit, 8, "Max width of blobs to make rows");
90
+ EXTERN double_VAR (textord_chop_width, 1.5, "Max width before chopping");
91
+ EXTERN double_VAR (textord_expansion_factor, 1.0,
92
+ "Factor to expand rows by in expand_rows");
93
+ EXTERN double_VAR (textord_overlap_x, 0.5,
94
+ "Fraction of linespace for good overlap");
95
+ EXTERN double_VAR (textord_merge_desc, 0.25,
96
+ "Fraction of linespace for desc drop");
97
+ EXTERN double_VAR (textord_merge_x, 0.5,
98
+ "Fraction of linespace for x height");
99
+ EXTERN double_VAR (textord_merge_asc, 0.25,
100
+ "Fraction of linespace for asc height");
101
+ EXTERN double_VAR (textord_minxh, 0.25,
102
+ "fraction of linesize for min xheight");
103
+ EXTERN double_VAR (textord_min_linesize, 1.25,
104
+ "* blob height for initial linesize");
105
+ EXTERN double_VAR (textord_excess_blobsize, 1.3,
106
+ "New row made if blob makes row this big");
107
+ EXTERN double_VAR (textord_occupancy_threshold, 0.4,
108
+ "Fraction of neighbourhood");
109
+ EXTERN double_VAR (textord_underline_width, 2.0,
110
+ "Multiple of line_size for underline");
111
+ EXTERN double_VAR (textord_xheight_mode_fraction, 0.4,
112
+ "Min pile height to make xheight");
113
+ EXTERN double_VAR (textord_ascheight_mode_fraction, 0.15,
114
+ "Min pile height to make ascheight");
115
+ EXTERN double_VAR (textord_ascx_ratio_min, 1.2, "Min cap/xheight");
116
+ EXTERN double_VAR (textord_ascx_ratio_max, 1.7, "Max cap/xheight");
117
+ EXTERN double_VAR (textord_descx_ratio_min, 0.15, "Min desc/xheight");
118
+ EXTERN double_VAR (textord_descx_ratio_max, 0.6, "Max desc/xheight");
119
+ EXTERN double_VAR (textord_xheight_error_margin, 0.1, "Accepted variation");
120
+
121
+ #define MAX_HEIGHT_MODES 12
122
+
123
+ /**********************************************************************
124
+ * make_rows
125
+ *
126
+ * Arrange the blobs into rows.
127
+ **********************************************************************/
128
+
129
+ float make_rows( //make rows
130
+ ICOORD page_tr, //top right
131
+ BLOCK_LIST *blocks, //block list
132
+ TO_BLOCK_LIST *land_blocks, //rotated for landscape
133
+ TO_BLOCK_LIST *port_blocks //output list
134
+ ) {
135
+ float port_m; //global skew
136
+ float port_err; //global noise
137
+ // float land_m; //global skew
138
+ // float land_err; //global noise
139
+ TO_BLOCK_IT block_it; //iterator
140
+
141
+ //don't do landscape for now
142
+ // block_it.set_to_list(land_blocks);
143
+ // for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
144
+ // make_initial_textrows(page_tr,block_it.data(),FCOORD(0,-1),
145
+ // (BOOL8)textord_test_landscape);
146
+ block_it.set_to_list (port_blocks);
147
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
148
+ block_it.forward ())
149
+ make_initial_textrows (page_tr, block_it.data (), FCOORD (1.0f, 0.0f),
150
+ !(BOOL8) textord_test_landscape);
151
+ //compute globally
152
+ compute_page_skew(port_blocks, port_m, port_err);
153
+ // compute_page_skew(land_blocks,land_m,land_err); //compute globally
154
+ // tprintf("Portrait skew gradient=%g, error=%g.\n",
155
+ // port_m,port_err);
156
+ // tprintf("Landscape skew gradient=%g, error=%g.\n",
157
+ // land_m,land_err);
158
+ block_it.set_to_list (port_blocks);
159
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
160
+ block_it.forward ()) {
161
+ cleanup_rows (page_tr, block_it.data (), port_m, FCOORD (1.0f, 0.0f),
162
+ block_it.data ()->block->bounding_box ().left (),
163
+ !(BOOL8) textord_test_landscape);
164
+ }
165
+ block_it.set_to_list (land_blocks);
166
+ // for (block_it.mark_cycle_pt();!block_it.cycled_list();block_it.forward())
167
+ // {
168
+ // cleanup_rows(page_tr,block_it.data(),land_m,FCOORD(0,-1),
169
+ // -block_it.data()->block->bounding_box().top(),
170
+ // (BOOL8)textord_test_landscape);
171
+ // }
172
+ return port_m; //global skew
173
+ }
174
+
175
+
176
+ /**********************************************************************
177
+ * make_initial_textrows
178
+ *
179
+ * Arrange the good blobs into rows of text.
180
+ **********************************************************************/
181
+
182
+ void make_initial_textrows( //find lines
183
+ ICOORD page_tr,
184
+ TO_BLOCK *block, //block to do
185
+ FCOORD rotation, //for drawing
186
+ BOOL8 testing_on //correct orientation
187
+ ) {
188
+ TO_ROW_IT row_it = block->get_rows ();
189
+
190
+ #ifndef GRAPHICS_DISABLED
191
+ ScrollView::Color colour; //of row
192
+
193
+ if (textord_show_initial_rows && testing_on) {
194
+ if (to_win == NULL)
195
+ create_to_win(page_tr);
196
+ }
197
+ #endif
198
+ //guess skew
199
+ assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
200
+ row_it.move_to_first ();
201
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
202
+ fit_lms_line (row_it.data ());
203
+ #ifndef GRAPHICS_DISABLED
204
+ if (textord_show_initial_rows && testing_on) {
205
+ colour = ScrollView::RED;
206
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
207
+ plot_to_row (row_it.data (), colour, rotation);
208
+ colour = (ScrollView::Color) (colour + 1);
209
+ if (colour > ScrollView::MAGENTA)
210
+ colour = ScrollView::RED;
211
+ }
212
+ }
213
+ #endif
214
+ }
215
+
216
+
217
+ /**********************************************************************
218
+ * fit_lms_line
219
+ *
220
+ * Fit an LMS line to a row.
221
+ **********************************************************************/
222
+
223
+ void fit_lms_line( //sort function
224
+ TO_ROW *row //row to fit
225
+ ) {
226
+ float m, c; //fitted line
227
+ TBOX box; //blob box
228
+ LMS lms (row->blob_list ()->length ());
229
+ //blobs
230
+ BLOBNBOX_IT blob_it = row->blob_list ();
231
+
232
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
233
+ box = blob_it.data ()->bounding_box ();
234
+ lms.add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
235
+ }
236
+ lms.fit (m, c);
237
+ row->set_line (m, c, lms.error ());
238
+ }
239
+
240
+
241
+ /**********************************************************************
242
+ * compute_page_skew
243
+ *
244
+ * Compute the skew over a full page by averaging the gradients over
245
+ * all the lines. Get the error of the same row.
246
+ **********************************************************************/
247
+
248
+ void compute_page_skew( //get average gradient
249
+ TO_BLOCK_LIST *blocks, //list of blocks
250
+ float &page_m, //average gradient
251
+ float &page_err //average error
252
+ ) {
253
+ inT32 row_count; //total rows
254
+ inT32 blob_count; //total_blobs
255
+ inT32 row_err; //integer error
256
+ float *gradients; //of rows
257
+ float *errors; //of rows
258
+ inT32 row_index; //of total
259
+ TO_ROW *row; //current row
260
+ TO_BLOCK_IT block_it = blocks; //iterator
261
+ TO_ROW_IT row_it;
262
+
263
+ row_count = 0;
264
+ blob_count = 0;
265
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
266
+ block_it.forward ()) {
267
+ row_count += block_it.data ()->get_rows ()->length ();
268
+ //count up rows
269
+ row_it.set_to_list (block_it.data ()->get_rows ());
270
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
271
+ blob_count += row_it.data ()->blob_list ()->length ();
272
+ }
273
+ if (row_count == 0) {
274
+ page_m = 0.0f;
275
+ page_err = 0.0f;
276
+ return;
277
+ }
278
+ gradients = (float *) alloc_mem (blob_count * sizeof (float));
279
+ //get mem
280
+ errors = (float *) alloc_mem (blob_count * sizeof (float));
281
+ if (gradients == NULL || errors == NULL)
282
+ MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
283
+
284
+ row_index = 0;
285
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
286
+ block_it.forward ()) {
287
+ row_it.set_to_list (block_it.data ()->get_rows ());
288
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
289
+ row = row_it.data ();
290
+ blob_count = row->blob_list ()->length ();
291
+ row_err = (inT32) ceil (row->line_error ());
292
+ if (row_err <= 0)
293
+ row_err = 1;
294
+ if (textord_biased_skewcalc) {
295
+ blob_count /= row_err;
296
+ for (blob_count /= row_err; blob_count > 0; blob_count--) {
297
+ gradients[row_index] = row->line_m ();
298
+ errors[row_index] = row->line_error ();
299
+ row_index++;
300
+ }
301
+ }
302
+ else if (blob_count >= textord_min_blobs_in_row) {
303
+ //get gradient
304
+ gradients[row_index] = row->line_m ();
305
+ errors[row_index] = row->line_error ();
306
+ row_index++;
307
+ }
308
+ }
309
+ }
310
+ if (row_index == 0) {
311
+ //desperate
312
+ for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
313
+ block_it.forward ()) {
314
+ row_it.set_to_list (block_it.data ()->get_rows ());
315
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
316
+ row_it.forward ()) {
317
+ row = row_it.data ();
318
+ gradients[row_index] = row->line_m ();
319
+ errors[row_index] = row->line_error ();
320
+ row_index++;
321
+ }
322
+ }
323
+ }
324
+ row_count = row_index;
325
+ row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
326
+ gradients, row_count);
327
+ page_m = gradients[row_index];
328
+ row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
329
+ errors, row_count);
330
+ page_err = errors[row_index];
331
+ free_mem(gradients);
332
+ free_mem(errors);
333
+ }
334
+
335
+ const double kNoiseSize = 0.5; // Fraction of xheight.
336
+ const int kMinSize = 8; // Min pixels to be xheight.
337
+
338
+ // Return true if the dot looks like it is part of the i.
339
+ // Doesn't work for any other diacritical.
340
+ static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
341
+ const TBOX& ibox = i->bounding_box();
342
+ const TBOX& dotbox = dot->bounding_box();
343
+
344
+ // Must overlap horizontally by enough and be high enough.
345
+ int overlap = MIN(dotbox.right(), ibox.right()) -
346
+ MAX(dotbox.left(), ibox.left());
347
+ if (ibox.height() <= 2 * dotbox.height() ||
348
+ (overlap * 2 < ibox.width() && overlap < dotbox.width()))
349
+ return false;
350
+
351
+ // If the i is tall and thin then it is good.
352
+ if (ibox.height() > ibox.width() * 2)
353
+ return true; // The i or ! must be tall and thin.
354
+
355
+ // It might still be tall and thin, but it might be joined to something.
356
+ // So search the outline for a piece of large height close to the edges
357
+ // of the dot.
358
+ const double kHeightFraction = 0.6;
359
+ double target_height = MIN(dotbox.bottom(), ibox.top());
360
+ target_height -= row->line_m()*dotbox.left() + row->line_c();
361
+ target_height *= kHeightFraction;
362
+ int left_min = dotbox.left() - dotbox.width();
363
+ int middle = (dotbox.left() + dotbox.right())/2;
364
+ int right_max = dotbox.right() + dotbox.width();
365
+ int left_miny = 0;
366
+ int left_maxy = 0;
367
+ int right_miny = 0;
368
+ int right_maxy = 0;
369
+ bool found_left = false;
370
+ bool found_right = false;
371
+ bool in_left = false;
372
+ bool in_right = false;
373
+ C_BLOB* blob = i->cblob();
374
+ C_OUTLINE_IT o_it = blob->out_list();
375
+ for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
376
+ C_OUTLINE* outline = o_it.data();
377
+ int length = outline->pathlength();
378
+ ICOORD pos = outline->start_pos();
379
+ for (int step = 0; step < length; pos += outline->step(step++)) {
380
+ int x = pos.x();
381
+ int y = pos.y();
382
+ if (x >= left_min && x < middle && !found_left) {
383
+ // We are in the left part so find min and max y.
384
+ if (in_left) {
385
+ if (y > left_maxy) left_maxy = y;
386
+ if (y < left_miny) left_miny = y;
387
+ } else {
388
+ left_maxy = left_miny = y;
389
+ in_left = true;
390
+ }
391
+ } else if (in_left) {
392
+ // We just left the left so look for size.
393
+ if (left_maxy - left_miny > target_height) {
394
+ if (found_right)
395
+ return true;
396
+ found_left = true;
397
+ }
398
+ in_left = false;
399
+ }
400
+ if (x <= right_max && x > middle && !found_right) {
401
+ // We are in the right part so find min and max y.
402
+ if (in_right) {
403
+ if (y > right_maxy) right_maxy = y;
404
+ if (y < right_miny) right_miny = y;
405
+ } else {
406
+ right_maxy = right_miny = y;
407
+ in_right = true;
408
+ }
409
+ } else if (in_right) {
410
+ // We just left the right so look for size.
411
+ if (right_maxy - right_miny > target_height) {
412
+ if (found_left)
413
+ return true;
414
+ found_right = true;
415
+ }
416
+ in_right = false;
417
+ }
418
+ }
419
+ }
420
+ return false;
421
+ }
422
+
423
+ static void vigorous_noise_removal(TO_BLOCK* block) {
424
+ TO_ROW_IT row_it = block->get_rows ();
425
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
426
+ TO_ROW* row = row_it.data();
427
+ BLOBNBOX_IT b_it = row->blob_list();
428
+ // Estimate the xheight on the row.
429
+ int max_height = 0;
430
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
431
+ BLOBNBOX* blob = b_it.data();
432
+ if (blob->bounding_box().height() > max_height)
433
+ max_height = blob->bounding_box().height();
434
+ }
435
+ STATS hstats(0, max_height + 1);
436
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
437
+ BLOBNBOX* blob = b_it.data();
438
+ int height = blob->bounding_box().height();
439
+ if (height >= kMinSize)
440
+ hstats.add(blob->bounding_box().height(), 1);
441
+ }
442
+ float xheight = hstats.median();
443
+ // Delete small objects.
444
+ BLOBNBOX* prev = NULL;
445
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
446
+ BLOBNBOX* blob = b_it.data();
447
+ const TBOX& box = blob->bounding_box();
448
+ if (box.height() < kNoiseSize * xheight) {
449
+ // Small so delete unless it looks like an i dot.
450
+ if (prev != NULL) {
451
+ if (dot_of_i(blob, prev, row))
452
+ continue; // Looks OK.
453
+ }
454
+ if (!b_it.at_last()) {
455
+ BLOBNBOX* next = b_it.data_relative(1);
456
+ if (dot_of_i(blob, next, row))
457
+ continue; // Looks OK.
458
+ }
459
+ // It might be noise so get rid of it.
460
+ if (blob->blob() != NULL)
461
+ delete blob->blob();
462
+ if (blob->cblob() != NULL)
463
+ delete blob->cblob();
464
+ delete b_it.extract();
465
+ } else {
466
+ prev = blob;
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ /**********************************************************************
473
+ * cleanup_rows
474
+ *
475
+ * Remove overlapping rows and fit all the blobs to what's left.
476
+ **********************************************************************/
477
+
478
+ void cleanup_rows( //find lines
479
+ ICOORD page_tr, //top right
480
+ TO_BLOCK *block, //block to do
481
+ float gradient, //gradient to fit
482
+ FCOORD rotation, //for drawing
483
+ inT32 block_edge, //edge of block
484
+ BOOL8 testing_on //correct orientation
485
+ ) {
486
+ //iterators
487
+ BLOBNBOX_IT blob_it = &block->blobs;
488
+ TO_ROW_IT row_it = block->get_rows ();
489
+
490
+ #ifndef GRAPHICS_DISABLED
491
+ if (textord_show_parallel_rows && testing_on) {
492
+ if (to_win == NULL)
493
+ create_to_win(page_tr);
494
+ }
495
+ #endif
496
+ //get row coords
497
+ fit_parallel_rows(block,
498
+ gradient,
499
+ rotation,
500
+ block_edge,
501
+ textord_show_parallel_rows &&testing_on);
502
+ delete_non_dropout_rows(block,
503
+ gradient,
504
+ rotation,
505
+ block_edge,
506
+ textord_show_parallel_rows &&testing_on);
507
+ expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
508
+ blob_it.set_to_list (&block->blobs);
509
+ row_it.set_to_list (block->get_rows ());
510
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
511
+ blob_it.add_list_after (row_it.data ()->blob_list ());
512
+ //give blobs back
513
+ assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
514
+ //now new rows must be genuine
515
+ blob_it.set_to_list (&block->blobs);
516
+ blob_it.add_list_after (&block->large_blobs);
517
+ assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
518
+ //safe to use big ones now
519
+ blob_it.set_to_list (&block->blobs);
520
+ //throw all blobs in
521
+ blob_it.add_list_after (&block->noise_blobs);
522
+ blob_it.add_list_after (&block->small_blobs);
523
+ assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
524
+ //no rows for noise
525
+ row_it.set_to_list (block->get_rows ());
526
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
527
+ row_it.data ()->blob_list ()->sort (blob_x_order);
528
+ fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
529
+ if (textord_heavy_nr) {
530
+ vigorous_noise_removal(block);
531
+ }
532
+ separate_underlines(block, gradient, rotation, testing_on);
533
+ pre_associate_blobs(page_tr, block, rotation, testing_on);
534
+
535
+ #ifndef GRAPHICS_DISABLED
536
+ if (textord_show_final_rows && testing_on) {
537
+ if (to_win == NULL)
538
+ create_to_win(page_tr);
539
+ }
540
+ #endif
541
+
542
+ fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
543
+ // textord_show_final_rows && testing_on);
544
+ make_spline_rows(block,
545
+ gradient,
546
+ rotation,
547
+ block_edge,
548
+ textord_show_final_rows &&testing_on);
549
+ if (!textord_old_xheight || !textord_old_baselines)
550
+ compute_block_xheight(block, gradient);
551
+ if (textord_restore_underlines)
552
+ //fix underlines
553
+ restore_underlined_blobs(block);
554
+ #ifndef GRAPHICS_DISABLED
555
+ if (textord_show_final_rows && testing_on) {
556
+ plot_blob_list (to_win, &block->blobs,
557
+ ScrollView::MAGENTA, ScrollView::WHITE);
558
+ //show discarded blobs
559
+ plot_blob_list (to_win, &block->underlines,
560
+ ScrollView::YELLOW, ScrollView::CORAL);
561
+ }
562
+ if (textord_show_final_rows && testing_on && block->blobs.length () > 0)
563
+ tprintf ("%d blobs discarded as noise\n", block->blobs.length ());
564
+ if (textord_show_final_rows && testing_on) {
565
+ draw_meanlines(block, gradient, block_edge, ScrollView::WHITE, rotation);
566
+ }
567
+ #endif
568
+ }
569
+
570
+
571
+ /**********************************************************************
572
+ * delete_non_dropout_rows
573
+ *
574
+ * Compute the linespacing and offset.
575
+ **********************************************************************/
576
+
577
+ void delete_non_dropout_rows( //find lines
578
+ TO_BLOCK *block, //block to do
579
+ float gradient, //global skew
580
+ FCOORD rotation, //deskew vector
581
+ inT32 block_edge, //left edge
582
+ BOOL8 testing_on //correct orientation
583
+ ) {
584
+ TBOX block_box; //deskewed block
585
+ inT32 *deltas; //change in occupation
586
+ inT32 *occupation; //of pixel coords
587
+ inT32 max_y; //in block
588
+ inT32 min_y;
589
+ inT32 line_index; //of scan line
590
+ inT32 line_count; //no of scan lines
591
+ inT32 distance; //to drop-out
592
+ inT32 xleft; //of block
593
+ inT32 ybottom; //of block
594
+ TO_ROW *row; //current row
595
+ TO_ROW_IT row_it = block->get_rows ();
596
+ BLOBNBOX_IT blob_it = &block->blobs;
597
+
598
+ if (row_it.length () == 0)
599
+ return; //empty block
600
+ block_box = deskew_block_coords (block, gradient);
601
+ xleft = block->block->bounding_box ().left ();
602
+ ybottom = block->block->bounding_box ().bottom ();
603
+ min_y = block_box.bottom () - 1;
604
+ max_y = block_box.top () + 1;
605
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
606
+ line_index = (inT32) floor (row_it.data ()->intercept ());
607
+ if (line_index <= min_y)
608
+ min_y = line_index - 1;
609
+ if (line_index >= max_y)
610
+ max_y = line_index + 1;
611
+ }
612
+ line_count = max_y - min_y + 1;
613
+ if (line_count <= 0)
614
+ return; //empty block
615
+ deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
616
+ occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
617
+ if (deltas == NULL || occupation == NULL)
618
+ MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
619
+
620
+ compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
621
+ compute_occupation_threshold ((inT32)
622
+ ceil (block->line_spacing *
623
+ (textord_merge_desc +
624
+ textord_merge_asc)),
625
+ (inT32) ceil (block->line_spacing *
626
+ (textord_merge_x +
627
+ textord_merge_asc)),
628
+ max_y - min_y + 1, occupation, deltas);
629
+ #ifndef GRAPHICS_DISABLED
630
+ if (testing_on) {
631
+ draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
632
+ }
633
+ #endif
634
+ compute_dropout_distances(occupation, deltas, line_count);
635
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
636
+ row = row_it.data ();
637
+ line_index = (inT32) floor (row->intercept ());
638
+ distance = deltas[line_index - min_y];
639
+ if (find_best_dropout_row (row, distance, block->line_spacing / 2,
640
+ line_index, &row_it, testing_on)) {
641
+ #ifndef GRAPHICS_DISABLED
642
+ if (testing_on)
643
+ plot_parallel_row(row, gradient, block_edge,
644
+ ScrollView::WHITE, rotation);
645
+ #endif
646
+ blob_it.add_list_after (row_it.data ()->blob_list ());
647
+ delete row_it.extract (); //too far away
648
+ }
649
+ }
650
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
651
+ blob_it.add_list_after (row_it.data ()->blob_list ());
652
+ }
653
+
654
+ free_mem(deltas);
655
+ free_mem(occupation);
656
+ }
657
+
658
+
659
+ /**********************************************************************
660
+ * find_best_dropout_row
661
+ *
662
+ * Delete this row if it has a neighbour with better dropout characteristics.
663
+ * TRUE is returned if the row should be deleted.
664
+ **********************************************************************/
665
+
666
+ BOOL8 find_best_dropout_row( //find neighbours
667
+ TO_ROW *row, //row to test
668
+ inT32 distance, //dropout dist
669
+ float dist_limit, //threshold distance
670
+ inT32 line_index, //index of row
671
+ TO_ROW_IT *row_it, //current position
672
+ BOOL8 testing_on //correct orientation
673
+ ) {
674
+ inT32 next_index; //of neigbouring row
675
+ inT32 row_offset; //from current row
676
+ inT32 abs_dist; //absolute distance
677
+ inT8 row_inc; //increment to row_index
678
+ TO_ROW *next_row; //nextious row
679
+
680
+ if (testing_on)
681
+ tprintf ("Row at %g(%g), dropout dist=%d,",
682
+ row->intercept (), row->parallel_c (), distance);
683
+ if (distance < 0) {
684
+ row_inc = 1;
685
+ abs_dist = -distance;
686
+ }
687
+ else {
688
+ row_inc = -1;
689
+ abs_dist = distance;
690
+ }
691
+ if (abs_dist > dist_limit) {
692
+ if (testing_on) {
693
+ tprintf (" too far - deleting\n");
694
+ }
695
+ return TRUE;
696
+ }
697
+ if ((distance < 0 && !row_it->at_last ())
698
+ || (distance >= 0 && !row_it->at_first ())) {
699
+ row_offset = row_inc;
700
+ do {
701
+ next_row = row_it->data_relative (row_offset);
702
+ next_index = (inT32) floor (next_row->intercept ());
703
+ if ((distance < 0
704
+ && next_index < line_index
705
+ && next_index > line_index + distance + distance)
706
+ || (distance >= 0
707
+ && next_index > line_index
708
+ && next_index < line_index + distance + distance)) {
709
+ if (testing_on) {
710
+ tprintf (" nearer neighbour (%d) at %g\n",
711
+ line_index + distance - next_index,
712
+ next_row->intercept ());
713
+ }
714
+ return TRUE; //other is nearer
715
+ }
716
+ else if (next_index == line_index
717
+ || next_index == line_index + distance + distance) {
718
+ if (row->believability () <= next_row->believability ()) {
719
+ if (testing_on) {
720
+ tprintf (" equal but more believable at %g (%g/%g)\n",
721
+ next_row->intercept (),
722
+ row->believability (),
723
+ next_row->believability ());
724
+ }
725
+ return TRUE; //other is more believable
726
+ }
727
+ }
728
+ row_offset += row_inc;
729
+ }
730
+ while ((next_index == line_index
731
+ || next_index == line_index + distance + distance)
732
+ && row_offset < row_it->length ());
733
+ if (testing_on)
734
+ tprintf (" keeping\n");
735
+ }
736
+ return FALSE;
737
+ }
738
+
739
+
740
+ /**********************************************************************
741
+ * deskew_block_coords
742
+ *
743
+ * Compute the bounding box of all the blobs in the block
744
+ * if they were deskewed without actually doing it.
745
+ **********************************************************************/
746
+
747
+ TBOX deskew_block_coords( //block box
748
+ TO_BLOCK *block, //block to do
749
+ float gradient //global skew
750
+ ) {
751
+ TBOX result; //block bounds
752
+ TBOX blob_box; //of block
753
+ FCOORD rotation; //deskew vector
754
+ float length; //of gradient vector
755
+ TO_ROW_IT row_it = block->get_rows ();
756
+ TO_ROW *row; //current row
757
+ BLOBNBOX *blob; //current blob
758
+ BLOBNBOX_IT blob_it; //iterator
759
+
760
+ length = sqrt (gradient * gradient + 1);
761
+ rotation = FCOORD (1 / length, -gradient / length);
762
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
763
+ row = row_it.data ();
764
+ blob_it.set_to_list (row->blob_list ());
765
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
766
+ blob_it.forward ()) {
767
+ blob = blob_it.data ();
768
+ blob_box = blob->bounding_box ();
769
+ blob_box.rotate (rotation);//de-skew it
770
+ result += blob_box;
771
+ }
772
+ }
773
+ return result;
774
+ }
775
+
776
+
777
+ /**********************************************************************
778
+ * compute_line_occupation
779
+ *
780
+ * Compute the pixel projection back on the y axis given the global
781
+ * skew. Also compute the 1st derivative.
782
+ **********************************************************************/
783
+
784
+ void compute_line_occupation( //project blobs
785
+ TO_BLOCK *block, //block to do
786
+ float gradient, //global skew
787
+ inT32 min_y, //min coord in block
788
+ inT32 max_y, //in block
789
+ inT32 *occupation, //output projection
790
+ inT32 *deltas //derivative
791
+ ) {
792
+ inT32 line_count; //maxy-miny+1
793
+ inT32 line_index; //of scan line
794
+ int index; //array index for daft compilers
795
+ float top, bottom; //coords of blob
796
+ inT32 width; //of blob
797
+ TO_ROW *row; //current row
798
+ TO_ROW_IT row_it = block->get_rows ();
799
+ BLOBNBOX *blob; //current blob
800
+ BLOBNBOX_IT blob_it; //iterator
801
+ float length; //of skew vector
802
+ TBOX blob_box; //bounding box
803
+ FCOORD rotation; //inverse of skew
804
+
805
+ line_count = max_y - min_y + 1;
806
+ length = sqrt (gradient * gradient + 1);
807
+ rotation = FCOORD (1 / length, -gradient / length);
808
+ for (line_index = 0; line_index < line_count; line_index++)
809
+ deltas[line_index] = 0;
810
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
811
+ row = row_it.data ();
812
+ blob_it.set_to_list (row->blob_list ());
813
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
814
+ blob_it.forward ()) {
815
+ blob = blob_it.data ();
816
+ blob_box = blob->bounding_box ();
817
+ blob_box.rotate (rotation);//de-skew it
818
+ top = blob_box.top ();
819
+ bottom = blob_box.bottom ();
820
+ width =
821
+ (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
822
+ if ((inT32) floor (bottom) < min_y
823
+ || (inT32) floor (bottom) - min_y >= line_count)
824
+ fprintf (stderr,
825
+ "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
826
+ INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
827
+ //count transitions
828
+ index = (inT32) floor (bottom) - min_y;
829
+ deltas[index] += width;
830
+ if ((inT32) floor (top) < min_y
831
+ || (inT32) floor (top) - min_y >= line_count)
832
+ fprintf (stderr,
833
+ "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
834
+ INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
835
+ index = (inT32) floor (top) - min_y;
836
+ deltas[index] -= width;
837
+ }
838
+ }
839
+ occupation[0] = deltas[0];
840
+ for (line_index = 1; line_index < line_count; line_index++)
841
+ occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
842
+ }
843
+
844
+
845
+ /**********************************************************************
846
+ * compute_occupation_threshold
847
+ *
848
+ * Compute thresholds for textline or not for the occupation array.
849
+ **********************************************************************/
850
+
851
+ void compute_occupation_threshold( //project blobs
852
+ inT32 low_window, //below result point
853
+ inT32 high_window, //above result point
854
+ inT32 line_count, //array sizes
855
+ inT32 *occupation, //input projection
856
+ inT32 *thresholds //output thresholds
857
+ ) {
858
+ inT32 line_index; //of thresholds line
859
+ inT32 low_index; //in occupation
860
+ inT32 high_index; //in occupation
861
+ inT32 sum; //current average
862
+ inT32 divisor; //to get thresholds
863
+ inT32 min_index; //of min occ
864
+ inT32 min_occ; //min in locality
865
+ inT32 test_index; //for finding min
866
+
867
+ divisor =
868
+ (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
869
+ if (low_window + high_window < line_count) {
870
+ for (sum = 0, high_index = 0; high_index < low_window; high_index++)
871
+ sum += occupation[high_index];
872
+ for (low_index = 0; low_index < high_window; low_index++, high_index++)
873
+ sum += occupation[high_index];
874
+ min_occ = occupation[0];
875
+ min_index = 0;
876
+ for (test_index = 1; test_index < high_index; test_index++) {
877
+ if (occupation[test_index] <= min_occ) {
878
+ min_occ = occupation[test_index];
879
+ min_index = test_index; //find min in region
880
+ }
881
+ }
882
+ for (line_index = 0; line_index < low_window; line_index++)
883
+ thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
884
+ //same out to end
885
+ for (low_index = 0; high_index < line_count; low_index++, high_index++) {
886
+ sum -= occupation[low_index];
887
+ sum += occupation[high_index];
888
+ if (occupation[high_index] <= min_occ) {
889
+ //find min in region
890
+ min_occ = occupation[high_index];
891
+ min_index = high_index;
892
+ }
893
+ //lost min from region
894
+ if (min_index <= low_index) {
895
+ min_occ = occupation[low_index + 1];
896
+ min_index = low_index + 1;
897
+ for (test_index = low_index + 2; test_index <= high_index;
898
+ test_index++) {
899
+ if (occupation[test_index] <= min_occ) {
900
+ min_occ = occupation[test_index];
901
+ //find min in region
902
+ min_index = test_index;
903
+ }
904
+ }
905
+ }
906
+ thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
907
+ }
908
+ }
909
+ else {
910
+ min_occ = occupation[0];
911
+ min_index = 0;
912
+ for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
913
+ if (occupation[low_index] < min_occ) {
914
+ min_occ = occupation[low_index];
915
+ min_index = low_index;
916
+ }
917
+ sum += occupation[low_index];
918
+ }
919
+ line_index = 0;
920
+ }
921
+ for (; line_index < line_count; line_index++)
922
+ thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
923
+ //same out to end
924
+ }
925
+
926
+
927
+ /**********************************************************************
928
+ * compute_dropout_distances
929
+ *
930
+ * Compute the distance from each coordinate to the nearest dropout.
931
+ **********************************************************************/
932
+
933
+ void compute_dropout_distances( //project blobs
934
+ inT32 *occupation, //input projection
935
+ inT32 *thresholds, //output thresholds
936
+ inT32 line_count //array sizes
937
+ ) {
938
+ inT32 line_index; //of thresholds line
939
+ inT32 distance; //from prev dropout
940
+ inT32 next_dist; //to next dropout
941
+ inT32 back_index; //for back filling
942
+ inT32 prev_threshold; //before overwrite
943
+
944
+ distance = -line_count;
945
+ line_index = 0;
946
+ do {
947
+ do {
948
+ distance--;
949
+ prev_threshold = thresholds[line_index];
950
+ //distance from prev
951
+ thresholds[line_index] = distance;
952
+ line_index++;
953
+ }
954
+ while (line_index < line_count
955
+ && (occupation[line_index] < thresholds[line_index]
956
+ || occupation[line_index - 1] >= prev_threshold));
957
+ if (line_index < line_count) {
958
+ back_index = line_index - 1;
959
+ next_dist = 1;
960
+ while (next_dist < -distance && back_index >= 0) {
961
+ thresholds[back_index] = next_dist;
962
+ back_index--;
963
+ next_dist++;
964
+ distance++;
965
+ }
966
+ distance = 1;
967
+ }
968
+ }
969
+ while (line_index < line_count);
970
+ }
971
+
972
+
973
+ /**********************************************************************
974
+ * expand_rows
975
+ *
976
+ * Expand each row to the least of its allowed size and touching its
977
+ * neighbours. If the expansion would entirely swallow a neighbouring row
978
+ * then do so.
979
+ **********************************************************************/
980
+
981
+ void expand_rows( //find lines
982
+ ICOORD page_tr, //top right
983
+ TO_BLOCK *block, //block to do
984
+ float gradient, //gradient to fit
985
+ FCOORD rotation, //for drawing
986
+ inT32 block_edge, //edge of block
987
+ BOOL8 testing_on //correct orientation
988
+ ) {
989
+ BOOL8 swallowed_row; //eaten a neighbour
990
+ float y_max, y_min; //new row limits
991
+ float y_bottom, y_top; //allowed limits
992
+ TO_ROW *test_row; //next row
993
+ TO_ROW *row; //current row
994
+ //iterators
995
+ BLOBNBOX_IT blob_it = &block->blobs;
996
+ TO_ROW_IT row_it = block->get_rows ();
997
+
998
+ #ifndef GRAPHICS_DISABLED
999
+ if (textord_show_expanded_rows && testing_on) {
1000
+ if (to_win == NULL)
1001
+ create_to_win(page_tr);
1002
+ }
1003
+ #endif
1004
+
1005
+ adjust_row_limits(block); //shift min,max.
1006
+ if (textord_new_initial_xheight) {
1007
+ if (block->get_rows ()->length () == 0)
1008
+ return;
1009
+ compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1010
+ }
1011
+ assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
1012
+ //get real membership
1013
+ if (block->get_rows ()->length () == 0)
1014
+ return;
1015
+ fit_parallel_rows(block,
1016
+ gradient,
1017
+ rotation,
1018
+ block_edge,
1019
+ textord_show_expanded_rows &&testing_on);
1020
+ if (!textord_new_initial_xheight)
1021
+ compute_row_stats(block, textord_show_expanded_rows &&testing_on);
1022
+ row_it.move_to_last ();
1023
+ do {
1024
+ row = row_it.data ();
1025
+ y_max = row->max_y (); //get current limits
1026
+ y_min = row->min_y ();
1027
+ y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
1028
+ textord_merge_desc;
1029
+ y_top = row->intercept () + block->line_size * textord_expansion_factor *
1030
+ (textord_merge_x + textord_merge_asc);
1031
+ if (y_min > y_bottom) { //expansion allowed
1032
+ if (textord_show_expanded_rows && testing_on)
1033
+ tprintf("Expanding bottom of row at %f from %f to %f\n",
1034
+ row->intercept(), y_min, y_bottom);
1035
+ //expandable
1036
+ swallowed_row = TRUE;
1037
+ while (swallowed_row && !row_it.at_last ()) {
1038
+ swallowed_row = FALSE;
1039
+ //get next one
1040
+ test_row = row_it.data_relative (1);
1041
+ //overlaps space
1042
+ if (test_row->max_y () > y_bottom) {
1043
+ if (test_row->min_y () > y_bottom) {
1044
+ if (textord_show_expanded_rows && testing_on)
1045
+ tprintf("Eating row below at %f\n", test_row->intercept());
1046
+ row_it.forward ();
1047
+ #ifndef GRAPHICS_DISABLED
1048
+ if (textord_show_expanded_rows && testing_on)
1049
+ plot_parallel_row(test_row,
1050
+ gradient,
1051
+ block_edge,
1052
+ ScrollView::WHITE,
1053
+ rotation);
1054
+ #endif
1055
+ blob_it.set_to_list (row->blob_list ());
1056
+ blob_it.add_list_after (test_row->blob_list ());
1057
+ //swallow complete row
1058
+ delete row_it.extract ();
1059
+ row_it.backward ();
1060
+ swallowed_row = TRUE;
1061
+ }
1062
+ else if (test_row->max_y () < y_min) {
1063
+ //shorter limit
1064
+ y_bottom = test_row->max_y ();
1065
+ if (textord_show_expanded_rows && testing_on)
1066
+ tprintf("Truncating limit to %f due to touching row at %f\n",
1067
+ y_bottom, test_row->intercept());
1068
+ }
1069
+ else {
1070
+ y_bottom = y_min; //can't expand it
1071
+ if (textord_show_expanded_rows && testing_on)
1072
+ tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1073
+ y_bottom, test_row->intercept());
1074
+ }
1075
+ }
1076
+ }
1077
+ y_min = y_bottom; //expand it
1078
+ }
1079
+ if (y_max < y_top) { //expansion allowed
1080
+ if (textord_show_expanded_rows && testing_on)
1081
+ tprintf("Expanding top of row at %f from %f to %f\n",
1082
+ row->intercept(), y_max, y_top);
1083
+ swallowed_row = TRUE;
1084
+ while (swallowed_row && !row_it.at_first ()) {
1085
+ swallowed_row = FALSE;
1086
+ //get one above
1087
+ test_row = row_it.data_relative (-1);
1088
+ if (test_row->min_y () < y_top) {
1089
+ if (test_row->max_y () < y_top) {
1090
+ if (textord_show_expanded_rows && testing_on)
1091
+ tprintf("Eating row above at %f\n", test_row->intercept());
1092
+ row_it.backward ();
1093
+ blob_it.set_to_list (row->blob_list ());
1094
+ #ifndef GRAPHICS_DISABLED
1095
+ if (textord_show_expanded_rows && testing_on)
1096
+ plot_parallel_row(test_row,
1097
+ gradient,
1098
+ block_edge,
1099
+ ScrollView::WHITE,
1100
+ rotation);
1101
+ #endif
1102
+ blob_it.add_list_after (test_row->blob_list ());
1103
+ //swallow complete row
1104
+ delete row_it.extract ();
1105
+ row_it.forward ();
1106
+ swallowed_row = TRUE;
1107
+ }
1108
+ else if (test_row->min_y () < y_max) {
1109
+ //shorter limit
1110
+ y_top = test_row->min_y ();
1111
+ if (textord_show_expanded_rows && testing_on)
1112
+ tprintf("Truncating limit to %f due to touching row at %f\n",
1113
+ y_top, test_row->intercept());
1114
+ }
1115
+ else {
1116
+ y_top = y_max; //can't expand it
1117
+ if (textord_show_expanded_rows && testing_on)
1118
+ tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1119
+ y_top, test_row->intercept());
1120
+ }
1121
+ }
1122
+ }
1123
+ y_max = y_top;
1124
+ }
1125
+ //new limits
1126
+ row->set_limits (y_min, y_max);
1127
+ row_it.backward ();
1128
+ }
1129
+ while (!row_it.at_last ());
1130
+ }
1131
+
1132
+
1133
+ /**********************************************************************
1134
+ * adjust_row_limits
1135
+ *
1136
+ * Change the limits of rows to suit the default fractions.
1137
+ **********************************************************************/
1138
+
1139
+ void adjust_row_limits( //tidy limits
1140
+ TO_BLOCK *block //block to do
1141
+ ) {
1142
+ TO_ROW *row; //current row
1143
+ float size; //size of row
1144
+ float ymax; //top of row
1145
+ float ymin; //bottom of row
1146
+ TO_ROW_IT row_it = block->get_rows ();
1147
+
1148
+ if (textord_show_expanded_rows)
1149
+ tprintf("Adjusting row limits for block(%d,%d)\n",
1150
+ block->block->bounding_box().left(),
1151
+ block->block->bounding_box().top());
1152
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1153
+ row = row_it.data ();
1154
+ size = row->max_y () - row->min_y ();
1155
+ if (textord_show_expanded_rows)
1156
+ tprintf("Row at %f has min %f, max %f, size %f\n",
1157
+ row->intercept(), row->min_y(), row->max_y(), size);
1158
+ size /= textord_merge_x + textord_merge_asc + textord_merge_desc;
1159
+ ymax = size * (textord_merge_x + textord_merge_asc);
1160
+ ymin = -size * textord_merge_desc;
1161
+ row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
1162
+ row->merged = FALSE;
1163
+ }
1164
+ }
1165
+
1166
+
1167
+ /**********************************************************************
1168
+ * compute_row_stats
1169
+ *
1170
+ * Compute the linespacing and offset.
1171
+ **********************************************************************/
1172
+
1173
+ void compute_row_stats( //find lines
1174
+ TO_BLOCK *block, //block to do
1175
+ BOOL8 testing_on //correct orientation
1176
+ ) {
1177
+ inT32 row_index; //of median
1178
+ TO_ROW *row; //current row
1179
+ TO_ROW *prev_row; //previous row
1180
+ float iqr; //inter quartile range
1181
+ TO_ROW_IT row_it = block->get_rows ();
1182
+ //number of rows
1183
+ inT16 rowcount = row_it.length ();
1184
+ TO_ROW **rows; //for choose nth
1185
+
1186
+ rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
1187
+ if (rows == NULL)
1188
+ MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
1189
+ rowcount = 0;
1190
+ prev_row = NULL;
1191
+ row_it.move_to_last (); //start at bottom
1192
+ do {
1193
+ row = row_it.data ();
1194
+ if (prev_row != NULL) {
1195
+ rows[rowcount++] = prev_row;
1196
+ prev_row->spacing = row->intercept () - prev_row->intercept ();
1197
+ if (testing_on)
1198
+ tprintf ("Row at %g yields spacing of %g\n",
1199
+ row->intercept (), prev_row->spacing);
1200
+ }
1201
+ prev_row = row;
1202
+ row_it.backward ();
1203
+ }
1204
+ while (!row_it.at_last ());
1205
+ block->key_row = prev_row;
1206
+ block->baseline_offset =
1207
+ fmod (prev_row->parallel_c (), block->line_spacing);
1208
+ if (testing_on)
1209
+ tprintf ("Blob based spacing=(%g,%g), offset=%g",
1210
+ block->line_size, block->line_spacing, block->baseline_offset);
1211
+ if (rowcount > 0) {
1212
+ row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
1213
+ sizeof (TO_ROW *), row_spacing_order);
1214
+ iqr = rows[row_index]->spacing;
1215
+ row_index = choose_nth_item (rowcount / 4, rows, rowcount,
1216
+ sizeof (TO_ROW *), row_spacing_order);
1217
+ iqr -= rows[row_index]->spacing;
1218
+ row_index = choose_nth_item (rowcount / 2, rows, rowcount,
1219
+ sizeof (TO_ROW *), row_spacing_order);
1220
+ block->key_row = rows[row_index];
1221
+ if (testing_on)
1222
+ tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
1223
+ if (rowcount > 2
1224
+ && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1225
+ if (!textord_new_initial_xheight) {
1226
+ if (rows[row_index]->spacing < block->line_spacing
1227
+ && rows[row_index]->spacing > block->line_size)
1228
+ //within range
1229
+ block->line_size = rows[row_index]->spacing;
1230
+ //spacing=size
1231
+ else if (rows[row_index]->spacing > block->line_spacing)
1232
+ block->line_size = block->line_spacing;
1233
+ //too big so use max
1234
+ }
1235
+ else {
1236
+ if (rows[row_index]->spacing < block->line_spacing)
1237
+ block->line_size = rows[row_index]->spacing;
1238
+ else
1239
+ block->line_size = block->line_spacing;
1240
+ //too big so use max
1241
+ }
1242
+ if (block->line_size < textord_min_xheight)
1243
+ block->line_size = (float) textord_min_xheight;
1244
+ block->line_spacing = rows[row_index]->spacing;
1245
+ block->max_blob_size =
1246
+ block->line_spacing * textord_excess_blobsize;
1247
+ }
1248
+ block->baseline_offset = fmod (rows[row_index]->intercept (),
1249
+ block->line_spacing);
1250
+ }
1251
+ if (testing_on)
1252
+ tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
1253
+ block->line_size, block->line_spacing, block->baseline_offset);
1254
+ free_mem(rows);
1255
+ }
1256
+
1257
+
1258
+ /**********************************************************************
1259
+ * compute_block_xheight
1260
+ *
1261
+ * Compute the xheight of the individual rows, then correlate them
1262
+ * and interpret ascenderless lines, correcting xheights.
1263
+ **********************************************************************/
1264
+
1265
+ void compute_block_xheight( //find lines
1266
+ TO_BLOCK *block, //block to do
1267
+ float gradient //global skew
1268
+ ) {
1269
+ TO_ROW *row; //current row
1270
+ int xh_count, desc_count; //no of samples
1271
+ float block_median; //median blob size
1272
+ int asc_count, cap_count;
1273
+ inT32 min_size, max_size; //limits on xheight
1274
+ inT32 evidence; //no of samples on row
1275
+ float xh_sum, desc_sum; //for averages
1276
+ float asc_sum, cap_sum;
1277
+ TO_ROW_IT row_it = block->get_rows ();
1278
+ STATS row_heights; //block evidence
1279
+
1280
+ if (row_it.empty ())
1281
+ return; //no rows
1282
+ block_median = median_block_xheight (block, gradient);
1283
+ block_median *= 2;
1284
+ if (block_median < block->line_size)
1285
+ block_median = block->line_size;
1286
+ // tprintf("Block median=%g, linesize=%g\n",
1287
+ // block_median,block->line_size);
1288
+ max_size = (inT32) ceil (block_median);
1289
+ min_size = (inT32) floor (block_median * textord_minxh);
1290
+ row_heights.set_range (min_size, max_size + 1);
1291
+ xh_count = desc_count = asc_count = cap_count = 0;
1292
+ xh_sum = desc_sum = asc_sum = cap_sum = 0.0f;
1293
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1294
+ row = row_it.data ();
1295
+ evidence = compute_row_xheight (row, min_size, max_size, gradient);
1296
+ if (row->xheight > 0 && row->ascrise > 0) {
1297
+ row_heights.add ((inT32) row->xheight, evidence);
1298
+ xh_count += evidence;
1299
+ asc_sum += row->ascrise;
1300
+ asc_count++;
1301
+ }
1302
+ else if (row->xheight > 0) {
1303
+ cap_sum += row->xheight; //assume just caps
1304
+ cap_count++;
1305
+ }
1306
+ if (row->descdrop != 0) {
1307
+ desc_sum += row->descdrop;
1308
+ desc_count++;
1309
+ }
1310
+ }
1311
+ if (xh_count > 0) {
1312
+ //median
1313
+ xh_sum = row_heights.ile (0.5);
1314
+ asc_sum /= asc_count;
1315
+ }
1316
+ else if (cap_count > 0) {
1317
+ cap_sum /= cap_count; //must assume caps
1318
+ xh_sum =
1319
+ cap_sum * textord_merge_x / (textord_merge_x + textord_merge_asc);
1320
+ asc_sum =
1321
+ cap_sum * textord_merge_asc / (textord_merge_x + textord_merge_asc);
1322
+ }
1323
+ else {
1324
+ //default sizes
1325
+ xh_sum = block_median * textord_merge_x;
1326
+ asc_sum = block_median * textord_merge_asc;
1327
+ }
1328
+ if (desc_count > 0) {
1329
+ desc_sum /= desc_count;
1330
+ }
1331
+ else {
1332
+ desc_sum = xh_sum * textord_merge_desc / textord_merge_x;
1333
+ }
1334
+ // tprintf("Block average x height=%g, count=%d, asc=%g/%d, desc=%g/%d,cap=%g/%d\n",
1335
+ // xh_sum,xh_count,asc_sum,asc_count,desc_sum,desc_count,
1336
+ // cap_sum,cap_count);
1337
+ if (xh_sum < textord_min_xheight)
1338
+ xh_sum = (float) textord_min_xheight;
1339
+ block->xheight = xh_sum;
1340
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1341
+ correct_row_xheight (row_it.data (), xh_sum, asc_sum, desc_sum);
1342
+ }
1343
+ }
1344
+
1345
+
1346
+ /**********************************************************************
1347
+ * median_block_xheight
1348
+ *
1349
+ * Compute the linespacing and offset.
1350
+ **********************************************************************/
1351
+
1352
+ float median_block_xheight( //find lines
1353
+ TO_BLOCK *block, //block to do
1354
+ float gradient //global skew
1355
+ ) {
1356
+ TO_ROW *row; //current row
1357
+ float result; //output size
1358
+ float xcentre; //centre of blob
1359
+ TO_ROW_IT row_it = block->get_rows ();
1360
+ BLOBNBOX_IT blob_it;
1361
+ BLOBNBOX *blob; //current blob
1362
+ float *heights; //for choose nth
1363
+ inT32 blob_count; //blobs in block
1364
+ inT32 blob_index; //current blob
1365
+
1366
+ blob_count = 0;
1367
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
1368
+ blob_count += row_it.data ()->blob_list ()->length ();
1369
+ heights = (float *) alloc_mem (blob_count * sizeof (float));
1370
+ if (heights == NULL)
1371
+ MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
1372
+
1373
+ blob_index = 0;
1374
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1375
+ row = row_it.data ();
1376
+ blob_it.set_to_list (row->blob_list ());
1377
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1378
+ blob_it.forward ()) {
1379
+ blob = blob_it.data ();
1380
+ if (!blob->joined_to_prev ()) {
1381
+ xcentre =
1382
+ (blob->bounding_box ().left () +
1383
+ blob->bounding_box ().right ()) / 2.0f;
1384
+ heights[blob_index] =
1385
+ blob->bounding_box ().top () - gradient * xcentre -
1386
+ row->parallel_c ();
1387
+ if (heights[blob_index] > 0)
1388
+ blob_index++;
1389
+ }
1390
+ }
1391
+ }
1392
+ ASSERT_HOST (blob_index > 0); //dont expect 0
1393
+ blob_count = blob_index;
1394
+ blob_index = choose_nth_item (blob_count / 2, heights, blob_count);
1395
+ result = heights[blob_index];
1396
+ free_mem(heights);
1397
+ return result;
1398
+ }
1399
+
1400
+
1401
+ /**********************************************************************
1402
+ * compute_row_xheight
1403
+ *
1404
+ * Estimate the xheight of this row.
1405
+ * Compute the ascender rise and descender drop at the same time.
1406
+ **********************************************************************/
1407
+
1408
+ inT32 compute_row_xheight( //find lines
1409
+ TO_ROW *row, //row to do
1410
+ inT32 min_height, //min xheight
1411
+ inT32 max_height, //max xheight
1412
+ float gradient //global skew
1413
+ ) {
1414
+ BOOL8 in_best_pile; //control of mode size
1415
+ inT32 prev_size; //previous size
1416
+ float xcentre; //centre of blob
1417
+ float height; //height of blob
1418
+ BLOBNBOX_IT blob_it = row->blob_list ();
1419
+ BLOBNBOX *blob; //current blob
1420
+ inT32 blob_count; //blobs in block
1421
+ inT32 x; //xheight index
1422
+ inT32 asc; //ascender index
1423
+ inT32 blob_index; //current blob
1424
+ inT32 mode_count; //no of modes
1425
+ inT32 best_count; //count of best x so far
1426
+ float ratio; //size ratio
1427
+ inT32 modes[MAX_HEIGHT_MODES]; //biggest piles
1428
+ STATS heights (min_height, max_height + 1);
1429
+
1430
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1431
+ blob = blob_it.data ();
1432
+ if (!blob->joined_to_prev ()) {
1433
+ xcentre =
1434
+ (blob->bounding_box ().left () +
1435
+ blob->bounding_box ().right ()) / 2.0f;
1436
+ height = blob->bounding_box ().top ();
1437
+ if (textord_fix_xheight_bug)
1438
+ height -= row->baseline.y (xcentre);
1439
+ else
1440
+ height -= gradient * xcentre + row->parallel_c ();
1441
+ if (height >= min_height && height <= max_height
1442
+ && (!textord_xheight_tweak || height > textord_min_xheight))
1443
+ heights.add ((inT32) floor (height + 0.5), 1);
1444
+ }
1445
+ }
1446
+ blob_index = heights.mode (); //find mode
1447
+ //get count of mode
1448
+ blob_count = heights.pile_count (blob_index);
1449
+ if (textord_debug_xheights)
1450
+ tprintf ("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d,%d\n",
1451
+ min_height, max_height, blob_index, blob_count,
1452
+ heights.get_total (), row->blob_list ()->length ());
1453
+ row->ascrise = 0.0f;
1454
+ row->xheight = 0.0f;
1455
+ row->descdrop = 0.0f; //undefined;
1456
+ in_best_pile = FALSE;
1457
+ prev_size = -MAX_INT32;
1458
+ best_count = 0;
1459
+ if (blob_count > 0) {
1460
+ //get biggest ones
1461
+ mode_count = compute_height_modes (&heights, min_height, max_height, modes, MAX_HEIGHT_MODES);
1462
+ for (x = 0; x < mode_count - 1; x++) {
1463
+ if (modes[x] != prev_size + 1)
1464
+ in_best_pile = FALSE; //had empty height
1465
+ if (heights.pile_count (modes[x])
1466
+ >= blob_count * textord_xheight_mode_fraction
1467
+ && (in_best_pile || heights.pile_count (modes[x]) > best_count)) {
1468
+ for (asc = x + 1; asc < mode_count; asc++) {
1469
+ ratio = (float) modes[asc] / modes[x];
1470
+ if (textord_ascx_ratio_min < ratio
1471
+ && ratio < textord_ascx_ratio_max
1472
+ && heights.pile_count (modes[asc])
1473
+ >= blob_count * textord_ascheight_mode_fraction) {
1474
+ if (heights.pile_count (modes[x]) > best_count) {
1475
+ in_best_pile = TRUE;
1476
+ best_count = heights.pile_count (modes[x]);
1477
+ }
1478
+ // tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
1479
+ // modes[x],modes[asc]-modes[x],
1480
+ // heights.pile_count(modes[x]),
1481
+ // ratio);
1482
+ prev_size = modes[x];
1483
+ row->xheight = (float) modes[x];
1484
+ row->ascrise = (float) (modes[asc] - modes[x]);
1485
+ }
1486
+ }
1487
+ }
1488
+ }
1489
+ if (row->xheight == 0) {
1490
+ //single mode
1491
+ row->xheight = (float) blob_index;
1492
+ row->ascrise = 0.0f;
1493
+ if (textord_debug_xheights)
1494
+ tprintf ("Single mode xheight set to %g\n", row->xheight);
1495
+ }
1496
+ else if (textord_debug_xheights)
1497
+ tprintf ("Multi-mode xheight set to %g, asc=%g\n",
1498
+ row->xheight, row->ascrise);
1499
+ row->descdrop = (float) compute_row_descdrop (row, gradient);
1500
+ //find descenders
1501
+ }
1502
+ return best_count;
1503
+ }
1504
+
1505
+
1506
+ /**********************************************************************
1507
+ * compute_row_descdrop
1508
+ *
1509
+ * Estimate the descdrop of this row.
1510
+ **********************************************************************/
1511
+
1512
+ inT32 compute_row_descdrop( //find lines
1513
+ TO_ROW *row, //row to do
1514
+ float gradient //global skew
1515
+ ) {
1516
+ inT32 min_height = (inT32) floor (row->xheight * textord_descx_ratio_min);
1517
+ inT32 max_height = (inT32) floor (row->xheight * textord_descx_ratio_max);
1518
+ float xcentre; //centre of blob
1519
+ float height; //height of blob
1520
+ BLOBNBOX_IT blob_it = row->blob_list ();
1521
+ BLOBNBOX *blob; //current blob
1522
+ inT32 blob_count; //blobs in block
1523
+ inT32 blob_index; //current blob
1524
+ STATS heights (min_height, max_height + 1);
1525
+
1526
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1527
+ blob = blob_it.data ();
1528
+ if (!blob->joined_to_prev ()) {
1529
+ xcentre =
1530
+ (blob->bounding_box ().left () +
1531
+ blob->bounding_box ().right ()) / 2.0f;
1532
+ height =
1533
+ gradient * xcentre + row->parallel_c () -
1534
+ blob->bounding_box ().bottom ();
1535
+ if (height >= min_height && height <= max_height)
1536
+ heights.add ((inT32) floor (height + 0.5), 1);
1537
+ }
1538
+ }
1539
+ blob_index = heights.mode (); //find mode
1540
+ //get count of mode
1541
+ blob_count = heights.pile_count (blob_index);
1542
+ return blob_count > 0 ? -blob_index : 0;
1543
+ }
1544
+
1545
+
1546
+ /**********************************************************************
1547
+ * compute_height_modes
1548
+ *
1549
+ * Find the top maxmodes values in the input array and put their
1550
+ * indices in the output in the order in which they occurred.
1551
+ **********************************************************************/
1552
+
1553
+ inT32 compute_height_modes( //find lines
1554
+ STATS *heights, //stats to search
1555
+ inT32 min_height, //bottom of range
1556
+ inT32 max_height, //top of range
1557
+ inT32 *modes, //output array
1558
+ inT32 maxmodes //size of modes
1559
+ ) {
1560
+ inT32 pile_count; //no in source pile
1561
+ inT32 src_count; //no of source entries
1562
+ inT32 src_index; //current entry
1563
+ inT32 least_count; //height of smalllest
1564
+ inT32 least_index; //index of least
1565
+ inT32 dest_count; //index in modes
1566
+
1567
+ src_count = max_height + 1 - min_height;
1568
+ dest_count = 0;
1569
+ least_count = MAX_INT32;
1570
+ least_index = -1;
1571
+ for (src_index = 0; src_index < src_count; src_index++) {
1572
+ pile_count = heights->pile_count (min_height + src_index);
1573
+ if (pile_count > 0) {
1574
+ if (dest_count < maxmodes) {
1575
+ if (pile_count < least_count) {
1576
+ //find smallest in array
1577
+ least_count = pile_count;
1578
+ least_index = dest_count;
1579
+ }
1580
+ modes[dest_count++] = min_height + src_index;
1581
+ }
1582
+ else if (pile_count >= least_count) {
1583
+ while (least_index < maxmodes - 1) {
1584
+ modes[least_index] = modes[least_index + 1];
1585
+ //shuffle up
1586
+ least_index++;
1587
+ }
1588
+ //new one on end
1589
+ modes[maxmodes - 1] = min_height + src_index;
1590
+ if (pile_count == least_count) {
1591
+ //new smallest
1592
+ least_index = maxmodes - 1;
1593
+ }
1594
+ else {
1595
+ least_count = heights->pile_count (modes[0]);
1596
+ least_index = 0;
1597
+ for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1598
+ pile_count = heights->pile_count (modes[dest_count]);
1599
+ if (pile_count < least_count) {
1600
+ //find smallest
1601
+ least_count = pile_count;
1602
+ least_index = dest_count;
1603
+ }
1604
+ }
1605
+ }
1606
+ }
1607
+ }
1608
+ }
1609
+ return dest_count;
1610
+ }
1611
+
1612
+
1613
+ /**********************************************************************
1614
+ * correct_row_xheight
1615
+ *
1616
+ * Adjust the xheight etc of this row if not within reasonable limits
1617
+ * of the average for the block.
1618
+ **********************************************************************/
1619
+
1620
+ void correct_row_xheight( //fix bad values
1621
+ TO_ROW *row, //row to fix
1622
+ float xheight, //average values
1623
+ float ascrise,
1624
+ float descdrop) {
1625
+ if (textord_row_xheights) {
1626
+ if (row->xheight <= 0)
1627
+ row->xheight = xheight;
1628
+ if (row->ascrise < row->xheight * (textord_ascx_ratio_min - 1)) {
1629
+ if (row->xheight >= xheight * (1 - textord_xheight_error_margin)
1630
+ && row->xheight <= xheight * (1 + textord_xheight_error_margin)) {
1631
+ row->all_caps = FALSE;
1632
+ row->ascrise = ascrise;
1633
+ }
1634
+ else if (row->xheight >=
1635
+ (xheight + ascrise) * (1 - textord_xheight_error_margin)
1636
+ && row->xheight <=
1637
+ (xheight + ascrise) * (1 + textord_xheight_error_margin)) {
1638
+ row->all_caps = TRUE;
1639
+ //it was caps
1640
+ row->ascrise = row->xheight - xheight;
1641
+ row->xheight = xheight;
1642
+ }
1643
+ else {
1644
+ row->all_caps = TRUE;
1645
+ row->ascrise = row->xheight * ascrise / (xheight + ascrise);
1646
+ row->xheight -= row->ascrise;
1647
+ }
1648
+ }
1649
+ else
1650
+ row->all_caps = FALSE;
1651
+ row->ascrise = ascrise;
1652
+ if (row->descdrop >= -row->xheight * (textord_ascx_ratio_min - 1))
1653
+ row->descdrop = descdrop;
1654
+ }
1655
+ else {
1656
+ if (row->xheight < xheight * (1 - textord_xheight_error_margin)
1657
+ || row->xheight > xheight * (1 + textord_xheight_error_margin))
1658
+ row->xheight = xheight; //set to average
1659
+ row->all_caps = row->ascrise <= 0;
1660
+ if (row->ascrise < ascrise * (1 - textord_xheight_error_margin)
1661
+ || row->ascrise > ascrise * (1 + textord_xheight_error_margin))
1662
+ row->ascrise = ascrise; //set to average
1663
+ if (row->descdrop < descdrop * (1 - textord_xheight_error_margin)
1664
+ || row->descdrop > descdrop * (1 + textord_xheight_error_margin))
1665
+ row->descdrop = descdrop; //set to average
1666
+ }
1667
+ }
1668
+
1669
+
1670
+ /**********************************************************************
1671
+ * separate_underlines
1672
+ *
1673
+ * Test wide objects for being potential underlines. If they are then
1674
+ * put them in a separate list in the block.
1675
+ **********************************************************************/
1676
+
1677
+ void separate_underlines( //make rough chars
1678
+ TO_BLOCK *block, //block to do
1679
+ float gradient, //skew angle
1680
+ FCOORD rotation, //inverse landscape
1681
+ BOOL8 testing_on //correct orientation
1682
+ ) {
1683
+ BLOBNBOX *blob; //current blob
1684
+ PBLOB *poly_blob; //rotated blob
1685
+ C_BLOB *rotated_blob; //rotated blob
1686
+ TO_ROW *row; //current row
1687
+ float length; //of g_vec
1688
+ TBOX blob_box;
1689
+ FCOORD blob_rotation; //inverse of rotation
1690
+ FCOORD g_vec; //skew rotation
1691
+ BLOBNBOX_IT blob_it; //iterator
1692
+ //iterator
1693
+ BLOBNBOX_IT under_it = &block->underlines;
1694
+ TO_ROW_IT row_it = block->get_rows ();
1695
+
1696
+ //length of vector
1697
+ length = sqrt (1 + gradient * gradient);
1698
+ g_vec = FCOORD (1 / length, -gradient / length);
1699
+ blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1700
+ blob_rotation.rotate (g_vec); //unoding everything
1701
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1702
+ row = row_it.data ();
1703
+ //get blobs
1704
+ blob_it.set_to_list (row->blob_list ());
1705
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1706
+ blob_it.forward ()) {
1707
+ blob = blob_it.data ();
1708
+ blob_box = blob->bounding_box ();
1709
+ if (blob_box.width () > block->line_size * textord_underline_width) {
1710
+ if (textord_cblob_blockocc && blob->cblob () != NULL) {
1711
+ rotated_blob = crotate_cblob (blob->cblob (),
1712
+ blob_rotation);
1713
+ if (test_underline (testing_on && textord_show_final_rows,
1714
+ rotated_blob, (inT16) row->intercept (),
1715
+ (inT16) (block->line_size *
1716
+ (textord_merge_x +
1717
+ textord_merge_asc / 2.0f)))) {
1718
+ under_it.add_after_then_move (blob_it.extract ());
1719
+ if (testing_on && textord_show_final_rows) {
1720
+ tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
1721
+ rotated_blob->bounding_box ().left (),
1722
+ rotated_blob->bounding_box ().bottom (),
1723
+ rotated_blob->bounding_box ().right (),
1724
+ rotated_blob->bounding_box ().top ());
1725
+ tprintf ("(Was (%d,%d)->(%d,%d))\n",
1726
+ blob_box.left (), blob_box.bottom (),
1727
+ blob_box.right (), blob_box.top ());
1728
+ }
1729
+ }
1730
+ delete rotated_blob;
1731
+ }
1732
+ else {
1733
+ if (blob->blob () != NULL) {
1734
+ // if (testing_on && textord_show_final_rows)
1735
+ // tprintf("Rotating by (%g,%g)\n",
1736
+ // blob_rotation.x(),blob_rotation.y());
1737
+ poly_blob = rotate_blob (blob->blob (), blob_rotation);
1738
+ }
1739
+ else
1740
+ poly_blob = rotate_cblob (blob->cblob (),
1741
+ block->line_size,
1742
+ blob_rotation);
1743
+ if (test_underline
1744
+ (testing_on
1745
+ && textord_show_final_rows, poly_blob,
1746
+ row->intercept (),
1747
+ block->line_size * (textord_merge_x +
1748
+ textord_merge_asc / 2))) {
1749
+ if (testing_on && textord_show_final_rows) {
1750
+ tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
1751
+ poly_blob->bounding_box ().left (),
1752
+ poly_blob->bounding_box ().bottom (),
1753
+ poly_blob->bounding_box ().right (),
1754
+ poly_blob->bounding_box ().top ());
1755
+ tprintf ("(Was (%d,%d)->(%d,%d))\n",
1756
+ blob_box.left (), blob_box.bottom (),
1757
+ blob_box.right (), blob_box.top ());
1758
+ }
1759
+ under_it.add_after_then_move (blob_it.extract ());
1760
+ }
1761
+ delete poly_blob;
1762
+ }
1763
+ }
1764
+ }
1765
+ }
1766
+ }
1767
+
1768
+
1769
+ /**********************************************************************
1770
+ * pre_associate_blobs
1771
+ *
1772
+ * Associate overlapping blobs and fake chop wide blobs.
1773
+ **********************************************************************/
1774
+
1775
+ void pre_associate_blobs( //make rough chars
1776
+ ICOORD page_tr, //top right
1777
+ TO_BLOCK *block, //block to do
1778
+ FCOORD rotation, //inverse landscape
1779
+ BOOL8 testing_on //correct orientation
1780
+ ) {
1781
+ #ifndef GRAPHICS_DISABLED
1782
+ ScrollView::Color colour; //of boxes
1783
+ #endif
1784
+ inT16 overlap; //of adjacent boxes
1785
+ BLOBNBOX *blob; //current blob
1786
+ BLOBNBOX *nextblob; //next in list
1787
+ TBOX blob_box;
1788
+ TBOX next_box; //next blob
1789
+ FCOORD blob_rotation; //inverse of rotation
1790
+ BLOBNBOX_IT blob_it; //iterator
1791
+ BLOBNBOX_IT start_it; //iterator
1792
+ TO_ROW_IT row_it = block->get_rows ();
1793
+
1794
+ #ifndef GRAPHICS_DISABLED
1795
+ colour = ScrollView::RED;
1796
+ #endif
1797
+
1798
+ blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1799
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1800
+ //get blobs
1801
+ blob_it.set_to_list (row_it.data ()->blob_list ());
1802
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1803
+ blob_it.forward ()) {
1804
+ blob = blob_it.data ();
1805
+ blob_box = blob->bounding_box ();
1806
+ start_it = blob_it; //save start point
1807
+ // if (testing_on && textord_show_final_blobs)
1808
+ // {
1809
+ // tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
1810
+ // blob_box.left(),blob_box.bottom(),
1811
+ // blob_box.right(),blob_box.top(),
1812
+ // (void*)blob,blob_it.length());
1813
+ // }
1814
+ do {
1815
+ if (!blob_it.at_last ()) {
1816
+ nextblob = blob_it.data_relative (1);
1817
+ next_box = nextblob->bounding_box ();
1818
+ overlap = next_box.width ();
1819
+ if (blob_box.left () > next_box.left ())
1820
+ overlap -= blob_box.left () - next_box.left ();
1821
+ if (blob_box.right () < next_box.right ())
1822
+ overlap -= next_box.right () - blob_box.right ();
1823
+ if (overlap >= next_box.width () / 2
1824
+ || overlap >= blob_box.width () / 2) {
1825
+ //merge new blob
1826
+ blob->merge (nextblob);
1827
+ //get bigger box
1828
+ blob_box = blob->bounding_box ();
1829
+ blob_it.forward ();
1830
+ }
1831
+ else
1832
+ overlap = -1; //no overlap
1833
+ }
1834
+ else
1835
+ overlap = -1; //no overlap
1836
+ }
1837
+ while (overlap >= 0);
1838
+ blob->chop (&start_it, &blob_it,
1839
+ blob_rotation,
1840
+ block->line_size * textord_merge_x *
1841
+ textord_chop_width);
1842
+ //attempt chop
1843
+ }
1844
+ #ifndef GRAPHICS_DISABLED
1845
+ if (testing_on && textord_show_final_blobs) {
1846
+ if (to_win == NULL)
1847
+ create_to_win(page_tr);
1848
+ to_win->Pen(colour);
1849
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1850
+ blob_it.forward ()) {
1851
+ blob = blob_it.data ();
1852
+ blob_box = blob->bounding_box ();
1853
+ blob_box.rotate (rotation);
1854
+ if (!blob->joined_to_prev ()) {
1855
+ to_win->Rectangle (blob_box.left (), blob_box.bottom (),
1856
+ blob_box.right (), blob_box.top ());
1857
+ }
1858
+ }
1859
+ colour = (ScrollView::Color) (colour + 1);
1860
+ if (colour > ScrollView::MAGENTA)
1861
+ colour = ScrollView::RED;
1862
+ }
1863
+ #endif
1864
+ }
1865
+ }
1866
+
1867
+
1868
+ /**********************************************************************
1869
+ * fit_parallel_rows
1870
+ *
1871
+ * Re-fit the rows in the block to the given gradient.
1872
+ **********************************************************************/
1873
+
1874
+ void fit_parallel_rows( //find lines
1875
+ TO_BLOCK *block, //block to do
1876
+ float gradient, //gradient to fit
1877
+ FCOORD rotation, //for drawing
1878
+ inT32 block_edge, //edge of block
1879
+ BOOL8 testing_on //correct orientation
1880
+ ) {
1881
+ #ifndef GRAPHICS_DISABLED
1882
+ ScrollView::Color colour; //of row
1883
+ #endif
1884
+ TO_ROW_IT row_it = block->get_rows ();
1885
+
1886
+ row_it.move_to_first ();
1887
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1888
+ if (row_it.data ()->blob_list ()->empty ())
1889
+ delete row_it.extract (); //nothing in it
1890
+ else
1891
+ fit_parallel_lms (gradient, row_it.data ());
1892
+ }
1893
+ #ifndef GRAPHICS_DISABLED
1894
+ if (testing_on) {
1895
+ colour = ScrollView::RED;
1896
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1897
+ plot_parallel_row (row_it.data (), gradient,
1898
+ block_edge, colour, rotation);
1899
+ colour = (ScrollView::Color) (colour + 1);
1900
+ if (colour > ScrollView::MAGENTA)
1901
+ colour = ScrollView::RED;
1902
+ }
1903
+ }
1904
+ #endif
1905
+ row_it.sort (row_y_order); //may have gone out of order
1906
+ }
1907
+
1908
+
1909
+ /**********************************************************************
1910
+ * fit_parallel_lms
1911
+ *
1912
+ * Fit an LMS line to a row.
1913
+ * Make the fit parallel to the given gradient and set the
1914
+ * row accordingly.
1915
+ **********************************************************************/
1916
+
1917
+ void fit_parallel_lms( //sort function
1918
+ float gradient, //forced gradient
1919
+ TO_ROW *row //row to fit
1920
+ ) {
1921
+ float c; //fitted line
1922
+ int blobcount; //no of blobs
1923
+ TBOX box; //blob box
1924
+ LMS lms (row->blob_list ()->length ());
1925
+ //blobs
1926
+ BLOBNBOX_IT blob_it = row->blob_list ();
1927
+
1928
+ blobcount = 0;
1929
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
1930
+ if (!blob_it.data ()->joined_to_prev ()) {
1931
+ box = blob_it.data ()->bounding_box ();
1932
+ lms.
1933
+ add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
1934
+ blobcount++;
1935
+ }
1936
+ }
1937
+ lms.constrained_fit (gradient, c);
1938
+ row->set_parallel_line (gradient, c, lms.error ());
1939
+ if (textord_straight_baselines && blobcount > lms_line_trials) {
1940
+ lms.fit (gradient, c);
1941
+ }
1942
+ //set the other too
1943
+ row->set_line (gradient, c, lms.error ());
1944
+ }
1945
+
1946
+
1947
+ /**********************************************************************
1948
+ * make_spline_rows
1949
+ *
1950
+ * Re-fit the rows in the block to the given gradient.
1951
+ **********************************************************************/
1952
+
1953
+ void make_spline_rows( //find lines
1954
+ TO_BLOCK *block, //block to do
1955
+ float gradient, //gradient to fit
1956
+ FCOORD rotation, //for drawing
1957
+ inT32 block_edge, //edge of block
1958
+ BOOL8 testing_on //correct orientation
1959
+ ) {
1960
+ #ifndef GRAPHICS_DISABLED
1961
+ ScrollView::Color colour; //of row
1962
+ #endif
1963
+ TO_ROW_IT row_it = block->get_rows ();
1964
+
1965
+ row_it.move_to_first ();
1966
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1967
+ if (row_it.data ()->blob_list ()->empty ())
1968
+ delete row_it.extract (); //nothing in it
1969
+ else
1970
+ make_baseline_spline (row_it.data (), block);
1971
+ }
1972
+ if (textord_old_baselines) {
1973
+ #ifndef GRAPHICS_DISABLED
1974
+ if (testing_on) {
1975
+ colour = ScrollView::RED;
1976
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
1977
+ row_it.forward ()) {
1978
+ row_it.data ()->baseline.plot (to_win, colour);
1979
+ colour = (ScrollView::Color) (colour + 1);
1980
+ if (colour > ScrollView::MAGENTA)
1981
+ colour = ScrollView::RED;
1982
+ }
1983
+ }
1984
+ #endif
1985
+ make_old_baselines(block, testing_on);
1986
+ }
1987
+ #ifndef GRAPHICS_DISABLED
1988
+ if (testing_on) {
1989
+ colour = ScrollView::RED;
1990
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1991
+ row_it.data ()->baseline.plot (to_win, colour);
1992
+ colour = (ScrollView::Color) (colour + 1);
1993
+ if (colour > ScrollView::MAGENTA)
1994
+ colour = ScrollView::RED;
1995
+ }
1996
+ }
1997
+ #endif
1998
+ }
1999
+
2000
+
2001
+ /**********************************************************************
2002
+ * make_baseline_spline
2003
+ *
2004
+ * Fit an LMS line to a row.
2005
+ * Make the fit parallel to the given gradient and set the
2006
+ * row accordingly.
2007
+ **********************************************************************/
2008
+
2009
+ void make_baseline_spline( //sort function
2010
+ TO_ROW *row, //row to fit
2011
+ TO_BLOCK *block //block it came from
2012
+ ) {
2013
+ float b, c; //fitted curve
2014
+ float middle; //x middle of blob
2015
+ TBOX box; //blob box
2016
+ LMS lms (row->blob_list ()->length ());
2017
+ //blobs
2018
+ BLOBNBOX_IT blob_it = row->blob_list ();
2019
+ inT32 *xstarts; //spline boundaries
2020
+ double *coeffs; //quadratic coeffs
2021
+ inT32 segments; //no of segments
2022
+ inT32 segment; //current segment
2023
+
2024
+ xstarts =
2025
+ (inT32 *) alloc_mem ((row->blob_list ()->length () + 1) * sizeof (inT32));
2026
+ if (segment_baseline (row, block, segments, xstarts)
2027
+ && !textord_straight_baselines && !textord_parallel_baselines) {
2028
+ if (textord_quadratic_baselines) {
2029
+ coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2030
+ for (segment = 0; segment < segments; segment++) {
2031
+ lms.clear ();
2032
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
2033
+ blob_it.forward ()) {
2034
+ if (!blob_it.data ()->joined_to_prev ()) {
2035
+ box = blob_it.data ()->bounding_box ();
2036
+ middle = (box.left () + box.right ()) / 2.0;
2037
+ if (middle >= xstarts[segment]
2038
+ && middle < xstarts[segment + 1]) {
2039
+ lms.add (FCOORD (middle, box.bottom ()));
2040
+ }
2041
+ }
2042
+ }
2043
+ if (textord_quadratic_baselines)
2044
+ lms.fit_quadratic (block->line_size *
2045
+ textord_spline_outlier_fraction,
2046
+ coeffs[segment * 3], b, c);
2047
+ else {
2048
+ lms.fit (b, c);
2049
+ coeffs[segment * 3] = 0;
2050
+ }
2051
+ coeffs[segment * 3 + 1] = b;
2052
+ coeffs[segment * 3 + 2] = c;
2053
+ }
2054
+ }
2055
+ else
2056
+ coeffs = linear_spline_baseline (row, block, segments, xstarts);
2057
+ }
2058
+ else {
2059
+ xstarts[1] = xstarts[segments];
2060
+ segments = 1;
2061
+ coeffs = (double *) alloc_mem (3 * sizeof (double));
2062
+ coeffs[0] = 0;
2063
+ coeffs[1] = row->line_m ();
2064
+ coeffs[2] = row->line_c ();
2065
+ }
2066
+ row->baseline = QSPLINE (segments, xstarts, coeffs);
2067
+ free_mem(coeffs);
2068
+ free_mem(xstarts);
2069
+ }
2070
+
2071
+
2072
+ /**********************************************************************
2073
+ * segment_baseline
2074
+ *
2075
+ * Divide the baseline up into segments which require a different
2076
+ * quadratic fitted to them.
2077
+ * Return TRUE if enough blobs were far enough away to need a quadratic.
2078
+ **********************************************************************/
2079
+
2080
+ BOOL8
2081
+ segment_baseline ( //split baseline
2082
+ TO_ROW * row, //row to fit
2083
+ TO_BLOCK * block, //block it came from
2084
+ inT32 & segments, //no fo segments
2085
+ inT32 xstarts[] //coords of segments
2086
+ ) {
2087
+ BOOL8 needs_curve; //needs curved line
2088
+ int blobcount; //no of blobs
2089
+ int blobindex; //current blob
2090
+ int last_state; //above, on , below
2091
+ int state; //of current blob
2092
+ float yshift; //from baseline
2093
+ TBOX box; //blob box
2094
+ TBOX new_box; //new_it box
2095
+ float middle; //xcentre of blob
2096
+ //blobs
2097
+ BLOBNBOX_IT blob_it = row->blob_list ();
2098
+ BLOBNBOX_IT new_it = blob_it; //front end
2099
+ SORTED_FLOATS yshifts; //shifts from baseline
2100
+
2101
+ needs_curve = FALSE;
2102
+ box = box_next_pre_chopped (&blob_it);
2103
+ xstarts[0] = box.left ();
2104
+ segments = 1;
2105
+ blobcount = row->blob_list ()->length ();
2106
+ if (textord_oldbl_debug)
2107
+ tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
2108
+ blobcount, box.left (), box.bottom ());
2109
+ if (blobcount <= textord_spline_medianwin
2110
+ || blobcount < textord_spline_minblobs) {
2111
+ blob_it.move_to_last ();
2112
+ box = blob_it.data ()->bounding_box ();
2113
+ xstarts[1] = box.right ();
2114
+ return FALSE;
2115
+ }
2116
+ last_state = 0;
2117
+ new_it.mark_cycle_pt ();
2118
+ for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2119
+ new_box = box_next_pre_chopped (&new_it);
2120
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2121
+ yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2122
+ //record shift
2123
+ yshifts.add (yshift, blobindex);
2124
+ if (new_it.cycled_list ()) {
2125
+ xstarts[1] = new_box.right ();
2126
+ return FALSE;
2127
+ }
2128
+ }
2129
+ for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
2130
+ box = box_next_pre_chopped (&blob_it);
2131
+ do {
2132
+ new_box = box_next_pre_chopped (&new_it);
2133
+ //get middle one
2134
+ yshift = yshifts[textord_spline_medianwin / 2];
2135
+ if (yshift > textord_spline_shift_fraction * block->line_size)
2136
+ state = 1;
2137
+ else if (-yshift > textord_spline_shift_fraction * block->line_size)
2138
+ state = -1;
2139
+ else
2140
+ state = 0;
2141
+ if (state != 0)
2142
+ needs_curve = TRUE;
2143
+ // tprintf("State=%d, prev=%d, shift=%g\n",
2144
+ // state,last_state,yshift);
2145
+ if (state != last_state && blobcount > textord_spline_minblobs) {
2146
+ xstarts[segments++] = box.left ();
2147
+ blobcount = 0;
2148
+ }
2149
+ last_state = state;
2150
+ yshifts.remove (blobindex - textord_spline_medianwin);
2151
+ box = box_next_pre_chopped (&blob_it);
2152
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2153
+ yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2154
+ yshifts.add (yshift, blobindex);
2155
+ blobindex++;
2156
+ blobcount++;
2157
+ }
2158
+ while (!new_it.cycled_list ());
2159
+ if (blobcount > textord_spline_minblobs || segments == 1) {
2160
+ xstarts[segments] = new_box.right ();
2161
+ }
2162
+ else {
2163
+ xstarts[--segments] = new_box.right ();
2164
+ }
2165
+ if (textord_oldbl_debug)
2166
+ tprintf ("Made %d segments on row at (%d,%d)\n",
2167
+ segments, box.right (), box.bottom ());
2168
+ return needs_curve;
2169
+ }
2170
+
2171
+
2172
+ /**********************************************************************
2173
+ * linear_spline_baseline
2174
+ *
2175
+ * Divide the baseline up into segments which require a different
2176
+ * quadratic fitted to them.
2177
+ * Return TRUE if enough blobs were far enough away to need a quadratic.
2178
+ **********************************************************************/
2179
+
2180
+ double *
2181
+ linear_spline_baseline ( //split baseline
2182
+ TO_ROW * row, //row to fit
2183
+ TO_BLOCK * block, //block it came from
2184
+ inT32 & segments, //no fo segments
2185
+ inT32 xstarts[] //coords of segments
2186
+ ) {
2187
+ int blobcount; //no of blobs
2188
+ int blobindex; //current blob
2189
+ int index1, index2; //blob numbers
2190
+ int blobs_per_segment; //blobs in each
2191
+ TBOX box; //blob box
2192
+ TBOX new_box; //new_it box
2193
+ float middle; //xcentre of blob
2194
+ //blobs
2195
+ BLOBNBOX_IT blob_it = row->blob_list ();
2196
+ BLOBNBOX_IT new_it = blob_it; //front end
2197
+ float b, c; //fitted curve
2198
+ LMS lms (row->blob_list ()->length ());
2199
+ double *coeffs; //quadratic coeffs
2200
+ inT32 segment; //current segment
2201
+
2202
+ box = box_next_pre_chopped (&blob_it);
2203
+ xstarts[0] = box.left ();
2204
+ blobcount = 1;
2205
+ while (!blob_it.at_first ()) {
2206
+ blobcount++;
2207
+ box = box_next_pre_chopped (&blob_it);
2208
+ }
2209
+ segments = blobcount / textord_spline_medianwin;
2210
+ if (segments < 1)
2211
+ segments = 1;
2212
+ blobs_per_segment = blobcount / segments;
2213
+ coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
2214
+ if (textord_oldbl_debug)
2215
+ tprintf
2216
+ ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2217
+ blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
2218
+ segment = 1;
2219
+ for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2220
+ box_next_pre_chopped(&new_it);
2221
+ index1 = 0;
2222
+ blobindex = index2;
2223
+ do {
2224
+ blobindex += blobs_per_segment;
2225
+ lms.clear ();
2226
+ while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2227
+ box = box_next_pre_chopped (&blob_it);
2228
+ middle = (box.left () + box.right ()) / 2.0;
2229
+ lms.add (FCOORD (middle, box.bottom ()));
2230
+ index1++;
2231
+ if (index1 == blobindex - blobs_per_segment / 2
2232
+ || index1 == blobcount - 1) {
2233
+ xstarts[segment] = box.left ();
2234
+ }
2235
+ }
2236
+ lms.fit (b, c);
2237
+ coeffs[segment * 3 - 3] = 0;
2238
+ coeffs[segment * 3 - 2] = b;
2239
+ coeffs[segment * 3 - 1] = c;
2240
+ segment++;
2241
+ if (segment > segments)
2242
+ break;
2243
+
2244
+ blobindex += blobs_per_segment;
2245
+ lms.clear ();
2246
+ while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2247
+ new_box = box_next_pre_chopped (&new_it);
2248
+ middle = (new_box.left () + new_box.right ()) / 2.0;
2249
+ lms.add (FCOORD (middle, new_box.bottom ()));
2250
+ index2++;
2251
+ if (index2 == blobindex - blobs_per_segment / 2
2252
+ || index2 == blobcount - 1) {
2253
+ xstarts[segment] = new_box.left ();
2254
+ }
2255
+ }
2256
+ lms.fit (b, c);
2257
+ coeffs[segment * 3 - 3] = 0;
2258
+ coeffs[segment * 3 - 2] = b;
2259
+ coeffs[segment * 3 - 1] = c;
2260
+ segment++;
2261
+ }
2262
+ while (segment <= segments);
2263
+ return coeffs;
2264
+ }
2265
+
2266
+
2267
+ /**********************************************************************
2268
+ * assign_blobs_to_rows
2269
+ *
2270
+ * Make enough rows to allocate all the given blobs to one.
2271
+ * If a block skew is given, use that, else attempt to track it.
2272
+ **********************************************************************/
2273
+
2274
+ void assign_blobs_to_rows( //find lines
2275
+ TO_BLOCK *block, //block to do
2276
+ float *gradient, //block skew
2277
+ int pass, //identification
2278
+ BOOL8 reject_misses, //chuck big ones out
2279
+ BOOL8 make_new_rows, //add rows for unmatched
2280
+ BOOL8 drawing_skew //draw smoothed skew
2281
+ ) {
2282
+ OVERLAP_STATE overlap_result; //what to do with it
2283
+ float ycoord; //current y
2284
+ float top, bottom; //of blob
2285
+ float g_length = 1.0f; //from gradient
2286
+ inT16 row_count; //no of rows
2287
+ inT16 left_x; //left edge
2288
+ inT16 last_x; //previous edge
2289
+ float block_skew; //y delta
2290
+ float smooth_factor; //for new coords
2291
+ float near_dist; //dist to nearest row
2292
+ ICOORD testpt; //testing only
2293
+ BLOBNBOX *blob; //current blob
2294
+ TO_ROW *row; //current row
2295
+ TO_ROW *dest_row; //row to put blob in
2296
+ //iterators
2297
+ BLOBNBOX_IT blob_it = &block->blobs;
2298
+ TO_ROW_IT row_it = block->get_rows ();
2299
+
2300
+ ycoord =
2301
+ (block->block->bounding_box ().bottom () +
2302
+ block->block->bounding_box ().top ()) / 2.0f;
2303
+ if (gradient != NULL)
2304
+ g_length = sqrt (1 + *gradient * *gradient);
2305
+ #ifndef GRAPHICS_DISABLED
2306
+ if (drawing_skew)
2307
+ to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
2308
+ #endif
2309
+ testpt = ICOORD (textord_test_x, textord_test_y);
2310
+ blob_it.sort (blob_x_order);
2311
+ smooth_factor = 1.0;
2312
+ block_skew = 0.0f;
2313
+ row_count = row_it.length (); //might have rows
2314
+ if (!blob_it.empty ()) {
2315
+ left_x = blob_it.data ()->bounding_box ().left ();
2316
+ }
2317
+ else {
2318
+ left_x = block->block->bounding_box ().left ();
2319
+ }
2320
+ last_x = left_x;
2321
+ for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2322
+ blob = blob_it.data ();
2323
+ if (gradient != NULL) {
2324
+ block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
2325
+ + *gradient / g_length * blob->bounding_box ().left ();
2326
+ }
2327
+ else if (blob->bounding_box ().left () - last_x > block->line_size / 2
2328
+ && last_x - left_x > block->line_size * 2
2329
+ && textord_interpolating_skew) {
2330
+ // tprintf("Interpolating skew from %g",block_skew);
2331
+ block_skew *= (float) (blob->bounding_box ().left () - left_x)
2332
+ / (last_x - left_x);
2333
+ // tprintf("to %g\n",block_skew);
2334
+ }
2335
+ last_x = blob->bounding_box ().left ();
2336
+ top = blob->bounding_box ().top () - block_skew;
2337
+ bottom = blob->bounding_box ().bottom () - block_skew;
2338
+ #ifndef GRAPHICS_DISABLED
2339
+ if (drawing_skew)
2340
+ to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
2341
+ #endif
2342
+ if (!row_it.empty ()) {
2343
+ for (row_it.move_to_first ();
2344
+ !row_it.at_last () && row_it.data ()->min_y () > top;
2345
+ row_it.forward ());
2346
+ row = row_it.data ();
2347
+ if (row->min_y () <= top && row->max_y () >= bottom) {
2348
+ //any overlap
2349
+ dest_row = row;
2350
+ overlap_result = most_overlapping_row (&row_it, dest_row,
2351
+ top, bottom,
2352
+ block->line_size,
2353
+ blob->bounding_box ().
2354
+ contains (testpt));
2355
+ if (overlap_result == NEW_ROW && !reject_misses)
2356
+ overlap_result = ASSIGN;
2357
+ }
2358
+ else {
2359
+ overlap_result = NEW_ROW;
2360
+ if (!make_new_rows) {
2361
+ near_dist = row_it.data_relative (-1)->min_y () - top;
2362
+ //below bottom
2363
+ if (bottom < row->min_y ()) {
2364
+ if (row->min_y () - bottom <=
2365
+ (block->line_spacing -
2366
+ block->line_size) * textord_merge_desc) {
2367
+ //done it
2368
+ overlap_result = ASSIGN;
2369
+ dest_row = row;
2370
+ }
2371
+ }
2372
+ else if (near_dist > 0
2373
+ && near_dist < bottom - row->max_y ()) {
2374
+ row_it.backward ();
2375
+ dest_row = row_it.data ();
2376
+ if (dest_row->min_y () - bottom <=
2377
+ (block->line_spacing -
2378
+ block->line_size) * textord_merge_desc) {
2379
+ //done it
2380
+ overlap_result = ASSIGN;
2381
+ }
2382
+ }
2383
+ else {
2384
+ if (top - row->max_y () <=
2385
+ (block->line_spacing -
2386
+ block->line_size) * (textord_overlap_x +
2387
+ textord_merge_asc)) {
2388
+ //done it
2389
+ overlap_result = ASSIGN;
2390
+ dest_row = row;
2391
+ }
2392
+ }
2393
+ }
2394
+ }
2395
+ if (overlap_result == ASSIGN)
2396
+ dest_row->add_blob (blob_it.extract (), top, bottom,
2397
+ block->line_size);
2398
+ if (overlap_result == NEW_ROW) {
2399
+ if (make_new_rows && top - bottom < block->max_blob_size) {
2400
+ dest_row =
2401
+ new TO_ROW (blob_it.extract (), top, bottom,
2402
+ block->line_size);
2403
+ row_count++;
2404
+ if (bottom > row_it.data ()->min_y ())
2405
+ row_it.add_before_then_move (dest_row);
2406
+ //insert in right place
2407
+ else
2408
+ row_it.add_after_then_move (dest_row);
2409
+ smooth_factor =
2410
+ 1.0 / (row_count * textord_skew_lag +
2411
+ textord_skewsmooth_offset);
2412
+ }
2413
+ else
2414
+ overlap_result = REJECT;
2415
+ }
2416
+ }
2417
+ else if (make_new_rows && top - bottom < block->max_blob_size) {
2418
+ overlap_result = NEW_ROW;
2419
+ dest_row =
2420
+ new TO_ROW (blob_it.extract (), top, bottom, block->line_size);
2421
+ row_count++;
2422
+ row_it.add_after_then_move (dest_row);
2423
+ smooth_factor = 1.0 / (row_count * textord_skew_lag +
2424
+ textord_skewsmooth_offset2);
2425
+ }
2426
+ else
2427
+ overlap_result = REJECT;
2428
+ if (blob->bounding_box ().contains (testpt)) {
2429
+ if (overlap_result != REJECT) {
2430
+ tprintf ("Test blob assigned to row at (%g,%g) on pass %d\n",
2431
+ dest_row->min_y (), dest_row->max_y (), pass);
2432
+ }
2433
+ else {
2434
+ tprintf ("Test blob assigned to no row on pass %d\n", pass);
2435
+ }
2436
+ }
2437
+ if (overlap_result != REJECT) {
2438
+ while (!row_it.at_first ()
2439
+ && row_it.data ()->min_y () >
2440
+ row_it.data_relative (-1)->min_y ()) {
2441
+ row = row_it.extract ();
2442
+ row_it.backward ();
2443
+ row_it.add_before_then_move (row);
2444
+ }
2445
+ while (!row_it.at_last ()
2446
+ && row_it.data ()->min_y () <
2447
+ row_it.data_relative (1)->min_y ()) {
2448
+ row = row_it.extract ();
2449
+ row_it.forward ();
2450
+ //keep rows in order
2451
+ row_it.add_after_then_move (row);
2452
+ }
2453
+ block_skew = (1 - smooth_factor) * block_skew
2454
+ + smooth_factor * (blob->bounding_box ().bottom () -
2455
+ dest_row->initial_min_y ());
2456
+ }
2457
+ }
2458
+ for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2459
+ if (row_it.data ()->blob_list ()->empty ())
2460
+ delete row_it.extract (); //discard empty rows
2461
+ }
2462
+ }
2463
+
2464
+
2465
+ /**********************************************************************
2466
+ * most_overlapping_row
2467
+ *
2468
+ * Return the row which most overlaps the blob.
2469
+ **********************************************************************/
2470
+
2471
+ OVERLAP_STATE most_overlapping_row( //find best row
2472
+ TO_ROW_IT *row_it, //iterator
2473
+ TO_ROW *&best_row, //output row
2474
+ float top, //top of blob
2475
+ float bottom, //bottom of blob
2476
+ float rowsize, //max row size
2477
+ BOOL8 testing_blob //test stuff
2478
+ ) {
2479
+ OVERLAP_STATE result; //result of tests
2480
+ float overlap; //of blob & row
2481
+ float bestover; //nearest row
2482
+ float merge_top, merge_bottom; //size of merged row
2483
+ ICOORD testpt; //testing only
2484
+ TO_ROW *row; //current row
2485
+ TO_ROW *test_row; //for multiple overlaps
2486
+ BLOBNBOX_IT blob_it; //for merging rows
2487
+
2488
+ result = ASSIGN;
2489
+ row = row_it->data ();
2490
+ bestover = top - bottom;
2491
+ if (top > row->max_y ())
2492
+ bestover -= top - row->max_y ();
2493
+ if (bottom < row->min_y ())
2494
+ //compute overlap
2495
+ bestover -= row->min_y () - bottom;
2496
+ if (testing_blob) {
2497
+ tprintf ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f\n",
2498
+ bottom, top, row->min_y (), row->max_y (), bestover);
2499
+ }
2500
+ test_row = row;
2501
+ do {
2502
+ if (!row_it->at_last ()) {
2503
+ row_it->forward ();
2504
+ test_row = row_it->data ();
2505
+ if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
2506
+ merge_top =
2507
+ test_row->max_y () >
2508
+ row->max_y ()? test_row->max_y () : row->max_y ();
2509
+ merge_bottom =
2510
+ test_row->min_y () <
2511
+ row->min_y ()? test_row->min_y () : row->min_y ();
2512
+ if (merge_top - merge_bottom <= rowsize) {
2513
+ if (testing_blob) {
2514
+ tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
2515
+ row->min_y (), row->max_y (),
2516
+ test_row->min_y (), test_row->max_y ());
2517
+ }
2518
+ test_row->set_limits (merge_bottom, merge_top);
2519
+ blob_it.set_to_list (test_row->blob_list ());
2520
+ blob_it.add_list_after (row->blob_list ());
2521
+ blob_it.sort (blob_x_order);
2522
+ row_it->backward ();
2523
+ delete row_it->extract ();
2524
+ row_it->forward ();
2525
+ bestover = -1.0f; //force replacement
2526
+ }
2527
+ overlap = top - bottom;
2528
+ if (top > test_row->max_y ())
2529
+ overlap -= top - test_row->max_y ();
2530
+ if (bottom < test_row->min_y ())
2531
+ overlap -= test_row->min_y () - bottom;
2532
+ if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2533
+ result = REJECT;
2534
+ }
2535
+ if (overlap > bestover) {
2536
+ bestover = overlap; //find biggest overlap
2537
+ row = test_row;
2538
+ }
2539
+ if (testing_blob) {
2540
+ tprintf
2541
+ ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f->%f\n",
2542
+ bottom, top, test_row->min_y (), test_row->max_y (),
2543
+ overlap, bestover);
2544
+ }
2545
+ }
2546
+ }
2547
+ }
2548
+ while (!row_it->at_last ()
2549
+ && test_row->min_y () <= top && test_row->max_y () >= bottom);
2550
+ while (row_it->data () != row)
2551
+ row_it->backward (); //make it point to row
2552
+ //doesn't overlap much
2553
+ if (top - bottom - bestover > rowsize * textord_overlap_x &&
2554
+ (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
2555
+ && result == ASSIGN)
2556
+ result = NEW_ROW; //doesn't overlap enough
2557
+ best_row = row;
2558
+ return result;
2559
+ }
2560
+
2561
+
2562
+ /**********************************************************************
2563
+ * blob_x_order
2564
+ *
2565
+ * Sort function to sort blobs in x from page left.
2566
+ **********************************************************************/
2567
+
2568
+ int blob_x_order( //sort function
2569
+ const void *item1, //items to compare
2570
+ const void *item2) {
2571
+ //converted ptr
2572
+ BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
2573
+ //converted ptr
2574
+ BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
2575
+
2576
+ if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
2577
+ return -1;
2578
+ else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
2579
+ return 1;
2580
+ else
2581
+ return 0;
2582
+ }
2583
+
2584
+
2585
+ /**********************************************************************
2586
+ * row_y_order
2587
+ *
2588
+ * Sort function to sort rows in y from page top.
2589
+ **********************************************************************/
2590
+
2591
+ int row_y_order( //sort function
2592
+ const void *item1, //items to compare
2593
+ const void *item2) {
2594
+ //converted ptr
2595
+ TO_ROW *row1 = *(TO_ROW **) item1;
2596
+ //converted ptr
2597
+ TO_ROW *row2 = *(TO_ROW **) item2;
2598
+
2599
+ if (row1->parallel_c () > row2->parallel_c ())
2600
+ return -1;
2601
+ else if (row1->parallel_c () < row2->parallel_c ())
2602
+ return 1;
2603
+ else
2604
+ return 0;
2605
+ }
2606
+
2607
+
2608
+ /**********************************************************************
2609
+ * row_spacing_order
2610
+ *
2611
+ * Qsort style function to compare 2 TO_ROWS based on their spacing value.
2612
+ **********************************************************************/
2613
+
2614
+ int row_spacing_order( //sort function
2615
+ const void *item1, //items to compare
2616
+ const void *item2) {
2617
+ //converted ptr
2618
+ TO_ROW *row1 = *(TO_ROW **) item1;
2619
+ //converted ptr
2620
+ TO_ROW *row2 = *(TO_ROW **) item2;
2621
+
2622
+ if (row1->spacing < row2->spacing)
2623
+ return -1;
2624
+ else if (row1->spacing > row2->spacing)
2625
+ return 1;
2626
+ else
2627
+ return 0;
2628
+ }