tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
|
@@ -0,0 +1,1939 @@
|
|
|
1
|
+
#include "mfcpch.h"
|
|
2
|
+
#include "tovars.h"
|
|
3
|
+
#include "drawtord.h"
|
|
4
|
+
#include "tospace.h"
|
|
5
|
+
#include "ndminx.h"
|
|
6
|
+
#include "statistc.h"
|
|
7
|
+
|
|
8
|
+
#define EXTERN
|
|
9
|
+
EXTERN BOOL_VAR (tosp_old_to_method, FALSE, "Space stats use prechopping?");
|
|
10
|
+
EXTERN BOOL_VAR (tosp_only_use_prop_rows, TRUE,
|
|
11
|
+
"Block stats to use fixed pitch rows?");
|
|
12
|
+
EXTERN BOOL_VAR (tosp_use_pre_chopping, FALSE,
|
|
13
|
+
"Space stats use prechopping?");
|
|
14
|
+
EXTERN BOOL_VAR (tosp_old_to_bug_fix, FALSE, "Fix suspected bug in old code");
|
|
15
|
+
EXTERN BOOL_VAR (tosp_block_use_cert_spaces, TRUE,
|
|
16
|
+
"Only stat OBVIOUS spaces");
|
|
17
|
+
EXTERN BOOL_VAR (tosp_row_use_cert_spaces, TRUE, "Only stat OBVIOUS spaces");
|
|
18
|
+
EXTERN BOOL_VAR (tosp_narrow_blobs_not_cert, TRUE,
|
|
19
|
+
"Only stat OBVIOUS spaces");
|
|
20
|
+
EXTERN BOOL_VAR (tosp_row_use_cert_spaces1, TRUE, "Only stat OBVIOUS spaces");
|
|
21
|
+
EXTERN BOOL_VAR (tosp_recovery_isolated_row_stats, TRUE,
|
|
22
|
+
"Use row alone when inadequate cert spaces");
|
|
23
|
+
EXTERN BOOL_VAR (tosp_only_small_gaps_for_kern, FALSE, "Better guess");
|
|
24
|
+
EXTERN BOOL_VAR (tosp_all_flips_fuzzy, FALSE, "Pass ANY flip to context?");
|
|
25
|
+
EXTERN BOOL_VAR (tosp_fuzzy_limit_all, TRUE,
|
|
26
|
+
"Dont restrict kn->sp fuzzy limit to tables");
|
|
27
|
+
EXTERN BOOL_VAR (tosp_stats_use_xht_gaps, TRUE,
|
|
28
|
+
"Use within xht gap for wd breaks");
|
|
29
|
+
EXTERN BOOL_VAR (tosp_use_xht_gaps, TRUE, "Use within xht gap for wd breaks");
|
|
30
|
+
EXTERN BOOL_VAR (tosp_only_use_xht_gaps, FALSE,
|
|
31
|
+
"Only use within xht gap for wd breaks");
|
|
32
|
+
EXTERN BOOL_VAR (tosp_rule_9_test_punct, FALSE,
|
|
33
|
+
"Dont chng kn to space next to punct");
|
|
34
|
+
EXTERN BOOL_VAR (tosp_flip_fuzz_kn_to_sp, TRUE, "Default flip");
|
|
35
|
+
EXTERN BOOL_VAR (tosp_flip_fuzz_sp_to_kn, TRUE, "Default flip");
|
|
36
|
+
EXTERN BOOL_VAR (tosp_improve_thresh, FALSE, "Enable improvement heuristic");
|
|
37
|
+
EXTERN INT_VAR (tosp_debug_level, 0, "Debug data");
|
|
38
|
+
EXTERN INT_VAR (tosp_enough_space_samples_for_median, 3,
|
|
39
|
+
"or should we use mean");
|
|
40
|
+
EXTERN INT_VAR (tosp_redo_kern_limit, 10,
|
|
41
|
+
"No.samples reqd to reestimate for row");
|
|
42
|
+
EXTERN INT_VAR (tosp_few_samples, 40,
|
|
43
|
+
"No.gaps reqd with 1 large gap to treat as a table");
|
|
44
|
+
EXTERN INT_VAR (tosp_short_row, 20,
|
|
45
|
+
"No.gaps reqd with few cert spaces to use certs");
|
|
46
|
+
EXTERN INT_VAR (tosp_sanity_method, 1, "How to avoid being silly");
|
|
47
|
+
EXTERN double_VAR (tosp_threshold_bias1, 0,
|
|
48
|
+
"how far between kern and space?");
|
|
49
|
+
EXTERN double_VAR (tosp_threshold_bias2, 0,
|
|
50
|
+
"how far between kern and space?");
|
|
51
|
+
EXTERN double_VAR (tosp_narrow_fraction, 0.3, "Fract of xheight for narrow");
|
|
52
|
+
EXTERN double_VAR (tosp_narrow_aspect_ratio, 0.48,
|
|
53
|
+
"narrow if w/h less than this");
|
|
54
|
+
EXTERN double_VAR (tosp_wide_fraction, 0.52, "Fract of xheight for wide");
|
|
55
|
+
EXTERN double_VAR (tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this");
|
|
56
|
+
EXTERN double_VAR (tosp_fuzzy_space_factor, 0.6,
|
|
57
|
+
"Fract of xheight for fuzz sp");
|
|
58
|
+
EXTERN double_VAR (tosp_fuzzy_space_factor1, 0.5,
|
|
59
|
+
"Fract of xheight for fuzz sp");
|
|
60
|
+
EXTERN double_VAR (tosp_fuzzy_space_factor2, 0.72,
|
|
61
|
+
"Fract of xheight for fuzz sp");
|
|
62
|
+
EXTERN double_VAR (tosp_gap_factor, 0.83, "gap ratio to flip sp->kern");
|
|
63
|
+
EXTERN double_VAR (tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp");
|
|
64
|
+
EXTERN double_VAR (tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp");
|
|
65
|
+
EXTERN double_VAR (tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp");
|
|
66
|
+
EXTERN double_VAR (tosp_ignore_big_gaps, -1, "xht multiplier");
|
|
67
|
+
EXTERN double_VAR (tosp_ignore_very_big_gaps, 3.5, "xht multiplier");
|
|
68
|
+
EXTERN double_VAR (tosp_rep_space, 1.6, "rep gap multiplier for space");
|
|
69
|
+
EXTERN double_VAR (tosp_enough_small_gaps, 0.65,
|
|
70
|
+
"Fract of kerns reqd for isolated row stats");
|
|
71
|
+
EXTERN double_VAR (tosp_table_kn_sp_ratio, 2.25,
|
|
72
|
+
"Min difference of kn & sp in table");
|
|
73
|
+
EXTERN double_VAR (tosp_table_xht_sp_ratio, 0.33,
|
|
74
|
+
"Expect spaces bigger than this");
|
|
75
|
+
EXTERN double_VAR (tosp_table_fuzzy_kn_sp_ratio, 3.0,
|
|
76
|
+
"Fuzzy if less than this");
|
|
77
|
+
EXTERN double_VAR (tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg");
|
|
78
|
+
EXTERN double_VAR (tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg");
|
|
79
|
+
EXTERN double_VAR (tosp_min_sane_kn_sp, 1.5,
|
|
80
|
+
"Dont trust spaces less than this time kn");
|
|
81
|
+
EXTERN double_VAR (tosp_init_guess_kn_mult, 2.2,
|
|
82
|
+
"Thresh guess - mult kn by this");
|
|
83
|
+
EXTERN double_VAR (tosp_init_guess_xht_mult, 0.28,
|
|
84
|
+
"Thresh guess - mult xht by this");
|
|
85
|
+
EXTERN double_VAR (tosp_max_sane_kn_thresh, 5.0,
|
|
86
|
+
"Multiplier on kn to limit thresh");
|
|
87
|
+
EXTERN double_VAR (tosp_flip_caution, 0.0,
|
|
88
|
+
"Dont autoflip kn to sp when large separation");
|
|
89
|
+
|
|
90
|
+
EXTERN double_VAR (tosp_large_kerning, 0.19,
|
|
91
|
+
"Limit use of xht gap with large kns");
|
|
92
|
+
EXTERN double_VAR (tosp_dont_fool_with_small_kerns, -1,
|
|
93
|
+
"Limit use of xht gap with odd small kns");
|
|
94
|
+
EXTERN double_VAR (tosp_near_lh_edge, 0,
|
|
95
|
+
"Dont reduce box if the top left is non blank");
|
|
96
|
+
EXTERN double_VAR (tosp_silly_kn_sp_gap, 0.2,
|
|
97
|
+
"Dont let sp minus kn get too small");
|
|
98
|
+
EXTERN double_VAR (tosp_pass_wide_fuzz_sp_to_context, 0.75,
|
|
99
|
+
"How wide fuzzies need context");
|
|
100
|
+
|
|
101
|
+
#define MAXSPACING 128 /*max expected spacing in pix */
|
|
102
|
+
/**********************************************************************
|
|
103
|
+
* to_spacing
|
|
104
|
+
*
|
|
105
|
+
* Compute fuzzy word spacing thresholds for each row.
|
|
106
|
+
* I.e. set : max_nonspace
|
|
107
|
+
* space_threshold
|
|
108
|
+
* min_space
|
|
109
|
+
* kern_size
|
|
110
|
+
* space_size for each row.
|
|
111
|
+
* ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
|
|
112
|
+
**********************************************************************/
|
|
113
|
+
|
|
114
|
+
void to_spacing( //set spacing
|
|
115
|
+
ICOORD page_tr, //topright of page
|
|
116
|
+
TO_BLOCK_LIST *blocks //blocks on page
|
|
117
|
+
) {
|
|
118
|
+
TO_BLOCK_IT block_it; //iterator
|
|
119
|
+
TO_BLOCK *block; //current block;
|
|
120
|
+
TO_ROW_IT row_it; //row iterator
|
|
121
|
+
TO_ROW *row; //current row
|
|
122
|
+
int block_index; //block number
|
|
123
|
+
int row_index; //row number
|
|
124
|
+
inT16 block_space_gap_width; //Estimated width of real spaces for whole block
|
|
125
|
+
//Estimate width ofnon space gaps for whole block
|
|
126
|
+
inT16 block_non_space_gap_width;
|
|
127
|
+
//Old fixed/prop result
|
|
128
|
+
BOOL8 old_text_ord_proportional;
|
|
129
|
+
GAPMAP *gapmap = NULL; //map of big vert gaps in blk
|
|
130
|
+
|
|
131
|
+
block_it.set_to_list (blocks);
|
|
132
|
+
block_index = 1;
|
|
133
|
+
for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
|
|
134
|
+
block_it.forward ()) {
|
|
135
|
+
block = block_it.data ();
|
|
136
|
+
gapmap = new GAPMAP (block);
|
|
137
|
+
block_spacing_stats(block,
|
|
138
|
+
gapmap,
|
|
139
|
+
old_text_ord_proportional,
|
|
140
|
+
block_space_gap_width,
|
|
141
|
+
block_non_space_gap_width);
|
|
142
|
+
row_it.set_to_list (block->get_rows ());
|
|
143
|
+
row_index = 1;
|
|
144
|
+
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
|
|
145
|
+
row = row_it.data ();
|
|
146
|
+
if ((row->pitch_decision == PITCH_DEF_PROP) ||
|
|
147
|
+
(row->pitch_decision == PITCH_CORR_PROP)) {
|
|
148
|
+
if ((tosp_debug_level > 0) && !old_text_ord_proportional)
|
|
149
|
+
tprintf ("Block %d Row %d: Now Proportional\n",
|
|
150
|
+
block_index, row_index);
|
|
151
|
+
row_spacing_stats(row,
|
|
152
|
+
gapmap,
|
|
153
|
+
block_index,
|
|
154
|
+
row_index,
|
|
155
|
+
block_space_gap_width,
|
|
156
|
+
block_non_space_gap_width);
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
if ((tosp_debug_level > 0) && old_text_ord_proportional)
|
|
160
|
+
tprintf
|
|
161
|
+
("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
|
|
162
|
+
block_index, row_index, row->pitch_decision,
|
|
163
|
+
row->fixed_pitch);
|
|
164
|
+
}
|
|
165
|
+
#ifndef GRAPHICS_DISABLED
|
|
166
|
+
if (textord_show_initial_words)
|
|
167
|
+
plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
|
|
168
|
+
#endif
|
|
169
|
+
row_index++;
|
|
170
|
+
}
|
|
171
|
+
delete gapmap;
|
|
172
|
+
block_index++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
/*************************************************************************
|
|
178
|
+
* block_spacing_stats()
|
|
179
|
+
*************************************************************************/
|
|
180
|
+
|
|
181
|
+
void block_spacing_stats( //DEBUG USE ONLY
|
|
182
|
+
TO_BLOCK *block,
|
|
183
|
+
GAPMAP *gapmap,
|
|
184
|
+
BOOL8 &old_text_ord_proportional,
|
|
185
|
+
inT16 &block_space_gap_width, //resulting estimate
|
|
186
|
+
inT16 &block_non_space_gap_width //resulting estimate
|
|
187
|
+
) {
|
|
188
|
+
TO_ROW_IT row_it; //row iterator
|
|
189
|
+
TO_ROW *row; //current row
|
|
190
|
+
BLOBNBOX_IT blob_it; //iterator
|
|
191
|
+
|
|
192
|
+
STATS centre_to_centre_stats (0, MAXSPACING);
|
|
193
|
+
//DEBUG USE ONLY
|
|
194
|
+
STATS all_gap_stats (0, MAXSPACING);
|
|
195
|
+
STATS space_gap_stats (0, MAXSPACING);
|
|
196
|
+
inT16 minwidth = MAX_INT16; //narrowest blob
|
|
197
|
+
TBOX blob_box;
|
|
198
|
+
TBOX prev_blob_box;
|
|
199
|
+
inT16 centre_to_centre;
|
|
200
|
+
inT16 gap_width;
|
|
201
|
+
float real_space_threshold;
|
|
202
|
+
float iqr_centre_to_centre; //DEBUG USE ONLY
|
|
203
|
+
float iqr_all_gap_stats; //DEBUG USE ONLY
|
|
204
|
+
inT32 end_of_row;
|
|
205
|
+
inT32 row_length;
|
|
206
|
+
|
|
207
|
+
row_it.set_to_list (block->get_rows ());
|
|
208
|
+
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
|
|
209
|
+
row = row_it.data ();
|
|
210
|
+
if (!row->blob_list ()->empty () &&
|
|
211
|
+
(!tosp_only_use_prop_rows ||
|
|
212
|
+
(row->pitch_decision == PITCH_DEF_PROP) ||
|
|
213
|
+
(row->pitch_decision == PITCH_CORR_PROP))) {
|
|
214
|
+
blob_it.set_to_list (row->blob_list ());
|
|
215
|
+
blob_it.mark_cycle_pt ();
|
|
216
|
+
end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
|
|
217
|
+
if (tosp_use_pre_chopping)
|
|
218
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
219
|
+
else if (tosp_stats_use_xht_gaps)
|
|
220
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
221
|
+
else
|
|
222
|
+
blob_box = box_next (&blob_it);
|
|
223
|
+
row_length = end_of_row - blob_box.left ();
|
|
224
|
+
if (blob_box.width () < minwidth)
|
|
225
|
+
minwidth = blob_box.width ();
|
|
226
|
+
prev_blob_box = blob_box;
|
|
227
|
+
while (!blob_it.cycled_list ()) {
|
|
228
|
+
if (tosp_use_pre_chopping)
|
|
229
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
230
|
+
else if (tosp_stats_use_xht_gaps)
|
|
231
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
232
|
+
else
|
|
233
|
+
blob_box = box_next (&blob_it);
|
|
234
|
+
if (blob_box.width () < minwidth)
|
|
235
|
+
minwidth = blob_box.width ();
|
|
236
|
+
gap_width = blob_box.left () - prev_blob_box.right ();
|
|
237
|
+
if (!ignore_big_gap (row, row_length, gapmap,
|
|
238
|
+
prev_blob_box.right (), blob_box.left ())) {
|
|
239
|
+
all_gap_stats.add (gap_width, 1);
|
|
240
|
+
|
|
241
|
+
centre_to_centre = (blob_box.left () + blob_box.right () -
|
|
242
|
+
(prev_blob_box.left () +
|
|
243
|
+
prev_blob_box.right ())) / 2;
|
|
244
|
+
//DEBUG
|
|
245
|
+
centre_to_centre_stats.add (centre_to_centre, 1);
|
|
246
|
+
// DEBUG
|
|
247
|
+
}
|
|
248
|
+
prev_blob_box = blob_box;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
//Inadequate samples
|
|
254
|
+
if (all_gap_stats.get_total () <= 1) {
|
|
255
|
+
block_non_space_gap_width = minwidth;
|
|
256
|
+
block_space_gap_width = -1; //No est. space width
|
|
257
|
+
//DEBUG
|
|
258
|
+
old_text_ord_proportional = TRUE;
|
|
259
|
+
}
|
|
260
|
+
else {
|
|
261
|
+
/* For debug only ..... */
|
|
262
|
+
iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
|
|
263
|
+
centre_to_centre_stats.ile (0.25);
|
|
264
|
+
iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
|
|
265
|
+
old_text_ord_proportional =
|
|
266
|
+
iqr_centre_to_centre * 2 > iqr_all_gap_stats;
|
|
267
|
+
/* .......For debug only */
|
|
268
|
+
|
|
269
|
+
/*
|
|
270
|
+
The median of the gaps is used as an estimate of the NON-SPACE gap width.
|
|
271
|
+
This RELIES on the assumption that there are more gaps WITHIN words than
|
|
272
|
+
BETWEEN words in a block
|
|
273
|
+
|
|
274
|
+
Now try to estimate the width of a real space for all real spaces in the
|
|
275
|
+
block. Do this by using a crude threshold to ignore "narrow" gaps, then
|
|
276
|
+
find the median of the "wide" gaps and use this.
|
|
277
|
+
*/
|
|
278
|
+
block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
|
|
279
|
+
// median gap
|
|
280
|
+
|
|
281
|
+
row_it.set_to_list (block->get_rows ());
|
|
282
|
+
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
|
|
283
|
+
row = row_it.data ();
|
|
284
|
+
if (!row->blob_list ()->empty () &&
|
|
285
|
+
(!tosp_only_use_prop_rows ||
|
|
286
|
+
(row->pitch_decision == PITCH_DEF_PROP) ||
|
|
287
|
+
(row->pitch_decision == PITCH_CORR_PROP))) {
|
|
288
|
+
real_space_threshold =
|
|
289
|
+
MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
|
|
290
|
+
tosp_init_guess_xht_mult * row->xheight);
|
|
291
|
+
blob_it.set_to_list (row->blob_list ());
|
|
292
|
+
blob_it.mark_cycle_pt ();
|
|
293
|
+
end_of_row =
|
|
294
|
+
blob_it.data_relative (-1)->bounding_box ().right ();
|
|
295
|
+
if (tosp_use_pre_chopping)
|
|
296
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
297
|
+
else if (tosp_stats_use_xht_gaps)
|
|
298
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
299
|
+
else
|
|
300
|
+
blob_box = box_next (&blob_it);
|
|
301
|
+
row_length = blob_box.left () - end_of_row;
|
|
302
|
+
prev_blob_box = blob_box;
|
|
303
|
+
while (!blob_it.cycled_list ()) {
|
|
304
|
+
if (tosp_use_pre_chopping)
|
|
305
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
306
|
+
else if (tosp_stats_use_xht_gaps)
|
|
307
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
308
|
+
else
|
|
309
|
+
blob_box = box_next (&blob_it);
|
|
310
|
+
gap_width = blob_box.left () - prev_blob_box.right ();
|
|
311
|
+
if ((gap_width > real_space_threshold) &&
|
|
312
|
+
!ignore_big_gap (row, row_length, gapmap,
|
|
313
|
+
prev_blob_box.right (),
|
|
314
|
+
blob_box.left ())) {
|
|
315
|
+
/*
|
|
316
|
+
If tosp_use_cert_spaces is enabled, the estimate of the space gap is
|
|
317
|
+
restricted to obvious spaces - those wider than half the xht or those
|
|
318
|
+
with wide blobs on both sides - i.e not things that are suspect 1's or
|
|
319
|
+
punctiation that is sometimes widely spaced.
|
|
320
|
+
*/
|
|
321
|
+
if (!tosp_block_use_cert_spaces ||
|
|
322
|
+
(gap_width >
|
|
323
|
+
tosp_fuzzy_space_factor2 * row->xheight)
|
|
324
|
+
||
|
|
325
|
+
((gap_width >
|
|
326
|
+
tosp_fuzzy_space_factor1 * row->xheight)
|
|
327
|
+
&& (!tosp_narrow_blobs_not_cert
|
|
328
|
+
|| (!narrow_blob (row, prev_blob_box)
|
|
329
|
+
&& !narrow_blob (row, blob_box))))
|
|
330
|
+
|| (wide_blob (row, prev_blob_box)
|
|
331
|
+
&& wide_blob (row, blob_box)))
|
|
332
|
+
space_gap_stats.add (gap_width, 1);
|
|
333
|
+
}
|
|
334
|
+
prev_blob_box = blob_box;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
//Inadequate samples
|
|
339
|
+
if (space_gap_stats.get_total () <= 2)
|
|
340
|
+
block_space_gap_width = -1;//No est. space width
|
|
341
|
+
else
|
|
342
|
+
block_space_gap_width =
|
|
343
|
+
MAX ((inT16) floor (space_gap_stats.median ()),
|
|
344
|
+
3 * block_non_space_gap_width);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
/*************************************************************************
|
|
350
|
+
* row_spacing_stats()
|
|
351
|
+
* Set values for min_space, max_non_space based on row stats only
|
|
352
|
+
* If failure - return 0 values.
|
|
353
|
+
*************************************************************************/
|
|
354
|
+
|
|
355
|
+
void row_spacing_stats( //estimate for block
|
|
356
|
+
TO_ROW *row,
|
|
357
|
+
GAPMAP *gapmap,
|
|
358
|
+
inT16 block_idx,
|
|
359
|
+
inT16 row_idx,
|
|
360
|
+
inT16 block_space_gap_width,
|
|
361
|
+
inT16 block_non_space_gap_width //estimate for block
|
|
362
|
+
) {
|
|
363
|
+
//iterator
|
|
364
|
+
BLOBNBOX_IT blob_it = row->blob_list ();
|
|
365
|
+
STATS all_gap_stats (0, MAXSPACING);
|
|
366
|
+
STATS cert_space_gap_stats (0, MAXSPACING);
|
|
367
|
+
STATS all_space_gap_stats (0, MAXSPACING);
|
|
368
|
+
STATS small_gap_stats (0, MAXSPACING);
|
|
369
|
+
TBOX blob_box;
|
|
370
|
+
TBOX prev_blob_box;
|
|
371
|
+
inT16 gap_width;
|
|
372
|
+
inT16 real_space_threshold = 0;
|
|
373
|
+
inT16 max = 0;
|
|
374
|
+
inT16 index;
|
|
375
|
+
inT16 large_gap_count = 0;
|
|
376
|
+
BOOL8 suspected_table;
|
|
377
|
+
inT32 max_max_nonspace; //upper bound
|
|
378
|
+
BOOL8 good_block_space_estimate = block_space_gap_width > 0;
|
|
379
|
+
inT32 end_of_row;
|
|
380
|
+
inT32 row_length = 0;
|
|
381
|
+
float sane_space;
|
|
382
|
+
inT32 sane_threshold;
|
|
383
|
+
|
|
384
|
+
/* Collect first pass stats for row */
|
|
385
|
+
|
|
386
|
+
if (!good_block_space_estimate)
|
|
387
|
+
block_space_gap_width = inT16 (floor (row->xheight / 2));
|
|
388
|
+
if (!row->blob_list ()->empty ()) {
|
|
389
|
+
if (tosp_threshold_bias1 > 0)
|
|
390
|
+
real_space_threshold =
|
|
391
|
+
block_non_space_gap_width +
|
|
392
|
+
inT16 (floor (0.5 +
|
|
393
|
+
tosp_threshold_bias1 * (block_space_gap_width -
|
|
394
|
+
block_non_space_gap_width)));
|
|
395
|
+
else
|
|
396
|
+
real_space_threshold = //Old TO method
|
|
397
|
+
(block_space_gap_width + block_non_space_gap_width) / 2;
|
|
398
|
+
blob_it.set_to_list (row->blob_list ());
|
|
399
|
+
blob_it.mark_cycle_pt ();
|
|
400
|
+
end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
|
|
401
|
+
if (tosp_use_pre_chopping)
|
|
402
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
403
|
+
else if (tosp_stats_use_xht_gaps)
|
|
404
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
405
|
+
else
|
|
406
|
+
blob_box = box_next (&blob_it);
|
|
407
|
+
row_length = end_of_row - blob_box.left ();
|
|
408
|
+
prev_blob_box = blob_box;
|
|
409
|
+
while (!blob_it.cycled_list ()) {
|
|
410
|
+
if (tosp_use_pre_chopping)
|
|
411
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
412
|
+
else if (tosp_stats_use_xht_gaps)
|
|
413
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
414
|
+
else
|
|
415
|
+
blob_box = box_next (&blob_it);
|
|
416
|
+
gap_width = blob_box.left () - prev_blob_box.right ();
|
|
417
|
+
if (ignore_big_gap (row, row_length, gapmap,
|
|
418
|
+
prev_blob_box.right (), blob_box.left ()))
|
|
419
|
+
large_gap_count++;
|
|
420
|
+
else {
|
|
421
|
+
if (gap_width >= real_space_threshold) {
|
|
422
|
+
if (!tosp_row_use_cert_spaces ||
|
|
423
|
+
(gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
|
|
424
|
+
((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
|
|
425
|
+
&& (!tosp_narrow_blobs_not_cert
|
|
426
|
+
|| (!narrow_blob (row, prev_blob_box)
|
|
427
|
+
&& !narrow_blob (row, blob_box))))
|
|
428
|
+
|| (wide_blob (row, prev_blob_box)
|
|
429
|
+
&& wide_blob (row, blob_box)))
|
|
430
|
+
cert_space_gap_stats.add (gap_width, 1);
|
|
431
|
+
all_space_gap_stats.add (gap_width, 1);
|
|
432
|
+
}
|
|
433
|
+
else
|
|
434
|
+
small_gap_stats.add (gap_width, 1);
|
|
435
|
+
all_gap_stats.add (gap_width, 1);
|
|
436
|
+
}
|
|
437
|
+
prev_blob_box = blob_box;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
suspected_table = (large_gap_count > 1) ||
|
|
441
|
+
((large_gap_count > 0) &&
|
|
442
|
+
(all_gap_stats.get_total () <= tosp_few_samples));
|
|
443
|
+
|
|
444
|
+
/* Now determine row kern size, space size and threshold */
|
|
445
|
+
|
|
446
|
+
if ((cert_space_gap_stats.get_total () >=
|
|
447
|
+
tosp_enough_space_samples_for_median) ||
|
|
448
|
+
((suspected_table ||
|
|
449
|
+
all_gap_stats.get_total () <= tosp_short_row) &&
|
|
450
|
+
cert_space_gap_stats.get_total () > 0))
|
|
451
|
+
old_to_method(row,
|
|
452
|
+
&all_gap_stats,
|
|
453
|
+
&cert_space_gap_stats,
|
|
454
|
+
&small_gap_stats,
|
|
455
|
+
block_space_gap_width,
|
|
456
|
+
block_non_space_gap_width);
|
|
457
|
+
else {
|
|
458
|
+
if (!tosp_recovery_isolated_row_stats ||
|
|
459
|
+
!isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
|
|
460
|
+
block_idx, row_idx)) {
|
|
461
|
+
if (tosp_row_use_cert_spaces && (tosp_debug_level > 5))
|
|
462
|
+
tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
|
|
463
|
+
block_idx, row_idx);
|
|
464
|
+
if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
|
|
465
|
+
//Use block default
|
|
466
|
+
row->space_size = block_space_gap_width;
|
|
467
|
+
if (all_gap_stats.get_total () > tosp_redo_kern_limit)
|
|
468
|
+
row->kern_size = all_gap_stats.median ();
|
|
469
|
+
else
|
|
470
|
+
row->kern_size = block_non_space_gap_width;
|
|
471
|
+
row->space_threshold =
|
|
472
|
+
inT32 (floor ((row->space_size + row->kern_size) / 2));
|
|
473
|
+
}
|
|
474
|
+
else
|
|
475
|
+
old_to_method(row,
|
|
476
|
+
&all_gap_stats,
|
|
477
|
+
&all_space_gap_stats,
|
|
478
|
+
&small_gap_stats,
|
|
479
|
+
block_space_gap_width,
|
|
480
|
+
block_non_space_gap_width);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if (tosp_improve_thresh && !suspected_table)
|
|
485
|
+
improve_row_threshold(row, &all_gap_stats);
|
|
486
|
+
|
|
487
|
+
/* Now lets try to be careful not to do anything silly with tables when we
|
|
488
|
+
are ignoring big gaps*/
|
|
489
|
+
if (tosp_sanity_method == 0) {
|
|
490
|
+
if (suspected_table &&
|
|
491
|
+
(row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
|
|
492
|
+
if (tosp_debug_level > 0)
|
|
493
|
+
tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
|
|
494
|
+
block_idx, row_idx,
|
|
495
|
+
row->kern_size, row->space_threshold, row->space_size);
|
|
496
|
+
row->space_threshold =
|
|
497
|
+
(inT32) (tosp_table_kn_sp_ratio * row->kern_size);
|
|
498
|
+
row->space_size = MAX (row->space_threshold + 1, row->xheight);
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
else if (tosp_sanity_method == 1) {
|
|
502
|
+
sane_space = row->space_size;
|
|
503
|
+
/* NEVER let space size get too close to kern size */
|
|
504
|
+
if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
|
|
505
|
+
|| ((row->space_size - row->kern_size) <
|
|
506
|
+
(tosp_silly_kn_sp_gap * row->xheight))) {
|
|
507
|
+
if (good_block_space_estimate &&
|
|
508
|
+
(block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
|
|
509
|
+
sane_space = block_space_gap_width;
|
|
510
|
+
else
|
|
511
|
+
sane_space =
|
|
512
|
+
MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
|
|
513
|
+
row->xheight / 2);
|
|
514
|
+
if (tosp_debug_level > 0)
|
|
515
|
+
tprintf
|
|
516
|
+
("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
|
|
517
|
+
block_idx, row_idx, row->kern_size, row->space_threshold,
|
|
518
|
+
row->space_size, sane_space);
|
|
519
|
+
row->space_size = sane_space;
|
|
520
|
+
row->space_threshold =
|
|
521
|
+
inT32 (floor ((row->space_size + row->kern_size) / 2));
|
|
522
|
+
}
|
|
523
|
+
/* NEVER let threshold get VERY far away from kern */
|
|
524
|
+
sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
|
|
525
|
+
MAX (row->kern_size, 2.5)));
|
|
526
|
+
if (row->space_threshold > sane_threshold) {
|
|
527
|
+
if (tosp_debug_level > 0)
|
|
528
|
+
tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
|
|
529
|
+
block_idx, row_idx,
|
|
530
|
+
row->kern_size,
|
|
531
|
+
row->space_threshold, row->space_size, sane_threshold);
|
|
532
|
+
row->space_threshold = sane_threshold;
|
|
533
|
+
if (row->space_size <= sane_threshold)
|
|
534
|
+
row->space_size = row->space_threshold + 1.0f;
|
|
535
|
+
}
|
|
536
|
+
/* Beware of tables - there may be NO spaces */
|
|
537
|
+
if (suspected_table) {
|
|
538
|
+
sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
|
|
539
|
+
tosp_table_xht_sp_ratio * row->xheight);
|
|
540
|
+
sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
|
|
541
|
+
|
|
542
|
+
if ((row->space_size < sane_space) ||
|
|
543
|
+
(row->space_threshold < sane_threshold)) {
|
|
544
|
+
if (tosp_debug_level > 0)
|
|
545
|
+
tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
|
|
546
|
+
block_idx, row_idx,
|
|
547
|
+
row->kern_size,
|
|
548
|
+
row->space_threshold, row->space_size);
|
|
549
|
+
//the minimum sane value
|
|
550
|
+
row->space_threshold = (inT32) sane_space;
|
|
551
|
+
row->space_size = MAX (row->space_threshold + 1, row->xheight);
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
/* Now lets try to put some error limits on the threshold */
|
|
557
|
+
|
|
558
|
+
if (tosp_old_to_method) {
|
|
559
|
+
/* Old textord made a space if gap >= threshold */
|
|
560
|
+
//NO FUZZY SPACES YET
|
|
561
|
+
row->max_nonspace = row->space_threshold;
|
|
562
|
+
//NO FUZZY SPACES YET
|
|
563
|
+
row->min_space = row->space_threshold + 1;
|
|
564
|
+
}
|
|
565
|
+
else {
|
|
566
|
+
/* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
|
|
567
|
+
row->min_space =
|
|
568
|
+
MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
|
|
569
|
+
inT32 (row->space_size));
|
|
570
|
+
if (row->min_space <= row->space_threshold)
|
|
571
|
+
//Dont be silly
|
|
572
|
+
row->min_space = row->space_threshold + 1;
|
|
573
|
+
/*
|
|
574
|
+
Lets try to guess the max certain kern gap by looking at the cluster of
|
|
575
|
+
kerns for the row. The row is proportional so the kerns should cluster
|
|
576
|
+
tightly at the bottom of the distribution. We also expect most gaps to be
|
|
577
|
+
kerns. Find the maximum of the kern piles between 0 and twice the kern
|
|
578
|
+
estimate. Piles before the first one with less than 1/10 the maximum
|
|
579
|
+
number of samples can be taken as certain kerns.
|
|
580
|
+
|
|
581
|
+
Of course, there are some cases where the kern peak and space peaks merge,
|
|
582
|
+
so we will put an UPPER limit on the max certain kern gap of some fraction
|
|
583
|
+
below the threshold.
|
|
584
|
+
*/
|
|
585
|
+
|
|
586
|
+
max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
|
|
587
|
+
|
|
588
|
+
//default
|
|
589
|
+
row->max_nonspace = max_max_nonspace;
|
|
590
|
+
for (index = 0; index <= max_max_nonspace; index++) {
|
|
591
|
+
if (all_gap_stats.pile_count (index) > max)
|
|
592
|
+
max = all_gap_stats.pile_count (index);
|
|
593
|
+
if ((index > row->kern_size) &&
|
|
594
|
+
(all_gap_stats.pile_count (index) < 0.1 * max)) {
|
|
595
|
+
row->max_nonspace = index;
|
|
596
|
+
break;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/* Yet another algorithm - simpler this time - just choose a fraction of the
|
|
602
|
+
threshold to space range */
|
|
603
|
+
|
|
604
|
+
if ((tosp_fuzzy_sp_fraction > 0) &&
|
|
605
|
+
(row->space_size > row->space_threshold))
|
|
606
|
+
row->min_space = MAX (row->min_space,
|
|
607
|
+
(inT32) ceil (row->space_threshold +
|
|
608
|
+
tosp_fuzzy_sp_fraction *
|
|
609
|
+
(row->space_size -
|
|
610
|
+
row->space_threshold)));
|
|
611
|
+
|
|
612
|
+
/* Ensure that ANY space less than some multiplier times the kern size is
|
|
613
|
+
fuzzy. In tables there is a risk of erroneously setting a small space size
|
|
614
|
+
when there are no real spaces. Sometimes tables have text squashed into
|
|
615
|
+
columns so that the kn->sp ratio is small anyway - this means that we cant
|
|
616
|
+
use this to force a wider separation - hence we rely on context to join any
|
|
617
|
+
dubious breaks. */
|
|
618
|
+
|
|
619
|
+
if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
|
|
620
|
+
(suspected_table || tosp_fuzzy_limit_all))
|
|
621
|
+
row->min_space = MAX (row->min_space,
|
|
622
|
+
(inT32) ceil (tosp_table_fuzzy_kn_sp_ratio *
|
|
623
|
+
row->kern_size));
|
|
624
|
+
|
|
625
|
+
if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold))
|
|
626
|
+
row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
|
|
627
|
+
tosp_fuzzy_kn_fraction *
|
|
628
|
+
(row->space_threshold -
|
|
629
|
+
row->kern_size));
|
|
630
|
+
|
|
631
|
+
if (row->max_nonspace > row->space_threshold)
|
|
632
|
+
//Dont be silly
|
|
633
|
+
row->max_nonspace = row->space_threshold;
|
|
634
|
+
|
|
635
|
+
if (tosp_debug_level > 5)
|
|
636
|
+
tprintf
|
|
637
|
+
("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
|
|
638
|
+
block_idx, row_idx, row_length, block_non_space_gap_width,
|
|
639
|
+
block_space_gap_width, real_space_threshold, row->kern_size,
|
|
640
|
+
row->max_nonspace, row->space_threshold, row->min_space,
|
|
641
|
+
row->space_size);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
void old_to_method( //estimate for block
|
|
646
|
+
TO_ROW *row,
|
|
647
|
+
STATS *all_gap_stats,
|
|
648
|
+
STATS *space_gap_stats,
|
|
649
|
+
STATS *small_gap_stats,
|
|
650
|
+
inT16 block_space_gap_width,
|
|
651
|
+
inT16 block_non_space_gap_width //estimate for block
|
|
652
|
+
) {
|
|
653
|
+
/* Old to condition was > 2 */
|
|
654
|
+
if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
|
|
655
|
+
//Adequate samples
|
|
656
|
+
/* Set space size to median of spaces BUT limits it if it seems wildly out */
|
|
657
|
+
row->space_size = space_gap_stats->median ();
|
|
658
|
+
if (row->space_size > block_space_gap_width * 1.5) {
|
|
659
|
+
if (tosp_old_to_bug_fix)
|
|
660
|
+
row->space_size = block_space_gap_width * 1.5;
|
|
661
|
+
else
|
|
662
|
+
//BUG??? should be *1.5
|
|
663
|
+
row->space_size = block_space_gap_width;
|
|
664
|
+
}
|
|
665
|
+
if (row->space_size < (block_non_space_gap_width * 2) + 1)
|
|
666
|
+
row->space_size = (block_non_space_gap_width * 2) + 1;
|
|
667
|
+
}
|
|
668
|
+
//Only 1 or 2 samples
|
|
669
|
+
else if (space_gap_stats->get_total () >= 1) {
|
|
670
|
+
//hence mean not median
|
|
671
|
+
row->space_size = space_gap_stats->mean ();
|
|
672
|
+
if (row->space_size > block_space_gap_width * 1.5) {
|
|
673
|
+
if (tosp_old_to_bug_fix)
|
|
674
|
+
row->space_size = block_space_gap_width * 1.5;
|
|
675
|
+
else
|
|
676
|
+
//BUG??? should be *1.5
|
|
677
|
+
row->space_size = block_space_gap_width;
|
|
678
|
+
}
|
|
679
|
+
if (row->space_size < (block_non_space_gap_width * 3) + 1)
|
|
680
|
+
row->space_size = (block_non_space_gap_width * 3) + 1;
|
|
681
|
+
}
|
|
682
|
+
else
|
|
683
|
+
//Use block default
|
|
684
|
+
row->space_size = block_space_gap_width;
|
|
685
|
+
|
|
686
|
+
if ((tosp_only_small_gaps_for_kern) &&
|
|
687
|
+
(small_gap_stats->get_total () > tosp_redo_kern_limit))
|
|
688
|
+
row->kern_size = small_gap_stats->median ();
|
|
689
|
+
else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
|
|
690
|
+
row->kern_size = all_gap_stats->median ();
|
|
691
|
+
else
|
|
692
|
+
//old TO -SAME FOR ALL ROWS
|
|
693
|
+
row->kern_size = block_non_space_gap_width;
|
|
694
|
+
|
|
695
|
+
if (tosp_threshold_bias2 > 0)
|
|
696
|
+
row->space_threshold =
|
|
697
|
+
inT32 (floor (0.5 + row->kern_size +
|
|
698
|
+
tosp_threshold_bias2 * (row->space_size -
|
|
699
|
+
row->kern_size)));
|
|
700
|
+
else
|
|
701
|
+
/*
|
|
702
|
+
NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
|
|
703
|
+
and holds this in a float. The use is with a >= test
|
|
704
|
+
NEW textord uses an integer threshold and a > test
|
|
705
|
+
It comes to the same thing.
|
|
706
|
+
(Though there is a difference in that old textor has integer space_size
|
|
707
|
+
and kern_size.)
|
|
708
|
+
*/
|
|
709
|
+
row->space_threshold =
|
|
710
|
+
inT32 (floor ((row->space_size + row->kern_size) / 2));
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
/*************************************************************************
|
|
715
|
+
* isolated_row_stats()
|
|
716
|
+
* Set values for min_space, max_non_space based on row stats only
|
|
717
|
+
*************************************************************************/
|
|
718
|
+
|
|
719
|
+
BOOL8 isolated_row_stats(TO_ROW *row,
|
|
720
|
+
GAPMAP *gapmap,
|
|
721
|
+
STATS *all_gap_stats,
|
|
722
|
+
BOOL8 suspected_table,
|
|
723
|
+
inT16 block_idx,
|
|
724
|
+
inT16 row_idx) {
|
|
725
|
+
float kern_estimate;
|
|
726
|
+
float crude_threshold_estimate;
|
|
727
|
+
inT16 small_gaps_count;
|
|
728
|
+
inT16 total;
|
|
729
|
+
//iterator
|
|
730
|
+
BLOBNBOX_IT blob_it = row->blob_list ();
|
|
731
|
+
STATS cert_space_gap_stats (0, MAXSPACING);
|
|
732
|
+
STATS all_space_gap_stats (0, MAXSPACING);
|
|
733
|
+
STATS small_gap_stats (0, MAXSPACING);
|
|
734
|
+
TBOX blob_box;
|
|
735
|
+
TBOX prev_blob_box;
|
|
736
|
+
inT16 gap_width;
|
|
737
|
+
inT32 end_of_row;
|
|
738
|
+
inT32 row_length;
|
|
739
|
+
|
|
740
|
+
kern_estimate = all_gap_stats->median ();
|
|
741
|
+
crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
|
|
742
|
+
tosp_init_guess_xht_mult * row->xheight);
|
|
743
|
+
small_gaps_count = stats_count_under (all_gap_stats,
|
|
744
|
+
(inT16)
|
|
745
|
+
ceil (crude_threshold_estimate));
|
|
746
|
+
total = all_gap_stats->get_total ();
|
|
747
|
+
|
|
748
|
+
if ((total <= tosp_redo_kern_limit) ||
|
|
749
|
+
((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
|
|
750
|
+
(total - small_gaps_count < 1)) {
|
|
751
|
+
if (tosp_debug_level > 5)
|
|
752
|
+
tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
|
|
753
|
+
block_idx, row_idx);
|
|
754
|
+
return FALSE;
|
|
755
|
+
}
|
|
756
|
+
blob_it.set_to_list (row->blob_list ());
|
|
757
|
+
blob_it.mark_cycle_pt ();
|
|
758
|
+
end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
|
|
759
|
+
if (tosp_use_pre_chopping)
|
|
760
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
761
|
+
else if (tosp_stats_use_xht_gaps)
|
|
762
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
763
|
+
else
|
|
764
|
+
blob_box = box_next (&blob_it);
|
|
765
|
+
row_length = end_of_row - blob_box.left ();
|
|
766
|
+
prev_blob_box = blob_box;
|
|
767
|
+
while (!blob_it.cycled_list ()) {
|
|
768
|
+
if (tosp_use_pre_chopping)
|
|
769
|
+
blob_box = box_next_pre_chopped (&blob_it);
|
|
770
|
+
else if (tosp_stats_use_xht_gaps)
|
|
771
|
+
blob_box = reduced_box_next (row, &blob_it);
|
|
772
|
+
else
|
|
773
|
+
blob_box = box_next (&blob_it);
|
|
774
|
+
gap_width = blob_box.left () - prev_blob_box.right ();
|
|
775
|
+
if (!ignore_big_gap (row, row_length, gapmap,
|
|
776
|
+
prev_blob_box.right (), blob_box.left ()) &&
|
|
777
|
+
(gap_width > crude_threshold_estimate)) {
|
|
778
|
+
if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
|
|
779
|
+
((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
|
|
780
|
+
(!tosp_narrow_blobs_not_cert ||
|
|
781
|
+
(!narrow_blob (row, prev_blob_box) &&
|
|
782
|
+
!narrow_blob (row, blob_box)))) ||
|
|
783
|
+
(wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
|
|
784
|
+
cert_space_gap_stats.add (gap_width, 1);
|
|
785
|
+
all_space_gap_stats.add (gap_width, 1);
|
|
786
|
+
}
|
|
787
|
+
if (gap_width < crude_threshold_estimate)
|
|
788
|
+
small_gap_stats.add (gap_width, 1);
|
|
789
|
+
|
|
790
|
+
prev_blob_box = blob_box;
|
|
791
|
+
}
|
|
792
|
+
if (cert_space_gap_stats.get_total () >=
|
|
793
|
+
tosp_enough_space_samples_for_median)
|
|
794
|
+
//median
|
|
795
|
+
row->space_size = cert_space_gap_stats.median ();
|
|
796
|
+
else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
|
|
797
|
+
//to avoid spaced
|
|
798
|
+
row->space_size = cert_space_gap_stats.mean ();
|
|
799
|
+
// 1's in tables
|
|
800
|
+
else if (all_space_gap_stats.get_total () >=
|
|
801
|
+
tosp_enough_space_samples_for_median)
|
|
802
|
+
//median
|
|
803
|
+
row->space_size = all_space_gap_stats.median ();
|
|
804
|
+
else
|
|
805
|
+
row->space_size = all_space_gap_stats.mean ();
|
|
806
|
+
|
|
807
|
+
if (tosp_only_small_gaps_for_kern)
|
|
808
|
+
row->kern_size = small_gap_stats.median ();
|
|
809
|
+
else
|
|
810
|
+
row->kern_size = all_gap_stats->median ();
|
|
811
|
+
row->space_threshold =
|
|
812
|
+
inT32 (floor ((row->space_size + row->kern_size) / 2));
|
|
813
|
+
/* Sanity check */
|
|
814
|
+
if ((row->kern_size >= row->space_threshold) ||
|
|
815
|
+
(row->space_threshold >= row->space_size) ||
|
|
816
|
+
(row->space_threshold <= 0)) {
|
|
817
|
+
if (tosp_debug_level > 0)
|
|
818
|
+
tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
|
|
819
|
+
block_idx, row_idx,
|
|
820
|
+
row->kern_size, row->space_threshold, row->space_size);
|
|
821
|
+
row->kern_size = 0.0f;
|
|
822
|
+
row->space_threshold = 0;
|
|
823
|
+
row->space_size = 0.0f;
|
|
824
|
+
return FALSE;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
if (tosp_debug_level > 5)
|
|
828
|
+
tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
|
|
829
|
+
block_idx, row_idx,
|
|
830
|
+
row->kern_size, row->space_threshold, row->space_size);
|
|
831
|
+
return TRUE;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
inT16 stats_count_under(STATS *stats, inT16 threshold) {
|
|
836
|
+
inT16 index;
|
|
837
|
+
inT16 total = 0;
|
|
838
|
+
|
|
839
|
+
for (index = 0; index < threshold; index++)
|
|
840
|
+
total += stats->pile_count (index);
|
|
841
|
+
return total;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
/*************************************************************************
|
|
846
|
+
* improve_row_threshold()
|
|
847
|
+
* Try to recognise a "normal line" -
|
|
848
|
+
* > 25 gaps
|
|
849
|
+
* && space > 3 * kn && space > 10
|
|
850
|
+
* (I.e. reasonably large space and kn:sp ratio)
|
|
851
|
+
* && > 3/4 # gaps < kn + (sp - kn)/3
|
|
852
|
+
* (I.e. most gaps are well away from space estimate)
|
|
853
|
+
* && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
|
|
854
|
+
* somewhere in the histogram between kn and sp
|
|
855
|
+
* THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
|
|
856
|
+
* NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
|
|
857
|
+
* try moving the default threshold to within this band but leave the
|
|
858
|
+
* fuzzy limit calculation as at present.
|
|
859
|
+
*************************************************************************/
|
|
860
|
+
|
|
861
|
+
void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
|
|
862
|
+
float sp = row->space_size;
|
|
863
|
+
float kn = row->kern_size;
|
|
864
|
+
inT16 reqd_zero_width = 0;
|
|
865
|
+
inT16 zero_width = 0;
|
|
866
|
+
inT16 zero_start = 0;
|
|
867
|
+
inT16 index = 0;
|
|
868
|
+
|
|
869
|
+
if (tosp_debug_level > 10)
|
|
870
|
+
tprintf ("Improve row threshold 0");
|
|
871
|
+
if ((all_gap_stats->get_total () <= 25) ||
|
|
872
|
+
(sp <= 10) ||
|
|
873
|
+
(sp <= 3 * kn) ||
|
|
874
|
+
(stats_count_under (all_gap_stats,
|
|
875
|
+
(inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
|
|
876
|
+
(0.75 * all_gap_stats->get_total ())))
|
|
877
|
+
return;
|
|
878
|
+
if (tosp_debug_level > 10)
|
|
879
|
+
tprintf (" 1");
|
|
880
|
+
/*
|
|
881
|
+
Look for the first region of all 0's in the histogram which is wider than
|
|
882
|
+
max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
|
|
883
|
+
threshold is not within it, move the threshold so that is is just inside it.
|
|
884
|
+
*/
|
|
885
|
+
reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
|
|
886
|
+
if (reqd_zero_width < 3)
|
|
887
|
+
reqd_zero_width = 3;
|
|
888
|
+
|
|
889
|
+
for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
|
|
890
|
+
if (all_gap_stats->pile_count (index) == 0) {
|
|
891
|
+
if (zero_width == 0)
|
|
892
|
+
zero_start = index;
|
|
893
|
+
zero_width++;
|
|
894
|
+
}
|
|
895
|
+
else {
|
|
896
|
+
if (zero_width >= reqd_zero_width)
|
|
897
|
+
break;
|
|
898
|
+
else {
|
|
899
|
+
zero_width = 0;
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
index--;
|
|
904
|
+
if (tosp_debug_level > 10)
|
|
905
|
+
tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
|
|
906
|
+
reqd_zero_width, zero_width, zero_start, row->space_threshold);
|
|
907
|
+
if ((zero_width < reqd_zero_width) ||
|
|
908
|
+
((row->space_threshold >= zero_start) &&
|
|
909
|
+
(row->space_threshold <= index)))
|
|
910
|
+
return;
|
|
911
|
+
if (tosp_debug_level > 10)
|
|
912
|
+
tprintf (" 2");
|
|
913
|
+
if (row->space_threshold < zero_start) {
|
|
914
|
+
if (tosp_debug_level > 5)
|
|
915
|
+
tprintf
|
|
916
|
+
("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
|
|
917
|
+
kn, sp, zero_start, index, row->space_threshold, zero_start);
|
|
918
|
+
row->space_threshold = zero_start;
|
|
919
|
+
}
|
|
920
|
+
if (row->space_threshold > index) {
|
|
921
|
+
if (tosp_debug_level > 5)
|
|
922
|
+
tprintf
|
|
923
|
+
("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
|
|
924
|
+
kn, sp, zero_start, index, row->space_threshold, index);
|
|
925
|
+
row->space_threshold = index;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
/**********************************************************************
|
|
931
|
+
* make_prop_words
|
|
932
|
+
*
|
|
933
|
+
* Convert a TO_BLOCK to a BLOCK.
|
|
934
|
+
**********************************************************************/
|
|
935
|
+
|
|
936
|
+
ROW *make_prop_words( //find lines
|
|
937
|
+
TO_ROW *row, //row to make
|
|
938
|
+
FCOORD rotation //for drawing
|
|
939
|
+
) {
|
|
940
|
+
BOOL8 bol; //start of line
|
|
941
|
+
/* prev_ values are for start of word being built. non prev_ values are for
|
|
942
|
+
the gap between the word being built and the next one. */
|
|
943
|
+
BOOL8 prev_fuzzy_sp; //probably space
|
|
944
|
+
BOOL8 prev_fuzzy_non; //probably not
|
|
945
|
+
uinT8 prev_blanks; //in front of word
|
|
946
|
+
BOOL8 fuzzy_sp; //probably space
|
|
947
|
+
BOOL8 fuzzy_non; //probably not
|
|
948
|
+
uinT8 blanks; //in front of word
|
|
949
|
+
ROW *real_row; //output row
|
|
950
|
+
OUTLINE_IT out_it; //outlines
|
|
951
|
+
C_OUTLINE_IT cout_it;
|
|
952
|
+
PBLOB_LIST blobs; //blobs in word
|
|
953
|
+
C_BLOB_LIST cblobs;
|
|
954
|
+
PBLOB_IT blob_it = &blobs; //iterator
|
|
955
|
+
C_BLOB_IT cblob_it = &cblobs;
|
|
956
|
+
WERD_LIST words;
|
|
957
|
+
WERD_IT word_it; //new words
|
|
958
|
+
WERD *word; //new word
|
|
959
|
+
WERD_IT rep_char_it; //repeated char words
|
|
960
|
+
inT32 next_rep_char_word_right = MAX_INT32;
|
|
961
|
+
float repetition_spacing; //gap between repetitions
|
|
962
|
+
inT32 xstarts[2]; //row ends
|
|
963
|
+
double coeffs[3]; //quadratic
|
|
964
|
+
inT32 prev_x; //end of prev blob
|
|
965
|
+
BLOBNBOX *bblob; //current blob
|
|
966
|
+
TBOX blob_box; //bounding box
|
|
967
|
+
BLOBNBOX_IT box_it; //iterator
|
|
968
|
+
TBOX prev_blob_box;
|
|
969
|
+
TBOX next_blob_box;
|
|
970
|
+
inT16 prev_gap = MAX_INT16;
|
|
971
|
+
inT16 current_gap = MAX_INT16;
|
|
972
|
+
inT16 next_gap = MAX_INT16;
|
|
973
|
+
inT16 prev_within_xht_gap = MAX_INT16;
|
|
974
|
+
inT16 current_within_xht_gap = MAX_INT16;
|
|
975
|
+
inT16 next_within_xht_gap = MAX_INT16;
|
|
976
|
+
inT16 word_count = 0;
|
|
977
|
+
static inT16 row_count = 0;
|
|
978
|
+
|
|
979
|
+
row_count++;
|
|
980
|
+
rep_char_it.set_to_list (&(row->rep_words));
|
|
981
|
+
if (!rep_char_it.empty ()) {
|
|
982
|
+
next_rep_char_word_right =
|
|
983
|
+
rep_char_it.data ()->bounding_box ().right ();
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
prev_x = -MAX_INT16;
|
|
987
|
+
blob_it.set_to_list (&blobs);
|
|
988
|
+
cblob_it.set_to_list (&cblobs);
|
|
989
|
+
box_it.set_to_list (row->blob_list ());
|
|
990
|
+
word_it.set_to_list (&words);
|
|
991
|
+
bol = TRUE;
|
|
992
|
+
prev_blanks = 0;
|
|
993
|
+
prev_fuzzy_sp = FALSE;
|
|
994
|
+
prev_fuzzy_non = FALSE;
|
|
995
|
+
if (!box_it.empty ()) {
|
|
996
|
+
xstarts[0] = box_it.data ()->bounding_box ().left ();
|
|
997
|
+
if (xstarts[0] > next_rep_char_word_right) {
|
|
998
|
+
/* We need to insert a repeated char word at the start of the row */
|
|
999
|
+
word = rep_char_it.extract ();
|
|
1000
|
+
word_it.add_after_then_move (word);
|
|
1001
|
+
/* Set spaces before repeated char word */
|
|
1002
|
+
word->set_flag (W_BOL, TRUE);
|
|
1003
|
+
bol = FALSE;
|
|
1004
|
+
word->set_blanks (0);
|
|
1005
|
+
//NO uncertainty
|
|
1006
|
+
word->set_flag (W_FUZZY_SP, FALSE);
|
|
1007
|
+
word->set_flag (W_FUZZY_NON, FALSE);
|
|
1008
|
+
xstarts[0] = word->bounding_box ().left ();
|
|
1009
|
+
/* Set spaces after repeated char word (and leave current word set) */
|
|
1010
|
+
repetition_spacing = find_mean_blob_spacing (word);
|
|
1011
|
+
current_gap = box_it.data ()->bounding_box ().left () -
|
|
1012
|
+
next_rep_char_word_right;
|
|
1013
|
+
current_within_xht_gap = current_gap;
|
|
1014
|
+
if (current_gap > tosp_rep_space * repetition_spacing) {
|
|
1015
|
+
prev_blanks = (uinT8) floor (current_gap / row->space_size);
|
|
1016
|
+
if (prev_blanks < 1)
|
|
1017
|
+
prev_blanks = 1;
|
|
1018
|
+
}
|
|
1019
|
+
else
|
|
1020
|
+
prev_blanks = 0;
|
|
1021
|
+
if (tosp_debug_level > 5)
|
|
1022
|
+
tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
|
|
1023
|
+
box_it.data ()->bounding_box ().left (),
|
|
1024
|
+
box_it.data ()->bounding_box ().bottom (),
|
|
1025
|
+
repetition_spacing, current_gap);
|
|
1026
|
+
prev_fuzzy_sp = FALSE;
|
|
1027
|
+
prev_fuzzy_non = FALSE;
|
|
1028
|
+
if (rep_char_it.empty ()) {
|
|
1029
|
+
next_rep_char_word_right = MAX_INT32;
|
|
1030
|
+
}
|
|
1031
|
+
else {
|
|
1032
|
+
rep_char_it.forward ();
|
|
1033
|
+
next_rep_char_word_right =
|
|
1034
|
+
rep_char_it.data ()->bounding_box ().right ();
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
peek_at_next_gap(row,
|
|
1039
|
+
box_it,
|
|
1040
|
+
next_blob_box,
|
|
1041
|
+
next_gap,
|
|
1042
|
+
next_within_xht_gap);
|
|
1043
|
+
do {
|
|
1044
|
+
bblob = box_it.data ();
|
|
1045
|
+
blob_box = bblob->bounding_box ();
|
|
1046
|
+
if (bblob->joined_to_prev ()) {
|
|
1047
|
+
if (bblob->blob () != NULL) {
|
|
1048
|
+
out_it.set_to_list (blob_it.data ()->out_list ());
|
|
1049
|
+
out_it.move_to_last ();
|
|
1050
|
+
out_it.add_list_after (bblob->blob ()->out_list ());
|
|
1051
|
+
delete bblob->blob ();
|
|
1052
|
+
}
|
|
1053
|
+
else if (bblob->cblob () != NULL) {
|
|
1054
|
+
cout_it.set_to_list (cblob_it.data ()->out_list ());
|
|
1055
|
+
cout_it.move_to_last ();
|
|
1056
|
+
cout_it.add_list_after (bblob->cblob ()->out_list ());
|
|
1057
|
+
delete bblob->cblob ();
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
else {
|
|
1061
|
+
if (bblob->blob () != NULL)
|
|
1062
|
+
blob_it.add_after_then_move (bblob->blob ());
|
|
1063
|
+
else if (bblob->cblob () != NULL)
|
|
1064
|
+
cblob_it.add_after_then_move (bblob->cblob ());
|
|
1065
|
+
prev_x = blob_box.right ();
|
|
1066
|
+
}
|
|
1067
|
+
box_it.forward (); //next one
|
|
1068
|
+
bblob = box_it.data ();
|
|
1069
|
+
blob_box = bblob->bounding_box ();
|
|
1070
|
+
|
|
1071
|
+
if (!bblob->joined_to_prev () &&
|
|
1072
|
+
(bblob->blob () != NULL || bblob->cblob () != NULL)) {
|
|
1073
|
+
/* Real Blob - not multiple outlines or pre-chopped */
|
|
1074
|
+
prev_gap = current_gap;
|
|
1075
|
+
prev_within_xht_gap = current_within_xht_gap;
|
|
1076
|
+
prev_blob_box = next_blob_box;
|
|
1077
|
+
current_gap = next_gap;
|
|
1078
|
+
current_within_xht_gap = next_within_xht_gap;
|
|
1079
|
+
peek_at_next_gap(row,
|
|
1080
|
+
box_it,
|
|
1081
|
+
next_blob_box,
|
|
1082
|
+
next_gap,
|
|
1083
|
+
next_within_xht_gap);
|
|
1084
|
+
|
|
1085
|
+
if ((blob_box.left () > next_rep_char_word_right) ||
|
|
1086
|
+
(!tosp_only_use_xht_gaps &&
|
|
1087
|
+
make_a_word_break (row, blob_box, prev_gap, prev_blob_box,
|
|
1088
|
+
current_gap, current_within_xht_gap,
|
|
1089
|
+
next_blob_box, next_gap,
|
|
1090
|
+
blanks, fuzzy_sp, fuzzy_non)) ||
|
|
1091
|
+
(tosp_only_use_xht_gaps &&
|
|
1092
|
+
make_a_word_break (row, blob_box, prev_within_xht_gap,
|
|
1093
|
+
prev_blob_box,
|
|
1094
|
+
current_gap, current_within_xht_gap,
|
|
1095
|
+
next_blob_box, next_within_xht_gap,
|
|
1096
|
+
blanks, fuzzy_sp, fuzzy_non)) ||
|
|
1097
|
+
box_it.at_first ()) {
|
|
1098
|
+
/* Form a new word out of the blobs collected */
|
|
1099
|
+
if (!blob_it.empty ()) {
|
|
1100
|
+
word = new WERD (&blobs, prev_blanks, NULL);
|
|
1101
|
+
//make real word
|
|
1102
|
+
word_count++;
|
|
1103
|
+
}
|
|
1104
|
+
else {
|
|
1105
|
+
word = new WERD (&cblobs, prev_blanks, NULL);
|
|
1106
|
+
word_count++;
|
|
1107
|
+
}
|
|
1108
|
+
word_it.add_after_then_move (word);
|
|
1109
|
+
if (bol) {
|
|
1110
|
+
word->set_flag (W_BOL, TRUE);
|
|
1111
|
+
bol = FALSE;
|
|
1112
|
+
}
|
|
1113
|
+
if (prev_fuzzy_sp)
|
|
1114
|
+
//probably space
|
|
1115
|
+
word->set_flag (W_FUZZY_SP, TRUE);
|
|
1116
|
+
else if (prev_fuzzy_non)
|
|
1117
|
+
word->set_flag (W_FUZZY_NON, TRUE);
|
|
1118
|
+
//probably not
|
|
1119
|
+
|
|
1120
|
+
if (blob_box.left () > next_rep_char_word_right) {
|
|
1121
|
+
/* We need to insert a repeated char word */
|
|
1122
|
+
word = rep_char_it.extract ();
|
|
1123
|
+
word_it.add_after_then_move (word);
|
|
1124
|
+
|
|
1125
|
+
/* Set spaces before repeated char word */
|
|
1126
|
+
repetition_spacing = find_mean_blob_spacing (word);
|
|
1127
|
+
current_gap = word->bounding_box ().left () - prev_x;
|
|
1128
|
+
current_within_xht_gap = current_gap;
|
|
1129
|
+
if (current_gap > tosp_rep_space * repetition_spacing) {
|
|
1130
|
+
blanks =
|
|
1131
|
+
(uinT8) floor (current_gap / row->space_size);
|
|
1132
|
+
if (blanks < 1)
|
|
1133
|
+
blanks = 1;
|
|
1134
|
+
}
|
|
1135
|
+
else
|
|
1136
|
+
blanks = 0;
|
|
1137
|
+
if (tosp_debug_level > 5)
|
|
1138
|
+
tprintf
|
|
1139
|
+
("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
|
|
1140
|
+
word->bounding_box ().left (),
|
|
1141
|
+
word->bounding_box ().bottom (),
|
|
1142
|
+
repetition_spacing, current_gap, blanks);
|
|
1143
|
+
word->set_blanks (blanks);
|
|
1144
|
+
//NO uncertainty
|
|
1145
|
+
word->set_flag (W_FUZZY_SP, FALSE);
|
|
1146
|
+
word->set_flag (W_FUZZY_NON, FALSE);
|
|
1147
|
+
|
|
1148
|
+
/* Set spaces after repeated char word (and leave current word set) */
|
|
1149
|
+
current_gap =
|
|
1150
|
+
blob_box.left () - next_rep_char_word_right;
|
|
1151
|
+
if (current_gap > tosp_rep_space * repetition_spacing) {
|
|
1152
|
+
blanks = (uinT8) (current_gap / row->space_size);
|
|
1153
|
+
if (blanks < 1)
|
|
1154
|
+
blanks = 1;
|
|
1155
|
+
}
|
|
1156
|
+
else
|
|
1157
|
+
blanks = 0;
|
|
1158
|
+
if (tosp_debug_level > 5)
|
|
1159
|
+
tprintf (" Rgap:%d (%d blanks)\n",
|
|
1160
|
+
current_gap, blanks);
|
|
1161
|
+
fuzzy_sp = FALSE;
|
|
1162
|
+
fuzzy_non = FALSE;
|
|
1163
|
+
|
|
1164
|
+
if (rep_char_it.empty ()) {
|
|
1165
|
+
next_rep_char_word_right = MAX_INT32;
|
|
1166
|
+
}
|
|
1167
|
+
else {
|
|
1168
|
+
rep_char_it.forward ();
|
|
1169
|
+
next_rep_char_word_right =
|
|
1170
|
+
rep_char_it.data ()->bounding_box ().right ();
|
|
1171
|
+
}
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
if (box_it.at_first () && rep_char_it.empty ()) {
|
|
1175
|
+
//at end of line
|
|
1176
|
+
word->set_flag (W_EOL, TRUE);
|
|
1177
|
+
xstarts[1] = prev_x;
|
|
1178
|
+
}
|
|
1179
|
+
else {
|
|
1180
|
+
prev_blanks = blanks;
|
|
1181
|
+
prev_fuzzy_sp = fuzzy_sp;
|
|
1182
|
+
prev_fuzzy_non = fuzzy_non;
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
while (!box_it.at_first ()); //until back at start
|
|
1188
|
+
|
|
1189
|
+
/* Insert any further repeated char words */
|
|
1190
|
+
while (!rep_char_it.empty ()) {
|
|
1191
|
+
word = rep_char_it.extract ();
|
|
1192
|
+
word_it.add_after_then_move (word);
|
|
1193
|
+
|
|
1194
|
+
/* Set spaces before repeated char word */
|
|
1195
|
+
repetition_spacing = find_mean_blob_spacing (word);
|
|
1196
|
+
current_gap = word->bounding_box ().left () - prev_x;
|
|
1197
|
+
if (current_gap > tosp_rep_space * repetition_spacing) {
|
|
1198
|
+
blanks = (uinT8) floor (current_gap / row->space_size);
|
|
1199
|
+
if (blanks < 1)
|
|
1200
|
+
blanks = 1;
|
|
1201
|
+
}
|
|
1202
|
+
else
|
|
1203
|
+
blanks = 0;
|
|
1204
|
+
if (tosp_debug_level > 5)
|
|
1205
|
+
tprintf
|
|
1206
|
+
("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
|
|
1207
|
+
word->bounding_box ().left (), word->bounding_box ().bottom (),
|
|
1208
|
+
repetition_spacing, current_gap, blanks);
|
|
1209
|
+
word->set_blanks (blanks);
|
|
1210
|
+
//NO uncertainty
|
|
1211
|
+
word->set_flag (W_FUZZY_SP, FALSE);
|
|
1212
|
+
word->set_flag (W_FUZZY_NON, FALSE);
|
|
1213
|
+
prev_x = word->bounding_box ().right ();
|
|
1214
|
+
if (rep_char_it.empty ()) {
|
|
1215
|
+
//at end of line
|
|
1216
|
+
word->set_flag (W_EOL, TRUE);
|
|
1217
|
+
xstarts[1] = prev_x;
|
|
1218
|
+
}
|
|
1219
|
+
else {
|
|
1220
|
+
rep_char_it.forward ();
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
coeffs[0] = 0;
|
|
1224
|
+
coeffs[1] = row->line_m ();
|
|
1225
|
+
coeffs[2] = row->line_c ();
|
|
1226
|
+
real_row = new ROW (row,
|
|
1227
|
+
(inT16) row->kern_size, (inT16) row->space_size);
|
|
1228
|
+
word_it.set_to_list (real_row->word_list ());
|
|
1229
|
+
//put words in row
|
|
1230
|
+
word_it.add_list_after (&words);
|
|
1231
|
+
real_row->recalc_bounding_box ();
|
|
1232
|
+
if (tosp_debug_level > 9) {
|
|
1233
|
+
tprintf ("Row %d Made %d words in row ((%d,%d)(%d,%d))\n",
|
|
1234
|
+
row_count,
|
|
1235
|
+
word_count,
|
|
1236
|
+
real_row->bounding_box ().left (),
|
|
1237
|
+
real_row->bounding_box ().bottom (),
|
|
1238
|
+
real_row->bounding_box ().right (),
|
|
1239
|
+
real_row->bounding_box ().top ());
|
|
1240
|
+
}
|
|
1241
|
+
return real_row;
|
|
1242
|
+
}
|
|
1243
|
+
return NULL;
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
BOOL8 make_a_word_break( //decide on word break
|
|
1248
|
+
TO_ROW *row, //row being made
|
|
1249
|
+
TBOX blob_box, //for next_blob //how many blanks?
|
|
1250
|
+
inT16 prev_gap,
|
|
1251
|
+
TBOX prev_blob_box,
|
|
1252
|
+
inT16 real_current_gap,
|
|
1253
|
+
inT16 within_xht_current_gap,
|
|
1254
|
+
TBOX next_blob_box,
|
|
1255
|
+
inT16 next_gap,
|
|
1256
|
+
uinT8 &blanks,
|
|
1257
|
+
BOOL8 &fuzzy_sp,
|
|
1258
|
+
BOOL8 &fuzzy_non) {
|
|
1259
|
+
static BOOL8 prev_gap_was_a_space;
|
|
1260
|
+
BOOL8 space;
|
|
1261
|
+
inT16 current_gap;
|
|
1262
|
+
float fuzzy_sp_to_kn_limit;
|
|
1263
|
+
|
|
1264
|
+
/* Inhibit using the reduced gap if
|
|
1265
|
+
The kerning is large - chars are not kerned and reducing "f"s can cause
|
|
1266
|
+
erroneous blanks
|
|
1267
|
+
OR The real gap is less than 0
|
|
1268
|
+
OR The real gap is less than the kerning estimate
|
|
1269
|
+
*/
|
|
1270
|
+
if ((row->kern_size > tosp_large_kerning * row->xheight) ||
|
|
1271
|
+
((tosp_dont_fool_with_small_kerns >= 0) &&
|
|
1272
|
+
(real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
|
|
1273
|
+
//Ignore the difference
|
|
1274
|
+
within_xht_current_gap = real_current_gap;
|
|
1275
|
+
|
|
1276
|
+
if (tosp_use_xht_gaps && tosp_only_use_xht_gaps)
|
|
1277
|
+
current_gap = within_xht_current_gap;
|
|
1278
|
+
else
|
|
1279
|
+
current_gap = real_current_gap;
|
|
1280
|
+
|
|
1281
|
+
if (tosp_old_to_method) {
|
|
1282
|
+
//Boring old method
|
|
1283
|
+
space = current_gap > row->max_nonspace;
|
|
1284
|
+
if (space && (current_gap < MAX_INT16)) {
|
|
1285
|
+
if (current_gap < row->min_space) {
|
|
1286
|
+
if (current_gap > row->space_threshold) {
|
|
1287
|
+
blanks = 1;
|
|
1288
|
+
fuzzy_sp = TRUE;
|
|
1289
|
+
fuzzy_non = FALSE;
|
|
1290
|
+
}
|
|
1291
|
+
else {
|
|
1292
|
+
blanks = 0;
|
|
1293
|
+
fuzzy_sp = FALSE;
|
|
1294
|
+
fuzzy_non = TRUE;
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
else {
|
|
1298
|
+
blanks = (uinT8) (current_gap / row->space_size);
|
|
1299
|
+
if (blanks < 1)
|
|
1300
|
+
blanks = 1;
|
|
1301
|
+
fuzzy_sp = FALSE;
|
|
1302
|
+
fuzzy_non = FALSE;
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
return space;
|
|
1306
|
+
}
|
|
1307
|
+
else {
|
|
1308
|
+
/* New exciting heuristic method */
|
|
1309
|
+
if (prev_blob_box.null_box ())
|
|
1310
|
+
//Beginning of row
|
|
1311
|
+
prev_gap_was_a_space = TRUE;
|
|
1312
|
+
|
|
1313
|
+
//Default as old TO
|
|
1314
|
+
space = current_gap > row->space_threshold;
|
|
1315
|
+
|
|
1316
|
+
/* Set defaults for the word break incase we find one. Currently there are
|
|
1317
|
+
no fuzzy spaces. Depending on the reliability of the different heuristics
|
|
1318
|
+
we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
|
|
1319
|
+
be used if the function returns TRUE - ie the word is to be broken.
|
|
1320
|
+
*/
|
|
1321
|
+
blanks = (uinT8) (current_gap / row->space_size);
|
|
1322
|
+
if (blanks < 1)
|
|
1323
|
+
blanks = 1;
|
|
1324
|
+
fuzzy_sp = FALSE;
|
|
1325
|
+
fuzzy_non = FALSE;
|
|
1326
|
+
/*
|
|
1327
|
+
If xht measure causes gap to flip one of the 3 thresholds act accordingly -
|
|
1328
|
+
despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
|
|
1329
|
+
context.
|
|
1330
|
+
*/
|
|
1331
|
+
if (tosp_use_xht_gaps &&
|
|
1332
|
+
(real_current_gap <= row->max_nonspace) &&
|
|
1333
|
+
(within_xht_current_gap > row->max_nonspace)) {
|
|
1334
|
+
space = TRUE;
|
|
1335
|
+
fuzzy_non = TRUE;
|
|
1336
|
+
#ifndef GRAPHICS_DISABLED
|
|
1337
|
+
mark_gap (blob_box, 20,
|
|
1338
|
+
prev_gap, prev_blob_box.width (),
|
|
1339
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1340
|
+
#endif
|
|
1341
|
+
}
|
|
1342
|
+
else if (tosp_use_xht_gaps &&
|
|
1343
|
+
(real_current_gap <= row->space_threshold) &&
|
|
1344
|
+
(within_xht_current_gap > row->space_threshold)) {
|
|
1345
|
+
space = TRUE;
|
|
1346
|
+
if (tosp_flip_fuzz_kn_to_sp)
|
|
1347
|
+
fuzzy_sp = TRUE;
|
|
1348
|
+
else
|
|
1349
|
+
fuzzy_non = TRUE;
|
|
1350
|
+
#ifndef GRAPHICS_DISABLED
|
|
1351
|
+
mark_gap (blob_box, 21,
|
|
1352
|
+
prev_gap, prev_blob_box.width (),
|
|
1353
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1354
|
+
#endif
|
|
1355
|
+
}
|
|
1356
|
+
else if (tosp_use_xht_gaps &&
|
|
1357
|
+
(real_current_gap < row->min_space) &&
|
|
1358
|
+
(within_xht_current_gap >= row->min_space)) {
|
|
1359
|
+
space = TRUE;
|
|
1360
|
+
#ifndef GRAPHICS_DISABLED
|
|
1361
|
+
mark_gap (blob_box, 22,
|
|
1362
|
+
prev_gap, prev_blob_box.width (),
|
|
1363
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1364
|
+
#endif
|
|
1365
|
+
}
|
|
1366
|
+
/* Now continue with normal heuristics */
|
|
1367
|
+
else if ((current_gap < row->min_space) &&
|
|
1368
|
+
(current_gap > row->space_threshold)) {
|
|
1369
|
+
/* Heuristics to turn dubious spaces to kerns */
|
|
1370
|
+
if (tosp_pass_wide_fuzz_sp_to_context > 0)
|
|
1371
|
+
fuzzy_sp_to_kn_limit = row->kern_size +
|
|
1372
|
+
tosp_pass_wide_fuzz_sp_to_context *
|
|
1373
|
+
(row->space_size - row->kern_size);
|
|
1374
|
+
else
|
|
1375
|
+
fuzzy_sp_to_kn_limit = 99999.0f;
|
|
1376
|
+
|
|
1377
|
+
/* If current gap is significantly smaller than the previous space the other
|
|
1378
|
+
side of a narrow blob then this gap is a kern. */
|
|
1379
|
+
if ((prev_blob_box.width () > 0) &&
|
|
1380
|
+
narrow_blob (row, prev_blob_box) &&
|
|
1381
|
+
prev_gap_was_a_space &&
|
|
1382
|
+
(current_gap <= tosp_gap_factor * prev_gap)) {
|
|
1383
|
+
if ((tosp_all_flips_fuzzy) ||
|
|
1384
|
+
(current_gap > fuzzy_sp_to_kn_limit)) {
|
|
1385
|
+
if (tosp_flip_fuzz_sp_to_kn)
|
|
1386
|
+
fuzzy_non = TRUE;
|
|
1387
|
+
else
|
|
1388
|
+
fuzzy_sp = TRUE;
|
|
1389
|
+
}
|
|
1390
|
+
else
|
|
1391
|
+
space = FALSE;
|
|
1392
|
+
#ifndef GRAPHICS_DISABLED
|
|
1393
|
+
mark_gap (blob_box, 1,
|
|
1394
|
+
prev_gap, prev_blob_box.width (),
|
|
1395
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1396
|
+
#endif
|
|
1397
|
+
}
|
|
1398
|
+
/* If current gap not much bigger than the previous kern the other side of a
|
|
1399
|
+
narrow blob then this gap is a kern as well */
|
|
1400
|
+
else if ((prev_blob_box.width () > 0) &&
|
|
1401
|
+
narrow_blob (row, prev_blob_box) &&
|
|
1402
|
+
!prev_gap_was_a_space &&
|
|
1403
|
+
(current_gap * tosp_gap_factor <= prev_gap)) {
|
|
1404
|
+
if ((tosp_all_flips_fuzzy) ||
|
|
1405
|
+
(current_gap > fuzzy_sp_to_kn_limit)) {
|
|
1406
|
+
if (tosp_flip_fuzz_sp_to_kn)
|
|
1407
|
+
fuzzy_non = TRUE;
|
|
1408
|
+
else
|
|
1409
|
+
fuzzy_sp = TRUE;
|
|
1410
|
+
}
|
|
1411
|
+
else
|
|
1412
|
+
space = FALSE;
|
|
1413
|
+
#ifndef GRAPHICS_DISABLED
|
|
1414
|
+
mark_gap (blob_box, 2,
|
|
1415
|
+
prev_gap, prev_blob_box.width (),
|
|
1416
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1417
|
+
#endif
|
|
1418
|
+
}
|
|
1419
|
+
else if ((next_blob_box.width () > 0) &&
|
|
1420
|
+
narrow_blob (row, next_blob_box) &&
|
|
1421
|
+
(next_gap > row->space_threshold) &&
|
|
1422
|
+
(current_gap <= tosp_gap_factor * next_gap)) {
|
|
1423
|
+
if ((tosp_all_flips_fuzzy) ||
|
|
1424
|
+
(current_gap > fuzzy_sp_to_kn_limit)) {
|
|
1425
|
+
if (tosp_flip_fuzz_sp_to_kn)
|
|
1426
|
+
fuzzy_non = TRUE;
|
|
1427
|
+
else
|
|
1428
|
+
fuzzy_sp = TRUE;
|
|
1429
|
+
}
|
|
1430
|
+
else
|
|
1431
|
+
space = FALSE;
|
|
1432
|
+
#ifndef GRAPHICS_DISABLED
|
|
1433
|
+
mark_gap (blob_box, 3,
|
|
1434
|
+
prev_gap, prev_blob_box.width (),
|
|
1435
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1436
|
+
#endif
|
|
1437
|
+
}
|
|
1438
|
+
else if ((next_blob_box.width () > 0) &&
|
|
1439
|
+
narrow_blob (row, next_blob_box) &&
|
|
1440
|
+
(next_gap <= row->space_threshold) &&
|
|
1441
|
+
(current_gap * tosp_gap_factor <= next_gap)) {
|
|
1442
|
+
if ((tosp_all_flips_fuzzy) ||
|
|
1443
|
+
(current_gap > fuzzy_sp_to_kn_limit)) {
|
|
1444
|
+
if (tosp_flip_fuzz_sp_to_kn)
|
|
1445
|
+
fuzzy_non = TRUE;
|
|
1446
|
+
else
|
|
1447
|
+
fuzzy_sp = TRUE;
|
|
1448
|
+
}
|
|
1449
|
+
else
|
|
1450
|
+
space = FALSE;
|
|
1451
|
+
#ifndef GRAPHICS_DISABLED
|
|
1452
|
+
mark_gap (blob_box, 4,
|
|
1453
|
+
prev_gap, prev_blob_box.width (),
|
|
1454
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1455
|
+
#endif
|
|
1456
|
+
}
|
|
1457
|
+
else if ((((next_blob_box.width () > 0) &&
|
|
1458
|
+
narrow_blob (row, next_blob_box)) ||
|
|
1459
|
+
((prev_blob_box.width () > 0) &&
|
|
1460
|
+
narrow_blob (row, prev_blob_box)))) {
|
|
1461
|
+
fuzzy_sp = TRUE;
|
|
1462
|
+
#ifndef GRAPHICS_DISABLED
|
|
1463
|
+
mark_gap (blob_box, 6,
|
|
1464
|
+
prev_gap, prev_blob_box.width (),
|
|
1465
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1466
|
+
#endif
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1469
|
+
else if ((current_gap > row->max_nonspace) &&
|
|
1470
|
+
(current_gap <= row->space_threshold)) {
|
|
1471
|
+
|
|
1472
|
+
/* Heuristics to turn dubious kerns to spaces */
|
|
1473
|
+
/* TRIED THIS BUT IT MADE THINGS WORSE
|
|
1474
|
+
if ( prev_gap == MAX_INT16 )
|
|
1475
|
+
prev_gap = 0; //start of row
|
|
1476
|
+
if ( next_gap == MAX_INT16 )
|
|
1477
|
+
next_gap = 0; //end of row
|
|
1478
|
+
*/
|
|
1479
|
+
if ((prev_blob_box.width () > 0) &&
|
|
1480
|
+
(next_blob_box.width () > 0) &&
|
|
1481
|
+
(current_gap >=
|
|
1482
|
+
tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
|
|
1483
|
+
wide_blob (row, prev_blob_box) &&
|
|
1484
|
+
wide_blob (row, next_blob_box)) {
|
|
1485
|
+
|
|
1486
|
+
space = TRUE;
|
|
1487
|
+
/*
|
|
1488
|
+
tosp_flip_caution is an attempt to stop the default changing in cases
|
|
1489
|
+
where there is a large difference between the kern and space estimates.
|
|
1490
|
+
See problem in 'chiefs' where "have" gets split in the quotation.
|
|
1491
|
+
*/
|
|
1492
|
+
if ((tosp_flip_fuzz_kn_to_sp) &&
|
|
1493
|
+
((tosp_flip_caution <= 0) ||
|
|
1494
|
+
(tosp_flip_caution * row->kern_size > row->space_size)))
|
|
1495
|
+
fuzzy_sp = TRUE;
|
|
1496
|
+
else
|
|
1497
|
+
fuzzy_non = TRUE;
|
|
1498
|
+
#ifndef GRAPHICS_DISABLED
|
|
1499
|
+
mark_gap (blob_box, 7,
|
|
1500
|
+
prev_gap, prev_blob_box.width (),
|
|
1501
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1502
|
+
#endif
|
|
1503
|
+
}
|
|
1504
|
+
else if ((prev_blob_box.width () > 0) &&
|
|
1505
|
+
(next_blob_box.width () > 0) &&
|
|
1506
|
+
(current_gap >=
|
|
1507
|
+
tosp_kern_gap_factor2 * MAX (prev_gap, next_gap)) &&
|
|
1508
|
+
!(narrow_blob (row, prev_blob_box) ||
|
|
1509
|
+
suspected_punct_blob (row, prev_blob_box)) &&
|
|
1510
|
+
!(narrow_blob (row, next_blob_box) ||
|
|
1511
|
+
suspected_punct_blob (row, next_blob_box))) {
|
|
1512
|
+
space = TRUE;
|
|
1513
|
+
fuzzy_non = TRUE;
|
|
1514
|
+
#ifndef GRAPHICS_DISABLED
|
|
1515
|
+
mark_gap (blob_box, 8,
|
|
1516
|
+
prev_gap, prev_blob_box.width (),
|
|
1517
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1518
|
+
#endif
|
|
1519
|
+
}
|
|
1520
|
+
else if ((tosp_kern_gap_factor3 > 0) &&
|
|
1521
|
+
(prev_blob_box.width () > 0) &&
|
|
1522
|
+
(next_blob_box.width () > 0) &&
|
|
1523
|
+
(current_gap >=
|
|
1524
|
+
tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
|
|
1525
|
+
(!tosp_rule_9_test_punct ||
|
|
1526
|
+
(!suspected_punct_blob (row, prev_blob_box) &&
|
|
1527
|
+
!suspected_punct_blob (row, next_blob_box)))) {
|
|
1528
|
+
space = TRUE;
|
|
1529
|
+
fuzzy_non = TRUE;
|
|
1530
|
+
#ifndef GRAPHICS_DISABLED
|
|
1531
|
+
mark_gap (blob_box, 9,
|
|
1532
|
+
prev_gap, prev_blob_box.width (),
|
|
1533
|
+
current_gap, next_blob_box.width (), next_gap);
|
|
1534
|
+
#endif
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
prev_gap_was_a_space = space && !(fuzzy_non);
|
|
1538
|
+
return space;
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
BOOL8 narrow_blob(TO_ROW *row, TBOX blob_box) {
|
|
1544
|
+
BOOL8 result;
|
|
1545
|
+
|
|
1546
|
+
result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
|
|
1547
|
+
(((float) blob_box.width () / blob_box.height ()) <=
|
|
1548
|
+
tosp_narrow_aspect_ratio));
|
|
1549
|
+
return result;
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
|
|
1553
|
+
BOOL8 wide_blob(TO_ROW *row, TBOX blob_box) {
|
|
1554
|
+
BOOL8 result;
|
|
1555
|
+
|
|
1556
|
+
if (tosp_wide_fraction > 0) {
|
|
1557
|
+
if (tosp_wide_aspect_ratio > 0)
|
|
1558
|
+
result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
|
|
1559
|
+
(((float) blob_box.width () / blob_box.height ()) >
|
|
1560
|
+
tosp_wide_aspect_ratio));
|
|
1561
|
+
else
|
|
1562
|
+
result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
|
|
1563
|
+
}
|
|
1564
|
+
else
|
|
1565
|
+
result = !narrow_blob (row, blob_box);
|
|
1566
|
+
return result;
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
BOOL8 suspected_punct_blob(TO_ROW *row, TBOX box) {
|
|
1571
|
+
BOOL8 result;
|
|
1572
|
+
float baseline;
|
|
1573
|
+
float blob_x_centre;
|
|
1574
|
+
|
|
1575
|
+
/* Find baseline of centre of blob */
|
|
1576
|
+
|
|
1577
|
+
blob_x_centre = (box.right () + box.left ()) / 2.0;
|
|
1578
|
+
baseline = row->baseline.y (blob_x_centre);
|
|
1579
|
+
|
|
1580
|
+
result = (box.height () <= 0.66 * row->xheight) ||
|
|
1581
|
+
(box.top () < baseline + row->xheight / 2.0) ||
|
|
1582
|
+
(box.bottom () > baseline + row->xheight / 2.0);
|
|
1583
|
+
return result;
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
|
|
1587
|
+
void peek_at_next_gap( //A COPY FOR PEEKING
|
|
1588
|
+
TO_ROW *row,
|
|
1589
|
+
BLOBNBOX_IT box_it,
|
|
1590
|
+
TBOX &next_blob_box,
|
|
1591
|
+
inT16 &next_gap,
|
|
1592
|
+
inT16 &next_within_xht_gap) {
|
|
1593
|
+
TBOX next_reduced_blob_box;
|
|
1594
|
+
TBOX bit_beyond;
|
|
1595
|
+
BLOBNBOX_IT reduced_box_it = box_it;
|
|
1596
|
+
|
|
1597
|
+
next_blob_box = box_next (&box_it);
|
|
1598
|
+
next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
|
|
1599
|
+
if (box_it.at_first ()) {
|
|
1600
|
+
next_gap = MAX_INT16;
|
|
1601
|
+
next_within_xht_gap = MAX_INT16;
|
|
1602
|
+
}
|
|
1603
|
+
else {
|
|
1604
|
+
bit_beyond = box_it.data ()->bounding_box ();
|
|
1605
|
+
next_gap = bit_beyond.left () - next_blob_box.right ();
|
|
1606
|
+
bit_beyond = reduced_box_next (row, &reduced_box_it);
|
|
1607
|
+
next_within_xht_gap =
|
|
1608
|
+
bit_beyond.left () - next_reduced_blob_box.right ();
|
|
1609
|
+
}
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
#ifndef GRAPHICS_DISABLED
|
|
1614
|
+
void mark_gap( //Debug stuff
|
|
1615
|
+
TBOX blob, //blob following gap
|
|
1616
|
+
inT16 rule, // heuristic id
|
|
1617
|
+
inT16 prev_gap,
|
|
1618
|
+
inT16 prev_blob_width,
|
|
1619
|
+
inT16 current_gap,
|
|
1620
|
+
inT16 next_blob_width,
|
|
1621
|
+
inT16 next_gap) {
|
|
1622
|
+
ScrollView::Color col; //of ellipse marking flipped gap
|
|
1623
|
+
|
|
1624
|
+
switch (rule) {
|
|
1625
|
+
case 1:
|
|
1626
|
+
col = ScrollView::RED;
|
|
1627
|
+
break;
|
|
1628
|
+
case 2:
|
|
1629
|
+
col = ScrollView::CYAN;
|
|
1630
|
+
break;
|
|
1631
|
+
case 3:
|
|
1632
|
+
col = ScrollView::GREEN;
|
|
1633
|
+
break;
|
|
1634
|
+
case 4:
|
|
1635
|
+
col = ScrollView::BLACK;
|
|
1636
|
+
break;
|
|
1637
|
+
case 5:
|
|
1638
|
+
col = ScrollView::MAGENTA;
|
|
1639
|
+
break;
|
|
1640
|
+
case 6:
|
|
1641
|
+
col = ScrollView::BLUE;
|
|
1642
|
+
break;
|
|
1643
|
+
|
|
1644
|
+
case 7:
|
|
1645
|
+
col = ScrollView::WHITE;
|
|
1646
|
+
break;
|
|
1647
|
+
case 8:
|
|
1648
|
+
col = ScrollView::YELLOW;
|
|
1649
|
+
break;
|
|
1650
|
+
case 9:
|
|
1651
|
+
col = ScrollView::BLACK;
|
|
1652
|
+
break;
|
|
1653
|
+
|
|
1654
|
+
case 20:
|
|
1655
|
+
col = ScrollView::CYAN;
|
|
1656
|
+
break;
|
|
1657
|
+
case 21:
|
|
1658
|
+
col = ScrollView::GREEN;
|
|
1659
|
+
break;
|
|
1660
|
+
case 22:
|
|
1661
|
+
col = ScrollView::MAGENTA;
|
|
1662
|
+
break;
|
|
1663
|
+
default:
|
|
1664
|
+
col = ScrollView::BLACK;
|
|
1665
|
+
}
|
|
1666
|
+
if (textord_show_initial_words) {
|
|
1667
|
+
to_win->Pen(col);
|
|
1668
|
+
/* if (rule < 20)
|
|
1669
|
+
//interior_style(to_win, INT_SOLID, FALSE);
|
|
1670
|
+
else
|
|
1671
|
+
//interior_style(to_win, INT_HOLLOW, TRUE);*/
|
|
1672
|
+
//x radius
|
|
1673
|
+
to_win->Ellipse (current_gap / 2.0f,
|
|
1674
|
+
blob.height () / 2.0f, //y radius
|
|
1675
|
+
//x centre
|
|
1676
|
+
blob.left () - current_gap / 2.0f,
|
|
1677
|
+
//y centre
|
|
1678
|
+
blob.bottom () + blob.height () / 2.0f);
|
|
1679
|
+
}
|
|
1680
|
+
if (tosp_debug_level > 0)
|
|
1681
|
+
tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
|
|
1682
|
+
blob.left () - current_gap / 2, blob.bottom (), rule,
|
|
1683
|
+
prev_gap, prev_blob_width, current_gap,
|
|
1684
|
+
next_blob_width, next_gap);
|
|
1685
|
+
}
|
|
1686
|
+
#endif
|
|
1687
|
+
|
|
1688
|
+
|
|
1689
|
+
float find_mean_blob_spacing(WERD *word) {
|
|
1690
|
+
PBLOB_IT blob_it;
|
|
1691
|
+
C_BLOB_IT cblob_it;
|
|
1692
|
+
TBOX blob_box;
|
|
1693
|
+
inT32 gap_sum = 0;
|
|
1694
|
+
inT16 gap_count = 0;
|
|
1695
|
+
inT16 prev_right;
|
|
1696
|
+
|
|
1697
|
+
if (word->flag (W_POLYGON)) {
|
|
1698
|
+
blob_it.set_to_list (word->blob_list ());
|
|
1699
|
+
if (!blob_it.empty ()) {
|
|
1700
|
+
blob_it.mark_cycle_pt ();
|
|
1701
|
+
prev_right = blob_it.data ()->bounding_box ().right ();
|
|
1702
|
+
//first blob
|
|
1703
|
+
blob_it.forward ();
|
|
1704
|
+
for (; !blob_it.cycled_list (); blob_it.forward ()) {
|
|
1705
|
+
blob_box = blob_it.data ()->bounding_box ();
|
|
1706
|
+
gap_sum += blob_box.left () - prev_right;
|
|
1707
|
+
gap_count++;
|
|
1708
|
+
prev_right = blob_box.right ();
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
else {
|
|
1713
|
+
cblob_it.set_to_list (word->cblob_list ());
|
|
1714
|
+
if (!cblob_it.empty ()) {
|
|
1715
|
+
cblob_it.mark_cycle_pt ();
|
|
1716
|
+
prev_right = cblob_it.data ()->bounding_box ().right ();
|
|
1717
|
+
//first blob
|
|
1718
|
+
cblob_it.forward ();
|
|
1719
|
+
for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
|
|
1720
|
+
blob_box = cblob_it.data ()->bounding_box ();
|
|
1721
|
+
gap_sum += blob_box.left () - prev_right;
|
|
1722
|
+
gap_count++;
|
|
1723
|
+
prev_right = blob_box.right ();
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
if (gap_count > 0)
|
|
1728
|
+
return (gap_sum / (float) gap_count);
|
|
1729
|
+
else
|
|
1730
|
+
return 0.0f;
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1733
|
+
|
|
1734
|
+
BOOL8 ignore_big_gap(TO_ROW *row,
|
|
1735
|
+
inT32 row_length,
|
|
1736
|
+
GAPMAP *gapmap,
|
|
1737
|
+
inT16 left,
|
|
1738
|
+
inT16 right) {
|
|
1739
|
+
inT16 gap = right - left + 1;
|
|
1740
|
+
|
|
1741
|
+
if (tosp_ignore_big_gaps > 999)
|
|
1742
|
+
return FALSE; //Dont ignore
|
|
1743
|
+
if (tosp_ignore_big_gaps > 0)
|
|
1744
|
+
return (gap > tosp_ignore_big_gaps * row->xheight);
|
|
1745
|
+
if (gap > tosp_ignore_very_big_gaps * row->xheight)
|
|
1746
|
+
return TRUE;
|
|
1747
|
+
if (tosp_ignore_big_gaps == 0) {
|
|
1748
|
+
if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
|
|
1749
|
+
return TRUE;
|
|
1750
|
+
if ((gap > 1.75 * row->xheight) &&
|
|
1751
|
+
((row_length > 35 * row->xheight) ||
|
|
1752
|
+
gapmap->table_gap (left, right)))
|
|
1753
|
+
return TRUE;
|
|
1754
|
+
}
|
|
1755
|
+
else {
|
|
1756
|
+
/* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
|
|
1757
|
+
if ((gap > gapmap_big_gaps * row->xheight) &&
|
|
1758
|
+
gapmap->table_gap (left, right))
|
|
1759
|
+
return TRUE;
|
|
1760
|
+
}
|
|
1761
|
+
return FALSE;
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
|
|
1765
|
+
/**********************************************************************
|
|
1766
|
+
* reduced_box_next
|
|
1767
|
+
*
|
|
1768
|
+
* Compute the bounding box of this blob with merging of x overlaps
|
|
1769
|
+
* but no pre-chopping.
|
|
1770
|
+
* Then move the iterator on to the start of the next blob.
|
|
1771
|
+
* DONT reduce the box for small things - eg punctuation.
|
|
1772
|
+
**********************************************************************/
|
|
1773
|
+
|
|
1774
|
+
TBOX reduced_box_next( //get bounding box
|
|
1775
|
+
TO_ROW *row, //current row
|
|
1776
|
+
BLOBNBOX_IT *it //iterator to blobds
|
|
1777
|
+
) {
|
|
1778
|
+
BLOBNBOX *blob; //current blob
|
|
1779
|
+
BLOBNBOX *head_blob; //place to store box
|
|
1780
|
+
TBOX full_box; //full blob boundg box
|
|
1781
|
+
TBOX reduced_box; //box of significant part
|
|
1782
|
+
inT16 left_above_xht; //ABOVE xht left limit
|
|
1783
|
+
inT16 new_left_above_xht; //ABOVE xht left limit
|
|
1784
|
+
|
|
1785
|
+
blob = it->data ();
|
|
1786
|
+
if (blob->red_box_set ()) {
|
|
1787
|
+
reduced_box = blob->reduced_box ();
|
|
1788
|
+
do {
|
|
1789
|
+
it->forward ();
|
|
1790
|
+
blob = it->data ();
|
|
1791
|
+
}
|
|
1792
|
+
//until next real blob
|
|
1793
|
+
while ((blob->blob () == NULL && blob->cblob () == NULL) || blob->joined_to_prev ());
|
|
1794
|
+
return reduced_box;
|
|
1795
|
+
}
|
|
1796
|
+
head_blob = blob;
|
|
1797
|
+
full_box = blob->bounding_box ();
|
|
1798
|
+
reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
|
|
1799
|
+
do {
|
|
1800
|
+
it->forward ();
|
|
1801
|
+
blob = it->data ();
|
|
1802
|
+
if (blob->blob () == NULL && blob->cblob () == NULL)
|
|
1803
|
+
//was pre-chopped
|
|
1804
|
+
full_box += blob->bounding_box ();
|
|
1805
|
+
else if (blob->joined_to_prev ()) {
|
|
1806
|
+
reduced_box +=
|
|
1807
|
+
reduced_box_for_blob(blob, row, &new_left_above_xht);
|
|
1808
|
+
left_above_xht = MIN (left_above_xht, new_left_above_xht);
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
//until next real blob
|
|
1812
|
+
while ((blob->blob () == NULL && blob->cblob () == NULL) || blob->joined_to_prev ());
|
|
1813
|
+
|
|
1814
|
+
if ((reduced_box.width () > 0) &&
|
|
1815
|
+
((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
|
|
1816
|
+
< left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
|
|
1817
|
+
#ifndef GRAPHICS_DISABLED
|
|
1818
|
+
if (textord_show_initial_words)
|
|
1819
|
+
reduced_box.plot (to_win, ScrollView::YELLOW, ScrollView::YELLOW);
|
|
1820
|
+
#endif
|
|
1821
|
+
}
|
|
1822
|
+
else
|
|
1823
|
+
reduced_box = full_box;
|
|
1824
|
+
head_blob->set_reduced_box (reduced_box);
|
|
1825
|
+
return reduced_box;
|
|
1826
|
+
}
|
|
1827
|
+
|
|
1828
|
+
|
|
1829
|
+
/*************************************************************************
|
|
1830
|
+
* reduced_box_for_blob()
|
|
1831
|
+
* Find box for blob which is the same height and y position as the whole blob,
|
|
1832
|
+
* but whose left limit is the left most position of the blob ABOVE the
|
|
1833
|
+
* baseline and whose right limit is the right most position of the blob BELOW
|
|
1834
|
+
* the xheight.
|
|
1835
|
+
*
|
|
1836
|
+
*
|
|
1837
|
+
* !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
|
|
1838
|
+
* "home". Perhaps we need something which say if the width ABOVE the
|
|
1839
|
+
* xht alone includes the whole of the reduced width, then use the full
|
|
1840
|
+
* blob box - Might still fail on italic F
|
|
1841
|
+
*
|
|
1842
|
+
* Alternatively we could be a little less severe and only reduce the
|
|
1843
|
+
* left and right edges by half the difference between the full box and
|
|
1844
|
+
* the reduced box.
|
|
1845
|
+
*
|
|
1846
|
+
* NOTE that we need to rotate all the coordinates as
|
|
1847
|
+
* find_blob_limits finds the y min and max within a specified x band
|
|
1848
|
+
*************************************************************************/
|
|
1849
|
+
|
|
1850
|
+
TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, inT16 *left_above_xht) {
|
|
1851
|
+
float baseline;
|
|
1852
|
+
float blob_x_centre;
|
|
1853
|
+
float left_limit;
|
|
1854
|
+
float right_limit;
|
|
1855
|
+
float junk;
|
|
1856
|
+
TBOX blob_box;
|
|
1857
|
+
|
|
1858
|
+
/* Find baseline of centre of blob */
|
|
1859
|
+
|
|
1860
|
+
blob_box = blob->bounding_box ();
|
|
1861
|
+
blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
|
|
1862
|
+
baseline = row->baseline.y (blob_x_centre);
|
|
1863
|
+
|
|
1864
|
+
/*
|
|
1865
|
+
Find LH limit of blob ABOVE the xht. This is so that we can detect certain
|
|
1866
|
+
caps ht chars which should NOT have their box reduced: T, Y, V, W etc
|
|
1867
|
+
*/
|
|
1868
|
+
left_limit = (float) MAX_INT32;
|
|
1869
|
+
junk = (float) -MAX_INT32;
|
|
1870
|
+
if (blob->blob () != NULL)
|
|
1871
|
+
//blob to test
|
|
1872
|
+
find_blob_limits (blob->blob (),
|
|
1873
|
+
(float) -MAX_INT16, //rotated lower limit
|
|
1874
|
+
-(baseline + 1.1 * row->xheight),
|
|
1875
|
+
//rotated upper limit
|
|
1876
|
+
FCOORD (0.0, 1.0), //90deg anticlock rot
|
|
1877
|
+
left_limit, junk); //min y max_y
|
|
1878
|
+
else
|
|
1879
|
+
//blob to test
|
|
1880
|
+
find_cblob_hlimits (blob->cblob (),
|
|
1881
|
+
//rotated lower limit
|
|
1882
|
+
(baseline + 1.1 * row->xheight), (float) MAX_INT16,
|
|
1883
|
+
//rotated upper limit
|
|
1884
|
+
// FCOORD( 0.0, 1.0 ), //90deg anticlock rot
|
|
1885
|
+
left_limit, junk); //min y max_y
|
|
1886
|
+
if (left_limit > junk)
|
|
1887
|
+
*left_above_xht = MAX_INT16; //No area above xht
|
|
1888
|
+
else
|
|
1889
|
+
*left_above_xht = (inT16) floor (left_limit);
|
|
1890
|
+
/*
|
|
1891
|
+
Find reduced LH limit of blob - the left extent of the region ABOVE the
|
|
1892
|
+
baseline.
|
|
1893
|
+
*/
|
|
1894
|
+
left_limit = (float) MAX_INT32;
|
|
1895
|
+
junk = (float) -MAX_INT32;
|
|
1896
|
+
if (blob->blob () != NULL)
|
|
1897
|
+
//blob to test
|
|
1898
|
+
find_blob_limits (blob->blob (),
|
|
1899
|
+
(float) -MAX_INT16, //rotated lower limit
|
|
1900
|
+
-baseline, //rotated upper limit
|
|
1901
|
+
FCOORD (0.0, 1.0), //90deg anticlock rot
|
|
1902
|
+
left_limit, junk); //min y max_y
|
|
1903
|
+
else
|
|
1904
|
+
//blob to test
|
|
1905
|
+
find_cblob_hlimits (blob->cblob (),
|
|
1906
|
+
baseline, //rotated upper limit
|
|
1907
|
+
(float) MAX_INT16, //rotated lower limit
|
|
1908
|
+
// FCOORD( 0.0, 1.0 ), //90deg anticlock rot
|
|
1909
|
+
left_limit, junk); //min y max_y
|
|
1910
|
+
|
|
1911
|
+
if (left_limit > junk)
|
|
1912
|
+
return TBOX (); //no area within xht so return empty box
|
|
1913
|
+
/*
|
|
1914
|
+
Find reduced RH limit of blob - the right extent of the region BELOW the xht.
|
|
1915
|
+
*/
|
|
1916
|
+
junk = (float) MAX_INT32;
|
|
1917
|
+
right_limit = (float) -MAX_INT32;
|
|
1918
|
+
if (blob->blob () != NULL)
|
|
1919
|
+
//blob to test
|
|
1920
|
+
find_blob_limits (blob->blob (),
|
|
1921
|
+
-(baseline + row->xheight),
|
|
1922
|
+
//rotated lower limit
|
|
1923
|
+
(float) MAX_INT16, //rotated upper limit
|
|
1924
|
+
FCOORD (0.0, 1.0), //90deg anticlock rot
|
|
1925
|
+
junk, right_limit); //min y max_y
|
|
1926
|
+
else
|
|
1927
|
+
//blob to test
|
|
1928
|
+
find_cblob_hlimits (blob->cblob (),
|
|
1929
|
+
(float) -MAX_INT16, //rotated upper limit
|
|
1930
|
+
(baseline + row->xheight),
|
|
1931
|
+
//rotated lower limit
|
|
1932
|
+
// FCOORD( 0.0, 1.0 ), //90deg anticlock rot
|
|
1933
|
+
junk, right_limit); //min y max_y
|
|
1934
|
+
if (junk > right_limit)
|
|
1935
|
+
return TBOX (); //no area within xht so return empty box
|
|
1936
|
+
|
|
1937
|
+
return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
|
|
1938
|
+
ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
|
|
1939
|
+
}
|