tesseract_bin 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
@@ -0,0 +1,1481 @@
|
|
1
|
+
/******************************************************************
|
2
|
+
* File: docqual.cpp (Formerly docqual.c)
|
3
|
+
* Description: Document Quality Metrics
|
4
|
+
* Author: Phil Cheatle
|
5
|
+
* Created: Mon May 9 11:27:28 BST 1994
|
6
|
+
*
|
7
|
+
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
8
|
+
** Licensed under the Apache License, Version 2.0 (the "License");
|
9
|
+
** you may not use this file except in compliance with the License.
|
10
|
+
** You may obtain a copy of the License at
|
11
|
+
** http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
** Unless required by applicable law or agreed to in writing, software
|
13
|
+
** distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
** See the License for the specific language governing permissions and
|
16
|
+
** limitations under the License.
|
17
|
+
*
|
18
|
+
**********************************************************************/
|
19
|
+
|
20
|
+
#include "mfcpch.h"
|
21
|
+
#include <ctype.h>
|
22
|
+
#include "docqual.h"
|
23
|
+
#include "tstruct.h"
|
24
|
+
#include "tfacep.h"
|
25
|
+
#include "reject.h"
|
26
|
+
#include "tessvars.h"
|
27
|
+
#include "genblob.h"
|
28
|
+
#include "secname.h"
|
29
|
+
#include "globals.h"
|
30
|
+
|
31
|
+
#define EXTERN
|
32
|
+
|
33
|
+
EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
|
34
|
+
EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
|
35
|
+
"Non standard number of outlines");
|
36
|
+
EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
|
37
|
+
"Allow outline errs in unrejection?");
|
38
|
+
EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
|
39
|
+
"Reduce rejection on good docs");
|
40
|
+
EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
|
41
|
+
EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
|
42
|
+
"%rej allowed before rej whole doc");
|
43
|
+
EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
|
44
|
+
"%rej allowed before rej whole block");
|
45
|
+
EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
|
46
|
+
"%rej allowed before rej whole row");
|
47
|
+
EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
|
48
|
+
"%of row rejects in whole word rejects which prevents whole row rejection");
|
49
|
+
EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
|
50
|
+
"Only rej partially rejected words in block rejection");
|
51
|
+
EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
|
52
|
+
"Only rej partially rejected words in row rejection");
|
53
|
+
EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
|
54
|
+
"Use word segmentation quality metric");
|
55
|
+
EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
|
56
|
+
"Use word segmentation quality metric");
|
57
|
+
EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
|
58
|
+
"Only preserve wds longer than this");
|
59
|
+
EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
|
60
|
+
"Apply row rejection to good docs");
|
61
|
+
EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
|
62
|
+
"rej good doc wd if more than this fraction rejected");
|
63
|
+
EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
|
64
|
+
"Reject all bad quality wds");
|
65
|
+
EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
|
66
|
+
EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
|
67
|
+
"Output data to debug file");
|
68
|
+
EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
|
69
|
+
EXTERN double_VAR (quality_rowrej_pc, 1.1,
|
70
|
+
"good_quality_doc gte good char limit");
|
71
|
+
|
72
|
+
EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
|
73
|
+
"Mark v.bad words for tilde crunch");
|
74
|
+
EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
|
75
|
+
EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
|
76
|
+
"Take out ~^ early?");
|
77
|
+
|
78
|
+
EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
|
79
|
+
EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
|
80
|
+
EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
|
81
|
+
"crunch garbage cert lt this");
|
82
|
+
EXTERN double_VAR (crunch_poor_garbage_rate, 60,
|
83
|
+
"crunch garbage rating lt this");
|
84
|
+
|
85
|
+
EXTERN double_VAR (crunch_pot_poor_rate, 40,
|
86
|
+
"POTENTIAL crunch rating lt this");
|
87
|
+
EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
|
88
|
+
"POTENTIAL crunch cert lt this");
|
89
|
+
EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
|
90
|
+
|
91
|
+
EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
|
92
|
+
EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
|
93
|
+
EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
|
94
|
+
EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
|
95
|
+
EXTERN double_VAR (crunch_del_min_width, 3.0,
|
96
|
+
"Del if word width lt xht x this");
|
97
|
+
EXTERN double_VAR (crunch_del_high_word, 1.5,
|
98
|
+
"Del if word gt xht x this above bl");
|
99
|
+
EXTERN double_VAR (crunch_del_low_word, 0.5,
|
100
|
+
"Del if word gt xht x this below bl");
|
101
|
+
EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
|
102
|
+
|
103
|
+
EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
|
104
|
+
EXTERN INT_VAR (crunch_pot_indicators, 1,
|
105
|
+
"How many potential indicators needed");
|
106
|
+
|
107
|
+
EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
|
108
|
+
"Dont touch sensible strings");
|
109
|
+
EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
|
110
|
+
EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
|
111
|
+
"Dont pot crunch sensible strings");
|
112
|
+
EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
|
113
|
+
EXTERN INT_VAR (crunch_leave_lc_strings, 4,
|
114
|
+
"Dont crunch words with long lower case strings");
|
115
|
+
EXTERN INT_VAR (crunch_leave_uc_strings, 4,
|
116
|
+
"Dont crunch words with long lower case strings");
|
117
|
+
EXTERN INT_VAR (crunch_long_repetitions, 3,
|
118
|
+
"Crunch words with long repetitions");
|
119
|
+
|
120
|
+
EXTERN INT_VAR (crunch_debug, 0, "As it says");
|
121
|
+
|
122
|
+
/*************************************************************************
|
123
|
+
* word_blob_quality()
|
124
|
+
* How many blobs in the outword are identical to those of the inword?
|
125
|
+
* ASSUME blobs in both initial word and outword are in ascending order of
|
126
|
+
* left hand blob edge.
|
127
|
+
*************************************************************************/
|
128
|
+
inT16 word_blob_quality( //Blob seg changes
|
129
|
+
WERD_RES *word,
|
130
|
+
ROW *row) {
|
131
|
+
WERD *bln_word; //BL norm init word
|
132
|
+
TWERD *tessword; //tess format
|
133
|
+
WERD *init_word; //BL norm init word
|
134
|
+
PBLOB_IT outword_it;
|
135
|
+
PBLOB_IT initial_it;
|
136
|
+
inT16 i;
|
137
|
+
inT16 init_blobs_left;
|
138
|
+
inT16 match_count = 0;
|
139
|
+
BOOL8 matched;
|
140
|
+
TBOX out_box;
|
141
|
+
PBLOB *test_blob;
|
142
|
+
DENORM denorm;
|
143
|
+
float bln_xht;
|
144
|
+
|
145
|
+
if (word->word->gblob_list ()->empty ())
|
146
|
+
return 0;
|
147
|
+
//xht used for blnorm
|
148
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
149
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
150
|
+
/*
|
151
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
152
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
153
|
+
comparison
|
154
|
+
*/
|
155
|
+
// if (!bln_word->flag(W_POLYGON))
|
156
|
+
// tprintf( "NON POLYGON BLN WERD\n");
|
157
|
+
tessword = make_tess_word (bln_word, NULL);
|
158
|
+
//convert word
|
159
|
+
init_word = make_ed_word (tessword, bln_word);
|
160
|
+
// if (!init_word->flag(W_POLYGON))
|
161
|
+
// tprintf( "NON POLYGON INIT WERD\n");
|
162
|
+
// tprintf( "SOURCE BLOBS-AFTER TESS:\n");
|
163
|
+
// print_boxes( init_word );
|
164
|
+
// tprintf( "OUTPUT BLOBS:\n");
|
165
|
+
// print_boxes( word->outword );
|
166
|
+
|
167
|
+
initial_it.set_to_list (init_word->blob_list ());
|
168
|
+
init_blobs_left = initial_it.length ();
|
169
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
170
|
+
delete bln_word;
|
171
|
+
delete_word(tessword); //get rid of it
|
172
|
+
|
173
|
+
for (outword_it.mark_cycle_pt ();
|
174
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
175
|
+
out_box = outword_it.data ()->bounding_box ();
|
176
|
+
|
177
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
178
|
+
while (!initial_it.at_last () &&
|
179
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
180
|
+
initial_it.forward ();
|
181
|
+
init_blobs_left--;
|
182
|
+
}
|
183
|
+
|
184
|
+
/* See if current outword blob matches any initial blob with the same left
|
185
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
186
|
+
|
187
|
+
i = 0;
|
188
|
+
matched = FALSE;
|
189
|
+
do {
|
190
|
+
test_blob = initial_it.data_relative (i++);
|
191
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
192
|
+
if (matched)
|
193
|
+
match_count++;
|
194
|
+
}
|
195
|
+
while (!matched &&
|
196
|
+
(init_blobs_left - i > 0) &&
|
197
|
+
(i < 129) &&
|
198
|
+
!initial_it.at_last () &&
|
199
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
200
|
+
}
|
201
|
+
delete init_word;
|
202
|
+
return match_count;
|
203
|
+
}
|
204
|
+
|
205
|
+
|
206
|
+
/*************************************************************************
|
207
|
+
* crude_match_blobs()
|
208
|
+
* Check bounding boxes are the same and the number of outlines are the same.
|
209
|
+
*************************************************************************/
|
210
|
+
BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
|
211
|
+
TBOX box1 = blob1->bounding_box ();
|
212
|
+
TBOX box2 = blob2->bounding_box ();
|
213
|
+
|
214
|
+
if (box1.contains (box2) &&
|
215
|
+
box2.contains (box1) &&
|
216
|
+
(blob1->out_list ()->length () == blob1->out_list ()->length ()))
|
217
|
+
return TRUE;
|
218
|
+
else
|
219
|
+
return FALSE;
|
220
|
+
}
|
221
|
+
|
222
|
+
|
223
|
+
inT16 word_outline_errs( //Outline count errs
|
224
|
+
WERD_RES *word) {
|
225
|
+
PBLOB_IT outword_it;
|
226
|
+
inT16 i = 0;
|
227
|
+
inT16 err_count = 0;
|
228
|
+
|
229
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
230
|
+
|
231
|
+
for (outword_it.mark_cycle_pt ();
|
232
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
233
|
+
err_count += count_outline_errs (word->best_choice->string ()[i],
|
234
|
+
outword_it.data ()->out_list ()->
|
235
|
+
length ());
|
236
|
+
i++;
|
237
|
+
}
|
238
|
+
return err_count;
|
239
|
+
}
|
240
|
+
|
241
|
+
|
242
|
+
/*************************************************************************
|
243
|
+
* word_char_quality()
|
244
|
+
* Combination of blob quality and outline quality - how many good chars are
|
245
|
+
* there? - I.e chars which pass the blob AND outline tests.
|
246
|
+
*************************************************************************/
|
247
|
+
void word_char_quality( //Blob seg changes
|
248
|
+
WERD_RES *word,
|
249
|
+
ROW *row,
|
250
|
+
inT16 *match_count,
|
251
|
+
inT16 *accepted_match_count) {
|
252
|
+
WERD *bln_word; //BL norm init word
|
253
|
+
TWERD *tessword; //tess format
|
254
|
+
WERD *init_word; //BL norm init word
|
255
|
+
PBLOB_IT outword_it;
|
256
|
+
PBLOB_IT initial_it;
|
257
|
+
inT16 i;
|
258
|
+
inT16 init_blobs_left;
|
259
|
+
BOOL8 matched;
|
260
|
+
TBOX out_box;
|
261
|
+
PBLOB *test_blob;
|
262
|
+
DENORM denorm;
|
263
|
+
float bln_xht;
|
264
|
+
inT16 j = 0;
|
265
|
+
|
266
|
+
*match_count = 0;
|
267
|
+
*accepted_match_count = 0;
|
268
|
+
if (word->word->gblob_list ()->empty ())
|
269
|
+
return;
|
270
|
+
|
271
|
+
//xht used for blnorm
|
272
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
273
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
274
|
+
/*
|
275
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
276
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
277
|
+
comparison
|
278
|
+
*/
|
279
|
+
tessword = make_tess_word (bln_word, NULL);
|
280
|
+
//convert word
|
281
|
+
init_word = make_ed_word (tessword, bln_word);
|
282
|
+
delete bln_word;
|
283
|
+
delete_word(tessword); //get rid of it
|
284
|
+
// tprintf( "SOURCE BLOBS-AFTER TESS:\n");
|
285
|
+
// print_boxes( init_word );
|
286
|
+
// tprintf( "OUTPUT BLOBS:\n");
|
287
|
+
// print_boxes( word->outword );
|
288
|
+
|
289
|
+
initial_it.set_to_list (init_word->blob_list ());
|
290
|
+
init_blobs_left = initial_it.length ();
|
291
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
292
|
+
|
293
|
+
for (outword_it.mark_cycle_pt ();
|
294
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
295
|
+
out_box = outword_it.data ()->bounding_box ();
|
296
|
+
|
297
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
298
|
+
while (!initial_it.at_last () &&
|
299
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
300
|
+
initial_it.forward ();
|
301
|
+
init_blobs_left--;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* See if current outword blob matches any initial blob with the same left
|
305
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
306
|
+
|
307
|
+
i = 0;
|
308
|
+
matched = FALSE;
|
309
|
+
do {
|
310
|
+
test_blob = initial_it.data_relative (i++);
|
311
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
312
|
+
if (matched &&
|
313
|
+
(count_outline_errs (word->best_choice->string ()[j],
|
314
|
+
outword_it.data ()->out_list ()->length ())
|
315
|
+
== 0)) {
|
316
|
+
(*match_count)++;
|
317
|
+
if (word->reject_map[j].accepted ())
|
318
|
+
(*accepted_match_count)++;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
while (!matched &&
|
322
|
+
(init_blobs_left - i > 0) &&
|
323
|
+
(i < 129) &&
|
324
|
+
!initial_it.at_last () &&
|
325
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
326
|
+
j++;
|
327
|
+
}
|
328
|
+
delete init_word;
|
329
|
+
}
|
330
|
+
|
331
|
+
|
332
|
+
/*************************************************************************
|
333
|
+
* unrej_good_chs()
|
334
|
+
* Unreject POTENTIAL rejects if the blob passes the blob and outline checks
|
335
|
+
*************************************************************************/
|
336
|
+
void unrej_good_chs(WERD_RES *word, ROW *row) {
|
337
|
+
WERD *bln_word; //BL norm init word
|
338
|
+
TWERD *tessword; //tess format
|
339
|
+
WERD *init_word; //BL norm init word
|
340
|
+
PBLOB_IT outword_it;
|
341
|
+
PBLOB_IT initial_it;
|
342
|
+
inT16 i;
|
343
|
+
inT16 init_blobs_left;
|
344
|
+
BOOL8 matched;
|
345
|
+
TBOX out_box;
|
346
|
+
PBLOB *test_blob;
|
347
|
+
DENORM denorm;
|
348
|
+
float bln_xht;
|
349
|
+
inT16 j = 0;
|
350
|
+
|
351
|
+
if (word->word->gblob_list ()->empty ())
|
352
|
+
return;
|
353
|
+
|
354
|
+
//xht used for blnorm
|
355
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
356
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
357
|
+
/*
|
358
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
359
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
360
|
+
comparison
|
361
|
+
*/
|
362
|
+
tessword = make_tess_word (bln_word, NULL);
|
363
|
+
//convert word
|
364
|
+
init_word = make_ed_word (tessword, bln_word);
|
365
|
+
delete bln_word;
|
366
|
+
delete_word(tessword); //get rid of it
|
367
|
+
|
368
|
+
initial_it.set_to_list (init_word->blob_list ());
|
369
|
+
init_blobs_left = initial_it.length ();
|
370
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
371
|
+
|
372
|
+
for (outword_it.mark_cycle_pt ();
|
373
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
374
|
+
out_box = outword_it.data ()->bounding_box ();
|
375
|
+
|
376
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
377
|
+
while (!initial_it.at_last () &&
|
378
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
379
|
+
initial_it.forward ();
|
380
|
+
init_blobs_left--;
|
381
|
+
}
|
382
|
+
|
383
|
+
/* See if current outword blob matches any initial blob with the same left
|
384
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
385
|
+
|
386
|
+
i = 0;
|
387
|
+
matched = FALSE;
|
388
|
+
do {
|
389
|
+
test_blob = initial_it.data_relative (i++);
|
390
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
391
|
+
if (matched &&
|
392
|
+
(word->reject_map[j].accept_if_good_quality ()) &&
|
393
|
+
(docqual_excuse_outline_errs ||
|
394
|
+
(count_outline_errs (word->best_choice->string ()[j],
|
395
|
+
outword_it.data ()->out_list ()->
|
396
|
+
length ()) == 0)))
|
397
|
+
word->reject_map[j].setrej_quality_accept ();
|
398
|
+
}
|
399
|
+
while (!matched &&
|
400
|
+
(init_blobs_left - i > 0) &&
|
401
|
+
(i < 129) &&
|
402
|
+
!initial_it.at_last () &&
|
403
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
404
|
+
j++;
|
405
|
+
}
|
406
|
+
delete init_word;
|
407
|
+
}
|
408
|
+
|
409
|
+
|
410
|
+
void print_boxes(WERD *word) {
|
411
|
+
PBLOB_IT it;
|
412
|
+
TBOX box;
|
413
|
+
|
414
|
+
it.set_to_list (word->blob_list ());
|
415
|
+
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
|
416
|
+
box = it.data ()->bounding_box ();
|
417
|
+
box.print ();
|
418
|
+
}
|
419
|
+
}
|
420
|
+
|
421
|
+
|
422
|
+
inT16 count_outline_errs(char c, inT16 outline_count) {
|
423
|
+
int expected_outline_count;
|
424
|
+
|
425
|
+
if (STRING (outlines_odd).contains (c))
|
426
|
+
return 0; //Dont use this char
|
427
|
+
else if (STRING (outlines_2).contains (c))
|
428
|
+
expected_outline_count = 2;
|
429
|
+
else
|
430
|
+
expected_outline_count = 1;
|
431
|
+
return abs (outline_count - expected_outline_count);
|
432
|
+
}
|
433
|
+
|
434
|
+
|
435
|
+
void quality_based_rejection(PAGE_RES_IT &page_res_it,
|
436
|
+
BOOL8 good_quality_doc) {
|
437
|
+
if ((tessedit_good_quality_unrej && good_quality_doc))
|
438
|
+
unrej_good_quality_words(page_res_it);
|
439
|
+
doc_and_block_rejection(page_res_it, good_quality_doc);
|
440
|
+
|
441
|
+
page_res_it.restart_page ();
|
442
|
+
while (page_res_it.word () != NULL) {
|
443
|
+
insert_rej_cblobs (page_res_it.word ());
|
444
|
+
page_res_it.forward ();
|
445
|
+
}
|
446
|
+
|
447
|
+
if (unlv_tilde_crunching) {
|
448
|
+
tilde_crunch(page_res_it);
|
449
|
+
tilde_delete(page_res_it);
|
450
|
+
}
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
/*************************************************************************
|
455
|
+
* unrej_good_quality_words()
|
456
|
+
* Accept potential rejects in words which pass the following checks:
|
457
|
+
* - Contains a potential reject
|
458
|
+
* - Word looks like a sensible alpha word.
|
459
|
+
* - Word segmentation is the same as the original image
|
460
|
+
* - All characters have the expected number of outlines
|
461
|
+
* NOTE - the rejection counts are recalculated after unrejection
|
462
|
+
* - CANT do it in a single pass without a bit of fiddling
|
463
|
+
* - keep it simple but inefficient
|
464
|
+
*************************************************************************/
|
465
|
+
void unrej_good_quality_words( //unreject potential
|
466
|
+
PAGE_RES_IT &page_res_it) {
|
467
|
+
WERD_RES *word;
|
468
|
+
ROW_RES *current_row;
|
469
|
+
BLOCK_RES *current_block;
|
470
|
+
int i;
|
471
|
+
|
472
|
+
page_res_it.restart_page ();
|
473
|
+
while (page_res_it.word () != NULL) {
|
474
|
+
check_debug_pt (page_res_it.word (), 100);
|
475
|
+
if (bland_unrej) {
|
476
|
+
word = page_res_it.word ();
|
477
|
+
for (i = 0; i < word->reject_map.length (); i++) {
|
478
|
+
if (word->reject_map[i].accept_if_good_quality ())
|
479
|
+
word->reject_map[i].setrej_quality_accept ();
|
480
|
+
}
|
481
|
+
page_res_it.forward ();
|
482
|
+
}
|
483
|
+
else if ((page_res_it.row ()->char_count > 0) &&
|
484
|
+
((page_res_it.row ()->rej_count /
|
485
|
+
(float) page_res_it.row ()->char_count) <=
|
486
|
+
quality_rowrej_pc)) {
|
487
|
+
word = page_res_it.word ();
|
488
|
+
if (word->reject_map.quality_recoverable_rejects () &&
|
489
|
+
(tessedit_unrej_any_wd ||
|
490
|
+
acceptable_word_string (word->best_choice->string ().string (),
|
491
|
+
word->best_choice->lengths().string())
|
492
|
+
!= AC_UNACCEPTABLE)) {
|
493
|
+
unrej_good_chs (word, page_res_it.row ()->row);
|
494
|
+
}
|
495
|
+
page_res_it.forward ();
|
496
|
+
}
|
497
|
+
else {
|
498
|
+
/* Skip to end of dodgy row */
|
499
|
+
current_row = page_res_it.row ();
|
500
|
+
while ((page_res_it.word () != NULL) &&
|
501
|
+
(page_res_it.row () == current_row))
|
502
|
+
page_res_it.forward ();
|
503
|
+
}
|
504
|
+
check_debug_pt (page_res_it.word (), 110);
|
505
|
+
}
|
506
|
+
page_res_it.restart_page ();
|
507
|
+
page_res_it.page_res->char_count = 0;
|
508
|
+
page_res_it.page_res->rej_count = 0;
|
509
|
+
current_block = NULL;
|
510
|
+
current_row = NULL;
|
511
|
+
while (page_res_it.word () != NULL) {
|
512
|
+
if (current_block != page_res_it.block ()) {
|
513
|
+
current_block = page_res_it.block ();
|
514
|
+
current_block->char_count = 0;
|
515
|
+
current_block->rej_count = 0;
|
516
|
+
}
|
517
|
+
if (current_row != page_res_it.row ()) {
|
518
|
+
current_row = page_res_it.row ();
|
519
|
+
current_row->char_count = 0;
|
520
|
+
current_row->rej_count = 0;
|
521
|
+
current_row->whole_word_rej_count = 0;
|
522
|
+
}
|
523
|
+
page_res_it.rej_stat_word ();
|
524
|
+
page_res_it.forward ();
|
525
|
+
}
|
526
|
+
}
|
527
|
+
|
528
|
+
|
529
|
+
/*************************************************************************
|
530
|
+
* doc_and_block_rejection()
|
531
|
+
*
|
532
|
+
* If the page has too many rejects - reject all of it.
|
533
|
+
* If any block has too many rejects - reject all words in the block
|
534
|
+
*************************************************************************/
|
535
|
+
|
536
|
+
void doc_and_block_rejection( //reject big chunks
|
537
|
+
PAGE_RES_IT &page_res_it,
|
538
|
+
BOOL8 good_quality_doc) {
|
539
|
+
inT16 block_no = 0;
|
540
|
+
inT16 row_no = 0;
|
541
|
+
BLOCK_RES *current_block;
|
542
|
+
ROW_RES *current_row;
|
543
|
+
|
544
|
+
BOOL8 rej_word;
|
545
|
+
BOOL8 prev_word_rejected;
|
546
|
+
inT16 char_quality;
|
547
|
+
inT16 accepted_char_quality;
|
548
|
+
|
549
|
+
if ((page_res_it.page_res->rej_count * 100.0 /
|
550
|
+
page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
|
551
|
+
reject_whole_page(page_res_it);
|
552
|
+
#ifndef SECURE_NAMES
|
553
|
+
if (tessedit_debug_doc_rejection) {
|
554
|
+
tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
|
555
|
+
page_res_it.page_res->char_count,
|
556
|
+
page_res_it.page_res->rej_count);
|
557
|
+
}
|
558
|
+
#endif
|
559
|
+
}
|
560
|
+
else {
|
561
|
+
#ifndef SECURE_NAMES
|
562
|
+
if (tessedit_debug_doc_rejection)
|
563
|
+
tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
|
564
|
+
page_res_it.page_res->char_count,
|
565
|
+
page_res_it.page_res->rej_count);
|
566
|
+
#endif
|
567
|
+
|
568
|
+
/* Walk blocks testing for block rejection */
|
569
|
+
|
570
|
+
page_res_it.restart_page ();
|
571
|
+
while (page_res_it.word () != NULL) {
|
572
|
+
current_block = page_res_it.block ();
|
573
|
+
if (current_block->block->text_region () != NULL)
|
574
|
+
block_no = current_block->block->text_region ()->id_no ();
|
575
|
+
else
|
576
|
+
block_no = -1;
|
577
|
+
if ((page_res_it.block ()->char_count > 0) &&
|
578
|
+
((page_res_it.block ()->rej_count * 100.0 /
|
579
|
+
page_res_it.block ()->char_count) >
|
580
|
+
tessedit_reject_block_percent)) {
|
581
|
+
#ifndef SECURE_NAMES
|
582
|
+
if (tessedit_debug_block_rejection)
|
583
|
+
tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
|
584
|
+
block_no,
|
585
|
+
page_res_it.block ()->char_count,
|
586
|
+
page_res_it.block ()->rej_count);
|
587
|
+
#endif
|
588
|
+
prev_word_rejected = FALSE;
|
589
|
+
while ((page_res_it.word () != NULL) &&
|
590
|
+
(page_res_it.block () == current_block)) {
|
591
|
+
if (tessedit_preserve_blk_rej_perfect_wds) {
|
592
|
+
rej_word =
|
593
|
+
(page_res_it.word ()->reject_map.reject_count () > 0)
|
594
|
+
|| (page_res_it.word ()->reject_map.length () <
|
595
|
+
tessedit_preserve_min_wd_len);
|
596
|
+
if (rej_word && tessedit_dont_blkrej_good_wds
|
597
|
+
&& !(page_res_it.word ()->reject_map.length () <
|
598
|
+
tessedit_preserve_min_wd_len)
|
599
|
+
&&
|
600
|
+
(acceptable_word_string
|
601
|
+
(page_res_it.word ()->best_choice->string ().
|
602
|
+
string (),
|
603
|
+
page_res_it.word ()->best_choice->lengths ().
|
604
|
+
string ()) != AC_UNACCEPTABLE)) {
|
605
|
+
word_char_quality (page_res_it.word (),
|
606
|
+
page_res_it.row ()->row,
|
607
|
+
&char_quality,
|
608
|
+
&accepted_char_quality);
|
609
|
+
rej_word = char_quality !=
|
610
|
+
page_res_it.word ()->reject_map.length ();
|
611
|
+
}
|
612
|
+
}
|
613
|
+
else
|
614
|
+
rej_word = TRUE;
|
615
|
+
if (rej_word) {
|
616
|
+
/*
|
617
|
+
Reject spacing if both current and prev words are rejected.
|
618
|
+
NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
|
619
|
+
more space errors.
|
620
|
+
*/
|
621
|
+
if (tessedit_use_reject_spaces &&
|
622
|
+
prev_word_rejected &&
|
623
|
+
(page_res_it.prev_row () == page_res_it.row ()) &&
|
624
|
+
(page_res_it.word ()->word->space () == 1))
|
625
|
+
page_res_it.word ()->reject_spaces = TRUE;
|
626
|
+
page_res_it.word ()->reject_map.rej_word_block_rej ();
|
627
|
+
}
|
628
|
+
prev_word_rejected = rej_word;
|
629
|
+
page_res_it.forward ();
|
630
|
+
}
|
631
|
+
}
|
632
|
+
else {
|
633
|
+
#ifndef SECURE_NAMES
|
634
|
+
if (tessedit_debug_block_rejection)
|
635
|
+
tprintf
|
636
|
+
("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
|
637
|
+
block_no, page_res_it.block ()->char_count,
|
638
|
+
page_res_it.block ()->rej_count);
|
639
|
+
#endif
|
640
|
+
|
641
|
+
/* Walk rows in block testing for row rejection */
|
642
|
+
row_no = 0;
|
643
|
+
while ((page_res_it.word () != NULL) &&
|
644
|
+
(page_res_it.block () == current_block)) {
|
645
|
+
current_row = page_res_it.row ();
|
646
|
+
row_no++;
|
647
|
+
/* Reject whole row if:
|
648
|
+
fraction of chars on row which are rejected exceed a limit AND
|
649
|
+
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
|
650
|
+
*/
|
651
|
+
if ((page_res_it.row ()->char_count > 0) &&
|
652
|
+
((page_res_it.row ()->rej_count * 100.0 /
|
653
|
+
page_res_it.row ()->char_count) >
|
654
|
+
tessedit_reject_row_percent) &&
|
655
|
+
((page_res_it.row ()->whole_word_rej_count * 100.0 /
|
656
|
+
page_res_it.row ()->rej_count) <
|
657
|
+
tessedit_whole_wd_rej_row_percent)) {
|
658
|
+
#ifndef SECURE_NAMES
|
659
|
+
if (tessedit_debug_block_rejection)
|
660
|
+
tprintf
|
661
|
+
("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
|
662
|
+
row_no, page_res_it.row ()->char_count,
|
663
|
+
page_res_it.row ()->rej_count);
|
664
|
+
#endif
|
665
|
+
prev_word_rejected = FALSE;
|
666
|
+
while ((page_res_it.word () != NULL) &&
|
667
|
+
(page_res_it.row () == current_row)) {
|
668
|
+
/* Preserve words on good docs unless they are mostly rejected*/
|
669
|
+
if (!tessedit_row_rej_good_docs && good_quality_doc) {
|
670
|
+
rej_word =
|
671
|
+
page_res_it.word ()->reject_map.
|
672
|
+
reject_count () /
|
673
|
+
(float) page_res_it.word ()->reject_map.
|
674
|
+
length () > tessedit_good_doc_still_rowrej_wd;
|
675
|
+
}
|
676
|
+
|
677
|
+
/* Preserve perfect words anyway */
|
678
|
+
else if (tessedit_preserve_row_rej_perfect_wds) {
|
679
|
+
rej_word =
|
680
|
+
(page_res_it.word ()->reject_map.
|
681
|
+
reject_count () > 0)
|
682
|
+
|| (page_res_it.word ()->reject_map.
|
683
|
+
length () < tessedit_preserve_min_wd_len);
|
684
|
+
if (rej_word && tessedit_dont_rowrej_good_wds
|
685
|
+
&& !(page_res_it.word ()->reject_map.
|
686
|
+
length () <
|
687
|
+
tessedit_preserve_min_wd_len)
|
688
|
+
&&
|
689
|
+
(acceptable_word_string
|
690
|
+
(page_res_it.word ()->best_choice->
|
691
|
+
string ().string (),
|
692
|
+
page_res_it.word ()->best_choice->
|
693
|
+
lengths ().string ()) != AC_UNACCEPTABLE)) {
|
694
|
+
word_char_quality (page_res_it.word (),
|
695
|
+
page_res_it.row ()->row,
|
696
|
+
&char_quality,
|
697
|
+
&accepted_char_quality);
|
698
|
+
rej_word = char_quality !=
|
699
|
+
page_res_it.word ()->reject_map.length ();
|
700
|
+
}
|
701
|
+
}
|
702
|
+
else
|
703
|
+
rej_word = TRUE;
|
704
|
+
if (rej_word) {
|
705
|
+
/*
|
706
|
+
Reject spacing if both current and prev words are rejected.
|
707
|
+
NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
|
708
|
+
more space errors.
|
709
|
+
*/
|
710
|
+
if (tessedit_use_reject_spaces &&
|
711
|
+
prev_word_rejected &&
|
712
|
+
(page_res_it.prev_row () ==
|
713
|
+
page_res_it.row ())
|
714
|
+
&& (page_res_it.word ()->word->space () ==
|
715
|
+
1))
|
716
|
+
page_res_it.word ()->reject_spaces = TRUE;
|
717
|
+
page_res_it.word ()->reject_map.
|
718
|
+
rej_word_row_rej();
|
719
|
+
}
|
720
|
+
prev_word_rejected = rej_word;
|
721
|
+
page_res_it.forward ();
|
722
|
+
}
|
723
|
+
}
|
724
|
+
else {
|
725
|
+
#ifndef SECURE_NAMES
|
726
|
+
if (tessedit_debug_block_rejection)
|
727
|
+
tprintf
|
728
|
+
("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
|
729
|
+
row_no, page_res_it.row ()->char_count,
|
730
|
+
page_res_it.row ()->rej_count);
|
731
|
+
#endif
|
732
|
+
while ((page_res_it.word () != NULL) &&
|
733
|
+
(page_res_it.row () == current_row))
|
734
|
+
page_res_it.forward ();
|
735
|
+
}
|
736
|
+
}
|
737
|
+
}
|
738
|
+
}
|
739
|
+
}
|
740
|
+
}
|
741
|
+
|
742
|
+
|
743
|
+
/*************************************************************************
|
744
|
+
* reject_whole_page()
|
745
|
+
* Dont believe any of it - set the reject map to 00..00 in all words
|
746
|
+
*
|
747
|
+
*************************************************************************/
|
748
|
+
|
749
|
+
void reject_whole_page(PAGE_RES_IT &page_res_it) {
|
750
|
+
page_res_it.restart_page ();
|
751
|
+
while (page_res_it.word () != NULL) {
|
752
|
+
page_res_it.word ()->reject_map.rej_word_doc_rej ();
|
753
|
+
page_res_it.forward ();
|
754
|
+
}
|
755
|
+
//whole page is rejected
|
756
|
+
page_res_it.page_res->rejected = TRUE;
|
757
|
+
}
|
758
|
+
|
759
|
+
|
760
|
+
void tilde_crunch(PAGE_RES_IT &page_res_it) {
|
761
|
+
WERD_RES *word;
|
762
|
+
GARBAGE_LEVEL garbage_level;
|
763
|
+
PAGE_RES_IT copy_it;
|
764
|
+
BOOL8 prev_potential_marked = FALSE;
|
765
|
+
BOOL8 found_terrible_word = FALSE;
|
766
|
+
int dict_type;
|
767
|
+
BOOL8 ok_dict_word;
|
768
|
+
|
769
|
+
page_res_it.restart_page ();
|
770
|
+
while (page_res_it.word () != NULL) {
|
771
|
+
word = page_res_it.word ();
|
772
|
+
|
773
|
+
if (crunch_early_convert_bad_unlv_chs)
|
774
|
+
convert_bad_unlv_chs(word);
|
775
|
+
|
776
|
+
if (crunch_early_merge_tess_fails)
|
777
|
+
merge_tess_fails(word);
|
778
|
+
|
779
|
+
if (word->reject_map.accept_count () != 0) {
|
780
|
+
found_terrible_word = FALSE;
|
781
|
+
//Forget earlier potential crunches
|
782
|
+
prev_potential_marked = FALSE;
|
783
|
+
}
|
784
|
+
else {
|
785
|
+
dict_type = dict_word (word->best_choice->string ().string ());
|
786
|
+
ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
|
787
|
+
garbage_level = garbage_word (word, ok_dict_word);
|
788
|
+
|
789
|
+
if ((garbage_level != G_NEVER_CRUNCH) &&
|
790
|
+
(terrible_word_crunch (word, garbage_level))) {
|
791
|
+
if (crunch_debug > 0) {
|
792
|
+
tprintf ("T CRUNCHING: \"%s\"\n",
|
793
|
+
word->best_choice->string ().string ());
|
794
|
+
}
|
795
|
+
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
796
|
+
if (prev_potential_marked) {
|
797
|
+
while (copy_it.word () != word) {
|
798
|
+
if (crunch_debug > 0) {
|
799
|
+
tprintf ("P1 CRUNCHING: \"%s\"\n",
|
800
|
+
copy_it.word ()->best_choice->string ().
|
801
|
+
string ());
|
802
|
+
}
|
803
|
+
copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
|
804
|
+
copy_it.forward ();
|
805
|
+
}
|
806
|
+
prev_potential_marked = FALSE;
|
807
|
+
}
|
808
|
+
found_terrible_word = TRUE;
|
809
|
+
}
|
810
|
+
else if ((garbage_level != G_NEVER_CRUNCH) &&
|
811
|
+
(potential_word_crunch (word,
|
812
|
+
garbage_level, ok_dict_word))) {
|
813
|
+
if (found_terrible_word) {
|
814
|
+
if (crunch_debug > 0) {
|
815
|
+
tprintf ("P2 CRUNCHING: \"%s\"\n",
|
816
|
+
word->best_choice->string ().string ());
|
817
|
+
}
|
818
|
+
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
819
|
+
}
|
820
|
+
else if (!prev_potential_marked) {
|
821
|
+
copy_it = page_res_it;
|
822
|
+
prev_potential_marked = TRUE;
|
823
|
+
if (crunch_debug > 1) {
|
824
|
+
tprintf ("P3 CRUNCHING: \"%s\"\n",
|
825
|
+
word->best_choice->string ().string ());
|
826
|
+
}
|
827
|
+
}
|
828
|
+
}
|
829
|
+
else {
|
830
|
+
found_terrible_word = FALSE;
|
831
|
+
//Forget earlier potential crunches
|
832
|
+
prev_potential_marked = FALSE;
|
833
|
+
if (crunch_debug > 2) {
|
834
|
+
tprintf ("NO CRUNCH: \"%s\"\n",
|
835
|
+
word->best_choice->string ().string ());
|
836
|
+
}
|
837
|
+
}
|
838
|
+
}
|
839
|
+
page_res_it.forward ();
|
840
|
+
}
|
841
|
+
}
|
842
|
+
|
843
|
+
|
844
|
+
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
|
845
|
+
float rating_per_ch;
|
846
|
+
int adjusted_len;
|
847
|
+
int crunch_mode = 0;
|
848
|
+
|
849
|
+
if ((word->best_choice->string ().length () == 0) ||
|
850
|
+
(strspn (word->best_choice->string ().string (), " ") ==
|
851
|
+
word->best_choice->string ().length ()))
|
852
|
+
crunch_mode = 1;
|
853
|
+
else {
|
854
|
+
adjusted_len = word->reject_map.length ();
|
855
|
+
if (adjusted_len > crunch_rating_max)
|
856
|
+
adjusted_len = crunch_rating_max;
|
857
|
+
rating_per_ch = word->best_choice->rating () / adjusted_len;
|
858
|
+
|
859
|
+
if (rating_per_ch > crunch_terrible_rating)
|
860
|
+
crunch_mode = 2;
|
861
|
+
else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
|
862
|
+
crunch_mode = 3;
|
863
|
+
else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
|
864
|
+
(garbage_level != G_OK))
|
865
|
+
crunch_mode = 4;
|
866
|
+
else if ((rating_per_ch > crunch_poor_garbage_rate) &&
|
867
|
+
(garbage_level != G_OK))
|
868
|
+
crunch_mode = 5;
|
869
|
+
}
|
870
|
+
if (crunch_mode > 0) {
|
871
|
+
if (crunch_debug > 2) {
|
872
|
+
tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
|
873
|
+
crunch_mode, word->best_choice->string ().string ());
|
874
|
+
}
|
875
|
+
return TRUE;
|
876
|
+
}
|
877
|
+
else
|
878
|
+
return FALSE;
|
879
|
+
}
|
880
|
+
|
881
|
+
|
882
|
+
BOOL8 potential_word_crunch(WERD_RES *word,
|
883
|
+
GARBAGE_LEVEL garbage_level,
|
884
|
+
BOOL8 ok_dict_word) {
|
885
|
+
float rating_per_ch;
|
886
|
+
int adjusted_len;
|
887
|
+
const char *str = word->best_choice->string ().string ();
|
888
|
+
const char *lengths = word->best_choice->lengths ().string ();
|
889
|
+
BOOL8 word_crunchable;
|
890
|
+
int poor_indicator_count = 0;
|
891
|
+
|
892
|
+
word_crunchable =
|
893
|
+
!crunch_leave_accept_strings ||
|
894
|
+
(word->reject_map.length () < 3) ||
|
895
|
+
((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
|
896
|
+
!ok_dict_word);
|
897
|
+
|
898
|
+
adjusted_len = word->reject_map.length ();
|
899
|
+
if (adjusted_len > 10)
|
900
|
+
adjusted_len = 10;
|
901
|
+
rating_per_ch = word->best_choice->rating () / adjusted_len;
|
902
|
+
|
903
|
+
if (rating_per_ch > crunch_pot_poor_rate) {
|
904
|
+
if (crunch_debug > 2) {
|
905
|
+
tprintf ("Potential poor rating on \"%s\"\n",
|
906
|
+
word->best_choice->string ().string ());
|
907
|
+
}
|
908
|
+
poor_indicator_count++;
|
909
|
+
}
|
910
|
+
|
911
|
+
if (word_crunchable &&
|
912
|
+
(word->best_choice->certainty () < crunch_pot_poor_cert)) {
|
913
|
+
if (crunch_debug > 2) {
|
914
|
+
tprintf ("Potential poor cert on \"%s\"\n",
|
915
|
+
word->best_choice->string ().string ());
|
916
|
+
}
|
917
|
+
poor_indicator_count++;
|
918
|
+
}
|
919
|
+
|
920
|
+
if (garbage_level != G_OK) {
|
921
|
+
if (crunch_debug > 2) {
|
922
|
+
tprintf ("Potential garbage on \"%s\"\n",
|
923
|
+
word->best_choice->string ().string ());
|
924
|
+
}
|
925
|
+
poor_indicator_count++;
|
926
|
+
}
|
927
|
+
return (poor_indicator_count >= crunch_pot_indicators);
|
928
|
+
}
|
929
|
+
|
930
|
+
|
931
|
+
void tilde_delete(PAGE_RES_IT &page_res_it) {
|
932
|
+
WERD_RES *word;
|
933
|
+
PAGE_RES_IT copy_it;
|
934
|
+
BOOL8 deleting_from_bol = FALSE;
|
935
|
+
BOOL8 marked_delete_point = FALSE;
|
936
|
+
inT16 debug_delete_mode;
|
937
|
+
CRUNCH_MODE delete_mode;
|
938
|
+
inT16 x_debug_delete_mode;
|
939
|
+
CRUNCH_MODE x_delete_mode;
|
940
|
+
|
941
|
+
page_res_it.restart_page ();
|
942
|
+
while (page_res_it.word () != NULL) {
|
943
|
+
word = page_res_it.word ();
|
944
|
+
|
945
|
+
delete_mode = word_deletable (word, debug_delete_mode);
|
946
|
+
if (delete_mode != CR_NONE) {
|
947
|
+
if (word->word->flag (W_BOL) || deleting_from_bol) {
|
948
|
+
if (crunch_debug > 0) {
|
949
|
+
tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
|
950
|
+
debug_delete_mode,
|
951
|
+
word->best_choice->string ().string ());
|
952
|
+
}
|
953
|
+
word->unlv_crunch_mode = delete_mode;
|
954
|
+
deleting_from_bol = TRUE;
|
955
|
+
}
|
956
|
+
else if (word->word->flag (W_EOL)) {
|
957
|
+
if (marked_delete_point) {
|
958
|
+
while (copy_it.word () != word) {
|
959
|
+
x_delete_mode = word_deletable (copy_it.word (),
|
960
|
+
x_debug_delete_mode);
|
961
|
+
if (crunch_debug > 0) {
|
962
|
+
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
|
963
|
+
x_debug_delete_mode,
|
964
|
+
copy_it.word ()->best_choice->string ().
|
965
|
+
string ());
|
966
|
+
}
|
967
|
+
copy_it.word ()->unlv_crunch_mode = x_delete_mode;
|
968
|
+
copy_it.forward ();
|
969
|
+
}
|
970
|
+
}
|
971
|
+
if (crunch_debug > 0) {
|
972
|
+
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
|
973
|
+
debug_delete_mode,
|
974
|
+
word->best_choice->string ().string ());
|
975
|
+
}
|
976
|
+
word->unlv_crunch_mode = delete_mode;
|
977
|
+
deleting_from_bol = FALSE;
|
978
|
+
marked_delete_point = FALSE;
|
979
|
+
}
|
980
|
+
else {
|
981
|
+
if (!marked_delete_point) {
|
982
|
+
copy_it = page_res_it;
|
983
|
+
marked_delete_point = TRUE;
|
984
|
+
}
|
985
|
+
}
|
986
|
+
}
|
987
|
+
else {
|
988
|
+
deleting_from_bol = FALSE;
|
989
|
+
//Forget earlier potential crunches
|
990
|
+
marked_delete_point = FALSE;
|
991
|
+
}
|
992
|
+
/*
|
993
|
+
The following step has been left till now as the tess fails are used to
|
994
|
+
determine if the word is deletable.
|
995
|
+
*/
|
996
|
+
if (!crunch_early_merge_tess_fails)
|
997
|
+
merge_tess_fails(word);
|
998
|
+
page_res_it.forward ();
|
999
|
+
}
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
|
1003
|
+
void convert_bad_unlv_chs( //word to do
|
1004
|
+
WERD_RES *word_res) {
|
1005
|
+
char *ptr; //string ptr
|
1006
|
+
int i;
|
1007
|
+
int offset;
|
1008
|
+
|
1009
|
+
ptr = (char *) word_res->best_choice->string ().string ();
|
1010
|
+
for (i = 0, offset = 0; i < word_res->reject_map.length ();
|
1011
|
+
offset += word_res->best_choice->lengths ()[i++]) {
|
1012
|
+
if (word_res->best_choice->lengths ()[i] == 1 &&
|
1013
|
+
ptr[offset] == '~') {
|
1014
|
+
ptr[offset] = '-';
|
1015
|
+
if (word_res->reject_map[i].accepted ())
|
1016
|
+
word_res->reject_map[i].setrej_unlv_rej ();
|
1017
|
+
}
|
1018
|
+
if (word_res->best_choice->lengths ()[i] == 1 &&
|
1019
|
+
ptr[offset] == '^') {
|
1020
|
+
ptr[offset] = ' ';
|
1021
|
+
if (word_res->reject_map[i].accepted ())
|
1022
|
+
word_res->reject_map[i].setrej_unlv_rej ();
|
1023
|
+
}
|
1024
|
+
}
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
|
1028
|
+
/**********************************************************************
|
1029
|
+
* merge_tess_fails
|
1030
|
+
*
|
1031
|
+
* Change pairs of tess failures to a single one
|
1032
|
+
**********************************************************************/
|
1033
|
+
|
1034
|
+
void merge_tess_fails( //word to do
|
1035
|
+
WERD_RES *word_res) {
|
1036
|
+
char *ptr; //string ptr
|
1037
|
+
char *ptr_lengths; //lengths ptr
|
1038
|
+
PBLOB_IT blob_it; //blobs
|
1039
|
+
int i = 0;
|
1040
|
+
int len;
|
1041
|
+
|
1042
|
+
len = strlen (word_res->best_choice->lengths ().string ());
|
1043
|
+
ASSERT_HOST (word_res->reject_map.length () == len);
|
1044
|
+
ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
|
1045
|
+
|
1046
|
+
ptr = (char *) word_res->best_choice->string ().string ();
|
1047
|
+
ptr_lengths = (char *) word_res->best_choice->lengths ().string ();
|
1048
|
+
blob_it = word_res->outword->blob_list ();
|
1049
|
+
while (*ptr != '\0') {
|
1050
|
+
if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
|
1051
|
+
strcpy (ptr + 1, ptr + 2); //shuffle up
|
1052
|
+
strcpy (ptr_lengths + 1, ptr_lengths + 2); //shuffle up
|
1053
|
+
word_res->reject_map.remove_pos (i);
|
1054
|
+
merge_blobs (blob_it.data_relative (1), blob_it.data ());
|
1055
|
+
delete blob_it.extract (); //get rid of spare
|
1056
|
+
}
|
1057
|
+
else {
|
1058
|
+
i++;
|
1059
|
+
ptr += *(ptr_lengths++);
|
1060
|
+
}
|
1061
|
+
blob_it.forward ();
|
1062
|
+
}
|
1063
|
+
len = strlen (word_res->best_choice->lengths ().string ());
|
1064
|
+
ASSERT_HOST (word_res->reject_map.length () == len);
|
1065
|
+
ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
|
1069
|
+
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
|
1070
|
+
enum STATES
|
1071
|
+
{
|
1072
|
+
JUNK,
|
1073
|
+
FIRST_UPPER,
|
1074
|
+
FIRST_LOWER,
|
1075
|
+
FIRST_NUM,
|
1076
|
+
SUBSEQUENT_UPPER,
|
1077
|
+
SUBSEQUENT_LOWER,
|
1078
|
+
SUBSEQUENT_NUM
|
1079
|
+
};
|
1080
|
+
const char *str = word->best_choice->string ().string ();
|
1081
|
+
const char *lengths = word->best_choice->lengths ().string ();
|
1082
|
+
STATES state = JUNK;
|
1083
|
+
int len = 0;
|
1084
|
+
int isolated_digits = 0;
|
1085
|
+
int isolated_alphas = 0;
|
1086
|
+
int bad_char_count = 0;
|
1087
|
+
int tess_rejs = 0;
|
1088
|
+
int dodgy_chars = 0;
|
1089
|
+
int ok_chars;
|
1090
|
+
UNICHAR_ID last_char = -1;
|
1091
|
+
int alpha_repetition_count = 0;
|
1092
|
+
int longest_alpha_repetition_count = 0;
|
1093
|
+
int longest_lower_run_len = 0;
|
1094
|
+
int lower_string_count = 0;
|
1095
|
+
int longest_upper_run_len = 0;
|
1096
|
+
int upper_string_count = 0;
|
1097
|
+
int total_alpha_count = 0;
|
1098
|
+
int total_digit_count = 0;
|
1099
|
+
|
1100
|
+
for (; *str != '\0'; str += *(lengths++)) {
|
1101
|
+
len++;
|
1102
|
+
if (unicharset.get_isupper (str, *lengths)) {
|
1103
|
+
total_alpha_count++;
|
1104
|
+
switch (state) {
|
1105
|
+
case SUBSEQUENT_UPPER:
|
1106
|
+
case FIRST_UPPER:
|
1107
|
+
state = SUBSEQUENT_UPPER;
|
1108
|
+
upper_string_count++;
|
1109
|
+
if (longest_upper_run_len < upper_string_count)
|
1110
|
+
longest_upper_run_len = upper_string_count;
|
1111
|
+
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
|
1112
|
+
alpha_repetition_count++;
|
1113
|
+
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
1114
|
+
longest_alpha_repetition_count = alpha_repetition_count;
|
1115
|
+
}
|
1116
|
+
}
|
1117
|
+
else {
|
1118
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
1119
|
+
alpha_repetition_count = 1;
|
1120
|
+
}
|
1121
|
+
break;
|
1122
|
+
case FIRST_NUM:
|
1123
|
+
isolated_digits++;
|
1124
|
+
default:
|
1125
|
+
state = FIRST_UPPER;
|
1126
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
1127
|
+
alpha_repetition_count = 1;
|
1128
|
+
upper_string_count = 1;
|
1129
|
+
break;
|
1130
|
+
}
|
1131
|
+
}
|
1132
|
+
else if (unicharset.get_islower (str, *lengths)) {
|
1133
|
+
total_alpha_count++;
|
1134
|
+
switch (state) {
|
1135
|
+
case SUBSEQUENT_LOWER:
|
1136
|
+
case FIRST_LOWER:
|
1137
|
+
state = SUBSEQUENT_LOWER;
|
1138
|
+
lower_string_count++;
|
1139
|
+
if (longest_lower_run_len < lower_string_count)
|
1140
|
+
longest_lower_run_len = lower_string_count;
|
1141
|
+
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
|
1142
|
+
alpha_repetition_count++;
|
1143
|
+
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
1144
|
+
longest_alpha_repetition_count = alpha_repetition_count;
|
1145
|
+
}
|
1146
|
+
}
|
1147
|
+
else {
|
1148
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
1149
|
+
alpha_repetition_count = 1;
|
1150
|
+
}
|
1151
|
+
break;
|
1152
|
+
case FIRST_NUM:
|
1153
|
+
isolated_digits++;
|
1154
|
+
default:
|
1155
|
+
state = FIRST_LOWER;
|
1156
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
1157
|
+
alpha_repetition_count = 1;
|
1158
|
+
lower_string_count = 1;
|
1159
|
+
break;
|
1160
|
+
}
|
1161
|
+
}
|
1162
|
+
else if (unicharset.get_isdigit (str, *lengths)) {
|
1163
|
+
total_digit_count++;
|
1164
|
+
switch (state) {
|
1165
|
+
case FIRST_NUM:
|
1166
|
+
state = SUBSEQUENT_NUM;
|
1167
|
+
case SUBSEQUENT_NUM:
|
1168
|
+
break;
|
1169
|
+
case FIRST_UPPER:
|
1170
|
+
case FIRST_LOWER:
|
1171
|
+
isolated_alphas++;
|
1172
|
+
default:
|
1173
|
+
state = FIRST_NUM;
|
1174
|
+
break;
|
1175
|
+
}
|
1176
|
+
}
|
1177
|
+
else {
|
1178
|
+
if (*lengths == 1 && *str == ' ')
|
1179
|
+
tess_rejs++;
|
1180
|
+
else
|
1181
|
+
bad_char_count++;
|
1182
|
+
switch (state) {
|
1183
|
+
case FIRST_NUM:
|
1184
|
+
isolated_digits++;
|
1185
|
+
break;
|
1186
|
+
case FIRST_UPPER:
|
1187
|
+
case FIRST_LOWER:
|
1188
|
+
isolated_alphas++;
|
1189
|
+
default:
|
1190
|
+
break;
|
1191
|
+
}
|
1192
|
+
state = JUNK;
|
1193
|
+
}
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
switch (state) {
|
1197
|
+
case FIRST_NUM:
|
1198
|
+
isolated_digits++;
|
1199
|
+
break;
|
1200
|
+
case FIRST_UPPER:
|
1201
|
+
case FIRST_LOWER:
|
1202
|
+
isolated_alphas++;
|
1203
|
+
default:
|
1204
|
+
break;
|
1205
|
+
}
|
1206
|
+
|
1207
|
+
if (crunch_include_numerals) {
|
1208
|
+
total_alpha_count += total_digit_count - isolated_digits;
|
1209
|
+
}
|
1210
|
+
|
1211
|
+
if (crunch_leave_ok_strings &&
|
1212
|
+
(len >= 4) &&
|
1213
|
+
(2 * (total_alpha_count - isolated_alphas) > len) &&
|
1214
|
+
(longest_alpha_repetition_count < crunch_long_repetitions)) {
|
1215
|
+
if ((crunch_accept_ok &&
|
1216
|
+
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
|
1217
|
+
(longest_lower_run_len > crunch_leave_lc_strings) ||
|
1218
|
+
(longest_upper_run_len > crunch_leave_uc_strings))
|
1219
|
+
return G_NEVER_CRUNCH;
|
1220
|
+
}
|
1221
|
+
if ((word->reject_map.length () > 1) &&
|
1222
|
+
(strpbrk (str, " ") == NULL) &&
|
1223
|
+
((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
1224
|
+
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
1225
|
+
(word->best_choice->permuter () == USER_DAWG_PERM) ||
|
1226
|
+
(word->best_choice->permuter () == NUMBER_PERM) ||
|
1227
|
+
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
|
1228
|
+
return G_OK;
|
1229
|
+
|
1230
|
+
ok_chars = len - bad_char_count - isolated_digits -
|
1231
|
+
isolated_alphas - tess_rejs;
|
1232
|
+
|
1233
|
+
if (crunch_debug > 3) {
|
1234
|
+
tprintf ("garbage_word: \"%s\"\n",
|
1235
|
+
word->best_choice->string ().string ());
|
1236
|
+
tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
|
1237
|
+
len,
|
1238
|
+
bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
|
1239
|
+
}
|
1240
|
+
if ((bad_char_count == 0) &&
|
1241
|
+
(tess_rejs == 0) &&
|
1242
|
+
((len > isolated_digits + isolated_alphas) || (len <= 2)))
|
1243
|
+
return G_OK;
|
1244
|
+
|
1245
|
+
if ((tess_rejs > ok_chars) ||
|
1246
|
+
((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
|
1247
|
+
return G_TERRIBLE;
|
1248
|
+
|
1249
|
+
if (len > 4) {
|
1250
|
+
dodgy_chars = 2 * tess_rejs + bad_char_count +
|
1251
|
+
isolated_digits + isolated_alphas;
|
1252
|
+
if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
|
1253
|
+
return G_DODGY;
|
1254
|
+
else
|
1255
|
+
return G_OK;
|
1256
|
+
}
|
1257
|
+
else {
|
1258
|
+
dodgy_chars = 2 * tess_rejs + bad_char_count;
|
1259
|
+
if (((len == 4) && (dodgy_chars > 2)) ||
|
1260
|
+
((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
|
1261
|
+
return G_DODGY;
|
1262
|
+
else
|
1263
|
+
return G_OK;
|
1264
|
+
}
|
1265
|
+
}
|
1266
|
+
|
1267
|
+
|
1268
|
+
/*************************************************************************
|
1269
|
+
* word_deletable()
|
1270
|
+
* DELETE WERDS AT ENDS OF ROWS IF
|
1271
|
+
* Word is crunched &&
|
1272
|
+
* ( string length = 0 OR
|
1273
|
+
* > 50% of chars are "|" (before merging) OR
|
1274
|
+
* certainty < -10 OR
|
1275
|
+
* rating /char > 60 OR
|
1276
|
+
* TOP of word is more than 0.5 xht BELOW baseline OR
|
1277
|
+
* BOTTOM of word is more than 0.5 xht ABOVE xht OR
|
1278
|
+
* length of word < 3xht OR
|
1279
|
+
* height of word < 0.7 xht OR
|
1280
|
+
* height of word > 3.0 xht OR
|
1281
|
+
* >75% of the outline BBs have longest dimension < 0.5xht
|
1282
|
+
*************************************************************************/
|
1283
|
+
|
1284
|
+
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
|
1285
|
+
int word_len = word->reject_map.length ();
|
1286
|
+
float rating_per_ch;
|
1287
|
+
TBOX box; //BB of word
|
1288
|
+
|
1289
|
+
if (word->unlv_crunch_mode == CR_NONE) {
|
1290
|
+
delete_mode = 0;
|
1291
|
+
return CR_NONE;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
if (word_len == 0) {
|
1295
|
+
delete_mode = 1;
|
1296
|
+
return CR_DELETE;
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
box = word->outword->bounding_box ();
|
1300
|
+
if (box.height () < crunch_del_min_ht * bln_x_height) {
|
1301
|
+
delete_mode = 4;
|
1302
|
+
return CR_DELETE;
|
1303
|
+
}
|
1304
|
+
|
1305
|
+
if (noise_outlines (word->outword)) {
|
1306
|
+
delete_mode = 5;
|
1307
|
+
return CR_DELETE;
|
1308
|
+
}
|
1309
|
+
|
1310
|
+
if ((failure_count (word) * 1.5) > word_len) {
|
1311
|
+
delete_mode = 2;
|
1312
|
+
return CR_LOOSE_SPACE;
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
if (word->best_choice->certainty () < crunch_del_cert) {
|
1316
|
+
delete_mode = 7;
|
1317
|
+
return CR_LOOSE_SPACE;
|
1318
|
+
}
|
1319
|
+
|
1320
|
+
rating_per_ch = word->best_choice->rating () / word_len;
|
1321
|
+
|
1322
|
+
if (rating_per_ch > crunch_del_rating) {
|
1323
|
+
delete_mode = 8;
|
1324
|
+
return CR_LOOSE_SPACE;
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
|
1328
|
+
delete_mode = 9;
|
1329
|
+
return CR_LOOSE_SPACE;
|
1330
|
+
}
|
1331
|
+
|
1332
|
+
if (box.bottom () >
|
1333
|
+
bln_baseline_offset + crunch_del_high_word * bln_x_height) {
|
1334
|
+
delete_mode = 10;
|
1335
|
+
return CR_LOOSE_SPACE;
|
1336
|
+
}
|
1337
|
+
|
1338
|
+
if (box.height () > crunch_del_max_ht * bln_x_height) {
|
1339
|
+
delete_mode = 11;
|
1340
|
+
return CR_LOOSE_SPACE;
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
if (box.width () < crunch_del_min_width * bln_x_height) {
|
1344
|
+
delete_mode = 3;
|
1345
|
+
return CR_LOOSE_SPACE;
|
1346
|
+
}
|
1347
|
+
|
1348
|
+
delete_mode = 0;
|
1349
|
+
return CR_NONE;
|
1350
|
+
}
|
1351
|
+
|
1352
|
+
|
1353
|
+
inT16 failure_count(WERD_RES *word) {
|
1354
|
+
char *str = (char *) word->best_choice->string ().string ();
|
1355
|
+
int tess_rejs = 0;
|
1356
|
+
|
1357
|
+
for (; *str != '\0'; str++) {
|
1358
|
+
if (*str == ' ')
|
1359
|
+
tess_rejs++;
|
1360
|
+
}
|
1361
|
+
return tess_rejs;
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
|
1365
|
+
BOOL8 noise_outlines(WERD *word) {
|
1366
|
+
PBLOB_IT blob_it;
|
1367
|
+
OUTLINE_IT outline_it;
|
1368
|
+
TBOX box; //BB of outline
|
1369
|
+
inT16 outline_count = 0;
|
1370
|
+
inT16 small_outline_count = 0;
|
1371
|
+
inT16 max_dimension;
|
1372
|
+
float small_limit = bln_x_height * crunch_small_outlines_size;
|
1373
|
+
|
1374
|
+
blob_it.set_to_list (word->blob_list ());
|
1375
|
+
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
|
1376
|
+
outline_it.set_to_list (blob_it.data ()->out_list ());
|
1377
|
+
for (outline_it.mark_cycle_pt ();
|
1378
|
+
!outline_it.cycled_list (); outline_it.forward ()) {
|
1379
|
+
outline_count++;
|
1380
|
+
box = outline_it.data ()->bounding_box ();
|
1381
|
+
if (box.height () > box.width ())
|
1382
|
+
max_dimension = box.height ();
|
1383
|
+
else
|
1384
|
+
max_dimension = box.width ();
|
1385
|
+
if (max_dimension < small_limit)
|
1386
|
+
small_outline_count++;
|
1387
|
+
}
|
1388
|
+
}
|
1389
|
+
return (small_outline_count >= outline_count);
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
|
1393
|
+
/*************************************************************************
|
1394
|
+
* insert_rej_cblobs()
|
1395
|
+
* Put rejected word blobs back into the outword.
|
1396
|
+
* NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
|
1397
|
+
* OF ELEMENTS.
|
1398
|
+
*************************************************************************/
|
1399
|
+
void insert_rej_cblobs( //word to do
|
1400
|
+
WERD_RES *word) {
|
1401
|
+
PBLOB_IT blob_it; //blob iterator
|
1402
|
+
PBLOB_IT rej_blob_it;
|
1403
|
+
const STRING *word_str;
|
1404
|
+
const STRING *word_lengths;
|
1405
|
+
int old_len;
|
1406
|
+
int rej_len;
|
1407
|
+
char new_str[512 * UNICHAR_LEN];
|
1408
|
+
char new_lengths[512];
|
1409
|
+
REJMAP new_map;
|
1410
|
+
int i = 0; //new_str index
|
1411
|
+
int j = 0; //old_str index
|
1412
|
+
int i_offset = 0; //new_str offset
|
1413
|
+
int j_offset = 0; //old_str offset
|
1414
|
+
int new_len;
|
1415
|
+
|
1416
|
+
gblob_sort_list (word->outword->rej_blob_list (), TRUE);
|
1417
|
+
rej_blob_it.set_to_list (word->outword->rej_blob_list ());
|
1418
|
+
if (rej_blob_it.empty ())
|
1419
|
+
return;
|
1420
|
+
rej_len = rej_blob_it.length ();
|
1421
|
+
blob_it.set_to_list (word->outword->blob_list ());
|
1422
|
+
word_str = &(word->best_choice->string ());
|
1423
|
+
word_lengths = &(word->best_choice->lengths ());
|
1424
|
+
old_len = word->best_choice->lengths().length ();
|
1425
|
+
ASSERT_HOST (word->reject_map.length () == old_len);
|
1426
|
+
ASSERT_HOST (blob_it.length () == old_len);
|
1427
|
+
if ((old_len + rej_len) > 511)
|
1428
|
+
return; //Word is garbage anyway prevent abort
|
1429
|
+
new_map.initialise (old_len + rej_len);
|
1430
|
+
|
1431
|
+
while (!rej_blob_it.empty ()) {
|
1432
|
+
if ((j >= old_len) ||
|
1433
|
+
(rej_blob_it.data ()->bounding_box ().left () <=
|
1434
|
+
blob_it.data ()->bounding_box ().left ())) {
|
1435
|
+
/* Insert reject blob */
|
1436
|
+
if (j >= old_len)
|
1437
|
+
blob_it.add_to_end (rej_blob_it.extract ());
|
1438
|
+
else
|
1439
|
+
blob_it.add_before_stay_put (rej_blob_it.extract ());
|
1440
|
+
if (!rej_blob_it.empty ())
|
1441
|
+
rej_blob_it.forward ();
|
1442
|
+
new_str[i_offset] = ' ';
|
1443
|
+
new_lengths[i] = 1;
|
1444
|
+
new_map[i].setrej_rej_cblob ();
|
1445
|
+
i_offset += new_lengths[i++];
|
1446
|
+
}
|
1447
|
+
else {
|
1448
|
+
strncpy(new_str + i_offset, &(*word_str)[j_offset],
|
1449
|
+
(*word_lengths)[j]);
|
1450
|
+
new_lengths[i] = (*word_lengths)[j];
|
1451
|
+
new_map[i] = word->reject_map[j];
|
1452
|
+
i_offset += new_lengths[i++];
|
1453
|
+
j_offset += (*word_lengths)[j++];
|
1454
|
+
blob_it.forward ();
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
/* Add any extra normal blobs to strings */
|
1458
|
+
while (j < word_lengths->length ()) {
|
1459
|
+
strncpy(new_str + i_offset, &(*word_str)[j_offset],
|
1460
|
+
(*word_lengths)[j]);
|
1461
|
+
new_lengths[i] = (*word_lengths)[j];
|
1462
|
+
new_map[i] = word->reject_map[j];
|
1463
|
+
i_offset += new_lengths[i++];
|
1464
|
+
j_offset += (*word_lengths)[j++];
|
1465
|
+
}
|
1466
|
+
new_str[i_offset] = '\0';
|
1467
|
+
new_lengths[i] = 0;
|
1468
|
+
/*
|
1469
|
+
tprintf(
|
1470
|
+
"\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
|
1471
|
+
old_len, i, new_str, new_map );
|
1472
|
+
*/
|
1473
|
+
ASSERT_HOST (i == blob_it.length ());
|
1474
|
+
ASSERT_HOST (i == old_len + rej_len);
|
1475
|
+
word->reject_map = new_map;
|
1476
|
+
*((STRING *) word_str) = new_str;
|
1477
|
+
*((STRING *) word_lengths) = new_lengths;
|
1478
|
+
new_len = word->best_choice->lengths ().length ();
|
1479
|
+
ASSERT_HOST (word->reject_map.length () == new_len);
|
1480
|
+
ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
|
1481
|
+
}
|