tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
|
@@ -0,0 +1,1481 @@
|
|
|
1
|
+
/******************************************************************
|
|
2
|
+
* File: docqual.cpp (Formerly docqual.c)
|
|
3
|
+
* Description: Document Quality Metrics
|
|
4
|
+
* Author: Phil Cheatle
|
|
5
|
+
* Created: Mon May 9 11:27:28 BST 1994
|
|
6
|
+
*
|
|
7
|
+
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
|
8
|
+
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
** you may not use this file except in compliance with the License.
|
|
10
|
+
** You may obtain a copy of the License at
|
|
11
|
+
** http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
** Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
** See the License for the specific language governing permissions and
|
|
16
|
+
** limitations under the License.
|
|
17
|
+
*
|
|
18
|
+
**********************************************************************/
|
|
19
|
+
|
|
20
|
+
#include "mfcpch.h"
|
|
21
|
+
#include <ctype.h>
|
|
22
|
+
#include "docqual.h"
|
|
23
|
+
#include "tstruct.h"
|
|
24
|
+
#include "tfacep.h"
|
|
25
|
+
#include "reject.h"
|
|
26
|
+
#include "tessvars.h"
|
|
27
|
+
#include "genblob.h"
|
|
28
|
+
#include "secname.h"
|
|
29
|
+
#include "globals.h"
|
|
30
|
+
|
|
31
|
+
#define EXTERN
|
|
32
|
+
|
|
33
|
+
EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
|
|
34
|
+
EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
|
|
35
|
+
"Non standard number of outlines");
|
|
36
|
+
EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
|
|
37
|
+
"Allow outline errs in unrejection?");
|
|
38
|
+
EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
|
|
39
|
+
"Reduce rejection on good docs");
|
|
40
|
+
EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
|
|
41
|
+
EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
|
|
42
|
+
"%rej allowed before rej whole doc");
|
|
43
|
+
EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
|
|
44
|
+
"%rej allowed before rej whole block");
|
|
45
|
+
EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
|
|
46
|
+
"%rej allowed before rej whole row");
|
|
47
|
+
EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
|
|
48
|
+
"%of row rejects in whole word rejects which prevents whole row rejection");
|
|
49
|
+
EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
|
|
50
|
+
"Only rej partially rejected words in block rejection");
|
|
51
|
+
EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
|
|
52
|
+
"Only rej partially rejected words in row rejection");
|
|
53
|
+
EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
|
|
54
|
+
"Use word segmentation quality metric");
|
|
55
|
+
EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
|
|
56
|
+
"Use word segmentation quality metric");
|
|
57
|
+
EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
|
|
58
|
+
"Only preserve wds longer than this");
|
|
59
|
+
EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
|
|
60
|
+
"Apply row rejection to good docs");
|
|
61
|
+
EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
|
|
62
|
+
"rej good doc wd if more than this fraction rejected");
|
|
63
|
+
EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
|
|
64
|
+
"Reject all bad quality wds");
|
|
65
|
+
EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
|
|
66
|
+
EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
|
|
67
|
+
"Output data to debug file");
|
|
68
|
+
EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
|
|
69
|
+
EXTERN double_VAR (quality_rowrej_pc, 1.1,
|
|
70
|
+
"good_quality_doc gte good char limit");
|
|
71
|
+
|
|
72
|
+
EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
|
|
73
|
+
"Mark v.bad words for tilde crunch");
|
|
74
|
+
EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
|
|
75
|
+
EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
|
|
76
|
+
"Take out ~^ early?");
|
|
77
|
+
|
|
78
|
+
EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
|
|
79
|
+
EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
|
|
80
|
+
EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
|
|
81
|
+
"crunch garbage cert lt this");
|
|
82
|
+
EXTERN double_VAR (crunch_poor_garbage_rate, 60,
|
|
83
|
+
"crunch garbage rating lt this");
|
|
84
|
+
|
|
85
|
+
EXTERN double_VAR (crunch_pot_poor_rate, 40,
|
|
86
|
+
"POTENTIAL crunch rating lt this");
|
|
87
|
+
EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
|
|
88
|
+
"POTENTIAL crunch cert lt this");
|
|
89
|
+
EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
|
|
90
|
+
|
|
91
|
+
EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
|
|
92
|
+
EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
|
|
93
|
+
EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
|
|
94
|
+
EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
|
|
95
|
+
EXTERN double_VAR (crunch_del_min_width, 3.0,
|
|
96
|
+
"Del if word width lt xht x this");
|
|
97
|
+
EXTERN double_VAR (crunch_del_high_word, 1.5,
|
|
98
|
+
"Del if word gt xht x this above bl");
|
|
99
|
+
EXTERN double_VAR (crunch_del_low_word, 0.5,
|
|
100
|
+
"Del if word gt xht x this below bl");
|
|
101
|
+
EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
|
|
102
|
+
|
|
103
|
+
EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
|
|
104
|
+
EXTERN INT_VAR (crunch_pot_indicators, 1,
|
|
105
|
+
"How many potential indicators needed");
|
|
106
|
+
|
|
107
|
+
EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
|
|
108
|
+
"Dont touch sensible strings");
|
|
109
|
+
EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
|
|
110
|
+
EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
|
|
111
|
+
"Dont pot crunch sensible strings");
|
|
112
|
+
EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
|
|
113
|
+
EXTERN INT_VAR (crunch_leave_lc_strings, 4,
|
|
114
|
+
"Dont crunch words with long lower case strings");
|
|
115
|
+
EXTERN INT_VAR (crunch_leave_uc_strings, 4,
|
|
116
|
+
"Dont crunch words with long lower case strings");
|
|
117
|
+
EXTERN INT_VAR (crunch_long_repetitions, 3,
|
|
118
|
+
"Crunch words with long repetitions");
|
|
119
|
+
|
|
120
|
+
EXTERN INT_VAR (crunch_debug, 0, "As it says");
|
|
121
|
+
|
|
122
|
+
/*************************************************************************
|
|
123
|
+
* word_blob_quality()
|
|
124
|
+
* How many blobs in the outword are identical to those of the inword?
|
|
125
|
+
* ASSUME blobs in both initial word and outword are in ascending order of
|
|
126
|
+
* left hand blob edge.
|
|
127
|
+
*************************************************************************/
|
|
128
|
+
inT16 word_blob_quality( //Blob seg changes
|
|
129
|
+
WERD_RES *word,
|
|
130
|
+
ROW *row) {
|
|
131
|
+
WERD *bln_word; //BL norm init word
|
|
132
|
+
TWERD *tessword; //tess format
|
|
133
|
+
WERD *init_word; //BL norm init word
|
|
134
|
+
PBLOB_IT outword_it;
|
|
135
|
+
PBLOB_IT initial_it;
|
|
136
|
+
inT16 i;
|
|
137
|
+
inT16 init_blobs_left;
|
|
138
|
+
inT16 match_count = 0;
|
|
139
|
+
BOOL8 matched;
|
|
140
|
+
TBOX out_box;
|
|
141
|
+
PBLOB *test_blob;
|
|
142
|
+
DENORM denorm;
|
|
143
|
+
float bln_xht;
|
|
144
|
+
|
|
145
|
+
if (word->word->gblob_list ()->empty ())
|
|
146
|
+
return 0;
|
|
147
|
+
//xht used for blnorm
|
|
148
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
|
149
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
|
150
|
+
/*
|
|
151
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
|
152
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
|
153
|
+
comparison
|
|
154
|
+
*/
|
|
155
|
+
// if (!bln_word->flag(W_POLYGON))
|
|
156
|
+
// tprintf( "NON POLYGON BLN WERD\n");
|
|
157
|
+
tessword = make_tess_word (bln_word, NULL);
|
|
158
|
+
//convert word
|
|
159
|
+
init_word = make_ed_word (tessword, bln_word);
|
|
160
|
+
// if (!init_word->flag(W_POLYGON))
|
|
161
|
+
// tprintf( "NON POLYGON INIT WERD\n");
|
|
162
|
+
// tprintf( "SOURCE BLOBS-AFTER TESS:\n");
|
|
163
|
+
// print_boxes( init_word );
|
|
164
|
+
// tprintf( "OUTPUT BLOBS:\n");
|
|
165
|
+
// print_boxes( word->outword );
|
|
166
|
+
|
|
167
|
+
initial_it.set_to_list (init_word->blob_list ());
|
|
168
|
+
init_blobs_left = initial_it.length ();
|
|
169
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
|
170
|
+
delete bln_word;
|
|
171
|
+
delete_word(tessword); //get rid of it
|
|
172
|
+
|
|
173
|
+
for (outword_it.mark_cycle_pt ();
|
|
174
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
|
175
|
+
out_box = outword_it.data ()->bounding_box ();
|
|
176
|
+
|
|
177
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
|
178
|
+
while (!initial_it.at_last () &&
|
|
179
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
|
180
|
+
initial_it.forward ();
|
|
181
|
+
init_blobs_left--;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/* See if current outword blob matches any initial blob with the same left
|
|
185
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
|
186
|
+
|
|
187
|
+
i = 0;
|
|
188
|
+
matched = FALSE;
|
|
189
|
+
do {
|
|
190
|
+
test_blob = initial_it.data_relative (i++);
|
|
191
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
|
192
|
+
if (matched)
|
|
193
|
+
match_count++;
|
|
194
|
+
}
|
|
195
|
+
while (!matched &&
|
|
196
|
+
(init_blobs_left - i > 0) &&
|
|
197
|
+
(i < 129) &&
|
|
198
|
+
!initial_it.at_last () &&
|
|
199
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
|
200
|
+
}
|
|
201
|
+
delete init_word;
|
|
202
|
+
return match_count;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
/*************************************************************************
|
|
207
|
+
* crude_match_blobs()
|
|
208
|
+
* Check bounding boxes are the same and the number of outlines are the same.
|
|
209
|
+
*************************************************************************/
|
|
210
|
+
BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
|
|
211
|
+
TBOX box1 = blob1->bounding_box ();
|
|
212
|
+
TBOX box2 = blob2->bounding_box ();
|
|
213
|
+
|
|
214
|
+
if (box1.contains (box2) &&
|
|
215
|
+
box2.contains (box1) &&
|
|
216
|
+
(blob1->out_list ()->length () == blob1->out_list ()->length ()))
|
|
217
|
+
return TRUE;
|
|
218
|
+
else
|
|
219
|
+
return FALSE;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
inT16 word_outline_errs( //Outline count errs
|
|
224
|
+
WERD_RES *word) {
|
|
225
|
+
PBLOB_IT outword_it;
|
|
226
|
+
inT16 i = 0;
|
|
227
|
+
inT16 err_count = 0;
|
|
228
|
+
|
|
229
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
|
230
|
+
|
|
231
|
+
for (outword_it.mark_cycle_pt ();
|
|
232
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
|
233
|
+
err_count += count_outline_errs (word->best_choice->string ()[i],
|
|
234
|
+
outword_it.data ()->out_list ()->
|
|
235
|
+
length ());
|
|
236
|
+
i++;
|
|
237
|
+
}
|
|
238
|
+
return err_count;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
/*************************************************************************
|
|
243
|
+
* word_char_quality()
|
|
244
|
+
* Combination of blob quality and outline quality - how many good chars are
|
|
245
|
+
* there? - I.e chars which pass the blob AND outline tests.
|
|
246
|
+
*************************************************************************/
|
|
247
|
+
void word_char_quality( //Blob seg changes
|
|
248
|
+
WERD_RES *word,
|
|
249
|
+
ROW *row,
|
|
250
|
+
inT16 *match_count,
|
|
251
|
+
inT16 *accepted_match_count) {
|
|
252
|
+
WERD *bln_word; //BL norm init word
|
|
253
|
+
TWERD *tessword; //tess format
|
|
254
|
+
WERD *init_word; //BL norm init word
|
|
255
|
+
PBLOB_IT outword_it;
|
|
256
|
+
PBLOB_IT initial_it;
|
|
257
|
+
inT16 i;
|
|
258
|
+
inT16 init_blobs_left;
|
|
259
|
+
BOOL8 matched;
|
|
260
|
+
TBOX out_box;
|
|
261
|
+
PBLOB *test_blob;
|
|
262
|
+
DENORM denorm;
|
|
263
|
+
float bln_xht;
|
|
264
|
+
inT16 j = 0;
|
|
265
|
+
|
|
266
|
+
*match_count = 0;
|
|
267
|
+
*accepted_match_count = 0;
|
|
268
|
+
if (word->word->gblob_list ()->empty ())
|
|
269
|
+
return;
|
|
270
|
+
|
|
271
|
+
//xht used for blnorm
|
|
272
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
|
273
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
|
274
|
+
/*
|
|
275
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
|
276
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
|
277
|
+
comparison
|
|
278
|
+
*/
|
|
279
|
+
tessword = make_tess_word (bln_word, NULL);
|
|
280
|
+
//convert word
|
|
281
|
+
init_word = make_ed_word (tessword, bln_word);
|
|
282
|
+
delete bln_word;
|
|
283
|
+
delete_word(tessword); //get rid of it
|
|
284
|
+
// tprintf( "SOURCE BLOBS-AFTER TESS:\n");
|
|
285
|
+
// print_boxes( init_word );
|
|
286
|
+
// tprintf( "OUTPUT BLOBS:\n");
|
|
287
|
+
// print_boxes( word->outword );
|
|
288
|
+
|
|
289
|
+
initial_it.set_to_list (init_word->blob_list ());
|
|
290
|
+
init_blobs_left = initial_it.length ();
|
|
291
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
|
292
|
+
|
|
293
|
+
for (outword_it.mark_cycle_pt ();
|
|
294
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
|
295
|
+
out_box = outword_it.data ()->bounding_box ();
|
|
296
|
+
|
|
297
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
|
298
|
+
while (!initial_it.at_last () &&
|
|
299
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
|
300
|
+
initial_it.forward ();
|
|
301
|
+
init_blobs_left--;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/* See if current outword blob matches any initial blob with the same left
|
|
305
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
|
306
|
+
|
|
307
|
+
i = 0;
|
|
308
|
+
matched = FALSE;
|
|
309
|
+
do {
|
|
310
|
+
test_blob = initial_it.data_relative (i++);
|
|
311
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
|
312
|
+
if (matched &&
|
|
313
|
+
(count_outline_errs (word->best_choice->string ()[j],
|
|
314
|
+
outword_it.data ()->out_list ()->length ())
|
|
315
|
+
== 0)) {
|
|
316
|
+
(*match_count)++;
|
|
317
|
+
if (word->reject_map[j].accepted ())
|
|
318
|
+
(*accepted_match_count)++;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
while (!matched &&
|
|
322
|
+
(init_blobs_left - i > 0) &&
|
|
323
|
+
(i < 129) &&
|
|
324
|
+
!initial_it.at_last () &&
|
|
325
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
|
326
|
+
j++;
|
|
327
|
+
}
|
|
328
|
+
delete init_word;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
/*************************************************************************
|
|
333
|
+
* unrej_good_chs()
|
|
334
|
+
* Unreject POTENTIAL rejects if the blob passes the blob and outline checks
|
|
335
|
+
*************************************************************************/
|
|
336
|
+
void unrej_good_chs(WERD_RES *word, ROW *row) {
|
|
337
|
+
WERD *bln_word; //BL norm init word
|
|
338
|
+
TWERD *tessword; //tess format
|
|
339
|
+
WERD *init_word; //BL norm init word
|
|
340
|
+
PBLOB_IT outword_it;
|
|
341
|
+
PBLOB_IT initial_it;
|
|
342
|
+
inT16 i;
|
|
343
|
+
inT16 init_blobs_left;
|
|
344
|
+
BOOL8 matched;
|
|
345
|
+
TBOX out_box;
|
|
346
|
+
PBLOB *test_blob;
|
|
347
|
+
DENORM denorm;
|
|
348
|
+
float bln_xht;
|
|
349
|
+
inT16 j = 0;
|
|
350
|
+
|
|
351
|
+
if (word->word->gblob_list ()->empty ())
|
|
352
|
+
return;
|
|
353
|
+
|
|
354
|
+
//xht used for blnorm
|
|
355
|
+
bln_xht = bln_x_height / word->denorm.scale ();
|
|
356
|
+
bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
|
|
357
|
+
/*
|
|
358
|
+
NOTE: Need to convert to tess format and back again to ensure that the
|
|
359
|
+
same float -> int rounding of coords is done to source wd as out wd before
|
|
360
|
+
comparison
|
|
361
|
+
*/
|
|
362
|
+
tessword = make_tess_word (bln_word, NULL);
|
|
363
|
+
//convert word
|
|
364
|
+
init_word = make_ed_word (tessword, bln_word);
|
|
365
|
+
delete bln_word;
|
|
366
|
+
delete_word(tessword); //get rid of it
|
|
367
|
+
|
|
368
|
+
initial_it.set_to_list (init_word->blob_list ());
|
|
369
|
+
init_blobs_left = initial_it.length ();
|
|
370
|
+
outword_it.set_to_list (word->outword->blob_list ());
|
|
371
|
+
|
|
372
|
+
for (outword_it.mark_cycle_pt ();
|
|
373
|
+
!outword_it.cycled_list (); outword_it.forward ()) {
|
|
374
|
+
out_box = outword_it.data ()->bounding_box ();
|
|
375
|
+
|
|
376
|
+
/* Skip any initial blobs LEFT of current outword blob */
|
|
377
|
+
while (!initial_it.at_last () &&
|
|
378
|
+
(initial_it.data ()->bounding_box ().left () < out_box.left ())) {
|
|
379
|
+
initial_it.forward ();
|
|
380
|
+
init_blobs_left--;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/* See if current outword blob matches any initial blob with the same left
|
|
384
|
+
coord. (Normally only one but possibly more - in unknown order) */
|
|
385
|
+
|
|
386
|
+
i = 0;
|
|
387
|
+
matched = FALSE;
|
|
388
|
+
do {
|
|
389
|
+
test_blob = initial_it.data_relative (i++);
|
|
390
|
+
matched = crude_match_blobs (test_blob, outword_it.data ());
|
|
391
|
+
if (matched &&
|
|
392
|
+
(word->reject_map[j].accept_if_good_quality ()) &&
|
|
393
|
+
(docqual_excuse_outline_errs ||
|
|
394
|
+
(count_outline_errs (word->best_choice->string ()[j],
|
|
395
|
+
outword_it.data ()->out_list ()->
|
|
396
|
+
length ()) == 0)))
|
|
397
|
+
word->reject_map[j].setrej_quality_accept ();
|
|
398
|
+
}
|
|
399
|
+
while (!matched &&
|
|
400
|
+
(init_blobs_left - i > 0) &&
|
|
401
|
+
(i < 129) &&
|
|
402
|
+
!initial_it.at_last () &&
|
|
403
|
+
test_blob->bounding_box ().left () == out_box.left ());
|
|
404
|
+
j++;
|
|
405
|
+
}
|
|
406
|
+
delete init_word;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
void print_boxes(WERD *word) {
|
|
411
|
+
PBLOB_IT it;
|
|
412
|
+
TBOX box;
|
|
413
|
+
|
|
414
|
+
it.set_to_list (word->blob_list ());
|
|
415
|
+
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
|
|
416
|
+
box = it.data ()->bounding_box ();
|
|
417
|
+
box.print ();
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
inT16 count_outline_errs(char c, inT16 outline_count) {
|
|
423
|
+
int expected_outline_count;
|
|
424
|
+
|
|
425
|
+
if (STRING (outlines_odd).contains (c))
|
|
426
|
+
return 0; //Dont use this char
|
|
427
|
+
else if (STRING (outlines_2).contains (c))
|
|
428
|
+
expected_outline_count = 2;
|
|
429
|
+
else
|
|
430
|
+
expected_outline_count = 1;
|
|
431
|
+
return abs (outline_count - expected_outline_count);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
void quality_based_rejection(PAGE_RES_IT &page_res_it,
|
|
436
|
+
BOOL8 good_quality_doc) {
|
|
437
|
+
if ((tessedit_good_quality_unrej && good_quality_doc))
|
|
438
|
+
unrej_good_quality_words(page_res_it);
|
|
439
|
+
doc_and_block_rejection(page_res_it, good_quality_doc);
|
|
440
|
+
|
|
441
|
+
page_res_it.restart_page ();
|
|
442
|
+
while (page_res_it.word () != NULL) {
|
|
443
|
+
insert_rej_cblobs (page_res_it.word ());
|
|
444
|
+
page_res_it.forward ();
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (unlv_tilde_crunching) {
|
|
448
|
+
tilde_crunch(page_res_it);
|
|
449
|
+
tilde_delete(page_res_it);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
/*************************************************************************
|
|
455
|
+
* unrej_good_quality_words()
|
|
456
|
+
* Accept potential rejects in words which pass the following checks:
|
|
457
|
+
* - Contains a potential reject
|
|
458
|
+
* - Word looks like a sensible alpha word.
|
|
459
|
+
* - Word segmentation is the same as the original image
|
|
460
|
+
* - All characters have the expected number of outlines
|
|
461
|
+
* NOTE - the rejection counts are recalculated after unrejection
|
|
462
|
+
* - CANT do it in a single pass without a bit of fiddling
|
|
463
|
+
* - keep it simple but inefficient
|
|
464
|
+
*************************************************************************/
|
|
465
|
+
void unrej_good_quality_words( //unreject potential
|
|
466
|
+
PAGE_RES_IT &page_res_it) {
|
|
467
|
+
WERD_RES *word;
|
|
468
|
+
ROW_RES *current_row;
|
|
469
|
+
BLOCK_RES *current_block;
|
|
470
|
+
int i;
|
|
471
|
+
|
|
472
|
+
page_res_it.restart_page ();
|
|
473
|
+
while (page_res_it.word () != NULL) {
|
|
474
|
+
check_debug_pt (page_res_it.word (), 100);
|
|
475
|
+
if (bland_unrej) {
|
|
476
|
+
word = page_res_it.word ();
|
|
477
|
+
for (i = 0; i < word->reject_map.length (); i++) {
|
|
478
|
+
if (word->reject_map[i].accept_if_good_quality ())
|
|
479
|
+
word->reject_map[i].setrej_quality_accept ();
|
|
480
|
+
}
|
|
481
|
+
page_res_it.forward ();
|
|
482
|
+
}
|
|
483
|
+
else if ((page_res_it.row ()->char_count > 0) &&
|
|
484
|
+
((page_res_it.row ()->rej_count /
|
|
485
|
+
(float) page_res_it.row ()->char_count) <=
|
|
486
|
+
quality_rowrej_pc)) {
|
|
487
|
+
word = page_res_it.word ();
|
|
488
|
+
if (word->reject_map.quality_recoverable_rejects () &&
|
|
489
|
+
(tessedit_unrej_any_wd ||
|
|
490
|
+
acceptable_word_string (word->best_choice->string ().string (),
|
|
491
|
+
word->best_choice->lengths().string())
|
|
492
|
+
!= AC_UNACCEPTABLE)) {
|
|
493
|
+
unrej_good_chs (word, page_res_it.row ()->row);
|
|
494
|
+
}
|
|
495
|
+
page_res_it.forward ();
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
/* Skip to end of dodgy row */
|
|
499
|
+
current_row = page_res_it.row ();
|
|
500
|
+
while ((page_res_it.word () != NULL) &&
|
|
501
|
+
(page_res_it.row () == current_row))
|
|
502
|
+
page_res_it.forward ();
|
|
503
|
+
}
|
|
504
|
+
check_debug_pt (page_res_it.word (), 110);
|
|
505
|
+
}
|
|
506
|
+
page_res_it.restart_page ();
|
|
507
|
+
page_res_it.page_res->char_count = 0;
|
|
508
|
+
page_res_it.page_res->rej_count = 0;
|
|
509
|
+
current_block = NULL;
|
|
510
|
+
current_row = NULL;
|
|
511
|
+
while (page_res_it.word () != NULL) {
|
|
512
|
+
if (current_block != page_res_it.block ()) {
|
|
513
|
+
current_block = page_res_it.block ();
|
|
514
|
+
current_block->char_count = 0;
|
|
515
|
+
current_block->rej_count = 0;
|
|
516
|
+
}
|
|
517
|
+
if (current_row != page_res_it.row ()) {
|
|
518
|
+
current_row = page_res_it.row ();
|
|
519
|
+
current_row->char_count = 0;
|
|
520
|
+
current_row->rej_count = 0;
|
|
521
|
+
current_row->whole_word_rej_count = 0;
|
|
522
|
+
}
|
|
523
|
+
page_res_it.rej_stat_word ();
|
|
524
|
+
page_res_it.forward ();
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
/*************************************************************************
|
|
530
|
+
* doc_and_block_rejection()
|
|
531
|
+
*
|
|
532
|
+
* If the page has too many rejects - reject all of it.
|
|
533
|
+
* If any block has too many rejects - reject all words in the block
|
|
534
|
+
*************************************************************************/
|
|
535
|
+
|
|
536
|
+
void doc_and_block_rejection( //reject big chunks
|
|
537
|
+
PAGE_RES_IT &page_res_it,
|
|
538
|
+
BOOL8 good_quality_doc) {
|
|
539
|
+
inT16 block_no = 0;
|
|
540
|
+
inT16 row_no = 0;
|
|
541
|
+
BLOCK_RES *current_block;
|
|
542
|
+
ROW_RES *current_row;
|
|
543
|
+
|
|
544
|
+
BOOL8 rej_word;
|
|
545
|
+
BOOL8 prev_word_rejected;
|
|
546
|
+
inT16 char_quality;
|
|
547
|
+
inT16 accepted_char_quality;
|
|
548
|
+
|
|
549
|
+
if ((page_res_it.page_res->rej_count * 100.0 /
|
|
550
|
+
page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
|
|
551
|
+
reject_whole_page(page_res_it);
|
|
552
|
+
#ifndef SECURE_NAMES
|
|
553
|
+
if (tessedit_debug_doc_rejection) {
|
|
554
|
+
tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
|
|
555
|
+
page_res_it.page_res->char_count,
|
|
556
|
+
page_res_it.page_res->rej_count);
|
|
557
|
+
}
|
|
558
|
+
#endif
|
|
559
|
+
}
|
|
560
|
+
else {
|
|
561
|
+
#ifndef SECURE_NAMES
|
|
562
|
+
if (tessedit_debug_doc_rejection)
|
|
563
|
+
tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
|
|
564
|
+
page_res_it.page_res->char_count,
|
|
565
|
+
page_res_it.page_res->rej_count);
|
|
566
|
+
#endif
|
|
567
|
+
|
|
568
|
+
/* Walk blocks testing for block rejection */
|
|
569
|
+
|
|
570
|
+
page_res_it.restart_page ();
|
|
571
|
+
while (page_res_it.word () != NULL) {
|
|
572
|
+
current_block = page_res_it.block ();
|
|
573
|
+
if (current_block->block->text_region () != NULL)
|
|
574
|
+
block_no = current_block->block->text_region ()->id_no ();
|
|
575
|
+
else
|
|
576
|
+
block_no = -1;
|
|
577
|
+
if ((page_res_it.block ()->char_count > 0) &&
|
|
578
|
+
((page_res_it.block ()->rej_count * 100.0 /
|
|
579
|
+
page_res_it.block ()->char_count) >
|
|
580
|
+
tessedit_reject_block_percent)) {
|
|
581
|
+
#ifndef SECURE_NAMES
|
|
582
|
+
if (tessedit_debug_block_rejection)
|
|
583
|
+
tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
|
|
584
|
+
block_no,
|
|
585
|
+
page_res_it.block ()->char_count,
|
|
586
|
+
page_res_it.block ()->rej_count);
|
|
587
|
+
#endif
|
|
588
|
+
prev_word_rejected = FALSE;
|
|
589
|
+
while ((page_res_it.word () != NULL) &&
|
|
590
|
+
(page_res_it.block () == current_block)) {
|
|
591
|
+
if (tessedit_preserve_blk_rej_perfect_wds) {
|
|
592
|
+
rej_word =
|
|
593
|
+
(page_res_it.word ()->reject_map.reject_count () > 0)
|
|
594
|
+
|| (page_res_it.word ()->reject_map.length () <
|
|
595
|
+
tessedit_preserve_min_wd_len);
|
|
596
|
+
if (rej_word && tessedit_dont_blkrej_good_wds
|
|
597
|
+
&& !(page_res_it.word ()->reject_map.length () <
|
|
598
|
+
tessedit_preserve_min_wd_len)
|
|
599
|
+
&&
|
|
600
|
+
(acceptable_word_string
|
|
601
|
+
(page_res_it.word ()->best_choice->string ().
|
|
602
|
+
string (),
|
|
603
|
+
page_res_it.word ()->best_choice->lengths ().
|
|
604
|
+
string ()) != AC_UNACCEPTABLE)) {
|
|
605
|
+
word_char_quality (page_res_it.word (),
|
|
606
|
+
page_res_it.row ()->row,
|
|
607
|
+
&char_quality,
|
|
608
|
+
&accepted_char_quality);
|
|
609
|
+
rej_word = char_quality !=
|
|
610
|
+
page_res_it.word ()->reject_map.length ();
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
else
|
|
614
|
+
rej_word = TRUE;
|
|
615
|
+
if (rej_word) {
|
|
616
|
+
/*
|
|
617
|
+
Reject spacing if both current and prev words are rejected.
|
|
618
|
+
NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
|
|
619
|
+
more space errors.
|
|
620
|
+
*/
|
|
621
|
+
if (tessedit_use_reject_spaces &&
|
|
622
|
+
prev_word_rejected &&
|
|
623
|
+
(page_res_it.prev_row () == page_res_it.row ()) &&
|
|
624
|
+
(page_res_it.word ()->word->space () == 1))
|
|
625
|
+
page_res_it.word ()->reject_spaces = TRUE;
|
|
626
|
+
page_res_it.word ()->reject_map.rej_word_block_rej ();
|
|
627
|
+
}
|
|
628
|
+
prev_word_rejected = rej_word;
|
|
629
|
+
page_res_it.forward ();
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
else {
|
|
633
|
+
#ifndef SECURE_NAMES
|
|
634
|
+
if (tessedit_debug_block_rejection)
|
|
635
|
+
tprintf
|
|
636
|
+
("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
|
|
637
|
+
block_no, page_res_it.block ()->char_count,
|
|
638
|
+
page_res_it.block ()->rej_count);
|
|
639
|
+
#endif
|
|
640
|
+
|
|
641
|
+
/* Walk rows in block testing for row rejection */
|
|
642
|
+
row_no = 0;
|
|
643
|
+
while ((page_res_it.word () != NULL) &&
|
|
644
|
+
(page_res_it.block () == current_block)) {
|
|
645
|
+
current_row = page_res_it.row ();
|
|
646
|
+
row_no++;
|
|
647
|
+
/* Reject whole row if:
|
|
648
|
+
fraction of chars on row which are rejected exceed a limit AND
|
|
649
|
+
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
|
|
650
|
+
*/
|
|
651
|
+
if ((page_res_it.row ()->char_count > 0) &&
|
|
652
|
+
((page_res_it.row ()->rej_count * 100.0 /
|
|
653
|
+
page_res_it.row ()->char_count) >
|
|
654
|
+
tessedit_reject_row_percent) &&
|
|
655
|
+
((page_res_it.row ()->whole_word_rej_count * 100.0 /
|
|
656
|
+
page_res_it.row ()->rej_count) <
|
|
657
|
+
tessedit_whole_wd_rej_row_percent)) {
|
|
658
|
+
#ifndef SECURE_NAMES
|
|
659
|
+
if (tessedit_debug_block_rejection)
|
|
660
|
+
tprintf
|
|
661
|
+
("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
|
|
662
|
+
row_no, page_res_it.row ()->char_count,
|
|
663
|
+
page_res_it.row ()->rej_count);
|
|
664
|
+
#endif
|
|
665
|
+
prev_word_rejected = FALSE;
|
|
666
|
+
while ((page_res_it.word () != NULL) &&
|
|
667
|
+
(page_res_it.row () == current_row)) {
|
|
668
|
+
/* Preserve words on good docs unless they are mostly rejected*/
|
|
669
|
+
if (!tessedit_row_rej_good_docs && good_quality_doc) {
|
|
670
|
+
rej_word =
|
|
671
|
+
page_res_it.word ()->reject_map.
|
|
672
|
+
reject_count () /
|
|
673
|
+
(float) page_res_it.word ()->reject_map.
|
|
674
|
+
length () > tessedit_good_doc_still_rowrej_wd;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/* Preserve perfect words anyway */
|
|
678
|
+
else if (tessedit_preserve_row_rej_perfect_wds) {
|
|
679
|
+
rej_word =
|
|
680
|
+
(page_res_it.word ()->reject_map.
|
|
681
|
+
reject_count () > 0)
|
|
682
|
+
|| (page_res_it.word ()->reject_map.
|
|
683
|
+
length () < tessedit_preserve_min_wd_len);
|
|
684
|
+
if (rej_word && tessedit_dont_rowrej_good_wds
|
|
685
|
+
&& !(page_res_it.word ()->reject_map.
|
|
686
|
+
length () <
|
|
687
|
+
tessedit_preserve_min_wd_len)
|
|
688
|
+
&&
|
|
689
|
+
(acceptable_word_string
|
|
690
|
+
(page_res_it.word ()->best_choice->
|
|
691
|
+
string ().string (),
|
|
692
|
+
page_res_it.word ()->best_choice->
|
|
693
|
+
lengths ().string ()) != AC_UNACCEPTABLE)) {
|
|
694
|
+
word_char_quality (page_res_it.word (),
|
|
695
|
+
page_res_it.row ()->row,
|
|
696
|
+
&char_quality,
|
|
697
|
+
&accepted_char_quality);
|
|
698
|
+
rej_word = char_quality !=
|
|
699
|
+
page_res_it.word ()->reject_map.length ();
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
else
|
|
703
|
+
rej_word = TRUE;
|
|
704
|
+
if (rej_word) {
|
|
705
|
+
/*
|
|
706
|
+
Reject spacing if both current and prev words are rejected.
|
|
707
|
+
NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated
|
|
708
|
+
more space errors.
|
|
709
|
+
*/
|
|
710
|
+
if (tessedit_use_reject_spaces &&
|
|
711
|
+
prev_word_rejected &&
|
|
712
|
+
(page_res_it.prev_row () ==
|
|
713
|
+
page_res_it.row ())
|
|
714
|
+
&& (page_res_it.word ()->word->space () ==
|
|
715
|
+
1))
|
|
716
|
+
page_res_it.word ()->reject_spaces = TRUE;
|
|
717
|
+
page_res_it.word ()->reject_map.
|
|
718
|
+
rej_word_row_rej();
|
|
719
|
+
}
|
|
720
|
+
prev_word_rejected = rej_word;
|
|
721
|
+
page_res_it.forward ();
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
else {
|
|
725
|
+
#ifndef SECURE_NAMES
|
|
726
|
+
if (tessedit_debug_block_rejection)
|
|
727
|
+
tprintf
|
|
728
|
+
("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
|
|
729
|
+
row_no, page_res_it.row ()->char_count,
|
|
730
|
+
page_res_it.row ()->rej_count);
|
|
731
|
+
#endif
|
|
732
|
+
while ((page_res_it.word () != NULL) &&
|
|
733
|
+
(page_res_it.row () == current_row))
|
|
734
|
+
page_res_it.forward ();
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
/*************************************************************************
|
|
744
|
+
* reject_whole_page()
|
|
745
|
+
* Dont believe any of it - set the reject map to 00..00 in all words
|
|
746
|
+
*
|
|
747
|
+
*************************************************************************/
|
|
748
|
+
|
|
749
|
+
void reject_whole_page(PAGE_RES_IT &page_res_it) {
|
|
750
|
+
page_res_it.restart_page ();
|
|
751
|
+
while (page_res_it.word () != NULL) {
|
|
752
|
+
page_res_it.word ()->reject_map.rej_word_doc_rej ();
|
|
753
|
+
page_res_it.forward ();
|
|
754
|
+
}
|
|
755
|
+
//whole page is rejected
|
|
756
|
+
page_res_it.page_res->rejected = TRUE;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
void tilde_crunch(PAGE_RES_IT &page_res_it) {
|
|
761
|
+
WERD_RES *word;
|
|
762
|
+
GARBAGE_LEVEL garbage_level;
|
|
763
|
+
PAGE_RES_IT copy_it;
|
|
764
|
+
BOOL8 prev_potential_marked = FALSE;
|
|
765
|
+
BOOL8 found_terrible_word = FALSE;
|
|
766
|
+
int dict_type;
|
|
767
|
+
BOOL8 ok_dict_word;
|
|
768
|
+
|
|
769
|
+
page_res_it.restart_page ();
|
|
770
|
+
while (page_res_it.word () != NULL) {
|
|
771
|
+
word = page_res_it.word ();
|
|
772
|
+
|
|
773
|
+
if (crunch_early_convert_bad_unlv_chs)
|
|
774
|
+
convert_bad_unlv_chs(word);
|
|
775
|
+
|
|
776
|
+
if (crunch_early_merge_tess_fails)
|
|
777
|
+
merge_tess_fails(word);
|
|
778
|
+
|
|
779
|
+
if (word->reject_map.accept_count () != 0) {
|
|
780
|
+
found_terrible_word = FALSE;
|
|
781
|
+
//Forget earlier potential crunches
|
|
782
|
+
prev_potential_marked = FALSE;
|
|
783
|
+
}
|
|
784
|
+
else {
|
|
785
|
+
dict_type = dict_word (word->best_choice->string ().string ());
|
|
786
|
+
ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
|
|
787
|
+
garbage_level = garbage_word (word, ok_dict_word);
|
|
788
|
+
|
|
789
|
+
if ((garbage_level != G_NEVER_CRUNCH) &&
|
|
790
|
+
(terrible_word_crunch (word, garbage_level))) {
|
|
791
|
+
if (crunch_debug > 0) {
|
|
792
|
+
tprintf ("T CRUNCHING: \"%s\"\n",
|
|
793
|
+
word->best_choice->string ().string ());
|
|
794
|
+
}
|
|
795
|
+
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
|
796
|
+
if (prev_potential_marked) {
|
|
797
|
+
while (copy_it.word () != word) {
|
|
798
|
+
if (crunch_debug > 0) {
|
|
799
|
+
tprintf ("P1 CRUNCHING: \"%s\"\n",
|
|
800
|
+
copy_it.word ()->best_choice->string ().
|
|
801
|
+
string ());
|
|
802
|
+
}
|
|
803
|
+
copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
|
|
804
|
+
copy_it.forward ();
|
|
805
|
+
}
|
|
806
|
+
prev_potential_marked = FALSE;
|
|
807
|
+
}
|
|
808
|
+
found_terrible_word = TRUE;
|
|
809
|
+
}
|
|
810
|
+
else if ((garbage_level != G_NEVER_CRUNCH) &&
|
|
811
|
+
(potential_word_crunch (word,
|
|
812
|
+
garbage_level, ok_dict_word))) {
|
|
813
|
+
if (found_terrible_word) {
|
|
814
|
+
if (crunch_debug > 0) {
|
|
815
|
+
tprintf ("P2 CRUNCHING: \"%s\"\n",
|
|
816
|
+
word->best_choice->string ().string ());
|
|
817
|
+
}
|
|
818
|
+
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
|
819
|
+
}
|
|
820
|
+
else if (!prev_potential_marked) {
|
|
821
|
+
copy_it = page_res_it;
|
|
822
|
+
prev_potential_marked = TRUE;
|
|
823
|
+
if (crunch_debug > 1) {
|
|
824
|
+
tprintf ("P3 CRUNCHING: \"%s\"\n",
|
|
825
|
+
word->best_choice->string ().string ());
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
else {
|
|
830
|
+
found_terrible_word = FALSE;
|
|
831
|
+
//Forget earlier potential crunches
|
|
832
|
+
prev_potential_marked = FALSE;
|
|
833
|
+
if (crunch_debug > 2) {
|
|
834
|
+
tprintf ("NO CRUNCH: \"%s\"\n",
|
|
835
|
+
word->best_choice->string ().string ());
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
page_res_it.forward ();
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
|
|
845
|
+
float rating_per_ch;
|
|
846
|
+
int adjusted_len;
|
|
847
|
+
int crunch_mode = 0;
|
|
848
|
+
|
|
849
|
+
if ((word->best_choice->string ().length () == 0) ||
|
|
850
|
+
(strspn (word->best_choice->string ().string (), " ") ==
|
|
851
|
+
word->best_choice->string ().length ()))
|
|
852
|
+
crunch_mode = 1;
|
|
853
|
+
else {
|
|
854
|
+
adjusted_len = word->reject_map.length ();
|
|
855
|
+
if (adjusted_len > crunch_rating_max)
|
|
856
|
+
adjusted_len = crunch_rating_max;
|
|
857
|
+
rating_per_ch = word->best_choice->rating () / adjusted_len;
|
|
858
|
+
|
|
859
|
+
if (rating_per_ch > crunch_terrible_rating)
|
|
860
|
+
crunch_mode = 2;
|
|
861
|
+
else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
|
|
862
|
+
crunch_mode = 3;
|
|
863
|
+
else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
|
|
864
|
+
(garbage_level != G_OK))
|
|
865
|
+
crunch_mode = 4;
|
|
866
|
+
else if ((rating_per_ch > crunch_poor_garbage_rate) &&
|
|
867
|
+
(garbage_level != G_OK))
|
|
868
|
+
crunch_mode = 5;
|
|
869
|
+
}
|
|
870
|
+
if (crunch_mode > 0) {
|
|
871
|
+
if (crunch_debug > 2) {
|
|
872
|
+
tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
|
|
873
|
+
crunch_mode, word->best_choice->string ().string ());
|
|
874
|
+
}
|
|
875
|
+
return TRUE;
|
|
876
|
+
}
|
|
877
|
+
else
|
|
878
|
+
return FALSE;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
BOOL8 potential_word_crunch(WERD_RES *word,
|
|
883
|
+
GARBAGE_LEVEL garbage_level,
|
|
884
|
+
BOOL8 ok_dict_word) {
|
|
885
|
+
float rating_per_ch;
|
|
886
|
+
int adjusted_len;
|
|
887
|
+
const char *str = word->best_choice->string ().string ();
|
|
888
|
+
const char *lengths = word->best_choice->lengths ().string ();
|
|
889
|
+
BOOL8 word_crunchable;
|
|
890
|
+
int poor_indicator_count = 0;
|
|
891
|
+
|
|
892
|
+
word_crunchable =
|
|
893
|
+
!crunch_leave_accept_strings ||
|
|
894
|
+
(word->reject_map.length () < 3) ||
|
|
895
|
+
((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
|
|
896
|
+
!ok_dict_word);
|
|
897
|
+
|
|
898
|
+
adjusted_len = word->reject_map.length ();
|
|
899
|
+
if (adjusted_len > 10)
|
|
900
|
+
adjusted_len = 10;
|
|
901
|
+
rating_per_ch = word->best_choice->rating () / adjusted_len;
|
|
902
|
+
|
|
903
|
+
if (rating_per_ch > crunch_pot_poor_rate) {
|
|
904
|
+
if (crunch_debug > 2) {
|
|
905
|
+
tprintf ("Potential poor rating on \"%s\"\n",
|
|
906
|
+
word->best_choice->string ().string ());
|
|
907
|
+
}
|
|
908
|
+
poor_indicator_count++;
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
if (word_crunchable &&
|
|
912
|
+
(word->best_choice->certainty () < crunch_pot_poor_cert)) {
|
|
913
|
+
if (crunch_debug > 2) {
|
|
914
|
+
tprintf ("Potential poor cert on \"%s\"\n",
|
|
915
|
+
word->best_choice->string ().string ());
|
|
916
|
+
}
|
|
917
|
+
poor_indicator_count++;
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
if (garbage_level != G_OK) {
|
|
921
|
+
if (crunch_debug > 2) {
|
|
922
|
+
tprintf ("Potential garbage on \"%s\"\n",
|
|
923
|
+
word->best_choice->string ().string ());
|
|
924
|
+
}
|
|
925
|
+
poor_indicator_count++;
|
|
926
|
+
}
|
|
927
|
+
return (poor_indicator_count >= crunch_pot_indicators);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
void tilde_delete(PAGE_RES_IT &page_res_it) {
|
|
932
|
+
WERD_RES *word;
|
|
933
|
+
PAGE_RES_IT copy_it;
|
|
934
|
+
BOOL8 deleting_from_bol = FALSE;
|
|
935
|
+
BOOL8 marked_delete_point = FALSE;
|
|
936
|
+
inT16 debug_delete_mode;
|
|
937
|
+
CRUNCH_MODE delete_mode;
|
|
938
|
+
inT16 x_debug_delete_mode;
|
|
939
|
+
CRUNCH_MODE x_delete_mode;
|
|
940
|
+
|
|
941
|
+
page_res_it.restart_page ();
|
|
942
|
+
while (page_res_it.word () != NULL) {
|
|
943
|
+
word = page_res_it.word ();
|
|
944
|
+
|
|
945
|
+
delete_mode = word_deletable (word, debug_delete_mode);
|
|
946
|
+
if (delete_mode != CR_NONE) {
|
|
947
|
+
if (word->word->flag (W_BOL) || deleting_from_bol) {
|
|
948
|
+
if (crunch_debug > 0) {
|
|
949
|
+
tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
|
|
950
|
+
debug_delete_mode,
|
|
951
|
+
word->best_choice->string ().string ());
|
|
952
|
+
}
|
|
953
|
+
word->unlv_crunch_mode = delete_mode;
|
|
954
|
+
deleting_from_bol = TRUE;
|
|
955
|
+
}
|
|
956
|
+
else if (word->word->flag (W_EOL)) {
|
|
957
|
+
if (marked_delete_point) {
|
|
958
|
+
while (copy_it.word () != word) {
|
|
959
|
+
x_delete_mode = word_deletable (copy_it.word (),
|
|
960
|
+
x_debug_delete_mode);
|
|
961
|
+
if (crunch_debug > 0) {
|
|
962
|
+
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
|
|
963
|
+
x_debug_delete_mode,
|
|
964
|
+
copy_it.word ()->best_choice->string ().
|
|
965
|
+
string ());
|
|
966
|
+
}
|
|
967
|
+
copy_it.word ()->unlv_crunch_mode = x_delete_mode;
|
|
968
|
+
copy_it.forward ();
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
if (crunch_debug > 0) {
|
|
972
|
+
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
|
|
973
|
+
debug_delete_mode,
|
|
974
|
+
word->best_choice->string ().string ());
|
|
975
|
+
}
|
|
976
|
+
word->unlv_crunch_mode = delete_mode;
|
|
977
|
+
deleting_from_bol = FALSE;
|
|
978
|
+
marked_delete_point = FALSE;
|
|
979
|
+
}
|
|
980
|
+
else {
|
|
981
|
+
if (!marked_delete_point) {
|
|
982
|
+
copy_it = page_res_it;
|
|
983
|
+
marked_delete_point = TRUE;
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
else {
|
|
988
|
+
deleting_from_bol = FALSE;
|
|
989
|
+
//Forget earlier potential crunches
|
|
990
|
+
marked_delete_point = FALSE;
|
|
991
|
+
}
|
|
992
|
+
/*
|
|
993
|
+
The following step has been left till now as the tess fails are used to
|
|
994
|
+
determine if the word is deletable.
|
|
995
|
+
*/
|
|
996
|
+
if (!crunch_early_merge_tess_fails)
|
|
997
|
+
merge_tess_fails(word);
|
|
998
|
+
page_res_it.forward ();
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
void convert_bad_unlv_chs( //word to do
|
|
1004
|
+
WERD_RES *word_res) {
|
|
1005
|
+
char *ptr; //string ptr
|
|
1006
|
+
int i;
|
|
1007
|
+
int offset;
|
|
1008
|
+
|
|
1009
|
+
ptr = (char *) word_res->best_choice->string ().string ();
|
|
1010
|
+
for (i = 0, offset = 0; i < word_res->reject_map.length ();
|
|
1011
|
+
offset += word_res->best_choice->lengths ()[i++]) {
|
|
1012
|
+
if (word_res->best_choice->lengths ()[i] == 1 &&
|
|
1013
|
+
ptr[offset] == '~') {
|
|
1014
|
+
ptr[offset] = '-';
|
|
1015
|
+
if (word_res->reject_map[i].accepted ())
|
|
1016
|
+
word_res->reject_map[i].setrej_unlv_rej ();
|
|
1017
|
+
}
|
|
1018
|
+
if (word_res->best_choice->lengths ()[i] == 1 &&
|
|
1019
|
+
ptr[offset] == '^') {
|
|
1020
|
+
ptr[offset] = ' ';
|
|
1021
|
+
if (word_res->reject_map[i].accepted ())
|
|
1022
|
+
word_res->reject_map[i].setrej_unlv_rej ();
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
/**********************************************************************
|
|
1029
|
+
* merge_tess_fails
|
|
1030
|
+
*
|
|
1031
|
+
* Change pairs of tess failures to a single one
|
|
1032
|
+
**********************************************************************/
|
|
1033
|
+
|
|
1034
|
+
void merge_tess_fails( //word to do
|
|
1035
|
+
WERD_RES *word_res) {
|
|
1036
|
+
char *ptr; //string ptr
|
|
1037
|
+
char *ptr_lengths; //lengths ptr
|
|
1038
|
+
PBLOB_IT blob_it; //blobs
|
|
1039
|
+
int i = 0;
|
|
1040
|
+
int len;
|
|
1041
|
+
|
|
1042
|
+
len = strlen (word_res->best_choice->lengths ().string ());
|
|
1043
|
+
ASSERT_HOST (word_res->reject_map.length () == len);
|
|
1044
|
+
ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
|
|
1045
|
+
|
|
1046
|
+
ptr = (char *) word_res->best_choice->string ().string ();
|
|
1047
|
+
ptr_lengths = (char *) word_res->best_choice->lengths ().string ();
|
|
1048
|
+
blob_it = word_res->outword->blob_list ();
|
|
1049
|
+
while (*ptr != '\0') {
|
|
1050
|
+
if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
|
|
1051
|
+
strcpy (ptr + 1, ptr + 2); //shuffle up
|
|
1052
|
+
strcpy (ptr_lengths + 1, ptr_lengths + 2); //shuffle up
|
|
1053
|
+
word_res->reject_map.remove_pos (i);
|
|
1054
|
+
merge_blobs (blob_it.data_relative (1), blob_it.data ());
|
|
1055
|
+
delete blob_it.extract (); //get rid of spare
|
|
1056
|
+
}
|
|
1057
|
+
else {
|
|
1058
|
+
i++;
|
|
1059
|
+
ptr += *(ptr_lengths++);
|
|
1060
|
+
}
|
|
1061
|
+
blob_it.forward ();
|
|
1062
|
+
}
|
|
1063
|
+
len = strlen (word_res->best_choice->lengths ().string ());
|
|
1064
|
+
ASSERT_HOST (word_res->reject_map.length () == len);
|
|
1065
|
+
ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
|
|
1070
|
+
enum STATES
|
|
1071
|
+
{
|
|
1072
|
+
JUNK,
|
|
1073
|
+
FIRST_UPPER,
|
|
1074
|
+
FIRST_LOWER,
|
|
1075
|
+
FIRST_NUM,
|
|
1076
|
+
SUBSEQUENT_UPPER,
|
|
1077
|
+
SUBSEQUENT_LOWER,
|
|
1078
|
+
SUBSEQUENT_NUM
|
|
1079
|
+
};
|
|
1080
|
+
const char *str = word->best_choice->string ().string ();
|
|
1081
|
+
const char *lengths = word->best_choice->lengths ().string ();
|
|
1082
|
+
STATES state = JUNK;
|
|
1083
|
+
int len = 0;
|
|
1084
|
+
int isolated_digits = 0;
|
|
1085
|
+
int isolated_alphas = 0;
|
|
1086
|
+
int bad_char_count = 0;
|
|
1087
|
+
int tess_rejs = 0;
|
|
1088
|
+
int dodgy_chars = 0;
|
|
1089
|
+
int ok_chars;
|
|
1090
|
+
UNICHAR_ID last_char = -1;
|
|
1091
|
+
int alpha_repetition_count = 0;
|
|
1092
|
+
int longest_alpha_repetition_count = 0;
|
|
1093
|
+
int longest_lower_run_len = 0;
|
|
1094
|
+
int lower_string_count = 0;
|
|
1095
|
+
int longest_upper_run_len = 0;
|
|
1096
|
+
int upper_string_count = 0;
|
|
1097
|
+
int total_alpha_count = 0;
|
|
1098
|
+
int total_digit_count = 0;
|
|
1099
|
+
|
|
1100
|
+
for (; *str != '\0'; str += *(lengths++)) {
|
|
1101
|
+
len++;
|
|
1102
|
+
if (unicharset.get_isupper (str, *lengths)) {
|
|
1103
|
+
total_alpha_count++;
|
|
1104
|
+
switch (state) {
|
|
1105
|
+
case SUBSEQUENT_UPPER:
|
|
1106
|
+
case FIRST_UPPER:
|
|
1107
|
+
state = SUBSEQUENT_UPPER;
|
|
1108
|
+
upper_string_count++;
|
|
1109
|
+
if (longest_upper_run_len < upper_string_count)
|
|
1110
|
+
longest_upper_run_len = upper_string_count;
|
|
1111
|
+
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
|
|
1112
|
+
alpha_repetition_count++;
|
|
1113
|
+
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
|
1114
|
+
longest_alpha_repetition_count = alpha_repetition_count;
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
else {
|
|
1118
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
|
1119
|
+
alpha_repetition_count = 1;
|
|
1120
|
+
}
|
|
1121
|
+
break;
|
|
1122
|
+
case FIRST_NUM:
|
|
1123
|
+
isolated_digits++;
|
|
1124
|
+
default:
|
|
1125
|
+
state = FIRST_UPPER;
|
|
1126
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
|
1127
|
+
alpha_repetition_count = 1;
|
|
1128
|
+
upper_string_count = 1;
|
|
1129
|
+
break;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
else if (unicharset.get_islower (str, *lengths)) {
|
|
1133
|
+
total_alpha_count++;
|
|
1134
|
+
switch (state) {
|
|
1135
|
+
case SUBSEQUENT_LOWER:
|
|
1136
|
+
case FIRST_LOWER:
|
|
1137
|
+
state = SUBSEQUENT_LOWER;
|
|
1138
|
+
lower_string_count++;
|
|
1139
|
+
if (longest_lower_run_len < lower_string_count)
|
|
1140
|
+
longest_lower_run_len = lower_string_count;
|
|
1141
|
+
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
|
|
1142
|
+
alpha_repetition_count++;
|
|
1143
|
+
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
|
1144
|
+
longest_alpha_repetition_count = alpha_repetition_count;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
else {
|
|
1148
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
|
1149
|
+
alpha_repetition_count = 1;
|
|
1150
|
+
}
|
|
1151
|
+
break;
|
|
1152
|
+
case FIRST_NUM:
|
|
1153
|
+
isolated_digits++;
|
|
1154
|
+
default:
|
|
1155
|
+
state = FIRST_LOWER;
|
|
1156
|
+
last_char = unicharset.unichar_to_id(str, *lengths);
|
|
1157
|
+
alpha_repetition_count = 1;
|
|
1158
|
+
lower_string_count = 1;
|
|
1159
|
+
break;
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
else if (unicharset.get_isdigit (str, *lengths)) {
|
|
1163
|
+
total_digit_count++;
|
|
1164
|
+
switch (state) {
|
|
1165
|
+
case FIRST_NUM:
|
|
1166
|
+
state = SUBSEQUENT_NUM;
|
|
1167
|
+
case SUBSEQUENT_NUM:
|
|
1168
|
+
break;
|
|
1169
|
+
case FIRST_UPPER:
|
|
1170
|
+
case FIRST_LOWER:
|
|
1171
|
+
isolated_alphas++;
|
|
1172
|
+
default:
|
|
1173
|
+
state = FIRST_NUM;
|
|
1174
|
+
break;
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
else {
|
|
1178
|
+
if (*lengths == 1 && *str == ' ')
|
|
1179
|
+
tess_rejs++;
|
|
1180
|
+
else
|
|
1181
|
+
bad_char_count++;
|
|
1182
|
+
switch (state) {
|
|
1183
|
+
case FIRST_NUM:
|
|
1184
|
+
isolated_digits++;
|
|
1185
|
+
break;
|
|
1186
|
+
case FIRST_UPPER:
|
|
1187
|
+
case FIRST_LOWER:
|
|
1188
|
+
isolated_alphas++;
|
|
1189
|
+
default:
|
|
1190
|
+
break;
|
|
1191
|
+
}
|
|
1192
|
+
state = JUNK;
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
switch (state) {
|
|
1197
|
+
case FIRST_NUM:
|
|
1198
|
+
isolated_digits++;
|
|
1199
|
+
break;
|
|
1200
|
+
case FIRST_UPPER:
|
|
1201
|
+
case FIRST_LOWER:
|
|
1202
|
+
isolated_alphas++;
|
|
1203
|
+
default:
|
|
1204
|
+
break;
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
if (crunch_include_numerals) {
|
|
1208
|
+
total_alpha_count += total_digit_count - isolated_digits;
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
if (crunch_leave_ok_strings &&
|
|
1212
|
+
(len >= 4) &&
|
|
1213
|
+
(2 * (total_alpha_count - isolated_alphas) > len) &&
|
|
1214
|
+
(longest_alpha_repetition_count < crunch_long_repetitions)) {
|
|
1215
|
+
if ((crunch_accept_ok &&
|
|
1216
|
+
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
|
|
1217
|
+
(longest_lower_run_len > crunch_leave_lc_strings) ||
|
|
1218
|
+
(longest_upper_run_len > crunch_leave_uc_strings))
|
|
1219
|
+
return G_NEVER_CRUNCH;
|
|
1220
|
+
}
|
|
1221
|
+
if ((word->reject_map.length () > 1) &&
|
|
1222
|
+
(strpbrk (str, " ") == NULL) &&
|
|
1223
|
+
((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
1224
|
+
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
|
1225
|
+
(word->best_choice->permuter () == USER_DAWG_PERM) ||
|
|
1226
|
+
(word->best_choice->permuter () == NUMBER_PERM) ||
|
|
1227
|
+
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
|
|
1228
|
+
return G_OK;
|
|
1229
|
+
|
|
1230
|
+
ok_chars = len - bad_char_count - isolated_digits -
|
|
1231
|
+
isolated_alphas - tess_rejs;
|
|
1232
|
+
|
|
1233
|
+
if (crunch_debug > 3) {
|
|
1234
|
+
tprintf ("garbage_word: \"%s\"\n",
|
|
1235
|
+
word->best_choice->string ().string ());
|
|
1236
|
+
tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
|
|
1237
|
+
len,
|
|
1238
|
+
bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
|
|
1239
|
+
}
|
|
1240
|
+
if ((bad_char_count == 0) &&
|
|
1241
|
+
(tess_rejs == 0) &&
|
|
1242
|
+
((len > isolated_digits + isolated_alphas) || (len <= 2)))
|
|
1243
|
+
return G_OK;
|
|
1244
|
+
|
|
1245
|
+
if ((tess_rejs > ok_chars) ||
|
|
1246
|
+
((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
|
|
1247
|
+
return G_TERRIBLE;
|
|
1248
|
+
|
|
1249
|
+
if (len > 4) {
|
|
1250
|
+
dodgy_chars = 2 * tess_rejs + bad_char_count +
|
|
1251
|
+
isolated_digits + isolated_alphas;
|
|
1252
|
+
if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
|
|
1253
|
+
return G_DODGY;
|
|
1254
|
+
else
|
|
1255
|
+
return G_OK;
|
|
1256
|
+
}
|
|
1257
|
+
else {
|
|
1258
|
+
dodgy_chars = 2 * tess_rejs + bad_char_count;
|
|
1259
|
+
if (((len == 4) && (dodgy_chars > 2)) ||
|
|
1260
|
+
((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
|
|
1261
|
+
return G_DODGY;
|
|
1262
|
+
else
|
|
1263
|
+
return G_OK;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
/*************************************************************************
|
|
1269
|
+
* word_deletable()
|
|
1270
|
+
* DELETE WERDS AT ENDS OF ROWS IF
|
|
1271
|
+
* Word is crunched &&
|
|
1272
|
+
* ( string length = 0 OR
|
|
1273
|
+
* > 50% of chars are "|" (before merging) OR
|
|
1274
|
+
* certainty < -10 OR
|
|
1275
|
+
* rating /char > 60 OR
|
|
1276
|
+
* TOP of word is more than 0.5 xht BELOW baseline OR
|
|
1277
|
+
* BOTTOM of word is more than 0.5 xht ABOVE xht OR
|
|
1278
|
+
* length of word < 3xht OR
|
|
1279
|
+
* height of word < 0.7 xht OR
|
|
1280
|
+
* height of word > 3.0 xht OR
|
|
1281
|
+
* >75% of the outline BBs have longest dimension < 0.5xht
|
|
1282
|
+
*************************************************************************/
|
|
1283
|
+
|
|
1284
|
+
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
|
|
1285
|
+
int word_len = word->reject_map.length ();
|
|
1286
|
+
float rating_per_ch;
|
|
1287
|
+
TBOX box; //BB of word
|
|
1288
|
+
|
|
1289
|
+
if (word->unlv_crunch_mode == CR_NONE) {
|
|
1290
|
+
delete_mode = 0;
|
|
1291
|
+
return CR_NONE;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
if (word_len == 0) {
|
|
1295
|
+
delete_mode = 1;
|
|
1296
|
+
return CR_DELETE;
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
box = word->outword->bounding_box ();
|
|
1300
|
+
if (box.height () < crunch_del_min_ht * bln_x_height) {
|
|
1301
|
+
delete_mode = 4;
|
|
1302
|
+
return CR_DELETE;
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
if (noise_outlines (word->outword)) {
|
|
1306
|
+
delete_mode = 5;
|
|
1307
|
+
return CR_DELETE;
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
if ((failure_count (word) * 1.5) > word_len) {
|
|
1311
|
+
delete_mode = 2;
|
|
1312
|
+
return CR_LOOSE_SPACE;
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
if (word->best_choice->certainty () < crunch_del_cert) {
|
|
1316
|
+
delete_mode = 7;
|
|
1317
|
+
return CR_LOOSE_SPACE;
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
rating_per_ch = word->best_choice->rating () / word_len;
|
|
1321
|
+
|
|
1322
|
+
if (rating_per_ch > crunch_del_rating) {
|
|
1323
|
+
delete_mode = 8;
|
|
1324
|
+
return CR_LOOSE_SPACE;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
|
|
1328
|
+
delete_mode = 9;
|
|
1329
|
+
return CR_LOOSE_SPACE;
|
|
1330
|
+
}
|
|
1331
|
+
|
|
1332
|
+
if (box.bottom () >
|
|
1333
|
+
bln_baseline_offset + crunch_del_high_word * bln_x_height) {
|
|
1334
|
+
delete_mode = 10;
|
|
1335
|
+
return CR_LOOSE_SPACE;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
if (box.height () > crunch_del_max_ht * bln_x_height) {
|
|
1339
|
+
delete_mode = 11;
|
|
1340
|
+
return CR_LOOSE_SPACE;
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
if (box.width () < crunch_del_min_width * bln_x_height) {
|
|
1344
|
+
delete_mode = 3;
|
|
1345
|
+
return CR_LOOSE_SPACE;
|
|
1346
|
+
}
|
|
1347
|
+
|
|
1348
|
+
delete_mode = 0;
|
|
1349
|
+
return CR_NONE;
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
inT16 failure_count(WERD_RES *word) {
|
|
1354
|
+
char *str = (char *) word->best_choice->string ().string ();
|
|
1355
|
+
int tess_rejs = 0;
|
|
1356
|
+
|
|
1357
|
+
for (; *str != '\0'; str++) {
|
|
1358
|
+
if (*str == ' ')
|
|
1359
|
+
tess_rejs++;
|
|
1360
|
+
}
|
|
1361
|
+
return tess_rejs;
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
BOOL8 noise_outlines(WERD *word) {
|
|
1366
|
+
PBLOB_IT blob_it;
|
|
1367
|
+
OUTLINE_IT outline_it;
|
|
1368
|
+
TBOX box; //BB of outline
|
|
1369
|
+
inT16 outline_count = 0;
|
|
1370
|
+
inT16 small_outline_count = 0;
|
|
1371
|
+
inT16 max_dimension;
|
|
1372
|
+
float small_limit = bln_x_height * crunch_small_outlines_size;
|
|
1373
|
+
|
|
1374
|
+
blob_it.set_to_list (word->blob_list ());
|
|
1375
|
+
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
|
|
1376
|
+
outline_it.set_to_list (blob_it.data ()->out_list ());
|
|
1377
|
+
for (outline_it.mark_cycle_pt ();
|
|
1378
|
+
!outline_it.cycled_list (); outline_it.forward ()) {
|
|
1379
|
+
outline_count++;
|
|
1380
|
+
box = outline_it.data ()->bounding_box ();
|
|
1381
|
+
if (box.height () > box.width ())
|
|
1382
|
+
max_dimension = box.height ();
|
|
1383
|
+
else
|
|
1384
|
+
max_dimension = box.width ();
|
|
1385
|
+
if (max_dimension < small_limit)
|
|
1386
|
+
small_outline_count++;
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
return (small_outline_count >= outline_count);
|
|
1390
|
+
}
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
/*************************************************************************
|
|
1394
|
+
* insert_rej_cblobs()
|
|
1395
|
+
* Put rejected word blobs back into the outword.
|
|
1396
|
+
* NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
|
|
1397
|
+
* OF ELEMENTS.
|
|
1398
|
+
*************************************************************************/
|
|
1399
|
+
void insert_rej_cblobs( //word to do
|
|
1400
|
+
WERD_RES *word) {
|
|
1401
|
+
PBLOB_IT blob_it; //blob iterator
|
|
1402
|
+
PBLOB_IT rej_blob_it;
|
|
1403
|
+
const STRING *word_str;
|
|
1404
|
+
const STRING *word_lengths;
|
|
1405
|
+
int old_len;
|
|
1406
|
+
int rej_len;
|
|
1407
|
+
char new_str[512 * UNICHAR_LEN];
|
|
1408
|
+
char new_lengths[512];
|
|
1409
|
+
REJMAP new_map;
|
|
1410
|
+
int i = 0; //new_str index
|
|
1411
|
+
int j = 0; //old_str index
|
|
1412
|
+
int i_offset = 0; //new_str offset
|
|
1413
|
+
int j_offset = 0; //old_str offset
|
|
1414
|
+
int new_len;
|
|
1415
|
+
|
|
1416
|
+
gblob_sort_list (word->outword->rej_blob_list (), TRUE);
|
|
1417
|
+
rej_blob_it.set_to_list (word->outword->rej_blob_list ());
|
|
1418
|
+
if (rej_blob_it.empty ())
|
|
1419
|
+
return;
|
|
1420
|
+
rej_len = rej_blob_it.length ();
|
|
1421
|
+
blob_it.set_to_list (word->outword->blob_list ());
|
|
1422
|
+
word_str = &(word->best_choice->string ());
|
|
1423
|
+
word_lengths = &(word->best_choice->lengths ());
|
|
1424
|
+
old_len = word->best_choice->lengths().length ();
|
|
1425
|
+
ASSERT_HOST (word->reject_map.length () == old_len);
|
|
1426
|
+
ASSERT_HOST (blob_it.length () == old_len);
|
|
1427
|
+
if ((old_len + rej_len) > 511)
|
|
1428
|
+
return; //Word is garbage anyway prevent abort
|
|
1429
|
+
new_map.initialise (old_len + rej_len);
|
|
1430
|
+
|
|
1431
|
+
while (!rej_blob_it.empty ()) {
|
|
1432
|
+
if ((j >= old_len) ||
|
|
1433
|
+
(rej_blob_it.data ()->bounding_box ().left () <=
|
|
1434
|
+
blob_it.data ()->bounding_box ().left ())) {
|
|
1435
|
+
/* Insert reject blob */
|
|
1436
|
+
if (j >= old_len)
|
|
1437
|
+
blob_it.add_to_end (rej_blob_it.extract ());
|
|
1438
|
+
else
|
|
1439
|
+
blob_it.add_before_stay_put (rej_blob_it.extract ());
|
|
1440
|
+
if (!rej_blob_it.empty ())
|
|
1441
|
+
rej_blob_it.forward ();
|
|
1442
|
+
new_str[i_offset] = ' ';
|
|
1443
|
+
new_lengths[i] = 1;
|
|
1444
|
+
new_map[i].setrej_rej_cblob ();
|
|
1445
|
+
i_offset += new_lengths[i++];
|
|
1446
|
+
}
|
|
1447
|
+
else {
|
|
1448
|
+
strncpy(new_str + i_offset, &(*word_str)[j_offset],
|
|
1449
|
+
(*word_lengths)[j]);
|
|
1450
|
+
new_lengths[i] = (*word_lengths)[j];
|
|
1451
|
+
new_map[i] = word->reject_map[j];
|
|
1452
|
+
i_offset += new_lengths[i++];
|
|
1453
|
+
j_offset += (*word_lengths)[j++];
|
|
1454
|
+
blob_it.forward ();
|
|
1455
|
+
}
|
|
1456
|
+
}
|
|
1457
|
+
/* Add any extra normal blobs to strings */
|
|
1458
|
+
while (j < word_lengths->length ()) {
|
|
1459
|
+
strncpy(new_str + i_offset, &(*word_str)[j_offset],
|
|
1460
|
+
(*word_lengths)[j]);
|
|
1461
|
+
new_lengths[i] = (*word_lengths)[j];
|
|
1462
|
+
new_map[i] = word->reject_map[j];
|
|
1463
|
+
i_offset += new_lengths[i++];
|
|
1464
|
+
j_offset += (*word_lengths)[j++];
|
|
1465
|
+
}
|
|
1466
|
+
new_str[i_offset] = '\0';
|
|
1467
|
+
new_lengths[i] = 0;
|
|
1468
|
+
/*
|
|
1469
|
+
tprintf(
|
|
1470
|
+
"\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
|
|
1471
|
+
old_len, i, new_str, new_map );
|
|
1472
|
+
*/
|
|
1473
|
+
ASSERT_HOST (i == blob_it.length ());
|
|
1474
|
+
ASSERT_HOST (i == old_len + rej_len);
|
|
1475
|
+
word->reject_map = new_map;
|
|
1476
|
+
*((STRING *) word_str) = new_str;
|
|
1477
|
+
*((STRING *) word_lengths) = new_lengths;
|
|
1478
|
+
new_len = word->best_choice->lengths ().length ();
|
|
1479
|
+
ASSERT_HOST (word->reject_map.length () == new_len);
|
|
1480
|
+
ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
|
|
1481
|
+
}
|