tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
@@ -0,0 +1,1458 @@
|
|
1
|
+
/******************************************************************************
|
2
|
+
** Filename: stopper.c
|
3
|
+
** Purpose: Stopping criteria for word classifier.
|
4
|
+
** Author: Dan Johnson
|
5
|
+
** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
|
6
|
+
**
|
7
|
+
** (c) Copyright Hewlett-Packard Company, 1988.
|
8
|
+
** Licensed under the Apache License, Version 2.0 (the "License");
|
9
|
+
** you may not use this file except in compliance with the License.
|
10
|
+
** You may obtain a copy of the License at
|
11
|
+
** http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
** Unless required by applicable law or agreed to in writing, software
|
13
|
+
** distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
** See the License for the specific language governing permissions and
|
16
|
+
** limitations under the License.
|
17
|
+
******************************************************************************/
|
18
|
+
/**----------------------------------------------------------------------------
|
19
|
+
Include Files and Type Defines
|
20
|
+
----------------------------------------------------------------------------**/
|
21
|
+
#include "stopper.h"
|
22
|
+
#include "emalloc.h"
|
23
|
+
#include "matchdefs.h"
|
24
|
+
#include "debug.h"
|
25
|
+
#include "callcpp.h"
|
26
|
+
#include "permute.h"
|
27
|
+
#include "context.h"
|
28
|
+
#include "permnum.h"
|
29
|
+
#include "danerror.h"
|
30
|
+
#include "const.h"
|
31
|
+
#include "freelist.h"
|
32
|
+
#include "efio.h"
|
33
|
+
#include "globals.h"
|
34
|
+
#include "scanutils.h"
|
35
|
+
#include "unichar.h"
|
36
|
+
|
37
|
+
#include <stdio.h>
|
38
|
+
#include <string.h>
|
39
|
+
#include <ctype.h>
|
40
|
+
#include <math.h>
|
41
|
+
#ifdef __UNIX__
|
42
|
+
#include <assert.h>
|
43
|
+
#endif
|
44
|
+
|
45
|
+
/* these are kludges - add appropriate .h file later */
|
46
|
+
extern float CertaintyScale; /* from subfeat.h */
|
47
|
+
|
48
|
+
#define MAX_WERD_SIZE 100
|
49
|
+
#define MAX_AMBIG_SIZE 3
|
50
|
+
#define DANGEROUS_AMBIGS "DangAmbigs"
|
51
|
+
|
52
|
+
typedef LIST AMBIG_TABLE;
|
53
|
+
|
54
|
+
typedef struct
|
55
|
+
{
|
56
|
+
UNICHAR_ID Class;
|
57
|
+
uinT16 NumChunks;
|
58
|
+
float Certainty;
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
CHAR_CHOICE;
|
63
|
+
|
64
|
+
typedef struct
|
65
|
+
{
|
66
|
+
float Rating;
|
67
|
+
float Certainty;
|
68
|
+
FLOAT32 AdjustFactor;
|
69
|
+
int Length;
|
70
|
+
CHAR_CHOICE Blob[1];
|
71
|
+
} VIABLE_CHOICE_STRUCT;
|
72
|
+
typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
|
73
|
+
|
74
|
+
typedef struct
|
75
|
+
{
|
76
|
+
VIABLE_CHOICE Choice;
|
77
|
+
float ChunkCertainty[MAX_NUM_CHUNKS];
|
78
|
+
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
|
79
|
+
}
|
80
|
+
|
81
|
+
|
82
|
+
EXPANDED_CHOICE;
|
83
|
+
|
84
|
+
typedef struct
|
85
|
+
{
|
86
|
+
char ambig[2 * (UNICHAR_LEN * MAX_AMBIG_SIZE) + 2];
|
87
|
+
char lengths[2 * (MAX_AMBIG_SIZE) + 2];
|
88
|
+
} AMBIG_SPEC;
|
89
|
+
|
90
|
+
/**----------------------------------------------------------------------------
|
91
|
+
Macros
|
92
|
+
----------------------------------------------------------------------------**/
|
93
|
+
#define BestCertainty(Choices) (((VIABLE_CHOICE) first_node (Choices))->Certainty)
|
94
|
+
#define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
|
95
|
+
#define BestFactor(Choices) (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
|
96
|
+
|
97
|
+
#define AmbigThreshold(F1,F2) (((F2) - (F1)) * AmbigThresholdGain - \
|
98
|
+
AmbigThresholdOffset)
|
99
|
+
|
100
|
+
/*---------------------------------------------------------------------------
|
101
|
+
Private Function Prototoypes
|
102
|
+
----------------------------------------------------------------------------*/
|
103
|
+
void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
|
104
|
+
|
105
|
+
int AmbigsFound(char *Word,
|
106
|
+
char *CurrentChar,
|
107
|
+
const char *Tail,
|
108
|
+
const char *Tail_lengths,
|
109
|
+
LIST Ambigs,
|
110
|
+
DANGERR *fixpt);
|
111
|
+
|
112
|
+
int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice);
|
113
|
+
|
114
|
+
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
|
115
|
+
void *arg2); //VIABLE_CHOICE Choice2);
|
116
|
+
|
117
|
+
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
|
118
|
+
|
119
|
+
AMBIG_TABLE *FillAmbigTable();
|
120
|
+
|
121
|
+
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
|
122
|
+
void *item2); //EXPANDED_CHOICE *BestChoice);
|
123
|
+
|
124
|
+
int LengthOfShortestAlphaRun(register char *Word, const char *Word_lengths);
|
125
|
+
|
126
|
+
VIABLE_CHOICE NewViableChoice (A_CHOICE * Choice,
|
127
|
+
FLOAT32 AdjustFactor, float Certainties[]);
|
128
|
+
|
129
|
+
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
|
130
|
+
|
131
|
+
void ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
|
132
|
+
A_CHOICE * NewChoice,
|
133
|
+
FLOAT32 AdjustFactor, float Certainties[]);
|
134
|
+
|
135
|
+
int StringSameAs(const char *String,
|
136
|
+
const char *String_lengths,
|
137
|
+
VIABLE_CHOICE ViableChoice);
|
138
|
+
|
139
|
+
int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice);
|
140
|
+
|
141
|
+
/**----------------------------------------------------------------------------
|
142
|
+
Global Data Definitions and Declarations
|
143
|
+
----------------------------------------------------------------------------**/
|
144
|
+
/* Name of file containing potentially dangerous ambiguities */
|
145
|
+
static const char *DangerousAmbigs = DANGEROUS_AMBIGS;
|
146
|
+
|
147
|
+
/* Word for which stopper debug information should be printed to stdout */
|
148
|
+
static char *WordToDebug = NULL;
|
149
|
+
static char *WordToDebug_lengths = NULL;
|
150
|
+
|
151
|
+
/* flag used to disable accumulation of word choices during compound word
|
152
|
+
permutation */
|
153
|
+
BOOL8 KeepWordChoices = TRUE;
|
154
|
+
|
155
|
+
/* additional certainty padding allowed before a word is rejected */
|
156
|
+
static FLOAT32 RejectOffset = 0.0;
|
157
|
+
|
158
|
+
/* structures to keep track of viable word choices */
|
159
|
+
static VIABLE_CHOICE BestRawChoice = NULL;
|
160
|
+
static LIST BestChoices = NIL;
|
161
|
+
static PIECES_STATE CurrentSegmentation;
|
162
|
+
|
163
|
+
make_float_var (NonDictCertainty, -2.50, MakeNonDictCertainty,
|
164
|
+
17, 2, SetNonDictCertainty,
|
165
|
+
"Certainty threshold for non-dict words");
|
166
|
+
|
167
|
+
make_float_var (RejectCertaintyOffset, 1.0, MakeRejectCertaintyOffset,
|
168
|
+
17, 3, SetRejectCertaintyOffset, "Reject certainty offset");
|
169
|
+
|
170
|
+
make_int_var (SmallWordSize, 2, MakeSmallWordSize,
|
171
|
+
17, 4, SetSmallWordSize,
|
172
|
+
"Size of dict word to be treated as non-dict word");
|
173
|
+
|
174
|
+
make_float_var (CertaintyPerChar, -0.50, MakeCertaintyPerChar,
|
175
|
+
17, 5, SetCertaintyPerChar,
|
176
|
+
"Certainty to add for each dict char above SmallWordSize");
|
177
|
+
|
178
|
+
make_float_var (CertaintyVariation, 3.0, MakeCertaintyVariation,
|
179
|
+
17, 6, SetCertaintyVariation,
|
180
|
+
"Max certaintly variation allowed in a word (in sigma)");
|
181
|
+
|
182
|
+
make_int_var (StopperDebugLevel, 0, MakeStopperDebugLevel,
|
183
|
+
17, 7, SetStopperDebugLevel, "Stopper debug level");
|
184
|
+
|
185
|
+
make_float_var (AmbigThresholdGain, 8.0, MakeAmbigThresholdGain,
|
186
|
+
17, 8, SetAmbigThresholdGain,
|
187
|
+
"Gain factor for ambiguity threshold");
|
188
|
+
|
189
|
+
make_float_var (AmbigThresholdOffset, 1.5, MakeAmbigThresholdOffset,
|
190
|
+
17, 9, SetAmbigThresholdOffset,
|
191
|
+
"Certainty offset for ambiguity threshold");
|
192
|
+
|
193
|
+
extern int first_pass;
|
194
|
+
INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
|
195
|
+
|
196
|
+
/**----------------------------------------------------------------------------
|
197
|
+
Public Code
|
198
|
+
----------------------------------------------------------------------------**/
|
199
|
+
/*---------------------------------------------------------------------------*/
|
200
|
+
int AcceptableChoice(CHOICES_LIST Choices,
|
201
|
+
A_CHOICE *BestChoice,
|
202
|
+
A_CHOICE *RawChoice,
|
203
|
+
DANGERR *fixpt) {
|
204
|
+
/*
|
205
|
+
** Parameters:
|
206
|
+
** Choices choices for current segmentation
|
207
|
+
** BestChoice best choice for current segmentation
|
208
|
+
** RawChoice best raw choice for current segmentation
|
209
|
+
** Globals:
|
210
|
+
** NonDictCertainty certainty for a non-dict word
|
211
|
+
** SmallWordSize size of word to be treated as non-word
|
212
|
+
** CertaintyPerChar certainty to add for each dict char
|
213
|
+
** Operation: Return TRUE if the results from this segmentation are
|
214
|
+
** good enough to stop. Otherwise return FALSE.
|
215
|
+
** Return: TRUE or FALSE.
|
216
|
+
** Exceptions: none
|
217
|
+
** History: Mon Apr 29 14:57:32 1991, DSJ, Created.
|
218
|
+
*/
|
219
|
+
float CertaintyThreshold = NonDictCertainty;
|
220
|
+
int WordSize;
|
221
|
+
|
222
|
+
if (fixpt != NULL)
|
223
|
+
fixpt->index = -1;
|
224
|
+
if ((BestChoice == NULL) || (class_string (BestChoice) == NULL))
|
225
|
+
return (FALSE);
|
226
|
+
|
227
|
+
if (StopperDebugLevel >= 1)
|
228
|
+
cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n",
|
229
|
+
class_string (BestChoice),
|
230
|
+
(valid_word (class_string (BestChoice)) ? 'y' : 'n'),
|
231
|
+
(case_ok (class_string (BestChoice),
|
232
|
+
class_lengths (BestChoice)) ? 'y' : 'n'),
|
233
|
+
((punctuation_ok (class_string (BestChoice),
|
234
|
+
class_lengths (BestChoice)) !=
|
235
|
+
-1) ? 'y' : 'n'));
|
236
|
+
|
237
|
+
if (valid_word (class_string (BestChoice)) &&
|
238
|
+
case_ok (class_string (BestChoice), class_lengths (BestChoice)) &&
|
239
|
+
punctuation_ok (class_string (BestChoice),
|
240
|
+
class_lengths (BestChoice)) != -1) {
|
241
|
+
WordSize = LengthOfShortestAlphaRun (class_string (BestChoice),
|
242
|
+
class_lengths (BestChoice));
|
243
|
+
WordSize -= SmallWordSize;
|
244
|
+
if (WordSize < 0)
|
245
|
+
WordSize = 0;
|
246
|
+
CertaintyThreshold += WordSize * CertaintyPerChar;
|
247
|
+
}
|
248
|
+
else if (stopper_numbers_on && valid_number (class_string (BestChoice),
|
249
|
+
class_lengths (BestChoice))) {
|
250
|
+
CertaintyThreshold += stopper_numbers_on * CertaintyPerChar;
|
251
|
+
}
|
252
|
+
|
253
|
+
if (StopperDebugLevel >= 1)
|
254
|
+
cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
|
255
|
+
class_certainty (BestChoice), CertaintyThreshold);
|
256
|
+
|
257
|
+
if (NoDangerousAmbig (class_string (BestChoice),
|
258
|
+
class_lengths (BestChoice), fixpt)
|
259
|
+
&& class_certainty (BestChoice) > CertaintyThreshold &&
|
260
|
+
UniformCertainties (Choices, BestChoice))
|
261
|
+
return (TRUE);
|
262
|
+
else
|
263
|
+
return (FALSE);
|
264
|
+
|
265
|
+
} /* AcceptableChoice */
|
266
|
+
|
267
|
+
|
268
|
+
/*---------------------------------------------------------------------------*/
|
269
|
+
int AcceptableResult(A_CHOICE *BestChoice, A_CHOICE *RawChoice) {
|
270
|
+
/*
|
271
|
+
** Parameters:
|
272
|
+
** BestChoice best choice for current word
|
273
|
+
** RawChoice best raw choice for current word
|
274
|
+
** Globals:
|
275
|
+
** NonDictCertainty certainty for a non-dict word
|
276
|
+
** SmallWordSize size of word to be treated as non-word
|
277
|
+
** CertaintyPerChar certainty to add for each dict char
|
278
|
+
** BestChoices list of all good choices found
|
279
|
+
** RejectOffset allowed offset before a word is rejected
|
280
|
+
** Operation: Return FALSE if the best choice for the current word
|
281
|
+
** is questionable and should be tried again on the second
|
282
|
+
** pass or should be flagged to the user.
|
283
|
+
** Return: TRUE or FALSE.
|
284
|
+
** Exceptions: none
|
285
|
+
** History: Thu May 9 14:05:05 1991, DSJ, Created.
|
286
|
+
*/
|
287
|
+
float CertaintyThreshold = NonDictCertainty - RejectOffset;
|
288
|
+
int WordSize;
|
289
|
+
|
290
|
+
if (StopperDebugLevel >= 1)
|
291
|
+
cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
|
292
|
+
class_string (BestChoice),
|
293
|
+
(valid_word (class_string (BestChoice)) ? 'y' : 'n'),
|
294
|
+
(case_ok (class_string (BestChoice),
|
295
|
+
class_lengths (BestChoice)) ? 'y' : 'n'),
|
296
|
+
((punctuation_ok (class_string (BestChoice),
|
297
|
+
class_lengths (BestChoice)) != -1) ? 'y' : 'n'),
|
298
|
+
((rest (BestChoices) != NIL) ? 'n' : 'y'));
|
299
|
+
|
300
|
+
if ((BestChoice == NULL) ||
|
301
|
+
(class_string (BestChoice) == NULL) || CurrentWordAmbig ())
|
302
|
+
return (FALSE);
|
303
|
+
|
304
|
+
if (valid_word (class_string (BestChoice)) &&
|
305
|
+
case_ok (class_string (BestChoice), class_lengths (BestChoice)) &&
|
306
|
+
punctuation_ok (class_string (BestChoice),
|
307
|
+
class_lengths (BestChoice)) != -1) {
|
308
|
+
WordSize = LengthOfShortestAlphaRun (class_string (BestChoice),
|
309
|
+
class_lengths (BestChoice));
|
310
|
+
WordSize -= SmallWordSize;
|
311
|
+
if (WordSize < 0)
|
312
|
+
WordSize = 0;
|
313
|
+
CertaintyThreshold += WordSize * CertaintyPerChar;
|
314
|
+
}
|
315
|
+
|
316
|
+
if (StopperDebugLevel >= 1)
|
317
|
+
cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
|
318
|
+
class_certainty (BestChoice), CertaintyThreshold);
|
319
|
+
|
320
|
+
if (class_certainty (BestChoice) > CertaintyThreshold) {
|
321
|
+
if (StopperDebugLevel >= 1)
|
322
|
+
cprintf ("ACCEPTED\n");
|
323
|
+
return (TRUE);
|
324
|
+
}
|
325
|
+
else {
|
326
|
+
if (StopperDebugLevel >= 1)
|
327
|
+
cprintf ("REJECTED\n");
|
328
|
+
return (FALSE);
|
329
|
+
}
|
330
|
+
} /* AcceptableResult */
|
331
|
+
|
332
|
+
|
333
|
+
/*---------------------------------------------------------------------------*/
|
334
|
+
int AlternativeChoicesWorseThan(FLOAT32 Threshold) {
|
335
|
+
/*
|
336
|
+
** Parameters:
|
337
|
+
** Threshold minimum adjust factor for alternative choices
|
338
|
+
** Globals:
|
339
|
+
** BestChoices alternative choices for current word
|
340
|
+
** Operation: This routine returns TRUE if there are no alternative
|
341
|
+
** choices for the current word OR if all alternatives have
|
342
|
+
** an adjust factor worse than Threshold.
|
343
|
+
** Return: TRUE or FALSE.
|
344
|
+
** Exceptions: none
|
345
|
+
** History: Mon Jun 3 09:36:31 1991, DSJ, Created.
|
346
|
+
*/
|
347
|
+
LIST Alternatives;
|
348
|
+
VIABLE_CHOICE Choice;
|
349
|
+
|
350
|
+
Alternatives = rest (BestChoices);
|
351
|
+
iterate(Alternatives) {
|
352
|
+
Choice = (VIABLE_CHOICE) first_node (Alternatives);
|
353
|
+
if (Choice->AdjustFactor <= Threshold)
|
354
|
+
return (FALSE);
|
355
|
+
}
|
356
|
+
|
357
|
+
return (TRUE);
|
358
|
+
|
359
|
+
} /* AlternativeChoicesWorseThan */
|
360
|
+
|
361
|
+
|
362
|
+
/*---------------------------------------------------------------------------*/
|
363
|
+
int CurrentBestChoiceIs(const char *Word, const char *Word_lengths) {
|
364
|
+
/*
|
365
|
+
** Parameters:
|
366
|
+
** Word string to compare to current best choice
|
367
|
+
** Word_lengths lengths of unichars in Word
|
368
|
+
** Globals:
|
369
|
+
** BestChoices set of best choices for current word
|
370
|
+
** Operation: Returns TRUE if Word is the same as the current best
|
371
|
+
** choice, FALSE otherwise.
|
372
|
+
** Return: TRUE or FALSE
|
373
|
+
** Exceptions: none
|
374
|
+
** History: Thu May 30 14:44:22 1991, DSJ, Created.
|
375
|
+
*/
|
376
|
+
return (BestChoices != NIL &&
|
377
|
+
StringSameAs (Word, Word_lengths,
|
378
|
+
(VIABLE_CHOICE) first_node (BestChoices)));
|
379
|
+
|
380
|
+
} /* CurrentBestChoiceIs */
|
381
|
+
|
382
|
+
|
383
|
+
/*---------------------------------------------------------------------------*/
|
384
|
+
FLOAT32 CurrentBestChoiceAdjustFactor() {
|
385
|
+
/*
|
386
|
+
** Parameters: none
|
387
|
+
** Globals:
|
388
|
+
** BestChoices set of best choices for current word
|
389
|
+
** Operation: Return the adjustment factor for the best choice for
|
390
|
+
** the current word.
|
391
|
+
** Return: Adjust factor for current best choice.
|
392
|
+
** Exceptions: none
|
393
|
+
** History: Thu May 30 14:48:24 1991, DSJ, Created.
|
394
|
+
*/
|
395
|
+
VIABLE_CHOICE BestChoice;
|
396
|
+
|
397
|
+
if (BestChoices == NIL)
|
398
|
+
return (MAX_FLOAT32);
|
399
|
+
|
400
|
+
BestChoice = (VIABLE_CHOICE) first_node (BestChoices);
|
401
|
+
return (BestChoice->AdjustFactor);
|
402
|
+
|
403
|
+
} /* CurrentBestChoiceAdjustFactor */
|
404
|
+
|
405
|
+
|
406
|
+
/*---------------------------------------------------------------------------*/
|
407
|
+
int CurrentWordAmbig() {
|
408
|
+
/*
|
409
|
+
** Parameters: none
|
410
|
+
** Globals:
|
411
|
+
** BestChoices set of best choices for current word
|
412
|
+
** Operation: This routine returns TRUE if there are multiple good
|
413
|
+
** choices for the current word and FALSE otherwise.
|
414
|
+
** Return: TRUE or FALSE
|
415
|
+
** Exceptions: none
|
416
|
+
** History: Wed May 22 15:38:38 1991, DSJ, Created.
|
417
|
+
*/
|
418
|
+
return (rest (BestChoices) != NIL);
|
419
|
+
|
420
|
+
} /* CurrentWordAmbig */
|
421
|
+
|
422
|
+
|
423
|
+
/*---------------------------------------------------------------------------*/
|
424
|
+
void DebugWordChoices() {
|
425
|
+
/*
|
426
|
+
** Parameters: none
|
427
|
+
** Globals:
|
428
|
+
** BestRawChoice
|
429
|
+
** BestChoices
|
430
|
+
** Operation: Print the current choices for this word to stdout.
|
431
|
+
** Return: none
|
432
|
+
** Exceptions: none
|
433
|
+
** History: Wed May 15 13:52:08 1991, DSJ, Created.
|
434
|
+
*/
|
435
|
+
LIST Choices;
|
436
|
+
int i;
|
437
|
+
char LabelString[80];
|
438
|
+
|
439
|
+
if (StopperDebugLevel >= 1 ||
|
440
|
+
(WordToDebug && BestChoices &&
|
441
|
+
StringSameAs (WordToDebug, WordToDebug_lengths,
|
442
|
+
(VIABLE_CHOICE) first_node (BestChoices)))) {
|
443
|
+
if (BestRawChoice)
|
444
|
+
PrintViableChoice (stderr, "\nBest Raw Choice: ", BestRawChoice);
|
445
|
+
|
446
|
+
i = 1;
|
447
|
+
Choices = BestChoices;
|
448
|
+
if (Choices)
|
449
|
+
cprintf ("\nBest Cooked Choices:\n");
|
450
|
+
iterate(Choices) {
|
451
|
+
sprintf (LabelString, "Cooked Choice #%d: ", i);
|
452
|
+
PrintViableChoice (stderr, LabelString,
|
453
|
+
(VIABLE_CHOICE) first_node (Choices));
|
454
|
+
i++;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
} /* DebugWordChoices */
|
458
|
+
|
459
|
+
|
460
|
+
/*---------------------------------------------------------------------------*/
|
461
|
+
void FilterWordChoices() {
|
462
|
+
/*
|
463
|
+
** Parameters: none
|
464
|
+
** Globals:
|
465
|
+
** BestChoices set of choices for current word
|
466
|
+
** Operation: This routine removes from BestChoices all choices which
|
467
|
+
** are not within a reasonable range of the best choice.
|
468
|
+
** Return: none
|
469
|
+
** Exceptions: none
|
470
|
+
** History: Wed May 15 13:08:24 1991, DSJ, Created.
|
471
|
+
*/
|
472
|
+
EXPANDED_CHOICE BestChoice;
|
473
|
+
|
474
|
+
if (BestChoices == NIL || second_node (BestChoices) == NIL)
|
475
|
+
return;
|
476
|
+
|
477
|
+
/* compute certainties and class for each chunk in best choice */
|
478
|
+
ExpandChoice ((VIABLE_CHOICE_STRUCT *) first_node (BestChoices), &BestChoice);
|
479
|
+
|
480
|
+
set_rest (BestChoices, delete_d (rest (BestChoices),
|
481
|
+
&BestChoice, FreeBadChoice));
|
482
|
+
|
483
|
+
} /* FilterWordChoices */
|
484
|
+
|
485
|
+
|
486
|
+
/*---------------------------------------------------------------------------*/
|
487
|
+
void
|
488
|
+
FindClassifierErrors (FLOAT32 MinRating,
|
489
|
+
FLOAT32 MaxRating,
|
490
|
+
FLOAT32 RatingMargin, FLOAT32 Thresholds[]) {
|
491
|
+
/*
|
492
|
+
** Parameters:
|
493
|
+
** MinRating limits how tight to make a template
|
494
|
+
** MaxRating limits how loose to make a template
|
495
|
+
** RatingMargin amount of margin to put in template
|
496
|
+
** Thresholds[] place to put error thresholds
|
497
|
+
** Globals: none
|
498
|
+
** Operation: This routine compares the best choice for the current
|
499
|
+
** word to the best raw choice to determine which characters
|
500
|
+
** were classified incorrectly by the classifier. It then
|
501
|
+
** places a separate threshold into Thresholds for each
|
502
|
+
** character in the word. If the classifier was correct,
|
503
|
+
** MaxRating is placed into Thresholds. If the
|
504
|
+
** classifier was incorrect, the avg. match rating (error
|
505
|
+
** percentage) of the classifier's incorrect choice minus
|
506
|
+
** some margin is
|
507
|
+
** placed into thresholds. This can then be used by the
|
508
|
+
** caller to try to create a new template for the desired
|
509
|
+
** class that will classify the character with a rating better
|
510
|
+
** than the threshold value. The match rating placed into
|
511
|
+
** Thresholds is never allowed to be below MinRating in order
|
512
|
+
** to prevent trying to make overly tight templates.
|
513
|
+
** Return: none (results are placed in Thresholds)
|
514
|
+
** Exceptions: none
|
515
|
+
** History: Fri May 31 16:02:57 1991, DSJ, Created.
|
516
|
+
*/
|
517
|
+
EXPANDED_CHOICE BestRaw;
|
518
|
+
VIABLE_CHOICE Choice;
|
519
|
+
int i, j, Chunk;
|
520
|
+
FLOAT32 AvgRating;
|
521
|
+
int NumErrorChunks;
|
522
|
+
|
523
|
+
assert (BestChoices != NIL);
|
524
|
+
assert (BestRawChoice != NULL);
|
525
|
+
|
526
|
+
ExpandChoice(BestRawChoice, &BestRaw);
|
527
|
+
Choice = (VIABLE_CHOICE) first_node (BestChoices);
|
528
|
+
|
529
|
+
for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
|
530
|
+
AvgRating = 0.0;
|
531
|
+
NumErrorChunks = 0;
|
532
|
+
|
533
|
+
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
|
534
|
+
if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
|
535
|
+
AvgRating += BestRaw.ChunkCertainty[Chunk];
|
536
|
+
NumErrorChunks++;
|
537
|
+
}
|
538
|
+
|
539
|
+
if (NumErrorChunks > 0) {
|
540
|
+
AvgRating /= NumErrorChunks;
|
541
|
+
*Thresholds = (AvgRating / -CertaintyScale) * (1.0 - RatingMargin);
|
542
|
+
}
|
543
|
+
else
|
544
|
+
*Thresholds = MaxRating;
|
545
|
+
|
546
|
+
if (*Thresholds > MaxRating)
|
547
|
+
*Thresholds = MaxRating;
|
548
|
+
if (*Thresholds < MinRating)
|
549
|
+
*Thresholds = MinRating;
|
550
|
+
}
|
551
|
+
} /* FindClassifierErrors */
|
552
|
+
|
553
|
+
|
554
|
+
/*---------------------------------------------------------------------------*/
|
555
|
+
void InitStopperVars() {
|
556
|
+
/*
|
557
|
+
** Parameters: none
|
558
|
+
** Globals: none
|
559
|
+
** Operation: Initializes the control variables used by the stopper.
|
560
|
+
** Return: none
|
561
|
+
** Exceptions: none
|
562
|
+
** History: Thu May 9 10:06:04 1991, DSJ, Created.
|
563
|
+
*/
|
564
|
+
VALUE dummy;
|
565
|
+
|
566
|
+
string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
|
567
|
+
string_variable (WordToDebug, "WordToDebug", "");
|
568
|
+
string_variable (WordToDebug_lengths, "WordToDebug_lengths", "");
|
569
|
+
|
570
|
+
MakeNonDictCertainty();
|
571
|
+
MakeRejectCertaintyOffset();
|
572
|
+
MakeSmallWordSize();
|
573
|
+
MakeCertaintyPerChar();
|
574
|
+
MakeCertaintyVariation();
|
575
|
+
MakeStopperDebugLevel();
|
576
|
+
MakeAmbigThresholdGain();
|
577
|
+
MakeAmbigThresholdOffset();
|
578
|
+
} /* InitStopperVars */
|
579
|
+
|
580
|
+
|
581
|
+
/*---------------------------------------------------------------------------*/
|
582
|
+
void InitChoiceAccum() {
|
583
|
+
/*
|
584
|
+
** Parameters: none
|
585
|
+
** Globals: none
|
586
|
+
** Operation: This routine initializes the data structures used to
|
587
|
+
** keep track the good word choices found for a word.
|
588
|
+
** Return: none
|
589
|
+
** Exceptions: none
|
590
|
+
** History: Fri May 17 07:59:00 1991, DSJ, Created.
|
591
|
+
*/
|
592
|
+
BLOB_WIDTH *BlobWidth, *End;
|
593
|
+
|
594
|
+
if (BestRawChoice)
|
595
|
+
memfree(BestRawChoice);
|
596
|
+
|
597
|
+
if (BestChoices)
|
598
|
+
destroy_nodes(BestChoices, memfree);
|
599
|
+
|
600
|
+
BestRawChoice = NULL;
|
601
|
+
BestChoices = NIL;
|
602
|
+
EnableChoiceAccum();
|
603
|
+
|
604
|
+
for (BlobWidth = CurrentSegmentation,
|
605
|
+
End = CurrentSegmentation + MAX_NUM_CHUNKS;
|
606
|
+
BlobWidth < End; *BlobWidth++ = 1);
|
607
|
+
|
608
|
+
} /* InitChoiceAccum */
|
609
|
+
|
610
|
+
|
611
|
+
/*---------------------------------------------------------------------------*/
|
612
|
+
void
|
613
|
+
LogNewRawChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
|
614
|
+
/*
|
615
|
+
** Parameters:
|
616
|
+
** Choice new raw choice for current word
|
617
|
+
** AdjustFactor adjustment factor which was applied to choice
|
618
|
+
** Certainties certainties for each char in new choice
|
619
|
+
** Globals:
|
620
|
+
** BestRawChoice best raw choice so far for current word
|
621
|
+
** Operation: This routine compares Choice to the best raw (non-dict)
|
622
|
+
** choice so far and replaces it if the new choice is better.
|
623
|
+
** Return: none
|
624
|
+
** Exceptions: none
|
625
|
+
** History: Wed May 15 09:57:19 1991, DSJ, Created.
|
626
|
+
*/
|
627
|
+
if (!KeepWordChoices)
|
628
|
+
return;
|
629
|
+
|
630
|
+
if (!BestRawChoice)
|
631
|
+
BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
|
632
|
+
else if (class_probability (Choice) < BestRawChoice->Rating) {
|
633
|
+
if (ChoiceSameAs (Choice, BestRawChoice))
|
634
|
+
ReplaceDuplicateChoice(BestRawChoice, Choice, AdjustFactor, Certainties);
|
635
|
+
else {
|
636
|
+
memfree(BestRawChoice);
|
637
|
+
BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
|
638
|
+
}
|
639
|
+
}
|
640
|
+
} /* LogNewRawChoice */
|
641
|
+
|
642
|
+
|
643
|
+
/*---------------------------------------------------------------------------*/
|
644
|
+
void LogNewSegmentation(PIECES_STATE BlobWidth) {
|
645
|
+
/*
|
646
|
+
** Parameters:
|
647
|
+
** BlobWidth[] number of chunks in each blob in segmentation
|
648
|
+
** Globals:
|
649
|
+
** CurrentSegmentation blob widths for current segmentation
|
650
|
+
** Operation: This routine updates the blob widths in CurrentSegmentation
|
651
|
+
** to be the same as provided in BlobWidth.
|
652
|
+
** Return: none
|
653
|
+
** Exceptions: none
|
654
|
+
** History: Mon May 20 11:52:26 1991, DSJ, Created.
|
655
|
+
*/
|
656
|
+
BLOB_WIDTH *Segmentation;
|
657
|
+
|
658
|
+
for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
|
659
|
+
BlobWidth++, Segmentation++)
|
660
|
+
*Segmentation = *BlobWidth;
|
661
|
+
*Segmentation = 0;
|
662
|
+
|
663
|
+
} /* LogNewSegmentation */
|
664
|
+
|
665
|
+
|
666
|
+
/*---------------------------------------------------------------------------*/
|
667
|
+
void LogNewSplit(int Blob) {
|
668
|
+
/*
|
669
|
+
** Parameters:
|
670
|
+
** Blob index of blob that was split
|
671
|
+
** Globals:
|
672
|
+
** BestRawChoice current best raw choice
|
673
|
+
** BestChoices list of best choices found so far
|
674
|
+
** Operation: This routine adds 1 chunk to the specified blob for each
|
675
|
+
** choice in BestChoices and for the BestRawChoice.
|
676
|
+
** Return: none
|
677
|
+
** Exceptions: none
|
678
|
+
** History: Mon May 20 11:38:56 1991, DSJ, Created.
|
679
|
+
*/
|
680
|
+
LIST Choices;
|
681
|
+
|
682
|
+
if (BestRawChoice) {
|
683
|
+
AddNewChunk(BestRawChoice, Blob);
|
684
|
+
}
|
685
|
+
|
686
|
+
Choices = BestChoices;
|
687
|
+
iterate(Choices) {
|
688
|
+
AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
|
689
|
+
}
|
690
|
+
|
691
|
+
} /* LogNewSplit */
|
692
|
+
|
693
|
+
|
694
|
+
/*---------------------------------------------------------------------------*/
|
695
|
+
void
|
696
|
+
LogNewWordChoice (A_CHOICE * Choice,
|
697
|
+
FLOAT32 AdjustFactor, float Certainties[]) {
|
698
|
+
/*
|
699
|
+
** Parameters:
|
700
|
+
** Choice new choice for current word
|
701
|
+
** AdjustFactor adjustment factor which was applied to choice
|
702
|
+
** Certainties certainties for each char in new choice
|
703
|
+
** Globals:
|
704
|
+
** BestChoices best choices so far for current word
|
705
|
+
** Operation: This routine adds Choice to BestChoices if the
|
706
|
+
** adjusted certainty for Choice is within a reasonable range
|
707
|
+
** of the best choice in BestChoices. The BestChoices
|
708
|
+
** list is kept in sorted order by rating. Duplicates are
|
709
|
+
** removed.
|
710
|
+
** Return: none
|
711
|
+
** Exceptions: none
|
712
|
+
** History: Wed May 15 09:57:19 1991, DSJ, Created.
|
713
|
+
*/
|
714
|
+
VIABLE_CHOICE NewChoice;
|
715
|
+
LIST Choices;
|
716
|
+
FLOAT32 Threshold;
|
717
|
+
|
718
|
+
if (!KeepWordChoices)
|
719
|
+
return;
|
720
|
+
|
721
|
+
/* throw out obviously bad choices to save some work */
|
722
|
+
if (BestChoices != NIL) {
|
723
|
+
Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
|
724
|
+
if (Threshold > -AmbigThresholdOffset)
|
725
|
+
Threshold = -AmbigThresholdOffset;
|
726
|
+
if (class_certainty (Choice) - BestCertainty (BestChoices) < Threshold)
|
727
|
+
return;
|
728
|
+
}
|
729
|
+
|
730
|
+
/* see if a choice with the same text string has already been found */
|
731
|
+
NewChoice = NULL;
|
732
|
+
Choices = BestChoices;
|
733
|
+
iterate(Choices) {
|
734
|
+
if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices))) {
|
735
|
+
if (class_probability (Choice) < BestRating (Choices))
|
736
|
+
NewChoice = (VIABLE_CHOICE) first_node (Choices);
|
737
|
+
else
|
738
|
+
return;
|
739
|
+
}
|
740
|
+
}
|
741
|
+
|
742
|
+
if (NewChoice) {
|
743
|
+
ReplaceDuplicateChoice(NewChoice, Choice, AdjustFactor, Certainties);
|
744
|
+
BestChoices = delete_d (BestChoices, NewChoice, is_same_node);
|
745
|
+
}
|
746
|
+
else {
|
747
|
+
NewChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
|
748
|
+
}
|
749
|
+
|
750
|
+
BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
|
751
|
+
if (StopperDebugLevel >= 2)
|
752
|
+
PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
|
753
|
+
if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
|
754
|
+
Choices =
|
755
|
+
(LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
|
756
|
+
destroy_nodes (rest (Choices), Efree);
|
757
|
+
set_rest(Choices, NIL);
|
758
|
+
}
|
759
|
+
|
760
|
+
} /* LogNewWordChoice */
|
761
|
+
|
762
|
+
|
763
|
+
/*---------------------------------------------------------------------------*/
|
764
|
+
static AMBIG_TABLE *AmbigFor = NULL;
|
765
|
+
|
766
|
+
int NoDangerousAmbig(const char *Word,
|
767
|
+
const char *Word_lengths,
|
768
|
+
DANGERR *fixpt) {
|
769
|
+
/*
|
770
|
+
** Parameters:
|
771
|
+
** Word word to check for dangerous ambiguities
|
772
|
+
** Word_lengths lengths of unichars in Word
|
773
|
+
** Globals: none
|
774
|
+
** Operation: This word checks each letter in word against a list
|
775
|
+
** of potentially ambiguous characters. If a match is found
|
776
|
+
** that letter is replaced with its ambiguity and tested in
|
777
|
+
** the dictionary. If the ambiguous word is found in the
|
778
|
+
** dictionary, FALSE is returned. Otherwise, the search
|
779
|
+
** continues for other ambiguities. If no ambiguities that
|
780
|
+
** match in the dictionary are found, TRUE is returned.
|
781
|
+
** Return: TRUE if Word contains no dangerous ambiguities.
|
782
|
+
** Exceptions: none
|
783
|
+
** History: Mon May 6 16:28:56 1991, DSJ, Created.
|
784
|
+
*/
|
785
|
+
|
786
|
+
char NewWord[MAX_WERD_SIZE * UNICHAR_LEN + 1];
|
787
|
+
char *NextNewChar;
|
788
|
+
int bad_index = 0;
|
789
|
+
|
790
|
+
if (!AmbigFor)
|
791
|
+
AmbigFor = FillAmbigTable ();
|
792
|
+
|
793
|
+
NextNewChar = NewWord;
|
794
|
+
while (*Word)
|
795
|
+
if (AmbigsFound (NewWord, NextNewChar,
|
796
|
+
Word + *Word_lengths, Word_lengths + 1,
|
797
|
+
AmbigFor[unicharset.unichar_to_id(Word, *Word_lengths)],
|
798
|
+
fixpt)) {
|
799
|
+
if (fixpt != NULL)
|
800
|
+
fixpt->index = bad_index;
|
801
|
+
return (FALSE);
|
802
|
+
}
|
803
|
+
else {
|
804
|
+
strncpy(NextNewChar, Word, *Word_lengths);
|
805
|
+
NextNewChar += *Word_lengths;
|
806
|
+
Word += *Word_lengths;
|
807
|
+
Word_lengths++;
|
808
|
+
bad_index++;
|
809
|
+
}
|
810
|
+
|
811
|
+
return (TRUE);
|
812
|
+
|
813
|
+
} /* NoDangerousAmbig */
|
814
|
+
|
815
|
+
void EndDangerousAmbigs() {
|
816
|
+
if (AmbigFor != NULL) {
|
817
|
+
for (int i = 0; i <= MAX_CLASS_ID; ++i) {
|
818
|
+
destroy_nodes(AmbigFor[i], Efree);
|
819
|
+
}
|
820
|
+
Efree(AmbigFor);
|
821
|
+
AmbigFor = NULL;
|
822
|
+
}
|
823
|
+
}
|
824
|
+
|
825
|
+
/*---------------------------------------------------------------------------*/
|
826
|
+
void SettupStopperPass1() {
|
827
|
+
/*
|
828
|
+
** Parameters: none
|
829
|
+
** Globals:
|
830
|
+
** RejectOffset offset allowed before word is rejected
|
831
|
+
** Operation: This routine performs any settup of stopper variables
|
832
|
+
** that is needed in preparation for the first pass.
|
833
|
+
** Return: none
|
834
|
+
** Exceptions: none
|
835
|
+
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
|
836
|
+
*/
|
837
|
+
RejectOffset = 0.0;
|
838
|
+
} /* SettupStopperPass1 */
|
839
|
+
|
840
|
+
|
841
|
+
/*---------------------------------------------------------------------------*/
|
842
|
+
void SettupStopperPass2() {
|
843
|
+
/*
|
844
|
+
** Parameters: none
|
845
|
+
** Globals:
|
846
|
+
** RejectOffset offset allowed before word is rejected
|
847
|
+
** Operation: This routine performs any settup of stopper variables
|
848
|
+
** that is needed in preparation for the second pass.
|
849
|
+
** Return: none
|
850
|
+
** Exceptions: none
|
851
|
+
** History: Mon Jun 3 12:32:00 1991, DSJ, Created.
|
852
|
+
*/
|
853
|
+
RejectOffset = RejectCertaintyOffset;
|
854
|
+
} /* SettupStopperPass2 */
|
855
|
+
|
856
|
+
|
857
|
+
/**----------------------------------------------------------------------------
|
858
|
+
Private Code
|
859
|
+
----------------------------------------------------------------------------**/
|
860
|
+
/*---------------------------------------------------------------------------*/
|
861
|
+
void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
|
862
|
+
/*
|
863
|
+
** Parameters:
|
864
|
+
** Choice choice to add a new chunk to
|
865
|
+
** Blob index of blob being split
|
866
|
+
** Globals: none
|
867
|
+
** Operation: This routine increments the chunk count of the character
|
868
|
+
** in Choice which corresponds to Blob.
|
869
|
+
** Return: none
|
870
|
+
** Exceptions: none
|
871
|
+
** History: Mon May 20 11:43:27 1991, DSJ, Created.
|
872
|
+
*/
|
873
|
+
int i, LastChunk;
|
874
|
+
|
875
|
+
for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
|
876
|
+
LastChunk += Choice->Blob[i].NumChunks;
|
877
|
+
if (Blob < LastChunk) {
|
878
|
+
(Choice->Blob[i].NumChunks)++;
|
879
|
+
return;
|
880
|
+
}
|
881
|
+
}
|
882
|
+
mem_tidy (1);
|
883
|
+
cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
|
884
|
+
Choice->Length, LastChunk, Blob);
|
885
|
+
assert(FALSE); /* this should never get executed */
|
886
|
+
|
887
|
+
} /* AddNewChunk */
|
888
|
+
|
889
|
+
|
890
|
+
/*---------------------------------------------------------------------------*/
|
891
|
+
int AmbigsFound(char *Word,
|
892
|
+
char *CurrentChar,
|
893
|
+
const char *Tail,
|
894
|
+
const char *Tail_lengths,
|
895
|
+
LIST Ambigs,
|
896
|
+
DANGERR *fixpt) {
|
897
|
+
/*
|
898
|
+
** Parameters:
|
899
|
+
** Word word being tested for ambiguities
|
900
|
+
** CurrentChar position in Word to put ambig replacement
|
901
|
+
** Tail end of word to place after ambiguity
|
902
|
+
** Tail_lengths lengths of the unichars in Tail
|
903
|
+
** Ambigs list of ambiguities to test at this position
|
904
|
+
** Globals: none
|
905
|
+
** Operation: For each ambiguity in Ambigs, see if the remainder of
|
906
|
+
** the test string matches the start of Tail. If it does,
|
907
|
+
** construct a word consisting of the contents of Word up to,
|
908
|
+
** but not including, CurrentChar followed by the replacement
|
909
|
+
** string for the ambiguity followed by the unmatched
|
910
|
+
** contents of Tail. Then test this word to see if it
|
911
|
+
** is a dictionary word. If it is return TRUE. If none of
|
912
|
+
** the ambiguities result in a dictionary word, return FALSE.
|
913
|
+
** Return: TRUE if the Word is ambiguous at the specified position
|
914
|
+
** Exceptions: none
|
915
|
+
** History: Thu May 9 10:10:28 1991, DSJ, Created.
|
916
|
+
*/
|
917
|
+
AMBIG_SPEC *AmbigSpec;
|
918
|
+
char *ambig;
|
919
|
+
char *ambig_lengths;
|
920
|
+
const char *UnmatchedTail;
|
921
|
+
const char *UnmatchedTail_lengths;
|
922
|
+
int Matches;
|
923
|
+
int bad_length;
|
924
|
+
|
925
|
+
iterate(Ambigs) {
|
926
|
+
AmbigSpec = (AMBIG_SPEC *) first_node (Ambigs);
|
927
|
+
ambig = AmbigSpec->ambig;
|
928
|
+
ambig_lengths = AmbigSpec->lengths;
|
929
|
+
bad_length = 1;
|
930
|
+
UnmatchedTail = Tail;
|
931
|
+
UnmatchedTail_lengths = Tail_lengths;
|
932
|
+
Matches = TRUE;
|
933
|
+
|
934
|
+
while (*ambig != ' ' && Matches)
|
935
|
+
if (*UnmatchedTail_lengths == *ambig_lengths &&
|
936
|
+
strncmp(ambig, UnmatchedTail, *ambig_lengths) == 0) {
|
937
|
+
ambig += *(ambig_lengths++);
|
938
|
+
UnmatchedTail += *(UnmatchedTail_lengths++);
|
939
|
+
bad_length++;
|
940
|
+
}
|
941
|
+
else
|
942
|
+
Matches = FALSE;
|
943
|
+
|
944
|
+
if (Matches) {
|
945
|
+
ambig += *(ambig_lengths++); /* skip over the space */
|
946
|
+
/* insert replacement string */
|
947
|
+
strcpy(CurrentChar, ambig);
|
948
|
+
/* add tail */
|
949
|
+
strcat(Word, UnmatchedTail);
|
950
|
+
if (valid_word (Word)) {
|
951
|
+
if (StopperDebugLevel >= 1)
|
952
|
+
cprintf ("Stopper: Possible ambiguous word = %s\n", Word);
|
953
|
+
if (fixpt != NULL) {
|
954
|
+
fixpt->good_length = strlen (ambig_lengths);
|
955
|
+
fixpt->bad_length = bad_length;
|
956
|
+
}
|
957
|
+
return (TRUE);
|
958
|
+
}
|
959
|
+
}
|
960
|
+
}
|
961
|
+
return (FALSE);
|
962
|
+
|
963
|
+
} /* AmbigsFound */
|
964
|
+
|
965
|
+
|
966
|
+
/*---------------------------------------------------------------------------*/
|
967
|
+
int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice) {
|
968
|
+
/*
|
969
|
+
** Parameters:
|
970
|
+
** Choice choice to compare to ViableChoice
|
971
|
+
** ViableChoice viable choice to compare to Choice
|
972
|
+
** Globals: none
|
973
|
+
** Operation: This routine compares the corresponding strings of
|
974
|
+
** Choice and ViableChoice and returns TRUE if they are the
|
975
|
+
** same, FALSE otherwise.
|
976
|
+
** Return: TRUE or FALSE.
|
977
|
+
** Exceptions: none
|
978
|
+
** History: Fri May 17 08:48:04 1991, DSJ, Created.
|
979
|
+
*/
|
980
|
+
return (StringSameAs (class_string (Choice), class_lengths (Choice),
|
981
|
+
ViableChoice));
|
982
|
+
|
983
|
+
} /* ChoiceSameAs */
|
984
|
+
|
985
|
+
|
986
|
+
/*---------------------------------------------------------------------------*/
|
987
|
+
int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1,
|
988
|
+
void *arg2) { //VIABLE_CHOICE Choice2)
|
989
|
+
/*
|
990
|
+
** Parameters:
|
991
|
+
** Choice1, Choice2 choices to compare ratings for
|
992
|
+
** Globals: none
|
993
|
+
** Operation: Return -1 if the rating for Choice1 is less than the
|
994
|
+
** rating for Choice2, otherwise return (1).
|
995
|
+
** Return: -1 or 1
|
996
|
+
** Exceptions: none
|
997
|
+
** History: Wed May 15 13:02:37 1991, DSJ, Created.
|
998
|
+
*/
|
999
|
+
float R1, R2;
|
1000
|
+
VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
|
1001
|
+
VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
|
1002
|
+
|
1003
|
+
R1 = Choice1->Rating;
|
1004
|
+
R2 = Choice2->Rating;
|
1005
|
+
|
1006
|
+
if (R1 < R2)
|
1007
|
+
return (-1);
|
1008
|
+
else
|
1009
|
+
return (1);
|
1010
|
+
|
1011
|
+
} /* CmpChoiceRatings */
|
1012
|
+
|
1013
|
+
|
1014
|
+
/*---------------------------------------------------------------------------*/
|
1015
|
+
void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
|
1016
|
+
/*
|
1017
|
+
** Parameters:
|
1018
|
+
** Choice choice to be expanded
|
1019
|
+
** ExpandedChoice place to put resulting expanded choice
|
1020
|
+
** Globals: none
|
1021
|
+
** Operation: This routine expands Choice and places the results
|
1022
|
+
** in ExpandedChoice. The primary function of expansion
|
1023
|
+
** is to create an two arrays, one which holds the corresponding
|
1024
|
+
** certainty for each chunk in Choice, and one which holds
|
1025
|
+
** the class for each chunk.
|
1026
|
+
** Return: none (results are placed in ExpandedChoice)
|
1027
|
+
** Exceptions: none
|
1028
|
+
** History: Fri May 31 15:21:57 1991, DSJ, Created.
|
1029
|
+
*/
|
1030
|
+
int i, j, Chunk;
|
1031
|
+
|
1032
|
+
ExpandedChoice->Choice = Choice;
|
1033
|
+
for (i = 0, Chunk = 0; i < Choice->Length; i++)
|
1034
|
+
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
|
1035
|
+
ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
|
1036
|
+
ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
|
1037
|
+
}
|
1038
|
+
} /* ExpandChoice */
|
1039
|
+
|
1040
|
+
|
1041
|
+
/*---------------------------------------------------------------------------*/
|
1042
|
+
AMBIG_TABLE *FillAmbigTable() {
|
1043
|
+
/*
|
1044
|
+
** Parameters: none
|
1045
|
+
** Globals:
|
1046
|
+
** DangerousAmbigs filename of dangerous ambig info
|
1047
|
+
** Operation: This routine allocates a new ambiguity table and fills
|
1048
|
+
** it in from the file specified by DangerousAmbigs. An
|
1049
|
+
** ambiguity table is an array of lists. The array is indexed
|
1050
|
+
** by a class id. Therefore, each entry in the table provides
|
1051
|
+
** a list of potential ambiguities which can start with the
|
1052
|
+
** corresponding character. Each potential ambiguity is
|
1053
|
+
** described by a string which contains the remainder of the
|
1054
|
+
** test string followed by a space followed by the replacement
|
1055
|
+
** string. For example the ambiguity "rn -> m", would be
|
1056
|
+
** located in the table at index 'r'. The string corresponding
|
1057
|
+
** to this ambiguity would be "n m".
|
1058
|
+
** Return: Pointer to new ambiguity table.
|
1059
|
+
** Exceptions: none
|
1060
|
+
** History: Thu May 9 09:20:57 1991, DSJ, Created.
|
1061
|
+
*/
|
1062
|
+
FILE *AmbigFile;
|
1063
|
+
AMBIG_TABLE *NewTable;
|
1064
|
+
int i;
|
1065
|
+
int AmbigPartSize;
|
1066
|
+
char buffer[256 * UNICHAR_LEN];
|
1067
|
+
char TestString[256 * UNICHAR_LEN];
|
1068
|
+
char TestString_lengths[256];
|
1069
|
+
char ReplacementString[256 * UNICHAR_LEN];
|
1070
|
+
char ReplacementString_lengths[256];
|
1071
|
+
STRING name;
|
1072
|
+
char lengths[2];
|
1073
|
+
AMBIG_SPEC *AmbigSpec;
|
1074
|
+
UNICHAR_ID unichar_id;
|
1075
|
+
|
1076
|
+
lengths[1] = 0;
|
1077
|
+
|
1078
|
+
name = language_data_path_prefix;
|
1079
|
+
name += DangerousAmbigs;
|
1080
|
+
AmbigFile = Efopen (name.string(), "r");
|
1081
|
+
NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));
|
1082
|
+
|
1083
|
+
for (i = 0; i <= MAX_CLASS_ID; i++)
|
1084
|
+
NewTable[i] = NIL;
|
1085
|
+
|
1086
|
+
while (fscanf (AmbigFile, "%d", &AmbigPartSize) == 1) {
|
1087
|
+
TestString[0] = '\0';
|
1088
|
+
TestString_lengths[0] = 0;
|
1089
|
+
ReplacementString[0] = '\0';
|
1090
|
+
ReplacementString_lengths[0] = 0;
|
1091
|
+
bool illegal_char = false;
|
1092
|
+
for (i = 0; i < AmbigPartSize; ++i) {
|
1093
|
+
fscanf (AmbigFile, "%s", buffer);
|
1094
|
+
strcat(TestString, buffer);
|
1095
|
+
lengths[0] = strlen(buffer);
|
1096
|
+
strcat(TestString_lengths, lengths);
|
1097
|
+
if (!unicharset.contains_unichar(buffer))
|
1098
|
+
illegal_char = true;
|
1099
|
+
}
|
1100
|
+
fscanf (AmbigFile, "%d", &AmbigPartSize);
|
1101
|
+
for (i = 0; i < AmbigPartSize; ++i) {
|
1102
|
+
fscanf (AmbigFile, "%s", buffer);
|
1103
|
+
strcat(ReplacementString, buffer);
|
1104
|
+
lengths[0] = strlen(buffer);
|
1105
|
+
strcat(ReplacementString_lengths, lengths);
|
1106
|
+
if (!unicharset.contains_unichar(buffer))
|
1107
|
+
illegal_char = true;
|
1108
|
+
}
|
1109
|
+
|
1110
|
+
if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
|
1111
|
+
strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
|
1112
|
+
DoError (0, "Illegal ambiguity specification!");
|
1113
|
+
if (illegal_char) {
|
1114
|
+
continue;
|
1115
|
+
}
|
1116
|
+
|
1117
|
+
AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));
|
1118
|
+
|
1119
|
+
strcpy(AmbigSpec->ambig, TestString + TestString_lengths[0]);
|
1120
|
+
strcat(AmbigSpec->ambig, " ");
|
1121
|
+
strcat(AmbigSpec->ambig, ReplacementString);
|
1122
|
+
|
1123
|
+
strcpy(AmbigSpec->lengths, TestString_lengths + 1);
|
1124
|
+
lengths[0] = 1;
|
1125
|
+
strcat(AmbigSpec->lengths, lengths);
|
1126
|
+
strcat(AmbigSpec->lengths, ReplacementString_lengths);
|
1127
|
+
unichar_id = unicharset.unichar_to_id(TestString, TestString_lengths[0]);
|
1128
|
+
NewTable[unichar_id] = push_last (NewTable[unichar_id], AmbigSpec);
|
1129
|
+
}
|
1130
|
+
|
1131
|
+
fclose(AmbigFile);
|
1132
|
+
return (NewTable);
|
1133
|
+
|
1134
|
+
} /* FillAmbigTable */
|
1135
|
+
|
1136
|
+
|
1137
|
+
/*---------------------------------------------------------------------------*/
|
1138
|
+
int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice,
|
1139
|
+
void *item2) { //EXPANDED_CHOICE *BestChoice)
|
1140
|
+
/*
|
1141
|
+
** Parameters:
|
1142
|
+
** Choice choice to be tested
|
1143
|
+
** BestChoice best choice found
|
1144
|
+
** Globals:
|
1145
|
+
** AmbigThresholdGain
|
1146
|
+
** AmbigThresholdOffset
|
1147
|
+
** Operation: If the certainty of any chunk in Choice is not ambiguous
|
1148
|
+
** with the corresponding chunk in the best choice, free
|
1149
|
+
** Choice and return TRUE. Otherwise, return FALSE.
|
1150
|
+
** Return: TRUE or FALSE.
|
1151
|
+
** Exceptions: none
|
1152
|
+
** History: Wed May 15 13:20:26 1991, DSJ, Created.
|
1153
|
+
*/
|
1154
|
+
int i, j, Chunk;
|
1155
|
+
FLOAT32 Threshold;
|
1156
|
+
VIABLE_CHOICE Choice;
|
1157
|
+
EXPANDED_CHOICE *BestChoice;
|
1158
|
+
|
1159
|
+
Choice = (VIABLE_CHOICE) item1;
|
1160
|
+
BestChoice = (EXPANDED_CHOICE *) item2;
|
1161
|
+
|
1162
|
+
Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
|
1163
|
+
Choice->AdjustFactor);
|
1164
|
+
|
1165
|
+
for (i = 0, Chunk = 0; i < Choice->Length; i++)
|
1166
|
+
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
|
1167
|
+
if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
|
1168
|
+
Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
|
1169
|
+
Threshold) {
|
1170
|
+
memfree(Choice);
|
1171
|
+
return (TRUE);
|
1172
|
+
}
|
1173
|
+
|
1174
|
+
return (FALSE);
|
1175
|
+
|
1176
|
+
} /* FreeBadChoice */
|
1177
|
+
|
1178
|
+
|
1179
|
+
/*---------------------------------------------------------------------------*/
|
1180
|
+
int LengthOfShortestAlphaRun(register char *Word, const char *Word_lengths) {
|
1181
|
+
/*
|
1182
|
+
** Parameters:
|
1183
|
+
** Word word to be tested
|
1184
|
+
** Word_lengths lengths of the unichars in Word
|
1185
|
+
** Globals: none
|
1186
|
+
** Operation: Return the length of the shortest alpha run in Word.
|
1187
|
+
** Return: Return the length of the shortest alpha run in Word.
|
1188
|
+
** Exceptions: none
|
1189
|
+
** History: Tue May 14 07:50:45 1991, DSJ, Created.
|
1190
|
+
*/
|
1191
|
+
register int Shortest = MAX_INT32;
|
1192
|
+
register int Length;
|
1193
|
+
|
1194
|
+
for (; *Word; Word += *(Word_lengths++))
|
1195
|
+
if (unicharset.get_isalpha(Word, *Word_lengths)) {
|
1196
|
+
for (Length = 1, Word += *(Word_lengths++);
|
1197
|
+
*Word && unicharset.get_isalpha(Word, *Word_lengths);
|
1198
|
+
Word += *(Word_lengths++), Length++);
|
1199
|
+
if (Length < Shortest)
|
1200
|
+
Shortest = Length;
|
1201
|
+
|
1202
|
+
if (*Word == 0)
|
1203
|
+
break;
|
1204
|
+
}
|
1205
|
+
if (Shortest == MAX_INT32)
|
1206
|
+
Shortest = 0;
|
1207
|
+
|
1208
|
+
return (Shortest);
|
1209
|
+
|
1210
|
+
} /* LengthOfShortestAlphaRun */
|
1211
|
+
|
1212
|
+
|
1213
|
+
/*---------------------------------------------------------------------------*/
|
1214
|
+
VIABLE_CHOICE
|
1215
|
+
NewViableChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
|
1216
|
+
/*
|
1217
|
+
** Parameters:
|
1218
|
+
** Choice choice to be converted to a viable choice
|
1219
|
+
** AdjustFactor factor used to adjust ratings for Choice
|
1220
|
+
** Certainties certainty for each character in Choice
|
1221
|
+
** Globals:
|
1222
|
+
** CurrentSegmentation segmentation corresponding to Choice
|
1223
|
+
** Operation: Allocate a new viable choice data structure, copy
|
1224
|
+
** Choice, Certainties, and CurrentSegmentation into it,
|
1225
|
+
** and return a pointer to it.
|
1226
|
+
** Return: Ptr to new viable choice.
|
1227
|
+
** Exceptions: none
|
1228
|
+
** History: Thu May 16 15:28:29 1991, DSJ, Created.
|
1229
|
+
*/
|
1230
|
+
VIABLE_CHOICE NewChoice;
|
1231
|
+
int Length;
|
1232
|
+
char *Word;
|
1233
|
+
char *Word_lengths;
|
1234
|
+
CHAR_CHOICE *NewChar;
|
1235
|
+
BLOB_WIDTH *BlobWidth;
|
1236
|
+
|
1237
|
+
Length = strlen (class_lengths (Choice));
|
1238
|
+
assert (Length <= MAX_NUM_CHUNKS && Length > 0);
|
1239
|
+
|
1240
|
+
NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) +
|
1241
|
+
(Length - 1) * sizeof (CHAR_CHOICE));
|
1242
|
+
|
1243
|
+
NewChoice->Rating = class_probability (Choice);
|
1244
|
+
NewChoice->Certainty = class_certainty (Choice);
|
1245
|
+
NewChoice->AdjustFactor = AdjustFactor;
|
1246
|
+
NewChoice->Length = Length;
|
1247
|
+
for (Word = class_string (Choice),
|
1248
|
+
Word_lengths = class_lengths (Choice),
|
1249
|
+
NewChar = &(NewChoice->Blob[0]),
|
1250
|
+
BlobWidth = CurrentSegmentation;
|
1251
|
+
*Word;
|
1252
|
+
Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) {
|
1253
|
+
NewChar->Class = unicharset.unichar_to_id(Word, *Word_lengths);
|
1254
|
+
NewChar->NumChunks = *BlobWidth;
|
1255
|
+
NewChar->Certainty = *Certainties;
|
1256
|
+
}
|
1257
|
+
|
1258
|
+
return (NewChoice);
|
1259
|
+
|
1260
|
+
} /* NewViableChoice */
|
1261
|
+
|
1262
|
+
|
1263
|
+
/*---------------------------------------------------------------------------*/
|
1264
|
+
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
|
1265
|
+
/*
|
1266
|
+
** Parameters:
|
1267
|
+
** File open text file to print Choice to
|
1268
|
+
** Label text label to be printed with Choice
|
1269
|
+
** Choice choice to be printed
|
1270
|
+
** Globals: none
|
1271
|
+
** Operation: This routine dumps a text representation of the
|
1272
|
+
** specified Choice to File.
|
1273
|
+
** Return: none
|
1274
|
+
** Exceptions: none
|
1275
|
+
** History: Mon May 20 11:16:44 1991, DSJ, Created.
|
1276
|
+
*/
|
1277
|
+
int i, j;
|
1278
|
+
|
1279
|
+
fprintf (File, "%s", Label);
|
1280
|
+
|
1281
|
+
fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f) ",
|
1282
|
+
Choice->Rating, Choice->Certainty, Choice->AdjustFactor);
|
1283
|
+
|
1284
|
+
for (i = 0; i < Choice->Length; i++)
|
1285
|
+
fprintf (File, "%s", unicharset.id_to_unichar(Choice->Blob[i].Class));
|
1286
|
+
fprintf (File, "\n");
|
1287
|
+
|
1288
|
+
for (i = 0; i < Choice->Length; i++) {
|
1289
|
+
fprintf (File, " %s", unicharset.id_to_unichar(Choice->Blob[i].Class));
|
1290
|
+
for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
|
1291
|
+
fprintf (File, " ");
|
1292
|
+
}
|
1293
|
+
fprintf (File, "\n");
|
1294
|
+
|
1295
|
+
for (i = 0; i < Choice->Length; i++) {
|
1296
|
+
for (j = 0; j < Choice->Blob[i].NumChunks; j++)
|
1297
|
+
fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0));
|
1298
|
+
}
|
1299
|
+
fprintf (File, "\n");
|
1300
|
+
|
1301
|
+
} /* PrintViableChoice */
|
1302
|
+
|
1303
|
+
|
1304
|
+
/*---------------------------------------------------------------------------*/
|
1305
|
+
void
|
1306
|
+
ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
|
1307
|
+
A_CHOICE * NewChoice,
|
1308
|
+
FLOAT32 AdjustFactor, float Certainties[]) {
|
1309
|
+
/*
|
1310
|
+
** Parameters:
|
1311
|
+
** OldChoice existing viable choice to be replaced
|
1312
|
+
** NewChoice choice to replace OldChoice with
|
1313
|
+
** AdjustFactor factor used to adjust ratings for OldChoice
|
1314
|
+
** Certainties certainty for each character in OldChoice
|
1315
|
+
** Globals:
|
1316
|
+
** CurrentSegmentation segmentation for NewChoice
|
1317
|
+
** Operation: This routine is used whenever a better segmentation (or
|
1318
|
+
** contextual interpretation) is found for a word which already
|
1319
|
+
** exists. The OldChoice is updated with the relevant
|
1320
|
+
** information from the new choice. The text string itself
|
1321
|
+
** does not need to be copied since, by definition, has not
|
1322
|
+
** changed.
|
1323
|
+
** Return: none
|
1324
|
+
** Exceptions: none
|
1325
|
+
** History: Fri May 17 13:35:58 1991, DSJ, Created.
|
1326
|
+
*/
|
1327
|
+
char *Word;
|
1328
|
+
char *Word_lengths;
|
1329
|
+
CHAR_CHOICE *NewChar;
|
1330
|
+
BLOB_WIDTH *BlobWidth;
|
1331
|
+
|
1332
|
+
OldChoice->Rating = class_probability (NewChoice);
|
1333
|
+
OldChoice->Certainty = class_certainty (NewChoice);
|
1334
|
+
OldChoice->AdjustFactor = AdjustFactor;
|
1335
|
+
|
1336
|
+
for (Word = class_string (NewChoice),
|
1337
|
+
Word_lengths = class_lengths (NewChoice),
|
1338
|
+
NewChar = &(OldChoice->Blob[0]),
|
1339
|
+
BlobWidth = CurrentSegmentation;
|
1340
|
+
*Word;
|
1341
|
+
Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) {
|
1342
|
+
NewChar->NumChunks = *BlobWidth;
|
1343
|
+
NewChar->Certainty = *Certainties;
|
1344
|
+
}
|
1345
|
+
} /* ReplaceDuplicateChoice */
|
1346
|
+
|
1347
|
+
|
1348
|
+
/*---------------------------------------------------------------------------*/
|
1349
|
+
int StringSameAs(const char *String,
|
1350
|
+
const char *String_lengths,
|
1351
|
+
VIABLE_CHOICE ViableChoice) {
|
1352
|
+
/*
|
1353
|
+
** Parameters:
|
1354
|
+
** String string to compare to ViableChoice
|
1355
|
+
** String_lengths lengths of unichars in String
|
1356
|
+
** ViableChoice viable choice to compare to String
|
1357
|
+
** Globals: none
|
1358
|
+
** Operation: This routine compares String to ViableChoice and
|
1359
|
+
** returns TRUE if they are the same, FALSE otherwise.
|
1360
|
+
** Return: TRUE or FALSE.
|
1361
|
+
** Exceptions: none
|
1362
|
+
** History: Fri May 17 08:48:04 1991, DSJ, Created.
|
1363
|
+
*/
|
1364
|
+
CHAR_CHOICE *Char;
|
1365
|
+
int i;
|
1366
|
+
int current_unichar_length;
|
1367
|
+
|
1368
|
+
for (Char = &(ViableChoice->Blob[0]), i = 0;
|
1369
|
+
i < ViableChoice->Length;
|
1370
|
+
String += *(String_lengths++), Char++, i++) {
|
1371
|
+
current_unichar_length = strlen(unicharset.id_to_unichar(Char->Class));
|
1372
|
+
if (current_unichar_length != *String_lengths ||
|
1373
|
+
strncmp(String, unicharset.id_to_unichar(Char->Class),
|
1374
|
+
current_unichar_length) != 0)
|
1375
|
+
return (FALSE);
|
1376
|
+
}
|
1377
|
+
|
1378
|
+
if (*String == 0)
|
1379
|
+
return (TRUE);
|
1380
|
+
else
|
1381
|
+
return (FALSE);
|
1382
|
+
|
1383
|
+
} /* StringSameAs */
|
1384
|
+
|
1385
|
+
|
1386
|
+
/*---------------------------------------------------------------------------*/
|
1387
|
+
int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice) {
|
1388
|
+
/*
|
1389
|
+
** Parameters:
|
1390
|
+
** Choices choices for current segmentation
|
1391
|
+
** BestChoice best choice for current segmentation
|
1392
|
+
** Globals:
|
1393
|
+
** CertaintyVariation max allowed certainty variation
|
1394
|
+
** Operation: This routine returns TRUE if the certainty of the
|
1395
|
+
** BestChoice word is within a reasonable range of the average
|
1396
|
+
** certainties for the best choices for each character in
|
1397
|
+
** the segmentation. This test is used to catch words in which
|
1398
|
+
** one character is much worse than the other characters in
|
1399
|
+
** the word (i.e. FALSE will be returned in that case).
|
1400
|
+
** The algorithm computes the mean and std deviation of the
|
1401
|
+
** certainties in the word with the worst certainty thrown out.
|
1402
|
+
** Return: TRUE or FALSE.
|
1403
|
+
** Exceptions: none
|
1404
|
+
** History: Tue May 14 08:23:21 1991, DSJ, Created.
|
1405
|
+
*/
|
1406
|
+
int i;
|
1407
|
+
CHOICES CharChoices;
|
1408
|
+
float Certainty;
|
1409
|
+
float WorstCertainty = MAX_FLOAT32;
|
1410
|
+
float CertaintyThreshold;
|
1411
|
+
FLOAT64 TotalCertainty;
|
1412
|
+
FLOAT64 TotalCertaintySquared;
|
1413
|
+
FLOAT64 Variance;
|
1414
|
+
FLOAT32 Mean, StdDev;
|
1415
|
+
int WordLength;
|
1416
|
+
|
1417
|
+
WordLength = array_count (Choices);
|
1418
|
+
if (WordLength < 3)
|
1419
|
+
return (TRUE);
|
1420
|
+
|
1421
|
+
TotalCertainty = TotalCertaintySquared = 0.0;
|
1422
|
+
for_each_choice(Choices, i) {
|
1423
|
+
CharChoices = (CHOICES) array_index (Choices, i);
|
1424
|
+
Certainty = best_certainty (CharChoices);
|
1425
|
+
TotalCertainty += Certainty;
|
1426
|
+
TotalCertaintySquared += Certainty * Certainty;
|
1427
|
+
if (Certainty < WorstCertainty)
|
1428
|
+
WorstCertainty = Certainty;
|
1429
|
+
}
|
1430
|
+
|
1431
|
+
/* subtract off worst certainty from statistics */
|
1432
|
+
WordLength--;
|
1433
|
+
TotalCertainty -= WorstCertainty;
|
1434
|
+
TotalCertaintySquared -= WorstCertainty * WorstCertainty;
|
1435
|
+
|
1436
|
+
Mean = TotalCertainty / WordLength;
|
1437
|
+
Variance = ((WordLength * TotalCertaintySquared -
|
1438
|
+
TotalCertainty * TotalCertainty) /
|
1439
|
+
(WordLength * (WordLength - 1)));
|
1440
|
+
if (Variance < 0.0)
|
1441
|
+
Variance = 0.0;
|
1442
|
+
StdDev = sqrt (Variance);
|
1443
|
+
|
1444
|
+
CertaintyThreshold = Mean - CertaintyVariation * StdDev;
|
1445
|
+
if (CertaintyThreshold > NonDictCertainty)
|
1446
|
+
CertaintyThreshold = NonDictCertainty;
|
1447
|
+
|
1448
|
+
if (class_certainty (BestChoice) < CertaintyThreshold) {
|
1449
|
+
if (StopperDebugLevel >= 1)
|
1450
|
+
cprintf
|
1451
|
+
("Stopper: Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n",
|
1452
|
+
class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold);
|
1453
|
+
return (FALSE);
|
1454
|
+
}
|
1455
|
+
else
|
1456
|
+
return (TRUE);
|
1457
|
+
|
1458
|
+
} /* UniformCertainties */
|