tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
|
@@ -0,0 +1,1273 @@
|
|
|
1
|
+
/******************************************************************
|
|
2
|
+
* File: output.cpp (Formerly output.c)
|
|
3
|
+
* Description: Output pass
|
|
4
|
+
* Author: Phil Cheatle
|
|
5
|
+
* Created: Thu Aug 4 10:56:08 BST 1994
|
|
6
|
+
*
|
|
7
|
+
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
|
8
|
+
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
** you may not use this file except in compliance with the License.
|
|
10
|
+
** You may obtain a copy of the License at
|
|
11
|
+
** http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
** Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
** See the License for the specific language governing permissions and
|
|
16
|
+
** limitations under the License.
|
|
17
|
+
*
|
|
18
|
+
**********************************************************************/
|
|
19
|
+
|
|
20
|
+
#include "mfcpch.h"
|
|
21
|
+
#include "ocrshell.h"
|
|
22
|
+
#include <string.h>
|
|
23
|
+
#include <ctype.h>
|
|
24
|
+
#ifdef __UNIX__
|
|
25
|
+
#include <assert.h>
|
|
26
|
+
#include <unistd.h>
|
|
27
|
+
#include <errno.h>
|
|
28
|
+
#endif
|
|
29
|
+
#include "mainblk.h"
|
|
30
|
+
#include "tfacep.h"
|
|
31
|
+
#include "tessvars.h"
|
|
32
|
+
#include "control.h"
|
|
33
|
+
#include "secname.h"
|
|
34
|
+
#include "reject.h"
|
|
35
|
+
#include "docqual.h"
|
|
36
|
+
#include "output.h"
|
|
37
|
+
#include "bestfirst.h"
|
|
38
|
+
#include "globals.h"
|
|
39
|
+
|
|
40
|
+
#define EXTERN
|
|
41
|
+
|
|
42
|
+
#define EPAPER_EXT ".ep"
|
|
43
|
+
#define PAGE_YSIZE 3508
|
|
44
|
+
#define CTRL_INSET '\024' //dc4=text inset
|
|
45
|
+
#define CTRL_FONT '\016' //so=font change
|
|
46
|
+
#define CTRL_DEFAULT '\017' //si=default font
|
|
47
|
+
#define CTRL_SHIFT '\022' //dc2=x shift
|
|
48
|
+
#define CTRL_TAB '\011' //tab
|
|
49
|
+
#define CTRL_NEWLINE '\012' //newline
|
|
50
|
+
#define CTRL_HARDLINE '\015' //cr
|
|
51
|
+
int NO_BLOCK = 0; //don't output block information
|
|
52
|
+
inT16 XOFFSET = 0; //the image can be a part of bigger picture and we want to have the original coordinates
|
|
53
|
+
inT16 YOFFSET = 0;
|
|
54
|
+
|
|
55
|
+
EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
|
|
56
|
+
"Write block separators in output");
|
|
57
|
+
EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
|
|
58
|
+
"Write raw stuff to name.raw");
|
|
59
|
+
EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
|
|
60
|
+
EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
|
|
61
|
+
"Return ratings in IPEOCRAPI data");
|
|
62
|
+
EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
|
|
63
|
+
"Write .txt to .etx map file");
|
|
64
|
+
EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
|
|
65
|
+
"Write repetition char code");
|
|
66
|
+
EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
|
|
67
|
+
EXTERN STRING_EVAR (unrecognised_char, "|",
|
|
68
|
+
"Output char for unidentified blobs");
|
|
69
|
+
EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
|
|
70
|
+
EXTERN INT_VAR (suspect_space_level, 100,
|
|
71
|
+
"Min suspect level for rejecting spaces");
|
|
72
|
+
EXTERN INT_VAR (suspect_short_words, 2,
|
|
73
|
+
"Dont Suspect dict wds longer than this");
|
|
74
|
+
EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
|
|
75
|
+
"UNLV keep 1Il chars rejected");
|
|
76
|
+
EXTERN double_VAR (suspect_rating_per_ch, 999.9,
|
|
77
|
+
"Dont touch bad rating limit");
|
|
78
|
+
EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
|
|
79
|
+
|
|
80
|
+
EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
|
|
81
|
+
"Only reject tess failures");
|
|
82
|
+
EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
|
|
83
|
+
EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
|
|
84
|
+
"Make output have exactly one word per WERD");
|
|
85
|
+
EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
|
|
86
|
+
"Dont reject ANYTHING AT ALL");
|
|
87
|
+
EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
|
|
88
|
+
"Force all rep chars the same");
|
|
89
|
+
|
|
90
|
+
FILE *txt_mapfile = NULL; //reject map
|
|
91
|
+
FILE *unlv_file = NULL; //reject map
|
|
92
|
+
|
|
93
|
+
/**********************************************************************
|
|
94
|
+
* pixels_to_pts
|
|
95
|
+
*
|
|
96
|
+
* Convert an integer number of pixels to the nearest integer
|
|
97
|
+
* number of points.
|
|
98
|
+
**********************************************************************/
|
|
99
|
+
|
|
100
|
+
inT32 pixels_to_pts( //convert coords
|
|
101
|
+
inT32 pixels,
|
|
102
|
+
inT32 pix_res //resolution
|
|
103
|
+
) {
|
|
104
|
+
float pts; //converted value
|
|
105
|
+
|
|
106
|
+
pts = pixels * 72.0 / pix_res;
|
|
107
|
+
return (inT32) (pts + 0.5); //round it
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
void output_pass( //Tess output pass //send to api
|
|
111
|
+
PAGE_RES_IT &page_res_it,
|
|
112
|
+
BOOL8 write_to_shm,
|
|
113
|
+
TBOX *target_word_box) {
|
|
114
|
+
BLOCK_RES *block_of_last_word;
|
|
115
|
+
inT16 block_id;
|
|
116
|
+
BOOL8 force_eol; //During output
|
|
117
|
+
BLOCK *nextblock; //block of next word
|
|
118
|
+
WERD *nextword; //next word
|
|
119
|
+
|
|
120
|
+
if (tessedit_write_txt_map)
|
|
121
|
+
txt_mapfile = open_outfile (".map");
|
|
122
|
+
|
|
123
|
+
page_res_it.restart_page ();
|
|
124
|
+
block_of_last_word = NULL;
|
|
125
|
+
while (page_res_it.word () != NULL) {
|
|
126
|
+
check_debug_pt (page_res_it.word (), 120);
|
|
127
|
+
|
|
128
|
+
if (target_word_box)
|
|
129
|
+
{
|
|
130
|
+
|
|
131
|
+
TBOX current_word_box=page_res_it.word ()->word->bounding_box();
|
|
132
|
+
FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
|
|
133
|
+
if (!target_word_box->contains(center_pt))
|
|
134
|
+
{
|
|
135
|
+
page_res_it.forward ();
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
}
|
|
140
|
+
if (tessedit_write_block_separators &&
|
|
141
|
+
block_of_last_word != page_res_it.block ()) {
|
|
142
|
+
block_of_last_word = page_res_it.block ();
|
|
143
|
+
if (block_of_last_word->block->text_region () == NULL) {
|
|
144
|
+
if (block_of_last_word->block->poly_block () == NULL)
|
|
145
|
+
block_id = 1;
|
|
146
|
+
else
|
|
147
|
+
block_id =
|
|
148
|
+
((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
|
|
149
|
+
id_no();
|
|
150
|
+
}
|
|
151
|
+
else
|
|
152
|
+
block_id = block_of_last_word->block->text_region ()->id_no ();
|
|
153
|
+
if (!NO_BLOCK)
|
|
154
|
+
fprintf (textfile, "|^~tr%d\n", block_id);
|
|
155
|
+
fprintf (txt_mapfile, "|^~tr%d\n", block_id);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
force_eol = (tessedit_write_block_separators &&
|
|
159
|
+
(page_res_it.block () != page_res_it.next_block ())) ||
|
|
160
|
+
(page_res_it.next_word () == NULL);
|
|
161
|
+
|
|
162
|
+
if (page_res_it.next_word () != NULL)
|
|
163
|
+
nextword = page_res_it.next_word ()->word;
|
|
164
|
+
else
|
|
165
|
+
nextword = NULL;
|
|
166
|
+
if (page_res_it.next_block () != NULL)
|
|
167
|
+
nextblock = page_res_it.next_block ()->block;
|
|
168
|
+
else
|
|
169
|
+
nextblock = NULL;
|
|
170
|
+
//regardless of tilde crunching
|
|
171
|
+
write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
|
|
172
|
+
write_to_shm);
|
|
173
|
+
page_res_it.forward ();
|
|
174
|
+
}
|
|
175
|
+
if (write_to_shm)
|
|
176
|
+
ocr_send_text(FALSE);
|
|
177
|
+
if (tessedit_write_block_separators) {
|
|
178
|
+
if (!NO_BLOCK)
|
|
179
|
+
fprintf (textfile, "|^~tr\n");
|
|
180
|
+
fprintf (txt_mapfile, "|^~tr\n");
|
|
181
|
+
}
|
|
182
|
+
if (tessedit_write_txt_map) {
|
|
183
|
+
fprintf (txt_mapfile, "\n"); //because txt gets one
|
|
184
|
+
#ifdef __UNIX__
|
|
185
|
+
fsync (fileno (txt_mapfile));
|
|
186
|
+
#endif
|
|
187
|
+
fclose(txt_mapfile);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/*************************************************************************
|
|
192
|
+
* write_results()
|
|
193
|
+
*
|
|
194
|
+
* All recognition and rejection has now been done. Generate the following:
|
|
195
|
+
* .txt file - giving the final best choices with NO highlighting
|
|
196
|
+
* .raw file - giving the tesseract top choice output for each word
|
|
197
|
+
* .map file - showing how the .txt file has been rejected in the .ep file
|
|
198
|
+
* epchoice list - a list of one element per word, containing the text for the
|
|
199
|
+
* epaper. Reject strings are inserted.
|
|
200
|
+
* inset list - a list of bounding boxes of reject insets - indexed by the
|
|
201
|
+
* reject strings in the epchoice text.
|
|
202
|
+
*************************************************************************/
|
|
203
|
+
|
|
204
|
+
void write_results( //output a word
|
|
205
|
+
PAGE_RES_IT &page_res_it, //full info
|
|
206
|
+
char newline_type, //type of newline
|
|
207
|
+
BOOL8 force_eol, //override tilde crunch?
|
|
208
|
+
BOOL8 write_to_shm //send to api
|
|
209
|
+
) {
|
|
210
|
+
//word to do
|
|
211
|
+
WERD_RES *word = page_res_it.word ();
|
|
212
|
+
// WERD_CHOICE *ep_choice; //ep format
|
|
213
|
+
STRING repetition_code;
|
|
214
|
+
const STRING *wordstr;
|
|
215
|
+
STRING wordstr_lengths;
|
|
216
|
+
const char *text;
|
|
217
|
+
int i;
|
|
218
|
+
char unrecognised = STRING (unrecognised_char)[0];
|
|
219
|
+
char ep_chars[32]; //Only for unlv_tilde_crunch
|
|
220
|
+
int ep_chars_index = 0;
|
|
221
|
+
char txt_chs[32]; //Only for unlv_tilde_crunch
|
|
222
|
+
char map_chs[32]; //Only for unlv_tilde_crunch
|
|
223
|
+
int txt_index = 0;
|
|
224
|
+
static BOOL8 tilde_crunch_written = FALSE;
|
|
225
|
+
static BOOL8 last_char_was_newline = TRUE;
|
|
226
|
+
static BOOL8 last_char_was_tilde = FALSE;
|
|
227
|
+
static BOOL8 empty_block = TRUE;
|
|
228
|
+
BOOL8 need_reject = FALSE;
|
|
229
|
+
char *ptr; //string ptr
|
|
230
|
+
PBLOB_IT blob_it; //blobs
|
|
231
|
+
|
|
232
|
+
/* if (word->best_choice->string().length() == 0)
|
|
233
|
+
{
|
|
234
|
+
tprintf("No output: to output\n");
|
|
235
|
+
}
|
|
236
|
+
else if (word->best_choice->string()[0]==' ')
|
|
237
|
+
{
|
|
238
|
+
tprintf("spaceword to output\n");
|
|
239
|
+
}
|
|
240
|
+
else if (word->best_choice->string()[0]=='\0')
|
|
241
|
+
{
|
|
242
|
+
tprintf("null to output\n");
|
|
243
|
+
}*/
|
|
244
|
+
if (word->unlv_crunch_mode != CR_NONE
|
|
245
|
+
&& !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
|
|
246
|
+
if ((word->unlv_crunch_mode != CR_DELETE) &&
|
|
247
|
+
(!tilde_crunch_written ||
|
|
248
|
+
((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
|
|
249
|
+
(word->word->space () > 0) &&
|
|
250
|
+
!word->word->flag (W_FUZZY_NON) &&
|
|
251
|
+
!word->word->flag (W_FUZZY_SP)))) {
|
|
252
|
+
if (!word->word->flag (W_BOL) &&
|
|
253
|
+
(word->word->space () > 0) &&
|
|
254
|
+
!word->word->flag (W_FUZZY_NON) &&
|
|
255
|
+
!word->word->flag (W_FUZZY_SP)) {
|
|
256
|
+
/* Write a space to separate from preceeding good text */
|
|
257
|
+
txt_chs[txt_index] = ' ';
|
|
258
|
+
map_chs[txt_index++] = '1';
|
|
259
|
+
ep_chars[ep_chars_index++] = ' ';
|
|
260
|
+
last_char_was_tilde = FALSE;
|
|
261
|
+
}
|
|
262
|
+
need_reject = TRUE;
|
|
263
|
+
}
|
|
264
|
+
if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
|
|
265
|
+
/* Write a reject char - mark as rejected unless zero_rejection mode */
|
|
266
|
+
last_char_was_tilde = TRUE;
|
|
267
|
+
txt_chs[txt_index] = unrecognised;
|
|
268
|
+
if (tessedit_zero_rejection || (suspect_level == 0)) {
|
|
269
|
+
map_chs[txt_index++] = '1';
|
|
270
|
+
ep_chars[ep_chars_index++] = unrecognised;
|
|
271
|
+
}
|
|
272
|
+
else {
|
|
273
|
+
map_chs[txt_index++] = '0';
|
|
274
|
+
/*
|
|
275
|
+
The ep_choice string is a faked reject to allow newdiff to sync the .etx
|
|
276
|
+
with the .txt and .map files.
|
|
277
|
+
*/
|
|
278
|
+
ep_chars[ep_chars_index++] = CTRL_INSET;
|
|
279
|
+
//escape code
|
|
280
|
+
//dummy reject
|
|
281
|
+
ep_chars[ep_chars_index++] = 1;
|
|
282
|
+
//dummy reject
|
|
283
|
+
ep_chars[ep_chars_index++] = 1;
|
|
284
|
+
//type
|
|
285
|
+
ep_chars[ep_chars_index++] = 2;
|
|
286
|
+
//dummy reject
|
|
287
|
+
ep_chars[ep_chars_index++] = 1;
|
|
288
|
+
//dummy reject
|
|
289
|
+
ep_chars[ep_chars_index++] = 1;
|
|
290
|
+
}
|
|
291
|
+
tilde_crunch_written = TRUE;
|
|
292
|
+
last_char_was_newline = FALSE;
|
|
293
|
+
empty_block = FALSE;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
|
|
297
|
+
/* Add a new line output */
|
|
298
|
+
txt_chs[txt_index] = '\n';
|
|
299
|
+
map_chs[txt_index++] = '\n';
|
|
300
|
+
//end line
|
|
301
|
+
ep_chars[ep_chars_index++] = newline_type;
|
|
302
|
+
|
|
303
|
+
//Cos of the real newline
|
|
304
|
+
tilde_crunch_written = FALSE;
|
|
305
|
+
last_char_was_newline = TRUE;
|
|
306
|
+
last_char_was_tilde = FALSE;
|
|
307
|
+
}
|
|
308
|
+
txt_chs[txt_index] = '\0';
|
|
309
|
+
map_chs[txt_index] = '\0';
|
|
310
|
+
//xiaofan
|
|
311
|
+
if (tessedit_write_output && !NO_BLOCK)
|
|
312
|
+
fprintf (textfile, "%s", txt_chs);
|
|
313
|
+
|
|
314
|
+
if (tessedit_write_txt_map)
|
|
315
|
+
fprintf (txt_mapfile, "%s", map_chs);
|
|
316
|
+
|
|
317
|
+
//terminate string
|
|
318
|
+
ep_chars[ep_chars_index] = '\0';
|
|
319
|
+
word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
|
|
320
|
+
|
|
321
|
+
if (force_eol)
|
|
322
|
+
empty_block = TRUE;
|
|
323
|
+
return;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/* NORMAL PROCESSING of non tilde crunched words */
|
|
327
|
+
|
|
328
|
+
tilde_crunch_written = FALSE;
|
|
329
|
+
if (newline_type)
|
|
330
|
+
last_char_was_newline = TRUE;
|
|
331
|
+
else
|
|
332
|
+
last_char_was_newline = FALSE;
|
|
333
|
+
empty_block = force_eol; //About to write a real word
|
|
334
|
+
|
|
335
|
+
if (unlv_tilde_crunching &&
|
|
336
|
+
last_char_was_tilde &&
|
|
337
|
+
(word->word->space () == 0) &&
|
|
338
|
+
!(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
|
|
339
|
+
(word->best_choice->string ()[0] == ' ')) {
|
|
340
|
+
/* Prevent adjacent tilde across words - we know that adjacent tildes within
|
|
341
|
+
words have been removed */
|
|
342
|
+
ptr = (char *) word->best_choice->string ().string ();
|
|
343
|
+
strcpy (ptr, ptr + 1); //shuffle up
|
|
344
|
+
ptr = (char *) word->best_choice->lengths ().string ();
|
|
345
|
+
strcpy (ptr, ptr + 1); //shuffle up
|
|
346
|
+
word->reject_map.remove_pos (0);
|
|
347
|
+
blob_it = word->outword->blob_list ();
|
|
348
|
+
delete blob_it.extract (); //get rid of reject blob
|
|
349
|
+
}
|
|
350
|
+
if (newline_type ||
|
|
351
|
+
(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
|
|
352
|
+
last_char_was_tilde = FALSE;
|
|
353
|
+
else {
|
|
354
|
+
if (word->reject_map.length () > 0) {
|
|
355
|
+
for (i = 0, ptr = (char *) word->best_choice->string().string();
|
|
356
|
+
i < word->reject_map.length () - 1; ++i)
|
|
357
|
+
ptr += word->best_choice->lengths()[i];
|
|
358
|
+
if (*ptr == ' ')
|
|
359
|
+
last_char_was_tilde = TRUE;
|
|
360
|
+
else
|
|
361
|
+
last_char_was_tilde = FALSE;
|
|
362
|
+
}
|
|
363
|
+
else if (word->word->space () > 0)
|
|
364
|
+
last_char_was_tilde = FALSE;
|
|
365
|
+
/* else it is unchanged as there are no output chars */
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
ptr = (char *) word->best_choice->lengths ().string ();
|
|
369
|
+
ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
|
|
370
|
+
|
|
371
|
+
if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
|
|
372
|
+
ensure_rep_chars_are_consistent(word);
|
|
373
|
+
|
|
374
|
+
set_unlv_suspects(word);
|
|
375
|
+
check_debug_pt (word, 120);
|
|
376
|
+
if (tessedit_rejection_debug) {
|
|
377
|
+
tprintf ("Dict word: \"%s\": %d\n",
|
|
378
|
+
word->best_choice->string ().string (),
|
|
379
|
+
dict_word (word->best_choice->string ().string ()));
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
#if 0
|
|
383
|
+
if (tessedit_write_unlv) {
|
|
384
|
+
write_unlv_text(word);
|
|
385
|
+
}
|
|
386
|
+
#endif
|
|
387
|
+
|
|
388
|
+
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
|
389
|
+
repetition_code = "|^~R";
|
|
390
|
+
wordstr_lengths = "\001\001\001\001";
|
|
391
|
+
repetition_code += unicharset.id_to_unichar(get_rep_char (word));
|
|
392
|
+
wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
|
|
393
|
+
wordstr = &repetition_code;
|
|
394
|
+
}
|
|
395
|
+
else {
|
|
396
|
+
wordstr = &(word->best_choice->string ());
|
|
397
|
+
wordstr_lengths = word->best_choice->lengths ();
|
|
398
|
+
if (tessedit_zero_rejection) {
|
|
399
|
+
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
|
400
|
+
text = wordstr->string ();
|
|
401
|
+
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
|
|
402
|
+
if (word->reject_map[i].rejected ())
|
|
403
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
if (tessedit_minimal_rejection) {
|
|
407
|
+
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
|
408
|
+
text = wordstr->string ();
|
|
409
|
+
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
|
|
410
|
+
if ((*text != ' ') && word->reject_map[i].rejected ())
|
|
411
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if (write_to_shm)
|
|
417
|
+
write_shm_text (word, page_res_it.block ()->block,
|
|
418
|
+
page_res_it.row (), *wordstr, wordstr_lengths);
|
|
419
|
+
|
|
420
|
+
#if 0
|
|
421
|
+
if (tessedit_write_output)
|
|
422
|
+
write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
|
|
423
|
+
|
|
424
|
+
if (tessedit_write_raw_output)
|
|
425
|
+
write_cooked_text (word->word, word->raw_choice->string (),
|
|
426
|
+
TRUE, FALSE, rawfile);
|
|
427
|
+
|
|
428
|
+
if (tessedit_write_txt_map)
|
|
429
|
+
write_map(txt_mapfile, word);
|
|
430
|
+
|
|
431
|
+
ep_choice = make_epaper_choice (word, newline_type);
|
|
432
|
+
word->ep_choice = ep_choice;
|
|
433
|
+
#endif
|
|
434
|
+
|
|
435
|
+
character_count += word->best_choice->lengths ().length ();
|
|
436
|
+
word_count++;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/**********************************************************************
|
|
440
|
+
* make_epaper_choice
|
|
441
|
+
*
|
|
442
|
+
* Construct the epaper text string for a word, using the reject map to
|
|
443
|
+
* determine whether each blob should be rejected.
|
|
444
|
+
**********************************************************************/
|
|
445
|
+
|
|
446
|
+
#if 0
|
|
447
|
+
WERD_CHOICE *make_epaper_choice( //convert one word
|
|
448
|
+
WERD_RES *word, //word to do
|
|
449
|
+
char newline_type //type of newline
|
|
450
|
+
) {
|
|
451
|
+
inT16 index = 0; //to string
|
|
452
|
+
inT16 blobindex; //to word
|
|
453
|
+
inT16 prevright = 0; //right of previous blob
|
|
454
|
+
inT16 nextleft; //left of next blob
|
|
455
|
+
PBLOB *blob;
|
|
456
|
+
TBOX inset_box; //bounding box
|
|
457
|
+
PBLOB_IT blob_it; //blob iterator
|
|
458
|
+
char word_string[MAX_PATH]; //converted string
|
|
459
|
+
BOOL8 force_total_reject;
|
|
460
|
+
char unrecognised = STRING (unrecognised_char)[0];
|
|
461
|
+
|
|
462
|
+
blob_it.set_to_list (word->outword->blob_list ());
|
|
463
|
+
|
|
464
|
+
ASSERT_HOST (word->reject_map.length () ==
|
|
465
|
+
word->best_choice->string ().length ());
|
|
466
|
+
/*
|
|
467
|
+
tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n",
|
|
468
|
+
word->best_choice->string().string(),
|
|
469
|
+
word->best_choice->string().length(),
|
|
470
|
+
blob_it.length(),
|
|
471
|
+
blob_count( word->outword ) );
|
|
472
|
+
*/
|
|
473
|
+
|
|
474
|
+
if (word->best_choice->string ().length () == 0)
|
|
475
|
+
force_total_reject = TRUE;
|
|
476
|
+
else {
|
|
477
|
+
force_total_reject = FALSE;
|
|
478
|
+
ASSERT_HOST (blob_it.length () ==
|
|
479
|
+
word->best_choice->string ().length ());
|
|
480
|
+
}
|
|
481
|
+
if (!blob_it.empty ()) {
|
|
482
|
+
for (index = 0; index < word->word->space (); index++)
|
|
483
|
+
word_string[index] = ' '; //leading blanks
|
|
484
|
+
}
|
|
485
|
+
/* Why does this generate leading blanks regardless of whether the
|
|
486
|
+
word_choice string is empty, when write_cooked_text ony generates leading
|
|
487
|
+
blanks when the string is NOT empty???. */
|
|
488
|
+
|
|
489
|
+
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
|
490
|
+
strcpy (word_string + index, "|^~R");
|
|
491
|
+
index += 4;
|
|
492
|
+
strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
|
|
493
|
+
index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
|
|
494
|
+
}
|
|
495
|
+
else {
|
|
496
|
+
if (!blob_it.empty ())
|
|
497
|
+
prevright = blob_it.data ()->bounding_box ().left ();
|
|
498
|
+
//actually first left
|
|
499
|
+
for (blobindex = 0, blob_it.mark_cycle_pt ();
|
|
500
|
+
!blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
|
|
501
|
+
blob = blob_it.data ();
|
|
502
|
+
if (word->reject_map[blobindex].accepted ()) {
|
|
503
|
+
if (word->best_choice->string ()[blobindex] == ' ')
|
|
504
|
+
//but not rejected!!
|
|
505
|
+
word_string[index++] = unrecognised;
|
|
506
|
+
else
|
|
507
|
+
word_string[index++] =
|
|
508
|
+
word->best_choice->string ()[blobindex];
|
|
509
|
+
}
|
|
510
|
+
else { // start reject
|
|
511
|
+
inset_box = blob->bounding_box ();
|
|
512
|
+
/* Extend reject box to include rejected neighbours */
|
|
513
|
+
while (!blob_it.at_last () &&
|
|
514
|
+
(force_total_reject ||
|
|
515
|
+
(word->reject_map[blobindex + 1].rejected ()))) {
|
|
516
|
+
blobindex++;
|
|
517
|
+
blob = blob_it.forward ();
|
|
518
|
+
//get total box
|
|
519
|
+
inset_box += blob->bounding_box ();
|
|
520
|
+
}
|
|
521
|
+
if (blob_it.at_last ())
|
|
522
|
+
nextleft = inset_box.right ();
|
|
523
|
+
else
|
|
524
|
+
nextleft = blob_it.data_relative (1)->bounding_box ().left ();
|
|
525
|
+
|
|
526
|
+
// tprintf("Making reject from (%d,%d)->(%d,%d)\n",
|
|
527
|
+
// inset_box.left(),inset_box.bottom(),
|
|
528
|
+
// inset_box.right(),inset_box.top());
|
|
529
|
+
|
|
530
|
+
index += make_reject (&inset_box, prevright, nextleft,
|
|
531
|
+
&word->denorm, &word_string[index]);
|
|
532
|
+
}
|
|
533
|
+
prevright = blob->bounding_box ().right ();
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
if (newline_type)
|
|
537
|
+
//end line
|
|
538
|
+
word_string[index++] = newline_type;
|
|
539
|
+
word_string[index] = '\0'; //terminate string
|
|
540
|
+
if (strlen (word_string) != index) {
|
|
541
|
+
tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
|
|
542
|
+
word_string, index, strlen (word_string));
|
|
543
|
+
}
|
|
544
|
+
//don't pass any zeros
|
|
545
|
+
ASSERT_HOST (strlen (word_string) == index);
|
|
546
|
+
return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
|
|
547
|
+
}
|
|
548
|
+
#endif
|
|
549
|
+
|
|
550
|
+
/**********************************************************************
|
|
551
|
+
* make_reject
|
|
552
|
+
*
|
|
553
|
+
* Add the escape code to the string for the reject.
|
|
554
|
+
**********************************************************************/
|
|
555
|
+
|
|
556
|
+
inT16
|
|
557
|
+
make_reject ( //make reject code
|
|
558
|
+
TBOX * inset_box, //bounding box
|
|
559
|
+
inT16 prevright, //previous char
|
|
560
|
+
inT16 nextleft, //next char
|
|
561
|
+
DENORM * denorm, //de-normalizer
|
|
562
|
+
char word_string[] //output string
|
|
563
|
+
) {
|
|
564
|
+
inT16 index; //to string
|
|
565
|
+
inT16 xpos; //start of inset
|
|
566
|
+
inT16 ypos;
|
|
567
|
+
inT16 width; //size of inset
|
|
568
|
+
inT16 height;
|
|
569
|
+
inT16 left_offset; //shift form prev char
|
|
570
|
+
inT16 right_offset; //shift to next char
|
|
571
|
+
inT16 baseline_offset; //shift from baseline
|
|
572
|
+
inT16 inset_index = 0; //number of inset
|
|
573
|
+
inT16 min_chars; //min width estimate
|
|
574
|
+
inT16 max_chars; //max width estimate
|
|
575
|
+
float x_centre; //centre of box
|
|
576
|
+
|
|
577
|
+
index = 0;
|
|
578
|
+
x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
|
|
579
|
+
left_offset =
|
|
580
|
+
(inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
|
|
581
|
+
right_offset =
|
|
582
|
+
(inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
|
|
583
|
+
xpos = (inT16) floor (denorm->x (inset_box->left ()));
|
|
584
|
+
width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
|
|
585
|
+
ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
|
|
586
|
+
height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
|
|
587
|
+
baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
|
|
588
|
+
//escape code
|
|
589
|
+
word_string[index++] = CTRL_INSET;
|
|
590
|
+
min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
|
|
591
|
+
max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
|
|
592
|
+
/*
|
|
593
|
+
Ensure min_chars and max_chars are in the range 0..254. This ensures that
|
|
594
|
+
we can add 1 to them to avoid putting \0 in a string, and still not exceed
|
|
595
|
+
the max value in a byte.
|
|
596
|
+
*/
|
|
597
|
+
if (min_chars < 0)
|
|
598
|
+
min_chars = 0;
|
|
599
|
+
if (min_chars > 254)
|
|
600
|
+
min_chars = 254;
|
|
601
|
+
if (max_chars < min_chars)
|
|
602
|
+
max_chars = min_chars;
|
|
603
|
+
if (max_chars > 254)
|
|
604
|
+
max_chars = 254;
|
|
605
|
+
//min chars
|
|
606
|
+
word_string[index++] = min_chars + 1;
|
|
607
|
+
//max chars
|
|
608
|
+
word_string[index++] = max_chars + 1;
|
|
609
|
+
word_string[index++] = 2; //type?
|
|
610
|
+
//store index
|
|
611
|
+
word_string[index++] = inset_index / 255 + 1;
|
|
612
|
+
word_string[index++] = inset_index % 255 + 1;
|
|
613
|
+
return index; //size of string
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
/**********************************************************************
|
|
618
|
+
* determine_newline_type
|
|
619
|
+
*
|
|
620
|
+
* Find whether we have a wrapping or hard newline.
|
|
621
|
+
* Return FALSE if not at end of line.
|
|
622
|
+
**********************************************************************/
|
|
623
|
+
|
|
624
|
+
char determine_newline_type( //test line ends
|
|
625
|
+
WERD *word, //word to do
|
|
626
|
+
BLOCK *block, //current block
|
|
627
|
+
WERD *next_word, //next word
|
|
628
|
+
BLOCK *next_block //block of next word
|
|
629
|
+
) {
|
|
630
|
+
inT16 end_gap; //to right edge
|
|
631
|
+
inT16 width; //of next word
|
|
632
|
+
TBOX word_box; //bounding
|
|
633
|
+
TBOX next_box; //next word
|
|
634
|
+
TBOX block_box; //block bounding
|
|
635
|
+
|
|
636
|
+
if (!word->flag (W_EOL))
|
|
637
|
+
return FALSE; //not end of line
|
|
638
|
+
if (next_word == NULL || next_block == NULL || block != next_block)
|
|
639
|
+
return CTRL_NEWLINE;
|
|
640
|
+
if (next_word->space () > 0)
|
|
641
|
+
return CTRL_HARDLINE; //it is tabbed
|
|
642
|
+
word_box = word->bounding_box ();
|
|
643
|
+
next_box = next_word->bounding_box ();
|
|
644
|
+
block_box = block->bounding_box ();
|
|
645
|
+
//gap to eol
|
|
646
|
+
end_gap = block_box.right () - word_box.right ();
|
|
647
|
+
end_gap -= (inT32) block->space ();
|
|
648
|
+
width = next_box.right () - next_box.left ();
|
|
649
|
+
// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
|
|
650
|
+
// block_box.right(),word_box.right(),end_gap,
|
|
651
|
+
// next_box.right(),next_box.left(),width,
|
|
652
|
+
// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
|
|
653
|
+
return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
/**********************************************************************
|
|
658
|
+
* write_cooked_text
|
|
659
|
+
*
|
|
660
|
+
* Write the cooked text (with bold for pass2 and underline for reject)
|
|
661
|
+
* to the given file.
|
|
662
|
+
**********************************************************************/
|
|
663
|
+
|
|
664
|
+
#if 0
|
|
665
|
+
void write_cooked_text( //write output
|
|
666
|
+
WERD *word, //word to do
|
|
667
|
+
const STRING &text, //text to write
|
|
668
|
+
BOOL8 acceptable, //good stuff
|
|
669
|
+
BOOL8 pass2, //done on pass2
|
|
670
|
+
FILE *fp //file to write
|
|
671
|
+
) {
|
|
672
|
+
inT16 index; //blank counter
|
|
673
|
+
int status;
|
|
674
|
+
static int newaline = 1;
|
|
675
|
+
static int havespace = 0;
|
|
676
|
+
char buff[512];
|
|
677
|
+
const char *wordstr = text.string ();
|
|
678
|
+
int i = 0;
|
|
679
|
+
char unrecognised = STRING (unrecognised_char)[0];
|
|
680
|
+
static int old_segs = 0;
|
|
681
|
+
TBOX mybox;
|
|
682
|
+
for (i = 0; wordstr[i] != '\0'; i++) {
|
|
683
|
+
if (wordstr[i] == ' ')
|
|
684
|
+
buff[i] = unrecognised;
|
|
685
|
+
else
|
|
686
|
+
buff[i] = wordstr[i];
|
|
687
|
+
}
|
|
688
|
+
buff[i] = '\0';
|
|
689
|
+
|
|
690
|
+
if (fp == stdout) {
|
|
691
|
+
tprintf ("Cooked=%s, %d segs, acceptable=%d",
|
|
692
|
+
buff, num_popped - old_segs, acceptable);
|
|
693
|
+
old_segs = num_popped;
|
|
694
|
+
return;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
if (text.length () > 0) {
|
|
698
|
+
for (index = 0; index < word->space (); index++) {
|
|
699
|
+
status = fprintf (fp, " ");
|
|
700
|
+
havespace = 1;
|
|
701
|
+
if (status < 0)
|
|
702
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
703
|
+
"Space Errno: %d", errno);
|
|
704
|
+
}
|
|
705
|
+
if (pass2) {
|
|
706
|
+
status = fprintf (fp, BOLD_ON);
|
|
707
|
+
if (status < 0)
|
|
708
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
709
|
+
"Bold Errno: %d", errno);
|
|
710
|
+
}
|
|
711
|
+
if (!acceptable) {
|
|
712
|
+
status = fprintf (fp, UNDERLINE_ON);
|
|
713
|
+
if (status < 0)
|
|
714
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
715
|
+
"Underline Errno: %d", errno);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
//xiaofan
|
|
719
|
+
if (NO_BLOCK && word && strlen (buff)) {
|
|
720
|
+
mybox = word->bounding_box ();
|
|
721
|
+
if (newaline || !havespace) {
|
|
722
|
+
fprintf (fp, " ");
|
|
723
|
+
newaline = 0;
|
|
724
|
+
}
|
|
725
|
+
fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
|
|
726
|
+
XOFFSET + mybox.left (),
|
|
727
|
+
YOFFSET + page_image.get_ysize () - mybox.top (),
|
|
728
|
+
XOFFSET + mybox.right (),
|
|
729
|
+
YOFFSET + page_image.get_ysize () - mybox.bottom ());
|
|
730
|
+
havespace = 0;
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
status = fprintf (fp, "%s", buff);
|
|
734
|
+
if (status < 0)
|
|
735
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
736
|
+
"Word Errno: %d", errno);
|
|
737
|
+
if (pass2) {
|
|
738
|
+
status = fprintf (fp, BOLD_OFF);
|
|
739
|
+
if (status < 0)
|
|
740
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
741
|
+
"Bold off Errno: %d", errno);
|
|
742
|
+
}
|
|
743
|
+
if (!acceptable) {
|
|
744
|
+
status = fprintf (fp, UNDERLINE_OFF);
|
|
745
|
+
if (status < 0)
|
|
746
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
747
|
+
"Underline off Errno: %d", errno);
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
if (word->flag (W_EOL)) {
|
|
751
|
+
status = fprintf (fp, "\n");
|
|
752
|
+
newaline = 1;
|
|
753
|
+
if (status < 0)
|
|
754
|
+
WRITEFAILED.error ("write_cooked_text", EXIT,
|
|
755
|
+
"Newline Errno: %d", errno);
|
|
756
|
+
}
|
|
757
|
+
status = fflush (fp);
|
|
758
|
+
if (status != 0)
|
|
759
|
+
WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
|
|
760
|
+
}
|
|
761
|
+
#endif
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
/**********************************************************************
|
|
765
|
+
* write_shm_text
|
|
766
|
+
*
|
|
767
|
+
* Write the cooked text to the shared memory for the api.
|
|
768
|
+
**********************************************************************/
|
|
769
|
+
|
|
770
|
+
void write_shm_text( //write output
|
|
771
|
+
WERD_RES *word, //word to do
|
|
772
|
+
BLOCK *block, //block it is from
|
|
773
|
+
ROW_RES *row, //row it is from
|
|
774
|
+
const STRING &text, //text to write
|
|
775
|
+
const STRING &text_lengths
|
|
776
|
+
) {
|
|
777
|
+
inT32 index; //char counter
|
|
778
|
+
inT32 index2; //char counter
|
|
779
|
+
inT32 length; //chars in word
|
|
780
|
+
inT32 ptsize; //font size
|
|
781
|
+
inT8 blanks; //blanks in word
|
|
782
|
+
uinT8 enhancement; //bold etc
|
|
783
|
+
uinT8 font; //font index
|
|
784
|
+
char unrecognised = STRING (unrecognised_char)[0];
|
|
785
|
+
PBLOB *blob;
|
|
786
|
+
TBOX blob_box; //bounding box
|
|
787
|
+
PBLOB_IT blob_it; //blob iterator
|
|
788
|
+
WERD copy_outword; // copy to denorm
|
|
789
|
+
uinT32 rating; //of char
|
|
790
|
+
BOOL8 lineend; //end of line
|
|
791
|
+
int offset;
|
|
792
|
+
int offset2;
|
|
793
|
+
|
|
794
|
+
//point size
|
|
795
|
+
ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
|
|
796
|
+
if (word->word->flag (W_BOL) && ocr_char_space () < 128
|
|
797
|
+
&& ocr_send_text (TRUE) != OKAY)
|
|
798
|
+
return; //release failed
|
|
799
|
+
copy_outword = *(word->outword);
|
|
800
|
+
copy_outword.baseline_denormalise (&word->denorm);
|
|
801
|
+
blob_it.set_to_list (copy_outword.blob_list ());
|
|
802
|
+
length = text_lengths.length ();
|
|
803
|
+
|
|
804
|
+
if (length > 0) {
|
|
805
|
+
blanks = word->word->space ();
|
|
806
|
+
if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
|
|
807
|
+
blanks = 1;
|
|
808
|
+
for (index = 0, offset = 0; index < length;
|
|
809
|
+
offset += text_lengths[index++], blob_it.forward ()) {
|
|
810
|
+
blob = blob_it.data ();
|
|
811
|
+
blob_box = blob->bounding_box ();
|
|
812
|
+
|
|
813
|
+
enhancement = 0;
|
|
814
|
+
if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
|
|
815
|
+
enhancement |= EUC_ITALIC;
|
|
816
|
+
if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
|
|
817
|
+
enhancement |= EUC_BOLD;
|
|
818
|
+
if (tessedit_write_ratings)
|
|
819
|
+
rating = (uinT32) (-word->best_choice->certainty () / 0.035);
|
|
820
|
+
else if (tessedit_zero_rejection)
|
|
821
|
+
rating = text[offset] == ' ' ? 100 : 0;
|
|
822
|
+
else
|
|
823
|
+
rating = word->reject_map[index].accepted ()? 0 : 100;
|
|
824
|
+
if (rating > 255)
|
|
825
|
+
rating = 255;
|
|
826
|
+
if (word->font1_count > 2)
|
|
827
|
+
font = word->font1;
|
|
828
|
+
else if (row->font1_count > 8)
|
|
829
|
+
font = row->font1;
|
|
830
|
+
else
|
|
831
|
+
//font index
|
|
832
|
+
font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
|
|
833
|
+
|
|
834
|
+
lineend = word->word->flag (W_EOL) && index == length - 1;
|
|
835
|
+
if (word->word->flag (W_EOL) && tessedit_zero_rejection
|
|
836
|
+
&& index < length - 1 && text[index + text_lengths[index]] == ' ') {
|
|
837
|
+
for (index2 = index + 1, offset2 = offset + text_lengths[index];
|
|
838
|
+
index2 < length && text[offset2] == ' ';
|
|
839
|
+
offset2 += text_lengths[index2++]);
|
|
840
|
+
if (index2 == length)
|
|
841
|
+
lineend = TRUE;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
if (!tessedit_zero_rejection || text[offset] != ' '
|
|
845
|
+
|| tessedit_word_for_word) {
|
|
846
|
+
//confidence
|
|
847
|
+
if (text[offset] == ' ') {
|
|
848
|
+
ocr_append_char (unrecognised,
|
|
849
|
+
blob_box.left (), blob_box.right (),
|
|
850
|
+
page_image.get_ysize () - 1 - blob_box.top (),
|
|
851
|
+
page_image.get_ysize () - 1 - blob_box.bottom (),
|
|
852
|
+
font, (uinT8) rating,
|
|
853
|
+
ptsize, //point size
|
|
854
|
+
blanks, enhancement, //enhancement
|
|
855
|
+
OCR_CDIR_LEFT_RIGHT,
|
|
856
|
+
OCR_LDIR_DOWN_RIGHT,
|
|
857
|
+
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
|
858
|
+
} else {
|
|
859
|
+
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
|
|
860
|
+
ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
|
|
861
|
+
blob_box.left (), blob_box.right (),
|
|
862
|
+
page_image.get_ysize () - 1 - blob_box.top (),
|
|
863
|
+
page_image.get_ysize () - 1 - blob_box.bottom (),
|
|
864
|
+
font, (uinT8) rating,
|
|
865
|
+
ptsize, //point size
|
|
866
|
+
blanks, enhancement, //enhancement
|
|
867
|
+
OCR_CDIR_LEFT_RIGHT,
|
|
868
|
+
OCR_LDIR_DOWN_RIGHT,
|
|
869
|
+
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
|
870
|
+
}
|
|
871
|
+
blanks = 0;
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
else if (tessedit_word_for_word) {
|
|
877
|
+
blanks = word->word->space ();
|
|
878
|
+
if (blanks == 0 && !word->word->flag (W_BOL))
|
|
879
|
+
blanks = 1;
|
|
880
|
+
blob_box = word->word->bounding_box ();
|
|
881
|
+
|
|
882
|
+
enhancement = 0;
|
|
883
|
+
if (word->italic > 0)
|
|
884
|
+
enhancement |= EUC_ITALIC;
|
|
885
|
+
if (word->bold > 0)
|
|
886
|
+
enhancement |= EUC_BOLD;
|
|
887
|
+
rating = 100;
|
|
888
|
+
if (word->font1_count > 2)
|
|
889
|
+
font = word->font1;
|
|
890
|
+
else if (row->font1_count > 8)
|
|
891
|
+
font = row->font1;
|
|
892
|
+
else
|
|
893
|
+
//font index
|
|
894
|
+
font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
|
|
895
|
+
|
|
896
|
+
lineend = word->word->flag (W_EOL);
|
|
897
|
+
|
|
898
|
+
//font index
|
|
899
|
+
ocr_append_char (unrecognised,
|
|
900
|
+
blob_box.left (), blob_box.right (),
|
|
901
|
+
page_image.get_ysize () - 1 - blob_box.top (),
|
|
902
|
+
page_image.get_ysize () - 1 - blob_box.bottom (),
|
|
903
|
+
font,
|
|
904
|
+
rating, //confidence
|
|
905
|
+
ptsize, //point size
|
|
906
|
+
blanks, enhancement, //enhancement
|
|
907
|
+
OCR_CDIR_LEFT_RIGHT,
|
|
908
|
+
OCR_LDIR_DOWN_RIGHT,
|
|
909
|
+
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
/**********************************************************************
|
|
915
|
+
* write_map
|
|
916
|
+
*
|
|
917
|
+
* Write a map file of 0's and 1'a which associates characters from the .txt
|
|
918
|
+
* file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
|
|
919
|
+
* is kept. Note that there may be reject regions in the .etx file WITHOUT
|
|
920
|
+
* .txt chars being rejected. The map file should be the same length, and
|
|
921
|
+
* the same number of lines as the .txt file
|
|
922
|
+
*
|
|
923
|
+
* The paramaterised input is because I thought I might be able to generate
|
|
924
|
+
* multiple map files in a single run. However, it didn't work because
|
|
925
|
+
* newdiff needs etx files!
|
|
926
|
+
**********************************************************************/
|
|
927
|
+
|
|
928
|
+
#if 0
|
|
929
|
+
void write_map( //output a map file
|
|
930
|
+
FILE *mapfile, //mapfile to write to
|
|
931
|
+
WERD_RES *word) {
|
|
932
|
+
inT16 index;
|
|
933
|
+
int status;
|
|
934
|
+
STRING mapstr = "";
|
|
935
|
+
|
|
936
|
+
if (word->best_choice->string ().length () > 0) {
|
|
937
|
+
for (index = 0; index < word->word->space (); index++) {
|
|
938
|
+
if (word->reject_spaces &&
|
|
939
|
+
(suspect_level >= suspect_space_level) &&
|
|
940
|
+
!tessedit_minimal_rejection && !tessedit_zero_rejection)
|
|
941
|
+
/* Write rejected spaces to .map file ONLY. Newdiff converts these back to
|
|
942
|
+
accepted spaces AFTER generating basic space stats but BEFORE using .etx */
|
|
943
|
+
status = fprintf (mapfile, "0");
|
|
944
|
+
else
|
|
945
|
+
status = fprintf (mapfile, "1");
|
|
946
|
+
if (status < 0)
|
|
947
|
+
WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
|
|
951
|
+
for (index = 0; index < 5; index++)
|
|
952
|
+
mapstr += '1';
|
|
953
|
+
}
|
|
954
|
+
else {
|
|
955
|
+
ASSERT_HOST (word->reject_map.length () ==
|
|
956
|
+
word->best_choice->string ().length ());
|
|
957
|
+
|
|
958
|
+
for (index = 0; index < word->reject_map.length (); index++) {
|
|
959
|
+
if (word->reject_map[index].accepted ())
|
|
960
|
+
mapstr += '1';
|
|
961
|
+
else
|
|
962
|
+
mapstr += '0';
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
status = fprintf (mapfile, "%s", mapstr.string ());
|
|
966
|
+
if (status < 0)
|
|
967
|
+
WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
|
|
968
|
+
}
|
|
969
|
+
if (word->word->flag (W_EOL)) {
|
|
970
|
+
status = fprintf (mapfile, "\n");
|
|
971
|
+
if (status < 0)
|
|
972
|
+
WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
|
|
973
|
+
}
|
|
974
|
+
status = fflush (mapfile);
|
|
975
|
+
if (status != 0)
|
|
976
|
+
WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
|
|
977
|
+
}
|
|
978
|
+
#endif
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
/*************************************************************************
|
|
982
|
+
* open_file()
|
|
983
|
+
*************************************************************************/
|
|
984
|
+
|
|
985
|
+
FILE *open_outfile( //open .map & .unlv file
|
|
986
|
+
const char *extension) {
|
|
987
|
+
STRING file_name;
|
|
988
|
+
FILE *outfile;
|
|
989
|
+
|
|
990
|
+
file_name = imagebasename + extension;
|
|
991
|
+
if (!(outfile = fopen (file_name.string (), "w"))) {
|
|
992
|
+
CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
|
|
993
|
+
file_name.string (), errno);
|
|
994
|
+
}
|
|
995
|
+
return outfile;
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
#if 0
|
|
1000
|
+
void write_unlv_text(WERD_RES *word) {
|
|
1001
|
+
const char *wordstr;
|
|
1002
|
+
|
|
1003
|
+
char buff[512]; //string to output
|
|
1004
|
+
int i = 0;
|
|
1005
|
+
int j = 0;
|
|
1006
|
+
char unrecognised = STRING (unrecognised_char)[0];
|
|
1007
|
+
int status;
|
|
1008
|
+
char space_str[3];
|
|
1009
|
+
|
|
1010
|
+
wordstr = word->best_choice->string ().string ();
|
|
1011
|
+
|
|
1012
|
+
/* DONT need to do anything special for repeated char words - at this stage
|
|
1013
|
+
the repetition char has been identified and any other chars have been
|
|
1014
|
+
rejected.
|
|
1015
|
+
*/
|
|
1016
|
+
|
|
1017
|
+
for (; wordstr[i] != '\0'; i++) {
|
|
1018
|
+
if ((wordstr[i] == ' ') ||
|
|
1019
|
+
(wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
|
|
1020
|
+
buff[j++] = unrecognised;
|
|
1021
|
+
else {
|
|
1022
|
+
if (word->reject_map[i].rejected ())
|
|
1023
|
+
buff[j++] = '^'; //Add suspect marker
|
|
1024
|
+
buff[j++] = wordstr[i];
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
buff[j] = '\0';
|
|
1028
|
+
|
|
1029
|
+
if (strlen (wordstr) > 0) {
|
|
1030
|
+
if (word->reject_spaces &&
|
|
1031
|
+
(suspect_level >= suspect_space_level) &&
|
|
1032
|
+
!tessedit_minimal_rejection && !tessedit_zero_rejection)
|
|
1033
|
+
strcpy (space_str, "^ "); //Suspect space
|
|
1034
|
+
else
|
|
1035
|
+
strcpy (space_str, " "); //Certain space
|
|
1036
|
+
|
|
1037
|
+
for (i = 0; i < word->word->space (); i++) {
|
|
1038
|
+
status = fprintf (unlv_file, "%s", space_str);
|
|
1039
|
+
if (status < 0)
|
|
1040
|
+
WRITEFAILED.error ("write_unlv_text", EXIT,
|
|
1041
|
+
"Space Errno: %d", errno);
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
status = fprintf (unlv_file, "%s", buff);
|
|
1045
|
+
if (status < 0)
|
|
1046
|
+
WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
|
|
1047
|
+
}
|
|
1048
|
+
if (word->word->flag (W_EOL)) {
|
|
1049
|
+
status = fprintf (unlv_file, "\n");
|
|
1050
|
+
if (status < 0)
|
|
1051
|
+
WRITEFAILED.error ("write_unlv_text", EXIT,
|
|
1052
|
+
"Newline Errno: %d", errno);
|
|
1053
|
+
}
|
|
1054
|
+
status = fflush (unlv_file);
|
|
1055
|
+
if (status != 0)
|
|
1056
|
+
WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
|
|
1057
|
+
}
|
|
1058
|
+
#endif
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
/*************************************************************************
|
|
1062
|
+
* get_rep_char()
|
|
1063
|
+
* Return the first accepted character from the repetition string. This is the
|
|
1064
|
+
* character which is repeated - as determined earlier by fix_rep_char()
|
|
1065
|
+
*************************************************************************/
|
|
1066
|
+
UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
|
|
1067
|
+
int i;
|
|
1068
|
+
int offset;
|
|
1069
|
+
|
|
1070
|
+
for (i = 0, offset = 0;
|
|
1071
|
+
((i < word->reject_map.length ()) &&
|
|
1072
|
+
(word->reject_map[i].rejected ()));
|
|
1073
|
+
offset += word->best_choice->lengths()[i++]);
|
|
1074
|
+
if (i < word->reject_map.length ())
|
|
1075
|
+
return unicharset.unichar_to_id(word->best_choice->string().string()
|
|
1076
|
+
+ offset,
|
|
1077
|
+
word->best_choice->lengths()[i]);
|
|
1078
|
+
else
|
|
1079
|
+
return unicharset.unichar_to_id(unrecognised_char.string());
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
void ensure_rep_chars_are_consistent(WERD_RES *word) {
|
|
1083
|
+
#if 0
|
|
1084
|
+
char rep_char = get_rep_char (word);
|
|
1085
|
+
char *ptr;
|
|
1086
|
+
|
|
1087
|
+
ptr = (char *) word->best_choice->string ().string ();
|
|
1088
|
+
for (; *ptr != '\0'; ptr++) {
|
|
1089
|
+
if (*ptr != rep_char)
|
|
1090
|
+
*ptr = rep_char;
|
|
1091
|
+
}
|
|
1092
|
+
#endif
|
|
1093
|
+
|
|
1094
|
+
#if 0
|
|
1095
|
+
UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
|
|
1096
|
+
int i;
|
|
1097
|
+
char *ptr;
|
|
1098
|
+
STRING consistent_string;
|
|
1099
|
+
STRING consistent_string_lengths;
|
|
1100
|
+
|
|
1101
|
+
ptr = (char *) word->best_choice->string ().string ();
|
|
1102
|
+
for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
|
|
1103
|
+
consistent_string += unicharset.id_to_unichar(rep_char);
|
|
1104
|
+
consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
|
|
1105
|
+
}
|
|
1106
|
+
word->best_choice->string() = consistent_string;
|
|
1107
|
+
word->best_choice->lengths() = consistent_string_lengths;
|
|
1108
|
+
#endif
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
/*************************************************************************
|
|
1112
|
+
* SUSPECT LEVELS
|
|
1113
|
+
*
|
|
1114
|
+
* 0 - dont reject ANYTHING
|
|
1115
|
+
* 1,2 - partial rejection
|
|
1116
|
+
* 3 - BEST
|
|
1117
|
+
*
|
|
1118
|
+
* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
|
|
1119
|
+
* tessedit_minimal_rejection.
|
|
1120
|
+
*************************************************************************/
|
|
1121
|
+
|
|
1122
|
+
void set_unlv_suspects(WERD_RES *word) {
|
|
1123
|
+
int len = word->reject_map.length ();
|
|
1124
|
+
int i;
|
|
1125
|
+
int offset;
|
|
1126
|
+
const char *ptr;
|
|
1127
|
+
const char *lengths = word->best_choice->lengths ().string ();
|
|
1128
|
+
float rating_per_ch;
|
|
1129
|
+
|
|
1130
|
+
ptr = word->best_choice->string ().string ();
|
|
1131
|
+
|
|
1132
|
+
if (suspect_level == 0) {
|
|
1133
|
+
for (i = 0; i < len; i++) {
|
|
1134
|
+
if (word->reject_map[i].rejected ())
|
|
1135
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1136
|
+
}
|
|
1137
|
+
return;
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
if (suspect_level >= 3)
|
|
1141
|
+
return; //Use defaults
|
|
1142
|
+
|
|
1143
|
+
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
|
|
1144
|
+
|
|
1145
|
+
if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
|
|
1146
|
+
suspect_short_words)) {
|
|
1147
|
+
/* Unreject alphas in dictionary words */
|
|
1148
|
+
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
|
|
1149
|
+
if (word->reject_map[i].rejected () &&
|
|
1150
|
+
unicharset.get_isalpha (ptr + offset, lengths[i]))
|
|
1151
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
|
|
1156
|
+
|
|
1157
|
+
if (rating_per_ch >= suspect_rating_per_ch)
|
|
1158
|
+
return; //Dont touch bad ratings
|
|
1159
|
+
|
|
1160
|
+
if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
|
|
1161
|
+
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
|
|
1162
|
+
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
|
|
1163
|
+
if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
|
|
1164
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
for (i = 0; i < len; i++) {
|
|
1169
|
+
if (word->reject_map[i].rejected ()) {
|
|
1170
|
+
if (word->reject_map[i].flag (R_DOC_REJ))
|
|
1171
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1172
|
+
if (word->reject_map[i].flag (R_BLOCK_REJ))
|
|
1173
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1174
|
+
if (word->reject_map[i].flag (R_ROW_REJ))
|
|
1175
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
if (suspect_level == 2)
|
|
1180
|
+
return;
|
|
1181
|
+
|
|
1182
|
+
if (!suspect_constrain_1Il ||
|
|
1183
|
+
(word->reject_map.length () <= suspect_short_words)) {
|
|
1184
|
+
for (i = 0; i < len; i++) {
|
|
1185
|
+
if (word->reject_map[i].rejected ()) {
|
|
1186
|
+
if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
|
|
1187
|
+
word->reject_map[i].flag (R_POSTNN_1IL)))
|
|
1188
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1189
|
+
|
|
1190
|
+
if (!suspect_constrain_1Il &&
|
|
1191
|
+
word->reject_map[i].flag (R_MM_REJECT))
|
|
1192
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
if ((acceptable_word_string (word->best_choice->string ().string (),
|
|
1198
|
+
word->best_choice->lengths ().string ())
|
|
1199
|
+
!= AC_UNACCEPTABLE) ||
|
|
1200
|
+
acceptable_number_string (word->best_choice->string ().string (),
|
|
1201
|
+
word->best_choice->lengths ().string ())) {
|
|
1202
|
+
if (word->reject_map.length () > suspect_short_words) {
|
|
1203
|
+
for (i = 0; i < len; i++) {
|
|
1204
|
+
if (word->reject_map[i].rejected () &&
|
|
1205
|
+
(!word->reject_map[i].perm_rejected () ||
|
|
1206
|
+
word->reject_map[i].flag (R_1IL_CONFLICT) ||
|
|
1207
|
+
word->reject_map[i].flag (R_POSTNN_1IL) ||
|
|
1208
|
+
word->reject_map[i].flag (R_MM_REJECT))) {
|
|
1209
|
+
word->reject_map[i].setrej_minimal_rej_accept ();
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
inT16 count_alphas( //how many alphas
|
|
1218
|
+
const char *s,
|
|
1219
|
+
const char *lengths) {
|
|
1220
|
+
int count = 0;
|
|
1221
|
+
|
|
1222
|
+
for (; *s != '\0'; s += *(lengths++)) {
|
|
1223
|
+
if (unicharset.get_isalpha(s, *lengths))
|
|
1224
|
+
count++;
|
|
1225
|
+
}
|
|
1226
|
+
return count;
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
inT16 count_alphanums( //how many alphanums
|
|
1231
|
+
const char *s,
|
|
1232
|
+
const char *lengths) {
|
|
1233
|
+
int count = 0;
|
|
1234
|
+
|
|
1235
|
+
for (; *s != '\0'; s += *(lengths++)) {
|
|
1236
|
+
if (unicharset.get_isalpha(s, *lengths) ||
|
|
1237
|
+
unicharset.get_isdigit(s, *lengths))
|
|
1238
|
+
count++;
|
|
1239
|
+
}
|
|
1240
|
+
return count;
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
BOOL8 acceptable_number_string(const char *s,
|
|
1245
|
+
const char *lengths) {
|
|
1246
|
+
BOOL8 prev_digit = FALSE;
|
|
1247
|
+
|
|
1248
|
+
if (*lengths == 1 && *s == '(')
|
|
1249
|
+
s++;
|
|
1250
|
+
|
|
1251
|
+
if (*lengths == 1 &&
|
|
1252
|
+
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
|
|
1253
|
+
s++;
|
|
1254
|
+
|
|
1255
|
+
for (; *s != '\0'; s += *(lengths++)) {
|
|
1256
|
+
if (unicharset.get_isdigit (s, *lengths))
|
|
1257
|
+
prev_digit = TRUE;
|
|
1258
|
+
else if (prev_digit &&
|
|
1259
|
+
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
|
|
1260
|
+
prev_digit = FALSE;
|
|
1261
|
+
else if (prev_digit && *lengths == 1 &&
|
|
1262
|
+
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
|
|
1263
|
+
return TRUE;
|
|
1264
|
+
else if (prev_digit &&
|
|
1265
|
+
*lengths == 1 && (*s == '%') &&
|
|
1266
|
+
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
|
|
1267
|
+
(*(s + *lengths + *(lengths + 1)) == '\0'))
|
|
1268
|
+
return TRUE;
|
|
1269
|
+
else
|
|
1270
|
+
return FALSE;
|
|
1271
|
+
}
|
|
1272
|
+
return TRUE;
|
|
1273
|
+
}
|