tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
|
@@ -0,0 +1,1105 @@
|
|
|
1
|
+
/**********************************************************************
|
|
2
|
+
* File: baseapi.cpp
|
|
3
|
+
* Description: Simple API for calling tesseract.
|
|
4
|
+
* Author: Ray Smith
|
|
5
|
+
* Created: Fri Oct 06 15:35:01 PDT 2006
|
|
6
|
+
*
|
|
7
|
+
* (C) Copyright 2006, Google Inc.
|
|
8
|
+
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
** you may not use this file except in compliance with the License.
|
|
10
|
+
** You may obtain a copy of the License at
|
|
11
|
+
** http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
** Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
** See the License for the specific language governing permissions and
|
|
16
|
+
** limitations under the License.
|
|
17
|
+
*
|
|
18
|
+
**********************************************************************/
|
|
19
|
+
|
|
20
|
+
#include "baseapi.h"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
// Include automatically generated configuration file if running autoconf.
|
|
24
|
+
#ifdef HAVE_CONFIG_H
|
|
25
|
+
#include "config_auto.h"
|
|
26
|
+
#endif
|
|
27
|
+
|
|
28
|
+
#ifdef HAVE_LIBLEPT
|
|
29
|
+
// Include leptonica library only if autoconf (or makefile etc) tell us to.
|
|
30
|
+
#include "allheaders.h"
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
#include "tessedit.h"
|
|
34
|
+
#include "ocrclass.h"
|
|
35
|
+
#include "pageres.h"
|
|
36
|
+
#include "tessvars.h"
|
|
37
|
+
#include "control.h"
|
|
38
|
+
#include "applybox.h"
|
|
39
|
+
#include "pgedit.h"
|
|
40
|
+
#include "varabled.h"
|
|
41
|
+
#include "variables.h"
|
|
42
|
+
#include "output.h"
|
|
43
|
+
#include "globals.h"
|
|
44
|
+
#include "adaptmatch.h"
|
|
45
|
+
#include "edgblob.h"
|
|
46
|
+
#include "tessbox.h"
|
|
47
|
+
#include "tordvars.h"
|
|
48
|
+
#include "imgs.h"
|
|
49
|
+
#include "makerow.h"
|
|
50
|
+
#include "tstruct.h"
|
|
51
|
+
#include "tessout.h"
|
|
52
|
+
#include "tface.h"
|
|
53
|
+
#include "permute.h"
|
|
54
|
+
|
|
55
|
+
BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
|
|
56
|
+
"Take segmentation and labeling from box file");
|
|
57
|
+
BOOL_VAR(tessedit_train_from_boxes, FALSE,
|
|
58
|
+
"Generate training data from boxed chars");
|
|
59
|
+
|
|
60
|
+
// Minimum sensible image size to be worth running tesseract.
|
|
61
|
+
const int kMinRectSize = 10;
|
|
62
|
+
|
|
63
|
+
static STRING input_file = "noname.tif";
|
|
64
|
+
|
|
65
|
+
// Set the value of an internal "variable" (of either old or new types).
|
|
66
|
+
// Supply the name of the variable and the value as a string, just as
|
|
67
|
+
// you would in a config file.
|
|
68
|
+
// Returns false if the name lookup failed.
|
|
69
|
+
bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
|
|
70
|
+
if (set_new_style_variable(variable, value))
|
|
71
|
+
return true;
|
|
72
|
+
return set_old_style_variable(variable, value);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void TessBaseAPI::SimpleInit(const char* datapath,
|
|
76
|
+
const char* language,
|
|
77
|
+
bool numeric_mode) {
|
|
78
|
+
InitWithLanguage(datapath, NULL, language, NULL, numeric_mode, 0, NULL);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Start tesseract.
|
|
82
|
+
// The datapath must be the name of the data directory or some other file
|
|
83
|
+
// in which the data directory resides (for instance argv[0].)
|
|
84
|
+
// The configfile is the name of a file in the tessconfigs directory
|
|
85
|
+
// (eg batch) or NULL to run on defaults.
|
|
86
|
+
// Outputbase may also be NULL, and is the basename of various output files.
|
|
87
|
+
// If the output of any of these files is enabled, then a name nmust be given.
|
|
88
|
+
// If numeric_mode is true, only possible digits and roman numbers are
|
|
89
|
+
// returned. Returns 0 if successful. Crashes if not.
|
|
90
|
+
// The argc and argv may be 0 and NULL respectively. They are used for
|
|
91
|
+
// providing config files for debug/display purposes.
|
|
92
|
+
// TODO(rays) get the facts straight. Is it OK to call
|
|
93
|
+
// it more than once? Make it properly check for errors and return them.
|
|
94
|
+
int TessBaseAPI::Init(const char* datapath, const char* outputbase,
|
|
95
|
+
const char* configfile, bool numeric_mode,
|
|
96
|
+
int argc, char* argv[]) {
|
|
97
|
+
return InitWithLanguage(datapath, outputbase, NULL, configfile,
|
|
98
|
+
numeric_mode, argc, argv);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Start tesseract.
|
|
102
|
+
// Similar to Init() except that it is possible to specify the language.
|
|
103
|
+
// Language is the code of the language for which the data will be loaded.
|
|
104
|
+
// (Codes follow ISO 639-3.) If it is NULL, english (eng) will be loaded.
|
|
105
|
+
int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
|
|
106
|
+
const char* language, const char* configfile,
|
|
107
|
+
bool numeric_mode, int argc, char* argv[]) {
|
|
108
|
+
int result = init_tesseract(datapath, outputbase, language,
|
|
109
|
+
configfile, argc, argv);
|
|
110
|
+
|
|
111
|
+
bln_numericmode.set_value(numeric_mode);
|
|
112
|
+
return result;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Init the lang model component of Tesseract
|
|
116
|
+
int TessBaseAPI::InitLangMod(const char* datapath, const char* outputbase,
|
|
117
|
+
const char* language, const char* configfile,
|
|
118
|
+
bool numeric_mode, int argc, char* argv[]) {
|
|
119
|
+
return init_tesseract_lm(datapath, outputbase, language,
|
|
120
|
+
configfile, argc, argv);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Set the name of the input file. Needed only for training and
|
|
124
|
+
// loading a UNLV zone file.
|
|
125
|
+
void TessBaseAPI::SetInputName(const char* name) {
|
|
126
|
+
input_file = name;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Recognize a rectangle from an image and return the result as a string.
|
|
130
|
+
// May be called many times for a single Init.
|
|
131
|
+
// Currently has no error checking.
|
|
132
|
+
// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
|
|
133
|
+
// Palette color images will not work properly and must be converted to
|
|
134
|
+
// 24 bit.
|
|
135
|
+
// Binary images of 1 bit per pixel may also be given but they must be
|
|
136
|
+
// byte packed with the MSB of the first byte being the first pixel, and a
|
|
137
|
+
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
|
|
138
|
+
// The recognized text is returned as a char* which (in future will be coded
|
|
139
|
+
// as UTF8 and) must be freed with the delete [] operator.
|
|
140
|
+
char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
|
|
141
|
+
int bytes_per_pixel,
|
|
142
|
+
int bytes_per_line,
|
|
143
|
+
int left, int top,
|
|
144
|
+
int width, int height) {
|
|
145
|
+
if (width < kMinRectSize || height < kMinRectSize)
|
|
146
|
+
return NULL; // Nothing worth doing.
|
|
147
|
+
|
|
148
|
+
// Copy/Threshold the image to the tesseract global page_image.
|
|
149
|
+
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
|
|
150
|
+
left, top, width, height);
|
|
151
|
+
|
|
152
|
+
return RecognizeToString();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// As TesseractRect but produces a box file as output.
|
|
156
|
+
char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
|
|
157
|
+
int bytes_per_pixel,
|
|
158
|
+
int bytes_per_line,
|
|
159
|
+
int left, int top,
|
|
160
|
+
int width, int height,
|
|
161
|
+
int imageheight) {
|
|
162
|
+
if (width < kMinRectSize || height < kMinRectSize)
|
|
163
|
+
return NULL; // Nothing worth doing.
|
|
164
|
+
|
|
165
|
+
// Copy/Threshold the image to the tesseract global page_image.
|
|
166
|
+
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
|
|
167
|
+
left, top, width, height);
|
|
168
|
+
|
|
169
|
+
BLOCK_LIST block_list;
|
|
170
|
+
|
|
171
|
+
FindLines(&block_list);
|
|
172
|
+
|
|
173
|
+
// Now run the main recognition.
|
|
174
|
+
PAGE_RES* page_res = Recognize(&block_list, NULL);
|
|
175
|
+
|
|
176
|
+
return TesseractToBoxText(page_res, left, imageheight - (top + height));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
|
|
180
|
+
int bytes_per_pixel,
|
|
181
|
+
int bytes_per_line,
|
|
182
|
+
int left, int top,
|
|
183
|
+
int width, int height) {
|
|
184
|
+
if (width < kMinRectSize || height < kMinRectSize)
|
|
185
|
+
return NULL; // Nothing worth doing.
|
|
186
|
+
|
|
187
|
+
// Copy/Threshold the image to the tesseract global page_image.
|
|
188
|
+
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
|
|
189
|
+
left, top, width, height);
|
|
190
|
+
|
|
191
|
+
BLOCK_LIST block_list;
|
|
192
|
+
|
|
193
|
+
FindLines(&block_list);
|
|
194
|
+
|
|
195
|
+
// Now run the main recognition.
|
|
196
|
+
PAGE_RES* page_res = Recognize(&block_list, NULL);
|
|
197
|
+
|
|
198
|
+
return TesseractToUNLV(page_res);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Call between pages or documents etc to free up memory and forget
|
|
202
|
+
// adaptive data.
|
|
203
|
+
void TessBaseAPI::ClearAdaptiveClassifier() {
|
|
204
|
+
ResetAdaptiveClassifier();
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Close down tesseract and free up memory.
|
|
208
|
+
void TessBaseAPI::End() {
|
|
209
|
+
ResetAdaptiveClassifier();
|
|
210
|
+
end_tesseract();
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Dump the internal binary image to a PGM file.
|
|
214
|
+
void TessBaseAPI::DumpPGM(const char* filename) {
|
|
215
|
+
IMAGELINE line;
|
|
216
|
+
line.init(page_image.get_xsize());
|
|
217
|
+
FILE *fp = fopen(filename, "w");
|
|
218
|
+
fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", page_image.get_xsize(),
|
|
219
|
+
page_image.get_ysize());
|
|
220
|
+
for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
|
|
221
|
+
page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
|
|
222
|
+
for (int i = 0; i < page_image.get_xsize(); ++i) {
|
|
223
|
+
uinT8 b = line.pixels[i] ? 255 : 0;
|
|
224
|
+
fwrite(&b, 1, 1, fp);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
fclose(fp);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#ifdef HAVE_LIBLEPT
|
|
231
|
+
// ONLY available if you have Leptonica installed.
|
|
232
|
+
// Get a copy of the thresholded global image from Tesseract.
|
|
233
|
+
Pix* TessBaseAPI::GetTesseractImage() {
|
|
234
|
+
return page_image.ToPix();
|
|
235
|
+
}
|
|
236
|
+
#endif // HAVE_LIBLEPT
|
|
237
|
+
|
|
238
|
+
// Copy the given image rectangle to Tesseract, with adaptive thresholding
|
|
239
|
+
// if the image is not already binary.
|
|
240
|
+
void TessBaseAPI::CopyImageToTesseract(const unsigned char* imagedata,
|
|
241
|
+
int bytes_per_pixel,
|
|
242
|
+
int bytes_per_line,
|
|
243
|
+
int left, int top,
|
|
244
|
+
int width, int height) {
|
|
245
|
+
if (bytes_per_pixel > 0) {
|
|
246
|
+
// Threshold grey or color.
|
|
247
|
+
int* thresholds = new int[bytes_per_pixel];
|
|
248
|
+
int* hi_values = new int[bytes_per_pixel];
|
|
249
|
+
|
|
250
|
+
// Compute the thresholds.
|
|
251
|
+
OtsuThreshold(imagedata, bytes_per_pixel, bytes_per_line,
|
|
252
|
+
left, top, left + width, top + height,
|
|
253
|
+
thresholds, hi_values);
|
|
254
|
+
|
|
255
|
+
// Threshold the image to the tesseract global page_image.
|
|
256
|
+
ThresholdRect(imagedata, bytes_per_pixel, bytes_per_line,
|
|
257
|
+
left, top, width, height,
|
|
258
|
+
thresholds, hi_values);
|
|
259
|
+
delete [] thresholds;
|
|
260
|
+
delete [] hi_values;
|
|
261
|
+
} else {
|
|
262
|
+
CopyBinaryRect(imagedata, bytes_per_line, left, top, width, height);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Compute the Otsu threshold(s) for the given image rectangle, making one
|
|
267
|
+
// for each channel. Each channel is always one byte per pixel.
|
|
268
|
+
// Returns an array of threshold values and an array of hi_values, such
|
|
269
|
+
// that a pixel value >threshold[channel] is considered foreground if
|
|
270
|
+
// hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates
|
|
271
|
+
// that there is no apparent foreground. At least one hi_value will not be -1.
|
|
272
|
+
// thresholds and hi_values are assumed to be of bytes_per_pixel size.
|
|
273
|
+
void TessBaseAPI::OtsuThreshold(const unsigned char* imagedata,
|
|
274
|
+
int bytes_per_pixel,
|
|
275
|
+
int bytes_per_line,
|
|
276
|
+
int left, int top, int right, int bottom,
|
|
277
|
+
int* thresholds,
|
|
278
|
+
int* hi_values) {
|
|
279
|
+
// Of all channels with no good hi_value, keep the best so we can always
|
|
280
|
+
// produce at least one answer.
|
|
281
|
+
int best_hi_value = 0;
|
|
282
|
+
int best_hi_index = 0;
|
|
283
|
+
bool any_good_hivalue = false;
|
|
284
|
+
double best_hi_dist = 0.0;
|
|
285
|
+
|
|
286
|
+
for (int ch = 0; ch < bytes_per_pixel; ++ch) {
|
|
287
|
+
thresholds[ch] = 0;
|
|
288
|
+
hi_values[ch] = -1;
|
|
289
|
+
// Compute the histogram of the image rectangle.
|
|
290
|
+
int histogram[256];
|
|
291
|
+
HistogramRect(imagedata + ch, bytes_per_pixel, bytes_per_line,
|
|
292
|
+
left, top, right, bottom, histogram);
|
|
293
|
+
int H;
|
|
294
|
+
int best_omega_0;
|
|
295
|
+
int best_t = OtsuStats(histogram, &H, &best_omega_0);
|
|
296
|
+
if (best_omega_0 == 0 || best_omega_0 == H) {
|
|
297
|
+
// This channel is empty.
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
// To be a convincing foreground we must have a small fraction of H
|
|
301
|
+
// or to be a convincing background we must have a large fraction of H.
|
|
302
|
+
// In between we assume this channel contains no thresholding information.
|
|
303
|
+
int hi_value = best_omega_0 < H * 0.5;
|
|
304
|
+
thresholds[ch] = best_t;
|
|
305
|
+
if (best_omega_0 > H * 0.75) {
|
|
306
|
+
any_good_hivalue = true;
|
|
307
|
+
hi_values[ch] = 0;
|
|
308
|
+
}
|
|
309
|
+
else if (best_omega_0 < H * 0.25) {
|
|
310
|
+
any_good_hivalue = true;
|
|
311
|
+
hi_values[ch] = 1;
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
// In case all channels are like this, keep the best of the bad lot.
|
|
315
|
+
double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
|
|
316
|
+
if (hi_dist > best_hi_dist) {
|
|
317
|
+
best_hi_dist = hi_dist;
|
|
318
|
+
best_hi_value = hi_value;
|
|
319
|
+
best_hi_index = ch;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (!any_good_hivalue) {
|
|
324
|
+
// Use the best of the ones that were not good enough.
|
|
325
|
+
hi_values[best_hi_index] = best_hi_value;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Compute the histogram for the given image rectangle, and the given
|
|
330
|
+
// channel. (Channel pointed to by imagedata.) Each channel is always
|
|
331
|
+
// one byte per pixel.
|
|
332
|
+
// Bytes per pixel is used to skip channels not being
|
|
333
|
+
// counted with this call in a multi-channel (pixel-major) image.
|
|
334
|
+
// Histogram is always a 256 element array to count occurrences of
|
|
335
|
+
// each pixel value.
|
|
336
|
+
void TessBaseAPI::HistogramRect(const unsigned char* imagedata,
|
|
337
|
+
int bytes_per_pixel,
|
|
338
|
+
int bytes_per_line,
|
|
339
|
+
int left, int top, int right, int bottom,
|
|
340
|
+
int* histogram) {
|
|
341
|
+
int width = right - left;
|
|
342
|
+
memset(histogram, 0, sizeof(*histogram) * 256);
|
|
343
|
+
const unsigned char* pixels = imagedata +
|
|
344
|
+
top*bytes_per_line +
|
|
345
|
+
left*bytes_per_pixel;
|
|
346
|
+
for (int y = top; y < bottom; ++y) {
|
|
347
|
+
for (int x = 0; x < width; ++x) {
|
|
348
|
+
++histogram[pixels[x * bytes_per_pixel]];
|
|
349
|
+
}
|
|
350
|
+
pixels += bytes_per_line;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Compute the Otsu threshold(s) for the given histogram.
|
|
355
|
+
// Also returns H = total count in histogram, and
|
|
356
|
+
// omega0 = count of histogram below threshold.
|
|
357
|
+
int TessBaseAPI::OtsuStats(const int* histogram,
|
|
358
|
+
int* H_out,
|
|
359
|
+
int* omega0_out) {
|
|
360
|
+
int H = 0;
|
|
361
|
+
double mu_T = 0.0;
|
|
362
|
+
for (int i = 0; i < 256; ++i) {
|
|
363
|
+
H += histogram[i];
|
|
364
|
+
mu_T += i * histogram[i];
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Now maximize sig_sq_B over t.
|
|
368
|
+
// http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
|
|
369
|
+
int best_t = -1;
|
|
370
|
+
int omega_0, omega_1;
|
|
371
|
+
int best_omega_0 = 0;
|
|
372
|
+
double best_sig_sq_B = 0.0;
|
|
373
|
+
double mu_0, mu_1, mu_t;
|
|
374
|
+
omega_0 = 0;
|
|
375
|
+
mu_t = 0.0;
|
|
376
|
+
for (int t = 0; t < 255; ++t) {
|
|
377
|
+
omega_0 += histogram[t];
|
|
378
|
+
mu_t += t * static_cast<double>(histogram[t]);
|
|
379
|
+
if (omega_0 == 0)
|
|
380
|
+
continue;
|
|
381
|
+
omega_1 = H - omega_0;
|
|
382
|
+
mu_0 = mu_t / omega_0;
|
|
383
|
+
mu_1 = (mu_T - mu_t) / omega_1;
|
|
384
|
+
double sig_sq_B = mu_1 - mu_0;
|
|
385
|
+
sig_sq_B *= sig_sq_B * omega_0 * omega_1;
|
|
386
|
+
if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
|
|
387
|
+
best_sig_sq_B = sig_sq_B;
|
|
388
|
+
best_t = t;
|
|
389
|
+
best_omega_0 = omega_0;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
if (H_out != NULL) *H_out = H;
|
|
393
|
+
if (omega0_out != NULL) *omega0_out = best_omega_0;
|
|
394
|
+
return best_t;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Threshold the given grey or color image into the tesseract global
|
|
398
|
+
// image ready for recognition. Requires thresholds and hi_value
|
|
399
|
+
// produced by OtsuThreshold above.
|
|
400
|
+
void TessBaseAPI::ThresholdRect(const unsigned char* imagedata,
|
|
401
|
+
int bytes_per_pixel,
|
|
402
|
+
int bytes_per_line,
|
|
403
|
+
int left, int top,
|
|
404
|
+
int width, int height,
|
|
405
|
+
const int* thresholds,
|
|
406
|
+
const int* hi_values) {
|
|
407
|
+
IMAGELINE line;
|
|
408
|
+
page_image.create(width, height, 1);
|
|
409
|
+
line.init(width);
|
|
410
|
+
// For each line in the image, fill the IMAGELINE class and put it into the
|
|
411
|
+
// Tesseract global page_image. Note that Tesseract stores images with the
|
|
412
|
+
// bottom at y=0 and 0 is black, so we need 2 kinds of inversion.
|
|
413
|
+
const unsigned char* data = imagedata + top*bytes_per_line +
|
|
414
|
+
left*bytes_per_pixel;
|
|
415
|
+
for (int y = height - 1 ; y >= 0; --y) {
|
|
416
|
+
const unsigned char* pix = data;
|
|
417
|
+
for (int x = 0; x < width; ++x, pix += bytes_per_pixel) {
|
|
418
|
+
line.pixels[x] = 1;
|
|
419
|
+
for (int ch = 0; ch < bytes_per_pixel; ++ch) {
|
|
420
|
+
if (hi_values[ch] >= 0 &&
|
|
421
|
+
(pix[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
|
|
422
|
+
line.pixels[x] = 0;
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
page_image.put_line(0, y, width, &line, 0);
|
|
428
|
+
data += bytes_per_line;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// Cut out the requested rectangle of the binary image to the
|
|
433
|
+
// tesseract global image ready for recognition.
|
|
434
|
+
void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
|
|
435
|
+
int bytes_per_line,
|
|
436
|
+
int left, int top,
|
|
437
|
+
int width, int height) {
|
|
438
|
+
// Copy binary image, cutting out the required rectangle.
|
|
439
|
+
IMAGE image;
|
|
440
|
+
image.capture(const_cast<unsigned char*>(imagedata),
|
|
441
|
+
bytes_per_line*8, top + height, 1);
|
|
442
|
+
page_image.create(width, height, 1);
|
|
443
|
+
copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Low-level function to recognize the current global image to a string.
|
|
447
|
+
char* TessBaseAPI::RecognizeToString() {
|
|
448
|
+
BLOCK_LIST block_list;
|
|
449
|
+
|
|
450
|
+
FindLines(&block_list);
|
|
451
|
+
|
|
452
|
+
// Now run the main recognition.
|
|
453
|
+
PAGE_RES* page_res = Recognize(&block_list, NULL);
|
|
454
|
+
|
|
455
|
+
return TesseractToText(page_res);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
// Find lines from the image making the BLOCK_LIST.
|
|
459
|
+
void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
|
|
460
|
+
// The following call creates a full-page block and then runs connected
|
|
461
|
+
// component analysis and text line creation.
|
|
462
|
+
pgeditor_read_file(input_file, block_list);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Recognize the tesseract global image and return the result as Tesseract
|
|
466
|
+
// internal structures.
|
|
467
|
+
PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
|
|
468
|
+
if (tessedit_resegment_from_boxes)
|
|
469
|
+
apply_boxes(block_list);
|
|
470
|
+
|
|
471
|
+
PAGE_RES* page_res = new PAGE_RES(block_list);
|
|
472
|
+
if (interactive_mode) {
|
|
473
|
+
#ifndef GRAPHICS_DISABLED
|
|
474
|
+
pgeditor_main(block_list); // pgeditor user I/F
|
|
475
|
+
#endif
|
|
476
|
+
} else if (tessedit_train_from_boxes) {
|
|
477
|
+
apply_box_training(block_list);
|
|
478
|
+
} else {
|
|
479
|
+
// Now run the main recognition.
|
|
480
|
+
recog_all_words(page_res, monitor);
|
|
481
|
+
}
|
|
482
|
+
return page_res;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// Return the maximum length that the output text string might occupy.
|
|
486
|
+
int TessBaseAPI::TextLength(PAGE_RES* page_res) {
|
|
487
|
+
PAGE_RES_IT page_res_it(page_res);
|
|
488
|
+
int total_length = 2;
|
|
489
|
+
// Iterate over the data structures to extract the recognition result.
|
|
490
|
+
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
491
|
+
page_res_it.forward()) {
|
|
492
|
+
WERD_RES *word = page_res_it.word();
|
|
493
|
+
WERD_CHOICE* choice = word->best_choice;
|
|
494
|
+
if (choice != NULL) {
|
|
495
|
+
total_length += choice->string().length() + 1;
|
|
496
|
+
for (int i = 0; i < word->reject_map.length(); ++i) {
|
|
497
|
+
if (word->reject_map[i].rejected())
|
|
498
|
+
++total_length;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
return total_length;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Returns an array of all word confidences, terminated by -1.
|
|
506
|
+
int* TessBaseAPI::AllTextConfidences(PAGE_RES* page_res) {
|
|
507
|
+
if (!page_res) return NULL;
|
|
508
|
+
int n_word = 0;
|
|
509
|
+
PAGE_RES_IT res_it(page_res);
|
|
510
|
+
for (res_it.restart_page(); res_it.word () != NULL; res_it.forward())
|
|
511
|
+
n_word++;
|
|
512
|
+
|
|
513
|
+
int* conf = new int[n_word+1];
|
|
514
|
+
n_word = 0;
|
|
515
|
+
for (res_it.restart_page(); res_it.word () != NULL; res_it.forward()) {
|
|
516
|
+
WERD_RES *word = res_it.word();
|
|
517
|
+
WERD_CHOICE* choice = word->best_choice;
|
|
518
|
+
int w_conf = static_cast<int>(100 + 5 * choice->certainty());
|
|
519
|
+
// This is the eq for converting Tesseract confidence to 1..100
|
|
520
|
+
if (w_conf < 0) w_conf = 0;
|
|
521
|
+
if (w_conf > 100) w_conf = 100;
|
|
522
|
+
conf[n_word++] = w_conf;
|
|
523
|
+
}
|
|
524
|
+
conf[n_word] = -1;
|
|
525
|
+
return conf;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Returns the average word confidence for Tesseract page result.
|
|
529
|
+
int TessBaseAPI::TextConf(PAGE_RES* page_res) {
|
|
530
|
+
int* conf = AllTextConfidences(page_res);
|
|
531
|
+
if (!conf) return 0;
|
|
532
|
+
int sum = 0;
|
|
533
|
+
int *pt = conf;
|
|
534
|
+
while (*pt >= 0) sum += *pt++;
|
|
535
|
+
if (pt != conf) sum /= pt - conf;
|
|
536
|
+
delete [] conf;
|
|
537
|
+
return sum;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// Make a text string from the internal data structures.
|
|
541
|
+
// The input page_res is deleted.
|
|
542
|
+
char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
|
|
543
|
+
if (page_res != NULL) {
|
|
544
|
+
int total_length = TextLength(page_res);
|
|
545
|
+
PAGE_RES_IT page_res_it(page_res);
|
|
546
|
+
char* result = new char[total_length];
|
|
547
|
+
char* ptr = result;
|
|
548
|
+
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
549
|
+
page_res_it.forward()) {
|
|
550
|
+
WERD_RES *word = page_res_it.word();
|
|
551
|
+
WERD_CHOICE* choice = word->best_choice;
|
|
552
|
+
if (choice != NULL) {
|
|
553
|
+
strcpy(ptr, choice->string().string());
|
|
554
|
+
ptr += strlen(ptr);
|
|
555
|
+
if (word->word->flag(W_EOL))
|
|
556
|
+
*ptr++ = '\n';
|
|
557
|
+
else
|
|
558
|
+
*ptr++ = ' ';
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
*ptr++ = '\n';
|
|
562
|
+
*ptr = '\0';
|
|
563
|
+
delete page_res;
|
|
564
|
+
return result;
|
|
565
|
+
}
|
|
566
|
+
return NULL;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
static int ConvertWordToBoxText(WERD_RES *word,
|
|
570
|
+
ROW_RES* row,
|
|
571
|
+
int left,
|
|
572
|
+
int bottom,
|
|
573
|
+
char* word_str) {
|
|
574
|
+
// Copy the output word and denormalize it back to image coords.
|
|
575
|
+
WERD copy_outword;
|
|
576
|
+
copy_outword = *(word->outword);
|
|
577
|
+
copy_outword.baseline_denormalise(&word->denorm);
|
|
578
|
+
PBLOB_IT blob_it;
|
|
579
|
+
blob_it.set_to_list(copy_outword.blob_list());
|
|
580
|
+
int length = copy_outword.blob_list()->length();
|
|
581
|
+
int output_size = 0;
|
|
582
|
+
|
|
583
|
+
if (length > 0) {
|
|
584
|
+
for (int index = 0, offset = 0; index < length;
|
|
585
|
+
offset += word->best_choice->lengths()[index++], blob_it.forward()) {
|
|
586
|
+
PBLOB* blob = blob_it.data();
|
|
587
|
+
TBOX blob_box = blob->bounding_box();
|
|
588
|
+
int box_left = MAX(blob_box.left(), 0);
|
|
589
|
+
int box_right = MIN(blob_box.right(), page_image.get_xsize());
|
|
590
|
+
int box_bottom = MAX(blob_box.bottom(), 0);
|
|
591
|
+
int box_top = MIN(blob_box.top(), page_image.get_ysize());
|
|
592
|
+
|
|
593
|
+
if (word->tess_failed || box_left >= box_right || box_bottom >= box_top) {
|
|
594
|
+
// Bounding boxes can be illegal when tess fails on a word.
|
|
595
|
+
TBOX word_box = word->word->bounding_box(); // Original word is backup.
|
|
596
|
+
if (box_left < word_box.left()) box_left = word_box.left();
|
|
597
|
+
if (box_right > word_box.right()) box_right = word_box.right();
|
|
598
|
+
if (box_bottom < word_box.bottom()) box_bottom = word_box.bottom();
|
|
599
|
+
if (box_top > word_box.top()) box_top = word_box.top();
|
|
600
|
+
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
|
601
|
+
box_left, box_bottom, box_right, box_top);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// A single classification unit can be composed of several UTF-8
|
|
605
|
+
// characters. Append each of them to the result.
|
|
606
|
+
for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
|
|
607
|
+
char ch = word->best_choice->string()[offset + sub];
|
|
608
|
+
// Tesseract uses space for recognition failure. Fix to a reject
|
|
609
|
+
// character, '~' so we don't create illegal box files.
|
|
610
|
+
if (ch == ' ')
|
|
611
|
+
ch = '~';
|
|
612
|
+
word_str[output_size++] = ch;
|
|
613
|
+
}
|
|
614
|
+
sprintf(word_str + output_size, " %d %d %d %d\n",
|
|
615
|
+
box_left + left,box_bottom + bottom,
|
|
616
|
+
box_right + left, box_top + bottom);
|
|
617
|
+
output_size += strlen(word_str + output_size);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
return output_size;
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
|
|
624
|
+
// plus the newline and the orginial character = 4*(5+1)+2
|
|
625
|
+
const int kMaxCharsPerChar = 26;
|
|
626
|
+
|
|
627
|
+
// Make a text string from the internal data structures.
|
|
628
|
+
// The input page_res is deleted.
|
|
629
|
+
// The text string takes the form of a box file as needed for training.
|
|
630
|
+
char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
|
|
631
|
+
int left, int bottom) {
|
|
632
|
+
if (page_res != NULL) {
|
|
633
|
+
int total_length = TextLength(page_res) * kMaxCharsPerChar;
|
|
634
|
+
PAGE_RES_IT page_res_it(page_res);
|
|
635
|
+
char* result = new char[total_length];
|
|
636
|
+
char* ptr = result;
|
|
637
|
+
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
638
|
+
page_res_it.forward()) {
|
|
639
|
+
WERD_RES *word = page_res_it.word();
|
|
640
|
+
ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
|
|
641
|
+
}
|
|
642
|
+
*ptr = '\0';
|
|
643
|
+
delete page_res;
|
|
644
|
+
return result;
|
|
645
|
+
}
|
|
646
|
+
return NULL;
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
// Make a text string from the internal data structures.
|
|
650
|
+
// The input page_res is deleted. The text string is converted
|
|
651
|
+
// to UNLV-format: Latin-1 with specific reject and suspect codes.
|
|
652
|
+
const char kUnrecognized = '~';
|
|
653
|
+
// Conversion table for non-latin characters.
|
|
654
|
+
// Maps characters out of the latin set into the latin set.
|
|
655
|
+
// TODO(rays) incorporate this translation into unicharset.
|
|
656
|
+
const int kUniChs[] = {
|
|
657
|
+
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
|
|
658
|
+
};
|
|
659
|
+
// Latin chars corresponding to the unicode chars above.
|
|
660
|
+
const int kLatinChs[] = {
|
|
661
|
+
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
|
|
662
|
+
};
|
|
663
|
+
|
|
664
|
+
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
|
|
665
|
+
bool tilde_crunch_written = false;
|
|
666
|
+
bool last_char_was_newline = true;
|
|
667
|
+
bool last_char_was_tilde = false;
|
|
668
|
+
|
|
669
|
+
if (page_res != NULL) {
|
|
670
|
+
int total_length = TextLength(page_res);
|
|
671
|
+
PAGE_RES_IT page_res_it(page_res);
|
|
672
|
+
char* result = new char[total_length];
|
|
673
|
+
char* ptr = result;
|
|
674
|
+
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
675
|
+
page_res_it.forward()) {
|
|
676
|
+
WERD_RES *word = page_res_it.word();
|
|
677
|
+
// Process the current word.
|
|
678
|
+
if (word->unlv_crunch_mode != CR_NONE) {
|
|
679
|
+
if (word->unlv_crunch_mode != CR_DELETE &&
|
|
680
|
+
(!tilde_crunch_written ||
|
|
681
|
+
(word->unlv_crunch_mode == CR_KEEP_SPACE &&
|
|
682
|
+
word->word->space () > 0 &&
|
|
683
|
+
!word->word->flag (W_FUZZY_NON) &&
|
|
684
|
+
!word->word->flag (W_FUZZY_SP)))) {
|
|
685
|
+
if (!word->word->flag (W_BOL) &&
|
|
686
|
+
word->word->space () > 0 &&
|
|
687
|
+
!word->word->flag (W_FUZZY_NON) &&
|
|
688
|
+
!word->word->flag (W_FUZZY_SP)) {
|
|
689
|
+
/* Write a space to separate from preceeding good text */
|
|
690
|
+
*ptr++ = ' ';
|
|
691
|
+
last_char_was_tilde = false;
|
|
692
|
+
}
|
|
693
|
+
if (!last_char_was_tilde) {
|
|
694
|
+
// Write a reject char.
|
|
695
|
+
last_char_was_tilde = true;
|
|
696
|
+
*ptr++ = kUnrecognized;
|
|
697
|
+
tilde_crunch_written = true;
|
|
698
|
+
last_char_was_newline = false;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
} else {
|
|
702
|
+
// NORMAL PROCESSING of non tilde crunched words.
|
|
703
|
+
tilde_crunch_written = false;
|
|
704
|
+
|
|
705
|
+
if (last_char_was_tilde &&
|
|
706
|
+
word->word->space () == 0 &&
|
|
707
|
+
(word->best_choice->string ()[0] == ' ')) {
|
|
708
|
+
/* Prevent adjacent tilde across words - we know that adjacent tildes within
|
|
709
|
+
words have been removed */
|
|
710
|
+
char* p = (char *) word->best_choice->string().string ();
|
|
711
|
+
strcpy (p, p + 1); //shuffle up
|
|
712
|
+
p = (char *) word->best_choice->lengths().string ();
|
|
713
|
+
strcpy (p, p + 1); //shuffle up
|
|
714
|
+
word->reject_map.remove_pos (0);
|
|
715
|
+
PBLOB_IT blob_it = word->outword->blob_list ();
|
|
716
|
+
delete blob_it.extract (); //get rid of reject blob
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
|
|
720
|
+
ensure_rep_chars_are_consistent(word);
|
|
721
|
+
|
|
722
|
+
set_unlv_suspects(word);
|
|
723
|
+
const char* wordstr = word->best_choice->string().string();
|
|
724
|
+
if (wordstr[0] != 0) {
|
|
725
|
+
if (!last_char_was_newline)
|
|
726
|
+
*ptr++ = ' ';
|
|
727
|
+
else
|
|
728
|
+
last_char_was_newline = false;
|
|
729
|
+
int offset = 0;
|
|
730
|
+
const STRING& lengths = word->best_choice->lengths();
|
|
731
|
+
int length = lengths.length();
|
|
732
|
+
for (int i = 0; i < length; offset += lengths[i++]) {
|
|
733
|
+
if (wordstr[offset] == ' ' ||
|
|
734
|
+
wordstr[offset] == '~' ||
|
|
735
|
+
wordstr[offset] == '|') {
|
|
736
|
+
*ptr++ = kUnrecognized;
|
|
737
|
+
last_char_was_tilde = true;
|
|
738
|
+
} else {
|
|
739
|
+
if (word->reject_map[i].rejected())
|
|
740
|
+
*ptr++ = '^';
|
|
741
|
+
UNICHAR ch(wordstr + offset, lengths[i]);
|
|
742
|
+
int uni_ch = ch.first_uni();
|
|
743
|
+
for (int j = 0; kUniChs[j] != 0; ++j) {
|
|
744
|
+
if (kUniChs[j] == uni_ch) {
|
|
745
|
+
uni_ch = kLatinChs[j];
|
|
746
|
+
break;
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
if (uni_ch <= 0xff) {
|
|
750
|
+
*ptr++ = static_cast<char>(uni_ch);
|
|
751
|
+
last_char_was_tilde = false;
|
|
752
|
+
} else {
|
|
753
|
+
*ptr++ = kUnrecognized;
|
|
754
|
+
last_char_was_tilde = true;
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
if (word->word->flag(W_EOL) && !last_char_was_newline) {
|
|
761
|
+
/* Add a new line output */
|
|
762
|
+
*ptr++ = '\n';
|
|
763
|
+
tilde_crunch_written = false;
|
|
764
|
+
last_char_was_newline = true;
|
|
765
|
+
last_char_was_tilde = false;
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
*ptr++ = '\n';
|
|
769
|
+
*ptr = '\0';
|
|
770
|
+
delete page_res;
|
|
771
|
+
return result;
|
|
772
|
+
}
|
|
773
|
+
return NULL;
|
|
774
|
+
}
|
|
775
|
+
// ____________________________________________________________________________
|
|
776
|
+
// Ocropus add-ons.
|
|
777
|
+
|
|
778
|
+
// Find lines from the image making the BLOCK_LIST.
|
|
779
|
+
BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
|
|
780
|
+
BLOCK_LIST *block_list = new BLOCK_LIST();
|
|
781
|
+
FindLines(block_list);
|
|
782
|
+
return block_list;
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// Delete a block list.
|
|
786
|
+
// This is to keep BLOCK_LIST pointer opaque
|
|
787
|
+
// and let go of including the other headers.
|
|
788
|
+
void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
|
|
789
|
+
delete block_list;
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
static ROW *make_tess_ocrrow(float baseline,
|
|
794
|
+
float xheight,
|
|
795
|
+
float descender,
|
|
796
|
+
float ascender) {
|
|
797
|
+
inT32 xstarts[] = {-32000};
|
|
798
|
+
double quad_coeffs[] = {0,0,baseline};
|
|
799
|
+
return new ROW(1,
|
|
800
|
+
xstarts,
|
|
801
|
+
quad_coeffs,
|
|
802
|
+
xheight,
|
|
803
|
+
ascender - (baseline + xheight),
|
|
804
|
+
descender - baseline,
|
|
805
|
+
0,
|
|
806
|
+
0);
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
// Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
|
|
810
|
+
static void fill_dummy_row(float baseline, float xheight,
|
|
811
|
+
float descender, float ascender,
|
|
812
|
+
TEXTROW* tessrow) {
|
|
813
|
+
tessrow->baseline.segments = 1;
|
|
814
|
+
tessrow->baseline.xstarts[0] = -32767;
|
|
815
|
+
tessrow->baseline.xstarts[1] = 32767;
|
|
816
|
+
tessrow->baseline.quads[0].a = 0;
|
|
817
|
+
tessrow->baseline.quads[0].b = 0;
|
|
818
|
+
tessrow->baseline.quads[0].c = bln_baseline_offset;
|
|
819
|
+
tessrow->xheight.segments = 1;
|
|
820
|
+
tessrow->xheight.xstarts[0] = -32767;
|
|
821
|
+
tessrow->xheight.xstarts[1] = 32767;
|
|
822
|
+
tessrow->xheight.quads[0].a = 0;
|
|
823
|
+
tessrow->xheight.quads[0].b = 0;
|
|
824
|
+
tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
|
|
825
|
+
tessrow->lineheight = bln_x_height;
|
|
826
|
+
tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
|
|
827
|
+
tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
/// Return a TBLOB * from the whole page_image.
|
|
832
|
+
/// To be freed later with free_blob().
|
|
833
|
+
TBLOB *make_tesseract_blob(float baseline, float xheight, float descender, float ascender) {
|
|
834
|
+
BLOCK *block = new BLOCK ("a character",
|
|
835
|
+
TRUE,
|
|
836
|
+
0, 0,
|
|
837
|
+
0, 0,
|
|
838
|
+
page_image.get_xsize(),
|
|
839
|
+
page_image.get_ysize());
|
|
840
|
+
|
|
841
|
+
// Create C_BLOBs from the page
|
|
842
|
+
extract_edges(
|
|
843
|
+
#ifndef GRAPHICS_DISABLED
|
|
844
|
+
NULL,
|
|
845
|
+
#endif
|
|
846
|
+
&page_image, &page_image,
|
|
847
|
+
ICOORD(page_image.get_xsize(), page_image.get_ysize()),
|
|
848
|
+
block);
|
|
849
|
+
|
|
850
|
+
// Create one PBLOB from all C_BLOBs
|
|
851
|
+
C_BLOB_LIST *list = block->blob_list();
|
|
852
|
+
C_BLOB_IT c_blob_it(list);
|
|
853
|
+
PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list
|
|
854
|
+
for (c_blob_it.mark_cycle_pt();
|
|
855
|
+
!c_blob_it.cycled_list();
|
|
856
|
+
c_blob_it.forward()) {
|
|
857
|
+
C_BLOB *c_blob = c_blob_it.data();
|
|
858
|
+
PBLOB c_as_p(c_blob, baseline + xheight);
|
|
859
|
+
merge_blobs(pblob, &c_as_p);
|
|
860
|
+
}
|
|
861
|
+
PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word
|
|
862
|
+
PBLOB_IT pblob_it(pblob_list);
|
|
863
|
+
pblob_it.add_after_then_move(pblob);
|
|
864
|
+
|
|
865
|
+
// Normalize PBLOB
|
|
866
|
+
WERD word(pblob_list, 0, " ");
|
|
867
|
+
ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
|
|
868
|
+
word.baseline_normalise(row);
|
|
869
|
+
delete row;
|
|
870
|
+
|
|
871
|
+
// Create a TBLOB from PBLOB
|
|
872
|
+
return make_tess_blob(pblob, /* flatten: */ TRUE);
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
// Adapt to recognize the current image as the given character.
|
|
877
|
+
// The image must be preloaded and be just an image of a single character.
|
|
878
|
+
void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
|
|
879
|
+
int length,
|
|
880
|
+
float baseline,
|
|
881
|
+
float xheight,
|
|
882
|
+
float descender,
|
|
883
|
+
float ascender) {
|
|
884
|
+
UNICHAR_ID id = unicharset.unichar_to_id(unichar_repr, length);
|
|
885
|
+
LINE_STATS LineStats;
|
|
886
|
+
TEXTROW row;
|
|
887
|
+
fill_dummy_row(baseline, xheight, descender, ascender, &row);
|
|
888
|
+
GetLineStatsFromRow(&row, &LineStats);
|
|
889
|
+
|
|
890
|
+
TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
|
|
891
|
+
float threshold;
|
|
892
|
+
int best_class = 0;
|
|
893
|
+
float best_rating = -100;
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
// Classify to get a raw choice.
|
|
897
|
+
LIST result = AdaptiveClassifier(blob, NULL, &row);
|
|
898
|
+
LIST p;
|
|
899
|
+
for (p = result; p != NULL; p = p->next) {
|
|
900
|
+
A_CHOICE *tesschoice = (A_CHOICE *) p->node;
|
|
901
|
+
if (tesschoice->rating > best_rating) {
|
|
902
|
+
best_rating = tesschoice->rating;
|
|
903
|
+
best_class = tesschoice->string[0];
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
FLOAT32 GetBestRatingFor(TBLOB *Blob, LINE_STATS *LineStats, CLASS_ID ClassId);
|
|
908
|
+
|
|
909
|
+
// We have to use char-level adaptation because otherwise
|
|
910
|
+
// someone should do forced alignment somewhere.
|
|
911
|
+
void AdaptToChar(TBLOB *Blob,
|
|
912
|
+
LINE_STATS *LineStats,
|
|
913
|
+
CLASS_ID ClassId,
|
|
914
|
+
FLOAT32 Threshold);
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
if (id == best_class)
|
|
918
|
+
threshold = GoodAdaptiveMatch;
|
|
919
|
+
else {
|
|
920
|
+
/* the blob was incorrectly classified - find the rating threshold
|
|
921
|
+
needed to create a template which will correct the error with
|
|
922
|
+
some margin. However, don't waste time trying to make
|
|
923
|
+
templates which are too tight. */
|
|
924
|
+
threshold = GetBestRatingFor(blob, &LineStats, id);
|
|
925
|
+
threshold *= .9;
|
|
926
|
+
const float max_threshold = .125;
|
|
927
|
+
const float min_threshold = .02;
|
|
928
|
+
|
|
929
|
+
if (threshold > max_threshold)
|
|
930
|
+
threshold = max_threshold;
|
|
931
|
+
|
|
932
|
+
// I have cuddled the following line to set it out of the strike
|
|
933
|
+
// of the coverage testing tool. I have no idea how to trigger
|
|
934
|
+
// this situation nor I have any necessity to do it. --mezhirov
|
|
935
|
+
if (threshold < min_threshold) threshold = min_threshold;
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
if (blob->outlines)
|
|
939
|
+
AdaptToChar(blob, &LineStats, id, threshold);
|
|
940
|
+
free_blob(blob);
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
|
|
945
|
+
PAGE_RES *page_res = new PAGE_RES(block_list);
|
|
946
|
+
recog_all_words(page_res, NULL, NULL, 1);
|
|
947
|
+
return page_res;
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
|
|
951
|
+
PAGE_RES* pass1_result) {
|
|
952
|
+
if (!pass1_result)
|
|
953
|
+
pass1_result = new PAGE_RES(block_list);
|
|
954
|
+
recog_all_words(pass1_result, NULL, NULL, 2);
|
|
955
|
+
return pass1_result;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
struct TESS_CHAR : ELIST_LINK {
|
|
959
|
+
char *unicode_repr;
|
|
960
|
+
int length; // of unicode_repr
|
|
961
|
+
float cost;
|
|
962
|
+
TBOX box;
|
|
963
|
+
|
|
964
|
+
TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
|
|
965
|
+
length = (len == -1 ? strlen(repr) : len);
|
|
966
|
+
unicode_repr = new char[length + 1];
|
|
967
|
+
strncpy(unicode_repr, repr, length);
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
~TESS_CHAR() {
|
|
971
|
+
delete unicode_repr;
|
|
972
|
+
}
|
|
973
|
+
};
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
static void add_space(ELIST_ITERATOR *it) {
|
|
977
|
+
TESS_CHAR *t = new TESS_CHAR(0, " ");
|
|
978
|
+
it->add_after_then_move(t);
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
static float rating_to_cost(float rating) {
|
|
983
|
+
rating = 100 + rating;
|
|
984
|
+
// cuddled that to save from coverage profiler
|
|
985
|
+
// (I have never seen ratings worse than -100,
|
|
986
|
+
// but the check won't hurt)
|
|
987
|
+
if (rating < 0) rating = 0;
|
|
988
|
+
return rating;
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
// Extract the OCR results, costs (penalty points for uncertainty),
|
|
993
|
+
// and the bounding boxes of the characters.
|
|
994
|
+
static void extract_result(ELIST_ITERATOR *out,
|
|
995
|
+
PAGE_RES* page_res) {
|
|
996
|
+
PAGE_RES_IT page_res_it(page_res);
|
|
997
|
+
int word_count = 0;
|
|
998
|
+
while (page_res_it.word() != NULL) {
|
|
999
|
+
WERD_RES *word = page_res_it.word();
|
|
1000
|
+
const char *str = word->best_choice->string().string();
|
|
1001
|
+
const char *len = word->best_choice->lengths().string();
|
|
1002
|
+
|
|
1003
|
+
if (word_count)
|
|
1004
|
+
add_space(out);
|
|
1005
|
+
TBOX bln_rect;
|
|
1006
|
+
PBLOB_LIST *blobs = word->outword->blob_list();
|
|
1007
|
+
PBLOB_IT it(blobs);
|
|
1008
|
+
int n = strlen(len);
|
|
1009
|
+
TBOX** boxes_to_fix = new TBOX*[n];
|
|
1010
|
+
for (int i = 0; i < n; i++) {
|
|
1011
|
+
PBLOB *blob = it.data();
|
|
1012
|
+
TBOX current = blob->bounding_box();
|
|
1013
|
+
bln_rect = bln_rect.bounding_union(current);
|
|
1014
|
+
TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
|
|
1015
|
+
str, *len);
|
|
1016
|
+
tc->box = current;
|
|
1017
|
+
boxes_to_fix[i] = &tc->box;
|
|
1018
|
+
|
|
1019
|
+
out->add_after_then_move(tc);
|
|
1020
|
+
it.forward();
|
|
1021
|
+
str += *len;
|
|
1022
|
+
len++;
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
// Find the word bbox before normalization.
|
|
1026
|
+
// Here we can't use the C_BLOB bboxes directly,
|
|
1027
|
+
// since connected letters are not yet cut.
|
|
1028
|
+
TBOX real_rect = word->word->bounding_box();
|
|
1029
|
+
|
|
1030
|
+
// Denormalize boxes by transforming the bbox of the whole bln word
|
|
1031
|
+
// into the denorm bbox (`real_rect') of the whole word.
|
|
1032
|
+
double x_stretch = double(real_rect.width()) / bln_rect.width();
|
|
1033
|
+
double y_stretch = double(real_rect.height()) / bln_rect.height();
|
|
1034
|
+
for (int j = 0; j < n; j++) {
|
|
1035
|
+
TBOX *box = boxes_to_fix[j];
|
|
1036
|
+
int x0 = int(real_rect.left() +
|
|
1037
|
+
x_stretch * (box->left() - bln_rect.left()) + 0.5);
|
|
1038
|
+
int x1 = int(real_rect.left() +
|
|
1039
|
+
x_stretch * (box->right() - bln_rect.left()) + 0.5);
|
|
1040
|
+
int y0 = int(real_rect.bottom() +
|
|
1041
|
+
y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
|
|
1042
|
+
int y1 = int(real_rect.bottom() +
|
|
1043
|
+
y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
|
|
1044
|
+
*box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
|
|
1045
|
+
}
|
|
1046
|
+
delete [] boxes_to_fix;
|
|
1047
|
+
|
|
1048
|
+
page_res_it.forward();
|
|
1049
|
+
word_count++;
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
// Extract the OCR results, costs (penalty points for uncertainty),
|
|
1055
|
+
// and the bounding boxes of the characters.
|
|
1056
|
+
int TessBaseAPI::TesseractExtractResult(char** string,
|
|
1057
|
+
int** lengths,
|
|
1058
|
+
float** costs,
|
|
1059
|
+
int** x0,
|
|
1060
|
+
int** y0,
|
|
1061
|
+
int** x1,
|
|
1062
|
+
int** y1,
|
|
1063
|
+
PAGE_RES* page_res) {
|
|
1064
|
+
ELIST tess_chars;
|
|
1065
|
+
ELIST_ITERATOR tess_chars_it(&tess_chars);
|
|
1066
|
+
extract_result(&tess_chars_it, page_res);
|
|
1067
|
+
tess_chars_it.move_to_first();
|
|
1068
|
+
int n = tess_chars.length();
|
|
1069
|
+
int string_len = 0;
|
|
1070
|
+
*lengths = new int[n];
|
|
1071
|
+
*costs = new float[n];
|
|
1072
|
+
*x0 = new int[n];
|
|
1073
|
+
*y0 = new int[n];
|
|
1074
|
+
*x1 = new int[n];
|
|
1075
|
+
*y1 = new int[n];
|
|
1076
|
+
int i = 0;
|
|
1077
|
+
for (tess_chars_it.mark_cycle_pt();
|
|
1078
|
+
!tess_chars_it.cycled_list();
|
|
1079
|
+
tess_chars_it.forward(), i++) {
|
|
1080
|
+
TESS_CHAR *tc = (TESS_CHAR *) tess_chars_it.data();
|
|
1081
|
+
string_len += (*lengths)[i] = tc->length;
|
|
1082
|
+
(*costs)[i] = tc->cost;
|
|
1083
|
+
(*x0)[i] = tc->box.left();
|
|
1084
|
+
(*y0)[i] = tc->box.bottom();
|
|
1085
|
+
(*x1)[i] = tc->box.right();
|
|
1086
|
+
(*y1)[i] = tc->box.top();
|
|
1087
|
+
}
|
|
1088
|
+
char *p = *string = new char[string_len];
|
|
1089
|
+
|
|
1090
|
+
tess_chars_it.move_to_first();
|
|
1091
|
+
for (tess_chars_it.mark_cycle_pt();
|
|
1092
|
+
!tess_chars_it.cycled_list();
|
|
1093
|
+
tess_chars_it.forward()) {
|
|
1094
|
+
TESS_CHAR *tc = (TESS_CHAR *) tess_chars_it.data();
|
|
1095
|
+
strncpy(p, tc->unicode_repr, tc->length);
|
|
1096
|
+
p += tc->length;
|
|
1097
|
+
}
|
|
1098
|
+
return n;
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
// Check whether a word is valid according to Tesseract's language model
|
|
1102
|
+
// returns 0 if the string is invalid, non-zero if valid
|
|
1103
|
+
int TessBaseAPI::IsValidWord(const char *string) {
|
|
1104
|
+
return valid_word(string);
|
|
1105
|
+
}
|