tesseract_bin 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/ext/tesseract_bin/extconf.rb +17 -0
- data/lib/tesseract_bin.rb +12 -0
- data/tesseract_bin.gemspec +660 -0
- data/test/helper.rb +18 -0
- data/test/test_tesseract_bin.rb +7 -0
- data/vendor/tesseract-2.04/AUTHORS +8 -0
- data/vendor/tesseract-2.04/COPYING +23 -0
- data/vendor/tesseract-2.04/ChangeLog +71 -0
- data/vendor/tesseract-2.04/INSTALL +229 -0
- data/vendor/tesseract-2.04/Makefile.am +20 -0
- data/vendor/tesseract-2.04/Makefile.in +641 -0
- data/vendor/tesseract-2.04/NEWS +1 -0
- data/vendor/tesseract-2.04/README +138 -0
- data/vendor/tesseract-2.04/ReleaseNotes +213 -0
- data/vendor/tesseract-2.04/StdAfx.cpp +8 -0
- data/vendor/tesseract-2.04/StdAfx.h +24 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.am +63 -0
- data/vendor/tesseract-2.04/ccmain/Makefile.in +735 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.cpp +1082 -0
- data/vendor/tesseract-2.04/ccmain/adaptions.h +109 -0
- data/vendor/tesseract-2.04/ccmain/applybox.cpp +891 -0
- data/vendor/tesseract-2.04/ccmain/applybox.h +73 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.cpp +1105 -0
- data/vendor/tesseract-2.04/ccmain/baseapi.h +256 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.cpp +76 -0
- data/vendor/tesseract-2.04/ccmain/blobcmp.h +29 -0
- data/vendor/tesseract-2.04/ccmain/callnet.cpp +93 -0
- data/vendor/tesseract-2.04/ccmain/callnet.h +32 -0
- data/vendor/tesseract-2.04/ccmain/charcut.cpp +704 -0
- data/vendor/tesseract-2.04/ccmain/charcut.h +120 -0
- data/vendor/tesseract-2.04/ccmain/charsample.cpp +699 -0
- data/vendor/tesseract-2.04/ccmain/control.cpp +1842 -0
- data/vendor/tesseract-2.04/ccmain/control.h +198 -0
- data/vendor/tesseract-2.04/ccmain/docqual.cpp +1481 -0
- data/vendor/tesseract-2.04/ccmain/docqual.h +155 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.cpp +82 -0
- data/vendor/tesseract-2.04/ccmain/expandblob.h +13 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.cpp +989 -0
- data/vendor/tesseract-2.04/ccmain/fixspace.h +72 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.cpp +825 -0
- data/vendor/tesseract-2.04/ccmain/fixxht.h +93 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.cpp +154 -0
- data/vendor/tesseract-2.04/ccmain/imgscale.h +32 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.cpp +391 -0
- data/vendor/tesseract-2.04/ccmain/matmatch.h +48 -0
- data/vendor/tesseract-2.04/ccmain/output.cpp +1273 -0
- data/vendor/tesseract-2.04/ccmain/output.h +116 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.cpp +666 -0
- data/vendor/tesseract-2.04/ccmain/pagewalk.h +155 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.cpp +107 -0
- data/vendor/tesseract-2.04/ccmain/paircmp.h +43 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.cpp +1867 -0
- data/vendor/tesseract-2.04/ccmain/pgedit.h +181 -0
- data/vendor/tesseract-2.04/ccmain/reject.cpp +1775 -0
- data/vendor/tesseract-2.04/ccmain/reject.h +181 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.cpp +366 -0
- data/vendor/tesseract-2.04/ccmain/scaleimg.h +35 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.cpp +375 -0
- data/vendor/tesseract-2.04/ccmain/tessbox.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.cpp +278 -0
- data/vendor/tesseract-2.04/ccmain/tessedit.h +49 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.cpp +110 -0
- data/vendor/tesseract-2.04/ccmain/tessembedded.h +38 -0
- data/vendor/tesseract-2.04/ccmain/tesseractfull.cc +37 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.cpp +387 -0
- data/vendor/tesseract-2.04/ccmain/tesseractmain.h +58 -0
- data/vendor/tesseract-2.04/ccmain/tessio.h +110 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.cpp +38 -0
- data/vendor/tesseract-2.04/ccmain/tessvars.h +48 -0
- data/vendor/tesseract-2.04/ccmain/tfacep.h +62 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.cpp +443 -0
- data/vendor/tesseract-2.04/ccmain/tfacepp.h +85 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.cpp +549 -0
- data/vendor/tesseract-2.04/ccmain/tstruct.h +108 -0
- data/vendor/tesseract-2.04/ccmain/varabled.cpp +346 -0
- data/vendor/tesseract-2.04/ccmain/varabled.h +125 -0
- data/vendor/tesseract-2.04/ccmain/werdit.cpp +193 -0
- data/vendor/tesseract-2.04/ccmain/werdit.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.am +25 -0
- data/vendor/tesseract-2.04/ccstruct/Makefile.in +650 -0
- data/vendor/tesseract-2.04/ccstruct/blckerr.h +29 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.cpp +778 -0
- data/vendor/tesseract-2.04/ccstruct/blobbox.h +381 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.cpp +247 -0
- data/vendor/tesseract-2.04/ccstruct/blobs.h +119 -0
- data/vendor/tesseract-2.04/ccstruct/blread.cpp +537 -0
- data/vendor/tesseract-2.04/ccstruct/blread.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/callcpp.cpp +252 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.cpp +650 -0
- data/vendor/tesseract-2.04/ccstruct/coutln.h +186 -0
- data/vendor/tesseract-2.04/ccstruct/crakedge.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.cpp +133 -0
- data/vendor/tesseract-2.04/ccstruct/genblob.h +52 -0
- data/vendor/tesseract-2.04/ccstruct/hpddef.h +39 -0
- data/vendor/tesseract-2.04/ccstruct/hpdsizes.h +8 -0
- data/vendor/tesseract-2.04/ccstruct/ipoints.h +479 -0
- data/vendor/tesseract-2.04/ccstruct/labls.cpp +188 -0
- data/vendor/tesseract-2.04/ccstruct/labls.h +38 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.cpp +249 -0
- data/vendor/tesseract-2.04/ccstruct/linlsq.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.cpp +453 -0
- data/vendor/tesseract-2.04/ccstruct/lmedsq.h +84 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.cpp +100 -0
- data/vendor/tesseract-2.04/ccstruct/mod128.h +85 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.cpp +176 -0
- data/vendor/tesseract-2.04/ccstruct/normalis.h +108 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.cpp +369 -0
- data/vendor/tesseract-2.04/ccstruct/ocrblock.h +235 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.cpp +216 -0
- data/vendor/tesseract-2.04/ccstruct/ocrrow.h +133 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.cpp +879 -0
- data/vendor/tesseract-2.04/ccstruct/pageblk.h +318 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.cpp +330 -0
- data/vendor/tesseract-2.04/ccstruct/pageres.h +313 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.cpp +361 -0
- data/vendor/tesseract-2.04/ccstruct/pdblock.h +181 -0
- data/vendor/tesseract-2.04/ccstruct/pdclass.h +54 -0
- data/vendor/tesseract-2.04/ccstruct/points.cpp +102 -0
- data/vendor/tesseract-2.04/ccstruct/points.h +299 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.cpp +588 -0
- data/vendor/tesseract-2.04/ccstruct/polyaprx.h +51 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.cpp +398 -0
- data/vendor/tesseract-2.04/ccstruct/polyblk.h +122 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.cpp +357 -0
- data/vendor/tesseract-2.04/ccstruct/polyblob.h +102 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.cpp +23 -0
- data/vendor/tesseract-2.04/ccstruct/polyvert.h +58 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.cpp +441 -0
- data/vendor/tesseract-2.04/ccstruct/poutline.h +125 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.cpp +147 -0
- data/vendor/tesseract-2.04/ccstruct/quadlsq.h +67 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.cpp +21 -0
- data/vendor/tesseract-2.04/ccstruct/quadratc.h +63 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.cpp +382 -0
- data/vendor/tesseract-2.04/ccstruct/quspline.h +113 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.cpp +372 -0
- data/vendor/tesseract-2.04/ccstruct/ratngs.h +198 -0
- data/vendor/tesseract-2.04/ccstruct/rect.cpp +229 -0
- data/vendor/tesseract-2.04/ccstruct/rect.h +320 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.cpp +545 -0
- data/vendor/tesseract-2.04/ccstruct/rejctmap.h +284 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.cpp +89 -0
- data/vendor/tesseract-2.04/ccstruct/rwpoly.h +45 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.cpp +905 -0
- data/vendor/tesseract-2.04/ccstruct/statistc.h +135 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.cpp +296 -0
- data/vendor/tesseract-2.04/ccstruct/stepblob.h +88 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.cpp +230 -0
- data/vendor/tesseract-2.04/ccstruct/txtregn.h +155 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.cpp +63 -0
- data/vendor/tesseract-2.04/ccstruct/vecfuncs.h +91 -0
- data/vendor/tesseract-2.04/ccstruct/werd.cpp +967 -0
- data/vendor/tesseract-2.04/ccstruct/werd.h +277 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.am +19 -0
- data/vendor/tesseract-2.04/ccutil/Makefile.in +626 -0
- data/vendor/tesseract-2.04/ccutil/basedir.cpp +118 -0
- data/vendor/tesseract-2.04/ccutil/basedir.h +32 -0
- data/vendor/tesseract-2.04/ccutil/bits16.cpp +30 -0
- data/vendor/tesseract-2.04/ccutil/bits16.h +61 -0
- data/vendor/tesseract-2.04/ccutil/boxread.cpp +105 -0
- data/vendor/tesseract-2.04/ccutil/boxread.h +44 -0
- data/vendor/tesseract-2.04/ccutil/clst.cpp +626 -0
- data/vendor/tesseract-2.04/ccutil/clst.h +1085 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.cpp +500 -0
- data/vendor/tesseract-2.04/ccutil/debugwin.h +103 -0
- data/vendor/tesseract-2.04/ccutil/elst.cpp +593 -0
- data/vendor/tesseract-2.04/ccutil/elst.h +1125 -0
- data/vendor/tesseract-2.04/ccutil/elst2.cpp +606 -0
- data/vendor/tesseract-2.04/ccutil/elst2.h +1121 -0
- data/vendor/tesseract-2.04/ccutil/errcode.cpp +104 -0
- data/vendor/tesseract-2.04/ccutil/errcode.h +104 -0
- data/vendor/tesseract-2.04/ccutil/fileerr.h +34 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.cpp +115 -0
- data/vendor/tesseract-2.04/ccutil/globaloc.h +40 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.cpp +57 -0
- data/vendor/tesseract-2.04/ccutil/hashfn.h +30 -0
- data/vendor/tesseract-2.04/ccutil/host.h +180 -0
- data/vendor/tesseract-2.04/ccutil/hosthplb.h +1 -0
- data/vendor/tesseract-2.04/ccutil/lsterr.h +43 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.cpp +126 -0
- data/vendor/tesseract-2.04/ccutil/mainblk.h +39 -0
- data/vendor/tesseract-2.04/ccutil/memblk.cpp +1106 -0
- data/vendor/tesseract-2.04/ccutil/memblk.h +189 -0
- data/vendor/tesseract-2.04/ccutil/memry.cpp +532 -0
- data/vendor/tesseract-2.04/ccutil/memry.h +192 -0
- data/vendor/tesseract-2.04/ccutil/memryerr.h +38 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.cpp +5 -0
- data/vendor/tesseract-2.04/ccutil/mfcpch.h +37 -0
- data/vendor/tesseract-2.04/ccutil/ndminx.h +31 -0
- data/vendor/tesseract-2.04/ccutil/notdll.h +28 -0
- data/vendor/tesseract-2.04/ccutil/nwmain.h +176 -0
- data/vendor/tesseract-2.04/ccutil/ocrclass.h +345 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.cpp +772 -0
- data/vendor/tesseract-2.04/ccutil/ocrshell.h +191 -0
- data/vendor/tesseract-2.04/ccutil/platform.h +18 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.cpp +543 -0
- data/vendor/tesseract-2.04/ccutil/scanutils.h +55 -0
- data/vendor/tesseract-2.04/ccutil/secname.h +9 -0
- data/vendor/tesseract-2.04/ccutil/serialis.cpp +117 -0
- data/vendor/tesseract-2.04/ccutil/serialis.h +93 -0
- data/vendor/tesseract-2.04/ccutil/stderr.h +26 -0
- data/vendor/tesseract-2.04/ccutil/strngs.cpp +495 -0
- data/vendor/tesseract-2.04/ccutil/strngs.h +138 -0
- data/vendor/tesseract-2.04/ccutil/tessclas.h +135 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.cpp +61 -0
- data/vendor/tesseract-2.04/ccutil/tessopt.h +30 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.cpp +122 -0
- data/vendor/tesseract-2.04/ccutil/tprintf.h +35 -0
- data/vendor/tesseract-2.04/ccutil/unichar.cpp +144 -0
- data/vendor/tesseract-2.04/ccutil/unichar.h +84 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.cpp +172 -0
- data/vendor/tesseract-2.04/ccutil/unicharmap.h +82 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.cpp +307 -0
- data/vendor/tesseract-2.04/ccutil/unicharset.h +267 -0
- data/vendor/tesseract-2.04/ccutil/varable.cpp +672 -0
- data/vendor/tesseract-2.04/ccutil/varable.h +419 -0
- data/vendor/tesseract-2.04/classify/Makefile.am +24 -0
- data/vendor/tesseract-2.04/classify/Makefile.in +647 -0
- data/vendor/tesseract-2.04/classify/adaptive.cpp +535 -0
- data/vendor/tesseract-2.04/classify/adaptive.h +199 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.cpp +2958 -0
- data/vendor/tesseract-2.04/classify/adaptmatch.h +86 -0
- data/vendor/tesseract-2.04/classify/baseline.cpp +58 -0
- data/vendor/tesseract-2.04/classify/baseline.h +91 -0
- data/vendor/tesseract-2.04/classify/blobclass.cpp +123 -0
- data/vendor/tesseract-2.04/classify/blobclass.h +49 -0
- data/vendor/tesseract-2.04/classify/chartoname.cpp +74 -0
- data/vendor/tesseract-2.04/classify/chartoname.h +21 -0
- data/vendor/tesseract-2.04/classify/cluster.cpp +2834 -0
- data/vendor/tesseract-2.04/classify/cluster.h +158 -0
- data/vendor/tesseract-2.04/classify/clusttool.cpp +507 -0
- data/vendor/tesseract-2.04/classify/clusttool.h +70 -0
- data/vendor/tesseract-2.04/classify/cutoffs.cpp +73 -0
- data/vendor/tesseract-2.04/classify/cutoffs.h +49 -0
- data/vendor/tesseract-2.04/classify/extern.h +32 -0
- data/vendor/tesseract-2.04/classify/extract.cpp +100 -0
- data/vendor/tesseract-2.04/classify/extract.h +36 -0
- data/vendor/tesseract-2.04/classify/featdefs.cpp +244 -0
- data/vendor/tesseract-2.04/classify/featdefs.h +71 -0
- data/vendor/tesseract-2.04/classify/flexfx.cpp +87 -0
- data/vendor/tesseract-2.04/classify/flexfx.h +34 -0
- data/vendor/tesseract-2.04/classify/float2int.cpp +126 -0
- data/vendor/tesseract-2.04/classify/float2int.h +65 -0
- data/vendor/tesseract-2.04/classify/fpoint.cpp +73 -0
- data/vendor/tesseract-2.04/classify/fpoint.h +63 -0
- data/vendor/tesseract-2.04/classify/fxdefs.cpp +74 -0
- data/vendor/tesseract-2.04/classify/fxdefs.h +93 -0
- data/vendor/tesseract-2.04/classify/fxid.h +69 -0
- data/vendor/tesseract-2.04/classify/hideedge.cpp +35 -0
- data/vendor/tesseract-2.04/classify/hideedge.h +76 -0
- data/vendor/tesseract-2.04/classify/intfx.cpp +608 -0
- data/vendor/tesseract-2.04/classify/intfx.h +63 -0
- data/vendor/tesseract-2.04/classify/intmatcher.cpp +1524 -0
- data/vendor/tesseract-2.04/classify/intmatcher.h +199 -0
- data/vendor/tesseract-2.04/classify/intproto.cpp +1823 -0
- data/vendor/tesseract-2.04/classify/intproto.h +320 -0
- data/vendor/tesseract-2.04/classify/kdtree.cpp +884 -0
- data/vendor/tesseract-2.04/classify/kdtree.h +118 -0
- data/vendor/tesseract-2.04/classify/mf.cpp +106 -0
- data/vendor/tesseract-2.04/classify/mf.h +43 -0
- data/vendor/tesseract-2.04/classify/mfdefs.cpp +58 -0
- data/vendor/tesseract-2.04/classify/mfdefs.h +60 -0
- data/vendor/tesseract-2.04/classify/mfoutline.cpp +1087 -0
- data/vendor/tesseract-2.04/classify/mfoutline.h +277 -0
- data/vendor/tesseract-2.04/classify/mfx.cpp +436 -0
- data/vendor/tesseract-2.04/classify/mfx.h +52 -0
- data/vendor/tesseract-2.04/classify/normfeat.cpp +132 -0
- data/vendor/tesseract-2.04/classify/normfeat.h +63 -0
- data/vendor/tesseract-2.04/classify/normmatch.cpp +305 -0
- data/vendor/tesseract-2.04/classify/normmatch.h +38 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.cpp +310 -0
- data/vendor/tesseract-2.04/classify/ocrfeatures.h +148 -0
- data/vendor/tesseract-2.04/classify/outfeat.cpp +262 -0
- data/vendor/tesseract-2.04/classify/outfeat.h +76 -0
- data/vendor/tesseract-2.04/classify/picofeat.cpp +297 -0
- data/vendor/tesseract-2.04/classify/picofeat.h +65 -0
- data/vendor/tesseract-2.04/classify/protos.cpp +472 -0
- data/vendor/tesseract-2.04/classify/protos.h +258 -0
- data/vendor/tesseract-2.04/classify/sigmenu.cpp +225 -0
- data/vendor/tesseract-2.04/classify/sigmenu.h +39 -0
- data/vendor/tesseract-2.04/classify/speckle.cpp +127 -0
- data/vendor/tesseract-2.04/classify/speckle.h +69 -0
- data/vendor/tesseract-2.04/classify/xform2d.cpp +120 -0
- data/vendor/tesseract-2.04/classify/xform2d.h +60 -0
- data/vendor/tesseract-2.04/config/config.guess +1466 -0
- data/vendor/tesseract-2.04/config/config.h.in +188 -0
- data/vendor/tesseract-2.04/config/config.sub +1579 -0
- data/vendor/tesseract-2.04/config/depcomp +530 -0
- data/vendor/tesseract-2.04/config/install-sh +269 -0
- data/vendor/tesseract-2.04/config/missing +198 -0
- data/vendor/tesseract-2.04/config/mkinstalldirs +40 -0
- data/vendor/tesseract-2.04/config/stamp-h.in +0 -0
- data/vendor/tesseract-2.04/configure +10424 -0
- data/vendor/tesseract-2.04/cutil/Makefile.am +14 -0
- data/vendor/tesseract-2.04/cutil/Makefile.in +612 -0
- data/vendor/tesseract-2.04/cutil/bitvec.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/bitvec.h +100 -0
- data/vendor/tesseract-2.04/cutil/callcpp.h +190 -0
- data/vendor/tesseract-2.04/cutil/const.h +108 -0
- data/vendor/tesseract-2.04/cutil/cutil.cpp +92 -0
- data/vendor/tesseract-2.04/cutil/cutil.h +159 -0
- data/vendor/tesseract-2.04/cutil/danerror.cpp +144 -0
- data/vendor/tesseract-2.04/cutil/danerror.h +41 -0
- data/vendor/tesseract-2.04/cutil/debug.cpp +97 -0
- data/vendor/tesseract-2.04/cutil/debug.h +348 -0
- data/vendor/tesseract-2.04/cutil/efio.cpp +62 -0
- data/vendor/tesseract-2.04/cutil/efio.h +32 -0
- data/vendor/tesseract-2.04/cutil/emalloc.cpp +91 -0
- data/vendor/tesseract-2.04/cutil/emalloc.h +44 -0
- data/vendor/tesseract-2.04/cutil/freelist.cpp +75 -0
- data/vendor/tesseract-2.04/cutil/freelist.h +45 -0
- data/vendor/tesseract-2.04/cutil/funcdefs.h +35 -0
- data/vendor/tesseract-2.04/cutil/general.h +33 -0
- data/vendor/tesseract-2.04/cutil/globals.cpp +69 -0
- data/vendor/tesseract-2.04/cutil/globals.h +70 -0
- data/vendor/tesseract-2.04/cutil/listio.cpp +68 -0
- data/vendor/tesseract-2.04/cutil/listio.h +43 -0
- data/vendor/tesseract-2.04/cutil/minmax.h +40 -0
- data/vendor/tesseract-2.04/cutil/oldheap.cpp +337 -0
- data/vendor/tesseract-2.04/cutil/oldheap.h +126 -0
- data/vendor/tesseract-2.04/cutil/oldlist.cpp +393 -0
- data/vendor/tesseract-2.04/cutil/oldlist.h +350 -0
- data/vendor/tesseract-2.04/cutil/structures.cpp +66 -0
- data/vendor/tesseract-2.04/cutil/structures.h +112 -0
- data/vendor/tesseract-2.04/cutil/tessarray.cpp +115 -0
- data/vendor/tesseract-2.04/cutil/tessarray.h +166 -0
- data/vendor/tesseract-2.04/cutil/tordvars.cpp +95 -0
- data/vendor/tesseract-2.04/cutil/tordvars.h +61 -0
- data/vendor/tesseract-2.04/cutil/variables.cpp +317 -0
- data/vendor/tesseract-2.04/cutil/variables.h +170 -0
- data/vendor/tesseract-2.04/dict/Makefile.am +13 -0
- data/vendor/tesseract-2.04/dict/Makefile.in +609 -0
- data/vendor/tesseract-2.04/dict/choicearr.h +96 -0
- data/vendor/tesseract-2.04/dict/choices.cpp +210 -0
- data/vendor/tesseract-2.04/dict/choices.h +241 -0
- data/vendor/tesseract-2.04/dict/context.cpp +270 -0
- data/vendor/tesseract-2.04/dict/context.h +82 -0
- data/vendor/tesseract-2.04/dict/dawg.cpp +363 -0
- data/vendor/tesseract-2.04/dict/dawg.h +394 -0
- data/vendor/tesseract-2.04/dict/hyphen.cpp +84 -0
- data/vendor/tesseract-2.04/dict/hyphen.h +125 -0
- data/vendor/tesseract-2.04/dict/lookdawg.cpp +228 -0
- data/vendor/tesseract-2.04/dict/lookdawg.h +76 -0
- data/vendor/tesseract-2.04/dict/makedawg.cpp +449 -0
- data/vendor/tesseract-2.04/dict/makedawg.h +83 -0
- data/vendor/tesseract-2.04/dict/matchdefs.h +145 -0
- data/vendor/tesseract-2.04/dict/permdawg.cpp +415 -0
- data/vendor/tesseract-2.04/dict/permdawg.h +98 -0
- data/vendor/tesseract-2.04/dict/permngram.cpp +358 -0
- data/vendor/tesseract-2.04/dict/permngram.h +33 -0
- data/vendor/tesseract-2.04/dict/permnum.cpp +522 -0
- data/vendor/tesseract-2.04/dict/permnum.h +83 -0
- data/vendor/tesseract-2.04/dict/permute.cpp +1704 -0
- data/vendor/tesseract-2.04/dict/permute.h +93 -0
- data/vendor/tesseract-2.04/dict/reduce.cpp +424 -0
- data/vendor/tesseract-2.04/dict/reduce.h +112 -0
- data/vendor/tesseract-2.04/dict/states.cpp +382 -0
- data/vendor/tesseract-2.04/dict/states.h +111 -0
- data/vendor/tesseract-2.04/dict/stopper.cpp +1458 -0
- data/vendor/tesseract-2.04/dict/stopper.h +103 -0
- data/vendor/tesseract-2.04/dict/trie.cpp +683 -0
- data/vendor/tesseract-2.04/dict/trie.h +190 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.am +2 -0
- data/vendor/tesseract-2.04/dlltest/Makefile.in +388 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.cpp +163 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.dsp +186 -0
- data/vendor/tesseract-2.04/dlltest/dlltest.vcproj +637 -0
- data/vendor/tesseract-2.04/eurotext.tif +0 -0
- data/vendor/tesseract-2.04/image/Makefile.am +10 -0
- data/vendor/tesseract-2.04/image/Makefile.in +596 -0
- data/vendor/tesseract-2.04/image/bitstrm.cpp +157 -0
- data/vendor/tesseract-2.04/image/bitstrm.h +73 -0
- data/vendor/tesseract-2.04/image/img.h +336 -0
- data/vendor/tesseract-2.04/image/imgbmp.cpp +223 -0
- data/vendor/tesseract-2.04/image/imgbmp.h +50 -0
- data/vendor/tesseract-2.04/image/imgerrs.h +35 -0
- data/vendor/tesseract-2.04/image/imgio.cpp +321 -0
- data/vendor/tesseract-2.04/image/imgio.h +22 -0
- data/vendor/tesseract-2.04/image/imgs.cpp +1764 -0
- data/vendor/tesseract-2.04/image/imgs.h +102 -0
- data/vendor/tesseract-2.04/image/imgtiff.cpp +723 -0
- data/vendor/tesseract-2.04/image/imgtiff.h +89 -0
- data/vendor/tesseract-2.04/image/imgunpk.h +1377 -0
- data/vendor/tesseract-2.04/image/svshowim.cpp +40 -0
- data/vendor/tesseract-2.04/image/svshowim.h +25 -0
- data/vendor/tesseract-2.04/java/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.am +1 -0
- data/vendor/tesseract-2.04/java/com/google/Makefile.in +470 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.am +4 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/Makefile.in +473 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ScrollView.java +421 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.am +5 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/Makefile.in +474 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEvent.java +87 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventHandler.java +296 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/events/SVEventType.java +31 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.am +7 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/Makefile.in +476 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVAbstractMenuItem.java +58 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVCheckboxMenuItem.java +60 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVEmptyMenuItem.java +48 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVImageHandler.java +228 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuBar.java +130 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVMenuItem.java +61 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVPopupMenu.java +142 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVSubMenuItem.java +41 -0
- data/vendor/tesseract-2.04/java/com/google/scrollview/ui/SVWindow.java +643 -0
- data/vendor/tesseract-2.04/java/makefile +55 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.am +13 -0
- data/vendor/tesseract-2.04/pageseg/Makefile.in +596 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.cpp +363 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg.h +90 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.cpp +82 -0
- data/vendor/tesseract-2.04/pageseg/leptonica_pageseg_interface.h +30 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.cpp +170 -0
- data/vendor/tesseract-2.04/pageseg/pageseg.h +29 -0
- data/vendor/tesseract-2.04/phototest.tif +0 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.am +31 -0
- data/vendor/tesseract-2.04/tessdata/Makefile.in +529 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/configs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/configs/api_config +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train +19 -0
- data/vendor/tesseract-2.04/tessdata/configs/box.train.stderr +18 -0
- data/vendor/tesseract-2.04/tessdata/configs/inter +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/kannada +4 -0
- data/vendor/tesseract-2.04/tessdata/configs/makebox +1 -0
- data/vendor/tesseract-2.04/tessdata/configs/unlv +3 -0
- data/vendor/tesseract-2.04/tessdata/confsets +3 -0
- data/vendor/tesseract-2.04/tessdata/eng.DangAmbigs +39 -0
- data/vendor/tesseract-2.04/tessdata/eng.freq-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.inttemp +0 -0
- data/vendor/tesseract-2.04/tessdata/eng.normproto +1247 -0
- data/vendor/tesseract-2.04/tessdata/eng.pffmtable +111 -0
- data/vendor/tesseract-2.04/tessdata/eng.unicharset +113 -0
- data/vendor/tesseract-2.04/tessdata/eng.user-words +921 -0
- data/vendor/tesseract-2.04/tessdata/eng.word-dawg +0 -0
- data/vendor/tesseract-2.04/tessdata/makedummies +8 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.am +3 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/Makefile.in +344 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/batch.nochop +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/matdemo +7 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/msdemo +13 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/nobatch +2 -0
- data/vendor/tesseract-2.04/tessdata/tessconfigs/segdemo +9 -0
- data/vendor/tesseract-2.04/tessdll.cpp +351 -0
- data/vendor/tesseract-2.04/tessdll.dsp +2050 -0
- data/vendor/tesseract-2.04/tessdll.h +143 -0
- data/vendor/tesseract-2.04/tessdll.vcproj +5495 -0
- data/vendor/tesseract-2.04/tesseract.dsp +2124 -0
- data/vendor/tesseract-2.04/tesseract.dsw +116 -0
- data/vendor/tesseract-2.04/tesseract.sln +59 -0
- data/vendor/tesseract-2.04/tesseract.spec +188 -0
- data/vendor/tesseract-2.04/tesseract.vcproj +5859 -0
- data/vendor/tesseract-2.04/testing/Makefile.am +2 -0
- data/vendor/tesseract-2.04/testing/Makefile.in +312 -0
- data/vendor/tesseract-2.04/testing/README +43 -0
- data/vendor/tesseract-2.04/testing/counttestset.sh +61 -0
- data/vendor/tesseract-2.04/testing/reorgdata.sh +44 -0
- data/vendor/tesseract-2.04/testing/reports/1995.bus.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.doe3.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.mag.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/1995.news.3B.sum +1 -0
- data/vendor/tesseract-2.04/testing/reports/2.03.summary +9 -0
- data/vendor/tesseract-2.04/testing/reports/2.04.summary +9 -0
- data/vendor/tesseract-2.04/testing/runalltests.sh +110 -0
- data/vendor/tesseract-2.04/testing/runtestset.sh +61 -0
- data/vendor/tesseract-2.04/textord/Makefile.am +20 -0
- data/vendor/tesseract-2.04/textord/Makefile.in +624 -0
- data/vendor/tesseract-2.04/textord/blkocc.cpp +809 -0
- data/vendor/tesseract-2.04/textord/blkocc.h +327 -0
- data/vendor/tesseract-2.04/textord/blobcmpl.h +31 -0
- data/vendor/tesseract-2.04/textord/drawedg.cpp +77 -0
- data/vendor/tesseract-2.04/textord/drawedg.h +34 -0
- data/vendor/tesseract-2.04/textord/drawtord.cpp +469 -0
- data/vendor/tesseract-2.04/textord/drawtord.h +107 -0
- data/vendor/tesseract-2.04/textord/edgblob.cpp +412 -0
- data/vendor/tesseract-2.04/textord/edgblob.h +100 -0
- data/vendor/tesseract-2.04/textord/edgloop.cpp +211 -0
- data/vendor/tesseract-2.04/textord/edgloop.h +66 -0
- data/vendor/tesseract-2.04/textord/fpchop.cpp +1641 -0
- data/vendor/tesseract-2.04/textord/fpchop.h +238 -0
- data/vendor/tesseract-2.04/textord/gap_map.cpp +166 -0
- data/vendor/tesseract-2.04/textord/gap_map.h +40 -0
- data/vendor/tesseract-2.04/textord/makerow.cpp +2628 -0
- data/vendor/tesseract-2.04/textord/makerow.h +295 -0
- data/vendor/tesseract-2.04/textord/oldbasel.cpp +1761 -0
- data/vendor/tesseract-2.04/textord/oldbasel.h +195 -0
- data/vendor/tesseract-2.04/textord/pithsync.cpp +696 -0
- data/vendor/tesseract-2.04/textord/pithsync.h +134 -0
- data/vendor/tesseract-2.04/textord/pitsync1.cpp +425 -0
- data/vendor/tesseract-2.04/textord/pitsync1.h +135 -0
- data/vendor/tesseract-2.04/textord/scanedg.cpp +452 -0
- data/vendor/tesseract-2.04/textord/scanedg.h +74 -0
- data/vendor/tesseract-2.04/textord/sortflts.cpp +80 -0
- data/vendor/tesseract-2.04/textord/sortflts.h +64 -0
- data/vendor/tesseract-2.04/textord/tessout.h +76 -0
- data/vendor/tesseract-2.04/textord/topitch.cpp +2019 -0
- data/vendor/tesseract-2.04/textord/topitch.h +195 -0
- data/vendor/tesseract-2.04/textord/tordmain.cpp +907 -0
- data/vendor/tesseract-2.04/textord/tordmain.h +132 -0
- data/vendor/tesseract-2.04/textord/tospace.cpp +1939 -0
- data/vendor/tesseract-2.04/textord/tospace.h +193 -0
- data/vendor/tesseract-2.04/textord/tovars.cpp +87 -0
- data/vendor/tesseract-2.04/textord/tovars.h +94 -0
- data/vendor/tesseract-2.04/textord/underlin.cpp +312 -0
- data/vendor/tesseract-2.04/textord/underlin.h +53 -0
- data/vendor/tesseract-2.04/textord/wordseg.cpp +620 -0
- data/vendor/tesseract-2.04/textord/wordseg.h +70 -0
- data/vendor/tesseract-2.04/training/Makefile.am +54 -0
- data/vendor/tesseract-2.04/training/Makefile.in +720 -0
- data/vendor/tesseract-2.04/training/cnTraining.cpp +855 -0
- data/vendor/tesseract-2.04/training/cntraining.dsp +243 -0
- data/vendor/tesseract-2.04/training/cntraining.vcproj +950 -0
- data/vendor/tesseract-2.04/training/mergenf.cpp +451 -0
- data/vendor/tesseract-2.04/training/mergenf.h +106 -0
- data/vendor/tesseract-2.04/training/mfTraining.cpp +1341 -0
- data/vendor/tesseract-2.04/training/mftraining.dsp +285 -0
- data/vendor/tesseract-2.04/training/mftraining.vcproj +1055 -0
- data/vendor/tesseract-2.04/training/name2char.cpp +166 -0
- data/vendor/tesseract-2.04/training/name2char.h +38 -0
- data/vendor/tesseract-2.04/training/training.cpp +190 -0
- data/vendor/tesseract-2.04/training/training.h +130 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.cpp +140 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.dsp +335 -0
- data/vendor/tesseract-2.04/training/unicharset_extractor.vcproj +769 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.cpp +69 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.dsp +319 -0
- data/vendor/tesseract-2.04/training/wordlist2dawg.vcproj +1113 -0
- data/vendor/tesseract-2.04/viewer/Makefile.am +9 -0
- data/vendor/tesseract-2.04/viewer/Makefile.in +591 -0
- data/vendor/tesseract-2.04/viewer/scrollview.cpp +825 -0
- data/vendor/tesseract-2.04/viewer/scrollview.h +414 -0
- data/vendor/tesseract-2.04/viewer/svmnode.cpp +140 -0
- data/vendor/tesseract-2.04/viewer/svmnode.h +94 -0
- data/vendor/tesseract-2.04/viewer/svpaint.cpp +220 -0
- data/vendor/tesseract-2.04/viewer/svutil.cpp +347 -0
- data/vendor/tesseract-2.04/viewer/svutil.h +138 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.am +23 -0
- data/vendor/tesseract-2.04/wordrec/Makefile.in +641 -0
- data/vendor/tesseract-2.04/wordrec/associate.cpp +62 -0
- data/vendor/tesseract-2.04/wordrec/associate.h +93 -0
- data/vendor/tesseract-2.04/wordrec/badwords.cpp +106 -0
- data/vendor/tesseract-2.04/wordrec/badwords.h +51 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.cpp +526 -0
- data/vendor/tesseract-2.04/wordrec/bestfirst.h +203 -0
- data/vendor/tesseract-2.04/wordrec/charsample.h +208 -0
- data/vendor/tesseract-2.04/wordrec/chop.cpp +458 -0
- data/vendor/tesseract-2.04/wordrec/chop.h +153 -0
- data/vendor/tesseract-2.04/wordrec/chopper.cpp +750 -0
- data/vendor/tesseract-2.04/wordrec/chopper.h +104 -0
- data/vendor/tesseract-2.04/wordrec/closed.cpp +136 -0
- data/vendor/tesseract-2.04/wordrec/closed.h +65 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/djmenus.h +33 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.cpp +92 -0
- data/vendor/tesseract-2.04/wordrec/drawfx.h +33 -0
- data/vendor/tesseract-2.04/wordrec/findseam.cpp +566 -0
- data/vendor/tesseract-2.04/wordrec/findseam.h +69 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.cpp +226 -0
- data/vendor/tesseract-2.04/wordrec/gradechop.h +91 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.cpp +194 -0
- data/vendor/tesseract-2.04/wordrec/heuristic.h +120 -0
- data/vendor/tesseract-2.04/wordrec/makechop.cpp +281 -0
- data/vendor/tesseract-2.04/wordrec/makechop.h +69 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.cpp +191 -0
- data/vendor/tesseract-2.04/wordrec/matchtab.h +45 -0
- data/vendor/tesseract-2.04/wordrec/matrix.cpp +118 -0
- data/vendor/tesseract-2.04/wordrec/matrix.h +104 -0
- data/vendor/tesseract-2.04/wordrec/measure.h +135 -0
- data/vendor/tesseract-2.04/wordrec/metrics.cpp +363 -0
- data/vendor/tesseract-2.04/wordrec/metrics.h +130 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.cpp +51 -0
- data/vendor/tesseract-2.04/wordrec/mfvars.h +27 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.cpp +110 -0
- data/vendor/tesseract-2.04/wordrec/msmenus.h +45 -0
- data/vendor/tesseract-2.04/wordrec/olutil.cpp +153 -0
- data/vendor/tesseract-2.04/wordrec/olutil.h +128 -0
- data/vendor/tesseract-2.04/wordrec/outlines.cpp +172 -0
- data/vendor/tesseract-2.04/wordrec/outlines.h +148 -0
- data/vendor/tesseract-2.04/wordrec/pieces.cpp +410 -0
- data/vendor/tesseract-2.04/wordrec/pieces.h +154 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.cpp +134 -0
- data/vendor/tesseract-2.04/wordrec/plotedges.h +71 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.cpp +116 -0
- data/vendor/tesseract-2.04/wordrec/plotseg.h +73 -0
- data/vendor/tesseract-2.04/wordrec/render.cpp +152 -0
- data/vendor/tesseract-2.04/wordrec/render.h +58 -0
- data/vendor/tesseract-2.04/wordrec/seam.cpp +482 -0
- data/vendor/tesseract-2.04/wordrec/seam.h +136 -0
- data/vendor/tesseract-2.04/wordrec/split.cpp +182 -0
- data/vendor/tesseract-2.04/wordrec/split.h +115 -0
- data/vendor/tesseract-2.04/wordrec/tally.cpp +68 -0
- data/vendor/tesseract-2.04/wordrec/tally.h +94 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.cpp +108 -0
- data/vendor/tesseract-2.04/wordrec/tessinit.h +46 -0
- data/vendor/tesseract-2.04/wordrec/tface.cpp +272 -0
- data/vendor/tesseract-2.04/wordrec/tface.h +35 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.cpp +284 -0
- data/vendor/tesseract-2.04/wordrec/wordclass.h +64 -0
- metadata +708 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
///////////////////////////////////////////////////////////////////////
|
|
2
|
+
// File: unicharmap.cpp
|
|
3
|
+
// Description: Unicode character/ligature to integer id class.
|
|
4
|
+
// Author: Thomas Kielbus
|
|
5
|
+
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
6
|
+
//
|
|
7
|
+
// (C) Copyright 2006, Google Inc.
|
|
8
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
// you may not use this file except in compliance with the License.
|
|
10
|
+
// You may obtain a copy of the License at
|
|
11
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
// See the License for the specific language governing permissions and
|
|
16
|
+
// limitations under the License.
|
|
17
|
+
//
|
|
18
|
+
///////////////////////////////////////////////////////////////////////
|
|
19
|
+
|
|
20
|
+
#include <assert.h>
|
|
21
|
+
#include "unichar.h"
|
|
22
|
+
#include "host.h"
|
|
23
|
+
#include "unicharmap.h"
|
|
24
|
+
|
|
25
|
+
UNICHARMAP::UNICHARMAP() :
|
|
26
|
+
nodes(0) {
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
UNICHARMAP::~UNICHARMAP() {
|
|
30
|
+
if (nodes != 0)
|
|
31
|
+
delete[] nodes;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Search the given unichar representation in the tree. Each character in the
|
|
35
|
+
// string is interpreted as an index in an array of nodes.
|
|
36
|
+
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
|
|
37
|
+
const char* current_char = unichar_repr;
|
|
38
|
+
UNICHARMAP_NODE* current_nodes = nodes;
|
|
39
|
+
|
|
40
|
+
assert(*unichar_repr != '\0');
|
|
41
|
+
|
|
42
|
+
do {
|
|
43
|
+
if (*(current_char + 1) == '\0')
|
|
44
|
+
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
|
45
|
+
current_nodes =
|
|
46
|
+
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
47
|
+
++current_char;
|
|
48
|
+
} while (true);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Search the given unichar representation in the tree, using length characters
|
|
52
|
+
// from it maximum. Each character in the string is interpreted as an index in
|
|
53
|
+
// an array of nodes.
|
|
54
|
+
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
|
55
|
+
int length) const {
|
|
56
|
+
const char* current_char = unichar_repr;
|
|
57
|
+
UNICHARMAP_NODE* current_nodes = nodes;
|
|
58
|
+
|
|
59
|
+
assert(*unichar_repr != '\0');
|
|
60
|
+
assert(length > 0 && length <= UNICHAR_LEN);
|
|
61
|
+
|
|
62
|
+
do {
|
|
63
|
+
if (length == 1 || *(current_char + 1) == '\0')
|
|
64
|
+
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
|
65
|
+
current_nodes =
|
|
66
|
+
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
67
|
+
++current_char;
|
|
68
|
+
--length;
|
|
69
|
+
} while (true);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Search the given unichar representation in the tree, creating the possibly
|
|
73
|
+
// missing nodes. Once the right place has been found, insert the given id and
|
|
74
|
+
// update the inserted flag to keep track of the insert. Each character in the
|
|
75
|
+
// string is interpreted as an index in an array of nodes.
|
|
76
|
+
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
|
77
|
+
const char* current_char = unichar_repr;
|
|
78
|
+
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
|
|
79
|
+
|
|
80
|
+
assert(*unichar_repr != '\0');
|
|
81
|
+
assert(id >= 0);
|
|
82
|
+
|
|
83
|
+
do {
|
|
84
|
+
if (*current_nodes_pointer == 0)
|
|
85
|
+
*current_nodes_pointer = new UNICHARMAP_NODE[256];
|
|
86
|
+
if (*(current_char + 1) == '\0') {
|
|
87
|
+
(*current_nodes_pointer)
|
|
88
|
+
[static_cast<unsigned char>(*current_char)].id = id;
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
current_nodes_pointer =
|
|
92
|
+
&((*current_nodes_pointer)
|
|
93
|
+
[static_cast<unsigned char>(*current_char)].children);
|
|
94
|
+
++current_char;
|
|
95
|
+
} while (true);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Search the given unichar representation in the tree. Each character in the
|
|
99
|
+
// string is interpreted as an index in an array of nodes. Stop once the tree
|
|
100
|
+
// does not have anymore nodes or once we found the right unichar_repr.
|
|
101
|
+
bool UNICHARMAP::contains(const char* const unichar_repr) const {
|
|
102
|
+
const char* current_char = unichar_repr;
|
|
103
|
+
UNICHARMAP_NODE* current_nodes = nodes;
|
|
104
|
+
|
|
105
|
+
assert(*unichar_repr != '\0');
|
|
106
|
+
|
|
107
|
+
while (current_nodes != 0 && *(current_char + 1) != '\0') {
|
|
108
|
+
current_nodes =
|
|
109
|
+
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
110
|
+
++current_char;
|
|
111
|
+
}
|
|
112
|
+
return current_nodes != 0 && *(current_char + 1) == '\0' &&
|
|
113
|
+
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Search the given unichar representation in the tree, using length characters
|
|
117
|
+
// from it maximum. Each character in the string is interpreted as an index in
|
|
118
|
+
// an array of nodes. Stop once the tree does not have anymore nodes or once we
|
|
119
|
+
// found the right unichar_repr.
|
|
120
|
+
bool UNICHARMAP::contains(const char* const unichar_repr,
|
|
121
|
+
int length) const {
|
|
122
|
+
const char* current_char = unichar_repr;
|
|
123
|
+
UNICHARMAP_NODE* current_nodes = nodes;
|
|
124
|
+
|
|
125
|
+
assert(*unichar_repr != '\0');
|
|
126
|
+
assert(length > 0 && length <= UNICHAR_LEN);
|
|
127
|
+
|
|
128
|
+
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
|
|
129
|
+
current_nodes =
|
|
130
|
+
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
131
|
+
--length;
|
|
132
|
+
++current_char;
|
|
133
|
+
}
|
|
134
|
+
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
|
|
135
|
+
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Return the minimum number of characters that must be used from this string
|
|
139
|
+
// to obtain a match in the UNICHARMAP.
|
|
140
|
+
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
|
|
141
|
+
const char* current_char = unichar_repr;
|
|
142
|
+
UNICHARMAP_NODE* current_nodes = nodes;
|
|
143
|
+
|
|
144
|
+
while (current_nodes != NULL && *current_char != '\0') {
|
|
145
|
+
if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
|
|
146
|
+
return current_char + 1 - unichar_repr;
|
|
147
|
+
current_nodes =
|
|
148
|
+
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
149
|
+
++current_char;
|
|
150
|
+
}
|
|
151
|
+
return 0;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
void UNICHARMAP::clear() {
|
|
155
|
+
if (nodes != 0)
|
|
156
|
+
{
|
|
157
|
+
delete[] nodes;
|
|
158
|
+
nodes = 0;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
|
|
163
|
+
children(0),
|
|
164
|
+
id(-1) {
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Recursively delete the children
|
|
168
|
+
UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
|
|
169
|
+
if (children != 0) {
|
|
170
|
+
delete[] children;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
///////////////////////////////////////////////////////////////////////
|
|
2
|
+
// File: unicharmap.h
|
|
3
|
+
// Description: Unicode character/ligature to integer id class.
|
|
4
|
+
// Author: Thomas Kielbus
|
|
5
|
+
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
6
|
+
//
|
|
7
|
+
// (C) Copyright 2006, Google Inc.
|
|
8
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
// you may not use this file except in compliance with the License.
|
|
10
|
+
// You may obtain a copy of the License at
|
|
11
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
// See the License for the specific language governing permissions and
|
|
16
|
+
// limitations under the License.
|
|
17
|
+
//
|
|
18
|
+
///////////////////////////////////////////////////////////////////////
|
|
19
|
+
|
|
20
|
+
#ifndef TESSERACT_CCUTIL_UNICHARMAP_H__
|
|
21
|
+
#define TESSERACT_CCUTIL_UNICHARMAP_H__
|
|
22
|
+
|
|
23
|
+
#include "unichar.h"
|
|
24
|
+
|
|
25
|
+
// A UNICHARMAP stores unique unichars. Each of them is associated with one
|
|
26
|
+
// UNICHAR_ID.
|
|
27
|
+
class UNICHARMAP {
|
|
28
|
+
public:
|
|
29
|
+
|
|
30
|
+
// Create an empty UNICHARMAP
|
|
31
|
+
UNICHARMAP();
|
|
32
|
+
|
|
33
|
+
~UNICHARMAP();
|
|
34
|
+
|
|
35
|
+
// Insert the given unichar represention in the UNICHARMAP and associate it
|
|
36
|
+
// with the given id. The length of the representation MUST be non-zero.
|
|
37
|
+
void insert(const char* const unichar_repr, UNICHAR_ID id);
|
|
38
|
+
|
|
39
|
+
// Return the id associated with the given unichar representation,
|
|
40
|
+
// this representation MUST exist within the UNICHARMAP.
|
|
41
|
+
// The length of the representation MUST be non-zero.
|
|
42
|
+
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
|
|
43
|
+
|
|
44
|
+
// Return the id associated with the given unichar representation,
|
|
45
|
+
// this representation MUST exist within the UNICHARMAP. The first
|
|
46
|
+
// length characters (maximum) from unichar_repr are used. The length
|
|
47
|
+
// MUST be non-zero.
|
|
48
|
+
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
|
|
49
|
+
|
|
50
|
+
// Return true if the given unichar representation is already present in the
|
|
51
|
+
// UNICHARMAP. The length of the representation MUST be non-zero.
|
|
52
|
+
bool contains(const char* const unichar_repr) const;
|
|
53
|
+
|
|
54
|
+
// Return true if the given unichar representation is already present in the
|
|
55
|
+
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
|
|
56
|
+
// used. The length MUST be non-zero.
|
|
57
|
+
bool contains(const char* const unichar_repr, int length) const;
|
|
58
|
+
|
|
59
|
+
// Return the minimum number of characters that must be used from this string
|
|
60
|
+
// to obtain a match in the UNICHARMAP.
|
|
61
|
+
int minmatch(const char* const unichar_repr) const;
|
|
62
|
+
|
|
63
|
+
// Clear the UNICHARMAP. All previous data is lost.
|
|
64
|
+
void clear();
|
|
65
|
+
|
|
66
|
+
private:
|
|
67
|
+
|
|
68
|
+
// The UNICHARMAP is represented as a tree whose nodes are of type
|
|
69
|
+
// UNICHARMAP_NODE.
|
|
70
|
+
struct UNICHARMAP_NODE {
|
|
71
|
+
|
|
72
|
+
UNICHARMAP_NODE();
|
|
73
|
+
~UNICHARMAP_NODE();
|
|
74
|
+
|
|
75
|
+
UNICHARMAP_NODE* children;
|
|
76
|
+
UNICHAR_ID id;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
UNICHARMAP_NODE* nodes;
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
#endif // TESSERACT_CCUTIL_UNICHARMAP_H__
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
|
|
2
|
+
///////////////////////////////////////////////////////////////////////
|
|
3
|
+
// File: unicharset.cpp
|
|
4
|
+
// Description: Unicode character/ligature set class.
|
|
5
|
+
// Author: Thomas Kielbus
|
|
6
|
+
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
7
|
+
//
|
|
8
|
+
// (C) Copyright 2006, Google Inc.
|
|
9
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
10
|
+
// you may not use this file except in compliance with the License.
|
|
11
|
+
// You may obtain a copy of the License at
|
|
12
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
// See the License for the specific language governing permissions and
|
|
17
|
+
// limitations under the License.
|
|
18
|
+
//
|
|
19
|
+
///////////////////////////////////////////////////////////////////////
|
|
20
|
+
|
|
21
|
+
#include <assert.h>
|
|
22
|
+
#include <stdio.h>
|
|
23
|
+
#include <string.h>
|
|
24
|
+
|
|
25
|
+
#include "unichar.h"
|
|
26
|
+
#include "unicharset.h"
|
|
27
|
+
|
|
28
|
+
static const int ISALPHA_MASK = 0x1;
|
|
29
|
+
static const int ISLOWER_MASK = 0x2;
|
|
30
|
+
static const int ISUPPER_MASK = 0x4;
|
|
31
|
+
static const int ISDIGIT_MASK = 0x8;
|
|
32
|
+
|
|
33
|
+
UNICHARSET::UNICHARSET() :
|
|
34
|
+
unichars(NULL),
|
|
35
|
+
ids(),
|
|
36
|
+
size_used(0),
|
|
37
|
+
size_reserved(0),
|
|
38
|
+
script_table(0),
|
|
39
|
+
script_table_size_used(0),
|
|
40
|
+
script_table_size_reserved(0),
|
|
41
|
+
null_script("NULL")
|
|
42
|
+
{
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
UNICHARSET::~UNICHARSET() {
|
|
46
|
+
if (size_reserved > 0) {
|
|
47
|
+
for (int i = 0; i < script_table_size_used; ++i)
|
|
48
|
+
delete[] script_table[i];
|
|
49
|
+
delete[] script_table;
|
|
50
|
+
delete[] unichars;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
void UNICHARSET::reserve(int unichars_number) {
|
|
55
|
+
if (unichars_number > size_reserved) {
|
|
56
|
+
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
|
|
57
|
+
for (int i = 0; i < size_used; ++i)
|
|
58
|
+
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
|
|
59
|
+
for (int j = size_used; j < unichars_number; ++j)
|
|
60
|
+
unichars_new[j].properties.script = add_script(null_script);
|
|
61
|
+
delete[] unichars;
|
|
62
|
+
unichars = unichars_new;
|
|
63
|
+
size_reserved = unichars_number;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const UNICHAR_ID
|
|
68
|
+
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
|
69
|
+
assert(ids.contains(unichar_repr));
|
|
70
|
+
return ids.unichar_to_id(unichar_repr);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
|
74
|
+
int length) const {
|
|
75
|
+
assert(length > 0 && length <= UNICHAR_LEN);
|
|
76
|
+
assert(ids.contains(unichar_repr, length));
|
|
77
|
+
return ids.unichar_to_id(unichar_repr, length);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
|
81
|
+
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
|
|
82
|
+
// is both a short and a long match to the string, return the length that
|
|
83
|
+
// ensures there is a legal match after it.
|
|
84
|
+
int UNICHARSET::step(const char* str) const {
|
|
85
|
+
// Find the length of the first matching unicharset member.
|
|
86
|
+
int minlength = ids.minmatch(str);
|
|
87
|
+
if (minlength == 0)
|
|
88
|
+
return 0; // Empty string or illegal char.
|
|
89
|
+
|
|
90
|
+
int goodlength = minlength;
|
|
91
|
+
while (goodlength <= UNICHAR_LEN) {
|
|
92
|
+
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
|
|
93
|
+
return goodlength; // This length works!
|
|
94
|
+
// The next char is illegal so find the next usable length.
|
|
95
|
+
do {
|
|
96
|
+
++goodlength;
|
|
97
|
+
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
|
|
98
|
+
!ids.contains(str, goodlength));
|
|
99
|
+
}
|
|
100
|
+
// Search to find a subsequent legal char failed so return the minlength.
|
|
101
|
+
return minlength;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
|
105
|
+
assert(id < this->size());
|
|
106
|
+
return unichars[id].representation;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Return a STRING containing debug information on the unichar, including
|
|
110
|
+
// the id_to_unichar, its hex unicodes and the properties.
|
|
111
|
+
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
|
112
|
+
const char* str = id_to_unichar(id);
|
|
113
|
+
STRING result = str;
|
|
114
|
+
result += " [";
|
|
115
|
+
int step = 1;
|
|
116
|
+
// Chop into unicodes and code each as hex.
|
|
117
|
+
for (int i = 0; str[i] != '\0'; i += step) {
|
|
118
|
+
char hex[sizeof(int) * 2 + 1];
|
|
119
|
+
step = UNICHAR::utf8_step(str + i);
|
|
120
|
+
if (step == 0) {
|
|
121
|
+
step = 1;
|
|
122
|
+
sprintf(hex, "%x", str[i]);
|
|
123
|
+
} else {
|
|
124
|
+
UNICHAR ch(str + i, step);
|
|
125
|
+
sprintf(hex, "%x", ch.first_uni());
|
|
126
|
+
}
|
|
127
|
+
result += hex;
|
|
128
|
+
result += " ";
|
|
129
|
+
}
|
|
130
|
+
result += "]";
|
|
131
|
+
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
|
|
132
|
+
if (get_isalpha(id)) {
|
|
133
|
+
if (get_islower(id))
|
|
134
|
+
result += "a";
|
|
135
|
+
else if (get_isupper(id))
|
|
136
|
+
result += "A";
|
|
137
|
+
else
|
|
138
|
+
result += "x";
|
|
139
|
+
}
|
|
140
|
+
// Append 0 if a digit.
|
|
141
|
+
if (get_isdigit(id)) {
|
|
142
|
+
result += "0";
|
|
143
|
+
}
|
|
144
|
+
return result;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
|
148
|
+
if (!ids.contains(unichar_repr)) {
|
|
149
|
+
if (size_used == size_reserved) {
|
|
150
|
+
if (size_used == 0)
|
|
151
|
+
reserve(8);
|
|
152
|
+
else
|
|
153
|
+
reserve(2 * size_used);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
strcpy(unichars[size_used].representation, unichar_repr);
|
|
157
|
+
this->set_isalpha(size_used, false);
|
|
158
|
+
this->set_islower(size_used, false);
|
|
159
|
+
this->set_isupper(size_used, false);
|
|
160
|
+
this->set_isdigit(size_used, false);
|
|
161
|
+
this->set_script(size_used, add_script(null_script));
|
|
162
|
+
this->unichars[size_used].properties.enabled = true;
|
|
163
|
+
ids.insert(unichar_repr, size_used);
|
|
164
|
+
++size_used;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
|
|
169
|
+
return ids.contains(unichar_repr);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
|
|
173
|
+
return ids.contains(unichar_repr, length);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
|
|
177
|
+
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
bool UNICHARSET::save_to_file(const char* filename) const {
|
|
181
|
+
FILE* file = fopen(filename, "w+");
|
|
182
|
+
|
|
183
|
+
if (file == NULL)
|
|
184
|
+
return false;
|
|
185
|
+
|
|
186
|
+
fprintf(file, "%d\n", this->size());
|
|
187
|
+
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
|
188
|
+
unsigned int properties = 0;
|
|
189
|
+
|
|
190
|
+
if (this->get_isalpha(id))
|
|
191
|
+
properties |= ISALPHA_MASK;
|
|
192
|
+
if (this->get_islower(id))
|
|
193
|
+
properties |= ISLOWER_MASK;
|
|
194
|
+
if (this->get_isupper(id))
|
|
195
|
+
properties |= ISUPPER_MASK;
|
|
196
|
+
if (this->get_isdigit(id))
|
|
197
|
+
properties |= ISDIGIT_MASK;
|
|
198
|
+
|
|
199
|
+
if (strcmp(this->id_to_unichar(id), " ") == 0)
|
|
200
|
+
fprintf(file, "%s %x %s\n", "NULL", properties, this->get_script(id));
|
|
201
|
+
else
|
|
202
|
+
fprintf(file, "%s %x %s\n", this->id_to_unichar(id), properties,
|
|
203
|
+
this->get_script(id));
|
|
204
|
+
}
|
|
205
|
+
fclose(file);
|
|
206
|
+
return true;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
bool UNICHARSET::load_from_file(const char* filename) {
|
|
210
|
+
FILE* file = fopen(filename, "r");
|
|
211
|
+
int unicharset_size;
|
|
212
|
+
char buffer[256];
|
|
213
|
+
|
|
214
|
+
if (file == NULL)
|
|
215
|
+
return false;
|
|
216
|
+
|
|
217
|
+
this->clear();
|
|
218
|
+
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
|
219
|
+
sscanf(buffer, "%d", &unicharset_size) != 1) {
|
|
220
|
+
fclose(file);
|
|
221
|
+
return false;
|
|
222
|
+
}
|
|
223
|
+
this->reserve(unicharset_size);
|
|
224
|
+
for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
|
|
225
|
+
char unichar[256];
|
|
226
|
+
unsigned int properties;
|
|
227
|
+
char script[64];
|
|
228
|
+
|
|
229
|
+
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
|
230
|
+
(sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
|
|
231
|
+
!(sscanf(buffer, "%s %x", unichar, &properties) == 2 &&
|
|
232
|
+
strcpy(script, null_script)))) {
|
|
233
|
+
fclose(file);
|
|
234
|
+
return false;
|
|
235
|
+
}
|
|
236
|
+
if (strcmp(unichar, "NULL") == 0)
|
|
237
|
+
this->unichar_insert(" ");
|
|
238
|
+
else
|
|
239
|
+
this->unichar_insert(unichar);
|
|
240
|
+
|
|
241
|
+
this->set_isalpha(id, properties & ISALPHA_MASK);
|
|
242
|
+
this->set_islower(id, properties & ISLOWER_MASK);
|
|
243
|
+
this->set_isupper(id, properties & ISUPPER_MASK);
|
|
244
|
+
this->set_isdigit(id, properties & ISDIGIT_MASK);
|
|
245
|
+
this->set_script(id, add_script(script));
|
|
246
|
+
this->unichars[id].properties.enabled = true;
|
|
247
|
+
}
|
|
248
|
+
fclose(file);
|
|
249
|
+
return true;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Set a whitelist and/or blacklist of characters to recognize.
|
|
253
|
+
// An empty or NULL whitelist enables everything (minus any blacklist).
|
|
254
|
+
// An empty or NULL blacklist disables nothing.
|
|
255
|
+
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
|
256
|
+
const char* whitelist) {
|
|
257
|
+
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
|
|
258
|
+
// Set everything to default
|
|
259
|
+
for (int ch = 0; ch < size_used; ++ch)
|
|
260
|
+
unichars[ch].properties.enabled = def_enabled;
|
|
261
|
+
int ch_step;
|
|
262
|
+
if (!def_enabled) {
|
|
263
|
+
// Enable the whitelist.
|
|
264
|
+
for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
|
|
265
|
+
ch_step = step(whitelist + w_ind);
|
|
266
|
+
if (ch_step > 0) {
|
|
267
|
+
UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
|
|
268
|
+
unichars[u_id].properties.enabled = true;
|
|
269
|
+
} else {
|
|
270
|
+
ch_step = 1;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
if (blacklist != NULL && blacklist[0] != '\0') {
|
|
275
|
+
// Disable the blacklist.
|
|
276
|
+
for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
|
|
277
|
+
ch_step = step(blacklist + b_ind);
|
|
278
|
+
if (ch_step > 0) {
|
|
279
|
+
UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
|
|
280
|
+
unichars[u_id].properties.enabled = false;
|
|
281
|
+
} else {
|
|
282
|
+
ch_step = 1;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
char* UNICHARSET::add_script(const char* script) {
|
|
289
|
+
for (int i = 0; i < script_table_size_used; ++i) {
|
|
290
|
+
if (strcmp(script, script_table[i]) == 0)
|
|
291
|
+
return script_table[i];
|
|
292
|
+
}
|
|
293
|
+
if (script_table_size_reserved == 0) {
|
|
294
|
+
script_table_size_reserved = 8;
|
|
295
|
+
script_table = new char*[script_table_size_reserved];
|
|
296
|
+
}
|
|
297
|
+
if (script_table_size_used + 1 >= script_table_size_reserved) {
|
|
298
|
+
char** new_script_table = new char*[script_table_size_reserved * 2];
|
|
299
|
+
memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
|
|
300
|
+
delete[] script_table;
|
|
301
|
+
script_table = new_script_table;
|
|
302
|
+
script_table_size_reserved = 2 * script_table_size_reserved;
|
|
303
|
+
}
|
|
304
|
+
script_table[script_table_size_used] = new char[strlen(script) + 1];
|
|
305
|
+
strcpy(script_table[script_table_size_used], script);
|
|
306
|
+
return script_table[script_table_size_used++];
|
|
307
|
+
}
|