charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Run this to generate all the initial makefiles, etc.
|
|
3
|
+
|
|
4
|
+
srcdir=`dirname $0`
|
|
5
|
+
PKG_NAME="the package."
|
|
6
|
+
|
|
7
|
+
DIE=0
|
|
8
|
+
|
|
9
|
+
#(grep "^:ext:" $srcdir/CVS/Root >/dev/null) || {
|
|
10
|
+
# echo Use configure, autogen.sh is for developers only!
|
|
11
|
+
# DIE=1
|
|
12
|
+
#}
|
|
13
|
+
|
|
14
|
+
(autoconf --version) < /dev/null > /dev/null 2>&1 || {
|
|
15
|
+
echo
|
|
16
|
+
echo "**Error**: You must have \`autoconf' installed to."
|
|
17
|
+
echo "Download the appropriate package for your distribution,"
|
|
18
|
+
echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/"
|
|
19
|
+
DIE=1
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
(grep "^AM_PROG_LIBTOOL" $srcdir/configure.in >/dev/null) && {
|
|
23
|
+
(libtool --version) < /dev/null > /dev/null 2>&1 || {
|
|
24
|
+
echo
|
|
25
|
+
echo "**Error**: You must have \`libtool' installed."
|
|
26
|
+
echo "Get ftp://ftp.gnu.org/pub/gnu/libtool-1.2d.tar.gz"
|
|
27
|
+
echo "(or a newer version if it is available)"
|
|
28
|
+
DIE=1
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
grep "^AM_GNU_GETTEXT" $srcdir/configure.in >/dev/null && {
|
|
33
|
+
grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
|
|
34
|
+
(gettext --version) < /dev/null > /dev/null 2>&1 || {
|
|
35
|
+
echo
|
|
36
|
+
echo "**Error**: You must have \`gettext' installed."
|
|
37
|
+
echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
|
|
38
|
+
echo "(or a newer version if it is available)"
|
|
39
|
+
DIE=1
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
grep "^AM_GNOME_GETTEXT" $srcdir/configure.in >/dev/null && {
|
|
44
|
+
grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
|
|
45
|
+
(gettext --version) < /dev/null > /dev/null 2>&1 || {
|
|
46
|
+
echo
|
|
47
|
+
echo "**Error**: You must have \`gettext' installed."
|
|
48
|
+
echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
|
|
49
|
+
echo "(or a newer version if it is available)"
|
|
50
|
+
DIE=1
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
(automake --version --copy) < /dev/null > /dev/null 2>&1 || {
|
|
55
|
+
echo
|
|
56
|
+
echo "**Error**: You must have \`automake' installed."
|
|
57
|
+
echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
|
|
58
|
+
echo "(or a newer version if it is available)"
|
|
59
|
+
DIE=1
|
|
60
|
+
NO_AUTOMAKE=yes
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# if no automake, don't bother testing for aclocal
|
|
65
|
+
test -n "$NO_AUTOMAKE" || (aclocal --version) < /dev/null > /dev/null 2>&1 || {
|
|
66
|
+
echo
|
|
67
|
+
echo "**Error**: Missing \`aclocal'. The version of \`automake'"
|
|
68
|
+
echo "installed doesn't appear recent enough."
|
|
69
|
+
echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
|
|
70
|
+
echo "(or a newer version if it is available)"
|
|
71
|
+
DIE=1
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if test "$DIE" -eq 1; then
|
|
75
|
+
exit 1
|
|
76
|
+
fi
|
|
77
|
+
|
|
78
|
+
if test -z "$*"; then
|
|
79
|
+
echo "**Warning**: I am going to run \`configure' with no arguments."
|
|
80
|
+
echo "If you wish to pass any to it, please specify them on the"
|
|
81
|
+
echo \`$0\'" command line."
|
|
82
|
+
echo
|
|
83
|
+
fi
|
|
84
|
+
|
|
85
|
+
case $CC in
|
|
86
|
+
xlc )
|
|
87
|
+
am_opt=--include-deps;;
|
|
88
|
+
esac
|
|
89
|
+
|
|
90
|
+
for coin in `find $srcdir -name configure.in -print`
|
|
91
|
+
do
|
|
92
|
+
dr=`dirname $coin`
|
|
93
|
+
if test -f $dr/NO-AUTO-GEN; then
|
|
94
|
+
echo skipping $dr -- flagged as no auto-gen
|
|
95
|
+
else
|
|
96
|
+
echo processing $dr
|
|
97
|
+
macrodirs=`sed -n -e 's,AM_ACLOCAL_INCLUDE(\(.*\)),\1,gp' < $coin`
|
|
98
|
+
( cd $dr
|
|
99
|
+
aclocalinclude="$ACLOCAL_FLAGS"
|
|
100
|
+
for k in $macrodirs; do
|
|
101
|
+
if test -d $k; then
|
|
102
|
+
aclocalinclude="$aclocalinclude -I $k"
|
|
103
|
+
##else
|
|
104
|
+
## echo "**Warning**: No such directory \`$k'. Ignored."
|
|
105
|
+
fi
|
|
106
|
+
done
|
|
107
|
+
if grep "^AM_GNU_GETTEXT" configure.in >/dev/null; then
|
|
108
|
+
if grep "sed.*POTFILES" configure.in >/dev/null; then
|
|
109
|
+
: do nothing -- we still have an old unmodified configure.in
|
|
110
|
+
else
|
|
111
|
+
echo "Creating $dr/aclocal.m4 ..."
|
|
112
|
+
test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
|
|
113
|
+
echo "Running gettextize... Ignore non-fatal messages."
|
|
114
|
+
echo "no" | gettextize --force --copy
|
|
115
|
+
echo "Making $dr/aclocal.m4 writable ..."
|
|
116
|
+
test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
|
|
117
|
+
fi
|
|
118
|
+
fi
|
|
119
|
+
if grep "^AM_GNOME_GETTEXT" configure.in >/dev/null; then
|
|
120
|
+
echo "Creating $dr/aclocal.m4 ..."
|
|
121
|
+
test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
|
|
122
|
+
echo "Running gettextize... Ignore non-fatal messages."
|
|
123
|
+
echo "no" | gettextize --force --copy
|
|
124
|
+
echo "Making $dr/aclocal.m4 writable ..."
|
|
125
|
+
test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
|
|
126
|
+
fi
|
|
127
|
+
if grep "^AM_PROG_LIBTOOL" configure.in >/dev/null; then
|
|
128
|
+
echo "Running libtoolize..."
|
|
129
|
+
libtoolize --force --copy
|
|
130
|
+
fi
|
|
131
|
+
echo "Running aclocal $aclocalinclude ..."
|
|
132
|
+
aclocal $aclocalinclude
|
|
133
|
+
if grep "^AM_CONFIG_HEADER" configure.in >/dev/null; then
|
|
134
|
+
echo "Running autoheader..."
|
|
135
|
+
autoheader
|
|
136
|
+
fi
|
|
137
|
+
echo "Running automake --gnu $am_opt ..."
|
|
138
|
+
automake --add-missing --gnu $am_opt
|
|
139
|
+
echo "Running autoconf ..."
|
|
140
|
+
autoconf
|
|
141
|
+
)
|
|
142
|
+
fi
|
|
143
|
+
done
|
|
144
|
+
|
|
145
|
+
#conf_flags="--enable-maintainer-mode --enable-compile-warnings" #--enable-iso-c
|
|
146
|
+
|
|
147
|
+
if test x$NOCONFIGURE = x; then
|
|
148
|
+
echo Running $srcdir/configure $conf_flags "$@" ...
|
|
149
|
+
$srcdir/configure $conf_flags "$@" \
|
|
150
|
+
&& echo Now type \`make\' to compile $PKG_NAME
|
|
151
|
+
else
|
|
152
|
+
echo Skipping configure process.
|
|
153
|
+
fi
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "big5Prober.h"
|
|
22
|
+
|
|
23
|
+
void nsBig5Prober::Reset(void)
|
|
24
|
+
{
|
|
25
|
+
mCodingSM->Reset();
|
|
26
|
+
mState = eDetecting;
|
|
27
|
+
mDistributionAnalyser.Reset();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|
31
|
+
{
|
|
32
|
+
nsSMState codingState;
|
|
33
|
+
|
|
34
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
35
|
+
{
|
|
36
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
37
|
+
if (codingState == eError)
|
|
38
|
+
{
|
|
39
|
+
mState = eNotMe;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
if (codingState == eItsMe)
|
|
43
|
+
{
|
|
44
|
+
mState = eFoundIt;
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
if (codingState == eStart)
|
|
48
|
+
{
|
|
49
|
+
PRUint32 charLen = mCodingSM->GetCurrentCharLen();
|
|
50
|
+
|
|
51
|
+
if (i == 0)
|
|
52
|
+
{
|
|
53
|
+
mLastChar[1] = aBuf[0];
|
|
54
|
+
mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
|
|
55
|
+
}
|
|
56
|
+
else
|
|
57
|
+
mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
mLastChar[0] = aBuf[aLen-1];
|
|
62
|
+
|
|
63
|
+
if (mState == eDetecting)
|
|
64
|
+
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
|
65
|
+
mState = eFoundIt;
|
|
66
|
+
|
|
67
|
+
return mState;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
float nsBig5Prober::GetConfidence(void)
|
|
71
|
+
{
|
|
72
|
+
float distribCf = mDistributionAnalyser.GetConfidence();
|
|
73
|
+
|
|
74
|
+
return (float)distribCf;
|
|
75
|
+
}
|
|
76
|
+
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsBig5Prober_h__
|
|
22
|
+
#define nsBig5Prober_h__
|
|
23
|
+
|
|
24
|
+
#include "charsetProber.h"
|
|
25
|
+
#include "codingStateMachine.h"
|
|
26
|
+
#include "charDistribution.h"
|
|
27
|
+
|
|
28
|
+
class nsBig5Prober: public nsCharSetProber {
|
|
29
|
+
public:
|
|
30
|
+
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
|
31
|
+
Reset();};
|
|
32
|
+
virtual ~nsBig5Prober(void) {delete mCodingSM;};
|
|
33
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
34
|
+
const char* GetCharSetName() {return "Big5";};
|
|
35
|
+
nsProbingState GetState(void) {return mState;};
|
|
36
|
+
void Reset(void);
|
|
37
|
+
float GetConfidence(void);
|
|
38
|
+
void SetOpion() {};
|
|
39
|
+
|
|
40
|
+
protected:
|
|
41
|
+
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
|
42
|
+
|
|
43
|
+
nsCodingStateMachine* mCodingSM;
|
|
44
|
+
nsProbingState mState;
|
|
45
|
+
|
|
46
|
+
//Big5ContextAnalysis mContextAnalyser;
|
|
47
|
+
Big5DistributionAnalysis mDistributionAnalyser;
|
|
48
|
+
char mLastChar[2];
|
|
49
|
+
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
#endif /* nsBig5Prober_h__ */
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "charDistribution.h"
|
|
22
|
+
|
|
23
|
+
#include "JISFreq.tab"
|
|
24
|
+
#include "Big5Freq.tab"
|
|
25
|
+
#include "EUCKRFreq.tab"
|
|
26
|
+
#include "EUCTWFreq.tab"
|
|
27
|
+
#include "GB2312Freq.tab"
|
|
28
|
+
|
|
29
|
+
#define SURE_YES 0.99f
|
|
30
|
+
#define SURE_NO 0.01f
|
|
31
|
+
|
|
32
|
+
//return confidence base on received data
|
|
33
|
+
float CharDistributionAnalysis::GetConfidence()
|
|
34
|
+
{
|
|
35
|
+
//if we didn't receive any character in our consideration range, return negative answer
|
|
36
|
+
if (mTotalChars <= 0)
|
|
37
|
+
return SURE_NO;
|
|
38
|
+
|
|
39
|
+
if (mTotalChars != mFreqChars) {
|
|
40
|
+
float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
|
|
41
|
+
|
|
42
|
+
if (r < SURE_YES)
|
|
43
|
+
return r;
|
|
44
|
+
}
|
|
45
|
+
//normalize confidence, (we don't want to be 100% sure)
|
|
46
|
+
return SURE_YES;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
EUCTWDistributionAnalysis::EUCTWDistributionAnalysis()
|
|
50
|
+
{
|
|
51
|
+
mCharToFreqOrder = EUCTWCharToFreqOrder;
|
|
52
|
+
mTableSize = EUCTW_TABLE_SIZE;
|
|
53
|
+
mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO;
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
EUCKRDistributionAnalysis::EUCKRDistributionAnalysis()
|
|
57
|
+
{
|
|
58
|
+
mCharToFreqOrder = EUCKRCharToFreqOrder;
|
|
59
|
+
mTableSize = EUCKR_TABLE_SIZE;
|
|
60
|
+
mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
GB2312DistributionAnalysis::GB2312DistributionAnalysis()
|
|
64
|
+
{
|
|
65
|
+
mCharToFreqOrder = GB2312CharToFreqOrder;
|
|
66
|
+
mTableSize = GB2312_TABLE_SIZE;
|
|
67
|
+
mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
Big5DistributionAnalysis::Big5DistributionAnalysis()
|
|
71
|
+
{
|
|
72
|
+
mCharToFreqOrder = Big5CharToFreqOrder;
|
|
73
|
+
mTableSize = BIG5_TABLE_SIZE;
|
|
74
|
+
mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
SJISDistributionAnalysis::SJISDistributionAnalysis()
|
|
78
|
+
{
|
|
79
|
+
mCharToFreqOrder = JISCharToFreqOrder;
|
|
80
|
+
mTableSize = JIS_TABLE_SIZE;
|
|
81
|
+
mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
EUCJPDistributionAnalysis::EUCJPDistributionAnalysis()
|
|
85
|
+
{
|
|
86
|
+
mCharToFreqOrder = JISCharToFreqOrder;
|
|
87
|
+
mTableSize = JIS_TABLE_SIZE;
|
|
88
|
+
mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
|
|
89
|
+
};
|
|
90
|
+
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef CharDistribution_h__
|
|
22
|
+
#define CharDistribution_h__
|
|
23
|
+
|
|
24
|
+
#include "types.h"
|
|
25
|
+
|
|
26
|
+
#define ENOUGH_DATA_THRESHOLD 1024
|
|
27
|
+
|
|
28
|
+
class CharDistributionAnalysis
|
|
29
|
+
{
|
|
30
|
+
public:
|
|
31
|
+
CharDistributionAnalysis() {Reset();};
|
|
32
|
+
|
|
33
|
+
//feed a block of data and do distribution analysis
|
|
34
|
+
void HandleData(const char* aBuf, PRUint32 aLen) {};
|
|
35
|
+
|
|
36
|
+
//Feed a character with known length
|
|
37
|
+
void HandleOneChar(const char* aStr, PRUint32 aCharLen)
|
|
38
|
+
{
|
|
39
|
+
PRInt32 order;
|
|
40
|
+
|
|
41
|
+
//we only care about 2-bytes character in our distribution analysis
|
|
42
|
+
order = (aCharLen == 2) ? GetOrder(aStr) : -1;
|
|
43
|
+
|
|
44
|
+
if (order >= 0)
|
|
45
|
+
{
|
|
46
|
+
mTotalChars++;
|
|
47
|
+
//order is valid
|
|
48
|
+
if ((PRUint32)order < mTableSize)
|
|
49
|
+
{
|
|
50
|
+
if (512 > mCharToFreqOrder[order])
|
|
51
|
+
mFreqChars++;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
//return confidence base on existing data
|
|
57
|
+
float GetConfidence();
|
|
58
|
+
|
|
59
|
+
//Reset analyser, clear any state
|
|
60
|
+
void Reset(void)
|
|
61
|
+
{
|
|
62
|
+
mDone = PR_FALSE;
|
|
63
|
+
mTotalChars = 0;
|
|
64
|
+
mFreqChars = 0;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
//This function is for future extension. Caller can use this function to control
|
|
68
|
+
//analyser's behavior
|
|
69
|
+
void SetOpion(){};
|
|
70
|
+
|
|
71
|
+
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
|
72
|
+
// certain amount of data is enough
|
|
73
|
+
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
|
|
74
|
+
|
|
75
|
+
protected:
|
|
76
|
+
//we do not handle character base on its original encoding string, but
|
|
77
|
+
//convert this encoding string to a number, here called order.
|
|
78
|
+
//This allow multiple encoding of a language to share one frequency table
|
|
79
|
+
virtual PRInt32 GetOrder(const char* str) {return -1;};
|
|
80
|
+
|
|
81
|
+
//If this flag is set to PR_TRUE, detection is done and conclusion has been made
|
|
82
|
+
PRBool mDone;
|
|
83
|
+
|
|
84
|
+
//The number of characters whose frequency order is less than 512
|
|
85
|
+
PRUint32 mFreqChars;
|
|
86
|
+
|
|
87
|
+
//Total character encounted.
|
|
88
|
+
PRUint32 mTotalChars;
|
|
89
|
+
|
|
90
|
+
//Mapping table to get frequency order from char order (get from GetOrder())
|
|
91
|
+
const PRInt16 *mCharToFreqOrder;
|
|
92
|
+
|
|
93
|
+
//Size of above table
|
|
94
|
+
PRUint32 mTableSize;
|
|
95
|
+
|
|
96
|
+
//This is a constant value varies from language to language, it is used in
|
|
97
|
+
//calculating confidence. See my paper for further detail.
|
|
98
|
+
float mTypicalDistributionRatio;
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class EUCTWDistributionAnalysis: public CharDistributionAnalysis
|
|
103
|
+
{
|
|
104
|
+
public:
|
|
105
|
+
EUCTWDistributionAnalysis();
|
|
106
|
+
protected:
|
|
107
|
+
|
|
108
|
+
//for euc-TW encoding, we are interested
|
|
109
|
+
// first byte range: 0xc4 -- 0xfe
|
|
110
|
+
// second byte range: 0xa1 -- 0xfe
|
|
111
|
+
//no validation needed here. State machine has done that
|
|
112
|
+
PRInt32 GetOrder(const char* str)
|
|
113
|
+
{ if ((unsigned char)*str >= (unsigned char)0xc4)
|
|
114
|
+
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
|
|
115
|
+
else
|
|
116
|
+
return -1;
|
|
117
|
+
};
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class EUCKRDistributionAnalysis : public CharDistributionAnalysis
|
|
122
|
+
{
|
|
123
|
+
public:
|
|
124
|
+
EUCKRDistributionAnalysis();
|
|
125
|
+
protected:
|
|
126
|
+
//for euc-KR encoding, we are interested
|
|
127
|
+
// first byte range: 0xb0 -- 0xfe
|
|
128
|
+
// second byte range: 0xa1 -- 0xfe
|
|
129
|
+
//no validation needed here. State machine has done that
|
|
130
|
+
PRInt32 GetOrder(const char* str)
|
|
131
|
+
{ if ((unsigned char)*str >= (unsigned char)0xb0)
|
|
132
|
+
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
|
133
|
+
else
|
|
134
|
+
return -1;
|
|
135
|
+
};
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
class GB2312DistributionAnalysis : public CharDistributionAnalysis
|
|
139
|
+
{
|
|
140
|
+
public:
|
|
141
|
+
GB2312DistributionAnalysis();
|
|
142
|
+
protected:
|
|
143
|
+
//for GB2312 encoding, we are interested
|
|
144
|
+
// first byte range: 0xb0 -- 0xfe
|
|
145
|
+
// second byte range: 0xa1 -- 0xfe
|
|
146
|
+
//no validation needed here. State machine has done that
|
|
147
|
+
PRInt32 GetOrder(const char* str)
|
|
148
|
+
{ if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
|
|
149
|
+
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
|
150
|
+
else
|
|
151
|
+
return -1;
|
|
152
|
+
};
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class Big5DistributionAnalysis : public CharDistributionAnalysis
|
|
157
|
+
{
|
|
158
|
+
public:
|
|
159
|
+
Big5DistributionAnalysis();
|
|
160
|
+
protected:
|
|
161
|
+
//for big5 encoding, we are interested
|
|
162
|
+
// first byte range: 0xa4 -- 0xfe
|
|
163
|
+
// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
|
164
|
+
//no validation needed here. State machine has done that
|
|
165
|
+
PRInt32 GetOrder(const char* str)
|
|
166
|
+
{ if ((unsigned char)*str >= (unsigned char)0xa4)
|
|
167
|
+
if ((unsigned char)str[1] >= (unsigned char)0xa1)
|
|
168
|
+
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
|
|
169
|
+
else
|
|
170
|
+
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
|
|
171
|
+
else
|
|
172
|
+
return -1;
|
|
173
|
+
};
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
class SJISDistributionAnalysis : public CharDistributionAnalysis
|
|
177
|
+
{
|
|
178
|
+
public:
|
|
179
|
+
SJISDistributionAnalysis();
|
|
180
|
+
protected:
|
|
181
|
+
//for sjis encoding, we are interested
|
|
182
|
+
// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
|
183
|
+
// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
|
184
|
+
//no validation needed here. State machine has done that
|
|
185
|
+
PRInt32 GetOrder(const char* str)
|
|
186
|
+
{
|
|
187
|
+
PRInt32 order;
|
|
188
|
+
if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
|
|
189
|
+
order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
|
|
190
|
+
else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
|
|
191
|
+
order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
|
|
192
|
+
else
|
|
193
|
+
return -1;
|
|
194
|
+
order += (unsigned char)*(str+1) - 0x40;
|
|
195
|
+
if ((unsigned char)str[1] > (unsigned char)0x7f)
|
|
196
|
+
order--;
|
|
197
|
+
return order;
|
|
198
|
+
};
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
class EUCJPDistributionAnalysis : public CharDistributionAnalysis
|
|
202
|
+
{
|
|
203
|
+
public:
|
|
204
|
+
EUCJPDistributionAnalysis();
|
|
205
|
+
protected:
|
|
206
|
+
//for euc-JP encoding, we are interested
|
|
207
|
+
// first byte range: 0xa0 -- 0xfe
|
|
208
|
+
// second byte range: 0xa1 -- 0xfe
|
|
209
|
+
//no validation needed here. State machine has done that
|
|
210
|
+
PRInt32 GetOrder(const char* str)
|
|
211
|
+
{ if ((unsigned char)*str >= (unsigned char)0xa0)
|
|
212
|
+
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
|
|
213
|
+
else
|
|
214
|
+
return -1;
|
|
215
|
+
};
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
#endif //CharDistribution_h__
|
|
219
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "universal.h"
|
|
22
|
+
#include "string.h"
|
|
23
|
+
|
|
24
|
+
nsUniversalDetector* det;
|
|
25
|
+
|
|
26
|
+
extern "C" int CharGuessInit(void)
|
|
27
|
+
{
|
|
28
|
+
det = new nsUniversalDetector;
|
|
29
|
+
if (det)
|
|
30
|
+
return 0;
|
|
31
|
+
else
|
|
32
|
+
return -1;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
extern "C" const char* GuessChardet(const char *str)
|
|
36
|
+
{
|
|
37
|
+
const char* tmp;
|
|
38
|
+
|
|
39
|
+
if (!det)
|
|
40
|
+
{
|
|
41
|
+
CharGuessInit();
|
|
42
|
+
return NULL;
|
|
43
|
+
}
|
|
44
|
+
det->Reset();
|
|
45
|
+
det->HandleData(str, strlen(str));
|
|
46
|
+
det->DataEnd();
|
|
47
|
+
tmp = det->GetCharset();
|
|
48
|
+
|
|
49
|
+
return tmp;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
extern "C" int CharGuessDone(void)
|
|
53
|
+
{
|
|
54
|
+
if (det)
|
|
55
|
+
delete det;
|
|
56
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
int CharGuessInit(void);
|
|
22
|
+
const char* GuessChardet(const char *str);
|
|
23
|
+
int CharGuessDone(void);
|