charguess 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,153 @@
1
+ #!/bin/sh
2
+ # Run this to generate all the initial makefiles, etc.
3
+
4
+ srcdir=`dirname $0`
5
+ PKG_NAME="the package."
6
+
7
+ DIE=0
8
+
9
+ #(grep "^:ext:" $srcdir/CVS/Root >/dev/null) || {
10
+ # echo Use configure, autogen.sh is for developers only!
11
+ # DIE=1
12
+ #}
13
+
14
+ (autoconf --version) < /dev/null > /dev/null 2>&1 || {
15
+ echo
16
+ echo "**Error**: You must have \`autoconf' installed to."
17
+ echo "Download the appropriate package for your distribution,"
18
+ echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/"
19
+ DIE=1
20
+ }
21
+
22
+ (grep "^AM_PROG_LIBTOOL" $srcdir/configure.in >/dev/null) && {
23
+ (libtool --version) < /dev/null > /dev/null 2>&1 || {
24
+ echo
25
+ echo "**Error**: You must have \`libtool' installed."
26
+ echo "Get ftp://ftp.gnu.org/pub/gnu/libtool-1.2d.tar.gz"
27
+ echo "(or a newer version if it is available)"
28
+ DIE=1
29
+ }
30
+ }
31
+
32
+ grep "^AM_GNU_GETTEXT" $srcdir/configure.in >/dev/null && {
33
+ grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
34
+ (gettext --version) < /dev/null > /dev/null 2>&1 || {
35
+ echo
36
+ echo "**Error**: You must have \`gettext' installed."
37
+ echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
38
+ echo "(or a newer version if it is available)"
39
+ DIE=1
40
+ }
41
+ }
42
+
43
+ grep "^AM_GNOME_GETTEXT" $srcdir/configure.in >/dev/null && {
44
+ grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
45
+ (gettext --version) < /dev/null > /dev/null 2>&1 || {
46
+ echo
47
+ echo "**Error**: You must have \`gettext' installed."
48
+ echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
49
+ echo "(or a newer version if it is available)"
50
+ DIE=1
51
+ }
52
+ }
53
+
54
+ (automake --version --copy) < /dev/null > /dev/null 2>&1 || {
55
+ echo
56
+ echo "**Error**: You must have \`automake' installed."
57
+ echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
58
+ echo "(or a newer version if it is available)"
59
+ DIE=1
60
+ NO_AUTOMAKE=yes
61
+ }
62
+
63
+
64
+ # if no automake, don't bother testing for aclocal
65
+ test -n "$NO_AUTOMAKE" || (aclocal --version) < /dev/null > /dev/null 2>&1 || {
66
+ echo
67
+ echo "**Error**: Missing \`aclocal'. The version of \`automake'"
68
+ echo "installed doesn't appear recent enough."
69
+ echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
70
+ echo "(or a newer version if it is available)"
71
+ DIE=1
72
+ }
73
+
74
+ if test "$DIE" -eq 1; then
75
+ exit 1
76
+ fi
77
+
78
+ if test -z "$*"; then
79
+ echo "**Warning**: I am going to run \`configure' with no arguments."
80
+ echo "If you wish to pass any to it, please specify them on the"
81
+ echo \`$0\'" command line."
82
+ echo
83
+ fi
84
+
85
+ case $CC in
86
+ xlc )
87
+ am_opt=--include-deps;;
88
+ esac
89
+
90
+ for coin in `find $srcdir -name configure.in -print`
91
+ do
92
+ dr=`dirname $coin`
93
+ if test -f $dr/NO-AUTO-GEN; then
94
+ echo skipping $dr -- flagged as no auto-gen
95
+ else
96
+ echo processing $dr
97
+ macrodirs=`sed -n -e 's,AM_ACLOCAL_INCLUDE(\(.*\)),\1,gp' < $coin`
98
+ ( cd $dr
99
+ aclocalinclude="$ACLOCAL_FLAGS"
100
+ for k in $macrodirs; do
101
+ if test -d $k; then
102
+ aclocalinclude="$aclocalinclude -I $k"
103
+ ##else
104
+ ## echo "**Warning**: No such directory \`$k'. Ignored."
105
+ fi
106
+ done
107
+ if grep "^AM_GNU_GETTEXT" configure.in >/dev/null; then
108
+ if grep "sed.*POTFILES" configure.in >/dev/null; then
109
+ : do nothing -- we still have an old unmodified configure.in
110
+ else
111
+ echo "Creating $dr/aclocal.m4 ..."
112
+ test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
113
+ echo "Running gettextize... Ignore non-fatal messages."
114
+ echo "no" | gettextize --force --copy
115
+ echo "Making $dr/aclocal.m4 writable ..."
116
+ test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
117
+ fi
118
+ fi
119
+ if grep "^AM_GNOME_GETTEXT" configure.in >/dev/null; then
120
+ echo "Creating $dr/aclocal.m4 ..."
121
+ test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
122
+ echo "Running gettextize... Ignore non-fatal messages."
123
+ echo "no" | gettextize --force --copy
124
+ echo "Making $dr/aclocal.m4 writable ..."
125
+ test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
126
+ fi
127
+ if grep "^AM_PROG_LIBTOOL" configure.in >/dev/null; then
128
+ echo "Running libtoolize..."
129
+ libtoolize --force --copy
130
+ fi
131
+ echo "Running aclocal $aclocalinclude ..."
132
+ aclocal $aclocalinclude
133
+ if grep "^AM_CONFIG_HEADER" configure.in >/dev/null; then
134
+ echo "Running autoheader..."
135
+ autoheader
136
+ fi
137
+ echo "Running automake --gnu $am_opt ..."
138
+ automake --add-missing --gnu $am_opt
139
+ echo "Running autoconf ..."
140
+ autoconf
141
+ )
142
+ fi
143
+ done
144
+
145
+ #conf_flags="--enable-maintainer-mode --enable-compile-warnings" #--enable-iso-c
146
+
147
+ if test x$NOCONFIGURE = x; then
148
+ echo Running $srcdir/configure $conf_flags "$@" ...
149
+ $srcdir/configure $conf_flags "$@" \
150
+ && echo Now type \`make\' to compile $PKG_NAME
151
+ else
152
+ echo Skipping configure process.
153
+ fi
@@ -0,0 +1,76 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "big5Prober.h"
22
+
23
+ void nsBig5Prober::Reset(void)
24
+ {
25
+ mCodingSM->Reset();
26
+ mState = eDetecting;
27
+ mDistributionAnalyser.Reset();
28
+ }
29
+
30
+ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
31
+ {
32
+ nsSMState codingState;
33
+
34
+ for (PRUint32 i = 0; i < aLen; i++)
35
+ {
36
+ codingState = mCodingSM->NextState(aBuf[i]);
37
+ if (codingState == eError)
38
+ {
39
+ mState = eNotMe;
40
+ break;
41
+ }
42
+ if (codingState == eItsMe)
43
+ {
44
+ mState = eFoundIt;
45
+ break;
46
+ }
47
+ if (codingState == eStart)
48
+ {
49
+ PRUint32 charLen = mCodingSM->GetCurrentCharLen();
50
+
51
+ if (i == 0)
52
+ {
53
+ mLastChar[1] = aBuf[0];
54
+ mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
55
+ }
56
+ else
57
+ mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
58
+ }
59
+ }
60
+
61
+ mLastChar[0] = aBuf[aLen-1];
62
+
63
+ if (mState == eDetecting)
64
+ if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
65
+ mState = eFoundIt;
66
+
67
+ return mState;
68
+ }
69
+
70
+ float nsBig5Prober::GetConfidence(void)
71
+ {
72
+ float distribCf = mDistributionAnalyser.GetConfidence();
73
+
74
+ return (float)distribCf;
75
+ }
76
+
@@ -0,0 +1,53 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsBig5Prober_h__
22
+ #define nsBig5Prober_h__
23
+
24
+ #include "charsetProber.h"
25
+ #include "codingStateMachine.h"
26
+ #include "charDistribution.h"
27
+
28
+ class nsBig5Prober: public nsCharSetProber {
29
+ public:
30
+ nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
31
+ Reset();};
32
+ virtual ~nsBig5Prober(void) {delete mCodingSM;};
33
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
34
+ const char* GetCharSetName() {return "Big5";};
35
+ nsProbingState GetState(void) {return mState;};
36
+ void Reset(void);
37
+ float GetConfidence(void);
38
+ void SetOpion() {};
39
+
40
+ protected:
41
+ void GetDistribution(PRUint32 aCharLen, const char* aStr);
42
+
43
+ nsCodingStateMachine* mCodingSM;
44
+ nsProbingState mState;
45
+
46
+ //Big5ContextAnalysis mContextAnalyser;
47
+ Big5DistributionAnalysis mDistributionAnalyser;
48
+ char mLastChar[2];
49
+
50
+ };
51
+
52
+
53
+ #endif /* nsBig5Prober_h__ */
@@ -0,0 +1,90 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "charDistribution.h"
22
+
23
+ #include "JISFreq.tab"
24
+ #include "Big5Freq.tab"
25
+ #include "EUCKRFreq.tab"
26
+ #include "EUCTWFreq.tab"
27
+ #include "GB2312Freq.tab"
28
+
29
+ #define SURE_YES 0.99f
30
+ #define SURE_NO 0.01f
31
+
32
+ //return confidence base on received data
33
+ float CharDistributionAnalysis::GetConfidence()
34
+ {
35
+ //if we didn't receive any character in our consideration range, return negative answer
36
+ if (mTotalChars <= 0)
37
+ return SURE_NO;
38
+
39
+ if (mTotalChars != mFreqChars) {
40
+ float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
41
+
42
+ if (r < SURE_YES)
43
+ return r;
44
+ }
45
+ //normalize confidence, (we don't want to be 100% sure)
46
+ return SURE_YES;
47
+ }
48
+
49
+ EUCTWDistributionAnalysis::EUCTWDistributionAnalysis()
50
+ {
51
+ mCharToFreqOrder = EUCTWCharToFreqOrder;
52
+ mTableSize = EUCTW_TABLE_SIZE;
53
+ mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO;
54
+ };
55
+
56
+ EUCKRDistributionAnalysis::EUCKRDistributionAnalysis()
57
+ {
58
+ mCharToFreqOrder = EUCKRCharToFreqOrder;
59
+ mTableSize = EUCKR_TABLE_SIZE;
60
+ mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
61
+ };
62
+
63
+ GB2312DistributionAnalysis::GB2312DistributionAnalysis()
64
+ {
65
+ mCharToFreqOrder = GB2312CharToFreqOrder;
66
+ mTableSize = GB2312_TABLE_SIZE;
67
+ mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
68
+ };
69
+
70
+ Big5DistributionAnalysis::Big5DistributionAnalysis()
71
+ {
72
+ mCharToFreqOrder = Big5CharToFreqOrder;
73
+ mTableSize = BIG5_TABLE_SIZE;
74
+ mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
75
+ };
76
+
77
+ SJISDistributionAnalysis::SJISDistributionAnalysis()
78
+ {
79
+ mCharToFreqOrder = JISCharToFreqOrder;
80
+ mTableSize = JIS_TABLE_SIZE;
81
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
82
+ };
83
+
84
+ EUCJPDistributionAnalysis::EUCJPDistributionAnalysis()
85
+ {
86
+ mCharToFreqOrder = JISCharToFreqOrder;
87
+ mTableSize = JIS_TABLE_SIZE;
88
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
89
+ };
90
+
@@ -0,0 +1,219 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef CharDistribution_h__
22
+ #define CharDistribution_h__
23
+
24
+ #include "types.h"
25
+
26
+ #define ENOUGH_DATA_THRESHOLD 1024
27
+
28
+ class CharDistributionAnalysis
29
+ {
30
+ public:
31
+ CharDistributionAnalysis() {Reset();};
32
+
33
+ //feed a block of data and do distribution analysis
34
+ void HandleData(const char* aBuf, PRUint32 aLen) {};
35
+
36
+ //Feed a character with known length
37
+ void HandleOneChar(const char* aStr, PRUint32 aCharLen)
38
+ {
39
+ PRInt32 order;
40
+
41
+ //we only care about 2-bytes character in our distribution analysis
42
+ order = (aCharLen == 2) ? GetOrder(aStr) : -1;
43
+
44
+ if (order >= 0)
45
+ {
46
+ mTotalChars++;
47
+ //order is valid
48
+ if ((PRUint32)order < mTableSize)
49
+ {
50
+ if (512 > mCharToFreqOrder[order])
51
+ mFreqChars++;
52
+ }
53
+ }
54
+ };
55
+
56
+ //return confidence base on existing data
57
+ float GetConfidence();
58
+
59
+ //Reset analyser, clear any state
60
+ void Reset(void)
61
+ {
62
+ mDone = PR_FALSE;
63
+ mTotalChars = 0;
64
+ mFreqChars = 0;
65
+ };
66
+
67
+ //This function is for future extension. Caller can use this function to control
68
+ //analyser's behavior
69
+ void SetOpion(){};
70
+
71
+ //It is not necessary to receive all data to draw conclusion. For charset detection,
72
+ // certain amount of data is enough
73
+ PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
74
+
75
+ protected:
76
+ //we do not handle character base on its original encoding string, but
77
+ //convert this encoding string to a number, here called order.
78
+ //This allow multiple encoding of a language to share one frequency table
79
+ virtual PRInt32 GetOrder(const char* str) {return -1;};
80
+
81
+ //If this flag is set to PR_TRUE, detection is done and conclusion has been made
82
+ PRBool mDone;
83
+
84
+ //The number of characters whose frequency order is less than 512
85
+ PRUint32 mFreqChars;
86
+
87
+ //Total character encounted.
88
+ PRUint32 mTotalChars;
89
+
90
+ //Mapping table to get frequency order from char order (get from GetOrder())
91
+ const PRInt16 *mCharToFreqOrder;
92
+
93
+ //Size of above table
94
+ PRUint32 mTableSize;
95
+
96
+ //This is a constant value varies from language to language, it is used in
97
+ //calculating confidence. See my paper for further detail.
98
+ float mTypicalDistributionRatio;
99
+ };
100
+
101
+
102
+ class EUCTWDistributionAnalysis: public CharDistributionAnalysis
103
+ {
104
+ public:
105
+ EUCTWDistributionAnalysis();
106
+ protected:
107
+
108
+ //for euc-TW encoding, we are interested
109
+ // first byte range: 0xc4 -- 0xfe
110
+ // second byte range: 0xa1 -- 0xfe
111
+ //no validation needed here. State machine has done that
112
+ PRInt32 GetOrder(const char* str)
113
+ { if ((unsigned char)*str >= (unsigned char)0xc4)
114
+ return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
115
+ else
116
+ return -1;
117
+ };
118
+ };
119
+
120
+
121
+ class EUCKRDistributionAnalysis : public CharDistributionAnalysis
122
+ {
123
+ public:
124
+ EUCKRDistributionAnalysis();
125
+ protected:
126
+ //for euc-KR encoding, we are interested
127
+ // first byte range: 0xb0 -- 0xfe
128
+ // second byte range: 0xa1 -- 0xfe
129
+ //no validation needed here. State machine has done that
130
+ PRInt32 GetOrder(const char* str)
131
+ { if ((unsigned char)*str >= (unsigned char)0xb0)
132
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
133
+ else
134
+ return -1;
135
+ };
136
+ };
137
+
138
+ class GB2312DistributionAnalysis : public CharDistributionAnalysis
139
+ {
140
+ public:
141
+ GB2312DistributionAnalysis();
142
+ protected:
143
+ //for GB2312 encoding, we are interested
144
+ // first byte range: 0xb0 -- 0xfe
145
+ // second byte range: 0xa1 -- 0xfe
146
+ //no validation needed here. State machine has done that
147
+ PRInt32 GetOrder(const char* str)
148
+ { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
149
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
150
+ else
151
+ return -1;
152
+ };
153
+ };
154
+
155
+
156
+ class Big5DistributionAnalysis : public CharDistributionAnalysis
157
+ {
158
+ public:
159
+ Big5DistributionAnalysis();
160
+ protected:
161
+ //for big5 encoding, we are interested
162
+ // first byte range: 0xa4 -- 0xfe
163
+ // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
164
+ //no validation needed here. State machine has done that
165
+ PRInt32 GetOrder(const char* str)
166
+ { if ((unsigned char)*str >= (unsigned char)0xa4)
167
+ if ((unsigned char)str[1] >= (unsigned char)0xa1)
168
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
169
+ else
170
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
171
+ else
172
+ return -1;
173
+ };
174
+ };
175
+
176
+ class SJISDistributionAnalysis : public CharDistributionAnalysis
177
+ {
178
+ public:
179
+ SJISDistributionAnalysis();
180
+ protected:
181
+ //for sjis encoding, we are interested
182
+ // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
183
+ // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
184
+ //no validation needed here. State machine has done that
185
+ PRInt32 GetOrder(const char* str)
186
+ {
187
+ PRInt32 order;
188
+ if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
189
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
190
+ else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
191
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
192
+ else
193
+ return -1;
194
+ order += (unsigned char)*(str+1) - 0x40;
195
+ if ((unsigned char)str[1] > (unsigned char)0x7f)
196
+ order--;
197
+ return order;
198
+ };
199
+ };
200
+
201
+ class EUCJPDistributionAnalysis : public CharDistributionAnalysis
202
+ {
203
+ public:
204
+ EUCJPDistributionAnalysis();
205
+ protected:
206
+ //for euc-JP encoding, we are interested
207
+ // first byte range: 0xa0 -- 0xfe
208
+ // second byte range: 0xa1 -- 0xfe
209
+ //no validation needed here. State machine has done that
210
+ PRInt32 GetOrder(const char* str)
211
+ { if ((unsigned char)*str >= (unsigned char)0xa0)
212
+ return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
213
+ else
214
+ return -1;
215
+ };
216
+ };
217
+
218
+ #endif //CharDistribution_h__
219
+
@@ -0,0 +1,56 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "universal.h"
22
+ #include "string.h"
23
+
24
+ nsUniversalDetector* det;
25
+
26
+ extern "C" int CharGuessInit(void)
27
+ {
28
+ det = new nsUniversalDetector;
29
+ if (det)
30
+ return 0;
31
+ else
32
+ return -1;
33
+ }
34
+
35
+ extern "C" const char* GuessChardet(const char *str)
36
+ {
37
+ const char* tmp;
38
+
39
+ if (!det)
40
+ {
41
+ CharGuessInit();
42
+ return NULL;
43
+ }
44
+ det->Reset();
45
+ det->HandleData(str, strlen(str));
46
+ det->DataEnd();
47
+ tmp = det->GetCharset();
48
+
49
+ return tmp;
50
+ }
51
+
52
+ extern "C" int CharGuessDone(void)
53
+ {
54
+ if (det)
55
+ delete det;
56
+ }
@@ -0,0 +1,23 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ int CharGuessInit(void);
22
+ const char* GuessChardet(const char *str);
23
+ int CharGuessDone(void);