charguess 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,153 @@
1
+ #!/bin/sh
2
+ # Run this to generate all the initial makefiles, etc.
3
+
4
+ srcdir=`dirname $0`
5
+ PKG_NAME="the package."
6
+
7
+ DIE=0
8
+
9
+ #(grep "^:ext:" $srcdir/CVS/Root >/dev/null) || {
10
+ # echo Use configure, autogen.sh is for developers only!
11
+ # DIE=1
12
+ #}
13
+
14
+ (autoconf --version) < /dev/null > /dev/null 2>&1 || {
15
+ echo
16
+ echo "**Error**: You must have \`autoconf' installed to."
17
+ echo "Download the appropriate package for your distribution,"
18
+ echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/"
19
+ DIE=1
20
+ }
21
+
22
+ (grep "^AM_PROG_LIBTOOL" $srcdir/configure.in >/dev/null) && {
23
+ (libtool --version) < /dev/null > /dev/null 2>&1 || {
24
+ echo
25
+ echo "**Error**: You must have \`libtool' installed."
26
+ echo "Get ftp://ftp.gnu.org/pub/gnu/libtool-1.2d.tar.gz"
27
+ echo "(or a newer version if it is available)"
28
+ DIE=1
29
+ }
30
+ }
31
+
32
+ grep "^AM_GNU_GETTEXT" $srcdir/configure.in >/dev/null && {
33
+ grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
34
+ (gettext --version) < /dev/null > /dev/null 2>&1 || {
35
+ echo
36
+ echo "**Error**: You must have \`gettext' installed."
37
+ echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
38
+ echo "(or a newer version if it is available)"
39
+ DIE=1
40
+ }
41
+ }
42
+
43
+ grep "^AM_GNOME_GETTEXT" $srcdir/configure.in >/dev/null && {
44
+ grep "sed.*POTFILES" $srcdir/configure.in >/dev/null || \
45
+ (gettext --version) < /dev/null > /dev/null 2>&1 || {
46
+ echo
47
+ echo "**Error**: You must have \`gettext' installed."
48
+ echo "Get ftp://alpha.gnu.org/gnu/gettext-0.10.35.tar.gz"
49
+ echo "(or a newer version if it is available)"
50
+ DIE=1
51
+ }
52
+ }
53
+
54
+ (automake --version --copy) < /dev/null > /dev/null 2>&1 || {
55
+ echo
56
+ echo "**Error**: You must have \`automake' installed."
57
+ echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
58
+ echo "(or a newer version if it is available)"
59
+ DIE=1
60
+ NO_AUTOMAKE=yes
61
+ }
62
+
63
+
64
+ # if no automake, don't bother testing for aclocal
65
+ test -n "$NO_AUTOMAKE" || (aclocal --version) < /dev/null > /dev/null 2>&1 || {
66
+ echo
67
+ echo "**Error**: Missing \`aclocal'. The version of \`automake'"
68
+ echo "installed doesn't appear recent enough."
69
+ echo "Get ftp://ftp.gnu.org/pub/gnu/automake-1.3.tar.gz"
70
+ echo "(or a newer version if it is available)"
71
+ DIE=1
72
+ }
73
+
74
+ if test "$DIE" -eq 1; then
75
+ exit 1
76
+ fi
77
+
78
+ if test -z "$*"; then
79
+ echo "**Warning**: I am going to run \`configure' with no arguments."
80
+ echo "If you wish to pass any to it, please specify them on the"
81
+ echo \`$0\'" command line."
82
+ echo
83
+ fi
84
+
85
+ case $CC in
86
+ xlc )
87
+ am_opt=--include-deps;;
88
+ esac
89
+
90
+ for coin in `find $srcdir -name configure.in -print`
91
+ do
92
+ dr=`dirname $coin`
93
+ if test -f $dr/NO-AUTO-GEN; then
94
+ echo skipping $dr -- flagged as no auto-gen
95
+ else
96
+ echo processing $dr
97
+ macrodirs=`sed -n -e 's,AM_ACLOCAL_INCLUDE(\(.*\)),\1,gp' < $coin`
98
+ ( cd $dr
99
+ aclocalinclude="$ACLOCAL_FLAGS"
100
+ for k in $macrodirs; do
101
+ if test -d $k; then
102
+ aclocalinclude="$aclocalinclude -I $k"
103
+ ##else
104
+ ## echo "**Warning**: No such directory \`$k'. Ignored."
105
+ fi
106
+ done
107
+ if grep "^AM_GNU_GETTEXT" configure.in >/dev/null; then
108
+ if grep "sed.*POTFILES" configure.in >/dev/null; then
109
+ : do nothing -- we still have an old unmodified configure.in
110
+ else
111
+ echo "Creating $dr/aclocal.m4 ..."
112
+ test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
113
+ echo "Running gettextize... Ignore non-fatal messages."
114
+ echo "no" | gettextize --force --copy
115
+ echo "Making $dr/aclocal.m4 writable ..."
116
+ test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
117
+ fi
118
+ fi
119
+ if grep "^AM_GNOME_GETTEXT" configure.in >/dev/null; then
120
+ echo "Creating $dr/aclocal.m4 ..."
121
+ test -r $dr/aclocal.m4 || touch $dr/aclocal.m4
122
+ echo "Running gettextize... Ignore non-fatal messages."
123
+ echo "no" | gettextize --force --copy
124
+ echo "Making $dr/aclocal.m4 writable ..."
125
+ test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4
126
+ fi
127
+ if grep "^AM_PROG_LIBTOOL" configure.in >/dev/null; then
128
+ echo "Running libtoolize..."
129
+ libtoolize --force --copy
130
+ fi
131
+ echo "Running aclocal $aclocalinclude ..."
132
+ aclocal $aclocalinclude
133
+ if grep "^AM_CONFIG_HEADER" configure.in >/dev/null; then
134
+ echo "Running autoheader..."
135
+ autoheader
136
+ fi
137
+ echo "Running automake --gnu $am_opt ..."
138
+ automake --add-missing --gnu $am_opt
139
+ echo "Running autoconf ..."
140
+ autoconf
141
+ )
142
+ fi
143
+ done
144
+
145
+ #conf_flags="--enable-maintainer-mode --enable-compile-warnings" #--enable-iso-c
146
+
147
+ if test x$NOCONFIGURE = x; then
148
+ echo Running $srcdir/configure $conf_flags "$@" ...
149
+ $srcdir/configure $conf_flags "$@" \
150
+ && echo Now type \`make\' to compile $PKG_NAME
151
+ else
152
+ echo Skipping configure process.
153
+ fi
@@ -0,0 +1,76 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "big5Prober.h"
22
+
23
+ void nsBig5Prober::Reset(void)
24
+ {
25
+ mCodingSM->Reset();
26
+ mState = eDetecting;
27
+ mDistributionAnalyser.Reset();
28
+ }
29
+
30
+ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
31
+ {
32
+ nsSMState codingState;
33
+
34
+ for (PRUint32 i = 0; i < aLen; i++)
35
+ {
36
+ codingState = mCodingSM->NextState(aBuf[i]);
37
+ if (codingState == eError)
38
+ {
39
+ mState = eNotMe;
40
+ break;
41
+ }
42
+ if (codingState == eItsMe)
43
+ {
44
+ mState = eFoundIt;
45
+ break;
46
+ }
47
+ if (codingState == eStart)
48
+ {
49
+ PRUint32 charLen = mCodingSM->GetCurrentCharLen();
50
+
51
+ if (i == 0)
52
+ {
53
+ mLastChar[1] = aBuf[0];
54
+ mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
55
+ }
56
+ else
57
+ mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
58
+ }
59
+ }
60
+
61
+ mLastChar[0] = aBuf[aLen-1];
62
+
63
+ if (mState == eDetecting)
64
+ if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
65
+ mState = eFoundIt;
66
+
67
+ return mState;
68
+ }
69
+
70
+ float nsBig5Prober::GetConfidence(void)
71
+ {
72
+ float distribCf = mDistributionAnalyser.GetConfidence();
73
+
74
+ return (float)distribCf;
75
+ }
76
+
@@ -0,0 +1,53 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsBig5Prober_h__
22
+ #define nsBig5Prober_h__
23
+
24
+ #include "charsetProber.h"
25
+ #include "codingStateMachine.h"
26
+ #include "charDistribution.h"
27
+
28
+ class nsBig5Prober: public nsCharSetProber {
29
+ public:
30
+ nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
31
+ Reset();};
32
+ virtual ~nsBig5Prober(void) {delete mCodingSM;};
33
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
34
+ const char* GetCharSetName() {return "Big5";};
35
+ nsProbingState GetState(void) {return mState;};
36
+ void Reset(void);
37
+ float GetConfidence(void);
38
+ void SetOpion() {};
39
+
40
+ protected:
41
+ void GetDistribution(PRUint32 aCharLen, const char* aStr);
42
+
43
+ nsCodingStateMachine* mCodingSM;
44
+ nsProbingState mState;
45
+
46
+ //Big5ContextAnalysis mContextAnalyser;
47
+ Big5DistributionAnalysis mDistributionAnalyser;
48
+ char mLastChar[2];
49
+
50
+ };
51
+
52
+
53
+ #endif /* nsBig5Prober_h__ */
@@ -0,0 +1,90 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "charDistribution.h"
22
+
23
+ #include "JISFreq.tab"
24
+ #include "Big5Freq.tab"
25
+ #include "EUCKRFreq.tab"
26
+ #include "EUCTWFreq.tab"
27
+ #include "GB2312Freq.tab"
28
+
29
+ #define SURE_YES 0.99f
30
+ #define SURE_NO 0.01f
31
+
32
+ //return confidence base on received data
33
+ float CharDistributionAnalysis::GetConfidence()
34
+ {
35
+ //if we didn't receive any character in our consideration range, return negative answer
36
+ if (mTotalChars <= 0)
37
+ return SURE_NO;
38
+
39
+ if (mTotalChars != mFreqChars) {
40
+ float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
41
+
42
+ if (r < SURE_YES)
43
+ return r;
44
+ }
45
+ //normalize confidence, (we don't want to be 100% sure)
46
+ return SURE_YES;
47
+ }
48
+
49
+ EUCTWDistributionAnalysis::EUCTWDistributionAnalysis()
50
+ {
51
+ mCharToFreqOrder = EUCTWCharToFreqOrder;
52
+ mTableSize = EUCTW_TABLE_SIZE;
53
+ mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO;
54
+ };
55
+
56
+ EUCKRDistributionAnalysis::EUCKRDistributionAnalysis()
57
+ {
58
+ mCharToFreqOrder = EUCKRCharToFreqOrder;
59
+ mTableSize = EUCKR_TABLE_SIZE;
60
+ mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
61
+ };
62
+
63
+ GB2312DistributionAnalysis::GB2312DistributionAnalysis()
64
+ {
65
+ mCharToFreqOrder = GB2312CharToFreqOrder;
66
+ mTableSize = GB2312_TABLE_SIZE;
67
+ mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
68
+ };
69
+
70
+ Big5DistributionAnalysis::Big5DistributionAnalysis()
71
+ {
72
+ mCharToFreqOrder = Big5CharToFreqOrder;
73
+ mTableSize = BIG5_TABLE_SIZE;
74
+ mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
75
+ };
76
+
77
+ SJISDistributionAnalysis::SJISDistributionAnalysis()
78
+ {
79
+ mCharToFreqOrder = JISCharToFreqOrder;
80
+ mTableSize = JIS_TABLE_SIZE;
81
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
82
+ };
83
+
84
+ EUCJPDistributionAnalysis::EUCJPDistributionAnalysis()
85
+ {
86
+ mCharToFreqOrder = JISCharToFreqOrder;
87
+ mTableSize = JIS_TABLE_SIZE;
88
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
89
+ };
90
+
@@ -0,0 +1,219 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef CharDistribution_h__
22
+ #define CharDistribution_h__
23
+
24
+ #include "types.h"
25
+
26
+ #define ENOUGH_DATA_THRESHOLD 1024
27
+
28
+ class CharDistributionAnalysis
29
+ {
30
+ public:
31
+ CharDistributionAnalysis() {Reset();};
32
+
33
+ //feed a block of data and do distribution analysis
34
+ void HandleData(const char* aBuf, PRUint32 aLen) {};
35
+
36
+ //Feed a character with known length
37
+ void HandleOneChar(const char* aStr, PRUint32 aCharLen)
38
+ {
39
+ PRInt32 order;
40
+
41
+ //we only care about 2-bytes character in our distribution analysis
42
+ order = (aCharLen == 2) ? GetOrder(aStr) : -1;
43
+
44
+ if (order >= 0)
45
+ {
46
+ mTotalChars++;
47
+ //order is valid
48
+ if ((PRUint32)order < mTableSize)
49
+ {
50
+ if (512 > mCharToFreqOrder[order])
51
+ mFreqChars++;
52
+ }
53
+ }
54
+ };
55
+
56
+ //return confidence base on existing data
57
+ float GetConfidence();
58
+
59
+ //Reset analyser, clear any state
60
+ void Reset(void)
61
+ {
62
+ mDone = PR_FALSE;
63
+ mTotalChars = 0;
64
+ mFreqChars = 0;
65
+ };
66
+
67
+ //This function is for future extension. Caller can use this function to control
68
+ //analyser's behavior
69
+ void SetOpion(){};
70
+
71
+ //It is not necessary to receive all data to draw conclusion. For charset detection,
72
+ // certain amount of data is enough
73
+ PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
74
+
75
+ protected:
76
+ //we do not handle character base on its original encoding string, but
77
+ //convert this encoding string to a number, here called order.
78
+ //This allow multiple encoding of a language to share one frequency table
79
+ virtual PRInt32 GetOrder(const char* str) {return -1;};
80
+
81
+ //If this flag is set to PR_TRUE, detection is done and conclusion has been made
82
+ PRBool mDone;
83
+
84
+ //The number of characters whose frequency order is less than 512
85
+ PRUint32 mFreqChars;
86
+
87
+ //Total character encounted.
88
+ PRUint32 mTotalChars;
89
+
90
+ //Mapping table to get frequency order from char order (get from GetOrder())
91
+ const PRInt16 *mCharToFreqOrder;
92
+
93
+ //Size of above table
94
+ PRUint32 mTableSize;
95
+
96
+ //This is a constant value varies from language to language, it is used in
97
+ //calculating confidence. See my paper for further detail.
98
+ float mTypicalDistributionRatio;
99
+ };
100
+
101
+
102
+ class EUCTWDistributionAnalysis: public CharDistributionAnalysis
103
+ {
104
+ public:
105
+ EUCTWDistributionAnalysis();
106
+ protected:
107
+
108
+ //for euc-TW encoding, we are interested
109
+ // first byte range: 0xc4 -- 0xfe
110
+ // second byte range: 0xa1 -- 0xfe
111
+ //no validation needed here. State machine has done that
112
+ PRInt32 GetOrder(const char* str)
113
+ { if ((unsigned char)*str >= (unsigned char)0xc4)
114
+ return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
115
+ else
116
+ return -1;
117
+ };
118
+ };
119
+
120
+
121
+ class EUCKRDistributionAnalysis : public CharDistributionAnalysis
122
+ {
123
+ public:
124
+ EUCKRDistributionAnalysis();
125
+ protected:
126
+ //for euc-KR encoding, we are interested
127
+ // first byte range: 0xb0 -- 0xfe
128
+ // second byte range: 0xa1 -- 0xfe
129
+ //no validation needed here. State machine has done that
130
+ PRInt32 GetOrder(const char* str)
131
+ { if ((unsigned char)*str >= (unsigned char)0xb0)
132
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
133
+ else
134
+ return -1;
135
+ };
136
+ };
137
+
138
+ class GB2312DistributionAnalysis : public CharDistributionAnalysis
139
+ {
140
+ public:
141
+ GB2312DistributionAnalysis();
142
+ protected:
143
+ //for GB2312 encoding, we are interested
144
+ // first byte range: 0xb0 -- 0xfe
145
+ // second byte range: 0xa1 -- 0xfe
146
+ //no validation needed here. State machine has done that
147
+ PRInt32 GetOrder(const char* str)
148
+ { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
149
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
150
+ else
151
+ return -1;
152
+ };
153
+ };
154
+
155
+
156
+ class Big5DistributionAnalysis : public CharDistributionAnalysis
157
+ {
158
+ public:
159
+ Big5DistributionAnalysis();
160
+ protected:
161
+ //for big5 encoding, we are interested
162
+ // first byte range: 0xa4 -- 0xfe
163
+ // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
164
+ //no validation needed here. State machine has done that
165
+ PRInt32 GetOrder(const char* str)
166
+ { if ((unsigned char)*str >= (unsigned char)0xa4)
167
+ if ((unsigned char)str[1] >= (unsigned char)0xa1)
168
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
169
+ else
170
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
171
+ else
172
+ return -1;
173
+ };
174
+ };
175
+
176
+ class SJISDistributionAnalysis : public CharDistributionAnalysis
177
+ {
178
+ public:
179
+ SJISDistributionAnalysis();
180
+ protected:
181
+ //for sjis encoding, we are interested
182
+ // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
183
+ // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
184
+ //no validation needed here. State machine has done that
185
+ PRInt32 GetOrder(const char* str)
186
+ {
187
+ PRInt32 order;
188
+ if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
189
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
190
+ else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
191
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
192
+ else
193
+ return -1;
194
+ order += (unsigned char)*(str+1) - 0x40;
195
+ if ((unsigned char)str[1] > (unsigned char)0x7f)
196
+ order--;
197
+ return order;
198
+ };
199
+ };
200
+
201
+ class EUCJPDistributionAnalysis : public CharDistributionAnalysis
202
+ {
203
+ public:
204
+ EUCJPDistributionAnalysis();
205
+ protected:
206
+ //for euc-JP encoding, we are interested
207
+ // first byte range: 0xa0 -- 0xfe
208
+ // second byte range: 0xa1 -- 0xfe
209
+ //no validation needed here. State machine has done that
210
+ PRInt32 GetOrder(const char* str)
211
+ { if ((unsigned char)*str >= (unsigned char)0xa0)
212
+ return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
213
+ else
214
+ return -1;
215
+ };
216
+ };
217
+
218
+ #endif //CharDistribution_h__
219
+
@@ -0,0 +1,56 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "universal.h"
22
+ #include "string.h"
23
+
24
+ nsUniversalDetector* det;
25
+
26
+ extern "C" int CharGuessInit(void)
27
+ {
28
+ det = new nsUniversalDetector;
29
+ if (det)
30
+ return 0;
31
+ else
32
+ return -1;
33
+ }
34
+
35
+ extern "C" const char* GuessChardet(const char *str)
36
+ {
37
+ const char* tmp;
38
+
39
+ if (!det)
40
+ {
41
+ CharGuessInit();
42
+ return NULL;
43
+ }
44
+ det->Reset();
45
+ det->HandleData(str, strlen(str));
46
+ det->DataEnd();
47
+ tmp = det->GetCharset();
48
+
49
+ return tmp;
50
+ }
51
+
52
+ extern "C" int CharGuessDone(void)
53
+ {
54
+ if (det)
55
+ delete det;
56
+ }
@@ -0,0 +1,23 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ int CharGuessInit(void);
22
+ const char* GuessChardet(const char *str);
23
+ int CharGuessDone(void);