charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "Latin1Prober.h"
|
|
22
|
+
#include "types.h"
|
|
23
|
+
|
|
24
|
+
#define UDF 0 // undefined
|
|
25
|
+
#define OTH 1 //other
|
|
26
|
+
#define ASC 2 // ascii capital letter
|
|
27
|
+
#define ASS 3 // ascii small letter
|
|
28
|
+
#define ACV 4 // accent capital vowel
|
|
29
|
+
#define ACO 5 // accent capital other
|
|
30
|
+
#define ASV 6 // accent small vowel
|
|
31
|
+
#define ASO 7 // accent small other
|
|
32
|
+
#define CLASS_NUM 8 // total classes
|
|
33
|
+
|
|
34
|
+
static unsigned char Latin1_CharToClass[] =
|
|
35
|
+
{
|
|
36
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
|
37
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
|
38
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
|
|
39
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
|
|
40
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
|
|
41
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
|
|
42
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
|
|
43
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
|
|
44
|
+
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
|
|
45
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
|
|
46
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
|
|
47
|
+
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
|
|
48
|
+
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
|
|
49
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
|
|
50
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
|
|
51
|
+
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
|
|
52
|
+
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
|
|
53
|
+
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
|
|
54
|
+
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
|
|
55
|
+
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
|
|
56
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
|
|
57
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
|
|
58
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
|
|
59
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
|
|
60
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
|
|
61
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
|
|
62
|
+
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
|
|
63
|
+
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
|
|
64
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
|
|
65
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
|
|
66
|
+
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
|
|
67
|
+
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
/* 0 : illegal
|
|
72
|
+
1 : very unlikely
|
|
73
|
+
2 : normal
|
|
74
|
+
3 : very likely
|
|
75
|
+
*/
|
|
76
|
+
static unsigned char Latin1ClassModel[] =
|
|
77
|
+
{
|
|
78
|
+
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
|
79
|
+
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
80
|
+
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
|
81
|
+
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
|
82
|
+
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
|
|
83
|
+
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
|
|
84
|
+
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
|
85
|
+
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
|
|
86
|
+
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
void nsLatin1Prober::Reset(void)
|
|
90
|
+
{
|
|
91
|
+
mState = eDetecting;
|
|
92
|
+
mLastCharClass = OTH;
|
|
93
|
+
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
|
94
|
+
mFreqCounter[i] = 0;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
PRBool nsLatin1Prober::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
|
98
|
+
{
|
|
99
|
+
//do filtering to reduce load to probers
|
|
100
|
+
char *newptr;
|
|
101
|
+
char *prevPtr, *curPtr;
|
|
102
|
+
PRBool isInTag = PR_FALSE;
|
|
103
|
+
|
|
104
|
+
newptr = *newBuf = (char*)PR_MALLOC(aLen);
|
|
105
|
+
if (!newptr)
|
|
106
|
+
return PR_FALSE;
|
|
107
|
+
|
|
108
|
+
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
|
109
|
+
{
|
|
110
|
+
/*
|
|
111
|
+
if (*curPtr == '>')
|
|
112
|
+
isInTag = PR_FALSE;
|
|
113
|
+
else if (*curPtr == '<')
|
|
114
|
+
isInTag = PR_TRUE;
|
|
115
|
+
*/
|
|
116
|
+
|
|
117
|
+
if (!(*curPtr & 0x80) &&
|
|
118
|
+
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
|
|
119
|
+
{
|
|
120
|
+
if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
|
|
121
|
+
// and it is not inside a tag, keep it
|
|
122
|
+
{
|
|
123
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
124
|
+
prevPtr++;
|
|
125
|
+
*newptr++ = ' ';
|
|
126
|
+
}
|
|
127
|
+
else
|
|
128
|
+
prevPtr = curPtr+1;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
newLen = newptr - *newBuf;
|
|
133
|
+
|
|
134
|
+
return PR_TRUE;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|
139
|
+
{
|
|
140
|
+
char *newBuf1;
|
|
141
|
+
PRUint32 newLen1;
|
|
142
|
+
|
|
143
|
+
if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
|
|
144
|
+
newBuf1 = (char*)aBuf;
|
|
145
|
+
newLen1 = aLen;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
unsigned char charClass;
|
|
149
|
+
unsigned char freq;
|
|
150
|
+
for (PRUint32 i = 0; i < newLen1; i++)
|
|
151
|
+
{
|
|
152
|
+
charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
|
|
153
|
+
freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
|
|
154
|
+
if (freq == 0) {
|
|
155
|
+
mState = eNotMe;
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
mFreqCounter[freq]++;
|
|
159
|
+
mLastCharClass = charClass;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (newBuf1 != aBuf)
|
|
163
|
+
PR_FREEIF(newBuf1);
|
|
164
|
+
|
|
165
|
+
return mState;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
float nsLatin1Prober::GetConfidence(void)
|
|
169
|
+
{
|
|
170
|
+
if (mState == eNotMe)
|
|
171
|
+
return 0.01f;
|
|
172
|
+
|
|
173
|
+
float confidence;
|
|
174
|
+
PRUint32 total = 0;
|
|
175
|
+
for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
|
|
176
|
+
total += mFreqCounter[i];
|
|
177
|
+
|
|
178
|
+
confidence = mFreqCounter[3]*1.0f / total;
|
|
179
|
+
confidence -= mFreqCounter[1]*20.0f/total;
|
|
180
|
+
|
|
181
|
+
if (confidence < 0.0f)
|
|
182
|
+
confidence = 0.0f;
|
|
183
|
+
|
|
184
|
+
// lower the confidence of latin1 so that other more accurate detector
|
|
185
|
+
// can take priority.
|
|
186
|
+
confidence *= 0.50f;
|
|
187
|
+
|
|
188
|
+
return confidence;
|
|
189
|
+
}
|
|
190
|
+
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsLatin1Prober_h__
|
|
22
|
+
#define nsLatin1Prober_h__
|
|
23
|
+
|
|
24
|
+
#include "charsetProber.h"
|
|
25
|
+
|
|
26
|
+
#define FREQ_CAT_NUM 4
|
|
27
|
+
|
|
28
|
+
class nsLatin1Prober: public nsCharSetProber {
|
|
29
|
+
public:
|
|
30
|
+
nsLatin1Prober(void){Reset();};
|
|
31
|
+
virtual ~nsLatin1Prober(void){};
|
|
32
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
33
|
+
const char* GetCharSetName() {return "windows-1252";};
|
|
34
|
+
nsProbingState GetState(void) {return mState;};
|
|
35
|
+
void Reset(void);
|
|
36
|
+
float GetConfidence(void);
|
|
37
|
+
void SetOpion() {};
|
|
38
|
+
|
|
39
|
+
protected:
|
|
40
|
+
PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
|
41
|
+
|
|
42
|
+
nsProbingState mState;
|
|
43
|
+
char mLastCharClass;
|
|
44
|
+
PRUint32 mFreqCounter[FREQ_CAT_NUM];
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
#endif /* nsLatin1Prober_h__ */
|
|
49
|
+
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
// #include <stdio.h>
|
|
22
|
+
// #include "prmem.h"
|
|
23
|
+
#include "types.h"
|
|
24
|
+
|
|
25
|
+
#include "MBCSGroupProber.h"
|
|
26
|
+
|
|
27
|
+
#ifdef DEBUG_chardet
|
|
28
|
+
char *ProberName[] =
|
|
29
|
+
{
|
|
30
|
+
"UTF8",
|
|
31
|
+
"SJIS",
|
|
32
|
+
"EUCJP",
|
|
33
|
+
"GB18030",
|
|
34
|
+
"EUCKR",
|
|
35
|
+
"Big5",
|
|
36
|
+
"EUCTW",
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
nsMBCSGroupProber::nsMBCSGroupProber()
|
|
42
|
+
{
|
|
43
|
+
mProbers[0] = new nsUTF8Prober();
|
|
44
|
+
mProbers[1] = new nsSJISProber();
|
|
45
|
+
mProbers[2] = new nsEUCJPProber();
|
|
46
|
+
mProbers[3] = new nsGB18030Prober();
|
|
47
|
+
mProbers[4] = new nsEUCKRProber();
|
|
48
|
+
mProbers[5] = new nsBig5Prober();
|
|
49
|
+
mProbers[6] = new nsEUCTWProber();
|
|
50
|
+
Reset();
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
nsMBCSGroupProber::~nsMBCSGroupProber()
|
|
54
|
+
{
|
|
55
|
+
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
56
|
+
{
|
|
57
|
+
delete mProbers[i];
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const char* nsMBCSGroupProber::GetCharSetName()
|
|
62
|
+
{
|
|
63
|
+
if (mBestGuess == -1)
|
|
64
|
+
{
|
|
65
|
+
GetConfidence();
|
|
66
|
+
if (mBestGuess == -1)
|
|
67
|
+
mBestGuess = 0;
|
|
68
|
+
}
|
|
69
|
+
return mProbers[mBestGuess]->GetCharSetName();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
void nsMBCSGroupProber::Reset(void)
|
|
73
|
+
{
|
|
74
|
+
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
|
75
|
+
{
|
|
76
|
+
mProbers[i]->Reset();
|
|
77
|
+
mIsActive[i] = PR_TRUE;
|
|
78
|
+
}
|
|
79
|
+
mActiveNum = NUM_OF_PROBERS;
|
|
80
|
+
mBestGuess = -1;
|
|
81
|
+
mState = eDetecting;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
85
|
+
{
|
|
86
|
+
nsProbingState st;
|
|
87
|
+
PRUint32 i;
|
|
88
|
+
|
|
89
|
+
//do filtering to reduce load to probers
|
|
90
|
+
char *highbyteBuf;
|
|
91
|
+
char *hptr;
|
|
92
|
+
PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do not harm except add some noise
|
|
93
|
+
hptr = highbyteBuf = (char*)PR_MALLOC(aLen);
|
|
94
|
+
for (i = 0; i < aLen; i++)
|
|
95
|
+
{
|
|
96
|
+
if (aBuf[i] & 0x80)
|
|
97
|
+
{
|
|
98
|
+
*hptr++ = aBuf[i];
|
|
99
|
+
keepNext = PR_TRUE;
|
|
100
|
+
}
|
|
101
|
+
else
|
|
102
|
+
{
|
|
103
|
+
//if previous is highbyte, keep this even it is a ASCII
|
|
104
|
+
if (keepNext)
|
|
105
|
+
{
|
|
106
|
+
*hptr++ = aBuf[i];
|
|
107
|
+
keepNext = PR_FALSE;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
for (i = 0; i < NUM_OF_PROBERS; i++)
|
|
113
|
+
{
|
|
114
|
+
if (!mIsActive[i])
|
|
115
|
+
continue;
|
|
116
|
+
st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
|
|
117
|
+
if (st == eFoundIt)
|
|
118
|
+
{
|
|
119
|
+
mBestGuess = i;
|
|
120
|
+
mState = eFoundIt;
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
123
|
+
else if (st == eNotMe)
|
|
124
|
+
{
|
|
125
|
+
mIsActive[i] = PR_FALSE;
|
|
126
|
+
mActiveNum--;
|
|
127
|
+
if (mActiveNum <= 0)
|
|
128
|
+
{
|
|
129
|
+
mState = eNotMe;
|
|
130
|
+
break;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
PR_FREEIF(highbyteBuf);
|
|
136
|
+
|
|
137
|
+
return mState;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
float nsMBCSGroupProber::GetConfidence(void)
|
|
141
|
+
{
|
|
142
|
+
PRUint32 i;
|
|
143
|
+
float bestConf = 0.0, cf;
|
|
144
|
+
|
|
145
|
+
switch (mState)
|
|
146
|
+
{
|
|
147
|
+
case eFoundIt:
|
|
148
|
+
return (float)0.99;
|
|
149
|
+
case eNotMe:
|
|
150
|
+
return (float)0.01;
|
|
151
|
+
default:
|
|
152
|
+
for (i = 0; i < NUM_OF_PROBERS; i++)
|
|
153
|
+
{
|
|
154
|
+
if (!mIsActive[i])
|
|
155
|
+
continue;
|
|
156
|
+
cf = mProbers[i]->GetConfidence();
|
|
157
|
+
if (bestConf < cf)
|
|
158
|
+
{
|
|
159
|
+
bestConf = cf;
|
|
160
|
+
mBestGuess = i;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return bestConf;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#ifdef DEBUG_chardet
|
|
168
|
+
void
|
|
169
|
+
nsMBCSGroupProber::DumpStatus()
|
|
170
|
+
{
|
|
171
|
+
PRUint32 i;
|
|
172
|
+
float cf;
|
|
173
|
+
|
|
174
|
+
GetConfidence();
|
|
175
|
+
for (i = 0; i < NUM_OF_PROBERS; i++)
|
|
176
|
+
{
|
|
177
|
+
if (!mIsActive[i])
|
|
178
|
+
printf("[%s] is inactive(ie. cofidence is too low).\r\n", ProberName[i]);
|
|
179
|
+
else
|
|
180
|
+
{
|
|
181
|
+
cf = mProbers[i]->GetConfidence();
|
|
182
|
+
printf("[%s] prober has confidence %f\r\n", ProberName[i], cf);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
#endif
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsMBCSGroupProber_h__
|
|
22
|
+
#define nsMBCSGroupProber_h__
|
|
23
|
+
|
|
24
|
+
#include "SJISProber.h"
|
|
25
|
+
#include "UTF8Prober.h"
|
|
26
|
+
#include "EUCJPProber.h"
|
|
27
|
+
#include "GB2312Prober.h"
|
|
28
|
+
#include "EUCKRProber.h"
|
|
29
|
+
#include "big5Prober.h"
|
|
30
|
+
#include "EUCTWProber.h"
|
|
31
|
+
|
|
32
|
+
#define NUM_OF_PROBERS 7
|
|
33
|
+
|
|
34
|
+
class nsMBCSGroupProber: public nsCharSetProber {
|
|
35
|
+
public:
|
|
36
|
+
nsMBCSGroupProber();
|
|
37
|
+
virtual ~nsMBCSGroupProber();
|
|
38
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
39
|
+
const char* GetCharSetName();
|
|
40
|
+
nsProbingState GetState(void) {return mState;};
|
|
41
|
+
void Reset(void);
|
|
42
|
+
float GetConfidence(void);
|
|
43
|
+
void SetOpion() {};
|
|
44
|
+
|
|
45
|
+
#ifdef DEBUG_chardet
|
|
46
|
+
void DumpStatus();
|
|
47
|
+
#endif
|
|
48
|
+
|
|
49
|
+
protected:
|
|
50
|
+
nsProbingState mState;
|
|
51
|
+
nsCharSetProber* mProbers[NUM_OF_PROBERS];
|
|
52
|
+
PRBool mIsActive[NUM_OF_PROBERS];
|
|
53
|
+
PRInt32 mBestGuess;
|
|
54
|
+
PRUint32 mActiveNum;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
#endif /* nsMBCSGroupProber_h__ */
|
|
58
|
+
|