charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "SBCharsetProber.h"
|
|
22
|
+
#include "SBCSGroupProber.h"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
nsSBCSGroupProber::nsSBCSGroupProber()
|
|
26
|
+
{
|
|
27
|
+
mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
|
|
28
|
+
mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
|
|
29
|
+
mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
|
|
30
|
+
mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
|
|
31
|
+
mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
|
|
32
|
+
mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
|
|
33
|
+
mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
|
|
34
|
+
mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
|
|
35
|
+
mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
|
|
36
|
+
mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
|
|
37
|
+
|
|
38
|
+
// disable latin2 before latin1 is available, otherwise all latin1
|
|
39
|
+
// will be detected as latin2 because of their similarity.
|
|
40
|
+
//mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
|
|
41
|
+
//mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
|
|
42
|
+
|
|
43
|
+
Reset();
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
nsSBCSGroupProber::~nsSBCSGroupProber()
|
|
47
|
+
{
|
|
48
|
+
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
|
49
|
+
{
|
|
50
|
+
delete mProbers[i];
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
const char* nsSBCSGroupProber::GetCharSetName()
|
|
56
|
+
{
|
|
57
|
+
//if we have no answer yet
|
|
58
|
+
if (mBestGuess == -1)
|
|
59
|
+
{
|
|
60
|
+
GetConfidence();
|
|
61
|
+
//no charset seems positive
|
|
62
|
+
if (mBestGuess == -1)
|
|
63
|
+
//we will use default.
|
|
64
|
+
mBestGuess = 0;
|
|
65
|
+
}
|
|
66
|
+
return mProbers[mBestGuess]->GetCharSetName();
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
void nsSBCSGroupProber::Reset(void)
|
|
70
|
+
{
|
|
71
|
+
for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
|
72
|
+
{
|
|
73
|
+
mProbers[i]->Reset();
|
|
74
|
+
mIsActive[i] = PR_TRUE;
|
|
75
|
+
}
|
|
76
|
+
mBestGuess = -1;
|
|
77
|
+
mState = eDetecting;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
//This filter apply to all scripts that does not use latin letters (english letter)
|
|
81
|
+
PRBool nsSBCSGroupProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
|
82
|
+
{
|
|
83
|
+
//do filtering to reduce load to probers
|
|
84
|
+
char *newptr;
|
|
85
|
+
char *prevPtr, *curPtr;
|
|
86
|
+
|
|
87
|
+
PRBool meetMSB = PR_FALSE;
|
|
88
|
+
newptr = *newBuf = (char*)PR_MALLOC(aLen);
|
|
89
|
+
if (!newptr)
|
|
90
|
+
return PR_FALSE;
|
|
91
|
+
|
|
92
|
+
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
|
93
|
+
{
|
|
94
|
+
if (*curPtr & 0x80)
|
|
95
|
+
meetMSB = PR_TRUE;
|
|
96
|
+
else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
|
|
97
|
+
{
|
|
98
|
+
//current char is a symbol, most likely a punctuation. we treat it as segment delimiter
|
|
99
|
+
if (meetMSB && curPtr > prevPtr)
|
|
100
|
+
//this segment contains more than single symbol, and it has upper ascii, we need to keep it
|
|
101
|
+
{
|
|
102
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
103
|
+
prevPtr++;
|
|
104
|
+
*newptr++ = ' ';
|
|
105
|
+
meetMSB = PR_FALSE;
|
|
106
|
+
}
|
|
107
|
+
else //ignore current segment. (either because it is just a symbol or just a english word
|
|
108
|
+
prevPtr = curPtr+1;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
newLen = newptr - *newBuf;
|
|
113
|
+
|
|
114
|
+
return PR_TRUE;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#ifdef NO_ENGLISH_CONTAMINATION
|
|
118
|
+
//This filter apply to all scripts that does use latin letters (english letter)
|
|
119
|
+
PRBool nsSBCSGroupProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
|
120
|
+
{
|
|
121
|
+
//do filtering to reduce load to probers
|
|
122
|
+
char *newptr;
|
|
123
|
+
char *prevPtr, *curPtr;
|
|
124
|
+
PRBool isInTag = PR_FALSE;
|
|
125
|
+
|
|
126
|
+
newptr = *newBuf = (char*)PR_MALLOC(aLen);
|
|
127
|
+
if (!newptr)
|
|
128
|
+
return PR_FALSE;
|
|
129
|
+
|
|
130
|
+
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
|
131
|
+
{
|
|
132
|
+
if (*curPtr == '>')
|
|
133
|
+
isInTag = PR_FALSE;
|
|
134
|
+
else if (*curPtr == '<')
|
|
135
|
+
isInTag = PR_TRUE;
|
|
136
|
+
|
|
137
|
+
if (!(*curPtr & 0x80) &&
|
|
138
|
+
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
|
|
139
|
+
{
|
|
140
|
+
if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
|
|
141
|
+
// and it is not inside a tag, keep it
|
|
142
|
+
{
|
|
143
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
144
|
+
prevPtr++;
|
|
145
|
+
*newptr++ = ' ';
|
|
146
|
+
}
|
|
147
|
+
else
|
|
148
|
+
prevPtr = curPtr+1;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
newLen = newptr - *newBuf;
|
|
153
|
+
|
|
154
|
+
return PR_TRUE;
|
|
155
|
+
}
|
|
156
|
+
#endif //NO_ENGLISH_CONTAMINATION
|
|
157
|
+
|
|
158
|
+
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
159
|
+
{
|
|
160
|
+
nsProbingState st;
|
|
161
|
+
PRUint32 i;
|
|
162
|
+
char *newBuf1;
|
|
163
|
+
PRUint32 newLen1;
|
|
164
|
+
|
|
165
|
+
//apply filter to original buffer, and we got new buffer back
|
|
166
|
+
//depend on what script it is, we will feed them the new buffer
|
|
167
|
+
//we got after applying proper filter
|
|
168
|
+
FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1);
|
|
169
|
+
|
|
170
|
+
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
|
171
|
+
{
|
|
172
|
+
if (!mIsActive[i])
|
|
173
|
+
continue;
|
|
174
|
+
st = mProbers[i]->HandleData(newBuf1, newLen1);
|
|
175
|
+
if (st == eFoundIt)
|
|
176
|
+
{
|
|
177
|
+
mBestGuess = i;
|
|
178
|
+
mState = eFoundIt;
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
else if (st == eNotMe)
|
|
182
|
+
{
|
|
183
|
+
mIsActive[i] = PR_FALSE;
|
|
184
|
+
mActiveNum--;
|
|
185
|
+
if (mActiveNum <= 0)
|
|
186
|
+
{
|
|
187
|
+
mState = eNotMe;
|
|
188
|
+
break;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
PR_FREEIF(newBuf1);
|
|
194
|
+
|
|
195
|
+
return mState;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
float nsSBCSGroupProber::GetConfidence(void)
|
|
199
|
+
{
|
|
200
|
+
PRUint32 i;
|
|
201
|
+
float bestConf = 0.0, cf;
|
|
202
|
+
|
|
203
|
+
switch (mState)
|
|
204
|
+
{
|
|
205
|
+
case eFoundIt:
|
|
206
|
+
return (float)0.99; //sure yes
|
|
207
|
+
case eNotMe:
|
|
208
|
+
return (float)0.01; //sure no
|
|
209
|
+
default:
|
|
210
|
+
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
|
211
|
+
{
|
|
212
|
+
if (!mIsActive[i])
|
|
213
|
+
continue;
|
|
214
|
+
cf = mProbers[i]->GetConfidence();
|
|
215
|
+
if (bestConf < cf)
|
|
216
|
+
{
|
|
217
|
+
bestConf = cf;
|
|
218
|
+
mBestGuess = i;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return bestConf;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
#ifdef DEBUG_chardet
|
|
226
|
+
void
|
|
227
|
+
nsSBCSGroupProber::DumpStatus()
|
|
228
|
+
{
|
|
229
|
+
PRUint32 i;
|
|
230
|
+
float cf;
|
|
231
|
+
|
|
232
|
+
cf = GetConfidence();
|
|
233
|
+
printf("SBCS Group Prober --------begin status \r\n");
|
|
234
|
+
for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
|
|
235
|
+
{
|
|
236
|
+
if (!mIsActive[i])
|
|
237
|
+
printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
|
|
238
|
+
else
|
|
239
|
+
mProbers[i]->DumpStatus();
|
|
240
|
+
}
|
|
241
|
+
printf("SBCS Group found best match [%s] confidence %f.\r\n",
|
|
242
|
+
mProbers[mBestGuess]->GetCharSetName(), cf);
|
|
243
|
+
}
|
|
244
|
+
#endif
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsSBCSGroupProber_h__
|
|
22
|
+
#define nsSBCSGroupProber_h__
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#define NUM_OF_SBCS_PROBERS 10
|
|
26
|
+
|
|
27
|
+
class nsSingleByteCharSetProber;
|
|
28
|
+
class nsSBCSGroupProber: public nsCharSetProber {
|
|
29
|
+
public:
|
|
30
|
+
nsSBCSGroupProber();
|
|
31
|
+
virtual ~nsSBCSGroupProber();
|
|
32
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
33
|
+
PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
|
34
|
+
PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
|
35
|
+
const char* GetCharSetName();
|
|
36
|
+
nsProbingState GetState(void) {return mState;};
|
|
37
|
+
void Reset(void);
|
|
38
|
+
float GetConfidence(void);
|
|
39
|
+
void SetOpion() {};
|
|
40
|
+
|
|
41
|
+
#ifdef DEBUG_chardet
|
|
42
|
+
void DumpStatus();
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
protected:
|
|
46
|
+
nsProbingState mState;
|
|
47
|
+
nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
|
|
48
|
+
PRBool mIsActive[NUM_OF_SBCS_PROBERS];
|
|
49
|
+
PRInt32 mBestGuess;
|
|
50
|
+
PRUint32 mActiveNum;
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
#endif /* nsSBCSGroupProber_h__ */
|
|
54
|
+
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include <stdio.h>
|
|
22
|
+
#include "SBCharsetProber.h"
|
|
23
|
+
|
|
24
|
+
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
25
|
+
{
|
|
26
|
+
unsigned char order;
|
|
27
|
+
|
|
28
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
29
|
+
{
|
|
30
|
+
order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
|
|
31
|
+
|
|
32
|
+
if (order < SYMBOL_CAT_ORDER)
|
|
33
|
+
mTotalChar++;
|
|
34
|
+
if (order < SAMPLE_SIZE)
|
|
35
|
+
{
|
|
36
|
+
mFreqChar++;
|
|
37
|
+
|
|
38
|
+
if (mLastOrder < SAMPLE_SIZE)
|
|
39
|
+
{
|
|
40
|
+
mTotalSeqs++;
|
|
41
|
+
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
mLastOrder = order;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (mState == eDetecting)
|
|
48
|
+
if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
|
49
|
+
{
|
|
50
|
+
float cf = GetConfidence();
|
|
51
|
+
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
|
52
|
+
mState = eFoundIt;
|
|
53
|
+
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
|
|
54
|
+
mState = eNotMe;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return mState;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
void nsSingleByteCharSetProber::Reset(void)
|
|
61
|
+
{
|
|
62
|
+
mState = eDetecting;
|
|
63
|
+
mLastOrder = 255;
|
|
64
|
+
for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
|
|
65
|
+
mSeqCounters[i] = 0;
|
|
66
|
+
mTotalSeqs = 0;
|
|
67
|
+
mTotalChar = 0;
|
|
68
|
+
mFreqChar = 0;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
//#define NEGATIVE_APPROACH 1
|
|
72
|
+
|
|
73
|
+
float nsSingleByteCharSetProber::GetConfidence(void)
|
|
74
|
+
{
|
|
75
|
+
#ifdef NEGATIVE_APPROACH
|
|
76
|
+
if (mTotalSeqs > 0)
|
|
77
|
+
if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
|
|
78
|
+
return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
|
|
79
|
+
return (float)0.01;
|
|
80
|
+
#else //POSITIVE_APPROACH
|
|
81
|
+
float r;
|
|
82
|
+
|
|
83
|
+
if (mTotalSeqs > 0) {
|
|
84
|
+
r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
|
|
85
|
+
r = r*mFreqChar/mTotalChar;
|
|
86
|
+
if (r >= (float)1.00)
|
|
87
|
+
r = (float)0.99;
|
|
88
|
+
return r;
|
|
89
|
+
}
|
|
90
|
+
return (float)0.01;
|
|
91
|
+
#endif
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#ifdef DEBUG_chardet
|
|
95
|
+
void
|
|
96
|
+
nsSingleByteCharSetProber::DumpStatus()
|
|
97
|
+
{
|
|
98
|
+
printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence());
|
|
99
|
+
}
|
|
100
|
+
#endif
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsSingleByteCharSetProber_h__
|
|
22
|
+
#define nsSingleByteCharSetProber_h__
|
|
23
|
+
|
|
24
|
+
#include "charsetProber.h"
|
|
25
|
+
|
|
26
|
+
#define SAMPLE_SIZE 64
|
|
27
|
+
#define SB_ENOUGH_REL_THRESHOLD 1024
|
|
28
|
+
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
|
29
|
+
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
|
30
|
+
#define SYMBOL_CAT_ORDER 250
|
|
31
|
+
#define NUMBER_OF_SEQ_CAT 4
|
|
32
|
+
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
|
|
33
|
+
#define NEGATIVE_CAT 0
|
|
34
|
+
|
|
35
|
+
typedef struct
|
|
36
|
+
{
|
|
37
|
+
unsigned char *charToOrderMap; //[256] table use to find a char's order
|
|
38
|
+
char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
|
39
|
+
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
|
40
|
+
PRBool keepEnglishLetter; //it says if this script contains latin letters
|
|
41
|
+
const char* charsetName;
|
|
42
|
+
} SequenceModel;
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class nsSingleByteCharSetProber : public nsCharSetProber{
|
|
46
|
+
public:
|
|
47
|
+
nsSingleByteCharSetProber(SequenceModel *model){mModel = model; Reset();};
|
|
48
|
+
const char* GetCharSetName() {return mModel->charsetName;};
|
|
49
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
50
|
+
nsProbingState GetState(void) {return mState;};
|
|
51
|
+
void Reset(void);
|
|
52
|
+
float GetConfidence(void);
|
|
53
|
+
void SetOpion() {};
|
|
54
|
+
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;};
|
|
55
|
+
|
|
56
|
+
#ifdef DEBUG_chardet
|
|
57
|
+
void DumpStatus();
|
|
58
|
+
#endif
|
|
59
|
+
|
|
60
|
+
protected:
|
|
61
|
+
nsProbingState mState;
|
|
62
|
+
SequenceModel *mModel;
|
|
63
|
+
|
|
64
|
+
//char order of last character
|
|
65
|
+
unsigned char mLastOrder;
|
|
66
|
+
|
|
67
|
+
PRUint32 mTotalSeqs;
|
|
68
|
+
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
|
|
69
|
+
|
|
70
|
+
PRUint32 mTotalChar;
|
|
71
|
+
//characters that fall in our sampling range
|
|
72
|
+
PRUint32 mFreqChar;
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
extern SequenceModel Koi8rModel;
|
|
77
|
+
extern SequenceModel Win1251Model;
|
|
78
|
+
extern SequenceModel Latin5Model;
|
|
79
|
+
extern SequenceModel MacCyrillicModel;
|
|
80
|
+
extern SequenceModel Ibm866Model;
|
|
81
|
+
extern SequenceModel Ibm855Model;
|
|
82
|
+
extern SequenceModel Latin7Model;
|
|
83
|
+
extern SequenceModel Win1253Model;
|
|
84
|
+
extern SequenceModel Latin5BulgarianModel;
|
|
85
|
+
extern SequenceModel Win1251BulgarianModel;
|
|
86
|
+
extern SequenceModel Latin2HungarianModel;
|
|
87
|
+
extern SequenceModel Win1250HungarianModel;
|
|
88
|
+
|
|
89
|
+
#endif /* nsSingleByteCharSetProber_h__ */
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
// for S-JIS encoding, obeserve characteristic:
|
|
22
|
+
// 1, kana character (or hankaku?) often have hight frequency of appereance
|
|
23
|
+
// 2, kana character often exist in group
|
|
24
|
+
// 3, certain combination of kana is never used in japanese language
|
|
25
|
+
|
|
26
|
+
#include "SJISProber.h"
|
|
27
|
+
|
|
28
|
+
void nsSJISProber::Reset(void)
|
|
29
|
+
{
|
|
30
|
+
mCodingSM->Reset();
|
|
31
|
+
mState = eDetecting;
|
|
32
|
+
mContextAnalyser.Reset();
|
|
33
|
+
mDistributionAnalyser.Reset();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
37
|
+
{
|
|
38
|
+
nsSMState codingState;
|
|
39
|
+
|
|
40
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
41
|
+
{
|
|
42
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
43
|
+
if (codingState == eError)
|
|
44
|
+
{
|
|
45
|
+
mState = eNotMe;
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
if (codingState == eItsMe)
|
|
49
|
+
{
|
|
50
|
+
mState = eFoundIt;
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
if (codingState == eStart)
|
|
54
|
+
{
|
|
55
|
+
PRUint32 charLen = mCodingSM->GetCurrentCharLen();
|
|
56
|
+
if (i == 0)
|
|
57
|
+
{
|
|
58
|
+
mLastChar[1] = aBuf[0];
|
|
59
|
+
mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen);
|
|
60
|
+
mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
|
|
61
|
+
}
|
|
62
|
+
else
|
|
63
|
+
{
|
|
64
|
+
mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen);
|
|
65
|
+
mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
mLastChar[0] = aBuf[aLen-1];
|
|
71
|
+
|
|
72
|
+
if (mState == eDetecting)
|
|
73
|
+
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
|
74
|
+
mState = eFoundIt;
|
|
75
|
+
|
|
76
|
+
return mState;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
float nsSJISProber::GetConfidence(void)
|
|
80
|
+
{
|
|
81
|
+
float contxtCf = mContextAnalyser.GetConfidence();
|
|
82
|
+
float distribCf = mDistributionAnalyser.GetConfidence();
|
|
83
|
+
|
|
84
|
+
return (contxtCf > distribCf ? contxtCf : distribCf);
|
|
85
|
+
}
|
|
86
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
// for S-JIS encoding, obeserve characteristic:
|
|
22
|
+
// 1, kana character (or hankaku?) often have hight frequency of appereance
|
|
23
|
+
// 2, kana character often exist in group
|
|
24
|
+
// 3, certain combination of kana is never used in japanese language
|
|
25
|
+
|
|
26
|
+
#ifndef nsSJISProber_h__
|
|
27
|
+
#define nsSJISProber_h__
|
|
28
|
+
|
|
29
|
+
#include "charsetProber.h"
|
|
30
|
+
#include "codingStateMachine.h"
|
|
31
|
+
#include "jpCntx.h"
|
|
32
|
+
#include "charDistribution.h"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class nsSJISProber: public nsCharSetProber {
|
|
36
|
+
public:
|
|
37
|
+
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
|
38
|
+
Reset();};
|
|
39
|
+
virtual ~nsSJISProber(void){delete mCodingSM;};
|
|
40
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
41
|
+
const char* GetCharSetName() {return "Shift_JIS";};
|
|
42
|
+
nsProbingState GetState(void) {return mState;};
|
|
43
|
+
void Reset(void);
|
|
44
|
+
float GetConfidence(void);
|
|
45
|
+
void SetOpion() {};
|
|
46
|
+
|
|
47
|
+
protected:
|
|
48
|
+
nsCodingStateMachine* mCodingSM;
|
|
49
|
+
nsProbingState mState;
|
|
50
|
+
|
|
51
|
+
SJISContextAnalysis mContextAnalyser;
|
|
52
|
+
SJISDistributionAnalysis mDistributionAnalyser;
|
|
53
|
+
|
|
54
|
+
char mLastChar[2];
|
|
55
|
+
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
#endif /* nsSJISProber_h__ */
|
|
60
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "UTF8Prober.h"
|
|
22
|
+
|
|
23
|
+
void nsUTF8Prober::Reset(void)
|
|
24
|
+
{
|
|
25
|
+
mCodingSM->Reset();
|
|
26
|
+
mNumOfMBChar = 0;
|
|
27
|
+
mState = eDetecting;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|
31
|
+
{
|
|
32
|
+
nsSMState codingState;
|
|
33
|
+
|
|
34
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
35
|
+
{
|
|
36
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
37
|
+
if (codingState == eError)
|
|
38
|
+
{
|
|
39
|
+
mState = eNotMe;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
if (codingState == eItsMe)
|
|
43
|
+
{
|
|
44
|
+
mState = eFoundIt;
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
if (codingState == eStart)
|
|
48
|
+
{
|
|
49
|
+
if (mCodingSM->GetCurrentCharLen() >= 2)
|
|
50
|
+
mNumOfMBChar++;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (mState == eDetecting)
|
|
55
|
+
if (GetConfidence() > SHORTCUT_THRESHOLD)
|
|
56
|
+
mState = eFoundIt;
|
|
57
|
+
return mState;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
#define ONE_CHAR_PROB (float)0.50
|
|
61
|
+
|
|
62
|
+
float nsUTF8Prober::GetConfidence(void)
|
|
63
|
+
{
|
|
64
|
+
float unlike = (float)0.99;
|
|
65
|
+
|
|
66
|
+
if (mNumOfMBChar < 6)
|
|
67
|
+
{
|
|
68
|
+
for (PRUint32 i = 0; i < mNumOfMBChar; i++)
|
|
69
|
+
unlike *= ONE_CHAR_PROB;
|
|
70
|
+
return (float)1.0 - unlike;
|
|
71
|
+
}
|
|
72
|
+
else
|
|
73
|
+
return (float)0.99;
|
|
74
|
+
}
|
|
75
|
+
|