charguess 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
/*
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
4
|
+
Based on Mozilla sources
|
5
|
+
|
6
|
+
This library is free software; you can redistribute it and/or
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
8
|
+
License as published by the Free Software Foundation; either
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
10
|
+
|
11
|
+
This library is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
Lesser General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
17
|
+
License along with this library; if not, write to the Free Software
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
*/
|
20
|
+
|
21
|
+
#include "universal.h"
|
22
|
+
|
23
|
+
#include "MBCSGroupProber.h"
|
24
|
+
#include "SBCSGroupProber.h"
|
25
|
+
#include "EscCharsetProber.h"
|
26
|
+
#include "Latin1Prober.h"
|
27
|
+
|
28
|
+
nsUniversalDetector::nsUniversalDetector()
|
29
|
+
{
|
30
|
+
mDone = PR_FALSE;
|
31
|
+
mBestGuess = -1; //illegal value as signal
|
32
|
+
mInTag = PR_FALSE;
|
33
|
+
mEscCharSetProber = nsnull;
|
34
|
+
|
35
|
+
mStart = PR_TRUE;
|
36
|
+
mDetectedCharset = nsnull;
|
37
|
+
mGotData = PR_FALSE;
|
38
|
+
mInputState = ePureAscii;
|
39
|
+
mLastChar = '\0';
|
40
|
+
|
41
|
+
PRUint32 i;
|
42
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
43
|
+
mCharSetProbers[i] = nsnull;
|
44
|
+
}
|
45
|
+
|
46
|
+
nsUniversalDetector::~nsUniversalDetector()
|
47
|
+
{
|
48
|
+
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
49
|
+
if (mCharSetProbers[i])
|
50
|
+
delete mCharSetProbers[i];
|
51
|
+
if (mEscCharSetProber)
|
52
|
+
delete mEscCharSetProber;
|
53
|
+
}
|
54
|
+
|
55
|
+
void nsUniversalDetector::Reset()
|
56
|
+
{
|
57
|
+
mDone = PR_FALSE;
|
58
|
+
mBestGuess = -1; //illegal value as signal
|
59
|
+
mInTag = PR_FALSE;
|
60
|
+
|
61
|
+
mStart = PR_TRUE;
|
62
|
+
mDetectedCharset = nsnull;
|
63
|
+
mGotData = PR_FALSE;
|
64
|
+
mInputState = ePureAscii;
|
65
|
+
mLastChar = '\0';
|
66
|
+
|
67
|
+
if (mEscCharSetProber)
|
68
|
+
mEscCharSetProber->Reset();
|
69
|
+
|
70
|
+
PRUint32 i;
|
71
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
72
|
+
if (mCharSetProbers[i])
|
73
|
+
mCharSetProbers[i]->Reset();
|
74
|
+
}
|
75
|
+
|
76
|
+
//---------------------------------------------------------------------
|
77
|
+
#define SHORTCUT_THRESHOLD (float)0.95
|
78
|
+
#define MINIMUM_THRESHOLD (float)0.20
|
79
|
+
|
80
|
+
void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
81
|
+
{
|
82
|
+
if(mDone)
|
83
|
+
return;
|
84
|
+
|
85
|
+
if (aLen > 0)
|
86
|
+
mGotData = PR_TRUE;
|
87
|
+
|
88
|
+
//If the data starts with BOM, we know it is UTF
|
89
|
+
if (mStart)
|
90
|
+
{
|
91
|
+
mStart = PR_FALSE;
|
92
|
+
if (aLen > 3)
|
93
|
+
switch (aBuf[0])
|
94
|
+
{
|
95
|
+
case '\xEF':
|
96
|
+
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
97
|
+
// EF BB BF UTF-8 encoded BOM
|
98
|
+
mDetectedCharset = "UTF-8";
|
99
|
+
break;
|
100
|
+
case '\xFE':
|
101
|
+
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
102
|
+
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
103
|
+
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
|
104
|
+
else if ('\xFF' == aBuf[1])
|
105
|
+
// FE FF UTF-16, big endian BOM
|
106
|
+
mDetectedCharset = "UTF-16BE";
|
107
|
+
break;
|
108
|
+
case '\x00':
|
109
|
+
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
110
|
+
// 00 00 FE FF UTF-32, big-endian BOM
|
111
|
+
mDetectedCharset = "UTF-32BE";
|
112
|
+
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
113
|
+
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
114
|
+
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
|
115
|
+
break;
|
116
|
+
case '\xFF':
|
117
|
+
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
118
|
+
// FF FE 00 00 UTF-32, little-endian BOM
|
119
|
+
mDetectedCharset = "UTF-32LE";
|
120
|
+
else if ('\xFE' == aBuf[1])
|
121
|
+
// FF FE UTF-16, little endian BOM
|
122
|
+
mDetectedCharset = "UTF-16LE";
|
123
|
+
break;
|
124
|
+
} // switch
|
125
|
+
|
126
|
+
if (mDetectedCharset)
|
127
|
+
{
|
128
|
+
mDone = PR_TRUE;
|
129
|
+
return;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
PRUint32 i;
|
134
|
+
for (i = 0; i < aLen; i++)
|
135
|
+
{
|
136
|
+
//other than 0xa0, if every othe character is ascii, the page is ascii
|
137
|
+
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
|
138
|
+
{
|
139
|
+
//we got a non-ascii byte (high-byte)
|
140
|
+
if (mInputState != eHighbyte)
|
141
|
+
{
|
142
|
+
//adjust state
|
143
|
+
mInputState = eHighbyte;
|
144
|
+
|
145
|
+
//kill mEscCharSetProber if it is active
|
146
|
+
if (mEscCharSetProber) {
|
147
|
+
delete mEscCharSetProber;
|
148
|
+
mEscCharSetProber = nsnull;
|
149
|
+
}
|
150
|
+
|
151
|
+
//start multibyte and singlebyte charset prober
|
152
|
+
if (nsnull == mCharSetProbers[0])
|
153
|
+
mCharSetProbers[0] = new nsMBCSGroupProber;
|
154
|
+
if (nsnull == mCharSetProbers[1])
|
155
|
+
mCharSetProbers[1] = new nsSBCSGroupProber;
|
156
|
+
if (nsnull == mCharSetProbers[2])
|
157
|
+
mCharSetProbers[2] = new nsLatin1Prober;
|
158
|
+
}
|
159
|
+
}
|
160
|
+
else
|
161
|
+
{
|
162
|
+
//ok, just pure ascii so far
|
163
|
+
if ( ePureAscii == mInputState &&
|
164
|
+
(aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
|
165
|
+
{
|
166
|
+
//found escape character or HZ "~{"
|
167
|
+
mInputState = eEscAscii;
|
168
|
+
}
|
169
|
+
mLastChar = aBuf[i];
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
nsProbingState st;
|
174
|
+
switch (mInputState)
|
175
|
+
{
|
176
|
+
case eEscAscii:
|
177
|
+
if (nsnull == mEscCharSetProber)
|
178
|
+
mEscCharSetProber = new nsEscCharSetProber;
|
179
|
+
st = mEscCharSetProber->HandleData(aBuf, aLen);
|
180
|
+
if (st == eFoundIt)
|
181
|
+
{
|
182
|
+
mDone = PR_TRUE;
|
183
|
+
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
184
|
+
}
|
185
|
+
break;
|
186
|
+
case eHighbyte:
|
187
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
188
|
+
{
|
189
|
+
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
190
|
+
if (st == eFoundIt)
|
191
|
+
{
|
192
|
+
mDone = PR_TRUE;
|
193
|
+
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
194
|
+
return;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
break;
|
198
|
+
|
199
|
+
default: //pure ascii
|
200
|
+
;//do nothing here
|
201
|
+
}
|
202
|
+
return ;
|
203
|
+
}
|
204
|
+
|
205
|
+
|
206
|
+
//---------------------------------------------------------------------
|
207
|
+
void nsUniversalDetector::DataEnd()
|
208
|
+
{
|
209
|
+
if (!mGotData)
|
210
|
+
{
|
211
|
+
// we haven't got any data yet, return immediately
|
212
|
+
// caller program sometimes call DataEnd before anything has been sent to detector
|
213
|
+
return;
|
214
|
+
}
|
215
|
+
|
216
|
+
if (mDetectedCharset)
|
217
|
+
{
|
218
|
+
mDone = PR_TRUE;
|
219
|
+
Report(mDetectedCharset);
|
220
|
+
return;
|
221
|
+
}
|
222
|
+
|
223
|
+
switch (mInputState)
|
224
|
+
{
|
225
|
+
case eHighbyte:
|
226
|
+
{
|
227
|
+
float proberConfidence;
|
228
|
+
float maxProberConfidence = (float)0.0;
|
229
|
+
PRInt32 maxProber = 0;
|
230
|
+
|
231
|
+
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
232
|
+
{
|
233
|
+
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
234
|
+
#ifdef DEBUG_chardet
|
235
|
+
mCharSetProbers[i]->DumpStatus();
|
236
|
+
#endif
|
237
|
+
|
238
|
+
if (proberConfidence > maxProberConfidence)
|
239
|
+
{
|
240
|
+
maxProberConfidence = proberConfidence;
|
241
|
+
maxProber = i;
|
242
|
+
}
|
243
|
+
}
|
244
|
+
//do not report anything because we are not confident of it, that's in fact a negative answer
|
245
|
+
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
246
|
+
Report(mCharSetProbers[maxProber]->GetCharSetName());
|
247
|
+
}
|
248
|
+
break;
|
249
|
+
case eEscAscii:
|
250
|
+
break;
|
251
|
+
default:
|
252
|
+
;
|
253
|
+
}
|
254
|
+
return;
|
255
|
+
}
|
256
|
+
|
257
|
+
|
258
|
+
void nsUniversalDetector::Report(const char* aCharset)
|
259
|
+
{
|
260
|
+
if (!mDone)
|
261
|
+
{
|
262
|
+
mDone = PR_TRUE;
|
263
|
+
mDetectedCharset = aCharset;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
|
267
|
+
const char* nsUniversalDetector::GetCharset(void)
|
268
|
+
{
|
269
|
+
if (mDone == PR_TRUE)
|
270
|
+
return (mDetectedCharset);
|
271
|
+
else
|
272
|
+
return NULL;
|
273
|
+
}
|
@@ -0,0 +1,65 @@
|
|
1
|
+
/*
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
4
|
+
Based on Mozilla sources
|
5
|
+
|
6
|
+
This library is free software; you can redistribute it and/or
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
8
|
+
License as published by the Free Software Foundation; either
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
10
|
+
|
11
|
+
This library is distributed in the hope that it will be useful,
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
Lesser General Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
17
|
+
License along with this library; if not, write to the Free Software
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
*/
|
20
|
+
|
21
|
+
#ifndef universal_h__
|
22
|
+
#define universal_h__
|
23
|
+
|
24
|
+
#include "types.h"
|
25
|
+
|
26
|
+
// #include "nsICharsetDetector.h"
|
27
|
+
// #include "nsIStringCharsetDetector.h"
|
28
|
+
// #include "nsICharsetDetectionObserver.h"
|
29
|
+
|
30
|
+
class nsCharSetProber;
|
31
|
+
|
32
|
+
#define NUM_OF_CHARSET_PROBERS 3
|
33
|
+
|
34
|
+
typedef enum {
|
35
|
+
ePureAscii = 0,
|
36
|
+
eEscAscii = 1,
|
37
|
+
eHighbyte = 2
|
38
|
+
} nsInputState;
|
39
|
+
|
40
|
+
class nsUniversalDetector {
|
41
|
+
public:
|
42
|
+
nsUniversalDetector();
|
43
|
+
virtual ~nsUniversalDetector();
|
44
|
+
virtual void HandleData(const char* aBuf, PRUint32 aLen);
|
45
|
+
virtual void DataEnd(void);
|
46
|
+
virtual void Reset();
|
47
|
+
virtual const char* GetCharset(void);
|
48
|
+
|
49
|
+
protected:
|
50
|
+
virtual void Report(const char* aCharset);
|
51
|
+
nsInputState mInputState;
|
52
|
+
PRBool mDone;
|
53
|
+
PRBool mInTag;
|
54
|
+
PRBool mStart;
|
55
|
+
PRBool mGotData;
|
56
|
+
char mLastChar;
|
57
|
+
const char * mDetectedCharset;
|
58
|
+
PRInt32 mBestGuess;
|
59
|
+
|
60
|
+
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
61
|
+
nsCharSetProber *mEscCharSetProber;
|
62
|
+
};
|
63
|
+
|
64
|
+
#endif
|
65
|
+
|
data/script/console
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
puts "Loading charguess gem"
|
9
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
@@ -0,0 +1,47 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build charguess."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir("ext/libcharguess/cpp") do
|
31
|
+
sh("./configure")
|
32
|
+
sh("make")
|
33
|
+
end
|
34
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
35
|
+
end
|
36
|
+
|
37
|
+
file ext_so => ext_files do
|
38
|
+
Dir.chdir(ext) do
|
39
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
40
|
+
if !ok
|
41
|
+
require "fileutils"
|
42
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/tasks/extconf.rake
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
desc "Compiles the Ruby extension"
|
3
|
+
task :compile
|
4
|
+
end
|
5
|
+
|
6
|
+
task :compile => "extconf:compile"
|
7
|
+
|
8
|
+
task :test => :compile
|
9
|
+
|
10
|
+
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
|
11
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
|
12
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
13
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: charguess
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "1.0"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Ernesto Jim\xC3\xA9nez"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-09 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: newgem
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.2.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: hoe
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.8.0
|
34
|
+
version:
|
35
|
+
description: |-
|
36
|
+
This gem builds and installs libcharguess and it's binding libcharguess-ruby
|
37
|
+
|
38
|
+
* libcharguess: http://libcharguess.sourceforge.net/
|
39
|
+
* libcharguess-ruby: http://raa.ruby-lang.org/project/charguess/
|
40
|
+
email:
|
41
|
+
- erjica@gmail.com
|
42
|
+
executables: []
|
43
|
+
|
44
|
+
extensions:
|
45
|
+
- ext/charguess/extconf.rb
|
46
|
+
extra_rdoc_files:
|
47
|
+
- History.txt
|
48
|
+
- Manifest.txt
|
49
|
+
- PostInstall.txt
|
50
|
+
- README.rdoc
|
51
|
+
files:
|
52
|
+
- History.txt
|
53
|
+
- Manifest.txt
|
54
|
+
- PostInstall.txt
|
55
|
+
- README.rdoc
|
56
|
+
- Rakefile
|
57
|
+
- ext/charguess/charguess.c
|
58
|
+
- ext/charguess/extconf.rb
|
59
|
+
- ext/libcharguess/cpp/.deps/EUCJPProber.Plo
|
60
|
+
- ext/libcharguess/cpp/.deps/EUCJPProber.Po
|
61
|
+
- ext/libcharguess/cpp/.deps/EUCKRProber.Plo
|
62
|
+
- ext/libcharguess/cpp/.deps/EUCKRProber.Po
|
63
|
+
- ext/libcharguess/cpp/.deps/EUCTWProber.Plo
|
64
|
+
- ext/libcharguess/cpp/.deps/EUCTWProber.Po
|
65
|
+
- ext/libcharguess/cpp/.deps/EscCharsetProber.Plo
|
66
|
+
- ext/libcharguess/cpp/.deps/EscCharsetProber.Po
|
67
|
+
- ext/libcharguess/cpp/.deps/EscSM.Plo
|
68
|
+
- ext/libcharguess/cpp/.deps/EscSM.Po
|
69
|
+
- ext/libcharguess/cpp/.deps/GB2312Prober.Plo
|
70
|
+
- ext/libcharguess/cpp/.deps/GB2312Prober.Po
|
71
|
+
- ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo
|
72
|
+
- ext/libcharguess/cpp/.deps/LangBulgarianModel.Po
|
73
|
+
- ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo
|
74
|
+
- ext/libcharguess/cpp/.deps/LangCyrillicModel.Po
|
75
|
+
- ext/libcharguess/cpp/.deps/LangGreekModel.Plo
|
76
|
+
- ext/libcharguess/cpp/.deps/LangGreekModel.Po
|
77
|
+
- ext/libcharguess/cpp/.deps/LangHungarianModel.Plo
|
78
|
+
- ext/libcharguess/cpp/.deps/LangHungarianModel.Po
|
79
|
+
- ext/libcharguess/cpp/.deps/LangThaiModel.Plo
|
80
|
+
- ext/libcharguess/cpp/.deps/LangThaiModel.Po
|
81
|
+
- ext/libcharguess/cpp/.deps/Latin1Prober.Plo
|
82
|
+
- ext/libcharguess/cpp/.deps/Latin1Prober.Po
|
83
|
+
- ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo
|
84
|
+
- ext/libcharguess/cpp/.deps/MBCSGroupProber.Po
|
85
|
+
- ext/libcharguess/cpp/.deps/MBCSSM.Plo
|
86
|
+
- ext/libcharguess/cpp/.deps/MBCSSM.Po
|
87
|
+
- ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo
|
88
|
+
- ext/libcharguess/cpp/.deps/SBCSGroupProber.Po
|
89
|
+
- ext/libcharguess/cpp/.deps/SBCharsetProber.Plo
|
90
|
+
- ext/libcharguess/cpp/.deps/SBCharsetProber.Po
|
91
|
+
- ext/libcharguess/cpp/.deps/SJISProber.Plo
|
92
|
+
- ext/libcharguess/cpp/.deps/SJISProber.Po
|
93
|
+
- ext/libcharguess/cpp/.deps/UTF8Prober.Plo
|
94
|
+
- ext/libcharguess/cpp/.deps/UTF8Prober.Po
|
95
|
+
- ext/libcharguess/cpp/.deps/big5Prober.Plo
|
96
|
+
- ext/libcharguess/cpp/.deps/big5Prober.Po
|
97
|
+
- ext/libcharguess/cpp/.deps/charDistribution.Plo
|
98
|
+
- ext/libcharguess/cpp/.deps/charDistribution.Po
|
99
|
+
- ext/libcharguess/cpp/.deps/chardet.Plo
|
100
|
+
- ext/libcharguess/cpp/.deps/chardet.Po
|
101
|
+
- ext/libcharguess/cpp/.deps/charguess.Po
|
102
|
+
- ext/libcharguess/cpp/.deps/jpCntx.Plo
|
103
|
+
- ext/libcharguess/cpp/.deps/jpCntx.Po
|
104
|
+
- ext/libcharguess/cpp/.deps/universal.Plo
|
105
|
+
- ext/libcharguess/cpp/.deps/universal.Po
|
106
|
+
- ext/libcharguess/cpp/AUTHORS
|
107
|
+
- ext/libcharguess/cpp/Big5Freq.tab
|
108
|
+
- ext/libcharguess/cpp/COPYING
|
109
|
+
- ext/libcharguess/cpp/COPYRIGHT
|
110
|
+
- ext/libcharguess/cpp/ChangeLog
|
111
|
+
- ext/libcharguess/cpp/EUCJPProber.cpp
|
112
|
+
- ext/libcharguess/cpp/EUCJPProber.h
|
113
|
+
- ext/libcharguess/cpp/EUCKRFreq.tab
|
114
|
+
- ext/libcharguess/cpp/EUCKRProber.cpp
|
115
|
+
- ext/libcharguess/cpp/EUCKRProber.h
|
116
|
+
- ext/libcharguess/cpp/EUCTWFreq.tab
|
117
|
+
- ext/libcharguess/cpp/EUCTWProber.cpp
|
118
|
+
- ext/libcharguess/cpp/EUCTWProber.h
|
119
|
+
- ext/libcharguess/cpp/EscCharsetProber.cpp
|
120
|
+
- ext/libcharguess/cpp/EscCharsetProber.h
|
121
|
+
- ext/libcharguess/cpp/EscSM.cpp
|
122
|
+
- ext/libcharguess/cpp/GB2312Freq.tab
|
123
|
+
- ext/libcharguess/cpp/GB2312Prober.cpp
|
124
|
+
- ext/libcharguess/cpp/GB2312Prober.h
|
125
|
+
- ext/libcharguess/cpp/INSTALL
|
126
|
+
- ext/libcharguess/cpp/JISFreq.tab
|
127
|
+
- ext/libcharguess/cpp/LICENSE
|
128
|
+
- ext/libcharguess/cpp/LangBulgarianModel.cpp
|
129
|
+
- ext/libcharguess/cpp/LangCyrillicModel.cpp
|
130
|
+
- ext/libcharguess/cpp/LangGreekModel.cpp
|
131
|
+
- ext/libcharguess/cpp/LangHungarianModel.cpp
|
132
|
+
- ext/libcharguess/cpp/LangThaiModel.cpp
|
133
|
+
- ext/libcharguess/cpp/Latin1Prober.cpp
|
134
|
+
- ext/libcharguess/cpp/Latin1Prober.h
|
135
|
+
- ext/libcharguess/cpp/MBCSGroupProber.cpp
|
136
|
+
- ext/libcharguess/cpp/MBCSGroupProber.h
|
137
|
+
- ext/libcharguess/cpp/MBCSSM.cpp
|
138
|
+
- ext/libcharguess/cpp/Makefile.am
|
139
|
+
- ext/libcharguess/cpp/Makefile.in
|
140
|
+
- ext/libcharguess/cpp/NEWS
|
141
|
+
- ext/libcharguess/cpp/README
|
142
|
+
- ext/libcharguess/cpp/SBCSGroupProber.cpp
|
143
|
+
- ext/libcharguess/cpp/SBCSGroupProber.h
|
144
|
+
- ext/libcharguess/cpp/SBCharsetProber.cpp
|
145
|
+
- ext/libcharguess/cpp/SBCharsetProber.h
|
146
|
+
- ext/libcharguess/cpp/SJISProber.cpp
|
147
|
+
- ext/libcharguess/cpp/SJISProber.h
|
148
|
+
- ext/libcharguess/cpp/UTF8Prober.cpp
|
149
|
+
- ext/libcharguess/cpp/UTF8Prober.h
|
150
|
+
- ext/libcharguess/cpp/aclocal.m4
|
151
|
+
- ext/libcharguess/cpp/autogen.sh
|
152
|
+
- ext/libcharguess/cpp/big5Prober.cpp
|
153
|
+
- ext/libcharguess/cpp/big5Prober.h
|
154
|
+
- ext/libcharguess/cpp/charDistribution.cpp
|
155
|
+
- ext/libcharguess/cpp/charDistribution.h
|
156
|
+
- ext/libcharguess/cpp/charguess.cpp
|
157
|
+
- ext/libcharguess/cpp/charguess.h
|
158
|
+
- ext/libcharguess/cpp/charsetProber.h
|
159
|
+
- ext/libcharguess/cpp/codingStateMachine.h
|
160
|
+
- ext/libcharguess/cpp/config.h
|
161
|
+
- ext/libcharguess/cpp/config.h.in
|
162
|
+
- ext/libcharguess/cpp/config.status
|
163
|
+
- ext/libcharguess/cpp/configure
|
164
|
+
- ext/libcharguess/cpp/configure.in
|
165
|
+
- ext/libcharguess/cpp/depcomp
|
166
|
+
- ext/libcharguess/cpp/fix_copyright
|
167
|
+
- ext/libcharguess/cpp/install-sh
|
168
|
+
- ext/libcharguess/cpp/jpCntx.cpp
|
169
|
+
- ext/libcharguess/cpp/jpCntx.h
|
170
|
+
- ext/libcharguess/cpp/missing
|
171
|
+
- ext/libcharguess/cpp/mkinstalldirs
|
172
|
+
- ext/libcharguess/cpp/pkgInt.h
|
173
|
+
- ext/libcharguess/cpp/stamp-h1
|
174
|
+
- ext/libcharguess/cpp/test/test.cpp
|
175
|
+
- ext/libcharguess/cpp/types.h
|
176
|
+
- ext/libcharguess/cpp/universal.cpp
|
177
|
+
- ext/libcharguess/cpp/universal.h
|
178
|
+
- script/console
|
179
|
+
- script/destroy
|
180
|
+
- script/generate
|
181
|
+
- tasks/extconf.rake
|
182
|
+
- tasks/extconf/charguess.rake
|
183
|
+
- test/test_charguess.rb
|
184
|
+
- test/test_charguess_extn.rb
|
185
|
+
- test/test_helper.rb
|
186
|
+
has_rdoc: true
|
187
|
+
homepage: http://github.com/ernesto-jimenez/charguess
|
188
|
+
licenses: []
|
189
|
+
|
190
|
+
post_install_message: PostInstall.txt
|
191
|
+
rdoc_options:
|
192
|
+
- --main
|
193
|
+
- README.rdoc
|
194
|
+
require_paths:
|
195
|
+
- ext/charguess
|
196
|
+
- ext/libcharguess
|
197
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: "0"
|
202
|
+
version:
|
203
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
204
|
+
requirements:
|
205
|
+
- - ">="
|
206
|
+
- !ruby/object:Gem::Version
|
207
|
+
version: "0"
|
208
|
+
version:
|
209
|
+
requirements: []
|
210
|
+
|
211
|
+
rubyforge_project: charguess
|
212
|
+
rubygems_version: 1.3.5
|
213
|
+
signing_key:
|
214
|
+
specification_version: 3
|
215
|
+
summary: "This gem builds and installs libcharguess and it's binding libcharguess-ruby * libcharguess: http://libcharguess.sourceforge.net/ * libcharguess-ruby: http://raa.ruby-lang.org/project/charguess/"
|
216
|
+
test_files:
|
217
|
+
- test/test_charguess.rb
|
218
|
+
- test/test_charguess_extn.rb
|
219
|
+
- test/test_helper.rb
|