charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "universal.h"
|
|
22
|
+
|
|
23
|
+
#include "MBCSGroupProber.h"
|
|
24
|
+
#include "SBCSGroupProber.h"
|
|
25
|
+
#include "EscCharsetProber.h"
|
|
26
|
+
#include "Latin1Prober.h"
|
|
27
|
+
|
|
28
|
+
nsUniversalDetector::nsUniversalDetector()
|
|
29
|
+
{
|
|
30
|
+
mDone = PR_FALSE;
|
|
31
|
+
mBestGuess = -1; //illegal value as signal
|
|
32
|
+
mInTag = PR_FALSE;
|
|
33
|
+
mEscCharSetProber = nsnull;
|
|
34
|
+
|
|
35
|
+
mStart = PR_TRUE;
|
|
36
|
+
mDetectedCharset = nsnull;
|
|
37
|
+
mGotData = PR_FALSE;
|
|
38
|
+
mInputState = ePureAscii;
|
|
39
|
+
mLastChar = '\0';
|
|
40
|
+
|
|
41
|
+
PRUint32 i;
|
|
42
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
|
43
|
+
mCharSetProbers[i] = nsnull;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
nsUniversalDetector::~nsUniversalDetector()
|
|
47
|
+
{
|
|
48
|
+
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
|
49
|
+
if (mCharSetProbers[i])
|
|
50
|
+
delete mCharSetProbers[i];
|
|
51
|
+
if (mEscCharSetProber)
|
|
52
|
+
delete mEscCharSetProber;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
void nsUniversalDetector::Reset()
|
|
56
|
+
{
|
|
57
|
+
mDone = PR_FALSE;
|
|
58
|
+
mBestGuess = -1; //illegal value as signal
|
|
59
|
+
mInTag = PR_FALSE;
|
|
60
|
+
|
|
61
|
+
mStart = PR_TRUE;
|
|
62
|
+
mDetectedCharset = nsnull;
|
|
63
|
+
mGotData = PR_FALSE;
|
|
64
|
+
mInputState = ePureAscii;
|
|
65
|
+
mLastChar = '\0';
|
|
66
|
+
|
|
67
|
+
if (mEscCharSetProber)
|
|
68
|
+
mEscCharSetProber->Reset();
|
|
69
|
+
|
|
70
|
+
PRUint32 i;
|
|
71
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
|
72
|
+
if (mCharSetProbers[i])
|
|
73
|
+
mCharSetProbers[i]->Reset();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
//---------------------------------------------------------------------
|
|
77
|
+
#define SHORTCUT_THRESHOLD (float)0.95
|
|
78
|
+
#define MINIMUM_THRESHOLD (float)0.20
|
|
79
|
+
|
|
80
|
+
void nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|
81
|
+
{
|
|
82
|
+
if(mDone)
|
|
83
|
+
return;
|
|
84
|
+
|
|
85
|
+
if (aLen > 0)
|
|
86
|
+
mGotData = PR_TRUE;
|
|
87
|
+
|
|
88
|
+
//If the data starts with BOM, we know it is UTF
|
|
89
|
+
if (mStart)
|
|
90
|
+
{
|
|
91
|
+
mStart = PR_FALSE;
|
|
92
|
+
if (aLen > 3)
|
|
93
|
+
switch (aBuf[0])
|
|
94
|
+
{
|
|
95
|
+
case '\xEF':
|
|
96
|
+
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
|
97
|
+
// EF BB BF UTF-8 encoded BOM
|
|
98
|
+
mDetectedCharset = "UTF-8";
|
|
99
|
+
break;
|
|
100
|
+
case '\xFE':
|
|
101
|
+
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
102
|
+
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
|
103
|
+
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
|
|
104
|
+
else if ('\xFF' == aBuf[1])
|
|
105
|
+
// FE FF UTF-16, big endian BOM
|
|
106
|
+
mDetectedCharset = "UTF-16BE";
|
|
107
|
+
break;
|
|
108
|
+
case '\x00':
|
|
109
|
+
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
|
110
|
+
// 00 00 FE FF UTF-32, big-endian BOM
|
|
111
|
+
mDetectedCharset = "UTF-32BE";
|
|
112
|
+
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
|
113
|
+
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
|
114
|
+
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
|
|
115
|
+
break;
|
|
116
|
+
case '\xFF':
|
|
117
|
+
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
118
|
+
// FF FE 00 00 UTF-32, little-endian BOM
|
|
119
|
+
mDetectedCharset = "UTF-32LE";
|
|
120
|
+
else if ('\xFE' == aBuf[1])
|
|
121
|
+
// FF FE UTF-16, little endian BOM
|
|
122
|
+
mDetectedCharset = "UTF-16LE";
|
|
123
|
+
break;
|
|
124
|
+
} // switch
|
|
125
|
+
|
|
126
|
+
if (mDetectedCharset)
|
|
127
|
+
{
|
|
128
|
+
mDone = PR_TRUE;
|
|
129
|
+
return;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
PRUint32 i;
|
|
134
|
+
for (i = 0; i < aLen; i++)
|
|
135
|
+
{
|
|
136
|
+
//other than 0xa0, if every othe character is ascii, the page is ascii
|
|
137
|
+
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
|
|
138
|
+
{
|
|
139
|
+
//we got a non-ascii byte (high-byte)
|
|
140
|
+
if (mInputState != eHighbyte)
|
|
141
|
+
{
|
|
142
|
+
//adjust state
|
|
143
|
+
mInputState = eHighbyte;
|
|
144
|
+
|
|
145
|
+
//kill mEscCharSetProber if it is active
|
|
146
|
+
if (mEscCharSetProber) {
|
|
147
|
+
delete mEscCharSetProber;
|
|
148
|
+
mEscCharSetProber = nsnull;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
//start multibyte and singlebyte charset prober
|
|
152
|
+
if (nsnull == mCharSetProbers[0])
|
|
153
|
+
mCharSetProbers[0] = new nsMBCSGroupProber;
|
|
154
|
+
if (nsnull == mCharSetProbers[1])
|
|
155
|
+
mCharSetProbers[1] = new nsSBCSGroupProber;
|
|
156
|
+
if (nsnull == mCharSetProbers[2])
|
|
157
|
+
mCharSetProbers[2] = new nsLatin1Prober;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
else
|
|
161
|
+
{
|
|
162
|
+
//ok, just pure ascii so far
|
|
163
|
+
if ( ePureAscii == mInputState &&
|
|
164
|
+
(aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
|
|
165
|
+
{
|
|
166
|
+
//found escape character or HZ "~{"
|
|
167
|
+
mInputState = eEscAscii;
|
|
168
|
+
}
|
|
169
|
+
mLastChar = aBuf[i];
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
nsProbingState st;
|
|
174
|
+
switch (mInputState)
|
|
175
|
+
{
|
|
176
|
+
case eEscAscii:
|
|
177
|
+
if (nsnull == mEscCharSetProber)
|
|
178
|
+
mEscCharSetProber = new nsEscCharSetProber;
|
|
179
|
+
st = mEscCharSetProber->HandleData(aBuf, aLen);
|
|
180
|
+
if (st == eFoundIt)
|
|
181
|
+
{
|
|
182
|
+
mDone = PR_TRUE;
|
|
183
|
+
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
|
184
|
+
}
|
|
185
|
+
break;
|
|
186
|
+
case eHighbyte:
|
|
187
|
+
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
|
188
|
+
{
|
|
189
|
+
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
|
190
|
+
if (st == eFoundIt)
|
|
191
|
+
{
|
|
192
|
+
mDone = PR_TRUE;
|
|
193
|
+
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
break;
|
|
198
|
+
|
|
199
|
+
default: //pure ascii
|
|
200
|
+
;//do nothing here
|
|
201
|
+
}
|
|
202
|
+
return ;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
//---------------------------------------------------------------------
|
|
207
|
+
void nsUniversalDetector::DataEnd()
|
|
208
|
+
{
|
|
209
|
+
if (!mGotData)
|
|
210
|
+
{
|
|
211
|
+
// we haven't got any data yet, return immediately
|
|
212
|
+
// caller program sometimes call DataEnd before anything has been sent to detector
|
|
213
|
+
return;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (mDetectedCharset)
|
|
217
|
+
{
|
|
218
|
+
mDone = PR_TRUE;
|
|
219
|
+
Report(mDetectedCharset);
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
switch (mInputState)
|
|
224
|
+
{
|
|
225
|
+
case eHighbyte:
|
|
226
|
+
{
|
|
227
|
+
float proberConfidence;
|
|
228
|
+
float maxProberConfidence = (float)0.0;
|
|
229
|
+
PRInt32 maxProber = 0;
|
|
230
|
+
|
|
231
|
+
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
|
232
|
+
{
|
|
233
|
+
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
|
234
|
+
#ifdef DEBUG_chardet
|
|
235
|
+
mCharSetProbers[i]->DumpStatus();
|
|
236
|
+
#endif
|
|
237
|
+
|
|
238
|
+
if (proberConfidence > maxProberConfidence)
|
|
239
|
+
{
|
|
240
|
+
maxProberConfidence = proberConfidence;
|
|
241
|
+
maxProber = i;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
//do not report anything because we are not confident of it, that's in fact a negative answer
|
|
245
|
+
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
|
246
|
+
Report(mCharSetProbers[maxProber]->GetCharSetName());
|
|
247
|
+
}
|
|
248
|
+
break;
|
|
249
|
+
case eEscAscii:
|
|
250
|
+
break;
|
|
251
|
+
default:
|
|
252
|
+
;
|
|
253
|
+
}
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
void nsUniversalDetector::Report(const char* aCharset)
|
|
259
|
+
{
|
|
260
|
+
if (!mDone)
|
|
261
|
+
{
|
|
262
|
+
mDone = PR_TRUE;
|
|
263
|
+
mDetectedCharset = aCharset;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const char* nsUniversalDetector::GetCharset(void)
|
|
268
|
+
{
|
|
269
|
+
if (mDone == PR_TRUE)
|
|
270
|
+
return (mDetectedCharset);
|
|
271
|
+
else
|
|
272
|
+
return NULL;
|
|
273
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef universal_h__
|
|
22
|
+
#define universal_h__
|
|
23
|
+
|
|
24
|
+
#include "types.h"
|
|
25
|
+
|
|
26
|
+
// #include "nsICharsetDetector.h"
|
|
27
|
+
// #include "nsIStringCharsetDetector.h"
|
|
28
|
+
// #include "nsICharsetDetectionObserver.h"
|
|
29
|
+
|
|
30
|
+
class nsCharSetProber;
|
|
31
|
+
|
|
32
|
+
#define NUM_OF_CHARSET_PROBERS 3
|
|
33
|
+
|
|
34
|
+
typedef enum {
|
|
35
|
+
ePureAscii = 0,
|
|
36
|
+
eEscAscii = 1,
|
|
37
|
+
eHighbyte = 2
|
|
38
|
+
} nsInputState;
|
|
39
|
+
|
|
40
|
+
class nsUniversalDetector {
|
|
41
|
+
public:
|
|
42
|
+
nsUniversalDetector();
|
|
43
|
+
virtual ~nsUniversalDetector();
|
|
44
|
+
virtual void HandleData(const char* aBuf, PRUint32 aLen);
|
|
45
|
+
virtual void DataEnd(void);
|
|
46
|
+
virtual void Reset();
|
|
47
|
+
virtual const char* GetCharset(void);
|
|
48
|
+
|
|
49
|
+
protected:
|
|
50
|
+
virtual void Report(const char* aCharset);
|
|
51
|
+
nsInputState mInputState;
|
|
52
|
+
PRBool mDone;
|
|
53
|
+
PRBool mInTag;
|
|
54
|
+
PRBool mStart;
|
|
55
|
+
PRBool mGotData;
|
|
56
|
+
char mLastChar;
|
|
57
|
+
const char * mDetectedCharset;
|
|
58
|
+
PRInt32 mBestGuess;
|
|
59
|
+
|
|
60
|
+
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
|
61
|
+
nsCharSetProber *mEscCharSetProber;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
#endif
|
|
65
|
+
|
data/script/console
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# File: script/console
|
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
|
4
|
+
|
|
5
|
+
libs = " -r irb/completion"
|
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
|
8
|
+
puts "Loading charguess gem"
|
|
9
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'rubigen'
|
|
6
|
+
rescue LoadError
|
|
7
|
+
require 'rubygems'
|
|
8
|
+
require 'rubigen'
|
|
9
|
+
end
|
|
10
|
+
require 'rubigen/scripts/destroy'
|
|
11
|
+
|
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'rubigen'
|
|
6
|
+
rescue LoadError
|
|
7
|
+
require 'rubygems'
|
|
8
|
+
require 'rubigen'
|
|
9
|
+
end
|
|
10
|
+
require 'rubigen/scripts/generate'
|
|
11
|
+
|
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
namespace :extconf do
|
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
|
3
|
+
|
|
4
|
+
ext = "ext/#{extension}"
|
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
|
6
|
+
ext_files = FileList[
|
|
7
|
+
"#{ext}/*.c",
|
|
8
|
+
"#{ext}/*.h",
|
|
9
|
+
"#{ext}/*.rl",
|
|
10
|
+
"#{ext}/extconf.rb",
|
|
11
|
+
"#{ext}/Makefile",
|
|
12
|
+
# "lib"
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
task :compile => extension do
|
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
|
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
|
20
|
+
STDERR.puts "NOT configured properly to build charguess."
|
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
|
22
|
+
exit(1)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
desc "Builds just the #{extension} extension"
|
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
|
28
|
+
|
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
|
30
|
+
Dir.chdir("ext/libcharguess/cpp") do
|
|
31
|
+
sh("./configure")
|
|
32
|
+
sh("make")
|
|
33
|
+
end
|
|
34
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
file ext_so => ext_files do
|
|
38
|
+
Dir.chdir(ext) do
|
|
39
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
|
40
|
+
if !ok
|
|
41
|
+
require "fileutils"
|
|
42
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
data/tasks/extconf.rake
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
namespace :extconf do
|
|
2
|
+
desc "Compiles the Ruby extension"
|
|
3
|
+
task :compile
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
task :compile => "extconf:compile"
|
|
7
|
+
|
|
8
|
+
task :test => :compile
|
|
9
|
+
|
|
10
|
+
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
|
|
11
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
|
|
12
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
|
13
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|
data/test/test_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: charguess
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: "1.0"
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- "Ernesto Jim\xC3\xA9nez"
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
|
|
12
|
+
date: 2009-12-09 00:00:00 +01:00
|
|
13
|
+
default_executable:
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: newgem
|
|
17
|
+
type: :development
|
|
18
|
+
version_requirement:
|
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
- - ">="
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: 1.2.1
|
|
24
|
+
version:
|
|
25
|
+
- !ruby/object:Gem::Dependency
|
|
26
|
+
name: hoe
|
|
27
|
+
type: :development
|
|
28
|
+
version_requirement:
|
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 1.8.0
|
|
34
|
+
version:
|
|
35
|
+
description: |-
|
|
36
|
+
This gem builds and installs libcharguess and it's binding libcharguess-ruby
|
|
37
|
+
|
|
38
|
+
* libcharguess: http://libcharguess.sourceforge.net/
|
|
39
|
+
* libcharguess-ruby: http://raa.ruby-lang.org/project/charguess/
|
|
40
|
+
email:
|
|
41
|
+
- erjica@gmail.com
|
|
42
|
+
executables: []
|
|
43
|
+
|
|
44
|
+
extensions:
|
|
45
|
+
- ext/charguess/extconf.rb
|
|
46
|
+
extra_rdoc_files:
|
|
47
|
+
- History.txt
|
|
48
|
+
- Manifest.txt
|
|
49
|
+
- PostInstall.txt
|
|
50
|
+
- README.rdoc
|
|
51
|
+
files:
|
|
52
|
+
- History.txt
|
|
53
|
+
- Manifest.txt
|
|
54
|
+
- PostInstall.txt
|
|
55
|
+
- README.rdoc
|
|
56
|
+
- Rakefile
|
|
57
|
+
- ext/charguess/charguess.c
|
|
58
|
+
- ext/charguess/extconf.rb
|
|
59
|
+
- ext/libcharguess/cpp/.deps/EUCJPProber.Plo
|
|
60
|
+
- ext/libcharguess/cpp/.deps/EUCJPProber.Po
|
|
61
|
+
- ext/libcharguess/cpp/.deps/EUCKRProber.Plo
|
|
62
|
+
- ext/libcharguess/cpp/.deps/EUCKRProber.Po
|
|
63
|
+
- ext/libcharguess/cpp/.deps/EUCTWProber.Plo
|
|
64
|
+
- ext/libcharguess/cpp/.deps/EUCTWProber.Po
|
|
65
|
+
- ext/libcharguess/cpp/.deps/EscCharsetProber.Plo
|
|
66
|
+
- ext/libcharguess/cpp/.deps/EscCharsetProber.Po
|
|
67
|
+
- ext/libcharguess/cpp/.deps/EscSM.Plo
|
|
68
|
+
- ext/libcharguess/cpp/.deps/EscSM.Po
|
|
69
|
+
- ext/libcharguess/cpp/.deps/GB2312Prober.Plo
|
|
70
|
+
- ext/libcharguess/cpp/.deps/GB2312Prober.Po
|
|
71
|
+
- ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo
|
|
72
|
+
- ext/libcharguess/cpp/.deps/LangBulgarianModel.Po
|
|
73
|
+
- ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo
|
|
74
|
+
- ext/libcharguess/cpp/.deps/LangCyrillicModel.Po
|
|
75
|
+
- ext/libcharguess/cpp/.deps/LangGreekModel.Plo
|
|
76
|
+
- ext/libcharguess/cpp/.deps/LangGreekModel.Po
|
|
77
|
+
- ext/libcharguess/cpp/.deps/LangHungarianModel.Plo
|
|
78
|
+
- ext/libcharguess/cpp/.deps/LangHungarianModel.Po
|
|
79
|
+
- ext/libcharguess/cpp/.deps/LangThaiModel.Plo
|
|
80
|
+
- ext/libcharguess/cpp/.deps/LangThaiModel.Po
|
|
81
|
+
- ext/libcharguess/cpp/.deps/Latin1Prober.Plo
|
|
82
|
+
- ext/libcharguess/cpp/.deps/Latin1Prober.Po
|
|
83
|
+
- ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo
|
|
84
|
+
- ext/libcharguess/cpp/.deps/MBCSGroupProber.Po
|
|
85
|
+
- ext/libcharguess/cpp/.deps/MBCSSM.Plo
|
|
86
|
+
- ext/libcharguess/cpp/.deps/MBCSSM.Po
|
|
87
|
+
- ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo
|
|
88
|
+
- ext/libcharguess/cpp/.deps/SBCSGroupProber.Po
|
|
89
|
+
- ext/libcharguess/cpp/.deps/SBCharsetProber.Plo
|
|
90
|
+
- ext/libcharguess/cpp/.deps/SBCharsetProber.Po
|
|
91
|
+
- ext/libcharguess/cpp/.deps/SJISProber.Plo
|
|
92
|
+
- ext/libcharguess/cpp/.deps/SJISProber.Po
|
|
93
|
+
- ext/libcharguess/cpp/.deps/UTF8Prober.Plo
|
|
94
|
+
- ext/libcharguess/cpp/.deps/UTF8Prober.Po
|
|
95
|
+
- ext/libcharguess/cpp/.deps/big5Prober.Plo
|
|
96
|
+
- ext/libcharguess/cpp/.deps/big5Prober.Po
|
|
97
|
+
- ext/libcharguess/cpp/.deps/charDistribution.Plo
|
|
98
|
+
- ext/libcharguess/cpp/.deps/charDistribution.Po
|
|
99
|
+
- ext/libcharguess/cpp/.deps/chardet.Plo
|
|
100
|
+
- ext/libcharguess/cpp/.deps/chardet.Po
|
|
101
|
+
- ext/libcharguess/cpp/.deps/charguess.Po
|
|
102
|
+
- ext/libcharguess/cpp/.deps/jpCntx.Plo
|
|
103
|
+
- ext/libcharguess/cpp/.deps/jpCntx.Po
|
|
104
|
+
- ext/libcharguess/cpp/.deps/universal.Plo
|
|
105
|
+
- ext/libcharguess/cpp/.deps/universal.Po
|
|
106
|
+
- ext/libcharguess/cpp/AUTHORS
|
|
107
|
+
- ext/libcharguess/cpp/Big5Freq.tab
|
|
108
|
+
- ext/libcharguess/cpp/COPYING
|
|
109
|
+
- ext/libcharguess/cpp/COPYRIGHT
|
|
110
|
+
- ext/libcharguess/cpp/ChangeLog
|
|
111
|
+
- ext/libcharguess/cpp/EUCJPProber.cpp
|
|
112
|
+
- ext/libcharguess/cpp/EUCJPProber.h
|
|
113
|
+
- ext/libcharguess/cpp/EUCKRFreq.tab
|
|
114
|
+
- ext/libcharguess/cpp/EUCKRProber.cpp
|
|
115
|
+
- ext/libcharguess/cpp/EUCKRProber.h
|
|
116
|
+
- ext/libcharguess/cpp/EUCTWFreq.tab
|
|
117
|
+
- ext/libcharguess/cpp/EUCTWProber.cpp
|
|
118
|
+
- ext/libcharguess/cpp/EUCTWProber.h
|
|
119
|
+
- ext/libcharguess/cpp/EscCharsetProber.cpp
|
|
120
|
+
- ext/libcharguess/cpp/EscCharsetProber.h
|
|
121
|
+
- ext/libcharguess/cpp/EscSM.cpp
|
|
122
|
+
- ext/libcharguess/cpp/GB2312Freq.tab
|
|
123
|
+
- ext/libcharguess/cpp/GB2312Prober.cpp
|
|
124
|
+
- ext/libcharguess/cpp/GB2312Prober.h
|
|
125
|
+
- ext/libcharguess/cpp/INSTALL
|
|
126
|
+
- ext/libcharguess/cpp/JISFreq.tab
|
|
127
|
+
- ext/libcharguess/cpp/LICENSE
|
|
128
|
+
- ext/libcharguess/cpp/LangBulgarianModel.cpp
|
|
129
|
+
- ext/libcharguess/cpp/LangCyrillicModel.cpp
|
|
130
|
+
- ext/libcharguess/cpp/LangGreekModel.cpp
|
|
131
|
+
- ext/libcharguess/cpp/LangHungarianModel.cpp
|
|
132
|
+
- ext/libcharguess/cpp/LangThaiModel.cpp
|
|
133
|
+
- ext/libcharguess/cpp/Latin1Prober.cpp
|
|
134
|
+
- ext/libcharguess/cpp/Latin1Prober.h
|
|
135
|
+
- ext/libcharguess/cpp/MBCSGroupProber.cpp
|
|
136
|
+
- ext/libcharguess/cpp/MBCSGroupProber.h
|
|
137
|
+
- ext/libcharguess/cpp/MBCSSM.cpp
|
|
138
|
+
- ext/libcharguess/cpp/Makefile.am
|
|
139
|
+
- ext/libcharguess/cpp/Makefile.in
|
|
140
|
+
- ext/libcharguess/cpp/NEWS
|
|
141
|
+
- ext/libcharguess/cpp/README
|
|
142
|
+
- ext/libcharguess/cpp/SBCSGroupProber.cpp
|
|
143
|
+
- ext/libcharguess/cpp/SBCSGroupProber.h
|
|
144
|
+
- ext/libcharguess/cpp/SBCharsetProber.cpp
|
|
145
|
+
- ext/libcharguess/cpp/SBCharsetProber.h
|
|
146
|
+
- ext/libcharguess/cpp/SJISProber.cpp
|
|
147
|
+
- ext/libcharguess/cpp/SJISProber.h
|
|
148
|
+
- ext/libcharguess/cpp/UTF8Prober.cpp
|
|
149
|
+
- ext/libcharguess/cpp/UTF8Prober.h
|
|
150
|
+
- ext/libcharguess/cpp/aclocal.m4
|
|
151
|
+
- ext/libcharguess/cpp/autogen.sh
|
|
152
|
+
- ext/libcharguess/cpp/big5Prober.cpp
|
|
153
|
+
- ext/libcharguess/cpp/big5Prober.h
|
|
154
|
+
- ext/libcharguess/cpp/charDistribution.cpp
|
|
155
|
+
- ext/libcharguess/cpp/charDistribution.h
|
|
156
|
+
- ext/libcharguess/cpp/charguess.cpp
|
|
157
|
+
- ext/libcharguess/cpp/charguess.h
|
|
158
|
+
- ext/libcharguess/cpp/charsetProber.h
|
|
159
|
+
- ext/libcharguess/cpp/codingStateMachine.h
|
|
160
|
+
- ext/libcharguess/cpp/config.h
|
|
161
|
+
- ext/libcharguess/cpp/config.h.in
|
|
162
|
+
- ext/libcharguess/cpp/config.status
|
|
163
|
+
- ext/libcharguess/cpp/configure
|
|
164
|
+
- ext/libcharguess/cpp/configure.in
|
|
165
|
+
- ext/libcharguess/cpp/depcomp
|
|
166
|
+
- ext/libcharguess/cpp/fix_copyright
|
|
167
|
+
- ext/libcharguess/cpp/install-sh
|
|
168
|
+
- ext/libcharguess/cpp/jpCntx.cpp
|
|
169
|
+
- ext/libcharguess/cpp/jpCntx.h
|
|
170
|
+
- ext/libcharguess/cpp/missing
|
|
171
|
+
- ext/libcharguess/cpp/mkinstalldirs
|
|
172
|
+
- ext/libcharguess/cpp/pkgInt.h
|
|
173
|
+
- ext/libcharguess/cpp/stamp-h1
|
|
174
|
+
- ext/libcharguess/cpp/test/test.cpp
|
|
175
|
+
- ext/libcharguess/cpp/types.h
|
|
176
|
+
- ext/libcharguess/cpp/universal.cpp
|
|
177
|
+
- ext/libcharguess/cpp/universal.h
|
|
178
|
+
- script/console
|
|
179
|
+
- script/destroy
|
|
180
|
+
- script/generate
|
|
181
|
+
- tasks/extconf.rake
|
|
182
|
+
- tasks/extconf/charguess.rake
|
|
183
|
+
- test/test_charguess.rb
|
|
184
|
+
- test/test_charguess_extn.rb
|
|
185
|
+
- test/test_helper.rb
|
|
186
|
+
has_rdoc: true
|
|
187
|
+
homepage: http://github.com/ernesto-jimenez/charguess
|
|
188
|
+
licenses: []
|
|
189
|
+
|
|
190
|
+
post_install_message: PostInstall.txt
|
|
191
|
+
rdoc_options:
|
|
192
|
+
- --main
|
|
193
|
+
- README.rdoc
|
|
194
|
+
require_paths:
|
|
195
|
+
- ext/charguess
|
|
196
|
+
- ext/libcharguess
|
|
197
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
198
|
+
requirements:
|
|
199
|
+
- - ">="
|
|
200
|
+
- !ruby/object:Gem::Version
|
|
201
|
+
version: "0"
|
|
202
|
+
version:
|
|
203
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
204
|
+
requirements:
|
|
205
|
+
- - ">="
|
|
206
|
+
- !ruby/object:Gem::Version
|
|
207
|
+
version: "0"
|
|
208
|
+
version:
|
|
209
|
+
requirements: []
|
|
210
|
+
|
|
211
|
+
rubyforge_project: charguess
|
|
212
|
+
rubygems_version: 1.3.5
|
|
213
|
+
signing_key:
|
|
214
|
+
specification_version: 3
|
|
215
|
+
summary: "This gem builds and installs libcharguess and it's binding libcharguess-ruby * libcharguess: http://libcharguess.sourceforge.net/ * libcharguess-ruby: http://raa.ruby-lang.org/project/charguess/"
|
|
216
|
+
test_files:
|
|
217
|
+
- test/test_charguess.rb
|
|
218
|
+
- test/test_charguess_extn.rb
|
|
219
|
+
- test/test_helper.rb
|