charguess 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +134 -0
- data/PostInstall.txt +7 -0
- data/README.rdoc +26 -0
- data/Rakefile +25 -0
- data/ext/charguess/charguess.c +29 -0
- data/ext/charguess/extconf.rb +11 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
- data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
- data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
- data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
- data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
- data/ext/libcharguess/cpp/AUTHORS +3 -0
- data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
- data/ext/libcharguess/cpp/COPYING +340 -0
- data/ext/libcharguess/cpp/COPYRIGHT +20 -0
- data/ext/libcharguess/cpp/ChangeLog +0 -0
- data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
- data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
- data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
- data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
- data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
- data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
- data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
- data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
- data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
- data/ext/libcharguess/cpp/EscSM.cpp +244 -0
- data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
- data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
- data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
- data/ext/libcharguess/cpp/INSTALL +229 -0
- data/ext/libcharguess/cpp/JISFreq.tab +574 -0
- data/ext/libcharguess/cpp/LICENSE +504 -0
- data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
- data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
- data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
- data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
- data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
- data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
- data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
- data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
- data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
- data/ext/libcharguess/cpp/Makefile.am +45 -0
- data/ext/libcharguess/cpp/Makefile.in +608 -0
- data/ext/libcharguess/cpp/NEWS +0 -0
- data/ext/libcharguess/cpp/README +0 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
- data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
- data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
- data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
- data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
- data/ext/libcharguess/cpp/SJISProber.h +60 -0
- data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
- data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
- data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
- data/ext/libcharguess/cpp/autogen.sh +153 -0
- data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
- data/ext/libcharguess/cpp/big5Prober.h +53 -0
- data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
- data/ext/libcharguess/cpp/charDistribution.h +219 -0
- data/ext/libcharguess/cpp/charguess.cpp +56 -0
- data/ext/libcharguess/cpp/charguess.h +23 -0
- data/ext/libcharguess/cpp/charsetProber.h +50 -0
- data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
- data/ext/libcharguess/cpp/config.h +36 -0
- data/ext/libcharguess/cpp/config.h.in +35 -0
- data/ext/libcharguess/cpp/config.status +1075 -0
- data/ext/libcharguess/cpp/configure +5226 -0
- data/ext/libcharguess/cpp/configure.in +49 -0
- data/ext/libcharguess/cpp/depcomp +472 -0
- data/ext/libcharguess/cpp/fix_copyright +32 -0
- data/ext/libcharguess/cpp/install-sh +294 -0
- data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
- data/ext/libcharguess/cpp/jpCntx.h +100 -0
- data/ext/libcharguess/cpp/missing +336 -0
- data/ext/libcharguess/cpp/mkinstalldirs +111 -0
- data/ext/libcharguess/cpp/pkgInt.h +72 -0
- data/ext/libcharguess/cpp/stamp-h1 +1 -0
- data/ext/libcharguess/cpp/test/test.cpp +78 -0
- data/ext/libcharguess/cpp/types.h +41 -0
- data/ext/libcharguess/cpp/universal.cpp +273 -0
- data/ext/libcharguess/cpp/universal.h +65 -0
- data/script/console +9 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf/charguess.rake +47 -0
- data/tasks/extconf.rake +13 -0
- data/test/test_charguess.rb +7 -0
- data/test/test_charguess_extn.rb +10 -0
- data/test/test_helper.rb +3 -0
- metadata +219 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "EUCTWProber.h"
|
|
22
|
+
|
|
23
|
+
void nsEUCTWProber::Reset(void)
|
|
24
|
+
{
|
|
25
|
+
mCodingSM->Reset();
|
|
26
|
+
mState = eDetecting;
|
|
27
|
+
mDistributionAnalyser.Reset();
|
|
28
|
+
//mContextAnalyser.Reset();
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
32
|
+
{
|
|
33
|
+
nsSMState codingState;
|
|
34
|
+
|
|
35
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
36
|
+
{
|
|
37
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
38
|
+
if (codingState == eError)
|
|
39
|
+
{
|
|
40
|
+
mState = eNotMe;
|
|
41
|
+
break;
|
|
42
|
+
}
|
|
43
|
+
if (codingState == eItsMe)
|
|
44
|
+
{
|
|
45
|
+
mState = eFoundIt;
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
if (codingState == eStart)
|
|
49
|
+
{
|
|
50
|
+
PRUint32 charLen = mCodingSM->GetCurrentCharLen();
|
|
51
|
+
|
|
52
|
+
if (i == 0)
|
|
53
|
+
{
|
|
54
|
+
mLastChar[1] = aBuf[0];
|
|
55
|
+
mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
|
|
56
|
+
}
|
|
57
|
+
else
|
|
58
|
+
mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
mLastChar[0] = aBuf[aLen-1];
|
|
63
|
+
|
|
64
|
+
if (mState == eDetecting)
|
|
65
|
+
if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
|
66
|
+
mState = eFoundIt;
|
|
67
|
+
// else
|
|
68
|
+
// mDistributionAnalyser.HandleData(aBuf, aLen);
|
|
69
|
+
|
|
70
|
+
return mState;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
float nsEUCTWProber::GetConfidence(void)
|
|
74
|
+
{
|
|
75
|
+
float distribCf = mDistributionAnalyser.GetConfidence();
|
|
76
|
+
|
|
77
|
+
return (float)distribCf;
|
|
78
|
+
}
|
|
79
|
+
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsEUCTWProber_h__
|
|
22
|
+
#define nsEUCTWProber_h__
|
|
23
|
+
|
|
24
|
+
#include "charsetProber.h"
|
|
25
|
+
#include "codingStateMachine.h"
|
|
26
|
+
#include "charDistribution.h"
|
|
27
|
+
|
|
28
|
+
class nsEUCTWProber: public nsCharSetProber {
|
|
29
|
+
public:
|
|
30
|
+
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
|
31
|
+
Reset();};
|
|
32
|
+
virtual ~nsEUCTWProber(void){delete mCodingSM;};
|
|
33
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
34
|
+
const char* GetCharSetName() {return "x-euc-tw";};
|
|
35
|
+
nsProbingState GetState(void) {return mState;};
|
|
36
|
+
void Reset(void);
|
|
37
|
+
float GetConfidence(void);
|
|
38
|
+
void SetOpion() {};
|
|
39
|
+
|
|
40
|
+
protected:
|
|
41
|
+
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
|
42
|
+
|
|
43
|
+
nsCodingStateMachine* mCodingSM;
|
|
44
|
+
nsProbingState mState;
|
|
45
|
+
|
|
46
|
+
//EUCTWContextAnalysis mContextAnalyser;
|
|
47
|
+
EUCTWDistributionAnalysis mDistributionAnalyser;
|
|
48
|
+
char mLastChar[2];
|
|
49
|
+
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
#endif /* nsEUCTWProber_h__ */
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "EscCharsetProber.h"
|
|
22
|
+
|
|
23
|
+
nsEscCharSetProber::nsEscCharSetProber(void)
|
|
24
|
+
{
|
|
25
|
+
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
|
26
|
+
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
|
27
|
+
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
|
28
|
+
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
|
29
|
+
mActiveSM = NUM_OF_ESC_CHARSETS;
|
|
30
|
+
mState = eDetecting;
|
|
31
|
+
mDetectedCharset = nsnull;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
nsEscCharSetProber::~nsEscCharSetProber(void)
|
|
35
|
+
{
|
|
36
|
+
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
|
37
|
+
delete mCodingSM[i];
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
void nsEscCharSetProber::Reset(void)
|
|
41
|
+
{
|
|
42
|
+
mState = eDetecting;
|
|
43
|
+
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
|
44
|
+
mCodingSM[i]->Reset();
|
|
45
|
+
mActiveSM = NUM_OF_ESC_CHARSETS;
|
|
46
|
+
mDetectedCharset = nsnull;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
50
|
+
{
|
|
51
|
+
nsSMState codingState;
|
|
52
|
+
PRInt32 j;
|
|
53
|
+
PRUint32 i;
|
|
54
|
+
|
|
55
|
+
for ( i = 0; i < aLen && mState == eDetecting; i++)
|
|
56
|
+
{
|
|
57
|
+
for (j = mActiveSM-1; j>= 0; j--)
|
|
58
|
+
{
|
|
59
|
+
//byte is feed to all active state machine
|
|
60
|
+
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
|
61
|
+
if (codingState == eError)
|
|
62
|
+
{
|
|
63
|
+
//got negative answer for this state machine, make it inactive
|
|
64
|
+
mActiveSM--;
|
|
65
|
+
if (mActiveSM == 0)
|
|
66
|
+
{
|
|
67
|
+
mState = eNotMe;
|
|
68
|
+
return mState;
|
|
69
|
+
}
|
|
70
|
+
else if (j != (PRInt32)mActiveSM)
|
|
71
|
+
{
|
|
72
|
+
nsCodingStateMachine* t;
|
|
73
|
+
t = mCodingSM[mActiveSM];
|
|
74
|
+
mCodingSM[mActiveSM] = mCodingSM[j];
|
|
75
|
+
mCodingSM[j] = t;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
else if (codingState == eItsMe)
|
|
79
|
+
{
|
|
80
|
+
mState = eFoundIt;
|
|
81
|
+
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
|
82
|
+
return mState;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return mState;
|
|
88
|
+
}
|
|
89
|
+
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef nsEscCharSetProber_h__
|
|
22
|
+
#define nsEscCharSetProber_h__
|
|
23
|
+
|
|
24
|
+
#include "charsetProber.h"
|
|
25
|
+
#include "codingStateMachine.h"
|
|
26
|
+
|
|
27
|
+
#define NUM_OF_ESC_CHARSETS 4
|
|
28
|
+
|
|
29
|
+
class nsEscCharSetProber: public nsCharSetProber {
|
|
30
|
+
public:
|
|
31
|
+
nsEscCharSetProber(void);
|
|
32
|
+
virtual ~nsEscCharSetProber(void);
|
|
33
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
34
|
+
const char* GetCharSetName() {return mDetectedCharset;};
|
|
35
|
+
nsProbingState GetState(void) {return mState;};
|
|
36
|
+
void Reset(void);
|
|
37
|
+
float GetConfidence(void){return (float)0.99;};
|
|
38
|
+
void SetOpion() {};
|
|
39
|
+
|
|
40
|
+
protected:
|
|
41
|
+
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
|
42
|
+
|
|
43
|
+
nsCodingStateMachine* mCodingSM[NUM_OF_ESC_CHARSETS] ;
|
|
44
|
+
PRUint32 mActiveSM;
|
|
45
|
+
nsProbingState mState;
|
|
46
|
+
const char * mDetectedCharset;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
#endif /* nsEscCharSetProber_h__ */
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/*
|
|
2
|
+
libcharguess - Guess the encoding/charset of a string
|
|
3
|
+
Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
|
|
4
|
+
Based on Mozilla sources
|
|
5
|
+
|
|
6
|
+
This library is free software; you can redistribute it and/or
|
|
7
|
+
modify it under the terms of the GNU Lesser General Public
|
|
8
|
+
License as published by the Free Software Foundation; either
|
|
9
|
+
version 2.1 of the License, or (at your option) any later version.
|
|
10
|
+
|
|
11
|
+
This library is distributed in the hope that it will be useful,
|
|
12
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14
|
+
Lesser General Public License for more details.
|
|
15
|
+
|
|
16
|
+
You should have received a copy of the GNU Lesser General Public
|
|
17
|
+
License along with this library; if not, write to the Free Software
|
|
18
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include "codingStateMachine.h"
|
|
22
|
+
|
|
23
|
+
static PRUint32 HZ_cls[ 256 / 8 ] = {
|
|
24
|
+
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
|
|
25
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
|
26
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
27
|
+
PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f
|
|
28
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
|
29
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 28 - 2f
|
|
30
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
31
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
32
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
|
33
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
|
34
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
35
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
36
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
37
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
38
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
39
|
+
PCK4BITS(0,0,0,4,0,5,2,0), // 78 - 7f
|
|
40
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // 80 - 87
|
|
41
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // 88 - 8f
|
|
42
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // 90 - 97
|
|
43
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // 98 - 9f
|
|
44
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // a0 - a7
|
|
45
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // a8 - af
|
|
46
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7
|
|
47
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf
|
|
48
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // c0 - c7
|
|
49
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // c8 - cf
|
|
50
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // d0 - d7
|
|
51
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // d8 - df
|
|
52
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // e0 - e7
|
|
53
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // e8 - ef
|
|
54
|
+
PCK4BITS(1,1,1,1,1,1,1,1), // f0 - f7
|
|
55
|
+
PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
static PRUint32 HZ_st [ 6] = {
|
|
60
|
+
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
|
|
61
|
+
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
|
62
|
+
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
|
|
63
|
+
PCK4BITS( 5,eError, 6,eError, 5, 5, 4,eError),//18-1f
|
|
64
|
+
PCK4BITS( 4,eError, 4, 4, 4,eError, 4,eError),//20-27
|
|
65
|
+
PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
|
69
|
+
|
|
70
|
+
SMModel HZSMModel = {
|
|
71
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
|
|
72
|
+
6,
|
|
73
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
|
|
74
|
+
HZCharLenTable,
|
|
75
|
+
"HZ-GB-2312",
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
|
80
|
+
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
|
81
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
|
82
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
83
|
+
PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f
|
|
84
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
|
85
|
+
PCK4BITS(0,3,0,0,0,0,0,0), // 28 - 2f
|
|
86
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
87
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
88
|
+
PCK4BITS(0,0,0,4,0,0,0,0), // 40 - 47
|
|
89
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
|
90
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
91
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
92
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
93
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
94
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
95
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
|
96
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87
|
|
97
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f
|
|
98
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97
|
|
99
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f
|
|
100
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7
|
|
101
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af
|
|
102
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
|
|
103
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
|
|
104
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
|
|
105
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
|
|
106
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
|
|
107
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
|
|
108
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7
|
|
109
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef
|
|
110
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7
|
|
111
|
+
PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
static PRUint32 ISO2022CN_st [ 8] = {
|
|
116
|
+
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
|
117
|
+
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
|
118
|
+
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
|
119
|
+
PCK4BITS(eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError),//18-1f
|
|
120
|
+
PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//20-27
|
|
121
|
+
PCK4BITS( 5, 6,eError,eError,eError,eError,eError,eError),//28-2f
|
|
122
|
+
PCK4BITS(eError,eError,eError,eItsMe,eError,eError,eError,eError),//30-37
|
|
123
|
+
PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
127
|
+
|
|
128
|
+
SMModel ISO2022CNSMModel = {
|
|
129
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
|
|
130
|
+
9,
|
|
131
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
|
|
132
|
+
ISO2022CNCharLenTable,
|
|
133
|
+
"ISO-2022-CN",
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
|
137
|
+
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
|
138
|
+
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
|
|
139
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
140
|
+
PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f
|
|
141
|
+
PCK4BITS(0,0,0,0,7,0,0,0), // 20 - 27
|
|
142
|
+
PCK4BITS(3,0,0,0,0,0,0,0), // 28 - 2f
|
|
143
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
144
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
145
|
+
PCK4BITS(6,0,4,0,0,0,0,0), // 40 - 47
|
|
146
|
+
PCK4BITS(0,0,5,0,0,0,0,0), // 48 - 4f
|
|
147
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
148
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
149
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
150
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
151
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
152
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
|
153
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87
|
|
154
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f
|
|
155
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97
|
|
156
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f
|
|
157
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7
|
|
158
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af
|
|
159
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
|
|
160
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
|
|
161
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
|
|
162
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
|
|
163
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
|
|
164
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
|
|
165
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7
|
|
166
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef
|
|
167
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7
|
|
168
|
+
PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
static PRUint32 ISO2022JP_st [ 6] = {
|
|
173
|
+
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
|
174
|
+
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
|
175
|
+
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
|
176
|
+
PCK4BITS(eError,eError,eError, 5,eError,eError,eError, 4),//18-1f
|
|
177
|
+
PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eItsMe,eError),//20-27
|
|
178
|
+
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eError,eError) //28-2f
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
|
182
|
+
|
|
183
|
+
SMModel ISO2022JPSMModel = {
|
|
184
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
|
|
185
|
+
8,
|
|
186
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
|
|
187
|
+
ISO2022JPCharLenTable,
|
|
188
|
+
"ISO-2022-JP",
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
|
192
|
+
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
|
193
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
|
194
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
195
|
+
PCK4BITS(0,0,0,1,0,0,0,0), // 18 - 1f
|
|
196
|
+
PCK4BITS(0,0,0,0,3,0,0,0), // 20 - 27
|
|
197
|
+
PCK4BITS(0,4,0,0,0,0,0,0), // 28 - 2f
|
|
198
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
199
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
200
|
+
PCK4BITS(0,0,0,5,0,0,0,0), // 40 - 47
|
|
201
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
|
202
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
203
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
204
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
205
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
206
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
207
|
+
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
|
208
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 80 - 87
|
|
209
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 88 - 8f
|
|
210
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 90 - 97
|
|
211
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // 98 - 9f
|
|
212
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7
|
|
213
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af
|
|
214
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
|
|
215
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
|
|
216
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
|
|
217
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
|
|
218
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
|
|
219
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
|
|
220
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7
|
|
221
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef
|
|
222
|
+
PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7
|
|
223
|
+
PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
static PRUint32 ISO2022KR_st [ 5] = {
|
|
228
|
+
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
|
|
229
|
+
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
|
230
|
+
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
|
|
231
|
+
PCK4BITS(eError,eError,eError,eError, 5,eError,eError,eError),//18-1f
|
|
232
|
+
PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
|
236
|
+
|
|
237
|
+
SMModel ISO2022KRSMModel = {
|
|
238
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
|
|
239
|
+
6,
|
|
240
|
+
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
|
|
241
|
+
ISO2022KRCharLenTable,
|
|
242
|
+
"ISO-2022-KR",
|
|
243
|
+
};
|
|
244
|
+
|