charguess 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,244 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "SBCharsetProber.h"
22
+ #include "SBCSGroupProber.h"
23
+
24
+
25
+ nsSBCSGroupProber::nsSBCSGroupProber()
26
+ {
27
+ mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
28
+ mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
29
+ mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
30
+ mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
31
+ mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
32
+ mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
33
+ mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
34
+ mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
35
+ mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
36
+ mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
37
+
38
+ // disable latin2 before latin1 is available, otherwise all latin1
39
+ // will be detected as latin2 because of their similarity.
40
+ //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
41
+ //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
42
+
43
+ Reset();
44
+ }
45
+
46
+ nsSBCSGroupProber::~nsSBCSGroupProber()
47
+ {
48
+ for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
49
+ {
50
+ delete mProbers[i];
51
+ }
52
+ }
53
+
54
+
55
+ const char* nsSBCSGroupProber::GetCharSetName()
56
+ {
57
+ //if we have no answer yet
58
+ if (mBestGuess == -1)
59
+ {
60
+ GetConfidence();
61
+ //no charset seems positive
62
+ if (mBestGuess == -1)
63
+ //we will use default.
64
+ mBestGuess = 0;
65
+ }
66
+ return mProbers[mBestGuess]->GetCharSetName();
67
+ }
68
+
69
+ void nsSBCSGroupProber::Reset(void)
70
+ {
71
+ for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
72
+ {
73
+ mProbers[i]->Reset();
74
+ mIsActive[i] = PR_TRUE;
75
+ }
76
+ mBestGuess = -1;
77
+ mState = eDetecting;
78
+ }
79
+
80
+ //This filter apply to all scripts that does not use latin letters (english letter)
81
+ PRBool nsSBCSGroupProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
82
+ {
83
+ //do filtering to reduce load to probers
84
+ char *newptr;
85
+ char *prevPtr, *curPtr;
86
+
87
+ PRBool meetMSB = PR_FALSE;
88
+ newptr = *newBuf = (char*)PR_MALLOC(aLen);
89
+ if (!newptr)
90
+ return PR_FALSE;
91
+
92
+ for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
93
+ {
94
+ if (*curPtr & 0x80)
95
+ meetMSB = PR_TRUE;
96
+ else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
97
+ {
98
+ //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
99
+ if (meetMSB && curPtr > prevPtr)
100
+ //this segment contains more than single symbol, and it has upper ascii, we need to keep it
101
+ {
102
+ while (prevPtr < curPtr) *newptr++ = *prevPtr++;
103
+ prevPtr++;
104
+ *newptr++ = ' ';
105
+ meetMSB = PR_FALSE;
106
+ }
107
+ else //ignore current segment. (either because it is just a symbol or just a english word
108
+ prevPtr = curPtr+1;
109
+ }
110
+ }
111
+
112
+ newLen = newptr - *newBuf;
113
+
114
+ return PR_TRUE;
115
+ }
116
+
117
+ #ifdef NO_ENGLISH_CONTAMINATION
118
+ //This filter apply to all scripts that does use latin letters (english letter)
119
+ PRBool nsSBCSGroupProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
120
+ {
121
+ //do filtering to reduce load to probers
122
+ char *newptr;
123
+ char *prevPtr, *curPtr;
124
+ PRBool isInTag = PR_FALSE;
125
+
126
+ newptr = *newBuf = (char*)PR_MALLOC(aLen);
127
+ if (!newptr)
128
+ return PR_FALSE;
129
+
130
+ for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
131
+ {
132
+ if (*curPtr == '>')
133
+ isInTag = PR_FALSE;
134
+ else if (*curPtr == '<')
135
+ isInTag = PR_TRUE;
136
+
137
+ if (!(*curPtr & 0x80) &&
138
+ (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
139
+ {
140
+ if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
141
+ // and it is not inside a tag, keep it
142
+ {
143
+ while (prevPtr < curPtr) *newptr++ = *prevPtr++;
144
+ prevPtr++;
145
+ *newptr++ = ' ';
146
+ }
147
+ else
148
+ prevPtr = curPtr+1;
149
+ }
150
+ }
151
+
152
+ newLen = newptr - *newBuf;
153
+
154
+ return PR_TRUE;
155
+ }
156
+ #endif //NO_ENGLISH_CONTAMINATION
157
+
158
+ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
159
+ {
160
+ nsProbingState st;
161
+ PRUint32 i;
162
+ char *newBuf1;
163
+ PRUint32 newLen1;
164
+
165
+ //apply filter to original buffer, and we got new buffer back
166
+ //depend on what script it is, we will feed them the new buffer
167
+ //we got after applying proper filter
168
+ FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1);
169
+
170
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
171
+ {
172
+ if (!mIsActive[i])
173
+ continue;
174
+ st = mProbers[i]->HandleData(newBuf1, newLen1);
175
+ if (st == eFoundIt)
176
+ {
177
+ mBestGuess = i;
178
+ mState = eFoundIt;
179
+ break;
180
+ }
181
+ else if (st == eNotMe)
182
+ {
183
+ mIsActive[i] = PR_FALSE;
184
+ mActiveNum--;
185
+ if (mActiveNum <= 0)
186
+ {
187
+ mState = eNotMe;
188
+ break;
189
+ }
190
+ }
191
+ }
192
+
193
+ PR_FREEIF(newBuf1);
194
+
195
+ return mState;
196
+ }
197
+
198
+ float nsSBCSGroupProber::GetConfidence(void)
199
+ {
200
+ PRUint32 i;
201
+ float bestConf = 0.0, cf;
202
+
203
+ switch (mState)
204
+ {
205
+ case eFoundIt:
206
+ return (float)0.99; //sure yes
207
+ case eNotMe:
208
+ return (float)0.01; //sure no
209
+ default:
210
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
211
+ {
212
+ if (!mIsActive[i])
213
+ continue;
214
+ cf = mProbers[i]->GetConfidence();
215
+ if (bestConf < cf)
216
+ {
217
+ bestConf = cf;
218
+ mBestGuess = i;
219
+ }
220
+ }
221
+ }
222
+ return bestConf;
223
+ }
224
+
225
+ #ifdef DEBUG_chardet
226
+ void
227
+ nsSBCSGroupProber::DumpStatus()
228
+ {
229
+ PRUint32 i;
230
+ float cf;
231
+
232
+ cf = GetConfidence();
233
+ printf("SBCS Group Prober --------begin status \r\n");
234
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
235
+ {
236
+ if (!mIsActive[i])
237
+ printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
238
+ else
239
+ mProbers[i]->DumpStatus();
240
+ }
241
+ printf("SBCS Group found best match [%s] confidence %f.\r\n",
242
+ mProbers[mBestGuess]->GetCharSetName(), cf);
243
+ }
244
+ #endif
@@ -0,0 +1,54 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsSBCSGroupProber_h__
22
+ #define nsSBCSGroupProber_h__
23
+
24
+
25
+ #define NUM_OF_SBCS_PROBERS 10
26
+
27
+ class nsSingleByteCharSetProber;
28
+ class nsSBCSGroupProber: public nsCharSetProber {
29
+ public:
30
+ nsSBCSGroupProber();
31
+ virtual ~nsSBCSGroupProber();
32
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
33
+ PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
34
+ PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
35
+ const char* GetCharSetName();
36
+ nsProbingState GetState(void) {return mState;};
37
+ void Reset(void);
38
+ float GetConfidence(void);
39
+ void SetOpion() {};
40
+
41
+ #ifdef DEBUG_chardet
42
+ void DumpStatus();
43
+ #endif
44
+
45
+ protected:
46
+ nsProbingState mState;
47
+ nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
48
+ PRBool mIsActive[NUM_OF_SBCS_PROBERS];
49
+ PRInt32 mBestGuess;
50
+ PRUint32 mActiveNum;
51
+ };
52
+
53
+ #endif /* nsSBCSGroupProber_h__ */
54
+
@@ -0,0 +1,100 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include "SBCharsetProber.h"
23
+
24
+ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
25
+ {
26
+ unsigned char order;
27
+
28
+ for (PRUint32 i = 0; i < aLen; i++)
29
+ {
30
+ order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
31
+
32
+ if (order < SYMBOL_CAT_ORDER)
33
+ mTotalChar++;
34
+ if (order < SAMPLE_SIZE)
35
+ {
36
+ mFreqChar++;
37
+
38
+ if (mLastOrder < SAMPLE_SIZE)
39
+ {
40
+ mTotalSeqs++;
41
+ ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
42
+ }
43
+ }
44
+ mLastOrder = order;
45
+ }
46
+
47
+ if (mState == eDetecting)
48
+ if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
49
+ {
50
+ float cf = GetConfidence();
51
+ if (cf > POSITIVE_SHORTCUT_THRESHOLD)
52
+ mState = eFoundIt;
53
+ else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
54
+ mState = eNotMe;
55
+ }
56
+
57
+ return mState;
58
+ }
59
+
60
+ void nsSingleByteCharSetProber::Reset(void)
61
+ {
62
+ mState = eDetecting;
63
+ mLastOrder = 255;
64
+ for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
65
+ mSeqCounters[i] = 0;
66
+ mTotalSeqs = 0;
67
+ mTotalChar = 0;
68
+ mFreqChar = 0;
69
+ }
70
+
71
+ //#define NEGATIVE_APPROACH 1
72
+
73
+ float nsSingleByteCharSetProber::GetConfidence(void)
74
+ {
75
+ #ifdef NEGATIVE_APPROACH
76
+ if (mTotalSeqs > 0)
77
+ if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
78
+ return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
79
+ return (float)0.01;
80
+ #else //POSITIVE_APPROACH
81
+ float r;
82
+
83
+ if (mTotalSeqs > 0) {
84
+ r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
85
+ r = r*mFreqChar/mTotalChar;
86
+ if (r >= (float)1.00)
87
+ r = (float)0.99;
88
+ return r;
89
+ }
90
+ return (float)0.01;
91
+ #endif
92
+ }
93
+
94
+ #ifdef DEBUG_chardet
95
+ void
96
+ nsSingleByteCharSetProber::DumpStatus()
97
+ {
98
+ printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence());
99
+ }
100
+ #endif
@@ -0,0 +1,89 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsSingleByteCharSetProber_h__
22
+ #define nsSingleByteCharSetProber_h__
23
+
24
+ #include "charsetProber.h"
25
+
26
+ #define SAMPLE_SIZE 64
27
+ #define SB_ENOUGH_REL_THRESHOLD 1024
28
+ #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
29
+ #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
30
+ #define SYMBOL_CAT_ORDER 250
31
+ #define NUMBER_OF_SEQ_CAT 4
32
+ #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
33
+ #define NEGATIVE_CAT 0
34
+
35
+ typedef struct
36
+ {
37
+ unsigned char *charToOrderMap; //[256] table use to find a char's order
38
+ char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
39
+ float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
40
+ PRBool keepEnglishLetter; //it says if this script contains latin letters
41
+ const char* charsetName;
42
+ } SequenceModel;
43
+
44
+
45
+ class nsSingleByteCharSetProber : public nsCharSetProber{
46
+ public:
47
+ nsSingleByteCharSetProber(SequenceModel *model){mModel = model; Reset();};
48
+ const char* GetCharSetName() {return mModel->charsetName;};
49
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
50
+ nsProbingState GetState(void) {return mState;};
51
+ void Reset(void);
52
+ float GetConfidence(void);
53
+ void SetOpion() {};
54
+ PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;};
55
+
56
+ #ifdef DEBUG_chardet
57
+ void DumpStatus();
58
+ #endif
59
+
60
+ protected:
61
+ nsProbingState mState;
62
+ SequenceModel *mModel;
63
+
64
+ //char order of last character
65
+ unsigned char mLastOrder;
66
+
67
+ PRUint32 mTotalSeqs;
68
+ PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
69
+
70
+ PRUint32 mTotalChar;
71
+ //characters that fall in our sampling range
72
+ PRUint32 mFreqChar;
73
+ };
74
+
75
+
76
+ extern SequenceModel Koi8rModel;
77
+ extern SequenceModel Win1251Model;
78
+ extern SequenceModel Latin5Model;
79
+ extern SequenceModel MacCyrillicModel;
80
+ extern SequenceModel Ibm866Model;
81
+ extern SequenceModel Ibm855Model;
82
+ extern SequenceModel Latin7Model;
83
+ extern SequenceModel Win1253Model;
84
+ extern SequenceModel Latin5BulgarianModel;
85
+ extern SequenceModel Win1251BulgarianModel;
86
+ extern SequenceModel Latin2HungarianModel;
87
+ extern SequenceModel Win1250HungarianModel;
88
+
89
+ #endif /* nsSingleByteCharSetProber_h__ */
@@ -0,0 +1,86 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ // for S-JIS encoding, obeserve characteristic:
22
+ // 1, kana character (or hankaku?) often have hight frequency of appereance
23
+ // 2, kana character often exist in group
24
+ // 3, certain combination of kana is never used in japanese language
25
+
26
+ #include "SJISProber.h"
27
+
28
+ void nsSJISProber::Reset(void)
29
+ {
30
+ mCodingSM->Reset();
31
+ mState = eDetecting;
32
+ mContextAnalyser.Reset();
33
+ mDistributionAnalyser.Reset();
34
+ }
35
+
36
+ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
37
+ {
38
+ nsSMState codingState;
39
+
40
+ for (PRUint32 i = 0; i < aLen; i++)
41
+ {
42
+ codingState = mCodingSM->NextState(aBuf[i]);
43
+ if (codingState == eError)
44
+ {
45
+ mState = eNotMe;
46
+ break;
47
+ }
48
+ if (codingState == eItsMe)
49
+ {
50
+ mState = eFoundIt;
51
+ break;
52
+ }
53
+ if (codingState == eStart)
54
+ {
55
+ PRUint32 charLen = mCodingSM->GetCurrentCharLen();
56
+ if (i == 0)
57
+ {
58
+ mLastChar[1] = aBuf[0];
59
+ mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen);
60
+ mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
61
+ }
62
+ else
63
+ {
64
+ mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen);
65
+ mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
66
+ }
67
+ }
68
+ }
69
+
70
+ mLastChar[0] = aBuf[aLen-1];
71
+
72
+ if (mState == eDetecting)
73
+ if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
74
+ mState = eFoundIt;
75
+
76
+ return mState;
77
+ }
78
+
79
+ float nsSJISProber::GetConfidence(void)
80
+ {
81
+ float contxtCf = mContextAnalyser.GetConfidence();
82
+ float distribCf = mDistributionAnalyser.GetConfidence();
83
+
84
+ return (contxtCf > distribCf ? contxtCf : distribCf);
85
+ }
86
+
@@ -0,0 +1,60 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ // for S-JIS encoding, obeserve characteristic:
22
+ // 1, kana character (or hankaku?) often have hight frequency of appereance
23
+ // 2, kana character often exist in group
24
+ // 3, certain combination of kana is never used in japanese language
25
+
26
+ #ifndef nsSJISProber_h__
27
+ #define nsSJISProber_h__
28
+
29
+ #include "charsetProber.h"
30
+ #include "codingStateMachine.h"
31
+ #include "jpCntx.h"
32
+ #include "charDistribution.h"
33
+
34
+
35
+ class nsSJISProber: public nsCharSetProber {
36
+ public:
37
+ nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
38
+ Reset();};
39
+ virtual ~nsSJISProber(void){delete mCodingSM;};
40
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
41
+ const char* GetCharSetName() {return "Shift_JIS";};
42
+ nsProbingState GetState(void) {return mState;};
43
+ void Reset(void);
44
+ float GetConfidence(void);
45
+ void SetOpion() {};
46
+
47
+ protected:
48
+ nsCodingStateMachine* mCodingSM;
49
+ nsProbingState mState;
50
+
51
+ SJISContextAnalysis mContextAnalyser;
52
+ SJISDistributionAnalysis mDistributionAnalyser;
53
+
54
+ char mLastChar[2];
55
+
56
+ };
57
+
58
+
59
+ #endif /* nsSJISProber_h__ */
60
+
@@ -0,0 +1,75 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "UTF8Prober.h"
22
+
23
+ void nsUTF8Prober::Reset(void)
24
+ {
25
+ mCodingSM->Reset();
26
+ mNumOfMBChar = 0;
27
+ mState = eDetecting;
28
+ }
29
+
30
+ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
31
+ {
32
+ nsSMState codingState;
33
+
34
+ for (PRUint32 i = 0; i < aLen; i++)
35
+ {
36
+ codingState = mCodingSM->NextState(aBuf[i]);
37
+ if (codingState == eError)
38
+ {
39
+ mState = eNotMe;
40
+ break;
41
+ }
42
+ if (codingState == eItsMe)
43
+ {
44
+ mState = eFoundIt;
45
+ break;
46
+ }
47
+ if (codingState == eStart)
48
+ {
49
+ if (mCodingSM->GetCurrentCharLen() >= 2)
50
+ mNumOfMBChar++;
51
+ }
52
+ }
53
+
54
+ if (mState == eDetecting)
55
+ if (GetConfidence() > SHORTCUT_THRESHOLD)
56
+ mState = eFoundIt;
57
+ return mState;
58
+ }
59
+
60
+ #define ONE_CHAR_PROB (float)0.50
61
+
62
+ float nsUTF8Prober::GetConfidence(void)
63
+ {
64
+ float unlike = (float)0.99;
65
+
66
+ if (mNumOfMBChar < 6)
67
+ {
68
+ for (PRUint32 i = 0; i < mNumOfMBChar; i++)
69
+ unlike *= ONE_CHAR_PROB;
70
+ return (float)1.0 - unlike;
71
+ }
72
+ else
73
+ return (float)0.99;
74
+ }
75
+