charguess 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,244 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "SBCharsetProber.h"
22
+ #include "SBCSGroupProber.h"
23
+
24
+
25
+ nsSBCSGroupProber::nsSBCSGroupProber()
26
+ {
27
+ mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
28
+ mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
29
+ mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
30
+ mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
31
+ mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
32
+ mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
33
+ mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
34
+ mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
35
+ mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
36
+ mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
37
+
38
+ // disable latin2 before latin1 is available, otherwise all latin1
39
+ // will be detected as latin2 because of their similarity.
40
+ //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
41
+ //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
42
+
43
+ Reset();
44
+ }
45
+
46
+ nsSBCSGroupProber::~nsSBCSGroupProber()
47
+ {
48
+ for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
49
+ {
50
+ delete mProbers[i];
51
+ }
52
+ }
53
+
54
+
55
+ const char* nsSBCSGroupProber::GetCharSetName()
56
+ {
57
+ //if we have no answer yet
58
+ if (mBestGuess == -1)
59
+ {
60
+ GetConfidence();
61
+ //no charset seems positive
62
+ if (mBestGuess == -1)
63
+ //we will use default.
64
+ mBestGuess = 0;
65
+ }
66
+ return mProbers[mBestGuess]->GetCharSetName();
67
+ }
68
+
69
+ void nsSBCSGroupProber::Reset(void)
70
+ {
71
+ for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
72
+ {
73
+ mProbers[i]->Reset();
74
+ mIsActive[i] = PR_TRUE;
75
+ }
76
+ mBestGuess = -1;
77
+ mState = eDetecting;
78
+ }
79
+
80
+ //This filter apply to all scripts that does not use latin letters (english letter)
81
+ PRBool nsSBCSGroupProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
82
+ {
83
+ //do filtering to reduce load to probers
84
+ char *newptr;
85
+ char *prevPtr, *curPtr;
86
+
87
+ PRBool meetMSB = PR_FALSE;
88
+ newptr = *newBuf = (char*)PR_MALLOC(aLen);
89
+ if (!newptr)
90
+ return PR_FALSE;
91
+
92
+ for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
93
+ {
94
+ if (*curPtr & 0x80)
95
+ meetMSB = PR_TRUE;
96
+ else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
97
+ {
98
+ //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
99
+ if (meetMSB && curPtr > prevPtr)
100
+ //this segment contains more than single symbol, and it has upper ascii, we need to keep it
101
+ {
102
+ while (prevPtr < curPtr) *newptr++ = *prevPtr++;
103
+ prevPtr++;
104
+ *newptr++ = ' ';
105
+ meetMSB = PR_FALSE;
106
+ }
107
+ else //ignore current segment. (either because it is just a symbol or just a english word
108
+ prevPtr = curPtr+1;
109
+ }
110
+ }
111
+
112
+ newLen = newptr - *newBuf;
113
+
114
+ return PR_TRUE;
115
+ }
116
+
117
+ #ifdef NO_ENGLISH_CONTAMINATION
118
+ //This filter apply to all scripts that does use latin letters (english letter)
119
+ PRBool nsSBCSGroupProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
120
+ {
121
+ //do filtering to reduce load to probers
122
+ char *newptr;
123
+ char *prevPtr, *curPtr;
124
+ PRBool isInTag = PR_FALSE;
125
+
126
+ newptr = *newBuf = (char*)PR_MALLOC(aLen);
127
+ if (!newptr)
128
+ return PR_FALSE;
129
+
130
+ for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
131
+ {
132
+ if (*curPtr == '>')
133
+ isInTag = PR_FALSE;
134
+ else if (*curPtr == '<')
135
+ isInTag = PR_TRUE;
136
+
137
+ if (!(*curPtr & 0x80) &&
138
+ (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
139
+ {
140
+ if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
141
+ // and it is not inside a tag, keep it
142
+ {
143
+ while (prevPtr < curPtr) *newptr++ = *prevPtr++;
144
+ prevPtr++;
145
+ *newptr++ = ' ';
146
+ }
147
+ else
148
+ prevPtr = curPtr+1;
149
+ }
150
+ }
151
+
152
+ newLen = newptr - *newBuf;
153
+
154
+ return PR_TRUE;
155
+ }
156
+ #endif //NO_ENGLISH_CONTAMINATION
157
+
158
+ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
159
+ {
160
+ nsProbingState st;
161
+ PRUint32 i;
162
+ char *newBuf1;
163
+ PRUint32 newLen1;
164
+
165
+ //apply filter to original buffer, and we got new buffer back
166
+ //depend on what script it is, we will feed them the new buffer
167
+ //we got after applying proper filter
168
+ FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1);
169
+
170
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
171
+ {
172
+ if (!mIsActive[i])
173
+ continue;
174
+ st = mProbers[i]->HandleData(newBuf1, newLen1);
175
+ if (st == eFoundIt)
176
+ {
177
+ mBestGuess = i;
178
+ mState = eFoundIt;
179
+ break;
180
+ }
181
+ else if (st == eNotMe)
182
+ {
183
+ mIsActive[i] = PR_FALSE;
184
+ mActiveNum--;
185
+ if (mActiveNum <= 0)
186
+ {
187
+ mState = eNotMe;
188
+ break;
189
+ }
190
+ }
191
+ }
192
+
193
+ PR_FREEIF(newBuf1);
194
+
195
+ return mState;
196
+ }
197
+
198
+ float nsSBCSGroupProber::GetConfidence(void)
199
+ {
200
+ PRUint32 i;
201
+ float bestConf = 0.0, cf;
202
+
203
+ switch (mState)
204
+ {
205
+ case eFoundIt:
206
+ return (float)0.99; //sure yes
207
+ case eNotMe:
208
+ return (float)0.01; //sure no
209
+ default:
210
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
211
+ {
212
+ if (!mIsActive[i])
213
+ continue;
214
+ cf = mProbers[i]->GetConfidence();
215
+ if (bestConf < cf)
216
+ {
217
+ bestConf = cf;
218
+ mBestGuess = i;
219
+ }
220
+ }
221
+ }
222
+ return bestConf;
223
+ }
224
+
225
+ #ifdef DEBUG_chardet
226
+ void
227
+ nsSBCSGroupProber::DumpStatus()
228
+ {
229
+ PRUint32 i;
230
+ float cf;
231
+
232
+ cf = GetConfidence();
233
+ printf("SBCS Group Prober --------begin status \r\n");
234
+ for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
235
+ {
236
+ if (!mIsActive[i])
237
+ printf("[%s] is inactive(ie. cofidence is too low).\r\n", mProbers[i]->GetCharSetName(), i);
238
+ else
239
+ mProbers[i]->DumpStatus();
240
+ }
241
+ printf("SBCS Group found best match [%s] confidence %f.\r\n",
242
+ mProbers[mBestGuess]->GetCharSetName(), cf);
243
+ }
244
+ #endif
@@ -0,0 +1,54 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsSBCSGroupProber_h__
22
+ #define nsSBCSGroupProber_h__
23
+
24
+
25
+ #define NUM_OF_SBCS_PROBERS 10
26
+
27
+ class nsSingleByteCharSetProber;
28
+ class nsSBCSGroupProber: public nsCharSetProber {
29
+ public:
30
+ nsSBCSGroupProber();
31
+ virtual ~nsSBCSGroupProber();
32
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
33
+ PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
34
+ PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
35
+ const char* GetCharSetName();
36
+ nsProbingState GetState(void) {return mState;};
37
+ void Reset(void);
38
+ float GetConfidence(void);
39
+ void SetOpion() {};
40
+
41
+ #ifdef DEBUG_chardet
42
+ void DumpStatus();
43
+ #endif
44
+
45
+ protected:
46
+ nsProbingState mState;
47
+ nsSingleByteCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
48
+ PRBool mIsActive[NUM_OF_SBCS_PROBERS];
49
+ PRInt32 mBestGuess;
50
+ PRUint32 mActiveNum;
51
+ };
52
+
53
+ #endif /* nsSBCSGroupProber_h__ */
54
+
@@ -0,0 +1,100 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include <stdio.h>
22
+ #include "SBCharsetProber.h"
23
+
24
+ nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
25
+ {
26
+ unsigned char order;
27
+
28
+ for (PRUint32 i = 0; i < aLen; i++)
29
+ {
30
+ order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
31
+
32
+ if (order < SYMBOL_CAT_ORDER)
33
+ mTotalChar++;
34
+ if (order < SAMPLE_SIZE)
35
+ {
36
+ mFreqChar++;
37
+
38
+ if (mLastOrder < SAMPLE_SIZE)
39
+ {
40
+ mTotalSeqs++;
41
+ ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
42
+ }
43
+ }
44
+ mLastOrder = order;
45
+ }
46
+
47
+ if (mState == eDetecting)
48
+ if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
49
+ {
50
+ float cf = GetConfidence();
51
+ if (cf > POSITIVE_SHORTCUT_THRESHOLD)
52
+ mState = eFoundIt;
53
+ else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
54
+ mState = eNotMe;
55
+ }
56
+
57
+ return mState;
58
+ }
59
+
60
+ void nsSingleByteCharSetProber::Reset(void)
61
+ {
62
+ mState = eDetecting;
63
+ mLastOrder = 255;
64
+ for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
65
+ mSeqCounters[i] = 0;
66
+ mTotalSeqs = 0;
67
+ mTotalChar = 0;
68
+ mFreqChar = 0;
69
+ }
70
+
71
+ //#define NEGATIVE_APPROACH 1
72
+
73
+ float nsSingleByteCharSetProber::GetConfidence(void)
74
+ {
75
+ #ifdef NEGATIVE_APPROACH
76
+ if (mTotalSeqs > 0)
77
+ if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
78
+ return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
79
+ return (float)0.01;
80
+ #else //POSITIVE_APPROACH
81
+ float r;
82
+
83
+ if (mTotalSeqs > 0) {
84
+ r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
85
+ r = r*mFreqChar/mTotalChar;
86
+ if (r >= (float)1.00)
87
+ r = (float)0.99;
88
+ return r;
89
+ }
90
+ return (float)0.01;
91
+ #endif
92
+ }
93
+
94
+ #ifdef DEBUG_chardet
95
+ void
96
+ nsSingleByteCharSetProber::DumpStatus()
97
+ {
98
+ printf("[%s] prober has confidence %f\r\n", GetCharSetName(), GetConfidence());
99
+ }
100
+ #endif
@@ -0,0 +1,89 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsSingleByteCharSetProber_h__
22
+ #define nsSingleByteCharSetProber_h__
23
+
24
+ #include "charsetProber.h"
25
+
26
+ #define SAMPLE_SIZE 64
27
+ #define SB_ENOUGH_REL_THRESHOLD 1024
28
+ #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
29
+ #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
30
+ #define SYMBOL_CAT_ORDER 250
31
+ #define NUMBER_OF_SEQ_CAT 4
32
+ #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
33
+ #define NEGATIVE_CAT 0
34
+
35
+ typedef struct
36
+ {
37
+ unsigned char *charToOrderMap; //[256] table use to find a char's order
38
+ char *precedenceMatrix; //[SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
39
+ float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
40
+ PRBool keepEnglishLetter; //it says if this script contains latin letters
41
+ const char* charsetName;
42
+ } SequenceModel;
43
+
44
+
45
+ class nsSingleByteCharSetProber : public nsCharSetProber{
46
+ public:
47
+ nsSingleByteCharSetProber(SequenceModel *model){mModel = model; Reset();};
48
+ const char* GetCharSetName() {return mModel->charsetName;};
49
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
50
+ nsProbingState GetState(void) {return mState;};
51
+ void Reset(void);
52
+ float GetConfidence(void);
53
+ void SetOpion() {};
54
+ PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;};
55
+
56
+ #ifdef DEBUG_chardet
57
+ void DumpStatus();
58
+ #endif
59
+
60
+ protected:
61
+ nsProbingState mState;
62
+ SequenceModel *mModel;
63
+
64
+ //char order of last character
65
+ unsigned char mLastOrder;
66
+
67
+ PRUint32 mTotalSeqs;
68
+ PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
69
+
70
+ PRUint32 mTotalChar;
71
+ //characters that fall in our sampling range
72
+ PRUint32 mFreqChar;
73
+ };
74
+
75
+
76
+ extern SequenceModel Koi8rModel;
77
+ extern SequenceModel Win1251Model;
78
+ extern SequenceModel Latin5Model;
79
+ extern SequenceModel MacCyrillicModel;
80
+ extern SequenceModel Ibm866Model;
81
+ extern SequenceModel Ibm855Model;
82
+ extern SequenceModel Latin7Model;
83
+ extern SequenceModel Win1253Model;
84
+ extern SequenceModel Latin5BulgarianModel;
85
+ extern SequenceModel Win1251BulgarianModel;
86
+ extern SequenceModel Latin2HungarianModel;
87
+ extern SequenceModel Win1250HungarianModel;
88
+
89
+ #endif /* nsSingleByteCharSetProber_h__ */
@@ -0,0 +1,86 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ // for S-JIS encoding, obeserve characteristic:
22
+ // 1, kana character (or hankaku?) often have hight frequency of appereance
23
+ // 2, kana character often exist in group
24
+ // 3, certain combination of kana is never used in japanese language
25
+
26
+ #include "SJISProber.h"
27
+
28
+ void nsSJISProber::Reset(void)
29
+ {
30
+ mCodingSM->Reset();
31
+ mState = eDetecting;
32
+ mContextAnalyser.Reset();
33
+ mDistributionAnalyser.Reset();
34
+ }
35
+
36
+ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
37
+ {
38
+ nsSMState codingState;
39
+
40
+ for (PRUint32 i = 0; i < aLen; i++)
41
+ {
42
+ codingState = mCodingSM->NextState(aBuf[i]);
43
+ if (codingState == eError)
44
+ {
45
+ mState = eNotMe;
46
+ break;
47
+ }
48
+ if (codingState == eItsMe)
49
+ {
50
+ mState = eFoundIt;
51
+ break;
52
+ }
53
+ if (codingState == eStart)
54
+ {
55
+ PRUint32 charLen = mCodingSM->GetCurrentCharLen();
56
+ if (i == 0)
57
+ {
58
+ mLastChar[1] = aBuf[0];
59
+ mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen);
60
+ mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
61
+ }
62
+ else
63
+ {
64
+ mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen);
65
+ mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
66
+ }
67
+ }
68
+ }
69
+
70
+ mLastChar[0] = aBuf[aLen-1];
71
+
72
+ if (mState == eDetecting)
73
+ if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
74
+ mState = eFoundIt;
75
+
76
+ return mState;
77
+ }
78
+
79
+ float nsSJISProber::GetConfidence(void)
80
+ {
81
+ float contxtCf = mContextAnalyser.GetConfidence();
82
+ float distribCf = mDistributionAnalyser.GetConfidence();
83
+
84
+ return (contxtCf > distribCf ? contxtCf : distribCf);
85
+ }
86
+
@@ -0,0 +1,60 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ // for S-JIS encoding, obeserve characteristic:
22
+ // 1, kana character (or hankaku?) often have hight frequency of appereance
23
+ // 2, kana character often exist in group
24
+ // 3, certain combination of kana is never used in japanese language
25
+
26
+ #ifndef nsSJISProber_h__
27
+ #define nsSJISProber_h__
28
+
29
+ #include "charsetProber.h"
30
+ #include "codingStateMachine.h"
31
+ #include "jpCntx.h"
32
+ #include "charDistribution.h"
33
+
34
+
35
+ class nsSJISProber: public nsCharSetProber {
36
+ public:
37
+ nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
38
+ Reset();};
39
+ virtual ~nsSJISProber(void){delete mCodingSM;};
40
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
41
+ const char* GetCharSetName() {return "Shift_JIS";};
42
+ nsProbingState GetState(void) {return mState;};
43
+ void Reset(void);
44
+ float GetConfidence(void);
45
+ void SetOpion() {};
46
+
47
+ protected:
48
+ nsCodingStateMachine* mCodingSM;
49
+ nsProbingState mState;
50
+
51
+ SJISContextAnalysis mContextAnalyser;
52
+ SJISDistributionAnalysis mDistributionAnalyser;
53
+
54
+ char mLastChar[2];
55
+
56
+ };
57
+
58
+
59
+ #endif /* nsSJISProber_h__ */
60
+
@@ -0,0 +1,75 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "UTF8Prober.h"
22
+
23
+ void nsUTF8Prober::Reset(void)
24
+ {
25
+ mCodingSM->Reset();
26
+ mNumOfMBChar = 0;
27
+ mState = eDetecting;
28
+ }
29
+
30
+ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
31
+ {
32
+ nsSMState codingState;
33
+
34
+ for (PRUint32 i = 0; i < aLen; i++)
35
+ {
36
+ codingState = mCodingSM->NextState(aBuf[i]);
37
+ if (codingState == eError)
38
+ {
39
+ mState = eNotMe;
40
+ break;
41
+ }
42
+ if (codingState == eItsMe)
43
+ {
44
+ mState = eFoundIt;
45
+ break;
46
+ }
47
+ if (codingState == eStart)
48
+ {
49
+ if (mCodingSM->GetCurrentCharLen() >= 2)
50
+ mNumOfMBChar++;
51
+ }
52
+ }
53
+
54
+ if (mState == eDetecting)
55
+ if (GetConfidence() > SHORTCUT_THRESHOLD)
56
+ mState = eFoundIt;
57
+ return mState;
58
+ }
59
+
60
+ #define ONE_CHAR_PROB (float)0.50
61
+
62
+ float nsUTF8Prober::GetConfidence(void)
63
+ {
64
+ float unlike = (float)0.99;
65
+
66
+ if (mNumOfMBChar < 6)
67
+ {
68
+ for (PRUint32 i = 0; i < mNumOfMBChar; i++)
69
+ unlike *= ONE_CHAR_PROB;
70
+ return (float)1.0 - unlike;
71
+ }
72
+ else
73
+ return (float)0.99;
74
+ }
75
+