charguess 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,190 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "Latin1Prober.h"
22
+ #include "types.h"
23
+
24
+ #define UDF 0 // undefined
25
+ #define OTH 1 //other
26
+ #define ASC 2 // ascii capital letter
27
+ #define ASS 3 // ascii small letter
28
+ #define ACV 4 // accent capital vowel
29
+ #define ACO 5 // accent capital other
30
+ #define ASV 6 // accent small vowel
31
+ #define ASO 7 // accent small other
32
+ #define CLASS_NUM 8 // total classes
33
+
34
+ static unsigned char Latin1_CharToClass[] =
35
+ {
36
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
37
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
38
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
39
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
40
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
41
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
42
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
43
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
44
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
45
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
46
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
47
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
48
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
49
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
50
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
51
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
52
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
53
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
54
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
55
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
56
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
57
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
58
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
59
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
60
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
61
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
62
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
63
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
64
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
65
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
66
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
67
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
68
+ };
69
+
70
+
71
+ /* 0 : illegal
72
+ 1 : very unlikely
73
+ 2 : normal
74
+ 3 : very likely
75
+ */
76
+ static unsigned char Latin1ClassModel[] =
77
+ {
78
+ /* UDF OTH ASC ASS ACV ACO ASV ASO */
79
+ /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
80
+ /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
81
+ /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
82
+ /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
83
+ /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
84
+ /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
85
+ /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
86
+ /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
87
+ };
88
+
89
+ void nsLatin1Prober::Reset(void)
90
+ {
91
+ mState = eDetecting;
92
+ mLastCharClass = OTH;
93
+ for (int i = 0; i < FREQ_CAT_NUM; i++)
94
+ mFreqCounter[i] = 0;
95
+ }
96
+
97
+ PRBool nsLatin1Prober::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
98
+ {
99
+ //do filtering to reduce load to probers
100
+ char *newptr;
101
+ char *prevPtr, *curPtr;
102
+ PRBool isInTag = PR_FALSE;
103
+
104
+ newptr = *newBuf = (char*)PR_MALLOC(aLen);
105
+ if (!newptr)
106
+ return PR_FALSE;
107
+
108
+ for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
109
+ {
110
+ /*
111
+ if (*curPtr == '>')
112
+ isInTag = PR_FALSE;
113
+ else if (*curPtr == '<')
114
+ isInTag = PR_TRUE;
115
+ */
116
+
117
+ if (!(*curPtr & 0x80) &&
118
+ (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
119
+ {
120
+ if (curPtr > prevPtr && !isInTag) //current segment contains more than just a symbol
121
+ // and it is not inside a tag, keep it
122
+ {
123
+ while (prevPtr < curPtr) *newptr++ = *prevPtr++;
124
+ prevPtr++;
125
+ *newptr++ = ' ';
126
+ }
127
+ else
128
+ prevPtr = curPtr+1;
129
+ }
130
+ }
131
+
132
+ newLen = newptr - *newBuf;
133
+
134
+ return PR_TRUE;
135
+ }
136
+
137
+
138
+ nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
139
+ {
140
+ char *newBuf1;
141
+ PRUint32 newLen1;
142
+
143
+ if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
144
+ newBuf1 = (char*)aBuf;
145
+ newLen1 = aLen;
146
+ }
147
+
148
+ unsigned char charClass;
149
+ unsigned char freq;
150
+ for (PRUint32 i = 0; i < newLen1; i++)
151
+ {
152
+ charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
153
+ freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
154
+ if (freq == 0) {
155
+ mState = eNotMe;
156
+ break;
157
+ }
158
+ mFreqCounter[freq]++;
159
+ mLastCharClass = charClass;
160
+ }
161
+
162
+ if (newBuf1 != aBuf)
163
+ PR_FREEIF(newBuf1);
164
+
165
+ return mState;
166
+ }
167
+
168
+ float nsLatin1Prober::GetConfidence(void)
169
+ {
170
+ if (mState == eNotMe)
171
+ return 0.01f;
172
+
173
+ float confidence;
174
+ PRUint32 total = 0;
175
+ for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
176
+ total += mFreqCounter[i];
177
+
178
+ confidence = mFreqCounter[3]*1.0f / total;
179
+ confidence -= mFreqCounter[1]*20.0f/total;
180
+
181
+ if (confidence < 0.0f)
182
+ confidence = 0.0f;
183
+
184
+ // lower the confidence of latin1 so that other more accurate detector
185
+ // can take priority.
186
+ confidence *= 0.50f;
187
+
188
+ return confidence;
189
+ }
190
+
@@ -0,0 +1,49 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsLatin1Prober_h__
22
+ #define nsLatin1Prober_h__
23
+
24
+ #include "charsetProber.h"
25
+
26
+ #define FREQ_CAT_NUM 4
27
+
28
+ class nsLatin1Prober: public nsCharSetProber {
29
+ public:
30
+ nsLatin1Prober(void){Reset();};
31
+ virtual ~nsLatin1Prober(void){};
32
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
33
+ const char* GetCharSetName() {return "windows-1252";};
34
+ nsProbingState GetState(void) {return mState;};
35
+ void Reset(void);
36
+ float GetConfidence(void);
37
+ void SetOpion() {};
38
+
39
+ protected:
40
+ PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
41
+
42
+ nsProbingState mState;
43
+ char mLastCharClass;
44
+ PRUint32 mFreqCounter[FREQ_CAT_NUM];
45
+ };
46
+
47
+
48
+ #endif /* nsLatin1Prober_h__ */
49
+
@@ -0,0 +1,186 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ // #include <stdio.h>
22
+ // #include "prmem.h"
23
+ #include "types.h"
24
+
25
+ #include "MBCSGroupProber.h"
26
+
27
+ #ifdef DEBUG_chardet
28
+ char *ProberName[] =
29
+ {
30
+ "UTF8",
31
+ "SJIS",
32
+ "EUCJP",
33
+ "GB18030",
34
+ "EUCKR",
35
+ "Big5",
36
+ "EUCTW",
37
+ };
38
+
39
+ #endif
40
+
41
+ nsMBCSGroupProber::nsMBCSGroupProber()
42
+ {
43
+ mProbers[0] = new nsUTF8Prober();
44
+ mProbers[1] = new nsSJISProber();
45
+ mProbers[2] = new nsEUCJPProber();
46
+ mProbers[3] = new nsGB18030Prober();
47
+ mProbers[4] = new nsEUCKRProber();
48
+ mProbers[5] = new nsBig5Prober();
49
+ mProbers[6] = new nsEUCTWProber();
50
+ Reset();
51
+ }
52
+
53
+ nsMBCSGroupProber::~nsMBCSGroupProber()
54
+ {
55
+ for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
56
+ {
57
+ delete mProbers[i];
58
+ }
59
+ }
60
+
61
+ const char* nsMBCSGroupProber::GetCharSetName()
62
+ {
63
+ if (mBestGuess == -1)
64
+ {
65
+ GetConfidence();
66
+ if (mBestGuess == -1)
67
+ mBestGuess = 0;
68
+ }
69
+ return mProbers[mBestGuess]->GetCharSetName();
70
+ }
71
+
72
+ void nsMBCSGroupProber::Reset(void)
73
+ {
74
+ for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
75
+ {
76
+ mProbers[i]->Reset();
77
+ mIsActive[i] = PR_TRUE;
78
+ }
79
+ mActiveNum = NUM_OF_PROBERS;
80
+ mBestGuess = -1;
81
+ mState = eDetecting;
82
+ }
83
+
84
+ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
85
+ {
86
+ nsProbingState st;
87
+ PRUint32 i;
88
+
89
+ //do filtering to reduce load to probers
90
+ char *highbyteBuf;
91
+ char *hptr;
92
+ PRBool keepNext = PR_TRUE; //assume previous is not ascii, it will do not harm except add some noise
93
+ hptr = highbyteBuf = (char*)PR_MALLOC(aLen);
94
+ for (i = 0; i < aLen; i++)
95
+ {
96
+ if (aBuf[i] & 0x80)
97
+ {
98
+ *hptr++ = aBuf[i];
99
+ keepNext = PR_TRUE;
100
+ }
101
+ else
102
+ {
103
+ //if previous is highbyte, keep this even it is a ASCII
104
+ if (keepNext)
105
+ {
106
+ *hptr++ = aBuf[i];
107
+ keepNext = PR_FALSE;
108
+ }
109
+ }
110
+ }
111
+
112
+ for (i = 0; i < NUM_OF_PROBERS; i++)
113
+ {
114
+ if (!mIsActive[i])
115
+ continue;
116
+ st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
117
+ if (st == eFoundIt)
118
+ {
119
+ mBestGuess = i;
120
+ mState = eFoundIt;
121
+ break;
122
+ }
123
+ else if (st == eNotMe)
124
+ {
125
+ mIsActive[i] = PR_FALSE;
126
+ mActiveNum--;
127
+ if (mActiveNum <= 0)
128
+ {
129
+ mState = eNotMe;
130
+ break;
131
+ }
132
+ }
133
+ }
134
+
135
+ PR_FREEIF(highbyteBuf);
136
+
137
+ return mState;
138
+ }
139
+
140
+ float nsMBCSGroupProber::GetConfidence(void)
141
+ {
142
+ PRUint32 i;
143
+ float bestConf = 0.0, cf;
144
+
145
+ switch (mState)
146
+ {
147
+ case eFoundIt:
148
+ return (float)0.99;
149
+ case eNotMe:
150
+ return (float)0.01;
151
+ default:
152
+ for (i = 0; i < NUM_OF_PROBERS; i++)
153
+ {
154
+ if (!mIsActive[i])
155
+ continue;
156
+ cf = mProbers[i]->GetConfidence();
157
+ if (bestConf < cf)
158
+ {
159
+ bestConf = cf;
160
+ mBestGuess = i;
161
+ }
162
+ }
163
+ }
164
+ return bestConf;
165
+ }
166
+
167
+ #ifdef DEBUG_chardet
168
+ void
169
+ nsMBCSGroupProber::DumpStatus()
170
+ {
171
+ PRUint32 i;
172
+ float cf;
173
+
174
+ GetConfidence();
175
+ for (i = 0; i < NUM_OF_PROBERS; i++)
176
+ {
177
+ if (!mIsActive[i])
178
+ printf("[%s] is inactive(ie. cofidence is too low).\r\n", ProberName[i]);
179
+ else
180
+ {
181
+ cf = mProbers[i]->GetConfidence();
182
+ printf("[%s] prober has confidence %f\r\n", ProberName[i], cf);
183
+ }
184
+ }
185
+ }
186
+ #endif
@@ -0,0 +1,58 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #ifndef nsMBCSGroupProber_h__
22
+ #define nsMBCSGroupProber_h__
23
+
24
+ #include "SJISProber.h"
25
+ #include "UTF8Prober.h"
26
+ #include "EUCJPProber.h"
27
+ #include "GB2312Prober.h"
28
+ #include "EUCKRProber.h"
29
+ #include "big5Prober.h"
30
+ #include "EUCTWProber.h"
31
+
32
+ #define NUM_OF_PROBERS 7
33
+
34
+ class nsMBCSGroupProber: public nsCharSetProber {
35
+ public:
36
+ nsMBCSGroupProber();
37
+ virtual ~nsMBCSGroupProber();
38
+ nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
39
+ const char* GetCharSetName();
40
+ nsProbingState GetState(void) {return mState;};
41
+ void Reset(void);
42
+ float GetConfidence(void);
43
+ void SetOpion() {};
44
+
45
+ #ifdef DEBUG_chardet
46
+ void DumpStatus();
47
+ #endif
48
+
49
+ protected:
50
+ nsProbingState mState;
51
+ nsCharSetProber* mProbers[NUM_OF_PROBERS];
52
+ PRBool mIsActive[NUM_OF_PROBERS];
53
+ PRInt32 mBestGuess;
54
+ PRUint32 mActiveNum;
55
+ };
56
+
57
+ #endif /* nsMBCSGroupProber_h__ */
58
+