charguess 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/History.txt +4 -0
  2. data/Manifest.txt +134 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +26 -0
  5. data/Rakefile +25 -0
  6. data/ext/charguess/charguess.c +29 -0
  7. data/ext/charguess/extconf.rb +11 -0
  8. data/ext/libcharguess/cpp/.deps/EUCJPProber.Plo +1 -0
  9. data/ext/libcharguess/cpp/.deps/EUCJPProber.Po +87 -0
  10. data/ext/libcharguess/cpp/.deps/EUCKRProber.Plo +1 -0
  11. data/ext/libcharguess/cpp/.deps/EUCKRProber.Po +85 -0
  12. data/ext/libcharguess/cpp/.deps/EUCTWProber.Plo +1 -0
  13. data/ext/libcharguess/cpp/.deps/EUCTWProber.Po +85 -0
  14. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Plo +1 -0
  15. data/ext/libcharguess/cpp/.deps/EscCharsetProber.Po +83 -0
  16. data/ext/libcharguess/cpp/.deps/EscSM.Plo +1 -0
  17. data/ext/libcharguess/cpp/.deps/EscSM.Po +77 -0
  18. data/ext/libcharguess/cpp/.deps/GB2312Prober.Plo +1 -0
  19. data/ext/libcharguess/cpp/.deps/GB2312Prober.Po +85 -0
  20. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Plo +1 -0
  21. data/ext/libcharguess/cpp/.deps/LangBulgarianModel.Po +78 -0
  22. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Plo +1 -0
  23. data/ext/libcharguess/cpp/.deps/LangCyrillicModel.Po +78 -0
  24. data/ext/libcharguess/cpp/.deps/LangGreekModel.Plo +1 -0
  25. data/ext/libcharguess/cpp/.deps/LangGreekModel.Po +78 -0
  26. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Plo +1 -0
  27. data/ext/libcharguess/cpp/.deps/LangHungarianModel.Po +78 -0
  28. data/ext/libcharguess/cpp/.deps/LangThaiModel.Plo +1 -0
  29. data/ext/libcharguess/cpp/.deps/LangThaiModel.Po +78 -0
  30. data/ext/libcharguess/cpp/.deps/Latin1Prober.Plo +1 -0
  31. data/ext/libcharguess/cpp/.deps/Latin1Prober.Po +78 -0
  32. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Plo +1 -0
  33. data/ext/libcharguess/cpp/.deps/MBCSGroupProber.Po +102 -0
  34. data/ext/libcharguess/cpp/.deps/MBCSSM.Plo +1 -0
  35. data/ext/libcharguess/cpp/.deps/MBCSSM.Po +77 -0
  36. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Plo +1 -0
  37. data/ext/libcharguess/cpp/.deps/SBCSGroupProber.Po +80 -0
  38. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Plo +1 -0
  39. data/ext/libcharguess/cpp/.deps/SBCharsetProber.Po +78 -0
  40. data/ext/libcharguess/cpp/.deps/SJISProber.Plo +1 -0
  41. data/ext/libcharguess/cpp/.deps/SJISProber.Po +86 -0
  42. data/ext/libcharguess/cpp/.deps/UTF8Prober.Plo +1 -0
  43. data/ext/libcharguess/cpp/.deps/UTF8Prober.Po +82 -0
  44. data/ext/libcharguess/cpp/.deps/big5Prober.Plo +1 -0
  45. data/ext/libcharguess/cpp/.deps/big5Prober.Po +84 -0
  46. data/ext/libcharguess/cpp/.deps/charDistribution.Plo +1 -0
  47. data/ext/libcharguess/cpp/.deps/charDistribution.Po +87 -0
  48. data/ext/libcharguess/cpp/.deps/chardet.Plo +1 -0
  49. data/ext/libcharguess/cpp/.deps/chardet.Po +84 -0
  50. data/ext/libcharguess/cpp/.deps/charguess.Po +77 -0
  51. data/ext/libcharguess/cpp/.deps/jpCntx.Plo +1 -0
  52. data/ext/libcharguess/cpp/.deps/jpCntx.Po +75 -0
  53. data/ext/libcharguess/cpp/.deps/universal.Plo +1 -0
  54. data/ext/libcharguess/cpp/.deps/universal.Po +111 -0
  55. data/ext/libcharguess/cpp/AUTHORS +3 -0
  56. data/ext/libcharguess/cpp/Big5Freq.tab +928 -0
  57. data/ext/libcharguess/cpp/COPYING +340 -0
  58. data/ext/libcharguess/cpp/COPYRIGHT +20 -0
  59. data/ext/libcharguess/cpp/ChangeLog +0 -0
  60. data/ext/libcharguess/cpp/EUCJPProber.cpp +80 -0
  61. data/ext/libcharguess/cpp/EUCJPProber.h +58 -0
  62. data/ext/libcharguess/cpp/EUCKRFreq.tab +615 -0
  63. data/ext/libcharguess/cpp/EUCKRProber.cpp +80 -0
  64. data/ext/libcharguess/cpp/EUCKRProber.h +54 -0
  65. data/ext/libcharguess/cpp/EUCTWFreq.tab +448 -0
  66. data/ext/libcharguess/cpp/EUCTWProber.cpp +79 -0
  67. data/ext/libcharguess/cpp/EUCTWProber.h +53 -0
  68. data/ext/libcharguess/cpp/EscCharsetProber.cpp +89 -0
  69. data/ext/libcharguess/cpp/EscCharsetProber.h +49 -0
  70. data/ext/libcharguess/cpp/EscSM.cpp +244 -0
  71. data/ext/libcharguess/cpp/GB2312Freq.tab +476 -0
  72. data/ext/libcharguess/cpp/GB2312Prober.cpp +84 -0
  73. data/ext/libcharguess/cpp/GB2312Prober.h +56 -0
  74. data/ext/libcharguess/cpp/INSTALL +229 -0
  75. data/ext/libcharguess/cpp/JISFreq.tab +574 -0
  76. data/ext/libcharguess/cpp/LICENSE +504 -0
  77. data/ext/libcharguess/cpp/LangBulgarianModel.cpp +230 -0
  78. data/ext/libcharguess/cpp/LangCyrillicModel.cpp +340 -0
  79. data/ext/libcharguess/cpp/LangGreekModel.cpp +229 -0
  80. data/ext/libcharguess/cpp/LangHungarianModel.cpp +228 -0
  81. data/ext/libcharguess/cpp/LangThaiModel.cpp +206 -0
  82. data/ext/libcharguess/cpp/Latin1Prober.cpp +190 -0
  83. data/ext/libcharguess/cpp/Latin1Prober.h +49 -0
  84. data/ext/libcharguess/cpp/MBCSGroupProber.cpp +186 -0
  85. data/ext/libcharguess/cpp/MBCSGroupProber.h +58 -0
  86. data/ext/libcharguess/cpp/MBCSSM.cpp +610 -0
  87. data/ext/libcharguess/cpp/Makefile.am +45 -0
  88. data/ext/libcharguess/cpp/Makefile.in +608 -0
  89. data/ext/libcharguess/cpp/NEWS +0 -0
  90. data/ext/libcharguess/cpp/README +0 -0
  91. data/ext/libcharguess/cpp/SBCSGroupProber.cpp +244 -0
  92. data/ext/libcharguess/cpp/SBCSGroupProber.h +54 -0
  93. data/ext/libcharguess/cpp/SBCharsetProber.cpp +100 -0
  94. data/ext/libcharguess/cpp/SBCharsetProber.h +89 -0
  95. data/ext/libcharguess/cpp/SJISProber.cpp +86 -0
  96. data/ext/libcharguess/cpp/SJISProber.h +60 -0
  97. data/ext/libcharguess/cpp/UTF8Prober.cpp +75 -0
  98. data/ext/libcharguess/cpp/UTF8Prober.h +46 -0
  99. data/ext/libcharguess/cpp/aclocal.m4 +1008 -0
  100. data/ext/libcharguess/cpp/autogen.sh +153 -0
  101. data/ext/libcharguess/cpp/big5Prober.cpp +76 -0
  102. data/ext/libcharguess/cpp/big5Prober.h +53 -0
  103. data/ext/libcharguess/cpp/charDistribution.cpp +90 -0
  104. data/ext/libcharguess/cpp/charDistribution.h +219 -0
  105. data/ext/libcharguess/cpp/charguess.cpp +56 -0
  106. data/ext/libcharguess/cpp/charguess.h +23 -0
  107. data/ext/libcharguess/cpp/charsetProber.h +50 -0
  108. data/ext/libcharguess/cpp/codingStateMachine.h +92 -0
  109. data/ext/libcharguess/cpp/config.h +36 -0
  110. data/ext/libcharguess/cpp/config.h.in +35 -0
  111. data/ext/libcharguess/cpp/config.status +1075 -0
  112. data/ext/libcharguess/cpp/configure +5226 -0
  113. data/ext/libcharguess/cpp/configure.in +49 -0
  114. data/ext/libcharguess/cpp/depcomp +472 -0
  115. data/ext/libcharguess/cpp/fix_copyright +32 -0
  116. data/ext/libcharguess/cpp/install-sh +294 -0
  117. data/ext/libcharguess/cpp/jpCntx.cpp +194 -0
  118. data/ext/libcharguess/cpp/jpCntx.h +100 -0
  119. data/ext/libcharguess/cpp/missing +336 -0
  120. data/ext/libcharguess/cpp/mkinstalldirs +111 -0
  121. data/ext/libcharguess/cpp/pkgInt.h +72 -0
  122. data/ext/libcharguess/cpp/stamp-h1 +1 -0
  123. data/ext/libcharguess/cpp/test/test.cpp +78 -0
  124. data/ext/libcharguess/cpp/types.h +41 -0
  125. data/ext/libcharguess/cpp/universal.cpp +273 -0
  126. data/ext/libcharguess/cpp/universal.h +65 -0
  127. data/script/console +9 -0
  128. data/script/destroy +14 -0
  129. data/script/generate +14 -0
  130. data/tasks/extconf/charguess.rake +47 -0
  131. data/tasks/extconf.rake +13 -0
  132. data/test/test_charguess.rb +7 -0
  133. data/test/test_charguess_extn.rb +10 -0
  134. data/test/test_helper.rb +3 -0
  135. metadata +219 -0
@@ -0,0 +1,610 @@
1
+ /*
2
+ libcharguess - Guess the encoding/charset of a string
3
+ Copyright (C) 2003 Stephane Corbe <noubi@users.sourceforge.net>
4
+ Based on Mozilla sources
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ */
20
+
21
+ #include "codingStateMachine.h"
22
+
23
+ /*
24
+ Modification from frank tang's original work:
25
+ . 0x00 is allowed as a legal character. Since some web pages contains this char in
26
+ text stream.
27
+ */
28
+
29
+ // BIG5
30
+
31
+ static PRUint32 BIG5_cls [ 256 / 8 ] = {
32
+ //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
33
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
34
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
35
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
36
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
37
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
38
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
39
+ PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37
40
+ PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f
41
+ PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47
42
+ PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f
43
+ PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57
44
+ PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f
45
+ PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67
46
+ PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f
47
+ PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77
48
+ PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f
49
+ PCK4BITS(4,4,4,4,4,4,4,4), // 80 - 87
50
+ PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f
51
+ PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97
52
+ PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f
53
+ PCK4BITS(4,3,3,3,3,3,3,3), // a0 - a7
54
+ PCK4BITS(3,3,3,3,3,3,3,3), // a8 - af
55
+ PCK4BITS(3,3,3,3,3,3,3,3), // b0 - b7
56
+ PCK4BITS(3,3,3,3,3,3,3,3), // b8 - bf
57
+ PCK4BITS(3,3,3,3,3,3,3,3), // c0 - c7
58
+ PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf
59
+ PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7
60
+ PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df
61
+ PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7
62
+ PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef
63
+ PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7
64
+ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
65
+ };
66
+
67
+
68
+ static PRUint32 BIG5_st [ 3] = {
69
+ PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
70
+ PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
71
+ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
72
+ };
73
+
74
+ static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
75
+
76
+ SMModel Big5SMModel = {
77
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
78
+ 5,
79
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
80
+ Big5CharLenTable,
81
+ "Big5",
82
+ };
83
+
84
+ static PRUint32 EUCJP_cls [ 256 / 8 ] = {
85
+ //PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
86
+ PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
87
+ PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
88
+ PCK4BITS(4,4,4,4,4,4,4,4), // 10 - 17
89
+ PCK4BITS(4,4,4,5,4,4,4,4), // 18 - 1f
90
+ PCK4BITS(4,4,4,4,4,4,4,4), // 20 - 27
91
+ PCK4BITS(4,4,4,4,4,4,4,4), // 28 - 2f
92
+ PCK4BITS(4,4,4,4,4,4,4,4), // 30 - 37
93
+ PCK4BITS(4,4,4,4,4,4,4,4), // 38 - 3f
94
+ PCK4BITS(4,4,4,4,4,4,4,4), // 40 - 47
95
+ PCK4BITS(4,4,4,4,4,4,4,4), // 48 - 4f
96
+ PCK4BITS(4,4,4,4,4,4,4,4), // 50 - 57
97
+ PCK4BITS(4,4,4,4,4,4,4,4), // 58 - 5f
98
+ PCK4BITS(4,4,4,4,4,4,4,4), // 60 - 67
99
+ PCK4BITS(4,4,4,4,4,4,4,4), // 68 - 6f
100
+ PCK4BITS(4,4,4,4,4,4,4,4), // 70 - 77
101
+ PCK4BITS(4,4,4,4,4,4,4,4), // 78 - 7f
102
+ PCK4BITS(5,5,5,5,5,5,5,5), // 80 - 87
103
+ PCK4BITS(5,5,5,5,5,5,1,3), // 88 - 8f
104
+ PCK4BITS(5,5,5,5,5,5,5,5), // 90 - 97
105
+ PCK4BITS(5,5,5,5,5,5,5,5), // 98 - 9f
106
+ PCK4BITS(5,2,2,2,2,2,2,2), // a0 - a7
107
+ PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af
108
+ PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
109
+ PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
110
+ PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
111
+ PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
112
+ PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
113
+ PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
114
+ PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
115
+ PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
116
+ PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
117
+ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
118
+ };
119
+
120
+
121
+ static PRUint32 EUCJP_st [ 5] = {
122
+ PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
123
+ PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
124
+ PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
125
+ PCK4BITS(eError,eError,eStart,eError,eError,eError, 3,eError),//18-1f
126
+ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
127
+ };
128
+
129
+ static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
130
+
131
+ SMModel EUCJPSMModel = {
132
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
133
+ 6,
134
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
135
+ EUCJPCharLenTable,
136
+ "EUC-JP",
137
+ };
138
+
139
+ static PRUint32 EUCKR_cls [ 256 / 8 ] = {
140
+ //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
141
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
142
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
143
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
144
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
145
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
146
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
147
+ PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37
148
+ PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f
149
+ PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47
150
+ PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f
151
+ PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57
152
+ PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f
153
+ PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67
154
+ PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f
155
+ PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77
156
+ PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f
157
+ PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
158
+ PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
159
+ PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
160
+ PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
161
+ PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7
162
+ PCK4BITS(2,2,2,2,2,3,3,3), // a8 - af
163
+ PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
164
+ PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
165
+ PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
166
+ PCK4BITS(2,3,2,2,2,2,2,2), // c8 - cf
167
+ PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
168
+ PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
169
+ PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7
170
+ PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef
171
+ PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7
172
+ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
173
+ };
174
+
175
+
176
+ static PRUint32 EUCKR_st [ 2] = {
177
+ PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
178
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
179
+ };
180
+
181
+ static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
182
+
183
+ SMModel EUCKRSMModel = {
184
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
185
+ 4,
186
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
187
+ EUCKRCharLenTable,
188
+ "EUC-KR",
189
+ };
190
+
191
+ static PRUint32 EUCTW_cls [ 256 / 8 ] = {
192
+ //PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
193
+ PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
194
+ PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
195
+ PCK4BITS(2,2,2,2,2,2,2,2), // 10 - 17
196
+ PCK4BITS(2,2,2,0,2,2,2,2), // 18 - 1f
197
+ PCK4BITS(2,2,2,2,2,2,2,2), // 20 - 27
198
+ PCK4BITS(2,2,2,2,2,2,2,2), // 28 - 2f
199
+ PCK4BITS(2,2,2,2,2,2,2,2), // 30 - 37
200
+ PCK4BITS(2,2,2,2,2,2,2,2), // 38 - 3f
201
+ PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47
202
+ PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f
203
+ PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57
204
+ PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f
205
+ PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67
206
+ PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f
207
+ PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77
208
+ PCK4BITS(2,2,2,2,2,2,2,2), // 78 - 7f
209
+ PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
210
+ PCK4BITS(0,0,0,0,0,0,6,0), // 88 - 8f
211
+ PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
212
+ PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
213
+ PCK4BITS(0,3,4,4,4,4,4,4), // a0 - a7
214
+ PCK4BITS(5,5,1,1,1,1,1,1), // a8 - af
215
+ PCK4BITS(1,1,1,1,1,1,1,1), // b0 - b7
216
+ PCK4BITS(1,1,1,1,1,1,1,1), // b8 - bf
217
+ PCK4BITS(1,1,3,1,3,3,3,3), // c0 - c7
218
+ PCK4BITS(3,3,3,3,3,3,3,3), // c8 - cf
219
+ PCK4BITS(3,3,3,3,3,3,3,3), // d0 - d7
220
+ PCK4BITS(3,3,3,3,3,3,3,3), // d8 - df
221
+ PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7
222
+ PCK4BITS(3,3,3,3,3,3,3,3), // e8 - ef
223
+ PCK4BITS(3,3,3,3,3,3,3,3), // f0 - f7
224
+ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
225
+ };
226
+
227
+
228
+ static PRUint32 EUCTW_st [ 6] = {
229
+ PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
230
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
231
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
232
+ PCK4BITS(eStart,eStart,eStart,eError,eError,eError,eError,eError),//18-1f
233
+ PCK4BITS( 5,eError,eError,eError,eStart,eError,eStart,eStart),//20-27
234
+ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
235
+ };
236
+
237
+ static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
238
+
239
+ SMModel EUCTWSMModel = {
240
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
241
+ 7,
242
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
243
+ EUCTWCharLenTable,
244
+ "x-euc-tw",
245
+ };
246
+
247
+ /* obsolete GB2312 by gb18030
248
+ static PRUint32 GB2312_cls [ 256 / 8 ] = {
249
+ //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
250
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
251
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
252
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
253
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
254
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
255
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
256
+ PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37
257
+ PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f
258
+ PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47
259
+ PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f
260
+ PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57
261
+ PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f
262
+ PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67
263
+ PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f
264
+ PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77
265
+ PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f
266
+ PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87
267
+ PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
268
+ PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
269
+ PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
270
+ PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7
271
+ PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af
272
+ PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
273
+ PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
274
+ PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
275
+ PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
276
+ PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
277
+ PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
278
+ PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7
279
+ PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef
280
+ PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7
281
+ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
282
+ };
283
+
284
+
285
+ static PRUint32 GB2312_st [ 2] = {
286
+ PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
287
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
288
+ };
289
+
290
+ static const PRUint32 GB2312CharLenTable[] = {0, 1, 2, 0};
291
+
292
+ SMModel GB2312SMModel = {
293
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls },
294
+ 4,
295
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st },
296
+ GB2312CharLenTable,
297
+ "GB2312",
298
+ };
299
+ */
300
+
301
+ // the following state machine data was created by perl script in
302
+ // intl/chardet/tools. It should be the same as in PSM detector.
303
+ static PRUint32 GB18030_cls [ 256 / 8 ] = {
304
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
305
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
306
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
307
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
308
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
309
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
310
+ PCK4BITS(3,3,3,3,3,3,3,3), // 30 - 37
311
+ PCK4BITS(3,3,1,1,1,1,1,1), // 38 - 3f
312
+ PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47
313
+ PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f
314
+ PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57
315
+ PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f
316
+ PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67
317
+ PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f
318
+ PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77
319
+ PCK4BITS(2,2,2,2,2,2,2,4), // 78 - 7f
320
+ PCK4BITS(5,6,6,6,6,6,6,6), // 80 - 87
321
+ PCK4BITS(6,6,6,6,6,6,6,6), // 88 - 8f
322
+ PCK4BITS(6,6,6,6,6,6,6,6), // 90 - 97
323
+ PCK4BITS(6,6,6,6,6,6,6,6), // 98 - 9f
324
+ PCK4BITS(6,6,6,6,6,6,6,6), // a0 - a7
325
+ PCK4BITS(6,6,6,6,6,6,6,6), // a8 - af
326
+ PCK4BITS(6,6,6,6,6,6,6,6), // b0 - b7
327
+ PCK4BITS(6,6,6,6,6,6,6,6), // b8 - bf
328
+ PCK4BITS(6,6,6,6,6,6,6,6), // c0 - c7
329
+ PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf
330
+ PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7
331
+ PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df
332
+ PCK4BITS(6,6,6,6,6,6,6,6), // e0 - e7
333
+ PCK4BITS(6,6,6,6,6,6,6,6), // e8 - ef
334
+ PCK4BITS(6,6,6,6,6,6,6,6), // f0 - f7
335
+ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
336
+ };
337
+
338
+
339
+ static PRUint32 GB18030_st [ 6] = {
340
+ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
341
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
342
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
343
+ PCK4BITS( 4,eError,eStart,eStart,eError,eError,eError,eError),//18-1f
344
+ PCK4BITS(eError,eError, 5,eError,eError,eError,eItsMe,eError),//20-27
345
+ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
346
+ };
347
+
348
+ // To be accurate, the length of class 6 can be either 2 or 4.
349
+ // But it is not necessary to discriminate between the two since
350
+ // it is used for frequency analysis only, and we are validing
351
+ // each code range there as well. So it is safe to set it to be
352
+ // 2 here.
353
+ static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
354
+
355
+ SMModel GB18030SMModel = {
356
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
357
+ 7,
358
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
359
+ GB18030CharLenTable,
360
+ "GB18030",
361
+ };
362
+
363
+ // sjis
364
+
365
+ static PRUint32 SJIS_cls [ 256 / 8 ] = {
366
+ //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
367
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
368
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
369
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
370
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
371
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
372
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
373
+ PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37
374
+ PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f
375
+ PCK4BITS(2,2,2,2,2,2,2,2), // 40 - 47
376
+ PCK4BITS(2,2,2,2,2,2,2,2), // 48 - 4f
377
+ PCK4BITS(2,2,2,2,2,2,2,2), // 50 - 57
378
+ PCK4BITS(2,2,2,2,2,2,2,2), // 58 - 5f
379
+ PCK4BITS(2,2,2,2,2,2,2,2), // 60 - 67
380
+ PCK4BITS(2,2,2,2,2,2,2,2), // 68 - 6f
381
+ PCK4BITS(2,2,2,2,2,2,2,2), // 70 - 77
382
+ PCK4BITS(2,2,2,2,2,2,2,1), // 78 - 7f
383
+ PCK4BITS(3,3,3,3,3,3,3,3), // 80 - 87
384
+ PCK4BITS(3,3,3,3,3,3,3,3), // 88 - 8f
385
+ PCK4BITS(3,3,3,3,3,3,3,3), // 90 - 97
386
+ PCK4BITS(3,3,3,3,3,3,3,3), // 98 - 9f
387
+ //0xa0 is illegal in sjis encoding, but some pages does
388
+ //contain such byte. We need to be more error forgiven.
389
+ PCK4BITS(2,2,2,2,2,2,2,2), // a0 - a7
390
+ PCK4BITS(2,2,2,2,2,2,2,2), // a8 - af
391
+ PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7
392
+ PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf
393
+ PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7
394
+ PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf
395
+ PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7
396
+ PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df
397
+ PCK4BITS(3,3,3,3,3,3,3,3), // e0 - e7
398
+ PCK4BITS(3,3,3,3,3,4,4,4), // e8 - ef
399
+ PCK4BITS(4,4,4,4,4,4,4,4), // f0 - f7
400
+ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
401
+ };
402
+
403
+
404
+ static PRUint32 SJIS_st [ 3] = {
405
+ PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
406
+ PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
407
+ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
408
+ };
409
+
410
+ static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
411
+
412
+ SMModel SJISSMModel = {
413
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
414
+ 6,
415
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
416
+ SJISCharLenTable,
417
+ "Shift_JIS",
418
+ };
419
+
420
+
421
+ static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
422
+ PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
423
+ PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
424
+ PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
425
+ PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
426
+ PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
427
+ PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
428
+ PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
429
+ PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
430
+ PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
431
+ PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
432
+ PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
433
+ PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
434
+ PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
435
+ PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
436
+ PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
437
+ PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
438
+ PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
439
+ PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
440
+ PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
441
+ PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
442
+ PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
443
+ PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
444
+ PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
445
+ PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
446
+ PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
447
+ PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
448
+ PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
449
+ PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
450
+ PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
451
+ PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
452
+ PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
453
+ PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
454
+ };
455
+
456
+
457
+ static PRUint32 UCS2BE_st [ 7] = {
458
+ PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
459
+ PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
460
+ PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
461
+ PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
462
+ PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
463
+ PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
464
+ PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
465
+ };
466
+
467
+ static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
468
+
469
+ SMModel UCS2BESMModel = {
470
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
471
+ 6,
472
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
473
+ UCS2BECharLenTable,
474
+ "UTF-16BE",
475
+ };
476
+
477
+ static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
478
+ PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
479
+ PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
480
+ PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
481
+ PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
482
+ PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
483
+ PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
484
+ PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
485
+ PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
486
+ PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
487
+ PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
488
+ PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
489
+ PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
490
+ PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
491
+ PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
492
+ PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
493
+ PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
494
+ PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
495
+ PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
496
+ PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
497
+ PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
498
+ PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
499
+ PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
500
+ PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
501
+ PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
502
+ PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
503
+ PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
504
+ PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
505
+ PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
506
+ PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
507
+ PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
508
+ PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
509
+ PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
510
+ };
511
+
512
+
513
+ static PRUint32 UCS2LE_st [ 7] = {
514
+ PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
515
+ PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
516
+ PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
517
+ PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
518
+ PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
519
+ PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
520
+ PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
521
+ };
522
+
523
+ static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
524
+
525
+ SMModel UCS2LESMModel = {
526
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
527
+ 6,
528
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
529
+ UCS2LECharLenTable,
530
+ "UTF-16LE",
531
+ };
532
+
533
+
534
+ static PRUint32 UTF8_cls [ 256 / 8 ] = {
535
+ //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
536
+ PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
537
+ PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
538
+ PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
539
+ PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f
540
+ PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27
541
+ PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f
542
+ PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37
543
+ PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f
544
+ PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47
545
+ PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f
546
+ PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57
547
+ PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f
548
+ PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67
549
+ PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f
550
+ PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77
551
+ PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f
552
+ PCK4BITS(2,2,2,2,3,3,3,3), // 80 - 87
553
+ PCK4BITS(4,4,4,4,4,4,4,4), // 88 - 8f
554
+ PCK4BITS(4,4,4,4,4,4,4,4), // 90 - 97
555
+ PCK4BITS(4,4,4,4,4,4,4,4), // 98 - 9f
556
+ PCK4BITS(5,5,5,5,5,5,5,5), // a0 - a7
557
+ PCK4BITS(5,5,5,5,5,5,5,5), // a8 - af
558
+ PCK4BITS(5,5,5,5,5,5,5,5), // b0 - b7
559
+ PCK4BITS(5,5,5,5,5,5,5,5), // b8 - bf
560
+ PCK4BITS(0,0,6,6,6,6,6,6), // c0 - c7
561
+ PCK4BITS(6,6,6,6,6,6,6,6), // c8 - cf
562
+ PCK4BITS(6,6,6,6,6,6,6,6), // d0 - d7
563
+ PCK4BITS(6,6,6,6,6,6,6,6), // d8 - df
564
+ PCK4BITS(7,8,8,8,8,8,8,8), // e0 - e7
565
+ PCK4BITS(8,8,8,8,8,9,8,8), // e8 - ef
566
+ PCK4BITS(10,11,11,11,11,11,11,11), // f0 - f7
567
+ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
568
+ };
569
+
570
+
571
+ static PRUint32 UTF8_st [ 26] = {
572
+ PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
573
+ PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
574
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
575
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//18-1f
576
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//20-27
577
+ PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//28-2f
578
+ PCK4BITS(eError,eError, 5, 5, 5, 5,eError,eError),//30-37
579
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//38-3f
580
+ PCK4BITS(eError,eError,eError, 5, 5, 5,eError,eError),//40-47
581
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//48-4f
582
+ PCK4BITS(eError,eError, 7, 7, 7, 7,eError,eError),//50-57
583
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//58-5f
584
+ PCK4BITS(eError,eError,eError,eError, 7, 7,eError,eError),//60-67
585
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//68-6f
586
+ PCK4BITS(eError,eError, 9, 9, 9, 9,eError,eError),//70-77
587
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//78-7f
588
+ PCK4BITS(eError,eError,eError,eError,eError, 9,eError,eError),//80-87
589
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//88-8f
590
+ PCK4BITS(eError,eError, 12, 12, 12, 12,eError,eError),//90-97
591
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//98-9f
592
+ PCK4BITS(eError,eError,eError,eError,eError, 12,eError,eError),//a0-a7
593
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//a8-af
594
+ PCK4BITS(eError,eError, 12, 12, 12,eError,eError,eError),//b0-b7
595
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//b8-bf
596
+ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eError,eError),//c0-c7
597
+ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
598
+ };
599
+
600
+ static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
601
+ 3, 3, 4, 4, 5, 5, 6, 6 };
602
+
603
+ SMModel UTF8SMModel = {
604
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
605
+ 16,
606
+ {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
607
+ UTF8CharLenTable,
608
+ "UTF-8",
609
+ };
610
+
@@ -0,0 +1,45 @@
1
+ #----------------------------------------------------------------------
2
+ #
3
+
4
+ ## Process this file with automake to produce Makefile.in
5
+
6
+ AM_CPPFLAGS = $(all_includes)
7
+
8
+ lib_LIBRARIES = libcharguess.a
9
+
10
+ # libcharguess_a_LIBADD =
11
+ # libcharguess_a_LDFLAGS = $(all_libraries) -version-info 0.1 -no-undefined
12
+
13
+ libcharguess_a_SOURCES = \
14
+ types.h \
15
+ charsetProber.h \
16
+ codingStateMachine.h pkgInt.h \
17
+ charguess.cpp charguess.h \
18
+ EscSM.cpp \
19
+ MBCSSM.cpp \
20
+ GB2312Prober.cpp GB2312Prober.h \
21
+ Latin1Prober.cpp Latin1Prober.h \
22
+ SJISProber.cpp SJISProber.h \
23
+ big5Prober.cpp big5Prober.h \
24
+ EUCJPProber.cpp EUCJPProber.h \
25
+ jpCntx.cpp jpCntx.h \
26
+ MBCSGroupProber.cpp MBCSGroupProber.h \
27
+ EUCKRProber.cpp EUCKRProber.h \
28
+ charDistribution.cpp charDistribution.h \
29
+ universal.cpp universal.h \
30
+ EUCTWProber.cpp EUCTWProber.h \
31
+ SBCharsetProber.cpp SBCharsetProber.h \
32
+ EscCharsetProber.cpp EscCharsetProber.h \
33
+ SBCSGroupProber.cpp SBCSGroupProber.h \
34
+ UTF8Prober.cpp UTF8Prober.h \
35
+ LangHungarianModel.cpp \
36
+ LangThaiModel.cpp \
37
+ LangCyrillicModel.cpp \
38
+ LangBulgarianModel.cpp \
39
+ LangGreekModel.cpp
40
+
41
+ METASOURCES = AUTO
42
+
43
+ CLEANFILES = *.bak *~
44
+
45
+