chardet2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/MBCSSM.rb ADDED
@@ -0,0 +1,513 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module UniversalDetector
30
+ BIG5_cls = [ \
31
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
32
+ 1,1,1,1,1,1,0,0, # 08 - 0f
33
+ 1,1,1,1,1,1,1,1, # 10 - 17
34
+ 1,1,1,0,1,1,1,1, # 18 - 1f
35
+ 1,1,1,1,1,1,1,1, # 20 - 27
36
+ 1,1,1,1,1,1,1,1, # 28 - 2f
37
+ 1,1,1,1,1,1,1,1, # 30 - 37
38
+ 1,1,1,1,1,1,1,1, # 38 - 3f
39
+ 2,2,2,2,2,2,2,2, # 40 - 47
40
+ 2,2,2,2,2,2,2,2, # 48 - 4f
41
+ 2,2,2,2,2,2,2,2, # 50 - 57
42
+ 2,2,2,2,2,2,2,2, # 58 - 5f
43
+ 2,2,2,2,2,2,2,2, # 60 - 67
44
+ 2,2,2,2,2,2,2,2, # 68 - 6f
45
+ 2,2,2,2,2,2,2,2, # 70 - 77
46
+ 2,2,2,2,2,2,2,1, # 78 - 7f
47
+ 4,4,4,4,4,4,4,4, # 80 - 87
48
+ 4,4,4,4,4,4,4,4, # 88 - 8f
49
+ 4,4,4,4,4,4,4,4, # 90 - 97
50
+ 4,4,4,4,4,4,4,4, # 98 - 9f
51
+ 4,3,3,3,3,3,3,3, # a0 - a7
52
+ 3,3,3,3,3,3,3,3, # a8 - af
53
+ 3,3,3,3,3,3,3,3, # b0 - b7
54
+ 3,3,3,3,3,3,3,3, # b8 - bf
55
+ 3,3,3,3,3,3,3,3, # c0 - c7
56
+ 3,3,3,3,3,3,3,3, # c8 - cf
57
+ 3,3,3,3,3,3,3,3, # d0 - d7
58
+ 3,3,3,3,3,3,3,3, # d8 - df
59
+ 3,3,3,3,3,3,3,3, # e0 - e7
60
+ 3,3,3,3,3,3,3,3, # e8 - ef
61
+ 3,3,3,3,3,3,3,3, # f0 - f7
62
+ 3,3,3,3,3,3,3,0] # f8 - ff
63
+
64
+ BIG5_st = [ \
65
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
66
+ :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,#08-0f
67
+ :Error,:Start,:Start,:Start,:Start,:Start,:Start,:Start]#10-17
68
+
69
+ Big5CharLenTable = [0, 1, 1, 2, 0]
70
+
71
+ Big5SMModel = {'classTable' => BIG5_cls,
72
+ 'classFactor' => 5,
73
+ 'stateTable' => BIG5_st,
74
+ 'charLenTable' => Big5CharLenTable,
75
+ 'name' => 'Big5'}
76
+
77
+ # EUC-JP
78
+
79
+ EUCJP_cls = [ \
80
+ 4,4,4,4,4,4,4,4, # 00 - 07
81
+ 4,4,4,4,4,4,5,5, # 08 - 0f
82
+ 4,4,4,4,4,4,4,4, # 10 - 17
83
+ 4,4,4,5,4,4,4,4, # 18 - 1f
84
+ 4,4,4,4,4,4,4,4, # 20 - 27
85
+ 4,4,4,4,4,4,4,4, # 28 - 2f
86
+ 4,4,4,4,4,4,4,4, # 30 - 37
87
+ 4,4,4,4,4,4,4,4, # 38 - 3f
88
+ 4,4,4,4,4,4,4,4, # 40 - 47
89
+ 4,4,4,4,4,4,4,4, # 48 - 4f
90
+ 4,4,4,4,4,4,4,4, # 50 - 57
91
+ 4,4,4,4,4,4,4,4, # 58 - 5f
92
+ 4,4,4,4,4,4,4,4, # 60 - 67
93
+ 4,4,4,4,4,4,4,4, # 68 - 6f
94
+ 4,4,4,4,4,4,4,4, # 70 - 77
95
+ 4,4,4,4,4,4,4,4, # 78 - 7f
96
+ 5,5,5,5,5,5,5,5, # 80 - 87
97
+ 5,5,5,5,5,5,1,3, # 88 - 8f
98
+ 5,5,5,5,5,5,5,5, # 90 - 97
99
+ 5,5,5,5,5,5,5,5, # 98 - 9f
100
+ 5,2,2,2,2,2,2,2, # a0 - a7
101
+ 2,2,2,2,2,2,2,2, # a8 - af
102
+ 2,2,2,2,2,2,2,2, # b0 - b7
103
+ 2,2,2,2,2,2,2,2, # b8 - bf
104
+ 2,2,2,2,2,2,2,2, # c0 - c7
105
+ 2,2,2,2,2,2,2,2, # c8 - cf
106
+ 2,2,2,2,2,2,2,2, # d0 - d7
107
+ 2,2,2,2,2,2,2,2, # d8 - df
108
+ 0,0,0,0,0,0,0,0, # e0 - e7
109
+ 0,0,0,0,0,0,0,0, # e8 - ef
110
+ 0,0,0,0,0,0,0,0, # f0 - f7
111
+ 0,0,0,0,0,0,0,5] # f8 - ff
112
+
113
+ EUCJP_st = [ \
114
+ 3, 4, 3, 5,:Start,:Error,:Error,:Error,#00-07
115
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
116
+ :ItsMe,:ItsMe,:Start,:Error,:Start,:Error,:Error,:Error,#10-17
117
+ :Error,:Error,:Start,:Error,:Error,:Error, 3,:Error,#18-1f
118
+ 3,:Error,:Error,:Error,:Start,:Start,:Start,:Start]#20-27
119
+
120
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
121
+
122
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
123
+ 'classFactor' => 6,
124
+ 'stateTable' => EUCJP_st,
125
+ 'charLenTable' => EUCJPCharLenTable,
126
+ 'name' => 'EUC-JP'}
127
+
128
+ # EUC-KR
129
+
130
+ EUCKR_cls = [ \
131
+ 1,1,1,1,1,1,1,1, # 00 - 07
132
+ 1,1,1,1,1,1,0,0, # 08 - 0f
133
+ 1,1,1,1,1,1,1,1, # 10 - 17
134
+ 1,1,1,0,1,1,1,1, # 18 - 1f
135
+ 1,1,1,1,1,1,1,1, # 20 - 27
136
+ 1,1,1,1,1,1,1,1, # 28 - 2f
137
+ 1,1,1,1,1,1,1,1, # 30 - 37
138
+ 1,1,1,1,1,1,1,1, # 38 - 3f
139
+ 1,1,1,1,1,1,1,1, # 40 - 47
140
+ 1,1,1,1,1,1,1,1, # 48 - 4f
141
+ 1,1,1,1,1,1,1,1, # 50 - 57
142
+ 1,1,1,1,1,1,1,1, # 58 - 5f
143
+ 1,1,1,1,1,1,1,1, # 60 - 67
144
+ 1,1,1,1,1,1,1,1, # 68 - 6f
145
+ 1,1,1,1,1,1,1,1, # 70 - 77
146
+ 1,1,1,1,1,1,1,1, # 78 - 7f
147
+ 0,0,0,0,0,0,0,0, # 80 - 87
148
+ 0,0,0,0,0,0,0,0, # 88 - 8f
149
+ 0,0,0,0,0,0,0,0, # 90 - 97
150
+ 0,0,0,0,0,0,0,0, # 98 - 9f
151
+ 0,2,2,2,2,2,2,2, # a0 - a7
152
+ 2,2,2,2,2,3,3,3, # a8 - af
153
+ 2,2,2,2,2,2,2,2, # b0 - b7
154
+ 2,2,2,2,2,2,2,2, # b8 - bf
155
+ 2,2,2,2,2,2,2,2, # c0 - c7
156
+ 2,3,2,2,2,2,2,2, # c8 - cf
157
+ 2,2,2,2,2,2,2,2, # d0 - d7
158
+ 2,2,2,2,2,2,2,2, # d8 - df
159
+ 2,2,2,2,2,2,2,2, # e0 - e7
160
+ 2,2,2,2,2,2,2,2, # e8 - ef
161
+ 2,2,2,2,2,2,2,2, # f0 - f7
162
+ 2,2,2,2,2,2,2,0] # f8 - ff
163
+
164
+ EUCKR_st = [
165
+ :Error,:Start, 3,:Error,:Error,:Error,:Error,:Error,#00-07
166
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,:Start]#08-0f
167
+
168
+ EUCKRCharLenTable = [0, 1, 2, 0]
169
+
170
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
171
+ 'classFactor' => 4,
172
+ 'stateTable' => EUCKR_st,
173
+ 'charLenTable' => EUCKRCharLenTable,
174
+ 'name' => 'EUC-KR'}
175
+
176
+ # EUC-TW
177
+
178
+ EUCTW_cls = [ \
179
+ 2,2,2,2,2,2,2,2, # 00 - 07
180
+ 2,2,2,2,2,2,0,0, # 08 - 0f
181
+ 2,2,2,2,2,2,2,2, # 10 - 17
182
+ 2,2,2,0,2,2,2,2, # 18 - 1f
183
+ 2,2,2,2,2,2,2,2, # 20 - 27
184
+ 2,2,2,2,2,2,2,2, # 28 - 2f
185
+ 2,2,2,2,2,2,2,2, # 30 - 37
186
+ 2,2,2,2,2,2,2,2, # 38 - 3f
187
+ 2,2,2,2,2,2,2,2, # 40 - 47
188
+ 2,2,2,2,2,2,2,2, # 48 - 4f
189
+ 2,2,2,2,2,2,2,2, # 50 - 57
190
+ 2,2,2,2,2,2,2,2, # 58 - 5f
191
+ 2,2,2,2,2,2,2,2, # 60 - 67
192
+ 2,2,2,2,2,2,2,2, # 68 - 6f
193
+ 2,2,2,2,2,2,2,2, # 70 - 77
194
+ 2,2,2,2,2,2,2,2, # 78 - 7f
195
+ 0,0,0,0,0,0,0,0, # 80 - 87
196
+ 0,0,0,0,0,0,6,0, # 88 - 8f
197
+ 0,0,0,0,0,0,0,0, # 90 - 97
198
+ 0,0,0,0,0,0,0,0, # 98 - 9f
199
+ 0,3,4,4,4,4,4,4, # a0 - a7
200
+ 5,5,1,1,1,1,1,1, # a8 - af
201
+ 1,1,1,1,1,1,1,1, # b0 - b7
202
+ 1,1,1,1,1,1,1,1, # b8 - bf
203
+ 1,1,3,1,3,3,3,3, # c0 - c7
204
+ 3,3,3,3,3,3,3,3, # c8 - cf
205
+ 3,3,3,3,3,3,3,3, # d0 - d7
206
+ 3,3,3,3,3,3,3,3, # d8 - df
207
+ 3,3,3,3,3,3,3,3, # e0 - e7
208
+ 3,3,3,3,3,3,3,3, # e8 - ef
209
+ 3,3,3,3,3,3,3,3, # f0 - f7
210
+ 3,3,3,3,3,3,3,0] # f8 - ff
211
+
212
+ EUCTW_st = [ \
213
+ :Error,:Error,:Start, 3, 3, 3, 4,:Error,#00-07
214
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
215
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Start,:Error,#10-17
216
+ :Start,:Start,:Start,:Error,:Error,:Error,:Error,:Error,#18-1f
217
+ 5,:Error,:Error,:Error,:Start,:Error,:Start,:Start,#20-27
218
+ :Start,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
219
+
220
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
221
+
222
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
223
+ 'classFactor' => 7,
224
+ 'stateTable' => EUCTW_st,
225
+ 'charLenTable' => EUCTWCharLenTable,
226
+ 'name' => 'x-euc-tw'}
227
+
228
+ # GB2312
229
+
230
+ GB2312_cls = [ \
231
+ 1,1,1,1,1,1,1,1, # 00 - 07
232
+ 1,1,1,1,1,1,0,0, # 08 - 0f
233
+ 1,1,1,1,1,1,1,1, # 10 - 17
234
+ 1,1,1,0,1,1,1,1, # 18 - 1f
235
+ 1,1,1,1,1,1,1,1, # 20 - 27
236
+ 1,1,1,1,1,1,1,1, # 28 - 2f
237
+ 3,3,3,3,3,3,3,3, # 30 - 37
238
+ 3,3,1,1,1,1,1,1, # 38 - 3f
239
+ 2,2,2,2,2,2,2,2, # 40 - 47
240
+ 2,2,2,2,2,2,2,2, # 48 - 4f
241
+ 2,2,2,2,2,2,2,2, # 50 - 57
242
+ 2,2,2,2,2,2,2,2, # 58 - 5f
243
+ 2,2,2,2,2,2,2,2, # 60 - 67
244
+ 2,2,2,2,2,2,2,2, # 68 - 6f
245
+ 2,2,2,2,2,2,2,2, # 70 - 77
246
+ 2,2,2,2,2,2,2,4, # 78 - 7f
247
+ 5,6,6,6,6,6,6,6, # 80 - 87
248
+ 6,6,6,6,6,6,6,6, # 88 - 8f
249
+ 6,6,6,6,6,6,6,6, # 90 - 97
250
+ 6,6,6,6,6,6,6,6, # 98 - 9f
251
+ 6,6,6,6,6,6,6,6, # a0 - a7
252
+ 6,6,6,6,6,6,6,6, # a8 - af
253
+ 6,6,6,6,6,6,6,6, # b0 - b7
254
+ 6,6,6,6,6,6,6,6, # b8 - bf
255
+ 6,6,6,6,6,6,6,6, # c0 - c7
256
+ 6,6,6,6,6,6,6,6, # c8 - cf
257
+ 6,6,6,6,6,6,6,6, # d0 - d7
258
+ 6,6,6,6,6,6,6,6, # d8 - df
259
+ 6,6,6,6,6,6,6,6, # e0 - e7
260
+ 6,6,6,6,6,6,6,6, # e8 - ef
261
+ 6,6,6,6,6,6,6,6, # f0 - f7
262
+ 6,6,6,6,6,6,6,0] # f8 - ff
263
+
264
+ GB2312_st = [ \
265
+ :Error,:Start,:Start,:Start,:Start,:Start, 3,:Error,#00-07
266
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
267
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,#10-17
268
+ 4,:Error,:Start,:Start,:Error,:Error,:Error,:Error,#18-1f
269
+ :Error,:Error, 5,:Error,:Error,:Error,:ItsMe,:Error,#20-27
270
+ :Error,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
271
+
272
+ # To be accurate, the length of class 6 can be either 2 or 4.
273
+ # But it is not necessary to discriminate between the two since
274
+ # it is used for frequency analysis only, and we are validing
275
+ # each code range there as well. So it is safe to set it to be
276
+ # 2 here.
277
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
278
+
279
+ GB2312SMModel = {'classTable' => GB2312_cls,
280
+ 'classFactor' => 7,
281
+ 'stateTable' => GB2312_st,
282
+ 'charLenTable' => GB2312CharLenTable,
283
+ 'name' => 'GB2312'}
284
+
285
+ # Shift_JIS
286
+
287
+ SJIS_cls = [ \
288
+ 1,1,1,1,1,1,1,1, # 00 - 07
289
+ 1,1,1,1,1,1,0,0, # 08 - 0f
290
+ 1,1,1,1,1,1,1,1, # 10 - 17
291
+ 1,1,1,0,1,1,1,1, # 18 - 1f
292
+ 1,1,1,1,1,1,1,1, # 20 - 27
293
+ 1,1,1,1,1,1,1,1, # 28 - 2f
294
+ 1,1,1,1,1,1,1,1, # 30 - 37
295
+ 1,1,1,1,1,1,1,1, # 38 - 3f
296
+ 2,2,2,2,2,2,2,2, # 40 - 47
297
+ 2,2,2,2,2,2,2,2, # 48 - 4f
298
+ 2,2,2,2,2,2,2,2, # 50 - 57
299
+ 2,2,2,2,2,2,2,2, # 58 - 5f
300
+ 2,2,2,2,2,2,2,2, # 60 - 67
301
+ 2,2,2,2,2,2,2,2, # 68 - 6f
302
+ 2,2,2,2,2,2,2,2, # 70 - 77
303
+ 2,2,2,2,2,2,2,1, # 78 - 7f
304
+ 3,3,3,3,3,3,3,3, # 80 - 87
305
+ 3,3,3,3,3,3,3,3, # 88 - 8f
306
+ 3,3,3,3,3,3,3,3, # 90 - 97
307
+ 3,3,3,3,3,3,3,3, # 98 - 9f
308
+ #0xa0 is illegal in sjis encoding, but some pages does
309
+ #contain such byte. We need to be more error forgiven.
310
+ 2,2,2,2,2,2,2,2, # a0 - a7
311
+ 2,2,2,2,2,2,2,2, # a8 - af
312
+ 2,2,2,2,2,2,2,2, # b0 - b7
313
+ 2,2,2,2,2,2,2,2, # b8 - bf
314
+ 2,2,2,2,2,2,2,2, # c0 - c7
315
+ 2,2,2,2,2,2,2,2, # c8 - cf
316
+ 2,2,2,2,2,2,2,2, # d0 - d7
317
+ 2,2,2,2,2,2,2,2, # d8 - df
318
+ 3,3,3,3,3,3,3,3, # e0 - e7
319
+ 3,3,3,3,3,4,4,4, # e8 - ef
320
+ 4,4,4,4,4,4,4,4, # f0 - f7
321
+ 4,4,4,4,4,0,0,0] # f8 - ff
322
+
323
+ SJIS_st = [ \
324
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
325
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
326
+ :ItsMe,:ItsMe,:Error,:Error,:Start,:Start,:Start,:Start]#10-17
327
+
328
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
329
+
330
+ SJISSMModel = {'classTable' => SJIS_cls,
331
+ 'classFactor' => 6,
332
+ 'stateTable' => SJIS_st,
333
+ 'charLenTable' => SJISCharLenTable,
334
+ 'name' => 'Shift_JIS'}
335
+
336
+ # UCS2-BE
337
+
338
+ UCS2BE_cls = [ \
339
+ 0,0,0,0,0,0,0,0, # 00 - 07
340
+ 0,0,1,0,0,2,0,0, # 08 - 0f
341
+ 0,0,0,0,0,0,0,0, # 10 - 17
342
+ 0,0,0,3,0,0,0,0, # 18 - 1f
343
+ 0,0,0,0,0,0,0,0, # 20 - 27
344
+ 0,3,3,3,3,3,0,0, # 28 - 2f
345
+ 0,0,0,0,0,0,0,0, # 30 - 37
346
+ 0,0,0,0,0,0,0,0, # 38 - 3f
347
+ 0,0,0,0,0,0,0,0, # 40 - 47
348
+ 0,0,0,0,0,0,0,0, # 48 - 4f
349
+ 0,0,0,0,0,0,0,0, # 50 - 57
350
+ 0,0,0,0,0,0,0,0, # 58 - 5f
351
+ 0,0,0,0,0,0,0,0, # 60 - 67
352
+ 0,0,0,0,0,0,0,0, # 68 - 6f
353
+ 0,0,0,0,0,0,0,0, # 70 - 77
354
+ 0,0,0,0,0,0,0,0, # 78 - 7f
355
+ 0,0,0,0,0,0,0,0, # 80 - 87
356
+ 0,0,0,0,0,0,0,0, # 88 - 8f
357
+ 0,0,0,0,0,0,0,0, # 90 - 97
358
+ 0,0,0,0,0,0,0,0, # 98 - 9f
359
+ 0,0,0,0,0,0,0,0, # a0 - a7
360
+ 0,0,0,0,0,0,0,0, # a8 - af
361
+ 0,0,0,0,0,0,0,0, # b0 - b7
362
+ 0,0,0,0,0,0,0,0, # b8 - bf
363
+ 0,0,0,0,0,0,0,0, # c0 - c7
364
+ 0,0,0,0,0,0,0,0, # c8 - cf
365
+ 0,0,0,0,0,0,0,0, # d0 - d7
366
+ 0,0,0,0,0,0,0,0, # d8 - df
367
+ 0,0,0,0,0,0,0,0, # e0 - e7
368
+ 0,0,0,0,0,0,0,0, # e8 - ef
369
+ 0,0,0,0,0,0,0,0, # f0 - f7
370
+ 0,0,0,0,0,0,4,5] # f8 - ff
371
+
372
+ UCS2BE_st = [ \
373
+ 5, 7, 7,:Error, 4, 3,:Error,:Error,#00-07
374
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
375
+ :ItsMe,:ItsMe, 6, 6, 6, 6,:Error,:Error,#10-17
376
+ 6, 6, 6, 6, 6,:ItsMe, 6, 6,#18-1f
377
+ 6, 6, 6, 6, 5, 7, 7,:Error,#20-27
378
+ 5, 8, 6, 6,:Error, 6, 6, 6,#28-2f
379
+ 6, 6, 6, 6,:Error,:Error,:Start,:Start]#30-37
380
+
381
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
382
+
383
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
384
+ 'classFactor' => 6,
385
+ 'stateTable' => UCS2BE_st,
386
+ 'charLenTable' => UCS2BECharLenTable,
387
+ 'name' => 'UTF-16BE'}
388
+
389
+ # UCS2-LE
390
+
391
+ UCS2LE_cls = [ \
392
+ 0,0,0,0,0,0,0,0, # 00 - 07
393
+ 0,0,1,0,0,2,0,0, # 08 - 0f
394
+ 0,0,0,0,0,0,0,0, # 10 - 17
395
+ 0,0,0,3,0,0,0,0, # 18 - 1f
396
+ 0,0,0,0,0,0,0,0, # 20 - 27
397
+ 0,3,3,3,3,3,0,0, # 28 - 2f
398
+ 0,0,0,0,0,0,0,0, # 30 - 37
399
+ 0,0,0,0,0,0,0,0, # 38 - 3f
400
+ 0,0,0,0,0,0,0,0, # 40 - 47
401
+ 0,0,0,0,0,0,0,0, # 48 - 4f
402
+ 0,0,0,0,0,0,0,0, # 50 - 57
403
+ 0,0,0,0,0,0,0,0, # 58 - 5f
404
+ 0,0,0,0,0,0,0,0, # 60 - 67
405
+ 0,0,0,0,0,0,0,0, # 68 - 6f
406
+ 0,0,0,0,0,0,0,0, # 70 - 77
407
+ 0,0,0,0,0,0,0,0, # 78 - 7f
408
+ 0,0,0,0,0,0,0,0, # 80 - 87
409
+ 0,0,0,0,0,0,0,0, # 88 - 8f
410
+ 0,0,0,0,0,0,0,0, # 90 - 97
411
+ 0,0,0,0,0,0,0,0, # 98 - 9f
412
+ 0,0,0,0,0,0,0,0, # a0 - a7
413
+ 0,0,0,0,0,0,0,0, # a8 - af
414
+ 0,0,0,0,0,0,0,0, # b0 - b7
415
+ 0,0,0,0,0,0,0,0, # b8 - bf
416
+ 0,0,0,0,0,0,0,0, # c0 - c7
417
+ 0,0,0,0,0,0,0,0, # c8 - cf
418
+ 0,0,0,0,0,0,0,0, # d0 - d7
419
+ 0,0,0,0,0,0,0,0, # d8 - df
420
+ 0,0,0,0,0,0,0,0, # e0 - e7
421
+ 0,0,0,0,0,0,0,0, # e8 - ef
422
+ 0,0,0,0,0,0,0,0, # f0 - f7
423
+ 0,0,0,0,0,0,4,5] # f8 - ff
424
+
425
+ UCS2LE_st = [ \
426
+ 6, 6, 7, 6, 4, 3,:Error,:Error,#00-07
427
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
428
+ :ItsMe,:ItsMe, 5, 5, 5,:Error,:ItsMe,:Error,#10-17
429
+ 5, 5, 5,:Error, 5,:Error, 6, 6,#18-1f
430
+ 7, 6, 8, 8, 5, 5, 5,:Error,#20-27
431
+ 5, 5, 5,:Error,:Error,:Error, 5, 5,#28-2f
432
+ 5, 5, 5,:Error, 5,:Error,:Start,:Start]#30-37
433
+
434
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
435
+
436
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
437
+ 'classFactor' => 6,
438
+ 'stateTable' => UCS2LE_st,
439
+ 'charLenTable' => UCS2LECharLenTable,
440
+ 'name' => 'UTF-16LE'}
441
+
442
+ # UTF-8
443
+
444
+ UTF8_cls = [ \
445
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
446
+ 1,1,1,1,1,1,0,0, # 08 - 0f
447
+ 1,1,1,1,1,1,1,1, # 10 - 17
448
+ 1,1,1,0,1,1,1,1, # 18 - 1f
449
+ 1,1,1,1,1,1,1,1, # 20 - 27
450
+ 1,1,1,1,1,1,1,1, # 28 - 2f
451
+ 1,1,1,1,1,1,1,1, # 30 - 37
452
+ 1,1,1,1,1,1,1,1, # 38 - 3f
453
+ 1,1,1,1,1,1,1,1, # 40 - 47
454
+ 1,1,1,1,1,1,1,1, # 48 - 4f
455
+ 1,1,1,1,1,1,1,1, # 50 - 57
456
+ 1,1,1,1,1,1,1,1, # 58 - 5f
457
+ 1,1,1,1,1,1,1,1, # 60 - 67
458
+ 1,1,1,1,1,1,1,1, # 68 - 6f
459
+ 1,1,1,1,1,1,1,1, # 70 - 77
460
+ 1,1,1,1,1,1,1,1, # 78 - 7f
461
+ 2,2,2,2,3,3,3,3, # 80 - 87
462
+ 4,4,4,4,4,4,4,4, # 88 - 8f
463
+ 4,4,4,4,4,4,4,4, # 90 - 97
464
+ 4,4,4,4,4,4,4,4, # 98 - 9f
465
+ 5,5,5,5,5,5,5,5, # a0 - a7
466
+ 5,5,5,5,5,5,5,5, # a8 - af
467
+ 5,5,5,5,5,5,5,5, # b0 - b7
468
+ 5,5,5,5,5,5,5,5, # b8 - bf
469
+ 0,0,6,6,6,6,6,6, # c0 - c7
470
+ 6,6,6,6,6,6,6,6, # c8 - cf
471
+ 6,6,6,6,6,6,6,6, # d0 - d7
472
+ 6,6,6,6,6,6,6,6, # d8 - df
473
+ 7,8,8,8,8,8,8,8, # e0 - e7
474
+ 8,8,8,8,8,9,8,8, # e8 - ef
475
+ 10,11,11,11,11,11,11,11, # f0 - f7
476
+ 12,13,13,13,14,15,0,0] # f8 - ff
477
+
478
+ UTF8_st = [ \
479
+ :Error,:Start,:Error,:Error,:Error,:Error, 12, 10,#00-07
480
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
481
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#10-17
482
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#18-1f
483
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#20-27
484
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#28-2f
485
+ :Error,:Error, 5, 5, 5, 5,:Error,:Error,#30-37
486
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#38-3f
487
+ :Error,:Error,:Error, 5, 5, 5,:Error,:Error,#40-47
488
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#48-4f
489
+ :Error,:Error, 7, 7, 7, 7,:Error,:Error,#50-57
490
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#58-5f
491
+ :Error,:Error,:Error,:Error, 7, 7,:Error,:Error,#60-67
492
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#68-6f
493
+ :Error,:Error, 9, 9, 9, 9,:Error,:Error,#70-77
494
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#78-7f
495
+ :Error,:Error,:Error,:Error,:Error, 9,:Error,:Error,#80-87
496
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#88-8f
497
+ :Error,:Error, 12, 12, 12, 12,:Error,:Error,#90-97
498
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#98-9f
499
+ :Error,:Error,:Error,:Error,:Error, 12,:Error,:Error,#a0-a7
500
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#a8-af
501
+ :Error,:Error, 12, 12, 12,:Error,:Error,:Error,#b0-b7
502
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#b8-bf
503
+ :Error,:Error,:Start,:Start,:Start,:Start,:Error,:Error,#c0-c7
504
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error]#c8-cf
505
+
506
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
507
+
508
+ UTF8SMModel = {'classTable' => UTF8_cls,
509
+ 'classFactor' => 16,
510
+ 'stateTable' => UTF8_st,
511
+ 'charLenTable' => UTF8CharLenTable,
512
+ 'name' => 'UTF-8'}
513
+ end
@@ -0,0 +1,94 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ class MultiByteCharSetProber < CharSetProber
35
+ def initialize
36
+ super
37
+ @_mDistributionAnalyzer = nil
38
+ @_mCodingSM = nil
39
+ @_mLastChar = ['\x00', '\x00']
40
+ end
41
+
42
+ def reset
43
+ super
44
+ if @_mCodingSM
45
+ @_mCodingSM.reset()
46
+ end
47
+ if @_mDistributionAnalyzer
48
+ @_mDistributionAnalyzer.reset()
49
+ end
50
+ @_mLastChar = ['\x00', '\x00']
51
+ end
52
+
53
+ def get_charset_name
54
+ end
55
+
56
+ def feed(aBuf)
57
+ aLen = aBuf.length
58
+ for i in 0...aLen
59
+ codingState = @_mCodingSM.next_state(aBuf[i])
60
+ if codingState == :Error
61
+ if UniversalDetector::DEBUG
62
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
63
+ end
64
+ @_mState = :NotMe
65
+ break
66
+ elsif codingState == :ItsMe
67
+ @_mState = :FoundIt
68
+ break
69
+ elsif codingState == :Start
70
+ charLen = @_mCodingSM.get_current_charlen()
71
+ if i == 0
72
+ @_mLastChar[1] = aBuf[0]
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mDistributionAnalyzer.feed(aBuf[(i-1)..(i+1)], charLen)
76
+ end
77
+ end
78
+ end
79
+
80
+ @_mLastChar[0] = aBuf[aLen - 1]
81
+ if get_state() == :Detecting
82
+ if @_mDistributionAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
83
+ @_mState = :FoundIt
84
+ end
85
+ end
86
+
87
+ return get_state()
88
+ end
89
+
90
+ def get_confidence
91
+ return @_mDistributionAnalyzer.get_confidence()
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,71 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'SingleByteCharSetProber'
32
+ require 'LangCyrillicModel'
33
+ require 'LangGreekModel'
34
+ require 'LangHebrewModel'
35
+ require 'LangHungarianModel'
36
+ require 'LangBulgarianModel'
37
+ require 'LangThaiModel'
38
+ require 'HebrewProber'
39
+
40
+ module UniversalDetector
41
+ class SBCSGroupProber < CharSetGroupProber
42
+
43
+ attr_reader :mProbers
44
+
45
+ def initialize
46
+ super
47
+ @mProbers = [ \
48
+ SingleByteCharSetProber.new(Win1251CyrillicModel),
49
+ SingleByteCharSetProber.new(Koi8rModel),
50
+ SingleByteCharSetProber.new(Latin5CyrillicModel),
51
+ SingleByteCharSetProber.new(MacCyrillicModel),
52
+ SingleByteCharSetProber.new(Ibm866Model),
53
+ SingleByteCharSetProber.new(Ibm855Model),
54
+ SingleByteCharSetProber.new(Latin7GreekModel),
55
+ SingleByteCharSetProber.new(Win1253GreekModel),
56
+ SingleByteCharSetProber.new(Latin5BulgarianModel),
57
+ SingleByteCharSetProber.new(Win1251BulgarianModel),
58
+ SingleByteCharSetProber.new(Latin2HungarianModel),
59
+ SingleByteCharSetProber.new(Win1250HungarianModel),
60
+ SingleByteCharSetProber.new(TIS620ThaiModel),
61
+ ]
62
+ hebrewProber = HebrewProber.new
63
+ logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
64
+ visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
65
+ hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
66
+ @mProbers = @mProbers + [hebrewProber, logicalHebrewProber, visualHebrewProber]
67
+
68
+ reset()
69
+ end
70
+ end
71
+ end