chardet2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ESCSM.rb ADDED
@@ -0,0 +1,242 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module UniversalDetector
30
+ HZ_cls = [ \
31
+ 1,0,0,0,0,0,0,0, # 00 - 07
32
+ 0,0,0,0,0,0,0,0, # 08 - 0f
33
+ 0,0,0,0,0,0,0,0, # 10 - 17
34
+ 0,0,0,1,0,0,0,0, # 18 - 1f
35
+ 0,0,0,0,0,0,0,0, # 20 - 27
36
+ 0,0,0,0,0,0,0,0, # 28 - 2f
37
+ 0,0,0,0,0,0,0,0, # 30 - 37
38
+ 0,0,0,0,0,0,0,0, # 38 - 3f
39
+ 0,0,0,0,0,0,0,0, # 40 - 47
40
+ 0,0,0,0,0,0,0,0, # 48 - 4f
41
+ 0,0,0,0,0,0,0,0, # 50 - 57
42
+ 0,0,0,0,0,0,0,0, # 58 - 5f
43
+ 0,0,0,0,0,0,0,0, # 60 - 67
44
+ 0,0,0,0,0,0,0,0, # 68 - 6f
45
+ 0,0,0,0,0,0,0,0, # 70 - 77
46
+ 0,0,0,4,0,5,2,0, # 78 - 7f
47
+ 1,1,1,1,1,1,1,1, # 80 - 87
48
+ 1,1,1,1,1,1,1,1, # 88 - 8f
49
+ 1,1,1,1,1,1,1,1, # 90 - 97
50
+ 1,1,1,1,1,1,1,1, # 98 - 9f
51
+ 1,1,1,1,1,1,1,1, # a0 - a7
52
+ 1,1,1,1,1,1,1,1, # a8 - af
53
+ 1,1,1,1,1,1,1,1, # b0 - b7
54
+ 1,1,1,1,1,1,1,1, # b8 - bf
55
+ 1,1,1,1,1,1,1,1, # c0 - c7
56
+ 1,1,1,1,1,1,1,1, # c8 - cf
57
+ 1,1,1,1,1,1,1,1, # d0 - d7
58
+ 1,1,1,1,1,1,1,1, # d8 - df
59
+ 1,1,1,1,1,1,1,1, # e0 - e7
60
+ 1,1,1,1,1,1,1,1, # e8 - ef
61
+ 1,1,1,1,1,1,1,1, # f0 - f7
62
+ 1,1,1,1,1,1,1,1, # f8 - ff
63
+ ]
64
+
65
+ HZ_st = [ \
66
+ :Start,:Error, 3,:Start,:Start,:Start,:Error,:Error,# 00-07
67
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,# 08-0f
68
+ :ItsMe,:ItsMe,:Error,:Error,:Start,:Start, 4,:Error,# 10-17
69
+ 5,:Error, 6,:Error, 5, 5, 4,:Error,# 18-1f
70
+ 4,:Error, 4, 4, 4,:Error, 4,:Error,# 20-27
71
+ 4,:ItsMe,:Start,:Start,:Start,:Start,:Start,:Start,# 28-2f
72
+ ]
73
+
74
+ HZCharLenTable = [0, 0, 0, 0, 0, 0]
75
+
76
+ HZSMModel = {'classTable' => HZ_cls,
77
+ 'classFactor' => 6,
78
+ 'stateTable' => HZ_st,
79
+ 'charLenTable' => HZCharLenTable,
80
+ 'name' => "HZ-GB-2312"}
81
+
82
+ ISO2022CN_cls = [ \
83
+ 2,0,0,0,0,0,0,0, # 00 - 07
84
+ 0,0,0,0,0,0,0,0, # 08 - 0f
85
+ 0,0,0,0,0,0,0,0, # 10 - 17
86
+ 0,0,0,1,0,0,0,0, # 18 - 1f
87
+ 0,0,0,0,0,0,0,0, # 20 - 27
88
+ 0,3,0,0,0,0,0,0, # 28 - 2f
89
+ 0,0,0,0,0,0,0,0, # 30 - 37
90
+ 0,0,0,0,0,0,0,0, # 38 - 3f
91
+ 0,0,0,4,0,0,0,0, # 40 - 47
92
+ 0,0,0,0,0,0,0,0, # 48 - 4f
93
+ 0,0,0,0,0,0,0,0, # 50 - 57
94
+ 0,0,0,0,0,0,0,0, # 58 - 5f
95
+ 0,0,0,0,0,0,0,0, # 60 - 67
96
+ 0,0,0,0,0,0,0,0, # 68 - 6f
97
+ 0,0,0,0,0,0,0,0, # 70 - 77
98
+ 0,0,0,0,0,0,0,0, # 78 - 7f
99
+ 2,2,2,2,2,2,2,2, # 80 - 87
100
+ 2,2,2,2,2,2,2,2, # 88 - 8f
101
+ 2,2,2,2,2,2,2,2, # 90 - 97
102
+ 2,2,2,2,2,2,2,2, # 98 - 9f
103
+ 2,2,2,2,2,2,2,2, # a0 - a7
104
+ 2,2,2,2,2,2,2,2, # a8 - af
105
+ 2,2,2,2,2,2,2,2, # b0 - b7
106
+ 2,2,2,2,2,2,2,2, # b8 - bf
107
+ 2,2,2,2,2,2,2,2, # c0 - c7
108
+ 2,2,2,2,2,2,2,2, # c8 - cf
109
+ 2,2,2,2,2,2,2,2, # d0 - d7
110
+ 2,2,2,2,2,2,2,2, # d8 - df
111
+ 2,2,2,2,2,2,2,2, # e0 - e7
112
+ 2,2,2,2,2,2,2,2, # e8 - ef
113
+ 2,2,2,2,2,2,2,2, # f0 - f7
114
+ 2,2,2,2,2,2,2,2, # f8 - ff
115
+ ]
116
+
117
+ ISO2022CN_st = [ \
118
+ :Start, 3,:Error,:Start,:Start,:Start,:Start,:Start,# 00-07
119
+ :Start,:Error,:Error,:Error,:Error,:Error,:Error,:Error,# 08-0f
120
+ :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,# 10-17
121
+ :ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Error, 4,:Error,# 18-1f
122
+ :Error,:Error,:Error,:ItsMe,:Error,:Error,:Error,:Error,# 20-27
123
+ 5, 6,:Error,:Error,:Error,:Error,:Error,:Error,# 28-2f
124
+ :Error,:Error,:Error,:ItsMe,:Error,:Error,:Error,:Error,# 30-37
125
+ :Error,:Error,:Error,:Error,:Error,:ItsMe,:Error,:Start,# 38-3f
126
+ ]
127
+
128
+ ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
+
130
+ ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
+ 'classFactor' => 9,
132
+ 'stateTable' => ISO2022CN_st,
133
+ 'charLenTable' => ISO2022CNCharLenTable,
134
+ 'name' => "ISO-2022-CN"}
135
+
136
+ ISO2022JP_cls = [ \
137
+ 2,0,0,0,0,0,0,0, # 00 - 07
138
+ 0,0,0,0,0,0,2,2, # 08 - 0f
139
+ 0,0,0,0,0,0,0,0, # 10 - 17
140
+ 0,0,0,1,0,0,0,0, # 18 - 1f
141
+ 0,0,0,0,7,0,0,0, # 20 - 27
142
+ 3,0,0,0,0,0,0,0, # 28 - 2f
143
+ 0,0,0,0,0,0,0,0, # 30 - 37
144
+ 0,0,0,0,0,0,0,0, # 38 - 3f
145
+ 6,0,4,0,8,0,0,0, # 40 - 47
146
+ 0,9,5,0,0,0,0,0, # 48 - 4f
147
+ 0,0,0,0,0,0,0,0, # 50 - 57
148
+ 0,0,0,0,0,0,0,0, # 58 - 5f
149
+ 0,0,0,0,0,0,0,0, # 60 - 67
150
+ 0,0,0,0,0,0,0,0, # 68 - 6f
151
+ 0,0,0,0,0,0,0,0, # 70 - 77
152
+ 0,0,0,0,0,0,0,0, # 78 - 7f
153
+ 2,2,2,2,2,2,2,2, # 80 - 87
154
+ 2,2,2,2,2,2,2,2, # 88 - 8f
155
+ 2,2,2,2,2,2,2,2, # 90 - 97
156
+ 2,2,2,2,2,2,2,2, # 98 - 9f
157
+ 2,2,2,2,2,2,2,2, # a0 - a7
158
+ 2,2,2,2,2,2,2,2, # a8 - af
159
+ 2,2,2,2,2,2,2,2, # b0 - b7
160
+ 2,2,2,2,2,2,2,2, # b8 - bf
161
+ 2,2,2,2,2,2,2,2, # c0 - c7
162
+ 2,2,2,2,2,2,2,2, # c8 - cf
163
+ 2,2,2,2,2,2,2,2, # d0 - d7
164
+ 2,2,2,2,2,2,2,2, # d8 - df
165
+ 2,2,2,2,2,2,2,2, # e0 - e7
166
+ 2,2,2,2,2,2,2,2, # e8 - ef
167
+ 2,2,2,2,2,2,2,2, # f0 - f7
168
+ 2,2,2,2,2,2,2,2, # f8 - ff
169
+ ]
170
+
171
+ ISO2022JP_st = [ \
172
+ :Start, 3,:Error,:Start,:Start,:Start,:Start,:Start,# 00-07
173
+ :Start,:Start,:Error,:Error,:Error,:Error,:Error,:Error,# 08-0f
174
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,# 10-17
175
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,# 18-1f
176
+ :Error, 5,:Error,:Error,:Error, 4,:Error,:Error,# 20-27
177
+ :Error,:Error,:Error, 6,:ItsMe,:Error,:ItsMe,:Error,# 28-2f
178
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,# 30-37
179
+ :Error,:Error,:Error,:ItsMe,:Error,:Error,:Error,:Error,# 38-3f
180
+ :Error,:Error,:Error,:Error,:ItsMe,:Error,:Start,:Start,# 40-47
181
+ ]
182
+
183
+ ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0]
184
+
185
+ ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
186
+ 'classFactor' => 10,
187
+ 'stateTable' => ISO2022JP_st,
188
+ 'charLenTable' => ISO2022JPCharLenTable,
189
+ 'name' => "ISO-2022-JP"}
190
+
191
+ ISO2022KR_cls = [ \
192
+ 2,0,0,0,0,0,0,0, # 00 - 07
193
+ 0,0,0,0,0,0,0,0, # 08 - 0f
194
+ 0,0,0,0,0,0,0,0, # 10 - 17
195
+ 0,0,0,1,0,0,0,0, # 18 - 1f
196
+ 0,0,0,0,3,0,0,0, # 20 - 27
197
+ 0,4,0,0,0,0,0,0, # 28 - 2f
198
+ 0,0,0,0,0,0,0,0, # 30 - 37
199
+ 0,0,0,0,0,0,0,0, # 38 - 3f
200
+ 0,0,0,5,0,0,0,0, # 40 - 47
201
+ 0,0,0,0,0,0,0,0, # 48 - 4f
202
+ 0,0,0,0,0,0,0,0, # 50 - 57
203
+ 0,0,0,0,0,0,0,0, # 58 - 5f
204
+ 0,0,0,0,0,0,0,0, # 60 - 67
205
+ 0,0,0,0,0,0,0,0, # 68 - 6f
206
+ 0,0,0,0,0,0,0,0, # 70 - 77
207
+ 0,0,0,0,0,0,0,0, # 78 - 7f
208
+ 2,2,2,2,2,2,2,2, # 80 - 87
209
+ 2,2,2,2,2,2,2,2, # 88 - 8f
210
+ 2,2,2,2,2,2,2,2, # 90 - 97
211
+ 2,2,2,2,2,2,2,2, # 98 - 9f
212
+ 2,2,2,2,2,2,2,2, # a0 - a7
213
+ 2,2,2,2,2,2,2,2, # a8 - af
214
+ 2,2,2,2,2,2,2,2, # b0 - b7
215
+ 2,2,2,2,2,2,2,2, # b8 - bf
216
+ 2,2,2,2,2,2,2,2, # c0 - c7
217
+ 2,2,2,2,2,2,2,2, # c8 - cf
218
+ 2,2,2,2,2,2,2,2, # d0 - d7
219
+ 2,2,2,2,2,2,2,2, # d8 - df
220
+ 2,2,2,2,2,2,2,2, # e0 - e7
221
+ 2,2,2,2,2,2,2,2, # e8 - ef
222
+ 2,2,2,2,2,2,2,2, # f0 - f7
223
+ 2,2,2,2,2,2,2,2, # f8 - ff
224
+ ]
225
+
226
+ ISO2022KR_st = [ \
227
+ :Start, 3,:Error,:Start,:Start,:Start,:Error,:Error,# 00-07
228
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,# 08-0f
229
+ :ItsMe,:ItsMe,:Error,:Error,:Error, 4,:Error,:Error,# 10-17
230
+ :Error,:Error,:Error,:Error, 5,:Error,:Error,:Error,# 18-1f
231
+ :Error,:Error,:Error,:ItsMe,:Start,:Start,:Start,:Start,# 20-27
232
+ ]
233
+
234
+ ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
235
+
236
+ ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
237
+ 'classFactor' => 6,
238
+ 'stateTable' => ISO2022KR_st,
239
+ 'charLenTable' => ISO2022KRCharLenTable,
240
+ 'name' => "ISO-2022-KR"}
241
+
242
+ end
@@ -0,0 +1,97 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'CharDistributionAnalysis'
33
+ require 'MBCSSM'
34
+ require 'JapaneseContextAnalysis'
35
+
36
+ module UniversalDetector
37
+ class EUCJPProber < MultiByteCharSetProber
38
+ def initialize
39
+ super
40
+ @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
41
+ @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new
42
+ @_mContextAnalyzer = EUCJPContextAnalysis.new
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ super
48
+ @_mContextAnalyzer.reset()
49
+ end
50
+
51
+ def get_charset_name
52
+ return "EUC-JP"
53
+ end
54
+
55
+ def feed(aBuf)
56
+ aLen = aBuf.length
57
+ for i in 0...aLen
58
+ codingState = @_mCodingSM.next_state(aBuf[i])
59
+ if codingState == :Error
60
+ if DEBUG
61
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
62
+ end
63
+ @_mState = :NotMe
64
+ break
65
+ elsif codingState == :ItsMe
66
+ @_mState = :FoundIt
67
+ break
68
+ elsif codingState == :Start
69
+ charLen = @_mCodingSM.get_current_charlen()
70
+ if i == 0
71
+ @_mLastChar[1] = aBuf[0]
72
+ @_mContextAnalyzer.feed(@_mLastChar, charLen)
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mContextAnalyzer.feed(aBuf[i-1..i+1], charLen)
76
+ @_mDistributionAnalyzer.feed(aBuf[i-1..i+1], charLen)
77
+ end
78
+ end
79
+ end
80
+
81
+ @_mLastChar[0] = aBuf[aLen - 1]
82
+
83
+ if get_state() == :Detecting
84
+ if @_mContextAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
85
+ @_mState = :FoundIt
86
+ end
87
+ end
88
+ return get_state()
89
+ end
90
+
91
+ def get_confidence
92
+ contxtCf = @_mContextAnalyzer.get_confidence()
93
+ distribCf = @_mDistributionAnalyzer.get_confidence()
94
+ return [contxtCf, distribCf].max
95
+ end
96
+ end
97
+ end