rchardet 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class CodingStateMachine
31
+ def initialize(sm)
32
+ @_mModel = sm
33
+ @_mCurrentBytePos = 0
34
+ @_mCurrentCharLen = 0
35
+ reset()
36
+ end
37
+
38
+ def reset
39
+ @_mCurrentState = EStart
40
+ end
41
+
42
+ def next_state(c)
43
+ # for each byte we get its class
44
+ # if it is first byte, we also get byte length
45
+ byteCls = @_mModel['classTable'][c[0]]
46
+ if @_mCurrentState == EStart
47
+ @_mCurrentBytePos = 0
48
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
49
+ end
50
+ # from byte's class and stateTable, we get its next state
51
+ @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
52
+ @_mCurrentBytePos += 1
53
+ return @_mCurrentState
54
+ end
55
+
56
+ def get_current_charlen
57
+ return @_mCurrentCharLen
58
+ end
59
+
60
+ def get_coding_state_machine
61
+ return @_mModel['name']
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,42 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ $debug = false
32
+
33
+ EDetecting = 0
34
+ EFoundIt = 1
35
+ ENotMe = 2
36
+
37
+ EStart = 0
38
+ EError = 1
39
+ EItsMe = 2
40
+
41
+ SHORTCUT_THRESHOLD = 0.95
42
+ end
@@ -0,0 +1,89 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class EscCharSetProber < CharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = [
34
+ CodingStateMachine.new(HZSMModel),
35
+ CodingStateMachine.new(ISO2022CNSMModel),
36
+ CodingStateMachine.new(ISO2022JPSMModel),
37
+ CodingStateMachine.new(ISO2022KRSMModel)
38
+ ]
39
+ reset()
40
+ end
41
+
42
+ def reset
43
+ super()
44
+ for codingSM in @_mCodingSM:
45
+ next if not codingSM
46
+ codingSM.active = true
47
+ codingSM.reset()
48
+ end
49
+ @_mActiveSM = @_mCodingSM.length
50
+ @_mDetectedCharset = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ return @_mDetectedCharset
55
+ end
56
+
57
+ def get_confidence
58
+ if @_mDetectedCharset
59
+ return 0.99
60
+ else
61
+ return 0.00
62
+ end
63
+ end
64
+
65
+ def feed(aBuf)
66
+ for c in aBuf.split('')
67
+ for codingSM in @_mCodingSM
68
+ next unless codingSM
69
+ next unless codingSM.active
70
+ codingState = codingSM.next_state(c)
71
+ if codingState == EError
72
+ codingSM.active = false
73
+ @_mActiveSM -= 1
74
+ if @_mActiveSM <= 0
75
+ @_mState = ENotMe
76
+ return get_state()
77
+ end
78
+ elsif codingState == EItsMe
79
+ @_mState = EFoundIt
80
+ @_mDetectedCharset = codingSM.get_coding_state_machine()
81
+ return get_state()
82
+ end
83
+ end
84
+ end
85
+
86
+ return get_state()
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,244 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Mark Pilgrim - port to Python
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2.1 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25
+ # 02110-1301 USA
26
+ ######################### END LICENSE BLOCK #########################
27
+
28
+ module CharDet
29
+ HZ_cls = [
30
+ 1,0,0,0,0,0,0,0, # 00 - 07
31
+ 0,0,0,0,0,0,0,0, # 08 - 0f
32
+ 0,0,0,0,0,0,0,0, # 10 - 17
33
+ 0,0,0,1,0,0,0,0, # 18 - 1f
34
+ 0,0,0,0,0,0,0,0, # 20 - 27
35
+ 0,0,0,0,0,0,0,0, # 28 - 2f
36
+ 0,0,0,0,0,0,0,0, # 30 - 37
37
+ 0,0,0,0,0,0,0,0, # 38 - 3f
38
+ 0,0,0,0,0,0,0,0, # 40 - 47
39
+ 0,0,0,0,0,0,0,0, # 48 - 4f
40
+ 0,0,0,0,0,0,0,0, # 50 - 57
41
+ 0,0,0,0,0,0,0,0, # 58 - 5f
42
+ 0,0,0,0,0,0,0,0, # 60 - 67
43
+ 0,0,0,0,0,0,0,0, # 68 - 6f
44
+ 0,0,0,0,0,0,0,0, # 70 - 77
45
+ 0,0,0,4,0,5,2,0, # 78 - 7f
46
+ 1,1,1,1,1,1,1,1, # 80 - 87
47
+ 1,1,1,1,1,1,1,1, # 88 - 8f
48
+ 1,1,1,1,1,1,1,1, # 90 - 97
49
+ 1,1,1,1,1,1,1,1, # 98 - 9f
50
+ 1,1,1,1,1,1,1,1, # a0 - a7
51
+ 1,1,1,1,1,1,1,1, # a8 - af
52
+ 1,1,1,1,1,1,1,1, # b0 - b7
53
+ 1,1,1,1,1,1,1,1, # b8 - bf
54
+ 1,1,1,1,1,1,1,1, # c0 - c7
55
+ 1,1,1,1,1,1,1,1, # c8 - cf
56
+ 1,1,1,1,1,1,1,1, # d0 - d7
57
+ 1,1,1,1,1,1,1,1, # d8 - df
58
+ 1,1,1,1,1,1,1,1, # e0 - e7
59
+ 1,1,1,1,1,1,1,1, # e8 - ef
60
+ 1,1,1,1,1,1,1,1, # f0 - f7
61
+ 1,1,1,1,1,1,1,1, # f8 - ff
62
+ ]
63
+
64
+ HZ_st = [
65
+ EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
66
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
67
+ EItsMe,EItsMe,EError,EError,EStart,EStart, 4,EError,# 10-17
68
+ 5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
69
+ 4,EError, 4, 4, 4,EError, 4,EError,# 20-27
70
+ 4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
71
+ ]
72
+
73
+ HZCharLenTable = [0, 0, 0, 0, 0, 0]
74
+
75
+ HZSMModel = {'classTable' => HZ_cls,
76
+ 'classFactor' => 6,
77
+ 'stateTable' => HZ_st,
78
+ 'charLenTable' => HZCharLenTable,
79
+ 'name' => "HZ-GB-2312"
80
+ }
81
+
82
+ ISO2022CN_cls = [
83
+ 2,0,0,0,0,0,0,0, # 00 - 07
84
+ 0,0,0,0,0,0,0,0, # 08 - 0f
85
+ 0,0,0,0,0,0,0,0, # 10 - 17
86
+ 0,0,0,1,0,0,0,0, # 18 - 1f
87
+ 0,0,0,0,0,0,0,0, # 20 - 27
88
+ 0,3,0,0,0,0,0,0, # 28 - 2f
89
+ 0,0,0,0,0,0,0,0, # 30 - 37
90
+ 0,0,0,0,0,0,0,0, # 38 - 3f
91
+ 0,0,0,4,0,0,0,0, # 40 - 47
92
+ 0,0,0,0,0,0,0,0, # 48 - 4f
93
+ 0,0,0,0,0,0,0,0, # 50 - 57
94
+ 0,0,0,0,0,0,0,0, # 58 - 5f
95
+ 0,0,0,0,0,0,0,0, # 60 - 67
96
+ 0,0,0,0,0,0,0,0, # 68 - 6f
97
+ 0,0,0,0,0,0,0,0, # 70 - 77
98
+ 0,0,0,0,0,0,0,0, # 78 - 7f
99
+ 2,2,2,2,2,2,2,2, # 80 - 87
100
+ 2,2,2,2,2,2,2,2, # 88 - 8f
101
+ 2,2,2,2,2,2,2,2, # 90 - 97
102
+ 2,2,2,2,2,2,2,2, # 98 - 9f
103
+ 2,2,2,2,2,2,2,2, # a0 - a7
104
+ 2,2,2,2,2,2,2,2, # a8 - af
105
+ 2,2,2,2,2,2,2,2, # b0 - b7
106
+ 2,2,2,2,2,2,2,2, # b8 - bf
107
+ 2,2,2,2,2,2,2,2, # c0 - c7
108
+ 2,2,2,2,2,2,2,2, # c8 - cf
109
+ 2,2,2,2,2,2,2,2, # d0 - d7
110
+ 2,2,2,2,2,2,2,2, # d8 - df
111
+ 2,2,2,2,2,2,2,2, # e0 - e7
112
+ 2,2,2,2,2,2,2,2, # e8 - ef
113
+ 2,2,2,2,2,2,2,2, # f0 - f7
114
+ 2,2,2,2,2,2,2,2, # f8 - ff
115
+ ]
116
+
117
+ ISO2022CN_st = [
118
+ EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
119
+ EStart,EError,EError,EError,EError,EError,EError,EError,# 08-0f
120
+ EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
121
+ EItsMe,EItsMe,EItsMe,EError,EError,EError, 4,EError,# 18-1f
122
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
123
+ 5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
124
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
125
+ EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
126
+ ]
127
+
128
+ ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
+
130
+ ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
+ 'classFactor' => 9,
132
+ 'stateTable' => ISO2022CN_st,
133
+ 'charLenTable' => ISO2022CNCharLenTable,
134
+ 'name' => "ISO-2022-CN"
135
+ }
136
+
137
+ ISO2022JP_cls = [
138
+ 2,0,0,0,0,0,0,0, # 00 - 07
139
+ 0,0,0,0,0,0,2,2, # 08 - 0f
140
+ 0,0,0,0,0,0,0,0, # 10 - 17
141
+ 0,0,0,1,0,0,0,0, # 18 - 1f
142
+ 0,0,0,0,7,0,0,0, # 20 - 27
143
+ 3,0,0,0,0,0,0,0, # 28 - 2f
144
+ 0,0,0,0,0,0,0,0, # 30 - 37
145
+ 0,0,0,0,0,0,0,0, # 38 - 3f
146
+ 6,0,4,0,8,0,0,0, # 40 - 47
147
+ 0,9,5,0,0,0,0,0, # 48 - 4f
148
+ 0,0,0,0,0,0,0,0, # 50 - 57
149
+ 0,0,0,0,0,0,0,0, # 58 - 5f
150
+ 0,0,0,0,0,0,0,0, # 60 - 67
151
+ 0,0,0,0,0,0,0,0, # 68 - 6f
152
+ 0,0,0,0,0,0,0,0, # 70 - 77
153
+ 0,0,0,0,0,0,0,0, # 78 - 7f
154
+ 2,2,2,2,2,2,2,2, # 80 - 87
155
+ 2,2,2,2,2,2,2,2, # 88 - 8f
156
+ 2,2,2,2,2,2,2,2, # 90 - 97
157
+ 2,2,2,2,2,2,2,2, # 98 - 9f
158
+ 2,2,2,2,2,2,2,2, # a0 - a7
159
+ 2,2,2,2,2,2,2,2, # a8 - af
160
+ 2,2,2,2,2,2,2,2, # b0 - b7
161
+ 2,2,2,2,2,2,2,2, # b8 - bf
162
+ 2,2,2,2,2,2,2,2, # c0 - c7
163
+ 2,2,2,2,2,2,2,2, # c8 - cf
164
+ 2,2,2,2,2,2,2,2, # d0 - d7
165
+ 2,2,2,2,2,2,2,2, # d8 - df
166
+ 2,2,2,2,2,2,2,2, # e0 - e7
167
+ 2,2,2,2,2,2,2,2, # e8 - ef
168
+ 2,2,2,2,2,2,2,2, # f0 - f7
169
+ 2,2,2,2,2,2,2,2, # f8 - ff
170
+ ]
171
+
172
+ ISO2022JP_st = [
173
+ EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
174
+ EStart,EStart,EError,EError,EError,EError,EError,EError,# 08-0f
175
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
176
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,# 18-1f
177
+ EError, 5,EError,EError,EError, 4,EError,EError,# 20-27
178
+ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
179
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
180
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
181
+ EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
182
+ ]
183
+
184
+ ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0]
185
+
186
+ ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
187
+ 'classFactor' => 10,
188
+ 'stateTable' => ISO2022JP_st,
189
+ 'charLenTable' => ISO2022JPCharLenTable,
190
+ 'name' => "ISO-2022-JP"
191
+ }
192
+
193
+ ISO2022KR_cls = [
194
+ 2,0,0,0,0,0,0,0, # 00 - 07
195
+ 0,0,0,0,0,0,0,0, # 08 - 0f
196
+ 0,0,0,0,0,0,0,0, # 10 - 17
197
+ 0,0,0,1,0,0,0,0, # 18 - 1f
198
+ 0,0,0,0,3,0,0,0, # 20 - 27
199
+ 0,4,0,0,0,0,0,0, # 28 - 2f
200
+ 0,0,0,0,0,0,0,0, # 30 - 37
201
+ 0,0,0,0,0,0,0,0, # 38 - 3f
202
+ 0,0,0,5,0,0,0,0, # 40 - 47
203
+ 0,0,0,0,0,0,0,0, # 48 - 4f
204
+ 0,0,0,0,0,0,0,0, # 50 - 57
205
+ 0,0,0,0,0,0,0,0, # 58 - 5f
206
+ 0,0,0,0,0,0,0,0, # 60 - 67
207
+ 0,0,0,0,0,0,0,0, # 68 - 6f
208
+ 0,0,0,0,0,0,0,0, # 70 - 77
209
+ 0,0,0,0,0,0,0,0, # 78 - 7f
210
+ 2,2,2,2,2,2,2,2, # 80 - 87
211
+ 2,2,2,2,2,2,2,2, # 88 - 8f
212
+ 2,2,2,2,2,2,2,2, # 90 - 97
213
+ 2,2,2,2,2,2,2,2, # 98 - 9f
214
+ 2,2,2,2,2,2,2,2, # a0 - a7
215
+ 2,2,2,2,2,2,2,2, # a8 - af
216
+ 2,2,2,2,2,2,2,2, # b0 - b7
217
+ 2,2,2,2,2,2,2,2, # b8 - bf
218
+ 2,2,2,2,2,2,2,2, # c0 - c7
219
+ 2,2,2,2,2,2,2,2, # c8 - cf
220
+ 2,2,2,2,2,2,2,2, # d0 - d7
221
+ 2,2,2,2,2,2,2,2, # d8 - df
222
+ 2,2,2,2,2,2,2,2, # e0 - e7
223
+ 2,2,2,2,2,2,2,2, # e8 - ef
224
+ 2,2,2,2,2,2,2,2, # f0 - f7
225
+ 2,2,2,2,2,2,2,2, # f8 - ff
226
+ ]
227
+
228
+ ISO2022KR_st = [
229
+ EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
230
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
231
+ EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
232
+ EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
233
+ EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
234
+ ]
235
+
236
+ ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
237
+
238
+ ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
239
+ 'classFactor' => 6,
240
+ 'stateTable' => ISO2022KR_st,
241
+ 'charLenTable' => ISO2022KRCharLenTable,
242
+ 'name' => "ISO-2022-KR"
243
+ }
244
+ end
@@ -0,0 +1,88 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class EUCJPProber < MultiByteCharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
34
+ @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new()
35
+ @_mContextAnalyzer = EUCJPContextAnalysis.new()
36
+ reset
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mContextAnalyzer.reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "EUC-JP"
46
+ end
47
+
48
+ def feed(aBuf)
49
+ aLen = aBuf.length
50
+ for i in (0...aLen)
51
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @_mState = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @_mState = EFoundIt
58
+ break
59
+ elsif codingState == EStart:
60
+ charLen = @_mCodingSM.get_current_charlen()
61
+ if i == 0
62
+ @_mLastChar[1] = aBuf[0..0]
63
+ @_mContextAnalyzer.feed(@_mLastChar, charLen)
64
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
+ else
66
+ @_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
67
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
68
+ end
69
+ end
70
+ end
71
+
72
+ @_mLastChar[0] = aBuf[aLen-1..aLen-1]
73
+
74
+ if get_state() == EDetecting
75
+ if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @_mState = EFoundIt
77
+ end
78
+ end
79
+
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ l = [@_mContextAnalyzer.get_confidence,@_mDistributionAnalyzer.get_confidence]
85
+ return l.max
86
+ end
87
+ end
88
+ end