chardet2 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/Big5Prober.rb ADDED
@@ -0,0 +1,48 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'CharDistributionAnalysis'
33
+ require 'MBCSSM'
34
+
35
+ module UniversalDetector
36
+ class Big5Prober < MultiByteCharSetProber
37
+ def initialize
38
+ super
39
+ @_mCodingSM = CodingStateMachine.new(Big5SMModel)
40
+ @_mDistributionAnalyzer = Big5DistributionAnalysis.new
41
+ reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "Big5"
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,245 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "UniversalDetector"
30
+ require "EUCTWFreq"
31
+ require "EUCKRFreq"
32
+ require "GB2312Freq"
33
+ require "Big5Freq"
34
+ require "JISFreq"
35
+
36
+ module UniversalDetector
37
+ class CharDistributionAnalysis
38
+
39
+ ENOUGH_DATA_THRESHOLD = 1024
40
+ SURE_YES = 0.99
41
+ SURE_NO = 0.01
42
+
43
+ def initialize
44
+ @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
45
+ @_mTableSize = nil # Size of above table
46
+ @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
47
+ reset()
48
+ end
49
+
50
+ def reset
51
+ #"""reset analyser, clear any state"""
52
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
53
+ @_mTotalChars = 0 # Total characters encountered
54
+ @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
55
+ end
56
+
57
+ def feed(aStr, aCharLen)
58
+ #"""feed a character with known length"""
59
+ if aCharLen == 2
60
+ # we only care about 2-bytes character in our distribution analysis
61
+ order = get_order(aStr)
62
+ else
63
+ order = -1
64
+ end
65
+ if order >= 0
66
+ @_mTotalChars += 1
67
+ # order is valid
68
+ if order < @_mTableSize
69
+ if 512 > @_mCharToFreqOrder[order]
70
+ @_mFreqChars += 1
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ def get_confidence
77
+ #"""return confidence based on existing data"""
78
+ # if we didn"t receive any character in our consideration range, return negative answer
79
+ if @_mTotalChars <= 0
80
+ return SURE_NO
81
+ end
82
+
83
+ if @_mTotalChars != @_mFreqChars
84
+ r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
85
+ if r < SURE_YES
86
+ return r
87
+ end
88
+ end
89
+
90
+ # normalize confidence (we don"t want to be 100% sure)
91
+ return SURE_YES
92
+ end
93
+
94
+ def got_enough_data
95
+ # It is not necessary to receive all data to draw conclusion. For charset detection,
96
+ # certain amount of data is enough
97
+ return @_mTotalChars > ENOUGH_DATA_THRESHOLD
98
+ end
99
+
100
+ def get_order(aStr)
101
+ # We do not handle characters based on the original encoding string, but
102
+ # convert this encoding string to a number, here called order.
103
+ # This allows multiple encodings of a language to share one frequency table.
104
+ return -1
105
+ end
106
+ end
107
+
108
+ class EUCTWDistributionAnalysis < CharDistributionAnalysis
109
+ def initialize
110
+ super
111
+ @_mCharToFreqOrder = EUCTWCharToFreqOrder
112
+ @_mTableSize = EUCTW_TABLE_SIZE
113
+ @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
114
+ end
115
+
116
+ def get_order(aStr)
117
+ # for euc-TW encoding, we are interested
118
+ # first byte range: 0xc4 -- 0xfe
119
+ # second byte range: 0xa1 -- 0xfe
120
+ # no validation needed here. State machine has done that
121
+ if aStr[0] >= 0xC4
122
+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
123
+ else
124
+ return -1
125
+ end
126
+ end
127
+ end
128
+
129
+ class EUCKRDistributionAnalysis < CharDistributionAnalysis
130
+ def initialize
131
+ super
132
+ @_mCharToFreqOrder = EUCKRCharToFreqOrder
133
+ @_mTableSize = EUCKR_TABLE_SIZE
134
+ @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
135
+ end
136
+
137
+ def get_order(aStr)
138
+ # for euc-KR encoding, we are interested
139
+ # first byte range: 0xb0 -- 0xfe
140
+ # second byte range: 0xa1 -- 0xfe
141
+ # no validation needed here. State machine has done that
142
+ if aStr[0] >= 0xB0
143
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
144
+ else
145
+ return -1;
146
+ end
147
+ end
148
+ end
149
+
150
+ class GB2312DistributionAnalysis < CharDistributionAnalysis
151
+ def initialize
152
+ super
153
+ @_mCharToFreqOrder = GB2312CharToFreqOrder
154
+ @_mTableSize = GB2312_TABLE_SIZE
155
+ @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
156
+ end
157
+
158
+ def get_order(aStr)
159
+ # for GB2312 encoding, we are interested
160
+ # first byte range: 0xb0 -- 0xfe
161
+ # second byte range: 0xa1 -- 0xfe
162
+ # no validation needed here. State machine has done that
163
+ if (aStr[0] >= 0xB0) and (aStr[1] >= 0xA1)
164
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
165
+ else
166
+ return -1;
167
+ end
168
+ end
169
+ end
170
+
171
+ class Big5DistributionAnalysis < CharDistributionAnalysis
172
+ def initialize
173
+ super
174
+ @_mCharToFreqOrder = Big5CharToFreqOrder
175
+ @_mTableSize = BIG5_TABLE_SIZE
176
+ @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
177
+ end
178
+
179
+ def get_order(aStr)
180
+ # for big5 encoding, we are interested
181
+ # first byte range: 0xa4 -- 0xfe
182
+ # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
183
+ # no validation needed here. State machine has done that
184
+ if aStr[0] >= 0xA4
185
+ if aStr[1] >= 0xA1
186
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
187
+ else
188
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
189
+ end
190
+ else
191
+ return -1
192
+ end
193
+ end
194
+ end
195
+
196
+ class SJISDistributionAnalysis < CharDistributionAnalysis
197
+ def initialize
198
+ super
199
+ @_mCharToFreqOrder = JISCharToFreqOrder
200
+ @_mTableSize = JIS_TABLE_SIZE
201
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
202
+ end
203
+
204
+ def get_order(aStr)
205
+ # for sjis encoding, we are interested
206
+ # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
207
+ # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
208
+ # no validation needed here. State machine has done that
209
+ if (aStr[0] >= 0x81) && (aStr[0] <= 0x9F)
210
+ order = 188 * (aStr[0] - 0x81)
211
+ elsif (aStr[0] >= 0xE0) and (aStr[0] <= 0xEF)
212
+ order = 188 * (aStr[0] - 0xE0 + 31)
213
+ else
214
+ return -1;
215
+ end
216
+ order = order + aStr[1] - 0x40
217
+ if aStr[1] > 0x7F
218
+ order =- 1
219
+ end
220
+ return order
221
+ end
222
+ end
223
+
224
+ class EUCJPDistributionAnalysis < CharDistributionAnalysis
225
+ def initialize
226
+ super
227
+ @_mCharToFreqOrder = JISCharToFreqOrder
228
+ @_mTableSize = JIS_TABLE_SIZE
229
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
230
+ end
231
+
232
+ def get_order(aStr)
233
+ # for euc-JP encoding, we are interested
234
+ # first byte range: 0xa0 -- 0xfe
235
+ # second byte range: 0xa1 -- 0xfe
236
+ # no validation needed here. State machine has done that
237
+ if aStr[0] >= 0xA0
238
+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
239
+ else
240
+ return -1
241
+ end
242
+ end
243
+ end
244
+
245
+ end
@@ -0,0 +1,114 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+ class CharSetGroupProber < CharSetProber
34
+ def initialize
35
+ @_mActiveNum = 0
36
+ @_mProbers = []
37
+ @_mBestGuessProber = nil
38
+ end
39
+
40
+ def reset
41
+ super
42
+ @_mActiveNum = 0
43
+ for prober in @_mProbers
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @_mActiveNum += 1
48
+ end
49
+ end
50
+ @_mBestGuessProber = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ unless @_mBestGuessProber
55
+ get_confidence()
56
+ unless @_mBestGuessProber then return nil end
57
+ # @_mBestGuessProber = @_mProbers[0]
58
+ end
59
+ return @_mBestGuessProber.get_charset_name()
60
+ end
61
+
62
+ def feed(aBuf)
63
+ for prober in @_mProbers
64
+ unless prober then next end
65
+ unless prober.active then next end
66
+ st = prober.feed(aBuf)
67
+ unless st then next end
68
+ if st == :FoundIt
69
+ @_mBestGuessProber = prober
70
+ return get_state()
71
+ elsif st == :NotMe
72
+ prober.active = false
73
+ @_mActiveNum -= 1
74
+ if @_mActiveNum <= 0
75
+ @_mState = :NotMe
76
+ return get_state()
77
+ end
78
+ end
79
+ end
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ st = get_state()
85
+ if st == :FoundIt
86
+ return 0.99
87
+ elsif st == :NotMe
88
+ return 0.01
89
+ end
90
+
91
+ bestConf = 0.0
92
+ @_mBestGuessProber = nil
93
+ for prober in @_mProbers
94
+ unless prober then next end
95
+ unless prober.active
96
+ if UniversalDetector::DEBUG
97
+ p(prober.get_charset_name() + ' not active\n')
98
+ end
99
+ next
100
+ end
101
+ cf = prober.get_confidence()
102
+ if UniversalDetector::DEBUG
103
+ p('%s confidence = %s\n' % [prober.get_charset_name(), cf])
104
+ end
105
+ if bestConf < cf
106
+ bestConf = cf
107
+ @_mBestGuessProber = prober
108
+ end
109
+ end
110
+ unless @_mBestGuessProber then return 0.0 end
111
+ return bestConf
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,70 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+
31
+ module UniversalDetector
32
+
33
+ class CharSetProber
34
+ def reset
35
+ @_mState = :Detecting
36
+ end
37
+
38
+ def get_charset_name
39
+ return nil
40
+ end
41
+
42
+ def feed(data)
43
+ end
44
+
45
+ def get_state
46
+ return @_mState
47
+ end
48
+
49
+ def get_confidence
50
+ return 0.0
51
+ end
52
+
53
+ def filter_high_bit_only(aBuf)
54
+ aBuf = aBuf.gsub(/([\x00-\x7F])+/, '')
55
+ return aBuf
56
+ end
57
+
58
+ def filter_without_english_letters(aBuf)
59
+ aBuf = aBuf.gsub(/([A-Za-z])+/, '')
60
+ return aBuf
61
+ end
62
+
63
+ def filter_with_english_letters(aBuf)
64
+ # TODO
65
+ return aBuf
66
+ end
67
+
68
+ end #class
69
+
70
+ end #module
@@ -0,0 +1,74 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Mark Pilgrim - first port to Python
11
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+
31
+ module UniversalDetector
32
+ class CodingStateMachine
33
+ def initialize(sm)
34
+ @_mModel = sm
35
+ @_mCurrentBytePos = 0
36
+ @_mCurrentCharLen = 0
37
+ reset()
38
+ end
39
+
40
+ def reset
41
+ @_mCurrentState = :Start
42
+ end
43
+
44
+ def next_state(c)
45
+ # for each byte we get its class
46
+ # if it is first byte, we also get byte length
47
+ byteCls = @_mModel['classTable'][c]
48
+
49
+ if @_mCurrentState == :Start
50
+ @_mCurrentBytePos = 0
51
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
52
+ end
53
+ # from byte's class and stateTable, we get its next state
54
+ stateValue = {:Start => 0, :Error => 1, :ItsMe => 2}
55
+ unless stateValue[@_mCurrentState]
56
+ v = @_mCurrentState
57
+ else
58
+ v = stateValue[@_mCurrentState]
59
+ end
60
+ @_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls]
61
+
62
+ @_mCurrentBytePos += 1
63
+ return @_mCurrentState
64
+ end
65
+
66
+ def get_current_charlen
67
+ return @_mCurrentCharLen
68
+ end
69
+
70
+ def get_coding_state_machine
71
+ return @_mModel['name']
72
+ end
73
+ end
74
+ end