chardet2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/Big5Prober.rb ADDED
@@ -0,0 +1,48 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'CharDistributionAnalysis'
33
+ require 'MBCSSM'
34
+
35
+ module UniversalDetector
36
+ class Big5Prober < MultiByteCharSetProber
37
+ def initialize
38
+ super
39
+ @_mCodingSM = CodingStateMachine.new(Big5SMModel)
40
+ @_mDistributionAnalyzer = Big5DistributionAnalysis.new
41
+ reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "Big5"
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,245 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "UniversalDetector"
30
+ require "EUCTWFreq"
31
+ require "EUCKRFreq"
32
+ require "GB2312Freq"
33
+ require "Big5Freq"
34
+ require "JISFreq"
35
+
36
+ module UniversalDetector
37
+ class CharDistributionAnalysis
38
+
39
+ ENOUGH_DATA_THRESHOLD = 1024
40
+ SURE_YES = 0.99
41
+ SURE_NO = 0.01
42
+
43
+ def initialize
44
+ @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
45
+ @_mTableSize = nil # Size of above table
46
+ @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
47
+ reset()
48
+ end
49
+
50
+ def reset
51
+ #"""reset analyser, clear any state"""
52
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
53
+ @_mTotalChars = 0 # Total characters encountered
54
+ @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
55
+ end
56
+
57
+ def feed(aStr, aCharLen)
58
+ #"""feed a character with known length"""
59
+ if aCharLen == 2
60
+ # we only care about 2-bytes character in our distribution analysis
61
+ order = get_order(aStr)
62
+ else
63
+ order = -1
64
+ end
65
+ if order >= 0
66
+ @_mTotalChars += 1
67
+ # order is valid
68
+ if order < @_mTableSize
69
+ if 512 > @_mCharToFreqOrder[order]
70
+ @_mFreqChars += 1
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ def get_confidence
77
+ #"""return confidence based on existing data"""
78
+ # if we didn"t receive any character in our consideration range, return negative answer
79
+ if @_mTotalChars <= 0
80
+ return SURE_NO
81
+ end
82
+
83
+ if @_mTotalChars != @_mFreqChars
84
+ r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
85
+ if r < SURE_YES
86
+ return r
87
+ end
88
+ end
89
+
90
+ # normalize confidence (we don"t want to be 100% sure)
91
+ return SURE_YES
92
+ end
93
+
94
+ def got_enough_data
95
+ # It is not necessary to receive all data to draw conclusion. For charset detection,
96
+ # certain amount of data is enough
97
+ return @_mTotalChars > ENOUGH_DATA_THRESHOLD
98
+ end
99
+
100
+ def get_order(aStr)
101
+ # We do not handle characters based on the original encoding string, but
102
+ # convert this encoding string to a number, here called order.
103
+ # This allows multiple encodings of a language to share one frequency table.
104
+ return -1
105
+ end
106
+ end
107
+
108
+ class EUCTWDistributionAnalysis < CharDistributionAnalysis
109
+ def initialize
110
+ super
111
+ @_mCharToFreqOrder = EUCTWCharToFreqOrder
112
+ @_mTableSize = EUCTW_TABLE_SIZE
113
+ @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
114
+ end
115
+
116
+ def get_order(aStr)
117
+ # for euc-TW encoding, we are interested
118
+ # first byte range: 0xc4 -- 0xfe
119
+ # second byte range: 0xa1 -- 0xfe
120
+ # no validation needed here. State machine has done that
121
+ if aStr[0] >= 0xC4
122
+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
123
+ else
124
+ return -1
125
+ end
126
+ end
127
+ end
128
+
129
+ class EUCKRDistributionAnalysis < CharDistributionAnalysis
130
+ def initialize
131
+ super
132
+ @_mCharToFreqOrder = EUCKRCharToFreqOrder
133
+ @_mTableSize = EUCKR_TABLE_SIZE
134
+ @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
135
+ end
136
+
137
+ def get_order(aStr)
138
+ # for euc-KR encoding, we are interested
139
+ # first byte range: 0xb0 -- 0xfe
140
+ # second byte range: 0xa1 -- 0xfe
141
+ # no validation needed here. State machine has done that
142
+ if aStr[0] >= 0xB0
143
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
144
+ else
145
+ return -1;
146
+ end
147
+ end
148
+ end
149
+
150
+ class GB2312DistributionAnalysis < CharDistributionAnalysis
151
+ def initialize
152
+ super
153
+ @_mCharToFreqOrder = GB2312CharToFreqOrder
154
+ @_mTableSize = GB2312_TABLE_SIZE
155
+ @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
156
+ end
157
+
158
+ def get_order(aStr)
159
+ # for GB2312 encoding, we are interested
160
+ # first byte range: 0xb0 -- 0xfe
161
+ # second byte range: 0xa1 -- 0xfe
162
+ # no validation needed here. State machine has done that
163
+ if (aStr[0] >= 0xB0) and (aStr[1] >= 0xA1)
164
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
165
+ else
166
+ return -1;
167
+ end
168
+ end
169
+ end
170
+
171
+ class Big5DistributionAnalysis < CharDistributionAnalysis
172
+ def initialize
173
+ super
174
+ @_mCharToFreqOrder = Big5CharToFreqOrder
175
+ @_mTableSize = BIG5_TABLE_SIZE
176
+ @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
177
+ end
178
+
179
+ def get_order(aStr)
180
+ # for big5 encoding, we are interested
181
+ # first byte range: 0xa4 -- 0xfe
182
+ # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
183
+ # no validation needed here. State machine has done that
184
+ if aStr[0] >= 0xA4
185
+ if aStr[1] >= 0xA1
186
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
187
+ else
188
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
189
+ end
190
+ else
191
+ return -1
192
+ end
193
+ end
194
+ end
195
+
196
+ class SJISDistributionAnalysis < CharDistributionAnalysis
197
+ def initialize
198
+ super
199
+ @_mCharToFreqOrder = JISCharToFreqOrder
200
+ @_mTableSize = JIS_TABLE_SIZE
201
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
202
+ end
203
+
204
+ def get_order(aStr)
205
+ # for sjis encoding, we are interested
206
+ # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
207
+ # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
208
+ # no validation needed here. State machine has done that
209
+ if (aStr[0] >= 0x81) && (aStr[0] <= 0x9F)
210
+ order = 188 * (aStr[0] - 0x81)
211
+ elsif (aStr[0] >= 0xE0) and (aStr[0] <= 0xEF)
212
+ order = 188 * (aStr[0] - 0xE0 + 31)
213
+ else
214
+ return -1;
215
+ end
216
+ order = order + aStr[1] - 0x40
217
+ if aStr[1] > 0x7F
218
+ order =- 1
219
+ end
220
+ return order
221
+ end
222
+ end
223
+
224
+ class EUCJPDistributionAnalysis < CharDistributionAnalysis
225
+ def initialize
226
+ super
227
+ @_mCharToFreqOrder = JISCharToFreqOrder
228
+ @_mTableSize = JIS_TABLE_SIZE
229
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
230
+ end
231
+
232
+ def get_order(aStr)
233
+ # for euc-JP encoding, we are interested
234
+ # first byte range: 0xa0 -- 0xfe
235
+ # second byte range: 0xa1 -- 0xfe
236
+ # no validation needed here. State machine has done that
237
+ if aStr[0] >= 0xA0
238
+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
239
+ else
240
+ return -1
241
+ end
242
+ end
243
+ end
244
+
245
+ end
@@ -0,0 +1,114 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+ class CharSetGroupProber < CharSetProber
34
+ def initialize
35
+ @_mActiveNum = 0
36
+ @_mProbers = []
37
+ @_mBestGuessProber = nil
38
+ end
39
+
40
+ def reset
41
+ super
42
+ @_mActiveNum = 0
43
+ for prober in @_mProbers
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @_mActiveNum += 1
48
+ end
49
+ end
50
+ @_mBestGuessProber = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ unless @_mBestGuessProber
55
+ get_confidence()
56
+ unless @_mBestGuessProber then return nil end
57
+ # @_mBestGuessProber = @_mProbers[0]
58
+ end
59
+ return @_mBestGuessProber.get_charset_name()
60
+ end
61
+
62
+ def feed(aBuf)
63
+ for prober in @_mProbers
64
+ unless prober then next end
65
+ unless prober.active then next end
66
+ st = prober.feed(aBuf)
67
+ unless st then next end
68
+ if st == :FoundIt
69
+ @_mBestGuessProber = prober
70
+ return get_state()
71
+ elsif st == :NotMe
72
+ prober.active = false
73
+ @_mActiveNum -= 1
74
+ if @_mActiveNum <= 0
75
+ @_mState = :NotMe
76
+ return get_state()
77
+ end
78
+ end
79
+ end
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ st = get_state()
85
+ if st == :FoundIt
86
+ return 0.99
87
+ elsif st == :NotMe
88
+ return 0.01
89
+ end
90
+
91
+ bestConf = 0.0
92
+ @_mBestGuessProber = nil
93
+ for prober in @_mProbers
94
+ unless prober then next end
95
+ unless prober.active
96
+ if UniversalDetector::DEBUG
97
+ p(prober.get_charset_name() + ' not active\n')
98
+ end
99
+ next
100
+ end
101
+ cf = prober.get_confidence()
102
+ if UniversalDetector::DEBUG
103
+ p('%s confidence = %s\n' % [prober.get_charset_name(), cf])
104
+ end
105
+ if bestConf < cf
106
+ bestConf = cf
107
+ @_mBestGuessProber = prober
108
+ end
109
+ end
110
+ unless @_mBestGuessProber then return 0.0 end
111
+ return bestConf
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,70 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+
31
+ module UniversalDetector
32
+
33
+ class CharSetProber
34
+ def reset
35
+ @_mState = :Detecting
36
+ end
37
+
38
+ def get_charset_name
39
+ return nil
40
+ end
41
+
42
+ def feed(data)
43
+ end
44
+
45
+ def get_state
46
+ return @_mState
47
+ end
48
+
49
+ def get_confidence
50
+ return 0.0
51
+ end
52
+
53
+ def filter_high_bit_only(aBuf)
54
+ aBuf = aBuf.gsub(/([\x00-\x7F])+/, '')
55
+ return aBuf
56
+ end
57
+
58
+ def filter_without_english_letters(aBuf)
59
+ aBuf = aBuf.gsub(/([A-Za-z])+/, '')
60
+ return aBuf
61
+ end
62
+
63
+ def filter_with_english_letters(aBuf)
64
+ # TODO
65
+ return aBuf
66
+ end
67
+
68
+ end #class
69
+
70
+ end #module
@@ -0,0 +1,74 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Mark Pilgrim - first port to Python
11
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+
31
+ module UniversalDetector
32
+ class CodingStateMachine
33
+ def initialize(sm)
34
+ @_mModel = sm
35
+ @_mCurrentBytePos = 0
36
+ @_mCurrentCharLen = 0
37
+ reset()
38
+ end
39
+
40
+ def reset
41
+ @_mCurrentState = :Start
42
+ end
43
+
44
+ def next_state(c)
45
+ # for each byte we get its class
46
+ # if it is first byte, we also get byte length
47
+ byteCls = @_mModel['classTable'][c]
48
+
49
+ if @_mCurrentState == :Start
50
+ @_mCurrentBytePos = 0
51
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
52
+ end
53
+ # from byte's class and stateTable, we get its next state
54
+ stateValue = {:Start => 0, :Error => 1, :ItsMe => 2}
55
+ unless stateValue[@_mCurrentState]
56
+ v = @_mCurrentState
57
+ else
58
+ v = stateValue[@_mCurrentState]
59
+ end
60
+ @_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls]
61
+
62
+ @_mCurrentBytePos += 1
63
+ return @_mCurrentState
64
+ end
65
+
66
+ def get_current_charlen
67
+ return @_mCurrentCharLen
68
+ end
69
+
70
+ def get_coding_state_machine
71
+ return @_mModel['name']
72
+ end
73
+ end
74
+ end