rchardet 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
data/lib/rchardet.rb
CHANGED
@@ -15,8 +15,7 @@
|
|
15
15
|
# 02110-1301 USA
|
16
16
|
######################### END LICENSE BLOCK #########################
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
require 'rchardet/version'
|
20
19
|
require 'rchardet/charsetprober'
|
21
20
|
require 'rchardet/mbcharsetprober'
|
22
21
|
|
@@ -56,7 +55,6 @@ require 'rchardet/universaldetector'
|
|
56
55
|
require 'rchardet/utf8prober'
|
57
56
|
|
58
57
|
module CharDet
|
59
|
-
VERSION = "1.3.1"
|
60
58
|
def CharDet.detect(aBuf)
|
61
59
|
u = UniversalDetector.new
|
62
60
|
u.reset
|
data/lib/rchardet/big5freq.rb
CHANGED
@@ -922,6 +922,6 @@ Big5CharToFreqOrder = [
|
|
922
922
|
13920,13921,13922,13923,13924,13925,13926,13927,13928,13929,13930,13931,13932,13933,13934,13935, #13936
|
923
923
|
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
924
924
|
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
925
|
-
13968,13969,13970,13971,13972
|
925
|
+
13968,13969,13970,13971,13972 #13973
|
926
|
+
].freeze
|
926
927
|
end
|
927
|
-
|
data/lib/rchardet/big5prober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class Big5Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(Big5SMModel)
|
34
|
+
@distributionAnalyzer = Big5DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
@@ -33,50 +33,50 @@ module CharDet
|
|
33
33
|
|
34
34
|
class CharDistributionAnalysis
|
35
35
|
def initialize
|
36
|
-
@
|
37
|
-
@
|
38
|
-
@
|
36
|
+
@charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
|
37
|
+
@tableSize = nil # Size of above table
|
38
|
+
@typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
|
39
39
|
reset()
|
40
40
|
end
|
41
41
|
|
42
42
|
def reset
|
43
43
|
# # """reset analyser, clear any state"""
|
44
|
-
@
|
45
|
-
@
|
46
|
-
@
|
44
|
+
@done = false # If this flag is set to constants.True, detection is done and conclusion has been made
|
45
|
+
@totalChars = 0 # Total characters encountered
|
46
|
+
@freqChars = 0 # The number of characters whose frequency order is less than 512
|
47
47
|
end
|
48
48
|
|
49
49
|
def feed(aStr, aCharLen)
|
50
50
|
# # """feed a character with known length"""
|
51
51
|
if aCharLen == 2
|
52
|
-
|
53
|
-
|
52
|
+
# we only care about 2-bytes character in our distribution analysis
|
53
|
+
order = get_order(aStr)
|
54
54
|
else
|
55
|
-
|
55
|
+
order = -1
|
56
56
|
end
|
57
57
|
if order >= 0
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
58
|
+
@totalChars += 1
|
59
|
+
# order is valid
|
60
|
+
if order < @tableSize
|
61
|
+
if 512 > @charToFreqOrder[order]
|
62
|
+
@freqChars += 1
|
63
|
+
end
|
64
|
+
end
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
68
|
def get_confidence
|
69
69
|
# """return confidence based on existing data"""
|
70
70
|
# if we didn't receive any character in our consideration range, return negative answer
|
71
|
-
if @
|
72
|
-
|
71
|
+
if @totalChars <= 0
|
72
|
+
return SURE_NO
|
73
73
|
end
|
74
74
|
|
75
|
-
if @
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
75
|
+
if @totalChars != @freqChars
|
76
|
+
r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio)
|
77
|
+
if r < SURE_YES
|
78
|
+
return r
|
79
|
+
end
|
80
80
|
end
|
81
81
|
|
82
82
|
# normalize confidence (we don't want to be 100% sure)
|
@@ -86,7 +86,7 @@ module CharDet
|
|
86
86
|
def got_enough_data
|
87
87
|
# It is not necessary to receive all data to draw conclusion. For charset detection,
|
88
88
|
# certain amount of data is enough
|
89
|
-
return @
|
89
|
+
return @totalChars > ENOUGH_DATA_THRESHOLD
|
90
90
|
end
|
91
91
|
|
92
92
|
def get_order(aStr)
|
@@ -100,9 +100,9 @@ module CharDet
|
|
100
100
|
class EUCTWDistributionAnalysis < CharDistributionAnalysis
|
101
101
|
def initialize
|
102
102
|
super()
|
103
|
-
@
|
104
|
-
@
|
105
|
-
@
|
103
|
+
@charToFreqOrder = EUCTWCharToFreqOrder
|
104
|
+
@tableSize = EUCTW_TABLE_SIZE
|
105
|
+
@typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
106
106
|
end
|
107
107
|
|
108
108
|
def get_order(aStr)
|
@@ -110,10 +110,11 @@ module CharDet
|
|
110
110
|
# first byte range: 0xc4 -- 0xfe
|
111
111
|
# second byte range: 0xa1 -- 0xfe
|
112
112
|
# no validation needed here. State machine has done that
|
113
|
-
if aStr[0
|
114
|
-
|
113
|
+
if aStr[0, 1] >= "\xC4"
|
114
|
+
bytes = aStr.bytes
|
115
|
+
return 94 * (bytes[0] - 0xC4) + bytes[1] - 0xA1
|
115
116
|
else
|
116
|
-
|
117
|
+
return -1
|
117
118
|
end
|
118
119
|
end
|
119
120
|
end
|
@@ -121,9 +122,9 @@ module CharDet
|
|
121
122
|
class EUCKRDistributionAnalysis < CharDistributionAnalysis
|
122
123
|
def initialize
|
123
124
|
super()
|
124
|
-
@
|
125
|
-
@
|
126
|
-
@
|
125
|
+
@charToFreqOrder = EUCKRCharToFreqOrder
|
126
|
+
@tableSize = EUCKR_TABLE_SIZE
|
127
|
+
@typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
127
128
|
end
|
128
129
|
|
129
130
|
def get_order(aStr)
|
@@ -131,10 +132,11 @@ module CharDet
|
|
131
132
|
# first byte range: 0xb0 -- 0xfe
|
132
133
|
# second byte range: 0xa1 -- 0xfe
|
133
134
|
# no validation needed here. State machine has done that
|
134
|
-
if aStr[0
|
135
|
-
|
135
|
+
if aStr[0, 1] >= "\xB0"
|
136
|
+
bytes = aStr.bytes
|
137
|
+
return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
|
136
138
|
else
|
137
|
-
|
139
|
+
return -1
|
138
140
|
end
|
139
141
|
end
|
140
142
|
end
|
@@ -142,9 +144,9 @@ module CharDet
|
|
142
144
|
class GB2312DistributionAnalysis < CharDistributionAnalysis
|
143
145
|
def initialize
|
144
146
|
super()
|
145
|
-
@
|
146
|
-
@
|
147
|
-
@
|
147
|
+
@charToFreqOrder = GB2312CharToFreqOrder
|
148
|
+
@tableSize = GB2312_TABLE_SIZE
|
149
|
+
@typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
148
150
|
end
|
149
151
|
|
150
152
|
def get_order(aStr)
|
@@ -152,10 +154,11 @@ module CharDet
|
|
152
154
|
# first byte range: 0xb0 -- 0xfe
|
153
155
|
# second byte range: 0xa1 -- 0xfe
|
154
156
|
# no validation needed here. State machine has done that
|
155
|
-
if (aStr[0
|
156
|
-
|
157
|
+
if (aStr[0, 1] >= "\xB0") and (aStr[1, 1] >= "\xA1")
|
158
|
+
bytes = aStr.bytes
|
159
|
+
return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
|
157
160
|
else
|
158
|
-
|
161
|
+
return -1
|
159
162
|
end
|
160
163
|
end
|
161
164
|
end
|
@@ -163,9 +166,9 @@ module CharDet
|
|
163
166
|
class Big5DistributionAnalysis < CharDistributionAnalysis
|
164
167
|
def initialize
|
165
168
|
super
|
166
|
-
@
|
167
|
-
@
|
168
|
-
@
|
169
|
+
@charToFreqOrder = Big5CharToFreqOrder
|
170
|
+
@tableSize = BIG5_TABLE_SIZE
|
171
|
+
@typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
169
172
|
end
|
170
173
|
|
171
174
|
def get_order(aStr)
|
@@ -173,14 +176,15 @@ module CharDet
|
|
173
176
|
# first byte range: 0xa4 -- 0xfe
|
174
177
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
175
178
|
# no validation needed here. State machine has done that
|
176
|
-
if aStr[0
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
179
|
+
if aStr[0, 1] >= "\xA4"
|
180
|
+
bytes = aStr.bytes
|
181
|
+
if aStr[1, 1] >= "\xA1"
|
182
|
+
return 157 * (bytes[0] - 0xA4) + bytes[1] - 0xA1 + 63
|
183
|
+
else
|
184
|
+
return 157 * (bytes[0] - 0xA4) + bytes[1] - 0x40
|
185
|
+
end
|
182
186
|
else
|
183
|
-
|
187
|
+
return -1
|
184
188
|
end
|
185
189
|
end
|
186
190
|
end
|
@@ -188,9 +192,9 @@ module CharDet
|
|
188
192
|
class SJISDistributionAnalysis < CharDistributionAnalysis
|
189
193
|
def initialize
|
190
194
|
super()
|
191
|
-
@
|
192
|
-
@
|
193
|
-
@
|
195
|
+
@charToFreqOrder = JISCharToFreqOrder
|
196
|
+
@tableSize = JIS_TABLE_SIZE
|
197
|
+
@typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
194
198
|
end
|
195
199
|
|
196
200
|
def get_order(aStr)
|
@@ -198,17 +202,17 @@ module CharDet
|
|
198
202
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
199
203
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
200
204
|
# no validation needed here. State machine has done that
|
201
|
-
|
202
|
-
if (aStr[0
|
203
|
-
|
204
|
-
elsif (aStr[0
|
205
|
-
|
205
|
+
bytes = aStr.bytes
|
206
|
+
if (aStr[0, 1] >= "\x81") and (aStr[0, 1] <= "\x9F")
|
207
|
+
order = 188 * (bytes[0] - 0x81)
|
208
|
+
elsif (aStr[0, 1] >= "\xE0") and (aStr[0, 1] <= "\xEF")
|
209
|
+
order = 188 * (bytes[0] - 0xE0 + 31)
|
206
210
|
else
|
207
|
-
|
211
|
+
return -1
|
208
212
|
end
|
209
|
-
order = order +
|
210
|
-
if aStr[1
|
211
|
-
|
213
|
+
order = order + bytes[1] - 0x40
|
214
|
+
if aStr[1, 1] > "\x7F"
|
215
|
+
order =- 1
|
212
216
|
end
|
213
217
|
return order
|
214
218
|
end
|
@@ -217,9 +221,9 @@ module CharDet
|
|
217
221
|
class EUCJPDistributionAnalysis < CharDistributionAnalysis
|
218
222
|
def initialize
|
219
223
|
super()
|
220
|
-
@
|
221
|
-
@
|
222
|
-
@
|
224
|
+
@charToFreqOrder = JISCharToFreqOrder
|
225
|
+
@tableSize = JIS_TABLE_SIZE
|
226
|
+
@typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
223
227
|
end
|
224
228
|
|
225
229
|
def get_order(aStr)
|
@@ -227,8 +231,9 @@ module CharDet
|
|
227
231
|
# first byte range: 0xa0 -- 0xfe
|
228
232
|
# second byte range: 0xa1 -- 0xfe
|
229
233
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0
|
231
|
-
|
234
|
+
if aStr[0, 1] >= "\xA0"
|
235
|
+
bytes = aStr.bytes
|
236
|
+
return 94 * (bytes[0] - 0xA1) + bytes[1] - 0xa1
|
232
237
|
else
|
233
238
|
return -1
|
234
239
|
end
|
@@ -28,54 +28,55 @@
|
|
28
28
|
|
29
29
|
module CharDet
|
30
30
|
class CharSetGroupProber < CharSetProber
|
31
|
-
attr_accessor :
|
31
|
+
attr_accessor :probers
|
32
32
|
def initialize
|
33
33
|
super
|
34
|
-
@
|
35
|
-
@
|
36
|
-
@
|
34
|
+
@activeNum = 0
|
35
|
+
@probers = []
|
36
|
+
@bestGuessProber = nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super
|
41
|
-
@
|
41
|
+
@activeNum = 0
|
42
42
|
|
43
|
-
for prober in @
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
for prober in @probers
|
44
|
+
if prober
|
45
|
+
prober.reset()
|
46
|
+
prober.active = true
|
47
|
+
@activeNum += 1
|
48
|
+
end
|
49
49
|
end
|
50
|
-
@
|
50
|
+
@bestGuessProber = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
|
-
if
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
if !@bestGuessProber
|
55
|
+
get_confidence()
|
56
|
+
if !@bestGuessProber
|
57
|
+
return nil
|
58
|
+
end
|
58
59
|
end
|
59
|
-
return @
|
60
|
+
return @bestGuessProber.get_charset_name()
|
60
61
|
end
|
61
62
|
|
62
63
|
def feed(aBuf)
|
63
|
-
for prober in @
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
for prober in @probers
|
65
|
+
next unless prober
|
66
|
+
next unless prober.active
|
67
|
+
st = prober.feed(aBuf)
|
68
|
+
next unless st
|
69
|
+
if st == EFoundIt
|
70
|
+
@bestGuessProber = prober
|
71
|
+
return get_state()
|
72
|
+
elsif st == ENotMe
|
73
|
+
prober.active = false
|
74
|
+
@activeNum -= 1
|
75
|
+
if @activeNum <= 0
|
76
|
+
@state = ENotMe
|
77
|
+
return get_state()
|
78
|
+
end
|
79
|
+
end
|
79
80
|
end
|
80
81
|
return get_state()
|
81
82
|
end
|
@@ -83,30 +84,27 @@ module CharDet
|
|
83
84
|
def get_confidence()
|
84
85
|
st = get_state()
|
85
86
|
if st == EFoundIt
|
86
|
-
|
87
|
+
return 0.99
|
87
88
|
elsif st == ENotMe
|
88
|
-
|
89
|
+
return 0.01
|
89
90
|
end
|
90
91
|
bestConf = 0.0
|
91
|
-
@
|
92
|
-
for prober in @
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
92
|
+
@bestGuessProber = nil
|
93
|
+
for prober in @probers
|
94
|
+
next unless prober
|
95
|
+
unless prober.active
|
96
|
+
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
|
97
|
+
next
|
98
|
+
end
|
99
|
+
cf = prober.get_confidence()
|
100
|
+
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
|
101
|
+
if bestConf < cf
|
102
|
+
bestConf = cf
|
103
|
+
@bestGuessProber = prober
|
104
|
+
end
|
104
105
|
end
|
105
|
-
return 0.0 unless @
|
106
|
+
return 0.0 unless @bestGuessProber
|
106
107
|
return bestConf
|
107
|
-
# else:
|
108
|
-
# self._mBestGuessProber = self._mProbers[0]
|
109
|
-
# return self._mBestGuessProber.get_confidence()
|
110
108
|
end
|
111
109
|
end
|
112
110
|
end
|
@@ -34,7 +34,7 @@ module CharDet
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def reset
|
37
|
-
@
|
37
|
+
@state = EDetecting
|
38
38
|
end
|
39
39
|
|
40
40
|
def get_charset_name
|
@@ -45,7 +45,7 @@ module CharDet
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def get_state
|
48
|
-
return @
|
48
|
+
return @state
|
49
49
|
end
|
50
50
|
|
51
51
|
def get_confidence
|
@@ -53,11 +53,6 @@ module CharDet
|
|
53
53
|
end
|
54
54
|
|
55
55
|
def filter_high_bit_only(aBuf)
|
56
|
-
# DO NOT USE `gsub!`
|
57
|
-
# It will remove all characters from the buffer that is later used by
|
58
|
-
# other probers. This is because gsub! removes data from the instance variable
|
59
|
-
# that will be passed to later probers, while gsub makes a new instance variable
|
60
|
-
# that will not.
|
61
56
|
newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
|
62
57
|
return newBuf
|
63
58
|
end
|