rchardet 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
data/lib/rchardet.rb
CHANGED
@@ -15,8 +15,7 @@
|
|
15
15
|
# 02110-1301 USA
|
16
16
|
######################### END LICENSE BLOCK #########################
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
require 'rchardet/version'
|
20
19
|
require 'rchardet/charsetprober'
|
21
20
|
require 'rchardet/mbcharsetprober'
|
22
21
|
|
@@ -56,7 +55,6 @@ require 'rchardet/universaldetector'
|
|
56
55
|
require 'rchardet/utf8prober'
|
57
56
|
|
58
57
|
module CharDet
|
59
|
-
VERSION = "1.3.1"
|
60
58
|
def CharDet.detect(aBuf)
|
61
59
|
u = UniversalDetector.new
|
62
60
|
u.reset
|
data/lib/rchardet/big5freq.rb
CHANGED
@@ -922,6 +922,6 @@ Big5CharToFreqOrder = [
|
|
922
922
|
13920,13921,13922,13923,13924,13925,13926,13927,13928,13929,13930,13931,13932,13933,13934,13935, #13936
|
923
923
|
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
|
924
924
|
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
|
925
|
-
13968,13969,13970,13971,13972
|
925
|
+
13968,13969,13970,13971,13972 #13973
|
926
|
+
].freeze
|
926
927
|
end
|
927
|
-
|
data/lib/rchardet/big5prober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class Big5Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(Big5SMModel)
|
34
|
+
@distributionAnalyzer = Big5DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
@@ -33,50 +33,50 @@ module CharDet
|
|
33
33
|
|
34
34
|
class CharDistributionAnalysis
|
35
35
|
def initialize
|
36
|
-
@
|
37
|
-
@
|
38
|
-
@
|
36
|
+
@charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
|
37
|
+
@tableSize = nil # Size of above table
|
38
|
+
@typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
|
39
39
|
reset()
|
40
40
|
end
|
41
41
|
|
42
42
|
def reset
|
43
43
|
# # """reset analyser, clear any state"""
|
44
|
-
@
|
45
|
-
@
|
46
|
-
@
|
44
|
+
@done = false # If this flag is set to constants.True, detection is done and conclusion has been made
|
45
|
+
@totalChars = 0 # Total characters encountered
|
46
|
+
@freqChars = 0 # The number of characters whose frequency order is less than 512
|
47
47
|
end
|
48
48
|
|
49
49
|
def feed(aStr, aCharLen)
|
50
50
|
# # """feed a character with known length"""
|
51
51
|
if aCharLen == 2
|
52
|
-
|
53
|
-
|
52
|
+
# we only care about 2-bytes character in our distribution analysis
|
53
|
+
order = get_order(aStr)
|
54
54
|
else
|
55
|
-
|
55
|
+
order = -1
|
56
56
|
end
|
57
57
|
if order >= 0
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
58
|
+
@totalChars += 1
|
59
|
+
# order is valid
|
60
|
+
if order < @tableSize
|
61
|
+
if 512 > @charToFreqOrder[order]
|
62
|
+
@freqChars += 1
|
63
|
+
end
|
64
|
+
end
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
68
|
def get_confidence
|
69
69
|
# """return confidence based on existing data"""
|
70
70
|
# if we didn't receive any character in our consideration range, return negative answer
|
71
|
-
if @
|
72
|
-
|
71
|
+
if @totalChars <= 0
|
72
|
+
return SURE_NO
|
73
73
|
end
|
74
74
|
|
75
|
-
if @
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
75
|
+
if @totalChars != @freqChars
|
76
|
+
r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio)
|
77
|
+
if r < SURE_YES
|
78
|
+
return r
|
79
|
+
end
|
80
80
|
end
|
81
81
|
|
82
82
|
# normalize confidence (we don't want to be 100% sure)
|
@@ -86,7 +86,7 @@ module CharDet
|
|
86
86
|
def got_enough_data
|
87
87
|
# It is not necessary to receive all data to draw conclusion. For charset detection,
|
88
88
|
# certain amount of data is enough
|
89
|
-
return @
|
89
|
+
return @totalChars > ENOUGH_DATA_THRESHOLD
|
90
90
|
end
|
91
91
|
|
92
92
|
def get_order(aStr)
|
@@ -100,9 +100,9 @@ module CharDet
|
|
100
100
|
class EUCTWDistributionAnalysis < CharDistributionAnalysis
|
101
101
|
def initialize
|
102
102
|
super()
|
103
|
-
@
|
104
|
-
@
|
105
|
-
@
|
103
|
+
@charToFreqOrder = EUCTWCharToFreqOrder
|
104
|
+
@tableSize = EUCTW_TABLE_SIZE
|
105
|
+
@typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
106
106
|
end
|
107
107
|
|
108
108
|
def get_order(aStr)
|
@@ -110,10 +110,11 @@ module CharDet
|
|
110
110
|
# first byte range: 0xc4 -- 0xfe
|
111
111
|
# second byte range: 0xa1 -- 0xfe
|
112
112
|
# no validation needed here. State machine has done that
|
113
|
-
if aStr[0
|
114
|
-
|
113
|
+
if aStr[0, 1] >= "\xC4"
|
114
|
+
bytes = aStr.bytes
|
115
|
+
return 94 * (bytes[0] - 0xC4) + bytes[1] - 0xA1
|
115
116
|
else
|
116
|
-
|
117
|
+
return -1
|
117
118
|
end
|
118
119
|
end
|
119
120
|
end
|
@@ -121,9 +122,9 @@ module CharDet
|
|
121
122
|
class EUCKRDistributionAnalysis < CharDistributionAnalysis
|
122
123
|
def initialize
|
123
124
|
super()
|
124
|
-
@
|
125
|
-
@
|
126
|
-
@
|
125
|
+
@charToFreqOrder = EUCKRCharToFreqOrder
|
126
|
+
@tableSize = EUCKR_TABLE_SIZE
|
127
|
+
@typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
127
128
|
end
|
128
129
|
|
129
130
|
def get_order(aStr)
|
@@ -131,10 +132,11 @@ module CharDet
|
|
131
132
|
# first byte range: 0xb0 -- 0xfe
|
132
133
|
# second byte range: 0xa1 -- 0xfe
|
133
134
|
# no validation needed here. State machine has done that
|
134
|
-
if aStr[0
|
135
|
-
|
135
|
+
if aStr[0, 1] >= "\xB0"
|
136
|
+
bytes = aStr.bytes
|
137
|
+
return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
|
136
138
|
else
|
137
|
-
|
139
|
+
return -1
|
138
140
|
end
|
139
141
|
end
|
140
142
|
end
|
@@ -142,9 +144,9 @@ module CharDet
|
|
142
144
|
class GB2312DistributionAnalysis < CharDistributionAnalysis
|
143
145
|
def initialize
|
144
146
|
super()
|
145
|
-
@
|
146
|
-
@
|
147
|
-
@
|
147
|
+
@charToFreqOrder = GB2312CharToFreqOrder
|
148
|
+
@tableSize = GB2312_TABLE_SIZE
|
149
|
+
@typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
148
150
|
end
|
149
151
|
|
150
152
|
def get_order(aStr)
|
@@ -152,10 +154,11 @@ module CharDet
|
|
152
154
|
# first byte range: 0xb0 -- 0xfe
|
153
155
|
# second byte range: 0xa1 -- 0xfe
|
154
156
|
# no validation needed here. State machine has done that
|
155
|
-
if (aStr[0
|
156
|
-
|
157
|
+
if (aStr[0, 1] >= "\xB0") and (aStr[1, 1] >= "\xA1")
|
158
|
+
bytes = aStr.bytes
|
159
|
+
return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
|
157
160
|
else
|
158
|
-
|
161
|
+
return -1
|
159
162
|
end
|
160
163
|
end
|
161
164
|
end
|
@@ -163,9 +166,9 @@ module CharDet
|
|
163
166
|
class Big5DistributionAnalysis < CharDistributionAnalysis
|
164
167
|
def initialize
|
165
168
|
super
|
166
|
-
@
|
167
|
-
@
|
168
|
-
@
|
169
|
+
@charToFreqOrder = Big5CharToFreqOrder
|
170
|
+
@tableSize = BIG5_TABLE_SIZE
|
171
|
+
@typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
169
172
|
end
|
170
173
|
|
171
174
|
def get_order(aStr)
|
@@ -173,14 +176,15 @@ module CharDet
|
|
173
176
|
# first byte range: 0xa4 -- 0xfe
|
174
177
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
175
178
|
# no validation needed here. State machine has done that
|
176
|
-
if aStr[0
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
179
|
+
if aStr[0, 1] >= "\xA4"
|
180
|
+
bytes = aStr.bytes
|
181
|
+
if aStr[1, 1] >= "\xA1"
|
182
|
+
return 157 * (bytes[0] - 0xA4) + bytes[1] - 0xA1 + 63
|
183
|
+
else
|
184
|
+
return 157 * (bytes[0] - 0xA4) + bytes[1] - 0x40
|
185
|
+
end
|
182
186
|
else
|
183
|
-
|
187
|
+
return -1
|
184
188
|
end
|
185
189
|
end
|
186
190
|
end
|
@@ -188,9 +192,9 @@ module CharDet
|
|
188
192
|
class SJISDistributionAnalysis < CharDistributionAnalysis
|
189
193
|
def initialize
|
190
194
|
super()
|
191
|
-
@
|
192
|
-
@
|
193
|
-
@
|
195
|
+
@charToFreqOrder = JISCharToFreqOrder
|
196
|
+
@tableSize = JIS_TABLE_SIZE
|
197
|
+
@typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
194
198
|
end
|
195
199
|
|
196
200
|
def get_order(aStr)
|
@@ -198,17 +202,17 @@ module CharDet
|
|
198
202
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
199
203
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
200
204
|
# no validation needed here. State machine has done that
|
201
|
-
|
202
|
-
if (aStr[0
|
203
|
-
|
204
|
-
elsif (aStr[0
|
205
|
-
|
205
|
+
bytes = aStr.bytes
|
206
|
+
if (aStr[0, 1] >= "\x81") and (aStr[0, 1] <= "\x9F")
|
207
|
+
order = 188 * (bytes[0] - 0x81)
|
208
|
+
elsif (aStr[0, 1] >= "\xE0") and (aStr[0, 1] <= "\xEF")
|
209
|
+
order = 188 * (bytes[0] - 0xE0 + 31)
|
206
210
|
else
|
207
|
-
|
211
|
+
return -1
|
208
212
|
end
|
209
|
-
order = order +
|
210
|
-
if aStr[1
|
211
|
-
|
213
|
+
order = order + bytes[1] - 0x40
|
214
|
+
if aStr[1, 1] > "\x7F"
|
215
|
+
order =- 1
|
212
216
|
end
|
213
217
|
return order
|
214
218
|
end
|
@@ -217,9 +221,9 @@ module CharDet
|
|
217
221
|
class EUCJPDistributionAnalysis < CharDistributionAnalysis
|
218
222
|
def initialize
|
219
223
|
super()
|
220
|
-
@
|
221
|
-
@
|
222
|
-
@
|
224
|
+
@charToFreqOrder = JISCharToFreqOrder
|
225
|
+
@tableSize = JIS_TABLE_SIZE
|
226
|
+
@typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
223
227
|
end
|
224
228
|
|
225
229
|
def get_order(aStr)
|
@@ -227,8 +231,9 @@ module CharDet
|
|
227
231
|
# first byte range: 0xa0 -- 0xfe
|
228
232
|
# second byte range: 0xa1 -- 0xfe
|
229
233
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0
|
231
|
-
|
234
|
+
if aStr[0, 1] >= "\xA0"
|
235
|
+
bytes = aStr.bytes
|
236
|
+
return 94 * (bytes[0] - 0xA1) + bytes[1] - 0xa1
|
232
237
|
else
|
233
238
|
return -1
|
234
239
|
end
|
@@ -28,54 +28,55 @@
|
|
28
28
|
|
29
29
|
module CharDet
|
30
30
|
class CharSetGroupProber < CharSetProber
|
31
|
-
attr_accessor :
|
31
|
+
attr_accessor :probers
|
32
32
|
def initialize
|
33
33
|
super
|
34
|
-
@
|
35
|
-
@
|
36
|
-
@
|
34
|
+
@activeNum = 0
|
35
|
+
@probers = []
|
36
|
+
@bestGuessProber = nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super
|
41
|
-
@
|
41
|
+
@activeNum = 0
|
42
42
|
|
43
|
-
for prober in @
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
for prober in @probers
|
44
|
+
if prober
|
45
|
+
prober.reset()
|
46
|
+
prober.active = true
|
47
|
+
@activeNum += 1
|
48
|
+
end
|
49
49
|
end
|
50
|
-
@
|
50
|
+
@bestGuessProber = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
|
-
if
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
if !@bestGuessProber
|
55
|
+
get_confidence()
|
56
|
+
if !@bestGuessProber
|
57
|
+
return nil
|
58
|
+
end
|
58
59
|
end
|
59
|
-
return @
|
60
|
+
return @bestGuessProber.get_charset_name()
|
60
61
|
end
|
61
62
|
|
62
63
|
def feed(aBuf)
|
63
|
-
for prober in @
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
for prober in @probers
|
65
|
+
next unless prober
|
66
|
+
next unless prober.active
|
67
|
+
st = prober.feed(aBuf)
|
68
|
+
next unless st
|
69
|
+
if st == EFoundIt
|
70
|
+
@bestGuessProber = prober
|
71
|
+
return get_state()
|
72
|
+
elsif st == ENotMe
|
73
|
+
prober.active = false
|
74
|
+
@activeNum -= 1
|
75
|
+
if @activeNum <= 0
|
76
|
+
@state = ENotMe
|
77
|
+
return get_state()
|
78
|
+
end
|
79
|
+
end
|
79
80
|
end
|
80
81
|
return get_state()
|
81
82
|
end
|
@@ -83,30 +84,27 @@ module CharDet
|
|
83
84
|
def get_confidence()
|
84
85
|
st = get_state()
|
85
86
|
if st == EFoundIt
|
86
|
-
|
87
|
+
return 0.99
|
87
88
|
elsif st == ENotMe
|
88
|
-
|
89
|
+
return 0.01
|
89
90
|
end
|
90
91
|
bestConf = 0.0
|
91
|
-
@
|
92
|
-
for prober in @
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
92
|
+
@bestGuessProber = nil
|
93
|
+
for prober in @probers
|
94
|
+
next unless prober
|
95
|
+
unless prober.active
|
96
|
+
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
|
97
|
+
next
|
98
|
+
end
|
99
|
+
cf = prober.get_confidence()
|
100
|
+
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
|
101
|
+
if bestConf < cf
|
102
|
+
bestConf = cf
|
103
|
+
@bestGuessProber = prober
|
104
|
+
end
|
104
105
|
end
|
105
|
-
return 0.0 unless @
|
106
|
+
return 0.0 unless @bestGuessProber
|
106
107
|
return bestConf
|
107
|
-
# else:
|
108
|
-
# self._mBestGuessProber = self._mProbers[0]
|
109
|
-
# return self._mBestGuessProber.get_confidence()
|
110
108
|
end
|
111
109
|
end
|
112
110
|
end
|
@@ -34,7 +34,7 @@ module CharDet
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def reset
|
37
|
-
@
|
37
|
+
@state = EDetecting
|
38
38
|
end
|
39
39
|
|
40
40
|
def get_charset_name
|
@@ -45,7 +45,7 @@ module CharDet
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def get_state
|
48
|
-
return @
|
48
|
+
return @state
|
49
49
|
end
|
50
50
|
|
51
51
|
def get_confidence
|
@@ -53,11 +53,6 @@ module CharDet
|
|
53
53
|
end
|
54
54
|
|
55
55
|
def filter_high_bit_only(aBuf)
|
56
|
-
# DO NOT USE `gsub!`
|
57
|
-
# It will remove all characters from the buffer that is later used by
|
58
|
-
# other probers. This is because gsub! removes data from the instance variable
|
59
|
-
# that will be passed to later probers, while gsub makes a new instance variable
|
60
|
-
# that will not.
|
61
56
|
newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
|
62
57
|
return newBuf
|
63
58
|
end
|