rchardet 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -51,7 +51,7 @@ module CharDet
|
|
51
51
|
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0
|
52
52
|
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0
|
53
53
|
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
54
|
-
]
|
54
|
+
].freeze
|
55
55
|
|
56
56
|
Win1253_CharToOrderMap = [
|
57
57
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
@@ -70,7 +70,7 @@ module CharDet
|
|
70
70
|
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0
|
71
71
|
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0
|
72
72
|
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
73
|
-
]
|
73
|
+
].freeze
|
74
74
|
|
75
75
|
# Model Table:
|
76
76
|
# total sequences: 100%
|
@@ -207,7 +207,7 @@ module CharDet
|
|
207
207
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
208
208
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
209
209
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
210
|
-
]
|
210
|
+
].freeze
|
211
211
|
|
212
212
|
Latin7GreekModel = {
|
213
213
|
'charToOrderMap' => Latin7_CharToOrderMap,
|
@@ -215,7 +215,7 @@ module CharDet
|
|
215
215
|
'mTypicalPositiveRatio' => 0.982851,
|
216
216
|
'keepEnglishLetter' => false,
|
217
217
|
'charsetName' => "ISO-8859-7"
|
218
|
-
}
|
218
|
+
}.freeze
|
219
219
|
|
220
220
|
Win1253GreekModel = {
|
221
221
|
'charToOrderMap' => Win1253_CharToOrderMap,
|
@@ -223,5 +223,5 @@ module CharDet
|
|
223
223
|
'mTypicalPositiveRatio' => 0.982851,
|
224
224
|
'keepEnglishLetter' => false,
|
225
225
|
'charsetName' => "windows-1253"
|
226
|
-
}
|
226
|
+
}.freeze
|
227
227
|
end
|
@@ -53,7 +53,7 @@ Win1255_CharToOrderMap = [
|
|
53
53
|
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
|
54
54
|
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
|
55
55
|
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
56
|
-
]
|
56
|
+
].freeze
|
57
57
|
|
58
58
|
# Model Table:
|
59
59
|
# total sequences: 100%
|
@@ -190,7 +190,7 @@ HebrewLangModel = [
|
|
190
190
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
191
191
|
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
192
192
|
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
193
|
-
]
|
193
|
+
].freeze
|
194
194
|
|
195
195
|
Win1255HebrewModel = {
|
196
196
|
'charToOrderMap' => Win1255_CharToOrderMap,
|
@@ -198,5 +198,5 @@ Win1255HebrewModel = {
|
|
198
198
|
'mTypicalPositiveRatio' => 0.984004,
|
199
199
|
'keepEnglishLetter' => false,
|
200
200
|
'charsetName' => "windows-1255"
|
201
|
-
}
|
201
|
+
}.freeze
|
202
202
|
end
|
@@ -50,7 +50,7 @@ Latin2_HungarianCharToOrderMap = [
|
|
50
50
|
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
|
51
51
|
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
|
52
52
|
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
53
|
-
]
|
53
|
+
].freeze
|
54
54
|
|
55
55
|
Win1250HungarianCharToOrderMap = [
|
56
56
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
@@ -69,7 +69,7 @@ Win1250HungarianCharToOrderMap = [
|
|
69
69
|
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
|
70
70
|
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
|
71
71
|
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
72
|
-
]
|
72
|
+
].freeze
|
73
73
|
|
74
74
|
# Model Table:
|
75
75
|
# total sequences: 100%
|
@@ -206,7 +206,7 @@ HungarianLangModel = [
|
|
206
206
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
207
207
|
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
208
208
|
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
209
|
-
]
|
209
|
+
].freeze
|
210
210
|
|
211
211
|
Latin2HungarianModel = {
|
212
212
|
'charToOrderMap' => Latin2_HungarianCharToOrderMap,
|
@@ -214,7 +214,7 @@ Latin2HungarianModel = {
|
|
214
214
|
'mTypicalPositiveRatio' => 0.947368,
|
215
215
|
'keepEnglishLetter' => true,
|
216
216
|
'charsetName' => "ISO-8859-2"
|
217
|
-
}
|
217
|
+
}.freeze
|
218
218
|
|
219
219
|
Win1250HungarianModel = {
|
220
220
|
'charToOrderMap' => Win1250HungarianCharToOrderMap,
|
@@ -222,5 +222,5 @@ Win1250HungarianModel = {
|
|
222
222
|
'mTypicalPositiveRatio' => 0.947368,
|
223
223
|
'keepEnglishLetter' => true,
|
224
224
|
'charsetName' => "windows-1250"
|
225
|
-
}
|
225
|
+
}.freeze
|
226
226
|
end
|
@@ -52,7 +52,7 @@ TIS620CharToOrderMap = [
|
|
52
52
|
22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244,
|
53
53
|
11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247,
|
54
54
|
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
|
55
|
-
]
|
55
|
+
].freeze
|
56
56
|
|
57
57
|
# Model Table:
|
58
58
|
# total sequences: 100%
|
@@ -189,7 +189,7 @@ ThaiLangModel = [
|
|
189
189
|
0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
190
190
|
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
191
191
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
192
|
-
]
|
192
|
+
].freeze
|
193
193
|
|
194
194
|
TIS620ThaiModel = {
|
195
195
|
'charToOrderMap' => TIS620CharToOrderMap,
|
@@ -197,5 +197,5 @@ TIS620ThaiModel = {
|
|
197
197
|
'mTypicalPositiveRatio' => 0.926386,
|
198
198
|
'keepEnglishLetter' => false,
|
199
199
|
'charsetName' => "TIS-620"
|
200
|
-
}
|
200
|
+
}.freeze
|
201
201
|
end
|
@@ -73,7 +73,7 @@ module CharDet
|
|
73
73
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
74
74
|
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
75
75
|
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
76
|
-
]
|
76
|
+
].freeze
|
77
77
|
|
78
78
|
# 0 : illegal
|
79
79
|
# 1 : very unlikely
|
@@ -89,7 +89,7 @@ module CharDet
|
|
89
89
|
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
90
90
|
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
91
91
|
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
92
|
-
]
|
92
|
+
].freeze
|
93
93
|
|
94
94
|
class Latin1Prober < CharSetProber
|
95
95
|
def initialize
|
@@ -98,8 +98,8 @@ module CharDet
|
|
98
98
|
end
|
99
99
|
|
100
100
|
def reset
|
101
|
-
@
|
102
|
-
@
|
101
|
+
@lastCharClass = OTH
|
102
|
+
@freqCounter = [0] * FREQ_CAT_NUM
|
103
103
|
super
|
104
104
|
end
|
105
105
|
|
@@ -110,15 +110,15 @@ module CharDet
|
|
110
110
|
def feed(aBuf)
|
111
111
|
aBuf = filter_with_english_letters(aBuf)
|
112
112
|
aBuf.each_byte do |b|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
c = b.chr
|
114
|
+
charClass = Latin1_CharToClass[c.bytes.first]
|
115
|
+
freq = Latin1ClassModel[(@lastCharClass * CLASS_NUM) + charClass]
|
116
|
+
if freq == 0
|
117
|
+
@state = ENotMe
|
118
|
+
break
|
119
|
+
end
|
120
|
+
@freqCounter[freq] += 1
|
121
|
+
@lastCharClass = charClass
|
122
122
|
end
|
123
123
|
|
124
124
|
return get_state()
|
@@ -126,17 +126,17 @@ module CharDet
|
|
126
126
|
|
127
127
|
def get_confidence
|
128
128
|
if get_state() == ENotMe
|
129
|
-
|
129
|
+
return 0.01
|
130
130
|
end
|
131
131
|
|
132
|
-
total = @
|
132
|
+
total = @freqCounter.inject{|a,b| a+b}
|
133
133
|
if total < 0.01
|
134
|
-
|
134
|
+
confidence = 0.0
|
135
135
|
else
|
136
|
-
|
136
|
+
confidence = (@freqCounter[3] / total) - (@freqCounter[1] * 20.0 / total)
|
137
137
|
end
|
138
138
|
if confidence < 0.0
|
139
|
-
|
139
|
+
confidence = 0.0
|
140
140
|
end
|
141
141
|
# lower the confidence of latin1 so that other more accurate detector
|
142
142
|
# can take priority.
|
@@ -32,20 +32,20 @@ module CharDet
|
|
32
32
|
class MultiByteCharSetProber < CharSetProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@
|
36
|
-
@
|
37
|
-
@
|
35
|
+
@distributionAnalyzer = nil
|
36
|
+
@codingSM = nil
|
37
|
+
@lastChar = "\x00\x00"
|
38
38
|
end
|
39
39
|
|
40
40
|
def reset
|
41
41
|
super
|
42
|
-
if @
|
43
|
-
|
42
|
+
if @codingSM
|
43
|
+
@codingSM.reset()
|
44
44
|
end
|
45
|
-
if @
|
46
|
-
|
45
|
+
if @distributionAnalyzer
|
46
|
+
@distributionAnalyzer.reset()
|
47
47
|
end
|
48
|
-
@
|
48
|
+
@lastChar = "\x00\x00"
|
49
49
|
end
|
50
50
|
|
51
51
|
def get_charset_name
|
@@ -54,36 +54,36 @@ module CharDet
|
|
54
54
|
def feed(aBuf)
|
55
55
|
aLen = aBuf.length
|
56
56
|
for i in (0...aLen)
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
57
|
+
codingState = @codingSM.next_state(aBuf[i, 1])
|
58
|
+
if codingState == EError
|
59
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
60
|
+
@state = ENotMe
|
61
|
+
break
|
62
|
+
elsif codingState == EItsMe
|
63
|
+
@state = EFoundIt
|
64
|
+
break
|
65
|
+
elsif codingState == EStart
|
66
|
+
charLen = @codingSM.get_current_charlen()
|
67
|
+
if i == 0
|
68
|
+
@lastChar[1] = aBuf[0, 1]
|
69
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
70
|
+
else
|
71
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
72
|
+
end
|
73
|
+
end
|
74
74
|
end
|
75
|
-
@
|
75
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
76
76
|
|
77
77
|
if get_state() == EDetecting
|
78
|
-
|
79
|
-
|
80
|
-
|
78
|
+
if @distributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
79
|
+
@state = EFoundIt
|
80
|
+
end
|
81
81
|
end
|
82
82
|
return get_state()
|
83
83
|
end
|
84
84
|
|
85
85
|
def get_confidence
|
86
|
-
return @
|
86
|
+
return @distributionAnalyzer.get_confidence()
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
@@ -32,15 +32,15 @@ module CharDet
|
|
32
32
|
class MBCSGroupProber < CharSetGroupProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
@probers = [
|
36
|
+
UTF8Prober.new,
|
37
|
+
SJISProber.new,
|
38
|
+
EUCJPProber.new,
|
39
|
+
GB2312Prober.new,
|
40
|
+
EUCKRProber.new,
|
41
|
+
Big5Prober.new,
|
42
|
+
EUCTWProber.new
|
43
|
+
]
|
44
44
|
reset()
|
45
45
|
end
|
46
46
|
end
|
data/lib/rchardet/mbcssm.rb
CHANGED
@@ -62,22 +62,22 @@ module CharDet
|
|
62
62
|
3,3,3,3,3,3,3,3, # e8 - ef
|
63
63
|
3,3,3,3,3,3,3,3, # f0 - f7
|
64
64
|
3,3,3,3,3,3,3,0 # f8 - ff
|
65
|
-
]
|
65
|
+
].freeze
|
66
66
|
|
67
67
|
BIG5_st = [
|
68
68
|
EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
|
69
69
|
EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
|
70
70
|
EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
|
71
|
-
]
|
71
|
+
].freeze
|
72
72
|
|
73
|
-
Big5CharLenTable = [0, 1, 1, 2, 0]
|
73
|
+
Big5CharLenTable = [0, 1, 1, 2, 0].freeze
|
74
74
|
|
75
75
|
Big5SMModel = {'classTable' => BIG5_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}
|
76
|
+
'classFactor' => 5,
|
77
|
+
'stateTable' => BIG5_st,
|
78
|
+
'charLenTable' => Big5CharLenTable,
|
79
|
+
'name' => 'Big5'
|
80
|
+
}.freeze
|
81
81
|
|
82
82
|
# EUC-JP
|
83
83
|
|
@@ -114,7 +114,7 @@ module CharDet
|
|
114
114
|
0,0,0,0,0,0,0,0, # e8 - ef
|
115
115
|
0,0,0,0,0,0,0,0, # f0 - f7
|
116
116
|
0,0,0,0,0,0,0,5 # f8 - ff
|
117
|
-
]
|
117
|
+
].freeze
|
118
118
|
|
119
119
|
EUCJP_st = [
|
120
120
|
3, 4, 3, 5,EStart,EError,EError,EError,#00-07
|
@@ -122,16 +122,16 @@ module CharDet
|
|
122
122
|
EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
|
123
123
|
EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
|
124
124
|
3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
|
125
|
-
]
|
125
|
+
].freeze
|
126
126
|
|
127
|
-
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
|
127
|
+
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0].freeze
|
128
128
|
|
129
129
|
EUCJPSMModel = {'classTable' => EUCJP_cls,
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
}
|
130
|
+
'classFactor' => 6,
|
131
|
+
'stateTable' => EUCJP_st,
|
132
|
+
'charLenTable' => EUCJPCharLenTable,
|
133
|
+
'name' => 'EUC-JP'
|
134
|
+
}.freeze
|
135
135
|
|
136
136
|
# EUC-KR
|
137
137
|
|
@@ -168,21 +168,21 @@ module CharDet
|
|
168
168
|
2,2,2,2,2,2,2,2, # e8 - ef
|
169
169
|
2,2,2,2,2,2,2,2, # f0 - f7
|
170
170
|
2,2,2,2,2,2,2,0 # f8 - ff
|
171
|
-
]
|
171
|
+
].freeze
|
172
172
|
|
173
173
|
EUCKR_st = [
|
174
174
|
EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
|
175
175
|
EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
|
176
|
-
]
|
176
|
+
].freeze
|
177
177
|
|
178
|
-
EUCKRCharLenTable = [0, 1, 2, 0]
|
178
|
+
EUCKRCharLenTable = [0, 1, 2, 0].freeze
|
179
179
|
|
180
180
|
EUCKRSMModel = {'classTable' => EUCKR_cls,
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
}
|
181
|
+
'classFactor' => 4,
|
182
|
+
'stateTable' => EUCKR_st,
|
183
|
+
'charLenTable' => EUCKRCharLenTable,
|
184
|
+
'name' => 'EUC-KR'
|
185
|
+
}.freeze
|
186
186
|
|
187
187
|
# EUC-TW
|
188
188
|
|
@@ -219,7 +219,7 @@ module CharDet
|
|
219
219
|
3,3,3,3,3,3,3,3, # e8 - ef
|
220
220
|
3,3,3,3,3,3,3,3, # f0 - f7
|
221
221
|
3,3,3,3,3,3,3,0 # f8 - ff
|
222
|
-
]
|
222
|
+
].freeze
|
223
223
|
|
224
224
|
EUCTW_st = [
|
225
225
|
EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
|
@@ -228,16 +228,16 @@ module CharDet
|
|
228
228
|
EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
|
229
229
|
5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
|
230
230
|
EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
|
231
|
-
]
|
231
|
+
].freeze
|
232
232
|
|
233
|
-
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
|
233
|
+
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3].freeze
|
234
234
|
|
235
235
|
EUCTWSMModel = {'classTable' => EUCTW_cls,
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
}
|
236
|
+
'classFactor' => 7,
|
237
|
+
'stateTable' => EUCTW_st,
|
238
|
+
'charLenTable' => EUCTWCharLenTable,
|
239
|
+
'name' => 'x-euc-tw'
|
240
|
+
}.freeze
|
241
241
|
|
242
242
|
# GB2312
|
243
243
|
|
@@ -274,7 +274,7 @@ module CharDet
|
|
274
274
|
6,6,6,6,6,6,6,6, # e8 - ef
|
275
275
|
6,6,6,6,6,6,6,6, # f0 - f7
|
276
276
|
6,6,6,6,6,6,6,0 # f8 - ff
|
277
|
-
]
|
277
|
+
].freeze
|
278
278
|
|
279
279
|
GB2312_st = [
|
280
280
|
EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
|
@@ -283,21 +283,21 @@ module CharDet
|
|
283
283
|
4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
|
284
284
|
EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
|
285
285
|
EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
|
286
|
-
]
|
286
|
+
].freeze
|
287
287
|
|
288
288
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
289
289
|
# But it is not necessary to discriminate between the two since
|
290
290
|
# it is used for frequency analysis only, and we are validing
|
291
291
|
# each code range there as well. So it is safe to set it to be
|
292
292
|
# 2 here.
|
293
|
-
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
|
293
|
+
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2].freeze
|
294
294
|
|
295
295
|
GB2312SMModel = {'classTable' => GB2312_cls,
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
}
|
296
|
+
'classFactor' => 7,
|
297
|
+
'stateTable' => GB2312_st,
|
298
|
+
'charLenTable' => GB2312CharLenTable,
|
299
|
+
'name' => 'GB2312'
|
300
|
+
}.freeze
|
301
301
|
|
302
302
|
# Shift_JIS
|
303
303
|
|
@@ -336,22 +336,22 @@ module CharDet
|
|
336
336
|
3,3,3,3,3,4,4,4, # e8 - ef
|
337
337
|
4,4,4,4,4,4,4,4, # f0 - f7
|
338
338
|
4,4,4,4,4,0,0,0 # f8 - ff
|
339
|
-
]
|
339
|
+
].freeze
|
340
340
|
|
341
341
|
SJIS_st = [
|
342
342
|
EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
|
343
343
|
EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
|
344
344
|
EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
|
345
|
-
]
|
345
|
+
].freeze
|
346
346
|
|
347
|
-
SJISCharLenTable = [0, 1, 1, 2, 0, 0]
|
347
|
+
SJISCharLenTable = [0, 1, 1, 2, 0, 0].freeze
|
348
348
|
|
349
349
|
SJISSMModel = {'classTable' => SJIS_cls,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
}
|
350
|
+
'classFactor' => 6,
|
351
|
+
'stateTable' => SJIS_st,
|
352
|
+
'charLenTable' => SJISCharLenTable,
|
353
|
+
'name' => 'Shift_JIS'
|
354
|
+
}.freeze
|
355
355
|
|
356
356
|
# UCS2-BE
|
357
357
|
|
@@ -388,7 +388,7 @@ module CharDet
|
|
388
388
|
0,0,0,0,0,0,0,0, # e8 - ef
|
389
389
|
0,0,0,0,0,0,0,0, # f0 - f7
|
390
390
|
0,0,0,0,0,0,4,5 # f8 - ff
|
391
|
-
]
|
391
|
+
].freeze
|
392
392
|
|
393
393
|
UCS2BE_st = [
|
394
394
|
5, 7, 7,EError, 4, 3,EError,EError,#00-07
|
@@ -398,16 +398,16 @@ module CharDet
|
|
398
398
|
6, 6, 6, 6, 5, 7, 7,EError,#20-27
|
399
399
|
5, 8, 6, 6,EError, 6, 6, 6,#28-2f
|
400
400
|
6, 6, 6, 6,EError,EError,EStart,EStart#30-37
|
401
|
-
]
|
401
|
+
].freeze
|
402
402
|
|
403
|
-
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
|
403
|
+
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2].freeze
|
404
404
|
|
405
405
|
UCS2BESMModel = {'classTable' => UCS2BE_cls,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
}
|
406
|
+
'classFactor' => 6,
|
407
|
+
'stateTable' => UCS2BE_st,
|
408
|
+
'charLenTable' => UCS2BECharLenTable,
|
409
|
+
'name' => 'UTF-16BE'
|
410
|
+
}.freeze
|
411
411
|
|
412
412
|
# UCS2-LE
|
413
413
|
|
@@ -444,7 +444,7 @@ module CharDet
|
|
444
444
|
0,0,0,0,0,0,0,0, # e8 - ef
|
445
445
|
0,0,0,0,0,0,0,0, # f0 - f7
|
446
446
|
0,0,0,0,0,0,4,5 # f8 - ff
|
447
|
-
]
|
447
|
+
].freeze
|
448
448
|
|
449
449
|
UCS2LE_st = [
|
450
450
|
6, 6, 7, 6, 4, 3,EError,EError,#00-07
|
@@ -454,16 +454,16 @@ module CharDet
|
|
454
454
|
7, 6, 8, 8, 5, 5, 5,EError,#20-27
|
455
455
|
5, 5, 5,EError,EError,EError, 5, 5,#28-2f
|
456
456
|
5, 5, 5,EError, 5,EError,EStart,EStart#30-37
|
457
|
-
]
|
457
|
+
].freeze
|
458
458
|
|
459
|
-
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
|
459
|
+
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2].freeze
|
460
460
|
|
461
461
|
UCS2LESMModel = {'classTable' => UCS2LE_cls,
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
}
|
462
|
+
'classFactor' => 6,
|
463
|
+
'stateTable' => UCS2LE_st,
|
464
|
+
'charLenTable' => UCS2LECharLenTable,
|
465
|
+
'name' => 'UTF-16LE'
|
466
|
+
}.freeze
|
467
467
|
|
468
468
|
# UTF-8
|
469
469
|
|
@@ -500,7 +500,7 @@ module CharDet
|
|
500
500
|
8,8,8,8,8,9,8,8, # e8 - ef
|
501
501
|
10,11,11,11,11,11,11,11, # f0 - f7
|
502
502
|
12,13,13,13,14,15,0,0 # f8 - ff
|
503
|
-
]
|
503
|
+
].freeze
|
504
504
|
|
505
505
|
UTF8_st = [
|
506
506
|
EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
|
@@ -529,14 +529,14 @@ module CharDet
|
|
529
529
|
EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
|
530
530
|
EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
|
531
531
|
EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
|
532
|
-
]
|
532
|
+
].freeze
|
533
533
|
|
534
|
-
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
|
534
|
+
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6].freeze
|
535
535
|
|
536
536
|
UTF8SMModel = {'classTable' => UTF8_cls,
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
}
|
537
|
+
'classFactor' => 16,
|
538
|
+
'stateTable' => UTF8_st,
|
539
|
+
'charLenTable' => UTF8CharLenTable,
|
540
|
+
'name' => 'UTF-8'
|
541
|
+
}.freeze
|
542
542
|
end
|