rchardet 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -51,7 +51,7 @@ module CharDet
|
|
51
51
|
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0
|
52
52
|
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0
|
53
53
|
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
54
|
-
]
|
54
|
+
].freeze
|
55
55
|
|
56
56
|
Win1253_CharToOrderMap = [
|
57
57
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
@@ -70,7 +70,7 @@ module CharDet
|
|
70
70
|
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0
|
71
71
|
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0
|
72
72
|
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
|
73
|
-
]
|
73
|
+
].freeze
|
74
74
|
|
75
75
|
# Model Table:
|
76
76
|
# total sequences: 100%
|
@@ -207,7 +207,7 @@ module CharDet
|
|
207
207
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
208
208
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
209
209
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
210
|
-
]
|
210
|
+
].freeze
|
211
211
|
|
212
212
|
Latin7GreekModel = {
|
213
213
|
'charToOrderMap' => Latin7_CharToOrderMap,
|
@@ -215,7 +215,7 @@ module CharDet
|
|
215
215
|
'mTypicalPositiveRatio' => 0.982851,
|
216
216
|
'keepEnglishLetter' => false,
|
217
217
|
'charsetName' => "ISO-8859-7"
|
218
|
-
}
|
218
|
+
}.freeze
|
219
219
|
|
220
220
|
Win1253GreekModel = {
|
221
221
|
'charToOrderMap' => Win1253_CharToOrderMap,
|
@@ -223,5 +223,5 @@ module CharDet
|
|
223
223
|
'mTypicalPositiveRatio' => 0.982851,
|
224
224
|
'keepEnglishLetter' => false,
|
225
225
|
'charsetName' => "windows-1253"
|
226
|
-
}
|
226
|
+
}.freeze
|
227
227
|
end
|
@@ -53,7 +53,7 @@ Win1255_CharToOrderMap = [
|
|
53
53
|
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
|
54
54
|
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
|
55
55
|
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
56
|
-
]
|
56
|
+
].freeze
|
57
57
|
|
58
58
|
# Model Table:
|
59
59
|
# total sequences: 100%
|
@@ -190,7 +190,7 @@ HebrewLangModel = [
|
|
190
190
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
191
191
|
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
192
192
|
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
193
|
-
]
|
193
|
+
].freeze
|
194
194
|
|
195
195
|
Win1255HebrewModel = {
|
196
196
|
'charToOrderMap' => Win1255_CharToOrderMap,
|
@@ -198,5 +198,5 @@ Win1255HebrewModel = {
|
|
198
198
|
'mTypicalPositiveRatio' => 0.984004,
|
199
199
|
'keepEnglishLetter' => false,
|
200
200
|
'charsetName' => "windows-1255"
|
201
|
-
}
|
201
|
+
}.freeze
|
202
202
|
end
|
@@ -50,7 +50,7 @@ Latin2_HungarianCharToOrderMap = [
|
|
50
50
|
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
|
51
51
|
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
|
52
52
|
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
53
|
-
]
|
53
|
+
].freeze
|
54
54
|
|
55
55
|
Win1250HungarianCharToOrderMap = [
|
56
56
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
|
@@ -69,7 +69,7 @@ Win1250HungarianCharToOrderMap = [
|
|
69
69
|
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
|
70
70
|
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
|
71
71
|
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
72
|
-
]
|
72
|
+
].freeze
|
73
73
|
|
74
74
|
# Model Table:
|
75
75
|
# total sequences: 100%
|
@@ -206,7 +206,7 @@ HungarianLangModel = [
|
|
206
206
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
207
207
|
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
208
208
|
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
209
|
-
]
|
209
|
+
].freeze
|
210
210
|
|
211
211
|
Latin2HungarianModel = {
|
212
212
|
'charToOrderMap' => Latin2_HungarianCharToOrderMap,
|
@@ -214,7 +214,7 @@ Latin2HungarianModel = {
|
|
214
214
|
'mTypicalPositiveRatio' => 0.947368,
|
215
215
|
'keepEnglishLetter' => true,
|
216
216
|
'charsetName' => "ISO-8859-2"
|
217
|
-
}
|
217
|
+
}.freeze
|
218
218
|
|
219
219
|
Win1250HungarianModel = {
|
220
220
|
'charToOrderMap' => Win1250HungarianCharToOrderMap,
|
@@ -222,5 +222,5 @@ Win1250HungarianModel = {
|
|
222
222
|
'mTypicalPositiveRatio' => 0.947368,
|
223
223
|
'keepEnglishLetter' => true,
|
224
224
|
'charsetName' => "windows-1250"
|
225
|
-
}
|
225
|
+
}.freeze
|
226
226
|
end
|
@@ -52,7 +52,7 @@ TIS620CharToOrderMap = [
|
|
52
52
|
22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244,
|
53
53
|
11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247,
|
54
54
|
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
|
55
|
-
]
|
55
|
+
].freeze
|
56
56
|
|
57
57
|
# Model Table:
|
58
58
|
# total sequences: 100%
|
@@ -189,7 +189,7 @@ ThaiLangModel = [
|
|
189
189
|
0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
190
190
|
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
191
191
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
192
|
-
]
|
192
|
+
].freeze
|
193
193
|
|
194
194
|
TIS620ThaiModel = {
|
195
195
|
'charToOrderMap' => TIS620CharToOrderMap,
|
@@ -197,5 +197,5 @@ TIS620ThaiModel = {
|
|
197
197
|
'mTypicalPositiveRatio' => 0.926386,
|
198
198
|
'keepEnglishLetter' => false,
|
199
199
|
'charsetName' => "TIS-620"
|
200
|
-
}
|
200
|
+
}.freeze
|
201
201
|
end
|
@@ -73,7 +73,7 @@ module CharDet
|
|
73
73
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
74
74
|
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
75
75
|
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
76
|
-
]
|
76
|
+
].freeze
|
77
77
|
|
78
78
|
# 0 : illegal
|
79
79
|
# 1 : very unlikely
|
@@ -89,7 +89,7 @@ module CharDet
|
|
89
89
|
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
90
90
|
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
91
91
|
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
92
|
-
]
|
92
|
+
].freeze
|
93
93
|
|
94
94
|
class Latin1Prober < CharSetProber
|
95
95
|
def initialize
|
@@ -98,8 +98,8 @@ module CharDet
|
|
98
98
|
end
|
99
99
|
|
100
100
|
def reset
|
101
|
-
@
|
102
|
-
@
|
101
|
+
@lastCharClass = OTH
|
102
|
+
@freqCounter = [0] * FREQ_CAT_NUM
|
103
103
|
super
|
104
104
|
end
|
105
105
|
|
@@ -110,15 +110,15 @@ module CharDet
|
|
110
110
|
def feed(aBuf)
|
111
111
|
aBuf = filter_with_english_letters(aBuf)
|
112
112
|
aBuf.each_byte do |b|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
c = b.chr
|
114
|
+
charClass = Latin1_CharToClass[c.bytes.first]
|
115
|
+
freq = Latin1ClassModel[(@lastCharClass * CLASS_NUM) + charClass]
|
116
|
+
if freq == 0
|
117
|
+
@state = ENotMe
|
118
|
+
break
|
119
|
+
end
|
120
|
+
@freqCounter[freq] += 1
|
121
|
+
@lastCharClass = charClass
|
122
122
|
end
|
123
123
|
|
124
124
|
return get_state()
|
@@ -126,17 +126,17 @@ module CharDet
|
|
126
126
|
|
127
127
|
def get_confidence
|
128
128
|
if get_state() == ENotMe
|
129
|
-
|
129
|
+
return 0.01
|
130
130
|
end
|
131
131
|
|
132
|
-
total = @
|
132
|
+
total = @freqCounter.inject{|a,b| a+b}
|
133
133
|
if total < 0.01
|
134
|
-
|
134
|
+
confidence = 0.0
|
135
135
|
else
|
136
|
-
|
136
|
+
confidence = (@freqCounter[3] / total) - (@freqCounter[1] * 20.0 / total)
|
137
137
|
end
|
138
138
|
if confidence < 0.0
|
139
|
-
|
139
|
+
confidence = 0.0
|
140
140
|
end
|
141
141
|
# lower the confidence of latin1 so that other more accurate detector
|
142
142
|
# can take priority.
|
@@ -32,20 +32,20 @@ module CharDet
|
|
32
32
|
class MultiByteCharSetProber < CharSetProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@
|
36
|
-
@
|
37
|
-
@
|
35
|
+
@distributionAnalyzer = nil
|
36
|
+
@codingSM = nil
|
37
|
+
@lastChar = "\x00\x00"
|
38
38
|
end
|
39
39
|
|
40
40
|
def reset
|
41
41
|
super
|
42
|
-
if @
|
43
|
-
|
42
|
+
if @codingSM
|
43
|
+
@codingSM.reset()
|
44
44
|
end
|
45
|
-
if @
|
46
|
-
|
45
|
+
if @distributionAnalyzer
|
46
|
+
@distributionAnalyzer.reset()
|
47
47
|
end
|
48
|
-
@
|
48
|
+
@lastChar = "\x00\x00"
|
49
49
|
end
|
50
50
|
|
51
51
|
def get_charset_name
|
@@ -54,36 +54,36 @@ module CharDet
|
|
54
54
|
def feed(aBuf)
|
55
55
|
aLen = aBuf.length
|
56
56
|
for i in (0...aLen)
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
57
|
+
codingState = @codingSM.next_state(aBuf[i, 1])
|
58
|
+
if codingState == EError
|
59
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
60
|
+
@state = ENotMe
|
61
|
+
break
|
62
|
+
elsif codingState == EItsMe
|
63
|
+
@state = EFoundIt
|
64
|
+
break
|
65
|
+
elsif codingState == EStart
|
66
|
+
charLen = @codingSM.get_current_charlen()
|
67
|
+
if i == 0
|
68
|
+
@lastChar[1] = aBuf[0, 1]
|
69
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
70
|
+
else
|
71
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
72
|
+
end
|
73
|
+
end
|
74
74
|
end
|
75
|
-
@
|
75
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
76
76
|
|
77
77
|
if get_state() == EDetecting
|
78
|
-
|
79
|
-
|
80
|
-
|
78
|
+
if @distributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
79
|
+
@state = EFoundIt
|
80
|
+
end
|
81
81
|
end
|
82
82
|
return get_state()
|
83
83
|
end
|
84
84
|
|
85
85
|
def get_confidence
|
86
|
-
return @
|
86
|
+
return @distributionAnalyzer.get_confidence()
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
@@ -32,15 +32,15 @@ module CharDet
|
|
32
32
|
class MBCSGroupProber < CharSetGroupProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
@probers = [
|
36
|
+
UTF8Prober.new,
|
37
|
+
SJISProber.new,
|
38
|
+
EUCJPProber.new,
|
39
|
+
GB2312Prober.new,
|
40
|
+
EUCKRProber.new,
|
41
|
+
Big5Prober.new,
|
42
|
+
EUCTWProber.new
|
43
|
+
]
|
44
44
|
reset()
|
45
45
|
end
|
46
46
|
end
|
data/lib/rchardet/mbcssm.rb
CHANGED
@@ -62,22 +62,22 @@ module CharDet
|
|
62
62
|
3,3,3,3,3,3,3,3, # e8 - ef
|
63
63
|
3,3,3,3,3,3,3,3, # f0 - f7
|
64
64
|
3,3,3,3,3,3,3,0 # f8 - ff
|
65
|
-
]
|
65
|
+
].freeze
|
66
66
|
|
67
67
|
BIG5_st = [
|
68
68
|
EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
|
69
69
|
EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
|
70
70
|
EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
|
71
|
-
]
|
71
|
+
].freeze
|
72
72
|
|
73
|
-
Big5CharLenTable = [0, 1, 1, 2, 0]
|
73
|
+
Big5CharLenTable = [0, 1, 1, 2, 0].freeze
|
74
74
|
|
75
75
|
Big5SMModel = {'classTable' => BIG5_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}
|
76
|
+
'classFactor' => 5,
|
77
|
+
'stateTable' => BIG5_st,
|
78
|
+
'charLenTable' => Big5CharLenTable,
|
79
|
+
'name' => 'Big5'
|
80
|
+
}.freeze
|
81
81
|
|
82
82
|
# EUC-JP
|
83
83
|
|
@@ -114,7 +114,7 @@ module CharDet
|
|
114
114
|
0,0,0,0,0,0,0,0, # e8 - ef
|
115
115
|
0,0,0,0,0,0,0,0, # f0 - f7
|
116
116
|
0,0,0,0,0,0,0,5 # f8 - ff
|
117
|
-
]
|
117
|
+
].freeze
|
118
118
|
|
119
119
|
EUCJP_st = [
|
120
120
|
3, 4, 3, 5,EStart,EError,EError,EError,#00-07
|
@@ -122,16 +122,16 @@ module CharDet
|
|
122
122
|
EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
|
123
123
|
EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
|
124
124
|
3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
|
125
|
-
]
|
125
|
+
].freeze
|
126
126
|
|
127
|
-
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
|
127
|
+
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0].freeze
|
128
128
|
|
129
129
|
EUCJPSMModel = {'classTable' => EUCJP_cls,
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
}
|
130
|
+
'classFactor' => 6,
|
131
|
+
'stateTable' => EUCJP_st,
|
132
|
+
'charLenTable' => EUCJPCharLenTable,
|
133
|
+
'name' => 'EUC-JP'
|
134
|
+
}.freeze
|
135
135
|
|
136
136
|
# EUC-KR
|
137
137
|
|
@@ -168,21 +168,21 @@ module CharDet
|
|
168
168
|
2,2,2,2,2,2,2,2, # e8 - ef
|
169
169
|
2,2,2,2,2,2,2,2, # f0 - f7
|
170
170
|
2,2,2,2,2,2,2,0 # f8 - ff
|
171
|
-
]
|
171
|
+
].freeze
|
172
172
|
|
173
173
|
EUCKR_st = [
|
174
174
|
EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
|
175
175
|
EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
|
176
|
-
]
|
176
|
+
].freeze
|
177
177
|
|
178
|
-
EUCKRCharLenTable = [0, 1, 2, 0]
|
178
|
+
EUCKRCharLenTable = [0, 1, 2, 0].freeze
|
179
179
|
|
180
180
|
EUCKRSMModel = {'classTable' => EUCKR_cls,
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
}
|
181
|
+
'classFactor' => 4,
|
182
|
+
'stateTable' => EUCKR_st,
|
183
|
+
'charLenTable' => EUCKRCharLenTable,
|
184
|
+
'name' => 'EUC-KR'
|
185
|
+
}.freeze
|
186
186
|
|
187
187
|
# EUC-TW
|
188
188
|
|
@@ -219,7 +219,7 @@ module CharDet
|
|
219
219
|
3,3,3,3,3,3,3,3, # e8 - ef
|
220
220
|
3,3,3,3,3,3,3,3, # f0 - f7
|
221
221
|
3,3,3,3,3,3,3,0 # f8 - ff
|
222
|
-
]
|
222
|
+
].freeze
|
223
223
|
|
224
224
|
EUCTW_st = [
|
225
225
|
EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
|
@@ -228,16 +228,16 @@ module CharDet
|
|
228
228
|
EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
|
229
229
|
5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
|
230
230
|
EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
|
231
|
-
]
|
231
|
+
].freeze
|
232
232
|
|
233
|
-
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
|
233
|
+
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3].freeze
|
234
234
|
|
235
235
|
EUCTWSMModel = {'classTable' => EUCTW_cls,
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
}
|
236
|
+
'classFactor' => 7,
|
237
|
+
'stateTable' => EUCTW_st,
|
238
|
+
'charLenTable' => EUCTWCharLenTable,
|
239
|
+
'name' => 'x-euc-tw'
|
240
|
+
}.freeze
|
241
241
|
|
242
242
|
# GB2312
|
243
243
|
|
@@ -274,7 +274,7 @@ module CharDet
|
|
274
274
|
6,6,6,6,6,6,6,6, # e8 - ef
|
275
275
|
6,6,6,6,6,6,6,6, # f0 - f7
|
276
276
|
6,6,6,6,6,6,6,0 # f8 - ff
|
277
|
-
]
|
277
|
+
].freeze
|
278
278
|
|
279
279
|
GB2312_st = [
|
280
280
|
EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
|
@@ -283,21 +283,21 @@ module CharDet
|
|
283
283
|
4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
|
284
284
|
EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
|
285
285
|
EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
|
286
|
-
]
|
286
|
+
].freeze
|
287
287
|
|
288
288
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
289
289
|
# But it is not necessary to discriminate between the two since
|
290
290
|
# it is used for frequency analysis only, and we are validing
|
291
291
|
# each code range there as well. So it is safe to set it to be
|
292
292
|
# 2 here.
|
293
|
-
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
|
293
|
+
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2].freeze
|
294
294
|
|
295
295
|
GB2312SMModel = {'classTable' => GB2312_cls,
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
}
|
296
|
+
'classFactor' => 7,
|
297
|
+
'stateTable' => GB2312_st,
|
298
|
+
'charLenTable' => GB2312CharLenTable,
|
299
|
+
'name' => 'GB2312'
|
300
|
+
}.freeze
|
301
301
|
|
302
302
|
# Shift_JIS
|
303
303
|
|
@@ -336,22 +336,22 @@ module CharDet
|
|
336
336
|
3,3,3,3,3,4,4,4, # e8 - ef
|
337
337
|
4,4,4,4,4,4,4,4, # f0 - f7
|
338
338
|
4,4,4,4,4,0,0,0 # f8 - ff
|
339
|
-
]
|
339
|
+
].freeze
|
340
340
|
|
341
341
|
SJIS_st = [
|
342
342
|
EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
|
343
343
|
EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
|
344
344
|
EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
|
345
|
-
]
|
345
|
+
].freeze
|
346
346
|
|
347
|
-
SJISCharLenTable = [0, 1, 1, 2, 0, 0]
|
347
|
+
SJISCharLenTable = [0, 1, 1, 2, 0, 0].freeze
|
348
348
|
|
349
349
|
SJISSMModel = {'classTable' => SJIS_cls,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
}
|
350
|
+
'classFactor' => 6,
|
351
|
+
'stateTable' => SJIS_st,
|
352
|
+
'charLenTable' => SJISCharLenTable,
|
353
|
+
'name' => 'Shift_JIS'
|
354
|
+
}.freeze
|
355
355
|
|
356
356
|
# UCS2-BE
|
357
357
|
|
@@ -388,7 +388,7 @@ module CharDet
|
|
388
388
|
0,0,0,0,0,0,0,0, # e8 - ef
|
389
389
|
0,0,0,0,0,0,0,0, # f0 - f7
|
390
390
|
0,0,0,0,0,0,4,5 # f8 - ff
|
391
|
-
]
|
391
|
+
].freeze
|
392
392
|
|
393
393
|
UCS2BE_st = [
|
394
394
|
5, 7, 7,EError, 4, 3,EError,EError,#00-07
|
@@ -398,16 +398,16 @@ module CharDet
|
|
398
398
|
6, 6, 6, 6, 5, 7, 7,EError,#20-27
|
399
399
|
5, 8, 6, 6,EError, 6, 6, 6,#28-2f
|
400
400
|
6, 6, 6, 6,EError,EError,EStart,EStart#30-37
|
401
|
-
]
|
401
|
+
].freeze
|
402
402
|
|
403
|
-
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
|
403
|
+
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2].freeze
|
404
404
|
|
405
405
|
UCS2BESMModel = {'classTable' => UCS2BE_cls,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
}
|
406
|
+
'classFactor' => 6,
|
407
|
+
'stateTable' => UCS2BE_st,
|
408
|
+
'charLenTable' => UCS2BECharLenTable,
|
409
|
+
'name' => 'UTF-16BE'
|
410
|
+
}.freeze
|
411
411
|
|
412
412
|
# UCS2-LE
|
413
413
|
|
@@ -444,7 +444,7 @@ module CharDet
|
|
444
444
|
0,0,0,0,0,0,0,0, # e8 - ef
|
445
445
|
0,0,0,0,0,0,0,0, # f0 - f7
|
446
446
|
0,0,0,0,0,0,4,5 # f8 - ff
|
447
|
-
]
|
447
|
+
].freeze
|
448
448
|
|
449
449
|
UCS2LE_st = [
|
450
450
|
6, 6, 7, 6, 4, 3,EError,EError,#00-07
|
@@ -454,16 +454,16 @@ module CharDet
|
|
454
454
|
7, 6, 8, 8, 5, 5, 5,EError,#20-27
|
455
455
|
5, 5, 5,EError,EError,EError, 5, 5,#28-2f
|
456
456
|
5, 5, 5,EError, 5,EError,EStart,EStart#30-37
|
457
|
-
]
|
457
|
+
].freeze
|
458
458
|
|
459
|
-
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
|
459
|
+
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2].freeze
|
460
460
|
|
461
461
|
UCS2LESMModel = {'classTable' => UCS2LE_cls,
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
}
|
462
|
+
'classFactor' => 6,
|
463
|
+
'stateTable' => UCS2LE_st,
|
464
|
+
'charLenTable' => UCS2LECharLenTable,
|
465
|
+
'name' => 'UTF-16LE'
|
466
|
+
}.freeze
|
467
467
|
|
468
468
|
# UTF-8
|
469
469
|
|
@@ -500,7 +500,7 @@ module CharDet
|
|
500
500
|
8,8,8,8,8,9,8,8, # e8 - ef
|
501
501
|
10,11,11,11,11,11,11,11, # f0 - f7
|
502
502
|
12,13,13,13,14,15,0,0 # f8 - ff
|
503
|
-
]
|
503
|
+
].freeze
|
504
504
|
|
505
505
|
UTF8_st = [
|
506
506
|
EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
|
@@ -529,14 +529,14 @@ module CharDet
|
|
529
529
|
EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
|
530
530
|
EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
|
531
531
|
EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
|
532
|
-
]
|
532
|
+
].freeze
|
533
533
|
|
534
|
-
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
|
534
|
+
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6].freeze
|
535
535
|
|
536
536
|
UTF8SMModel = {'classTable' => UTF8_cls,
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
}
|
537
|
+
'classFactor' => 16,
|
538
|
+
'stateTable' => UTF8_st,
|
539
|
+
'charLenTable' => UTF8CharLenTable,
|
540
|
+
'name' => 'UTF-8'
|
541
|
+
}.freeze
|
542
542
|
end
|