rchardet 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -40,68 +40,68 @@ module CharDet
|
|
40
40
|
class SingleByteCharSetProber < CharSetProber
|
41
41
|
def initialize(model, reversed=false, nameProber=nil)
|
42
42
|
super()
|
43
|
-
@
|
44
|
-
@
|
45
|
-
@
|
43
|
+
@model = model
|
44
|
+
@reversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
45
|
+
@nameProber = nameProber # Optional auxiliary prober for name decision
|
46
46
|
reset()
|
47
47
|
end
|
48
48
|
|
49
49
|
def reset
|
50
50
|
super()
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
51
|
+
@lastOrder = 255 # char order of last character
|
52
|
+
@seqCounters = [0] * NUMBER_OF_SEQ_CAT
|
53
|
+
@totalSeqs = 0
|
54
|
+
@totalChar = 0
|
55
|
+
@freqChar = 0 # characters that fall in our sampling range
|
56
56
|
end
|
57
57
|
|
58
58
|
def get_charset_name
|
59
|
-
if @
|
60
|
-
|
59
|
+
if @nameProber
|
60
|
+
return @nameProber.get_charset_name()
|
61
61
|
else
|
62
|
-
|
62
|
+
return @model['charsetName']
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def feed(aBuf)
|
67
|
-
if
|
68
|
-
|
67
|
+
if !@model['keepEnglishLetter']
|
68
|
+
aBuf = filter_without_english_letters(aBuf)
|
69
69
|
end
|
70
70
|
aLen = aBuf.length
|
71
|
-
if
|
72
|
-
|
71
|
+
if aLen == 0
|
72
|
+
return get_state()
|
73
73
|
end
|
74
74
|
aBuf.each_byte do |b|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
75
|
+
c = b.chr
|
76
|
+
order = @model['charToOrderMap'][c.bytes.first]
|
77
|
+
if order < SYMBOL_CAT_ORDER
|
78
|
+
@totalChar += 1
|
79
|
+
end
|
80
|
+
if order < SAMPLE_SIZE
|
81
|
+
@freqChar += 1
|
82
|
+
if @lastOrder < SAMPLE_SIZE
|
83
|
+
@totalSeqs += 1
|
84
|
+
if !@reversed
|
85
|
+
@seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
|
86
|
+
else # reverse the order of the letters in the lookup
|
87
|
+
@seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@lastOrder = order
|
92
92
|
end
|
93
93
|
|
94
94
|
if get_state() == EDetecting
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
95
|
+
if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
|
96
|
+
cf = get_confidence()
|
97
|
+
if cf > POSITIVE_SHORTCUT_THRESHOLD
|
98
|
+
$stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
|
99
|
+
@state = EFoundIt
|
100
|
+
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
|
101
|
+
$stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
|
102
|
+
@state = ENotMe
|
103
|
+
end
|
104
|
+
end
|
105
105
|
end
|
106
106
|
|
107
107
|
return get_state()
|
@@ -109,14 +109,12 @@ module CharDet
|
|
109
109
|
|
110
110
|
def get_confidence
|
111
111
|
r = 0.01
|
112
|
-
if @
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
r = 0.99
|
119
|
-
end
|
112
|
+
if @totalSeqs > 0
|
113
|
+
r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
|
114
|
+
r = r * @freqChar / @totalChar
|
115
|
+
if r >= 1.0
|
116
|
+
r = 0.99
|
117
|
+
end
|
120
118
|
end
|
121
119
|
return r
|
122
120
|
end
|
@@ -31,26 +31,26 @@ module CharDet
|
|
31
31
|
class SBCSGroupProber < CharSetGroupProber
|
32
32
|
def initialize
|
33
33
|
super
|
34
|
-
@
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
@probers = [
|
35
|
+
SingleByteCharSetProber.new(Win1251CyrillicModel),
|
36
|
+
SingleByteCharSetProber.new(Koi8rModel),
|
37
|
+
SingleByteCharSetProber.new(Latin5CyrillicModel),
|
38
|
+
SingleByteCharSetProber.new(MacCyrillicModel),
|
39
|
+
SingleByteCharSetProber.new(Ibm866Model),
|
40
|
+
SingleByteCharSetProber.new(Ibm855Model),
|
41
|
+
SingleByteCharSetProber.new(Latin7GreekModel),
|
42
|
+
SingleByteCharSetProber.new(Win1253GreekModel),
|
43
|
+
SingleByteCharSetProber.new(Latin5BulgarianModel),
|
44
|
+
SingleByteCharSetProber.new(Win1251BulgarianModel),
|
45
|
+
SingleByteCharSetProber.new(Latin2HungarianModel),
|
46
|
+
SingleByteCharSetProber.new(Win1250HungarianModel),
|
47
|
+
SingleByteCharSetProber.new(TIS620ThaiModel),
|
48
|
+
]
|
49
49
|
hebrewProber = HebrewProber.new()
|
50
50
|
logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
|
51
51
|
visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
|
52
52
|
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
53
|
-
@
|
53
|
+
@probers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
|
54
54
|
|
55
55
|
reset()
|
56
56
|
end
|
data/lib/rchardet/sjisprober.rb
CHANGED
@@ -30,15 +30,15 @@ module CharDet
|
|
30
30
|
class SJISProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
35
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(SJISSMModel)
|
34
|
+
@distributionAnalyzer = SJISDistributionAnalysis.new()
|
35
|
+
@contextAnalyzer = SJISContextAnalysis.new()
|
36
36
|
reset()
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super()
|
41
|
-
@
|
41
|
+
@contextAnalyzer.reset()
|
42
42
|
end
|
43
43
|
|
44
44
|
def get_charset_name
|
@@ -48,40 +48,40 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @codingSM.next_state(aBuf[i,1])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@state = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@state = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @codingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@lastChar[1] = aBuf[0, 1]
|
63
|
+
@contextAnalyzer.feed(@lastChar[2-charLen, 1], charLen)
|
64
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
65
|
+
else
|
66
|
+
@contextAnalyzer.feed(aBuf[i+1-charLen, 2], charLen)
|
67
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
|
-
@
|
72
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@state = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
81
81
|
end
|
82
82
|
|
83
83
|
def get_confidence
|
84
|
-
l = [@
|
84
|
+
l = [@contextAnalyzer.get_confidence(), @distributionAnalyzer.get_confidence()]
|
85
85
|
return l.max
|
86
86
|
end
|
87
87
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: US-ASCII
|
1
2
|
######################## BEGIN LICENSE BLOCK ########################
|
2
3
|
# The Original Code is Mozilla Universal charset detector code.
|
3
4
|
#
|
@@ -34,27 +35,28 @@ module CharDet
|
|
34
35
|
EHighbyte = 2
|
35
36
|
|
36
37
|
class UniversalDetector
|
37
|
-
|
38
|
+
attr_reader :done, :result
|
39
|
+
|
38
40
|
def initialize
|
39
|
-
@
|
40
|
-
@
|
41
|
-
@
|
42
|
-
@
|
41
|
+
@highBitDetector = /[\x80-\xFF]/
|
42
|
+
@escDetector = /(\033|\~\{)/
|
43
|
+
@escCharSetProber = nil
|
44
|
+
@charSetProbers = []
|
43
45
|
reset()
|
44
46
|
end
|
45
47
|
|
46
48
|
def reset
|
47
49
|
@result = {'encoding' => nil, 'confidence' => 0.0}
|
48
50
|
@done = false
|
49
|
-
@
|
50
|
-
@
|
51
|
-
@
|
52
|
-
@
|
53
|
-
if @
|
54
|
-
|
51
|
+
@start = true
|
52
|
+
@gotData = false
|
53
|
+
@inputState = EPureAscii
|
54
|
+
@lastChar = ''
|
55
|
+
if @escCharSetProber
|
56
|
+
@escCharSetProber.reset()
|
55
57
|
end
|
56
|
-
for prober in @
|
57
|
-
|
58
|
+
for prober in @charSetProbers
|
59
|
+
prober.reset()
|
58
60
|
end
|
59
61
|
end
|
60
62
|
|
@@ -62,104 +64,104 @@ module CharDet
|
|
62
64
|
return if @done
|
63
65
|
|
64
66
|
aLen = aBuf.length
|
65
|
-
return if
|
67
|
+
return if aLen == 0
|
66
68
|
|
67
|
-
if
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
69
|
+
if !@gotData
|
70
|
+
# If the data starts with BOM, we know it is UTF
|
71
|
+
if aBuf[0, 3] == "\xEF\xBB\xBF"
|
72
|
+
# EF BB BF UTF-8 with BOM
|
73
|
+
@result = {'encoding' => "UTF-8", 'confidence' => 1.0}
|
74
|
+
elsif aBuf[0, 4] == "\xFF\xFE\x00\x00"
|
75
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
76
|
+
@result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
|
77
|
+
elsif aBuf[0, 4] == "\x00\x00\xFE\xFF"
|
78
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
79
|
+
@result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
|
80
|
+
elsif aBuf[0, 4] == "\xFE\xFF\x00\x00"
|
81
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
82
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
|
83
|
+
elsif aBuf[0, 4] == "\x00\x00\xFF\xFE"
|
84
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
85
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
|
86
|
+
elsif aBuf[0, 2] == "\xFF\xFE"
|
87
|
+
# FF FE UTF-16, little endian BOM
|
88
|
+
@result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
|
89
|
+
elsif aBuf[0, 2] == "\xFE\xFF"
|
90
|
+
# FE FF UTF-16, big endian BOM
|
91
|
+
@result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
|
92
|
+
end
|
91
93
|
end
|
92
94
|
|
93
|
-
@
|
95
|
+
@gotData = true
|
94
96
|
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
|
-
|
96
|
-
|
97
|
+
@done = true
|
98
|
+
return
|
97
99
|
end
|
98
|
-
if @
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
if @inputState == EPureAscii
|
101
|
+
if @highBitDetector =~ (aBuf)
|
102
|
+
@inputState = EHighbyte
|
103
|
+
elsif (@inputState == EPureAscii) and @escDetector =~ (@lastChar + aBuf)
|
104
|
+
@inputState = EEscAscii
|
105
|
+
end
|
104
106
|
end
|
105
107
|
|
106
|
-
@
|
107
|
-
if @
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
elsif @
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
108
|
+
@lastChar = aBuf[-1, 1]
|
109
|
+
if @inputState == EEscAscii
|
110
|
+
if !@escCharSetProber
|
111
|
+
@escCharSetProber = EscCharSetProber.new()
|
112
|
+
end
|
113
|
+
if @escCharSetProber.feed(aBuf) == EFoundIt
|
114
|
+
@result = {'encoding' => @escCharSetProber.get_charset_name(),
|
115
|
+
'confidence' => @escCharSetProber.get_confidence()
|
116
|
+
}
|
117
|
+
@done = true
|
118
|
+
end
|
119
|
+
elsif @inputState == EHighbyte
|
120
|
+
if @charSetProbers.nil? || @charSetProbers.empty?
|
121
|
+
@charSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
|
122
|
+
end
|
123
|
+
for prober in @charSetProbers
|
124
|
+
if prober.feed(aBuf) == EFoundIt
|
125
|
+
@result = {'encoding' => prober.get_charset_name(),
|
126
|
+
'confidence' => prober.get_confidence()}
|
127
|
+
@done = true
|
128
|
+
break
|
129
|
+
end
|
130
|
+
end
|
129
131
|
end
|
130
132
|
|
131
133
|
end
|
132
134
|
|
133
135
|
def close
|
134
136
|
return if @done
|
135
|
-
if
|
136
|
-
|
137
|
-
|
137
|
+
if !@gotData
|
138
|
+
$stderr << "no data received!\n" if $debug
|
139
|
+
return
|
138
140
|
end
|
139
141
|
@done = true
|
140
142
|
|
141
|
-
if @
|
142
|
-
|
143
|
-
|
143
|
+
if @inputState == EPureAscii
|
144
|
+
@result = {'encoding' => 'ascii', 'confidence' => 1.0}
|
145
|
+
return @result
|
144
146
|
end
|
145
147
|
|
146
|
-
if @
|
147
|
-
|
148
|
-
@
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
148
|
+
if @inputState == EHighbyte
|
149
|
+
confidences = {}
|
150
|
+
@charSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
|
151
|
+
maxProber = @charSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
|
152
|
+
if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
|
153
|
+
@result = {'encoding' => maxProber.get_charset_name(),
|
154
|
+
'confidence' => maxProber.get_confidence()}
|
155
|
+
return @result
|
156
|
+
end
|
155
157
|
end
|
156
158
|
|
157
159
|
if $debug
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
160
|
+
$stderr << "no probers hit minimum threshhold\n" if $debug
|
161
|
+
for prober in @charSetProbers[0].probers
|
162
|
+
next if !prober
|
163
|
+
$stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
|
164
|
+
end
|
163
165
|
end
|
164
166
|
end
|
165
167
|
end
|