rchardet 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -40,68 +40,68 @@ module CharDet
|
|
40
40
|
class SingleByteCharSetProber < CharSetProber
|
41
41
|
def initialize(model, reversed=false, nameProber=nil)
|
42
42
|
super()
|
43
|
-
@
|
44
|
-
@
|
45
|
-
@
|
43
|
+
@model = model
|
44
|
+
@reversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
45
|
+
@nameProber = nameProber # Optional auxiliary prober for name decision
|
46
46
|
reset()
|
47
47
|
end
|
48
48
|
|
49
49
|
def reset
|
50
50
|
super()
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
51
|
+
@lastOrder = 255 # char order of last character
|
52
|
+
@seqCounters = [0] * NUMBER_OF_SEQ_CAT
|
53
|
+
@totalSeqs = 0
|
54
|
+
@totalChar = 0
|
55
|
+
@freqChar = 0 # characters that fall in our sampling range
|
56
56
|
end
|
57
57
|
|
58
58
|
def get_charset_name
|
59
|
-
if @
|
60
|
-
|
59
|
+
if @nameProber
|
60
|
+
return @nameProber.get_charset_name()
|
61
61
|
else
|
62
|
-
|
62
|
+
return @model['charsetName']
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def feed(aBuf)
|
67
|
-
if
|
68
|
-
|
67
|
+
if !@model['keepEnglishLetter']
|
68
|
+
aBuf = filter_without_english_letters(aBuf)
|
69
69
|
end
|
70
70
|
aLen = aBuf.length
|
71
|
-
if
|
72
|
-
|
71
|
+
if aLen == 0
|
72
|
+
return get_state()
|
73
73
|
end
|
74
74
|
aBuf.each_byte do |b|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
75
|
+
c = b.chr
|
76
|
+
order = @model['charToOrderMap'][c.bytes.first]
|
77
|
+
if order < SYMBOL_CAT_ORDER
|
78
|
+
@totalChar += 1
|
79
|
+
end
|
80
|
+
if order < SAMPLE_SIZE
|
81
|
+
@freqChar += 1
|
82
|
+
if @lastOrder < SAMPLE_SIZE
|
83
|
+
@totalSeqs += 1
|
84
|
+
if !@reversed
|
85
|
+
@seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
|
86
|
+
else # reverse the order of the letters in the lookup
|
87
|
+
@seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@lastOrder = order
|
92
92
|
end
|
93
93
|
|
94
94
|
if get_state() == EDetecting
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
95
|
+
if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
|
96
|
+
cf = get_confidence()
|
97
|
+
if cf > POSITIVE_SHORTCUT_THRESHOLD
|
98
|
+
$stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
|
99
|
+
@state = EFoundIt
|
100
|
+
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
|
101
|
+
$stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
|
102
|
+
@state = ENotMe
|
103
|
+
end
|
104
|
+
end
|
105
105
|
end
|
106
106
|
|
107
107
|
return get_state()
|
@@ -109,14 +109,12 @@ module CharDet
|
|
109
109
|
|
110
110
|
def get_confidence
|
111
111
|
r = 0.01
|
112
|
-
if @
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
r = 0.99
|
119
|
-
end
|
112
|
+
if @totalSeqs > 0
|
113
|
+
r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
|
114
|
+
r = r * @freqChar / @totalChar
|
115
|
+
if r >= 1.0
|
116
|
+
r = 0.99
|
117
|
+
end
|
120
118
|
end
|
121
119
|
return r
|
122
120
|
end
|
@@ -31,26 +31,26 @@ module CharDet
|
|
31
31
|
class SBCSGroupProber < CharSetGroupProber
|
32
32
|
def initialize
|
33
33
|
super
|
34
|
-
@
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
@probers = [
|
35
|
+
SingleByteCharSetProber.new(Win1251CyrillicModel),
|
36
|
+
SingleByteCharSetProber.new(Koi8rModel),
|
37
|
+
SingleByteCharSetProber.new(Latin5CyrillicModel),
|
38
|
+
SingleByteCharSetProber.new(MacCyrillicModel),
|
39
|
+
SingleByteCharSetProber.new(Ibm866Model),
|
40
|
+
SingleByteCharSetProber.new(Ibm855Model),
|
41
|
+
SingleByteCharSetProber.new(Latin7GreekModel),
|
42
|
+
SingleByteCharSetProber.new(Win1253GreekModel),
|
43
|
+
SingleByteCharSetProber.new(Latin5BulgarianModel),
|
44
|
+
SingleByteCharSetProber.new(Win1251BulgarianModel),
|
45
|
+
SingleByteCharSetProber.new(Latin2HungarianModel),
|
46
|
+
SingleByteCharSetProber.new(Win1250HungarianModel),
|
47
|
+
SingleByteCharSetProber.new(TIS620ThaiModel),
|
48
|
+
]
|
49
49
|
hebrewProber = HebrewProber.new()
|
50
50
|
logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
|
51
51
|
visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
|
52
52
|
hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
|
53
|
-
@
|
53
|
+
@probers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
|
54
54
|
|
55
55
|
reset()
|
56
56
|
end
|
data/lib/rchardet/sjisprober.rb
CHANGED
@@ -30,15 +30,15 @@ module CharDet
|
|
30
30
|
class SJISProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
35
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(SJISSMModel)
|
34
|
+
@distributionAnalyzer = SJISDistributionAnalysis.new()
|
35
|
+
@contextAnalyzer = SJISContextAnalysis.new()
|
36
36
|
reset()
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super()
|
41
|
-
@
|
41
|
+
@contextAnalyzer.reset()
|
42
42
|
end
|
43
43
|
|
44
44
|
def get_charset_name
|
@@ -48,40 +48,40 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @codingSM.next_state(aBuf[i,1])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@state = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@state = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @codingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@lastChar[1] = aBuf[0, 1]
|
63
|
+
@contextAnalyzer.feed(@lastChar[2-charLen, 1], charLen)
|
64
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
65
|
+
else
|
66
|
+
@contextAnalyzer.feed(aBuf[i+1-charLen, 2], charLen)
|
67
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
|
-
@
|
72
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@state = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
81
81
|
end
|
82
82
|
|
83
83
|
def get_confidence
|
84
|
-
l = [@
|
84
|
+
l = [@contextAnalyzer.get_confidence(), @distributionAnalyzer.get_confidence()]
|
85
85
|
return l.max
|
86
86
|
end
|
87
87
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: US-ASCII
|
1
2
|
######################## BEGIN LICENSE BLOCK ########################
|
2
3
|
# The Original Code is Mozilla Universal charset detector code.
|
3
4
|
#
|
@@ -34,27 +35,28 @@ module CharDet
|
|
34
35
|
EHighbyte = 2
|
35
36
|
|
36
37
|
class UniversalDetector
|
37
|
-
|
38
|
+
attr_reader :done, :result
|
39
|
+
|
38
40
|
def initialize
|
39
|
-
@
|
40
|
-
@
|
41
|
-
@
|
42
|
-
@
|
41
|
+
@highBitDetector = /[\x80-\xFF]/
|
42
|
+
@escDetector = /(\033|\~\{)/
|
43
|
+
@escCharSetProber = nil
|
44
|
+
@charSetProbers = []
|
43
45
|
reset()
|
44
46
|
end
|
45
47
|
|
46
48
|
def reset
|
47
49
|
@result = {'encoding' => nil, 'confidence' => 0.0}
|
48
50
|
@done = false
|
49
|
-
@
|
50
|
-
@
|
51
|
-
@
|
52
|
-
@
|
53
|
-
if @
|
54
|
-
|
51
|
+
@start = true
|
52
|
+
@gotData = false
|
53
|
+
@inputState = EPureAscii
|
54
|
+
@lastChar = ''
|
55
|
+
if @escCharSetProber
|
56
|
+
@escCharSetProber.reset()
|
55
57
|
end
|
56
|
-
for prober in @
|
57
|
-
|
58
|
+
for prober in @charSetProbers
|
59
|
+
prober.reset()
|
58
60
|
end
|
59
61
|
end
|
60
62
|
|
@@ -62,104 +64,104 @@ module CharDet
|
|
62
64
|
return if @done
|
63
65
|
|
64
66
|
aLen = aBuf.length
|
65
|
-
return if
|
67
|
+
return if aLen == 0
|
66
68
|
|
67
|
-
if
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
69
|
+
if !@gotData
|
70
|
+
# If the data starts with BOM, we know it is UTF
|
71
|
+
if aBuf[0, 3] == "\xEF\xBB\xBF"
|
72
|
+
# EF BB BF UTF-8 with BOM
|
73
|
+
@result = {'encoding' => "UTF-8", 'confidence' => 1.0}
|
74
|
+
elsif aBuf[0, 4] == "\xFF\xFE\x00\x00"
|
75
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
76
|
+
@result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
|
77
|
+
elsif aBuf[0, 4] == "\x00\x00\xFE\xFF"
|
78
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
79
|
+
@result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
|
80
|
+
elsif aBuf[0, 4] == "\xFE\xFF\x00\x00"
|
81
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
82
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
|
83
|
+
elsif aBuf[0, 4] == "\x00\x00\xFF\xFE"
|
84
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
85
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
|
86
|
+
elsif aBuf[0, 2] == "\xFF\xFE"
|
87
|
+
# FF FE UTF-16, little endian BOM
|
88
|
+
@result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
|
89
|
+
elsif aBuf[0, 2] == "\xFE\xFF"
|
90
|
+
# FE FF UTF-16, big endian BOM
|
91
|
+
@result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
|
92
|
+
end
|
91
93
|
end
|
92
94
|
|
93
|
-
@
|
95
|
+
@gotData = true
|
94
96
|
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
|
-
|
96
|
-
|
97
|
+
@done = true
|
98
|
+
return
|
97
99
|
end
|
98
|
-
if @
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
if @inputState == EPureAscii
|
101
|
+
if @highBitDetector =~ (aBuf)
|
102
|
+
@inputState = EHighbyte
|
103
|
+
elsif (@inputState == EPureAscii) and @escDetector =~ (@lastChar + aBuf)
|
104
|
+
@inputState = EEscAscii
|
105
|
+
end
|
104
106
|
end
|
105
107
|
|
106
|
-
@
|
107
|
-
if @
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
elsif @
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
108
|
+
@lastChar = aBuf[-1, 1]
|
109
|
+
if @inputState == EEscAscii
|
110
|
+
if !@escCharSetProber
|
111
|
+
@escCharSetProber = EscCharSetProber.new()
|
112
|
+
end
|
113
|
+
if @escCharSetProber.feed(aBuf) == EFoundIt
|
114
|
+
@result = {'encoding' => @escCharSetProber.get_charset_name(),
|
115
|
+
'confidence' => @escCharSetProber.get_confidence()
|
116
|
+
}
|
117
|
+
@done = true
|
118
|
+
end
|
119
|
+
elsif @inputState == EHighbyte
|
120
|
+
if @charSetProbers.nil? || @charSetProbers.empty?
|
121
|
+
@charSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
|
122
|
+
end
|
123
|
+
for prober in @charSetProbers
|
124
|
+
if prober.feed(aBuf) == EFoundIt
|
125
|
+
@result = {'encoding' => prober.get_charset_name(),
|
126
|
+
'confidence' => prober.get_confidence()}
|
127
|
+
@done = true
|
128
|
+
break
|
129
|
+
end
|
130
|
+
end
|
129
131
|
end
|
130
132
|
|
131
133
|
end
|
132
134
|
|
133
135
|
def close
|
134
136
|
return if @done
|
135
|
-
if
|
136
|
-
|
137
|
-
|
137
|
+
if !@gotData
|
138
|
+
$stderr << "no data received!\n" if $debug
|
139
|
+
return
|
138
140
|
end
|
139
141
|
@done = true
|
140
142
|
|
141
|
-
if @
|
142
|
-
|
143
|
-
|
143
|
+
if @inputState == EPureAscii
|
144
|
+
@result = {'encoding' => 'ascii', 'confidence' => 1.0}
|
145
|
+
return @result
|
144
146
|
end
|
145
147
|
|
146
|
-
if @
|
147
|
-
|
148
|
-
@
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
148
|
+
if @inputState == EHighbyte
|
149
|
+
confidences = {}
|
150
|
+
@charSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
|
151
|
+
maxProber = @charSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
|
152
|
+
if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
|
153
|
+
@result = {'encoding' => maxProber.get_charset_name(),
|
154
|
+
'confidence' => maxProber.get_confidence()}
|
155
|
+
return @result
|
156
|
+
end
|
155
157
|
end
|
156
158
|
|
157
159
|
if $debug
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
160
|
+
$stderr << "no probers hit minimum threshhold\n" if $debug
|
161
|
+
for prober in @charSetProbers[0].probers
|
162
|
+
next if !prober
|
163
|
+
$stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
|
164
|
+
end
|
163
165
|
end
|
164
166
|
end
|
165
167
|
end
|