rchardet 1.3 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rchardet/chardistribution.rb +6 -6
- data/lib/rchardet/charsetgroupprober.rb +1 -1
- data/lib/rchardet/codingstatemachine.rb +2 -0
- data/lib/rchardet/escprober.rb +1 -1
- data/lib/rchardet/eucjpprober.rb +1 -1
- data/lib/rchardet/jpcntx.rb +6 -6
- data/lib/rchardet/sjisprober.rb +1 -1
- data/lib/rchardet/universaldetector.rb +5 -5
- data/lib/rchardet/utf8prober.rb +1 -1
- data/lib/rchardet.rb +1 -1
- metadata +17 -7
@@ -57,8 +57,8 @@ module CharDet
|
|
57
57
|
if order >= 0
|
58
58
|
@_mTotalChars += 1
|
59
59
|
# order is valid
|
60
|
-
if order < @_mTableSize
|
61
|
-
if 512 > @_mCharToFreqOrder[order]
|
60
|
+
if order < @_mTableSize
|
61
|
+
if 512 > @_mCharToFreqOrder[order]
|
62
62
|
@_mFreqChars += 1
|
63
63
|
end
|
64
64
|
end
|
@@ -72,7 +72,7 @@ module CharDet
|
|
72
72
|
return SURE_NO
|
73
73
|
end
|
74
74
|
|
75
|
-
if @_mTotalChars != @_mFreqChars
|
75
|
+
if @_mTotalChars != @_mFreqChars
|
76
76
|
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
|
77
77
|
if r < SURE_YES
|
78
78
|
return r
|
@@ -227,10 +227,10 @@ module CharDet
|
|
227
227
|
# first byte range: 0xa0 -- 0xfe
|
228
228
|
# second byte range: 0xa1 -- 0xfe
|
229
229
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0..0] >= "\xA0"
|
231
|
-
|
230
|
+
if aStr[0..0] >= "\xA0"
|
231
|
+
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
|
232
232
|
else
|
233
|
-
|
233
|
+
return -1
|
234
234
|
end
|
235
235
|
end
|
236
236
|
end
|
data/lib/rchardet/escprober.rb
CHANGED
data/lib/rchardet/eucjpprober.rb
CHANGED
data/lib/rchardet/jpcntx.rb
CHANGED
@@ -34,7 +34,7 @@ module CharDet
|
|
34
34
|
MINIMUM_DATA_THRESHOLD = 4
|
35
35
|
|
36
36
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
37
|
-
|
37
|
+
JP2_CHAR_CONTEXT = [
|
38
38
|
[0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1],
|
39
39
|
[2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4],
|
40
40
|
[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2],
|
@@ -150,13 +150,13 @@ module CharDet
|
|
150
150
|
@_mNeedToSkipCharNum = i - aLen
|
151
151
|
@_mLastCharOrder = -1
|
152
152
|
else
|
153
|
-
if (order != -1) and (@_mLastCharOrder != -1)
|
153
|
+
if (order != -1) and (@_mLastCharOrder != -1)
|
154
154
|
@_mTotalRel += 1
|
155
|
-
if @_mTotalRel > MAX_REL_THRESHOLD
|
155
|
+
if @_mTotalRel > MAX_REL_THRESHOLD
|
156
156
|
@_mDone = true
|
157
157
|
break
|
158
158
|
end
|
159
|
-
@_mRelSample[
|
159
|
+
@_mRelSample[JP2_CHAR_CONTEXT[@_mLastCharOrder][order]] += 1
|
160
160
|
end
|
161
161
|
@_mLastCharOrder = order
|
162
162
|
end
|
@@ -169,7 +169,7 @@ module CharDet
|
|
169
169
|
|
170
170
|
def get_confidence
|
171
171
|
# This is just one way to calculate confidence. It works well for me.
|
172
|
-
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
172
|
+
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
173
|
return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
|
174
174
|
else
|
175
175
|
return DONT_KNOW
|
@@ -208,7 +208,7 @@ module CharDet
|
|
208
208
|
return -1, 1 unless aStr
|
209
209
|
# find out current char's byte length
|
210
210
|
aStr = aStr[0..1].join if aStr.class == Array
|
211
|
-
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
211
|
+
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
212
|
charLen = 2
|
213
213
|
elsif aStr[0..0] == "\x8F"
|
214
214
|
charLen = 3
|
data/lib/rchardet/sjisprober.rb
CHANGED
@@ -91,11 +91,11 @@ module CharDet
|
|
91
91
|
end
|
92
92
|
|
93
93
|
@_mGotData = true
|
94
|
-
if @result['encoding'] and (@result['confidence'] > 0.0)
|
94
|
+
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
95
|
@done = true
|
96
96
|
return
|
97
97
|
end
|
98
|
-
if @_mInputState == EPureAscii
|
98
|
+
if @_mInputState == EPureAscii
|
99
99
|
if @_highBitDetector =~ (aBuf)
|
100
100
|
@_mInputState = EHighbyte
|
101
101
|
elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
|
@@ -109,7 +109,7 @@ module CharDet
|
|
109
109
|
@_mEscCharSetProber = EscCharSetProber.new()
|
110
110
|
end
|
111
111
|
if @_mEscCharSetProber.feed(aBuf) == EFoundIt
|
112
|
-
@result = {'encoding' =>
|
112
|
+
@result = {'encoding' => @_mEscCharSetProber.get_charset_name(),
|
113
113
|
'confidence' => @_mEscCharSetProber.get_confidence()
|
114
114
|
}
|
115
115
|
@done = true
|
@@ -138,12 +138,12 @@ module CharDet
|
|
138
138
|
end
|
139
139
|
@done = true
|
140
140
|
|
141
|
-
if @_mInputState == EPureAscii
|
141
|
+
if @_mInputState == EPureAscii
|
142
142
|
@result = {'encoding' => 'ascii', 'confidence' => 1.0}
|
143
143
|
return @result
|
144
144
|
end
|
145
145
|
|
146
|
-
if @_mInputState == EHighbyte
|
146
|
+
if @_mInputState == EHighbyte
|
147
147
|
confidences = {}
|
148
148
|
@_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
|
149
149
|
maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
|
data/lib/rchardet/utf8prober.rb
CHANGED
data/lib/rchardet.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rchardet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 3
|
9
|
+
- 1
|
10
|
+
version: 1.3.1
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Jeff Hodges
|
@@ -9,8 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2012-12-02 00:00:00 Z
|
14
19
|
dependencies: []
|
15
20
|
|
16
21
|
description:
|
@@ -60,7 +65,6 @@ files:
|
|
60
65
|
- lib/rchardet.rb
|
61
66
|
- README
|
62
67
|
- COPYING
|
63
|
-
has_rdoc: true
|
64
68
|
homepage: http://github.com/jmhodges/rchardet/tree/master
|
65
69
|
licenses: []
|
66
70
|
|
@@ -70,21 +74,27 @@ rdoc_options: []
|
|
70
74
|
require_paths:
|
71
75
|
- lib
|
72
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
73
78
|
requirements:
|
74
79
|
- - ">="
|
75
80
|
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
82
|
+
segments:
|
83
|
+
- 0
|
76
84
|
version: "0"
|
77
|
-
version:
|
78
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
79
87
|
requirements:
|
80
88
|
- - ">="
|
81
89
|
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
91
|
+
segments:
|
92
|
+
- 0
|
82
93
|
version: "0"
|
83
|
-
version:
|
84
94
|
requirements: []
|
85
95
|
|
86
96
|
rubyforge_project: rchardet
|
87
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.8.15
|
88
98
|
signing_key:
|
89
99
|
specification_version: 3
|
90
100
|
summary: Character encoding auto-detection in Ruby. As smart as your browser. Open source.
|