rchardet 1.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rchardet/chardistribution.rb +6 -6
- data/lib/rchardet/charsetgroupprober.rb +1 -1
- data/lib/rchardet/codingstatemachine.rb +2 -0
- data/lib/rchardet/escprober.rb +1 -1
- data/lib/rchardet/eucjpprober.rb +1 -1
- data/lib/rchardet/jpcntx.rb +6 -6
- data/lib/rchardet/sjisprober.rb +1 -1
- data/lib/rchardet/universaldetector.rb +5 -5
- data/lib/rchardet/utf8prober.rb +1 -1
- data/lib/rchardet.rb +1 -1
- metadata +17 -7
@@ -57,8 +57,8 @@ module CharDet
|
|
57
57
|
if order >= 0
|
58
58
|
@_mTotalChars += 1
|
59
59
|
# order is valid
|
60
|
-
if order < @_mTableSize
|
61
|
-
if 512 > @_mCharToFreqOrder[order]
|
60
|
+
if order < @_mTableSize
|
61
|
+
if 512 > @_mCharToFreqOrder[order]
|
62
62
|
@_mFreqChars += 1
|
63
63
|
end
|
64
64
|
end
|
@@ -72,7 +72,7 @@ module CharDet
|
|
72
72
|
return SURE_NO
|
73
73
|
end
|
74
74
|
|
75
|
-
if @_mTotalChars != @_mFreqChars
|
75
|
+
if @_mTotalChars != @_mFreqChars
|
76
76
|
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
|
77
77
|
if r < SURE_YES
|
78
78
|
return r
|
@@ -227,10 +227,10 @@ module CharDet
|
|
227
227
|
# first byte range: 0xa0 -- 0xfe
|
228
228
|
# second byte range: 0xa1 -- 0xfe
|
229
229
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0..0] >= "\xA0"
|
231
|
-
|
230
|
+
if aStr[0..0] >= "\xA0"
|
231
|
+
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
|
232
232
|
else
|
233
|
-
|
233
|
+
return -1
|
234
234
|
end
|
235
235
|
end
|
236
236
|
end
|
data/lib/rchardet/escprober.rb
CHANGED
data/lib/rchardet/eucjpprober.rb
CHANGED
data/lib/rchardet/jpcntx.rb
CHANGED
@@ -34,7 +34,7 @@ module CharDet
|
|
34
34
|
MINIMUM_DATA_THRESHOLD = 4
|
35
35
|
|
36
36
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
37
|
-
|
37
|
+
JP2_CHAR_CONTEXT = [
|
38
38
|
[0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1],
|
39
39
|
[2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4],
|
40
40
|
[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2],
|
@@ -150,13 +150,13 @@ module CharDet
|
|
150
150
|
@_mNeedToSkipCharNum = i - aLen
|
151
151
|
@_mLastCharOrder = -1
|
152
152
|
else
|
153
|
-
if (order != -1) and (@_mLastCharOrder != -1)
|
153
|
+
if (order != -1) and (@_mLastCharOrder != -1)
|
154
154
|
@_mTotalRel += 1
|
155
|
-
if @_mTotalRel > MAX_REL_THRESHOLD
|
155
|
+
if @_mTotalRel > MAX_REL_THRESHOLD
|
156
156
|
@_mDone = true
|
157
157
|
break
|
158
158
|
end
|
159
|
-
@_mRelSample[
|
159
|
+
@_mRelSample[JP2_CHAR_CONTEXT[@_mLastCharOrder][order]] += 1
|
160
160
|
end
|
161
161
|
@_mLastCharOrder = order
|
162
162
|
end
|
@@ -169,7 +169,7 @@ module CharDet
|
|
169
169
|
|
170
170
|
def get_confidence
|
171
171
|
# This is just one way to calculate confidence. It works well for me.
|
172
|
-
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
172
|
+
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
173
|
return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
|
174
174
|
else
|
175
175
|
return DONT_KNOW
|
@@ -208,7 +208,7 @@ module CharDet
|
|
208
208
|
return -1, 1 unless aStr
|
209
209
|
# find out current char's byte length
|
210
210
|
aStr = aStr[0..1].join if aStr.class == Array
|
211
|
-
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
211
|
+
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
212
|
charLen = 2
|
213
213
|
elsif aStr[0..0] == "\x8F"
|
214
214
|
charLen = 3
|
data/lib/rchardet/sjisprober.rb
CHANGED
@@ -91,11 +91,11 @@ module CharDet
|
|
91
91
|
end
|
92
92
|
|
93
93
|
@_mGotData = true
|
94
|
-
if @result['encoding'] and (@result['confidence'] > 0.0)
|
94
|
+
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
95
|
@done = true
|
96
96
|
return
|
97
97
|
end
|
98
|
-
if @_mInputState == EPureAscii
|
98
|
+
if @_mInputState == EPureAscii
|
99
99
|
if @_highBitDetector =~ (aBuf)
|
100
100
|
@_mInputState = EHighbyte
|
101
101
|
elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
|
@@ -109,7 +109,7 @@ module CharDet
|
|
109
109
|
@_mEscCharSetProber = EscCharSetProber.new()
|
110
110
|
end
|
111
111
|
if @_mEscCharSetProber.feed(aBuf) == EFoundIt
|
112
|
-
@result = {'encoding' =>
|
112
|
+
@result = {'encoding' => @_mEscCharSetProber.get_charset_name(),
|
113
113
|
'confidence' => @_mEscCharSetProber.get_confidence()
|
114
114
|
}
|
115
115
|
@done = true
|
@@ -138,12 +138,12 @@ module CharDet
|
|
138
138
|
end
|
139
139
|
@done = true
|
140
140
|
|
141
|
-
if @_mInputState == EPureAscii
|
141
|
+
if @_mInputState == EPureAscii
|
142
142
|
@result = {'encoding' => 'ascii', 'confidence' => 1.0}
|
143
143
|
return @result
|
144
144
|
end
|
145
145
|
|
146
|
-
if @_mInputState == EHighbyte
|
146
|
+
if @_mInputState == EHighbyte
|
147
147
|
confidences = {}
|
148
148
|
@_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
|
149
149
|
maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
|
data/lib/rchardet/utf8prober.rb
CHANGED
data/lib/rchardet.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rchardet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 3
|
9
|
+
- 1
|
10
|
+
version: 1.3.1
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Jeff Hodges
|
@@ -9,8 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2012-12-02 00:00:00 Z
|
14
19
|
dependencies: []
|
15
20
|
|
16
21
|
description:
|
@@ -60,7 +65,6 @@ files:
|
|
60
65
|
- lib/rchardet.rb
|
61
66
|
- README
|
62
67
|
- COPYING
|
63
|
-
has_rdoc: true
|
64
68
|
homepage: http://github.com/jmhodges/rchardet/tree/master
|
65
69
|
licenses: []
|
66
70
|
|
@@ -70,21 +74,27 @@ rdoc_options: []
|
|
70
74
|
require_paths:
|
71
75
|
- lib
|
72
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
73
78
|
requirements:
|
74
79
|
- - ">="
|
75
80
|
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
82
|
+
segments:
|
83
|
+
- 0
|
76
84
|
version: "0"
|
77
|
-
version:
|
78
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
79
87
|
requirements:
|
80
88
|
- - ">="
|
81
89
|
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
91
|
+
segments:
|
92
|
+
- 0
|
82
93
|
version: "0"
|
83
|
-
version:
|
84
94
|
requirements: []
|
85
95
|
|
86
96
|
rubyforge_project: rchardet
|
87
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.8.15
|
88
98
|
signing_key:
|
89
99
|
specification_version: 3
|
90
100
|
summary: Character encoding auto-detection in Ruby. As smart as your browser. Open source.
|