rchardet 1.7.0 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/rchardet.rb +2 -2
- data/lib/rchardet/chardistribution.rb +5 -5
- data/lib/rchardet/{gb2312freq.rb → gb18030freq.rb} +4 -4
- data/lib/rchardet/{gb2312prober.rb → gb18030prober.rb} +4 -4
- data/lib/rchardet/mbcsgroupprober.rb +1 -1
- data/lib/rchardet/mbcssm.rb +8 -8
- data/lib/rchardet/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b03009ca37e41a17ddc0f4f62cd5889b11b59b1f2100998f191a0be7ede5d000
|
4
|
+
data.tar.gz: ea8473dc5f1b7c4d24f858de90c591f1dec66055cc05b30b475e8256fa0c7a41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7829042566b227306274d03219456cda42b8cd3767a085bec2dd72e0d6d0d4fa78a165fd911768599c2c255f5879a7c25d2cde74bee2091a70e5fbf2ca77e586
|
7
|
+
data.tar.gz: 249b2027502f888713c57bcae850178adcdf60db9463a5357b396b3a2d9fd76a65d9a12247f8cddf8bf4e877d1c1a427c05b0c040c3f329b604106775af85f27
|
data/lib/rchardet.rb
CHANGED
@@ -33,8 +33,8 @@ require 'rchardet/euckrfreq'
|
|
33
33
|
require 'rchardet/euckrprober'
|
34
34
|
require 'rchardet/euctwfreq'
|
35
35
|
require 'rchardet/euctwprober'
|
36
|
-
require 'rchardet/
|
37
|
-
require 'rchardet/
|
36
|
+
require 'rchardet/gb18030freq'
|
37
|
+
require 'rchardet/gb18030prober'
|
38
38
|
require 'rchardet/hebrewprober'
|
39
39
|
require 'rchardet/jisfreq'
|
40
40
|
require 'rchardet/jpcntx'
|
@@ -149,16 +149,16 @@ module CharDet
|
|
149
149
|
end
|
150
150
|
end
|
151
151
|
|
152
|
-
class
|
152
|
+
class GB18030DistributionAnalysis < CharDistributionAnalysis
|
153
153
|
def initialize
|
154
154
|
super()
|
155
|
-
@charToFreqOrder =
|
156
|
-
@tableSize =
|
157
|
-
@typicalDistributionRatio =
|
155
|
+
@charToFreqOrder = GB18030CharToFreqOrder
|
156
|
+
@tableSize = GB18030_TABLE_SIZE
|
157
|
+
@typicalDistributionRatio = GB18030_TYPICAL_DISTRIBUTION_RATIO
|
158
158
|
end
|
159
159
|
|
160
160
|
def get_order(aStr)
|
161
|
-
# for
|
161
|
+
# for GB18030 encoding, we are interested
|
162
162
|
# first byte range: 0xb0 -- 0xfe
|
163
163
|
# second byte range: 0xa1 -- 0xfe
|
164
164
|
# no validation needed here. State machine has done that
|
@@ -26,7 +26,7 @@
|
|
26
26
|
# 02110-1301 USA
|
27
27
|
######################### END LICENSE BLOCK #########################
|
28
28
|
|
29
|
-
#
|
29
|
+
# GB18030 most frequently used character table
|
30
30
|
#
|
31
31
|
# Char to FreqOrder table , from hz6763
|
32
32
|
|
@@ -41,11 +41,11 @@
|
|
41
41
|
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
|
42
42
|
|
43
43
|
module CharDet
|
44
|
-
|
44
|
+
GB18030_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
45
45
|
|
46
|
-
|
46
|
+
GB18030_TABLE_SIZE = 3760
|
47
47
|
|
48
|
-
|
48
|
+
GB18030CharToFreqOrder = [
|
49
49
|
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
50
50
|
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
51
51
|
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
@@ -27,16 +27,16 @@
|
|
27
27
|
######################### END LICENSE BLOCK #########################
|
28
28
|
|
29
29
|
module CharDet
|
30
|
-
class
|
30
|
+
class GB18030Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@codingSM = CodingStateMachine.new(
|
34
|
-
@distributionAnalyzer =
|
33
|
+
@codingSM = CodingStateMachine.new(GB18030SMModel)
|
34
|
+
@distributionAnalyzer = GB18030DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
38
38
|
def get_charset_name
|
39
|
-
return "
|
39
|
+
return "GB18030"
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
data/lib/rchardet/mbcssm.rb
CHANGED
@@ -239,9 +239,9 @@ module CharDet
|
|
239
239
|
'name' => 'x-euc-tw'
|
240
240
|
}.freeze
|
241
241
|
|
242
|
-
#
|
242
|
+
# GB18030
|
243
243
|
|
244
|
-
|
244
|
+
GB18030_cls = [
|
245
245
|
1,1,1,1,1,1,1,1, # 00 - 07
|
246
246
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
247
247
|
1,1,1,1,1,1,1,1, # 10 - 17
|
@@ -276,7 +276,7 @@ module CharDet
|
|
276
276
|
6,6,6,6,6,6,6,0 # f8 - ff
|
277
277
|
].freeze
|
278
278
|
|
279
|
-
|
279
|
+
GB18030_st = [
|
280
280
|
EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
|
281
281
|
EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
|
282
282
|
EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
|
@@ -290,13 +290,13 @@ module CharDet
|
|
290
290
|
# it is used for frequency analysis only, and we are validing
|
291
291
|
# each code range there as well. So it is safe to set it to be
|
292
292
|
# 2 here.
|
293
|
-
|
293
|
+
GB18030CharLenTable = [0, 1, 1, 1, 1, 1, 2].freeze
|
294
294
|
|
295
|
-
|
295
|
+
GB18030SMModel = {'classTable' => GB18030_cls,
|
296
296
|
'classFactor' => 7,
|
297
|
-
'stateTable' =>
|
298
|
-
'charLenTable' =>
|
299
|
-
'name' => '
|
297
|
+
'stateTable' => GB18030_st,
|
298
|
+
'charLenTable' => GB18030CharLenTable,
|
299
|
+
'name' => 'GB18030'
|
300
300
|
}.freeze
|
301
301
|
|
302
302
|
# Shift_JIS
|
data/lib/rchardet/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rchardet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Grosser
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-01
|
12
|
+
date: 2018-06-01 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description:
|
15
15
|
email:
|
@@ -34,8 +34,8 @@ files:
|
|
34
34
|
- lib/rchardet/euckrprober.rb
|
35
35
|
- lib/rchardet/euctwfreq.rb
|
36
36
|
- lib/rchardet/euctwprober.rb
|
37
|
-
- lib/rchardet/
|
38
|
-
- lib/rchardet/
|
37
|
+
- lib/rchardet/gb18030freq.rb
|
38
|
+
- lib/rchardet/gb18030prober.rb
|
39
39
|
- lib/rchardet/hebrewprober.rb
|
40
40
|
- lib/rchardet/jisfreq.rb
|
41
41
|
- lib/rchardet/jpcntx.rb
|
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
75
|
version: '0'
|
76
76
|
requirements: []
|
77
77
|
rubyforge_project:
|
78
|
-
rubygems_version: 2.6
|
78
|
+
rubygems_version: 2.7.6
|
79
79
|
signing_key:
|
80
80
|
specification_version: 4
|
81
81
|
summary: Character encoding auto-detection in Ruby. As smart as your browser. Open
|