rchardet 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/rchardet.rb +2 -2
- data/lib/rchardet/chardistribution.rb +5 -5
- data/lib/rchardet/{gb2312freq.rb → gb18030freq.rb} +4 -4
- data/lib/rchardet/{gb2312prober.rb → gb18030prober.rb} +4 -4
- data/lib/rchardet/mbcsgroupprober.rb +1 -1
- data/lib/rchardet/mbcssm.rb +8 -8
- data/lib/rchardet/version.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b03009ca37e41a17ddc0f4f62cd5889b11b59b1f2100998f191a0be7ede5d000
|
4
|
+
data.tar.gz: ea8473dc5f1b7c4d24f858de90c591f1dec66055cc05b30b475e8256fa0c7a41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7829042566b227306274d03219456cda42b8cd3767a085bec2dd72e0d6d0d4fa78a165fd911768599c2c255f5879a7c25d2cde74bee2091a70e5fbf2ca77e586
|
7
|
+
data.tar.gz: 249b2027502f888713c57bcae850178adcdf60db9463a5357b396b3a2d9fd76a65d9a12247f8cddf8bf4e877d1c1a427c05b0c040c3f329b604106775af85f27
|
data/lib/rchardet.rb
CHANGED
@@ -33,8 +33,8 @@ require 'rchardet/euckrfreq'
|
|
33
33
|
require 'rchardet/euckrprober'
|
34
34
|
require 'rchardet/euctwfreq'
|
35
35
|
require 'rchardet/euctwprober'
|
36
|
-
require 'rchardet/
|
37
|
-
require 'rchardet/
|
36
|
+
require 'rchardet/gb18030freq'
|
37
|
+
require 'rchardet/gb18030prober'
|
38
38
|
require 'rchardet/hebrewprober'
|
39
39
|
require 'rchardet/jisfreq'
|
40
40
|
require 'rchardet/jpcntx'
|
@@ -149,16 +149,16 @@ module CharDet
|
|
149
149
|
end
|
150
150
|
end
|
151
151
|
|
152
|
-
class
|
152
|
+
class GB18030DistributionAnalysis < CharDistributionAnalysis
|
153
153
|
def initialize
|
154
154
|
super()
|
155
|
-
@charToFreqOrder =
|
156
|
-
@tableSize =
|
157
|
-
@typicalDistributionRatio =
|
155
|
+
@charToFreqOrder = GB18030CharToFreqOrder
|
156
|
+
@tableSize = GB18030_TABLE_SIZE
|
157
|
+
@typicalDistributionRatio = GB18030_TYPICAL_DISTRIBUTION_RATIO
|
158
158
|
end
|
159
159
|
|
160
160
|
def get_order(aStr)
|
161
|
-
# for
|
161
|
+
# for GB18030 encoding, we are interested
|
162
162
|
# first byte range: 0xb0 -- 0xfe
|
163
163
|
# second byte range: 0xa1 -- 0xfe
|
164
164
|
# no validation needed here. State machine has done that
|
@@ -26,7 +26,7 @@
|
|
26
26
|
# 02110-1301 USA
|
27
27
|
######################### END LICENSE BLOCK #########################
|
28
28
|
|
29
|
-
#
|
29
|
+
# GB18030 most frequently used character table
|
30
30
|
#
|
31
31
|
# Char to FreqOrder table , from hz6763
|
32
32
|
|
@@ -41,11 +41,11 @@
|
|
41
41
|
# Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
|
42
42
|
|
43
43
|
module CharDet
|
44
|
-
|
44
|
+
GB18030_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
45
45
|
|
46
|
-
|
46
|
+
GB18030_TABLE_SIZE = 3760
|
47
47
|
|
48
|
-
|
48
|
+
GB18030CharToFreqOrder = [
|
49
49
|
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
50
50
|
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
51
51
|
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
|
@@ -27,16 +27,16 @@
|
|
27
27
|
######################### END LICENSE BLOCK #########################
|
28
28
|
|
29
29
|
module CharDet
|
30
|
-
class
|
30
|
+
class GB18030Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@codingSM = CodingStateMachine.new(
|
34
|
-
@distributionAnalyzer =
|
33
|
+
@codingSM = CodingStateMachine.new(GB18030SMModel)
|
34
|
+
@distributionAnalyzer = GB18030DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
38
38
|
def get_charset_name
|
39
|
-
return "
|
39
|
+
return "GB18030"
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
data/lib/rchardet/mbcssm.rb
CHANGED
@@ -239,9 +239,9 @@ module CharDet
|
|
239
239
|
'name' => 'x-euc-tw'
|
240
240
|
}.freeze
|
241
241
|
|
242
|
-
#
|
242
|
+
# GB18030
|
243
243
|
|
244
|
-
|
244
|
+
GB18030_cls = [
|
245
245
|
1,1,1,1,1,1,1,1, # 00 - 07
|
246
246
|
1,1,1,1,1,1,0,0, # 08 - 0f
|
247
247
|
1,1,1,1,1,1,1,1, # 10 - 17
|
@@ -276,7 +276,7 @@ module CharDet
|
|
276
276
|
6,6,6,6,6,6,6,0 # f8 - ff
|
277
277
|
].freeze
|
278
278
|
|
279
|
-
|
279
|
+
GB18030_st = [
|
280
280
|
EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
|
281
281
|
EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
|
282
282
|
EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
|
@@ -290,13 +290,13 @@ module CharDet
|
|
290
290
|
# it is used for frequency analysis only, and we are validing
|
291
291
|
# each code range there as well. So it is safe to set it to be
|
292
292
|
# 2 here.
|
293
|
-
|
293
|
+
GB18030CharLenTable = [0, 1, 1, 1, 1, 1, 2].freeze
|
294
294
|
|
295
|
-
|
295
|
+
GB18030SMModel = {'classTable' => GB18030_cls,
|
296
296
|
'classFactor' => 7,
|
297
|
-
'stateTable' =>
|
298
|
-
'charLenTable' =>
|
299
|
-
'name' => '
|
297
|
+
'stateTable' => GB18030_st,
|
298
|
+
'charLenTable' => GB18030CharLenTable,
|
299
|
+
'name' => 'GB18030'
|
300
300
|
}.freeze
|
301
301
|
|
302
302
|
# Shift_JIS
|
data/lib/rchardet/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rchardet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Grosser
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-01
|
12
|
+
date: 2018-06-01 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description:
|
15
15
|
email:
|
@@ -34,8 +34,8 @@ files:
|
|
34
34
|
- lib/rchardet/euckrprober.rb
|
35
35
|
- lib/rchardet/euctwfreq.rb
|
36
36
|
- lib/rchardet/euctwprober.rb
|
37
|
-
- lib/rchardet/
|
38
|
-
- lib/rchardet/
|
37
|
+
- lib/rchardet/gb18030freq.rb
|
38
|
+
- lib/rchardet/gb18030prober.rb
|
39
39
|
- lib/rchardet/hebrewprober.rb
|
40
40
|
- lib/rchardet/jisfreq.rb
|
41
41
|
- lib/rchardet/jpcntx.rb
|
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
75
|
version: '0'
|
76
76
|
requirements: []
|
77
77
|
rubyforge_project:
|
78
|
-
rubygems_version: 2.6
|
78
|
+
rubygems_version: 2.7.6
|
79
79
|
signing_key:
|
80
80
|
specification_version: 4
|
81
81
|
summary: Character encoding auto-detection in Ruby. As smart as your browser. Open
|