chardet2 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/CharDistributionAnalysis.rb +4 -2
- data/lib/EUCJPProber.rb +2 -2
- data/lib/HebrewProber.rb +2 -2
- data/lib/JapaneseContextAnalysis.rb +4 -5
- data/lib/Latin1Prober.rb +2 -2
- data/lib/MultiByteCharSetProber.rb +2 -2
- data/lib/SJISProber.rb +2 -2
- data/lib/SingleByteCharSetProber.rb +2 -2
- data/lib/UTF8Prober.rb +2 -2
- data/lib/UniversalDetector.rb +4 -3
- data/lib/shim.rb +37 -0
- metadata +43 -21
@@ -58,7 +58,9 @@ module UniversalDetector
|
|
58
58
|
#"""feed a character with known length"""
|
59
59
|
if aCharLen == 2
|
60
60
|
# we only care about 2-bytes character in our distribution analysis
|
61
|
-
|
61
|
+
b1 = aStr.get_byte(0)
|
62
|
+
b2 = aStr.get_byte(1)
|
63
|
+
order = get_order([b1, b2])
|
62
64
|
else
|
63
65
|
order = -1
|
64
66
|
end
|
@@ -242,4 +244,4 @@ module UniversalDetector
|
|
242
244
|
end
|
243
245
|
end
|
244
246
|
|
245
|
-
end
|
247
|
+
end
|
data/lib/EUCJPProber.rb
CHANGED
@@ -55,7 +55,7 @@ module UniversalDetector
|
|
55
55
|
def feed(aBuf)
|
56
56
|
aLen = aBuf.length
|
57
57
|
for i in 0...aLen
|
58
|
-
codingState = @_mCodingSM.next_state(aBuf
|
58
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
59
59
|
if codingState == :Error
|
60
60
|
if DEBUG
|
61
61
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -94,4 +94,4 @@ module UniversalDetector
|
|
94
94
|
return [contxtCf, distribCf].max
|
95
95
|
end
|
96
96
|
end
|
97
|
-
end
|
97
|
+
end
|
data/lib/HebrewProber.rb
CHANGED
@@ -224,7 +224,7 @@ module UniversalDetector
|
|
224
224
|
|
225
225
|
aBuf = filter_high_bit_only(aBuf)
|
226
226
|
|
227
|
-
|
227
|
+
aBuf.each_char do |cur|
|
228
228
|
if cur == ' '
|
229
229
|
# We stand on a space - a word just ended
|
230
230
|
if @_mBeforePrev != ' '
|
@@ -289,4 +289,4 @@ module UniversalDetector
|
|
289
289
|
return :Detecting
|
290
290
|
end
|
291
291
|
end
|
292
|
-
end
|
292
|
+
end
|
@@ -145,7 +145,7 @@ module UniversalDetector
|
|
145
145
|
# this character will simply our logic and improve performance.
|
146
146
|
i = @_mNeedToSkipCharNum
|
147
147
|
while i < aLen
|
148
|
-
order, charLen = get_order(aBuf[i..i+2])
|
148
|
+
order, charLen = get_order(aBuf[i..i+2].to_bytes)
|
149
149
|
i += charLen
|
150
150
|
if i > aLen
|
151
151
|
@_mNeedToSkipCharNum = i - aLen
|
@@ -195,10 +195,10 @@ module UniversalDetector
|
|
195
195
|
|
196
196
|
# return its order if it is hiragana
|
197
197
|
if aStr.length > 1
|
198
|
-
if (aStr[0] ==
|
198
|
+
if (aStr[0] == 0x82) and \
|
199
199
|
(aStr[1] >= 0x9F) and \
|
200
200
|
(aStr[1] <= 0xF1)
|
201
|
-
return
|
201
|
+
return aStr[1] - 0x9F, charLen
|
202
202
|
end
|
203
203
|
end
|
204
204
|
|
@@ -210,7 +210,6 @@ module UniversalDetector
|
|
210
210
|
def get_order(aStr)
|
211
211
|
unless aStr then return -1, 1 end
|
212
212
|
# find out current char's byte length
|
213
|
-
aStr = aStr.to_s
|
214
213
|
if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
|
215
214
|
charLen = 2
|
216
215
|
elsif aStr[0] == 0x8F
|
@@ -224,7 +223,7 @@ module UniversalDetector
|
|
224
223
|
if (aStr[0] == 0xA4) and \
|
225
224
|
(aStr[1] >= 0xA1) and \
|
226
225
|
(aStr[1] <= 0xF3)
|
227
|
-
return aStr[1]
|
226
|
+
return aStr[1] - 0xA1, charLen
|
228
227
|
end
|
229
228
|
end
|
230
229
|
|
data/lib/Latin1Prober.rb
CHANGED
@@ -118,8 +118,8 @@ module UniversalDetector
|
|
118
118
|
|
119
119
|
def feed(aBuf)
|
120
120
|
aBuf = filter_with_english_letters(aBuf)
|
121
|
-
|
122
|
-
charClass = Latin1_CharToClass[c
|
121
|
+
aBuf.each_char do |c|
|
122
|
+
charClass = Latin1_CharToClass[c.get_byte(0)]
|
123
123
|
freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
|
124
124
|
if freq == 0
|
125
125
|
@_mState = :NotMe
|
@@ -56,7 +56,7 @@ module UniversalDetector
|
|
56
56
|
def feed(aBuf)
|
57
57
|
aLen = aBuf.length
|
58
58
|
for i in 0...aLen
|
59
|
-
codingState = @_mCodingSM.next_state(aBuf
|
59
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
60
60
|
if codingState == :Error
|
61
61
|
if UniversalDetector::DEBUG
|
62
62
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -91,4 +91,4 @@ module UniversalDetector
|
|
91
91
|
return @_mDistributionAnalyzer.get_confidence()
|
92
92
|
end
|
93
93
|
end
|
94
|
-
end
|
94
|
+
end
|
data/lib/SJISProber.rb
CHANGED
@@ -55,7 +55,7 @@ module UniversalDetector
|
|
55
55
|
def feed(aBuf)
|
56
56
|
aLen = aBuf.length
|
57
57
|
for i in 0...aLen
|
58
|
-
codingState = @_mCodingSM.next_state(aBuf
|
58
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
59
59
|
if codingState == :Error
|
60
60
|
if DEBUG
|
61
61
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -96,4 +96,4 @@ module UniversalDetector
|
|
96
96
|
return [contxtCf, distribCf].max
|
97
97
|
end
|
98
98
|
end
|
99
|
-
end
|
99
|
+
end
|
@@ -75,7 +75,7 @@ module UniversalDetector
|
|
75
75
|
end
|
76
76
|
|
77
77
|
for i in 0...aLen
|
78
|
-
c = aBuf
|
78
|
+
c = aBuf.get_byte(i)
|
79
79
|
order = @_mModel['charToOrderMap'][c]
|
80
80
|
if order < SYMBOL_CAT_ORDER
|
81
81
|
@_mTotalChar += 1
|
@@ -128,4 +128,4 @@ module UniversalDetector
|
|
128
128
|
return r
|
129
129
|
end
|
130
130
|
end
|
131
|
-
end
|
131
|
+
end
|
data/lib/UTF8Prober.rb
CHANGED
@@ -52,9 +52,9 @@ module UniversalDetector
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def feed(aBuf)
|
55
|
-
aLen = aBuf.length
|
55
|
+
aLen = aBuf.length
|
56
56
|
for i in 0...aLen
|
57
|
-
codingState = @_mCodingSM.next_state(aBuf
|
57
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
58
58
|
if codingState == :Error
|
59
59
|
@_mState = :NotMe
|
60
60
|
break
|
data/lib/UniversalDetector.rb
CHANGED
@@ -30,15 +30,16 @@ require "EscCharSetProber"
|
|
30
30
|
require "MBCSGroupProber"
|
31
31
|
require "SBCSGroupProber"
|
32
32
|
require "Latin1Prober"
|
33
|
+
require "shim"
|
33
34
|
require "singleton"
|
34
35
|
|
35
|
-
module UniversalDetector
|
36
|
-
|
36
|
+
module UniversalDetector
|
37
|
+
|
37
38
|
class << self
|
38
39
|
def encoding(data)
|
39
40
|
chardet(data)['encoding']
|
40
41
|
end
|
41
|
-
|
42
|
+
|
42
43
|
def chardet(data)
|
43
44
|
u = UniversalDetector::Detector.instance
|
44
45
|
u.reset()
|
data/lib/shim.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module UniversalDetector
|
2
|
+
|
3
|
+
def self.is18?
|
4
|
+
RUBY_VERSION =~ /^1\.8/
|
5
|
+
end
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
class String
|
10
|
+
|
11
|
+
if UniversalDetector.is18?
|
12
|
+
alias :get_byte :[]
|
13
|
+
else
|
14
|
+
def get_byte(i)
|
15
|
+
self[i].ord
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_bytes
|
20
|
+
bytes.to_a
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
class Array
|
26
|
+
|
27
|
+
def get_byte(i)
|
28
|
+
v = self[i]
|
29
|
+
v = v.bytes.to_a.first if v.is_a?(String)
|
30
|
+
v
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_bytes
|
34
|
+
map {|v| v.is_a?(String) ? v.get_byte(0) : v}
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
metadata
CHANGED
@@ -1,31 +1,43 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: chardet2
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 1.0.1
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Jan Xie
|
9
14
|
- Felipe Tanus
|
10
15
|
- Hui
|
11
16
|
autorequire: UniversalDetector
|
12
17
|
bindir: bin
|
13
18
|
cert_chain: []
|
14
|
-
|
19
|
+
|
20
|
+
date: 2013-05-17 00:00:00 +08:00
|
21
|
+
default_executable:
|
15
22
|
dependencies: []
|
23
|
+
|
16
24
|
description:
|
17
|
-
email:
|
25
|
+
email:
|
18
26
|
- jan.h.xie@gmail.com
|
19
27
|
executables: []
|
28
|
+
|
20
29
|
extensions: []
|
30
|
+
|
21
31
|
extra_rdoc_files: []
|
22
|
-
|
32
|
+
|
33
|
+
files:
|
23
34
|
- lib/MBCSSM.rb
|
24
35
|
- lib/MultiByteCharSetProber.rb
|
25
36
|
- lib/JapaneseContextAnalysis.rb
|
26
37
|
- lib/LangCyrillicModel.rb
|
27
38
|
- lib/EUCKRFreq.rb
|
28
39
|
- lib/GB2312Freq.rb
|
40
|
+
- lib/shim.rb
|
29
41
|
- lib/EUCKRProber.rb
|
30
42
|
- lib/CodingStateMachine.rb
|
31
43
|
- lib/LangHungarianModel.rb
|
@@ -55,29 +67,39 @@ files:
|
|
55
67
|
- lib/CharSetProber.rb
|
56
68
|
- COPYING
|
57
69
|
- README.markdown
|
70
|
+
has_rdoc: true
|
58
71
|
homepage: https://github.com/janx/chardet
|
59
72
|
licenses: []
|
73
|
+
|
60
74
|
post_install_message:
|
61
75
|
rdoc_options: []
|
62
|
-
|
76
|
+
|
77
|
+
require_paths:
|
63
78
|
- lib
|
64
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
80
|
none: false
|
66
|
-
requirements:
|
67
|
-
- -
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
|
70
|
-
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
89
|
none: false
|
72
|
-
requirements:
|
73
|
-
- -
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
hash: 3
|
94
|
+
segments:
|
95
|
+
- 0
|
96
|
+
version: "0"
|
76
97
|
requirements: []
|
98
|
+
|
77
99
|
rubyforge_project:
|
78
|
-
rubygems_version: 1.
|
100
|
+
rubygems_version: 1.6.2
|
79
101
|
signing_key:
|
80
102
|
specification_version: 3
|
81
|
-
summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
|
82
|
-
on Mark Pilgrim's Python port and Hui's ruby port.
|
103
|
+
summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base on Mark Pilgrim's Python port and Hui's ruby port.
|
83
104
|
test_files: []
|
105
|
+
|