chardet2 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/CharDistributionAnalysis.rb +4 -2
- data/lib/EUCJPProber.rb +2 -2
- data/lib/HebrewProber.rb +2 -2
- data/lib/JapaneseContextAnalysis.rb +4 -5
- data/lib/Latin1Prober.rb +2 -2
- data/lib/MultiByteCharSetProber.rb +2 -2
- data/lib/SJISProber.rb +2 -2
- data/lib/SingleByteCharSetProber.rb +2 -2
- data/lib/UTF8Prober.rb +2 -2
- data/lib/UniversalDetector.rb +4 -3
- data/lib/shim.rb +37 -0
- metadata +43 -21
@@ -58,7 +58,9 @@ module UniversalDetector
|
|
58
58
|
#"""feed a character with known length"""
|
59
59
|
if aCharLen == 2
|
60
60
|
# we only care about 2-bytes character in our distribution analysis
|
61
|
-
|
61
|
+
b1 = aStr.get_byte(0)
|
62
|
+
b2 = aStr.get_byte(1)
|
63
|
+
order = get_order([b1, b2])
|
62
64
|
else
|
63
65
|
order = -1
|
64
66
|
end
|
@@ -242,4 +244,4 @@ module UniversalDetector
|
|
242
244
|
end
|
243
245
|
end
|
244
246
|
|
245
|
-
end
|
247
|
+
end
|
data/lib/EUCJPProber.rb
CHANGED
@@ -55,7 +55,7 @@ module UniversalDetector
|
|
55
55
|
def feed(aBuf)
|
56
56
|
aLen = aBuf.length
|
57
57
|
for i in 0...aLen
|
58
|
-
codingState = @_mCodingSM.next_state(aBuf
|
58
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
59
59
|
if codingState == :Error
|
60
60
|
if DEBUG
|
61
61
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -94,4 +94,4 @@ module UniversalDetector
|
|
94
94
|
return [contxtCf, distribCf].max
|
95
95
|
end
|
96
96
|
end
|
97
|
-
end
|
97
|
+
end
|
data/lib/HebrewProber.rb
CHANGED
@@ -224,7 +224,7 @@ module UniversalDetector
|
|
224
224
|
|
225
225
|
aBuf = filter_high_bit_only(aBuf)
|
226
226
|
|
227
|
-
|
227
|
+
aBuf.each_char do |cur|
|
228
228
|
if cur == ' '
|
229
229
|
# We stand on a space - a word just ended
|
230
230
|
if @_mBeforePrev != ' '
|
@@ -289,4 +289,4 @@ module UniversalDetector
|
|
289
289
|
return :Detecting
|
290
290
|
end
|
291
291
|
end
|
292
|
-
end
|
292
|
+
end
|
@@ -145,7 +145,7 @@ module UniversalDetector
|
|
145
145
|
# this character will simply our logic and improve performance.
|
146
146
|
i = @_mNeedToSkipCharNum
|
147
147
|
while i < aLen
|
148
|
-
order, charLen = get_order(aBuf[i..i+2])
|
148
|
+
order, charLen = get_order(aBuf[i..i+2].to_bytes)
|
149
149
|
i += charLen
|
150
150
|
if i > aLen
|
151
151
|
@_mNeedToSkipCharNum = i - aLen
|
@@ -195,10 +195,10 @@ module UniversalDetector
|
|
195
195
|
|
196
196
|
# return its order if it is hiragana
|
197
197
|
if aStr.length > 1
|
198
|
-
if (aStr[0] ==
|
198
|
+
if (aStr[0] == 0x82) and \
|
199
199
|
(aStr[1] >= 0x9F) and \
|
200
200
|
(aStr[1] <= 0xF1)
|
201
|
-
return
|
201
|
+
return aStr[1] - 0x9F, charLen
|
202
202
|
end
|
203
203
|
end
|
204
204
|
|
@@ -210,7 +210,6 @@ module UniversalDetector
|
|
210
210
|
def get_order(aStr)
|
211
211
|
unless aStr then return -1, 1 end
|
212
212
|
# find out current char's byte length
|
213
|
-
aStr = aStr.to_s
|
214
213
|
if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
|
215
214
|
charLen = 2
|
216
215
|
elsif aStr[0] == 0x8F
|
@@ -224,7 +223,7 @@ module UniversalDetector
|
|
224
223
|
if (aStr[0] == 0xA4) and \
|
225
224
|
(aStr[1] >= 0xA1) and \
|
226
225
|
(aStr[1] <= 0xF3)
|
227
|
-
return aStr[1]
|
226
|
+
return aStr[1] - 0xA1, charLen
|
228
227
|
end
|
229
228
|
end
|
230
229
|
|
data/lib/Latin1Prober.rb
CHANGED
@@ -118,8 +118,8 @@ module UniversalDetector
|
|
118
118
|
|
119
119
|
def feed(aBuf)
|
120
120
|
aBuf = filter_with_english_letters(aBuf)
|
121
|
-
|
122
|
-
charClass = Latin1_CharToClass[c
|
121
|
+
aBuf.each_char do |c|
|
122
|
+
charClass = Latin1_CharToClass[c.get_byte(0)]
|
123
123
|
freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
|
124
124
|
if freq == 0
|
125
125
|
@_mState = :NotMe
|
@@ -56,7 +56,7 @@ module UniversalDetector
|
|
56
56
|
def feed(aBuf)
|
57
57
|
aLen = aBuf.length
|
58
58
|
for i in 0...aLen
|
59
|
-
codingState = @_mCodingSM.next_state(aBuf
|
59
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
60
60
|
if codingState == :Error
|
61
61
|
if UniversalDetector::DEBUG
|
62
62
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -91,4 +91,4 @@ module UniversalDetector
|
|
91
91
|
return @_mDistributionAnalyzer.get_confidence()
|
92
92
|
end
|
93
93
|
end
|
94
|
-
end
|
94
|
+
end
|
data/lib/SJISProber.rb
CHANGED
@@ -55,7 +55,7 @@ module UniversalDetector
|
|
55
55
|
def feed(aBuf)
|
56
56
|
aLen = aBuf.length
|
57
57
|
for i in 0...aLen
|
58
|
-
codingState = @_mCodingSM.next_state(aBuf
|
58
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
59
59
|
if codingState == :Error
|
60
60
|
if DEBUG
|
61
61
|
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
@@ -96,4 +96,4 @@ module UniversalDetector
|
|
96
96
|
return [contxtCf, distribCf].max
|
97
97
|
end
|
98
98
|
end
|
99
|
-
end
|
99
|
+
end
|
@@ -75,7 +75,7 @@ module UniversalDetector
|
|
75
75
|
end
|
76
76
|
|
77
77
|
for i in 0...aLen
|
78
|
-
c = aBuf
|
78
|
+
c = aBuf.get_byte(i)
|
79
79
|
order = @_mModel['charToOrderMap'][c]
|
80
80
|
if order < SYMBOL_CAT_ORDER
|
81
81
|
@_mTotalChar += 1
|
@@ -128,4 +128,4 @@ module UniversalDetector
|
|
128
128
|
return r
|
129
129
|
end
|
130
130
|
end
|
131
|
-
end
|
131
|
+
end
|
data/lib/UTF8Prober.rb
CHANGED
@@ -52,9 +52,9 @@ module UniversalDetector
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def feed(aBuf)
|
55
|
-
aLen = aBuf.length
|
55
|
+
aLen = aBuf.length
|
56
56
|
for i in 0...aLen
|
57
|
-
codingState = @_mCodingSM.next_state(aBuf
|
57
|
+
codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
|
58
58
|
if codingState == :Error
|
59
59
|
@_mState = :NotMe
|
60
60
|
break
|
data/lib/UniversalDetector.rb
CHANGED
@@ -30,15 +30,16 @@ require "EscCharSetProber"
|
|
30
30
|
require "MBCSGroupProber"
|
31
31
|
require "SBCSGroupProber"
|
32
32
|
require "Latin1Prober"
|
33
|
+
require "shim"
|
33
34
|
require "singleton"
|
34
35
|
|
35
|
-
module UniversalDetector
|
36
|
-
|
36
|
+
module UniversalDetector
|
37
|
+
|
37
38
|
class << self
|
38
39
|
def encoding(data)
|
39
40
|
chardet(data)['encoding']
|
40
41
|
end
|
41
|
-
|
42
|
+
|
42
43
|
def chardet(data)
|
43
44
|
u = UniversalDetector::Detector.instance
|
44
45
|
u.reset()
|
data/lib/shim.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module UniversalDetector
|
2
|
+
|
3
|
+
def self.is18?
|
4
|
+
RUBY_VERSION =~ /^1\.8/
|
5
|
+
end
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
class String
|
10
|
+
|
11
|
+
if UniversalDetector.is18?
|
12
|
+
alias :get_byte :[]
|
13
|
+
else
|
14
|
+
def get_byte(i)
|
15
|
+
self[i].ord
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_bytes
|
20
|
+
bytes.to_a
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
class Array
|
26
|
+
|
27
|
+
def get_byte(i)
|
28
|
+
v = self[i]
|
29
|
+
v = v.bytes.to_a.first if v.is_a?(String)
|
30
|
+
v
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_bytes
|
34
|
+
map {|v| v.is_a?(String) ? v.get_byte(0) : v}
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
metadata
CHANGED
@@ -1,31 +1,43 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: chardet2
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 1.0.1
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Jan Xie
|
9
14
|
- Felipe Tanus
|
10
15
|
- Hui
|
11
16
|
autorequire: UniversalDetector
|
12
17
|
bindir: bin
|
13
18
|
cert_chain: []
|
14
|
-
|
19
|
+
|
20
|
+
date: 2013-05-17 00:00:00 +08:00
|
21
|
+
default_executable:
|
15
22
|
dependencies: []
|
23
|
+
|
16
24
|
description:
|
17
|
-
email:
|
25
|
+
email:
|
18
26
|
- jan.h.xie@gmail.com
|
19
27
|
executables: []
|
28
|
+
|
20
29
|
extensions: []
|
30
|
+
|
21
31
|
extra_rdoc_files: []
|
22
|
-
|
32
|
+
|
33
|
+
files:
|
23
34
|
- lib/MBCSSM.rb
|
24
35
|
- lib/MultiByteCharSetProber.rb
|
25
36
|
- lib/JapaneseContextAnalysis.rb
|
26
37
|
- lib/LangCyrillicModel.rb
|
27
38
|
- lib/EUCKRFreq.rb
|
28
39
|
- lib/GB2312Freq.rb
|
40
|
+
- lib/shim.rb
|
29
41
|
- lib/EUCKRProber.rb
|
30
42
|
- lib/CodingStateMachine.rb
|
31
43
|
- lib/LangHungarianModel.rb
|
@@ -55,29 +67,39 @@ files:
|
|
55
67
|
- lib/CharSetProber.rb
|
56
68
|
- COPYING
|
57
69
|
- README.markdown
|
70
|
+
has_rdoc: true
|
58
71
|
homepage: https://github.com/janx/chardet
|
59
72
|
licenses: []
|
73
|
+
|
60
74
|
post_install_message:
|
61
75
|
rdoc_options: []
|
62
|
-
|
76
|
+
|
77
|
+
require_paths:
|
63
78
|
- lib
|
64
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
80
|
none: false
|
66
|
-
requirements:
|
67
|
-
- -
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
|
70
|
-
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
89
|
none: false
|
72
|
-
requirements:
|
73
|
-
- -
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
hash: 3
|
94
|
+
segments:
|
95
|
+
- 0
|
96
|
+
version: "0"
|
76
97
|
requirements: []
|
98
|
+
|
77
99
|
rubyforge_project:
|
78
|
-
rubygems_version: 1.
|
100
|
+
rubygems_version: 1.6.2
|
79
101
|
signing_key:
|
80
102
|
specification_version: 3
|
81
|
-
summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
|
82
|
-
on Mark Pilgrim's Python port and Hui's ruby port.
|
103
|
+
summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base on Mark Pilgrim's Python port and Hui's ruby port.
|
83
104
|
test_files: []
|
105
|
+
|