chardet2 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -58,7 +58,9 @@ module UniversalDetector
58
58
  #"""feed a character with known length"""
59
59
  if aCharLen == 2
60
60
  # we only care about 2-bytes character in our distribution analysis
61
- order = get_order(aStr)
61
+ b1 = aStr.get_byte(0)
62
+ b2 = aStr.get_byte(1)
63
+ order = get_order([b1, b2])
62
64
  else
63
65
  order = -1
64
66
  end
@@ -242,4 +244,4 @@ module UniversalDetector
242
244
  end
243
245
  end
244
246
 
245
- end
247
+ end
@@ -55,7 +55,7 @@ module UniversalDetector
55
55
  def feed(aBuf)
56
56
  aLen = aBuf.length
57
57
  for i in 0...aLen
58
- codingState = @_mCodingSM.next_state(aBuf[i])
58
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
59
59
  if codingState == :Error
60
60
  if DEBUG
61
61
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -94,4 +94,4 @@ module UniversalDetector
94
94
  return [contxtCf, distribCf].max
95
95
  end
96
96
  end
97
- end
97
+ end
@@ -224,7 +224,7 @@ module UniversalDetector
224
224
 
225
225
  aBuf = filter_high_bit_only(aBuf)
226
226
 
227
- for cur in aBuf
227
+ aBuf.each_char do |cur|
228
228
  if cur == ' '
229
229
  # We stand on a space - a word just ended
230
230
  if @_mBeforePrev != ' '
@@ -289,4 +289,4 @@ module UniversalDetector
289
289
  return :Detecting
290
290
  end
291
291
  end
292
- end
292
+ end
@@ -145,7 +145,7 @@ module UniversalDetector
145
145
  # this character will simply our logic and improve performance.
146
146
  i = @_mNeedToSkipCharNum
147
147
  while i < aLen
148
- order, charLen = get_order(aBuf[i..i+2])
148
+ order, charLen = get_order(aBuf[i..i+2].to_bytes)
149
149
  i += charLen
150
150
  if i > aLen
151
151
  @_mNeedToSkipCharNum = i - aLen
@@ -195,10 +195,10 @@ module UniversalDetector
195
195
 
196
196
  # return its order if it is hiragana
197
197
  if aStr.length > 1
198
- if (aStr[0] == '\202') and \
198
+ if (aStr[0] == 0x82) and \
199
199
  (aStr[1] >= 0x9F) and \
200
200
  (aStr[1] <= 0xF1)
201
- return ord(aStr[1]) - 0x9F, charLen
201
+ return aStr[1] - 0x9F, charLen
202
202
  end
203
203
  end
204
204
 
@@ -210,7 +210,6 @@ module UniversalDetector
210
210
  def get_order(aStr)
211
211
  unless aStr then return -1, 1 end
212
212
  # find out current char's byte length
213
- aStr = aStr.to_s
214
213
  if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
215
214
  charLen = 2
216
215
  elsif aStr[0] == 0x8F
@@ -224,7 +223,7 @@ module UniversalDetector
224
223
  if (aStr[0] == 0xA4) and \
225
224
  (aStr[1] >= 0xA1) and \
226
225
  (aStr[1] <= 0xF3)
227
- return aStr[1][0] - 0xA1, charLen
226
+ return aStr[1] - 0xA1, charLen
228
227
  end
229
228
  end
230
229
 
@@ -118,8 +118,8 @@ module UniversalDetector
118
118
 
119
119
  def feed(aBuf)
120
120
  aBuf = filter_with_english_letters(aBuf)
121
- for c in aBuf
122
- charClass = Latin1_CharToClass[c[0]]
121
+ aBuf.each_char do |c|
122
+ charClass = Latin1_CharToClass[c.get_byte(0)]
123
123
  freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
124
124
  if freq == 0
125
125
  @_mState = :NotMe
@@ -56,7 +56,7 @@ module UniversalDetector
56
56
  def feed(aBuf)
57
57
  aLen = aBuf.length
58
58
  for i in 0...aLen
59
- codingState = @_mCodingSM.next_state(aBuf[i])
59
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
60
60
  if codingState == :Error
61
61
  if UniversalDetector::DEBUG
62
62
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -91,4 +91,4 @@ module UniversalDetector
91
91
  return @_mDistributionAnalyzer.get_confidence()
92
92
  end
93
93
  end
94
- end
94
+ end
@@ -55,7 +55,7 @@ module UniversalDetector
55
55
  def feed(aBuf)
56
56
  aLen = aBuf.length
57
57
  for i in 0...aLen
58
- codingState = @_mCodingSM.next_state(aBuf[i])
58
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
59
59
  if codingState == :Error
60
60
  if DEBUG
61
61
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -96,4 +96,4 @@ module UniversalDetector
96
96
  return [contxtCf, distribCf].max
97
97
  end
98
98
  end
99
- end
99
+ end
@@ -75,7 +75,7 @@ module UniversalDetector
75
75
  end
76
76
 
77
77
  for i in 0...aLen
78
- c = aBuf[i]
78
+ c = aBuf.get_byte(i)
79
79
  order = @_mModel['charToOrderMap'][c]
80
80
  if order < SYMBOL_CAT_ORDER
81
81
  @_mTotalChar += 1
@@ -128,4 +128,4 @@ module UniversalDetector
128
128
  return r
129
129
  end
130
130
  end
131
- end
131
+ end
@@ -52,9 +52,9 @@ module UniversalDetector
52
52
  end
53
53
 
54
54
  def feed(aBuf)
55
- aLen = aBuf.length
55
+ aLen = aBuf.length
56
56
  for i in 0...aLen
57
- codingState = @_mCodingSM.next_state(aBuf[i])
57
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
58
58
  if codingState == :Error
59
59
  @_mState = :NotMe
60
60
  break
@@ -30,15 +30,16 @@ require "EscCharSetProber"
30
30
  require "MBCSGroupProber"
31
31
  require "SBCSGroupProber"
32
32
  require "Latin1Prober"
33
+ require "shim"
33
34
  require "singleton"
34
35
 
35
- module UniversalDetector
36
-
36
+ module UniversalDetector
37
+
37
38
  class << self
38
39
  def encoding(data)
39
40
  chardet(data)['encoding']
40
41
  end
41
-
42
+
42
43
  def chardet(data)
43
44
  u = UniversalDetector::Detector.instance
44
45
  u.reset()
@@ -0,0 +1,37 @@
1
+ module UniversalDetector
2
+
3
+ def self.is18?
4
+ RUBY_VERSION =~ /^1\.8/
5
+ end
6
+
7
+ end
8
+
9
+ class String
10
+
11
+ if UniversalDetector.is18?
12
+ alias :get_byte :[]
13
+ else
14
+ def get_byte(i)
15
+ self[i].ord
16
+ end
17
+ end
18
+
19
+ def to_bytes
20
+ bytes.to_a
21
+ end
22
+
23
+ end
24
+
25
+ class Array
26
+
27
+ def get_byte(i)
28
+ v = self[i]
29
+ v = v.bytes.to_a.first if v.is_a?(String)
30
+ v
31
+ end
32
+
33
+ def to_bytes
34
+ map {|v| v.is_a?(String) ? v.get_byte(0) : v}
35
+ end
36
+
37
+ end
metadata CHANGED
@@ -1,31 +1,43 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: chardet2
3
- version: !ruby/object:Gem::Version
4
- version: 1.0.0
3
+ version: !ruby/object:Gem::Version
4
+ hash: 21
5
5
  prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 1
10
+ version: 1.0.1
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Jan Xie
9
14
  - Felipe Tanus
10
15
  - Hui
11
16
  autorequire: UniversalDetector
12
17
  bindir: bin
13
18
  cert_chain: []
14
- date: 2013-05-17 00:00:00.000000000 Z
19
+
20
+ date: 2013-05-17 00:00:00 +08:00
21
+ default_executable:
15
22
  dependencies: []
23
+
16
24
  description:
17
- email:
25
+ email:
18
26
  - jan.h.xie@gmail.com
19
27
  executables: []
28
+
20
29
  extensions: []
30
+
21
31
  extra_rdoc_files: []
22
- files:
32
+
33
+ files:
23
34
  - lib/MBCSSM.rb
24
35
  - lib/MultiByteCharSetProber.rb
25
36
  - lib/JapaneseContextAnalysis.rb
26
37
  - lib/LangCyrillicModel.rb
27
38
  - lib/EUCKRFreq.rb
28
39
  - lib/GB2312Freq.rb
40
+ - lib/shim.rb
29
41
  - lib/EUCKRProber.rb
30
42
  - lib/CodingStateMachine.rb
31
43
  - lib/LangHungarianModel.rb
@@ -55,29 +67,39 @@ files:
55
67
  - lib/CharSetProber.rb
56
68
  - COPYING
57
69
  - README.markdown
70
+ has_rdoc: true
58
71
  homepage: https://github.com/janx/chardet
59
72
  licenses: []
73
+
60
74
  post_install_message:
61
75
  rdoc_options: []
62
- require_paths:
76
+
77
+ require_paths:
63
78
  - lib
64
- required_ruby_version: !ruby/object:Gem::Requirement
79
+ required_ruby_version: !ruby/object:Gem::Requirement
65
80
  none: false
66
- requirements:
67
- - - ! '>='
68
- - !ruby/object:Gem::Version
69
- version: '0'
70
- required_rubygems_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
89
  none: false
72
- requirements:
73
- - - ! '>='
74
- - !ruby/object:Gem::Version
75
- version: '0'
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ hash: 3
94
+ segments:
95
+ - 0
96
+ version: "0"
76
97
  requirements: []
98
+
77
99
  rubyforge_project:
78
- rubygems_version: 1.8.23
100
+ rubygems_version: 1.6.2
79
101
  signing_key:
80
102
  specification_version: 3
81
- summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
82
- on Mark Pilgrim's Python port and Hui's ruby port.
103
+ summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base on Mark Pilgrim's Python port and Hui's ruby port.
83
104
  test_files: []
105
+