chardet2 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,7 +58,9 @@ module UniversalDetector
58
58
  #"""feed a character with known length"""
59
59
  if aCharLen == 2
60
60
  # we only care about 2-bytes character in our distribution analysis
61
- order = get_order(aStr)
61
+ b1 = aStr.get_byte(0)
62
+ b2 = aStr.get_byte(1)
63
+ order = get_order([b1, b2])
62
64
  else
63
65
  order = -1
64
66
  end
@@ -242,4 +244,4 @@ module UniversalDetector
242
244
  end
243
245
  end
244
246
 
245
- end
247
+ end
@@ -55,7 +55,7 @@ module UniversalDetector
55
55
  def feed(aBuf)
56
56
  aLen = aBuf.length
57
57
  for i in 0...aLen
58
- codingState = @_mCodingSM.next_state(aBuf[i])
58
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
59
59
  if codingState == :Error
60
60
  if DEBUG
61
61
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -94,4 +94,4 @@ module UniversalDetector
94
94
  return [contxtCf, distribCf].max
95
95
  end
96
96
  end
97
- end
97
+ end
@@ -224,7 +224,7 @@ module UniversalDetector
224
224
 
225
225
  aBuf = filter_high_bit_only(aBuf)
226
226
 
227
- for cur in aBuf
227
+ aBuf.each_char do |cur|
228
228
  if cur == ' '
229
229
  # We stand on a space - a word just ended
230
230
  if @_mBeforePrev != ' '
@@ -289,4 +289,4 @@ module UniversalDetector
289
289
  return :Detecting
290
290
  end
291
291
  end
292
- end
292
+ end
@@ -145,7 +145,7 @@ module UniversalDetector
145
145
  # this character will simply our logic and improve performance.
146
146
  i = @_mNeedToSkipCharNum
147
147
  while i < aLen
148
- order, charLen = get_order(aBuf[i..i+2])
148
+ order, charLen = get_order(aBuf[i..i+2].to_bytes)
149
149
  i += charLen
150
150
  if i > aLen
151
151
  @_mNeedToSkipCharNum = i - aLen
@@ -195,10 +195,10 @@ module UniversalDetector
195
195
 
196
196
  # return its order if it is hiragana
197
197
  if aStr.length > 1
198
- if (aStr[0] == '\202') and \
198
+ if (aStr[0] == 0x82) and \
199
199
  (aStr[1] >= 0x9F) and \
200
200
  (aStr[1] <= 0xF1)
201
- return ord(aStr[1]) - 0x9F, charLen
201
+ return aStr[1] - 0x9F, charLen
202
202
  end
203
203
  end
204
204
 
@@ -210,7 +210,6 @@ module UniversalDetector
210
210
  def get_order(aStr)
211
211
  unless aStr then return -1, 1 end
212
212
  # find out current char's byte length
213
- aStr = aStr.to_s
214
213
  if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
215
214
  charLen = 2
216
215
  elsif aStr[0] == 0x8F
@@ -224,7 +223,7 @@ module UniversalDetector
224
223
  if (aStr[0] == 0xA4) and \
225
224
  (aStr[1] >= 0xA1) and \
226
225
  (aStr[1] <= 0xF3)
227
- return aStr[1][0] - 0xA1, charLen
226
+ return aStr[1] - 0xA1, charLen
228
227
  end
229
228
  end
230
229
 
@@ -118,8 +118,8 @@ module UniversalDetector
118
118
 
119
119
  def feed(aBuf)
120
120
  aBuf = filter_with_english_letters(aBuf)
121
- for c in aBuf
122
- charClass = Latin1_CharToClass[c[0]]
121
+ aBuf.each_char do |c|
122
+ charClass = Latin1_CharToClass[c.get_byte(0)]
123
123
  freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
124
124
  if freq == 0
125
125
  @_mState = :NotMe
@@ -56,7 +56,7 @@ module UniversalDetector
56
56
  def feed(aBuf)
57
57
  aLen = aBuf.length
58
58
  for i in 0...aLen
59
- codingState = @_mCodingSM.next_state(aBuf[i])
59
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
60
60
  if codingState == :Error
61
61
  if UniversalDetector::DEBUG
62
62
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -91,4 +91,4 @@ module UniversalDetector
91
91
  return @_mDistributionAnalyzer.get_confidence()
92
92
  end
93
93
  end
94
- end
94
+ end
@@ -55,7 +55,7 @@ module UniversalDetector
55
55
  def feed(aBuf)
56
56
  aLen = aBuf.length
57
57
  for i in 0...aLen
58
- codingState = @_mCodingSM.next_state(aBuf[i])
58
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
59
59
  if codingState == :Error
60
60
  if DEBUG
61
61
  p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
@@ -96,4 +96,4 @@ module UniversalDetector
96
96
  return [contxtCf, distribCf].max
97
97
  end
98
98
  end
99
- end
99
+ end
@@ -75,7 +75,7 @@ module UniversalDetector
75
75
  end
76
76
 
77
77
  for i in 0...aLen
78
- c = aBuf[i]
78
+ c = aBuf.get_byte(i)
79
79
  order = @_mModel['charToOrderMap'][c]
80
80
  if order < SYMBOL_CAT_ORDER
81
81
  @_mTotalChar += 1
@@ -128,4 +128,4 @@ module UniversalDetector
128
128
  return r
129
129
  end
130
130
  end
131
- end
131
+ end
@@ -52,9 +52,9 @@ module UniversalDetector
52
52
  end
53
53
 
54
54
  def feed(aBuf)
55
- aLen = aBuf.length
55
+ aLen = aBuf.length
56
56
  for i in 0...aLen
57
- codingState = @_mCodingSM.next_state(aBuf[i])
57
+ codingState = @_mCodingSM.next_state(aBuf.get_byte(i))
58
58
  if codingState == :Error
59
59
  @_mState = :NotMe
60
60
  break
@@ -30,15 +30,16 @@ require "EscCharSetProber"
30
30
  require "MBCSGroupProber"
31
31
  require "SBCSGroupProber"
32
32
  require "Latin1Prober"
33
+ require "shim"
33
34
  require "singleton"
34
35
 
35
- module UniversalDetector
36
-
36
+ module UniversalDetector
37
+
37
38
  class << self
38
39
  def encoding(data)
39
40
  chardet(data)['encoding']
40
41
  end
41
-
42
+
42
43
  def chardet(data)
43
44
  u = UniversalDetector::Detector.instance
44
45
  u.reset()
@@ -0,0 +1,37 @@
1
+ module UniversalDetector
2
+
3
+ def self.is18?
4
+ RUBY_VERSION =~ /^1\.8/
5
+ end
6
+
7
+ end
8
+
9
+ class String
10
+
11
+ if UniversalDetector.is18?
12
+ alias :get_byte :[]
13
+ else
14
+ def get_byte(i)
15
+ self[i].ord
16
+ end
17
+ end
18
+
19
+ def to_bytes
20
+ bytes.to_a
21
+ end
22
+
23
+ end
24
+
25
+ class Array
26
+
27
+ def get_byte(i)
28
+ v = self[i]
29
+ v = v.bytes.to_a.first if v.is_a?(String)
30
+ v
31
+ end
32
+
33
+ def to_bytes
34
+ map {|v| v.is_a?(String) ? v.get_byte(0) : v}
35
+ end
36
+
37
+ end
metadata CHANGED
@@ -1,31 +1,43 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: chardet2
3
- version: !ruby/object:Gem::Version
4
- version: 1.0.0
3
+ version: !ruby/object:Gem::Version
4
+ hash: 21
5
5
  prerelease:
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 1
10
+ version: 1.0.1
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Jan Xie
9
14
  - Felipe Tanus
10
15
  - Hui
11
16
  autorequire: UniversalDetector
12
17
  bindir: bin
13
18
  cert_chain: []
14
- date: 2013-05-17 00:00:00.000000000 Z
19
+
20
+ date: 2013-05-17 00:00:00 +08:00
21
+ default_executable:
15
22
  dependencies: []
23
+
16
24
  description:
17
- email:
25
+ email:
18
26
  - jan.h.xie@gmail.com
19
27
  executables: []
28
+
20
29
  extensions: []
30
+
21
31
  extra_rdoc_files: []
22
- files:
32
+
33
+ files:
23
34
  - lib/MBCSSM.rb
24
35
  - lib/MultiByteCharSetProber.rb
25
36
  - lib/JapaneseContextAnalysis.rb
26
37
  - lib/LangCyrillicModel.rb
27
38
  - lib/EUCKRFreq.rb
28
39
  - lib/GB2312Freq.rb
40
+ - lib/shim.rb
29
41
  - lib/EUCKRProber.rb
30
42
  - lib/CodingStateMachine.rb
31
43
  - lib/LangHungarianModel.rb
@@ -55,29 +67,39 @@ files:
55
67
  - lib/CharSetProber.rb
56
68
  - COPYING
57
69
  - README.markdown
70
+ has_rdoc: true
58
71
  homepage: https://github.com/janx/chardet
59
72
  licenses: []
73
+
60
74
  post_install_message:
61
75
  rdoc_options: []
62
- require_paths:
76
+
77
+ require_paths:
63
78
  - lib
64
- required_ruby_version: !ruby/object:Gem::Requirement
79
+ required_ruby_version: !ruby/object:Gem::Requirement
65
80
  none: false
66
- requirements:
67
- - - ! '>='
68
- - !ruby/object:Gem::Version
69
- version: '0'
70
- required_rubygems_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
89
  none: false
72
- requirements:
73
- - - ! '>='
74
- - !ruby/object:Gem::Version
75
- version: '0'
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ hash: 3
94
+ segments:
95
+ - 0
96
+ version: "0"
76
97
  requirements: []
98
+
77
99
  rubyforge_project:
78
- rubygems_version: 1.8.23
100
+ rubygems_version: 1.6.2
79
101
  signing_key:
80
102
  specification_version: 3
81
- summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
82
- on Mark Pilgrim's Python port and Hui's ruby port.
103
+ summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base on Mark Pilgrim's Python port and Hui's ruby port.
83
104
  test_files: []
105
+