rchardet 1.3 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,8 +57,8 @@ module CharDet
57
57
  if order >= 0
58
58
  @_mTotalChars += 1
59
59
  # order is valid
60
- if order < @_mTableSize:
61
- if 512 > @_mCharToFreqOrder[order]:
60
+ if order < @_mTableSize
61
+ if 512 > @_mCharToFreqOrder[order]
62
62
  @_mFreqChars += 1
63
63
  end
64
64
  end
@@ -72,7 +72,7 @@ module CharDet
72
72
  return SURE_NO
73
73
  end
74
74
 
75
- if @_mTotalChars != @_mFreqChars:
75
+ if @_mTotalChars != @_mFreqChars
76
76
  r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
77
77
  if r < SURE_YES
78
78
  return r
@@ -227,10 +227,10 @@ module CharDet
227
227
  # first byte range: 0xa0 -- 0xfe
228
228
  # second byte range: 0xa1 -- 0xfe
229
229
  # no validation needed here. State machine has done that
230
- if aStr[0..0] >= "\xA0":
231
- return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
230
+ if aStr[0..0] >= "\xA0"
231
+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
232
232
  else
233
- return -1
233
+ return -1
234
234
  end
235
235
  end
236
236
  end
@@ -40,7 +40,7 @@ module CharDet
40
40
  super
41
41
  @_mActiveNum = 0
42
42
 
43
- for prober in @_mProbers:
43
+ for prober in @_mProbers
44
44
  if prober
45
45
  prober.reset()
46
46
  prober.active = true
@@ -28,6 +28,8 @@
28
28
 
29
29
  module CharDet
30
30
  class CodingStateMachine
31
+ attr_accessor :active
32
+
31
33
  def initialize(sm)
32
34
  @_mModel = sm
33
35
  @_mCurrentBytePos = 0
@@ -41,7 +41,7 @@ module CharDet
41
41
 
42
42
  def reset
43
43
  super()
44
- for codingSM in @_mCodingSM:
44
+ for codingSM in @_mCodingSM
45
45
  next if not codingSM
46
46
  codingSM.active = true
47
47
  codingSM.reset()
@@ -56,7 +56,7 @@ module CharDet
56
56
  elsif codingState == EItsMe
57
57
  @_mState = EFoundIt
58
58
  break
59
- elsif codingState == EStart:
59
+ elsif codingState == EStart
60
60
  charLen = @_mCodingSM.get_current_charlen()
61
61
  if i == 0
62
62
  @_mLastChar[1] = aBuf[0..0]
@@ -34,7 +34,7 @@ module CharDet
34
34
  MINIMUM_DATA_THRESHOLD = 4
35
35
 
36
36
  # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
37
- jp2CharContext = [
37
+ JP2_CHAR_CONTEXT = [
38
38
  [0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1],
39
39
  [2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4],
40
40
  [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2],
@@ -150,13 +150,13 @@ module CharDet
150
150
  @_mNeedToSkipCharNum = i - aLen
151
151
  @_mLastCharOrder = -1
152
152
  else
153
- if (order != -1) and (@_mLastCharOrder != -1):
153
+ if (order != -1) and (@_mLastCharOrder != -1)
154
154
  @_mTotalRel += 1
155
- if @_mTotalRel > MAX_REL_THRESHOLD:
155
+ if @_mTotalRel > MAX_REL_THRESHOLD
156
156
  @_mDone = true
157
157
  break
158
158
  end
159
- @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
159
+ @_mRelSample[JP2_CHAR_CONTEXT[@_mLastCharOrder][order]] += 1
160
160
  end
161
161
  @_mLastCharOrder = order
162
162
  end
@@ -169,7 +169,7 @@ module CharDet
169
169
 
170
170
  def get_confidence
171
171
  # This is just one way to calculate confidence. It works well for me.
172
- if @_mTotalRel > MINIMUM_DATA_THRESHOLD:
172
+ if @_mTotalRel > MINIMUM_DATA_THRESHOLD
173
173
  return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
174
174
  else
175
175
  return DONT_KNOW
@@ -208,7 +208,7 @@ module CharDet
208
208
  return -1, 1 unless aStr
209
209
  # find out current char's byte length
210
210
  aStr = aStr[0..1].join if aStr.class == Array
211
- if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE")):
211
+ if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
212
212
  charLen = 2
213
213
  elsif aStr[0..0] == "\x8F"
214
214
  charLen = 3
@@ -71,7 +71,7 @@ module CharDet
71
71
 
72
72
  @_mLastChar[0] = aBuf[aLen - 1.. aLen-1]
73
73
 
74
- if get_state() == EDetecting:
74
+ if get_state() == EDetecting
75
75
  if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
76
  @_mState = EFoundIt
77
77
  end
@@ -91,11 +91,11 @@ module CharDet
91
91
  end
92
92
 
93
93
  @_mGotData = true
94
- if @result['encoding'] and (@result['confidence'] > 0.0):
94
+ if @result['encoding'] and (@result['confidence'] > 0.0)
95
95
  @done = true
96
96
  return
97
97
  end
98
- if @_mInputState == EPureAscii:
98
+ if @_mInputState == EPureAscii
99
99
  if @_highBitDetector =~ (aBuf)
100
100
  @_mInputState = EHighbyte
101
101
  elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
@@ -109,7 +109,7 @@ module CharDet
109
109
  @_mEscCharSetProber = EscCharSetProber.new()
110
110
  end
111
111
  if @_mEscCharSetProber.feed(aBuf) == EFoundIt
112
- @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
112
+ @result = {'encoding' => @_mEscCharSetProber.get_charset_name(),
113
113
  'confidence' => @_mEscCharSetProber.get_confidence()
114
114
  }
115
115
  @done = true
@@ -138,12 +138,12 @@ module CharDet
138
138
  end
139
139
  @done = true
140
140
 
141
- if @_mInputState == EPureAscii:
141
+ if @_mInputState == EPureAscii
142
142
  @result = {'encoding' => 'ascii', 'confidence' => 1.0}
143
143
  return @result
144
144
  end
145
145
 
146
- if @_mInputState == EHighbyte:
146
+ if @_mInputState == EHighbyte
147
147
  confidences = {}
148
148
  @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
149
149
  maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
@@ -63,7 +63,7 @@ module CharDet
63
63
  end
64
64
  end
65
65
 
66
- if get_state() == EDetecting:
66
+ if get_state() == EDetecting
67
67
  if get_confidence() > SHORTCUT_THRESHOLD
68
68
  @_mState = EFoundIt
69
69
  end
data/lib/rchardet.rb CHANGED
@@ -56,7 +56,7 @@ require 'rchardet/universaldetector'
56
56
  require 'rchardet/utf8prober'
57
57
 
58
58
  module CharDet
59
- VERSION = "1.3"
59
+ VERSION = "1.3.1"
60
60
  def CharDet.detect(aBuf)
61
61
  u = UniversalDetector.new
62
62
  u.reset
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rchardet
3
3
  version: !ruby/object:Gem::Version
4
- version: "1.3"
4
+ hash: 25
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 3
9
+ - 1
10
+ version: 1.3.1
5
11
  platform: ruby
6
12
  authors:
7
13
  - Jeff Hodges
@@ -9,8 +15,7 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2009-07-19 00:00:00 -07:00
13
- default_executable:
18
+ date: 2012-12-02 00:00:00 Z
14
19
  dependencies: []
15
20
 
16
21
  description:
@@ -60,7 +65,6 @@ files:
60
65
  - lib/rchardet.rb
61
66
  - README
62
67
  - COPYING
63
- has_rdoc: true
64
68
  homepage: http://github.com/jmhodges/rchardet/tree/master
65
69
  licenses: []
66
70
 
@@ -70,21 +74,27 @@ rdoc_options: []
70
74
  require_paths:
71
75
  - lib
72
76
  required_ruby_version: !ruby/object:Gem::Requirement
77
+ none: false
73
78
  requirements:
74
79
  - - ">="
75
80
  - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
76
84
  version: "0"
77
- version:
78
85
  required_rubygems_version: !ruby/object:Gem::Requirement
86
+ none: false
79
87
  requirements:
80
88
  - - ">="
81
89
  - !ruby/object:Gem::Version
90
+ hash: 3
91
+ segments:
92
+ - 0
82
93
  version: "0"
83
- version:
84
94
  requirements: []
85
95
 
86
96
  rubyforge_project: rchardet
87
- rubygems_version: 1.3.4
97
+ rubygems_version: 1.8.15
88
98
  signing_key:
89
99
  specification_version: 3
90
100
  summary: Character encoding auto-detection in Ruby. As smart as your browser. Open source.