rchardet 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rchardet.rb +1 -3
  2. data/lib/rchardet/big5freq.rb +2 -2
  3. data/lib/rchardet/big5prober.rb +2 -2
  4. data/lib/rchardet/chardistribution.rb +74 -69
  5. data/lib/rchardet/charsetgroupprober.rb +50 -52
  6. data/lib/rchardet/charsetprober.rb +2 -7
  7. data/lib/rchardet/codingstatemachine.rb +14 -13
  8. data/lib/rchardet/constants.rb +0 -0
  9. data/lib/rchardet/escprober.rb +34 -34
  10. data/lib/rchardet/escsm.rb +33 -32
  11. data/lib/rchardet/eucjpprober.rb +28 -28
  12. data/lib/rchardet/euckrfreq.rb +2 -1
  13. data/lib/rchardet/euckrprober.rb +2 -2
  14. data/lib/rchardet/euctwfreq.rb +2 -1
  15. data/lib/rchardet/euctwprober.rb +2 -2
  16. data/lib/rchardet/gb2312freq.rb +2 -2
  17. data/lib/rchardet/gb2312prober.rb +2 -2
  18. data/lib/rchardet/hebrewprober.rb +40 -40
  19. data/lib/rchardet/jisfreq.rb +2 -1
  20. data/lib/rchardet/jpcntx.rb +131 -130
  21. data/lib/rchardet/langbulgarianmodel.rb +6 -6
  22. data/lib/rchardet/langcyrillicmodel.rb +13 -13
  23. data/lib/rchardet/langgreekmodel.rb +5 -5
  24. data/lib/rchardet/langhebrewmodel.rb +3 -3
  25. data/lib/rchardet/langhungarianmodel.rb +5 -5
  26. data/lib/rchardet/langthaimodel.rb +3 -3
  27. data/lib/rchardet/latin1prober.rb +18 -18
  28. data/lib/rchardet/mbcharsetprober.rb +30 -30
  29. data/lib/rchardet/mbcsgroupprober.rb +9 -9
  30. data/lib/rchardet/mbcssm.rb +72 -72
  31. data/lib/rchardet/sbcharsetprober.rb +48 -50
  32. data/lib/rchardet/sbcsgroupprober.rb +16 -16
  33. data/lib/rchardet/sjisprober.rb +28 -28
  34. data/lib/rchardet/universaldetector.rb +92 -90
  35. data/lib/rchardet/utf8prober.rb +25 -25
  36. data/lib/rchardet/version.rb +3 -0
  37. metadata +30 -47
  38. data/COPYING +0 -504
  39. data/README +0 -12
@@ -15,8 +15,7 @@
15
15
  # 02110-1301 USA
16
16
  ######################### END LICENSE BLOCK #########################
17
17
 
18
- $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
19
-
18
+ require 'rchardet/version'
20
19
  require 'rchardet/charsetprober'
21
20
  require 'rchardet/mbcharsetprober'
22
21
 
@@ -56,7 +55,6 @@ require 'rchardet/universaldetector'
56
55
  require 'rchardet/utf8prober'
57
56
 
58
57
  module CharDet
59
- VERSION = "1.3.1"
60
58
  def CharDet.detect(aBuf)
61
59
  u = UniversalDetector.new
62
60
  u.reset
@@ -922,6 +922,6 @@ Big5CharToFreqOrder = [
922
922
  13920,13921,13922,13923,13924,13925,13926,13927,13928,13929,13930,13931,13932,13933,13934,13935, #13936
923
923
  13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
924
924
  13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
925
- 13968,13969,13970,13971,13972] #13973
925
+ 13968,13969,13970,13971,13972 #13973
926
+ ].freeze
926
927
  end
927
-
@@ -30,8 +30,8 @@ module CharDet
30
30
  class Big5Prober < MultiByteCharSetProber
31
31
  def initialize
32
32
  super
33
- @_mCodingSM = CodingStateMachine.new(Big5SMModel)
34
- @_mDistributionAnalyzer = Big5DistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(Big5SMModel)
34
+ @distributionAnalyzer = Big5DistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -33,50 +33,50 @@ module CharDet
33
33
 
34
34
  class CharDistributionAnalysis
35
35
  def initialize
36
- @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
37
- @_mTableSize = nil # Size of above table
38
- @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
36
+ @charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
37
+ @tableSize = nil # Size of above table
38
+ @typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
39
39
  reset()
40
40
  end
41
41
 
42
42
  def reset
43
43
  # # """reset analyser, clear any state"""
44
- @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
45
- @_mTotalChars = 0 # Total characters encountered
46
- @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
44
+ @done = false # If this flag is set to constants.True, detection is done and conclusion has been made
45
+ @totalChars = 0 # Total characters encountered
46
+ @freqChars = 0 # The number of characters whose frequency order is less than 512
47
47
  end
48
48
 
49
49
  def feed(aStr, aCharLen)
50
50
  # # """feed a character with known length"""
51
51
  if aCharLen == 2
52
- # we only care about 2-bytes character in our distribution analysis
53
- order = get_order(aStr)
52
+ # we only care about 2-bytes character in our distribution analysis
53
+ order = get_order(aStr)
54
54
  else
55
- order = -1
55
+ order = -1
56
56
  end
57
57
  if order >= 0
58
- @_mTotalChars += 1
59
- # order is valid
60
- if order < @_mTableSize
61
- if 512 > @_mCharToFreqOrder[order]
62
- @_mFreqChars += 1
63
- end
64
- end
58
+ @totalChars += 1
59
+ # order is valid
60
+ if order < @tableSize
61
+ if 512 > @charToFreqOrder[order]
62
+ @freqChars += 1
63
+ end
64
+ end
65
65
  end
66
66
  end
67
67
 
68
68
  def get_confidence
69
69
  # """return confidence based on existing data"""
70
70
  # if we didn't receive any character in our consideration range, return negative answer
71
- if @_mTotalChars <= 0
72
- return SURE_NO
71
+ if @totalChars <= 0
72
+ return SURE_NO
73
73
  end
74
74
 
75
- if @_mTotalChars != @_mFreqChars
76
- r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
77
- if r < SURE_YES
78
- return r
79
- end
75
+ if @totalChars != @freqChars
76
+ r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio)
77
+ if r < SURE_YES
78
+ return r
79
+ end
80
80
  end
81
81
 
82
82
  # normalize confidence (we don't want to be 100% sure)
@@ -86,7 +86,7 @@ module CharDet
86
86
  def got_enough_data
87
87
  # It is not necessary to receive all data to draw conclusion. For charset detection,
88
88
  # certain amount of data is enough
89
- return @_mTotalChars > ENOUGH_DATA_THRESHOLD
89
+ return @totalChars > ENOUGH_DATA_THRESHOLD
90
90
  end
91
91
 
92
92
  def get_order(aStr)
@@ -100,9 +100,9 @@ module CharDet
100
100
  class EUCTWDistributionAnalysis < CharDistributionAnalysis
101
101
  def initialize
102
102
  super()
103
- @_mCharToFreqOrder = EUCTWCharToFreqOrder
104
- @_mTableSize = EUCTW_TABLE_SIZE
105
- @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
103
+ @charToFreqOrder = EUCTWCharToFreqOrder
104
+ @tableSize = EUCTW_TABLE_SIZE
105
+ @typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
106
106
  end
107
107
 
108
108
  def get_order(aStr)
@@ -110,10 +110,11 @@ module CharDet
110
110
  # first byte range: 0xc4 -- 0xfe
111
111
  # second byte range: 0xa1 -- 0xfe
112
112
  # no validation needed here. State machine has done that
113
- if aStr[0..0] >= "\xC4"
114
- return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
113
+ if aStr[0, 1] >= "\xC4"
114
+ bytes = aStr.bytes
115
+ return 94 * (bytes[0] - 0xC4) + bytes[1] - 0xA1
115
116
  else
116
- return -1
117
+ return -1
117
118
  end
118
119
  end
119
120
  end
@@ -121,9 +122,9 @@ module CharDet
121
122
  class EUCKRDistributionAnalysis < CharDistributionAnalysis
122
123
  def initialize
123
124
  super()
124
- @_mCharToFreqOrder = EUCKRCharToFreqOrder
125
- @_mTableSize = EUCKR_TABLE_SIZE
126
- @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
125
+ @charToFreqOrder = EUCKRCharToFreqOrder
126
+ @tableSize = EUCKR_TABLE_SIZE
127
+ @typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
127
128
  end
128
129
 
129
130
  def get_order(aStr)
@@ -131,10 +132,11 @@ module CharDet
131
132
  # first byte range: 0xb0 -- 0xfe
132
133
  # second byte range: 0xa1 -- 0xfe
133
134
  # no validation needed here. State machine has done that
134
- if aStr[0..0] >= "\xB0"
135
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
135
+ if aStr[0, 1] >= "\xB0"
136
+ bytes = aStr.bytes
137
+ return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
136
138
  else
137
- return -1
139
+ return -1
138
140
  end
139
141
  end
140
142
  end
@@ -142,9 +144,9 @@ module CharDet
142
144
  class GB2312DistributionAnalysis < CharDistributionAnalysis
143
145
  def initialize
144
146
  super()
145
- @_mCharToFreqOrder = GB2312CharToFreqOrder
146
- @_mTableSize = GB2312_TABLE_SIZE
147
- @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
147
+ @charToFreqOrder = GB2312CharToFreqOrder
148
+ @tableSize = GB2312_TABLE_SIZE
149
+ @typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
148
150
  end
149
151
 
150
152
  def get_order(aStr)
@@ -152,10 +154,11 @@ module CharDet
152
154
  # first byte range: 0xb0 -- 0xfe
153
155
  # second byte range: 0xa1 -- 0xfe
154
156
  # no validation needed here. State machine has done that
155
- if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
156
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
157
+ if (aStr[0, 1] >= "\xB0") and (aStr[1, 1] >= "\xA1")
158
+ bytes = aStr.bytes
159
+ return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
157
160
  else
158
- return -1
161
+ return -1
159
162
  end
160
163
  end
161
164
  end
@@ -163,9 +166,9 @@ module CharDet
163
166
  class Big5DistributionAnalysis < CharDistributionAnalysis
164
167
  def initialize
165
168
  super
166
- @_mCharToFreqOrder = Big5CharToFreqOrder
167
- @_mTableSize = BIG5_TABLE_SIZE
168
- @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
169
+ @charToFreqOrder = Big5CharToFreqOrder
170
+ @tableSize = BIG5_TABLE_SIZE
171
+ @typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
169
172
  end
170
173
 
171
174
  def get_order(aStr)
@@ -173,14 +176,15 @@ module CharDet
173
176
  # first byte range: 0xa4 -- 0xfe
174
177
  # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
175
178
  # no validation needed here. State machine has done that
176
- if aStr[0..0] >= "\xA4"
177
- if aStr[1..1] >= "\xA1"
178
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
179
- else
180
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
181
- end
179
+ if aStr[0, 1] >= "\xA4"
180
+ bytes = aStr.bytes
181
+ if aStr[1, 1] >= "\xA1"
182
+ return 157 * (bytes[0] - 0xA4) + bytes[1] - 0xA1 + 63
183
+ else
184
+ return 157 * (bytes[0] - 0xA4) + bytes[1] - 0x40
185
+ end
182
186
  else
183
- return -1
187
+ return -1
184
188
  end
185
189
  end
186
190
  end
@@ -188,9 +192,9 @@ module CharDet
188
192
  class SJISDistributionAnalysis < CharDistributionAnalysis
189
193
  def initialize
190
194
  super()
191
- @_mCharToFreqOrder = JISCharToFreqOrder
192
- @_mTableSize = JIS_TABLE_SIZE
193
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
195
+ @charToFreqOrder = JISCharToFreqOrder
196
+ @tableSize = JIS_TABLE_SIZE
197
+ @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
194
198
  end
195
199
 
196
200
  def get_order(aStr)
@@ -198,17 +202,17 @@ module CharDet
198
202
  # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
199
203
  # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
200
204
  # no validation needed here. State machine has done that
201
- aStr = aStr[0..1].join if aStr.class == Array
202
- if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
203
- order = 188 * (aStr[0] - 0x81)
204
- elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
205
- order = 188 * (aStr[0] - 0xE0 + 31)
205
+ bytes = aStr.bytes
206
+ if (aStr[0, 1] >= "\x81") and (aStr[0, 1] <= "\x9F")
207
+ order = 188 * (bytes[0] - 0x81)
208
+ elsif (aStr[0, 1] >= "\xE0") and (aStr[0, 1] <= "\xEF")
209
+ order = 188 * (bytes[0] - 0xE0 + 31)
206
210
  else
207
- return -1
211
+ return -1
208
212
  end
209
- order = order + aStr[1] - 0x40
210
- if aStr[1..1] > "\x7F"
211
- order =- 1
213
+ order = order + bytes[1] - 0x40
214
+ if aStr[1, 1] > "\x7F"
215
+ order =- 1
212
216
  end
213
217
  return order
214
218
  end
@@ -217,9 +221,9 @@ module CharDet
217
221
  class EUCJPDistributionAnalysis < CharDistributionAnalysis
218
222
  def initialize
219
223
  super()
220
- @_mCharToFreqOrder = JISCharToFreqOrder
221
- @_mTableSize = JIS_TABLE_SIZE
222
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
224
+ @charToFreqOrder = JISCharToFreqOrder
225
+ @tableSize = JIS_TABLE_SIZE
226
+ @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
223
227
  end
224
228
 
225
229
  def get_order(aStr)
@@ -227,8 +231,9 @@ module CharDet
227
231
  # first byte range: 0xa0 -- 0xfe
228
232
  # second byte range: 0xa1 -- 0xfe
229
233
  # no validation needed here. State machine has done that
230
- if aStr[0..0] >= "\xA0"
231
- return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
234
+ if aStr[0, 1] >= "\xA0"
235
+ bytes = aStr.bytes
236
+ return 94 * (bytes[0] - 0xA1) + bytes[1] - 0xa1
232
237
  else
233
238
  return -1
234
239
  end
@@ -28,54 +28,55 @@
28
28
 
29
29
  module CharDet
30
30
  class CharSetGroupProber < CharSetProber
31
- attr_accessor :_mProbers
31
+ attr_accessor :probers
32
32
  def initialize
33
33
  super
34
- @_mActiveNum = 0
35
- @_mProbers = []
36
- @_mBestGuessProber = nil
34
+ @activeNum = 0
35
+ @probers = []
36
+ @bestGuessProber = nil
37
37
  end
38
38
 
39
39
  def reset
40
40
  super
41
- @_mActiveNum = 0
41
+ @activeNum = 0
42
42
 
43
- for prober in @_mProbers
44
- if prober
45
- prober.reset()
46
- prober.active = true
47
- @_mActiveNum += 1
48
- end
43
+ for prober in @probers
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @activeNum += 1
48
+ end
49
49
  end
50
- @_mBestGuessProber = nil
50
+ @bestGuessProber = nil
51
51
  end
52
52
 
53
53
  def get_charset_name
54
- if not @_mBestGuessProber
55
- get_confidence()
56
- return nil unless @_mBestGuessProber
57
- # self._mBestGuessProber = self._mProbers[0]
54
+ if !@bestGuessProber
55
+ get_confidence()
56
+ if !@bestGuessProber
57
+ return nil
58
+ end
58
59
  end
59
- return @_mBestGuessProber.get_charset_name()
60
+ return @bestGuessProber.get_charset_name()
60
61
  end
61
62
 
62
63
  def feed(aBuf)
63
- for prober in @_mProbers
64
- next unless prober
65
- next unless prober.active
66
- st = prober.feed(aBuf)
67
- next unless st
68
- if st == EFoundIt
69
- @_mBestGuessProber = prober
70
- return get_state()
71
- elsif st == ENotMe
72
- prober.active = false
73
- @_mActiveNum -= 1
74
- if @_mActiveNum <= 0
75
- @_mState = ENotMe
76
- return get_state()
77
- end
78
- end
64
+ for prober in @probers
65
+ next unless prober
66
+ next unless prober.active
67
+ st = prober.feed(aBuf)
68
+ next unless st
69
+ if st == EFoundIt
70
+ @bestGuessProber = prober
71
+ return get_state()
72
+ elsif st == ENotMe
73
+ prober.active = false
74
+ @activeNum -= 1
75
+ if @activeNum <= 0
76
+ @state = ENotMe
77
+ return get_state()
78
+ end
79
+ end
79
80
  end
80
81
  return get_state()
81
82
  end
@@ -83,30 +84,27 @@ module CharDet
83
84
  def get_confidence()
84
85
  st = get_state()
85
86
  if st == EFoundIt
86
- return 0.99
87
+ return 0.99
87
88
  elsif st == ENotMe
88
- return 0.01
89
+ return 0.01
89
90
  end
90
91
  bestConf = 0.0
91
- @_mBestGuessProber = nil
92
- for prober in @_mProbers
93
- next unless prober
94
- unless prober.active
95
- $stderr << "#{prober.get_charset_name()} not active\n" if $debug
96
- next
97
- end
98
- cf = prober.get_confidence()
99
- $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
100
- if bestConf < cf
101
- bestConf = cf
102
- @_mBestGuessProber = prober
103
- end
92
+ @bestGuessProber = nil
93
+ for prober in @probers
94
+ next unless prober
95
+ unless prober.active
96
+ $stderr << "#{prober.get_charset_name()} not active\n" if $debug
97
+ next
98
+ end
99
+ cf = prober.get_confidence()
100
+ $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
101
+ if bestConf < cf
102
+ bestConf = cf
103
+ @bestGuessProber = prober
104
+ end
104
105
  end
105
- return 0.0 unless @_mBestGuessProber
106
+ return 0.0 unless @bestGuessProber
106
107
  return bestConf
107
- # else:
108
- # self._mBestGuessProber = self._mProbers[0]
109
- # return self._mBestGuessProber.get_confidence()
110
108
  end
111
109
  end
112
110
  end
@@ -34,7 +34,7 @@ module CharDet
34
34
  end
35
35
 
36
36
  def reset
37
- @_mState = EDetecting
37
+ @state = EDetecting
38
38
  end
39
39
 
40
40
  def get_charset_name
@@ -45,7 +45,7 @@ module CharDet
45
45
  end
46
46
 
47
47
  def get_state
48
- return @_mState
48
+ return @state
49
49
  end
50
50
 
51
51
  def get_confidence
@@ -53,11 +53,6 @@ module CharDet
53
53
  end
54
54
 
55
55
  def filter_high_bit_only(aBuf)
56
- # DO NOT USE `gsub!`
57
- # It will remove all characters from the buffer that is later used by
58
- # other probers. This is because gsub! removes data from the instance variable
59
- # that will be passed to later probers, while gsub makes a new instance variable
60
- # that will not.
61
56
  newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
62
57
  return newBuf
63
58
  end