rchardet 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rchardet.rb +1 -3
  2. data/lib/rchardet/big5freq.rb +2 -2
  3. data/lib/rchardet/big5prober.rb +2 -2
  4. data/lib/rchardet/chardistribution.rb +74 -69
  5. data/lib/rchardet/charsetgroupprober.rb +50 -52
  6. data/lib/rchardet/charsetprober.rb +2 -7
  7. data/lib/rchardet/codingstatemachine.rb +14 -13
  8. data/lib/rchardet/constants.rb +0 -0
  9. data/lib/rchardet/escprober.rb +34 -34
  10. data/lib/rchardet/escsm.rb +33 -32
  11. data/lib/rchardet/eucjpprober.rb +28 -28
  12. data/lib/rchardet/euckrfreq.rb +2 -1
  13. data/lib/rchardet/euckrprober.rb +2 -2
  14. data/lib/rchardet/euctwfreq.rb +2 -1
  15. data/lib/rchardet/euctwprober.rb +2 -2
  16. data/lib/rchardet/gb2312freq.rb +2 -2
  17. data/lib/rchardet/gb2312prober.rb +2 -2
  18. data/lib/rchardet/hebrewprober.rb +40 -40
  19. data/lib/rchardet/jisfreq.rb +2 -1
  20. data/lib/rchardet/jpcntx.rb +131 -130
  21. data/lib/rchardet/langbulgarianmodel.rb +6 -6
  22. data/lib/rchardet/langcyrillicmodel.rb +13 -13
  23. data/lib/rchardet/langgreekmodel.rb +5 -5
  24. data/lib/rchardet/langhebrewmodel.rb +3 -3
  25. data/lib/rchardet/langhungarianmodel.rb +5 -5
  26. data/lib/rchardet/langthaimodel.rb +3 -3
  27. data/lib/rchardet/latin1prober.rb +18 -18
  28. data/lib/rchardet/mbcharsetprober.rb +30 -30
  29. data/lib/rchardet/mbcsgroupprober.rb +9 -9
  30. data/lib/rchardet/mbcssm.rb +72 -72
  31. data/lib/rchardet/sbcharsetprober.rb +48 -50
  32. data/lib/rchardet/sbcsgroupprober.rb +16 -16
  33. data/lib/rchardet/sjisprober.rb +28 -28
  34. data/lib/rchardet/universaldetector.rb +92 -90
  35. data/lib/rchardet/utf8prober.rb +25 -25
  36. data/lib/rchardet/version.rb +3 -0
  37. metadata +30 -47
  38. data/COPYING +0 -504
  39. data/README +0 -12
@@ -15,8 +15,7 @@
15
15
  # 02110-1301 USA
16
16
  ######################### END LICENSE BLOCK #########################
17
17
 
18
- $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
19
-
18
+ require 'rchardet/version'
20
19
  require 'rchardet/charsetprober'
21
20
  require 'rchardet/mbcharsetprober'
22
21
 
@@ -56,7 +55,6 @@ require 'rchardet/universaldetector'
56
55
  require 'rchardet/utf8prober'
57
56
 
58
57
  module CharDet
59
- VERSION = "1.3.1"
60
58
  def CharDet.detect(aBuf)
61
59
  u = UniversalDetector.new
62
60
  u.reset
@@ -922,6 +922,6 @@ Big5CharToFreqOrder = [
922
922
  13920,13921,13922,13923,13924,13925,13926,13927,13928,13929,13930,13931,13932,13933,13934,13935, #13936
923
923
  13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
924
924
  13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
925
- 13968,13969,13970,13971,13972] #13973
925
+ 13968,13969,13970,13971,13972 #13973
926
+ ].freeze
926
927
  end
927
-
@@ -30,8 +30,8 @@ module CharDet
30
30
  class Big5Prober < MultiByteCharSetProber
31
31
  def initialize
32
32
  super
33
- @_mCodingSM = CodingStateMachine.new(Big5SMModel)
34
- @_mDistributionAnalyzer = Big5DistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(Big5SMModel)
34
+ @distributionAnalyzer = Big5DistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -33,50 +33,50 @@ module CharDet
33
33
 
34
34
  class CharDistributionAnalysis
35
35
  def initialize
36
- @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
37
- @_mTableSize = nil # Size of above table
38
- @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
36
+ @charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
37
+ @tableSize = nil # Size of above table
38
+ @typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
39
39
  reset()
40
40
  end
41
41
 
42
42
  def reset
43
43
  # # """reset analyser, clear any state"""
44
- @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
45
- @_mTotalChars = 0 # Total characters encountered
46
- @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
44
+ @done = false # If this flag is set to constants.True, detection is done and conclusion has been made
45
+ @totalChars = 0 # Total characters encountered
46
+ @freqChars = 0 # The number of characters whose frequency order is less than 512
47
47
  end
48
48
 
49
49
  def feed(aStr, aCharLen)
50
50
  # # """feed a character with known length"""
51
51
  if aCharLen == 2
52
- # we only care about 2-bytes character in our distribution analysis
53
- order = get_order(aStr)
52
+ # we only care about 2-bytes character in our distribution analysis
53
+ order = get_order(aStr)
54
54
  else
55
- order = -1
55
+ order = -1
56
56
  end
57
57
  if order >= 0
58
- @_mTotalChars += 1
59
- # order is valid
60
- if order < @_mTableSize
61
- if 512 > @_mCharToFreqOrder[order]
62
- @_mFreqChars += 1
63
- end
64
- end
58
+ @totalChars += 1
59
+ # order is valid
60
+ if order < @tableSize
61
+ if 512 > @charToFreqOrder[order]
62
+ @freqChars += 1
63
+ end
64
+ end
65
65
  end
66
66
  end
67
67
 
68
68
  def get_confidence
69
69
  # """return confidence based on existing data"""
70
70
  # if we didn't receive any character in our consideration range, return negative answer
71
- if @_mTotalChars <= 0
72
- return SURE_NO
71
+ if @totalChars <= 0
72
+ return SURE_NO
73
73
  end
74
74
 
75
- if @_mTotalChars != @_mFreqChars
76
- r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
77
- if r < SURE_YES
78
- return r
79
- end
75
+ if @totalChars != @freqChars
76
+ r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio)
77
+ if r < SURE_YES
78
+ return r
79
+ end
80
80
  end
81
81
 
82
82
  # normalize confidence (we don't want to be 100% sure)
@@ -86,7 +86,7 @@ module CharDet
86
86
  def got_enough_data
87
87
  # It is not necessary to receive all data to draw conclusion. For charset detection,
88
88
  # certain amount of data is enough
89
- return @_mTotalChars > ENOUGH_DATA_THRESHOLD
89
+ return @totalChars > ENOUGH_DATA_THRESHOLD
90
90
  end
91
91
 
92
92
  def get_order(aStr)
@@ -100,9 +100,9 @@ module CharDet
100
100
  class EUCTWDistributionAnalysis < CharDistributionAnalysis
101
101
  def initialize
102
102
  super()
103
- @_mCharToFreqOrder = EUCTWCharToFreqOrder
104
- @_mTableSize = EUCTW_TABLE_SIZE
105
- @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
103
+ @charToFreqOrder = EUCTWCharToFreqOrder
104
+ @tableSize = EUCTW_TABLE_SIZE
105
+ @typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
106
106
  end
107
107
 
108
108
  def get_order(aStr)
@@ -110,10 +110,11 @@ module CharDet
110
110
  # first byte range: 0xc4 -- 0xfe
111
111
  # second byte range: 0xa1 -- 0xfe
112
112
  # no validation needed here. State machine has done that
113
- if aStr[0..0] >= "\xC4"
114
- return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
113
+ if aStr[0, 1] >= "\xC4"
114
+ bytes = aStr.bytes
115
+ return 94 * (bytes[0] - 0xC4) + bytes[1] - 0xA1
115
116
  else
116
- return -1
117
+ return -1
117
118
  end
118
119
  end
119
120
  end
@@ -121,9 +122,9 @@ module CharDet
121
122
  class EUCKRDistributionAnalysis < CharDistributionAnalysis
122
123
  def initialize
123
124
  super()
124
- @_mCharToFreqOrder = EUCKRCharToFreqOrder
125
- @_mTableSize = EUCKR_TABLE_SIZE
126
- @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
125
+ @charToFreqOrder = EUCKRCharToFreqOrder
126
+ @tableSize = EUCKR_TABLE_SIZE
127
+ @typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
127
128
  end
128
129
 
129
130
  def get_order(aStr)
@@ -131,10 +132,11 @@ module CharDet
131
132
  # first byte range: 0xb0 -- 0xfe
132
133
  # second byte range: 0xa1 -- 0xfe
133
134
  # no validation needed here. State machine has done that
134
- if aStr[0..0] >= "\xB0"
135
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
135
+ if aStr[0, 1] >= "\xB0"
136
+ bytes = aStr.bytes
137
+ return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
136
138
  else
137
- return -1
139
+ return -1
138
140
  end
139
141
  end
140
142
  end
@@ -142,9 +144,9 @@ module CharDet
142
144
  class GB2312DistributionAnalysis < CharDistributionAnalysis
143
145
  def initialize
144
146
  super()
145
- @_mCharToFreqOrder = GB2312CharToFreqOrder
146
- @_mTableSize = GB2312_TABLE_SIZE
147
- @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
147
+ @charToFreqOrder = GB2312CharToFreqOrder
148
+ @tableSize = GB2312_TABLE_SIZE
149
+ @typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
148
150
  end
149
151
 
150
152
  def get_order(aStr)
@@ -152,10 +154,11 @@ module CharDet
152
154
  # first byte range: 0xb0 -- 0xfe
153
155
  # second byte range: 0xa1 -- 0xfe
154
156
  # no validation needed here. State machine has done that
155
- if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
156
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
157
+ if (aStr[0, 1] >= "\xB0") and (aStr[1, 1] >= "\xA1")
158
+ bytes = aStr.bytes
159
+ return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
157
160
  else
158
- return -1
161
+ return -1
159
162
  end
160
163
  end
161
164
  end
@@ -163,9 +166,9 @@ module CharDet
163
166
  class Big5DistributionAnalysis < CharDistributionAnalysis
164
167
  def initialize
165
168
  super
166
- @_mCharToFreqOrder = Big5CharToFreqOrder
167
- @_mTableSize = BIG5_TABLE_SIZE
168
- @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
169
+ @charToFreqOrder = Big5CharToFreqOrder
170
+ @tableSize = BIG5_TABLE_SIZE
171
+ @typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
169
172
  end
170
173
 
171
174
  def get_order(aStr)
@@ -173,14 +176,15 @@ module CharDet
173
176
  # first byte range: 0xa4 -- 0xfe
174
177
  # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
175
178
  # no validation needed here. State machine has done that
176
- if aStr[0..0] >= "\xA4"
177
- if aStr[1..1] >= "\xA1"
178
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
179
- else
180
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
181
- end
179
+ if aStr[0, 1] >= "\xA4"
180
+ bytes = aStr.bytes
181
+ if aStr[1, 1] >= "\xA1"
182
+ return 157 * (bytes[0] - 0xA4) + bytes[1] - 0xA1 + 63
183
+ else
184
+ return 157 * (bytes[0] - 0xA4) + bytes[1] - 0x40
185
+ end
182
186
  else
183
- return -1
187
+ return -1
184
188
  end
185
189
  end
186
190
  end
@@ -188,9 +192,9 @@ module CharDet
188
192
  class SJISDistributionAnalysis < CharDistributionAnalysis
189
193
  def initialize
190
194
  super()
191
- @_mCharToFreqOrder = JISCharToFreqOrder
192
- @_mTableSize = JIS_TABLE_SIZE
193
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
195
+ @charToFreqOrder = JISCharToFreqOrder
196
+ @tableSize = JIS_TABLE_SIZE
197
+ @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
194
198
  end
195
199
 
196
200
  def get_order(aStr)
@@ -198,17 +202,17 @@ module CharDet
198
202
  # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
199
203
  # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
200
204
  # no validation needed here. State machine has done that
201
- aStr = aStr[0..1].join if aStr.class == Array
202
- if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
203
- order = 188 * (aStr[0] - 0x81)
204
- elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
205
- order = 188 * (aStr[0] - 0xE0 + 31)
205
+ bytes = aStr.bytes
206
+ if (aStr[0, 1] >= "\x81") and (aStr[0, 1] <= "\x9F")
207
+ order = 188 * (bytes[0] - 0x81)
208
+ elsif (aStr[0, 1] >= "\xE0") and (aStr[0, 1] <= "\xEF")
209
+ order = 188 * (bytes[0] - 0xE0 + 31)
206
210
  else
207
- return -1
211
+ return -1
208
212
  end
209
- order = order + aStr[1] - 0x40
210
- if aStr[1..1] > "\x7F"
211
- order =- 1
213
+ order = order + bytes[1] - 0x40
214
+ if aStr[1, 1] > "\x7F"
215
+ order =- 1
212
216
  end
213
217
  return order
214
218
  end
@@ -217,9 +221,9 @@ module CharDet
217
221
  class EUCJPDistributionAnalysis < CharDistributionAnalysis
218
222
  def initialize
219
223
  super()
220
- @_mCharToFreqOrder = JISCharToFreqOrder
221
- @_mTableSize = JIS_TABLE_SIZE
222
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
224
+ @charToFreqOrder = JISCharToFreqOrder
225
+ @tableSize = JIS_TABLE_SIZE
226
+ @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
223
227
  end
224
228
 
225
229
  def get_order(aStr)
@@ -227,8 +231,9 @@ module CharDet
227
231
  # first byte range: 0xa0 -- 0xfe
228
232
  # second byte range: 0xa1 -- 0xfe
229
233
  # no validation needed here. State machine has done that
230
- if aStr[0..0] >= "\xA0"
231
- return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
234
+ if aStr[0, 1] >= "\xA0"
235
+ bytes = aStr.bytes
236
+ return 94 * (bytes[0] - 0xA1) + bytes[1] - 0xa1
232
237
  else
233
238
  return -1
234
239
  end
@@ -28,54 +28,55 @@
28
28
 
29
29
  module CharDet
30
30
  class CharSetGroupProber < CharSetProber
31
- attr_accessor :_mProbers
31
+ attr_accessor :probers
32
32
  def initialize
33
33
  super
34
- @_mActiveNum = 0
35
- @_mProbers = []
36
- @_mBestGuessProber = nil
34
+ @activeNum = 0
35
+ @probers = []
36
+ @bestGuessProber = nil
37
37
  end
38
38
 
39
39
  def reset
40
40
  super
41
- @_mActiveNum = 0
41
+ @activeNum = 0
42
42
 
43
- for prober in @_mProbers
44
- if prober
45
- prober.reset()
46
- prober.active = true
47
- @_mActiveNum += 1
48
- end
43
+ for prober in @probers
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @activeNum += 1
48
+ end
49
49
  end
50
- @_mBestGuessProber = nil
50
+ @bestGuessProber = nil
51
51
  end
52
52
 
53
53
  def get_charset_name
54
- if not @_mBestGuessProber
55
- get_confidence()
56
- return nil unless @_mBestGuessProber
57
- # self._mBestGuessProber = self._mProbers[0]
54
+ if !@bestGuessProber
55
+ get_confidence()
56
+ if !@bestGuessProber
57
+ return nil
58
+ end
58
59
  end
59
- return @_mBestGuessProber.get_charset_name()
60
+ return @bestGuessProber.get_charset_name()
60
61
  end
61
62
 
62
63
  def feed(aBuf)
63
- for prober in @_mProbers
64
- next unless prober
65
- next unless prober.active
66
- st = prober.feed(aBuf)
67
- next unless st
68
- if st == EFoundIt
69
- @_mBestGuessProber = prober
70
- return get_state()
71
- elsif st == ENotMe
72
- prober.active = false
73
- @_mActiveNum -= 1
74
- if @_mActiveNum <= 0
75
- @_mState = ENotMe
76
- return get_state()
77
- end
78
- end
64
+ for prober in @probers
65
+ next unless prober
66
+ next unless prober.active
67
+ st = prober.feed(aBuf)
68
+ next unless st
69
+ if st == EFoundIt
70
+ @bestGuessProber = prober
71
+ return get_state()
72
+ elsif st == ENotMe
73
+ prober.active = false
74
+ @activeNum -= 1
75
+ if @activeNum <= 0
76
+ @state = ENotMe
77
+ return get_state()
78
+ end
79
+ end
79
80
  end
80
81
  return get_state()
81
82
  end
@@ -83,30 +84,27 @@ module CharDet
83
84
  def get_confidence()
84
85
  st = get_state()
85
86
  if st == EFoundIt
86
- return 0.99
87
+ return 0.99
87
88
  elsif st == ENotMe
88
- return 0.01
89
+ return 0.01
89
90
  end
90
91
  bestConf = 0.0
91
- @_mBestGuessProber = nil
92
- for prober in @_mProbers
93
- next unless prober
94
- unless prober.active
95
- $stderr << "#{prober.get_charset_name()} not active\n" if $debug
96
- next
97
- end
98
- cf = prober.get_confidence()
99
- $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
100
- if bestConf < cf
101
- bestConf = cf
102
- @_mBestGuessProber = prober
103
- end
92
+ @bestGuessProber = nil
93
+ for prober in @probers
94
+ next unless prober
95
+ unless prober.active
96
+ $stderr << "#{prober.get_charset_name()} not active\n" if $debug
97
+ next
98
+ end
99
+ cf = prober.get_confidence()
100
+ $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
101
+ if bestConf < cf
102
+ bestConf = cf
103
+ @bestGuessProber = prober
104
+ end
104
105
  end
105
- return 0.0 unless @_mBestGuessProber
106
+ return 0.0 unless @bestGuessProber
106
107
  return bestConf
107
- # else:
108
- # self._mBestGuessProber = self._mProbers[0]
109
- # return self._mBestGuessProber.get_confidence()
110
108
  end
111
109
  end
112
110
  end
@@ -34,7 +34,7 @@ module CharDet
34
34
  end
35
35
 
36
36
  def reset
37
- @_mState = EDetecting
37
+ @state = EDetecting
38
38
  end
39
39
 
40
40
  def get_charset_name
@@ -45,7 +45,7 @@ module CharDet
45
45
  end
46
46
 
47
47
  def get_state
48
- return @_mState
48
+ return @state
49
49
  end
50
50
 
51
51
  def get_confidence
@@ -53,11 +53,6 @@ module CharDet
53
53
  end
54
54
 
55
55
  def filter_high_bit_only(aBuf)
56
- # DO NOT USE `gsub!`
57
- # It will remove all characters from the buffer that is later used by
58
- # other probers. This is because gsub! removes data from the instance variable
59
- # that will be passed to later probers, while gsub makes a new instance variable
60
- # that will not.
61
56
  newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
62
57
  return newBuf
63
58
  end