tmail 1.2.7 → 1.2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -144,22 +144,22 @@ module CharDet
144
144
  # this character will simply our logic and improve performance.
145
145
  i = @_mNeedToSkipCharNum
146
146
  while i < aLen
147
- order, charLen = get_order(aBuf[i...i+2])
148
- i += charLen
149
- if i > aLen
150
- @_mNeedToSkipCharNum = i - aLen
151
- @_mLastCharOrder = -1
152
- else
153
- if (order != -1) and (@_mLastCharOrder != -1):
154
- @_mTotalRel += 1
155
- if @_mTotalRel > MAX_REL_THRESHOLD:
156
- @_mDone = true
157
- break
158
- end
159
- @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
160
- end
161
- @_mLastCharOrder = order
162
- end
147
+ order, charLen = get_order(aBuf[i...i+2])
148
+ i += charLen
149
+ if i > aLen
150
+ @_mNeedToSkipCharNum = i - aLen
151
+ @_mLastCharOrder = -1
152
+ else
153
+ if (order != -1) and (@_mLastCharOrder != -1)
154
+ @_mTotalRel += 1
155
+ if @_mTotalRel > MAX_REL_THRESHOLD
156
+ @_mDone = true
157
+ break
158
+ end
159
+ @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
160
+ end
161
+ @_mLastCharOrder = order
162
+ end
163
163
  end
164
164
  end
165
165
 
@@ -169,10 +169,10 @@ module CharDet
169
169
 
170
170
  def get_confidence
171
171
  # This is just one way to calculate confidence. It works well for me.
172
- if @_mTotalRel > MINIMUM_DATA_THRESHOLD:
173
- return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
172
+ if @_mTotalRel > MINIMUM_DATA_THRESHOLD
173
+ return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
174
174
  else
175
- return DONT_KNOW
175
+ return DONT_KNOW
176
176
  end
177
177
  end
178
178
 
@@ -188,15 +188,15 @@ module CharDet
188
188
  # find out current char's byte length
189
189
  aStr = aStr[0..1].join if aStr.class == Array
190
190
  if ((aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")) or ((aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xFC"))
191
- charLen = 2
191
+ charLen = 2
192
192
  else
193
- charLen = 1
193
+ charLen = 1
194
194
  end
195
195
  # return its order if it is hiragana
196
196
  if aStr.length > 1
197
- if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
198
- return aStr[1] - 0x9F, charLen
199
- end
197
+ if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
198
+ return aStr[1] - 0x9F, charLen
199
+ end
200
200
  end
201
201
 
202
202
  return -1, charLen
@@ -208,19 +208,19 @@ module CharDet
208
208
  return -1, 1 unless aStr
209
209
  # find out current char's byte length
210
210
  aStr = aStr[0..1].join if aStr.class == Array
211
- if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE")):
212
- charLen = 2
211
+ if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
212
+ charLen = 2
213
213
  elsif aStr[0..0] == "\x8F"
214
- charLen = 3
214
+ charLen = 3
215
215
  else
216
- charLen = 1
216
+ charLen = 1
217
217
  end
218
218
 
219
219
  # return its order if it is hiragana
220
220
  if aStr.length > 1
221
- if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
222
- return aStr[1] - 0xA1, charLen
223
- end
221
+ if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
222
+ return aStr[1] - 0xA1, charLen
223
+ end
224
224
  end
225
225
 
226
226
  return -1, charLen
@@ -110,15 +110,15 @@ module CharDet
110
110
  def feed(aBuf)
111
111
  aBuf = filter_with_english_letters(aBuf)
112
112
  aBuf.each_byte do |b|
113
- c = b.chr
114
- charClass = Latin1_CharToClass[c[0]]
115
- freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
- if freq == 0
117
- @_mState = ENotMe
118
- break
119
- end
120
- @_mFreqCounter[freq] += 1
121
- @_mLastCharClass = charClass
113
+ c = b.chr
114
+ charClass = Latin1_CharToClass[c[0]]
115
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
+ if freq == 0
117
+ @_mState = ENotMe
118
+ break
119
+ end
120
+ @_mFreqCounter[freq] += 1
121
+ @_mLastCharClass = charClass
122
122
  end
123
123
 
124
124
  return get_state()
@@ -126,17 +126,17 @@ module CharDet
126
126
 
127
127
  def get_confidence
128
128
  if get_state() == ENotMe
129
- return 0.01
129
+ return 0.01
130
130
  end
131
131
 
132
132
  total = @_mFreqCounter.inject{|a,b| a+b}
133
133
  if total < 0.01
134
- confidence = 0.0
134
+ confidence = 0.0
135
135
  else
136
- confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
136
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
137
137
  end
138
138
  if confidence < 0.0
139
- confidence = 0.0
139
+ confidence = 0.0
140
140
  end
141
141
  # lower the confidence of latin1 so that other more accurate detector
142
142
  # can take priority.
@@ -40,10 +40,10 @@ module CharDet
40
40
  def reset
41
41
  super
42
42
  if @_mCodingSM
43
- @_mCodingSM.reset()
43
+ @_mCodingSM.reset()
44
44
  end
45
45
  if @_mDistributionAnalyzer
46
- @_mDistributionAnalyzer.reset()
46
+ @_mDistributionAnalyzer.reset()
47
47
  end
48
48
  @_mLastChar = "\x00\x00"
49
49
  end
@@ -54,30 +54,30 @@ module CharDet
54
54
  def feed(aBuf)
55
55
  aLen = aBuf.length
56
56
  for i in (0...aLen)
57
- codingState = @_mCodingSM.next_state(aBuf[i..i])
58
- if codingState == EError
59
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
- @_mState = ENotMe
61
- break
62
- elsif codingState == EItsMe
63
- @_mState = EFoundIt
64
- break
65
- elsif codingState == EStart
66
- charLen = @_mCodingSM.get_current_charlen()
67
- if i == 0
68
- @_mLastChar[1] = aBuf[0..0]
69
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
- else
71
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
- end
73
- end
57
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
58
+ if codingState == EError
59
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
+ @_mState = ENotMe
61
+ break
62
+ elsif codingState == EItsMe
63
+ @_mState = EFoundIt
64
+ break
65
+ elsif codingState == EStart
66
+ charLen = @_mCodingSM.get_current_charlen()
67
+ if i == 0
68
+ @_mLastChar[1] = aBuf[0..0]
69
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
+ else
71
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
+ end
73
+ end
74
74
  end
75
75
  @_mLastChar[0] = aBuf[aLen-1..aLen-1]
76
76
 
77
77
  if get_state() == EDetecting
78
- if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
- @_mState = EFoundIt
80
- end
78
+ if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
+ @_mState = EFoundIt
80
+ end
81
81
  end
82
82
  return get_state()
83
83
  end
@@ -32,15 +32,13 @@ module CharDet
32
32
  class MBCSGroupProber < CharSetGroupProber
33
33
  def initialize
34
34
  super
35
- @_mProbers = [
36
- UTF8Prober.new,
37
- SJISProber.new,
38
- EUCJPProber.new,
39
- GB2312Prober.new,
40
- EUCKRProber.new,
41
- Big5Prober.new,
42
- EUCTWProber.new
43
- ]
35
+ @_mProbers = [ UTF8Prober.new,
36
+ SJISProber.new,
37
+ EUCJPProber.new,
38
+ GB2312Prober.new,
39
+ EUCKRProber.new,
40
+ Big5Prober.new,
41
+ EUCTWProber.new ]
44
42
  reset()
45
43
  end
46
44
  end
@@ -73,10 +73,10 @@ module CharDet
73
73
  Big5CharLenTable = [0, 1, 1, 2, 0]
74
74
 
75
75
  Big5SMModel = {'classTable' => BIG5_cls,
76
- 'classFactor' => 5,
77
- 'stateTable' => BIG5_st,
78
- 'charLenTable' => Big5CharLenTable,
79
- 'name' => 'Big5'
76
+ 'classFactor' => 5,
77
+ 'stateTable' => BIG5_st,
78
+ 'charLenTable' => Big5CharLenTable,
79
+ 'name' => 'Big5'
80
80
  }
81
81
 
82
82
  # EUC-JP
@@ -127,10 +127,10 @@ module CharDet
127
127
  EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
128
 
129
129
  EUCJPSMModel = {'classTable' => EUCJP_cls,
130
- 'classFactor' => 6,
131
- 'stateTable' => EUCJP_st,
132
- 'charLenTable' => EUCJPCharLenTable,
133
- 'name' => 'EUC-JP'
130
+ 'classFactor' => 6,
131
+ 'stateTable' => EUCJP_st,
132
+ 'charLenTable' => EUCJPCharLenTable,
133
+ 'name' => 'EUC-JP'
134
134
  }
135
135
 
136
136
  # EUC-KR
@@ -178,10 +178,10 @@ module CharDet
178
178
  EUCKRCharLenTable = [0, 1, 2, 0]
179
179
 
180
180
  EUCKRSMModel = {'classTable' => EUCKR_cls,
181
- 'classFactor' => 4,
182
- 'stateTable' => EUCKR_st,
183
- 'charLenTable' => EUCKRCharLenTable,
184
- 'name' => 'EUC-KR'
181
+ 'classFactor' => 4,
182
+ 'stateTable' => EUCKR_st,
183
+ 'charLenTable' => EUCKRCharLenTable,
184
+ 'name' => 'EUC-KR'
185
185
  }
186
186
 
187
187
  # EUC-TW
@@ -233,10 +233,10 @@ module CharDet
233
233
  EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
234
 
235
235
  EUCTWSMModel = {'classTable' => EUCTW_cls,
236
- 'classFactor' => 7,
237
- 'stateTable' => EUCTW_st,
238
- 'charLenTable' => EUCTWCharLenTable,
239
- 'name' => 'x-euc-tw'
236
+ 'classFactor' => 7,
237
+ 'stateTable' => EUCTW_st,
238
+ 'charLenTable' => EUCTWCharLenTable,
239
+ 'name' => 'x-euc-tw'
240
240
  }
241
241
 
242
242
  # GB2312
@@ -293,10 +293,10 @@ module CharDet
293
293
  GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
294
 
295
295
  GB2312SMModel = {'classTable' => GB2312_cls,
296
- 'classFactor' => 7,
297
- 'stateTable' => GB2312_st,
298
- 'charLenTable' => GB2312CharLenTable,
299
- 'name' => 'GB2312'
296
+ 'classFactor' => 7,
297
+ 'stateTable' => GB2312_st,
298
+ 'charLenTable' => GB2312CharLenTable,
299
+ 'name' => 'GB2312'
300
300
  }
301
301
 
302
302
  # Shift_JIS
@@ -347,10 +347,10 @@ module CharDet
347
347
  SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
348
 
349
349
  SJISSMModel = {'classTable' => SJIS_cls,
350
- 'classFactor' => 6,
351
- 'stateTable' => SJIS_st,
352
- 'charLenTable' => SJISCharLenTable,
353
- 'name' => 'Shift_JIS'
350
+ 'classFactor' => 6,
351
+ 'stateTable' => SJIS_st,
352
+ 'charLenTable' => SJISCharLenTable,
353
+ 'name' => 'Shift_JIS'
354
354
  }
355
355
 
356
356
  # UCS2-BE
@@ -403,10 +403,10 @@ module CharDet
403
403
  UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
404
 
405
405
  UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
- 'classFactor' => 6,
407
- 'stateTable' => UCS2BE_st,
408
- 'charLenTable' => UCS2BECharLenTable,
409
- 'name' => 'UTF-16BE'
406
+ 'classFactor' => 6,
407
+ 'stateTable' => UCS2BE_st,
408
+ 'charLenTable' => UCS2BECharLenTable,
409
+ 'name' => 'UTF-16BE'
410
410
  }
411
411
 
412
412
  # UCS2-LE
@@ -459,10 +459,10 @@ module CharDet
459
459
  UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
460
 
461
461
  UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
- 'classFactor' => 6,
463
- 'stateTable' => UCS2LE_st,
464
- 'charLenTable' => UCS2LECharLenTable,
465
- 'name' => 'UTF-16LE'
462
+ 'classFactor' => 6,
463
+ 'stateTable' => UCS2LE_st,
464
+ 'charLenTable' => UCS2LECharLenTable,
465
+ 'name' => 'UTF-16LE'
466
466
  }
467
467
 
468
468
  # UTF-8
@@ -534,9 +534,9 @@ module CharDet
534
534
  UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
535
 
536
536
  UTF8SMModel = {'classTable' => UTF8_cls,
537
- 'classFactor' => 16,
538
- 'stateTable' => UTF8_st,
539
- 'charLenTable' => UTF8CharLenTable,
540
- 'name' => 'UTF-8'
537
+ 'classFactor' => 16,
538
+ 'stateTable' => UTF8_st,
539
+ 'charLenTable' => UTF8CharLenTable,
540
+ 'name' => 'UTF-8'
541
541
  }
542
542
  end
@@ -57,51 +57,51 @@ module CharDet
57
57
 
58
58
  def get_charset_name
59
59
  if @_mNameProber
60
- return @_mNameProber.get_charset_name()
60
+ return @_mNameProber.get_charset_name()
61
61
  else
62
- return @_mModel['charsetName']
62
+ return @_mModel['charsetName']
63
63
  end
64
64
  end
65
65
 
66
66
  def feed(aBuf)
67
67
  if not @_mModel['keepEnglishLetter']
68
- aBuf = filter_without_english_letters(aBuf)
68
+ aBuf = filter_without_english_letters(aBuf)
69
69
  end
70
70
  aLen = aBuf.length
71
71
  if not aLen
72
- return get_state()
72
+ return get_state()
73
73
  end
74
74
  aBuf.each_byte do |b|
75
- c = b.chr
76
- order = @_mModel['charToOrderMap'][c[0]]
77
- if order < SYMBOL_CAT_ORDER
78
- @_mTotalChar += 1
79
- end
80
- if order < SAMPLE_SIZE
81
- @_mFreqChar += 1
82
- if @_mLastOrder < SAMPLE_SIZE
83
- @_mTotalSeqs += 1
84
- if not @_mReversed
85
- @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
- else # reverse the order of the letters in the lookup
87
- @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
- end
89
- end
90
- end
91
- @_mLastOrder = order
75
+ c = b.chr
76
+ order = @_mModel['charToOrderMap'][c[0]]
77
+ if order < SYMBOL_CAT_ORDER
78
+ @_mTotalChar += 1
79
+ end
80
+ if order < SAMPLE_SIZE
81
+ @_mFreqChar += 1
82
+ if @_mLastOrder < SAMPLE_SIZE
83
+ @_mTotalSeqs += 1
84
+ if not @_mReversed
85
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
+ else # reverse the order of the letters in the lookup
87
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
+ end
89
+ end
90
+ end
91
+ @_mLastOrder = order
92
92
  end
93
93
 
94
94
  if get_state() == EDetecting
95
- if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
- cf = get_confidence()
97
- if cf > POSITIVE_SHORTCUT_THRESHOLD
98
- $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
- @_mState = EFoundIt
100
- elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
- $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
- @_mState = ENotMe
103
- end
104
- end
95
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
+ cf = get_confidence()
97
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
98
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
+ @_mState = EFoundIt
100
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
+ @_mState = ENotMe
103
+ end
104
+ end
105
105
  end
106
106
 
107
107
  return get_state()
@@ -110,13 +110,13 @@ module CharDet
110
110
  def get_confidence
111
111
  r = 0.01
112
112
  if @_mTotalSeqs > 0
113
- # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
- r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
- # print r, self._mFreqChar, self._mTotalChar
116
- r = r * @_mFreqChar / @_mTotalChar
117
- if r >= 1.0
118
- r = 0.99
119
- end
113
+ # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
+ # print r, self._mFreqChar, self._mTotalChar
116
+ r = r * @_mFreqChar / @_mTotalChar
117
+ if r >= 1.0
118
+ r = 0.99
119
+ end
120
120
  end
121
121
  return r
122
122
  end