tmail 1.2.7 → 1.2.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -144,22 +144,22 @@ module CharDet
144
144
  # this character will simply our logic and improve performance.
145
145
  i = @_mNeedToSkipCharNum
146
146
  while i < aLen
147
- order, charLen = get_order(aBuf[i...i+2])
148
- i += charLen
149
- if i > aLen
150
- @_mNeedToSkipCharNum = i - aLen
151
- @_mLastCharOrder = -1
152
- else
153
- if (order != -1) and (@_mLastCharOrder != -1):
154
- @_mTotalRel += 1
155
- if @_mTotalRel > MAX_REL_THRESHOLD:
156
- @_mDone = true
157
- break
158
- end
159
- @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
160
- end
161
- @_mLastCharOrder = order
162
- end
147
+ order, charLen = get_order(aBuf[i...i+2])
148
+ i += charLen
149
+ if i > aLen
150
+ @_mNeedToSkipCharNum = i - aLen
151
+ @_mLastCharOrder = -1
152
+ else
153
+ if (order != -1) and (@_mLastCharOrder != -1)
154
+ @_mTotalRel += 1
155
+ if @_mTotalRel > MAX_REL_THRESHOLD
156
+ @_mDone = true
157
+ break
158
+ end
159
+ @_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
160
+ end
161
+ @_mLastCharOrder = order
162
+ end
163
163
  end
164
164
  end
165
165
 
@@ -169,10 +169,10 @@ module CharDet
169
169
 
170
170
  def get_confidence
171
171
  # This is just one way to calculate confidence. It works well for me.
172
- if @_mTotalRel > MINIMUM_DATA_THRESHOLD:
173
- return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
172
+ if @_mTotalRel > MINIMUM_DATA_THRESHOLD
173
+ return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
174
174
  else
175
- return DONT_KNOW
175
+ return DONT_KNOW
176
176
  end
177
177
  end
178
178
 
@@ -188,15 +188,15 @@ module CharDet
188
188
  # find out current char's byte length
189
189
  aStr = aStr[0..1].join if aStr.class == Array
190
190
  if ((aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")) or ((aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xFC"))
191
- charLen = 2
191
+ charLen = 2
192
192
  else
193
- charLen = 1
193
+ charLen = 1
194
194
  end
195
195
  # return its order if it is hiragana
196
196
  if aStr.length > 1
197
- if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
198
- return aStr[1] - 0x9F, charLen
199
- end
197
+ if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
198
+ return aStr[1] - 0x9F, charLen
199
+ end
200
200
  end
201
201
 
202
202
  return -1, charLen
@@ -208,19 +208,19 @@ module CharDet
208
208
  return -1, 1 unless aStr
209
209
  # find out current char's byte length
210
210
  aStr = aStr[0..1].join if aStr.class == Array
211
- if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE")):
212
- charLen = 2
211
+ if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
212
+ charLen = 2
213
213
  elsif aStr[0..0] == "\x8F"
214
- charLen = 3
214
+ charLen = 3
215
215
  else
216
- charLen = 1
216
+ charLen = 1
217
217
  end
218
218
 
219
219
  # return its order if it is hiragana
220
220
  if aStr.length > 1
221
- if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
222
- return aStr[1] - 0xA1, charLen
223
- end
221
+ if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
222
+ return aStr[1] - 0xA1, charLen
223
+ end
224
224
  end
225
225
 
226
226
  return -1, charLen
@@ -110,15 +110,15 @@ module CharDet
110
110
  def feed(aBuf)
111
111
  aBuf = filter_with_english_letters(aBuf)
112
112
  aBuf.each_byte do |b|
113
- c = b.chr
114
- charClass = Latin1_CharToClass[c[0]]
115
- freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
- if freq == 0
117
- @_mState = ENotMe
118
- break
119
- end
120
- @_mFreqCounter[freq] += 1
121
- @_mLastCharClass = charClass
113
+ c = b.chr
114
+ charClass = Latin1_CharToClass[c[0]]
115
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
+ if freq == 0
117
+ @_mState = ENotMe
118
+ break
119
+ end
120
+ @_mFreqCounter[freq] += 1
121
+ @_mLastCharClass = charClass
122
122
  end
123
123
 
124
124
  return get_state()
@@ -126,17 +126,17 @@ module CharDet
126
126
 
127
127
  def get_confidence
128
128
  if get_state() == ENotMe
129
- return 0.01
129
+ return 0.01
130
130
  end
131
131
 
132
132
  total = @_mFreqCounter.inject{|a,b| a+b}
133
133
  if total < 0.01
134
- confidence = 0.0
134
+ confidence = 0.0
135
135
  else
136
- confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
136
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
137
137
  end
138
138
  if confidence < 0.0
139
- confidence = 0.0
139
+ confidence = 0.0
140
140
  end
141
141
  # lower the confidence of latin1 so that other more accurate detector
142
142
  # can take priority.
@@ -40,10 +40,10 @@ module CharDet
40
40
  def reset
41
41
  super
42
42
  if @_mCodingSM
43
- @_mCodingSM.reset()
43
+ @_mCodingSM.reset()
44
44
  end
45
45
  if @_mDistributionAnalyzer
46
- @_mDistributionAnalyzer.reset()
46
+ @_mDistributionAnalyzer.reset()
47
47
  end
48
48
  @_mLastChar = "\x00\x00"
49
49
  end
@@ -54,30 +54,30 @@ module CharDet
54
54
  def feed(aBuf)
55
55
  aLen = aBuf.length
56
56
  for i in (0...aLen)
57
- codingState = @_mCodingSM.next_state(aBuf[i..i])
58
- if codingState == EError
59
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
- @_mState = ENotMe
61
- break
62
- elsif codingState == EItsMe
63
- @_mState = EFoundIt
64
- break
65
- elsif codingState == EStart
66
- charLen = @_mCodingSM.get_current_charlen()
67
- if i == 0
68
- @_mLastChar[1] = aBuf[0..0]
69
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
- else
71
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
- end
73
- end
57
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
58
+ if codingState == EError
59
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
+ @_mState = ENotMe
61
+ break
62
+ elsif codingState == EItsMe
63
+ @_mState = EFoundIt
64
+ break
65
+ elsif codingState == EStart
66
+ charLen = @_mCodingSM.get_current_charlen()
67
+ if i == 0
68
+ @_mLastChar[1] = aBuf[0..0]
69
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
+ else
71
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
+ end
73
+ end
74
74
  end
75
75
  @_mLastChar[0] = aBuf[aLen-1..aLen-1]
76
76
 
77
77
  if get_state() == EDetecting
78
- if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
- @_mState = EFoundIt
80
- end
78
+ if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
+ @_mState = EFoundIt
80
+ end
81
81
  end
82
82
  return get_state()
83
83
  end
@@ -32,15 +32,13 @@ module CharDet
32
32
  class MBCSGroupProber < CharSetGroupProber
33
33
  def initialize
34
34
  super
35
- @_mProbers = [
36
- UTF8Prober.new,
37
- SJISProber.new,
38
- EUCJPProber.new,
39
- GB2312Prober.new,
40
- EUCKRProber.new,
41
- Big5Prober.new,
42
- EUCTWProber.new
43
- ]
35
+ @_mProbers = [ UTF8Prober.new,
36
+ SJISProber.new,
37
+ EUCJPProber.new,
38
+ GB2312Prober.new,
39
+ EUCKRProber.new,
40
+ Big5Prober.new,
41
+ EUCTWProber.new ]
44
42
  reset()
45
43
  end
46
44
  end
@@ -73,10 +73,10 @@ module CharDet
73
73
  Big5CharLenTable = [0, 1, 1, 2, 0]
74
74
 
75
75
  Big5SMModel = {'classTable' => BIG5_cls,
76
- 'classFactor' => 5,
77
- 'stateTable' => BIG5_st,
78
- 'charLenTable' => Big5CharLenTable,
79
- 'name' => 'Big5'
76
+ 'classFactor' => 5,
77
+ 'stateTable' => BIG5_st,
78
+ 'charLenTable' => Big5CharLenTable,
79
+ 'name' => 'Big5'
80
80
  }
81
81
 
82
82
  # EUC-JP
@@ -127,10 +127,10 @@ module CharDet
127
127
  EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
128
 
129
129
  EUCJPSMModel = {'classTable' => EUCJP_cls,
130
- 'classFactor' => 6,
131
- 'stateTable' => EUCJP_st,
132
- 'charLenTable' => EUCJPCharLenTable,
133
- 'name' => 'EUC-JP'
130
+ 'classFactor' => 6,
131
+ 'stateTable' => EUCJP_st,
132
+ 'charLenTable' => EUCJPCharLenTable,
133
+ 'name' => 'EUC-JP'
134
134
  }
135
135
 
136
136
  # EUC-KR
@@ -178,10 +178,10 @@ module CharDet
178
178
  EUCKRCharLenTable = [0, 1, 2, 0]
179
179
 
180
180
  EUCKRSMModel = {'classTable' => EUCKR_cls,
181
- 'classFactor' => 4,
182
- 'stateTable' => EUCKR_st,
183
- 'charLenTable' => EUCKRCharLenTable,
184
- 'name' => 'EUC-KR'
181
+ 'classFactor' => 4,
182
+ 'stateTable' => EUCKR_st,
183
+ 'charLenTable' => EUCKRCharLenTable,
184
+ 'name' => 'EUC-KR'
185
185
  }
186
186
 
187
187
  # EUC-TW
@@ -233,10 +233,10 @@ module CharDet
233
233
  EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
234
 
235
235
  EUCTWSMModel = {'classTable' => EUCTW_cls,
236
- 'classFactor' => 7,
237
- 'stateTable' => EUCTW_st,
238
- 'charLenTable' => EUCTWCharLenTable,
239
- 'name' => 'x-euc-tw'
236
+ 'classFactor' => 7,
237
+ 'stateTable' => EUCTW_st,
238
+ 'charLenTable' => EUCTWCharLenTable,
239
+ 'name' => 'x-euc-tw'
240
240
  }
241
241
 
242
242
  # GB2312
@@ -293,10 +293,10 @@ module CharDet
293
293
  GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
294
 
295
295
  GB2312SMModel = {'classTable' => GB2312_cls,
296
- 'classFactor' => 7,
297
- 'stateTable' => GB2312_st,
298
- 'charLenTable' => GB2312CharLenTable,
299
- 'name' => 'GB2312'
296
+ 'classFactor' => 7,
297
+ 'stateTable' => GB2312_st,
298
+ 'charLenTable' => GB2312CharLenTable,
299
+ 'name' => 'GB2312'
300
300
  }
301
301
 
302
302
  # Shift_JIS
@@ -347,10 +347,10 @@ module CharDet
347
347
  SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
348
 
349
349
  SJISSMModel = {'classTable' => SJIS_cls,
350
- 'classFactor' => 6,
351
- 'stateTable' => SJIS_st,
352
- 'charLenTable' => SJISCharLenTable,
353
- 'name' => 'Shift_JIS'
350
+ 'classFactor' => 6,
351
+ 'stateTable' => SJIS_st,
352
+ 'charLenTable' => SJISCharLenTable,
353
+ 'name' => 'Shift_JIS'
354
354
  }
355
355
 
356
356
  # UCS2-BE
@@ -403,10 +403,10 @@ module CharDet
403
403
  UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
404
 
405
405
  UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
- 'classFactor' => 6,
407
- 'stateTable' => UCS2BE_st,
408
- 'charLenTable' => UCS2BECharLenTable,
409
- 'name' => 'UTF-16BE'
406
+ 'classFactor' => 6,
407
+ 'stateTable' => UCS2BE_st,
408
+ 'charLenTable' => UCS2BECharLenTable,
409
+ 'name' => 'UTF-16BE'
410
410
  }
411
411
 
412
412
  # UCS2-LE
@@ -459,10 +459,10 @@ module CharDet
459
459
  UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
460
 
461
461
  UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
- 'classFactor' => 6,
463
- 'stateTable' => UCS2LE_st,
464
- 'charLenTable' => UCS2LECharLenTable,
465
- 'name' => 'UTF-16LE'
462
+ 'classFactor' => 6,
463
+ 'stateTable' => UCS2LE_st,
464
+ 'charLenTable' => UCS2LECharLenTable,
465
+ 'name' => 'UTF-16LE'
466
466
  }
467
467
 
468
468
  # UTF-8
@@ -534,9 +534,9 @@ module CharDet
534
534
  UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
535
 
536
536
  UTF8SMModel = {'classTable' => UTF8_cls,
537
- 'classFactor' => 16,
538
- 'stateTable' => UTF8_st,
539
- 'charLenTable' => UTF8CharLenTable,
540
- 'name' => 'UTF-8'
537
+ 'classFactor' => 16,
538
+ 'stateTable' => UTF8_st,
539
+ 'charLenTable' => UTF8CharLenTable,
540
+ 'name' => 'UTF-8'
541
541
  }
542
542
  end
@@ -57,51 +57,51 @@ module CharDet
57
57
 
58
58
  def get_charset_name
59
59
  if @_mNameProber
60
- return @_mNameProber.get_charset_name()
60
+ return @_mNameProber.get_charset_name()
61
61
  else
62
- return @_mModel['charsetName']
62
+ return @_mModel['charsetName']
63
63
  end
64
64
  end
65
65
 
66
66
  def feed(aBuf)
67
67
  if not @_mModel['keepEnglishLetter']
68
- aBuf = filter_without_english_letters(aBuf)
68
+ aBuf = filter_without_english_letters(aBuf)
69
69
  end
70
70
  aLen = aBuf.length
71
71
  if not aLen
72
- return get_state()
72
+ return get_state()
73
73
  end
74
74
  aBuf.each_byte do |b|
75
- c = b.chr
76
- order = @_mModel['charToOrderMap'][c[0]]
77
- if order < SYMBOL_CAT_ORDER
78
- @_mTotalChar += 1
79
- end
80
- if order < SAMPLE_SIZE
81
- @_mFreqChar += 1
82
- if @_mLastOrder < SAMPLE_SIZE
83
- @_mTotalSeqs += 1
84
- if not @_mReversed
85
- @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
- else # reverse the order of the letters in the lookup
87
- @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
- end
89
- end
90
- end
91
- @_mLastOrder = order
75
+ c = b.chr
76
+ order = @_mModel['charToOrderMap'][c[0]]
77
+ if order < SYMBOL_CAT_ORDER
78
+ @_mTotalChar += 1
79
+ end
80
+ if order < SAMPLE_SIZE
81
+ @_mFreqChar += 1
82
+ if @_mLastOrder < SAMPLE_SIZE
83
+ @_mTotalSeqs += 1
84
+ if not @_mReversed
85
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
+ else # reverse the order of the letters in the lookup
87
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
+ end
89
+ end
90
+ end
91
+ @_mLastOrder = order
92
92
  end
93
93
 
94
94
  if get_state() == EDetecting
95
- if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
- cf = get_confidence()
97
- if cf > POSITIVE_SHORTCUT_THRESHOLD
98
- $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
- @_mState = EFoundIt
100
- elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
- $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
- @_mState = ENotMe
103
- end
104
- end
95
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
+ cf = get_confidence()
97
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
98
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
+ @_mState = EFoundIt
100
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
+ @_mState = ENotMe
103
+ end
104
+ end
105
105
  end
106
106
 
107
107
  return get_state()
@@ -110,13 +110,13 @@ module CharDet
110
110
  def get_confidence
111
111
  r = 0.01
112
112
  if @_mTotalSeqs > 0
113
- # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
- r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
- # print r, self._mFreqChar, self._mTotalChar
116
- r = r * @_mFreqChar / @_mTotalChar
117
- if r >= 1.0
118
- r = 0.99
119
- end
113
+ # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
+ # print r, self._mFreqChar, self._mTotalChar
116
+ r = r * @_mFreqChar / @_mTotalChar
117
+ if r >= 1.0
118
+ r = 0.99
119
+ end
120
120
  end
121
121
  return r
122
122
  end