tmail 1.2.7 → 1.2.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +5 -0
- data/lib/tmail/attachments.rb +1 -1
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +37 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +39 -39
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +2 -2
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +30 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +29 -29
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +31 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +13 -13
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +7 -9
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +36 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +38 -38
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +13 -15
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +23 -23
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +79 -78
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +22 -22
- data/lib/tmail/version.rb +2 -1
- data/tmail.gemspec +1 -1
- metadata +1 -1
@@ -144,22 +144,22 @@ module CharDet
|
|
144
144
|
# this character will simply our logic and improve performance.
|
145
145
|
i = @_mNeedToSkipCharNum
|
146
146
|
while i < aLen
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
147
|
+
order, charLen = get_order(aBuf[i...i+2])
|
148
|
+
i += charLen
|
149
|
+
if i > aLen
|
150
|
+
@_mNeedToSkipCharNum = i - aLen
|
151
|
+
@_mLastCharOrder = -1
|
152
|
+
else
|
153
|
+
if (order != -1) and (@_mLastCharOrder != -1)
|
154
|
+
@_mTotalRel += 1
|
155
|
+
if @_mTotalRel > MAX_REL_THRESHOLD
|
156
|
+
@_mDone = true
|
157
|
+
break
|
158
|
+
end
|
159
|
+
@_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
|
160
|
+
end
|
161
|
+
@_mLastCharOrder = order
|
162
|
+
end
|
163
163
|
end
|
164
164
|
end
|
165
165
|
|
@@ -169,10 +169,10 @@ module CharDet
|
|
169
169
|
|
170
170
|
def get_confidence
|
171
171
|
# This is just one way to calculate confidence. It works well for me.
|
172
|
-
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
|
-
|
172
|
+
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
|
+
return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
|
174
174
|
else
|
175
|
-
|
175
|
+
return DONT_KNOW
|
176
176
|
end
|
177
177
|
end
|
178
178
|
|
@@ -188,15 +188,15 @@ module CharDet
|
|
188
188
|
# find out current char's byte length
|
189
189
|
aStr = aStr[0..1].join if aStr.class == Array
|
190
190
|
if ((aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")) or ((aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xFC"))
|
191
|
-
|
191
|
+
charLen = 2
|
192
192
|
else
|
193
|
-
|
193
|
+
charLen = 1
|
194
194
|
end
|
195
195
|
# return its order if it is hiragana
|
196
196
|
if aStr.length > 1
|
197
|
-
|
198
|
-
|
199
|
-
|
197
|
+
if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
|
198
|
+
return aStr[1] - 0x9F, charLen
|
199
|
+
end
|
200
200
|
end
|
201
201
|
|
202
202
|
return -1, charLen
|
@@ -208,19 +208,19 @@ module CharDet
|
|
208
208
|
return -1, 1 unless aStr
|
209
209
|
# find out current char's byte length
|
210
210
|
aStr = aStr[0..1].join if aStr.class == Array
|
211
|
-
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
|
-
|
211
|
+
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
|
+
charLen = 2
|
213
213
|
elsif aStr[0..0] == "\x8F"
|
214
|
-
|
214
|
+
charLen = 3
|
215
215
|
else
|
216
|
-
|
216
|
+
charLen = 1
|
217
217
|
end
|
218
218
|
|
219
219
|
# return its order if it is hiragana
|
220
220
|
if aStr.length > 1
|
221
|
-
|
222
|
-
|
223
|
-
|
221
|
+
if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
|
222
|
+
return aStr[1] - 0xA1, charLen
|
223
|
+
end
|
224
224
|
end
|
225
225
|
|
226
226
|
return -1, charLen
|
@@ -110,15 +110,15 @@ module CharDet
|
|
110
110
|
def feed(aBuf)
|
111
111
|
aBuf = filter_with_english_letters(aBuf)
|
112
112
|
aBuf.each_byte do |b|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
c = b.chr
|
114
|
+
charClass = Latin1_CharToClass[c[0]]
|
115
|
+
freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
|
116
|
+
if freq == 0
|
117
|
+
@_mState = ENotMe
|
118
|
+
break
|
119
|
+
end
|
120
|
+
@_mFreqCounter[freq] += 1
|
121
|
+
@_mLastCharClass = charClass
|
122
122
|
end
|
123
123
|
|
124
124
|
return get_state()
|
@@ -126,17 +126,17 @@ module CharDet
|
|
126
126
|
|
127
127
|
def get_confidence
|
128
128
|
if get_state() == ENotMe
|
129
|
-
|
129
|
+
return 0.01
|
130
130
|
end
|
131
131
|
|
132
132
|
total = @_mFreqCounter.inject{|a,b| a+b}
|
133
133
|
if total < 0.01
|
134
|
-
|
134
|
+
confidence = 0.0
|
135
135
|
else
|
136
|
-
|
136
|
+
confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
|
137
137
|
end
|
138
138
|
if confidence < 0.0
|
139
|
-
|
139
|
+
confidence = 0.0
|
140
140
|
end
|
141
141
|
# lower the confidence of latin1 so that other more accurate detector
|
142
142
|
# can take priority.
|
@@ -40,10 +40,10 @@ module CharDet
|
|
40
40
|
def reset
|
41
41
|
super
|
42
42
|
if @_mCodingSM
|
43
|
-
|
43
|
+
@_mCodingSM.reset()
|
44
44
|
end
|
45
45
|
if @_mDistributionAnalyzer
|
46
|
-
|
46
|
+
@_mDistributionAnalyzer.reset()
|
47
47
|
end
|
48
48
|
@_mLastChar = "\x00\x00"
|
49
49
|
end
|
@@ -54,30 +54,30 @@ module CharDet
|
|
54
54
|
def feed(aBuf)
|
55
55
|
aLen = aBuf.length
|
56
56
|
for i in (0...aLen)
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
57
|
+
codingState = @_mCodingSM.next_state(aBuf[i..i])
|
58
|
+
if codingState == EError
|
59
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
60
|
+
@_mState = ENotMe
|
61
|
+
break
|
62
|
+
elsif codingState == EItsMe
|
63
|
+
@_mState = EFoundIt
|
64
|
+
break
|
65
|
+
elsif codingState == EStart
|
66
|
+
charLen = @_mCodingSM.get_current_charlen()
|
67
|
+
if i == 0
|
68
|
+
@_mLastChar[1] = aBuf[0..0]
|
69
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
70
|
+
else
|
71
|
+
@_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
72
|
+
end
|
73
|
+
end
|
74
74
|
end
|
75
75
|
@_mLastChar[0] = aBuf[aLen-1..aLen-1]
|
76
76
|
|
77
77
|
if get_state() == EDetecting
|
78
|
-
|
79
|
-
|
80
|
-
|
78
|
+
if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
79
|
+
@_mState = EFoundIt
|
80
|
+
end
|
81
81
|
end
|
82
82
|
return get_state()
|
83
83
|
end
|
@@ -32,15 +32,13 @@ module CharDet
|
|
32
32
|
class MBCSGroupProber < CharSetGroupProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@_mProbers = [
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
EUCTWProber.new
|
43
|
-
]
|
35
|
+
@_mProbers = [ UTF8Prober.new,
|
36
|
+
SJISProber.new,
|
37
|
+
EUCJPProber.new,
|
38
|
+
GB2312Prober.new,
|
39
|
+
EUCKRProber.new,
|
40
|
+
Big5Prober.new,
|
41
|
+
EUCTWProber.new ]
|
44
42
|
reset()
|
45
43
|
end
|
46
44
|
end
|
@@ -73,10 +73,10 @@ module CharDet
|
|
73
73
|
Big5CharLenTable = [0, 1, 1, 2, 0]
|
74
74
|
|
75
75
|
Big5SMModel = {'classTable' => BIG5_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
76
|
+
'classFactor' => 5,
|
77
|
+
'stateTable' => BIG5_st,
|
78
|
+
'charLenTable' => Big5CharLenTable,
|
79
|
+
'name' => 'Big5'
|
80
80
|
}
|
81
81
|
|
82
82
|
# EUC-JP
|
@@ -127,10 +127,10 @@ module CharDet
|
|
127
127
|
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
|
128
128
|
|
129
129
|
EUCJPSMModel = {'classTable' => EUCJP_cls,
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
130
|
+
'classFactor' => 6,
|
131
|
+
'stateTable' => EUCJP_st,
|
132
|
+
'charLenTable' => EUCJPCharLenTable,
|
133
|
+
'name' => 'EUC-JP'
|
134
134
|
}
|
135
135
|
|
136
136
|
# EUC-KR
|
@@ -178,10 +178,10 @@ module CharDet
|
|
178
178
|
EUCKRCharLenTable = [0, 1, 2, 0]
|
179
179
|
|
180
180
|
EUCKRSMModel = {'classTable' => EUCKR_cls,
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
181
|
+
'classFactor' => 4,
|
182
|
+
'stateTable' => EUCKR_st,
|
183
|
+
'charLenTable' => EUCKRCharLenTable,
|
184
|
+
'name' => 'EUC-KR'
|
185
185
|
}
|
186
186
|
|
187
187
|
# EUC-TW
|
@@ -233,10 +233,10 @@ module CharDet
|
|
233
233
|
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
|
234
234
|
|
235
235
|
EUCTWSMModel = {'classTable' => EUCTW_cls,
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
236
|
+
'classFactor' => 7,
|
237
|
+
'stateTable' => EUCTW_st,
|
238
|
+
'charLenTable' => EUCTWCharLenTable,
|
239
|
+
'name' => 'x-euc-tw'
|
240
240
|
}
|
241
241
|
|
242
242
|
# GB2312
|
@@ -293,10 +293,10 @@ module CharDet
|
|
293
293
|
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
|
294
294
|
|
295
295
|
GB2312SMModel = {'classTable' => GB2312_cls,
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
296
|
+
'classFactor' => 7,
|
297
|
+
'stateTable' => GB2312_st,
|
298
|
+
'charLenTable' => GB2312CharLenTable,
|
299
|
+
'name' => 'GB2312'
|
300
300
|
}
|
301
301
|
|
302
302
|
# Shift_JIS
|
@@ -347,10 +347,10 @@ module CharDet
|
|
347
347
|
SJISCharLenTable = [0, 1, 1, 2, 0, 0]
|
348
348
|
|
349
349
|
SJISSMModel = {'classTable' => SJIS_cls,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
350
|
+
'classFactor' => 6,
|
351
|
+
'stateTable' => SJIS_st,
|
352
|
+
'charLenTable' => SJISCharLenTable,
|
353
|
+
'name' => 'Shift_JIS'
|
354
354
|
}
|
355
355
|
|
356
356
|
# UCS2-BE
|
@@ -403,10 +403,10 @@ module CharDet
|
|
403
403
|
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
|
404
404
|
|
405
405
|
UCS2BESMModel = {'classTable' => UCS2BE_cls,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
406
|
+
'classFactor' => 6,
|
407
|
+
'stateTable' => UCS2BE_st,
|
408
|
+
'charLenTable' => UCS2BECharLenTable,
|
409
|
+
'name' => 'UTF-16BE'
|
410
410
|
}
|
411
411
|
|
412
412
|
# UCS2-LE
|
@@ -459,10 +459,10 @@ module CharDet
|
|
459
459
|
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
|
460
460
|
|
461
461
|
UCS2LESMModel = {'classTable' => UCS2LE_cls,
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
462
|
+
'classFactor' => 6,
|
463
|
+
'stateTable' => UCS2LE_st,
|
464
|
+
'charLenTable' => UCS2LECharLenTable,
|
465
|
+
'name' => 'UTF-16LE'
|
466
466
|
}
|
467
467
|
|
468
468
|
# UTF-8
|
@@ -534,9 +534,9 @@ module CharDet
|
|
534
534
|
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
|
535
535
|
|
536
536
|
UTF8SMModel = {'classTable' => UTF8_cls,
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
537
|
+
'classFactor' => 16,
|
538
|
+
'stateTable' => UTF8_st,
|
539
|
+
'charLenTable' => UTF8CharLenTable,
|
540
|
+
'name' => 'UTF-8'
|
541
541
|
}
|
542
542
|
end
|
@@ -57,51 +57,51 @@ module CharDet
|
|
57
57
|
|
58
58
|
def get_charset_name
|
59
59
|
if @_mNameProber
|
60
|
-
|
60
|
+
return @_mNameProber.get_charset_name()
|
61
61
|
else
|
62
|
-
|
62
|
+
return @_mModel['charsetName']
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def feed(aBuf)
|
67
67
|
if not @_mModel['keepEnglishLetter']
|
68
|
-
|
68
|
+
aBuf = filter_without_english_letters(aBuf)
|
69
69
|
end
|
70
70
|
aLen = aBuf.length
|
71
71
|
if not aLen
|
72
|
-
|
72
|
+
return get_state()
|
73
73
|
end
|
74
74
|
aBuf.each_byte do |b|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
75
|
+
c = b.chr
|
76
|
+
order = @_mModel['charToOrderMap'][c[0]]
|
77
|
+
if order < SYMBOL_CAT_ORDER
|
78
|
+
@_mTotalChar += 1
|
79
|
+
end
|
80
|
+
if order < SAMPLE_SIZE
|
81
|
+
@_mFreqChar += 1
|
82
|
+
if @_mLastOrder < SAMPLE_SIZE
|
83
|
+
@_mTotalSeqs += 1
|
84
|
+
if not @_mReversed
|
85
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
|
86
|
+
else # reverse the order of the letters in the lookup
|
87
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@_mLastOrder = order
|
92
92
|
end
|
93
93
|
|
94
94
|
if get_state() == EDetecting
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
95
|
+
if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
|
96
|
+
cf = get_confidence()
|
97
|
+
if cf > POSITIVE_SHORTCUT_THRESHOLD
|
98
|
+
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
|
99
|
+
@_mState = EFoundIt
|
100
|
+
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
|
101
|
+
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
|
102
|
+
@_mState = ENotMe
|
103
|
+
end
|
104
|
+
end
|
105
105
|
end
|
106
106
|
|
107
107
|
return get_state()
|
@@ -110,13 +110,13 @@ module CharDet
|
|
110
110
|
def get_confidence
|
111
111
|
r = 0.01
|
112
112
|
if @_mTotalSeqs > 0
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
|
114
|
+
r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
|
115
|
+
# print r, self._mFreqChar, self._mTotalChar
|
116
|
+
r = r * @_mFreqChar / @_mTotalChar
|
117
|
+
if r >= 1.0
|
118
|
+
r = 0.99
|
119
|
+
end
|
120
120
|
end
|
121
121
|
return r
|
122
122
|
end
|