tmail 1.2.7 → 1.2.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +5 -0
- data/lib/tmail/attachments.rb +1 -1
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +37 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +39 -39
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +2 -2
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +30 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +29 -29
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +31 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +13 -13
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +7 -9
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +36 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +38 -38
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +13 -15
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +23 -23
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +79 -78
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +22 -22
- data/lib/tmail/version.rb +2 -1
- data/tmail.gemspec +1 -1
- metadata +1 -1
data/CHANGES
CHANGED
data/lib/tmail/attachments.rb
CHANGED
@@ -6,7 +6,8 @@
|
|
6
6
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
|
-
# Contributor(s)
|
9
|
+
# Contributor(s)
|
10
|
+
|
10
11
|
# Jeff Hodges
|
11
12
|
# Mark Pilgrim - port to Python
|
12
13
|
#
|
@@ -49,19 +50,19 @@ module CharDet
|
|
49
50
|
def feed(aStr, aCharLen)
|
50
51
|
# # """feed a character with known length"""
|
51
52
|
if aCharLen == 2
|
52
|
-
|
53
|
-
|
53
|
+
# we only care about 2-bytes character in our distribution analysis
|
54
|
+
order = get_order(aStr)
|
54
55
|
else
|
55
|
-
|
56
|
+
order = -1
|
56
57
|
end
|
57
58
|
if order >= 0
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
59
|
+
@_mTotalChars += 1
|
60
|
+
# order is valid
|
61
|
+
if order < @_mTableSize
|
62
|
+
if 512 > @_mCharToFreqOrder[order]
|
63
|
+
@_mFreqChars += 1
|
64
|
+
end
|
65
|
+
end
|
65
66
|
end
|
66
67
|
end
|
67
68
|
|
@@ -69,14 +70,14 @@ module CharDet
|
|
69
70
|
# """return confidence based on existing data"""
|
70
71
|
# if we didn't receive any character in our consideration range, return negative answer
|
71
72
|
if @_mTotalChars <= 0
|
72
|
-
|
73
|
+
return SURE_NO
|
73
74
|
end
|
74
75
|
|
75
|
-
if @_mTotalChars != @_mFreqChars
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
76
|
+
if @_mTotalChars != @_mFreqChars
|
77
|
+
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
|
78
|
+
if r < SURE_YES
|
79
|
+
return r
|
80
|
+
end
|
80
81
|
end
|
81
82
|
|
82
83
|
# normalize confidence (we don't want to be 100% sure)
|
@@ -111,9 +112,9 @@ module CharDet
|
|
111
112
|
# second byte range: 0xa1 -- 0xfe
|
112
113
|
# no validation needed here. State machine has done that
|
113
114
|
if aStr[0..0] >= "\xC4"
|
114
|
-
|
115
|
+
return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
|
115
116
|
else
|
116
|
-
|
117
|
+
return -1
|
117
118
|
end
|
118
119
|
end
|
119
120
|
end
|
@@ -132,9 +133,9 @@ module CharDet
|
|
132
133
|
# second byte range: 0xa1 -- 0xfe
|
133
134
|
# no validation needed here. State machine has done that
|
134
135
|
if aStr[0..0] >= "\xB0"
|
135
|
-
|
136
|
+
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
|
136
137
|
else
|
137
|
-
|
138
|
+
return -1
|
138
139
|
end
|
139
140
|
end
|
140
141
|
end
|
@@ -153,9 +154,9 @@ module CharDet
|
|
153
154
|
# second byte range: 0xa1 -- 0xfe
|
154
155
|
# no validation needed here. State machine has done that
|
155
156
|
if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
|
156
|
-
|
157
|
+
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
|
157
158
|
else
|
158
|
-
|
159
|
+
return -1
|
159
160
|
end
|
160
161
|
end
|
161
162
|
end
|
@@ -174,13 +175,13 @@ module CharDet
|
|
174
175
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
175
176
|
# no validation needed here. State machine has done that
|
176
177
|
if aStr[0..0] >= "\xA4"
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
178
|
+
if aStr[1..1] >= "\xA1"
|
179
|
+
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
|
180
|
+
else
|
181
|
+
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
|
182
|
+
end
|
182
183
|
else
|
183
|
-
|
184
|
+
return -1
|
184
185
|
end
|
185
186
|
end
|
186
187
|
end
|
@@ -200,15 +201,15 @@ module CharDet
|
|
200
201
|
# no validation needed here. State machine has done that
|
201
202
|
aStr = aStr[0..1].join if aStr.class == Array
|
202
203
|
if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
|
203
|
-
|
204
|
+
order = 188 * (aStr[0] - 0x81)
|
204
205
|
elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
|
205
|
-
|
206
|
+
order = 188 * (aStr[0] - 0xE0 + 31)
|
206
207
|
else
|
207
|
-
|
208
|
+
return -1
|
208
209
|
end
|
209
210
|
order = order + aStr[1] - 0x40
|
210
211
|
if aStr[1..1] > "\x7F"
|
211
|
-
|
212
|
+
order =- 1
|
212
213
|
end
|
213
214
|
return order
|
214
215
|
end
|
@@ -227,10 +228,10 @@ module CharDet
|
|
227
228
|
# first byte range: 0xa0 -- 0xfe
|
228
229
|
# second byte range: 0xa1 -- 0xfe
|
229
230
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0..0] >= "\xA0"
|
231
|
-
|
231
|
+
if aStr[0..0] >= "\xA0"
|
232
|
+
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
|
232
233
|
else
|
233
|
-
|
234
|
+
return -1
|
234
235
|
end
|
235
236
|
end
|
236
237
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
|
-
# Contributor(s)
|
9
|
+
# Contributor(s)
|
10
10
|
# Jeff Hodges - port to Ruby
|
11
11
|
# Mark Pilgrim - port to Python
|
12
12
|
#
|
@@ -40,42 +40,42 @@ module CharDet
|
|
40
40
|
super
|
41
41
|
@_mActiveNum = 0
|
42
42
|
|
43
|
-
for prober in @_mProbers
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
for prober in @_mProbers
|
44
|
+
if prober
|
45
|
+
prober.reset()
|
46
|
+
prober.active = true
|
47
|
+
@_mActiveNum += 1
|
48
|
+
end
|
49
49
|
end
|
50
50
|
@_mBestGuessProber = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
54
|
if not @_mBestGuessProber
|
55
|
-
|
56
|
-
|
57
|
-
|
55
|
+
get_confidence()
|
56
|
+
return nil unless @_mBestGuessProber
|
57
|
+
# self._mBestGuessProber = self._mProbers[0]
|
58
58
|
end
|
59
59
|
return @_mBestGuessProber.get_charset_name()
|
60
60
|
end
|
61
61
|
|
62
62
|
def feed(aBuf)
|
63
63
|
for prober in @_mProbers
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
next unless prober
|
65
|
+
next unless prober.active
|
66
|
+
st = prober.feed(aBuf)
|
67
|
+
next unless st
|
68
|
+
if st == EFoundIt
|
69
|
+
@_mBestGuessProber = prober
|
70
|
+
return get_state()
|
71
|
+
elsif st == ENotMe
|
72
|
+
prober.active = false
|
73
|
+
@_mActiveNum -= 1
|
74
|
+
if @_mActiveNum <= 0
|
75
|
+
@_mState = ENotMe
|
76
|
+
return get_state()
|
77
|
+
end
|
78
|
+
end
|
79
79
|
end
|
80
80
|
return get_state()
|
81
81
|
end
|
@@ -83,28 +83,28 @@ module CharDet
|
|
83
83
|
def get_confidence()
|
84
84
|
st = get_state()
|
85
85
|
if st == EFoundIt
|
86
|
-
|
86
|
+
return 0.99
|
87
87
|
elsif st == ENotMe
|
88
|
-
|
88
|
+
return 0.01
|
89
89
|
end
|
90
90
|
bestConf = 0.0
|
91
91
|
@_mBestGuessProber = nil
|
92
92
|
for prober in @_mProbers
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
93
|
+
next unless prober
|
94
|
+
unless prober.active
|
95
|
+
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
|
96
|
+
next
|
97
|
+
end
|
98
|
+
cf = prober.get_confidence()
|
99
|
+
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
|
100
|
+
if bestConf < cf
|
101
|
+
bestConf = cf
|
102
|
+
@_mBestGuessProber = prober
|
103
|
+
end
|
104
104
|
end
|
105
105
|
return 0.0 unless @_mBestGuessProber
|
106
106
|
return bestConf
|
107
|
-
# else
|
107
|
+
# else
|
108
108
|
# self._mBestGuessProber = self._mProbers[0]
|
109
109
|
# return self._mBestGuessProber.get_confidence()
|
110
110
|
end
|
@@ -44,8 +44,8 @@ module CharDet
|
|
44
44
|
# if it is first byte, we also get byte length
|
45
45
|
byteCls = @_mModel['classTable'][c[0]]
|
46
46
|
if @_mCurrentState == EStart
|
47
|
-
|
48
|
-
|
47
|
+
@_mCurrentBytePos = 0
|
48
|
+
@_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
|
49
49
|
end
|
50
50
|
# from byte's class and stateTable, we get its next state
|
51
51
|
@_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
|
@@ -30,21 +30,19 @@ module CharDet
|
|
30
30
|
class EscCharSetProber < CharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@_mCodingSM = [
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
CodingStateMachine.new(ISO2022KRSMModel)
|
38
|
-
]
|
33
|
+
@_mCodingSM = [ CodingStateMachine.new(HZSMModel),
|
34
|
+
CodingStateMachine.new(ISO2022CNSMModel),
|
35
|
+
CodingStateMachine.new(ISO2022JPSMModel),
|
36
|
+
CodingStateMachine.new(ISO2022KRSMModel) ]
|
39
37
|
reset()
|
40
38
|
end
|
41
39
|
|
42
40
|
def reset
|
43
41
|
super()
|
44
|
-
for codingSM in @_mCodingSM
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
for codingSM in @_mCodingSM
|
43
|
+
next if not codingSM
|
44
|
+
codingSM.active = true
|
45
|
+
codingSM.reset()
|
48
46
|
end
|
49
47
|
@_mActiveSM = @_mCodingSM.length
|
50
48
|
@_mDetectedCharset = nil
|
@@ -56,35 +54,36 @@ module CharDet
|
|
56
54
|
|
57
55
|
def get_confidence
|
58
56
|
if @_mDetectedCharset
|
59
|
-
|
57
|
+
return 0.99
|
60
58
|
else
|
61
|
-
|
59
|
+
return 0.00
|
62
60
|
end
|
63
61
|
end
|
64
62
|
|
65
63
|
def feed(aBuf)
|
66
64
|
aBuf.each_byte do |b|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
65
|
+
c = b.chr
|
66
|
+
for codingSM in @_mCodingSM
|
67
|
+
next unless codingSM
|
68
|
+
next unless codingSM.active
|
69
|
+
codingState = codingSM.next_state(c)
|
70
|
+
if codingState == EError
|
71
|
+
codingSM.active = false
|
72
|
+
@_mActiveSM -= 1
|
73
|
+
if @_mActiveSM <= 0
|
74
|
+
@_mState = ENotMe
|
75
|
+
return get_state()
|
76
|
+
end
|
77
|
+
elsif codingState == EItsMe
|
78
|
+
@_mState = EFoundIt
|
79
|
+
@_mDetectedCharset = codingSM.get_coding_state_machine()
|
80
|
+
return get_state()
|
81
|
+
end
|
82
|
+
end
|
85
83
|
end
|
86
|
-
|
87
84
|
return get_state()
|
85
|
+
|
88
86
|
end
|
87
|
+
|
89
88
|
end
|
90
89
|
end
|
@@ -48,33 +48,33 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @_mCodingSM.next_state(aBuf[i..i])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@_mState = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@_mState = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @_mCodingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@_mLastChar[1] = aBuf[0..0]
|
63
|
+
@_mContextAnalyzer.feed(@_mLastChar, charLen)
|
64
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
65
|
+
else
|
66
|
+
@_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
67
|
+
@_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
72
|
@_mLastChar[0] = aBuf[aLen-1..aLen-1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@_mState = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
@@ -215,34 +215,34 @@ module CharDet
|
|
215
215
|
# so the word boundary detection works properly. [MAP]
|
216
216
|
|
217
217
|
if get_state() == ENotMe
|
218
|
-
|
219
|
-
|
218
|
+
# Both model probers say it's not them. No reason to continue.
|
219
|
+
return ENotMe
|
220
220
|
end
|
221
221
|
|
222
222
|
aBuf = filter_high_bit_only(aBuf)
|
223
223
|
|
224
224
|
for cur in aBuf.split(' ')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
225
|
+
if cur == ' '
|
226
|
+
# We stand on a space - a word just ended
|
227
|
+
if @_mBeforePrev != ' '
|
228
|
+
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
229
|
+
if is_final(@_mPrev)
|
230
|
+
# case (1) [-2:not space][-1:final letter][cur:space]
|
231
|
+
@_mFinalCharLogicalScore += 1
|
232
|
+
elsif is_non_final(@_mPrev)
|
233
|
+
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
234
|
+
@_mFinalCharVisualScore += 1
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# Not standing on a space
|
239
|
+
if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
|
240
|
+
# case (3) [-2:space][-1:final letter][cur:not space]
|
241
|
+
@_mFinalCharVisualScore += 1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
@_mBeforePrev = @_mPrev
|
245
|
+
@_mPrev = cur
|
246
246
|
end
|
247
247
|
|
248
248
|
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
@@ -254,24 +254,24 @@ module CharDet
|
|
254
254
|
# If the final letter score distance is dominant enough, rely on it.
|
255
255
|
finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
|
256
256
|
if finalsub >= MIN_FINAL_CHAR_DISTANCE
|
257
|
-
|
257
|
+
return LOGICAL_HEBREW_NAME
|
258
258
|
end
|
259
259
|
if finalsub <= -MIN_FINAL_CHAR_DISTANCE
|
260
|
-
|
260
|
+
return VISUAL_HEBREW_NAME
|
261
261
|
end
|
262
262
|
|
263
263
|
# It's not dominant enough, try to rely on the model scores instead.
|
264
264
|
modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
|
265
265
|
if modelsub > MIN_MODEL_DISTANCE
|
266
|
-
|
266
|
+
return LOGICAL_HEBREW_NAME
|
267
267
|
end
|
268
268
|
if modelsub < -MIN_MODEL_DISTANCE
|
269
|
-
|
269
|
+
return VISUAL_HEBREW_NAME
|
270
270
|
end
|
271
271
|
|
272
272
|
# Still no good, back to final letter distance, maybe it'll save the day.
|
273
273
|
if finalsub < 0.0
|
274
|
-
|
274
|
+
return VISUAL_HEBREW_NAME
|
275
275
|
end
|
276
276
|
|
277
277
|
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
@@ -281,7 +281,7 @@ module CharDet
|
|
281
281
|
def get_state
|
282
282
|
# Remain active as long as any of the model probers are active.
|
283
283
|
if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
|
284
|
-
|
284
|
+
return ENotMe
|
285
285
|
end
|
286
286
|
return EDetecting
|
287
287
|
end
|