tmail 1.2.7 → 1.2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +5 -0
- data/lib/tmail/attachments.rb +1 -1
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +37 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +39 -39
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +2 -2
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +30 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +29 -29
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +31 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +13 -13
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +7 -9
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +36 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +38 -38
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +13 -15
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +23 -23
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +79 -78
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +22 -22
- data/lib/tmail/version.rb +2 -1
- data/tmail.gemspec +1 -1
- metadata +1 -1
data/CHANGES
CHANGED
data/lib/tmail/attachments.rb
CHANGED
@@ -6,7 +6,8 @@
|
|
6
6
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
|
-
# Contributor(s)
|
9
|
+
# Contributor(s)
|
10
|
+
|
10
11
|
# Jeff Hodges
|
11
12
|
# Mark Pilgrim - port to Python
|
12
13
|
#
|
@@ -49,19 +50,19 @@ module CharDet
|
|
49
50
|
def feed(aStr, aCharLen)
|
50
51
|
# # """feed a character with known length"""
|
51
52
|
if aCharLen == 2
|
52
|
-
|
53
|
-
|
53
|
+
# we only care about 2-bytes character in our distribution analysis
|
54
|
+
order = get_order(aStr)
|
54
55
|
else
|
55
|
-
|
56
|
+
order = -1
|
56
57
|
end
|
57
58
|
if order >= 0
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
59
|
+
@_mTotalChars += 1
|
60
|
+
# order is valid
|
61
|
+
if order < @_mTableSize
|
62
|
+
if 512 > @_mCharToFreqOrder[order]
|
63
|
+
@_mFreqChars += 1
|
64
|
+
end
|
65
|
+
end
|
65
66
|
end
|
66
67
|
end
|
67
68
|
|
@@ -69,14 +70,14 @@ module CharDet
|
|
69
70
|
# """return confidence based on existing data"""
|
70
71
|
# if we didn't receive any character in our consideration range, return negative answer
|
71
72
|
if @_mTotalChars <= 0
|
72
|
-
|
73
|
+
return SURE_NO
|
73
74
|
end
|
74
75
|
|
75
|
-
if @_mTotalChars != @_mFreqChars
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
76
|
+
if @_mTotalChars != @_mFreqChars
|
77
|
+
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
|
78
|
+
if r < SURE_YES
|
79
|
+
return r
|
80
|
+
end
|
80
81
|
end
|
81
82
|
|
82
83
|
# normalize confidence (we don't want to be 100% sure)
|
@@ -111,9 +112,9 @@ module CharDet
|
|
111
112
|
# second byte range: 0xa1 -- 0xfe
|
112
113
|
# no validation needed here. State machine has done that
|
113
114
|
if aStr[0..0] >= "\xC4"
|
114
|
-
|
115
|
+
return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
|
115
116
|
else
|
116
|
-
|
117
|
+
return -1
|
117
118
|
end
|
118
119
|
end
|
119
120
|
end
|
@@ -132,9 +133,9 @@ module CharDet
|
|
132
133
|
# second byte range: 0xa1 -- 0xfe
|
133
134
|
# no validation needed here. State machine has done that
|
134
135
|
if aStr[0..0] >= "\xB0"
|
135
|
-
|
136
|
+
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
|
136
137
|
else
|
137
|
-
|
138
|
+
return -1
|
138
139
|
end
|
139
140
|
end
|
140
141
|
end
|
@@ -153,9 +154,9 @@ module CharDet
|
|
153
154
|
# second byte range: 0xa1 -- 0xfe
|
154
155
|
# no validation needed here. State machine has done that
|
155
156
|
if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
|
156
|
-
|
157
|
+
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
|
157
158
|
else
|
158
|
-
|
159
|
+
return -1
|
159
160
|
end
|
160
161
|
end
|
161
162
|
end
|
@@ -174,13 +175,13 @@ module CharDet
|
|
174
175
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
175
176
|
# no validation needed here. State machine has done that
|
176
177
|
if aStr[0..0] >= "\xA4"
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
178
|
+
if aStr[1..1] >= "\xA1"
|
179
|
+
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
|
180
|
+
else
|
181
|
+
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
|
182
|
+
end
|
182
183
|
else
|
183
|
-
|
184
|
+
return -1
|
184
185
|
end
|
185
186
|
end
|
186
187
|
end
|
@@ -200,15 +201,15 @@ module CharDet
|
|
200
201
|
# no validation needed here. State machine has done that
|
201
202
|
aStr = aStr[0..1].join if aStr.class == Array
|
202
203
|
if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
|
203
|
-
|
204
|
+
order = 188 * (aStr[0] - 0x81)
|
204
205
|
elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
|
205
|
-
|
206
|
+
order = 188 * (aStr[0] - 0xE0 + 31)
|
206
207
|
else
|
207
|
-
|
208
|
+
return -1
|
208
209
|
end
|
209
210
|
order = order + aStr[1] - 0x40
|
210
211
|
if aStr[1..1] > "\x7F"
|
211
|
-
|
212
|
+
order =- 1
|
212
213
|
end
|
213
214
|
return order
|
214
215
|
end
|
@@ -227,10 +228,10 @@ module CharDet
|
|
227
228
|
# first byte range: 0xa0 -- 0xfe
|
228
229
|
# second byte range: 0xa1 -- 0xfe
|
229
230
|
# no validation needed here. State machine has done that
|
230
|
-
if aStr[0..0] >= "\xA0"
|
231
|
-
|
231
|
+
if aStr[0..0] >= "\xA0"
|
232
|
+
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
|
232
233
|
else
|
233
|
-
|
234
|
+
return -1
|
234
235
|
end
|
235
236
|
end
|
236
237
|
end
|
@@ -6,7 +6,7 @@
|
|
6
6
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
|
-
# Contributor(s)
|
9
|
+
# Contributor(s)
|
10
10
|
# Jeff Hodges - port to Ruby
|
11
11
|
# Mark Pilgrim - port to Python
|
12
12
|
#
|
@@ -40,42 +40,42 @@ module CharDet
|
|
40
40
|
super
|
41
41
|
@_mActiveNum = 0
|
42
42
|
|
43
|
-
for prober in @_mProbers
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
for prober in @_mProbers
|
44
|
+
if prober
|
45
|
+
prober.reset()
|
46
|
+
prober.active = true
|
47
|
+
@_mActiveNum += 1
|
48
|
+
end
|
49
49
|
end
|
50
50
|
@_mBestGuessProber = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
54
|
if not @_mBestGuessProber
|
55
|
-
|
56
|
-
|
57
|
-
|
55
|
+
get_confidence()
|
56
|
+
return nil unless @_mBestGuessProber
|
57
|
+
# self._mBestGuessProber = self._mProbers[0]
|
58
58
|
end
|
59
59
|
return @_mBestGuessProber.get_charset_name()
|
60
60
|
end
|
61
61
|
|
62
62
|
def feed(aBuf)
|
63
63
|
for prober in @_mProbers
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
64
|
+
next unless prober
|
65
|
+
next unless prober.active
|
66
|
+
st = prober.feed(aBuf)
|
67
|
+
next unless st
|
68
|
+
if st == EFoundIt
|
69
|
+
@_mBestGuessProber = prober
|
70
|
+
return get_state()
|
71
|
+
elsif st == ENotMe
|
72
|
+
prober.active = false
|
73
|
+
@_mActiveNum -= 1
|
74
|
+
if @_mActiveNum <= 0
|
75
|
+
@_mState = ENotMe
|
76
|
+
return get_state()
|
77
|
+
end
|
78
|
+
end
|
79
79
|
end
|
80
80
|
return get_state()
|
81
81
|
end
|
@@ -83,28 +83,28 @@ module CharDet
|
|
83
83
|
def get_confidence()
|
84
84
|
st = get_state()
|
85
85
|
if st == EFoundIt
|
86
|
-
|
86
|
+
return 0.99
|
87
87
|
elsif st == ENotMe
|
88
|
-
|
88
|
+
return 0.01
|
89
89
|
end
|
90
90
|
bestConf = 0.0
|
91
91
|
@_mBestGuessProber = nil
|
92
92
|
for prober in @_mProbers
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
93
|
+
next unless prober
|
94
|
+
unless prober.active
|
95
|
+
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
|
96
|
+
next
|
97
|
+
end
|
98
|
+
cf = prober.get_confidence()
|
99
|
+
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
|
100
|
+
if bestConf < cf
|
101
|
+
bestConf = cf
|
102
|
+
@_mBestGuessProber = prober
|
103
|
+
end
|
104
104
|
end
|
105
105
|
return 0.0 unless @_mBestGuessProber
|
106
106
|
return bestConf
|
107
|
-
# else
|
107
|
+
# else
|
108
108
|
# self._mBestGuessProber = self._mProbers[0]
|
109
109
|
# return self._mBestGuessProber.get_confidence()
|
110
110
|
end
|
@@ -44,8 +44,8 @@ module CharDet
|
|
44
44
|
# if it is first byte, we also get byte length
|
45
45
|
byteCls = @_mModel['classTable'][c[0]]
|
46
46
|
if @_mCurrentState == EStart
|
47
|
-
|
48
|
-
|
47
|
+
@_mCurrentBytePos = 0
|
48
|
+
@_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
|
49
49
|
end
|
50
50
|
# from byte's class and stateTable, we get its next state
|
51
51
|
@_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
|
@@ -30,21 +30,19 @@ module CharDet
|
|
30
30
|
class EscCharSetProber < CharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@_mCodingSM = [
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
CodingStateMachine.new(ISO2022KRSMModel)
|
38
|
-
]
|
33
|
+
@_mCodingSM = [ CodingStateMachine.new(HZSMModel),
|
34
|
+
CodingStateMachine.new(ISO2022CNSMModel),
|
35
|
+
CodingStateMachine.new(ISO2022JPSMModel),
|
36
|
+
CodingStateMachine.new(ISO2022KRSMModel) ]
|
39
37
|
reset()
|
40
38
|
end
|
41
39
|
|
42
40
|
def reset
|
43
41
|
super()
|
44
|
-
for codingSM in @_mCodingSM
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
for codingSM in @_mCodingSM
|
43
|
+
next if not codingSM
|
44
|
+
codingSM.active = true
|
45
|
+
codingSM.reset()
|
48
46
|
end
|
49
47
|
@_mActiveSM = @_mCodingSM.length
|
50
48
|
@_mDetectedCharset = nil
|
@@ -56,35 +54,36 @@ module CharDet
|
|
56
54
|
|
57
55
|
def get_confidence
|
58
56
|
if @_mDetectedCharset
|
59
|
-
|
57
|
+
return 0.99
|
60
58
|
else
|
61
|
-
|
59
|
+
return 0.00
|
62
60
|
end
|
63
61
|
end
|
64
62
|
|
65
63
|
def feed(aBuf)
|
66
64
|
aBuf.each_byte do |b|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
65
|
+
c = b.chr
|
66
|
+
for codingSM in @_mCodingSM
|
67
|
+
next unless codingSM
|
68
|
+
next unless codingSM.active
|
69
|
+
codingState = codingSM.next_state(c)
|
70
|
+
if codingState == EError
|
71
|
+
codingSM.active = false
|
72
|
+
@_mActiveSM -= 1
|
73
|
+
if @_mActiveSM <= 0
|
74
|
+
@_mState = ENotMe
|
75
|
+
return get_state()
|
76
|
+
end
|
77
|
+
elsif codingState == EItsMe
|
78
|
+
@_mState = EFoundIt
|
79
|
+
@_mDetectedCharset = codingSM.get_coding_state_machine()
|
80
|
+
return get_state()
|
81
|
+
end
|
82
|
+
end
|
85
83
|
end
|
86
|
-
|
87
84
|
return get_state()
|
85
|
+
|
88
86
|
end
|
87
|
+
|
89
88
|
end
|
90
89
|
end
|
@@ -48,33 +48,33 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @_mCodingSM.next_state(aBuf[i..i])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@_mState = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@_mState = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @_mCodingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@_mLastChar[1] = aBuf[0..0]
|
63
|
+
@_mContextAnalyzer.feed(@_mLastChar, charLen)
|
64
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
65
|
+
else
|
66
|
+
@_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
67
|
+
@_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
72
|
@_mLastChar[0] = aBuf[aLen-1..aLen-1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@_mState = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
@@ -215,34 +215,34 @@ module CharDet
|
|
215
215
|
# so the word boundary detection works properly. [MAP]
|
216
216
|
|
217
217
|
if get_state() == ENotMe
|
218
|
-
|
219
|
-
|
218
|
+
# Both model probers say it's not them. No reason to continue.
|
219
|
+
return ENotMe
|
220
220
|
end
|
221
221
|
|
222
222
|
aBuf = filter_high_bit_only(aBuf)
|
223
223
|
|
224
224
|
for cur in aBuf.split(' ')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
225
|
+
if cur == ' '
|
226
|
+
# We stand on a space - a word just ended
|
227
|
+
if @_mBeforePrev != ' '
|
228
|
+
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
229
|
+
if is_final(@_mPrev)
|
230
|
+
# case (1) [-2:not space][-1:final letter][cur:space]
|
231
|
+
@_mFinalCharLogicalScore += 1
|
232
|
+
elsif is_non_final(@_mPrev)
|
233
|
+
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
234
|
+
@_mFinalCharVisualScore += 1
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# Not standing on a space
|
239
|
+
if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
|
240
|
+
# case (3) [-2:space][-1:final letter][cur:not space]
|
241
|
+
@_mFinalCharVisualScore += 1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
@_mBeforePrev = @_mPrev
|
245
|
+
@_mPrev = cur
|
246
246
|
end
|
247
247
|
|
248
248
|
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
@@ -254,24 +254,24 @@ module CharDet
|
|
254
254
|
# If the final letter score distance is dominant enough, rely on it.
|
255
255
|
finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
|
256
256
|
if finalsub >= MIN_FINAL_CHAR_DISTANCE
|
257
|
-
|
257
|
+
return LOGICAL_HEBREW_NAME
|
258
258
|
end
|
259
259
|
if finalsub <= -MIN_FINAL_CHAR_DISTANCE
|
260
|
-
|
260
|
+
return VISUAL_HEBREW_NAME
|
261
261
|
end
|
262
262
|
|
263
263
|
# It's not dominant enough, try to rely on the model scores instead.
|
264
264
|
modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
|
265
265
|
if modelsub > MIN_MODEL_DISTANCE
|
266
|
-
|
266
|
+
return LOGICAL_HEBREW_NAME
|
267
267
|
end
|
268
268
|
if modelsub < -MIN_MODEL_DISTANCE
|
269
|
-
|
269
|
+
return VISUAL_HEBREW_NAME
|
270
270
|
end
|
271
271
|
|
272
272
|
# Still no good, back to final letter distance, maybe it'll save the day.
|
273
273
|
if finalsub < 0.0
|
274
|
-
|
274
|
+
return VISUAL_HEBREW_NAME
|
275
275
|
end
|
276
276
|
|
277
277
|
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
@@ -281,7 +281,7 @@ module CharDet
|
|
281
281
|
def get_state
|
282
282
|
# Remain active as long as any of the model probers are active.
|
283
283
|
if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
|
284
|
-
|
284
|
+
return ENotMe
|
285
285
|
end
|
286
286
|
return EDetecting
|
287
287
|
end
|