tmail 1.2.7 → 1.2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +5 -0
- data/lib/tmail/attachments.rb +1 -1
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +37 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +39 -39
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +2 -2
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +30 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +29 -29
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +31 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +13 -13
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +7 -9
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +36 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +38 -38
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +13 -15
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +23 -23
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +79 -78
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +22 -22
- data/lib/tmail/version.rb +2 -1
- data/tmail.gemspec +1 -1
- metadata +1 -1
@@ -144,22 +144,22 @@ module CharDet
|
|
144
144
|
# this character will simply our logic and improve performance.
|
145
145
|
i = @_mNeedToSkipCharNum
|
146
146
|
while i < aLen
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
147
|
+
order, charLen = get_order(aBuf[i...i+2])
|
148
|
+
i += charLen
|
149
|
+
if i > aLen
|
150
|
+
@_mNeedToSkipCharNum = i - aLen
|
151
|
+
@_mLastCharOrder = -1
|
152
|
+
else
|
153
|
+
if (order != -1) and (@_mLastCharOrder != -1)
|
154
|
+
@_mTotalRel += 1
|
155
|
+
if @_mTotalRel > MAX_REL_THRESHOLD
|
156
|
+
@_mDone = true
|
157
|
+
break
|
158
|
+
end
|
159
|
+
@_mRelSample[jp2CharContext[@_mLastCharOrder][order]] += 1
|
160
|
+
end
|
161
|
+
@_mLastCharOrder = order
|
162
|
+
end
|
163
163
|
end
|
164
164
|
end
|
165
165
|
|
@@ -169,10 +169,10 @@ module CharDet
|
|
169
169
|
|
170
170
|
def get_confidence
|
171
171
|
# This is just one way to calculate confidence. It works well for me.
|
172
|
-
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
|
-
|
172
|
+
if @_mTotalRel > MINIMUM_DATA_THRESHOLD
|
173
|
+
return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
|
174
174
|
else
|
175
|
-
|
175
|
+
return DONT_KNOW
|
176
176
|
end
|
177
177
|
end
|
178
178
|
|
@@ -188,15 +188,15 @@ module CharDet
|
|
188
188
|
# find out current char's byte length
|
189
189
|
aStr = aStr[0..1].join if aStr.class == Array
|
190
190
|
if ((aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")) or ((aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xFC"))
|
191
|
-
|
191
|
+
charLen = 2
|
192
192
|
else
|
193
|
-
|
193
|
+
charLen = 1
|
194
194
|
end
|
195
195
|
# return its order if it is hiragana
|
196
196
|
if aStr.length > 1
|
197
|
-
|
198
|
-
|
199
|
-
|
197
|
+
if (aStr[0..0] == "\202") and (aStr[1..1] >= "\x9F") and (aStr[1..1] <= "\xF1")
|
198
|
+
return aStr[1] - 0x9F, charLen
|
199
|
+
end
|
200
200
|
end
|
201
201
|
|
202
202
|
return -1, charLen
|
@@ -208,19 +208,19 @@ module CharDet
|
|
208
208
|
return -1, 1 unless aStr
|
209
209
|
# find out current char's byte length
|
210
210
|
aStr = aStr[0..1].join if aStr.class == Array
|
211
|
-
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
|
-
|
211
|
+
if (aStr[0..0] == "\x8E") or ((aStr[0..0] >= "\xA1") and (aStr[0..0] <= "\xFE"))
|
212
|
+
charLen = 2
|
213
213
|
elsif aStr[0..0] == "\x8F"
|
214
|
-
|
214
|
+
charLen = 3
|
215
215
|
else
|
216
|
-
|
216
|
+
charLen = 1
|
217
217
|
end
|
218
218
|
|
219
219
|
# return its order if it is hiragana
|
220
220
|
if aStr.length > 1
|
221
|
-
|
222
|
-
|
223
|
-
|
221
|
+
if (aStr[0..0] == "\xA4") and (aStr[1..1] >= "\xA1") and (aStr[1..1] <= "\xF3")
|
222
|
+
return aStr[1] - 0xA1, charLen
|
223
|
+
end
|
224
224
|
end
|
225
225
|
|
226
226
|
return -1, charLen
|
@@ -110,15 +110,15 @@ module CharDet
|
|
110
110
|
def feed(aBuf)
|
111
111
|
aBuf = filter_with_english_letters(aBuf)
|
112
112
|
aBuf.each_byte do |b|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
c = b.chr
|
114
|
+
charClass = Latin1_CharToClass[c[0]]
|
115
|
+
freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
|
116
|
+
if freq == 0
|
117
|
+
@_mState = ENotMe
|
118
|
+
break
|
119
|
+
end
|
120
|
+
@_mFreqCounter[freq] += 1
|
121
|
+
@_mLastCharClass = charClass
|
122
122
|
end
|
123
123
|
|
124
124
|
return get_state()
|
@@ -126,17 +126,17 @@ module CharDet
|
|
126
126
|
|
127
127
|
def get_confidence
|
128
128
|
if get_state() == ENotMe
|
129
|
-
|
129
|
+
return 0.01
|
130
130
|
end
|
131
131
|
|
132
132
|
total = @_mFreqCounter.inject{|a,b| a+b}
|
133
133
|
if total < 0.01
|
134
|
-
|
134
|
+
confidence = 0.0
|
135
135
|
else
|
136
|
-
|
136
|
+
confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
|
137
137
|
end
|
138
138
|
if confidence < 0.0
|
139
|
-
|
139
|
+
confidence = 0.0
|
140
140
|
end
|
141
141
|
# lower the confidence of latin1 so that other more accurate detector
|
142
142
|
# can take priority.
|
@@ -40,10 +40,10 @@ module CharDet
|
|
40
40
|
def reset
|
41
41
|
super
|
42
42
|
if @_mCodingSM
|
43
|
-
|
43
|
+
@_mCodingSM.reset()
|
44
44
|
end
|
45
45
|
if @_mDistributionAnalyzer
|
46
|
-
|
46
|
+
@_mDistributionAnalyzer.reset()
|
47
47
|
end
|
48
48
|
@_mLastChar = "\x00\x00"
|
49
49
|
end
|
@@ -54,30 +54,30 @@ module CharDet
|
|
54
54
|
def feed(aBuf)
|
55
55
|
aLen = aBuf.length
|
56
56
|
for i in (0...aLen)
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
57
|
+
codingState = @_mCodingSM.next_state(aBuf[i..i])
|
58
|
+
if codingState == EError
|
59
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
60
|
+
@_mState = ENotMe
|
61
|
+
break
|
62
|
+
elsif codingState == EItsMe
|
63
|
+
@_mState = EFoundIt
|
64
|
+
break
|
65
|
+
elsif codingState == EStart
|
66
|
+
charLen = @_mCodingSM.get_current_charlen()
|
67
|
+
if i == 0
|
68
|
+
@_mLastChar[1] = aBuf[0..0]
|
69
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
70
|
+
else
|
71
|
+
@_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
|
72
|
+
end
|
73
|
+
end
|
74
74
|
end
|
75
75
|
@_mLastChar[0] = aBuf[aLen-1..aLen-1]
|
76
76
|
|
77
77
|
if get_state() == EDetecting
|
78
|
-
|
79
|
-
|
80
|
-
|
78
|
+
if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
79
|
+
@_mState = EFoundIt
|
80
|
+
end
|
81
81
|
end
|
82
82
|
return get_state()
|
83
83
|
end
|
@@ -32,15 +32,13 @@ module CharDet
|
|
32
32
|
class MBCSGroupProber < CharSetGroupProber
|
33
33
|
def initialize
|
34
34
|
super
|
35
|
-
@_mProbers = [
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
EUCTWProber.new
|
43
|
-
]
|
35
|
+
@_mProbers = [ UTF8Prober.new,
|
36
|
+
SJISProber.new,
|
37
|
+
EUCJPProber.new,
|
38
|
+
GB2312Prober.new,
|
39
|
+
EUCKRProber.new,
|
40
|
+
Big5Prober.new,
|
41
|
+
EUCTWProber.new ]
|
44
42
|
reset()
|
45
43
|
end
|
46
44
|
end
|
@@ -73,10 +73,10 @@ module CharDet
|
|
73
73
|
Big5CharLenTable = [0, 1, 1, 2, 0]
|
74
74
|
|
75
75
|
Big5SMModel = {'classTable' => BIG5_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
76
|
+
'classFactor' => 5,
|
77
|
+
'stateTable' => BIG5_st,
|
78
|
+
'charLenTable' => Big5CharLenTable,
|
79
|
+
'name' => 'Big5'
|
80
80
|
}
|
81
81
|
|
82
82
|
# EUC-JP
|
@@ -127,10 +127,10 @@ module CharDet
|
|
127
127
|
EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
|
128
128
|
|
129
129
|
EUCJPSMModel = {'classTable' => EUCJP_cls,
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
130
|
+
'classFactor' => 6,
|
131
|
+
'stateTable' => EUCJP_st,
|
132
|
+
'charLenTable' => EUCJPCharLenTable,
|
133
|
+
'name' => 'EUC-JP'
|
134
134
|
}
|
135
135
|
|
136
136
|
# EUC-KR
|
@@ -178,10 +178,10 @@ module CharDet
|
|
178
178
|
EUCKRCharLenTable = [0, 1, 2, 0]
|
179
179
|
|
180
180
|
EUCKRSMModel = {'classTable' => EUCKR_cls,
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
181
|
+
'classFactor' => 4,
|
182
|
+
'stateTable' => EUCKR_st,
|
183
|
+
'charLenTable' => EUCKRCharLenTable,
|
184
|
+
'name' => 'EUC-KR'
|
185
185
|
}
|
186
186
|
|
187
187
|
# EUC-TW
|
@@ -233,10 +233,10 @@ module CharDet
|
|
233
233
|
EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
|
234
234
|
|
235
235
|
EUCTWSMModel = {'classTable' => EUCTW_cls,
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
236
|
+
'classFactor' => 7,
|
237
|
+
'stateTable' => EUCTW_st,
|
238
|
+
'charLenTable' => EUCTWCharLenTable,
|
239
|
+
'name' => 'x-euc-tw'
|
240
240
|
}
|
241
241
|
|
242
242
|
# GB2312
|
@@ -293,10 +293,10 @@ module CharDet
|
|
293
293
|
GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
|
294
294
|
|
295
295
|
GB2312SMModel = {'classTable' => GB2312_cls,
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
296
|
+
'classFactor' => 7,
|
297
|
+
'stateTable' => GB2312_st,
|
298
|
+
'charLenTable' => GB2312CharLenTable,
|
299
|
+
'name' => 'GB2312'
|
300
300
|
}
|
301
301
|
|
302
302
|
# Shift_JIS
|
@@ -347,10 +347,10 @@ module CharDet
|
|
347
347
|
SJISCharLenTable = [0, 1, 1, 2, 0, 0]
|
348
348
|
|
349
349
|
SJISSMModel = {'classTable' => SJIS_cls,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
350
|
+
'classFactor' => 6,
|
351
|
+
'stateTable' => SJIS_st,
|
352
|
+
'charLenTable' => SJISCharLenTable,
|
353
|
+
'name' => 'Shift_JIS'
|
354
354
|
}
|
355
355
|
|
356
356
|
# UCS2-BE
|
@@ -403,10 +403,10 @@ module CharDet
|
|
403
403
|
UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
|
404
404
|
|
405
405
|
UCS2BESMModel = {'classTable' => UCS2BE_cls,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
406
|
+
'classFactor' => 6,
|
407
|
+
'stateTable' => UCS2BE_st,
|
408
|
+
'charLenTable' => UCS2BECharLenTable,
|
409
|
+
'name' => 'UTF-16BE'
|
410
410
|
}
|
411
411
|
|
412
412
|
# UCS2-LE
|
@@ -459,10 +459,10 @@ module CharDet
|
|
459
459
|
UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
|
460
460
|
|
461
461
|
UCS2LESMModel = {'classTable' => UCS2LE_cls,
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
462
|
+
'classFactor' => 6,
|
463
|
+
'stateTable' => UCS2LE_st,
|
464
|
+
'charLenTable' => UCS2LECharLenTable,
|
465
|
+
'name' => 'UTF-16LE'
|
466
466
|
}
|
467
467
|
|
468
468
|
# UTF-8
|
@@ -534,9 +534,9 @@ module CharDet
|
|
534
534
|
UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
|
535
535
|
|
536
536
|
UTF8SMModel = {'classTable' => UTF8_cls,
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
537
|
+
'classFactor' => 16,
|
538
|
+
'stateTable' => UTF8_st,
|
539
|
+
'charLenTable' => UTF8CharLenTable,
|
540
|
+
'name' => 'UTF-8'
|
541
541
|
}
|
542
542
|
end
|
@@ -57,51 +57,51 @@ module CharDet
|
|
57
57
|
|
58
58
|
def get_charset_name
|
59
59
|
if @_mNameProber
|
60
|
-
|
60
|
+
return @_mNameProber.get_charset_name()
|
61
61
|
else
|
62
|
-
|
62
|
+
return @_mModel['charsetName']
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def feed(aBuf)
|
67
67
|
if not @_mModel['keepEnglishLetter']
|
68
|
-
|
68
|
+
aBuf = filter_without_english_letters(aBuf)
|
69
69
|
end
|
70
70
|
aLen = aBuf.length
|
71
71
|
if not aLen
|
72
|
-
|
72
|
+
return get_state()
|
73
73
|
end
|
74
74
|
aBuf.each_byte do |b|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
75
|
+
c = b.chr
|
76
|
+
order = @_mModel['charToOrderMap'][c[0]]
|
77
|
+
if order < SYMBOL_CAT_ORDER
|
78
|
+
@_mTotalChar += 1
|
79
|
+
end
|
80
|
+
if order < SAMPLE_SIZE
|
81
|
+
@_mFreqChar += 1
|
82
|
+
if @_mLastOrder < SAMPLE_SIZE
|
83
|
+
@_mTotalSeqs += 1
|
84
|
+
if not @_mReversed
|
85
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
|
86
|
+
else # reverse the order of the letters in the lookup
|
87
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
@_mLastOrder = order
|
92
92
|
end
|
93
93
|
|
94
94
|
if get_state() == EDetecting
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
95
|
+
if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
|
96
|
+
cf = get_confidence()
|
97
|
+
if cf > POSITIVE_SHORTCUT_THRESHOLD
|
98
|
+
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
|
99
|
+
@_mState = EFoundIt
|
100
|
+
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
|
101
|
+
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
|
102
|
+
@_mState = ENotMe
|
103
|
+
end
|
104
|
+
end
|
105
105
|
end
|
106
106
|
|
107
107
|
return get_state()
|
@@ -110,13 +110,13 @@ module CharDet
|
|
110
110
|
def get_confidence
|
111
111
|
r = 0.01
|
112
112
|
if @_mTotalSeqs > 0
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
113
|
+
# print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
|
114
|
+
r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
|
115
|
+
# print r, self._mFreqChar, self._mTotalChar
|
116
|
+
r = r * @_mFreqChar / @_mTotalChar
|
117
|
+
if r >= 1.0
|
118
|
+
r = 0.99
|
119
|
+
end
|
120
120
|
end
|
121
121
|
return r
|
122
122
|
end
|