tmail 1.2.7 → 1.2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +5 -0
- data/lib/tmail/attachments.rb +1 -1
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +37 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +39 -39
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +2 -2
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +30 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +29 -29
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +31 -31
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +13 -13
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +22 -22
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +7 -9
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +36 -36
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +38 -38
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +13 -15
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +23 -23
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +79 -78
- data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +22 -22
- data/lib/tmail/version.rb +2 -1
- data/tmail.gemspec +1 -1
- metadata +1 -1
@@ -31,21 +31,19 @@ module CharDet
|
|
31
31
|
class SBCSGroupProber < CharSetGroupProber
|
32
32
|
def initialize
|
33
33
|
super
|
34
|
-
@_mProbers = [
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
SingleByteCharSetProber.new(TIS620ThaiModel),
|
48
|
-
]
|
34
|
+
@_mProbers = [ SingleByteCharSetProber.new(Win1251CyrillicModel),
|
35
|
+
SingleByteCharSetProber.new(Koi8rModel),
|
36
|
+
SingleByteCharSetProber.new(Latin5CyrillicModel),
|
37
|
+
SingleByteCharSetProber.new(MacCyrillicModel),
|
38
|
+
SingleByteCharSetProber.new(Ibm866Model),
|
39
|
+
SingleByteCharSetProber.new(Ibm855Model),
|
40
|
+
SingleByteCharSetProber.new(Latin7GreekModel),
|
41
|
+
SingleByteCharSetProber.new(Win1253GreekModel),
|
42
|
+
SingleByteCharSetProber.new(Latin5BulgarianModel),
|
43
|
+
SingleByteCharSetProber.new(Win1251BulgarianModel),
|
44
|
+
SingleByteCharSetProber.new(Latin2HungarianModel),
|
45
|
+
SingleByteCharSetProber.new(Win1250HungarianModel),
|
46
|
+
SingleByteCharSetProber.new(TIS620ThaiModel) ]
|
49
47
|
hebrewProber = HebrewProber.new()
|
50
48
|
logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
|
51
49
|
visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
|
@@ -48,33 +48,33 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @_mCodingSM.next_state(aBuf[i..i])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@_mState = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@_mState = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @_mCodingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@_mLastChar[1] = aBuf[0..0]
|
63
|
+
@_mContextAnalyzer.feed(@_mLastChar[2 - charLen..-1], charLen)
|
64
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
65
|
+
else
|
66
|
+
@_mContextAnalyzer.feed(aBuf[i + 1 - charLen ... i + 3 - charLen], charLen)
|
67
|
+
@_mDistributionAnalyzer.feed(aBuf[i - 1 ... i + 1], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
72
|
@_mLastChar[0] = aBuf[aLen - 1.. aLen-1]
|
73
73
|
|
74
|
-
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
74
|
+
if get_state() == EDetecting
|
75
|
+
if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@_mState = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
@@ -51,10 +51,10 @@ module CharDet
|
|
51
51
|
@_mInputState = EPureAscii
|
52
52
|
@_mLastChar = ''
|
53
53
|
if @_mEscCharSetProber
|
54
|
-
|
54
|
+
@_mEscCharSetProber.reset()
|
55
55
|
end
|
56
56
|
for prober in @_mCharSetProbers
|
57
|
-
|
57
|
+
prober.reset()
|
58
58
|
end
|
59
59
|
end
|
60
60
|
|
@@ -65,101 +65,102 @@ module CharDet
|
|
65
65
|
return if not aLen
|
66
66
|
|
67
67
|
if not @_mGotData
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
68
|
+
# If the data starts with BOM, we know it is UTF
|
69
|
+
if aBuf[0...3] == "\xEF\xBB\xBF"
|
70
|
+
# EF BB BF UTF-8 with BOM
|
71
|
+
@result = {'encoding' => "UTF-8", 'confidence' => 1.0}
|
72
|
+
elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
|
73
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
74
|
+
@result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
|
75
|
+
elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
|
76
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
77
|
+
@result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
|
78
|
+
elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
|
79
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
80
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
|
81
|
+
elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
|
82
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
83
|
+
@result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
|
84
|
+
elsif aBuf[0...2] == "\xFF\xFE"
|
85
|
+
# FF FE UTF-16, little endian BOM
|
86
|
+
@result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
|
87
|
+
elsif aBuf[0...2] == "\xFE\xFF"
|
88
|
+
# FE FF UTF-16, big endian BOM
|
89
|
+
@result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
|
90
|
+
end
|
91
91
|
end
|
92
|
-
|
92
|
+
|
93
93
|
@_mGotData = true
|
94
|
-
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
|
-
|
96
|
-
|
94
|
+
if @result['encoding'] and (@result['confidence'] > 0.0)
|
95
|
+
@done = true
|
96
|
+
return
|
97
97
|
end
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
98
|
+
|
99
|
+
if @_mInputState == EPureAscii
|
100
|
+
if @_highBitDetector =~ (aBuf)
|
101
|
+
@_mInputState = EHighbyte
|
102
|
+
elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
|
103
|
+
@_mInputState = EEscAscii
|
104
|
+
end
|
104
105
|
end
|
105
|
-
|
106
|
+
|
106
107
|
@_mLastChar = aBuf[-1..-1]
|
107
108
|
if @_mInputState == EEscAscii
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
109
|
+
if not @_mEscCharSetProber
|
110
|
+
@_mEscCharSetProber = EscCharSetProber.new()
|
111
|
+
end
|
112
|
+
if @_mEscCharSetProber.feed(aBuf) == EFoundIt
|
113
|
+
@result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
|
114
|
+
'confidence' => @_mEscCharSetProber.get_confidence()
|
115
|
+
}
|
116
|
+
@done = true
|
117
|
+
end
|
117
118
|
elsif @_mInputState == EHighbyte
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
119
|
+
if not @_mCharSetProbers or @_mCharSetProbers.empty?
|
120
|
+
@_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
|
121
|
+
end
|
122
|
+
for prober in @_mCharSetProbers
|
123
|
+
if prober.feed(aBuf) == EFoundIt
|
124
|
+
@result = {'encoding' => prober.get_charset_name(),
|
125
|
+
'confidence' => prober.get_confidence()}
|
126
|
+
@done = true
|
127
|
+
break
|
128
|
+
end
|
129
|
+
end
|
129
130
|
end
|
130
|
-
|
131
|
+
|
131
132
|
end
|
132
|
-
|
133
|
+
|
133
134
|
def close
|
134
135
|
return if @done
|
135
136
|
if not @_mGotData
|
136
|
-
|
137
|
-
|
137
|
+
$stderr << "no data received!\n" if $debug
|
138
|
+
return
|
138
139
|
end
|
139
140
|
@done = true
|
140
|
-
|
141
|
-
if @_mInputState == EPureAscii
|
142
|
-
|
143
|
-
|
141
|
+
|
142
|
+
if @_mInputState == EPureAscii
|
143
|
+
@result = {'encoding' => 'ascii', 'confidence' => 1.0}
|
144
|
+
return @result
|
144
145
|
end
|
145
|
-
|
146
|
-
if @_mInputState == EHighbyte
|
147
|
-
|
146
|
+
|
147
|
+
if @_mInputState == EHighbyte
|
148
|
+
confidences = {}
|
148
149
|
@_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
150
|
+
maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
|
151
|
+
if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
|
152
|
+
@result = {'encoding' => maxProber.get_charset_name(),
|
153
|
+
'confidence' => maxProber.get_confidence()}
|
154
|
+
return @result
|
155
|
+
end
|
155
156
|
end
|
156
157
|
|
157
158
|
if $debug
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
159
|
+
$stderr << "no probers hit minimum threshhold\n" if $debug
|
160
|
+
for prober in @_mCharSetProbers[0]._mProbers
|
161
|
+
next if not prober
|
162
|
+
$stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
|
163
|
+
end
|
163
164
|
end
|
164
165
|
end
|
165
166
|
end
|
@@ -48,25 +48,25 @@ module CharDet
|
|
48
48
|
|
49
49
|
def feed(aBuf)
|
50
50
|
aBuf.each_byte do |b|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
51
|
+
c = b.chr
|
52
|
+
codingState = @_mCodingSM.next_state(c)
|
53
|
+
if codingState == EError
|
54
|
+
@_mState = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@_mState = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
if @_mCodingSM.get_current_charlen() >= 2
|
61
|
+
@_mNumOfMBChar += 1
|
62
|
+
end
|
63
|
+
end
|
64
64
|
end
|
65
65
|
|
66
|
-
if get_state() == EDetecting
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
if get_state() == EDetecting
|
67
|
+
if get_confidence() > SHORTCUT_THRESHOLD
|
68
|
+
@_mState = EFoundIt
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
72
|
return get_state()
|
@@ -75,12 +75,12 @@ module CharDet
|
|
75
75
|
def get_confidence
|
76
76
|
unlike = 0.99
|
77
77
|
if @_mNumOfMBChar < 6
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
78
|
+
for i in (0...@_mNumOfMBChar)
|
79
|
+
unlike = unlike * ONE_CHAR_PROB
|
80
|
+
end
|
81
|
+
return 1.0 - unlike
|
82
82
|
else
|
83
|
-
|
83
|
+
return unlike
|
84
84
|
end
|
85
85
|
end
|
86
86
|
end
|
data/lib/tmail/version.rb
CHANGED
data/tmail.gemspec
CHANGED