mail 2.2.20 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mail might be problematic. Click here for more details.

@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ module Mail #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ # Raised when a problem with the encoding was found.
6
+ class EncodingError < StandardError; end
7
+ end
8
+ end
@@ -0,0 +1,392 @@
1
+ module Mail
2
+ module Multibyte
3
+ module Unicode
4
+
5
+ extend self
6
+
7
+ # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
8
+ # information about normalization.
9
+ NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
10
+
11
+ # The Unicode version that is supported by the implementation
12
+ UNICODE_VERSION = '5.2.0'
13
+
14
+ # The default normalization used for operations that require normalization. It can be set to any of the
15
+ # normalizations in NORMALIZATION_FORMS.
16
+ #
17
+ # Example:
18
+ # Mail::Multibyte::Unicode.default_normalization_form = :c
19
+ attr_accessor :default_normalization_form
20
+ @default_normalization_form = :kc
21
+
22
+ # Hangul character boundaries and properties
23
+ HANGUL_SBASE = 0xAC00
24
+ HANGUL_LBASE = 0x1100
25
+ HANGUL_VBASE = 0x1161
26
+ HANGUL_TBASE = 0x11A7
27
+ HANGUL_LCOUNT = 19
28
+ HANGUL_VCOUNT = 21
29
+ HANGUL_TCOUNT = 28
30
+ HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
31
+ HANGUL_SCOUNT = 11172
32
+ HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
33
+ HANGUL_JAMO_FIRST = 0x1100
34
+ HANGUL_JAMO_LAST = 0x11FF
35
+
36
+ # All the unicode whitespace
37
+ WHITESPACE = [
38
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
39
+ 0x0020, # White_Space # Zs SPACE
40
+ 0x0085, # White_Space # Cc <control-0085>
41
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
42
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
43
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
44
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
45
+ 0x2028, # White_Space # Zl LINE SEPARATOR
46
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
47
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
48
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
49
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
50
+ ].flatten.freeze
51
+
52
+ # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
53
+ # between little and big endian. This is not an issue in utf-8, so it must be ignored.
54
+ LEADERS_AND_TRAILERS = WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
55
+
56
+ # Returns a regular expression pattern that matches the passed Unicode codepoints
57
+ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
58
+ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
59
+ end
60
+ TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u
61
+ LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u
62
+
63
+ # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
64
+ # valid UTF-8.
65
+ #
66
+ # Example:
67
+ # Unicode.u_unpack('Café') # => [67, 97, 102, 233]
68
+ def u_unpack(string)
69
+ begin
70
+ string.unpack 'U*'
71
+ rescue ArgumentError
72
+ raise EncodingError, 'malformed UTF-8 character'
73
+ end
74
+ end
75
+
76
+ # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
77
+ # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
78
+ # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
79
+ #
80
+ # Primarily used by the grapheme cluster support.
81
+ def in_char_class?(codepoint, classes)
82
+ classes.detect { |c| database.boundary[c] === codepoint } ? true : false
83
+ end
84
+
85
+ # Unpack the string at grapheme boundaries. Returns a list of character lists.
86
+ #
87
+ # Example:
88
+ # Unicode.g_unpack('क्षि') # => [[2325, 2381], [2359], [2367]]
89
+ # Unicode.g_unpack('Café') # => [[67], [97], [102], [233]]
90
+ def g_unpack(string)
91
+ codepoints = u_unpack(string)
92
+ unpacked = []
93
+ pos = 0
94
+ marker = 0
95
+ eoc = codepoints.length
96
+ while(pos < eoc)
97
+ pos += 1
98
+ previous = codepoints[pos-1]
99
+ current = codepoints[pos]
100
+ if (
101
+ # CR X LF
102
+ ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or
103
+ # L X (L|V|LV|LVT)
104
+ ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
105
+ # (LV|V) X (V|T)
106
+ ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
107
+ # (LVT|T) X (T)
108
+ ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or
109
+ # X Extend
110
+ (database.boundary[:extend] === current)
111
+ )
112
+ else
113
+ unpacked << codepoints[marker..pos-1]
114
+ marker = pos
115
+ end
116
+ end
117
+ unpacked
118
+ end
119
+
120
+ # Reverse operation of g_unpack.
121
+ #
122
+ # Example:
123
+ # Unicode.g_pack(Unicode.g_unpack('क्षि')) # => 'क्षि'
124
+ def g_pack(unpacked)
125
+ (unpacked.flatten).pack('U*')
126
+ end
127
+
128
+ # Re-order codepoints so the string becomes canonical.
129
+ def reorder_characters(codepoints)
130
+ length = codepoints.length- 1
131
+ pos = 0
132
+ while pos < length do
133
+ cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]]
134
+ if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
135
+ codepoints[pos..pos+1] = cp2.code, cp1.code
136
+ pos += (pos > 0 ? -1 : 1)
137
+ else
138
+ pos += 1
139
+ end
140
+ end
141
+ codepoints
142
+ end
143
+
144
+ # Decompose composed characters to the decomposed form.
145
+ def decompose_codepoints(type, codepoints)
146
+ codepoints.inject([]) do |decomposed, cp|
147
+ # if it's a hangul syllable starter character
148
+ if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
149
+ sindex = cp - HANGUL_SBASE
150
+ ncp = [] # new codepoints
151
+ ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
152
+ ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
153
+ tindex = sindex % HANGUL_TCOUNT
154
+ ncp << (HANGUL_TBASE + tindex) unless tindex == 0
155
+ decomposed.concat ncp
156
+ # if the codepoint is decomposable in with the current decomposition type
157
+ elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
158
+ decomposed.concat decompose_codepoints(type, ncp.dup)
159
+ else
160
+ decomposed << cp
161
+ end
162
+ end
163
+ end
164
+
165
+ # Compose decomposed characters to the composed form.
166
+ def compose_codepoints(codepoints)
167
+ pos = 0
168
+ eoa = codepoints.length - 1
169
+ starter_pos = 0
170
+ starter_char = codepoints[0]
171
+ previous_combining_class = -1
172
+ while pos < eoa
173
+ pos += 1
174
+ lindex = starter_char - HANGUL_LBASE
175
+ # -- Hangul
176
+ if 0 <= lindex and lindex < HANGUL_LCOUNT
177
+ vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
178
+ if 0 <= vindex and vindex < HANGUL_VCOUNT
179
+ tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
180
+ if 0 <= tindex and tindex < HANGUL_TCOUNT
181
+ j = starter_pos + 2
182
+ eoa -= 2
183
+ else
184
+ tindex = 0
185
+ j = starter_pos + 1
186
+ eoa -= 1
187
+ end
188
+ codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
189
+ end
190
+ starter_pos += 1
191
+ starter_char = codepoints[starter_pos]
192
+ # -- Other characters
193
+ else
194
+ current_char = codepoints[pos]
195
+ current = database.codepoints[current_char]
196
+ if current.combining_class > previous_combining_class
197
+ if ref = database.composition_map[starter_char]
198
+ composition = ref[current_char]
199
+ else
200
+ composition = nil
201
+ end
202
+ unless composition.nil?
203
+ codepoints[starter_pos] = composition
204
+ starter_char = composition
205
+ codepoints.delete_at pos
206
+ eoa -= 1
207
+ pos -= 1
208
+ previous_combining_class = -1
209
+ else
210
+ previous_combining_class = current.combining_class
211
+ end
212
+ else
213
+ previous_combining_class = current.combining_class
214
+ end
215
+ if current.combining_class == 0
216
+ starter_pos = pos
217
+ starter_char = codepoints[pos]
218
+ end
219
+ end
220
+ end
221
+ codepoints
222
+ end
223
+
224
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
225
+ #
226
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
227
+ def tidy_bytes(string, force = false)
228
+ if force
229
+ return string.unpack("C*").map do |b|
230
+ tidy_byte(b)
231
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
232
+ end
233
+
234
+ bytes = string.unpack("C*")
235
+ conts_expected = 0
236
+ last_lead = 0
237
+
238
+ bytes.each_index do |i|
239
+
240
+ byte = bytes[i]
241
+ is_cont = byte > 127 && byte < 192
242
+ is_lead = byte > 191 && byte < 245
243
+ is_unused = byte > 240
244
+ is_restricted = byte > 244
245
+
246
+ # Impossible or highly unlikely byte? Clean it.
247
+ if is_unused || is_restricted
248
+ bytes[i] = tidy_byte(byte)
249
+ elsif is_cont
250
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
251
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
252
+ else
253
+ if conts_expected > 0
254
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
255
+ # the leading byte.
256
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
257
+ conts_expected = 0
258
+ end
259
+ if is_lead
260
+ # Final byte is leading? Clean it.
261
+ if i == bytes.length - 1
262
+ bytes[i] = tidy_byte(bytes.last)
263
+ else
264
+ # Valid leading byte? Expect continuations determined by position of
265
+ # first zero bit, with max of 3.
266
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
267
+ last_lead = i
268
+ end
269
+ end
270
+ end
271
+ end
272
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
273
+ end
274
+
275
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
276
+ # passing strings to databases and validations.
277
+ #
278
+ # * <tt>string</tt> - The string to perform normalization on.
279
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
280
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
281
+ # Mail::Multibyte.default_normalization_form
282
+ def normalize(string, form=nil)
283
+ form ||= @default_normalization_form
284
+ # See http://www.unicode.org/reports/tr15, Table 1
285
+ codepoints = u_unpack(string)
286
+ case form
287
+ when :d
288
+ reorder_characters(decompose_codepoints(:canonical, codepoints))
289
+ when :c
290
+ compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints)))
291
+ when :kd
292
+ reorder_characters(decompose_codepoints(:compatability, codepoints))
293
+ when :kc
294
+ compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints)))
295
+ else
296
+ raise ArgumentError, "#{form} is not a valid normalization variant", caller
297
+ end.pack('U*')
298
+ end
299
+
300
+ def apply_mapping(string, mapping) #:nodoc:
301
+ u_unpack(string).map do |codepoint|
302
+ cp = database.codepoints[codepoint]
303
+ if cp and (ncp = cp.send(mapping)) and ncp > 0
304
+ ncp
305
+ else
306
+ codepoint
307
+ end
308
+ end.pack('U*')
309
+ end
310
+
311
+ # Holds data about a codepoint in the Unicode database
312
+ class Codepoint
313
+ attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
314
+ end
315
+
316
+ # Holds static data from the Unicode database
317
+ class UnicodeDatabase
318
+ ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
319
+
320
+ attr_writer(*ATTRIBUTES)
321
+
322
+ def initialize
323
+ @codepoints = Hash.new(Codepoint.new)
324
+ @composition_exclusion = []
325
+ @composition_map = {}
326
+ @boundary = {}
327
+ @cp1252 = {}
328
+ end
329
+
330
+ # Lazy load the Unicode database so it's only loaded when it's actually used
331
+ ATTRIBUTES.each do |attr_name|
332
+ class_eval(<<-EOS, __FILE__, __LINE__ + 1)
333
+ def #{attr_name} # def codepoints
334
+ load # load
335
+ @#{attr_name} # @codepoints
336
+ end # end
337
+ EOS
338
+ end
339
+
340
+ # Loads the Unicode database and returns all the internal objects of UnicodeDatabase.
341
+ def load
342
+ begin
343
+ @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
344
+ rescue Exception => e
345
+ raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), Mail::Multibyte is unusable")
346
+ end
347
+
348
+ # Redefine the === method so we can write shorter rules for grapheme cluster breaks
349
+ @boundary.each do |k,_|
350
+ @boundary[k].instance_eval do
351
+ def ===(other)
352
+ detect { |i| i === other } ? true : false
353
+ end
354
+ end if @boundary[k].kind_of?(Array)
355
+ end
356
+
357
+ # define attr_reader methods for the instance variables
358
+ class << self
359
+ attr_reader(*ATTRIBUTES)
360
+ end
361
+ end
362
+
363
+ # Returns the directory in which the data files are stored
364
+ def self.dirname
365
+ File.dirname(__FILE__) + '/../values/'
366
+ end
367
+
368
+ # Returns the filename for the data file for this version
369
+ def self.filename
370
+ File.expand_path File.join(dirname, "unicode_tables.dat")
371
+ end
372
+ end
373
+
374
+ private
375
+
376
+ def tidy_byte(byte)
377
+ if byte < 160
378
+ [database.cp1252[byte] || byte].pack("U").unpack("C*")
379
+ elsif byte < 192
380
+ [194, byte]
381
+ else
382
+ [195, byte - 64]
383
+ end
384
+ end
385
+
386
+ def database
387
+ @database ||= UnicodeDatabase.new
388
+ end
389
+
390
+ end
391
+ end
392
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+
3
+ module Mail #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ if Kernel.const_defined?(:Encoding)
6
+ # Returns a regular expression that matches valid characters in the current encoding
7
+ def self.valid_character
8
+ VALID_CHARACTER[Encoding.default_external.to_s]
9
+ end
10
+ else
11
+ def self.valid_character
12
+ case $KCODE
13
+ when 'UTF8'
14
+ VALID_CHARACTER['UTF-8']
15
+ when 'SJIS'
16
+ VALID_CHARACTER['Shift_JIS']
17
+ end
18
+ end
19
+ end
20
+
21
+ if 'string'.respond_to?(:valid_encoding?)
22
+ # Verifies the encoding of a string
23
+ def self.verify(string)
24
+ string.valid_encoding?
25
+ end
26
+ else
27
+ def self.verify(string)
28
+ if expression = valid_character
29
+ # Splits the string on character boundaries, which are determined based on $KCODE.
30
+ string.split(//).all? { |c| expression =~ c }
31
+ else
32
+ true
33
+ end
34
+ end
35
+ end
36
+
37
+ # Verifies the encoding of the string and raises an exception when it's not valid
38
+ def self.verify!(string)
39
+ raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
40
+ end
41
+
42
+ if 'string'.respond_to?(:force_encoding)
43
+ # Removes all invalid characters from the string.
44
+ #
45
+ # Note: this method is a no-op in Ruby 1.9
46
+ def self.clean(string)
47
+ string
48
+ end
49
+ else
50
+ def self.clean(string)
51
+ if expression = valid_character
52
+ # Splits the string on character boundaries, which are determined based on $KCODE.
53
+ string.split(//).grep(expression).join
54
+ else
55
+ string
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end