mail 1.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of mail might be problematic. Click here for more details.

Files changed (107) hide show
  1. data/.gitignore +4 -0
  2. data/Manifest.txt +106 -0
  3. data/README.rdoc +441 -0
  4. data/Rakefile +38 -0
  5. data/lib/mail.rb +86 -0
  6. data/lib/mail/attachment.rb +90 -0
  7. data/lib/mail/body.rb +149 -0
  8. data/lib/mail/configuration.rb +90 -0
  9. data/lib/mail/core_extensions.rb +6 -0
  10. data/lib/mail/core_extensions/blank.rb +41 -0
  11. data/lib/mail/core_extensions/nil.rb +15 -0
  12. data/lib/mail/core_extensions/string.rb +31 -0
  13. data/lib/mail/elements/address.rb +293 -0
  14. data/lib/mail/elements/address_list.rb +62 -0
  15. data/lib/mail/elements/content_disposition_element.rb +34 -0
  16. data/lib/mail/elements/content_transfer_encoding_element.rb +21 -0
  17. data/lib/mail/elements/content_type_element.rb +39 -0
  18. data/lib/mail/elements/date_time_element.rb +26 -0
  19. data/lib/mail/elements/envelope_from_element.rb +34 -0
  20. data/lib/mail/elements/message_ids_element.rb +29 -0
  21. data/lib/mail/elements/mime_version_element.rb +26 -0
  22. data/lib/mail/elements/phrase_list.rb +21 -0
  23. data/lib/mail/elements/received_element.rb +30 -0
  24. data/lib/mail/encodings/base64.rb +17 -0
  25. data/lib/mail/encodings/encodings.rb +24 -0
  26. data/lib/mail/encodings/quoted_printable.rb +26 -0
  27. data/lib/mail/envelope.rb +35 -0
  28. data/lib/mail/field.rb +202 -0
  29. data/lib/mail/field_list.rb +33 -0
  30. data/lib/mail/fields/bcc_field.rb +40 -0
  31. data/lib/mail/fields/cc_field.rb +40 -0
  32. data/lib/mail/fields/comments_field.rb +41 -0
  33. data/lib/mail/fields/common/common_address.rb +62 -0
  34. data/lib/mail/fields/common/common_date.rb +35 -0
  35. data/lib/mail/fields/common/common_field.rb +128 -0
  36. data/lib/mail/fields/common/common_message_id.rb +35 -0
  37. data/lib/mail/fields/content_description_field.rb +15 -0
  38. data/lib/mail/fields/content_disposition_field.rb +34 -0
  39. data/lib/mail/fields/content_id_field.rb +50 -0
  40. data/lib/mail/fields/content_transfer_encoding_field.rb +28 -0
  41. data/lib/mail/fields/content_type_field.rb +50 -0
  42. data/lib/mail/fields/date_field.rb +44 -0
  43. data/lib/mail/fields/from_field.rb +40 -0
  44. data/lib/mail/fields/in_reply_to_field.rb +42 -0
  45. data/lib/mail/fields/keywords_field.rb +22 -0
  46. data/lib/mail/fields/message_id_field.rb +70 -0
  47. data/lib/mail/fields/mime_version_field.rb +42 -0
  48. data/lib/mail/fields/optional_field.rb +11 -0
  49. data/lib/mail/fields/received_field.rb +49 -0
  50. data/lib/mail/fields/references_field.rb +42 -0
  51. data/lib/mail/fields/reply_to_field.rb +40 -0
  52. data/lib/mail/fields/resent_bcc_field.rb +40 -0
  53. data/lib/mail/fields/resent_cc_field.rb +40 -0
  54. data/lib/mail/fields/resent_date_field.rb +16 -0
  55. data/lib/mail/fields/resent_from_field.rb +40 -0
  56. data/lib/mail/fields/resent_message_id_field.rb +20 -0
  57. data/lib/mail/fields/resent_sender_field.rb +48 -0
  58. data/lib/mail/fields/resent_to_field.rb +40 -0
  59. data/lib/mail/fields/return_path_field.rb +34 -0
  60. data/lib/mail/fields/sender_field.rb +48 -0
  61. data/lib/mail/fields/structured_field.rb +32 -0
  62. data/lib/mail/fields/subject_field.rb +14 -0
  63. data/lib/mail/fields/to_field.rb +40 -0
  64. data/lib/mail/fields/unstructured_field.rb +27 -0
  65. data/lib/mail/header.rb +213 -0
  66. data/lib/mail/mail.rb +120 -0
  67. data/lib/mail/message.rb +648 -0
  68. data/lib/mail/network/deliverable.rb +42 -0
  69. data/lib/mail/network/retrievable.rb +63 -0
  70. data/lib/mail/parsers/address_lists.rb +61 -0
  71. data/lib/mail/parsers/address_lists.treetop +19 -0
  72. data/lib/mail/parsers/content_disposition.rb +358 -0
  73. data/lib/mail/parsers/content_disposition.treetop +45 -0
  74. data/lib/mail/parsers/content_transfer_encoding.rb +179 -0
  75. data/lib/mail/parsers/content_transfer_encoding.treetop +25 -0
  76. data/lib/mail/parsers/content_type.rb +507 -0
  77. data/lib/mail/parsers/content_type.treetop +58 -0
  78. data/lib/mail/parsers/date_time.rb +111 -0
  79. data/lib/mail/parsers/date_time.treetop +11 -0
  80. data/lib/mail/parsers/envelope_from.rb +188 -0
  81. data/lib/mail/parsers/envelope_from.treetop +32 -0
  82. data/lib/mail/parsers/message_ids.rb +42 -0
  83. data/lib/mail/parsers/message_ids.treetop +15 -0
  84. data/lib/mail/parsers/mime_version.rb +141 -0
  85. data/lib/mail/parsers/mime_version.treetop +19 -0
  86. data/lib/mail/parsers/phrase_lists.rb +42 -0
  87. data/lib/mail/parsers/phrase_lists.treetop +15 -0
  88. data/lib/mail/parsers/received.rb +68 -0
  89. data/lib/mail/parsers/received.treetop +11 -0
  90. data/lib/mail/parsers/rfc2045.rb +406 -0
  91. data/lib/mail/parsers/rfc2045.treetop +35 -0
  92. data/lib/mail/parsers/rfc2822.rb +5005 -0
  93. data/lib/mail/parsers/rfc2822.treetop +402 -0
  94. data/lib/mail/parsers/rfc2822_obsolete.rb +3607 -0
  95. data/lib/mail/parsers/rfc2822_obsolete.treetop +241 -0
  96. data/lib/mail/part.rb +120 -0
  97. data/lib/mail/patterns.rb +42 -0
  98. data/lib/mail/utilities.rb +142 -0
  99. data/lib/mail/version.rb +10 -0
  100. data/lib/mail/version_specific/multibyte.rb +62 -0
  101. data/lib/mail/version_specific/multibyte/chars.rb +701 -0
  102. data/lib/mail/version_specific/multibyte/exceptions.rb +8 -0
  103. data/lib/mail/version_specific/multibyte/unicode_database.rb +71 -0
  104. data/lib/mail/version_specific/ruby_1_8.rb +61 -0
  105. data/lib/mail/version_specific/ruby_1_8_string.rb +88 -0
  106. data/lib/mail/version_specific/ruby_1_9.rb +49 -0
  107. metadata +192 -0
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ module Mail
3
+ module VERSION
4
+ MAJOR = 1
5
+ MINOR = 0
6
+ TINY = 0
7
+
8
+ STRING = [MAJOR, MINOR, TINY].join('.')
9
+ end
10
+ end
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+ #:nodoc:
3
+
4
+ # OK... serious code smell in here... I just took the whole multibyte_chars code out of
5
+ # ActiveSupport.... hacked it to fit... like a mallet bashing a square peg... the thing
6
+ # fits in the hole... really!
7
+ #
8
+ # Bah... I'll get the first gem out and we'll fix this up.
9
+
10
+ require File.join(File.dirname(__FILE__), 'multibyte/chars')
11
+ require File.join(File.dirname(__FILE__), 'multibyte/exceptions')
12
+ require File.join(File.dirname(__FILE__), 'multibyte/unicode_database')
13
+
14
+ module Mail #:nodoc:
15
+ module Multibyte
16
+ # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
17
+ # information about normalization.
18
+ NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
19
+
20
+ # The Unicode version that is supported by the implementation
21
+ UNICODE_VERSION = '5.1.0'
22
+
23
+ # The default normalization used for operations that require normalization. It can be set to any of the
24
+ # normalizations in NORMALIZATION_FORMS.
25
+ #
26
+ # Example:
27
+ # Mail::Multibyte.default_normalization_form = :c
28
+ def self.default_normalization_form=(val)
29
+ @default_normalization_form = val
30
+ end
31
+
32
+ def self.default_normalization_form
33
+ @default_normalization_form ||= :kc
34
+ end
35
+
36
+ # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
37
+ # class so you can support other encodings. See the Mail::Multibyte::Chars implementation for
38
+ # an example how to do this.
39
+ #
40
+ # Example:
41
+ # Mail::Multibyte.proxy_class = CharsForUTF32
42
+ def self.proxy_class=(val)
43
+ @proxy_class = val
44
+ end
45
+
46
+ def self.proxy_class
47
+ @proxy_class ||= Mail::Multibyte::Chars
48
+ end
49
+
50
+ def length
51
+ self.mb_chars.length
52
+ end
53
+
54
+ def slice!(*args)
55
+ self.mb_chars.slice!(*args)
56
+ end
57
+
58
+ def slice(*args)
59
+ self.mb_chars.slice(*args)
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,701 @@
1
+ # encoding: utf-8
2
+
3
+ module Mail #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ # Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive
6
+ # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
7
+ # encoding safe manner. All the normal String methods are also implemented on the proxy.
8
+ #
9
+ # String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
10
+ # which would normally return a String object now return a Chars object so methods can be chained.
11
+ #
12
+ # "The Perfect String ".mb_chars.downcase.strip.normalize #=> "the perfect string"
13
+ #
14
+ # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
15
+ # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
16
+ #
17
+ # bad.explicit_checking_method "T".mb_chars.downcase.to_s
18
+ #
19
+ # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
20
+ # encodings you can write your own multibyte string handler and configure it through
21
+ # ActiveSupport::Multibyte.proxy_class.
22
+ #
23
+ # class CharsForUTF32
24
+ # def size
25
+ # @wrapped_string.size / 4
26
+ # end
27
+ #
28
+ # def self.accepts?(string)
29
+ # string.length % 4 == 0
30
+ # end
31
+ # end
32
+ #
33
+ # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
34
+ class Chars
35
+ # Hangul character boundaries and properties
36
+ HANGUL_SBASE = 0xAC00
37
+ HANGUL_LBASE = 0x1100
38
+ HANGUL_VBASE = 0x1161
39
+ HANGUL_TBASE = 0x11A7
40
+ HANGUL_LCOUNT = 19
41
+ HANGUL_VCOUNT = 21
42
+ HANGUL_TCOUNT = 28
43
+ HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
44
+ HANGUL_SCOUNT = 11172
45
+ HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
46
+ HANGUL_JAMO_FIRST = 0x1100
47
+ HANGUL_JAMO_LAST = 0x11FF
48
+
49
+ # All the unicode whitespace
50
+ UNICODE_WHITESPACE = [
51
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
52
+ 0x0020, # White_Space # Zs SPACE
53
+ 0x0085, # White_Space # Cc <control-0085>
54
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
55
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
56
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
57
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
58
+ 0x2028, # White_Space # Zl LINE SEPARATOR
59
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
60
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
61
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
62
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
63
+ ].flatten.freeze
64
+
65
+ # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
66
+ # between little and big endian. This is not an issue in utf-8, so it must be ignored.
67
+ UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
68
+
69
+ # Returns a regular expression pattern that matches the passed Unicode codepoints
70
+ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
71
+ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
72
+ end
73
+ UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
74
+ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
75
+
76
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
77
+ UTF8_PAT = /\A(?:
78
+ [\x00-\x7f] |
79
+ [\xc2-\xdf] [\x80-\xbf] |
80
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
81
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
82
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
83
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
84
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
85
+ )*\z/xn
86
+
87
+ attr_reader :wrapped_string
88
+ alias to_s wrapped_string
89
+ alias to_str wrapped_string
90
+
91
+ if '1.9'.respond_to?(:force_encoding)
92
+ # Creates a new Chars instance by wrapping _string_.
93
+ def initialize(string)
94
+ @wrapped_string = string
95
+ @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
96
+ end
97
+ else
98
+ def initialize(string) #:nodoc:
99
+ @wrapped_string = string
100
+ end
101
+ end
102
+
103
+ # Forward all undefined methods to the wrapped string.
104
+ def method_missing(method, *args, &block)
105
+ if method.to_s =~ /!$/
106
+ @wrapped_string.__send__(method, *args, &block)
107
+ self
108
+ else
109
+ result = @wrapped_string.__send__(method, *args, &block)
110
+ result.kind_of?(String) ? chars(result) : result
111
+ end
112
+ end
113
+
114
+ # Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
115
+ # only if the optional second parameter evaluates to +true+.
116
+ def respond_to?(method, include_private=false)
117
+ super || @wrapped_string.respond_to?(method, include_private) || false
118
+ end
119
+
120
+ # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
121
+ def acts_like_string?
122
+ true
123
+ end
124
+
125
+ # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
126
+ # +false+ otherwise.
127
+ def self.wants?(string)
128
+ $KCODE == 'UTF8' && consumes?(string)
129
+ end
130
+
131
+ # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
132
+ def self.consumes?(string)
133
+ # Unpack is a little bit faster than regular expressions.
134
+ string.unpack('U*')
135
+ true
136
+ rescue ArgumentError
137
+ false
138
+ end
139
+
140
+ include Comparable
141
+
142
+ # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
143
+ # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
144
+ # See <tt>String#<=></tt> for more details.
145
+ #
146
+ # Example:
147
+ # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
148
+ def <=>(other)
149
+ @wrapped_string <=> other.to_s
150
+ end
151
+
152
+ # Returns a new Chars object containing the _other_ object concatenated to the string.
153
+ #
154
+ # Example:
155
+ # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
156
+ def +(other)
157
+ self << other
158
+ end
159
+
160
+ # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
161
+ #
162
+ # Example:
163
+ # 'Café périferôl'.mb_chars =~ /ô/ #=> 12
164
+ def =~(other)
165
+ translate_offset(@wrapped_string =~ other)
166
+ end
167
+
168
+ # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
169
+ # instances instead of String. This makes chaining methods easier.
170
+ #
171
+ # Example:
172
+ # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
173
+ def split(*args)
174
+ @wrapped_string.split(*args).map { |i| i.mb_chars }
175
+ end
176
+
177
+ # Inserts the passed string at specified codepoint offsets.
178
+ #
179
+ # Example:
180
+ # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
181
+ def insert(offset, fragment)
182
+ unpacked = self.class.u_unpack(@wrapped_string)
183
+ unless offset > unpacked.length
184
+ @wrapped_string.replace(
185
+ self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*')
186
+ )
187
+ else
188
+ raise IndexError, "index #{offset} out of string"
189
+ end
190
+ self
191
+ end
192
+
193
+ # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
194
+ #
195
+ # Example:
196
+ # 'Café'.mb_chars.include?('é') #=> true
197
+ def include?(other)
198
+ # We have to redefine this method because Enumerable defines it.
199
+ @wrapped_string.include?(other)
200
+ end
201
+
202
+ # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
203
+ #
204
+ # Example:
205
+ # 'Café périferôl'.mb_chars.index('ô') #=> 12
206
+ # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
207
+ def index(needle, offset=0)
208
+ index = @wrapped_string.index(needle, offset)
209
+ index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
210
+ end
211
+
212
+ # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
213
+ #
214
+ # Example:
215
+ #
216
+ # s = "Müller"
217
+ # s.mb_chars[2] = "e" # Replace character with offset 2
218
+ # s
219
+ # #=> "Müeler"
220
+ #
221
+ # s = "Müller"
222
+ # s.mb_chars[1, 2] = "ö" # Replace 2 characters at character offset 1
223
+ # s
224
+ # #=> "Möler"
225
+ def []=(*args)
226
+ replace_by = args.pop
227
+ # Indexed replace with regular expressions already works
228
+ if args.first.is_a?(Regexp)
229
+ @wrapped_string[*args] = replace_by
230
+ else
231
+ result = self.class.u_unpack(@wrapped_string)
232
+ if args[0].is_a?(Fixnum)
233
+ raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
234
+ min = args[0]
235
+ max = args[1].nil? ? min : (min + args[1] - 1)
236
+ range = Range.new(min, max)
237
+ replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
238
+ elsif args.first.is_a?(Range)
239
+ raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
240
+ range = args[0]
241
+ else
242
+ needle = args[0].to_s
243
+ min = index(needle)
244
+ max = min + self.class.u_unpack(needle).length - 1
245
+ range = Range.new(min, max)
246
+ end
247
+ result[range] = self.class.u_unpack(replace_by)
248
+ @wrapped_string.replace(result.pack('U*'))
249
+ end
250
+ end
251
+
252
+ # Works just like <tt>String#rjust</tt>, only integer specifies characters instead of bytes.
253
+ #
254
+ # Example:
255
+ #
256
+ # "¾ cup".mb_chars.rjust(8).to_s
257
+ # #=> " ¾ cup"
258
+ #
259
+ # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
260
+ # #=> "   ¾ cup"
261
+ def rjust(integer, padstr=' ')
262
+ justify(integer, :right, padstr)
263
+ end
264
+
265
+ # Works just like <tt>String#ljust</tt>, only integer specifies characters instead of bytes.
266
+ #
267
+ # Example:
268
+ #
269
+ # "¾ cup".mb_chars.rjust(8).to_s
270
+ # #=> "¾ cup "
271
+ #
272
+ # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
273
+ # #=> "¾ cup   "
274
+ def ljust(integer, padstr=' ')
275
+ justify(integer, :left, padstr)
276
+ end
277
+
278
+ # Works just like <tt>String#center</tt>, only integer specifies characters instead of bytes.
279
+ #
280
+ # Example:
281
+ #
282
+ # "¾ cup".mb_chars.center(8).to_s
283
+ # #=> " ¾ cup "
284
+ #
285
+ # "¾ cup".mb_chars.center(8, " ").to_s # Use non-breaking whitespace
286
+ # #=> " ¾ cup  "
287
+ def center(integer, padstr=' ')
288
+ justify(integer, :center, padstr)
289
+ end
290
+
291
+ # Strips entire range of Unicode whitespace from the right of the string.
292
+ def rstrip
293
+ chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
294
+ end
295
+
296
+ # Strips entire range of Unicode whitespace from the left of the string.
297
+ def lstrip
298
+ chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
299
+ end
300
+
301
+ # Strips entire range of Unicode whitespace from the right and left of the string.
302
+ def strip
303
+ rstrip.lstrip
304
+ end
305
+
306
+ # Returns the number of codepoints in the string
307
+ def size
308
+ self.class.u_unpack(@wrapped_string).size
309
+ end
310
+ alias_method :length, :size
311
+
312
+ # Reverses all characters in the string.
313
+ #
314
+ # Example:
315
+ # 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
316
+ def reverse
317
+ chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
318
+ end
319
+
320
+ # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
321
+ # character.
322
+ #
323
+ # Example:
324
+ # 'こんにちは'.mb_chars.slice(2..3).to_s #=> "にち"
325
+ def slice(*args)
326
+ if args.size > 2
327
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
328
+ elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
329
+ raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
330
+ elsif (args.size == 2 && !args[1].is_a?(Numeric))
331
+ raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
332
+ elsif args[0].kind_of? Range
333
+ cps = self.class.u_unpack(@wrapped_string).slice(*args)
334
+ result = cps.nil? ? nil : cps.pack('U*')
335
+ elsif args[0].kind_of? Regexp
336
+ result = @wrapped_string.slice(*args)
337
+ elsif args.size == 1 && args[0].kind_of?(Numeric)
338
+ character = self.class.u_unpack(@wrapped_string)[args[0]]
339
+ result = character.nil? ? nil : [character].pack('U')
340
+ else
341
+ result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*')
342
+ end
343
+ result.nil? ? nil : chars(result)
344
+ end
345
+ alias_method :[], :slice
346
+
347
+ # Like <tt>String#slice!</tt>, except instead of byte offsets you specify character offsets.
348
+ #
349
+ # Example:
350
+ # s = 'こんにちは'
351
+ # s.mb_chars.slice!(2..3).to_s #=> "にち"
352
+ # s #=> "こんは"
353
+ def slice!(*args)
354
+ slice = self[*args]
355
+ self[*args] = ''
356
+ slice
357
+ end
358
+
359
+ # Returns the codepoint of the first character in the string.
360
+ #
361
+ # Example:
362
+ # 'こんにちは'.mb_chars.ord #=> 12371
363
+ def ord
364
+ self.class.u_unpack(@wrapped_string)[0]
365
+ end
366
+
367
+ # Convert characters in the string to uppercase.
368
+ #
369
+ # Example:
370
+ # 'Laurent, òu sont les tests?'.mb_chars.upcase.to_s #=> "LAURENT, ÒU SONT LES TESTS?"
371
+ def upcase
372
+ apply_mapping :uppercase_mapping
373
+ end
374
+
375
+ # Convert characters in the string to lowercase.
376
+ #
377
+ # Example:
378
+ # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
379
+ def downcase
380
+ apply_mapping :lowercase_mapping
381
+ end
382
+
383
+ # Converts the first character to uppercase and the remainder to lowercase.
384
+ #
385
+ # Example:
386
+ # 'über'.mb_chars.capitalize.to_s #=> "Über"
387
+ def capitalize
388
+ (slice(0) || chars('')).upcase + (slice(1..-1) || chars('')).downcase
389
+ end
390
+
391
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
392
+ # passing strings to databases and validations.
393
+ #
394
+ # * <tt>str</tt> - The string to perform normalization on.
395
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
396
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
397
+ # ActiveSupport::Multibyte.default_normalization_form
398
+ def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
399
+ # See http://www.unicode.org/reports/tr15, Table 1
400
+ codepoints = self.class.u_unpack(@wrapped_string)
401
+ chars(case form
402
+ when :d
403
+ self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
404
+ when :c
405
+ self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
406
+ when :kd
407
+ self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
408
+ when :kc
409
+ self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
410
+ else
411
+ raise ArgumentError, "#{form} is not a valid normalization variant", caller
412
+ end.pack('U*'))
413
+ end
414
+
415
+ # Performs canonical decomposition on all the characters.
416
+ #
417
+ # Example:
418
+ # 'é'.length #=> 2
419
+ # 'é'.mb_chars.decompose.to_s.length #=> 3
420
+ def decompose
421
+ chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
422
+ end
423
+
424
+ # Performs composition on all the characters.
425
+ #
426
+ # Example:
427
+ # 'é'.length #=> 3
428
+ # 'é'.mb_chars.compose.to_s.length #=> 2
429
+ def compose
430
+ chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
431
+ end
432
+
433
+ # Returns the number of grapheme clusters in the string.
434
+ #
435
+ # Example:
436
+ # 'क्षि'.mb_chars.length #=> 4
437
+ # 'क्षि'.mb_chars.g_length #=> 3
438
+ def g_length
439
+ self.class.g_unpack(@wrapped_string).length
440
+ end
441
+
442
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
443
+ def tidy_bytes
444
+ chars(self.class.tidy_bytes(@wrapped_string))
445
+ end
446
+
447
+ %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
448
+ define_method("#{method}!") do |*args|
449
+ unless args.nil?
450
+ @wrapped_string = send(method, *args).to_s
451
+ else
452
+ @wrapped_string = send(method).to_s
453
+ end
454
+ self
455
+ end
456
+ end
457
+
458
+ class << self
459
+
460
+ # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
461
+ # valid UTF-8.
462
+ #
463
+ # Example:
464
+ # Chars.u_unpack('Café') #=> [67, 97, 102, 233]
465
+ def u_unpack(string)
466
+ begin
467
+ string.unpack 'U*'
468
+ rescue ArgumentError
469
+ raise EncodingError, 'malformed UTF-8 character'
470
+ end
471
+ end
472
+
473
+ # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
474
+ # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
475
+ # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
476
+ #
477
+ # Primarily used by the grapheme cluster support.
478
+ def in_char_class?(codepoint, classes)
479
+ classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
480
+ end
481
+
482
+ # Unpack the string at grapheme boundaries. Returns a list of character lists.
483
+ #
484
+ # Example:
485
+ # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
486
+ # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
487
+ def g_unpack(string)
488
+ codepoints = u_unpack(string)
489
+ unpacked = []
490
+ pos = 0
491
+ marker = 0
492
+ eoc = codepoints.length
493
+ while(pos < eoc)
494
+ pos += 1
495
+ previous = codepoints[pos-1]
496
+ current = codepoints[pos]
497
+ if (
498
+ # CR X LF
499
+ one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
500
+ # L X (L|V|LV|LVT)
501
+ two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
502
+ # (LV|V) X (V|T)
503
+ three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
504
+ # (LVT|T) X (T)
505
+ four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
506
+ # X Extend
507
+ five = (UCD.boundary[:extend] === current)
508
+ )
509
+ else
510
+ unpacked << codepoints[marker..pos-1]
511
+ marker = pos
512
+ end
513
+ end
514
+ unpacked
515
+ end
516
+
517
+ # Reverse operation of g_unpack.
518
+ #
519
+ # Example:
520
+ # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
521
+ def g_pack(unpacked)
522
+ (unpacked.flatten).pack('U*')
523
+ end
524
+
525
+ def padding(padsize, padstr=' ') #:nodoc:
526
+ if padsize != 0
527
+ new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
528
+ else
529
+ ''
530
+ end
531
+ end
532
+
533
+ # Re-order codepoints so the string becomes canonical.
534
+ def reorder_characters(codepoints)
535
+ length = codepoints.length- 1
536
+ pos = 0
537
+ while pos < length do
538
+ cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
539
+ if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
540
+ codepoints[pos..pos+1] = cp2.code, cp1.code
541
+ pos += (pos > 0 ? -1 : 1)
542
+ else
543
+ pos += 1
544
+ end
545
+ end
546
+ codepoints
547
+ end
548
+
549
+ # Decompose composed characters to the decomposed form.
550
+ def decompose_codepoints(type, codepoints)
551
+ codepoints.inject([]) do |decomposed, cp|
552
+ # if it's a hangul syllable starter character
553
+ if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
554
+ sindex = cp - HANGUL_SBASE
555
+ ncp = [] # new codepoints
556
+ ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
557
+ ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
558
+ tindex = sindex % HANGUL_TCOUNT
559
+ ncp << (HANGUL_TBASE + tindex) unless tindex == 0
560
+ decomposed.concat ncp
561
+ # if the codepoint is decomposable in with the current decomposition type
562
+ elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
563
+ decomposed.concat decompose_codepoints(type, ncp.dup)
564
+ else
565
+ decomposed << cp
566
+ end
567
+ end
568
+ end
569
+
570
+ # Compose decomposed characters to the composed form.
571
+ def compose_codepoints(codepoints)
572
+ pos = 0
573
+ eoa = codepoints.length - 1
574
+ starter_pos = 0
575
+ starter_char = codepoints[0]
576
+ previous_combining_class = -1
577
+ while pos < eoa
578
+ pos += 1
579
+ lindex = starter_char - HANGUL_LBASE
580
+ # -- Hangul
581
+ if 0 <= lindex and lindex < HANGUL_LCOUNT
582
+ vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
583
+ if 0 <= vindex and vindex < HANGUL_VCOUNT
584
+ tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
585
+ if 0 <= tindex and tindex < HANGUL_TCOUNT
586
+ j = starter_pos + 2
587
+ eoa -= 2
588
+ else
589
+ tindex = 0
590
+ j = starter_pos + 1
591
+ eoa -= 1
592
+ end
593
+ codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
594
+ end
595
+ starter_pos += 1
596
+ starter_char = codepoints[starter_pos]
597
+ # -- Other characters
598
+ else
599
+ current_char = codepoints[pos]
600
+ current = UCD.codepoints[current_char]
601
+ if current.combining_class > previous_combining_class
602
+ if ref = UCD.composition_map[starter_char]
603
+ composition = ref[current_char]
604
+ else
605
+ composition = nil
606
+ end
607
+ unless composition.nil?
608
+ codepoints[starter_pos] = composition
609
+ starter_char = composition
610
+ codepoints.delete_at pos
611
+ eoa -= 1
612
+ pos -= 1
613
+ previous_combining_class = -1
614
+ else
615
+ previous_combining_class = current.combining_class
616
+ end
617
+ else
618
+ previous_combining_class = current.combining_class
619
+ end
620
+ if current.combining_class == 0
621
+ starter_pos = pos
622
+ starter_char = codepoints[pos]
623
+ end
624
+ end
625
+ end
626
+ codepoints
627
+ end
628
+
629
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
630
+ def tidy_bytes(string)
631
+ string.split(//u).map do |c|
632
+ c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
633
+
634
+ if !UTF8_PAT.match(c)
635
+ n = c.unpack('C')[0]
636
+ n < 128 ? n.chr :
637
+ n < 160 ? [UCD.cp1252[n] || n].pack('U') :
638
+ n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
639
+ else
640
+ c
641
+ end
642
+ end.join
643
+ end
644
+ end
645
+
646
+ protected
647
+
648
+ def translate_offset(byte_offset) #:nodoc:
649
+ return nil if byte_offset.nil?
650
+ return 0 if @wrapped_string == ''
651
+ chunk = @wrapped_string[0..byte_offset]
652
+ begin
653
+ begin
654
+ chunk.unpack('U*').length - 1
655
+ rescue ArgumentError => e
656
+ chunk = @wrapped_string[0..(byte_offset+=1)]
657
+ # Stop retrying at the end of the string
658
+ raise e unless byte_offset < chunk.length
659
+ # We damaged a character, retry
660
+ retry
661
+ end
662
+ # Catch the ArgumentError so we can throw our own
663
+ rescue ArgumentError
664
+ raise EncodingError, 'malformed UTF-8 character'
665
+ end
666
+ end
667
+
668
+ def justify(integer, way, padstr=' ') #:nodoc:
669
+ raise ArgumentError, "zero width padding" if padstr.length == 0
670
+ padsize = integer - size
671
+ padsize = padsize > 0 ? padsize : 0
672
+ case way
673
+ when :right
674
+ result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr))
675
+ when :left
676
+ result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr))
677
+ when :center
678
+ lpad = self.class.padding((padsize / 2.0).floor, padstr)
679
+ rpad = self.class.padding((padsize / 2.0).ceil, padstr)
680
+ result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
681
+ end
682
+ chars(result)
683
+ end
684
+
685
+ def apply_mapping(mapping) #:nodoc:
686
+ chars(self.class.u_unpack(@wrapped_string).map do |codepoint|
687
+ cp = UCD.codepoints[codepoint]
688
+ if cp and (ncp = cp.send(mapping)) and ncp > 0
689
+ ncp
690
+ else
691
+ codepoint
692
+ end
693
+ end.pack('U*'))
694
+ end
695
+
696
+ def chars(string) #:nodoc:
697
+ self.class.new(string)
698
+ end
699
+ end
700
+ end
701
+ end