activesupport 3.0.0.beta3 → 3.0.0.beta4

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of activesupport might be problematic. Click here for more details.

Files changed (63) hide show
  1. data/CHANGELOG +57 -0
  2. data/lib/active_support/builder.rb +6 -0
  3. data/lib/active_support/cache.rb +428 -70
  4. data/lib/active_support/cache/compressed_mem_cache_store.rb +6 -15
  5. data/lib/active_support/cache/file_store.rb +139 -41
  6. data/lib/active_support/cache/mem_cache_store.rb +115 -76
  7. data/lib/active_support/cache/memory_store.rb +127 -27
  8. data/lib/active_support/cache/strategy/local_cache.rb +109 -57
  9. data/lib/active_support/cache/synchronized_memory_store.rb +2 -38
  10. data/lib/active_support/callbacks.rb +27 -27
  11. data/lib/active_support/configurable.rb +19 -18
  12. data/lib/active_support/core_ext/array/conversions.rb +30 -26
  13. data/lib/active_support/core_ext/array/random_access.rb +19 -5
  14. data/lib/active_support/core_ext/benchmark.rb +0 -12
  15. data/lib/active_support/core_ext/class/attribute.rb +1 -4
  16. data/lib/active_support/core_ext/class/inheritable_attributes.rb +3 -0
  17. data/lib/active_support/core_ext/date/calculations.rb +27 -8
  18. data/lib/active_support/core_ext/date/conversions.rb +1 -0
  19. data/lib/active_support/core_ext/date_time/conversions.rb +9 -3
  20. data/lib/active_support/core_ext/file.rb +1 -0
  21. data/lib/active_support/core_ext/hash/conversions.rb +14 -137
  22. data/lib/active_support/core_ext/kernel/debugger.rb +1 -1
  23. data/lib/active_support/core_ext/kernel/reporting.rb +2 -1
  24. data/lib/active_support/core_ext/load_error.rb +1 -0
  25. data/lib/active_support/core_ext/logger.rb +1 -1
  26. data/lib/active_support/core_ext/module/attr_internal.rb +2 -2
  27. data/lib/active_support/core_ext/object/to_param.rb +2 -2
  28. data/lib/active_support/core_ext/object/with_options.rb +2 -0
  29. data/lib/active_support/core_ext/string.rb +1 -0
  30. data/lib/active_support/core_ext/string/conversions.rb +35 -1
  31. data/lib/active_support/core_ext/string/encoding.rb +11 -0
  32. data/lib/active_support/core_ext/string/filters.rb +29 -0
  33. data/lib/active_support/core_ext/string/inflections.rb +0 -11
  34. data/lib/active_support/core_ext/string/interpolation.rb +1 -0
  35. data/lib/active_support/core_ext/string/multibyte.rb +16 -19
  36. data/lib/active_support/core_ext/time/calculations.rb +7 -6
  37. data/lib/active_support/core_ext/uri.rb +8 -3
  38. data/lib/active_support/dependencies.rb +33 -1
  39. data/lib/active_support/duration.rb +1 -0
  40. data/lib/active_support/hash_with_indifferent_access.rb +5 -1
  41. data/lib/active_support/i18n.rb +7 -2
  42. data/lib/active_support/inflector/transliterate.rb +58 -38
  43. data/lib/active_support/json/encoding.rb +28 -5
  44. data/lib/active_support/lazy_load_hooks.rb +14 -4
  45. data/lib/active_support/locale/en.yml +4 -1
  46. data/lib/active_support/message_verifier.rb +4 -4
  47. data/lib/active_support/multibyte.rb +1 -19
  48. data/lib/active_support/multibyte/chars.rb +143 -427
  49. data/lib/active_support/multibyte/unicode.rb +393 -0
  50. data/lib/active_support/notifications/fanout.rb +15 -5
  51. data/lib/active_support/notifications/instrumenter.rb +10 -4
  52. data/lib/active_support/railtie.rb +36 -0
  53. data/lib/active_support/rescuable.rb +1 -0
  54. data/lib/active_support/ruby/shim.rb +1 -0
  55. data/lib/active_support/testing/declarative.rb +1 -1
  56. data/lib/active_support/testing/isolation.rb +2 -1
  57. data/lib/active_support/testing/setup_and_teardown.rb +3 -0
  58. data/lib/active_support/values/time_zone.rb +20 -30
  59. data/lib/active_support/values/unicode_tables.dat +0 -0
  60. data/lib/active_support/version.rb +1 -1
  61. data/lib/active_support/xml_mini.rb +126 -1
  62. metadata +8 -61
  63. data/lib/active_support/multibyte/unicode_database.rb +0 -71
@@ -3,45 +3,64 @@ require 'active_support/core_ext/string/multibyte'
3
3
 
4
4
  module ActiveSupport
5
5
  module Inflector
6
- extend self
7
6
 
8
- # UTF-8 byte => ASCII approximate UTF-8 byte(s)
9
- ASCII_APPROXIMATIONS = {
10
- 198 => [65, 69], # Æ => AE
11
- 208 => 68, # Ð => D
12
- 216 => 79, # Ø => O
13
- 222 => [84, 104], # Þ => Þ
14
- 223 => [115, 115], # ß => ss
15
- 230 => [97, 101], # æ => ae
16
- 240 => 100, # ð => d
17
- 248 => 111, # ø => o
18
- 254 => [116, 104], # þ => th
19
- 272 => 68, # Đ => D
20
- 273 => 100, # đ => đ
21
- 294 => 72, # Ħ => H
22
- 295 => 104, # ħ => h
23
- 305 => 105, # ı => i
24
- 306 => [73, 74], # IJ =>IJ
25
- 307 => [105, 106], # ij => ij
26
- 312 => 107, # ĸ => k
27
- 319 => 76, # Ŀ => L
28
- 320 => 108, # ŀ => l
29
- 321 => 76, # Ł => L
30
- 322 => 108, # ł => l
31
- 329 => 110, # ʼn => n
32
- 330 => [78, 71], # Ŋ => NG
33
- 331 => [110, 103], # ŋ => ng
34
- 338 => [79, 69], # Π=> OE
35
- 339 => [111, 101], # œ => oe
36
- 358 => 84, # Ŧ => T
37
- 359 => 116 # ŧ => t
38
- }
39
-
40
- # Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
41
- def transliterate(string)
42
- ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
43
- ASCII_APPROXIMATIONS[char] || (char if char < 128)
44
- end.compact.flatten.pack("U*")
7
+ # Replaces non-ASCII characters with an ASCII approximation, or if none
8
+ # exists, a replacement character which defaults to "?".
9
+ #
10
+ # transliterate("Ærøskøbing")
11
+ # # => "AEroskobing"
12
+ #
13
+ # Default approximations are provided for Western/Latin characters,
14
+ # e.g, "ø", "ñ", "é", "ß", etc.
15
+ #
16
+ # This method is I18n aware, so you can set up custom approximations for a
17
+ # locale. This can be useful, for example, to transliterate German's "ü"
18
+ # and "ö" to "ue" and "oe", or to add support for transliterating Russian
19
+ # to ASCII.
20
+ #
21
+ # In order to make your custom transliterations available, you must set
22
+ # them as the <tt>i18n.transliterate.rule</tt> i18n key:
23
+ #
24
+ # # Store the transliterations in locales/de.yml
25
+ # i18n:
26
+ # transliterate:
27
+ # rule:
28
+ # ü: "ue"
29
+ # ö: "oe"
30
+ #
31
+ # # Or set them using Ruby
32
+ # I18n.backend.store_translations(:de, :i18n => {
33
+ # :transliterate => {
34
+ # :rule => {
35
+ # "ü" => "ue",
36
+ # "ö" => "oe"
37
+ # }
38
+ # }
39
+ # })
40
+ #
41
+ # The value for <tt>i18n.transliterate.rule</tt> can be a simple Hash that maps
42
+ # characters to ASCII approximations as shown above, or, for more complex
43
+ # requirements, a Proc:
44
+ #
45
+ # I18n.backend.store_translations(:de, :i18n => {
46
+ # :transliterate => {
47
+ # :rule => lambda {|string| MyTransliterator.transliterate(string)}
48
+ # }
49
+ # })
50
+ #
51
+ # Now you can have different transliterations for each locale:
52
+ #
53
+ # I18n.locale = :en
54
+ # transliterate("Jürgen")
55
+ # # => "Jurgen"
56
+ #
57
+ # I18n.locale = :de
58
+ # transliterate("Jürgen")
59
+ # # => "Juergen"
60
+ def transliterate(string, replacement = "?")
61
+ I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
62
+ ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
63
+ :replacement => replacement)
45
64
  end
46
65
 
47
66
  # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@@ -73,5 +92,6 @@ module ActiveSupport
73
92
  end
74
93
  parameterized_string.downcase
75
94
  end
95
+
76
96
  end
77
97
  end
@@ -1,5 +1,7 @@
1
1
  # encoding: utf-8
2
+ require 'bigdecimal'
2
3
  require 'active_support/core_ext/array/wrap'
4
+ require 'active_support/core_ext/big_decimal/conversions' # for #to_s
3
5
  require 'active_support/core_ext/hash/except'
4
6
  require 'active_support/core_ext/hash/slice'
5
7
  require 'active_support/core_ext/module/delegation'
@@ -102,7 +104,9 @@ module ActiveSupport
102
104
  end
103
105
 
104
106
  def escape(string)
105
- string = string.dup.force_encoding(::Encoding::BINARY) if string.respond_to?(:force_encoding)
107
+ if string.respond_to?(:force_encoding)
108
+ string = string.encode(::Encoding::UTF_8, :undef => :replace).force_encoding(::Encoding::BINARY)
109
+ end
106
110
  json = string.
107
111
  gsub(escape_regex) { |s| ESCAPED_CHARS[s] }.
108
112
  gsub(/([\xC0-\xDF][\x80-\xBF]|
@@ -110,7 +114,9 @@ module ActiveSupport
110
114
  [\xF0-\xF7][\x80-\xBF]{3})+/nx) { |s|
111
115
  s.unpack("U*").pack("n*").unpack("H*")[0].gsub(/.{4}/n, '\\\\u\&')
112
116
  }
113
- %("#{json}")
117
+ json = %("#{json}")
118
+ json.force_encoding(::Encoding::UTF_8) if json.respond_to?(:force_encoding)
119
+ json
114
120
  end
115
121
  end
116
122
 
@@ -128,7 +134,13 @@ class Object
128
134
  ActiveSupport::JSON.encode(self, options)
129
135
  end
130
136
 
131
- def as_json(options = nil) instance_values end #:nodoc:
137
+ def as_json(options = nil) #:nodoc:
138
+ if respond_to?(:to_hash)
139
+ to_hash
140
+ else
141
+ instance_values
142
+ end
143
+ end
132
144
  end
133
145
 
134
146
  # A string that returns itself as its JSON-encoded form.
@@ -166,9 +178,20 @@ class Numeric
166
178
  def encode_json(encoder) to_s end #:nodoc:
167
179
  end
168
180
 
181
+ class BigDecimal
182
+ # A BigDecimal would be naturally represented as a JSON number. Most libraries,
183
+ # however, parse non-integer JSON numbers directly as floats. Clients using
184
+ # those libraries would get in general a wrong number and no way to recover
185
+ # other than manually inspecting the string with the JSON code itself.
186
+ #
187
+ # That's why a JSON string is returned. The JSON literal is not numeric, but if
188
+ # the other end knows by contract that the data is supposed to be a BigDecimal,
189
+ # it still has the chance to post-process the string and get the real value.
190
+ def as_json(options = nil) to_s end #:nodoc:
191
+ end
192
+
169
193
  class Regexp
170
- def as_json(options = nil) self end #:nodoc:
171
- def encode_json(encoder) inspect end #:nodoc:
194
+ def as_json(options = nil) to_s end #:nodoc:
172
195
  end
173
196
 
174
197
  module Enumerable
@@ -2,16 +2,26 @@ module ActiveSupport
2
2
  @load_hooks = Hash.new {|h,k| h[k] = [] }
3
3
  @loaded = {}
4
4
 
5
- def self.on_load(name, &block)
5
+ def self.on_load(name, options = {}, &block)
6
6
  if base = @loaded[name]
7
- base.instance_eval(&block)
7
+ execute_hook(base, options, block)
8
8
  else
9
- @load_hooks[name] << block
9
+ @load_hooks[name] << [block, options]
10
+ end
11
+ end
12
+
13
+ def self.execute_hook(base, options, block)
14
+ if options[:yield]
15
+ block.call(base)
16
+ else
17
+ base.instance_eval(&block)
10
18
  end
11
19
  end
12
20
 
13
21
  def self.run_load_hooks(name, base = Object)
14
- @load_hooks[name].each { |hook| base.instance_eval(&hook) }
15
22
  @loaded[name] = base
23
+ @load_hooks[name].each do |hook, options|
24
+ execute_hook(base, options, hook)
25
+ end
16
26
  end
17
27
  end
@@ -15,7 +15,10 @@ en:
15
15
  month_names: [~, January, February, March, April, May, June, July, August, September, October, November, December]
16
16
  abbr_month_names: [~, Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec]
17
17
  # Used in date_select and datime_select.
18
- order: [ :year, :month, :day ]
18
+ order:
19
+ - :year
20
+ - :month
21
+ - :day
19
22
 
20
23
  time:
21
24
  formats:
@@ -47,11 +47,11 @@ module ActiveSupport
47
47
  def secure_compare(a, b)
48
48
  return false unless a.bytesize == b.bytesize
49
49
 
50
- l = a.unpack "C#{a.bytesize}"
50
+ l = a.unpack "C*"
51
51
 
52
- res = 0
53
- b.each_byte { |byte| res |= byte ^ l.shift }
54
- res == 0
52
+ res = true
53
+ b.each_byte { |byte| res = (byte == l.shift) && res }
54
+ res
55
55
  end
56
56
 
57
57
  def generate_digest(data)
@@ -1,30 +1,12 @@
1
1
  # encoding: utf-8
2
-
3
2
  require 'active_support/core_ext/module/attribute_accessors'
4
3
 
5
4
  module ActiveSupport #:nodoc:
6
5
  module Multibyte
7
6
  autoload :EncodingError, 'active_support/multibyte/exceptions'
8
7
  autoload :Chars, 'active_support/multibyte/chars'
9
- autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database'
10
- autoload :Codepoint, 'active_support/multibyte/unicode_database'
11
- autoload :UCD, 'active_support/multibyte/unicode_database'
8
+ autoload :Unicode, 'active_support/multibyte/unicode'
12
9
 
13
- # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
14
- # information about normalization.
15
- NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
16
-
17
- # The Unicode version that is supported by the implementation
18
- UNICODE_VERSION = '5.1.0'
19
-
20
- # The default normalization used for operations that require normalization. It can be set to any of the
21
- # normalizations in NORMALIZATION_FORMS.
22
- #
23
- # Example:
24
- # ActiveSupport::Multibyte.default_normalization_form = :c
25
- mattr_accessor :default_normalization_form
26
- self.default_normalization_form = :kc
27
-
28
10
  # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
29
11
  # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
30
12
  # an example how to do this.
@@ -34,59 +34,19 @@ module ActiveSupport #:nodoc:
34
34
  #
35
35
  # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
36
36
  class Chars
37
- # Hangul character boundaries and properties
38
- HANGUL_SBASE = 0xAC00
39
- HANGUL_LBASE = 0x1100
40
- HANGUL_VBASE = 0x1161
41
- HANGUL_TBASE = 0x11A7
42
- HANGUL_LCOUNT = 19
43
- HANGUL_VCOUNT = 21
44
- HANGUL_TCOUNT = 28
45
- HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
46
- HANGUL_SCOUNT = 11172
47
- HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
48
- HANGUL_JAMO_FIRST = 0x1100
49
- HANGUL_JAMO_LAST = 0x11FF
50
-
51
- # All the unicode whitespace
52
- UNICODE_WHITESPACE = [
53
- (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
54
- 0x0020, # White_Space # Zs SPACE
55
- 0x0085, # White_Space # Cc <control-0085>
56
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
57
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
58
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
59
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
60
- 0x2028, # White_Space # Zl LINE SEPARATOR
61
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
62
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
63
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
64
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
65
- ].flatten.freeze
66
-
67
- # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
68
- # between little and big endian. This is not an issue in utf-8, so it must be ignored.
69
- UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
70
-
71
- # Returns a regular expression pattern that matches the passed Unicode codepoints
72
- def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
73
- array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
74
- end
75
- UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
76
- UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
77
-
78
- UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
79
37
 
80
38
  attr_reader :wrapped_string
81
39
  alias to_s wrapped_string
82
40
  alias to_str wrapped_string
83
41
 
84
- if '1.9'.respond_to?(:force_encoding)
42
+ if RUBY_VERSION >= "1.9"
85
43
  # Creates a new Chars instance by wrapping _string_.
86
44
  def initialize(string)
87
45
  @wrapped_string = string
88
46
  @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
89
47
  end
48
+
49
+ undef <=>
90
50
  else
91
51
  def initialize(string) #:nodoc:
92
52
  @wrapped_string = string
@@ -115,12 +75,6 @@ module ActiveSupport #:nodoc:
115
75
  true
116
76
  end
117
77
 
118
- # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
119
- # +false+ otherwise.
120
- def self.wants?(string)
121
- $KCODE == 'UTF8' && consumes?(string)
122
- end
123
-
124
78
  # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
125
79
  def self.consumes?(string)
126
80
  # Unpack is a little bit faster than regular expressions.
@@ -132,89 +86,131 @@ module ActiveSupport #:nodoc:
132
86
 
133
87
  include Comparable
134
88
 
135
- # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
136
- # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
137
- # See <tt>String#<=></tt> for more details.
138
- #
139
- # Example:
140
- # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
141
- def <=>(other)
142
- @wrapped_string <=> other.to_s
143
- end
89
+ if RUBY_VERSION < "1.9"
90
+ # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
91
+ # +false+ otherwise.
92
+ def self.wants?(string)
93
+ $KCODE == 'UTF8' && consumes?(string)
94
+ end
144
95
 
145
- # Returns a new Chars object containing the _other_ object concatenated to the string.
146
- #
147
- # Example:
148
- # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
149
- def +(other)
150
- self << other
151
- end
96
+ # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
97
+ # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
98
+ # See <tt>String#<=></tt> for more details.
99
+ #
100
+ # Example:
101
+ # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
102
+ def <=>(other)
103
+ @wrapped_string <=> other.to_s
104
+ end
152
105
 
153
- # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
154
- #
155
- # Example:
156
- # 'Café périferôl'.mb_chars =~ /ô/ #=> 12
157
- def =~(other)
158
- translate_offset(@wrapped_string =~ other)
159
- end
106
+ # Returns a new Chars object containing the _other_ object concatenated to the string.
107
+ #
108
+ # Example:
109
+ # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
110
+ def +(other)
111
+ self << other
112
+ end
160
113
 
161
- # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
162
- # instances instead of String. This makes chaining methods easier.
163
- #
164
- # Example:
165
- # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
166
- def split(*args)
167
- @wrapped_string.split(*args).map { |i| i.mb_chars }
168
- end
114
+ # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
115
+ #
116
+ # Example:
117
+ # 'Café périferôl'.mb_chars =~ /ô/ #=> 12
118
+ def =~(other)
119
+ translate_offset(@wrapped_string =~ other)
120
+ end
169
121
 
170
- # Inserts the passed string at specified codepoint offsets.
171
- #
172
- # Example:
173
- # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
174
- def insert(offset, fragment)
175
- unpacked = self.class.u_unpack(@wrapped_string)
176
- unless offset > unpacked.length
177
- @wrapped_string.replace(
178
- self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*')
179
- )
180
- else
181
- raise IndexError, "index #{offset} out of string"
122
+ # Inserts the passed string at specified codepoint offsets.
123
+ #
124
+ # Example:
125
+ # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
126
+ def insert(offset, fragment)
127
+ unpacked = Unicode.u_unpack(@wrapped_string)
128
+ unless offset > unpacked.length
129
+ @wrapped_string.replace(
130
+ Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*')
131
+ )
132
+ else
133
+ raise IndexError, "index #{offset} out of string"
134
+ end
135
+ self
182
136
  end
183
- self
184
- end
185
137
 
186
- # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
187
- #
188
- # Example:
189
- # 'Café'.mb_chars.include?('é') #=> true
190
- def include?(other)
191
- # We have to redefine this method because Enumerable defines it.
192
- @wrapped_string.include?(other)
193
- end
138
+ # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
139
+ #
140
+ # Example:
141
+ # 'Café'.mb_chars.include?('é') #=> true
142
+ def include?(other)
143
+ # We have to redefine this method because Enumerable defines it.
144
+ @wrapped_string.include?(other)
145
+ end
194
146
 
195
- # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
196
- #
197
- # Example:
198
- # 'Café périferôl'.mb_chars.index('ô') #=> 12
199
- # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
200
- def index(needle, offset=0)
201
- wrapped_offset = first(offset).wrapped_string.length
202
- index = @wrapped_string.index(needle, wrapped_offset)
203
- index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
147
+ # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
148
+ #
149
+ # Example:
150
+ # 'Café périferôl'.mb_chars.index('ô') #=> 12
151
+ # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
152
+ def index(needle, offset=0)
153
+ wrapped_offset = first(offset).wrapped_string.length
154
+ index = @wrapped_string.index(needle, wrapped_offset)
155
+ index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
156
+ end
157
+
158
+ # Returns the position _needle_ in the string, counting in
159
+ # codepoints, searching backward from _offset_ or the end of the
160
+ # string. Returns +nil+ if _needle_ isn't found.
161
+ #
162
+ # Example:
163
+ # 'Café périferôl'.mb_chars.rindex('é') #=> 6
164
+ # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
165
+ def rindex(needle, offset=nil)
166
+ offset ||= length
167
+ wrapped_offset = first(offset).wrapped_string.length
168
+ index = @wrapped_string.rindex(needle, wrapped_offset)
169
+ index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
170
+ end
171
+
172
+ # Returns the number of codepoints in the string
173
+ def size
174
+ Unicode.u_unpack(@wrapped_string).size
175
+ end
176
+ alias_method :length, :size
177
+
178
+ # Strips entire range of Unicode whitespace from the right of the string.
179
+ def rstrip
180
+ chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, ''))
181
+ end
182
+
183
+ # Strips entire range of Unicode whitespace from the left of the string.
184
+ def lstrip
185
+ chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, ''))
186
+ end
187
+
188
+ # Strips entire range of Unicode whitespace from the right and left of the string.
189
+ def strip
190
+ rstrip.lstrip
191
+ end
192
+
193
+ # Returns the codepoint of the first character in the string.
194
+ #
195
+ # Example:
196
+ # 'こんにちは'.mb_chars.ord #=> 12371
197
+ def ord
198
+ Unicode.u_unpack(@wrapped_string)[0]
199
+ end
200
+
201
+ else
202
+ def =~(other)
203
+ @wrapped_string =~ other
204
+ end
204
205
  end
205
206
 
206
- # Returns the position _needle_ in the string, counting in
207
- # codepoints, searching backward from _offset_ or the end of the
208
- # string. Returns +nil+ if _needle_ isn't found.
207
+ # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
208
+ # instances instead of String. This makes chaining methods easier.
209
209
  #
210
210
  # Example:
211
- # 'Café périferôl'.mb_chars.rindex('é') #=> 6
212
- # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
213
- def rindex(needle, offset=nil)
214
- offset ||= length
215
- wrapped_offset = first(offset).wrapped_string.length
216
- index = @wrapped_string.rindex(needle, wrapped_offset)
217
- index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
211
+ # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
212
+ def split(*args)
213
+ @wrapped_string.split(*args).map { |i| i.mb_chars }
218
214
  end
219
215
 
220
216
  # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
@@ -236,7 +232,7 @@ module ActiveSupport #:nodoc:
236
232
  if args.first.is_a?(Regexp)
237
233
  @wrapped_string[*args] = replace_by
238
234
  else
239
- result = self.class.u_unpack(@wrapped_string)
235
+ result = Unicode.u_unpack(@wrapped_string)
240
236
  if args[0].is_a?(Fixnum)
241
237
  raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
242
238
  min = args[0]
@@ -249,10 +245,10 @@ module ActiveSupport #:nodoc:
249
245
  else
250
246
  needle = args[0].to_s
251
247
  min = index(needle)
252
- max = min + self.class.u_unpack(needle).length - 1
248
+ max = min + Unicode.u_unpack(needle).length - 1
253
249
  range = Range.new(min, max)
254
250
  end
255
- result[range] = self.class.u_unpack(replace_by)
251
+ result[range] = Unicode.u_unpack(replace_by)
256
252
  @wrapped_string.replace(result.pack('U*'))
257
253
  end
258
254
  end
@@ -296,33 +292,13 @@ module ActiveSupport #:nodoc:
296
292
  justify(integer, :center, padstr)
297
293
  end
298
294
 
299
- # Strips entire range of Unicode whitespace from the right of the string.
300
- def rstrip
301
- chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
302
- end
303
-
304
- # Strips entire range of Unicode whitespace from the left of the string.
305
- def lstrip
306
- chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
307
- end
308
-
309
- # Strips entire range of Unicode whitespace from the right and left of the string.
310
- def strip
311
- rstrip.lstrip
312
- end
313
-
314
- # Returns the number of codepoints in the string
315
- def size
316
- self.class.u_unpack(@wrapped_string).size
317
- end
318
- alias_method :length, :size
319
295
 
320
296
  # Reverses all characters in the string.
321
297
  #
322
298
  # Example:
323
299
  # 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
324
300
  def reverse
325
- chars(self.class.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
301
+ chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
326
302
  end
327
303
 
328
304
  # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
@@ -338,15 +314,15 @@ module ActiveSupport #:nodoc:
338
314
  elsif (args.size == 2 && !args[1].is_a?(Numeric))
339
315
  raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
340
316
  elsif args[0].kind_of? Range
341
- cps = self.class.u_unpack(@wrapped_string).slice(*args)
317
+ cps = Unicode.u_unpack(@wrapped_string).slice(*args)
342
318
  result = cps.nil? ? nil : cps.pack('U*')
343
319
  elsif args[0].kind_of? Regexp
344
320
  result = @wrapped_string.slice(*args)
345
321
  elsif args.size == 1 && args[0].kind_of?(Numeric)
346
- character = self.class.u_unpack(@wrapped_string)[args[0]]
322
+ character = Unicode.u_unpack(@wrapped_string)[args[0]]
347
323
  result = character.nil? ? nil : [character].pack('U')
348
324
  else
349
- result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*')
325
+ result = Unicode.u_unpack(@wrapped_string).slice(*args).pack('U*')
350
326
  end
351
327
  result.nil? ? nil : chars(result)
352
328
  end
@@ -374,20 +350,12 @@ module ActiveSupport #:nodoc:
374
350
  slice(0...translate_offset(limit))
375
351
  end
376
352
 
377
- # Returns the codepoint of the first character in the string.
378
- #
379
- # Example:
380
- # 'こんにちは'.mb_chars.ord #=> 12371
381
- def ord
382
- self.class.u_unpack(@wrapped_string)[0]
383
- end
384
-
385
353
  # Convert characters in the string to uppercase.
386
354
  #
387
355
  # Example:
388
356
  # 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s #=> "LAURENT, OÙ SONT LES TESTS ?"
389
357
  def upcase
390
- apply_mapping :uppercase_mapping
358
+ chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping)
391
359
  end
392
360
 
393
361
  # Convert characters in the string to lowercase.
@@ -395,7 +363,7 @@ module ActiveSupport #:nodoc:
395
363
  # Example:
396
364
  # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
397
365
  def downcase
398
- apply_mapping :lowercase_mapping
366
+ chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping)
399
367
  end
400
368
 
401
369
  # Converts the first character to uppercase and the remainder to lowercase.
@@ -409,25 +377,11 @@ module ActiveSupport #:nodoc:
409
377
  # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
410
378
  # passing strings to databases and validations.
411
379
  #
412
- # * <tt>str</tt> - The string to perform normalization on.
413
380
  # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
414
381
  # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
415
- # ActiveSupport::Multibyte.default_normalization_form
416
- def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
417
- # See http://www.unicode.org/reports/tr15, Table 1
418
- codepoints = self.class.u_unpack(@wrapped_string)
419
- chars(case form
420
- when :d
421
- self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
422
- when :c
423
- self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
424
- when :kd
425
- self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
426
- when :kc
427
- self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
428
- else
429
- raise ArgumentError, "#{form} is not a valid normalization variant", caller
430
- end.pack('U*'))
382
+ # ActiveSupport::Multibyte::Unicode.default_normalization_form
383
+ def normalize(form = nil)
384
+ chars(Unicode.normalize(@wrapped_string, form))
431
385
  end
432
386
 
433
387
  # Performs canonical decomposition on all the characters.
@@ -436,7 +390,7 @@ module ActiveSupport #:nodoc:
436
390
  # 'é'.length #=> 2
437
391
  # 'é'.mb_chars.decompose.to_s.length #=> 3
438
392
  def decompose
439
- chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
393
+ chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*'))
440
394
  end
441
395
 
442
396
  # Performs composition on all the characters.
@@ -445,7 +399,7 @@ module ActiveSupport #:nodoc:
445
399
  # 'é'.length #=> 3
446
400
  # 'é'.mb_chars.compose.to_s.length #=> 2
447
401
  def compose
448
- chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
402
+ chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*'))
449
403
  end
450
404
 
451
405
  # Returns the number of grapheme clusters in the string.
@@ -454,14 +408,14 @@ module ActiveSupport #:nodoc:
454
408
  # 'क्षि'.mb_chars.length #=> 4
455
409
  # 'क्षि'.mb_chars.g_length #=> 3
456
410
  def g_length
457
- self.class.g_unpack(@wrapped_string).length
411
+ Unicode.g_unpack(@wrapped_string).length
458
412
  end
459
413
 
460
414
  # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
461
415
  #
462
416
  # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
463
417
  def tidy_bytes(force = false)
464
- chars(self.class.tidy_bytes(@wrapped_string, force))
418
+ chars(Unicode.tidy_bytes(@wrapped_string, force))
465
419
  end
466
420
 
467
421
  %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
@@ -475,241 +429,6 @@ module ActiveSupport #:nodoc:
475
429
  end
476
430
  end
477
431
 
478
- class << self
479
-
480
- # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
481
- # valid UTF-8.
482
- #
483
- # Example:
484
- # Chars.u_unpack('Café') #=> [67, 97, 102, 233]
485
- def u_unpack(string)
486
- begin
487
- string.unpack 'U*'
488
- rescue ArgumentError
489
- raise EncodingError, 'malformed UTF-8 character'
490
- end
491
- end
492
-
493
- # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
494
- # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
495
- # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
496
- #
497
- # Primarily used by the grapheme cluster support.
498
- def in_char_class?(codepoint, classes)
499
- classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
500
- end
501
-
502
- # Unpack the string at grapheme boundaries. Returns a list of character lists.
503
- #
504
- # Example:
505
- # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
506
- # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
507
- def g_unpack(string)
508
- codepoints = u_unpack(string)
509
- unpacked = []
510
- pos = 0
511
- marker = 0
512
- eoc = codepoints.length
513
- while(pos < eoc)
514
- pos += 1
515
- previous = codepoints[pos-1]
516
- current = codepoints[pos]
517
- if (
518
- # CR X LF
519
- one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
520
- # L X (L|V|LV|LVT)
521
- two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
522
- # (LV|V) X (V|T)
523
- three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
524
- # (LVT|T) X (T)
525
- four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
526
- # X Extend
527
- five = (UCD.boundary[:extend] === current)
528
- )
529
- else
530
- unpacked << codepoints[marker..pos-1]
531
- marker = pos
532
- end
533
- end
534
- unpacked
535
- end
536
-
537
- # Reverse operation of g_unpack.
538
- #
539
- # Example:
540
- # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
541
- def g_pack(unpacked)
542
- (unpacked.flatten).pack('U*')
543
- end
544
-
545
- def padding(padsize, padstr=' ') #:nodoc:
546
- if padsize != 0
547
- new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
548
- else
549
- ''
550
- end
551
- end
552
-
553
- # Re-order codepoints so the string becomes canonical.
554
- def reorder_characters(codepoints)
555
- length = codepoints.length- 1
556
- pos = 0
557
- while pos < length do
558
- cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
559
- if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
560
- codepoints[pos..pos+1] = cp2.code, cp1.code
561
- pos += (pos > 0 ? -1 : 1)
562
- else
563
- pos += 1
564
- end
565
- end
566
- codepoints
567
- end
568
-
569
- # Decompose composed characters to the decomposed form.
570
- def decompose_codepoints(type, codepoints)
571
- codepoints.inject([]) do |decomposed, cp|
572
- # if it's a hangul syllable starter character
573
- if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
574
- sindex = cp - HANGUL_SBASE
575
- ncp = [] # new codepoints
576
- ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
577
- ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
578
- tindex = sindex % HANGUL_TCOUNT
579
- ncp << (HANGUL_TBASE + tindex) unless tindex == 0
580
- decomposed.concat ncp
581
- # if the codepoint is decomposable in with the current decomposition type
582
- elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
583
- decomposed.concat decompose_codepoints(type, ncp.dup)
584
- else
585
- decomposed << cp
586
- end
587
- end
588
- end
589
-
590
- # Compose decomposed characters to the composed form.
591
- def compose_codepoints(codepoints)
592
- pos = 0
593
- eoa = codepoints.length - 1
594
- starter_pos = 0
595
- starter_char = codepoints[0]
596
- previous_combining_class = -1
597
- while pos < eoa
598
- pos += 1
599
- lindex = starter_char - HANGUL_LBASE
600
- # -- Hangul
601
- if 0 <= lindex and lindex < HANGUL_LCOUNT
602
- vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
603
- if 0 <= vindex and vindex < HANGUL_VCOUNT
604
- tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
605
- if 0 <= tindex and tindex < HANGUL_TCOUNT
606
- j = starter_pos + 2
607
- eoa -= 2
608
- else
609
- tindex = 0
610
- j = starter_pos + 1
611
- eoa -= 1
612
- end
613
- codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
614
- end
615
- starter_pos += 1
616
- starter_char = codepoints[starter_pos]
617
- # -- Other characters
618
- else
619
- current_char = codepoints[pos]
620
- current = UCD.codepoints[current_char]
621
- if current.combining_class > previous_combining_class
622
- if ref = UCD.composition_map[starter_char]
623
- composition = ref[current_char]
624
- else
625
- composition = nil
626
- end
627
- unless composition.nil?
628
- codepoints[starter_pos] = composition
629
- starter_char = composition
630
- codepoints.delete_at pos
631
- eoa -= 1
632
- pos -= 1
633
- previous_combining_class = -1
634
- else
635
- previous_combining_class = current.combining_class
636
- end
637
- else
638
- previous_combining_class = current.combining_class
639
- end
640
- if current.combining_class == 0
641
- starter_pos = pos
642
- starter_char = codepoints[pos]
643
- end
644
- end
645
- end
646
- codepoints
647
- end
648
-
649
- def tidy_byte(byte)
650
- if byte < 160
651
- [UCD.cp1252[byte] || byte].pack("U").unpack("C*")
652
- elsif byte < 192
653
- [194, byte]
654
- else
655
- [195, byte - 64]
656
- end
657
- end
658
- private :tidy_byte
659
-
660
- # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
661
- #
662
- # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
663
- def tidy_bytes(string, force = false)
664
- if force
665
- return string.unpack("C*").map do |b|
666
- tidy_byte(b)
667
- end.flatten.compact.pack("C*").unpack("U*").pack("U*")
668
- end
669
-
670
- bytes = string.unpack("C*")
671
- conts_expected = 0
672
- last_lead = 0
673
-
674
- bytes.each_index do |i|
675
-
676
- byte = bytes[i]
677
- is_ascii = byte < 128
678
- is_cont = byte > 127 && byte < 192
679
- is_lead = byte > 191 && byte < 245
680
- is_unused = byte > 240
681
- is_restricted = byte > 244
682
-
683
- # Impossible or highly unlikely byte? Clean it.
684
- if is_unused || is_restricted
685
- bytes[i] = tidy_byte(byte)
686
- elsif is_cont
687
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
688
- conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
689
- else
690
- if conts_expected > 0
691
- # Expected continuation, but got ASCII or leading? Clean backwards up to
692
- # the leading byte.
693
- (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
694
- conts_expected = 0
695
- end
696
- if is_lead
697
- # Final byte is leading? Clean it.
698
- if i == bytes.length - 1
699
- bytes[i] = tidy_byte(bytes.last)
700
- else
701
- # Valid leading byte? Expect continuations determined by position of
702
- # first zero bit, with max of 3.
703
- conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
704
- last_lead = i
705
- end
706
- end
707
- end
708
- end
709
- bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
710
- end
711
- end
712
-
713
432
  protected
714
433
 
715
434
  def translate_offset(byte_offset) #:nodoc:
@@ -734,26 +453,23 @@ module ActiveSupport #:nodoc:
734
453
  padsize = padsize > 0 ? padsize : 0
735
454
  case way
736
455
  when :right
737
- result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr))
456
+ result = @wrapped_string.dup.insert(0, padding(padsize, padstr))
738
457
  when :left
739
- result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr))
458
+ result = @wrapped_string.dup.insert(-1, padding(padsize, padstr))
740
459
  when :center
741
- lpad = self.class.padding((padsize / 2.0).floor, padstr)
742
- rpad = self.class.padding((padsize / 2.0).ceil, padstr)
460
+ lpad = padding((padsize / 2.0).floor, padstr)
461
+ rpad = padding((padsize / 2.0).ceil, padstr)
743
462
  result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
744
463
  end
745
464
  chars(result)
746
465
  end
747
466
 
748
- def apply_mapping(mapping) #:nodoc:
749
- chars(self.class.u_unpack(@wrapped_string).map do |codepoint|
750
- cp = UCD.codepoints[codepoint]
751
- if cp and (ncp = cp.send(mapping)) and ncp > 0
752
- ncp
753
- else
754
- codepoint
755
- end
756
- end.pack('U*'))
467
+ def padding(padsize, padstr=' ') #:nodoc:
468
+ if padsize != 0
469
+ chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize)
470
+ else
471
+ ''
472
+ end
757
473
  end
758
474
 
759
475
  def chars(string) #:nodoc: