activesupport 3.0.0.beta3 → 3.0.0.beta4
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of activesupport might be problematic. Click here for more details.
- data/CHANGELOG +57 -0
- data/lib/active_support/builder.rb +6 -0
- data/lib/active_support/cache.rb +428 -70
- data/lib/active_support/cache/compressed_mem_cache_store.rb +6 -15
- data/lib/active_support/cache/file_store.rb +139 -41
- data/lib/active_support/cache/mem_cache_store.rb +115 -76
- data/lib/active_support/cache/memory_store.rb +127 -27
- data/lib/active_support/cache/strategy/local_cache.rb +109 -57
- data/lib/active_support/cache/synchronized_memory_store.rb +2 -38
- data/lib/active_support/callbacks.rb +27 -27
- data/lib/active_support/configurable.rb +19 -18
- data/lib/active_support/core_ext/array/conversions.rb +30 -26
- data/lib/active_support/core_ext/array/random_access.rb +19 -5
- data/lib/active_support/core_ext/benchmark.rb +0 -12
- data/lib/active_support/core_ext/class/attribute.rb +1 -4
- data/lib/active_support/core_ext/class/inheritable_attributes.rb +3 -0
- data/lib/active_support/core_ext/date/calculations.rb +27 -8
- data/lib/active_support/core_ext/date/conversions.rb +1 -0
- data/lib/active_support/core_ext/date_time/conversions.rb +9 -3
- data/lib/active_support/core_ext/file.rb +1 -0
- data/lib/active_support/core_ext/hash/conversions.rb +14 -137
- data/lib/active_support/core_ext/kernel/debugger.rb +1 -1
- data/lib/active_support/core_ext/kernel/reporting.rb +2 -1
- data/lib/active_support/core_ext/load_error.rb +1 -0
- data/lib/active_support/core_ext/logger.rb +1 -1
- data/lib/active_support/core_ext/module/attr_internal.rb +2 -2
- data/lib/active_support/core_ext/object/to_param.rb +2 -2
- data/lib/active_support/core_ext/object/with_options.rb +2 -0
- data/lib/active_support/core_ext/string.rb +1 -0
- data/lib/active_support/core_ext/string/conversions.rb +35 -1
- data/lib/active_support/core_ext/string/encoding.rb +11 -0
- data/lib/active_support/core_ext/string/filters.rb +29 -0
- data/lib/active_support/core_ext/string/inflections.rb +0 -11
- data/lib/active_support/core_ext/string/interpolation.rb +1 -0
- data/lib/active_support/core_ext/string/multibyte.rb +16 -19
- data/lib/active_support/core_ext/time/calculations.rb +7 -6
- data/lib/active_support/core_ext/uri.rb +8 -3
- data/lib/active_support/dependencies.rb +33 -1
- data/lib/active_support/duration.rb +1 -0
- data/lib/active_support/hash_with_indifferent_access.rb +5 -1
- data/lib/active_support/i18n.rb +7 -2
- data/lib/active_support/inflector/transliterate.rb +58 -38
- data/lib/active_support/json/encoding.rb +28 -5
- data/lib/active_support/lazy_load_hooks.rb +14 -4
- data/lib/active_support/locale/en.yml +4 -1
- data/lib/active_support/message_verifier.rb +4 -4
- data/lib/active_support/multibyte.rb +1 -19
- data/lib/active_support/multibyte/chars.rb +143 -427
- data/lib/active_support/multibyte/unicode.rb +393 -0
- data/lib/active_support/notifications/fanout.rb +15 -5
- data/lib/active_support/notifications/instrumenter.rb +10 -4
- data/lib/active_support/railtie.rb +36 -0
- data/lib/active_support/rescuable.rb +1 -0
- data/lib/active_support/ruby/shim.rb +1 -0
- data/lib/active_support/testing/declarative.rb +1 -1
- data/lib/active_support/testing/isolation.rb +2 -1
- data/lib/active_support/testing/setup_and_teardown.rb +3 -0
- data/lib/active_support/values/time_zone.rb +20 -30
- data/lib/active_support/values/unicode_tables.dat +0 -0
- data/lib/active_support/version.rb +1 -1
- data/lib/active_support/xml_mini.rb +126 -1
- metadata +8 -61
- data/lib/active_support/multibyte/unicode_database.rb +0 -71
@@ -3,45 +3,64 @@ require 'active_support/core_ext/string/multibyte'
|
|
3
3
|
|
4
4
|
module ActiveSupport
|
5
5
|
module Inflector
|
6
|
-
extend self
|
7
6
|
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
}
|
39
|
-
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
7
|
+
# Replaces non-ASCII characters with an ASCII approximation, or if none
|
8
|
+
# exists, a replacement character which defaults to "?".
|
9
|
+
#
|
10
|
+
# transliterate("Ærøskøbing")
|
11
|
+
# # => "AEroskobing"
|
12
|
+
#
|
13
|
+
# Default approximations are provided for Western/Latin characters,
|
14
|
+
# e.g, "ø", "ñ", "é", "ß", etc.
|
15
|
+
#
|
16
|
+
# This method is I18n aware, so you can set up custom approximations for a
|
17
|
+
# locale. This can be useful, for example, to transliterate German's "ü"
|
18
|
+
# and "ö" to "ue" and "oe", or to add support for transliterating Russian
|
19
|
+
# to ASCII.
|
20
|
+
#
|
21
|
+
# In order to make your custom transliterations available, you must set
|
22
|
+
# them as the <tt>i18n.transliterate.rule</tt> i18n key:
|
23
|
+
#
|
24
|
+
# # Store the transliterations in locales/de.yml
|
25
|
+
# i18n:
|
26
|
+
# transliterate:
|
27
|
+
# rule:
|
28
|
+
# ü: "ue"
|
29
|
+
# ö: "oe"
|
30
|
+
#
|
31
|
+
# # Or set them using Ruby
|
32
|
+
# I18n.backend.store_translations(:de, :i18n => {
|
33
|
+
# :transliterate => {
|
34
|
+
# :rule => {
|
35
|
+
# "ü" => "ue",
|
36
|
+
# "ö" => "oe"
|
37
|
+
# }
|
38
|
+
# }
|
39
|
+
# })
|
40
|
+
#
|
41
|
+
# The value for <tt>i18n.transliterate.rule</tt> can be a simple Hash that maps
|
42
|
+
# characters to ASCII approximations as shown above, or, for more complex
|
43
|
+
# requirements, a Proc:
|
44
|
+
#
|
45
|
+
# I18n.backend.store_translations(:de, :i18n => {
|
46
|
+
# :transliterate => {
|
47
|
+
# :rule => lambda {|string| MyTransliterator.transliterate(string)}
|
48
|
+
# }
|
49
|
+
# })
|
50
|
+
#
|
51
|
+
# Now you can have different transliterations for each locale:
|
52
|
+
#
|
53
|
+
# I18n.locale = :en
|
54
|
+
# transliterate("Jürgen")
|
55
|
+
# # => "Jurgen"
|
56
|
+
#
|
57
|
+
# I18n.locale = :de
|
58
|
+
# transliterate("Jürgen")
|
59
|
+
# # => "Juergen"
|
60
|
+
def transliterate(string, replacement = "?")
|
61
|
+
I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
|
62
|
+
ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
|
63
|
+
:replacement => replacement)
|
45
64
|
end
|
46
65
|
|
47
66
|
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
|
@@ -73,5 +92,6 @@ module ActiveSupport
|
|
73
92
|
end
|
74
93
|
parameterized_string.downcase
|
75
94
|
end
|
95
|
+
|
76
96
|
end
|
77
97
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require 'bigdecimal'
|
2
3
|
require 'active_support/core_ext/array/wrap'
|
4
|
+
require 'active_support/core_ext/big_decimal/conversions' # for #to_s
|
3
5
|
require 'active_support/core_ext/hash/except'
|
4
6
|
require 'active_support/core_ext/hash/slice'
|
5
7
|
require 'active_support/core_ext/module/delegation'
|
@@ -102,7 +104,9 @@ module ActiveSupport
|
|
102
104
|
end
|
103
105
|
|
104
106
|
def escape(string)
|
105
|
-
|
107
|
+
if string.respond_to?(:force_encoding)
|
108
|
+
string = string.encode(::Encoding::UTF_8, :undef => :replace).force_encoding(::Encoding::BINARY)
|
109
|
+
end
|
106
110
|
json = string.
|
107
111
|
gsub(escape_regex) { |s| ESCAPED_CHARS[s] }.
|
108
112
|
gsub(/([\xC0-\xDF][\x80-\xBF]|
|
@@ -110,7 +114,9 @@ module ActiveSupport
|
|
110
114
|
[\xF0-\xF7][\x80-\xBF]{3})+/nx) { |s|
|
111
115
|
s.unpack("U*").pack("n*").unpack("H*")[0].gsub(/.{4}/n, '\\\\u\&')
|
112
116
|
}
|
113
|
-
%("#{json}")
|
117
|
+
json = %("#{json}")
|
118
|
+
json.force_encoding(::Encoding::UTF_8) if json.respond_to?(:force_encoding)
|
119
|
+
json
|
114
120
|
end
|
115
121
|
end
|
116
122
|
|
@@ -128,7 +134,13 @@ class Object
|
|
128
134
|
ActiveSupport::JSON.encode(self, options)
|
129
135
|
end
|
130
136
|
|
131
|
-
def as_json(options = nil)
|
137
|
+
def as_json(options = nil) #:nodoc:
|
138
|
+
if respond_to?(:to_hash)
|
139
|
+
to_hash
|
140
|
+
else
|
141
|
+
instance_values
|
142
|
+
end
|
143
|
+
end
|
132
144
|
end
|
133
145
|
|
134
146
|
# A string that returns itself as its JSON-encoded form.
|
@@ -166,9 +178,20 @@ class Numeric
|
|
166
178
|
def encode_json(encoder) to_s end #:nodoc:
|
167
179
|
end
|
168
180
|
|
181
|
+
class BigDecimal
|
182
|
+
# A BigDecimal would be naturally represented as a JSON number. Most libraries,
|
183
|
+
# however, parse non-integer JSON numbers directly as floats. Clients using
|
184
|
+
# those libraries would get in general a wrong number and no way to recover
|
185
|
+
# other than manually inspecting the string with the JSON code itself.
|
186
|
+
#
|
187
|
+
# That's why a JSON string is returned. The JSON literal is not numeric, but if
|
188
|
+
# the other end knows by contract that the data is supposed to be a BigDecimal,
|
189
|
+
# it still has the chance to post-process the string and get the real value.
|
190
|
+
def as_json(options = nil) to_s end #:nodoc:
|
191
|
+
end
|
192
|
+
|
169
193
|
class Regexp
|
170
|
-
def as_json(options = nil)
|
171
|
-
def encode_json(encoder) inspect end #:nodoc:
|
194
|
+
def as_json(options = nil) to_s end #:nodoc:
|
172
195
|
end
|
173
196
|
|
174
197
|
module Enumerable
|
@@ -2,16 +2,26 @@ module ActiveSupport
|
|
2
2
|
@load_hooks = Hash.new {|h,k| h[k] = [] }
|
3
3
|
@loaded = {}
|
4
4
|
|
5
|
-
def self.on_load(name, &block)
|
5
|
+
def self.on_load(name, options = {}, &block)
|
6
6
|
if base = @loaded[name]
|
7
|
-
base
|
7
|
+
execute_hook(base, options, block)
|
8
8
|
else
|
9
|
-
@load_hooks[name] << block
|
9
|
+
@load_hooks[name] << [block, options]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.execute_hook(base, options, block)
|
14
|
+
if options[:yield]
|
15
|
+
block.call(base)
|
16
|
+
else
|
17
|
+
base.instance_eval(&block)
|
10
18
|
end
|
11
19
|
end
|
12
20
|
|
13
21
|
def self.run_load_hooks(name, base = Object)
|
14
|
-
@load_hooks[name].each { |hook| base.instance_eval(&hook) }
|
15
22
|
@loaded[name] = base
|
23
|
+
@load_hooks[name].each do |hook, options|
|
24
|
+
execute_hook(base, options, hook)
|
25
|
+
end
|
16
26
|
end
|
17
27
|
end
|
@@ -15,7 +15,10 @@ en:
|
|
15
15
|
month_names: [~, January, February, March, April, May, June, July, August, September, October, November, December]
|
16
16
|
abbr_month_names: [~, Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec]
|
17
17
|
# Used in date_select and datime_select.
|
18
|
-
order:
|
18
|
+
order:
|
19
|
+
- :year
|
20
|
+
- :month
|
21
|
+
- :day
|
19
22
|
|
20
23
|
time:
|
21
24
|
formats:
|
@@ -47,11 +47,11 @@ module ActiveSupport
|
|
47
47
|
def secure_compare(a, b)
|
48
48
|
return false unless a.bytesize == b.bytesize
|
49
49
|
|
50
|
-
l = a.unpack "C
|
50
|
+
l = a.unpack "C*"
|
51
51
|
|
52
|
-
res =
|
53
|
-
b.each_byte { |byte| res
|
54
|
-
res
|
52
|
+
res = true
|
53
|
+
b.each_byte { |byte| res = (byte == l.shift) && res }
|
54
|
+
res
|
55
55
|
end
|
56
56
|
|
57
57
|
def generate_digest(data)
|
@@ -1,30 +1,12 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
2
|
require 'active_support/core_ext/module/attribute_accessors'
|
4
3
|
|
5
4
|
module ActiveSupport #:nodoc:
|
6
5
|
module Multibyte
|
7
6
|
autoload :EncodingError, 'active_support/multibyte/exceptions'
|
8
7
|
autoload :Chars, 'active_support/multibyte/chars'
|
9
|
-
autoload :
|
10
|
-
autoload :Codepoint, 'active_support/multibyte/unicode_database'
|
11
|
-
autoload :UCD, 'active_support/multibyte/unicode_database'
|
8
|
+
autoload :Unicode, 'active_support/multibyte/unicode'
|
12
9
|
|
13
|
-
# A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
|
14
|
-
# information about normalization.
|
15
|
-
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
|
16
|
-
|
17
|
-
# The Unicode version that is supported by the implementation
|
18
|
-
UNICODE_VERSION = '5.1.0'
|
19
|
-
|
20
|
-
# The default normalization used for operations that require normalization. It can be set to any of the
|
21
|
-
# normalizations in NORMALIZATION_FORMS.
|
22
|
-
#
|
23
|
-
# Example:
|
24
|
-
# ActiveSupport::Multibyte.default_normalization_form = :c
|
25
|
-
mattr_accessor :default_normalization_form
|
26
|
-
self.default_normalization_form = :kc
|
27
|
-
|
28
10
|
# The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
|
29
11
|
# class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
|
30
12
|
# an example how to do this.
|
@@ -34,59 +34,19 @@ module ActiveSupport #:nodoc:
|
|
34
34
|
#
|
35
35
|
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
|
36
36
|
class Chars
|
37
|
-
# Hangul character boundaries and properties
|
38
|
-
HANGUL_SBASE = 0xAC00
|
39
|
-
HANGUL_LBASE = 0x1100
|
40
|
-
HANGUL_VBASE = 0x1161
|
41
|
-
HANGUL_TBASE = 0x11A7
|
42
|
-
HANGUL_LCOUNT = 19
|
43
|
-
HANGUL_VCOUNT = 21
|
44
|
-
HANGUL_TCOUNT = 28
|
45
|
-
HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
|
46
|
-
HANGUL_SCOUNT = 11172
|
47
|
-
HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
|
48
|
-
HANGUL_JAMO_FIRST = 0x1100
|
49
|
-
HANGUL_JAMO_LAST = 0x11FF
|
50
|
-
|
51
|
-
# All the unicode whitespace
|
52
|
-
UNICODE_WHITESPACE = [
|
53
|
-
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
54
|
-
0x0020, # White_Space # Zs SPACE
|
55
|
-
0x0085, # White_Space # Cc <control-0085>
|
56
|
-
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
57
|
-
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
58
|
-
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
59
|
-
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
60
|
-
0x2028, # White_Space # Zl LINE SEPARATOR
|
61
|
-
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
62
|
-
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
63
|
-
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
64
|
-
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
65
|
-
].flatten.freeze
|
66
|
-
|
67
|
-
# BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
|
68
|
-
# between little and big endian. This is not an issue in utf-8, so it must be ignored.
|
69
|
-
UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
|
70
|
-
|
71
|
-
# Returns a regular expression pattern that matches the passed Unicode codepoints
|
72
|
-
def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
|
73
|
-
array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
|
74
|
-
end
|
75
|
-
UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
|
76
|
-
UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
|
77
|
-
|
78
|
-
UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
|
79
37
|
|
80
38
|
attr_reader :wrapped_string
|
81
39
|
alias to_s wrapped_string
|
82
40
|
alias to_str wrapped_string
|
83
41
|
|
84
|
-
if
|
42
|
+
if RUBY_VERSION >= "1.9"
|
85
43
|
# Creates a new Chars instance by wrapping _string_.
|
86
44
|
def initialize(string)
|
87
45
|
@wrapped_string = string
|
88
46
|
@wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
|
89
47
|
end
|
48
|
+
|
49
|
+
undef <=>
|
90
50
|
else
|
91
51
|
def initialize(string) #:nodoc:
|
92
52
|
@wrapped_string = string
|
@@ -115,12 +75,6 @@ module ActiveSupport #:nodoc:
|
|
115
75
|
true
|
116
76
|
end
|
117
77
|
|
118
|
-
# Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
|
119
|
-
# +false+ otherwise.
|
120
|
-
def self.wants?(string)
|
121
|
-
$KCODE == 'UTF8' && consumes?(string)
|
122
|
-
end
|
123
|
-
|
124
78
|
# Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
|
125
79
|
def self.consumes?(string)
|
126
80
|
# Unpack is a little bit faster than regular expressions.
|
@@ -132,89 +86,131 @@ module ActiveSupport #:nodoc:
|
|
132
86
|
|
133
87
|
include Comparable
|
134
88
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
def <=>(other)
|
142
|
-
@wrapped_string <=> other.to_s
|
143
|
-
end
|
89
|
+
if RUBY_VERSION < "1.9"
|
90
|
+
# Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
|
91
|
+
# +false+ otherwise.
|
92
|
+
def self.wants?(string)
|
93
|
+
$KCODE == 'UTF8' && consumes?(string)
|
94
|
+
end
|
144
95
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
96
|
+
# Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
|
97
|
+
# equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
|
98
|
+
# See <tt>String#<=></tt> for more details.
|
99
|
+
#
|
100
|
+
# Example:
|
101
|
+
# 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
|
102
|
+
def <=>(other)
|
103
|
+
@wrapped_string <=> other.to_s
|
104
|
+
end
|
152
105
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
106
|
+
# Returns a new Chars object containing the _other_ object concatenated to the string.
|
107
|
+
#
|
108
|
+
# Example:
|
109
|
+
# ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
|
110
|
+
def +(other)
|
111
|
+
self << other
|
112
|
+
end
|
160
113
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
end
|
114
|
+
# Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
|
115
|
+
#
|
116
|
+
# Example:
|
117
|
+
# 'Café périferôl'.mb_chars =~ /ô/ #=> 12
|
118
|
+
def =~(other)
|
119
|
+
translate_offset(@wrapped_string =~ other)
|
120
|
+
end
|
169
121
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
122
|
+
# Inserts the passed string at specified codepoint offsets.
|
123
|
+
#
|
124
|
+
# Example:
|
125
|
+
# 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
|
126
|
+
def insert(offset, fragment)
|
127
|
+
unpacked = Unicode.u_unpack(@wrapped_string)
|
128
|
+
unless offset > unpacked.length
|
129
|
+
@wrapped_string.replace(
|
130
|
+
Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*')
|
131
|
+
)
|
132
|
+
else
|
133
|
+
raise IndexError, "index #{offset} out of string"
|
134
|
+
end
|
135
|
+
self
|
182
136
|
end
|
183
|
-
self
|
184
|
-
end
|
185
137
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
138
|
+
# Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
|
139
|
+
#
|
140
|
+
# Example:
|
141
|
+
# 'Café'.mb_chars.include?('é') #=> true
|
142
|
+
def include?(other)
|
143
|
+
# We have to redefine this method because Enumerable defines it.
|
144
|
+
@wrapped_string.include?(other)
|
145
|
+
end
|
194
146
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
147
|
+
# Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
|
148
|
+
#
|
149
|
+
# Example:
|
150
|
+
# 'Café périferôl'.mb_chars.index('ô') #=> 12
|
151
|
+
# 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
|
152
|
+
def index(needle, offset=0)
|
153
|
+
wrapped_offset = first(offset).wrapped_string.length
|
154
|
+
index = @wrapped_string.index(needle, wrapped_offset)
|
155
|
+
index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns the position _needle_ in the string, counting in
|
159
|
+
# codepoints, searching backward from _offset_ or the end of the
|
160
|
+
# string. Returns +nil+ if _needle_ isn't found.
|
161
|
+
#
|
162
|
+
# Example:
|
163
|
+
# 'Café périferôl'.mb_chars.rindex('é') #=> 6
|
164
|
+
# 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
|
165
|
+
def rindex(needle, offset=nil)
|
166
|
+
offset ||= length
|
167
|
+
wrapped_offset = first(offset).wrapped_string.length
|
168
|
+
index = @wrapped_string.rindex(needle, wrapped_offset)
|
169
|
+
index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
|
170
|
+
end
|
171
|
+
|
172
|
+
# Returns the number of codepoints in the string
|
173
|
+
def size
|
174
|
+
Unicode.u_unpack(@wrapped_string).size
|
175
|
+
end
|
176
|
+
alias_method :length, :size
|
177
|
+
|
178
|
+
# Strips entire range of Unicode whitespace from the right of the string.
|
179
|
+
def rstrip
|
180
|
+
chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, ''))
|
181
|
+
end
|
182
|
+
|
183
|
+
# Strips entire range of Unicode whitespace from the left of the string.
|
184
|
+
def lstrip
|
185
|
+
chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, ''))
|
186
|
+
end
|
187
|
+
|
188
|
+
# Strips entire range of Unicode whitespace from the right and left of the string.
|
189
|
+
def strip
|
190
|
+
rstrip.lstrip
|
191
|
+
end
|
192
|
+
|
193
|
+
# Returns the codepoint of the first character in the string.
|
194
|
+
#
|
195
|
+
# Example:
|
196
|
+
# 'こんにちは'.mb_chars.ord #=> 12371
|
197
|
+
def ord
|
198
|
+
Unicode.u_unpack(@wrapped_string)[0]
|
199
|
+
end
|
200
|
+
|
201
|
+
else
|
202
|
+
def =~(other)
|
203
|
+
@wrapped_string =~ other
|
204
|
+
end
|
204
205
|
end
|
205
206
|
|
206
|
-
#
|
207
|
-
#
|
208
|
-
# string. Returns +nil+ if _needle_ isn't found.
|
207
|
+
# Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
|
208
|
+
# instances instead of String. This makes chaining methods easier.
|
209
209
|
#
|
210
210
|
# Example:
|
211
|
-
# 'Café périferôl'.mb_chars.
|
212
|
-
|
213
|
-
|
214
|
-
offset ||= length
|
215
|
-
wrapped_offset = first(offset).wrapped_string.length
|
216
|
-
index = @wrapped_string.rindex(needle, wrapped_offset)
|
217
|
-
index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
|
211
|
+
# 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
|
212
|
+
def split(*args)
|
213
|
+
@wrapped_string.split(*args).map { |i| i.mb_chars }
|
218
214
|
end
|
219
215
|
|
220
216
|
# Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
|
@@ -236,7 +232,7 @@ module ActiveSupport #:nodoc:
|
|
236
232
|
if args.first.is_a?(Regexp)
|
237
233
|
@wrapped_string[*args] = replace_by
|
238
234
|
else
|
239
|
-
result =
|
235
|
+
result = Unicode.u_unpack(@wrapped_string)
|
240
236
|
if args[0].is_a?(Fixnum)
|
241
237
|
raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
|
242
238
|
min = args[0]
|
@@ -249,10 +245,10 @@ module ActiveSupport #:nodoc:
|
|
249
245
|
else
|
250
246
|
needle = args[0].to_s
|
251
247
|
min = index(needle)
|
252
|
-
max = min +
|
248
|
+
max = min + Unicode.u_unpack(needle).length - 1
|
253
249
|
range = Range.new(min, max)
|
254
250
|
end
|
255
|
-
result[range] =
|
251
|
+
result[range] = Unicode.u_unpack(replace_by)
|
256
252
|
@wrapped_string.replace(result.pack('U*'))
|
257
253
|
end
|
258
254
|
end
|
@@ -296,33 +292,13 @@ module ActiveSupport #:nodoc:
|
|
296
292
|
justify(integer, :center, padstr)
|
297
293
|
end
|
298
294
|
|
299
|
-
# Strips entire range of Unicode whitespace from the right of the string.
|
300
|
-
def rstrip
|
301
|
-
chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
|
302
|
-
end
|
303
|
-
|
304
|
-
# Strips entire range of Unicode whitespace from the left of the string.
|
305
|
-
def lstrip
|
306
|
-
chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
|
307
|
-
end
|
308
|
-
|
309
|
-
# Strips entire range of Unicode whitespace from the right and left of the string.
|
310
|
-
def strip
|
311
|
-
rstrip.lstrip
|
312
|
-
end
|
313
|
-
|
314
|
-
# Returns the number of codepoints in the string
|
315
|
-
def size
|
316
|
-
self.class.u_unpack(@wrapped_string).size
|
317
|
-
end
|
318
|
-
alias_method :length, :size
|
319
295
|
|
320
296
|
# Reverses all characters in the string.
|
321
297
|
#
|
322
298
|
# Example:
|
323
299
|
# 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
|
324
300
|
def reverse
|
325
|
-
chars(
|
301
|
+
chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
|
326
302
|
end
|
327
303
|
|
328
304
|
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
|
@@ -338,15 +314,15 @@ module ActiveSupport #:nodoc:
|
|
338
314
|
elsif (args.size == 2 && !args[1].is_a?(Numeric))
|
339
315
|
raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
|
340
316
|
elsif args[0].kind_of? Range
|
341
|
-
cps =
|
317
|
+
cps = Unicode.u_unpack(@wrapped_string).slice(*args)
|
342
318
|
result = cps.nil? ? nil : cps.pack('U*')
|
343
319
|
elsif args[0].kind_of? Regexp
|
344
320
|
result = @wrapped_string.slice(*args)
|
345
321
|
elsif args.size == 1 && args[0].kind_of?(Numeric)
|
346
|
-
character =
|
322
|
+
character = Unicode.u_unpack(@wrapped_string)[args[0]]
|
347
323
|
result = character.nil? ? nil : [character].pack('U')
|
348
324
|
else
|
349
|
-
result =
|
325
|
+
result = Unicode.u_unpack(@wrapped_string).slice(*args).pack('U*')
|
350
326
|
end
|
351
327
|
result.nil? ? nil : chars(result)
|
352
328
|
end
|
@@ -374,20 +350,12 @@ module ActiveSupport #:nodoc:
|
|
374
350
|
slice(0...translate_offset(limit))
|
375
351
|
end
|
376
352
|
|
377
|
-
# Returns the codepoint of the first character in the string.
|
378
|
-
#
|
379
|
-
# Example:
|
380
|
-
# 'こんにちは'.mb_chars.ord #=> 12371
|
381
|
-
def ord
|
382
|
-
self.class.u_unpack(@wrapped_string)[0]
|
383
|
-
end
|
384
|
-
|
385
353
|
# Convert characters in the string to uppercase.
|
386
354
|
#
|
387
355
|
# Example:
|
388
356
|
# 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s #=> "LAURENT, OÙ SONT LES TESTS ?"
|
389
357
|
def upcase
|
390
|
-
apply_mapping :uppercase_mapping
|
358
|
+
chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping)
|
391
359
|
end
|
392
360
|
|
393
361
|
# Convert characters in the string to lowercase.
|
@@ -395,7 +363,7 @@ module ActiveSupport #:nodoc:
|
|
395
363
|
# Example:
|
396
364
|
# 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
|
397
365
|
def downcase
|
398
|
-
apply_mapping :lowercase_mapping
|
366
|
+
chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping)
|
399
367
|
end
|
400
368
|
|
401
369
|
# Converts the first character to uppercase and the remainder to lowercase.
|
@@ -409,25 +377,11 @@ module ActiveSupport #:nodoc:
|
|
409
377
|
# Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
|
410
378
|
# passing strings to databases and validations.
|
411
379
|
#
|
412
|
-
# * <tt>str</tt> - The string to perform normalization on.
|
413
380
|
# * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
|
414
381
|
# <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
|
415
|
-
# ActiveSupport::Multibyte.default_normalization_form
|
416
|
-
def normalize(form=
|
417
|
-
|
418
|
-
codepoints = self.class.u_unpack(@wrapped_string)
|
419
|
-
chars(case form
|
420
|
-
when :d
|
421
|
-
self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
|
422
|
-
when :c
|
423
|
-
self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
|
424
|
-
when :kd
|
425
|
-
self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
|
426
|
-
when :kc
|
427
|
-
self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
|
428
|
-
else
|
429
|
-
raise ArgumentError, "#{form} is not a valid normalization variant", caller
|
430
|
-
end.pack('U*'))
|
382
|
+
# ActiveSupport::Multibyte::Unicode.default_normalization_form
|
383
|
+
def normalize(form = nil)
|
384
|
+
chars(Unicode.normalize(@wrapped_string, form))
|
431
385
|
end
|
432
386
|
|
433
387
|
# Performs canonical decomposition on all the characters.
|
@@ -436,7 +390,7 @@ module ActiveSupport #:nodoc:
|
|
436
390
|
# 'é'.length #=> 2
|
437
391
|
# 'é'.mb_chars.decompose.to_s.length #=> 3
|
438
392
|
def decompose
|
439
|
-
chars(
|
393
|
+
chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*'))
|
440
394
|
end
|
441
395
|
|
442
396
|
# Performs composition on all the characters.
|
@@ -445,7 +399,7 @@ module ActiveSupport #:nodoc:
|
|
445
399
|
# 'é'.length #=> 3
|
446
400
|
# 'é'.mb_chars.compose.to_s.length #=> 2
|
447
401
|
def compose
|
448
|
-
chars(
|
402
|
+
chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*'))
|
449
403
|
end
|
450
404
|
|
451
405
|
# Returns the number of grapheme clusters in the string.
|
@@ -454,14 +408,14 @@ module ActiveSupport #:nodoc:
|
|
454
408
|
# 'क्षि'.mb_chars.length #=> 4
|
455
409
|
# 'क्षि'.mb_chars.g_length #=> 3
|
456
410
|
def g_length
|
457
|
-
|
411
|
+
Unicode.g_unpack(@wrapped_string).length
|
458
412
|
end
|
459
413
|
|
460
414
|
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
|
461
415
|
#
|
462
416
|
# Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
|
463
417
|
def tidy_bytes(force = false)
|
464
|
-
chars(
|
418
|
+
chars(Unicode.tidy_bytes(@wrapped_string, force))
|
465
419
|
end
|
466
420
|
|
467
421
|
%w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
|
@@ -475,241 +429,6 @@ module ActiveSupport #:nodoc:
|
|
475
429
|
end
|
476
430
|
end
|
477
431
|
|
478
|
-
class << self
|
479
|
-
|
480
|
-
# Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
|
481
|
-
# valid UTF-8.
|
482
|
-
#
|
483
|
-
# Example:
|
484
|
-
# Chars.u_unpack('Café') #=> [67, 97, 102, 233]
|
485
|
-
def u_unpack(string)
|
486
|
-
begin
|
487
|
-
string.unpack 'U*'
|
488
|
-
rescue ArgumentError
|
489
|
-
raise EncodingError, 'malformed UTF-8 character'
|
490
|
-
end
|
491
|
-
end
|
492
|
-
|
493
|
-
# Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
|
494
|
-
# character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
|
495
|
-
# <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
|
496
|
-
#
|
497
|
-
# Primarily used by the grapheme cluster support.
|
498
|
-
def in_char_class?(codepoint, classes)
|
499
|
-
classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
|
500
|
-
end
|
501
|
-
|
502
|
-
# Unpack the string at grapheme boundaries. Returns a list of character lists.
|
503
|
-
#
|
504
|
-
# Example:
|
505
|
-
# Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
|
506
|
-
# Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
|
507
|
-
def g_unpack(string)
|
508
|
-
codepoints = u_unpack(string)
|
509
|
-
unpacked = []
|
510
|
-
pos = 0
|
511
|
-
marker = 0
|
512
|
-
eoc = codepoints.length
|
513
|
-
while(pos < eoc)
|
514
|
-
pos += 1
|
515
|
-
previous = codepoints[pos-1]
|
516
|
-
current = codepoints[pos]
|
517
|
-
if (
|
518
|
-
# CR X LF
|
519
|
-
one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
|
520
|
-
# L X (L|V|LV|LVT)
|
521
|
-
two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
|
522
|
-
# (LV|V) X (V|T)
|
523
|
-
three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
|
524
|
-
# (LVT|T) X (T)
|
525
|
-
four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
|
526
|
-
# X Extend
|
527
|
-
five = (UCD.boundary[:extend] === current)
|
528
|
-
)
|
529
|
-
else
|
530
|
-
unpacked << codepoints[marker..pos-1]
|
531
|
-
marker = pos
|
532
|
-
end
|
533
|
-
end
|
534
|
-
unpacked
|
535
|
-
end
|
536
|
-
|
537
|
-
# Reverse operation of g_unpack.
|
538
|
-
#
|
539
|
-
# Example:
|
540
|
-
# Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
|
541
|
-
def g_pack(unpacked)
|
542
|
-
(unpacked.flatten).pack('U*')
|
543
|
-
end
|
544
|
-
|
545
|
-
def padding(padsize, padstr=' ') #:nodoc:
|
546
|
-
if padsize != 0
|
547
|
-
new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
|
548
|
-
else
|
549
|
-
''
|
550
|
-
end
|
551
|
-
end
|
552
|
-
|
553
|
-
# Re-order codepoints so the string becomes canonical.
|
554
|
-
def reorder_characters(codepoints)
|
555
|
-
length = codepoints.length- 1
|
556
|
-
pos = 0
|
557
|
-
while pos < length do
|
558
|
-
cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
|
559
|
-
if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
|
560
|
-
codepoints[pos..pos+1] = cp2.code, cp1.code
|
561
|
-
pos += (pos > 0 ? -1 : 1)
|
562
|
-
else
|
563
|
-
pos += 1
|
564
|
-
end
|
565
|
-
end
|
566
|
-
codepoints
|
567
|
-
end
|
568
|
-
|
569
|
-
# Decompose composed characters to the decomposed form.
|
570
|
-
def decompose_codepoints(type, codepoints)
|
571
|
-
codepoints.inject([]) do |decomposed, cp|
|
572
|
-
# if it's a hangul syllable starter character
|
573
|
-
if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
|
574
|
-
sindex = cp - HANGUL_SBASE
|
575
|
-
ncp = [] # new codepoints
|
576
|
-
ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
|
577
|
-
ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
|
578
|
-
tindex = sindex % HANGUL_TCOUNT
|
579
|
-
ncp << (HANGUL_TBASE + tindex) unless tindex == 0
|
580
|
-
decomposed.concat ncp
|
581
|
-
# if the codepoint is decomposable in with the current decomposition type
|
582
|
-
elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
|
583
|
-
decomposed.concat decompose_codepoints(type, ncp.dup)
|
584
|
-
else
|
585
|
-
decomposed << cp
|
586
|
-
end
|
587
|
-
end
|
588
|
-
end
|
589
|
-
|
590
|
-
# Compose decomposed characters to the composed form.
|
591
|
-
def compose_codepoints(codepoints)
|
592
|
-
pos = 0
|
593
|
-
eoa = codepoints.length - 1
|
594
|
-
starter_pos = 0
|
595
|
-
starter_char = codepoints[0]
|
596
|
-
previous_combining_class = -1
|
597
|
-
while pos < eoa
|
598
|
-
pos += 1
|
599
|
-
lindex = starter_char - HANGUL_LBASE
|
600
|
-
# -- Hangul
|
601
|
-
if 0 <= lindex and lindex < HANGUL_LCOUNT
|
602
|
-
vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
|
603
|
-
if 0 <= vindex and vindex < HANGUL_VCOUNT
|
604
|
-
tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
|
605
|
-
if 0 <= tindex and tindex < HANGUL_TCOUNT
|
606
|
-
j = starter_pos + 2
|
607
|
-
eoa -= 2
|
608
|
-
else
|
609
|
-
tindex = 0
|
610
|
-
j = starter_pos + 1
|
611
|
-
eoa -= 1
|
612
|
-
end
|
613
|
-
codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
|
614
|
-
end
|
615
|
-
starter_pos += 1
|
616
|
-
starter_char = codepoints[starter_pos]
|
617
|
-
# -- Other characters
|
618
|
-
else
|
619
|
-
current_char = codepoints[pos]
|
620
|
-
current = UCD.codepoints[current_char]
|
621
|
-
if current.combining_class > previous_combining_class
|
622
|
-
if ref = UCD.composition_map[starter_char]
|
623
|
-
composition = ref[current_char]
|
624
|
-
else
|
625
|
-
composition = nil
|
626
|
-
end
|
627
|
-
unless composition.nil?
|
628
|
-
codepoints[starter_pos] = composition
|
629
|
-
starter_char = composition
|
630
|
-
codepoints.delete_at pos
|
631
|
-
eoa -= 1
|
632
|
-
pos -= 1
|
633
|
-
previous_combining_class = -1
|
634
|
-
else
|
635
|
-
previous_combining_class = current.combining_class
|
636
|
-
end
|
637
|
-
else
|
638
|
-
previous_combining_class = current.combining_class
|
639
|
-
end
|
640
|
-
if current.combining_class == 0
|
641
|
-
starter_pos = pos
|
642
|
-
starter_char = codepoints[pos]
|
643
|
-
end
|
644
|
-
end
|
645
|
-
end
|
646
|
-
codepoints
|
647
|
-
end
|
648
|
-
|
649
|
-
def tidy_byte(byte)
|
650
|
-
if byte < 160
|
651
|
-
[UCD.cp1252[byte] || byte].pack("U").unpack("C*")
|
652
|
-
elsif byte < 192
|
653
|
-
[194, byte]
|
654
|
-
else
|
655
|
-
[195, byte - 64]
|
656
|
-
end
|
657
|
-
end
|
658
|
-
private :tidy_byte
|
659
|
-
|
660
|
-
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
|
661
|
-
#
|
662
|
-
# Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
|
663
|
-
def tidy_bytes(string, force = false)
|
664
|
-
if force
|
665
|
-
return string.unpack("C*").map do |b|
|
666
|
-
tidy_byte(b)
|
667
|
-
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
668
|
-
end
|
669
|
-
|
670
|
-
bytes = string.unpack("C*")
|
671
|
-
conts_expected = 0
|
672
|
-
last_lead = 0
|
673
|
-
|
674
|
-
bytes.each_index do |i|
|
675
|
-
|
676
|
-
byte = bytes[i]
|
677
|
-
is_ascii = byte < 128
|
678
|
-
is_cont = byte > 127 && byte < 192
|
679
|
-
is_lead = byte > 191 && byte < 245
|
680
|
-
is_unused = byte > 240
|
681
|
-
is_restricted = byte > 244
|
682
|
-
|
683
|
-
# Impossible or highly unlikely byte? Clean it.
|
684
|
-
if is_unused || is_restricted
|
685
|
-
bytes[i] = tidy_byte(byte)
|
686
|
-
elsif is_cont
|
687
|
-
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
688
|
-
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
689
|
-
else
|
690
|
-
if conts_expected > 0
|
691
|
-
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
692
|
-
# the leading byte.
|
693
|
-
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
694
|
-
conts_expected = 0
|
695
|
-
end
|
696
|
-
if is_lead
|
697
|
-
# Final byte is leading? Clean it.
|
698
|
-
if i == bytes.length - 1
|
699
|
-
bytes[i] = tidy_byte(bytes.last)
|
700
|
-
else
|
701
|
-
# Valid leading byte? Expect continuations determined by position of
|
702
|
-
# first zero bit, with max of 3.
|
703
|
-
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
704
|
-
last_lead = i
|
705
|
-
end
|
706
|
-
end
|
707
|
-
end
|
708
|
-
end
|
709
|
-
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
710
|
-
end
|
711
|
-
end
|
712
|
-
|
713
432
|
protected
|
714
433
|
|
715
434
|
def translate_offset(byte_offset) #:nodoc:
|
@@ -734,26 +453,23 @@ module ActiveSupport #:nodoc:
|
|
734
453
|
padsize = padsize > 0 ? padsize : 0
|
735
454
|
case way
|
736
455
|
when :right
|
737
|
-
result = @wrapped_string.dup.insert(0,
|
456
|
+
result = @wrapped_string.dup.insert(0, padding(padsize, padstr))
|
738
457
|
when :left
|
739
|
-
result = @wrapped_string.dup.insert(-1,
|
458
|
+
result = @wrapped_string.dup.insert(-1, padding(padsize, padstr))
|
740
459
|
when :center
|
741
|
-
lpad =
|
742
|
-
rpad =
|
460
|
+
lpad = padding((padsize / 2.0).floor, padstr)
|
461
|
+
rpad = padding((padsize / 2.0).ceil, padstr)
|
743
462
|
result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
|
744
463
|
end
|
745
464
|
chars(result)
|
746
465
|
end
|
747
466
|
|
748
|
-
def
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
codepoint
|
755
|
-
end
|
756
|
-
end.pack('U*'))
|
467
|
+
def padding(padsize, padstr=' ') #:nodoc:
|
468
|
+
if padsize != 0
|
469
|
+
chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize)
|
470
|
+
else
|
471
|
+
''
|
472
|
+
end
|
757
473
|
end
|
758
474
|
|
759
475
|
def chars(string) #:nodoc:
|