bioinform 0.1.11 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -1
  3. data/TODO.txt +2 -0
  4. data/bioinform.gemspec +0 -6
  5. data/lib/bioinform/data_models/motif.rb +1 -1
  6. data/lib/bioinform/data_models/pm.rb +2 -1
  7. data/lib/bioinform/data_models/pwm.rb +4 -5
  8. data/lib/bioinform/support/multiline_squish.rb +1 -1
  9. data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +29 -0
  10. data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +23 -0
  11. data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +54 -0
  12. data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +64 -0
  13. data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +57 -0
  14. data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +99 -0
  15. data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +6 -0
  16. data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +49 -0
  17. data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +72 -0
  18. data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +178 -0
  19. data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +476 -0
  20. data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +8 -0
  21. data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +393 -0
  22. data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +60 -0
  23. data/lib/bioinform/support/third_part/active_support/multibyte.rb +44 -0
  24. data/lib/bioinform/support.rb +2 -2
  25. data/lib/bioinform/version.rb +1 -1
  26. data/spec/spec_helper.rb +10 -20
  27. metadata +21 -77
@@ -0,0 +1,178 @@
1
+ require_relative 'core_ext/hash/keys'
2
+
3
+ # This class has dubious semantics and we only have it so that
4
+ # people can write <tt>params[:key]</tt> instead of <tt>params['key']</tt>
5
+ # and they get the same value for both keys.
6
+
7
+ module ActiveSupport
8
+ class HashWithIndifferentAccess < Hash
9
+
10
+ # Always returns true, so that <tt>Array#extract_options!</tt> finds members of this class.
11
+ def extractable_options?
12
+ true
13
+ end
14
+
15
+ def with_indifferent_access
16
+ dup
17
+ end
18
+
19
+ def nested_under_indifferent_access
20
+ self
21
+ end
22
+
23
+ def initialize(constructor = {})
24
+ if constructor.is_a?(Hash)
25
+ super()
26
+ update(constructor)
27
+ else
28
+ super(constructor)
29
+ end
30
+ end
31
+
32
+ def default(key = nil)
33
+ if key.is_a?(Symbol) && include?(key = key.to_s)
34
+ self[key]
35
+ else
36
+ super
37
+ end
38
+ end
39
+
40
+ def self.new_from_hash_copying_default(hash)
41
+ new(hash).tap do |new_hash|
42
+ new_hash.default = hash.default
43
+ end
44
+ end
45
+
46
+ alias_method :regular_writer, :[]= unless method_defined?(:regular_writer)
47
+ alias_method :regular_update, :update unless method_defined?(:regular_update)
48
+
49
+ # Assigns a new value to the hash:
50
+ #
51
+ # hash = HashWithIndifferentAccess.new
52
+ # hash[:key] = "value"
53
+ #
54
+ def []=(key, value)
55
+ regular_writer(convert_key(key), convert_value(value))
56
+ end
57
+
58
+ alias_method :store, :[]=
59
+
60
+ # Updates the instantized hash with values from the second:
61
+ #
62
+ # hash_1 = HashWithIndifferentAccess.new
63
+ # hash_1[:key] = "value"
64
+ #
65
+ # hash_2 = HashWithIndifferentAccess.new
66
+ # hash_2[:key] = "New Value!"
67
+ #
68
+ # hash_1.update(hash_2) # => {"key"=>"New Value!"}
69
+ #
70
+ def update(other_hash)
71
+ if other_hash.is_a? HashWithIndifferentAccess
72
+ super(other_hash)
73
+ else
74
+ other_hash.each_pair { |key, value| regular_writer(convert_key(key), convert_value(value)) }
75
+ self
76
+ end
77
+ end
78
+
79
+ alias_method :merge!, :update
80
+
81
+ # Checks the hash for a key matching the argument passed in:
82
+ #
83
+ # hash = HashWithIndifferentAccess.new
84
+ # hash["key"] = "value"
85
+ # hash.key? :key # => true
86
+ # hash.key? "key" # => true
87
+ #
88
+ def key?(key)
89
+ super(convert_key(key))
90
+ end
91
+
92
+ alias_method :include?, :key?
93
+ alias_method :has_key?, :key?
94
+ alias_method :member?, :key?
95
+
96
+ # Same as <tt>Hash#fetch</tt> where the key passed as argument can be
97
+ # either a string or a symbol:
98
+ #
99
+ # counters = HashWithIndifferentAccess.new
100
+ # counters[:foo] = 1
101
+ #
102
+ # counters.fetch("foo") # => 1
103
+ # counters.fetch(:bar, 0) # => 0
104
+ # counters.fetch(:bar) {|key| 0} # => 0
105
+ # counters.fetch(:zoo) # => KeyError: key not found: "zoo"
106
+ #
107
+ def fetch(key, *extras)
108
+ super(convert_key(key), *extras)
109
+ end
110
+
111
+ # Returns an array of the values at the specified indices:
112
+ #
113
+ # hash = HashWithIndifferentAccess.new
114
+ # hash[:a] = "x"
115
+ # hash[:b] = "y"
116
+ # hash.values_at("a", "b") # => ["x", "y"]
117
+ #
118
+ def values_at(*indices)
119
+ indices.collect {|key| self[convert_key(key)]}
120
+ end
121
+
122
+ # Returns an exact copy of the hash.
123
+ def dup
124
+ self.class.new(self).tap do |new_hash|
125
+ new_hash.default = default
126
+ end
127
+ end
128
+
129
+ # Merges the instantized and the specified hashes together, giving precedence to the values from the second hash.
130
+ # Does not overwrite the existing hash.
131
+ def merge(hash)
132
+ self.dup.update(hash)
133
+ end
134
+
135
+ # Performs the opposite of merge, with the keys and values from the first hash taking precedence over the second.
136
+ # This overloaded definition prevents returning a regular hash, if reverse_merge is called on a <tt>HashWithDifferentAccess</tt>.
137
+ def reverse_merge(other_hash)
138
+ super self.class.new_from_hash_copying_default(other_hash)
139
+ end
140
+
141
+ def reverse_merge!(other_hash)
142
+ replace(reverse_merge( other_hash ))
143
+ end
144
+
145
+ # Removes a specified key from the hash.
146
+ def delete(key)
147
+ super(convert_key(key))
148
+ end
149
+
150
+ def stringify_keys!; self end
151
+ def stringify_keys; dup end
152
+ undef :symbolize_keys!
153
+ def symbolize_keys; to_hash.symbolize_keys end
154
+ def to_options!; self end
155
+
156
+ # Convert to a Hash with String keys.
157
+ def to_hash
158
+ Hash.new(default).merge!(self)
159
+ end
160
+
161
+ protected
162
+ def convert_key(key)
163
+ key.kind_of?(Symbol) ? key.to_s : key
164
+ end
165
+
166
+ def convert_value(value)
167
+ if value.is_a? Hash
168
+ value.nested_under_indifferent_access
169
+ elsif value.is_a?(Array)
170
+ value.dup.replace(value.map { |e| convert_value(e) })
171
+ else
172
+ value
173
+ end
174
+ end
175
+ end
176
+ end
177
+
178
+ HashWithIndifferentAccess = ActiveSupport::HashWithIndifferentAccess
@@ -0,0 +1,476 @@
1
+ # encoding: utf-8
2
+ require_relative '../core_ext/string/access'
3
+ require_relative '../core_ext/string/behavior'
4
+
5
+ module ActiveSupport #:nodoc:
6
+ module Multibyte #:nodoc:
7
+ # Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive
8
+ # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
9
+ # encoding safe manner. All the normal String methods are also implemented on the proxy.
10
+ #
11
+ # String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
12
+ # which would normally return a String object now return a Chars object so methods can be chained.
13
+ #
14
+ # "The Perfect String ".mb_chars.downcase.strip.normalize # => "the perfect string"
15
+ #
16
+ # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
17
+ # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
18
+ #
19
+ # bad.explicit_checking_method "T".mb_chars.downcase.to_s
20
+ #
21
+ # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
22
+ # encodings you can write your own multibyte string handler and configure it through
23
+ # ActiveSupport::Multibyte.proxy_class.
24
+ #
25
+ # class CharsForUTF32
26
+ # def size
27
+ # @wrapped_string.size / 4
28
+ # end
29
+ #
30
+ # def self.accepts?(string)
31
+ # string.length % 4 == 0
32
+ # end
33
+ # end
34
+ #
35
+ # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
36
+ class Chars
37
+ attr_reader :wrapped_string
38
+ alias to_s wrapped_string
39
+ alias to_str wrapped_string
40
+
41
+ if RUBY_VERSION >= "1.9"
42
+ # Creates a new Chars instance by wrapping _string_.
43
+ def initialize(string)
44
+ @wrapped_string = string
45
+ @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
46
+ end
47
+ else
48
+ def initialize(string) #:nodoc:
49
+ @wrapped_string = string
50
+ end
51
+ end
52
+
53
+ # Forward all undefined methods to the wrapped string.
54
+ def method_missing(method, *args, &block)
55
+ if method.to_s =~ /!$/
56
+ @wrapped_string.__send__(method, *args, &block)
57
+ self
58
+ else
59
+ result = @wrapped_string.__send__(method, *args, &block)
60
+ result.kind_of?(String) ? chars(result) : result
61
+ end
62
+ end
63
+
64
+ # Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
65
+ # only if the optional second parameter evaluates to +true+.
66
+ def respond_to?(method, include_private=false)
67
+ super || @wrapped_string.respond_to?(method, include_private)
68
+ end
69
+
70
+ # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
71
+ def acts_like_string?
72
+ true
73
+ end
74
+
75
+ # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
76
+ def self.consumes?(string)
77
+ # Unpack is a little bit faster than regular expressions.
78
+ string.unpack('U*')
79
+ true
80
+ rescue ArgumentError
81
+ false
82
+ end
83
+
84
+ include Comparable
85
+
86
+ # Returns -1, 0, or 1, depending on whether the Chars object is to be sorted before,
87
+ # equal or after the object on the right side of the operation. It accepts any object
88
+ # that implements +to_s+:
89
+ #
90
+ # 'é'.mb_chars <=> 'ü'.mb_chars # => -1
91
+ #
92
+ # See <tt>String#<=></tt> for more details.
93
+ def <=>(other)
94
+ @wrapped_string <=> other.to_s
95
+ end
96
+
97
+ if RUBY_VERSION < "1.9"
98
+ # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
99
+ # +false+ otherwise.
100
+ def self.wants?(string)
101
+ $KCODE == 'UTF8' && consumes?(string)
102
+ end
103
+
104
+ # Returns a new Chars object containing the _other_ object concatenated to the string.
105
+ #
106
+ # Example:
107
+ # ('Café'.mb_chars + ' périferôl').to_s # => "Café périferôl"
108
+ def +(other)
109
+ chars(@wrapped_string + other)
110
+ end
111
+
112
+ # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
113
+ #
114
+ # Example:
115
+ # 'Café périferôl'.mb_chars =~ /ô/ # => 12
116
+ def =~(other)
117
+ translate_offset(@wrapped_string =~ other)
118
+ end
119
+
120
+ # Inserts the passed string at specified codepoint offsets.
121
+ #
122
+ # Example:
123
+ # 'Café'.mb_chars.insert(4, ' périferôl').to_s # => "Café périferôl"
124
+ def insert(offset, fragment)
125
+ unpacked = Unicode.u_unpack(@wrapped_string)
126
+ unless offset > unpacked.length
127
+ @wrapped_string.replace(
128
+ Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*')
129
+ )
130
+ else
131
+ raise IndexError, "index #{offset} out of string"
132
+ end
133
+ self
134
+ end
135
+
136
+ # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
137
+ #
138
+ # Example:
139
+ # 'Café'.mb_chars.include?('é') # => true
140
+ def include?(other)
141
+ # We have to redefine this method because Enumerable defines it.
142
+ @wrapped_string.include?(other)
143
+ end
144
+
145
+ # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
146
+ #
147
+ # Example:
148
+ # 'Café périferôl'.mb_chars.index('ô') # => 12
149
+ # 'Café périferôl'.mb_chars.index(/\w/u) # => 0
150
+ def index(needle, offset=0)
151
+ wrapped_offset = first(offset).wrapped_string.length
152
+ index = @wrapped_string.index(needle, wrapped_offset)
153
+ index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
154
+ end
155
+
156
+ # Returns the position _needle_ in the string, counting in
157
+ # codepoints, searching backward from _offset_ or the end of the
158
+ # string. Returns +nil+ if _needle_ isn't found.
159
+ #
160
+ # Example:
161
+ # 'Café périferôl'.mb_chars.rindex('é') # => 6
162
+ # 'Café périferôl'.mb_chars.rindex(/\w/u) # => 13
163
+ def rindex(needle, offset=nil)
164
+ offset ||= length
165
+ wrapped_offset = first(offset).wrapped_string.length
166
+ index = @wrapped_string.rindex(needle, wrapped_offset)
167
+ index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
168
+ end
169
+
170
+ # Returns the number of codepoints in the string
171
+ def size
172
+ Unicode.u_unpack(@wrapped_string).size
173
+ end
174
+ alias_method :length, :size
175
+
176
+ # Strips entire range of Unicode whitespace from the right of the string.
177
+ def rstrip
178
+ chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, ''))
179
+ end
180
+
181
+ # Strips entire range of Unicode whitespace from the left of the string.
182
+ def lstrip
183
+ chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, ''))
184
+ end
185
+
186
+ # Strips entire range of Unicode whitespace from the right and left of the string.
187
+ def strip
188
+ rstrip.lstrip
189
+ end
190
+
191
+ # Returns the codepoint of the first character in the string.
192
+ #
193
+ # Example:
194
+ # 'こんにちは'.mb_chars.ord # => 12371
195
+ def ord
196
+ Unicode.u_unpack(@wrapped_string)[0]
197
+ end
198
+
199
+ # Works just like <tt>String#rjust</tt>, only integer specifies characters instead of bytes.
200
+ #
201
+ # Example:
202
+ #
203
+ # "¾ cup".mb_chars.rjust(8).to_s
204
+ # # => " ¾ cup"
205
+ #
206
+ # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
207
+ # # => "   ¾ cup"
208
+ def rjust(integer, padstr=' ')
209
+ justify(integer, :right, padstr)
210
+ end
211
+
212
+ # Works just like <tt>String#ljust</tt>, only integer specifies characters instead of bytes.
213
+ #
214
+ # Example:
215
+ #
216
+ # "¾ cup".mb_chars.rjust(8).to_s
217
+ # # => "¾ cup "
218
+ #
219
+ # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
220
+ # # => "¾ cup   "
221
+ def ljust(integer, padstr=' ')
222
+ justify(integer, :left, padstr)
223
+ end
224
+
225
+ # Works just like <tt>String#center</tt>, only integer specifies characters instead of bytes.
226
+ #
227
+ # Example:
228
+ #
229
+ # "¾ cup".mb_chars.center(8).to_s
230
+ # # => " ¾ cup "
231
+ #
232
+ # "¾ cup".mb_chars.center(8, " ").to_s # Use non-breaking whitespace
233
+ # # => " ¾ cup  "
234
+ def center(integer, padstr=' ')
235
+ justify(integer, :center, padstr)
236
+ end
237
+
238
+ else
239
+ def =~(other)
240
+ @wrapped_string =~ other
241
+ end
242
+ end
243
+
244
+ # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
245
+ # instances instead of String. This makes chaining methods easier.
246
+ #
247
+ # Example:
248
+ # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } # => ["CAF", " P", "RIFERÔL"]
249
+ def split(*args)
250
+ @wrapped_string.split(*args).map { |i| i.mb_chars }
251
+ end
252
+
253
+ # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
254
+ #
255
+ # Example:
256
+ #
257
+ # s = "Müller"
258
+ # s.mb_chars[2] = "e" # Replace character with offset 2
259
+ # s
260
+ # # => "Müeler"
261
+ #
262
+ # s = "Müller"
263
+ # s.mb_chars[1, 2] = "ö" # Replace 2 characters at character offset 1
264
+ # s
265
+ # # => "Möler"
266
+ def []=(*args)
267
+ replace_by = args.pop
268
+ # Indexed replace with regular expressions already works
269
+ if args.first.is_a?(Regexp)
270
+ @wrapped_string[*args] = replace_by
271
+ else
272
+ result = Unicode.u_unpack(@wrapped_string)
273
+ case args.first
274
+ when Fixnum
275
+ raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
276
+ min = args[0]
277
+ max = args[1].nil? ? min : (min + args[1] - 1)
278
+ range = Range.new(min, max)
279
+ replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
280
+ when Range
281
+ raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
282
+ range = args[0]
283
+ else
284
+ needle = args[0].to_s
285
+ min = index(needle)
286
+ max = min + Unicode.u_unpack(needle).length - 1
287
+ range = Range.new(min, max)
288
+ end
289
+ result[range] = Unicode.u_unpack(replace_by)
290
+ @wrapped_string.replace(result.pack('U*'))
291
+ end
292
+ end
293
+
294
+ # Reverses all characters in the string.
295
+ #
296
+ # Example:
297
+ # 'Café'.mb_chars.reverse.to_s # => 'éfaC'
298
+ def reverse
299
+ chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
300
+ end
301
+
302
+ # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
303
+ # character.
304
+ #
305
+ # Example:
306
+ # 'こんにちは'.mb_chars.slice(2..3).to_s # => "にち"
307
+ def slice(*args)
308
+ if args.size > 2
309
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
310
+ elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
311
+ raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
312
+ elsif (args.size == 2 && !args[1].is_a?(Numeric))
313
+ raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
314
+ elsif args[0].kind_of? Range
315
+ cps = Unicode.u_unpack(@wrapped_string).slice(*args)
316
+ result = cps.nil? ? nil : cps.pack('U*')
317
+ elsif args[0].kind_of? Regexp
318
+ result = @wrapped_string.slice(*args)
319
+ elsif args.size == 1 && args[0].kind_of?(Numeric)
320
+ character = Unicode.u_unpack(@wrapped_string)[args[0]]
321
+ result = character && [character].pack('U')
322
+ else
323
+ cps = Unicode.u_unpack(@wrapped_string).slice(*args)
324
+ result = cps && cps.pack('U*')
325
+ end
326
+ result && chars(result)
327
+ end
328
+ alias_method :[], :slice
329
+
330
+ # Limit the byte size of the string to a number of bytes without breaking characters. Usable
331
+ # when the storage for a string is limited for some reason.
332
+ #
333
+ # Example:
334
+ # 'こんにちは'.mb_chars.limit(7).to_s # => "こん"
335
+ def limit(limit)
336
+ slice(0...translate_offset(limit))
337
+ end
338
+
339
+ # Convert characters in the string to uppercase.
340
+ #
341
+ # Example:
342
+ # 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s # => "LAURENT, OÙ SONT LES TESTS ?"
343
+ def upcase
344
+ chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping)
345
+ end
346
+
347
+ # Convert characters in the string to lowercase.
348
+ #
349
+ # Example:
350
+ # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s # => "věda a výzkum"
351
+ def downcase
352
+ chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping)
353
+ end
354
+
355
+ # Converts the first character to uppercase and the remainder to lowercase.
356
+ #
357
+ # Example:
358
+ # 'über'.mb_chars.capitalize.to_s # => "Über"
359
+ def capitalize
360
+ (slice(0) || chars('')).upcase + (slice(1..-1) || chars('')).downcase
361
+ end
362
+
363
+ # Capitalizes the first letter of every word, when possible.
364
+ #
365
+ # Example:
366
+ # "ÉL QUE SE ENTERÓ".mb_chars.titleize # => "Él Que Se Enteró"
367
+ # "日本語".mb_chars.titleize # => "日本語"
368
+ def titleize
369
+ chars(downcase.to_s.gsub(/\b('?[\S])/u) { Unicode.apply_mapping $1, :uppercase_mapping })
370
+ end
371
+ alias_method :titlecase, :titleize
372
+
373
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
374
+ # passing strings to databases and validations.
375
+ #
376
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
377
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
378
+ # ActiveSupport::Multibyte::Unicode.default_normalization_form
379
+ def normalize(form = nil)
380
+ chars(Unicode.normalize(@wrapped_string, form))
381
+ end
382
+
383
+ # Performs canonical decomposition on all the characters.
384
+ #
385
+ # Example:
386
+ # 'é'.length # => 2
387
+ # 'é'.mb_chars.decompose.to_s.length # => 3
388
+ def decompose
389
+ chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*'))
390
+ end
391
+
392
+ # Performs composition on all the characters.
393
+ #
394
+ # Example:
395
+ # 'é'.length # => 3
396
+ # 'é'.mb_chars.compose.to_s.length # => 2
397
+ def compose
398
+ chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*'))
399
+ end
400
+
401
+ # Returns the number of grapheme clusters in the string.
402
+ #
403
+ # Example:
404
+ # 'क्षि'.mb_chars.length # => 4
405
+ # 'क्षि'.mb_chars.g_length # => 3
406
+ def g_length
407
+ Unicode.g_unpack(@wrapped_string).length
408
+ end
409
+
410
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
411
+ #
412
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
413
+ def tidy_bytes(force = false)
414
+ chars(Unicode.tidy_bytes(@wrapped_string, force))
415
+ end
416
+
417
+ %w(capitalize downcase lstrip reverse rstrip slice strip tidy_bytes upcase).each do |method|
418
+ # Only define a corresponding bang method for methods defined in the proxy; On 1.9 the proxy will
419
+ # exclude lstrip!, rstrip! and strip! because they are already work as expected on multibyte strings.
420
+ if public_method_defined?(method)
421
+ define_method("#{method}!") do |*args|
422
+ @wrapped_string = send(args.nil? ? method : method, *args).to_s
423
+ self
424
+ end
425
+ end
426
+ end
427
+
428
+ protected
429
+
430
+ def translate_offset(byte_offset) #:nodoc:
431
+ return nil if byte_offset.nil?
432
+ return 0 if @wrapped_string == ''
433
+
434
+ if @wrapped_string.respond_to?(:force_encoding)
435
+ @wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
436
+ end
437
+
438
+ begin
439
+ @wrapped_string[0...byte_offset].unpack('U*').length
440
+ rescue ArgumentError
441
+ byte_offset -= 1
442
+ retry
443
+ end
444
+ end
445
+
446
+ def justify(integer, way, padstr=' ') #:nodoc:
447
+ raise ArgumentError, "zero width padding" if padstr.length == 0
448
+ padsize = integer - size
449
+ padsize = padsize > 0 ? padsize : 0
450
+ case way
451
+ when :right
452
+ result = @wrapped_string.dup.insert(0, padding(padsize, padstr))
453
+ when :left
454
+ result = @wrapped_string.dup.insert(-1, padding(padsize, padstr))
455
+ when :center
456
+ lpad = padding((padsize / 2.0).floor, padstr)
457
+ rpad = padding((padsize / 2.0).ceil, padstr)
458
+ result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
459
+ end
460
+ chars(result)
461
+ end
462
+
463
+ def padding(padsize, padstr=' ') #:nodoc:
464
+ if padsize != 0
465
+ chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize)
466
+ else
467
+ ''
468
+ end
469
+ end
470
+
471
+ def chars(string) #:nodoc:
472
+ self.class.new(string)
473
+ end
474
+ end
475
+ end
476
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ module ActiveSupport #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ # Raised when a problem with the encoding was found.
6
+ class EncodingError < StandardError; end
7
+ end
8
+ end