bioinform 0.1.17 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -3
- data/LICENSE +0 -1
- data/README.md +1 -1
- data/TODO.txt +23 -30
- data/bin/convert_motif +4 -0
- data/bin/pcm2pwm +1 -1
- data/bin/split_motifs +1 -1
- data/bioinform.gemspec +0 -2
- data/lib/bioinform.rb +54 -16
- data/lib/bioinform/alphabet.rb +85 -0
- data/lib/bioinform/background.rb +90 -0
- data/lib/bioinform/cli.rb +1 -2
- data/lib/bioinform/cli/convert_motif.rb +52 -17
- data/lib/bioinform/cli/pcm2pwm.rb +32 -26
- data/lib/bioinform/cli/split_motifs.rb +31 -30
- data/lib/bioinform/conversion_algorithms.rb +6 -0
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +13 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +39 -11
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +26 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +30 -0
- data/lib/bioinform/conversion_algorithms/pwm2iupac_pwm_converter.rb +23 -0
- data/lib/bioinform/conversion_algorithms/pwm2pcm_converter.rb +85 -0
- data/lib/bioinform/data_models.rb +1 -7
- data/lib/bioinform/data_models/named_model.rb +38 -0
- data/lib/bioinform/data_models/pcm.rb +18 -28
- data/lib/bioinform/data_models/pm.rb +73 -170
- data/lib/bioinform/data_models/ppm.rb +11 -24
- data/lib/bioinform/data_models/pwm.rb +30 -56
- data/lib/bioinform/errors.rb +17 -0
- data/lib/bioinform/formatters.rb +4 -2
- data/lib/bioinform/formatters/consensus_formatter.rb +35 -0
- data/lib/bioinform/formatters/motif_formatter.rb +69 -0
- data/lib/bioinform/formatters/pretty_matrix_formatter.rb +36 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +29 -37
- data/lib/bioinform/parsers.rb +1 -8
- data/lib/bioinform/parsers/matrix_parser.rb +44 -36
- data/lib/bioinform/parsers/motif_splitter.rb +45 -0
- data/lib/bioinform/support.rb +46 -14
- data/lib/bioinform/support/strip_doc.rb +1 -1
- data/lib/bioinform/version.rb +1 -1
- data/spec/alphabet_spec.rb +79 -0
- data/spec/background_spec.rb +57 -0
- data/spec/cli/cli_spec.rb +6 -6
- data/spec/cli/convert_motif_spec.rb +88 -88
- data/spec/cli/data/pcm2pwm/KLF4_f2.pwm.result +9 -9
- data/spec/cli/data/pcm2pwm/SP1_f1.pwm.result +11 -11
- data/spec/cli/pcm2pwm_spec.rb +22 -23
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +1 -1
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +5 -5
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +2 -2
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +3 -3
- data/spec/cli/split_motifs_spec.rb +6 -21
- data/spec/converters/pcm2ppm_converter_spec.rb +32 -0
- data/spec/converters/pcm2pwm_converter_spec.rb +71 -0
- data/spec/converters/ppm2pcm_converter_spec.rb +32 -0
- data/spec/converters/pwm2iupac_pwm_converter_spec.rb +65 -0
- data/spec/converters/pwm2pcm_converter_spec.rb +57 -0
- data/spec/data_models/named_model_spec.rb +41 -0
- data/spec/data_models/pcm_spec.rb +114 -45
- data/spec/data_models/pm_spec.rb +132 -333
- data/spec/data_models/ppm_spec.rb +47 -44
- data/spec/data_models/pwm_spec.rb +85 -77
- data/spec/fabricators/motif_formats_fabricator.rb +116 -116
- data/spec/formatters/consensus_formatter_spec.rb +26 -0
- data/spec/formatters/raw_formatter_spec.rb +169 -0
- data/spec/parsers/matrix_parser_spec.rb +216 -0
- data/spec/parsers/motif_splitter_spec.rb +87 -0
- data/spec/spec_helper.rb +2 -2
- data/spec/spec_helper_source.rb +25 -5
- data/spec/support_spec.rb +31 -0
- metadata +43 -124
- data/bin/merge_into_collection +0 -4
- data/lib/bioinform/cli/merge_into_collection.rb +0 -80
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +0 -75
- data/lib/bioinform/data_models/motif.rb +0 -56
- data/lib/bioinform/formatters/raw_formatter.rb +0 -41
- data/lib/bioinform/parsers/jaspar_parser.rb +0 -35
- data/lib/bioinform/parsers/parser.rb +0 -92
- data/lib/bioinform/parsers/splittable_parser.rb +0 -57
- data/lib/bioinform/parsers/string_fantom_parser.rb +0 -35
- data/lib/bioinform/parsers/string_parser.rb +0 -72
- data/lib/bioinform/parsers/trivial_parser.rb +0 -34
- data/lib/bioinform/parsers/yaml_parser.rb +0 -35
- data/lib/bioinform/support/advanced_scan.rb +0 -8
- data/lib/bioinform/support/array_product.rb +0 -6
- data/lib/bioinform/support/array_zip.rb +0 -6
- data/lib/bioinform/support/collect_hash.rb +0 -7
- data/lib/bioinform/support/deep_dup.rb +0 -5
- data/lib/bioinform/support/delete_many.rb +0 -14
- data/lib/bioinform/support/inverf.rb +0 -13
- data/lib/bioinform/support/multiline_squish.rb +0 -6
- data/lib/bioinform/support/parameters.rb +0 -28
- data/lib/bioinform/support/partial_sums.rb +0 -16
- data/lib/bioinform/support/same_by.rb +0 -12
- data/lib/bioinform/support/third_part/active_support/core_ext/array/extract_options.rb +0 -29
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/indifferent_access.rb +0 -23
- data/lib/bioinform/support/third_part/active_support/core_ext/hash/keys.rb +0 -54
- data/lib/bioinform/support/third_part/active_support/core_ext/module/attribute_accessors.rb +0 -64
- data/lib/bioinform/support/third_part/active_support/core_ext/object/try.rb +0 -57
- data/lib/bioinform/support/third_part/active_support/core_ext/string/access.rb +0 -99
- data/lib/bioinform/support/third_part/active_support/core_ext/string/behavior.rb +0 -6
- data/lib/bioinform/support/third_part/active_support/core_ext/string/filters.rb +0 -49
- data/lib/bioinform/support/third_part/active_support/core_ext/string/multibyte.rb +0 -72
- data/lib/bioinform/support/third_part/active_support/hash_with_indifferent_access.rb +0 -181
- data/lib/bioinform/support/third_part/active_support/multibyte.rb +0 -44
- data/lib/bioinform/support/third_part/active_support/multibyte/chars.rb +0 -476
- data/lib/bioinform/support/third_part/active_support/multibyte/exceptions.rb +0 -8
- data/lib/bioinform/support/third_part/active_support/multibyte/unicode.rb +0 -393
- data/lib/bioinform/support/third_part/active_support/multibyte/utils.rb +0 -60
- data/spec/cli/data/merge_into_collection/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/SP1_f1.pwm +0 -12
- data/spec/cli/data/merge_into_collection/collection.txt.result +0 -40
- data/spec/cli/data/merge_into_collection/collection.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +0 -188
- data/spec/cli/data/merge_into_collection/pwm_folder/GABPA_f1.pwm +0 -14
- data/spec/cli/data/merge_into_collection/pwm_folder/KLF4_f2.pwm +0 -11
- data/spec/cli/data/merge_into_collection/pwm_folder/SP1_f1.pwm +0 -12
- data/spec/cli/data/split_motifs/collection.yaml +0 -188
- data/spec/cli/merge_into_collection_spec.rb +0 -100
- data/spec/data_models/collection_spec.rb +0 -98
- data/spec/data_models/motif_spec.rb +0 -224
- data/spec/fabricators/collection_fabricator.rb +0 -8
- data/spec/fabricators/motif_fabricator.rb +0 -33
- data/spec/fabricators/pcm_fabricator.rb +0 -25
- data/spec/fabricators/pm_fabricator.rb +0 -52
- data/spec/fabricators/ppm_fabricator.rb +0 -14
- data/spec/fabricators/pwm_fabricator.rb +0 -16
- data/spec/parsers/parser_spec.rb +0 -152
- data/spec/parsers/string_fantom_parser_spec.rb +0 -70
- data/spec/parsers/string_parser_spec.rb +0 -77
- data/spec/parsers/trivial_parser_spec.rb +0 -64
- data/spec/parsers/yaml_parser_spec.rb +0 -50
- data/spec/support/advanced_scan_spec.rb +0 -32
- data/spec/support/array_product_spec.rb +0 -15
- data/spec/support/array_zip_spec.rb +0 -15
- data/spec/support/collect_hash_spec.rb +0 -15
- data/spec/support/delete_many_spec.rb +0 -44
- data/spec/support/inverf_spec.rb +0 -19
- data/spec/support/multiline_squish_spec.rb +0 -25
- data/spec/support/partial_sums_spec.rb +0 -30
- data/spec/support/same_by_spec.rb +0 -36
@@ -1,476 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require_relative '../core_ext/string/access'
|
3
|
-
require_relative '../core_ext/string/behavior'
|
4
|
-
|
5
|
-
module ActiveSupport #:nodoc:
|
6
|
-
module Multibyte #:nodoc:
|
7
|
-
# Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive
|
8
|
-
# knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
|
9
|
-
# encoding safe manner. All the normal String methods are also implemented on the proxy.
|
10
|
-
#
|
11
|
-
# String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
|
12
|
-
# which would normally return a String object now return a Chars object so methods can be chained.
|
13
|
-
#
|
14
|
-
# "The Perfect String ".mb_chars.downcase.strip.normalize # => "the perfect string"
|
15
|
-
#
|
16
|
-
# Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
|
17
|
-
# If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
|
18
|
-
#
|
19
|
-
# bad.explicit_checking_method "T".mb_chars.downcase.to_s
|
20
|
-
#
|
21
|
-
# The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
|
22
|
-
# encodings you can write your own multibyte string handler and configure it through
|
23
|
-
# ActiveSupport::Multibyte.proxy_class.
|
24
|
-
#
|
25
|
-
# class CharsForUTF32
|
26
|
-
# def size
|
27
|
-
# @wrapped_string.size / 4
|
28
|
-
# end
|
29
|
-
#
|
30
|
-
# def self.accepts?(string)
|
31
|
-
# string.length % 4 == 0
|
32
|
-
# end
|
33
|
-
# end
|
34
|
-
#
|
35
|
-
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
|
36
|
-
class Chars
|
37
|
-
attr_reader :wrapped_string
|
38
|
-
alias to_s wrapped_string
|
39
|
-
alias to_str wrapped_string
|
40
|
-
|
41
|
-
if RUBY_VERSION >= "1.9"
|
42
|
-
# Creates a new Chars instance by wrapping _string_.
|
43
|
-
def initialize(string)
|
44
|
-
@wrapped_string = string
|
45
|
-
@wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
|
46
|
-
end
|
47
|
-
else
|
48
|
-
def initialize(string) #:nodoc:
|
49
|
-
@wrapped_string = string
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
# Forward all undefined methods to the wrapped string.
|
54
|
-
def method_missing(method, *args, &block)
|
55
|
-
if method.to_s =~ /!$/
|
56
|
-
@wrapped_string.__send__(method, *args, &block)
|
57
|
-
self
|
58
|
-
else
|
59
|
-
result = @wrapped_string.__send__(method, *args, &block)
|
60
|
-
result.kind_of?(String) ? chars(result) : result
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
# Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
|
65
|
-
# only if the optional second parameter evaluates to +true+.
|
66
|
-
def respond_to?(method, include_private=false)
|
67
|
-
super || @wrapped_string.respond_to?(method, include_private)
|
68
|
-
end
|
69
|
-
|
70
|
-
# Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
|
71
|
-
def acts_like_string?
|
72
|
-
true
|
73
|
-
end
|
74
|
-
|
75
|
-
# Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
|
76
|
-
def self.consumes?(string)
|
77
|
-
# Unpack is a little bit faster than regular expressions.
|
78
|
-
string.unpack('U*')
|
79
|
-
true
|
80
|
-
rescue ArgumentError
|
81
|
-
false
|
82
|
-
end
|
83
|
-
|
84
|
-
include Comparable
|
85
|
-
|
86
|
-
# Returns -1, 0, or 1, depending on whether the Chars object is to be sorted before,
|
87
|
-
# equal or after the object on the right side of the operation. It accepts any object
|
88
|
-
# that implements +to_s+:
|
89
|
-
#
|
90
|
-
# 'é'.mb_chars <=> 'ü'.mb_chars # => -1
|
91
|
-
#
|
92
|
-
# See <tt>String#<=></tt> for more details.
|
93
|
-
def <=>(other)
|
94
|
-
@wrapped_string <=> other.to_s
|
95
|
-
end
|
96
|
-
|
97
|
-
if RUBY_VERSION < "1.9"
|
98
|
-
# Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
|
99
|
-
# +false+ otherwise.
|
100
|
-
def self.wants?(string)
|
101
|
-
$KCODE == 'UTF8' && consumes?(string)
|
102
|
-
end
|
103
|
-
|
104
|
-
# Returns a new Chars object containing the _other_ object concatenated to the string.
|
105
|
-
#
|
106
|
-
# Example:
|
107
|
-
# ('Café'.mb_chars + ' périferôl').to_s # => "Café périferôl"
|
108
|
-
def +(other)
|
109
|
-
chars(@wrapped_string + other)
|
110
|
-
end
|
111
|
-
|
112
|
-
# Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
|
113
|
-
#
|
114
|
-
# Example:
|
115
|
-
# 'Café périferôl'.mb_chars =~ /ô/ # => 12
|
116
|
-
def =~(other)
|
117
|
-
translate_offset(@wrapped_string =~ other)
|
118
|
-
end
|
119
|
-
|
120
|
-
# Inserts the passed string at specified codepoint offsets.
|
121
|
-
#
|
122
|
-
# Example:
|
123
|
-
# 'Café'.mb_chars.insert(4, ' périferôl').to_s # => "Café périferôl"
|
124
|
-
def insert(offset, fragment)
|
125
|
-
unpacked = Unicode.u_unpack(@wrapped_string)
|
126
|
-
unless offset > unpacked.length
|
127
|
-
@wrapped_string.replace(
|
128
|
-
Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*')
|
129
|
-
)
|
130
|
-
else
|
131
|
-
raise IndexError, "index #{offset} out of string"
|
132
|
-
end
|
133
|
-
self
|
134
|
-
end
|
135
|
-
|
136
|
-
# Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
|
137
|
-
#
|
138
|
-
# Example:
|
139
|
-
# 'Café'.mb_chars.include?('é') # => true
|
140
|
-
def include?(other)
|
141
|
-
# We have to redefine this method because Enumerable defines it.
|
142
|
-
@wrapped_string.include?(other)
|
143
|
-
end
|
144
|
-
|
145
|
-
# Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
|
146
|
-
#
|
147
|
-
# Example:
|
148
|
-
# 'Café périferôl'.mb_chars.index('ô') # => 12
|
149
|
-
# 'Café périferôl'.mb_chars.index(/\w/u) # => 0
|
150
|
-
def index(needle, offset=0)
|
151
|
-
wrapped_offset = first(offset).wrapped_string.length
|
152
|
-
index = @wrapped_string.index(needle, wrapped_offset)
|
153
|
-
index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
|
154
|
-
end
|
155
|
-
|
156
|
-
# Returns the position _needle_ in the string, counting in
|
157
|
-
# codepoints, searching backward from _offset_ or the end of the
|
158
|
-
# string. Returns +nil+ if _needle_ isn't found.
|
159
|
-
#
|
160
|
-
# Example:
|
161
|
-
# 'Café périferôl'.mb_chars.rindex('é') # => 6
|
162
|
-
# 'Café périferôl'.mb_chars.rindex(/\w/u) # => 13
|
163
|
-
def rindex(needle, offset=nil)
|
164
|
-
offset ||= length
|
165
|
-
wrapped_offset = first(offset).wrapped_string.length
|
166
|
-
index = @wrapped_string.rindex(needle, wrapped_offset)
|
167
|
-
index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
|
168
|
-
end
|
169
|
-
|
170
|
-
# Returns the number of codepoints in the string
|
171
|
-
def size
|
172
|
-
Unicode.u_unpack(@wrapped_string).size
|
173
|
-
end
|
174
|
-
alias_method :length, :size
|
175
|
-
|
176
|
-
# Strips entire range of Unicode whitespace from the right of the string.
|
177
|
-
def rstrip
|
178
|
-
chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, ''))
|
179
|
-
end
|
180
|
-
|
181
|
-
# Strips entire range of Unicode whitespace from the left of the string.
|
182
|
-
def lstrip
|
183
|
-
chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, ''))
|
184
|
-
end
|
185
|
-
|
186
|
-
# Strips entire range of Unicode whitespace from the right and left of the string.
|
187
|
-
def strip
|
188
|
-
rstrip.lstrip
|
189
|
-
end
|
190
|
-
|
191
|
-
# Returns the codepoint of the first character in the string.
|
192
|
-
#
|
193
|
-
# Example:
|
194
|
-
# 'こんにちは'.mb_chars.ord # => 12371
|
195
|
-
def ord
|
196
|
-
Unicode.u_unpack(@wrapped_string)[0]
|
197
|
-
end
|
198
|
-
|
199
|
-
# Works just like <tt>String#rjust</tt>, only integer specifies characters instead of bytes.
|
200
|
-
#
|
201
|
-
# Example:
|
202
|
-
#
|
203
|
-
# "¾ cup".mb_chars.rjust(8).to_s
|
204
|
-
# # => " ¾ cup"
|
205
|
-
#
|
206
|
-
# "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
|
207
|
-
# # => " ¾ cup"
|
208
|
-
def rjust(integer, padstr=' ')
|
209
|
-
justify(integer, :right, padstr)
|
210
|
-
end
|
211
|
-
|
212
|
-
# Works just like <tt>String#ljust</tt>, only integer specifies characters instead of bytes.
|
213
|
-
#
|
214
|
-
# Example:
|
215
|
-
#
|
216
|
-
# "¾ cup".mb_chars.rjust(8).to_s
|
217
|
-
# # => "¾ cup "
|
218
|
-
#
|
219
|
-
# "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
|
220
|
-
# # => "¾ cup "
|
221
|
-
def ljust(integer, padstr=' ')
|
222
|
-
justify(integer, :left, padstr)
|
223
|
-
end
|
224
|
-
|
225
|
-
# Works just like <tt>String#center</tt>, only integer specifies characters instead of bytes.
|
226
|
-
#
|
227
|
-
# Example:
|
228
|
-
#
|
229
|
-
# "¾ cup".mb_chars.center(8).to_s
|
230
|
-
# # => " ¾ cup "
|
231
|
-
#
|
232
|
-
# "¾ cup".mb_chars.center(8, " ").to_s # Use non-breaking whitespace
|
233
|
-
# # => " ¾ cup "
|
234
|
-
def center(integer, padstr=' ')
|
235
|
-
justify(integer, :center, padstr)
|
236
|
-
end
|
237
|
-
|
238
|
-
else
|
239
|
-
def =~(other)
|
240
|
-
@wrapped_string =~ other
|
241
|
-
end
|
242
|
-
end
|
243
|
-
|
244
|
-
# Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
|
245
|
-
# instances instead of String. This makes chaining methods easier.
|
246
|
-
#
|
247
|
-
# Example:
|
248
|
-
# 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } # => ["CAF", " P", "RIFERÔL"]
|
249
|
-
def split(*args)
|
250
|
-
@wrapped_string.split(*args).map { |i| i.mb_chars }
|
251
|
-
end
|
252
|
-
|
253
|
-
# Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
|
254
|
-
#
|
255
|
-
# Example:
|
256
|
-
#
|
257
|
-
# s = "Müller"
|
258
|
-
# s.mb_chars[2] = "e" # Replace character with offset 2
|
259
|
-
# s
|
260
|
-
# # => "Müeler"
|
261
|
-
#
|
262
|
-
# s = "Müller"
|
263
|
-
# s.mb_chars[1, 2] = "ö" # Replace 2 characters at character offset 1
|
264
|
-
# s
|
265
|
-
# # => "Möler"
|
266
|
-
def []=(*args)
|
267
|
-
replace_by = args.pop
|
268
|
-
# Indexed replace with regular expressions already works
|
269
|
-
if args.first.is_a?(Regexp)
|
270
|
-
@wrapped_string[*args] = replace_by
|
271
|
-
else
|
272
|
-
result = Unicode.u_unpack(@wrapped_string)
|
273
|
-
case args.first
|
274
|
-
when Fixnum
|
275
|
-
raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
|
276
|
-
min = args[0]
|
277
|
-
max = args[1].nil? ? min : (min + args[1] - 1)
|
278
|
-
range = Range.new(min, max)
|
279
|
-
replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
|
280
|
-
when Range
|
281
|
-
raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
|
282
|
-
range = args[0]
|
283
|
-
else
|
284
|
-
needle = args[0].to_s
|
285
|
-
min = index(needle)
|
286
|
-
max = min + Unicode.u_unpack(needle).length - 1
|
287
|
-
range = Range.new(min, max)
|
288
|
-
end
|
289
|
-
result[range] = Unicode.u_unpack(replace_by)
|
290
|
-
@wrapped_string.replace(result.pack('U*'))
|
291
|
-
end
|
292
|
-
end
|
293
|
-
|
294
|
-
# Reverses all characters in the string.
|
295
|
-
#
|
296
|
-
# Example:
|
297
|
-
# 'Café'.mb_chars.reverse.to_s # => 'éfaC'
|
298
|
-
def reverse
|
299
|
-
chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
|
300
|
-
end
|
301
|
-
|
302
|
-
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
|
303
|
-
# character.
|
304
|
-
#
|
305
|
-
# Example:
|
306
|
-
# 'こんにちは'.mb_chars.slice(2..3).to_s # => "にち"
|
307
|
-
def slice(*args)
|
308
|
-
if args.size > 2
|
309
|
-
raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
|
310
|
-
elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
|
311
|
-
raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
|
312
|
-
elsif (args.size == 2 && !args[1].is_a?(Numeric))
|
313
|
-
raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
|
314
|
-
elsif args[0].kind_of? Range
|
315
|
-
cps = Unicode.u_unpack(@wrapped_string).slice(*args)
|
316
|
-
result = cps.nil? ? nil : cps.pack('U*')
|
317
|
-
elsif args[0].kind_of? Regexp
|
318
|
-
result = @wrapped_string.slice(*args)
|
319
|
-
elsif args.size == 1 && args[0].kind_of?(Numeric)
|
320
|
-
character = Unicode.u_unpack(@wrapped_string)[args[0]]
|
321
|
-
result = character && [character].pack('U')
|
322
|
-
else
|
323
|
-
cps = Unicode.u_unpack(@wrapped_string).slice(*args)
|
324
|
-
result = cps && cps.pack('U*')
|
325
|
-
end
|
326
|
-
result && chars(result)
|
327
|
-
end
|
328
|
-
alias_method :[], :slice
|
329
|
-
|
330
|
-
# Limit the byte size of the string to a number of bytes without breaking characters. Usable
|
331
|
-
# when the storage for a string is limited for some reason.
|
332
|
-
#
|
333
|
-
# Example:
|
334
|
-
# 'こんにちは'.mb_chars.limit(7).to_s # => "こん"
|
335
|
-
def limit(limit)
|
336
|
-
slice(0...translate_offset(limit))
|
337
|
-
end
|
338
|
-
|
339
|
-
# Convert characters in the string to uppercase.
|
340
|
-
#
|
341
|
-
# Example:
|
342
|
-
# 'Laurent, où sont les tests ?'.mb_chars.upcase.to_s # => "LAURENT, OÙ SONT LES TESTS ?"
|
343
|
-
def upcase
|
344
|
-
chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping)
|
345
|
-
end
|
346
|
-
|
347
|
-
# Convert characters in the string to lowercase.
|
348
|
-
#
|
349
|
-
# Example:
|
350
|
-
# 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s # => "věda a výzkum"
|
351
|
-
def downcase
|
352
|
-
chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping)
|
353
|
-
end
|
354
|
-
|
355
|
-
# Converts the first character to uppercase and the remainder to lowercase.
|
356
|
-
#
|
357
|
-
# Example:
|
358
|
-
# 'über'.mb_chars.capitalize.to_s # => "Über"
|
359
|
-
def capitalize
|
360
|
-
(slice(0) || chars('')).upcase + (slice(1..-1) || chars('')).downcase
|
361
|
-
end
|
362
|
-
|
363
|
-
# Capitalizes the first letter of every word, when possible.
|
364
|
-
#
|
365
|
-
# Example:
|
366
|
-
# "ÉL QUE SE ENTERÓ".mb_chars.titleize # => "Él Que Se Enteró"
|
367
|
-
# "日本語".mb_chars.titleize # => "日本語"
|
368
|
-
def titleize
|
369
|
-
chars(downcase.to_s.gsub(/\b('?[\S])/u) { Unicode.apply_mapping $1, :uppercase_mapping })
|
370
|
-
end
|
371
|
-
alias_method :titlecase, :titleize
|
372
|
-
|
373
|
-
# Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
|
374
|
-
# passing strings to databases and validations.
|
375
|
-
#
|
376
|
-
# * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
|
377
|
-
# <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
|
378
|
-
# ActiveSupport::Multibyte::Unicode.default_normalization_form
|
379
|
-
def normalize(form = nil)
|
380
|
-
chars(Unicode.normalize(@wrapped_string, form))
|
381
|
-
end
|
382
|
-
|
383
|
-
# Performs canonical decomposition on all the characters.
|
384
|
-
#
|
385
|
-
# Example:
|
386
|
-
# 'é'.length # => 2
|
387
|
-
# 'é'.mb_chars.decompose.to_s.length # => 3
|
388
|
-
def decompose
|
389
|
-
chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*'))
|
390
|
-
end
|
391
|
-
|
392
|
-
# Performs composition on all the characters.
|
393
|
-
#
|
394
|
-
# Example:
|
395
|
-
# 'é'.length # => 3
|
396
|
-
# 'é'.mb_chars.compose.to_s.length # => 2
|
397
|
-
def compose
|
398
|
-
chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*'))
|
399
|
-
end
|
400
|
-
|
401
|
-
# Returns the number of grapheme clusters in the string.
|
402
|
-
#
|
403
|
-
# Example:
|
404
|
-
# 'क्षि'.mb_chars.length # => 4
|
405
|
-
# 'क्षि'.mb_chars.g_length # => 3
|
406
|
-
def g_length
|
407
|
-
Unicode.g_unpack(@wrapped_string).length
|
408
|
-
end
|
409
|
-
|
410
|
-
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
|
411
|
-
#
|
412
|
-
# Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
|
413
|
-
def tidy_bytes(force = false)
|
414
|
-
chars(Unicode.tidy_bytes(@wrapped_string, force))
|
415
|
-
end
|
416
|
-
|
417
|
-
%w(capitalize downcase lstrip reverse rstrip slice strip tidy_bytes upcase).each do |method|
|
418
|
-
# Only define a corresponding bang method for methods defined in the proxy; On 1.9 the proxy will
|
419
|
-
# exclude lstrip!, rstrip! and strip! because they are already work as expected on multibyte strings.
|
420
|
-
if public_method_defined?(method)
|
421
|
-
define_method("#{method}!") do |*args|
|
422
|
-
@wrapped_string = send(args.nil? ? method : method, *args).to_s
|
423
|
-
self
|
424
|
-
end
|
425
|
-
end
|
426
|
-
end
|
427
|
-
|
428
|
-
protected
|
429
|
-
|
430
|
-
def translate_offset(byte_offset) #:nodoc:
|
431
|
-
return nil if byte_offset.nil?
|
432
|
-
return 0 if @wrapped_string == ''
|
433
|
-
|
434
|
-
if @wrapped_string.respond_to?(:force_encoding)
|
435
|
-
@wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
|
436
|
-
end
|
437
|
-
|
438
|
-
begin
|
439
|
-
@wrapped_string[0...byte_offset].unpack('U*').length
|
440
|
-
rescue ArgumentError
|
441
|
-
byte_offset -= 1
|
442
|
-
retry
|
443
|
-
end
|
444
|
-
end
|
445
|
-
|
446
|
-
def justify(integer, way, padstr=' ') #:nodoc:
|
447
|
-
raise ArgumentError, "zero width padding" if padstr.length == 0
|
448
|
-
padsize = integer - size
|
449
|
-
padsize = padsize > 0 ? padsize : 0
|
450
|
-
case way
|
451
|
-
when :right
|
452
|
-
result = @wrapped_string.dup.insert(0, padding(padsize, padstr))
|
453
|
-
when :left
|
454
|
-
result = @wrapped_string.dup.insert(-1, padding(padsize, padstr))
|
455
|
-
when :center
|
456
|
-
lpad = padding((padsize / 2.0).floor, padstr)
|
457
|
-
rpad = padding((padsize / 2.0).ceil, padstr)
|
458
|
-
result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
|
459
|
-
end
|
460
|
-
chars(result)
|
461
|
-
end
|
462
|
-
|
463
|
-
def padding(padsize, padstr=' ') #:nodoc:
|
464
|
-
if padsize != 0
|
465
|
-
chars(padstr * ((padsize / Unicode.u_unpack(padstr).size) + 1)).slice(0, padsize)
|
466
|
-
else
|
467
|
-
''
|
468
|
-
end
|
469
|
-
end
|
470
|
-
|
471
|
-
def chars(string) #:nodoc:
|
472
|
-
self.class.new(string)
|
473
|
-
end
|
474
|
-
end
|
475
|
-
end
|
476
|
-
end
|