unicode_utils 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL.txt +37 -0
- data/README.txt +11 -22
- data/cdata/cond_tc_map +16 -0
- data/cdata/grapheme_break_property +1 -0
- data/cdata/simple_tc_map +1 -0
- data/cdata/special_tc_map +1 -0
- data/cdata/word_break_property +1 -0
- data/lib/unicode_utils.rb +31 -3
- data/lib/unicode_utils/canonical_decomposition.rb +27 -20
- data/lib/unicode_utils/canonical_equivalents_q.rb +3 -2
- data/lib/unicode_utils/casefold.rb +1 -0
- data/lib/unicode_utils/char_name.rb +3 -2
- data/lib/unicode_utils/combining_class.rb +4 -21
- data/lib/unicode_utils/compatibility_decomposition.rb +1 -0
- data/lib/unicode_utils/conditional_casing.rb +16 -18
- data/lib/unicode_utils/downcase.rb +10 -3
- data/lib/unicode_utils/each_grapheme.rb +85 -0
- data/lib/unicode_utils/each_word.rb +118 -0
- data/lib/unicode_utils/grep.rb +1 -0
- data/lib/unicode_utils/hangul_syllable_decomposition.rb +2 -1
- data/lib/unicode_utils/jamo_short_name.rb +2 -1
- data/lib/unicode_utils/nfc.rb +3 -6
- data/lib/unicode_utils/nfkc.rb +1 -0
- data/lib/unicode_utils/read_cdata.rb +49 -2
- data/lib/unicode_utils/simple_casefold.rb +1 -0
- data/lib/unicode_utils/simple_downcase.rb +5 -6
- data/lib/unicode_utils/simple_upcase.rb +5 -6
- data/lib/unicode_utils/titlecase.rb +70 -0
- data/lib/unicode_utils/upcase.rb +11 -4
- data/lib/unicode_utils/version.rb +1 -1
- data/test/test_unicode_utils.rb +46 -0
- metadata +13 -3
@@ -0,0 +1,118 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# Maps codepoints to integer codes. For the integer code to property
|
8
|
+
# mapping, see #compile_word_break_property in data/compile.rb.
|
9
|
+
WORD_BREAK_MAP =
|
10
|
+
Impl.read_hexdigit_map("word_break_property") # :nodoc:
|
11
|
+
|
12
|
+
# Split +str+ along word boundaries according to Unicode's Default
|
13
|
+
# Word Boundary Specification, calling the given block with each
|
14
|
+
# word. Returns +str+, or an enumerator if no block is given.
|
15
|
+
#
|
16
|
+
# Example:
|
17
|
+
#
|
18
|
+
# require "unicode_utils/each_word"
|
19
|
+
# UnicodeUtils.each_word("Hello, world!").to_a => ["Hello", ",", " ", "world", "!"]
|
20
|
+
def each_word(str)
|
21
|
+
return enum_for(__method__, str) unless block_given?
|
22
|
+
cs = str.each_codepoint.map { |c| WORD_BREAK_MAP[c] }
|
23
|
+
cs << nil << nil # for negative indices
|
24
|
+
word = String.new.force_encoding(str.encoding)
|
25
|
+
i = 0
|
26
|
+
str.each_codepoint { |c|
|
27
|
+
word << c
|
28
|
+
if Impl.word_break?(cs, i) && !word.empty?
|
29
|
+
yield word
|
30
|
+
word = String.new.force_encoding(str.encoding)
|
31
|
+
end
|
32
|
+
i += 1
|
33
|
+
}
|
34
|
+
yield word unless word.empty?
|
35
|
+
str
|
36
|
+
end
|
37
|
+
module_function :each_word
|
38
|
+
|
39
|
+
module Impl # :nodoc:all
|
40
|
+
|
41
|
+
def self.word_break?(cs, i)
|
42
|
+
# wb3
|
43
|
+
cs_i = cs[i]
|
44
|
+
i1 = i + 1
|
45
|
+
cs_i1 = cs[i1]
|
46
|
+
if cs_i == 0x0 && cs_i1 == 0x1
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
# wb3a
|
50
|
+
if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
|
51
|
+
return true
|
52
|
+
end
|
53
|
+
# wb3b
|
54
|
+
if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
# wb5
|
58
|
+
i0 = i
|
59
|
+
# inline skip_l
|
60
|
+
c = nil
|
61
|
+
loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
|
62
|
+
ci0 = c
|
63
|
+
if ci0 == 0x6 && cs_i1 == 0x6
|
64
|
+
return false
|
65
|
+
end
|
66
|
+
# wb6
|
67
|
+
i2 = i1 + 1
|
68
|
+
# inline skip_r
|
69
|
+
loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
|
70
|
+
if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
|
71
|
+
return false
|
72
|
+
end
|
73
|
+
# wb7
|
74
|
+
i_1 = i0 - 1
|
75
|
+
# inline skip_l
|
76
|
+
loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
|
77
|
+
if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
# wb8
|
81
|
+
if ci0 == 0xA && cs_i1 == 0xA
|
82
|
+
return false
|
83
|
+
end
|
84
|
+
# wb9
|
85
|
+
if ci0 == 0x6 && cs_i1 == 0xA
|
86
|
+
return false
|
87
|
+
end
|
88
|
+
# wb10
|
89
|
+
if ci0 == 0xA && cs_i1 == 0x6
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
# wb11
|
93
|
+
if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
# wb12
|
97
|
+
if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
|
98
|
+
return false
|
99
|
+
end
|
100
|
+
# wb13
|
101
|
+
if ci0 == 0x5 && cs_i1 == 0x5
|
102
|
+
return false
|
103
|
+
end
|
104
|
+
# wb13a
|
105
|
+
if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
|
106
|
+
return false
|
107
|
+
end
|
108
|
+
# wb13b
|
109
|
+
if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
|
110
|
+
return false
|
111
|
+
end
|
112
|
+
# break unless next char is Extend/Format
|
113
|
+
cs_i1 != 0x3 && cs_i1 != 0x4
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
data/lib/unicode_utils/grep.rb
CHANGED
@@ -7,6 +7,7 @@ module UnicodeUtils
|
|
7
7
|
# Get an array of all Codepoint instances in Codepoint::RANGE whose
|
8
8
|
# name matches regexp. Matching is case insensitive.
|
9
9
|
#
|
10
|
+
# require "unicode_utils/grep"
|
10
11
|
# UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
|
11
12
|
def grep(regexp)
|
12
13
|
unless regexp.casefold?
|
@@ -6,7 +6,8 @@ module UnicodeUtils
|
|
6
6
|
#
|
7
7
|
# Example:
|
8
8
|
#
|
9
|
-
#
|
9
|
+
# require "unicode_utils/hangul_syllable_decomposition"
|
10
|
+
# UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
|
10
11
|
def hangul_syllable_decomposition(char)
|
11
12
|
String.new.force_encoding(char.encoding).tap do |str|
|
12
13
|
Impl.append_hangul_syllable_decomposition(str , char.ord)
|
@@ -11,7 +11,8 @@ module UnicodeUtils
|
|
11
11
|
#
|
12
12
|
# Example:
|
13
13
|
#
|
14
|
-
#
|
14
|
+
# require "unicode_utils/jamo_short_name"
|
15
|
+
# UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
|
15
16
|
def jamo_short_name(char)
|
16
17
|
JAMO_SHORT_NAME_MAP[char.ord]
|
17
18
|
end
|
data/lib/unicode_utils/nfc.rb
CHANGED
@@ -21,10 +21,6 @@ module UnicodeUtils
|
|
21
21
|
|
22
22
|
module NFC
|
23
23
|
|
24
|
-
def self.starter?(cp)
|
25
|
-
(COMBINING_CLASS_MAP[cp] || 0) == 0
|
26
|
-
end
|
27
|
-
|
28
24
|
# does b block c?
|
29
25
|
def self.blocked?(b, c)
|
30
26
|
# From the standard:
|
@@ -33,7 +29,7 @@ module UnicodeUtils
|
|
33
29
|
# at only the immediately preceding character."
|
34
30
|
# cpary is in canonical order (since it comes out of
|
35
31
|
# canonical_decomposition).
|
36
|
-
|
32
|
+
COMBINING_CLASS_MAP[b] >= COMBINING_CLASS_MAP[c]
|
37
33
|
end
|
38
34
|
|
39
35
|
def self.primary_composite?(cp)
|
@@ -64,7 +60,7 @@ module UnicodeUtils
|
|
64
60
|
last_starter = nil
|
65
61
|
uncomposable_non_starters = []
|
66
62
|
str.each_codepoint { |cp|
|
67
|
-
if
|
63
|
+
if COMBINING_CLASS_MAP[cp] == 0 # starter?
|
68
64
|
combined = false
|
69
65
|
if last_starter && uncomposable_non_starters.empty?
|
70
66
|
### hangul ###
|
@@ -135,6 +131,7 @@ module UnicodeUtils
|
|
135
131
|
#
|
136
132
|
# Example:
|
137
133
|
#
|
134
|
+
# require "unicode_utils/nfc"
|
138
135
|
# UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
|
139
136
|
def nfc(str)
|
140
137
|
str = UnicodeUtils.canonical_decomposition(str)
|
data/lib/unicode_utils/nfkc.rb
CHANGED
@@ -4,8 +4,7 @@ module UnicodeUtils
|
|
4
4
|
|
5
5
|
# Absolute path to the directory from which UnicodeUtils loads its
|
6
6
|
# compiled Unicode data files at runtime.
|
7
|
-
CDATA_DIR =
|
8
|
-
File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
|
7
|
+
CDATA_DIR = File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
|
9
8
|
|
10
9
|
module Impl # :nodoc:
|
11
10
|
|
@@ -66,6 +65,54 @@ module UnicodeUtils
|
|
66
65
|
}
|
67
66
|
end
|
68
67
|
|
68
|
+
def self.read_conditional_casings(filename)
|
69
|
+
Hash.new.tap { |cp_map|
|
70
|
+
open_cdata_file(filename) do |input|
|
71
|
+
input.each_line { |line|
|
72
|
+
line.chomp!
|
73
|
+
record = line.split(";")
|
74
|
+
cp = record[0].to_i(16)
|
75
|
+
mapping = record[1].split(",").map { |c| c.to_i(16) }
|
76
|
+
language_id = record[2].empty? ? nil : record[2].to_sym
|
77
|
+
context = record[3] && record[3].gsub('_', '')
|
78
|
+
casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
|
79
|
+
(cp_map[cp] ||= {})[language_id] = casing
|
80
|
+
}
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.read_combining_class_map
|
86
|
+
Hash.new.tap { |map|
|
87
|
+
open_cdata_file("combining_class_map") do |input|
|
88
|
+
buffer = "x" * 6
|
89
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
90
|
+
cc_buffer = "x" * 2
|
91
|
+
cc_buffer.force_encoding(Encoding::US_ASCII)
|
92
|
+
while input.read(6, buffer)
|
93
|
+
map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
# Read a map whose keys are codepoints (6 hexgdigits, converted to
|
100
|
+
# integer) and whose values are single hexdigits (converted to
|
101
|
+
# integer).
|
102
|
+
def self.read_hexdigit_map(filename)
|
103
|
+
Hash.new.tap { |map|
|
104
|
+
open_cdata_file(filename) do |input|
|
105
|
+
buffer = "x" * 6
|
106
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
107
|
+
val_buffer = "x"
|
108
|
+
val_buffer.force_encoding(Encoding::US_ASCII)
|
109
|
+
while input.read(6, buffer)
|
110
|
+
map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
69
116
|
end
|
70
117
|
|
71
118
|
end
|
@@ -16,6 +16,7 @@ module UnicodeUtils
|
|
16
16
|
#
|
17
17
|
# Examples:
|
18
18
|
#
|
19
|
+
# require "unicode_utils/simple_casefold"
|
19
20
|
# UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
|
20
21
|
# UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
|
21
22
|
#
|
@@ -7,17 +7,16 @@ module UnicodeUtils
|
|
7
7
|
SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
|
8
8
|
|
9
9
|
# Map each codepoint in +str+ that has a single codepoint
|
10
|
-
# lowercase-mapping to that lowercase mapping.
|
11
|
-
#
|
12
|
-
# returned string has the same encoding and same length as the
|
13
|
-
# original string.
|
10
|
+
# lowercase-mapping to that lowercase mapping. The returned string
|
11
|
+
# has the same length as the original string.
|
14
12
|
#
|
15
13
|
# This function is locale independent.
|
16
14
|
#
|
17
15
|
# Examples:
|
18
16
|
#
|
19
|
-
#
|
20
|
-
#
|
17
|
+
# require "unicode_utils/simple_downcase"
|
18
|
+
# UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
|
19
|
+
# UnicodeUtils.simple_downcase("STRASSE") => "strasse"
|
21
20
|
def simple_downcase(str)
|
22
21
|
String.new.force_encoding(str.encoding).tap { |res|
|
23
22
|
str.each_codepoint { |cp|
|
@@ -7,17 +7,16 @@ module UnicodeUtils
|
|
7
7
|
SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
|
8
8
|
|
9
9
|
# Map each codepoint in +str+ that has a single codepoint
|
10
|
-
# uppercase-mapping to that uppercase mapping.
|
11
|
-
#
|
12
|
-
# returned string has the same encoding and same length as the
|
13
|
-
# original string.
|
10
|
+
# uppercase-mapping to that uppercase mapping. The returned string
|
11
|
+
# has the same length as the original string.
|
14
12
|
#
|
15
13
|
# This function is locale independent.
|
16
14
|
#
|
17
15
|
# Examples:
|
18
16
|
#
|
19
|
-
#
|
20
|
-
#
|
17
|
+
# require "unicode_utils/simple_upcase"
|
18
|
+
# UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
|
19
|
+
# UnicodeUtils.simple_upcase("weiß") => "WEIß"
|
21
20
|
def simple_upcase(str)
|
22
21
|
String.new.force_encoding(str.encoding).tap { |res|
|
23
22
|
str.each_codepoint { |cp|
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
require "unicode_utils/conditional_casing"
|
5
|
+
require "unicode_utils/each_word"
|
6
|
+
require "unicode_utils/cased_char_q"
|
7
|
+
require "unicode_utils/downcase"
|
8
|
+
|
9
|
+
module UnicodeUtils
|
10
|
+
|
11
|
+
SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
|
12
|
+
SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
|
13
|
+
|
14
|
+
# Convert the first cased character after each word boundary to
|
15
|
+
# titlecase and all other cased characters to lowercase. For many,
|
16
|
+
# but not all characters, the titlecase mapping is the same as the
|
17
|
+
# uppercase mapping.
|
18
|
+
#
|
19
|
+
# Some conversion rules are language dependent, these are in effect
|
20
|
+
# when a non-nil +language_id+ is given. If non-nil, the
|
21
|
+
# +language_id+ must be a two letter language code as defined in BCP
|
22
|
+
# 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
|
23
|
+
# language doesn't have a two letter code, the three letter code is
|
24
|
+
# to be used. If locale independent behaviour is required, +nil+
|
25
|
+
# should be passed explicitely, because a later version of
|
26
|
+
# UnicodeUtils may default to something else.
|
27
|
+
#
|
28
|
+
# Example:
|
29
|
+
#
|
30
|
+
# require "unicode_utils/titlecase"
|
31
|
+
# UnicodeUtils.titlecase("hello, world!") => "Hello, World!"
|
32
|
+
def titlecase(str, language_id = nil)
|
33
|
+
String.new.force_encoding(str.encoding).tap do |res|
|
34
|
+
# ensure O(1) lookup by index
|
35
|
+
str = str.encode(Encoding::UTF_32LE)
|
36
|
+
i = 0
|
37
|
+
each_word(str) { |word|
|
38
|
+
cased_char_found = false
|
39
|
+
word.each_codepoint { |cp|
|
40
|
+
cased = cased_char?(cp)
|
41
|
+
if !cased_char_found && cased
|
42
|
+
cased_char_found = true
|
43
|
+
special_mapping =
|
44
|
+
Impl.conditional_titlecase_mapping(cp, str, i, language_id) ||
|
45
|
+
SPECIAL_TITLECASE_MAP[cp]
|
46
|
+
if special_mapping
|
47
|
+
special_mapping.each { |m| res << m }
|
48
|
+
else
|
49
|
+
res << (SIMPLE_TITLECASE_MAP[cp] || cp)
|
50
|
+
end
|
51
|
+
elsif cased
|
52
|
+
special_mapping =
|
53
|
+
Impl.conditional_downcase_mapping(cp, str, i, language_id) ||
|
54
|
+
SPECIAL_DOWNCASE_MAP[cp]
|
55
|
+
if special_mapping
|
56
|
+
special_mapping.each { |m| res << m }
|
57
|
+
else
|
58
|
+
res << (SIMPLE_DOWNCASE_MAP[cp] || cp)
|
59
|
+
end
|
60
|
+
else
|
61
|
+
res << cp
|
62
|
+
end
|
63
|
+
i += 1
|
64
|
+
}
|
65
|
+
}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
module_function :titlecase
|
69
|
+
|
70
|
+
end
|
data/lib/unicode_utils/upcase.rb
CHANGED
@@ -16,15 +16,22 @@ module UnicodeUtils
|
|
16
16
|
# +language_id+ must be a two letter language code as defined in BCP
|
17
17
|
# 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
|
18
18
|
# language doesn't have a two letter code, the three letter code is
|
19
|
-
# to be used.
|
19
|
+
# to be used. If locale independent behaviour is required, +nil+
|
20
|
+
# should be passed explicitely, because a later version of
|
21
|
+
# UnicodeUtils may default to something else.
|
20
22
|
#
|
21
23
|
# Examples:
|
22
24
|
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
25
|
+
# require "unicode_utils/upcase"
|
26
|
+
# UnicodeUtils.upcase("weiß") => "WEISS"
|
27
|
+
# UnicodeUtils.upcase("i", :en) => "I"
|
28
|
+
# UnicodeUtils.upcase("i", :tr) => "İ"
|
26
29
|
def upcase(str, language_id = nil)
|
27
30
|
String.new.force_encoding(str.encoding).tap { |res|
|
31
|
+
if Impl::LANGS_WITH_RULES.include?(language_id)
|
32
|
+
# ensure O(1) lookup by index
|
33
|
+
str = str.encode(Encoding::UTF_32LE)
|
34
|
+
end
|
28
35
|
pos = 0
|
29
36
|
str.each_codepoint { |cp|
|
30
37
|
special_mapping =
|
data/test/test_unicode_utils.rb
CHANGED
@@ -177,4 +177,50 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
177
177
|
UnicodeUtils.casefold("weiß")
|
178
178
|
end
|
179
179
|
|
180
|
+
def test_each_grapheme
|
181
|
+
graphemes = []
|
182
|
+
UnicodeUtils.each_grapheme("word") { |g| graphemes << g }
|
183
|
+
assert_equal ["w", "o", "r", "d"], graphemes
|
184
|
+
UnicodeUtils.each_grapheme("") { |g| flunk }
|
185
|
+
graphemes = []
|
186
|
+
UnicodeUtils.each_grapheme("u\u{308}mit") { |g| graphemes << g }
|
187
|
+
# diaeresis
|
188
|
+
assert_equal ["u\u{308}", "m", "i", "t"], graphemes
|
189
|
+
# hangul syllable
|
190
|
+
graphemes = []
|
191
|
+
UnicodeUtils.each_grapheme("\u{1111}\u{1171}\u{11b6}\u{d4db}") { |g| graphemes << g }
|
192
|
+
assert_equal ["\u{1111}\u{1171}\u{11b6}", "\u{d4db}"], graphemes
|
193
|
+
assert_equal ["a", "\r\n", "b"], UnicodeUtils.each_grapheme("a\r\nb").to_a
|
194
|
+
end
|
195
|
+
|
196
|
+
def test_each_word
|
197
|
+
words = []
|
198
|
+
UnicodeUtils.each_word("two words") { |w| words << w }
|
199
|
+
assert_equal ["two", " ", "words"], words
|
200
|
+
assert_equal ["a", " ", "b"], UnicodeUtils.each_word("a b").to_a
|
201
|
+
assert_equal [" ", "b"], UnicodeUtils.each_word(" b").to_a
|
202
|
+
assert_equal ["a", " "], UnicodeUtils.each_word("a ").to_a
|
203
|
+
assert_equal [" "], UnicodeUtils.each_word(" ").to_a
|
204
|
+
assert_equal ["a"], UnicodeUtils.each_word("a").to_a
|
205
|
+
assert_equal [], UnicodeUtils.each_word("").to_a
|
206
|
+
assert_equal ["Hello", ",", " ", "world", "!"],
|
207
|
+
UnicodeUtils.each_word("Hello, world!").to_a
|
208
|
+
assert_equal ["o\u{308}12"],
|
209
|
+
UnicodeUtils.each_word("o\u{308}12").to_a
|
210
|
+
assert_equal ["o\u{308}1"],
|
211
|
+
UnicodeUtils.each_word("o\u{308}1").to_a
|
212
|
+
assert_equal ["o\u{308}"],
|
213
|
+
UnicodeUtils.each_word("o\u{308}").to_a
|
214
|
+
assert_equal ["\u{308}", "o"],
|
215
|
+
UnicodeUtils.each_word("\u{308}o").to_a
|
216
|
+
end
|
217
|
+
|
218
|
+
def test_titlecase
|
219
|
+
assert_equal "Hello, World!", UnicodeUtils.titlecase("heLlo, world!")
|
220
|
+
assert_equal "Find", UnicodeUtils.titlecase("finD")
|
221
|
+
assert_equal "Ümit Huber Jandl", UnicodeUtils.titlecase("ümit huber jandl")
|
222
|
+
assert_equal "İ Can Has 1Kg Cheesburger",
|
223
|
+
UnicodeUtils.titlecase("i can has 1kg CHEESBURGER", :tr)
|
224
|
+
end
|
225
|
+
|
180
226
|
end
|