unicode_utils 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL.txt +37 -0
- data/README.txt +11 -22
- data/cdata/cond_tc_map +16 -0
- data/cdata/grapheme_break_property +1 -0
- data/cdata/simple_tc_map +1 -0
- data/cdata/special_tc_map +1 -0
- data/cdata/word_break_property +1 -0
- data/lib/unicode_utils.rb +31 -3
- data/lib/unicode_utils/canonical_decomposition.rb +27 -20
- data/lib/unicode_utils/canonical_equivalents_q.rb +3 -2
- data/lib/unicode_utils/casefold.rb +1 -0
- data/lib/unicode_utils/char_name.rb +3 -2
- data/lib/unicode_utils/combining_class.rb +4 -21
- data/lib/unicode_utils/compatibility_decomposition.rb +1 -0
- data/lib/unicode_utils/conditional_casing.rb +16 -18
- data/lib/unicode_utils/downcase.rb +10 -3
- data/lib/unicode_utils/each_grapheme.rb +85 -0
- data/lib/unicode_utils/each_word.rb +118 -0
- data/lib/unicode_utils/grep.rb +1 -0
- data/lib/unicode_utils/hangul_syllable_decomposition.rb +2 -1
- data/lib/unicode_utils/jamo_short_name.rb +2 -1
- data/lib/unicode_utils/nfc.rb +3 -6
- data/lib/unicode_utils/nfkc.rb +1 -0
- data/lib/unicode_utils/read_cdata.rb +49 -2
- data/lib/unicode_utils/simple_casefold.rb +1 -0
- data/lib/unicode_utils/simple_downcase.rb +5 -6
- data/lib/unicode_utils/simple_upcase.rb +5 -6
- data/lib/unicode_utils/titlecase.rb +70 -0
- data/lib/unicode_utils/upcase.rb +11 -4
- data/lib/unicode_utils/version.rb +1 -1
- data/test/test_unicode_utils.rb +46 -0
- metadata +13 -3
@@ -0,0 +1,118 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# Maps codepoints to integer codes. For the integer code to property
|
8
|
+
# mapping, see #compile_word_break_property in data/compile.rb.
|
9
|
+
WORD_BREAK_MAP =
|
10
|
+
Impl.read_hexdigit_map("word_break_property") # :nodoc:
|
11
|
+
|
12
|
+
# Split +str+ along word boundaries according to Unicode's Default
|
13
|
+
# Word Boundary Specification, calling the given block with each
|
14
|
+
# word. Returns +str+, or an enumerator if no block is given.
|
15
|
+
#
|
16
|
+
# Example:
|
17
|
+
#
|
18
|
+
# require "unicode_utils/each_word"
|
19
|
+
# UnicodeUtils.each_word("Hello, world!").to_a => ["Hello", ",", " ", "world", "!"]
|
20
|
+
def each_word(str)
|
21
|
+
return enum_for(__method__, str) unless block_given?
|
22
|
+
cs = str.each_codepoint.map { |c| WORD_BREAK_MAP[c] }
|
23
|
+
cs << nil << nil # for negative indices
|
24
|
+
word = String.new.force_encoding(str.encoding)
|
25
|
+
i = 0
|
26
|
+
str.each_codepoint { |c|
|
27
|
+
word << c
|
28
|
+
if Impl.word_break?(cs, i) && !word.empty?
|
29
|
+
yield word
|
30
|
+
word = String.new.force_encoding(str.encoding)
|
31
|
+
end
|
32
|
+
i += 1
|
33
|
+
}
|
34
|
+
yield word unless word.empty?
|
35
|
+
str
|
36
|
+
end
|
37
|
+
module_function :each_word
|
38
|
+
|
39
|
+
module Impl # :nodoc:all
|
40
|
+
|
41
|
+
def self.word_break?(cs, i)
|
42
|
+
# wb3
|
43
|
+
cs_i = cs[i]
|
44
|
+
i1 = i + 1
|
45
|
+
cs_i1 = cs[i1]
|
46
|
+
if cs_i == 0x0 && cs_i1 == 0x1
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
# wb3a
|
50
|
+
if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
|
51
|
+
return true
|
52
|
+
end
|
53
|
+
# wb3b
|
54
|
+
if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
# wb5
|
58
|
+
i0 = i
|
59
|
+
# inline skip_l
|
60
|
+
c = nil
|
61
|
+
loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
|
62
|
+
ci0 = c
|
63
|
+
if ci0 == 0x6 && cs_i1 == 0x6
|
64
|
+
return false
|
65
|
+
end
|
66
|
+
# wb6
|
67
|
+
i2 = i1 + 1
|
68
|
+
# inline skip_r
|
69
|
+
loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
|
70
|
+
if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
|
71
|
+
return false
|
72
|
+
end
|
73
|
+
# wb7
|
74
|
+
i_1 = i0 - 1
|
75
|
+
# inline skip_l
|
76
|
+
loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
|
77
|
+
if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
# wb8
|
81
|
+
if ci0 == 0xA && cs_i1 == 0xA
|
82
|
+
return false
|
83
|
+
end
|
84
|
+
# wb9
|
85
|
+
if ci0 == 0x6 && cs_i1 == 0xA
|
86
|
+
return false
|
87
|
+
end
|
88
|
+
# wb10
|
89
|
+
if ci0 == 0xA && cs_i1 == 0x6
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
# wb11
|
93
|
+
if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
# wb12
|
97
|
+
if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
|
98
|
+
return false
|
99
|
+
end
|
100
|
+
# wb13
|
101
|
+
if ci0 == 0x5 && cs_i1 == 0x5
|
102
|
+
return false
|
103
|
+
end
|
104
|
+
# wb13a
|
105
|
+
if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
|
106
|
+
return false
|
107
|
+
end
|
108
|
+
# wb13b
|
109
|
+
if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
|
110
|
+
return false
|
111
|
+
end
|
112
|
+
# break unless next char is Extend/Format
|
113
|
+
cs_i1 != 0x3 && cs_i1 != 0x4
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
data/lib/unicode_utils/grep.rb
CHANGED
@@ -7,6 +7,7 @@ module UnicodeUtils
|
|
7
7
|
# Get an array of all Codepoint instances in Codepoint::RANGE whose
|
8
8
|
# name matches regexp. Matching is case insensitive.
|
9
9
|
#
|
10
|
+
# require "unicode_utils/grep"
|
10
11
|
# UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
|
11
12
|
def grep(regexp)
|
12
13
|
unless regexp.casefold?
|
@@ -6,7 +6,8 @@ module UnicodeUtils
|
|
6
6
|
#
|
7
7
|
# Example:
|
8
8
|
#
|
9
|
-
#
|
9
|
+
# require "unicode_utils/hangul_syllable_decomposition"
|
10
|
+
# UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
|
10
11
|
def hangul_syllable_decomposition(char)
|
11
12
|
String.new.force_encoding(char.encoding).tap do |str|
|
12
13
|
Impl.append_hangul_syllable_decomposition(str , char.ord)
|
@@ -11,7 +11,8 @@ module UnicodeUtils
|
|
11
11
|
#
|
12
12
|
# Example:
|
13
13
|
#
|
14
|
-
#
|
14
|
+
# require "unicode_utils/jamo_short_name"
|
15
|
+
# UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
|
15
16
|
def jamo_short_name(char)
|
16
17
|
JAMO_SHORT_NAME_MAP[char.ord]
|
17
18
|
end
|
data/lib/unicode_utils/nfc.rb
CHANGED
@@ -21,10 +21,6 @@ module UnicodeUtils
|
|
21
21
|
|
22
22
|
module NFC
|
23
23
|
|
24
|
-
def self.starter?(cp)
|
25
|
-
(COMBINING_CLASS_MAP[cp] || 0) == 0
|
26
|
-
end
|
27
|
-
|
28
24
|
# does b block c?
|
29
25
|
def self.blocked?(b, c)
|
30
26
|
# From the standard:
|
@@ -33,7 +29,7 @@ module UnicodeUtils
|
|
33
29
|
# at only the immediately preceding character."
|
34
30
|
# cpary is in canonical order (since it comes out of
|
35
31
|
# canonical_decomposition).
|
36
|
-
|
32
|
+
COMBINING_CLASS_MAP[b] >= COMBINING_CLASS_MAP[c]
|
37
33
|
end
|
38
34
|
|
39
35
|
def self.primary_composite?(cp)
|
@@ -64,7 +60,7 @@ module UnicodeUtils
|
|
64
60
|
last_starter = nil
|
65
61
|
uncomposable_non_starters = []
|
66
62
|
str.each_codepoint { |cp|
|
67
|
-
if
|
63
|
+
if COMBINING_CLASS_MAP[cp] == 0 # starter?
|
68
64
|
combined = false
|
69
65
|
if last_starter && uncomposable_non_starters.empty?
|
70
66
|
### hangul ###
|
@@ -135,6 +131,7 @@ module UnicodeUtils
|
|
135
131
|
#
|
136
132
|
# Example:
|
137
133
|
#
|
134
|
+
# require "unicode_utils/nfc"
|
138
135
|
# UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
|
139
136
|
def nfc(str)
|
140
137
|
str = UnicodeUtils.canonical_decomposition(str)
|
data/lib/unicode_utils/nfkc.rb
CHANGED
@@ -4,8 +4,7 @@ module UnicodeUtils
|
|
4
4
|
|
5
5
|
# Absolute path to the directory from which UnicodeUtils loads its
|
6
6
|
# compiled Unicode data files at runtime.
|
7
|
-
CDATA_DIR =
|
8
|
-
File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
|
7
|
+
CDATA_DIR = File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
|
9
8
|
|
10
9
|
module Impl # :nodoc:
|
11
10
|
|
@@ -66,6 +65,54 @@ module UnicodeUtils
|
|
66
65
|
}
|
67
66
|
end
|
68
67
|
|
68
|
+
def self.read_conditional_casings(filename)
|
69
|
+
Hash.new.tap { |cp_map|
|
70
|
+
open_cdata_file(filename) do |input|
|
71
|
+
input.each_line { |line|
|
72
|
+
line.chomp!
|
73
|
+
record = line.split(";")
|
74
|
+
cp = record[0].to_i(16)
|
75
|
+
mapping = record[1].split(",").map { |c| c.to_i(16) }
|
76
|
+
language_id = record[2].empty? ? nil : record[2].to_sym
|
77
|
+
context = record[3] && record[3].gsub('_', '')
|
78
|
+
casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
|
79
|
+
(cp_map[cp] ||= {})[language_id] = casing
|
80
|
+
}
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.read_combining_class_map
|
86
|
+
Hash.new.tap { |map|
|
87
|
+
open_cdata_file("combining_class_map") do |input|
|
88
|
+
buffer = "x" * 6
|
89
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
90
|
+
cc_buffer = "x" * 2
|
91
|
+
cc_buffer.force_encoding(Encoding::US_ASCII)
|
92
|
+
while input.read(6, buffer)
|
93
|
+
map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
# Read a map whose keys are codepoints (6 hexgdigits, converted to
|
100
|
+
# integer) and whose values are single hexdigits (converted to
|
101
|
+
# integer).
|
102
|
+
def self.read_hexdigit_map(filename)
|
103
|
+
Hash.new.tap { |map|
|
104
|
+
open_cdata_file(filename) do |input|
|
105
|
+
buffer = "x" * 6
|
106
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
107
|
+
val_buffer = "x"
|
108
|
+
val_buffer.force_encoding(Encoding::US_ASCII)
|
109
|
+
while input.read(6, buffer)
|
110
|
+
map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
69
116
|
end
|
70
117
|
|
71
118
|
end
|
@@ -16,6 +16,7 @@ module UnicodeUtils
|
|
16
16
|
#
|
17
17
|
# Examples:
|
18
18
|
#
|
19
|
+
# require "unicode_utils/simple_casefold"
|
19
20
|
# UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
|
20
21
|
# UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
|
21
22
|
#
|
@@ -7,17 +7,16 @@ module UnicodeUtils
|
|
7
7
|
SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
|
8
8
|
|
9
9
|
# Map each codepoint in +str+ that has a single codepoint
|
10
|
-
# lowercase-mapping to that lowercase mapping.
|
11
|
-
#
|
12
|
-
# returned string has the same encoding and same length as the
|
13
|
-
# original string.
|
10
|
+
# lowercase-mapping to that lowercase mapping. The returned string
|
11
|
+
# has the same length as the original string.
|
14
12
|
#
|
15
13
|
# This function is locale independent.
|
16
14
|
#
|
17
15
|
# Examples:
|
18
16
|
#
|
19
|
-
#
|
20
|
-
#
|
17
|
+
# require "unicode_utils/simple_downcase"
|
18
|
+
# UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
|
19
|
+
# UnicodeUtils.simple_downcase("STRASSE") => "strasse"
|
21
20
|
def simple_downcase(str)
|
22
21
|
String.new.force_encoding(str.encoding).tap { |res|
|
23
22
|
str.each_codepoint { |cp|
|
@@ -7,17 +7,16 @@ module UnicodeUtils
|
|
7
7
|
SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
|
8
8
|
|
9
9
|
# Map each codepoint in +str+ that has a single codepoint
|
10
|
-
# uppercase-mapping to that uppercase mapping.
|
11
|
-
#
|
12
|
-
# returned string has the same encoding and same length as the
|
13
|
-
# original string.
|
10
|
+
# uppercase-mapping to that uppercase mapping. The returned string
|
11
|
+
# has the same length as the original string.
|
14
12
|
#
|
15
13
|
# This function is locale independent.
|
16
14
|
#
|
17
15
|
# Examples:
|
18
16
|
#
|
19
|
-
#
|
20
|
-
#
|
17
|
+
# require "unicode_utils/simple_upcase"
|
18
|
+
# UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
|
19
|
+
# UnicodeUtils.simple_upcase("weiß") => "WEIß"
|
21
20
|
def simple_upcase(str)
|
22
21
|
String.new.force_encoding(str.encoding).tap { |res|
|
23
22
|
str.each_codepoint { |cp|
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_cdata"
|
4
|
+
require "unicode_utils/conditional_casing"
|
5
|
+
require "unicode_utils/each_word"
|
6
|
+
require "unicode_utils/cased_char_q"
|
7
|
+
require "unicode_utils/downcase"
|
8
|
+
|
9
|
+
module UnicodeUtils
|
10
|
+
|
11
|
+
SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
|
12
|
+
SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
|
13
|
+
|
14
|
+
# Convert the first cased character after each word boundary to
|
15
|
+
# titlecase and all other cased characters to lowercase. For many,
|
16
|
+
# but not all characters, the titlecase mapping is the same as the
|
17
|
+
# uppercase mapping.
|
18
|
+
#
|
19
|
+
# Some conversion rules are language dependent, these are in effect
|
20
|
+
# when a non-nil +language_id+ is given. If non-nil, the
|
21
|
+
# +language_id+ must be a two letter language code as defined in BCP
|
22
|
+
# 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
|
23
|
+
# language doesn't have a two letter code, the three letter code is
|
24
|
+
# to be used. If locale independent behaviour is required, +nil+
|
25
|
+
# should be passed explicitely, because a later version of
|
26
|
+
# UnicodeUtils may default to something else.
|
27
|
+
#
|
28
|
+
# Example:
|
29
|
+
#
|
30
|
+
# require "unicode_utils/titlecase"
|
31
|
+
# UnicodeUtils.titlecase("hello, world!") => "Hello, World!"
|
32
|
+
def titlecase(str, language_id = nil)
|
33
|
+
String.new.force_encoding(str.encoding).tap do |res|
|
34
|
+
# ensure O(1) lookup by index
|
35
|
+
str = str.encode(Encoding::UTF_32LE)
|
36
|
+
i = 0
|
37
|
+
each_word(str) { |word|
|
38
|
+
cased_char_found = false
|
39
|
+
word.each_codepoint { |cp|
|
40
|
+
cased = cased_char?(cp)
|
41
|
+
if !cased_char_found && cased
|
42
|
+
cased_char_found = true
|
43
|
+
special_mapping =
|
44
|
+
Impl.conditional_titlecase_mapping(cp, str, i, language_id) ||
|
45
|
+
SPECIAL_TITLECASE_MAP[cp]
|
46
|
+
if special_mapping
|
47
|
+
special_mapping.each { |m| res << m }
|
48
|
+
else
|
49
|
+
res << (SIMPLE_TITLECASE_MAP[cp] || cp)
|
50
|
+
end
|
51
|
+
elsif cased
|
52
|
+
special_mapping =
|
53
|
+
Impl.conditional_downcase_mapping(cp, str, i, language_id) ||
|
54
|
+
SPECIAL_DOWNCASE_MAP[cp]
|
55
|
+
if special_mapping
|
56
|
+
special_mapping.each { |m| res << m }
|
57
|
+
else
|
58
|
+
res << (SIMPLE_DOWNCASE_MAP[cp] || cp)
|
59
|
+
end
|
60
|
+
else
|
61
|
+
res << cp
|
62
|
+
end
|
63
|
+
i += 1
|
64
|
+
}
|
65
|
+
}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
module_function :titlecase
|
69
|
+
|
70
|
+
end
|
data/lib/unicode_utils/upcase.rb
CHANGED
@@ -16,15 +16,22 @@ module UnicodeUtils
|
|
16
16
|
# +language_id+ must be a two letter language code as defined in BCP
|
17
17
|
# 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
|
18
18
|
# language doesn't have a two letter code, the three letter code is
|
19
|
-
# to be used.
|
19
|
+
# to be used. If locale independent behaviour is required, +nil+
|
20
|
+
# should be passed explicitely, because a later version of
|
21
|
+
# UnicodeUtils may default to something else.
|
20
22
|
#
|
21
23
|
# Examples:
|
22
24
|
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
25
|
+
# require "unicode_utils/upcase"
|
26
|
+
# UnicodeUtils.upcase("weiß") => "WEISS"
|
27
|
+
# UnicodeUtils.upcase("i", :en) => "I"
|
28
|
+
# UnicodeUtils.upcase("i", :tr) => "İ"
|
26
29
|
def upcase(str, language_id = nil)
|
27
30
|
String.new.force_encoding(str.encoding).tap { |res|
|
31
|
+
if Impl::LANGS_WITH_RULES.include?(language_id)
|
32
|
+
# ensure O(1) lookup by index
|
33
|
+
str = str.encode(Encoding::UTF_32LE)
|
34
|
+
end
|
28
35
|
pos = 0
|
29
36
|
str.each_codepoint { |cp|
|
30
37
|
special_mapping =
|
data/test/test_unicode_utils.rb
CHANGED
@@ -177,4 +177,50 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
177
177
|
UnicodeUtils.casefold("weiß")
|
178
178
|
end
|
179
179
|
|
180
|
+
def test_each_grapheme
|
181
|
+
graphemes = []
|
182
|
+
UnicodeUtils.each_grapheme("word") { |g| graphemes << g }
|
183
|
+
assert_equal ["w", "o", "r", "d"], graphemes
|
184
|
+
UnicodeUtils.each_grapheme("") { |g| flunk }
|
185
|
+
graphemes = []
|
186
|
+
UnicodeUtils.each_grapheme("u\u{308}mit") { |g| graphemes << g }
|
187
|
+
# diaeresis
|
188
|
+
assert_equal ["u\u{308}", "m", "i", "t"], graphemes
|
189
|
+
# hangul syllable
|
190
|
+
graphemes = []
|
191
|
+
UnicodeUtils.each_grapheme("\u{1111}\u{1171}\u{11b6}\u{d4db}") { |g| graphemes << g }
|
192
|
+
assert_equal ["\u{1111}\u{1171}\u{11b6}", "\u{d4db}"], graphemes
|
193
|
+
assert_equal ["a", "\r\n", "b"], UnicodeUtils.each_grapheme("a\r\nb").to_a
|
194
|
+
end
|
195
|
+
|
196
|
+
def test_each_word
|
197
|
+
words = []
|
198
|
+
UnicodeUtils.each_word("two words") { |w| words << w }
|
199
|
+
assert_equal ["two", " ", "words"], words
|
200
|
+
assert_equal ["a", " ", "b"], UnicodeUtils.each_word("a b").to_a
|
201
|
+
assert_equal [" ", "b"], UnicodeUtils.each_word(" b").to_a
|
202
|
+
assert_equal ["a", " "], UnicodeUtils.each_word("a ").to_a
|
203
|
+
assert_equal [" "], UnicodeUtils.each_word(" ").to_a
|
204
|
+
assert_equal ["a"], UnicodeUtils.each_word("a").to_a
|
205
|
+
assert_equal [], UnicodeUtils.each_word("").to_a
|
206
|
+
assert_equal ["Hello", ",", " ", "world", "!"],
|
207
|
+
UnicodeUtils.each_word("Hello, world!").to_a
|
208
|
+
assert_equal ["o\u{308}12"],
|
209
|
+
UnicodeUtils.each_word("o\u{308}12").to_a
|
210
|
+
assert_equal ["o\u{308}1"],
|
211
|
+
UnicodeUtils.each_word("o\u{308}1").to_a
|
212
|
+
assert_equal ["o\u{308}"],
|
213
|
+
UnicodeUtils.each_word("o\u{308}").to_a
|
214
|
+
assert_equal ["\u{308}", "o"],
|
215
|
+
UnicodeUtils.each_word("\u{308}o").to_a
|
216
|
+
end
|
217
|
+
|
218
|
+
def test_titlecase
|
219
|
+
assert_equal "Hello, World!", UnicodeUtils.titlecase("heLlo, world!")
|
220
|
+
assert_equal "Find", UnicodeUtils.titlecase("finD")
|
221
|
+
assert_equal "Ümit Huber Jandl", UnicodeUtils.titlecase("ümit huber jandl")
|
222
|
+
assert_equal "İ Can Has 1Kg Cheesburger",
|
223
|
+
UnicodeUtils.titlecase("i can has 1kg CHEESBURGER", :tr)
|
224
|
+
end
|
225
|
+
|
180
226
|
end
|