unicode_utils 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/cdata/canonical_decomposition_map +1 -0
- data/cdata/composition_exclusion_set +1 -0
- data/cdata/jamo_short_names +66 -0
- data/cdata/names +0 -20
- data/lib/unicode_utils/canonical_decomposition.rb +86 -0
- data/lib/unicode_utils/canonical_equivalents_q.rb +20 -0
- data/lib/unicode_utils/combining_class.rb +1 -1
- data/lib/unicode_utils/conditional_casing.rb +8 -8
- data/lib/unicode_utils/downcase.rb +2 -2
- data/lib/unicode_utils/hangul_syllable_decomposition.rb +44 -0
- data/lib/unicode_utils/jamo_short_name.rb +20 -0
- data/lib/unicode_utils/name.rb +26 -21
- data/lib/unicode_utils/nfc.rb +142 -0
- data/lib/unicode_utils/nfd.rb +15 -0
- data/lib/unicode_utils/{read_special_casing_map.rb → read_multivalued_map.rb} +1 -1
- data/lib/unicode_utils/read_names.rb +22 -0
- data/lib/unicode_utils/upcase.rb +2 -2
- data/lib/unicode_utils/version.rb +1 -1
- data/lib/unicode_utils.rb +18 -0
- data/test/test_normalization.rb +75 -0
- data/test/test_unicode_utils.rb +34 -0
- metadata +16 -4
@@ -0,0 +1 @@
|
|
1
|
+

|
@@ -0,0 +1 @@
|
|
1
|
+
00034000034100034300034400037400037e00038700095800095900095a00095b00095c00095d00095e00095f0009dc0009dd0009df000a33000a36000a59000a5a000a5b000a5e000b5c000b5d000f43000f4d000f52000f57000f5c000f69000f73000f75000f76000f78000f81000f93000f9d000fa2000fa7000fac000fb9001f71001f73001f75001f77001f79001f7b001f7d001fbb001fbe001fc9001fcb001fd3001fdb001fe3001feb001fee001fef001ff9001ffb001ffd00200000200100212600212a00212b00232900232a002adc00f90000f90100f90200f90300f90400f90500f90600f90700f90800f90900f90a00f90b00f90c00f90d00f90e00f90f00f91000f91100f91200f91300f91400f91500f91600f91700f91800f91900f91a00f91b00f91c00f91d00f91e00f91f00f92000f92100f92200f92300f92400f92500f92600f92700f92800f92900f92a00f92b00f92c00f92d00f92e00f92f00f93000f93100f93200f93300f93400f93500f93600f93700f93800f93900f93a00f93b00f93c00f93d00f93e00f93f00f94000f94100f94200f94300f94400f94500f94600f94700f94800f94900f94a00f94b00f94c00f94d00f94e00f94f00f95000f95100f95200f95300f95400f95500f95600f95700f95800f95900f95a00f95b00f95c00f95d00f95e00f95f00f96000f96100f96200f96300f96400f96500f96600f96700f96800f96900f96a00f96b00f96c00f96d00f96e00f96f00f97000f97100f97200f97300f97400f97500f97600f97700f97800f97900f97a00f97b00f97c00f97d00f97e00f97f00f98000f98100f98200f98300f98400f98500f98600f98700f98800f98900f98a00f98b00f98c00f98d00f98e00f98f00f99000f99100f99200f99300f99400f99500f99600f99700f99800f99900f99a00f99b00f99c00f99d00f99e00f99f00f9a000f9a100f9a200f9a300f9a400f9a500f9a600f9a700f9a800f9a900f9aa00f9ab00f9ac00f9ad00f9ae00f9af00f9b000f9b100f9b200f9b300f9b400f9b500f9b600f9b700f9b800f9b900f9ba00f9bb00f9bc00f9bd00f9be00f9bf00f9c000f9c100f9c200f9c300f9c400f9c500f9c600f9c700f9c800f9c900f9ca00f9cb00f9cc00f9cd00f9ce00f9cf00f9d000f9d100f9d200f9d300f9d400f9d500f9d600f9d700f9d800f9d900f9da00f9db00f9dc00f9dd00f9de00f9df00f9e000f9e100f9e200f9e300f9e400f9e500f9e600f9e700f9e800f9e900f9ea00f9eb00f9ec00f9ed00f9ee00f9ef00f9f000f9f100f9f200f9f300f9f400f9f500f9f600f9f700f9f800f9f900f9fa00f9fb00f9fc00f9fd00f9fe00f9ff00fa0000fa0100fa0200fa0300fa0400fa0500fa0600fa0700fa0800fa0900fa0a00fa0b00fa0c00fa0d00fa1000fa1200fa1500fa1600fa1700fa1800fa1900fa1a00fa1b00fa1c00fa1d00fa1e00fa2000fa2200fa2500fa2600fa2a00fa2b00fa2c00fa2d00fa3000fa3100fa3200fa3300fa3400fa3500fa3600fa3700fa3800fa3900fa3a00fa3b00fa3c00fa3d00fa3e00fa3f00fa4000fa4100fa4200fa4300fa4400fa4500fa4600fa4700fa4800fa4900fa4a00fa4b00fa4c00fa4d00fa4e00fa4f00fa5000fa5100fa5200fa5300fa5400fa5500fa5600fa5700fa5800fa5900fa5a00fa5b00fa5c00fa5d00fa5e00fa5f00fa6000fa6100fa6200fa6300fa6400fa6500fa6600fa6700fa6800fa6900fa6a00fa7000fa7100fa7200fa7300fa7400fa7500fa7600fa7700fa7800fa7900fa7a00fa7b00fa7c00fa7d00fa7e00fa7f00fa8000fa8100fa8200fa8300fa8400fa8500fa8600fa8700fa8800fa8900fa8a00fa8b00fa8c00fa8d00fa8e00fa8f00fa9000fa9100fa9200fa9300fa9400fa9500fa9600fa9700fa9800fa9900fa9a00fa9b00fa9c00fa9d00fa9e00fa9f00faa000faa100faa200faa300faa400faa500faa600faa700faa800faa900faaa00faab00faac00faad00faae00faaf00fab000fab100fab200fab300fab400fab500fab600fab700fab800fab900faba00fabb00fabc00fabd00fabe00fabf00fac000fac100fac200fac300fac400fac500fac600fac700fac800fac900faca00facb00facc00facd00face00facf00fad000fad100fad200fad300fad400fad500fad600fad700fad800fad900fb1d00fb1f00fb2a00fb2b00fb2c00fb2d00fb2e00fb2f00fb3000fb3100fb3200fb3300fb3400fb3500fb3600fb3800fb3900fb3a00fb3b00fb3c00fb3e00fb4000fb4100fb4300fb4400fb4600fb4700fb4800fb4900fb4a00fb4b00fb4c00fb4d00fb4e01d15e01d15f01d16001d16101d16201d16301d16401d1bb01d1bc01d1bd01d1be01d1bf01d1c002f80002f80102f80202f80302f80402f80502f80602f80702f80802f80902f80a02f80b02f80c02f80d02f80e02f80f02f81002f81102f81202f81302f81402f81502f81602f81702f81802f81902f81a02f81b02f81c02f81d02f81e02f81f02f82002f82102f82202f82302f82402f82502f82602f82702f82802f82902f82a02f82b02f82c02f82d02f82e02f82f02f83002f83102f83202f83302f83402f83502f83602f83702f83802f83902f83a02f83b02f83c02f83d02f83e02f83f02f84002f84102f84202f84302f84402f84502f84602f84702f84802f84902f84a02f84b02f84c02f84d02f84e02f84f02f85002f85102f85202f85302f85402f85502f85602f85702f85802f85902f85a02f85b02f85c02f85d02f85e02f85f02f86002f86102f86202f86302f86402f86502f86602f86702f86802f86902f86a02f86b02f86c02f86d02f86e02f86f02f87002f87102f87202f87302f87402f87502f87602f87702f87802f87902f87a02f87b02f87c02f87d02f87e02f87f02f88002f88102f88202f88302f88402f88502f88602f88702f88802f88902f88a02f88b02f88c02f88d02f88e02f88f02f89002f89102f89202f89302f89402f89502f89602f89702f89802f89902f89a02f89b02f89c02f89d02f89e02f89f02f8a002f8a102f8a202f8a302f8a402f8a502f8a602f8a702f8a802f8a902f8aa02f8ab02f8ac02f8ad02f8ae02f8af02f8b002f8b102f8b202f8b302f8b402f8b502f8b602f8b702f8b802f8b902f8ba02f8bb02f8bc02f8bd02f8be02f8bf02f8c002f8c102f8c202f8c302f8c402f8c502f8c602f8c702f8c802f8c902f8ca02f8cb02f8cc02f8cd02f8ce02f8cf02f8d002f8d102f8d202f8d302f8d402f8d502f8d602f8d702f8d802f8d902f8da02f8db02f8dc02f8dd02f8de02f8df02f8e002f8e102f8e202f8e302f8e402f8e502f8e602f8e702f8e802f8e902f8ea02f8eb02f8ec02f8ed02f8ee02f8ef02f8f002f8f102f8f202f8f302f8f402f8f502f8f602f8f702f8f802f8f902f8fa02f8fb02f8fc02f8fd02f8fe02f8ff02f90002f90102f90202f90302f90402f90502f90602f90702f90802f90902f90a02f90b02f90c02f90d02f90e02f90f02f91002f91102f91202f91302f91402f91502f91602f91702f91802f91902f91a02f91b02f91c02f91d02f91e02f91f02f92002f92102f92202f92302f92402f92502f92602f92702f92802f92902f92a02f92b02f92c02f92d02f92e02f92f02f93002f93102f93202f93302f93402f93502f93602f93702f93802f93902f93a02f93b02f93c02f93d02f93e02f93f02f94002f94102f94202f94302f94402f94502f94602f94702f94802f94902f94a02f94b02f94c02f94d02f94e02f94f02f95002f95102f95202f95302f95402f95502f95602f95702f95802f95902f95a02f95b02f95c02f95d02f95e02f95f02f96002f96102f96202f96302f96402f96502f96602f96702f96802f96902f96a02f96b02f96c02f96d02f96e02f96f02f97002f97102f97202f97302f97402f97502f97602f97702f97802f97902f97a02f97b02f97c02f97d02f97e02f97f02f98002f98102f98202f98302f98402f98502f98602f98702f98802f98902f98a02f98b02f98c02f98d02f98e02f98f02f99002f99102f99202f99302f99402f99502f99602f99702f99802f99902f99a02f99b02f99c02f99d02f99e02f99f02f9a002f9a102f9a202f9a302f9a402f9a502f9a602f9a702f9a802f9a902f9aa02f9ab02f9ac02f9ad02f9ae02f9af02f9b002f9b102f9b202f9b302f9b402f9b502f9b602f9b702f9b802f9b902f9ba02f9bb02f9bc02f9bd02f9be02f9bf02f9c002f9c102f9c202f9c302f9c402f9c502f9c602f9c702f9c802f9c902f9ca02f9cb02f9cc02f9cd02f9ce02f9cf02f9d002f9d102f9d202f9d302f9d402f9d502f9d602f9d702f9d802f9d902f9da02f9db02f9dc02f9dd02f9de02f9df02f9e002f9e102f9e202f9e302f9e402f9e502f9e602f9e702f9e802f9e902f9ea02f9eb02f9ec02f9ed02f9ee02f9ef02f9f002f9f102f9f202f9f302f9f402f9f502f9f602f9f702f9f802f9f902f9fa02f9fb02f9fc02f9fd02f9fe02f9ff02fa0002fa0102fa0202fa0302fa0402fa0502fa0602fa0702fa0802fa0902fa0a02fa0b02fa0c02fa0d02fa0e02fa0f02fa1002fa1102fa1202fa1302fa1402fa1502fa1602fa1702fa1802fa1902fa1a02fa1b02fa1c02fa1d
|
@@ -0,0 +1,66 @@
|
|
1
|
+
001100G
|
2
|
+
001101GG
|
3
|
+
001102N
|
4
|
+
001103D
|
5
|
+
001104DD
|
6
|
+
001105R
|
7
|
+
001106M
|
8
|
+
001107B
|
9
|
+
001108BB
|
10
|
+
001109S
|
11
|
+
00110aSS
|
12
|
+
00110cJ
|
13
|
+
00110dJJ
|
14
|
+
00110eC
|
15
|
+
00110fK
|
16
|
+
001110T
|
17
|
+
001111P
|
18
|
+
001112H
|
19
|
+
001161A
|
20
|
+
001162AE
|
21
|
+
001163YA
|
22
|
+
001164YAE
|
23
|
+
001165EO
|
24
|
+
001166E
|
25
|
+
001167YEO
|
26
|
+
001168YE
|
27
|
+
001169O
|
28
|
+
00116aWA
|
29
|
+
00116bWAE
|
30
|
+
00116cOE
|
31
|
+
00116dYO
|
32
|
+
00116eU
|
33
|
+
00116fWEO
|
34
|
+
001170WE
|
35
|
+
001171WI
|
36
|
+
001172YU
|
37
|
+
001173EU
|
38
|
+
001174YI
|
39
|
+
001175I
|
40
|
+
0011a8G
|
41
|
+
0011a9GG
|
42
|
+
0011aaGS
|
43
|
+
0011abN
|
44
|
+
0011acNJ
|
45
|
+
0011adNH
|
46
|
+
0011aeD
|
47
|
+
0011afL
|
48
|
+
0011b0LG
|
49
|
+
0011b1LM
|
50
|
+
0011b2LB
|
51
|
+
0011b3LS
|
52
|
+
0011b4LT
|
53
|
+
0011b5LP
|
54
|
+
0011b6LH
|
55
|
+
0011b7M
|
56
|
+
0011b8B
|
57
|
+
0011b9BS
|
58
|
+
0011baS
|
59
|
+
0011bbSS
|
60
|
+
0011bcNG
|
61
|
+
0011bdJ
|
62
|
+
0011beC
|
63
|
+
0011bfK
|
64
|
+
0011c0T
|
65
|
+
0011c1P
|
66
|
+
0011c2H
|
data/cdata/names
CHANGED
@@ -10969,8 +10969,6 @@
|
|
10969
10969
|
0033fdIDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY
|
10970
10970
|
0033feIDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
|
10971
10971
|
0033ffSQUARE GAL
|
10972
|
-
003400<CJK Ideograph Extension A, First>
|
10973
|
-
004db5<CJK Ideograph Extension A, Last>
|
10974
10972
|
004dc0HEXAGRAM FOR THE CREATIVE HEAVEN
|
10975
10973
|
004dc1HEXAGRAM FOR THE RECEPTIVE EARTH
|
10976
10974
|
004dc2HEXAGRAM FOR DIFFICULTY AT THE BEGINNING
|
@@ -11035,8 +11033,6 @@
|
|
11035
11033
|
004dfdHEXAGRAM FOR SMALL PREPONDERANCE
|
11036
11034
|
004dfeHEXAGRAM FOR AFTER COMPLETION
|
11037
11035
|
004dffHEXAGRAM FOR BEFORE COMPLETION
|
11038
|
-
004e00<CJK Ideograph, First>
|
11039
|
-
009fc3<CJK Ideograph, Last>
|
11040
11036
|
00a000YI SYLLABLE IT
|
11041
11037
|
00a001YI SYLLABLE IX
|
11042
11038
|
00a002YI SYLLABLE I
|
@@ -13130,16 +13126,6 @@
|
|
13130
13126
|
00aa5dCHAM PUNCTUATION DANDA
|
13131
13127
|
00aa5eCHAM PUNCTUATION DOUBLE DANDA
|
13132
13128
|
00aa5fCHAM PUNCTUATION TRIPLE DANDA
|
13133
|
-
00ac00<Hangul Syllable, First>
|
13134
|
-
00d7a3<Hangul Syllable, Last>
|
13135
|
-
00d800<Non Private Use High Surrogate, First>
|
13136
|
-
00db7f<Non Private Use High Surrogate, Last>
|
13137
|
-
00db80<Private Use High Surrogate, First>
|
13138
|
-
00dbff<Private Use High Surrogate, Last>
|
13139
|
-
00dc00<Low Surrogate, First>
|
13140
|
-
00dfff<Low Surrogate, Last>
|
13141
|
-
00e000<Private Use, First>
|
13142
|
-
00f8ff<Private Use, Last>
|
13143
13129
|
00f900CJK COMPATIBILITY IDEOGRAPH-F900
|
13144
13130
|
00f901CJK COMPATIBILITY IDEOGRAPH-F901
|
13145
13131
|
00f902CJK COMPATIBILITY IDEOGRAPH-F902
|
@@ -18449,8 +18435,6 @@
|
|
18449
18435
|
01f091DOMINO TILE VERTICAL-06-04
|
18450
18436
|
01f092DOMINO TILE VERTICAL-06-05
|
18451
18437
|
01f093DOMINO TILE VERTICAL-06-06
|
18452
|
-
020000<CJK Ideograph Extension B, First>
|
18453
|
-
02a6d6<CJK Ideograph Extension B, Last>
|
18454
18438
|
02f800CJK COMPATIBILITY IDEOGRAPH-2F800
|
18455
18439
|
02f801CJK COMPATIBILITY IDEOGRAPH-2F801
|
18456
18440
|
02f802CJK COMPATIBILITY IDEOGRAPH-2F802
|
@@ -19330,7 +19314,3 @@
|
|
19330
19314
|
0e01edVARIATION SELECTOR-254
|
19331
19315
|
0e01eeVARIATION SELECTOR-255
|
19332
19316
|
0e01efVARIATION SELECTOR-256
|
19333
|
-
0f0000<Plane 15 Private Use, First>
|
19334
|
-
0ffffd<Plane 15 Private Use, Last>
|
19335
|
-
100000<Plane 16 Private Use, First>
|
19336
|
-
10fffd<Plane 16 Private Use, Last>
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_multivalued_map"
|
4
|
+
require "unicode_utils/hangul_syllable_decomposition"
|
5
|
+
require "unicode_utils/combining_class"
|
6
|
+
|
7
|
+
module UnicodeUtils
|
8
|
+
|
9
|
+
CANONICAL_DECOMPOSITION_MAP =
|
10
|
+
Impl.read_multivalued_map("canonical_decomposition_map") # :nodoc:
|
11
|
+
|
12
|
+
# Get the canonical decomposition of the given string, also called
|
13
|
+
# Normalization Form D or short NFD.
|
14
|
+
#
|
15
|
+
# The Unicode standard has multiple representations for some
|
16
|
+
# characters. One representation as a single codepoint and other
|
17
|
+
# representation(s) as a combination of multiple codepoints. This
|
18
|
+
# function "decomposes" these characters in +str+ into the latter
|
19
|
+
# representation.
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
#
|
23
|
+
# # LATIN SMALL LETTER A WITH ACUTE => LATIN SMALL LETTER A, COMBINING ACUTE ACCENT
|
24
|
+
# UnicodeUtils.canonical_decomposition("\u{E1}") => "\u{61}\u{301}"
|
25
|
+
def canonical_decomposition(str)
|
26
|
+
res = String.new.force_encoding(str.encoding)
|
27
|
+
str.each_codepoint { |cp|
|
28
|
+
if cp >= 0xAC00 && cp <= 0xD7A3 # hangul syllable
|
29
|
+
Impl.append_hangul_syllable_decomposition(res, cp)
|
30
|
+
else
|
31
|
+
Impl.append_recursive_canonical_decomposition_mapping(res, cp)
|
32
|
+
end
|
33
|
+
}
|
34
|
+
Impl.put_into_canonical_order(res)
|
35
|
+
end
|
36
|
+
module_function :canonical_decomposition
|
37
|
+
|
38
|
+
module Impl # :nodoc:
|
39
|
+
|
40
|
+
def self.append_recursive_canonical_decomposition_mapping(str, cp)
|
41
|
+
mapping = CANONICAL_DECOMPOSITION_MAP[cp]
|
42
|
+
if mapping
|
43
|
+
mapping.each { |c|
|
44
|
+
append_recursive_canonical_decomposition_mapping(str, c)
|
45
|
+
}
|
46
|
+
else
|
47
|
+
str << cp
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.put_into_canonical_order(str)
|
52
|
+
reorder_needed = false
|
53
|
+
last_cp = nil
|
54
|
+
str.each_codepoint { |cp|
|
55
|
+
if last_cp
|
56
|
+
cc = COMBINING_CLASS_MAP[cp] || 0
|
57
|
+
if cc != 0
|
58
|
+
if (COMBINING_CLASS_MAP[last_cp] || 0) > cc
|
59
|
+
reorder_needed = true
|
60
|
+
break
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
last_cp = cp
|
65
|
+
}
|
66
|
+
return str unless reorder_needed
|
67
|
+
res = String.new.force_encoding(str.encoding)
|
68
|
+
last_cp = nil
|
69
|
+
str.each_codepoint { |cp|
|
70
|
+
if last_cp
|
71
|
+
cc = COMBINING_CLASS_MAP[cp] || 0
|
72
|
+
if cc != 0 && (COMBINING_CLASS_MAP[last_cp] || 0) > cc
|
73
|
+
res << cp
|
74
|
+
cp = nil
|
75
|
+
end
|
76
|
+
res << last_cp
|
77
|
+
end
|
78
|
+
last_cp = cp
|
79
|
+
}
|
80
|
+
res << last_cp if last_cp
|
81
|
+
put_into_canonical_order(res)
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/canonical_decomposition"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# The strings +a+ and +b+ are canonical equivalents if their
|
8
|
+
# canonical decompositions are equal.
|
9
|
+
#
|
10
|
+
# Example:
|
11
|
+
#
|
12
|
+
# UnicodeUtils.canonical_equivalents?("Äste", "A\u{308}ste") => true
|
13
|
+
# UnicodeUtils.canonical_equivalents?("Äste", "Aste") => false
|
14
|
+
def canonical_equivalents?(a, b)
|
15
|
+
UnicodeUtils.canonical_decomposition(a) ==
|
16
|
+
UnicodeUtils.canonical_decomposition(b)
|
17
|
+
end
|
18
|
+
module_function :canonical_equivalents?
|
19
|
+
|
20
|
+
end
|
@@ -7,9 +7,9 @@ require "unicode_utils/combining_class"
|
|
7
7
|
|
8
8
|
module UnicodeUtils
|
9
9
|
|
10
|
-
module Impl # :nodoc:
|
10
|
+
module Impl # :nodoc:all
|
11
11
|
|
12
|
-
class ConditionalCasing
|
12
|
+
class ConditionalCasing
|
13
13
|
|
14
14
|
attr_reader :mapping
|
15
15
|
|
@@ -23,7 +23,7 @@ module UnicodeUtils
|
|
23
23
|
|
24
24
|
end
|
25
25
|
|
26
|
-
class BeforeDotConditionalCasing < ConditionalCasing
|
26
|
+
class BeforeDotConditionalCasing < ConditionalCasing
|
27
27
|
|
28
28
|
def context_match?(str, pos)
|
29
29
|
(pos + 1).upto(str.length - 1) { |i|
|
@@ -37,7 +37,7 @@ module UnicodeUtils
|
|
37
37
|
|
38
38
|
end
|
39
39
|
|
40
|
-
class NotBeforeDotConditionalCasing < BeforeDotConditionalCasing
|
40
|
+
class NotBeforeDotConditionalCasing < BeforeDotConditionalCasing
|
41
41
|
|
42
42
|
def context_match?(str, pos)
|
43
43
|
!super
|
@@ -45,7 +45,7 @@ module UnicodeUtils
|
|
45
45
|
|
46
46
|
end
|
47
47
|
|
48
|
-
class MoreAboveConditionalCasing < ConditionalCasing
|
48
|
+
class MoreAboveConditionalCasing < ConditionalCasing
|
49
49
|
|
50
50
|
def context_match?(str, pos)
|
51
51
|
(pos + 1).upto(str.length - 1) { |i|
|
@@ -59,7 +59,7 @@ module UnicodeUtils
|
|
59
59
|
|
60
60
|
end
|
61
61
|
|
62
|
-
class AfterIConditionalCasing < ConditionalCasing
|
62
|
+
class AfterIConditionalCasing < ConditionalCasing
|
63
63
|
|
64
64
|
def context_match?(str, pos)
|
65
65
|
(pos - 1).downto(0) { |i|
|
@@ -73,7 +73,7 @@ module UnicodeUtils
|
|
73
73
|
|
74
74
|
end
|
75
75
|
|
76
|
-
class AfterSoftDottedConditionalCasing < ConditionalCasing
|
76
|
+
class AfterSoftDottedConditionalCasing < ConditionalCasing
|
77
77
|
|
78
78
|
def context_match?(str, pos)
|
79
79
|
(pos - 1).downto(0) { |i|
|
@@ -87,7 +87,7 @@ module UnicodeUtils
|
|
87
87
|
|
88
88
|
end
|
89
89
|
|
90
|
-
class FinalSigmaConditionalCasing < ConditionalCasing
|
90
|
+
class FinalSigmaConditionalCasing < ConditionalCasing
|
91
91
|
|
92
92
|
def context_match?(str, pos)
|
93
93
|
before_match?(str, pos) && !after_match?(str, pos)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
require "unicode_utils/simple_downcase"
|
4
|
-
require "unicode_utils/
|
4
|
+
require "unicode_utils/read_multivalued_map"
|
5
5
|
require "unicode_utils/conditional_casing"
|
6
6
|
|
7
7
|
module UnicodeUtils
|
8
8
|
|
9
|
-
SPECIAL_DOWNCASE_MAP = Impl.
|
9
|
+
SPECIAL_DOWNCASE_MAP = Impl.read_multivalued_map("special_lc_map") # :nodoc:
|
10
10
|
|
11
11
|
# Perform a full case-conversion of +str+ to lowercase according to
|
12
12
|
# the Unicode standard.
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module UnicodeUtils
|
4
|
+
|
5
|
+
# Derives the canonical decomposition of the given Hangul syllable.
|
6
|
+
#
|
7
|
+
# Example:
|
8
|
+
#
|
9
|
+
# UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
|
10
|
+
def hangul_syllable_decomposition(char)
|
11
|
+
Impl.append_hangul_syllable_decomposition(
|
12
|
+
String.new.force_encoding(char.encoding), char.ord)
|
13
|
+
end
|
14
|
+
module_function :hangul_syllable_decomposition
|
15
|
+
|
16
|
+
module Impl # :nodoc:
|
17
|
+
|
18
|
+
def self.append_hangul_syllable_decomposition(str, s)
|
19
|
+
# constants
|
20
|
+
sbase = 0xAC00
|
21
|
+
lbase = 0x1100
|
22
|
+
vbase = 0x1161
|
23
|
+
tbase = 0x11A7
|
24
|
+
scount = 11172
|
25
|
+
lcount = 19
|
26
|
+
vcount = 21
|
27
|
+
tcount = 28
|
28
|
+
ncount = vcount * tcount
|
29
|
+
|
30
|
+
sindex = s - sbase
|
31
|
+
if 0 <= sindex && sindex < scount
|
32
|
+
l = lbase + sindex / ncount
|
33
|
+
v = vbase + (sindex % ncount) / tcount
|
34
|
+
t = tbase + sindex % tcount
|
35
|
+
str << l << v
|
36
|
+
str << t if t != tbase
|
37
|
+
else
|
38
|
+
str << s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/read_names"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
JAMO_SHORT_NAME_MAP = Impl.read_names("jamo_short_names") # :nodoc:
|
8
|
+
|
9
|
+
# The the Jamo Short Name property of the given character (defaults
|
10
|
+
# to nil).
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
#
|
14
|
+
# UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
|
15
|
+
def jamo_short_name(char)
|
16
|
+
JAMO_SHORT_NAME_MAP[char.ord]
|
17
|
+
end
|
18
|
+
module_function :jamo_short_name
|
19
|
+
|
20
|
+
end
|
data/lib/unicode_utils/name.rb
CHANGED
@@ -1,33 +1,38 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
require "unicode_utils/read_names"
|
4
|
+
require "unicode_utils/hangul_syllable_decomposition"
|
5
|
+
require "unicode_utils/jamo_short_name"
|
6
6
|
|
7
|
-
|
8
|
-
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", "names")
|
9
|
-
Hash.new.tap { |map|
|
10
|
-
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
-
buffer = "x" * 6
|
12
|
-
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
-
while input.read(6, buffer)
|
14
|
-
map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
|
15
|
-
end
|
16
|
-
end
|
17
|
-
}
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
7
|
+
module UnicodeUtils
|
21
8
|
|
22
|
-
NAME_MAP = Impl.read_names # :nodoc:
|
9
|
+
NAME_MAP = Impl.read_names("names") # :nodoc:
|
23
10
|
|
24
|
-
# Get the Unicode name of the
|
11
|
+
# Get the normative Unicode name of the given character.
|
12
|
+
#
|
13
|
+
# Private Use codepoints have no name, this function returns nil for
|
14
|
+
# such codepoints.
|
15
|
+
#
|
16
|
+
# All control characters have the special name "<control>". All
|
17
|
+
# other characters have a unique name.
|
25
18
|
#
|
26
19
|
# Example:
|
27
20
|
#
|
28
21
|
# UnicodeUtils.name "ᾀ" => "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"
|
29
|
-
|
30
|
-
|
22
|
+
# UnicodeUtils.name "\t" => "<control>"
|
23
|
+
def name(char)
|
24
|
+
cp = char.ord
|
25
|
+
NAME_MAP[cp] ||
|
26
|
+
case cp
|
27
|
+
when 0x3400..0x4DB5, 0x4E00..0x9FC3, 0x20000..0x2A6D6
|
28
|
+
"CJK UNIFIED IDEOGRAPH-#{sprintf('%04x', cp).upcase}"
|
29
|
+
when 0xAC00..0xD7A3
|
30
|
+
"HANGUL SYLLABLE ".tap do |n|
|
31
|
+
hangul_syllable_decomposition(char).each_char { |c|
|
32
|
+
n << (jamo_short_name(c) || '')
|
33
|
+
}
|
34
|
+
end
|
35
|
+
end
|
31
36
|
end
|
32
37
|
module_function :name
|
33
38
|
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/canonical_decomposition"
|
4
|
+
require "unicode_utils/combining_class"
|
5
|
+
require "unicode_utils/read_codepoint_set"
|
6
|
+
|
7
|
+
module UnicodeUtils
|
8
|
+
|
9
|
+
module Impl # :nodoc:all
|
10
|
+
|
11
|
+
COMPOSITION_EXCLUSION_SET =
|
12
|
+
Impl.read_codepoint_set("composition_exclusion_set")
|
13
|
+
|
14
|
+
CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
|
15
|
+
CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
|
16
|
+
if decomp.length == 2
|
17
|
+
(m[decomp[0]] ||= {})[decomp[1]] = comp
|
18
|
+
end
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
module NFC
|
23
|
+
|
24
|
+
def self.starter?(cp)
|
25
|
+
(COMBINING_CLASS_MAP[cp] || 0) == 0
|
26
|
+
end
|
27
|
+
|
28
|
+
# does b block c?
|
29
|
+
def self.blocked?(b, c)
|
30
|
+
# From the standard:
|
31
|
+
# "If a combining character sequence is in canonical order,
|
32
|
+
# then testing whether a character is blocked requires looking
|
33
|
+
# at only the immediately preceding character."
|
34
|
+
# cpary is in canonical order (since it comes out of
|
35
|
+
# canonical_decomposition).
|
36
|
+
(COMBINING_CLASS_MAP[b] || 0) >= (COMBINING_CLASS_MAP[c] || 0)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.primary_composite?(cp)
|
40
|
+
unless CANONICAL_DECOMPOSITION_MAP[cp] ||
|
41
|
+
# has hangul syllable decomposition?
|
42
|
+
(cp >= 0xAC00 && cp <= 0xD7A3)
|
43
|
+
return false
|
44
|
+
end
|
45
|
+
!COMPOSITION_EXCLUSION_SET.include?(cp)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get +str+ in Normalization Form C.
|
53
|
+
#
|
54
|
+
# The Unicode standard has multiple representations for some
|
55
|
+
# characters. One representation as a single codepoint and other
|
56
|
+
# representation(s) as a combination of multiple codepoints. This
|
57
|
+
# function "composes" these characters into the former
|
58
|
+
# representation.
|
59
|
+
#
|
60
|
+
# Example:
|
61
|
+
#
|
62
|
+
# UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
|
63
|
+
def nfc(str)
|
64
|
+
str = UnicodeUtils.canonical_decomposition(str)
|
65
|
+
|
66
|
+
### constants for hangul composition ###
|
67
|
+
sbase = 0xAC00
|
68
|
+
lbase = 0x1100
|
69
|
+
vbase = 0x1161
|
70
|
+
tbase = 0x11A7
|
71
|
+
lcount = 19
|
72
|
+
vcount = 21
|
73
|
+
tcount = 28
|
74
|
+
ncount = vcount * tcount
|
75
|
+
scount = lcount * ncount
|
76
|
+
########################################
|
77
|
+
|
78
|
+
String.new.force_encoding(str.encoding).tap do |res|
|
79
|
+
last_starter = nil
|
80
|
+
uncomposable_non_starters = []
|
81
|
+
str.each_codepoint { |cp|
|
82
|
+
if Impl::NFC.starter?(cp)
|
83
|
+
combined = false
|
84
|
+
if last_starter && uncomposable_non_starters.empty?
|
85
|
+
### hangul ###
|
86
|
+
lindex = last_starter - lbase
|
87
|
+
if 0 <= lindex && lindex < lcount
|
88
|
+
vindex = cp - vbase
|
89
|
+
if 0 <= vindex && vindex <= vcount
|
90
|
+
last_starter =
|
91
|
+
sbase + (lindex * vcount + vindex) * tcount
|
92
|
+
combined = true
|
93
|
+
end
|
94
|
+
end
|
95
|
+
unless combined
|
96
|
+
sindex = last_starter - sbase
|
97
|
+
if 0 <= sindex && sindex < scount && (sindex % tcount) == 0
|
98
|
+
tindex = cp - tbase
|
99
|
+
if 0 <= tindex && tindex < tcount
|
100
|
+
last_starter += tindex
|
101
|
+
combined = true
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
##############
|
106
|
+
unless combined
|
107
|
+
map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
|
108
|
+
composition = map && map[cp]
|
109
|
+
if composition && Impl::NFC.primary_composite?(composition)
|
110
|
+
last_starter = composition
|
111
|
+
combined = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
unless combined
|
116
|
+
res << last_starter if last_starter
|
117
|
+
uncomposable_non_starters.each { |nc| res << nc }
|
118
|
+
uncomposable_non_starters.clear
|
119
|
+
last_starter = cp
|
120
|
+
end
|
121
|
+
else
|
122
|
+
last_non_starter = uncomposable_non_starters.last
|
123
|
+
if last_non_starter && Impl::NFC.blocked?(last_non_starter, cp)
|
124
|
+
uncomposable_non_starters << cp
|
125
|
+
else
|
126
|
+
map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
|
127
|
+
composition = map && map[cp]
|
128
|
+
if composition && Impl::NFC.primary_composite?(composition)
|
129
|
+
last_starter = composition
|
130
|
+
else
|
131
|
+
uncomposable_non_starters << cp
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
}
|
136
|
+
res << last_starter if last_starter
|
137
|
+
uncomposable_non_starters.each { |nc| res << nc }
|
138
|
+
end
|
139
|
+
end
|
140
|
+
module_function :nfc
|
141
|
+
|
142
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/canonical_decomposition"
|
4
|
+
|
5
|
+
module UnicodeUtils
|
6
|
+
|
7
|
+
# Get +str+ in Normalization Form D.
|
8
|
+
#
|
9
|
+
# Alias for UnicodeUtils.canonical_decomposition.
|
10
|
+
def nfd(str)
|
11
|
+
UnicodeUtils.canonical_decomposition(str)
|
12
|
+
end
|
13
|
+
module_function :nfd
|
14
|
+
|
15
|
+
end
|
@@ -4,7 +4,7 @@ module UnicodeUtils
|
|
4
4
|
|
5
5
|
module Impl # :nodoc:
|
6
6
|
|
7
|
-
def self.
|
7
|
+
def self.read_multivalued_map(filename)
|
8
8
|
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
9
|
Hash.new.tap { |map|
|
10
10
|
File.open(path, "r:US-ASCII:-") do |input|
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module UnicodeUtils
|
4
|
+
|
5
|
+
module Impl # :nodoc:
|
6
|
+
|
7
|
+
def self.read_names(filename)
|
8
|
+
path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
|
9
|
+
Hash.new.tap { |map|
|
10
|
+
File.open(path, "r:US-ASCII:-") do |input|
|
11
|
+
buffer = "x" * 6
|
12
|
+
buffer.force_encoding(Encoding::US_ASCII)
|
13
|
+
while input.read(6, buffer)
|
14
|
+
map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/unicode_utils/upcase.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
require "unicode_utils/simple_upcase"
|
4
|
-
require "unicode_utils/
|
4
|
+
require "unicode_utils/read_multivalued_map"
|
5
5
|
require "unicode_utils/conditional_casing"
|
6
6
|
|
7
7
|
module UnicodeUtils
|
8
8
|
|
9
|
-
SPECIAL_UPCASE_MAP = Impl.
|
9
|
+
SPECIAL_UPCASE_MAP = Impl.read_multivalued_map("special_uc_map") # :nodoc:
|
10
10
|
|
11
11
|
# Perform a full case-conversion of +str+ to uppercase according to
|
12
12
|
# the Unicode standard.
|
data/lib/unicode_utils.rb
CHANGED
@@ -13,3 +13,21 @@ require "unicode_utils/cased_char_q"
|
|
13
13
|
require "unicode_utils/case_ignorable_char_q"
|
14
14
|
require "unicode_utils/soft_dotted_char_q"
|
15
15
|
require "unicode_utils/combining_class"
|
16
|
+
require "unicode_utils/hangul_syllable_decomposition"
|
17
|
+
require "unicode_utils/jamo_short_name"
|
18
|
+
require "unicode_utils/canonical_decomposition"
|
19
|
+
require "unicode_utils/nfd"
|
20
|
+
require "unicode_utils/canonical_equivalents_q"
|
21
|
+
require "unicode_utils/nfc"
|
22
|
+
|
23
|
+
# Read the README[link:files/README_txt.html] for an introduction.
|
24
|
+
#
|
25
|
+
# Highlevel functions are:
|
26
|
+
#
|
27
|
+
# UnicodeUtils.upcase:: full conversion to uppercase
|
28
|
+
# UnicodeUtils.downcase:: full conversion to lowercase
|
29
|
+
# UnicodeUtils.nfd:: Normalization Form D
|
30
|
+
# UnicodeUtils.nfc:: Normalization Form C
|
31
|
+
# UnicodeUtils.name:: character names
|
32
|
+
module UnicodeUtils
|
33
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
|
5
|
+
require "unicode_utils/nfd"
|
6
|
+
require "unicode_utils/nfc"
|
7
|
+
|
8
|
+
# See data/NormalizationTest.txt
|
9
|
+
class TestNormalization < Test::Unit::TestCase
|
10
|
+
|
11
|
+
class Record
|
12
|
+
def initialize(ary)
|
13
|
+
@ary = ary
|
14
|
+
end
|
15
|
+
def c1
|
16
|
+
@ary[0]
|
17
|
+
end
|
18
|
+
def c2
|
19
|
+
@ary[1]
|
20
|
+
end
|
21
|
+
def c3
|
22
|
+
@ary[2]
|
23
|
+
end
|
24
|
+
def c4
|
25
|
+
@ary[3]
|
26
|
+
end
|
27
|
+
def c5
|
28
|
+
@ary[4]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def each_testdata_record
|
33
|
+
fn = File.join(File.dirname(__FILE__),
|
34
|
+
"..", "data", "NormalizationTest.txt")
|
35
|
+
File.open(fn, "r:utf-8:-") do |input|
|
36
|
+
input.each_line { |line|
|
37
|
+
if line =~ /^([^#]*)#/
|
38
|
+
line = $1
|
39
|
+
end
|
40
|
+
line.strip!
|
41
|
+
next if line.empty? || line =~ /^@Part/
|
42
|
+
columns = line.split(";")
|
43
|
+
ary = columns.map { |column|
|
44
|
+
String.new.force_encoding(Encoding::UTF_8).tap do |str|
|
45
|
+
column.split(" ").each { |c|
|
46
|
+
str << c.strip.to_i(16)
|
47
|
+
}
|
48
|
+
end
|
49
|
+
}
|
50
|
+
yield Record.new(ary)
|
51
|
+
}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_nfd
|
56
|
+
each_testdata_record { |r|
|
57
|
+
assert_equal r.c3, UnicodeUtils.nfd(r.c1)
|
58
|
+
assert_equal r.c3, UnicodeUtils.nfd(r.c2)
|
59
|
+
assert_equal r.c3, UnicodeUtils.nfd(r.c3)
|
60
|
+
assert_equal r.c5, UnicodeUtils.nfd(r.c4)
|
61
|
+
assert_equal r.c5, UnicodeUtils.nfd(r.c5)
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_nfc
|
66
|
+
each_testdata_record { |r|
|
67
|
+
assert_equal r.c2, UnicodeUtils.nfc(r.c1)
|
68
|
+
assert_equal r.c2, UnicodeUtils.nfc(r.c2)
|
69
|
+
assert_equal r.c2, UnicodeUtils.nfc(r.c3)
|
70
|
+
assert_equal r.c4, UnicodeUtils.nfc(r.c4)
|
71
|
+
assert_equal r.c4, UnicodeUtils.nfc(r.c5)
|
72
|
+
}
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
data/test/test_unicode_utils.rb
CHANGED
@@ -9,6 +9,12 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
9
9
|
def test_name
|
10
10
|
assert_equal "LATIN SMALL LETTER F", UnicodeUtils.name("f")
|
11
11
|
assert_equal Encoding::US_ASCII, UnicodeUtils.name("f").encoding
|
12
|
+
assert_equal nil, UnicodeUtils.name("\u{e000}") # private use
|
13
|
+
assert_equal "<control>", UnicodeUtils.name("\t")
|
14
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-4E00", UnicodeUtils.name("\u{4e00}")
|
15
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-2A6D6", UnicodeUtils.name("\u{2a6d6}")
|
16
|
+
assert_equal "CJK UNIFIED IDEOGRAPH-2A3D6", UnicodeUtils.name("\u{2a3d6}")
|
17
|
+
assert_equal "HANGUL SYLLABLE PWILH", UnicodeUtils.name("\u{d4db}")
|
12
18
|
end
|
13
19
|
|
14
20
|
def test_simple_upcase
|
@@ -104,4 +110,32 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
104
110
|
assert_equal false, UnicodeUtils.soft_dotted_char?("a")
|
105
111
|
end
|
106
112
|
|
113
|
+
def test_hangul_syllable_decomposition
|
114
|
+
assert_equal "\u{1111}\u{1171}\u{11b6}", UnicodeUtils.hangul_syllable_decomposition("\u{d4db}")
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_jamo_short_name
|
118
|
+
assert_equal "GG", UnicodeUtils.jamo_short_name("\u{1101}")
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_canonical_decomposition
|
122
|
+
assert_equal "\u{61}\u{301}",
|
123
|
+
UnicodeUtils.canonical_decomposition("\u{E1}")
|
124
|
+
assert_equal "\u{61}\u{301}\u{63}\u{327}\u{301}",
|
125
|
+
UnicodeUtils.canonical_decomposition("\u{e1}\u{63}\u{301}\u{327}")
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_nfd
|
129
|
+
assert_equal "\u{61}\u{301}", UnicodeUtils.nfd("\u{E1}")
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_canonical_equivalents?
|
133
|
+
assert_equal true, UnicodeUtils.canonical_equivalents?("Äste", "A\u{308}ste")
|
134
|
+
assert_equal false, UnicodeUtils.canonical_equivalents?("Äste", "Aste")
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_nfc
|
138
|
+
assert_equal "Häschen", UnicodeUtils.nfc("Ha\u{308}schen")
|
139
|
+
end
|
140
|
+
|
107
141
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stefan Lang
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-11-
|
12
|
+
date: 2008-11-16 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -22,20 +22,27 @@ extensions: []
|
|
22
22
|
extra_rdoc_files:
|
23
23
|
- README.txt
|
24
24
|
files:
|
25
|
-
- lib/unicode_utils/read_special_casing_map.rb
|
26
25
|
- lib/unicode_utils/conditional_casing.rb
|
26
|
+
- lib/unicode_utils/hangul_syllable_decomposition.rb
|
27
27
|
- lib/unicode_utils/simple_downcase.rb
|
28
28
|
- lib/unicode_utils/read_codepoint_map.rb
|
29
|
+
- lib/unicode_utils/read_names.rb
|
29
30
|
- lib/unicode_utils/read_codepoint_set.rb
|
30
31
|
- lib/unicode_utils/titlecase_char_q.rb
|
31
32
|
- lib/unicode_utils/cased_char_q.rb
|
32
33
|
- lib/unicode_utils/downcase.rb
|
33
34
|
- lib/unicode_utils/name.rb
|
34
35
|
- lib/unicode_utils/uppercase_char_q.rb
|
36
|
+
- lib/unicode_utils/read_multivalued_map.rb
|
37
|
+
- lib/unicode_utils/canonical_equivalents_q.rb
|
38
|
+
- lib/unicode_utils/canonical_decomposition.rb
|
35
39
|
- lib/unicode_utils/upcase.rb
|
40
|
+
- lib/unicode_utils/nfc.rb
|
41
|
+
- lib/unicode_utils/nfd.rb
|
36
42
|
- lib/unicode_utils/case_ignorable_char_q.rb
|
37
43
|
- lib/unicode_utils/simple_upcase.rb
|
38
44
|
- lib/unicode_utils/lowercase_char_q.rb
|
45
|
+
- lib/unicode_utils/jamo_short_name.rb
|
39
46
|
- lib/unicode_utils/combining_class.rb
|
40
47
|
- lib/unicode_utils/version.rb
|
41
48
|
- lib/unicode_utils/soft_dotted_char_q.rb
|
@@ -48,11 +55,15 @@ files:
|
|
48
55
|
- cdata/names
|
49
56
|
- cdata/cond_uc_map
|
50
57
|
- cdata/special_uc_map
|
58
|
+
- cdata/canonical_decomposition_map
|
51
59
|
- cdata/soft_dotted_set
|
52
60
|
- cdata/simple_lc_map
|
53
61
|
- cdata/case_ignorable_set
|
62
|
+
- cdata/composition_exclusion_set
|
54
63
|
- cdata/simple_uc_map
|
64
|
+
- cdata/jamo_short_names
|
55
65
|
- cdata/prop_set_uppercase
|
66
|
+
- test/test_normalization.rb
|
56
67
|
- test/test_unicode_utils.rb
|
57
68
|
- README.txt
|
58
69
|
- LICENSE.txt
|
@@ -78,10 +89,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
89
|
version:
|
79
90
|
requirements: []
|
80
91
|
|
81
|
-
rubyforge_project:
|
92
|
+
rubyforge_project: unicode-utils
|
82
93
|
rubygems_version: 1.3.1
|
83
94
|
signing_key:
|
84
95
|
specification_version: 2
|
85
96
|
summary: additional Unicode aware functions for Ruby 1.9
|
86
97
|
test_files:
|
98
|
+
- test/test_normalization.rb
|
87
99
|
- test/test_unicode_utils.rb
|