unicode_utils 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ 
@@ -0,0 +1 @@
1
+ 00034000034100034300034400037400037e00038700095800095900095a00095b00095c00095d00095e00095f0009dc0009dd0009df000a33000a36000a59000a5a000a5b000a5e000b5c000b5d000f43000f4d000f52000f57000f5c000f69000f73000f75000f76000f78000f81000f93000f9d000fa2000fa7000fac000fb9001f71001f73001f75001f77001f79001f7b001f7d001fbb001fbe001fc9001fcb001fd3001fdb001fe3001feb001fee001fef001ff9001ffb001ffd00200000200100212600212a00212b00232900232a002adc00f90000f90100f90200f90300f90400f90500f90600f90700f90800f90900f90a00f90b00f90c00f90d00f90e00f90f00f91000f91100f91200f91300f91400f91500f91600f91700f91800f91900f91a00f91b00f91c00f91d00f91e00f91f00f92000f92100f92200f92300f92400f92500f92600f92700f92800f92900f92a00f92b00f92c00f92d00f92e00f92f00f93000f93100f93200f93300f93400f93500f93600f93700f93800f93900f93a00f93b00f93c00f93d00f93e00f93f00f94000f94100f94200f94300f94400f94500f94600f94700f94800f94900f94a00f94b00f94c00f94d00f94e00f94f00f95000f95100f95200f95300f95400f95500f95600f95700f95800f95900f95a00f95b00f95c00f95d00f95e00f95f00f96000f96100f96200f96300f96400f96500f96600f96700f96800f96900f96a00f96b00f96c00f96d00f96e00f96f00f97000f97100f97200f97300f97400f97500f97600f97700f97800f97900f97a00f97b00f97c00f97d00f97e00f97f00f98000f98100f98200f98300f98400f98500f98600f98700f98800f98900f98a00f98b00f98c00f98d00f98e00f98f00f99000f99100f99200f99300f99400f99500f99600f99700f99800f99900f99a00f99b00f99c00f99d00f99e00f99f00f9a000f9a100f9a200f9a300f9a400f9a500f9a600f9a700f9a800f9a900f9aa00f9ab00f9ac00f9ad00f9ae00f9af00f9b000f9b100f9b200f9b300f9b400f9b500f9b600f9b700f9b800f9b900f9ba00f9bb00f9bc00f9bd00f9be00f9bf00f9c000f9c100f9c200f9c300f9c400f9c500f9c600f9c700f9c800f9c900f9ca00f9cb00f9cc00f9cd00f9ce00f9cf00f9d000f9d100f9d200f9d300f9d400f9d500f9d600f9d700f9d800f9d900f9da00f9db00f9dc00f9dd00f9de00f9df00f9e000f9e100f9e200f9e300f9e400f9e500f9e600f9e700f9e800f9e900f9ea00f9eb00f9ec00f9ed00f9ee00f9ef00f9f000f9f100f9f200f9f300f9f400f9f500f9f600f9f700f9f800f9f900f9fa00f9fb00f9fc00f9fd00f9fe00f9ff00fa0000fa0100fa0200fa0300fa0400fa0500fa0600fa0700fa0800fa0900fa0a00fa0b00fa0c00fa0d00fa1000fa1200fa1500fa1600fa1700fa1800fa1900fa1a00fa1b00fa1c00fa1d00fa1e00fa2000fa2200fa2500fa2600fa2a00fa2b00fa2c00fa2d00fa3000fa3100fa3200fa3300fa3400fa3500fa3600fa3700fa3800fa3900fa3a00fa3b00fa3c00fa3d00fa3e00fa3f00fa4000fa4100fa4200fa4300fa4400fa4500fa4600fa4700fa4800fa4900fa4a00fa4b00fa4c00fa4d00fa4e00fa4f00fa5000fa5100fa5200fa5300fa5400fa5500fa5600fa5700fa5800fa5900fa5a00fa5b00fa5c00fa5d00fa5e00fa5f00fa6000fa6100fa6200fa6300fa6400fa6500fa6600fa6700fa6800fa6900fa6a00fa7000fa7100fa7200fa7300fa7400fa7500fa7600fa7700fa7800fa7900fa7a00fa7b00fa7c00fa7d00fa7e00fa7f00fa8000fa8100fa8200fa8300fa8400fa8500fa8600fa8700fa8800fa8900fa8a00fa8b00fa8c00fa8d00fa8e00fa8f00fa9000fa9100fa9200fa9300fa9400fa9500fa9600fa9700fa9800fa9900fa9a00fa9b00fa9c00fa9d00fa9e00fa9f00faa000faa100faa200faa300faa400faa500faa600faa700faa800faa900faaa00faab00faac00faad00faae00faaf00fab000fab100fab200fab300fab400fab500fab600fab700fab800fab900faba00fabb00fabc00fabd00fabe00fabf00fac000fac100fac200fac300fac400fac500fac600fac700fac800fac900faca00facb00facc00facd00face00facf00fad000fad100fad200fad300fad400fad500fad600fad700fad800fad900fb1d00fb1f00fb2a00fb2b00fb2c00fb2d00fb2e00fb2f00fb3000fb3100fb3200fb3300fb3400fb3500fb3600fb3800fb3900fb3a00fb3b00fb3c00fb3e00fb4000fb4100fb4300fb4400fb4600fb4700fb4800fb4900fb4a00fb4b00fb4c00fb4d00fb4e01d15e01d15f01d16001d16101d16201d16301d16401d1bb01d1bc01d1bd01d1be01d1bf01d1c002f80002f80102f80202f80302f80402f80502f80602f80702f80802f80902f80a02f80b02f80c02f80d02f80e02f80f02f81002f81102f81202f81302f81402f81502f81602f81702f81802f81902f81a02f81b02f81c02f81d02f81e02f81f02f82002f82102f82202f82302f82402f82502f82602f82702f82802f82902f82a02f82b02f82c02f82d02f82e02f82f02f83002f83102f83202f83302f83402f83502f83602f83702f83802f83902f83a02f83b02f83c02f83d02f83e02f83f02f84002f84102f84202f84302f84402f84502f84602f84702f84802f84902f84a02f84b02f84c02f84d02f84e02f84f02f85002f85102f85202f85302f85402f85502f85602f85702f85802f85902f85a02f85b02f85c02f85d02f85e02f85f02f86002f86102f86202f86302f86402f86502f86602f86702f86802f86902f86a02f86b02f86c02f86d02f86e02f86f02f87002f87102f87202f87302f87402f87502f87602f87702f87802f87902f87a02f87b02f87c02f87d02f87e02f87f02f88002f88102f88202f88302f88402f88502f88602f88702f88802f88902f88a02f88b02f88c02f88d02f88e02f88f02f89002f89102f89202f89302f89402f89502f89602f89702f89802f89902f89a02f89b02f89c02f89d02f89e02f89f02f8a002f8a102f8a202f8a302f8a402f8a502f8a602f8a702f8a802f8a902f8aa02f8ab02f8ac02f8ad02f8ae02f8af02f8b002f8b102f8b202f8b302f8b402f8b502f8b602f8b702f8b802f8b902f8ba02f8bb02f8bc02f8bd02f8be02f8bf02f8c002f8c102f8c202f8c302f8c402f8c502f8c602f8c702f8c802f8c902f8ca02f8cb02f8cc02f8cd02f8ce02f8cf02f8d002f8d102f8d202f8d302f8d402f8d502f8d602f8d702f8d802f8d902f8da02f8db02f8dc02f8dd02f8de02f8df02f8e002f8e102f8e202f8e302f8e402f8e502f8e602f8e702f8e802f8e902f8ea02f8eb02f8ec02f8ed02f8ee02f8ef02f8f002f8f102f8f202f8f302f8f402f8f502f8f602f8f702f8f802f8f902f8fa02f8fb02f8fc02f8fd02f8fe02f8ff02f90002f90102f90202f90302f90402f90502f90602f90702f90802f90902f90a02f90b02f90c02f90d02f90e02f90f02f91002f91102f91202f91302f91402f91502f91602f91702f91802f91902f91a02f91b02f91c02f91d02f91e02f91f02f92002f92102f92202f92302f92402f92502f92602f92702f92802f92902f92a02f92b02f92c02f92d02f92e02f92f02f93002f93102f93202f93302f93402f93502f93602f93702f93802f93902f93a02f93b02f93c02f93d02f93e02f93f02f94002f94102f94202f94302f94402f94502f94602f94702f94802f94902f94a02f94b02f94c02f94d02f94e02f94f02f95002f95102f95202f95302f95402f95502f95602f95702f95802f95902f95a02f95b02f95c02f95d02f95e02f95f02f96002f96102f96202f96302f96402f96502f96602f96702f96802f96902f96a02f96b02f96c02f96d02f96e02f96f02f97002f97102f97202f97302f97402f97502f97602f97702f97802f97902f97a02f97b02f97c02f97d02f97e02f97f02f98002f98102f98202f98302f98402f98502f98602f98702f98802f98902f98a02f98b02f98c02f98d02f98e02f98f02f99002f99102f99202f99302f99402f99502f99602f99702f99802f99902f99a02f99b02f99c02f99d02f99e02f99f02f9a002f9a102f9a202f9a302f9a402f9a502f9a602f9a702f9a802f9a902f9aa02f9ab02f9ac02f9ad02f9ae02f9af02f9b002f9b102f9b202f9b302f9b402f9b502f9b602f9b702f9b802f9b902f9ba02f9bb02f9bc02f9bd02f9be02f9bf02f9c002f9c102f9c202f9c302f9c402f9c502f9c602f9c702f9c802f9c902f9ca02f9cb02f9cc02f9cd02f9ce02f9cf02f9d002f9d102f9d202f9d302f9d402f9d502f9d602f9d702f9d802f9d902f9da02f9db02f9dc02f9dd02f9de02f9df02f9e002f9e102f9e202f9e302f9e402f9e502f9e602f9e702f9e802f9e902f9ea02f9eb02f9ec02f9ed02f9ee02f9ef02f9f002f9f102f9f202f9f302f9f402f9f502f9f602f9f702f9f802f9f902f9fa02f9fb02f9fc02f9fd02f9fe02f9ff02fa0002fa0102fa0202fa0302fa0402fa0502fa0602fa0702fa0802fa0902fa0a02fa0b02fa0c02fa0d02fa0e02fa0f02fa1002fa1102fa1202fa1302fa1402fa1502fa1602fa1702fa1802fa1902fa1a02fa1b02fa1c02fa1d
@@ -0,0 +1,66 @@
1
+ 001100G
2
+ 001101GG
3
+ 001102N
4
+ 001103D
5
+ 001104DD
6
+ 001105R
7
+ 001106M
8
+ 001107B
9
+ 001108BB
10
+ 001109S
11
+ 00110aSS
12
+ 00110cJ
13
+ 00110dJJ
14
+ 00110eC
15
+ 00110fK
16
+ 001110T
17
+ 001111P
18
+ 001112H
19
+ 001161A
20
+ 001162AE
21
+ 001163YA
22
+ 001164YAE
23
+ 001165EO
24
+ 001166E
25
+ 001167YEO
26
+ 001168YE
27
+ 001169O
28
+ 00116aWA
29
+ 00116bWAE
30
+ 00116cOE
31
+ 00116dYO
32
+ 00116eU
33
+ 00116fWEO
34
+ 001170WE
35
+ 001171WI
36
+ 001172YU
37
+ 001173EU
38
+ 001174YI
39
+ 001175I
40
+ 0011a8G
41
+ 0011a9GG
42
+ 0011aaGS
43
+ 0011abN
44
+ 0011acNJ
45
+ 0011adNH
46
+ 0011aeD
47
+ 0011afL
48
+ 0011b0LG
49
+ 0011b1LM
50
+ 0011b2LB
51
+ 0011b3LS
52
+ 0011b4LT
53
+ 0011b5LP
54
+ 0011b6LH
55
+ 0011b7M
56
+ 0011b8B
57
+ 0011b9BS
58
+ 0011baS
59
+ 0011bbSS
60
+ 0011bcNG
61
+ 0011bdJ
62
+ 0011beC
63
+ 0011bfK
64
+ 0011c0T
65
+ 0011c1P
66
+ 0011c2H
data/cdata/names CHANGED
@@ -10969,8 +10969,6 @@
10969
10969
  0033fdIDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY
10970
10970
  0033feIDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10971
10971
  0033ffSQUARE GAL
10972
- 003400<CJK Ideograph Extension A, First>
10973
- 004db5<CJK Ideograph Extension A, Last>
10974
10972
  004dc0HEXAGRAM FOR THE CREATIVE HEAVEN
10975
10973
  004dc1HEXAGRAM FOR THE RECEPTIVE EARTH
10976
10974
  004dc2HEXAGRAM FOR DIFFICULTY AT THE BEGINNING
@@ -11035,8 +11033,6 @@
11035
11033
  004dfdHEXAGRAM FOR SMALL PREPONDERANCE
11036
11034
  004dfeHEXAGRAM FOR AFTER COMPLETION
11037
11035
  004dffHEXAGRAM FOR BEFORE COMPLETION
11038
- 004e00<CJK Ideograph, First>
11039
- 009fc3<CJK Ideograph, Last>
11040
11036
  00a000YI SYLLABLE IT
11041
11037
  00a001YI SYLLABLE IX
11042
11038
  00a002YI SYLLABLE I
@@ -13130,16 +13126,6 @@
13130
13126
  00aa5dCHAM PUNCTUATION DANDA
13131
13127
  00aa5eCHAM PUNCTUATION DOUBLE DANDA
13132
13128
  00aa5fCHAM PUNCTUATION TRIPLE DANDA
13133
- 00ac00<Hangul Syllable, First>
13134
- 00d7a3<Hangul Syllable, Last>
13135
- 00d800<Non Private Use High Surrogate, First>
13136
- 00db7f<Non Private Use High Surrogate, Last>
13137
- 00db80<Private Use High Surrogate, First>
13138
- 00dbff<Private Use High Surrogate, Last>
13139
- 00dc00<Low Surrogate, First>
13140
- 00dfff<Low Surrogate, Last>
13141
- 00e000<Private Use, First>
13142
- 00f8ff<Private Use, Last>
13143
13129
  00f900CJK COMPATIBILITY IDEOGRAPH-F900
13144
13130
  00f901CJK COMPATIBILITY IDEOGRAPH-F901
13145
13131
  00f902CJK COMPATIBILITY IDEOGRAPH-F902
@@ -18449,8 +18435,6 @@
18449
18435
  01f091DOMINO TILE VERTICAL-06-04
18450
18436
  01f092DOMINO TILE VERTICAL-06-05
18451
18437
  01f093DOMINO TILE VERTICAL-06-06
18452
- 020000<CJK Ideograph Extension B, First>
18453
- 02a6d6<CJK Ideograph Extension B, Last>
18454
18438
  02f800CJK COMPATIBILITY IDEOGRAPH-2F800
18455
18439
  02f801CJK COMPATIBILITY IDEOGRAPH-2F801
18456
18440
  02f802CJK COMPATIBILITY IDEOGRAPH-2F802
@@ -19330,7 +19314,3 @@
19330
19314
  0e01edVARIATION SELECTOR-254
19331
19315
  0e01eeVARIATION SELECTOR-255
19332
19316
  0e01efVARIATION SELECTOR-256
19333
- 0f0000<Plane 15 Private Use, First>
19334
- 0ffffd<Plane 15 Private Use, Last>
19335
- 100000<Plane 16 Private Use, First>
19336
- 10fffd<Plane 16 Private Use, Last>
@@ -0,0 +1,86 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_multivalued_map"
4
+ require "unicode_utils/hangul_syllable_decomposition"
5
+ require "unicode_utils/combining_class"
6
+
7
+ module UnicodeUtils
8
+
9
+ CANONICAL_DECOMPOSITION_MAP =
10
+ Impl.read_multivalued_map("canonical_decomposition_map") # :nodoc:
11
+
12
+ # Get the canonical decomposition of the given string, also called
13
+ # Normalization Form D or short NFD.
14
+ #
15
+ # The Unicode standard has multiple representations for some
16
+ # characters. One representation as a single codepoint and other
17
+ # representation(s) as a combination of multiple codepoints. This
18
+ # function "decomposes" these characters in +str+ into the latter
19
+ # representation.
20
+ #
21
+ # Example:
22
+ #
23
+ # # LATIN SMALL LETTER A WITH ACUTE => LATIN SMALL LETTER A, COMBINING ACUTE ACCENT
24
+ # UnicodeUtils.canonical_decomposition("\u{E1}") => "\u{61}\u{301}"
25
+ def canonical_decomposition(str)
26
+ res = String.new.force_encoding(str.encoding)
27
+ str.each_codepoint { |cp|
28
+ if cp >= 0xAC00 && cp <= 0xD7A3 # hangul syllable
29
+ Impl.append_hangul_syllable_decomposition(res, cp)
30
+ else
31
+ Impl.append_recursive_canonical_decomposition_mapping(res, cp)
32
+ end
33
+ }
34
+ Impl.put_into_canonical_order(res)
35
+ end
36
+ module_function :canonical_decomposition
37
+
38
+ module Impl # :nodoc:
39
+
40
+ def self.append_recursive_canonical_decomposition_mapping(str, cp)
41
+ mapping = CANONICAL_DECOMPOSITION_MAP[cp]
42
+ if mapping
43
+ mapping.each { |c|
44
+ append_recursive_canonical_decomposition_mapping(str, c)
45
+ }
46
+ else
47
+ str << cp
48
+ end
49
+ end
50
+
51
+ def self.put_into_canonical_order(str)
52
+ reorder_needed = false
53
+ last_cp = nil
54
+ str.each_codepoint { |cp|
55
+ if last_cp
56
+ cc = COMBINING_CLASS_MAP[cp] || 0
57
+ if cc != 0
58
+ if (COMBINING_CLASS_MAP[last_cp] || 0) > cc
59
+ reorder_needed = true
60
+ break
61
+ end
62
+ end
63
+ end
64
+ last_cp = cp
65
+ }
66
+ return str unless reorder_needed
67
+ res = String.new.force_encoding(str.encoding)
68
+ last_cp = nil
69
+ str.each_codepoint { |cp|
70
+ if last_cp
71
+ cc = COMBINING_CLASS_MAP[cp] || 0
72
+ if cc != 0 && (COMBINING_CLASS_MAP[last_cp] || 0) > cc
73
+ res << cp
74
+ cp = nil
75
+ end
76
+ res << last_cp
77
+ end
78
+ last_cp = cp
79
+ }
80
+ res << last_cp if last_cp
81
+ put_into_canonical_order(res)
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/canonical_decomposition"
4
+
5
+ module UnicodeUtils
6
+
7
+ # The strings +a+ and +b+ are canonical equivalents if their
8
+ # canonical decompositions are equal.
9
+ #
10
+ # Example:
11
+ #
12
+ # UnicodeUtils.canonical_equivalents?("Äste", "A\u{308}ste") => true
13
+ # UnicodeUtils.canonical_equivalents?("Äste", "Aste") => false
14
+ def canonical_equivalents?(a, b)
15
+ UnicodeUtils.canonical_decomposition(a) ==
16
+ UnicodeUtils.canonical_decomposition(b)
17
+ end
18
+ module_function :canonical_equivalents?
19
+
20
+ end
@@ -22,7 +22,7 @@ module UnicodeUtils
22
22
 
23
23
  end
24
24
 
25
- COMBINING_CLASS_MAP = Impl.read_combining_class_map # :nodoc:
25
+ COMBINING_CLASS_MAP = Impl.read_combining_class_map() # :nodoc:
26
26
 
27
27
  # Get the combining class of the given character as an integer in
28
28
  # the range 0..255.
@@ -7,9 +7,9 @@ require "unicode_utils/combining_class"
7
7
 
8
8
  module UnicodeUtils
9
9
 
10
- module Impl # :nodoc:
10
+ module Impl # :nodoc:all
11
11
 
12
- class ConditionalCasing # :nodoc:
12
+ class ConditionalCasing
13
13
 
14
14
  attr_reader :mapping
15
15
 
@@ -23,7 +23,7 @@ module UnicodeUtils
23
23
 
24
24
  end
25
25
 
26
- class BeforeDotConditionalCasing < ConditionalCasing # :nodoc:
26
+ class BeforeDotConditionalCasing < ConditionalCasing
27
27
 
28
28
  def context_match?(str, pos)
29
29
  (pos + 1).upto(str.length - 1) { |i|
@@ -37,7 +37,7 @@ module UnicodeUtils
37
37
 
38
38
  end
39
39
 
40
- class NotBeforeDotConditionalCasing < BeforeDotConditionalCasing # :nodoc:
40
+ class NotBeforeDotConditionalCasing < BeforeDotConditionalCasing
41
41
 
42
42
  def context_match?(str, pos)
43
43
  !super
@@ -45,7 +45,7 @@ module UnicodeUtils
45
45
 
46
46
  end
47
47
 
48
- class MoreAboveConditionalCasing < ConditionalCasing # :nodoc:
48
+ class MoreAboveConditionalCasing < ConditionalCasing
49
49
 
50
50
  def context_match?(str, pos)
51
51
  (pos + 1).upto(str.length - 1) { |i|
@@ -59,7 +59,7 @@ module UnicodeUtils
59
59
 
60
60
  end
61
61
 
62
- class AfterIConditionalCasing < ConditionalCasing # :nodoc:
62
+ class AfterIConditionalCasing < ConditionalCasing
63
63
 
64
64
  def context_match?(str, pos)
65
65
  (pos - 1).downto(0) { |i|
@@ -73,7 +73,7 @@ module UnicodeUtils
73
73
 
74
74
  end
75
75
 
76
- class AfterSoftDottedConditionalCasing < ConditionalCasing # :nodoc:
76
+ class AfterSoftDottedConditionalCasing < ConditionalCasing
77
77
 
78
78
  def context_match?(str, pos)
79
79
  (pos - 1).downto(0) { |i|
@@ -87,7 +87,7 @@ module UnicodeUtils
87
87
 
88
88
  end
89
89
 
90
- class FinalSigmaConditionalCasing < ConditionalCasing # :nodoc:
90
+ class FinalSigmaConditionalCasing < ConditionalCasing
91
91
 
92
92
  def context_match?(str, pos)
93
93
  before_match?(str, pos) && !after_match?(str, pos)
@@ -1,12 +1,12 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  require "unicode_utils/simple_downcase"
4
- require "unicode_utils/read_special_casing_map"
4
+ require "unicode_utils/read_multivalued_map"
5
5
  require "unicode_utils/conditional_casing"
6
6
 
7
7
  module UnicodeUtils
8
8
 
9
- SPECIAL_DOWNCASE_MAP = Impl.read_special_casing_map("special_lc_map") # :nodoc:
9
+ SPECIAL_DOWNCASE_MAP = Impl.read_multivalued_map("special_lc_map") # :nodoc:
10
10
 
11
11
  # Perform a full case-conversion of +str+ to lowercase according to
12
12
  # the Unicode standard.
@@ -0,0 +1,44 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module UnicodeUtils
4
+
5
+ # Derives the canonical decomposition of the given Hangul syllable.
6
+ #
7
+ # Example:
8
+ #
9
+ # UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
10
+ def hangul_syllable_decomposition(char)
11
+ Impl.append_hangul_syllable_decomposition(
12
+ String.new.force_encoding(char.encoding), char.ord)
13
+ end
14
+ module_function :hangul_syllable_decomposition
15
+
16
+ module Impl # :nodoc:
17
+
18
+ def self.append_hangul_syllable_decomposition(str, s)
19
+ # constants
20
+ sbase = 0xAC00
21
+ lbase = 0x1100
22
+ vbase = 0x1161
23
+ tbase = 0x11A7
24
+ scount = 11172
25
+ lcount = 19
26
+ vcount = 21
27
+ tcount = 28
28
+ ncount = vcount * tcount
29
+
30
+ sindex = s - sbase
31
+ if 0 <= sindex && sindex < scount
32
+ l = lbase + sindex / ncount
33
+ v = vbase + (sindex % ncount) / tcount
34
+ t = tbase + sindex % tcount
35
+ str << l << v
36
+ str << t if t != tbase
37
+ else
38
+ str << s
39
+ end
40
+ end
41
+
42
+ end
43
+
44
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_names"
4
+
5
+ module UnicodeUtils
6
+
7
+ JAMO_SHORT_NAME_MAP = Impl.read_names("jamo_short_names") # :nodoc:
8
+
9
+ # The the Jamo Short Name property of the given character (defaults
10
+ # to nil).
11
+ #
12
+ # Example:
13
+ #
14
+ # UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
15
+ def jamo_short_name(char)
16
+ JAMO_SHORT_NAME_MAP[char.ord]
17
+ end
18
+ module_function :jamo_short_name
19
+
20
+ end
@@ -1,33 +1,38 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
- module UnicodeUtils
4
-
5
- module Impl # :nodoc:
3
+ require "unicode_utils/read_names"
4
+ require "unicode_utils/hangul_syllable_decomposition"
5
+ require "unicode_utils/jamo_short_name"
6
6
 
7
- def self.read_names
8
- path = File.join(File.dirname(__FILE__), "..", "..", "cdata", "names")
9
- Hash.new.tap { |map|
10
- File.open(path, "r:US-ASCII:-") do |input|
11
- buffer = "x" * 6
12
- buffer.force_encoding(Encoding::US_ASCII)
13
- while input.read(6, buffer)
14
- map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
15
- end
16
- end
17
- }
18
- end
19
-
20
- end
7
+ module UnicodeUtils
21
8
 
22
- NAME_MAP = Impl.read_names # :nodoc:
9
+ NAME_MAP = Impl.read_names("names") # :nodoc:
23
10
 
24
- # Get the Unicode name of the single codepoint in str.
11
+ # Get the normative Unicode name of the given character.
12
+ #
13
+ # Private Use codepoints have no name, this function returns nil for
14
+ # such codepoints.
15
+ #
16
+ # All control characters have the special name "<control>". All
17
+ # other characters have a unique name.
25
18
  #
26
19
  # Example:
27
20
  #
28
21
  # UnicodeUtils.name "ᾀ" => "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"
29
- def name(str)
30
- NAME_MAP[str.codepoints.first]
22
+ # UnicodeUtils.name "\t" => "<control>"
23
+ def name(char)
24
+ cp = char.ord
25
+ NAME_MAP[cp] ||
26
+ case cp
27
+ when 0x3400..0x4DB5, 0x4E00..0x9FC3, 0x20000..0x2A6D6
28
+ "CJK UNIFIED IDEOGRAPH-#{sprintf('%04x', cp).upcase}"
29
+ when 0xAC00..0xD7A3
30
+ "HANGUL SYLLABLE ".tap do |n|
31
+ hangul_syllable_decomposition(char).each_char { |c|
32
+ n << (jamo_short_name(c) || '')
33
+ }
34
+ end
35
+ end
31
36
  end
32
37
  module_function :name
33
38
 
@@ -0,0 +1,142 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/canonical_decomposition"
4
+ require "unicode_utils/combining_class"
5
+ require "unicode_utils/read_codepoint_set"
6
+
7
+ module UnicodeUtils
8
+
9
+ module Impl # :nodoc:all
10
+
11
+ COMPOSITION_EXCLUSION_SET =
12
+ Impl.read_codepoint_set("composition_exclusion_set")
13
+
14
+ CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
15
+ CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
16
+ if decomp.length == 2
17
+ (m[decomp[0]] ||= {})[decomp[1]] = comp
18
+ end
19
+ }
20
+ end
21
+
22
+ module NFC
23
+
24
+ def self.starter?(cp)
25
+ (COMBINING_CLASS_MAP[cp] || 0) == 0
26
+ end
27
+
28
+ # does b block c?
29
+ def self.blocked?(b, c)
30
+ # From the standard:
31
+ # "If a combining character sequence is in canonical order,
32
+ # then testing whether a character is blocked requires looking
33
+ # at only the immediately preceding character."
34
+ # cpary is in canonical order (since it comes out of
35
+ # canonical_decomposition).
36
+ (COMBINING_CLASS_MAP[b] || 0) >= (COMBINING_CLASS_MAP[c] || 0)
37
+ end
38
+
39
+ def self.primary_composite?(cp)
40
+ unless CANONICAL_DECOMPOSITION_MAP[cp] ||
41
+ # has hangul syllable decomposition?
42
+ (cp >= 0xAC00 && cp <= 0xD7A3)
43
+ return false
44
+ end
45
+ !COMPOSITION_EXCLUSION_SET.include?(cp)
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ # Get +str+ in Normalization Form C.
53
+ #
54
+ # The Unicode standard has multiple representations for some
55
+ # characters. One representation as a single codepoint and other
56
+ # representation(s) as a combination of multiple codepoints. This
57
+ # function "composes" these characters into the former
58
+ # representation.
59
+ #
60
+ # Example:
61
+ #
62
+ # UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
63
+ def nfc(str)
64
+ str = UnicodeUtils.canonical_decomposition(str)
65
+
66
+ ### constants for hangul composition ###
67
+ sbase = 0xAC00
68
+ lbase = 0x1100
69
+ vbase = 0x1161
70
+ tbase = 0x11A7
71
+ lcount = 19
72
+ vcount = 21
73
+ tcount = 28
74
+ ncount = vcount * tcount
75
+ scount = lcount * ncount
76
+ ########################################
77
+
78
+ String.new.force_encoding(str.encoding).tap do |res|
79
+ last_starter = nil
80
+ uncomposable_non_starters = []
81
+ str.each_codepoint { |cp|
82
+ if Impl::NFC.starter?(cp)
83
+ combined = false
84
+ if last_starter && uncomposable_non_starters.empty?
85
+ ### hangul ###
86
+ lindex = last_starter - lbase
87
+ if 0 <= lindex && lindex < lcount
88
+ vindex = cp - vbase
89
+ if 0 <= vindex && vindex <= vcount
90
+ last_starter =
91
+ sbase + (lindex * vcount + vindex) * tcount
92
+ combined = true
93
+ end
94
+ end
95
+ unless combined
96
+ sindex = last_starter - sbase
97
+ if 0 <= sindex && sindex < scount && (sindex % tcount) == 0
98
+ tindex = cp - tbase
99
+ if 0 <= tindex && tindex < tcount
100
+ last_starter += tindex
101
+ combined = true
102
+ end
103
+ end
104
+ end
105
+ ##############
106
+ unless combined
107
+ map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
108
+ composition = map && map[cp]
109
+ if composition && Impl::NFC.primary_composite?(composition)
110
+ last_starter = composition
111
+ combined = true
112
+ end
113
+ end
114
+ end
115
+ unless combined
116
+ res << last_starter if last_starter
117
+ uncomposable_non_starters.each { |nc| res << nc }
118
+ uncomposable_non_starters.clear
119
+ last_starter = cp
120
+ end
121
+ else
122
+ last_non_starter = uncomposable_non_starters.last
123
+ if last_non_starter && Impl::NFC.blocked?(last_non_starter, cp)
124
+ uncomposable_non_starters << cp
125
+ else
126
+ map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
127
+ composition = map && map[cp]
128
+ if composition && Impl::NFC.primary_composite?(composition)
129
+ last_starter = composition
130
+ else
131
+ uncomposable_non_starters << cp
132
+ end
133
+ end
134
+ end
135
+ }
136
+ res << last_starter if last_starter
137
+ uncomposable_non_starters.each { |nc| res << nc }
138
+ end
139
+ end
140
+ module_function :nfc
141
+
142
+ end
@@ -0,0 +1,15 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/canonical_decomposition"
4
+
5
+ module UnicodeUtils
6
+
7
+ # Get +str+ in Normalization Form D.
8
+ #
9
+ # Alias for UnicodeUtils.canonical_decomposition.
10
+ def nfd(str)
11
+ UnicodeUtils.canonical_decomposition(str)
12
+ end
13
+ module_function :nfd
14
+
15
+ end
@@ -4,7 +4,7 @@ module UnicodeUtils
4
4
 
5
5
  module Impl # :nodoc:
6
6
 
7
- def self.read_special_casing_map(filename)
7
+ def self.read_multivalued_map(filename)
8
8
  path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
9
9
  Hash.new.tap { |map|
10
10
  File.open(path, "r:US-ASCII:-") do |input|
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ module UnicodeUtils
4
+
5
+ module Impl # :nodoc:
6
+
7
+ def self.read_names(filename)
8
+ path = File.join(File.dirname(__FILE__), "..", "..", "cdata", filename)
9
+ Hash.new.tap { |map|
10
+ File.open(path, "r:US-ASCII:-") do |input|
11
+ buffer = "x" * 6
12
+ buffer.force_encoding(Encoding::US_ASCII)
13
+ while input.read(6, buffer)
14
+ map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
15
+ end
16
+ end
17
+ }
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -1,12 +1,12 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  require "unicode_utils/simple_upcase"
4
- require "unicode_utils/read_special_casing_map"
4
+ require "unicode_utils/read_multivalued_map"
5
5
  require "unicode_utils/conditional_casing"
6
6
 
7
7
  module UnicodeUtils
8
8
 
9
- SPECIAL_UPCASE_MAP = Impl.read_special_casing_map("special_uc_map") # :nodoc:
9
+ SPECIAL_UPCASE_MAP = Impl.read_multivalued_map("special_uc_map") # :nodoc:
10
10
 
11
11
  # Perform a full case-conversion of +str+ to uppercase according to
12
12
  # the Unicode standard.
@@ -3,6 +3,6 @@
3
3
  module UnicodeUtils
4
4
 
5
5
  # Corresponds to the unicode_utils gem version.
6
- VERSION = "0.2.0"
6
+ VERSION = "0.3.0"
7
7
 
8
8
  end
data/lib/unicode_utils.rb CHANGED
@@ -13,3 +13,21 @@ require "unicode_utils/cased_char_q"
13
13
  require "unicode_utils/case_ignorable_char_q"
14
14
  require "unicode_utils/soft_dotted_char_q"
15
15
  require "unicode_utils/combining_class"
16
+ require "unicode_utils/hangul_syllable_decomposition"
17
+ require "unicode_utils/jamo_short_name"
18
+ require "unicode_utils/canonical_decomposition"
19
+ require "unicode_utils/nfd"
20
+ require "unicode_utils/canonical_equivalents_q"
21
+ require "unicode_utils/nfc"
22
+
23
+ # Read the README[link:files/README_txt.html] for an introduction.
24
+ #
25
+ # Highlevel functions are:
26
+ #
27
+ # UnicodeUtils.upcase:: full conversion to uppercase
28
+ # UnicodeUtils.downcase:: full conversion to lowercase
29
+ # UnicodeUtils.nfd:: Normalization Form D
30
+ # UnicodeUtils.nfc:: Normalization Form C
31
+ # UnicodeUtils.name:: character names
32
+ module UnicodeUtils
33
+ end
@@ -0,0 +1,75 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "test/unit"
4
+
5
+ require "unicode_utils/nfd"
6
+ require "unicode_utils/nfc"
7
+
8
+ # See data/NormalizationTest.txt
9
+ class TestNormalization < Test::Unit::TestCase
10
+
11
+ class Record
12
+ def initialize(ary)
13
+ @ary = ary
14
+ end
15
+ def c1
16
+ @ary[0]
17
+ end
18
+ def c2
19
+ @ary[1]
20
+ end
21
+ def c3
22
+ @ary[2]
23
+ end
24
+ def c4
25
+ @ary[3]
26
+ end
27
+ def c5
28
+ @ary[4]
29
+ end
30
+ end
31
+
32
+ def each_testdata_record
33
+ fn = File.join(File.dirname(__FILE__),
34
+ "..", "data", "NormalizationTest.txt")
35
+ File.open(fn, "r:utf-8:-") do |input|
36
+ input.each_line { |line|
37
+ if line =~ /^([^#]*)#/
38
+ line = $1
39
+ end
40
+ line.strip!
41
+ next if line.empty? || line =~ /^@Part/
42
+ columns = line.split(";")
43
+ ary = columns.map { |column|
44
+ String.new.force_encoding(Encoding::UTF_8).tap do |str|
45
+ column.split(" ").each { |c|
46
+ str << c.strip.to_i(16)
47
+ }
48
+ end
49
+ }
50
+ yield Record.new(ary)
51
+ }
52
+ end
53
+ end
54
+
55
+ def test_nfd
56
+ each_testdata_record { |r|
57
+ assert_equal r.c3, UnicodeUtils.nfd(r.c1)
58
+ assert_equal r.c3, UnicodeUtils.nfd(r.c2)
59
+ assert_equal r.c3, UnicodeUtils.nfd(r.c3)
60
+ assert_equal r.c5, UnicodeUtils.nfd(r.c4)
61
+ assert_equal r.c5, UnicodeUtils.nfd(r.c5)
62
+ }
63
+ end
64
+
65
+ def test_nfc
66
+ each_testdata_record { |r|
67
+ assert_equal r.c2, UnicodeUtils.nfc(r.c1)
68
+ assert_equal r.c2, UnicodeUtils.nfc(r.c2)
69
+ assert_equal r.c2, UnicodeUtils.nfc(r.c3)
70
+ assert_equal r.c4, UnicodeUtils.nfc(r.c4)
71
+ assert_equal r.c4, UnicodeUtils.nfc(r.c5)
72
+ }
73
+ end
74
+
75
+ end
@@ -9,6 +9,12 @@ class TestUnicodeUtils < Test::Unit::TestCase
9
9
  def test_name
10
10
  assert_equal "LATIN SMALL LETTER F", UnicodeUtils.name("f")
11
11
  assert_equal Encoding::US_ASCII, UnicodeUtils.name("f").encoding
12
+ assert_equal nil, UnicodeUtils.name("\u{e000}") # private use
13
+ assert_equal "<control>", UnicodeUtils.name("\t")
14
+ assert_equal "CJK UNIFIED IDEOGRAPH-4E00", UnicodeUtils.name("\u{4e00}")
15
+ assert_equal "CJK UNIFIED IDEOGRAPH-2A6D6", UnicodeUtils.name("\u{2a6d6}")
16
+ assert_equal "CJK UNIFIED IDEOGRAPH-2A3D6", UnicodeUtils.name("\u{2a3d6}")
17
+ assert_equal "HANGUL SYLLABLE PWILH", UnicodeUtils.name("\u{d4db}")
12
18
  end
13
19
 
14
20
  def test_simple_upcase
@@ -104,4 +110,32 @@ class TestUnicodeUtils < Test::Unit::TestCase
104
110
  assert_equal false, UnicodeUtils.soft_dotted_char?("a")
105
111
  end
106
112
 
113
+ def test_hangul_syllable_decomposition
114
+ assert_equal "\u{1111}\u{1171}\u{11b6}", UnicodeUtils.hangul_syllable_decomposition("\u{d4db}")
115
+ end
116
+
117
+ def test_jamo_short_name
118
+ assert_equal "GG", UnicodeUtils.jamo_short_name("\u{1101}")
119
+ end
120
+
121
+ def test_canonical_decomposition
122
+ assert_equal "\u{61}\u{301}",
123
+ UnicodeUtils.canonical_decomposition("\u{E1}")
124
+ assert_equal "\u{61}\u{301}\u{63}\u{327}\u{301}",
125
+ UnicodeUtils.canonical_decomposition("\u{e1}\u{63}\u{301}\u{327}")
126
+ end
127
+
128
+ def test_nfd
129
+ assert_equal "\u{61}\u{301}", UnicodeUtils.nfd("\u{E1}")
130
+ end
131
+
132
+ def test_canonical_equivalents?
133
+ assert_equal true, UnicodeUtils.canonical_equivalents?("Äste", "A\u{308}ste")
134
+ assert_equal false, UnicodeUtils.canonical_equivalents?("Äste", "Aste")
135
+ end
136
+
137
+ def test_nfc
138
+ assert_equal "Häschen", UnicodeUtils.nfc("Ha\u{308}schen")
139
+ end
140
+
107
141
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stefan Lang
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-11-02 00:00:00 +01:00
12
+ date: 2008-11-16 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -22,20 +22,27 @@ extensions: []
22
22
  extra_rdoc_files:
23
23
  - README.txt
24
24
  files:
25
- - lib/unicode_utils/read_special_casing_map.rb
26
25
  - lib/unicode_utils/conditional_casing.rb
26
+ - lib/unicode_utils/hangul_syllable_decomposition.rb
27
27
  - lib/unicode_utils/simple_downcase.rb
28
28
  - lib/unicode_utils/read_codepoint_map.rb
29
+ - lib/unicode_utils/read_names.rb
29
30
  - lib/unicode_utils/read_codepoint_set.rb
30
31
  - lib/unicode_utils/titlecase_char_q.rb
31
32
  - lib/unicode_utils/cased_char_q.rb
32
33
  - lib/unicode_utils/downcase.rb
33
34
  - lib/unicode_utils/name.rb
34
35
  - lib/unicode_utils/uppercase_char_q.rb
36
+ - lib/unicode_utils/read_multivalued_map.rb
37
+ - lib/unicode_utils/canonical_equivalents_q.rb
38
+ - lib/unicode_utils/canonical_decomposition.rb
35
39
  - lib/unicode_utils/upcase.rb
40
+ - lib/unicode_utils/nfc.rb
41
+ - lib/unicode_utils/nfd.rb
36
42
  - lib/unicode_utils/case_ignorable_char_q.rb
37
43
  - lib/unicode_utils/simple_upcase.rb
38
44
  - lib/unicode_utils/lowercase_char_q.rb
45
+ - lib/unicode_utils/jamo_short_name.rb
39
46
  - lib/unicode_utils/combining_class.rb
40
47
  - lib/unicode_utils/version.rb
41
48
  - lib/unicode_utils/soft_dotted_char_q.rb
@@ -48,11 +55,15 @@ files:
48
55
  - cdata/names
49
56
  - cdata/cond_uc_map
50
57
  - cdata/special_uc_map
58
+ - cdata/canonical_decomposition_map
51
59
  - cdata/soft_dotted_set
52
60
  - cdata/simple_lc_map
53
61
  - cdata/case_ignorable_set
62
+ - cdata/composition_exclusion_set
54
63
  - cdata/simple_uc_map
64
+ - cdata/jamo_short_names
55
65
  - cdata/prop_set_uppercase
66
+ - test/test_normalization.rb
56
67
  - test/test_unicode_utils.rb
57
68
  - README.txt
58
69
  - LICENSE.txt
@@ -78,10 +89,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
89
  version:
79
90
  requirements: []
80
91
 
81
- rubyforge_project:
92
+ rubyforge_project: unicode-utils
82
93
  rubygems_version: 1.3.1
83
94
  signing_key:
84
95
  specification_version: 2
85
96
  summary: additional Unicode aware functions for Ruby 1.9
86
97
  test_files:
98
+ - test/test_normalization.rb
87
99
  - test/test_unicode_utils.rb