unicode_utils 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_cdata"
4
+
5
+ module UnicodeUtils
6
+
7
+ # Maps codepoints to integer codes. For the integer code to property
8
+ # mapping, see #compile_word_break_property in data/compile.rb.
9
+ WORD_BREAK_MAP =
10
+ Impl.read_hexdigit_map("word_break_property") # :nodoc:
11
+
12
+ # Split +str+ along word boundaries according to Unicode's Default
13
+ # Word Boundary Specification, calling the given block with each
14
+ # word. Returns +str+, or an enumerator if no block is given.
15
+ #
16
+ # Example:
17
+ #
18
+ # require "unicode_utils/each_word"
19
+ # UnicodeUtils.each_word("Hello, world!").to_a => ["Hello", ",", " ", "world", "!"]
20
+ def each_word(str)
21
+ return enum_for(__method__, str) unless block_given?
22
+ cs = str.each_codepoint.map { |c| WORD_BREAK_MAP[c] }
23
+ cs << nil << nil # for negative indices
24
+ word = String.new.force_encoding(str.encoding)
25
+ i = 0
26
+ str.each_codepoint { |c|
27
+ word << c
28
+ if Impl.word_break?(cs, i) && !word.empty?
29
+ yield word
30
+ word = String.new.force_encoding(str.encoding)
31
+ end
32
+ i += 1
33
+ }
34
+ yield word unless word.empty?
35
+ str
36
+ end
37
+ module_function :each_word
38
+
39
+ module Impl # :nodoc:all
40
+
41
+ def self.word_break?(cs, i)
42
+ # wb3
43
+ cs_i = cs[i]
44
+ i1 = i + 1
45
+ cs_i1 = cs[i1]
46
+ if cs_i == 0x0 && cs_i1 == 0x1
47
+ return false
48
+ end
49
+ # wb3a
50
+ if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
51
+ return true
52
+ end
53
+ # wb3b
54
+ if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
55
+ return true
56
+ end
57
+ # wb5
58
+ i0 = i
59
+ # inline skip_l
60
+ c = nil
61
+ loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
62
+ ci0 = c
63
+ if ci0 == 0x6 && cs_i1 == 0x6
64
+ return false
65
+ end
66
+ # wb6
67
+ i2 = i1 + 1
68
+ # inline skip_r
69
+ loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
70
+ if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
71
+ return false
72
+ end
73
+ # wb7
74
+ i_1 = i0 - 1
75
+ # inline skip_l
76
+ loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
77
+ if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
78
+ return false
79
+ end
80
+ # wb8
81
+ if ci0 == 0xA && cs_i1 == 0xA
82
+ return false
83
+ end
84
+ # wb9
85
+ if ci0 == 0x6 && cs_i1 == 0xA
86
+ return false
87
+ end
88
+ # wb10
89
+ if ci0 == 0xA && cs_i1 == 0x6
90
+ return false
91
+ end
92
+ # wb11
93
+ if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
94
+ return false
95
+ end
96
+ # wb12
97
+ if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
98
+ return false
99
+ end
100
+ # wb13
101
+ if ci0 == 0x5 && cs_i1 == 0x5
102
+ return false
103
+ end
104
+ # wb13a
105
+ if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
106
+ return false
107
+ end
108
+ # wb13b
109
+ if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
110
+ return false
111
+ end
112
+ # break unless next char is Extend/Format
113
+ cs_i1 != 0x3 && cs_i1 != 0x4
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -7,6 +7,7 @@ module UnicodeUtils
7
7
  # Get an array of all Codepoint instances in Codepoint::RANGE whose
8
8
  # name matches regexp. Matching is case insensitive.
9
9
  #
10
+ # require "unicode_utils/grep"
10
11
  # UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
11
12
  def grep(regexp)
12
13
  unless regexp.casefold?
@@ -6,7 +6,8 @@ module UnicodeUtils
6
6
  #
7
7
  # Example:
8
8
  #
9
- # UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
9
+ # require "unicode_utils/hangul_syllable_decomposition"
10
+ # UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
10
11
  def hangul_syllable_decomposition(char)
11
12
  String.new.force_encoding(char.encoding).tap do |str|
12
13
  Impl.append_hangul_syllable_decomposition(str , char.ord)
@@ -11,7 +11,8 @@ module UnicodeUtils
11
11
  #
12
12
  # Example:
13
13
  #
14
- # UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
14
+ # require "unicode_utils/jamo_short_name"
15
+ # UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
15
16
  def jamo_short_name(char)
16
17
  JAMO_SHORT_NAME_MAP[char.ord]
17
18
  end
@@ -21,10 +21,6 @@ module UnicodeUtils
21
21
 
22
22
  module NFC
23
23
 
24
- def self.starter?(cp)
25
- (COMBINING_CLASS_MAP[cp] || 0) == 0
26
- end
27
-
28
24
  # does b block c?
29
25
  def self.blocked?(b, c)
30
26
  # From the standard:
@@ -33,7 +29,7 @@ module UnicodeUtils
33
29
  # at only the immediately preceding character."
34
30
  # cpary is in canonical order (since it comes out of
35
31
  # canonical_decomposition).
36
- (COMBINING_CLASS_MAP[b] || 0) >= (COMBINING_CLASS_MAP[c] || 0)
32
+ COMBINING_CLASS_MAP[b] >= COMBINING_CLASS_MAP[c]
37
33
  end
38
34
 
39
35
  def self.primary_composite?(cp)
@@ -64,7 +60,7 @@ module UnicodeUtils
64
60
  last_starter = nil
65
61
  uncomposable_non_starters = []
66
62
  str.each_codepoint { |cp|
67
- if Impl::NFC.starter?(cp)
63
+ if COMBINING_CLASS_MAP[cp] == 0 # starter?
68
64
  combined = false
69
65
  if last_starter && uncomposable_non_starters.empty?
70
66
  ### hangul ###
@@ -135,6 +131,7 @@ module UnicodeUtils
135
131
  #
136
132
  # Example:
137
133
  #
134
+ # require "unicode_utils/nfc"
138
135
  # UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
139
136
  def nfc(str)
140
137
  str = UnicodeUtils.canonical_decomposition(str)
@@ -13,6 +13,7 @@ module UnicodeUtils
13
13
  #
14
14
  # Example:
15
15
  #
16
+ # require "unicode_utils/nfkc"
16
17
  # # LATIN SMALL LIGATURE FI => LATIN SMALL LETTER F, LATIN SMALL LETTER I
17
18
  # UnicodeUtils.nfkc("fi") => "fi"
18
19
  #
@@ -4,8 +4,7 @@ module UnicodeUtils
4
4
 
5
5
  # Absolute path to the directory from which UnicodeUtils loads its
6
6
  # compiled Unicode data files at runtime.
7
- CDATA_DIR =
8
- File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
7
+ CDATA_DIR = File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
9
8
 
10
9
  module Impl # :nodoc:
11
10
 
@@ -66,6 +65,54 @@ module UnicodeUtils
66
65
  }
67
66
  end
68
67
 
68
+ def self.read_conditional_casings(filename)
69
+ Hash.new.tap { |cp_map|
70
+ open_cdata_file(filename) do |input|
71
+ input.each_line { |line|
72
+ line.chomp!
73
+ record = line.split(";")
74
+ cp = record[0].to_i(16)
75
+ mapping = record[1].split(",").map { |c| c.to_i(16) }
76
+ language_id = record[2].empty? ? nil : record[2].to_sym
77
+ context = record[3] && record[3].gsub('_', '')
78
+ casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
79
+ (cp_map[cp] ||= {})[language_id] = casing
80
+ }
81
+ end
82
+ }
83
+ end
84
+
85
+ def self.read_combining_class_map
86
+ Hash.new.tap { |map|
87
+ open_cdata_file("combining_class_map") do |input|
88
+ buffer = "x" * 6
89
+ buffer.force_encoding(Encoding::US_ASCII)
90
+ cc_buffer = "x" * 2
91
+ cc_buffer.force_encoding(Encoding::US_ASCII)
92
+ while input.read(6, buffer)
93
+ map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
94
+ end
95
+ end
96
+ }
97
+ end
98
+
99
+ # Read a map whose keys are codepoints (6 hexgdigits, converted to
100
+ # integer) and whose values are single hexdigits (converted to
101
+ # integer).
102
+ def self.read_hexdigit_map(filename)
103
+ Hash.new.tap { |map|
104
+ open_cdata_file(filename) do |input|
105
+ buffer = "x" * 6
106
+ buffer.force_encoding(Encoding::US_ASCII)
107
+ val_buffer = "x"
108
+ val_buffer.force_encoding(Encoding::US_ASCII)
109
+ while input.read(6, buffer)
110
+ map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
111
+ end
112
+ end
113
+ }
114
+ end
115
+
69
116
  end
70
117
 
71
118
  end
@@ -16,6 +16,7 @@ module UnicodeUtils
16
16
  #
17
17
  # Examples:
18
18
  #
19
+ # require "unicode_utils/simple_casefold"
19
20
  # UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
20
21
  # UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
21
22
  #
@@ -7,17 +7,16 @@ module UnicodeUtils
7
7
  SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
8
8
 
9
9
  # Map each codepoint in +str+ that has a single codepoint
10
- # lowercase-mapping to that lowercase mapping. +str+ is assumed to be
11
- # in a unicode encoding. The original string is not modified. The
12
- # returned string has the same encoding and same length as the
13
- # original string.
10
+ # lowercase-mapping to that lowercase mapping. The returned string
11
+ # has the same length as the original string.
14
12
  #
15
13
  # This function is locale independent.
16
14
  #
17
15
  # Examples:
18
16
  #
19
- # UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
20
- # UnicodeUtils.simple_downcase("STRASSE") => "strasse"
17
+ # require "unicode_utils/simple_downcase"
18
+ # UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
19
+ # UnicodeUtils.simple_downcase("STRASSE") => "strasse"
21
20
  def simple_downcase(str)
22
21
  String.new.force_encoding(str.encoding).tap { |res|
23
22
  str.each_codepoint { |cp|
@@ -7,17 +7,16 @@ module UnicodeUtils
7
7
  SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
8
8
 
9
9
  # Map each codepoint in +str+ that has a single codepoint
10
- # uppercase-mapping to that uppercase mapping. +str+ is assumed to be
11
- # in a unicode encoding. The original string is not modified. The
12
- # returned string has the same encoding and same length as the
13
- # original string.
10
+ # uppercase-mapping to that uppercase mapping. The returned string
11
+ # has the same length as the original string.
14
12
  #
15
13
  # This function is locale independent.
16
14
  #
17
15
  # Examples:
18
16
  #
19
- # UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
20
- # UnicodeUtils.simple_upcase("weiß") => "WEIß"
17
+ # require "unicode_utils/simple_upcase"
18
+ # UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
19
+ # UnicodeUtils.simple_upcase("weiß") => "WEIß"
21
20
  def simple_upcase(str)
22
21
  String.new.force_encoding(str.encoding).tap { |res|
23
22
  str.each_codepoint { |cp|
@@ -0,0 +1,70 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_cdata"
4
+ require "unicode_utils/conditional_casing"
5
+ require "unicode_utils/each_word"
6
+ require "unicode_utils/cased_char_q"
7
+ require "unicode_utils/downcase"
8
+
9
+ module UnicodeUtils
10
+
11
+ SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
12
+ SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
13
+
14
+ # Convert the first cased character after each word boundary to
15
+ # titlecase and all other cased characters to lowercase. For many,
16
+ # but not all characters, the titlecase mapping is the same as the
17
+ # uppercase mapping.
18
+ #
19
+ # Some conversion rules are language dependent, these are in effect
20
+ # when a non-nil +language_id+ is given. If non-nil, the
21
+ # +language_id+ must be a two letter language code as defined in BCP
22
+ # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
23
+ # language doesn't have a two letter code, the three letter code is
24
+ # to be used. If locale independent behaviour is required, +nil+
25
+ # should be passed explicitely, because a later version of
26
+ # UnicodeUtils may default to something else.
27
+ #
28
+ # Example:
29
+ #
30
+ # require "unicode_utils/titlecase"
31
+ # UnicodeUtils.titlecase("hello, world!") => "Hello, World!"
32
+ def titlecase(str, language_id = nil)
33
+ String.new.force_encoding(str.encoding).tap do |res|
34
+ # ensure O(1) lookup by index
35
+ str = str.encode(Encoding::UTF_32LE)
36
+ i = 0
37
+ each_word(str) { |word|
38
+ cased_char_found = false
39
+ word.each_codepoint { |cp|
40
+ cased = cased_char?(cp)
41
+ if !cased_char_found && cased
42
+ cased_char_found = true
43
+ special_mapping =
44
+ Impl.conditional_titlecase_mapping(cp, str, i, language_id) ||
45
+ SPECIAL_TITLECASE_MAP[cp]
46
+ if special_mapping
47
+ special_mapping.each { |m| res << m }
48
+ else
49
+ res << (SIMPLE_TITLECASE_MAP[cp] || cp)
50
+ end
51
+ elsif cased
52
+ special_mapping =
53
+ Impl.conditional_downcase_mapping(cp, str, i, language_id) ||
54
+ SPECIAL_DOWNCASE_MAP[cp]
55
+ if special_mapping
56
+ special_mapping.each { |m| res << m }
57
+ else
58
+ res << (SIMPLE_DOWNCASE_MAP[cp] || cp)
59
+ end
60
+ else
61
+ res << cp
62
+ end
63
+ i += 1
64
+ }
65
+ }
66
+ end
67
+ end
68
+ module_function :titlecase
69
+
70
+ end
@@ -16,15 +16,22 @@ module UnicodeUtils
16
16
  # +language_id+ must be a two letter language code as defined in BCP
17
17
  # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
18
18
  # language doesn't have a two letter code, the three letter code is
19
- # to be used.
19
+ # to be used. If locale independent behaviour is required, +nil+
20
+ # should be passed explicitely, because a later version of
21
+ # UnicodeUtils may default to something else.
20
22
  #
21
23
  # Examples:
22
24
  #
23
- # UnicodeUtils.upcase("weiß") => "WEISS"
24
- # UnicodeUtils.upcase("i", :en) => "I"
25
- # UnicodeUtils.upcase("i", :tr) => "İ"
25
+ # require "unicode_utils/upcase"
26
+ # UnicodeUtils.upcase("weiß") => "WEISS"
27
+ # UnicodeUtils.upcase("i", :en) => "I"
28
+ # UnicodeUtils.upcase("i", :tr) => "İ"
26
29
  def upcase(str, language_id = nil)
27
30
  String.new.force_encoding(str.encoding).tap { |res|
31
+ if Impl::LANGS_WITH_RULES.include?(language_id)
32
+ # ensure O(1) lookup by index
33
+ str = str.encode(Encoding::UTF_32LE)
34
+ end
28
35
  pos = 0
29
36
  str.each_codepoint { |cp|
30
37
  special_mapping =
@@ -3,6 +3,6 @@
3
3
  module UnicodeUtils
4
4
 
5
5
  # Corresponds to the unicode_utils gem version.
6
- VERSION = "0.5.0"
6
+ VERSION = "1.0.0"
7
7
 
8
8
  end
@@ -177,4 +177,50 @@ class TestUnicodeUtils < Test::Unit::TestCase
177
177
  UnicodeUtils.casefold("weiß")
178
178
  end
179
179
 
180
+ def test_each_grapheme
181
+ graphemes = []
182
+ UnicodeUtils.each_grapheme("word") { |g| graphemes << g }
183
+ assert_equal ["w", "o", "r", "d"], graphemes
184
+ UnicodeUtils.each_grapheme("") { |g| flunk }
185
+ graphemes = []
186
+ UnicodeUtils.each_grapheme("u\u{308}mit") { |g| graphemes << g }
187
+ # diaeresis
188
+ assert_equal ["u\u{308}", "m", "i", "t"], graphemes
189
+ # hangul syllable
190
+ graphemes = []
191
+ UnicodeUtils.each_grapheme("\u{1111}\u{1171}\u{11b6}\u{d4db}") { |g| graphemes << g }
192
+ assert_equal ["\u{1111}\u{1171}\u{11b6}", "\u{d4db}"], graphemes
193
+ assert_equal ["a", "\r\n", "b"], UnicodeUtils.each_grapheme("a\r\nb").to_a
194
+ end
195
+
196
+ def test_each_word
197
+ words = []
198
+ UnicodeUtils.each_word("two words") { |w| words << w }
199
+ assert_equal ["two", " ", "words"], words
200
+ assert_equal ["a", " ", "b"], UnicodeUtils.each_word("a b").to_a
201
+ assert_equal [" ", "b"], UnicodeUtils.each_word(" b").to_a
202
+ assert_equal ["a", " "], UnicodeUtils.each_word("a ").to_a
203
+ assert_equal [" "], UnicodeUtils.each_word(" ").to_a
204
+ assert_equal ["a"], UnicodeUtils.each_word("a").to_a
205
+ assert_equal [], UnicodeUtils.each_word("").to_a
206
+ assert_equal ["Hello", ",", " ", "world", "!"],
207
+ UnicodeUtils.each_word("Hello, world!").to_a
208
+ assert_equal ["o\u{308}12"],
209
+ UnicodeUtils.each_word("o\u{308}12").to_a
210
+ assert_equal ["o\u{308}1"],
211
+ UnicodeUtils.each_word("o\u{308}1").to_a
212
+ assert_equal ["o\u{308}"],
213
+ UnicodeUtils.each_word("o\u{308}").to_a
214
+ assert_equal ["\u{308}", "o"],
215
+ UnicodeUtils.each_word("\u{308}o").to_a
216
+ end
217
+
218
+ def test_titlecase
219
+ assert_equal "Hello, World!", UnicodeUtils.titlecase("heLlo, world!")
220
+ assert_equal "Find", UnicodeUtils.titlecase("finD")
221
+ assert_equal "Ümit Huber Jandl", UnicodeUtils.titlecase("ümit huber jandl")
222
+ assert_equal "İ Can Has 1Kg Cheesburger",
223
+ UnicodeUtils.titlecase("i can has 1kg CHEESBURGER", :tr)
224
+ end
225
+
180
226
  end