unicode_utils 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_cdata"
4
+
5
+ module UnicodeUtils
6
+
7
+ # Maps codepoints to integer codes. For the integer code to property
8
+ # mapping, see #compile_word_break_property in data/compile.rb.
9
+ WORD_BREAK_MAP =
10
+ Impl.read_hexdigit_map("word_break_property") # :nodoc:
11
+
12
+ # Split +str+ along word boundaries according to Unicode's Default
13
+ # Word Boundary Specification, calling the given block with each
14
+ # word. Returns +str+, or an enumerator if no block is given.
15
+ #
16
+ # Example:
17
+ #
18
+ # require "unicode_utils/each_word"
19
+ # UnicodeUtils.each_word("Hello, world!").to_a => ["Hello", ",", " ", "world", "!"]
20
+ def each_word(str)
21
+ return enum_for(__method__, str) unless block_given?
22
+ cs = str.each_codepoint.map { |c| WORD_BREAK_MAP[c] }
23
+ cs << nil << nil # for negative indices
24
+ word = String.new.force_encoding(str.encoding)
25
+ i = 0
26
+ str.each_codepoint { |c|
27
+ word << c
28
+ if Impl.word_break?(cs, i) && !word.empty?
29
+ yield word
30
+ word = String.new.force_encoding(str.encoding)
31
+ end
32
+ i += 1
33
+ }
34
+ yield word unless word.empty?
35
+ str
36
+ end
37
+ module_function :each_word
38
+
39
+ module Impl # :nodoc:all
40
+
41
+ def self.word_break?(cs, i)
42
+ # wb3
43
+ cs_i = cs[i]
44
+ i1 = i + 1
45
+ cs_i1 = cs[i1]
46
+ if cs_i == 0x0 && cs_i1 == 0x1
47
+ return false
48
+ end
49
+ # wb3a
50
+ if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
51
+ return true
52
+ end
53
+ # wb3b
54
+ if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
55
+ return true
56
+ end
57
+ # wb5
58
+ i0 = i
59
+ # inline skip_l
60
+ c = nil
61
+ loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
62
+ ci0 = c
63
+ if ci0 == 0x6 && cs_i1 == 0x6
64
+ return false
65
+ end
66
+ # wb6
67
+ i2 = i1 + 1
68
+ # inline skip_r
69
+ loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
70
+ if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
71
+ return false
72
+ end
73
+ # wb7
74
+ i_1 = i0 - 1
75
+ # inline skip_l
76
+ loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
77
+ if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
78
+ return false
79
+ end
80
+ # wb8
81
+ if ci0 == 0xA && cs_i1 == 0xA
82
+ return false
83
+ end
84
+ # wb9
85
+ if ci0 == 0x6 && cs_i1 == 0xA
86
+ return false
87
+ end
88
+ # wb10
89
+ if ci0 == 0xA && cs_i1 == 0x6
90
+ return false
91
+ end
92
+ # wb11
93
+ if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
94
+ return false
95
+ end
96
+ # wb12
97
+ if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
98
+ return false
99
+ end
100
+ # wb13
101
+ if ci0 == 0x5 && cs_i1 == 0x5
102
+ return false
103
+ end
104
+ # wb13a
105
+ if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
106
+ return false
107
+ end
108
+ # wb13b
109
+ if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
110
+ return false
111
+ end
112
+ # break unless next char is Extend/Format
113
+ cs_i1 != 0x3 && cs_i1 != 0x4
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -7,6 +7,7 @@ module UnicodeUtils
7
7
  # Get an array of all Codepoint instances in Codepoint::RANGE whose
8
8
  # name matches regexp. Matching is case insensitive.
9
9
  #
10
+ # require "unicode_utils/grep"
10
11
  # UnicodeUtils.grep(/angstrom/) => [#<U+212B "Å" ANGSTROM SIGN utf8:e2,84,ab>]
11
12
  def grep(regexp)
12
13
  unless regexp.casefold?
@@ -6,7 +6,8 @@ module UnicodeUtils
6
6
  #
7
7
  # Example:
8
8
  #
9
- # UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
9
+ # require "unicode_utils/hangul_syllable_decomposition"
10
+ # UnicodeUtils.hangul_syllable_decomposition("\u{d4db}") => "\u{1111}\u{1171}\u{11b6}"
10
11
  def hangul_syllable_decomposition(char)
11
12
  String.new.force_encoding(char.encoding).tap do |str|
12
13
  Impl.append_hangul_syllable_decomposition(str , char.ord)
@@ -11,7 +11,8 @@ module UnicodeUtils
11
11
  #
12
12
  # Example:
13
13
  #
14
- # UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
14
+ # require "unicode_utils/jamo_short_name"
15
+ # UnicodeUtils.jamo_short_name("\u{1101}") => "GG"
15
16
  def jamo_short_name(char)
16
17
  JAMO_SHORT_NAME_MAP[char.ord]
17
18
  end
@@ -21,10 +21,6 @@ module UnicodeUtils
21
21
 
22
22
  module NFC
23
23
 
24
- def self.starter?(cp)
25
- (COMBINING_CLASS_MAP[cp] || 0) == 0
26
- end
27
-
28
24
  # does b block c?
29
25
  def self.blocked?(b, c)
30
26
  # From the standard:
@@ -33,7 +29,7 @@ module UnicodeUtils
33
29
  # at only the immediately preceding character."
34
30
  # cpary is in canonical order (since it comes out of
35
31
  # canonical_decomposition).
36
- (COMBINING_CLASS_MAP[b] || 0) >= (COMBINING_CLASS_MAP[c] || 0)
32
+ COMBINING_CLASS_MAP[b] >= COMBINING_CLASS_MAP[c]
37
33
  end
38
34
 
39
35
  def self.primary_composite?(cp)
@@ -64,7 +60,7 @@ module UnicodeUtils
64
60
  last_starter = nil
65
61
  uncomposable_non_starters = []
66
62
  str.each_codepoint { |cp|
67
- if Impl::NFC.starter?(cp)
63
+ if COMBINING_CLASS_MAP[cp] == 0 # starter?
68
64
  combined = false
69
65
  if last_starter && uncomposable_non_starters.empty?
70
66
  ### hangul ###
@@ -135,6 +131,7 @@ module UnicodeUtils
135
131
  #
136
132
  # Example:
137
133
  #
134
+ # require "unicode_utils/nfc"
138
135
  # UnicodeUtils.nfc("La\u{308}mpchen") => "Lämpchen"
139
136
  def nfc(str)
140
137
  str = UnicodeUtils.canonical_decomposition(str)
@@ -13,6 +13,7 @@ module UnicodeUtils
13
13
  #
14
14
  # Example:
15
15
  #
16
+ # require "unicode_utils/nfkc"
16
17
  # # LATIN SMALL LIGATURE FI => LATIN SMALL LETTER F, LATIN SMALL LETTER I
17
18
  # UnicodeUtils.nfkc("fi") => "fi"
18
19
  #
@@ -4,8 +4,7 @@ module UnicodeUtils
4
4
 
5
5
  # Absolute path to the directory from which UnicodeUtils loads its
6
6
  # compiled Unicode data files at runtime.
7
- CDATA_DIR =
8
- File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
7
+ CDATA_DIR = File.absolute_path(File.join(File.dirname(__FILE__), "..", "..", "cdata"))
9
8
 
10
9
  module Impl # :nodoc:
11
10
 
@@ -66,6 +65,54 @@ module UnicodeUtils
66
65
  }
67
66
  end
68
67
 
68
+ def self.read_conditional_casings(filename)
69
+ Hash.new.tap { |cp_map|
70
+ open_cdata_file(filename) do |input|
71
+ input.each_line { |line|
72
+ line.chomp!
73
+ record = line.split(";")
74
+ cp = record[0].to_i(16)
75
+ mapping = record[1].split(",").map { |c| c.to_i(16) }
76
+ language_id = record[2].empty? ? nil : record[2].to_sym
77
+ context = record[3] && record[3].gsub('_', '')
78
+ casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
79
+ (cp_map[cp] ||= {})[language_id] = casing
80
+ }
81
+ end
82
+ }
83
+ end
84
+
85
+ def self.read_combining_class_map
86
+ Hash.new.tap { |map|
87
+ open_cdata_file("combining_class_map") do |input|
88
+ buffer = "x" * 6
89
+ buffer.force_encoding(Encoding::US_ASCII)
90
+ cc_buffer = "x" * 2
91
+ cc_buffer.force_encoding(Encoding::US_ASCII)
92
+ while input.read(6, buffer)
93
+ map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
94
+ end
95
+ end
96
+ }
97
+ end
98
+
99
+ # Read a map whose keys are codepoints (6 hexgdigits, converted to
100
+ # integer) and whose values are single hexdigits (converted to
101
+ # integer).
102
+ def self.read_hexdigit_map(filename)
103
+ Hash.new.tap { |map|
104
+ open_cdata_file(filename) do |input|
105
+ buffer = "x" * 6
106
+ buffer.force_encoding(Encoding::US_ASCII)
107
+ val_buffer = "x"
108
+ val_buffer.force_encoding(Encoding::US_ASCII)
109
+ while input.read(6, buffer)
110
+ map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
111
+ end
112
+ end
113
+ }
114
+ end
115
+
69
116
  end
70
117
 
71
118
  end
@@ -16,6 +16,7 @@ module UnicodeUtils
16
16
  #
17
17
  # Examples:
18
18
  #
19
+ # require "unicode_utils/simple_casefold"
19
20
  # UnicodeUtils.simple_casefold("Ümit") == UnicodeUtils.simple_casefold("ümit") => true
20
21
  # UnicodeUtils.simple_casefold("WEISS") == UnicodeUtils.simple_casefold("weiß") => false
21
22
  #
@@ -7,17 +7,16 @@ module UnicodeUtils
7
7
  SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
8
8
 
9
9
  # Map each codepoint in +str+ that has a single codepoint
10
- # lowercase-mapping to that lowercase mapping. +str+ is assumed to be
11
- # in a unicode encoding. The original string is not modified. The
12
- # returned string has the same encoding and same length as the
13
- # original string.
10
+ # lowercase-mapping to that lowercase mapping. The returned string
11
+ # has the same length as the original string.
14
12
  #
15
13
  # This function is locale independent.
16
14
  #
17
15
  # Examples:
18
16
  #
19
- # UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
20
- # UnicodeUtils.simple_downcase("STRASSE") => "strasse"
17
+ # require "unicode_utils/simple_downcase"
18
+ # UnicodeUtils.simple_downcase("ÜMIT: 123") => "ümit: 123"
19
+ # UnicodeUtils.simple_downcase("STRASSE") => "strasse"
21
20
  def simple_downcase(str)
22
21
  String.new.force_encoding(str.encoding).tap { |res|
23
22
  str.each_codepoint { |cp|
@@ -7,17 +7,16 @@ module UnicodeUtils
7
7
  SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
8
8
 
9
9
  # Map each codepoint in +str+ that has a single codepoint
10
- # uppercase-mapping to that uppercase mapping. +str+ is assumed to be
11
- # in a unicode encoding. The original string is not modified. The
12
- # returned string has the same encoding and same length as the
13
- # original string.
10
+ # uppercase-mapping to that uppercase mapping. The returned string
11
+ # has the same length as the original string.
14
12
  #
15
13
  # This function is locale independent.
16
14
  #
17
15
  # Examples:
18
16
  #
19
- # UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
20
- # UnicodeUtils.simple_upcase("weiß") => "WEIß"
17
+ # require "unicode_utils/simple_upcase"
18
+ # UnicodeUtils.simple_upcase("ümit: 123") => "ÜMIT: 123"
19
+ # UnicodeUtils.simple_upcase("weiß") => "WEIß"
21
20
  def simple_upcase(str)
22
21
  String.new.force_encoding(str.encoding).tap { |res|
23
22
  str.each_codepoint { |cp|
@@ -0,0 +1,70 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/read_cdata"
4
+ require "unicode_utils/conditional_casing"
5
+ require "unicode_utils/each_word"
6
+ require "unicode_utils/cased_char_q"
7
+ require "unicode_utils/downcase"
8
+
9
+ module UnicodeUtils
10
+
11
+ SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
12
+ SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
13
+
14
+ # Convert the first cased character after each word boundary to
15
+ # titlecase and all other cased characters to lowercase. For many,
16
+ # but not all characters, the titlecase mapping is the same as the
17
+ # uppercase mapping.
18
+ #
19
+ # Some conversion rules are language dependent, these are in effect
20
+ # when a non-nil +language_id+ is given. If non-nil, the
21
+ # +language_id+ must be a two letter language code as defined in BCP
22
+ # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
23
+ # language doesn't have a two letter code, the three letter code is
24
+ # to be used. If locale independent behaviour is required, +nil+
25
+ # should be passed explicitely, because a later version of
26
+ # UnicodeUtils may default to something else.
27
+ #
28
+ # Example:
29
+ #
30
+ # require "unicode_utils/titlecase"
31
+ # UnicodeUtils.titlecase("hello, world!") => "Hello, World!"
32
+ def titlecase(str, language_id = nil)
33
+ String.new.force_encoding(str.encoding).tap do |res|
34
+ # ensure O(1) lookup by index
35
+ str = str.encode(Encoding::UTF_32LE)
36
+ i = 0
37
+ each_word(str) { |word|
38
+ cased_char_found = false
39
+ word.each_codepoint { |cp|
40
+ cased = cased_char?(cp)
41
+ if !cased_char_found && cased
42
+ cased_char_found = true
43
+ special_mapping =
44
+ Impl.conditional_titlecase_mapping(cp, str, i, language_id) ||
45
+ SPECIAL_TITLECASE_MAP[cp]
46
+ if special_mapping
47
+ special_mapping.each { |m| res << m }
48
+ else
49
+ res << (SIMPLE_TITLECASE_MAP[cp] || cp)
50
+ end
51
+ elsif cased
52
+ special_mapping =
53
+ Impl.conditional_downcase_mapping(cp, str, i, language_id) ||
54
+ SPECIAL_DOWNCASE_MAP[cp]
55
+ if special_mapping
56
+ special_mapping.each { |m| res << m }
57
+ else
58
+ res << (SIMPLE_DOWNCASE_MAP[cp] || cp)
59
+ end
60
+ else
61
+ res << cp
62
+ end
63
+ i += 1
64
+ }
65
+ }
66
+ end
67
+ end
68
+ module_function :titlecase
69
+
70
+ end
@@ -16,15 +16,22 @@ module UnicodeUtils
16
16
  # +language_id+ must be a two letter language code as defined in BCP
17
17
  # 47 (http://tools.ietf.org/rfc/bcp/bcp47.txt) as a symbol. If a
18
18
  # language doesn't have a two letter code, the three letter code is
19
- # to be used.
19
+ # to be used. If locale independent behaviour is required, +nil+
20
+ # should be passed explicitely, because a later version of
21
+ # UnicodeUtils may default to something else.
20
22
  #
21
23
  # Examples:
22
24
  #
23
- # UnicodeUtils.upcase("weiß") => "WEISS"
24
- # UnicodeUtils.upcase("i", :en) => "I"
25
- # UnicodeUtils.upcase("i", :tr) => "İ"
25
+ # require "unicode_utils/upcase"
26
+ # UnicodeUtils.upcase("weiß") => "WEISS"
27
+ # UnicodeUtils.upcase("i", :en) => "I"
28
+ # UnicodeUtils.upcase("i", :tr) => "İ"
26
29
  def upcase(str, language_id = nil)
27
30
  String.new.force_encoding(str.encoding).tap { |res|
31
+ if Impl::LANGS_WITH_RULES.include?(language_id)
32
+ # ensure O(1) lookup by index
33
+ str = str.encode(Encoding::UTF_32LE)
34
+ end
28
35
  pos = 0
29
36
  str.each_codepoint { |cp|
30
37
  special_mapping =
@@ -3,6 +3,6 @@
3
3
  module UnicodeUtils
4
4
 
5
5
  # Corresponds to the unicode_utils gem version.
6
- VERSION = "0.5.0"
6
+ VERSION = "1.0.0"
7
7
 
8
8
  end
@@ -177,4 +177,50 @@ class TestUnicodeUtils < Test::Unit::TestCase
177
177
  UnicodeUtils.casefold("weiß")
178
178
  end
179
179
 
180
+ def test_each_grapheme
181
+ graphemes = []
182
+ UnicodeUtils.each_grapheme("word") { |g| graphemes << g }
183
+ assert_equal ["w", "o", "r", "d"], graphemes
184
+ UnicodeUtils.each_grapheme("") { |g| flunk }
185
+ graphemes = []
186
+ UnicodeUtils.each_grapheme("u\u{308}mit") { |g| graphemes << g }
187
+ # diaeresis
188
+ assert_equal ["u\u{308}", "m", "i", "t"], graphemes
189
+ # hangul syllable
190
+ graphemes = []
191
+ UnicodeUtils.each_grapheme("\u{1111}\u{1171}\u{11b6}\u{d4db}") { |g| graphemes << g }
192
+ assert_equal ["\u{1111}\u{1171}\u{11b6}", "\u{d4db}"], graphemes
193
+ assert_equal ["a", "\r\n", "b"], UnicodeUtils.each_grapheme("a\r\nb").to_a
194
+ end
195
+
196
+ def test_each_word
197
+ words = []
198
+ UnicodeUtils.each_word("two words") { |w| words << w }
199
+ assert_equal ["two", " ", "words"], words
200
+ assert_equal ["a", " ", "b"], UnicodeUtils.each_word("a b").to_a
201
+ assert_equal [" ", "b"], UnicodeUtils.each_word(" b").to_a
202
+ assert_equal ["a", " "], UnicodeUtils.each_word("a ").to_a
203
+ assert_equal [" "], UnicodeUtils.each_word(" ").to_a
204
+ assert_equal ["a"], UnicodeUtils.each_word("a").to_a
205
+ assert_equal [], UnicodeUtils.each_word("").to_a
206
+ assert_equal ["Hello", ",", " ", "world", "!"],
207
+ UnicodeUtils.each_word("Hello, world!").to_a
208
+ assert_equal ["o\u{308}12"],
209
+ UnicodeUtils.each_word("o\u{308}12").to_a
210
+ assert_equal ["o\u{308}1"],
211
+ UnicodeUtils.each_word("o\u{308}1").to_a
212
+ assert_equal ["o\u{308}"],
213
+ UnicodeUtils.each_word("o\u{308}").to_a
214
+ assert_equal ["\u{308}", "o"],
215
+ UnicodeUtils.each_word("\u{308}o").to_a
216
+ end
217
+
218
+ def test_titlecase
219
+ assert_equal "Hello, World!", UnicodeUtils.titlecase("heLlo, world!")
220
+ assert_equal "Find", UnicodeUtils.titlecase("finD")
221
+ assert_equal "Ümit Huber Jandl", UnicodeUtils.titlecase("ümit huber jandl")
222
+ assert_equal "İ Can Has 1Kg Cheesburger",
223
+ UnicodeUtils.titlecase("i can has 1kg CHEESBURGER", :tr)
224
+ end
225
+
180
226
  end