unicode_utils 1.2.2 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES.txt +14 -0
- data/LICENSE.txt +1 -1
- data/cdata/canonical_decomposition_map +1 -1
- data/cdata/case_ignorable_set +1 -1
- data/cdata/casefold_c_map +1 -1
- data/cdata/combining_class_map +1 -1
- data/cdata/compatibility_decomposition_map +1 -1
- data/cdata/composition_exclusion_set +1 -1
- data/cdata/east_asian_width_property_per_cp +1 -1
- data/cdata/east_asian_width_property_ranges +1 -1
- data/cdata/general_category_per_cp +1 -1
- data/cdata/general_category_ranges +1 -1
- data/cdata/grapheme_break_property +1 -1
- data/cdata/name_aliases +1 -0
- data/cdata/names +731 -0
- data/cdata/prop_set_lowercase +1 -1
- data/cdata/prop_set_uppercase +1 -1
- data/cdata/simple_lc_map +1 -1
- data/cdata/simple_tc_map +1 -1
- data/cdata/simple_uc_map +1 -1
- data/cdata/word_break_property +1 -1
- data/lib/unicode_utils.rb +6 -3
- data/lib/unicode_utils/canonical_decomposition.rb +2 -2
- data/lib/unicode_utils/case_ignorable_char_q.rb +1 -1
- data/lib/unicode_utils/char_display_width.rb +2 -2
- data/lib/unicode_utils/char_name.rb +13 -3
- data/lib/unicode_utils/char_type.rb +1 -1
- data/lib/unicode_utils/code_point_type.rb +70 -0
- data/lib/unicode_utils/codepoint.rb +5 -5
- data/lib/unicode_utils/compatibility_decomposition.rb +1 -1
- data/lib/unicode_utils/debug.rb +5 -5
- data/lib/unicode_utils/default_ignorable_char_q.rb +2 -2
- data/lib/unicode_utils/display_width.rb +3 -3
- data/lib/unicode_utils/each_grapheme.rb +2 -2
- data/lib/unicode_utils/each_word.rb +1 -1
- data/lib/unicode_utils/east_asian_width.rb +2 -2
- data/lib/unicode_utils/gc.rb +1 -1
- data/lib/unicode_utils/general_category.rb +1 -1
- data/lib/unicode_utils/lowercase_char_q.rb +1 -1
- data/lib/unicode_utils/name_alias.rb +46 -0
- data/lib/unicode_utils/name_aliases.rb +29 -0
- data/lib/unicode_utils/nfc.rb +3 -3
- data/lib/unicode_utils/read_cdata.rb +36 -4
- data/lib/unicode_utils/sid.rb +63 -0
- data/lib/unicode_utils/simple_casefold.rb +2 -2
- data/lib/unicode_utils/simple_downcase.rb +2 -2
- data/lib/unicode_utils/simple_upcase.rb +2 -2
- data/lib/unicode_utils/soft_dotted_char_q.rb +1 -1
- data/lib/unicode_utils/titlecase.rb +1 -1
- data/lib/unicode_utils/titlecase_char_q.rb +1 -1
- data/lib/unicode_utils/uppercase_char_q.rb +1 -1
- data/lib/unicode_utils/version.rb +10 -3
- data/test/test_unicode_utils.rb +109 -5
- metadata +26 -39
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/name_alias"
|
4
|
+
require "unicode_utils/read_cdata"
|
5
|
+
|
6
|
+
module UnicodeUtils
|
7
|
+
|
8
|
+
NAME_ALIASES_MAP = Impl.read_name_aliases("name_aliases") # :nodoc:
|
9
|
+
NAME_ALIASES_MAP.default = [].freeze
|
10
|
+
|
11
|
+
# Get an Enumerable of formal name aliases of the given character. Returns an
|
12
|
+
# empty Enumerable if the character doesn't have an alias.
|
13
|
+
#
|
14
|
+
# The aliases are instances of UnicodeUtils::NameAlias, the order of the
|
15
|
+
# aliases in the returned Enumerable is preserved from NameAliases.txt in the
|
16
|
+
# Unicode Character Database.
|
17
|
+
#
|
18
|
+
# Example:
|
19
|
+
#
|
20
|
+
# require "unicode_utils/name_aliases"
|
21
|
+
# UnicodeUtils.name_aliases("\n").map(&:name) # => ["LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL"]
|
22
|
+
#
|
23
|
+
# See also: UnicodeUtils.char_name
|
24
|
+
def name_aliases(char)
|
25
|
+
NAME_ALIASES_MAP[char.ord]
|
26
|
+
end
|
27
|
+
module_function :name_aliases
|
28
|
+
|
29
|
+
end
|
data/lib/unicode_utils/nfc.rb
CHANGED
@@ -9,7 +9,7 @@ module UnicodeUtils
|
|
9
9
|
module Impl # :nodoc:all
|
10
10
|
|
11
11
|
COMPOSITION_EXCLUSION_SET =
|
12
|
-
Impl.
|
12
|
+
Impl.read_code_point_set("composition_exclusion_set")
|
13
13
|
|
14
14
|
CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
|
15
15
|
CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
|
@@ -124,8 +124,8 @@ module UnicodeUtils
|
|
124
124
|
# Get +str+ in Normalization Form C.
|
125
125
|
#
|
126
126
|
# The Unicode standard has multiple representations for some
|
127
|
-
# characters. One representation as a single
|
128
|
-
# representation(s) as a combination of multiple
|
127
|
+
# characters. One representation as a single code point and other
|
128
|
+
# representation(s) as a combination of multiple code points. This
|
129
129
|
# function "composes" these characters into the former
|
130
130
|
# representation.
|
131
131
|
#
|
@@ -16,11 +16,19 @@ module UnicodeUtils
|
|
16
16
|
5 => :Narrow
|
17
17
|
}.freeze
|
18
18
|
|
19
|
+
NAME_ALIAS_TYPE_TO_SYMBOL_MAP = {
|
20
|
+
1 => :correction,
|
21
|
+
2 => :control,
|
22
|
+
3 => :alternate,
|
23
|
+
4 => :figment,
|
24
|
+
5 => :abbreviation
|
25
|
+
}.freeze
|
26
|
+
|
19
27
|
def self.open_cdata_file(filename, &block)
|
20
28
|
File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
|
21
29
|
end
|
22
30
|
|
23
|
-
def self.
|
31
|
+
def self.read_code_point_set(filename)
|
24
32
|
Hash.new.tap { |set|
|
25
33
|
open_cdata_file(filename) do |input|
|
26
34
|
buffer = "x" * 6
|
@@ -32,7 +40,7 @@ module UnicodeUtils
|
|
32
40
|
}
|
33
41
|
end
|
34
42
|
|
35
|
-
def self.
|
43
|
+
def self.read_code_point_map(filename)
|
36
44
|
Hash.new.tap { |map|
|
37
45
|
open_cdata_file(filename) do |input|
|
38
46
|
buffer = "x" * 6
|
@@ -104,7 +112,7 @@ module UnicodeUtils
|
|
104
112
|
}
|
105
113
|
end
|
106
114
|
|
107
|
-
# Read a map whose keys are
|
115
|
+
# Read a map whose keys are code points (6 hexgdigits, converted to
|
108
116
|
# integer) and whose values are single hexdigits (converted to
|
109
117
|
# integer).
|
110
118
|
def self.read_hexdigit_map(filename)
|
@@ -122,7 +130,7 @@ module UnicodeUtils
|
|
122
130
|
end
|
123
131
|
|
124
132
|
# Returns a list (array) of pairs (two element Arrays) of Range
|
125
|
-
# (
|
133
|
+
# (code points) and associated integer value.
|
126
134
|
def self.read_range_to_hexdigit_list(filename)
|
127
135
|
Array.new.tap { |list|
|
128
136
|
open_cdata_file(filename) do |input|
|
@@ -208,6 +216,30 @@ module UnicodeUtils
|
|
208
216
|
}
|
209
217
|
end
|
210
218
|
|
219
|
+
def self.read_name_aliases(filename)
|
220
|
+
Hash.new.tap { |map|
|
221
|
+
open_cdata_file(filename) do |input|
|
222
|
+
cp_buffer = "x" * 6
|
223
|
+
cp_buffer.force_encoding(Encoding::US_ASCII)
|
224
|
+
ac_buffer = "x" * 1
|
225
|
+
ac_buffer.force_encoding(Encoding::US_ASCII)
|
226
|
+
at_buffer = "x" * 1
|
227
|
+
at_buffer.force_encoding(Encoding::US_ASCII)
|
228
|
+
al_buffer = "x" * 2
|
229
|
+
al_buffer.force_encoding(Encoding::US_ASCII)
|
230
|
+
while input.read(6, cp_buffer)
|
231
|
+
aliases = Array.new(input.read(1, ac_buffer).to_i(16))
|
232
|
+
0.upto(aliases.length - 1) { |i|
|
233
|
+
type = NAME_ALIAS_TYPE_TO_SYMBOL_MAP[input.read(1, at_buffer).to_i(16)]
|
234
|
+
name = input.read(input.read(2, al_buffer).to_i(16))
|
235
|
+
aliases[i] = NameAlias.new(name.freeze, type)
|
236
|
+
}
|
237
|
+
map[cp_buffer.to_i(16)] = aliases.freeze
|
238
|
+
end
|
239
|
+
end
|
240
|
+
}
|
241
|
+
end
|
242
|
+
|
211
243
|
end
|
212
244
|
|
213
245
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/name_aliases"
|
4
|
+
require "unicode_utils/code_point_type"
|
5
|
+
|
6
|
+
module UnicodeUtils
|
7
|
+
|
8
|
+
CP_PREFERRED_ALIAS_STRING_MAP = Hash.new.tap do |map|
|
9
|
+
NAME_ALIASES_MAP.each { |cp, aliases|
|
10
|
+
al =
|
11
|
+
(aliases.find { |al| al.type == :correction } ||
|
12
|
+
aliases.find { |al| al.type == :control } ||
|
13
|
+
aliases.find { |al| al.type == :figment } ||
|
14
|
+
aliases.find { |al| al.type == :alternate })
|
15
|
+
map[cp] = al.name if al
|
16
|
+
}
|
17
|
+
end #:nodoc:
|
18
|
+
|
19
|
+
# Returns a unique string identifier for every code point. Returns
|
20
|
+
# nil if +code_point+ is not in the Unicode codespace. +code_point+
|
21
|
+
# must be an Integer.
|
22
|
+
#
|
23
|
+
# The returned string identifier is either the non-empty Name
|
24
|
+
# property value of +code_point+, a non-empty Name_Alias string
|
25
|
+
# property value of +code_point+, or the code point label as
|
26
|
+
# described by section "Code Point Labels" in chapter 4.8 "Name" of
|
27
|
+
# the Unicode standard.
|
28
|
+
#
|
29
|
+
# If the returned identifier starts with "<", it is a code point
|
30
|
+
# label and it ends with ">". Otherwise it is the normative name or
|
31
|
+
# a formal alias string.
|
32
|
+
#
|
33
|
+
# The exact name/alias/label selection algorithm may change even in
|
34
|
+
# minor UnicodeUtils releases, but overall behaviour will stay the
|
35
|
+
# same in spirit.
|
36
|
+
#
|
37
|
+
# The selection process in this version of UnicodeUtils is:
|
38
|
+
# 1. Use an alias of type :correction, :control, :figment or
|
39
|
+
# :alternate (with listed precendence) if available
|
40
|
+
# 2. Use the Unicode Name property value if it is not empty
|
41
|
+
# 3. Construct a code point label in angle brackets.
|
42
|
+
#
|
43
|
+
# Examples:
|
44
|
+
#
|
45
|
+
# require "unicode_utils/sid"
|
46
|
+
#
|
47
|
+
# U.sid 0xa # => "LINE FEED"
|
48
|
+
# U.sid 0x0 # => "NULL"
|
49
|
+
# U.sid 0xfeff # => "BYTE ORDER MARK"
|
50
|
+
# U.sid 0xe000 # => "<private-use-E000>"
|
51
|
+
# U.sid 0x61 # => "LATIN SMALL LETTER A"
|
52
|
+
# U.sid -1 # => nil
|
53
|
+
def sid(code_point)
|
54
|
+
s = CP_PREFERRED_ALIAS_STRING_MAP[code_point] and return s
|
55
|
+
cn = UnicodeUtils.char_name(code_point)
|
56
|
+
return cn if cn && cn !~ /\A(\<|\z)/
|
57
|
+
ct = UnicodeUtils.code_point_type(code_point) or return nil
|
58
|
+
ts = ct.to_s.downcase.gsub('_', '-')
|
59
|
+
"<#{ts}-#{code_point.to_s(16).upcase.rjust(4, '0')}>"
|
60
|
+
end
|
61
|
+
module_function :sid
|
62
|
+
|
63
|
+
end
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
CASEFOLD_C_MAP = Impl.
|
7
|
+
CASEFOLD_C_MAP = Impl.read_code_point_map("casefold_c_map") # :nodoc:
|
8
8
|
|
9
|
-
CASEFOLD_S_MAP = Impl.
|
9
|
+
CASEFOLD_S_MAP = Impl.read_code_point_map("casefold_s_map") # :nodoc:
|
10
10
|
|
11
11
|
# Perform simple case folding. Contrary to full case folding, this
|
12
12
|
# uses only one to one mappings, so that the length of the returned
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SIMPLE_DOWNCASE_MAP = Impl.
|
7
|
+
SIMPLE_DOWNCASE_MAP = Impl.read_code_point_map("simple_lc_map") # :nodoc:
|
8
8
|
|
9
|
-
# Map each
|
9
|
+
# Map each code point in +str+ that has a single code point
|
10
10
|
# lowercase-mapping to that lowercase mapping. The returned string
|
11
11
|
# has the same length as the original string.
|
12
12
|
#
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SIMPLE_UPCASE_MAP = Impl.
|
7
|
+
SIMPLE_UPCASE_MAP = Impl.read_code_point_map("simple_uc_map") # :nodoc:
|
8
8
|
|
9
|
-
# Map each
|
9
|
+
# Map each code point in +str+ that has a single code point
|
10
10
|
# uppercase-mapping to that uppercase mapping. The returned string
|
11
11
|
# has the same length as the original string.
|
12
12
|
#
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SOFT_DOTTED_SET = Impl.
|
7
|
+
SOFT_DOTTED_SET = Impl.read_code_point_set("soft_dotted_set") # :nodoc:
|
8
8
|
|
9
9
|
# Returns true if the given character has the Unicode property
|
10
10
|
# Soft_Dotted.
|
@@ -8,7 +8,7 @@ require "unicode_utils/downcase"
|
|
8
8
|
|
9
9
|
module UnicodeUtils
|
10
10
|
|
11
|
-
SIMPLE_TITLECASE_MAP = Impl.
|
11
|
+
SIMPLE_TITLECASE_MAP = Impl.read_code_point_map("simple_tc_map") # :nodoc:
|
12
12
|
SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
|
13
13
|
|
14
14
|
# Convert the first cased character after each word boundary to
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
TITLECASE_LETTER_SET = Impl.
|
7
|
+
TITLECASE_LETTER_SET = Impl.read_code_point_set("cat_set_titlecase") # :nodoc:
|
8
8
|
|
9
9
|
# True if the given character has the General_Category
|
10
10
|
# Titlecase_Letter (Lt).
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
PROP_UPPERCASE_SET = Impl.
|
7
|
+
PROP_UPPERCASE_SET = Impl.read_code_point_set("prop_set_uppercase") # :nodoc:
|
8
8
|
|
9
9
|
# True if the given character has the Unicode property Uppercase.
|
10
10
|
def uppercase_char?(char)
|
@@ -4,13 +4,20 @@ module UnicodeUtils
|
|
4
4
|
|
5
5
|
# Corresponds to the unicode_utils gem version.
|
6
6
|
#
|
7
|
+
# Conforms to Semantic Versioning as documented at semver.org.
|
8
|
+
#
|
9
|
+
# Summary:
|
7
10
|
# MAJOR.MINOR.PATCHLEVEL
|
8
11
|
# - A backwards incompatible change causes a change in MAJOR
|
9
12
|
# - New features or non-bugfix improvals cause a change in MINOR
|
10
13
|
# - Bugfixes increase only PATCHLEVEL.
|
14
|
+
# - Pre-release versions append more info after a dash.
|
15
|
+
VERSION = "1.3.0"
|
16
|
+
|
17
|
+
# The version of Unicode implemented by this version of UnicodeUtils.
|
11
18
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
|
19
|
+
# require "unicode_utils/version"
|
20
|
+
# puts "Unicode #{UnicodeUtils::UNICODE_VERSION}"
|
21
|
+
UNICODE_VERSION = "6.1.0"
|
15
22
|
|
16
23
|
end
|
data/test/test_unicode_utils.rb
CHANGED
@@ -8,6 +8,10 @@ require "unicode_utils"
|
|
8
8
|
# Fast tests for allmost all UnicodeUtils functions.
|
9
9
|
class TestUnicodeUtils < Test::Unit::TestCase
|
10
10
|
|
11
|
+
def test_unicode_version
|
12
|
+
assert_match /\A\d+\.\d+\.\d+\z/, UnicodeUtils::UNICODE_VERSION
|
13
|
+
end
|
14
|
+
|
11
15
|
def test_name
|
12
16
|
assert_equal "LATIN SMALL LETTER F", UnicodeUtils.char_name("f")
|
13
17
|
assert_equal Encoding::US_ASCII, UnicodeUtils.char_name("f").encoding
|
@@ -421,19 +425,119 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
421
425
|
io = StringIO.new
|
422
426
|
UnicodeUtils.debug("", io: io)
|
423
427
|
assert_equal <<-'EOF', io.string
|
424
|
-
Char | Ordinal |
|
425
|
-
|
428
|
+
Char | Ordinal | Sid | General Category | UTF-8
|
429
|
+
------+---------+-----+------------------+-------
|
426
430
|
EOF
|
427
431
|
io = StringIO.new
|
428
432
|
UnicodeUtils.debug("一 \u{100000}\n", io: io)
|
429
433
|
assert_equal <<-'EOF', io.string
|
430
|
-
Char | Ordinal |
|
434
|
+
Char | Ordinal | Sid | General Category | UTF-8
|
431
435
|
------+---------+----------------------------+------------------+-------------
|
432
436
|
"一" | 4E00 | CJK UNIFIED IDEOGRAPH-4E00 | Other_Letter | E4 B8 80
|
433
437
|
" " | 20 | SPACE | Space_Separator | 20
|
434
|
-
N/A | 100000 |
|
435
|
-
"\n" | A |
|
438
|
+
N/A | 100000 | <private-use-100000> | Private_Use | F4 80 80 80
|
439
|
+
"\n" | A | LINE FEED | Control | 0A
|
436
440
|
EOF
|
437
441
|
end
|
438
442
|
|
443
|
+
def test_code_point_type
|
444
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type("A")
|
445
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type("a")
|
446
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x1cb)
|
447
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2b5)
|
448
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10923)
|
449
|
+
|
450
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x5a0)
|
451
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x93f)
|
452
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x20dd)
|
453
|
+
|
454
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xa901)
|
455
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10144)
|
456
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10917)
|
457
|
+
|
458
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x5f)
|
459
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2011)
|
460
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2329)
|
461
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xfe38)
|
462
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x201c)
|
463
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x201d)
|
464
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2e10)
|
465
|
+
|
466
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xff0b)
|
467
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xa3)
|
468
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2c2)
|
469
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x60f)
|
470
|
+
|
471
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2001)
|
472
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x2028)
|
473
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x2029)
|
474
|
+
|
475
|
+
assert_equal :Control, UnicodeUtils.code_point_type(0x0)
|
476
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x70f)
|
477
|
+
assert_equal :Surrogate, UnicodeUtils.code_point_type(0xdb82)
|
478
|
+
assert_equal :Private_Use, UnicodeUtils.code_point_type(0xf1020)
|
479
|
+
assert_equal :Private_Use, UnicodeUtils.code_point_type(0x10fffd)
|
480
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0x10ffff)
|
481
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xfffe)
|
482
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xffff)
|
483
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbfffe)
|
484
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbffff)
|
485
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0x380)
|
486
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xeeb)
|
487
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xfff)
|
488
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0x7fffd)
|
489
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xeffef)
|
490
|
+
### above is at least one assertion for every general category ###
|
491
|
+
|
492
|
+
assert_equal nil, UnicodeUtils.code_point_type(-1)
|
493
|
+
assert_equal nil, UnicodeUtils.code_point_type(0x110000)
|
494
|
+
end
|
495
|
+
|
496
|
+
def test_name_aliases
|
497
|
+
assert_equal [UnicodeUtils::NameAlias.new("NULL", :control),
|
498
|
+
UnicodeUtils::NameAlias.new("NUL", :abbreviation)],
|
499
|
+
UnicodeUtils.name_aliases(0x0)
|
500
|
+
assert_equal [UnicodeUtils::NameAlias.new("LATIN CAPITAL LETTER GHA", :correction)],
|
501
|
+
UnicodeUtils.name_aliases(0x1a2)
|
502
|
+
assert_equal [UnicodeUtils::NameAlias.new("BYTE ORDER MARK", :alternate),
|
503
|
+
UnicodeUtils::NameAlias.new("BOM", :abbreviation),
|
504
|
+
UnicodeUtils::NameAlias.new("ZWNBSP", :abbreviation)],
|
505
|
+
UnicodeUtils.name_aliases(0xfeff)
|
506
|
+
assert_equal [UnicodeUtils::NameAlias.new("PADDING CHARACTER", :figment),
|
507
|
+
UnicodeUtils::NameAlias.new("PAD", :abbreviation)],
|
508
|
+
UnicodeUtils.name_aliases(0x80)
|
509
|
+
assert_equal [UnicodeUtils::NameAlias.new("VS256", :abbreviation)],
|
510
|
+
UnicodeUtils.name_aliases(0xe01ef)
|
511
|
+
assert_equal [UnicodeUtils::NameAlias.new("LINE FEED", :control),
|
512
|
+
UnicodeUtils::NameAlias.new("NEW LINE", :control),
|
513
|
+
UnicodeUtils::NameAlias.new("END OF LINE", :control),
|
514
|
+
UnicodeUtils::NameAlias.new("LF", :abbreviation),
|
515
|
+
UnicodeUtils::NameAlias.new("NL", :abbreviation),
|
516
|
+
UnicodeUtils::NameAlias.new("EOL", :abbreviation)],
|
517
|
+
UnicodeUtils.name_aliases(0xa)
|
518
|
+
assert_equal [UnicodeUtils::NameAlias.new("CHARACTER TABULATION", :control),
|
519
|
+
UnicodeUtils::NameAlias.new("HORIZONTAL TABULATION", :control),
|
520
|
+
UnicodeUtils::NameAlias.new("HT", :abbreviation),
|
521
|
+
UnicodeUtils::NameAlias.new("TAB", :abbreviation)],
|
522
|
+
UnicodeUtils.name_aliases("\t")
|
523
|
+
assert_equal [],
|
524
|
+
UnicodeUtils.name_aliases("a")
|
525
|
+
end
|
526
|
+
|
527
|
+
def test_sid
|
528
|
+
assert_equal nil, UnicodeUtils.sid(-1)
|
529
|
+
assert_equal "NULL", UnicodeUtils.sid(0x0)
|
530
|
+
assert_equal "LATIN CAPITAL LETTER GHA", UnicodeUtils.sid(0x1a2)
|
531
|
+
assert_equal "LINE FEED", UnicodeUtils.sid(0xa)
|
532
|
+
assert_equal "PADDING CHARACTER", UnicodeUtils.sid(0x80)
|
533
|
+
assert_equal "BYTE ORDER MARK", UnicodeUtils.sid(0xfeff)
|
534
|
+
assert_equal "SPACE", UnicodeUtils.sid(0x20)
|
535
|
+
assert_equal "<reserved-0380>", UnicodeUtils.sid(0x380)
|
536
|
+
assert_equal "<surrogate-D800>", UnicodeUtils.sid(0xd800)
|
537
|
+
assert_equal "<private-use-F0000>", UnicodeUtils.sid(0xf0000)
|
538
|
+
assert_equal "<private-use-10FFFD>", UnicodeUtils.sid(0x10fffd)
|
539
|
+
assert_equal "<noncharacter-10FFFF>", UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end)
|
540
|
+
assert_equal nil, UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end + 1)
|
541
|
+
end
|
542
|
+
|
439
543
|
end
|
metadata
CHANGED
@@ -1,34 +1,25 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode_utils
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 1
|
7
|
-
- 2
|
8
|
-
- 2
|
9
|
-
version: 1.2.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- Stefan Lang
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
date: 2011-11-27 00:00:00 +01:00
|
18
|
-
default_executable:
|
12
|
+
date: 2012-03-07 00:00:00.000000000 Z
|
19
13
|
dependencies: []
|
20
|
-
|
21
14
|
description:
|
22
15
|
email: langstefan@gmx.at
|
23
16
|
executables: []
|
24
|
-
|
25
17
|
extensions: []
|
26
|
-
|
27
|
-
extra_rdoc_files:
|
18
|
+
extra_rdoc_files:
|
28
19
|
- README.txt
|
29
20
|
- INSTALL.txt
|
30
21
|
- CHANGES.txt
|
31
|
-
files:
|
22
|
+
files:
|
32
23
|
- lib/unicode_utils.rb
|
33
24
|
- lib/unicode_utils/conditional_casing.rb
|
34
25
|
- lib/unicode_utils/version.rb
|
@@ -43,8 +34,12 @@ files:
|
|
43
34
|
- lib/unicode_utils/general_category.rb
|
44
35
|
- lib/unicode_utils/uppercase_char_q.rb
|
45
36
|
- lib/unicode_utils/upcase.rb
|
37
|
+
- lib/unicode_utils/sid.rb
|
46
38
|
- lib/unicode_utils/u.rb
|
39
|
+
- lib/unicode_utils/code_point_type.rb
|
47
40
|
- lib/unicode_utils/hangul_syllable_decomposition.rb
|
41
|
+
- lib/unicode_utils/name_aliases.rb
|
42
|
+
- lib/unicode_utils/name_alias.rb
|
48
43
|
- lib/unicode_utils/soft_dotted_char_q.rb
|
49
44
|
- lib/unicode_utils/lowercase_char_q.rb
|
50
45
|
- lib/unicode_utils/read_cdata.rb
|
@@ -87,6 +82,7 @@ files:
|
|
87
82
|
- cdata/general_category_aliases
|
88
83
|
- cdata/canonical_decomposition_map
|
89
84
|
- cdata/cat_set_titlecase
|
85
|
+
- cdata/name_aliases
|
90
86
|
- cdata/casefold_f_map
|
91
87
|
- cdata/special_uc_map
|
92
88
|
- cdata/special_tc_map
|
@@ -107,40 +103,31 @@ files:
|
|
107
103
|
- INSTALL.txt
|
108
104
|
- LICENSE.txt
|
109
105
|
- CHANGES.txt
|
110
|
-
has_rdoc: true
|
111
106
|
homepage: http://github.com/lang/unicode_utils
|
112
107
|
licenses: []
|
113
|
-
|
114
108
|
post_install_message:
|
115
|
-
rdoc_options:
|
109
|
+
rdoc_options:
|
116
110
|
- --main=README.txt
|
117
111
|
- --charset=UTF-8
|
118
|
-
require_paths:
|
112
|
+
require_paths:
|
119
113
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
115
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
segments:
|
126
|
-
- 1
|
127
|
-
- 9
|
128
|
-
- 1
|
116
|
+
requirements:
|
117
|
+
- - ! '>='
|
118
|
+
- !ruby/object:Gem::Version
|
129
119
|
version: 1.9.1
|
130
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
121
|
none: false
|
132
|
-
requirements:
|
133
|
-
- -
|
134
|
-
- !ruby/object:Gem::Version
|
135
|
-
|
136
|
-
- 0
|
137
|
-
version: "0"
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
138
126
|
requirements: []
|
139
|
-
|
140
127
|
rubyforge_project: unicode-utils
|
141
|
-
rubygems_version: 1.
|
128
|
+
rubygems_version: 1.8.11
|
142
129
|
signing_key:
|
143
130
|
specification_version: 3
|
144
131
|
summary: additional Unicode aware functions for Ruby 1.9
|
145
|
-
test_files:
|
132
|
+
test_files:
|
146
133
|
- test/test_unicode_utils.rb
|