unicode_utils 1.2.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES.txt +14 -0
- data/LICENSE.txt +1 -1
- data/cdata/canonical_decomposition_map +1 -1
- data/cdata/case_ignorable_set +1 -1
- data/cdata/casefold_c_map +1 -1
- data/cdata/combining_class_map +1 -1
- data/cdata/compatibility_decomposition_map +1 -1
- data/cdata/composition_exclusion_set +1 -1
- data/cdata/east_asian_width_property_per_cp +1 -1
- data/cdata/east_asian_width_property_ranges +1 -1
- data/cdata/general_category_per_cp +1 -1
- data/cdata/general_category_ranges +1 -1
- data/cdata/grapheme_break_property +1 -1
- data/cdata/name_aliases +1 -0
- data/cdata/names +731 -0
- data/cdata/prop_set_lowercase +1 -1
- data/cdata/prop_set_uppercase +1 -1
- data/cdata/simple_lc_map +1 -1
- data/cdata/simple_tc_map +1 -1
- data/cdata/simple_uc_map +1 -1
- data/cdata/word_break_property +1 -1
- data/lib/unicode_utils.rb +6 -3
- data/lib/unicode_utils/canonical_decomposition.rb +2 -2
- data/lib/unicode_utils/case_ignorable_char_q.rb +1 -1
- data/lib/unicode_utils/char_display_width.rb +2 -2
- data/lib/unicode_utils/char_name.rb +13 -3
- data/lib/unicode_utils/char_type.rb +1 -1
- data/lib/unicode_utils/code_point_type.rb +70 -0
- data/lib/unicode_utils/codepoint.rb +5 -5
- data/lib/unicode_utils/compatibility_decomposition.rb +1 -1
- data/lib/unicode_utils/debug.rb +5 -5
- data/lib/unicode_utils/default_ignorable_char_q.rb +2 -2
- data/lib/unicode_utils/display_width.rb +3 -3
- data/lib/unicode_utils/each_grapheme.rb +2 -2
- data/lib/unicode_utils/each_word.rb +1 -1
- data/lib/unicode_utils/east_asian_width.rb +2 -2
- data/lib/unicode_utils/gc.rb +1 -1
- data/lib/unicode_utils/general_category.rb +1 -1
- data/lib/unicode_utils/lowercase_char_q.rb +1 -1
- data/lib/unicode_utils/name_alias.rb +46 -0
- data/lib/unicode_utils/name_aliases.rb +29 -0
- data/lib/unicode_utils/nfc.rb +3 -3
- data/lib/unicode_utils/read_cdata.rb +36 -4
- data/lib/unicode_utils/sid.rb +63 -0
- data/lib/unicode_utils/simple_casefold.rb +2 -2
- data/lib/unicode_utils/simple_downcase.rb +2 -2
- data/lib/unicode_utils/simple_upcase.rb +2 -2
- data/lib/unicode_utils/soft_dotted_char_q.rb +1 -1
- data/lib/unicode_utils/titlecase.rb +1 -1
- data/lib/unicode_utils/titlecase_char_q.rb +1 -1
- data/lib/unicode_utils/uppercase_char_q.rb +1 -1
- data/lib/unicode_utils/version.rb +10 -3
- data/test/test_unicode_utils.rb +109 -5
- metadata +26 -39
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/name_alias"
|
4
|
+
require "unicode_utils/read_cdata"
|
5
|
+
|
6
|
+
module UnicodeUtils
|
7
|
+
|
8
|
+
NAME_ALIASES_MAP = Impl.read_name_aliases("name_aliases") # :nodoc:
|
9
|
+
NAME_ALIASES_MAP.default = [].freeze
|
10
|
+
|
11
|
+
# Get an Enumerable of formal name aliases of the given character. Returns an
|
12
|
+
# empty Enumerable if the character doesn't have an alias.
|
13
|
+
#
|
14
|
+
# The aliases are instances of UnicodeUtils::NameAlias, the order of the
|
15
|
+
# aliases in the returned Enumerable is preserved from NameAliases.txt in the
|
16
|
+
# Unicode Character Database.
|
17
|
+
#
|
18
|
+
# Example:
|
19
|
+
#
|
20
|
+
# require "unicode_utils/name_aliases"
|
21
|
+
# UnicodeUtils.name_aliases("\n").map(&:name) # => ["LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL"]
|
22
|
+
#
|
23
|
+
# See also: UnicodeUtils.char_name
|
24
|
+
def name_aliases(char)
|
25
|
+
NAME_ALIASES_MAP[char.ord]
|
26
|
+
end
|
27
|
+
module_function :name_aliases
|
28
|
+
|
29
|
+
end
|
data/lib/unicode_utils/nfc.rb
CHANGED
@@ -9,7 +9,7 @@ module UnicodeUtils
|
|
9
9
|
module Impl # :nodoc:all
|
10
10
|
|
11
11
|
COMPOSITION_EXCLUSION_SET =
|
12
|
-
Impl.
|
12
|
+
Impl.read_code_point_set("composition_exclusion_set")
|
13
13
|
|
14
14
|
CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
|
15
15
|
CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
|
@@ -124,8 +124,8 @@ module UnicodeUtils
|
|
124
124
|
# Get +str+ in Normalization Form C.
|
125
125
|
#
|
126
126
|
# The Unicode standard has multiple representations for some
|
127
|
-
# characters. One representation as a single
|
128
|
-
# representation(s) as a combination of multiple
|
127
|
+
# characters. One representation as a single code point and other
|
128
|
+
# representation(s) as a combination of multiple code points. This
|
129
129
|
# function "composes" these characters into the former
|
130
130
|
# representation.
|
131
131
|
#
|
@@ -16,11 +16,19 @@ module UnicodeUtils
|
|
16
16
|
5 => :Narrow
|
17
17
|
}.freeze
|
18
18
|
|
19
|
+
NAME_ALIAS_TYPE_TO_SYMBOL_MAP = {
|
20
|
+
1 => :correction,
|
21
|
+
2 => :control,
|
22
|
+
3 => :alternate,
|
23
|
+
4 => :figment,
|
24
|
+
5 => :abbreviation
|
25
|
+
}.freeze
|
26
|
+
|
19
27
|
def self.open_cdata_file(filename, &block)
|
20
28
|
File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
|
21
29
|
end
|
22
30
|
|
23
|
-
def self.
|
31
|
+
def self.read_code_point_set(filename)
|
24
32
|
Hash.new.tap { |set|
|
25
33
|
open_cdata_file(filename) do |input|
|
26
34
|
buffer = "x" * 6
|
@@ -32,7 +40,7 @@ module UnicodeUtils
|
|
32
40
|
}
|
33
41
|
end
|
34
42
|
|
35
|
-
def self.
|
43
|
+
def self.read_code_point_map(filename)
|
36
44
|
Hash.new.tap { |map|
|
37
45
|
open_cdata_file(filename) do |input|
|
38
46
|
buffer = "x" * 6
|
@@ -104,7 +112,7 @@ module UnicodeUtils
|
|
104
112
|
}
|
105
113
|
end
|
106
114
|
|
107
|
-
# Read a map whose keys are
|
115
|
+
# Read a map whose keys are code points (6 hexgdigits, converted to
|
108
116
|
# integer) and whose values are single hexdigits (converted to
|
109
117
|
# integer).
|
110
118
|
def self.read_hexdigit_map(filename)
|
@@ -122,7 +130,7 @@ module UnicodeUtils
|
|
122
130
|
end
|
123
131
|
|
124
132
|
# Returns a list (array) of pairs (two element Arrays) of Range
|
125
|
-
# (
|
133
|
+
# (code points) and associated integer value.
|
126
134
|
def self.read_range_to_hexdigit_list(filename)
|
127
135
|
Array.new.tap { |list|
|
128
136
|
open_cdata_file(filename) do |input|
|
@@ -208,6 +216,30 @@ module UnicodeUtils
|
|
208
216
|
}
|
209
217
|
end
|
210
218
|
|
219
|
+
def self.read_name_aliases(filename)
|
220
|
+
Hash.new.tap { |map|
|
221
|
+
open_cdata_file(filename) do |input|
|
222
|
+
cp_buffer = "x" * 6
|
223
|
+
cp_buffer.force_encoding(Encoding::US_ASCII)
|
224
|
+
ac_buffer = "x" * 1
|
225
|
+
ac_buffer.force_encoding(Encoding::US_ASCII)
|
226
|
+
at_buffer = "x" * 1
|
227
|
+
at_buffer.force_encoding(Encoding::US_ASCII)
|
228
|
+
al_buffer = "x" * 2
|
229
|
+
al_buffer.force_encoding(Encoding::US_ASCII)
|
230
|
+
while input.read(6, cp_buffer)
|
231
|
+
aliases = Array.new(input.read(1, ac_buffer).to_i(16))
|
232
|
+
0.upto(aliases.length - 1) { |i|
|
233
|
+
type = NAME_ALIAS_TYPE_TO_SYMBOL_MAP[input.read(1, at_buffer).to_i(16)]
|
234
|
+
name = input.read(input.read(2, al_buffer).to_i(16))
|
235
|
+
aliases[i] = NameAlias.new(name.freeze, type)
|
236
|
+
}
|
237
|
+
map[cp_buffer.to_i(16)] = aliases.freeze
|
238
|
+
end
|
239
|
+
end
|
240
|
+
}
|
241
|
+
end
|
242
|
+
|
211
243
|
end
|
212
244
|
|
213
245
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require "unicode_utils/name_aliases"
|
4
|
+
require "unicode_utils/code_point_type"
|
5
|
+
|
6
|
+
module UnicodeUtils
|
7
|
+
|
8
|
+
CP_PREFERRED_ALIAS_STRING_MAP = Hash.new.tap do |map|
|
9
|
+
NAME_ALIASES_MAP.each { |cp, aliases|
|
10
|
+
al =
|
11
|
+
(aliases.find { |al| al.type == :correction } ||
|
12
|
+
aliases.find { |al| al.type == :control } ||
|
13
|
+
aliases.find { |al| al.type == :figment } ||
|
14
|
+
aliases.find { |al| al.type == :alternate })
|
15
|
+
map[cp] = al.name if al
|
16
|
+
}
|
17
|
+
end #:nodoc:
|
18
|
+
|
19
|
+
# Returns a unique string identifier for every code point. Returns
|
20
|
+
# nil if +code_point+ is not in the Unicode codespace. +code_point+
|
21
|
+
# must be an Integer.
|
22
|
+
#
|
23
|
+
# The returned string identifier is either the non-empty Name
|
24
|
+
# property value of +code_point+, a non-empty Name_Alias string
|
25
|
+
# property value of +code_point+, or the code point label as
|
26
|
+
# described by section "Code Point Labels" in chapter 4.8 "Name" of
|
27
|
+
# the Unicode standard.
|
28
|
+
#
|
29
|
+
# If the returned identifier starts with "<", it is a code point
|
30
|
+
# label and it ends with ">". Otherwise it is the normative name or
|
31
|
+
# a formal alias string.
|
32
|
+
#
|
33
|
+
# The exact name/alias/label selection algorithm may change even in
|
34
|
+
# minor UnicodeUtils releases, but overall behaviour will stay the
|
35
|
+
# same in spirit.
|
36
|
+
#
|
37
|
+
# The selection process in this version of UnicodeUtils is:
|
38
|
+
# 1. Use an alias of type :correction, :control, :figment or
|
39
|
+
# :alternate (with listed precendence) if available
|
40
|
+
# 2. Use the Unicode Name property value if it is not empty
|
41
|
+
# 3. Construct a code point label in angle brackets.
|
42
|
+
#
|
43
|
+
# Examples:
|
44
|
+
#
|
45
|
+
# require "unicode_utils/sid"
|
46
|
+
#
|
47
|
+
# U.sid 0xa # => "LINE FEED"
|
48
|
+
# U.sid 0x0 # => "NULL"
|
49
|
+
# U.sid 0xfeff # => "BYTE ORDER MARK"
|
50
|
+
# U.sid 0xe000 # => "<private-use-E000>"
|
51
|
+
# U.sid 0x61 # => "LATIN SMALL LETTER A"
|
52
|
+
# U.sid -1 # => nil
|
53
|
+
def sid(code_point)
|
54
|
+
s = CP_PREFERRED_ALIAS_STRING_MAP[code_point] and return s
|
55
|
+
cn = UnicodeUtils.char_name(code_point)
|
56
|
+
return cn if cn && cn !~ /\A(\<|\z)/
|
57
|
+
ct = UnicodeUtils.code_point_type(code_point) or return nil
|
58
|
+
ts = ct.to_s.downcase.gsub('_', '-')
|
59
|
+
"<#{ts}-#{code_point.to_s(16).upcase.rjust(4, '0')}>"
|
60
|
+
end
|
61
|
+
module_function :sid
|
62
|
+
|
63
|
+
end
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
CASEFOLD_C_MAP = Impl.
|
7
|
+
CASEFOLD_C_MAP = Impl.read_code_point_map("casefold_c_map") # :nodoc:
|
8
8
|
|
9
|
-
CASEFOLD_S_MAP = Impl.
|
9
|
+
CASEFOLD_S_MAP = Impl.read_code_point_map("casefold_s_map") # :nodoc:
|
10
10
|
|
11
11
|
# Perform simple case folding. Contrary to full case folding, this
|
12
12
|
# uses only one to one mappings, so that the length of the returned
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SIMPLE_DOWNCASE_MAP = Impl.
|
7
|
+
SIMPLE_DOWNCASE_MAP = Impl.read_code_point_map("simple_lc_map") # :nodoc:
|
8
8
|
|
9
|
-
# Map each
|
9
|
+
# Map each code point in +str+ that has a single code point
|
10
10
|
# lowercase-mapping to that lowercase mapping. The returned string
|
11
11
|
# has the same length as the original string.
|
12
12
|
#
|
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SIMPLE_UPCASE_MAP = Impl.
|
7
|
+
SIMPLE_UPCASE_MAP = Impl.read_code_point_map("simple_uc_map") # :nodoc:
|
8
8
|
|
9
|
-
# Map each
|
9
|
+
# Map each code point in +str+ that has a single code point
|
10
10
|
# uppercase-mapping to that uppercase mapping. The returned string
|
11
11
|
# has the same length as the original string.
|
12
12
|
#
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
SOFT_DOTTED_SET = Impl.
|
7
|
+
SOFT_DOTTED_SET = Impl.read_code_point_set("soft_dotted_set") # :nodoc:
|
8
8
|
|
9
9
|
# Returns true if the given character has the Unicode property
|
10
10
|
# Soft_Dotted.
|
@@ -8,7 +8,7 @@ require "unicode_utils/downcase"
|
|
8
8
|
|
9
9
|
module UnicodeUtils
|
10
10
|
|
11
|
-
SIMPLE_TITLECASE_MAP = Impl.
|
11
|
+
SIMPLE_TITLECASE_MAP = Impl.read_code_point_map("simple_tc_map") # :nodoc:
|
12
12
|
SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
|
13
13
|
|
14
14
|
# Convert the first cased character after each word boundary to
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
TITLECASE_LETTER_SET = Impl.
|
7
|
+
TITLECASE_LETTER_SET = Impl.read_code_point_set("cat_set_titlecase") # :nodoc:
|
8
8
|
|
9
9
|
# True if the given character has the General_Category
|
10
10
|
# Titlecase_Letter (Lt).
|
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
|
|
4
4
|
|
5
5
|
module UnicodeUtils
|
6
6
|
|
7
|
-
PROP_UPPERCASE_SET = Impl.
|
7
|
+
PROP_UPPERCASE_SET = Impl.read_code_point_set("prop_set_uppercase") # :nodoc:
|
8
8
|
|
9
9
|
# True if the given character has the Unicode property Uppercase.
|
10
10
|
def uppercase_char?(char)
|
@@ -4,13 +4,20 @@ module UnicodeUtils
|
|
4
4
|
|
5
5
|
# Corresponds to the unicode_utils gem version.
|
6
6
|
#
|
7
|
+
# Conforms to Semantic Versioning as documented at semver.org.
|
8
|
+
#
|
9
|
+
# Summary:
|
7
10
|
# MAJOR.MINOR.PATCHLEVEL
|
8
11
|
# - A backwards incompatible change causes a change in MAJOR
|
9
12
|
# - New features or non-bugfix improvals cause a change in MINOR
|
10
13
|
# - Bugfixes increase only PATCHLEVEL.
|
14
|
+
# - Pre-release versions append more info after a dash.
|
15
|
+
VERSION = "1.3.0"
|
16
|
+
|
17
|
+
# The version of Unicode implemented by this version of UnicodeUtils.
|
11
18
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
|
19
|
+
# require "unicode_utils/version"
|
20
|
+
# puts "Unicode #{UnicodeUtils::UNICODE_VERSION}"
|
21
|
+
UNICODE_VERSION = "6.1.0"
|
15
22
|
|
16
23
|
end
|
data/test/test_unicode_utils.rb
CHANGED
@@ -8,6 +8,10 @@ require "unicode_utils"
|
|
8
8
|
# Fast tests for allmost all UnicodeUtils functions.
|
9
9
|
class TestUnicodeUtils < Test::Unit::TestCase
|
10
10
|
|
11
|
+
def test_unicode_version
|
12
|
+
assert_match /\A\d+\.\d+\.\d+\z/, UnicodeUtils::UNICODE_VERSION
|
13
|
+
end
|
14
|
+
|
11
15
|
def test_name
|
12
16
|
assert_equal "LATIN SMALL LETTER F", UnicodeUtils.char_name("f")
|
13
17
|
assert_equal Encoding::US_ASCII, UnicodeUtils.char_name("f").encoding
|
@@ -421,19 +425,119 @@ class TestUnicodeUtils < Test::Unit::TestCase
|
|
421
425
|
io = StringIO.new
|
422
426
|
UnicodeUtils.debug("", io: io)
|
423
427
|
assert_equal <<-'EOF', io.string
|
424
|
-
Char | Ordinal |
|
425
|
-
|
428
|
+
Char | Ordinal | Sid | General Category | UTF-8
|
429
|
+
------+---------+-----+------------------+-------
|
426
430
|
EOF
|
427
431
|
io = StringIO.new
|
428
432
|
UnicodeUtils.debug("一 \u{100000}\n", io: io)
|
429
433
|
assert_equal <<-'EOF', io.string
|
430
|
-
Char | Ordinal |
|
434
|
+
Char | Ordinal | Sid | General Category | UTF-8
|
431
435
|
------+---------+----------------------------+------------------+-------------
|
432
436
|
"一" | 4E00 | CJK UNIFIED IDEOGRAPH-4E00 | Other_Letter | E4 B8 80
|
433
437
|
" " | 20 | SPACE | Space_Separator | 20
|
434
|
-
N/A | 100000 |
|
435
|
-
"\n" | A |
|
438
|
+
N/A | 100000 | <private-use-100000> | Private_Use | F4 80 80 80
|
439
|
+
"\n" | A | LINE FEED | Control | 0A
|
436
440
|
EOF
|
437
441
|
end
|
438
442
|
|
443
|
+
def test_code_point_type
|
444
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type("A")
|
445
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type("a")
|
446
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x1cb)
|
447
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2b5)
|
448
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10923)
|
449
|
+
|
450
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x5a0)
|
451
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x93f)
|
452
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x20dd)
|
453
|
+
|
454
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xa901)
|
455
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10144)
|
456
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x10917)
|
457
|
+
|
458
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x5f)
|
459
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2011)
|
460
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2329)
|
461
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xfe38)
|
462
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x201c)
|
463
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x201d)
|
464
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2e10)
|
465
|
+
|
466
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xff0b)
|
467
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0xa3)
|
468
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2c2)
|
469
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x60f)
|
470
|
+
|
471
|
+
assert_equal :Graphic, UnicodeUtils.code_point_type(0x2001)
|
472
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x2028)
|
473
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x2029)
|
474
|
+
|
475
|
+
assert_equal :Control, UnicodeUtils.code_point_type(0x0)
|
476
|
+
assert_equal :Format, UnicodeUtils.code_point_type(0x70f)
|
477
|
+
assert_equal :Surrogate, UnicodeUtils.code_point_type(0xdb82)
|
478
|
+
assert_equal :Private_Use, UnicodeUtils.code_point_type(0xf1020)
|
479
|
+
assert_equal :Private_Use, UnicodeUtils.code_point_type(0x10fffd)
|
480
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0x10ffff)
|
481
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xfffe)
|
482
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xffff)
|
483
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbfffe)
|
484
|
+
assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbffff)
|
485
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0x380)
|
486
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xeeb)
|
487
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xfff)
|
488
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0x7fffd)
|
489
|
+
assert_equal :Reserved, UnicodeUtils.code_point_type(0xeffef)
|
490
|
+
### above is at least one assertion for every general category ###
|
491
|
+
|
492
|
+
assert_equal nil, UnicodeUtils.code_point_type(-1)
|
493
|
+
assert_equal nil, UnicodeUtils.code_point_type(0x110000)
|
494
|
+
end
|
495
|
+
|
496
|
+
def test_name_aliases
|
497
|
+
assert_equal [UnicodeUtils::NameAlias.new("NULL", :control),
|
498
|
+
UnicodeUtils::NameAlias.new("NUL", :abbreviation)],
|
499
|
+
UnicodeUtils.name_aliases(0x0)
|
500
|
+
assert_equal [UnicodeUtils::NameAlias.new("LATIN CAPITAL LETTER GHA", :correction)],
|
501
|
+
UnicodeUtils.name_aliases(0x1a2)
|
502
|
+
assert_equal [UnicodeUtils::NameAlias.new("BYTE ORDER MARK", :alternate),
|
503
|
+
UnicodeUtils::NameAlias.new("BOM", :abbreviation),
|
504
|
+
UnicodeUtils::NameAlias.new("ZWNBSP", :abbreviation)],
|
505
|
+
UnicodeUtils.name_aliases(0xfeff)
|
506
|
+
assert_equal [UnicodeUtils::NameAlias.new("PADDING CHARACTER", :figment),
|
507
|
+
UnicodeUtils::NameAlias.new("PAD", :abbreviation)],
|
508
|
+
UnicodeUtils.name_aliases(0x80)
|
509
|
+
assert_equal [UnicodeUtils::NameAlias.new("VS256", :abbreviation)],
|
510
|
+
UnicodeUtils.name_aliases(0xe01ef)
|
511
|
+
assert_equal [UnicodeUtils::NameAlias.new("LINE FEED", :control),
|
512
|
+
UnicodeUtils::NameAlias.new("NEW LINE", :control),
|
513
|
+
UnicodeUtils::NameAlias.new("END OF LINE", :control),
|
514
|
+
UnicodeUtils::NameAlias.new("LF", :abbreviation),
|
515
|
+
UnicodeUtils::NameAlias.new("NL", :abbreviation),
|
516
|
+
UnicodeUtils::NameAlias.new("EOL", :abbreviation)],
|
517
|
+
UnicodeUtils.name_aliases(0xa)
|
518
|
+
assert_equal [UnicodeUtils::NameAlias.new("CHARACTER TABULATION", :control),
|
519
|
+
UnicodeUtils::NameAlias.new("HORIZONTAL TABULATION", :control),
|
520
|
+
UnicodeUtils::NameAlias.new("HT", :abbreviation),
|
521
|
+
UnicodeUtils::NameAlias.new("TAB", :abbreviation)],
|
522
|
+
UnicodeUtils.name_aliases("\t")
|
523
|
+
assert_equal [],
|
524
|
+
UnicodeUtils.name_aliases("a")
|
525
|
+
end
|
526
|
+
|
527
|
+
def test_sid
|
528
|
+
assert_equal nil, UnicodeUtils.sid(-1)
|
529
|
+
assert_equal "NULL", UnicodeUtils.sid(0x0)
|
530
|
+
assert_equal "LATIN CAPITAL LETTER GHA", UnicodeUtils.sid(0x1a2)
|
531
|
+
assert_equal "LINE FEED", UnicodeUtils.sid(0xa)
|
532
|
+
assert_equal "PADDING CHARACTER", UnicodeUtils.sid(0x80)
|
533
|
+
assert_equal "BYTE ORDER MARK", UnicodeUtils.sid(0xfeff)
|
534
|
+
assert_equal "SPACE", UnicodeUtils.sid(0x20)
|
535
|
+
assert_equal "<reserved-0380>", UnicodeUtils.sid(0x380)
|
536
|
+
assert_equal "<surrogate-D800>", UnicodeUtils.sid(0xd800)
|
537
|
+
assert_equal "<private-use-F0000>", UnicodeUtils.sid(0xf0000)
|
538
|
+
assert_equal "<private-use-10FFFD>", UnicodeUtils.sid(0x10fffd)
|
539
|
+
assert_equal "<noncharacter-10FFFF>", UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end)
|
540
|
+
assert_equal nil, UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end + 1)
|
541
|
+
end
|
542
|
+
|
439
543
|
end
|
metadata
CHANGED
@@ -1,34 +1,25 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode_utils
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 1
|
7
|
-
- 2
|
8
|
-
- 2
|
9
|
-
version: 1.2.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.0
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- Stefan Lang
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
date: 2011-11-27 00:00:00 +01:00
|
18
|
-
default_executable:
|
12
|
+
date: 2012-03-07 00:00:00.000000000 Z
|
19
13
|
dependencies: []
|
20
|
-
|
21
14
|
description:
|
22
15
|
email: langstefan@gmx.at
|
23
16
|
executables: []
|
24
|
-
|
25
17
|
extensions: []
|
26
|
-
|
27
|
-
extra_rdoc_files:
|
18
|
+
extra_rdoc_files:
|
28
19
|
- README.txt
|
29
20
|
- INSTALL.txt
|
30
21
|
- CHANGES.txt
|
31
|
-
files:
|
22
|
+
files:
|
32
23
|
- lib/unicode_utils.rb
|
33
24
|
- lib/unicode_utils/conditional_casing.rb
|
34
25
|
- lib/unicode_utils/version.rb
|
@@ -43,8 +34,12 @@ files:
|
|
43
34
|
- lib/unicode_utils/general_category.rb
|
44
35
|
- lib/unicode_utils/uppercase_char_q.rb
|
45
36
|
- lib/unicode_utils/upcase.rb
|
37
|
+
- lib/unicode_utils/sid.rb
|
46
38
|
- lib/unicode_utils/u.rb
|
39
|
+
- lib/unicode_utils/code_point_type.rb
|
47
40
|
- lib/unicode_utils/hangul_syllable_decomposition.rb
|
41
|
+
- lib/unicode_utils/name_aliases.rb
|
42
|
+
- lib/unicode_utils/name_alias.rb
|
48
43
|
- lib/unicode_utils/soft_dotted_char_q.rb
|
49
44
|
- lib/unicode_utils/lowercase_char_q.rb
|
50
45
|
- lib/unicode_utils/read_cdata.rb
|
@@ -87,6 +82,7 @@ files:
|
|
87
82
|
- cdata/general_category_aliases
|
88
83
|
- cdata/canonical_decomposition_map
|
89
84
|
- cdata/cat_set_titlecase
|
85
|
+
- cdata/name_aliases
|
90
86
|
- cdata/casefold_f_map
|
91
87
|
- cdata/special_uc_map
|
92
88
|
- cdata/special_tc_map
|
@@ -107,40 +103,31 @@ files:
|
|
107
103
|
- INSTALL.txt
|
108
104
|
- LICENSE.txt
|
109
105
|
- CHANGES.txt
|
110
|
-
has_rdoc: true
|
111
106
|
homepage: http://github.com/lang/unicode_utils
|
112
107
|
licenses: []
|
113
|
-
|
114
108
|
post_install_message:
|
115
|
-
rdoc_options:
|
109
|
+
rdoc_options:
|
116
110
|
- --main=README.txt
|
117
111
|
- --charset=UTF-8
|
118
|
-
require_paths:
|
112
|
+
require_paths:
|
119
113
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
115
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
segments:
|
126
|
-
- 1
|
127
|
-
- 9
|
128
|
-
- 1
|
116
|
+
requirements:
|
117
|
+
- - ! '>='
|
118
|
+
- !ruby/object:Gem::Version
|
129
119
|
version: 1.9.1
|
130
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
121
|
none: false
|
132
|
-
requirements:
|
133
|
-
- -
|
134
|
-
- !ruby/object:Gem::Version
|
135
|
-
|
136
|
-
- 0
|
137
|
-
version: "0"
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
138
126
|
requirements: []
|
139
|
-
|
140
127
|
rubyforge_project: unicode-utils
|
141
|
-
rubygems_version: 1.
|
128
|
+
rubygems_version: 1.8.11
|
142
129
|
signing_key:
|
143
130
|
specification_version: 3
|
144
131
|
summary: additional Unicode aware functions for Ruby 1.9
|
145
|
-
test_files:
|
132
|
+
test_files:
|
146
133
|
- test/test_unicode_utils.rb
|