unicode_utils 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGES.txt +14 -0
  2. data/LICENSE.txt +1 -1
  3. data/cdata/canonical_decomposition_map +1 -1
  4. data/cdata/case_ignorable_set +1 -1
  5. data/cdata/casefold_c_map +1 -1
  6. data/cdata/combining_class_map +1 -1
  7. data/cdata/compatibility_decomposition_map +1 -1
  8. data/cdata/composition_exclusion_set +1 -1
  9. data/cdata/east_asian_width_property_per_cp +1 -1
  10. data/cdata/east_asian_width_property_ranges +1 -1
  11. data/cdata/general_category_per_cp +1 -1
  12. data/cdata/general_category_ranges +1 -1
  13. data/cdata/grapheme_break_property +1 -1
  14. data/cdata/name_aliases +1 -0
  15. data/cdata/names +731 -0
  16. data/cdata/prop_set_lowercase +1 -1
  17. data/cdata/prop_set_uppercase +1 -1
  18. data/cdata/simple_lc_map +1 -1
  19. data/cdata/simple_tc_map +1 -1
  20. data/cdata/simple_uc_map +1 -1
  21. data/cdata/word_break_property +1 -1
  22. data/lib/unicode_utils.rb +6 -3
  23. data/lib/unicode_utils/canonical_decomposition.rb +2 -2
  24. data/lib/unicode_utils/case_ignorable_char_q.rb +1 -1
  25. data/lib/unicode_utils/char_display_width.rb +2 -2
  26. data/lib/unicode_utils/char_name.rb +13 -3
  27. data/lib/unicode_utils/char_type.rb +1 -1
  28. data/lib/unicode_utils/code_point_type.rb +70 -0
  29. data/lib/unicode_utils/codepoint.rb +5 -5
  30. data/lib/unicode_utils/compatibility_decomposition.rb +1 -1
  31. data/lib/unicode_utils/debug.rb +5 -5
  32. data/lib/unicode_utils/default_ignorable_char_q.rb +2 -2
  33. data/lib/unicode_utils/display_width.rb +3 -3
  34. data/lib/unicode_utils/each_grapheme.rb +2 -2
  35. data/lib/unicode_utils/each_word.rb +1 -1
  36. data/lib/unicode_utils/east_asian_width.rb +2 -2
  37. data/lib/unicode_utils/gc.rb +1 -1
  38. data/lib/unicode_utils/general_category.rb +1 -1
  39. data/lib/unicode_utils/lowercase_char_q.rb +1 -1
  40. data/lib/unicode_utils/name_alias.rb +46 -0
  41. data/lib/unicode_utils/name_aliases.rb +29 -0
  42. data/lib/unicode_utils/nfc.rb +3 -3
  43. data/lib/unicode_utils/read_cdata.rb +36 -4
  44. data/lib/unicode_utils/sid.rb +63 -0
  45. data/lib/unicode_utils/simple_casefold.rb +2 -2
  46. data/lib/unicode_utils/simple_downcase.rb +2 -2
  47. data/lib/unicode_utils/simple_upcase.rb +2 -2
  48. data/lib/unicode_utils/soft_dotted_char_q.rb +1 -1
  49. data/lib/unicode_utils/titlecase.rb +1 -1
  50. data/lib/unicode_utils/titlecase_char_q.rb +1 -1
  51. data/lib/unicode_utils/uppercase_char_q.rb +1 -1
  52. data/lib/unicode_utils/version.rb +10 -3
  53. data/test/test_unicode_utils.rb +109 -5
  54. metadata +26 -39
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/name_alias"
4
+ require "unicode_utils/read_cdata"
5
+
6
+ module UnicodeUtils
7
+
8
+ NAME_ALIASES_MAP = Impl.read_name_aliases("name_aliases") # :nodoc:
9
+ NAME_ALIASES_MAP.default = [].freeze
10
+
11
+ # Get an Enumerable of formal name aliases of the given character. Returns an
12
+ # empty Enumerable if the character doesn't have an alias.
13
+ #
14
+ # The aliases are instances of UnicodeUtils::NameAlias, the order of the
15
+ # aliases in the returned Enumerable is preserved from NameAliases.txt in the
16
+ # Unicode Character Database.
17
+ #
18
+ # Example:
19
+ #
20
+ # require "unicode_utils/name_aliases"
21
+ # UnicodeUtils.name_aliases("\n").map(&:name) # => ["LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL"]
22
+ #
23
+ # See also: UnicodeUtils.char_name
24
+ def name_aliases(char)
25
+ NAME_ALIASES_MAP[char.ord]
26
+ end
27
+ module_function :name_aliases
28
+
29
+ end
@@ -9,7 +9,7 @@ module UnicodeUtils
9
9
  module Impl # :nodoc:all
10
10
 
11
11
  COMPOSITION_EXCLUSION_SET =
12
- Impl.read_codepoint_set("composition_exclusion_set")
12
+ Impl.read_code_point_set("composition_exclusion_set")
13
13
 
14
14
  CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
15
15
  CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
@@ -124,8 +124,8 @@ module UnicodeUtils
124
124
  # Get +str+ in Normalization Form C.
125
125
  #
126
126
  # The Unicode standard has multiple representations for some
127
- # characters. One representation as a single codepoint and other
128
- # representation(s) as a combination of multiple codepoints. This
127
+ # characters. One representation as a single code point and other
128
+ # representation(s) as a combination of multiple code points. This
129
129
  # function "composes" these characters into the former
130
130
  # representation.
131
131
  #
@@ -16,11 +16,19 @@ module UnicodeUtils
16
16
  5 => :Narrow
17
17
  }.freeze
18
18
 
19
+ NAME_ALIAS_TYPE_TO_SYMBOL_MAP = {
20
+ 1 => :correction,
21
+ 2 => :control,
22
+ 3 => :alternate,
23
+ 4 => :figment,
24
+ 5 => :abbreviation
25
+ }.freeze
26
+
19
27
  def self.open_cdata_file(filename, &block)
20
28
  File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
21
29
  end
22
30
 
23
- def self.read_codepoint_set(filename)
31
+ def self.read_code_point_set(filename)
24
32
  Hash.new.tap { |set|
25
33
  open_cdata_file(filename) do |input|
26
34
  buffer = "x" * 6
@@ -32,7 +40,7 @@ module UnicodeUtils
32
40
  }
33
41
  end
34
42
 
35
- def self.read_codepoint_map(filename)
43
+ def self.read_code_point_map(filename)
36
44
  Hash.new.tap { |map|
37
45
  open_cdata_file(filename) do |input|
38
46
  buffer = "x" * 6
@@ -104,7 +112,7 @@ module UnicodeUtils
104
112
  }
105
113
  end
106
114
 
107
- # Read a map whose keys are codepoints (6 hexgdigits, converted to
115
+ # Read a map whose keys are code points (6 hexgdigits, converted to
108
116
  # integer) and whose values are single hexdigits (converted to
109
117
  # integer).
110
118
  def self.read_hexdigit_map(filename)
@@ -122,7 +130,7 @@ module UnicodeUtils
122
130
  end
123
131
 
124
132
  # Returns a list (array) of pairs (two element Arrays) of Range
125
- # (codepoints) and associated integer value.
133
+ # (code points) and associated integer value.
126
134
  def self.read_range_to_hexdigit_list(filename)
127
135
  Array.new.tap { |list|
128
136
  open_cdata_file(filename) do |input|
@@ -208,6 +216,30 @@ module UnicodeUtils
208
216
  }
209
217
  end
210
218
 
219
+ def self.read_name_aliases(filename)
220
+ Hash.new.tap { |map|
221
+ open_cdata_file(filename) do |input|
222
+ cp_buffer = "x" * 6
223
+ cp_buffer.force_encoding(Encoding::US_ASCII)
224
+ ac_buffer = "x" * 1
225
+ ac_buffer.force_encoding(Encoding::US_ASCII)
226
+ at_buffer = "x" * 1
227
+ at_buffer.force_encoding(Encoding::US_ASCII)
228
+ al_buffer = "x" * 2
229
+ al_buffer.force_encoding(Encoding::US_ASCII)
230
+ while input.read(6, cp_buffer)
231
+ aliases = Array.new(input.read(1, ac_buffer).to_i(16))
232
+ 0.upto(aliases.length - 1) { |i|
233
+ type = NAME_ALIAS_TYPE_TO_SYMBOL_MAP[input.read(1, at_buffer).to_i(16)]
234
+ name = input.read(input.read(2, al_buffer).to_i(16))
235
+ aliases[i] = NameAlias.new(name.freeze, type)
236
+ }
237
+ map[cp_buffer.to_i(16)] = aliases.freeze
238
+ end
239
+ end
240
+ }
241
+ end
242
+
211
243
  end
212
244
 
213
245
  end
@@ -0,0 +1,63 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/name_aliases"
4
+ require "unicode_utils/code_point_type"
5
+
6
+ module UnicodeUtils
7
+
8
+ CP_PREFERRED_ALIAS_STRING_MAP = Hash.new.tap do |map|
9
+ NAME_ALIASES_MAP.each { |cp, aliases|
10
+ al =
11
+ (aliases.find { |al| al.type == :correction } ||
12
+ aliases.find { |al| al.type == :control } ||
13
+ aliases.find { |al| al.type == :figment } ||
14
+ aliases.find { |al| al.type == :alternate })
15
+ map[cp] = al.name if al
16
+ }
17
+ end #:nodoc:
18
+
19
+ # Returns a unique string identifier for every code point. Returns
20
+ # nil if +code_point+ is not in the Unicode codespace. +code_point+
21
+ # must be an Integer.
22
+ #
23
+ # The returned string identifier is either the non-empty Name
24
+ # property value of +code_point+, a non-empty Name_Alias string
25
+ # property value of +code_point+, or the code point label as
26
+ # described by section "Code Point Labels" in chapter 4.8 "Name" of
27
+ # the Unicode standard.
28
+ #
29
+ # If the returned identifier starts with "<", it is a code point
30
+ # label and it ends with ">". Otherwise it is the normative name or
31
+ # a formal alias string.
32
+ #
33
+ # The exact name/alias/label selection algorithm may change even in
34
+ # minor UnicodeUtils releases, but overall behaviour will stay the
35
+ # same in spirit.
36
+ #
37
+ # The selection process in this version of UnicodeUtils is:
38
+ # 1. Use an alias of type :correction, :control, :figment or
39
+ # :alternate (with listed precendence) if available
40
+ # 2. Use the Unicode Name property value if it is not empty
41
+ # 3. Construct a code point label in angle brackets.
42
+ #
43
+ # Examples:
44
+ #
45
+ # require "unicode_utils/sid"
46
+ #
47
+ # U.sid 0xa # => "LINE FEED"
48
+ # U.sid 0x0 # => "NULL"
49
+ # U.sid 0xfeff # => "BYTE ORDER MARK"
50
+ # U.sid 0xe000 # => "<private-use-E000>"
51
+ # U.sid 0x61 # => "LATIN SMALL LETTER A"
52
+ # U.sid -1 # => nil
53
+ def sid(code_point)
54
+ s = CP_PREFERRED_ALIAS_STRING_MAP[code_point] and return s
55
+ cn = UnicodeUtils.char_name(code_point)
56
+ return cn if cn && cn !~ /\A(\<|\z)/
57
+ ct = UnicodeUtils.code_point_type(code_point) or return nil
58
+ ts = ct.to_s.downcase.gsub('_', '-')
59
+ "<#{ts}-#{code_point.to_s(16).upcase.rjust(4, '0')}>"
60
+ end
61
+ module_function :sid
62
+
63
+ end
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- CASEFOLD_C_MAP = Impl.read_codepoint_map("casefold_c_map") # :nodoc:
7
+ CASEFOLD_C_MAP = Impl.read_code_point_map("casefold_c_map") # :nodoc:
8
8
 
9
- CASEFOLD_S_MAP = Impl.read_codepoint_map("casefold_s_map") # :nodoc:
9
+ CASEFOLD_S_MAP = Impl.read_code_point_map("casefold_s_map") # :nodoc:
10
10
 
11
11
  # Perform simple case folding. Contrary to full case folding, this
12
12
  # uses only one to one mappings, so that the length of the returned
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
7
+ SIMPLE_DOWNCASE_MAP = Impl.read_code_point_map("simple_lc_map") # :nodoc:
8
8
 
9
- # Map each codepoint in +str+ that has a single codepoint
9
+ # Map each code point in +str+ that has a single code point
10
10
  # lowercase-mapping to that lowercase mapping. The returned string
11
11
  # has the same length as the original string.
12
12
  #
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
7
+ SIMPLE_UPCASE_MAP = Impl.read_code_point_map("simple_uc_map") # :nodoc:
8
8
 
9
- # Map each codepoint in +str+ that has a single codepoint
9
+ # Map each code point in +str+ that has a single code point
10
10
  # uppercase-mapping to that uppercase mapping. The returned string
11
11
  # has the same length as the original string.
12
12
  #
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SOFT_DOTTED_SET = Impl.read_codepoint_set("soft_dotted_set") # :nodoc:
7
+ SOFT_DOTTED_SET = Impl.read_code_point_set("soft_dotted_set") # :nodoc:
8
8
 
9
9
  # Returns true if the given character has the Unicode property
10
10
  # Soft_Dotted.
@@ -8,7 +8,7 @@ require "unicode_utils/downcase"
8
8
 
9
9
  module UnicodeUtils
10
10
 
11
- SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
11
+ SIMPLE_TITLECASE_MAP = Impl.read_code_point_map("simple_tc_map") # :nodoc:
12
12
  SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
13
13
 
14
14
  # Convert the first cased character after each word boundary to
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- TITLECASE_LETTER_SET = Impl.read_codepoint_set("cat_set_titlecase") # :nodoc:
7
+ TITLECASE_LETTER_SET = Impl.read_code_point_set("cat_set_titlecase") # :nodoc:
8
8
 
9
9
  # True if the given character has the General_Category
10
10
  # Titlecase_Letter (Lt).
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- PROP_UPPERCASE_SET = Impl.read_codepoint_set("prop_set_uppercase") # :nodoc:
7
+ PROP_UPPERCASE_SET = Impl.read_code_point_set("prop_set_uppercase") # :nodoc:
8
8
 
9
9
  # True if the given character has the Unicode property Uppercase.
10
10
  def uppercase_char?(char)
@@ -4,13 +4,20 @@ module UnicodeUtils
4
4
 
5
5
  # Corresponds to the unicode_utils gem version.
6
6
  #
7
+ # Conforms to Semantic Versioning as documented at semver.org.
8
+ #
9
+ # Summary:
7
10
  # MAJOR.MINOR.PATCHLEVEL
8
11
  # - A backwards incompatible change causes a change in MAJOR
9
12
  # - New features or non-bugfix improvals cause a change in MINOR
10
13
  # - Bugfixes increase only PATCHLEVEL.
14
+ # - Pre-release versions append more info after a dash.
15
+ VERSION = "1.3.0"
16
+
17
+ # The version of Unicode implemented by this version of UnicodeUtils.
11
18
  #
12
- # A release always has an even PATCHLEVEL. PATCHLEVEL is uneven
13
- # during development.
14
- VERSION = "1.2.2"
19
+ # require "unicode_utils/version"
20
+ # puts "Unicode #{UnicodeUtils::UNICODE_VERSION}"
21
+ UNICODE_VERSION = "6.1.0"
15
22
 
16
23
  end
@@ -8,6 +8,10 @@ require "unicode_utils"
8
8
  # Fast tests for allmost all UnicodeUtils functions.
9
9
  class TestUnicodeUtils < Test::Unit::TestCase
10
10
 
11
+ def test_unicode_version
12
+ assert_match /\A\d+\.\d+\.\d+\z/, UnicodeUtils::UNICODE_VERSION
13
+ end
14
+
11
15
  def test_name
12
16
  assert_equal "LATIN SMALL LETTER F", UnicodeUtils.char_name("f")
13
17
  assert_equal Encoding::US_ASCII, UnicodeUtils.char_name("f").encoding
@@ -421,19 +425,119 @@ class TestUnicodeUtils < Test::Unit::TestCase
421
425
  io = StringIO.new
422
426
  UnicodeUtils.debug("", io: io)
423
427
  assert_equal <<-'EOF', io.string
424
- Char | Ordinal | Name | General Category | UTF-8
425
- ------+---------+------+------------------+-------
428
+ Char | Ordinal | Sid | General Category | UTF-8
429
+ ------+---------+-----+------------------+-------
426
430
  EOF
427
431
  io = StringIO.new
428
432
  UnicodeUtils.debug("一 \u{100000}\n", io: io)
429
433
  assert_equal <<-'EOF', io.string
430
- Char | Ordinal | Name | General Category | UTF-8
434
+ Char | Ordinal | Sid | General Category | UTF-8
431
435
  ------+---------+----------------------------+------------------+-------------
432
436
  "一" | 4E00 | CJK UNIFIED IDEOGRAPH-4E00 | Other_Letter | E4 B8 80
433
437
  " " | 20 | SPACE | Space_Separator | 20
434
- N/A | 100000 | N/A | Private_Use | F4 80 80 80
435
- "\n" | A | <control> | Control | 0A
438
+ N/A | 100000 | <private-use-100000> | Private_Use | F4 80 80 80
439
+ "\n" | A | LINE FEED | Control | 0A
436
440
  EOF
437
441
  end
438
442
 
443
+ def test_code_point_type
444
+ assert_equal :Graphic, UnicodeUtils.code_point_type("A")
445
+ assert_equal :Graphic, UnicodeUtils.code_point_type("a")
446
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x1cb)
447
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2b5)
448
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10923)
449
+
450
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x5a0)
451
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x93f)
452
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x20dd)
453
+
454
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xa901)
455
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10144)
456
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10917)
457
+
458
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x5f)
459
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2011)
460
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2329)
461
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xfe38)
462
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x201c)
463
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x201d)
464
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2e10)
465
+
466
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xff0b)
467
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xa3)
468
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2c2)
469
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x60f)
470
+
471
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2001)
472
+ assert_equal :Format, UnicodeUtils.code_point_type(0x2028)
473
+ assert_equal :Format, UnicodeUtils.code_point_type(0x2029)
474
+
475
+ assert_equal :Control, UnicodeUtils.code_point_type(0x0)
476
+ assert_equal :Format, UnicodeUtils.code_point_type(0x70f)
477
+ assert_equal :Surrogate, UnicodeUtils.code_point_type(0xdb82)
478
+ assert_equal :Private_Use, UnicodeUtils.code_point_type(0xf1020)
479
+ assert_equal :Private_Use, UnicodeUtils.code_point_type(0x10fffd)
480
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0x10ffff)
481
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xfffe)
482
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xffff)
483
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbfffe)
484
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbffff)
485
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0x380)
486
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xeeb)
487
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xfff)
488
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0x7fffd)
489
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xeffef)
490
+ ### above is at least one assertion for every general category ###
491
+
492
+ assert_equal nil, UnicodeUtils.code_point_type(-1)
493
+ assert_equal nil, UnicodeUtils.code_point_type(0x110000)
494
+ end
495
+
496
+ def test_name_aliases
497
+ assert_equal [UnicodeUtils::NameAlias.new("NULL", :control),
498
+ UnicodeUtils::NameAlias.new("NUL", :abbreviation)],
499
+ UnicodeUtils.name_aliases(0x0)
500
+ assert_equal [UnicodeUtils::NameAlias.new("LATIN CAPITAL LETTER GHA", :correction)],
501
+ UnicodeUtils.name_aliases(0x1a2)
502
+ assert_equal [UnicodeUtils::NameAlias.new("BYTE ORDER MARK", :alternate),
503
+ UnicodeUtils::NameAlias.new("BOM", :abbreviation),
504
+ UnicodeUtils::NameAlias.new("ZWNBSP", :abbreviation)],
505
+ UnicodeUtils.name_aliases(0xfeff)
506
+ assert_equal [UnicodeUtils::NameAlias.new("PADDING CHARACTER", :figment),
507
+ UnicodeUtils::NameAlias.new("PAD", :abbreviation)],
508
+ UnicodeUtils.name_aliases(0x80)
509
+ assert_equal [UnicodeUtils::NameAlias.new("VS256", :abbreviation)],
510
+ UnicodeUtils.name_aliases(0xe01ef)
511
+ assert_equal [UnicodeUtils::NameAlias.new("LINE FEED", :control),
512
+ UnicodeUtils::NameAlias.new("NEW LINE", :control),
513
+ UnicodeUtils::NameAlias.new("END OF LINE", :control),
514
+ UnicodeUtils::NameAlias.new("LF", :abbreviation),
515
+ UnicodeUtils::NameAlias.new("NL", :abbreviation),
516
+ UnicodeUtils::NameAlias.new("EOL", :abbreviation)],
517
+ UnicodeUtils.name_aliases(0xa)
518
+ assert_equal [UnicodeUtils::NameAlias.new("CHARACTER TABULATION", :control),
519
+ UnicodeUtils::NameAlias.new("HORIZONTAL TABULATION", :control),
520
+ UnicodeUtils::NameAlias.new("HT", :abbreviation),
521
+ UnicodeUtils::NameAlias.new("TAB", :abbreviation)],
522
+ UnicodeUtils.name_aliases("\t")
523
+ assert_equal [],
524
+ UnicodeUtils.name_aliases("a")
525
+ end
526
+
527
+ def test_sid
528
+ assert_equal nil, UnicodeUtils.sid(-1)
529
+ assert_equal "NULL", UnicodeUtils.sid(0x0)
530
+ assert_equal "LATIN CAPITAL LETTER GHA", UnicodeUtils.sid(0x1a2)
531
+ assert_equal "LINE FEED", UnicodeUtils.sid(0xa)
532
+ assert_equal "PADDING CHARACTER", UnicodeUtils.sid(0x80)
533
+ assert_equal "BYTE ORDER MARK", UnicodeUtils.sid(0xfeff)
534
+ assert_equal "SPACE", UnicodeUtils.sid(0x20)
535
+ assert_equal "<reserved-0380>", UnicodeUtils.sid(0x380)
536
+ assert_equal "<surrogate-D800>", UnicodeUtils.sid(0xd800)
537
+ assert_equal "<private-use-F0000>", UnicodeUtils.sid(0xf0000)
538
+ assert_equal "<private-use-10FFFD>", UnicodeUtils.sid(0x10fffd)
539
+ assert_equal "<noncharacter-10FFFF>", UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end)
540
+ assert_equal nil, UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end + 1)
541
+ end
542
+
439
543
  end
metadata CHANGED
@@ -1,34 +1,25 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: unicode_utils
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 1
7
- - 2
8
- - 2
9
- version: 1.2.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.0
5
+ prerelease:
10
6
  platform: ruby
11
- authors:
7
+ authors:
12
8
  - Stefan Lang
13
9
  autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
-
17
- date: 2011-11-27 00:00:00 +01:00
18
- default_executable:
12
+ date: 2012-03-07 00:00:00.000000000 Z
19
13
  dependencies: []
20
-
21
14
  description:
22
15
  email: langstefan@gmx.at
23
16
  executables: []
24
-
25
17
  extensions: []
26
-
27
- extra_rdoc_files:
18
+ extra_rdoc_files:
28
19
  - README.txt
29
20
  - INSTALL.txt
30
21
  - CHANGES.txt
31
- files:
22
+ files:
32
23
  - lib/unicode_utils.rb
33
24
  - lib/unicode_utils/conditional_casing.rb
34
25
  - lib/unicode_utils/version.rb
@@ -43,8 +34,12 @@ files:
43
34
  - lib/unicode_utils/general_category.rb
44
35
  - lib/unicode_utils/uppercase_char_q.rb
45
36
  - lib/unicode_utils/upcase.rb
37
+ - lib/unicode_utils/sid.rb
46
38
  - lib/unicode_utils/u.rb
39
+ - lib/unicode_utils/code_point_type.rb
47
40
  - lib/unicode_utils/hangul_syllable_decomposition.rb
41
+ - lib/unicode_utils/name_aliases.rb
42
+ - lib/unicode_utils/name_alias.rb
48
43
  - lib/unicode_utils/soft_dotted_char_q.rb
49
44
  - lib/unicode_utils/lowercase_char_q.rb
50
45
  - lib/unicode_utils/read_cdata.rb
@@ -87,6 +82,7 @@ files:
87
82
  - cdata/general_category_aliases
88
83
  - cdata/canonical_decomposition_map
89
84
  - cdata/cat_set_titlecase
85
+ - cdata/name_aliases
90
86
  - cdata/casefold_f_map
91
87
  - cdata/special_uc_map
92
88
  - cdata/special_tc_map
@@ -107,40 +103,31 @@ files:
107
103
  - INSTALL.txt
108
104
  - LICENSE.txt
109
105
  - CHANGES.txt
110
- has_rdoc: true
111
106
  homepage: http://github.com/lang/unicode_utils
112
107
  licenses: []
113
-
114
108
  post_install_message:
115
- rdoc_options:
109
+ rdoc_options:
116
110
  - --main=README.txt
117
111
  - --charset=UTF-8
118
- require_paths:
112
+ require_paths:
119
113
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
114
+ required_ruby_version: !ruby/object:Gem::Requirement
121
115
  none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- segments:
126
- - 1
127
- - 9
128
- - 1
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
129
119
  version: 1.9.1
130
- required_rubygems_version: !ruby/object:Gem::Requirement
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
121
  none: false
132
- requirements:
133
- - - ">="
134
- - !ruby/object:Gem::Version
135
- segments:
136
- - 0
137
- version: "0"
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
138
126
  requirements: []
139
-
140
127
  rubyforge_project: unicode-utils
141
- rubygems_version: 1.3.7
128
+ rubygems_version: 1.8.11
142
129
  signing_key:
143
130
  specification_version: 3
144
131
  summary: additional Unicode aware functions for Ruby 1.9
145
- test_files:
132
+ test_files:
146
133
  - test/test_unicode_utils.rb