unicode_utils 1.2.2 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGES.txt +14 -0
  2. data/LICENSE.txt +1 -1
  3. data/cdata/canonical_decomposition_map +1 -1
  4. data/cdata/case_ignorable_set +1 -1
  5. data/cdata/casefold_c_map +1 -1
  6. data/cdata/combining_class_map +1 -1
  7. data/cdata/compatibility_decomposition_map +1 -1
  8. data/cdata/composition_exclusion_set +1 -1
  9. data/cdata/east_asian_width_property_per_cp +1 -1
  10. data/cdata/east_asian_width_property_ranges +1 -1
  11. data/cdata/general_category_per_cp +1 -1
  12. data/cdata/general_category_ranges +1 -1
  13. data/cdata/grapheme_break_property +1 -1
  14. data/cdata/name_aliases +1 -0
  15. data/cdata/names +731 -0
  16. data/cdata/prop_set_lowercase +1 -1
  17. data/cdata/prop_set_uppercase +1 -1
  18. data/cdata/simple_lc_map +1 -1
  19. data/cdata/simple_tc_map +1 -1
  20. data/cdata/simple_uc_map +1 -1
  21. data/cdata/word_break_property +1 -1
  22. data/lib/unicode_utils.rb +6 -3
  23. data/lib/unicode_utils/canonical_decomposition.rb +2 -2
  24. data/lib/unicode_utils/case_ignorable_char_q.rb +1 -1
  25. data/lib/unicode_utils/char_display_width.rb +2 -2
  26. data/lib/unicode_utils/char_name.rb +13 -3
  27. data/lib/unicode_utils/char_type.rb +1 -1
  28. data/lib/unicode_utils/code_point_type.rb +70 -0
  29. data/lib/unicode_utils/codepoint.rb +5 -5
  30. data/lib/unicode_utils/compatibility_decomposition.rb +1 -1
  31. data/lib/unicode_utils/debug.rb +5 -5
  32. data/lib/unicode_utils/default_ignorable_char_q.rb +2 -2
  33. data/lib/unicode_utils/display_width.rb +3 -3
  34. data/lib/unicode_utils/each_grapheme.rb +2 -2
  35. data/lib/unicode_utils/each_word.rb +1 -1
  36. data/lib/unicode_utils/east_asian_width.rb +2 -2
  37. data/lib/unicode_utils/gc.rb +1 -1
  38. data/lib/unicode_utils/general_category.rb +1 -1
  39. data/lib/unicode_utils/lowercase_char_q.rb +1 -1
  40. data/lib/unicode_utils/name_alias.rb +46 -0
  41. data/lib/unicode_utils/name_aliases.rb +29 -0
  42. data/lib/unicode_utils/nfc.rb +3 -3
  43. data/lib/unicode_utils/read_cdata.rb +36 -4
  44. data/lib/unicode_utils/sid.rb +63 -0
  45. data/lib/unicode_utils/simple_casefold.rb +2 -2
  46. data/lib/unicode_utils/simple_downcase.rb +2 -2
  47. data/lib/unicode_utils/simple_upcase.rb +2 -2
  48. data/lib/unicode_utils/soft_dotted_char_q.rb +1 -1
  49. data/lib/unicode_utils/titlecase.rb +1 -1
  50. data/lib/unicode_utils/titlecase_char_q.rb +1 -1
  51. data/lib/unicode_utils/uppercase_char_q.rb +1 -1
  52. data/lib/unicode_utils/version.rb +10 -3
  53. data/test/test_unicode_utils.rb +109 -5
  54. metadata +26 -39
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/name_alias"
4
+ require "unicode_utils/read_cdata"
5
+
6
+ module UnicodeUtils
7
+
8
+ NAME_ALIASES_MAP = Impl.read_name_aliases("name_aliases") # :nodoc:
9
+ NAME_ALIASES_MAP.default = [].freeze
10
+
11
+ # Get an Enumerable of formal name aliases of the given character. Returns an
12
+ # empty Enumerable if the character doesn't have an alias.
13
+ #
14
+ # The aliases are instances of UnicodeUtils::NameAlias, the order of the
15
+ # aliases in the returned Enumerable is preserved from NameAliases.txt in the
16
+ # Unicode Character Database.
17
+ #
18
+ # Example:
19
+ #
20
+ # require "unicode_utils/name_aliases"
21
+ # UnicodeUtils.name_aliases("\n").map(&:name) # => ["LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL"]
22
+ #
23
+ # See also: UnicodeUtils.char_name
24
+ def name_aliases(char)
25
+ NAME_ALIASES_MAP[char.ord]
26
+ end
27
+ module_function :name_aliases
28
+
29
+ end
@@ -9,7 +9,7 @@ module UnicodeUtils
9
9
  module Impl # :nodoc:all
10
10
 
11
11
  COMPOSITION_EXCLUSION_SET =
12
- Impl.read_codepoint_set("composition_exclusion_set")
12
+ Impl.read_code_point_set("composition_exclusion_set")
13
13
 
14
14
  CANONICAL_COMPOSITION_MAP = Hash.new.tap do |m|
15
15
  CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
@@ -124,8 +124,8 @@ module UnicodeUtils
124
124
  # Get +str+ in Normalization Form C.
125
125
  #
126
126
  # The Unicode standard has multiple representations for some
127
- # characters. One representation as a single codepoint and other
128
- # representation(s) as a combination of multiple codepoints. This
127
+ # characters. One representation as a single code point and other
128
+ # representation(s) as a combination of multiple code points. This
129
129
  # function "composes" these characters into the former
130
130
  # representation.
131
131
  #
@@ -16,11 +16,19 @@ module UnicodeUtils
16
16
  5 => :Narrow
17
17
  }.freeze
18
18
 
19
+ NAME_ALIAS_TYPE_TO_SYMBOL_MAP = {
20
+ 1 => :correction,
21
+ 2 => :control,
22
+ 3 => :alternate,
23
+ 4 => :figment,
24
+ 5 => :abbreviation
25
+ }.freeze
26
+
19
27
  def self.open_cdata_file(filename, &block)
20
28
  File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
21
29
  end
22
30
 
23
- def self.read_codepoint_set(filename)
31
+ def self.read_code_point_set(filename)
24
32
  Hash.new.tap { |set|
25
33
  open_cdata_file(filename) do |input|
26
34
  buffer = "x" * 6
@@ -32,7 +40,7 @@ module UnicodeUtils
32
40
  }
33
41
  end
34
42
 
35
- def self.read_codepoint_map(filename)
43
+ def self.read_code_point_map(filename)
36
44
  Hash.new.tap { |map|
37
45
  open_cdata_file(filename) do |input|
38
46
  buffer = "x" * 6
@@ -104,7 +112,7 @@ module UnicodeUtils
104
112
  }
105
113
  end
106
114
 
107
- # Read a map whose keys are codepoints (6 hexgdigits, converted to
115
+ # Read a map whose keys are code points (6 hexgdigits, converted to
108
116
  # integer) and whose values are single hexdigits (converted to
109
117
  # integer).
110
118
  def self.read_hexdigit_map(filename)
@@ -122,7 +130,7 @@ module UnicodeUtils
122
130
  end
123
131
 
124
132
  # Returns a list (array) of pairs (two element Arrays) of Range
125
- # (codepoints) and associated integer value.
133
+ # (code points) and associated integer value.
126
134
  def self.read_range_to_hexdigit_list(filename)
127
135
  Array.new.tap { |list|
128
136
  open_cdata_file(filename) do |input|
@@ -208,6 +216,30 @@ module UnicodeUtils
208
216
  }
209
217
  end
210
218
 
219
+ def self.read_name_aliases(filename)
220
+ Hash.new.tap { |map|
221
+ open_cdata_file(filename) do |input|
222
+ cp_buffer = "x" * 6
223
+ cp_buffer.force_encoding(Encoding::US_ASCII)
224
+ ac_buffer = "x" * 1
225
+ ac_buffer.force_encoding(Encoding::US_ASCII)
226
+ at_buffer = "x" * 1
227
+ at_buffer.force_encoding(Encoding::US_ASCII)
228
+ al_buffer = "x" * 2
229
+ al_buffer.force_encoding(Encoding::US_ASCII)
230
+ while input.read(6, cp_buffer)
231
+ aliases = Array.new(input.read(1, ac_buffer).to_i(16))
232
+ 0.upto(aliases.length - 1) { |i|
233
+ type = NAME_ALIAS_TYPE_TO_SYMBOL_MAP[input.read(1, at_buffer).to_i(16)]
234
+ name = input.read(input.read(2, al_buffer).to_i(16))
235
+ aliases[i] = NameAlias.new(name.freeze, type)
236
+ }
237
+ map[cp_buffer.to_i(16)] = aliases.freeze
238
+ end
239
+ end
240
+ }
241
+ end
242
+
211
243
  end
212
244
 
213
245
  end
@@ -0,0 +1,63 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require "unicode_utils/name_aliases"
4
+ require "unicode_utils/code_point_type"
5
+
6
+ module UnicodeUtils
7
+
8
+ CP_PREFERRED_ALIAS_STRING_MAP = Hash.new.tap do |map|
9
+ NAME_ALIASES_MAP.each { |cp, aliases|
10
+ al =
11
+ (aliases.find { |al| al.type == :correction } ||
12
+ aliases.find { |al| al.type == :control } ||
13
+ aliases.find { |al| al.type == :figment } ||
14
+ aliases.find { |al| al.type == :alternate })
15
+ map[cp] = al.name if al
16
+ }
17
+ end #:nodoc:
18
+
19
+ # Returns a unique string identifier for every code point. Returns
20
+ # nil if +code_point+ is not in the Unicode codespace. +code_point+
21
+ # must be an Integer.
22
+ #
23
+ # The returned string identifier is either the non-empty Name
24
+ # property value of +code_point+, a non-empty Name_Alias string
25
+ # property value of +code_point+, or the code point label as
26
+ # described by section "Code Point Labels" in chapter 4.8 "Name" of
27
+ # the Unicode standard.
28
+ #
29
+ # If the returned identifier starts with "<", it is a code point
30
+ # label and it ends with ">". Otherwise it is the normative name or
31
+ # a formal alias string.
32
+ #
33
+ # The exact name/alias/label selection algorithm may change even in
34
+ # minor UnicodeUtils releases, but overall behaviour will stay the
35
+ # same in spirit.
36
+ #
37
+ # The selection process in this version of UnicodeUtils is:
38
+ # 1. Use an alias of type :correction, :control, :figment or
39
+ # :alternate (with listed precendence) if available
40
+ # 2. Use the Unicode Name property value if it is not empty
41
+ # 3. Construct a code point label in angle brackets.
42
+ #
43
+ # Examples:
44
+ #
45
+ # require "unicode_utils/sid"
46
+ #
47
+ # U.sid 0xa # => "LINE FEED"
48
+ # U.sid 0x0 # => "NULL"
49
+ # U.sid 0xfeff # => "BYTE ORDER MARK"
50
+ # U.sid 0xe000 # => "<private-use-E000>"
51
+ # U.sid 0x61 # => "LATIN SMALL LETTER A"
52
+ # U.sid -1 # => nil
53
+ def sid(code_point)
54
+ s = CP_PREFERRED_ALIAS_STRING_MAP[code_point] and return s
55
+ cn = UnicodeUtils.char_name(code_point)
56
+ return cn if cn && cn !~ /\A(\<|\z)/
57
+ ct = UnicodeUtils.code_point_type(code_point) or return nil
58
+ ts = ct.to_s.downcase.gsub('_', '-')
59
+ "<#{ts}-#{code_point.to_s(16).upcase.rjust(4, '0')}>"
60
+ end
61
+ module_function :sid
62
+
63
+ end
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- CASEFOLD_C_MAP = Impl.read_codepoint_map("casefold_c_map") # :nodoc:
7
+ CASEFOLD_C_MAP = Impl.read_code_point_map("casefold_c_map") # :nodoc:
8
8
 
9
- CASEFOLD_S_MAP = Impl.read_codepoint_map("casefold_s_map") # :nodoc:
9
+ CASEFOLD_S_MAP = Impl.read_code_point_map("casefold_s_map") # :nodoc:
10
10
 
11
11
  # Perform simple case folding. Contrary to full case folding, this
12
12
  # uses only one to one mappings, so that the length of the returned
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SIMPLE_DOWNCASE_MAP = Impl.read_codepoint_map("simple_lc_map") # :nodoc:
7
+ SIMPLE_DOWNCASE_MAP = Impl.read_code_point_map("simple_lc_map") # :nodoc:
8
8
 
9
- # Map each codepoint in +str+ that has a single codepoint
9
+ # Map each code point in +str+ that has a single code point
10
10
  # lowercase-mapping to that lowercase mapping. The returned string
11
11
  # has the same length as the original string.
12
12
  #
@@ -4,9 +4,9 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SIMPLE_UPCASE_MAP = Impl.read_codepoint_map("simple_uc_map") # :nodoc:
7
+ SIMPLE_UPCASE_MAP = Impl.read_code_point_map("simple_uc_map") # :nodoc:
8
8
 
9
- # Map each codepoint in +str+ that has a single codepoint
9
+ # Map each code point in +str+ that has a single code point
10
10
  # uppercase-mapping to that uppercase mapping. The returned string
11
11
  # has the same length as the original string.
12
12
  #
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- SOFT_DOTTED_SET = Impl.read_codepoint_set("soft_dotted_set") # :nodoc:
7
+ SOFT_DOTTED_SET = Impl.read_code_point_set("soft_dotted_set") # :nodoc:
8
8
 
9
9
  # Returns true if the given character has the Unicode property
10
10
  # Soft_Dotted.
@@ -8,7 +8,7 @@ require "unicode_utils/downcase"
8
8
 
9
9
  module UnicodeUtils
10
10
 
11
- SIMPLE_TITLECASE_MAP = Impl.read_codepoint_map("simple_tc_map") # :nodoc:
11
+ SIMPLE_TITLECASE_MAP = Impl.read_code_point_map("simple_tc_map") # :nodoc:
12
12
  SPECIAL_TITLECASE_MAP = Impl.read_multivalued_map("special_tc_map") # :nodoc:
13
13
 
14
14
  # Convert the first cased character after each word boundary to
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- TITLECASE_LETTER_SET = Impl.read_codepoint_set("cat_set_titlecase") # :nodoc:
7
+ TITLECASE_LETTER_SET = Impl.read_code_point_set("cat_set_titlecase") # :nodoc:
8
8
 
9
9
  # True if the given character has the General_Category
10
10
  # Titlecase_Letter (Lt).
@@ -4,7 +4,7 @@ require "unicode_utils/read_cdata"
4
4
 
5
5
  module UnicodeUtils
6
6
 
7
- PROP_UPPERCASE_SET = Impl.read_codepoint_set("prop_set_uppercase") # :nodoc:
7
+ PROP_UPPERCASE_SET = Impl.read_code_point_set("prop_set_uppercase") # :nodoc:
8
8
 
9
9
  # True if the given character has the Unicode property Uppercase.
10
10
  def uppercase_char?(char)
@@ -4,13 +4,20 @@ module UnicodeUtils
4
4
 
5
5
  # Corresponds to the unicode_utils gem version.
6
6
  #
7
+ # Conforms to Semantic Versioning as documented at semver.org.
8
+ #
9
+ # Summary:
7
10
  # MAJOR.MINOR.PATCHLEVEL
8
11
  # - A backwards incompatible change causes a change in MAJOR
9
12
  # - New features or non-bugfix improvals cause a change in MINOR
10
13
  # - Bugfixes increase only PATCHLEVEL.
14
+ # - Pre-release versions append more info after a dash.
15
+ VERSION = "1.3.0"
16
+
17
+ # The version of Unicode implemented by this version of UnicodeUtils.
11
18
  #
12
- # A release always has an even PATCHLEVEL. PATCHLEVEL is uneven
13
- # during development.
14
- VERSION = "1.2.2"
19
+ # require "unicode_utils/version"
20
+ # puts "Unicode #{UnicodeUtils::UNICODE_VERSION}"
21
+ UNICODE_VERSION = "6.1.0"
15
22
 
16
23
  end
@@ -8,6 +8,10 @@ require "unicode_utils"
8
8
  # Fast tests for allmost all UnicodeUtils functions.
9
9
  class TestUnicodeUtils < Test::Unit::TestCase
10
10
 
11
+ def test_unicode_version
12
+ assert_match /\A\d+\.\d+\.\d+\z/, UnicodeUtils::UNICODE_VERSION
13
+ end
14
+
11
15
  def test_name
12
16
  assert_equal "LATIN SMALL LETTER F", UnicodeUtils.char_name("f")
13
17
  assert_equal Encoding::US_ASCII, UnicodeUtils.char_name("f").encoding
@@ -421,19 +425,119 @@ class TestUnicodeUtils < Test::Unit::TestCase
421
425
  io = StringIO.new
422
426
  UnicodeUtils.debug("", io: io)
423
427
  assert_equal <<-'EOF', io.string
424
- Char | Ordinal | Name | General Category | UTF-8
425
- ------+---------+------+------------------+-------
428
+ Char | Ordinal | Sid | General Category | UTF-8
429
+ ------+---------+-----+------------------+-------
426
430
  EOF
427
431
  io = StringIO.new
428
432
  UnicodeUtils.debug("一 \u{100000}\n", io: io)
429
433
  assert_equal <<-'EOF', io.string
430
- Char | Ordinal | Name | General Category | UTF-8
434
+ Char | Ordinal | Sid | General Category | UTF-8
431
435
  ------+---------+----------------------------+------------------+-------------
432
436
  "一" | 4E00 | CJK UNIFIED IDEOGRAPH-4E00 | Other_Letter | E4 B8 80
433
437
  " " | 20 | SPACE | Space_Separator | 20
434
- N/A | 100000 | N/A | Private_Use | F4 80 80 80
435
- "\n" | A | <control> | Control | 0A
438
+ N/A | 100000 | <private-use-100000> | Private_Use | F4 80 80 80
439
+ "\n" | A | LINE FEED | Control | 0A
436
440
  EOF
437
441
  end
438
442
 
443
+ def test_code_point_type
444
+ assert_equal :Graphic, UnicodeUtils.code_point_type("A")
445
+ assert_equal :Graphic, UnicodeUtils.code_point_type("a")
446
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x1cb)
447
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2b5)
448
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10923)
449
+
450
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x5a0)
451
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x93f)
452
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x20dd)
453
+
454
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xa901)
455
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10144)
456
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x10917)
457
+
458
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x5f)
459
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2011)
460
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2329)
461
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xfe38)
462
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x201c)
463
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x201d)
464
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2e10)
465
+
466
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xff0b)
467
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0xa3)
468
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2c2)
469
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x60f)
470
+
471
+ assert_equal :Graphic, UnicodeUtils.code_point_type(0x2001)
472
+ assert_equal :Format, UnicodeUtils.code_point_type(0x2028)
473
+ assert_equal :Format, UnicodeUtils.code_point_type(0x2029)
474
+
475
+ assert_equal :Control, UnicodeUtils.code_point_type(0x0)
476
+ assert_equal :Format, UnicodeUtils.code_point_type(0x70f)
477
+ assert_equal :Surrogate, UnicodeUtils.code_point_type(0xdb82)
478
+ assert_equal :Private_Use, UnicodeUtils.code_point_type(0xf1020)
479
+ assert_equal :Private_Use, UnicodeUtils.code_point_type(0x10fffd)
480
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0x10ffff)
481
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xfffe)
482
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xffff)
483
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbfffe)
484
+ assert_equal :Noncharacter, UnicodeUtils.code_point_type(0xbffff)
485
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0x380)
486
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xeeb)
487
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xfff)
488
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0x7fffd)
489
+ assert_equal :Reserved, UnicodeUtils.code_point_type(0xeffef)
490
+ ### above is at least one assertion for every general category ###
491
+
492
+ assert_equal nil, UnicodeUtils.code_point_type(-1)
493
+ assert_equal nil, UnicodeUtils.code_point_type(0x110000)
494
+ end
495
+
496
+ def test_name_aliases
497
+ assert_equal [UnicodeUtils::NameAlias.new("NULL", :control),
498
+ UnicodeUtils::NameAlias.new("NUL", :abbreviation)],
499
+ UnicodeUtils.name_aliases(0x0)
500
+ assert_equal [UnicodeUtils::NameAlias.new("LATIN CAPITAL LETTER GHA", :correction)],
501
+ UnicodeUtils.name_aliases(0x1a2)
502
+ assert_equal [UnicodeUtils::NameAlias.new("BYTE ORDER MARK", :alternate),
503
+ UnicodeUtils::NameAlias.new("BOM", :abbreviation),
504
+ UnicodeUtils::NameAlias.new("ZWNBSP", :abbreviation)],
505
+ UnicodeUtils.name_aliases(0xfeff)
506
+ assert_equal [UnicodeUtils::NameAlias.new("PADDING CHARACTER", :figment),
507
+ UnicodeUtils::NameAlias.new("PAD", :abbreviation)],
508
+ UnicodeUtils.name_aliases(0x80)
509
+ assert_equal [UnicodeUtils::NameAlias.new("VS256", :abbreviation)],
510
+ UnicodeUtils.name_aliases(0xe01ef)
511
+ assert_equal [UnicodeUtils::NameAlias.new("LINE FEED", :control),
512
+ UnicodeUtils::NameAlias.new("NEW LINE", :control),
513
+ UnicodeUtils::NameAlias.new("END OF LINE", :control),
514
+ UnicodeUtils::NameAlias.new("LF", :abbreviation),
515
+ UnicodeUtils::NameAlias.new("NL", :abbreviation),
516
+ UnicodeUtils::NameAlias.new("EOL", :abbreviation)],
517
+ UnicodeUtils.name_aliases(0xa)
518
+ assert_equal [UnicodeUtils::NameAlias.new("CHARACTER TABULATION", :control),
519
+ UnicodeUtils::NameAlias.new("HORIZONTAL TABULATION", :control),
520
+ UnicodeUtils::NameAlias.new("HT", :abbreviation),
521
+ UnicodeUtils::NameAlias.new("TAB", :abbreviation)],
522
+ UnicodeUtils.name_aliases("\t")
523
+ assert_equal [],
524
+ UnicodeUtils.name_aliases("a")
525
+ end
526
+
527
+ def test_sid
528
+ assert_equal nil, UnicodeUtils.sid(-1)
529
+ assert_equal "NULL", UnicodeUtils.sid(0x0)
530
+ assert_equal "LATIN CAPITAL LETTER GHA", UnicodeUtils.sid(0x1a2)
531
+ assert_equal "LINE FEED", UnicodeUtils.sid(0xa)
532
+ assert_equal "PADDING CHARACTER", UnicodeUtils.sid(0x80)
533
+ assert_equal "BYTE ORDER MARK", UnicodeUtils.sid(0xfeff)
534
+ assert_equal "SPACE", UnicodeUtils.sid(0x20)
535
+ assert_equal "<reserved-0380>", UnicodeUtils.sid(0x380)
536
+ assert_equal "<surrogate-D800>", UnicodeUtils.sid(0xd800)
537
+ assert_equal "<private-use-F0000>", UnicodeUtils.sid(0xf0000)
538
+ assert_equal "<private-use-10FFFD>", UnicodeUtils.sid(0x10fffd)
539
+ assert_equal "<noncharacter-10FFFF>", UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end)
540
+ assert_equal nil, UnicodeUtils.sid(UnicodeUtils::Codepoint::RANGE.end + 1)
541
+ end
542
+
439
543
  end
metadata CHANGED
@@ -1,34 +1,25 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: unicode_utils
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 1
7
- - 2
8
- - 2
9
- version: 1.2.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.0
5
+ prerelease:
10
6
  platform: ruby
11
- authors:
7
+ authors:
12
8
  - Stefan Lang
13
9
  autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
-
17
- date: 2011-11-27 00:00:00 +01:00
18
- default_executable:
12
+ date: 2012-03-07 00:00:00.000000000 Z
19
13
  dependencies: []
20
-
21
14
  description:
22
15
  email: langstefan@gmx.at
23
16
  executables: []
24
-
25
17
  extensions: []
26
-
27
- extra_rdoc_files:
18
+ extra_rdoc_files:
28
19
  - README.txt
29
20
  - INSTALL.txt
30
21
  - CHANGES.txt
31
- files:
22
+ files:
32
23
  - lib/unicode_utils.rb
33
24
  - lib/unicode_utils/conditional_casing.rb
34
25
  - lib/unicode_utils/version.rb
@@ -43,8 +34,12 @@ files:
43
34
  - lib/unicode_utils/general_category.rb
44
35
  - lib/unicode_utils/uppercase_char_q.rb
45
36
  - lib/unicode_utils/upcase.rb
37
+ - lib/unicode_utils/sid.rb
46
38
  - lib/unicode_utils/u.rb
39
+ - lib/unicode_utils/code_point_type.rb
47
40
  - lib/unicode_utils/hangul_syllable_decomposition.rb
41
+ - lib/unicode_utils/name_aliases.rb
42
+ - lib/unicode_utils/name_alias.rb
48
43
  - lib/unicode_utils/soft_dotted_char_q.rb
49
44
  - lib/unicode_utils/lowercase_char_q.rb
50
45
  - lib/unicode_utils/read_cdata.rb
@@ -87,6 +82,7 @@ files:
87
82
  - cdata/general_category_aliases
88
83
  - cdata/canonical_decomposition_map
89
84
  - cdata/cat_set_titlecase
85
+ - cdata/name_aliases
90
86
  - cdata/casefold_f_map
91
87
  - cdata/special_uc_map
92
88
  - cdata/special_tc_map
@@ -107,40 +103,31 @@ files:
107
103
  - INSTALL.txt
108
104
  - LICENSE.txt
109
105
  - CHANGES.txt
110
- has_rdoc: true
111
106
  homepage: http://github.com/lang/unicode_utils
112
107
  licenses: []
113
-
114
108
  post_install_message:
115
- rdoc_options:
109
+ rdoc_options:
116
110
  - --main=README.txt
117
111
  - --charset=UTF-8
118
- require_paths:
112
+ require_paths:
119
113
  - lib
120
- required_ruby_version: !ruby/object:Gem::Requirement
114
+ required_ruby_version: !ruby/object:Gem::Requirement
121
115
  none: false
122
- requirements:
123
- - - ">="
124
- - !ruby/object:Gem::Version
125
- segments:
126
- - 1
127
- - 9
128
- - 1
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
129
119
  version: 1.9.1
130
- required_rubygems_version: !ruby/object:Gem::Requirement
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
121
  none: false
132
- requirements:
133
- - - ">="
134
- - !ruby/object:Gem::Version
135
- segments:
136
- - 0
137
- version: "0"
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
138
126
  requirements: []
139
-
140
127
  rubyforge_project: unicode-utils
141
- rubygems_version: 1.3.7
128
+ rubygems_version: 1.8.11
142
129
  signing_key:
143
130
  specification_version: 3
144
131
  summary: additional Unicode aware functions for Ruby 1.9
145
- test_files:
132
+ test_files:
146
133
  - test/test_unicode_utils.rb