unicode-name 1.13.1 → 1.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ad37a86c74a53f8ed0fd5b7d99958e63f06537392c1156810405d8cf83e98082
4
- data.tar.gz: ec21c556d90d33b7dbef6ac61cfca62805505f4a3afba068a5efc8f3d967b686
3
+ metadata.gz: 7d5b492a10c5d32a1452b2c01a98dce50e1dbeedbe75b5fc3d73aae7cac013d8
4
+ data.tar.gz: d1b1cd8ced4f9685c31833b15e7d441469ec3117911f46a13b07cbd7ea33426b
5
5
  SHA512:
6
- metadata.gz: 5a86c60936a59105991a4e769ecfa2ea6cff752ab85a4ad87ca800d2455d13cf2b276cfa20a4d281bc4b91eb4f9c7155293b4da8a454d544a992045b66b9006e
7
- data.tar.gz: 8e2ad3afb63a2dda11dee3b2a5b3e7ea24e275e3744fbfe14615b8cca698ae14b2d9784cf6180e4d4136eb058fe1ae4b1a1fddd1ff34fc220ac059a0639586fb
6
+ metadata.gz: 20e926fc2271e4421359f767a0b601c52e37c012c8bb804c1318df4bf8d50686bfd8206d6c0bf26454921e14aae237c255c734e15aaede0ef875a398a91757e7
7
+ data.tar.gz: c2c46f3bd4dd9b83f32343c7d18e15a16a1af440a51db7c0512f3acdd0382a6b5cb7653653b3ed9695ebd277172b7d8c15bedda517ce9d8028b4db0bf475f4bb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.13.3
4
+
5
+ - Fix regression introduced in 1.13.2 that some CJK ideographs were missing
6
+
7
+ ### 1.13.2
8
+
9
+ - Optimize index size by removing ranges that have codepoints embedded
10
+ - Optimize index size by substituting common words
11
+ - Fix missing Tangut ideographs
12
+
3
13
  ### 1.13.1
4
14
 
5
15
  Bugfix release:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-name (1.13.1)
4
+ unicode-name (1.13.3)
5
5
  unicode-types (~> 1.10)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -42,7 +42,11 @@ Unicode::Name.readable("\0") # => "NULL"
42
42
  Unicode::Name.readable("\u{FFFFD}") # => "<private-use-FFFFD>"
43
43
  ```
44
44
 
45
- See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences.
45
+ See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences. This is how you could use both libraries together to get the most relevant name of a character:
46
+
47
+ ```ruby
48
+ name = Unicode::SequenceName.of(char) || Unicode::Name.readable(char)
49
+ ```
46
50
 
47
51
  See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries.
48
52
 
data/data/name.marshal.gz CHANGED
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Unicode
4
4
  module Name
5
- VERSION = "1.13.1"
5
+ VERSION = "1.13.3"
6
6
  UNICODE_VERSION = "16.0.0"
7
7
  DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/").freeze
8
8
  INDEX_FILENAME = (DATA_DIRECTORY + "/name.marshal.gz").freeze
data/lib/unicode/name.rb CHANGED
@@ -11,11 +11,18 @@ module Unicode
11
11
  def self.unicode_name(char)
12
12
  codepoint = char.unpack("U")[0]
13
13
  require_relative "name/index" unless defined? ::Unicode::Name::INDEX
14
+
14
15
  if res = INDEX[:NAMES][codepoint]
15
- res
16
- elsif INDEX[:CJK].any?{ |cjk_range| codepoint >= cjk_range[0] && codepoint <= cjk_range[1] }
17
- "CJK UNIFIED IDEOGRAPH-%.4X" % codepoint
18
- elsif codepoint >= HANGUL_START && codepoint <= HANGUL_END
16
+ return insert_words(res)
17
+ end
18
+
19
+ INDEX[:CP_RANGES].each{|prefix, range|
20
+ if range.any?{ |range| codepoint >= range[0] && codepoint <= range[1] }
21
+ return "%s%.4X" %[prefix, codepoint]
22
+ end
23
+ }
24
+
25
+ if codepoint >= HANGUL_START && codepoint <= HANGUL_END
19
26
  "HANGUL SYLLABLE %s" % hangul_decomposition(codepoint)
20
27
  else
21
28
  nil
@@ -72,8 +79,6 @@ module Unicode
72
79
  label(char)
73
80
  end
74
81
 
75
- private
76
-
77
82
  # See https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
78
83
  def self.hangul_decomposition(codepoint)
79
84
  base = codepoint - HANGUL_START
@@ -82,6 +87,22 @@ module Unicode
82
87
  initial = base / HANGUL_MEDIAL_MAX
83
88
  "#{INDEX[:JAMO][:INITIAL][initial]}#{INDEX[:JAMO][:MEDIAL][medial]}#{INDEX[:JAMO][:FINAL][final]}"
84
89
  end
90
+
91
+ def self.insert_words(raw_name)
92
+ raw_name.chars.map{ |char|
93
+ codepoint = char.ord
94
+ if codepoint < INDEX[:REPLACE_BASE]
95
+ char
96
+ else
97
+ "#{INDEX[:COMMON_WORDS][codepoint - INDEX[:REPLACE_BASE]]} "
98
+ end
99
+ }.join.chomp
100
+ end
101
+
102
+ class << self
103
+ private :hangul_decomposition
104
+ private :insert_words
105
+ end
85
106
  end
86
107
  end
87
108
 
@@ -9,11 +9,12 @@ describe Unicode::Name do
9
9
  assert_equal "REPLACEMENT CHARACTER", Unicode::Name.of("�")
10
10
  end
11
11
 
12
- it "works for CJK Ideographs" do
12
+ it "works for CJK unified ideographs" do
13
13
  assert_equal "CJK UNIFIED IDEOGRAPH-4E01", Unicode::Name.of("丁")
14
+ assert_equal "SQUARED CJK UNIFIED IDEOGRAPH-6709", Unicode::Name.of("🈶")
14
15
  end
15
16
 
16
- it "works for Hangul Syllables" do
17
+ it "works for Hangul syllables" do
17
18
  assert_equal "HANGUL SYLLABLE HAN", Unicode::Name.of("한")
18
19
  assert_equal "HANGUL SYLLABLE GAG", Unicode::Name.of("각")
19
20
  assert_equal "HANGUL SYLLABLE GAE", Unicode::Name.of("개")
@@ -21,6 +22,14 @@ describe Unicode::Name do
21
22
  assert_equal "HANGUL SYLLABLE DWALB", Unicode::Name.of("돫")
22
23
  end
23
24
 
25
+ it "works with some ranges that have the codepoint embedded" do
26
+ assert_equal "EGYPTIAN HIEROGLYPH-143F5", Unicode::Name.of("𔏵")
27
+ assert_equal "KHITAN SMALL SCRIPT CHARACTER-18C12", Unicode::Name.of("𘰒")
28
+ assert_equal "TANGUT IDEOGRAPH-18D00", Unicode::Name.of("𘴀")
29
+ assert_equal "NUSHU CHARACTER-1B171", Unicode::Name.of("𛅱")
30
+ assert_equal "CJK COMPATIBILITY IDEOGRAPH-2F9B1", Unicode::Name.of("𧃒")
31
+ end
32
+
24
33
  it "will return nil for characters without name" do
25
34
  assert_nil Unicode::Name.of("\u{10c50}")
26
35
  assert_nil Unicode::Name.of("\0")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode-name
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.13.1
4
+ version: 1.13.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-03 00:00:00.000000000 Z
11
+ date: 2024-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode-types
@@ -68,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0'
70
70
  requirements: []
71
- rubygems_version: 3.5.20
71
+ rubygems_version: 3.5.21
72
72
  signing_key:
73
73
  specification_version: 4
74
74
  summary: Returns name/aliases/label of a Unicode codepoint