unicode-name 1.13.1 → 1.13.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ad37a86c74a53f8ed0fd5b7d99958e63f06537392c1156810405d8cf83e98082
4
- data.tar.gz: ec21c556d90d33b7dbef6ac61cfca62805505f4a3afba068a5efc8f3d967b686
3
+ metadata.gz: ca6d8f90ce7c5fa9c9da362be1d90b10260da01d8ac97e2412e0699fa69ca40a
4
+ data.tar.gz: a3a2a417c76906c32fe429ce51e16c543696687bb0078340aecb293a65595800
5
5
  SHA512:
6
- metadata.gz: 5a86c60936a59105991a4e769ecfa2ea6cff752ab85a4ad87ca800d2455d13cf2b276cfa20a4d281bc4b91eb4f9c7155293b4da8a454d544a992045b66b9006e
7
- data.tar.gz: 8e2ad3afb63a2dda11dee3b2a5b3e7ea24e275e3744fbfe14615b8cca698ae14b2d9784cf6180e4d4136eb058fe1ae4b1a1fddd1ff34fc220ac059a0639586fb
6
+ metadata.gz: 9ad0910912fcf5e226e00955c72cd3325796acc49137ce2e9c141fdcaa5518585fb38795de6587cd792d22b428110d08592a212220dc09a91f3e92016140a86a
7
+ data.tar.gz: 5b8de2a4c57c893d6e18ef4ce5b876a032f0cc0f3726504753a0dfbf9b7b4e4bf18d2b6b7aadf1a976231079d285872a6203e352d66fe26154883c31237d9aca
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.13.2
4
+
5
+ - Optimize index size by removing ranges that have codepoints embedded
6
+ - Optimize index size by substituting common words
7
+ - Fix missing Tangut ideographs
8
+
3
9
  ### 1.13.1
4
10
 
5
11
  Bugfix release:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-name (1.13.1)
4
+ unicode-name (1.13.2)
5
5
  unicode-types (~> 1.10)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -42,7 +42,11 @@ Unicode::Name.readable("\0") # => "NULL"
42
42
  Unicode::Name.readable("\u{FFFFD}") # => "<private-use-FFFFD>"
43
43
  ```
44
44
 
45
- See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences.
45
+ See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences. This is how you could use both libraries together to get the most relevant name of a character:
46
+
47
+ ```ruby
48
+ name = Unicode::SequenceName.of(char) || Unicode::Name.readable(char)
49
+ ```
46
50
 
47
51
  See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries.
48
52
 
data/data/name.marshal.gz CHANGED
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Unicode
4
4
  module Name
5
- VERSION = "1.13.1"
5
+ VERSION = "1.13.2"
6
6
  UNICODE_VERSION = "16.0.0"
7
7
  DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/").freeze
8
8
  INDEX_FILENAME = (DATA_DIRECTORY + "/name.marshal.gz").freeze
data/lib/unicode/name.rb CHANGED
@@ -11,11 +11,18 @@ module Unicode
11
11
  def self.unicode_name(char)
12
12
  codepoint = char.unpack("U")[0]
13
13
  require_relative "name/index" unless defined? ::Unicode::Name::INDEX
14
+
14
15
  if res = INDEX[:NAMES][codepoint]
15
- res
16
- elsif INDEX[:CJK].any?{ |cjk_range| codepoint >= cjk_range[0] && codepoint <= cjk_range[1] }
17
- "CJK UNIFIED IDEOGRAPH-%.4X" % codepoint
18
- elsif codepoint >= HANGUL_START && codepoint <= HANGUL_END
16
+ return insert_words(res)
17
+ end
18
+
19
+ INDEX[:CP_RANGES].each{|prefix, range|
20
+ if range.any?{ |range| codepoint >= range[0] && codepoint <= range[1] }
21
+ return "%s%.4X" %[prefix, codepoint]
22
+ end
23
+ }
24
+
25
+ if codepoint >= HANGUL_START && codepoint <= HANGUL_END
19
26
  "HANGUL SYLLABLE %s" % hangul_decomposition(codepoint)
20
27
  else
21
28
  nil
@@ -82,6 +89,17 @@ module Unicode
82
89
  initial = base / HANGUL_MEDIAL_MAX
83
90
  "#{INDEX[:JAMO][:INITIAL][initial]}#{INDEX[:JAMO][:MEDIAL][medial]}#{INDEX[:JAMO][:FINAL][final]}"
84
91
  end
92
+
93
+ def self.insert_words(raw_name)
94
+ raw_name.chars.map{ |char|
95
+ codepoint = char.ord
96
+ if codepoint < INDEX[:REPLACE_BASE]
97
+ char
98
+ else
99
+ "#{INDEX[:COMMON_WORDS][codepoint - INDEX[:REPLACE_BASE]]} "
100
+ end
101
+ }.join.chomp
102
+ end
85
103
  end
86
104
  end
87
105
 
@@ -9,11 +9,11 @@ describe Unicode::Name do
9
9
  assert_equal "REPLACEMENT CHARACTER", Unicode::Name.of("�")
10
10
  end
11
11
 
12
- it "works for CJK Ideographs" do
12
+ it "works for CJK unified ideographs" do
13
13
  assert_equal "CJK UNIFIED IDEOGRAPH-4E01", Unicode::Name.of("丁")
14
14
  end
15
15
 
16
- it "works for Hangul Syllables" do
16
+ it "works for Hangul syllables" do
17
17
  assert_equal "HANGUL SYLLABLE HAN", Unicode::Name.of("한")
18
18
  assert_equal "HANGUL SYLLABLE GAG", Unicode::Name.of("각")
19
19
  assert_equal "HANGUL SYLLABLE GAE", Unicode::Name.of("개")
@@ -21,6 +21,14 @@ describe Unicode::Name do
21
21
  assert_equal "HANGUL SYLLABLE DWALB", Unicode::Name.of("돫")
22
22
  end
23
23
 
24
+ it "works with some ranges that have the codepoint embedded" do
25
+ assert_equal "EGYPTIAN HIEROGLYPH-143F5", Unicode::Name.of("𔏵")
26
+ assert_equal "KHITAN SMALL SCRIPT CHARACTER-18C12", Unicode::Name.of("𘰒")
27
+ assert_equal "TANGUT IDEOGRAPH-18D00", Unicode::Name.of("𘴀")
28
+ assert_equal "NUSHU CHARACTER-1B171", Unicode::Name.of("𛅱")
29
+ assert_equal "CJK COMPATIBILITY IDEOGRAPH-2F9B1", Unicode::Name.of("𧃒")
30
+ end
31
+
24
32
  it "will return nil for characters without name" do
25
33
  assert_nil Unicode::Name.of("\u{10c50}")
26
34
  assert_nil Unicode::Name.of("\0")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode-name
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.13.1
4
+ version: 1.13.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-03 00:00:00.000000000 Z
11
+ date: 2024-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode-types
@@ -68,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0'
70
70
  requirements: []
71
- rubygems_version: 3.5.20
71
+ rubygems_version: 3.5.21
72
72
  signing_key:
73
73
  specification_version: 4
74
74
  summary: Returns name/aliases/label of a Unicode codepoint