RubyGems - unicode-name - Versions diffs - 1.13.1 → 1.13.3 - Mend

unicode-name 1.13.1 → 1.13.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/Gemfile.lock +1 -1
data/README.md +5 -1
data/data/name.marshal.gz +0 -0
data/lib/unicode/name/constants.rb +1 -1
data/lib/unicode/name.rb +27 -6
data/spec/unicode_name_spec.rb +11 -2
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ad37a86c74a53f8ed0fd5b7d99958e63f06537392c1156810405d8cf83e98082
-  data.tar.gz: ec21c556d90d33b7dbef6ac61cfca62805505f4a3afba068a5efc8f3d967b686
+  metadata.gz: 7d5b492a10c5d32a1452b2c01a98dce50e1dbeedbe75b5fc3d73aae7cac013d8
+  data.tar.gz: d1b1cd8ced4f9685c31833b15e7d441469ec3117911f46a13b07cbd7ea33426b
 SHA512:
-  metadata.gz: 5a86c60936a59105991a4e769ecfa2ea6cff752ab85a4ad87ca800d2455d13cf2b276cfa20a4d281bc4b91eb4f9c7155293b4da8a454d544a992045b66b9006e
-  data.tar.gz: 8e2ad3afb63a2dda11dee3b2a5b3e7ea24e275e3744fbfe14615b8cca698ae14b2d9784cf6180e4d4136eb058fe1ae4b1a1fddd1ff34fc220ac059a0639586fb
+  metadata.gz: 20e926fc2271e4421359f767a0b601c52e37c012c8bb804c1318df4bf8d50686bfd8206d6c0bf26454921e14aae237c255c734e15aaede0ef875a398a91757e7
+  data.tar.gz: c2c46f3bd4dd9b83f32343c7d18e15a16a1af440a51db7c0512f3acdd0382a6b5cb7653653b3ed9695ebd277172b7d8c15bedda517ce9d8028b4db0bf475f4bb

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,15 @@
 ## CHANGELOG
+### 1.13.3
+- Fix regression introduced in 1.13.2 that some CJK ideographs were missing
+### 1.13.2
+- Optimize index size by removing ranges that have codepoints embedded
+- Optimize index size by substituting common words
+- Fix missing Tangut ideographs
 ### 1.13.1
 Bugfix release:

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicode-name (1.13.1)
+    unicode-name (1.13.3)
       unicode-types (~> 1.10)
 GEM

data/README.md CHANGED Viewed

@@ -42,7 +42,11 @@ Unicode::Name.readable("\0") # => "NULL"
 Unicode::Name.readable("\u{FFFFD}") # => "<private-use-FFFFD>"
 ```
-See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences.
+See [unicode-sequence_names](https://github.com/janlelis/unicode-sequence_name) for character names of more complex codepoint sequences. This is how you could use both libraries together to get the most relevant name of a character:
+```ruby
+name = Unicode::SequenceName.of(char) || Unicode::Name.readable(char)
+```
 See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries.

data/data/name.marshal.gz CHANGED Viewed

Binary file

data/lib/unicode/name/constants.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Unicode
   module Name
-    VERSION = "1.13.1"
+    VERSION = "1.13.3"
     UNICODE_VERSION = "16.0.0"
     DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/").freeze
     INDEX_FILENAME = (DATA_DIRECTORY + "/name.marshal.gz").freeze

data/lib/unicode/name.rb CHANGED Viewed

@@ -11,11 +11,18 @@ module Unicode
     def self.unicode_name(char)
       codepoint = char.unpack("U")[0]
       require_relative "name/index" unless defined? ::Unicode::Name::INDEX
       if res = INDEX[:NAMES][codepoint]
-        res
-      elsif INDEX[:CJK].any?{ |cjk_range| codepoint >= cjk_range[0] && codepoint <= cjk_range[1] }
-        "CJK UNIFIED IDEOGRAPH-%.4X" % codepoint
-      elsif codepoint >= HANGUL_START && codepoint <= HANGUL_END
+        return insert_words(res)
+      end
+      INDEX[:CP_RANGES].each{|prefix, range|
+        if range.any?{ |range| codepoint >= range[0] && codepoint <= range[1] }
+          return "%s%.4X" %[prefix, codepoint]
+        end
+      }
+      if codepoint >= HANGUL_START && codepoint <= HANGUL_END
         "HANGUL SYLLABLE %s" % hangul_decomposition(codepoint)
       else
         nil
@@ -72,8 +79,6 @@ module Unicode
       label(char)
     end
-    private
     # See https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
     def self.hangul_decomposition(codepoint)
       base = codepoint - HANGUL_START
@@ -82,6 +87,22 @@ module Unicode
       initial = base / HANGUL_MEDIAL_MAX
       "#{INDEX[:JAMO][:INITIAL][initial]}#{INDEX[:JAMO][:MEDIAL][medial]}#{INDEX[:JAMO][:FINAL][final]}"
     end
+    def self.insert_words(raw_name)
+      raw_name.chars.map{ |char|
+        codepoint = char.ord
+        if codepoint < INDEX[:REPLACE_BASE]
+          char
+        else
+          "#{INDEX[:COMMON_WORDS][codepoint - INDEX[:REPLACE_BASE]]} "
+        end
+      }.join.chomp
+    end
+    class << self
+      private :hangul_decomposition
+      private :insert_words
+    end
   end
 end

data/spec/unicode_name_spec.rb CHANGED Viewed

@@ -9,11 +9,12 @@ describe Unicode::Name do
       assert_equal "REPLACEMENT CHARACTER", Unicode::Name.of("�")
     end
-    it "works for CJK Ideographs" do
+    it "works for CJK unified ideographs" do
       assert_equal "CJK UNIFIED IDEOGRAPH-4E01", Unicode::Name.of("丁")
+      assert_equal "SQUARED CJK UNIFIED IDEOGRAPH-6709", Unicode::Name.of("🈶")
     end
-    it "works for Hangul Syllables" do
+    it "works for Hangul syllables" do
       assert_equal "HANGUL SYLLABLE HAN", Unicode::Name.of("한")
       assert_equal "HANGUL SYLLABLE GAG", Unicode::Name.of("각")
       assert_equal "HANGUL SYLLABLE GAE", Unicode::Name.of("개")
@@ -21,6 +22,14 @@ describe Unicode::Name do
       assert_equal "HANGUL SYLLABLE DWALB", Unicode::Name.of("돫")
     end
+    it "works with some ranges that have the codepoint embedded" do
+      assert_equal "EGYPTIAN HIEROGLYPH-143F5", Unicode::Name.of("𔏵")
+      assert_equal "KHITAN SMALL SCRIPT CHARACTER-18C12", Unicode::Name.of("𘰒")
+      assert_equal "TANGUT IDEOGRAPH-18D00", Unicode::Name.of("𘴀")
+      assert_equal "NUSHU CHARACTER-1B171", Unicode::Name.of("𛅱")
+      assert_equal "CJK COMPATIBILITY IDEOGRAPH-2F9B1", Unicode::Name.of("𧃒")
+    end
     it "will return nil for characters without name" do
       assert_nil Unicode::Name.of("\u{10c50}")
       assert_nil Unicode::Name.of("\0")

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicode-name
 version: !ruby/object:Gem::Version
-  version: 1.13.1
+  version: 1.13.3
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-03 00:00:00.000000000 Z
+date: 2024-10-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode-types
@@ -68,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.20
+rubygems_version: 3.5.21
 signing_key:
 specification_version: 4
 summary: Returns name/aliases/label of a Unicode codepoint