RubyGems - unicoder - Versions diffs - 1.3.0 → 1.5.0 - Mend

unicoder 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/Gemfile.lock +1 -1
data/lib/unicoder/builders/display_width.rb +43 -18
data/lib/unicoder/constants.rb +15 -8
data/unicoder.gemspec +1 -1
metadata +4 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
-  data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
+  metadata.gz: 1b4aa3a99c2805fe8caa17f90210132a2e3a3e1df511e8a63a7cc83af0fd6e74
+  data.tar.gz: 1f2174a23878ac589e80fd544f374f56ac4881d6d8416c47b5e1eb0db6b5daa2
 SHA512:
-  metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
-  data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
+  metadata.gz: 4e44aa4e9d0328d27f337c1ee012ae60fc8dda50ac685db9c1c944420155e1741502f0a555fdcc821732ed2e7a3980aeb3bbe9642dded0ab14a2f9c33605bdb6
+  data.tar.gz: fe152b4e2966b64e5810866cf6b74c50866d6e034c9c59cb9009cbb7d3108770f4c9d3297b495e15b4b1b724c482200a93ee3fa5a31e91fc66c7cb0834c2866e

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,18 @@
 ## CHANGELOG
+### 1.5.0
+- New Emoji locations
+- Update CLDR to v48
+- Allow Ruby 4.0
+### 1.4.0
+- Update Unicode and Emoji to 17.0
+  - Some files now have a new location in UCD
+- Update CLDR to v46
+- Update IVD to 2025-07-14
 ### 1.3.0
 - confusable: Add ignorables

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.3.0)
+    unicoder (1.5.0)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/lib/unicoder/builders/display_width.rb CHANGED Viewed

@@ -4,15 +4,11 @@ module Unicoder
       include Builder
       include MultiDimensionalArrayBuilder
-      IGNORE_CATEGORIES     = %w[Cs Co Cn].freeze
-      ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
+      ZERO_WIDTH_CATEGORIES = %w[Mn Me Zl Zp Cf].freeze
-      ZERO_WIDTH_RANGES = [
+      ZERO_WIDTH_HANGUL = [
         *0x1160..0x11FF, # HANGUL JUNGSEONG
         *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
-        *0x2060..0x206F, # Ignorables
-        *0xFFF0..0xFFF8, # Ignorables
-        *0xE0000..0xE0FFF, # Ignorables
       ].freeze
       WIDE_RANGES = [
@@ -34,19 +30,36 @@ module Unicoder
         0xD    =>  0, # \r CARRIAGE RETURN
         0xE    =>  0, #    SHIFT OUT
         0xF    =>  0, #    SHIFT IN
-        0x00AD =>  nil, #    SOFT HYPHEN
+        # 0x85   =>  0, #    NEXT LINE
+        0xAD   =>  nil, #  SOFT HYPHEN, nil = 1 (default)
         0x2E3A =>  2, #    TWO-EM DASH
         0x2E3B =>  3, #    THREE-EM DASH
       }.freeze
       def initialize_index
-        @index = []
+        @index = {
+          WIDTH_ONE: [],
+          WIDTH_TWO: [],
+        }
+        @ignorable = []
       end
       def parse!
-        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
-          next if IGNORE_CATEGORIES.include?(line["category"])
+        # Find Ignorables
+        parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
+          if line["codepoints"]['..']
+            single_or_multiple_codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
+              codepoint.to_i(16)
+            })
+          else
+            single_or_multiple_codepoints = line["codepoints"].to_i(16)
+          end
+          @ignorable += [*single_or_multiple_codepoints]
+        end
+        # Assign based on East Asian Width
+        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
           if line["codepoints"]['..']
             codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
               codepoint.to_i(16)
@@ -56,33 +69,45 @@ module Unicoder
           end
           codepoints.each{ |codepoint|
-            assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
+            assign :WIDTH_ONE, codepoint, determine_width(codepoint, line["category"], line["width"], 1)
+            assign :WIDTH_TWO, codepoint, determine_width(codepoint, line["category"], line["width"], 2)
           }
         end
-        ZERO_WIDTH_RANGES.each{ |codepoint|
-          assign_codepoint codepoint, 0
+        # Assign Ranges
+        ## Zero-width
+        (ZERO_WIDTH_HANGUL | @ignorable).each{ |codepoint|
+          assign :WIDTH_ONE, codepoint, 0
+          assign :WIDTH_TWO, codepoint, 0
         }
+        ## Full-width
         WIDE_RANGES.each{ |codepoint|
-          assign_codepoint codepoint, 2
+          assign :WIDTH_ONE, codepoint, 2
+          assign :WIDTH_TWO, codepoint, 2
         }
+        ## Table
         SPECIAL_WIDTHS.each{ |codepoint, value|
-          assign_codepoint codepoint, value
+          assign :WIDTH_ONE, codepoint, value
+          assign :WIDTH_TWO, codepoint, value
         }
-        4.times{ compress! }
+        # Compres Index
+        4.times{ compress! @index[:WIDTH_ONE] }
+        4.times{ compress! @index[:WIDTH_TWO] }
+        remove_trailing_nils! @index[:WIDTH_ONE]
+        remove_trailing_nils! @index[:WIDTH_TWO]
       end
-      def determine_width(codepoint, category, east_asian_width)
+      def determine_width(codepoint, category, east_asian_width, ambiguous)
         if  ( ZERO_WIDTH_CATEGORIES.include?(category) &&
               [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
           0
         elsif east_asian_width == "F" || east_asian_width == "W"
           2
         elsif east_asian_width == "A"
-          :A
+          ambiguous == 1 ? nil : ambiguous
         else
           nil
         end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,9 +1,10 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.3.0"
+  VERSION = "1.5.0"
   UNICODE_VERSIONS = %w[
+    17.0.0
     16.0.0
     15.1.0
     15.0.0
@@ -22,6 +23,7 @@ module Unicoder
   CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
   EMOJI_VERSIONS = %w[
+   17.0
    16.0
    15.1
    15.0
@@ -38,6 +40,7 @@ module Unicoder
   ].freeze
   EMOJI_RELATED_UNICODE_VERSIONS = {
+   "17.0" => "17.0.0",
    "16.0" => "16.0.0",
    "15.1" => "15.1.0",
    "15.0" => "15.0.0",
@@ -55,11 +58,11 @@ module Unicoder
   CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
-  IVD_VERSION = "2022-09-13"
+  IVD_VERSION = "2025-07-14"
-  CLDR_VERSION = "46"
+  CLDR_VERSION = "48"
-  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
+  UNICODE_DATA_ENDPOINT = "http://ftp.unicode.org/Public"
   LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
@@ -67,7 +70,8 @@ module Unicoder
     east_asian_width:          "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
     unicode_data:              "/UNICODE_VERSION/ucd/UnicodeData.txt",
     name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
-    confusables:               "/security/UNICODE_VERSION/confusables.txt",
+    confusables:               "/UNICODE_VERSION/security/confusables.txt",
+    confusables_before_17:     "/security/UNICODE_VERSION/confusables.txt",
     blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
     core_properties:           "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
     scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
@@ -82,11 +86,14 @@ module Unicoder
     ivd_sequences:             "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
     # emoji_data:                "/EMOJI_VERSION/ucd/emoji/",
     emoji_data:                "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
-    emoji_sequences:           "/emoji/EMOJI_VERSION/emoji-sequences.txt",
+    # emoji_sequences:           "/emoji/EMOJI_VERSION/emoji-sequences.txt",
+    emoji_sequences:           "/EMOJI_RELATED_VERSION/emoji/emoji-sequences.txt",
     # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
     emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
-    emoji_zwj_sequences:       "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
-    emoji_test:                "/emoji/EMOJI_VERSION/emoji-test.txt",
+    # emoji_zwj_sequences:       "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
+    emoji_zwj_sequences:       "/EMOJI_RELATED_VERSION/emoji/emoji-zwj-sequences.txt",
+    # emoji_test:                "/emoji/EMOJI_VERSION/emoji-test.txt",
+    emoji_test:                "/EMOJI_RELATED_VERSION/emoji/emoji-test.txt",
     # valid_subdivisions:        "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
     valid_subdivisions:        "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
     # ""

data/unicoder.gemspec CHANGED Viewed

@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.required_ruby_version = ">= 3.0", "< 4.0"
+  gem.required_ruby_version = ">= 3.0", "< 5.0"
   gem.add_dependency "rationalist", "~> 2.0"
   gem.add_dependency "rubyzip", "~> 1.2"
   gem.add_dependency "oga", "~> 2.9"

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.5.0
 platform: ruby
 authors:
 - Jan Lelis
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-11-04 00:00:00.000000000 Z
+date: 2026-01-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -91,7 +90,6 @@ homepage: https://github.com/janlelis/unicoder
 licenses:
 - MIT
 metadata: {}
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -102,15 +100,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '3.0'
   - - "<"
     - !ruby/object:Gem::Version
-      version: '4.0'
+      version: '5.0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.21
-signing_key:
+rubygems_version: 3.6.2
 specification_version: 4
 summary: Creates specialized indexes for Unicode data lookup
 test_files: []