RubyGems - unicoder - Versions diffs - 1.1.1 → 1.4.0 - Mend

unicoder 1.1.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +24 -0
data/Gemfile.lock +1 -1
data/README.md +3 -1
data/lib/unicoder/builder.rb +11 -1
data/lib/unicoder/builders/categories.rb +6 -0
data/lib/unicoder/builders/confusable.rb +24 -3
data/lib/unicoder/builders/display_width.rb +43 -18
data/lib/unicoder/builders/name.rb +1 -1
data/lib/unicoder/builders/sequence_name.rb +8 -4
data/lib/unicoder/constants.rb +10 -5
metadata +2 -3
data/.travis.yml +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
-  data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
+  metadata.gz: c698f0042604828d6acf19123dba21935d65387c030df78774885e0e0084c6ef
+  data.tar.gz: f8c1b180273b758079232066ecd4729adbfb41901464414ad56bab6df3c83ee5
 SHA512:
-  metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
-  data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
+  metadata.gz: 486f62f96ed10a3dac0703f62da4c85fb1b6a594ff922e7bae668bdf62878f3c2b5704bb89f491164d0e416d66e316eb8f7fbb23c5eb971f9231edef26bb5162
+  data.tar.gz: cc7bdff24d99a31021d0b1321447da769b0e25d5209b79e75d6d04c5d1590b858f9f9195c68dd7e35a5817f7dbd00dc7f5f3dae3eaaa268542e4608d7095b035

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,29 @@
 ## CHANGELOG
+### 1.4.0
+- Update Unicode and Emoji to 17.0
+  - Some files now have a new location in UCD
+- Update CLDR to v46
+- Update IVD to 2025-07-14
+### 1.3.0
+- confusable: Add ignorables
+- confusable: Nest index and make ESM/charkeys version, fix ";"
+### 1.2.1
+- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
+### 1.2.0
+- Change format for sequence_name's sub-index for unqalified Emoji sequences
+### 1.1.2
+- Update CLDR to v46
 ### 1.1.1
 - Fix bug related to unsafe characters

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.1.1)
+    unicoder (1.3.0)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/README.md CHANGED Viewed

@@ -39,7 +39,9 @@ Index Name    | Module
 --------------|----
 name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
 numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
-scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
+scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
+blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
+categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
 ## MIT License

data/lib/unicoder/builder.rb CHANGED Viewed

@@ -73,8 +73,18 @@ module Unicoder
       file = File.read(LOCAL_DATA_DIRECTORY + filename)
       if parse_mode == :line
+        active = !parse_options[:begin]
         file.each_line{ |line|
-          yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          if !active && parse_options[:begin] && line.match?(parse_options[:begin])
+            active = true
+          elsif active && parse_options[:end] && line.match?(parse_options[:end])
+            active = false
+          end
+          if active
+            yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          end
         }
       elsif parse_mode == :xml
         require "oga"

data/lib/unicoder/builders/categories.rb CHANGED Viewed

@@ -9,6 +9,12 @@ module Unicoder
         @index = {
           CATEGORIES: [],
           CATEGORY_NAMES: {},
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @range_start = nil
       end

data/lib/unicoder/builders/confusable.rb CHANGED Viewed

@@ -3,17 +3,38 @@ module Unicoder
     class Confusable
       include Builder
+      def initialize_index
+        @index = {
+          CONFUSABLE: {},
+          IGNORABLE: [],
+        }
+      end
       def parse!
-        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
+        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
           source = line["from"].to_i(16)
           if line["to"].include?(" ")
             replace_with = line["to"].split(" ").map{ |codepoint|
+              cp = codepoint.to_i(16)
+              option =~ /charvalues/ ? [cp].pack("U") : cp
+            }
+          else
+            cp = line["to"].to_i(16)
+            replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
+          end
+          assign :CONFUSABLE, source, replace_with
+        end
+        parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
+          if line["codepoints"]['..']
+            single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
               codepoint.to_i(16)
             }
           else
-            replace_with = line["to"].to_i(16)
+            single_or_multiple_codepoints = line["codepoints"].to_i(16)
           end
-          @index[source] = replace_with
+          @index[:IGNORABLE] << single_or_multiple_codepoints
         end
       end
     end

data/lib/unicoder/builders/display_width.rb CHANGED Viewed

@@ -4,15 +4,11 @@ module Unicoder
       include Builder
       include MultiDimensionalArrayBuilder
-      IGNORE_CATEGORIES     = %w[Cs Co Cn].freeze
-      ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
+      ZERO_WIDTH_CATEGORIES = %w[Mn Me Zl Zp Cf].freeze
-      ZERO_WIDTH_RANGES = [
+      ZERO_WIDTH_HANGUL = [
         *0x1160..0x11FF, # HANGUL JUNGSEONG
         *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
-        *0x2060..0x206F, # Ignorables
-        *0xFFF0..0xFFF8, # Ignorables
-        *0xE0000..0xE0FFF, # Ignorables
       ].freeze
       WIDE_RANGES = [
@@ -34,19 +30,36 @@ module Unicoder
         0xD    =>  0, # \r CARRIAGE RETURN
         0xE    =>  0, #    SHIFT OUT
         0xF    =>  0, #    SHIFT IN
-        0x00AD =>  nil, #    SOFT HYPHEN
+        # 0x85   =>  0, #    NEXT LINE
+        0xAD   =>  nil, #  SOFT HYPHEN, nil = 1 (default)
         0x2E3A =>  2, #    TWO-EM DASH
         0x2E3B =>  3, #    THREE-EM DASH
       }.freeze
       def initialize_index
-        @index = []
+        @index = {
+          WIDTH_ONE: [],
+          WIDTH_TWO: [],
+        }
+        @ignorable = []
       end
       def parse!
-        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
-          next if IGNORE_CATEGORIES.include?(line["category"])
+        # Find Ignorables
+        parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
+          if line["codepoints"]['..']
+            single_or_multiple_codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
+              codepoint.to_i(16)
+            })
+          else
+            single_or_multiple_codepoints = line["codepoints"].to_i(16)
+          end
+          @ignorable += [*single_or_multiple_codepoints]
+        end
+        # Assign based on East Asian Width
+        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
           if line["codepoints"]['..']
             codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
               codepoint.to_i(16)
@@ -56,33 +69,45 @@ module Unicoder
           end
           codepoints.each{ |codepoint|
-            assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
+            assign :WIDTH_ONE, codepoint, determine_width(codepoint, line["category"], line["width"], 1)
+            assign :WIDTH_TWO, codepoint, determine_width(codepoint, line["category"], line["width"], 2)
           }
         end
-        ZERO_WIDTH_RANGES.each{ |codepoint|
-          assign_codepoint codepoint, 0
+        # Assign Ranges
+        ## Zero-width
+        (ZERO_WIDTH_HANGUL | @ignorable).each{ |codepoint|
+          assign :WIDTH_ONE, codepoint, 0
+          assign :WIDTH_TWO, codepoint, 0
         }
+        ## Full-width
         WIDE_RANGES.each{ |codepoint|
-          assign_codepoint codepoint, 2
+          assign :WIDTH_ONE, codepoint, 2
+          assign :WIDTH_TWO, codepoint, 2
         }
+        ## Table
         SPECIAL_WIDTHS.each{ |codepoint, value|
-          assign_codepoint codepoint, value
+          assign :WIDTH_ONE, codepoint, value
+          assign :WIDTH_TWO, codepoint, value
         }
-        4.times{ compress! }
+        # Compres Index
+        4.times{ compress! @index[:WIDTH_ONE] }
+        4.times{ compress! @index[:WIDTH_TWO] }
+        remove_trailing_nils! @index[:WIDTH_ONE]
+        remove_trailing_nils! @index[:WIDTH_TWO]
       end
-      def determine_width(codepoint, category, east_asian_width)
+      def determine_width(codepoint, category, east_asian_width, ambiguous)
         if  ( ZERO_WIDTH_CATEGORIES.include?(category) &&
               [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
           0
         elsif east_asian_width == "F" || east_asian_width == "W"
           2
         elsif east_asian_width == "A"
-          :A
+          ambiguous == 1 ? nil : ambiguous
         else
           nil
         end

data/lib/unicoder/builders/name.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Unicoder
             "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
             "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
             "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
-            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
           },
           # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
           JAMO: {

data/lib/unicoder/builders/sequence_name.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Unicoder
       def initialize_index
         @index = {
           SEQUENCES: {},
-          SEQUENCES_NOT_QUALIFIED: {},
+          EMOJI_NOT_QUALIFIED: {},
         }
         @words = []
       end
@@ -74,8 +74,12 @@ module Unicoder
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
           codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
           assign_codepoint codepoints, name
+          # Build all combinations of VS16 present and missing and add to second index
           if codepoints.include?(0xFE0F)
-            # Build all combinations of VS16 present and missing
+            sequence = codepoints.pack("U*")
             codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
               if cur.include? 0xFE0F
                 acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
             }.
             select {|sub_codepoints| sub_codepoints != codepoints }.
             each { |sub_codepoints|
-              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+              sub_sequence = sub_codepoints.pack("U*")
+              @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
             }
           end
         end
         replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
-        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,9 +1,10 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.1.1"
+  VERSION = "1.4.0"
   UNICODE_VERSIONS = %w[
+    17.0.0
     16.0.0
     15.1.0
     15.0.0
@@ -22,6 +23,7 @@ module Unicoder
   CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
   EMOJI_VERSIONS = %w[
+   17.0
    16.0
    15.1
    15.0
@@ -38,6 +40,7 @@ module Unicoder
   ].freeze
   EMOJI_RELATED_UNICODE_VERSIONS = {
+   "17.0" => "17.0.0",
    "16.0" => "16.0.0",
    "15.1" => "15.1.0",
    "15.0" => "15.0.0",
@@ -55,11 +58,11 @@ module Unicoder
   CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
-  IVD_VERSION = "2022-09-13"
+  IVD_VERSION = "2025-07-14"
-  CLDR_VERSION = "45"
+  CLDR_VERSION = "47"
-  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
+  UNICODE_DATA_ENDPOINT = "http://ftp.unicode.org/Public"
   LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
@@ -67,8 +70,10 @@ module Unicoder
     east_asian_width:          "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
     unicode_data:              "/UNICODE_VERSION/ucd/UnicodeData.txt",
     name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
-    confusables:               "/security/UNICODE_VERSION/confusables.txt",
+    confusables:               "/UNICODE_VERSION/security/confusables.txt",
+    confusables_before_17:     "/security/UNICODE_VERSION/confusables.txt",
     blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
+    core_properties:           "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
     scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
     script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
     property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.4.0
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-14 00:00:00.000000000 Z
+date: 2025-09-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -61,7 +61,6 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
-- ".travis.yml"
 - CHANGELOG.md
 - CODE_OF_CONDUCT.md
 - Gemfile

data/.travis.yml DELETED Viewed

@@ -1,20 +0,0 @@
-sudo: false
-language: ruby
-rvm:
-- 2.7
-- 2.6
-- 2.5
-- 2.4
-- 2.3
-- ruby-head
-- jruby-9.2.9.0
-- truffleruby
-matrix:
-  allow_failures:
-    - rvm: 2.3
-    - rvm: ruby-head
-    - rvm: jruby-2.9.2.0
-    - rvm: truffleruby
-#   fast_finish: true