RubyGems - unicoder - Versions diffs - 1.1.0 → 1.3.0 - Mend

unicoder 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +23 -0
data/Gemfile.lock +1 -1
data/README.md +3 -0
data/lib/unicoder/builder.rb +11 -1
data/lib/unicoder/builders/blocks.rb +4 -2
data/lib/unicoder/builders/categories.rb +6 -0
data/lib/unicoder/builders/confusable.rb +24 -3
data/lib/unicoder/builders/name.rb +2 -2
data/lib/unicoder/builders/scripts.rb +19 -2
data/lib/unicoder/builders/sequence_name.rb +8 -4
data/lib/unicoder/constants.rb +3 -2
data/lib/unicoder/replace_common_words.rb +3 -2
metadata +2 -3
data/.travis.yml +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
-  data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
+  metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
+  data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
 SHA512:
-  metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
-  data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
+  metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
+  data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,28 @@
 ## CHANGELOG
+### 1.3.0
+- confusable: Add ignorables
+- confusable: Nest index and make ESM/charkeys version, fix ";"
+### 1.2.1
+- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
+### 1.2.0
+- Change format for sequence_name's sub-index for unqalified Emoji sequences
+### 1.1.2
+- Update CLDR to v46
+### 1.1.1
+- Fix bug related to unsafe characters
+- Fix squared CJK
+- Small adjustments for scripts and blocks index builders
 ### 1.1.0
 - Improve name index size: Support ranges

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.1.0)
+    unicoder (1.3.0)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/README.md CHANGED Viewed

@@ -39,6 +39,9 @@ Index Name    | Module
 --------------|----
 name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
 numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
+scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
+blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
+categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
 ## MIT License

data/lib/unicoder/builder.rb CHANGED Viewed

@@ -73,8 +73,18 @@ module Unicoder
       file = File.read(LOCAL_DATA_DIRECTORY + filename)
       if parse_mode == :line
+        active = !parse_options[:begin]
         file.each_line{ |line|
-          yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          if !active && parse_options[:begin] && line.match?(parse_options[:begin])
+            active = true
+          elsif active && parse_options[:end] && line.match?(parse_options[:end])
+            active = false
+          end
+          if active
+            yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          end
         }
       elsif parse_mode == :xml
         require "oga"

data/lib/unicoder/builders/blocks.rb CHANGED Viewed

@@ -4,12 +4,14 @@ module Unicoder
       include Builder
       def initialize_index
-        @index = []
+        @index = {
+          BLOCKS: []
+        }
       end
       def parse!
         parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
-          @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
+          @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
         end
       end
     end

data/lib/unicoder/builders/categories.rb CHANGED Viewed

@@ -9,6 +9,12 @@ module Unicoder
         @index = {
           CATEGORIES: [],
           CATEGORY_NAMES: {},
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @range_start = nil
       end

data/lib/unicoder/builders/confusable.rb CHANGED Viewed

@@ -3,17 +3,38 @@ module Unicoder
     class Confusable
       include Builder
+      def initialize_index
+        @index = {
+          CONFUSABLE: {},
+          IGNORABLE: [],
+        }
+      end
       def parse!
-        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
+        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
           source = line["from"].to_i(16)
           if line["to"].include?(" ")
             replace_with = line["to"].split(" ").map{ |codepoint|
+              cp = codepoint.to_i(16)
+              option =~ /charvalues/ ? [cp].pack("U") : cp
+            }
+          else
+            cp = line["to"].to_i(16)
+            replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
+          end
+          assign :CONFUSABLE, source, replace_with
+        end
+        parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
+          if line["codepoints"]['..']
+            single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
               codepoint.to_i(16)
             }
           else
-            replace_with = line["to"].to_i(16)
+            single_or_multiple_codepoints = line["codepoints"].to_i(16)
           end
-          @index[source] = replace_with
+          @index[:IGNORABLE] << single_or_multiple_codepoints
         end
       end
     end

data/lib/unicoder/builders/name.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Unicoder
             "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
             "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
             "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
-            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
           },
           # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
           JAMO: {
@@ -68,7 +68,7 @@ module Unicoder
             elsif line["name"] != "<control>"
               raise ArgumentError, "inconsistent range found in data, don't know what to do"
             end
-          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
+          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
             # ignore
           else
             assign :NAMES, line["codepoint"].to_i(16), line["name"]

data/lib/unicoder/builders/scripts.rb CHANGED Viewed

@@ -10,6 +10,12 @@ module Unicoder
           SCRIPT_EXTENSIONS: {},
           SCRIPT_ALIASES: {},
           SCRIPT_NAMES: [],
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @reverse_script_names = {}
         @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
         }
       end
+      # TODO refactor how multiple indexes are organized
+      def assign_classic(sub_index_name, codepoint, value)
+        idx = @index[sub_index_name]
+        if option =~ /charkeys/
+          idx[[codepoint].pack("U*")] = value
+        else
+          idx[codepoint] = value
+        end
+      end
       def parse!
         parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
           @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
         parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
           if line["to"]
             (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
-              @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
+              assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
             }
           else
-            @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
+            assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
           end
         end
       end

data/lib/unicoder/builders/sequence_name.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Unicoder
       def initialize_index
         @index = {
           SEQUENCES: {},
-          SEQUENCES_NOT_QUALIFIED: {},
+          EMOJI_NOT_QUALIFIED: {},
         }
         @words = []
       end
@@ -74,8 +74,12 @@ module Unicoder
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
           codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
           assign_codepoint codepoints, name
+          # Build all combinations of VS16 present and missing and add to second index
           if codepoints.include?(0xFE0F)
-            # Build all combinations of VS16 present and missing
+            sequence = codepoints.pack("U*")
             codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
               if cur.include? 0xFE0F
                 acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
             }.
             select {|sub_codepoints| sub_codepoints != codepoints }.
             each { |sub_codepoints|
-              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+              sub_sequence = sub_codepoints.pack("U*")
+              @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
             }
           end
         end
         replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
-        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.1.0"
+  VERSION = "1.3.0"
   UNICODE_VERSIONS = %w[
     16.0.0
@@ -57,7 +57,7 @@ module Unicoder
   IVD_VERSION = "2022-09-13"
-  CLDR_VERSION = "45"
+  CLDR_VERSION = "46"
   UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
@@ -69,6 +69,7 @@ module Unicoder
     name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
     confusables:               "/security/UNICODE_VERSION/confusables.txt",
     blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
+    core_properties:           "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
     scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
     script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
     property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",

data/lib/unicoder/replace_common_words.rb CHANGED Viewed

@@ -2,8 +2,9 @@ require "json"
 module Unicoder
   module ReplaceCommonWords
-  	def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
-  	  puts "Starting to replace the #{count} most common words"
+  	def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
+      base = @words.join.chars.max.ord + 1
+  	  puts "Starting to replace the #{count} most common words (replace base: #{base})"
   	  @index[:REPLACE_BASE] = base
   	  @index[:COMMON_WORDS] = words.
   	    select{_1.size >= min_word_length}.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.3.0
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-09 00:00:00.000000000 Z
+date: 2024-11-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -61,7 +61,6 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
-- ".travis.yml"
 - CHANGELOG.md
 - CODE_OF_CONDUCT.md
 - Gemfile

data/.travis.yml DELETED Viewed

@@ -1,20 +0,0 @@
-sudo: false
-language: ruby
-rvm:
-- 2.7
-- 2.6
-- 2.5
-- 2.4
-- 2.3
-- ruby-head
-- jruby-9.2.9.0
-- truffleruby
-matrix:
-  allow_failures:
-    - rvm: 2.3
-    - rvm: ruby-head
-    - rvm: jruby-2.9.2.0
-    - rvm: truffleruby
-#   fast_finish: true