RubyGems - unicoder - Versions diffs - 1.1.0 → 1.3.0 - Mend

unicoder 1.1.0 → 1.3.0

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +23 -0
data/Gemfile.lock +1 -1
data/README.md +3 -0
data/lib/unicoder/builder.rb +11 -1
data/lib/unicoder/builders/blocks.rb +4 -2
data/lib/unicoder/builders/categories.rb +6 -0
data/lib/unicoder/builders/confusable.rb +24 -3
data/lib/unicoder/builders/name.rb +2 -2
data/lib/unicoder/builders/scripts.rb +19 -2
data/lib/unicoder/builders/sequence_name.rb +8 -4
data/lib/unicoder/constants.rb +3 -2
data/lib/unicoder/replace_common_words.rb +3 -2
metadata +2 -3
data/.travis.yml +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
-  data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
+  metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
+  data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
 SHA512:
-  metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
-  data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
+  metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
+  data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,28 @@
 ## CHANGELOG
+### 1.3.0
+- confusable: Add ignorables
+- confusable: Nest index and make ESM/charkeys version, fix ";"
+### 1.2.1
+- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
+### 1.2.0
+- Change format for sequence_name's sub-index for unqalified Emoji sequences
+### 1.1.2
+- Update CLDR to v46
+### 1.1.1
+- Fix bug related to unsafe characters
+- Fix squared CJK
+- Small adjustments for scripts and blocks index builders
 ### 1.1.0
 - Improve name index size: Support ranges

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.1.0)
+    unicoder (1.3.0)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/README.md CHANGED Viewed

@@ -39,6 +39,9 @@ Index Name    | Module
 --------------|----
 name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
 numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
+scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
+blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
+categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
 ## MIT License

data/lib/unicoder/builder.rb CHANGED Viewed

@@ -73,8 +73,18 @@ module Unicoder
       file = File.read(LOCAL_DATA_DIRECTORY + filename)
       if parse_mode == :line
+        active = !parse_options[:begin]
         file.each_line{ |line|
-          yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          if !active && parse_options[:begin] && line.match?(parse_options[:begin])
+            active = true
+          elsif active && parse_options[:end] && line.match?(parse_options[:end])
+            active = false
+          end
+          if active
+            yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
+          end
         }
       elsif parse_mode == :xml
         require "oga"

data/lib/unicoder/builders/blocks.rb CHANGED Viewed

@@ -4,12 +4,14 @@ module Unicoder
       include Builder
       def initialize_index
-        @index = []
+        @index = {
+          BLOCKS: []
+        }
       end
       def parse!
         parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
-          @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
+          @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
         end
       end
     end

data/lib/unicoder/builders/categories.rb CHANGED Viewed

@@ -9,6 +9,12 @@ module Unicoder
         @index = {
           CATEGORIES: [],
           CATEGORY_NAMES: {},
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @range_start = nil
       end

data/lib/unicoder/builders/confusable.rb CHANGED Viewed

@@ -3,17 +3,38 @@ module Unicoder
     class Confusable
       include Builder
+      def initialize_index
+        @index = {
+          CONFUSABLE: {},
+          IGNORABLE: [],
+        }
+      end
       def parse!
-        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
+        parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
           source = line["from"].to_i(16)
           if line["to"].include?(" ")
             replace_with = line["to"].split(" ").map{ |codepoint|
+              cp = codepoint.to_i(16)
+              option =~ /charvalues/ ? [cp].pack("U") : cp
+            }
+          else
+            cp = line["to"].to_i(16)
+            replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
+          end
+          assign :CONFUSABLE, source, replace_with
+        end
+        parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
+          if line["codepoints"]['..']
+            single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
               codepoint.to_i(16)
             }
           else
-            replace_with = line["to"].to_i(16)
+            single_or_multiple_codepoints = line["codepoints"].to_i(16)
           end
-          @index[source] = replace_with
+          @index[:IGNORABLE] << single_or_multiple_codepoints
         end
       end
     end

data/lib/unicoder/builders/name.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Unicoder
             "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
             "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
             "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
-            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
           },
           # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
           JAMO: {
@@ -68,7 +68,7 @@ module Unicoder
             elsif line["name"] != "<control>"
               raise ArgumentError, "inconsistent range found in data, don't know what to do"
             end
-          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
+          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
             # ignore
           else
             assign :NAMES, line["codepoint"].to_i(16), line["name"]

data/lib/unicoder/builders/scripts.rb CHANGED Viewed

@@ -10,6 +10,12 @@ module Unicoder
           SCRIPT_EXTENSIONS: {},
           SCRIPT_ALIASES: {},
           SCRIPT_NAMES: [],
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @reverse_script_names = {}
         @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
         }
       end
+      # TODO refactor how multiple indexes are organized
+      def assign_classic(sub_index_name, codepoint, value)
+        idx = @index[sub_index_name]
+        if option =~ /charkeys/
+          idx[[codepoint].pack("U*")] = value
+        else
+          idx[codepoint] = value
+        end
+      end
       def parse!
         parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
           @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
         parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
           if line["to"]
             (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
-              @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
+              assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
             }
           else
-            @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
+            assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
           end
         end
       end

data/lib/unicoder/builders/sequence_name.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Unicoder
       def initialize_index
         @index = {
           SEQUENCES: {},
-          SEQUENCES_NOT_QUALIFIED: {},
+          EMOJI_NOT_QUALIFIED: {},
         }
         @words = []
       end
@@ -74,8 +74,12 @@ module Unicoder
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
           codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
           assign_codepoint codepoints, name
+          # Build all combinations of VS16 present and missing and add to second index
           if codepoints.include?(0xFE0F)
-            # Build all combinations of VS16 present and missing
+            sequence = codepoints.pack("U*")
             codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
               if cur.include? 0xFE0F
                 acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
             }.
             select {|sub_codepoints| sub_codepoints != codepoints }.
             each { |sub_codepoints|
-              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+              sub_sequence = sub_codepoints.pack("U*")
+              @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
             }
           end
         end
         replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
-        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.1.0"
+  VERSION = "1.3.0"
   UNICODE_VERSIONS = %w[
     16.0.0
@@ -57,7 +57,7 @@ module Unicoder
   IVD_VERSION = "2022-09-13"
-  CLDR_VERSION = "45"
+  CLDR_VERSION = "46"
   UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
@@ -69,6 +69,7 @@ module Unicoder
     name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
     confusables:               "/security/UNICODE_VERSION/confusables.txt",
     blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
+    core_properties:           "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
     scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
     script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
     property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",

data/lib/unicoder/replace_common_words.rb CHANGED Viewed

@@ -2,8 +2,9 @@ require "json"
 module Unicoder
   module ReplaceCommonWords
-  	def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
-  	  puts "Starting to replace the #{count} most common words"
+  	def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
+      base = @words.join.chars.max.ord + 1
+  	  puts "Starting to replace the #{count} most common words (replace base: #{base})"
   	  @index[:REPLACE_BASE] = base
   	  @index[:COMMON_WORDS] = words.
   	    select{_1.size >= min_word_length}.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.3.0
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-09 00:00:00.000000000 Z
+date: 2024-11-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -61,7 +61,6 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
-- ".travis.yml"
 - CHANGELOG.md
 - CODE_OF_CONDUCT.md
 - Gemfile

data/.travis.yml DELETED Viewed

@@ -1,20 +0,0 @@
-sudo: false
-language: ruby
-rvm:
-- 2.7
-- 2.6
-- 2.5
-- 2.4
-- 2.3
-- ruby-head
-- jruby-9.2.9.0
-- truffleruby
-matrix:
-  allow_failures:
-    - rvm: 2.3
-    - rvm: ruby-head
-    - rvm: jruby-2.9.2.0
-    - rvm: truffleruby
-#   fast_finish: true