RubyGems - unicoder - Versions diffs - 1.0.0 → 1.1.1 - Mend

unicoder 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/Gemfile.lock +1 -1
data/README.md +3 -2
data/lib/unicoder/builders/blocks.rb +4 -2
data/lib/unicoder/builders/name.rb +32 -6
data/lib/unicoder/builders/scripts.rb +19 -2
data/lib/unicoder/builders/sequence_name.rb +29 -2
data/lib/unicoder/constants.rb +1 -1
data/lib/unicoder/replace_common_words.rb +21 -0
data/lib/unicoder.rb +1 -0
data/unicoder.gemspec +1 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
-  data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
+  metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
+  data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
 SHA512:
-  metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
-  data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
+  metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
+  data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,16 @@
 ## CHANGELOG
+### 1.1.1
+- Fix bug related to unsafe characters
+- Fix squared CJK
+- Small adjustments for scripts and blocks index builders
+### 1.1.0
+- Improve name index size: Support ranges
+- Improve name index size: Replace common words
 ### 1.0.0
 With the first 1.0 release, unicoder supports 10 indexes:

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.0.0)
+    unicoder (1.1.1)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/README.md CHANGED Viewed

@@ -37,8 +37,9 @@ types         | [unicode-types](https://github.com/janlelis/unicode-types)
 Index Name    | Module
 --------------|----
-numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
-name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
+name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
+numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
+scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
 ## MIT License

data/lib/unicoder/builders/blocks.rb CHANGED Viewed

@@ -4,12 +4,14 @@ module Unicoder
       include Builder
       def initialize_index
-        @index = []
+        @index = {
+          BLOCKS: []
+        }
       end
       def parse!
         parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
-          @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
+          @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
         end
       end
     end

data/lib/unicoder/builders/name.rb CHANGED Viewed

@@ -1,19 +1,34 @@
 module Unicoder
   module Builder
     class Name
       include Builder
+      include ReplaceCommonWords
       JAMO_INITIAL = 4352
       JAMO_MEDIAL = 4449
       JAMO_FINAL = 4520
       JAMO_END = 4697
+      CJK = "CJK UNIFIED IDEOGRAPH-"
+      TANGUT = "TANGUT IDEOGRAPH-"
+      REPLACE_COUNT = 500
+      REPLACE_BASE = ?[.ord
       def initialize_index
         @index = {
           NAMES: {},
           ALIASES: {},
-          CJK: [],
-          HANGUL: [],
+          # HANGUL: [],
+          CP_RANGES: {
+            CJK => [], # filled while parsing
+            TANGUT => [], # filled while parsing
+            "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
+            "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
+            "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+          },
           # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
           JAMO: {
             INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
             FINAL: [""],
           },
         }
+        @words = []
         @range_start = nil
       end
@@ -36,22 +52,32 @@ module Unicoder
             if line["name"] =~ /First/
               @range_start = line["codepoint"].to_i(16)
             elsif line["name"] =~ /Last/ && @range_start
-              if line["name"] =~ /Hangul/
-                @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
-              elsif line["name"] =~ /CJK/
-                @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
+              case line["name"]
+              when /Hangul/
+                # currently not necessary
+                # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
+              when /CJK/
+                @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
+              when /Tangut/
+                @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
               else
                 # no name
+                warn "ignoring range: #{line["name"]}"
               end
               @range_start = nil
             elsif line["name"] != "<control>"
               raise ArgumentError, "inconsistent range found in data, don't know what to do"
             end
+          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
+            # ignore
           else
             assign :NAMES, line["codepoint"].to_i(16), line["name"]
+            @words += line["name"].split
           end
         end
+        replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
         parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
           @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
           @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []

data/lib/unicoder/builders/scripts.rb CHANGED Viewed

@@ -10,6 +10,12 @@ module Unicoder
           SCRIPT_EXTENSIONS: {},
           SCRIPT_ALIASES: {},
           SCRIPT_NAMES: [],
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
         }
         @reverse_script_names = {}
         @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
         }
       end
+      # TODO refactor how multiple indexes are organized
+      def assign_classic(sub_index_name, codepoint, value)
+        idx = @index[sub_index_name]
+        if option =~ /charkeys/
+          idx[[codepoint].pack("U*")] = value
+        else
+          idx[codepoint] = value
+        end
+      end
       def parse!
         parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
           @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
         parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
           if line["to"]
             (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
-              @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
+              assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
             }
           else
-            @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
+            assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
           end
         end
       end

data/lib/unicoder/builders/sequence_name.rb CHANGED Viewed

@@ -2,11 +2,18 @@ module Unicoder
   module Builder
     class SequenceName
       include Builder
+      include ReplaceCommonWords
+      REPLACE_COUNT = 100
+      REPLACE_BASE = ?{.ord
+      REPLACE_MIN_WORD_LENGTH = 3
       def initialize_index
         @index = {
           SEQUENCES: {},
+          SEQUENCES_NOT_QUALIFIED: {},
         }
+        @words = []
       end
       def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
         else
           idx[key] = value
         end
+        @words += value.split
       end
       def parse!
@@ -61,10 +70,28 @@ module Unicoder
           assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         end
-        parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
-          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+          codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
+          assign_codepoint codepoints, name
+          if codepoints.include?(0xFE0F)
+            # Build all combinations of VS16 present and missing
+            codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
+              if cur.include? 0xFE0F
+                acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
+              else
+                acc.map{|prev| prev + cur}
+              end
+            }.
+            select {|sub_codepoints| sub_codepoints != codepoints }.
+            each { |sub_codepoints|
+              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+            }
+          end
         end
+        replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.0.0"
+  VERSION = "1.1.1"
   UNICODE_VERSIONS = %w[
     16.0.0

data/lib/unicoder/replace_common_words.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require "json"
+module Unicoder
+  module ReplaceCommonWords
+  	def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
+      base = @words.join.chars.max.ord + 1
+  	  puts "Starting to replace the #{count} most common words (replace base: #{base})"
+  	  @index[:REPLACE_BASE] = base
+  	  @index[:COMMON_WORDS] = words.
+  	    select{_1.size >= min_word_length}.
+  	    tally.
+  	    max_by(count){_2}.
+  	    map(&:first)
+  	  @index[which_index].each{|_, name|
+  	    @index[:COMMON_WORDS].each_with_index{|word, index|
+  	      name.gsub! word + " ", [base + index].pack("U")
+  	    }
+  	  }
+  	end
+  end
+end

data/lib/unicoder.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
 require_relative "unicoder/downloader"
 require_relative "unicoder/builder"
 require_relative "unicoder/multi_dimensional_array_builder"
+require_relative "unicoder/replace_common_words"
 if defined?(Rake)
   Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))

data/unicoder.gemspec CHANGED Viewed

@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.required_ruby_version = ">= 2.0", "< 4.0"
+  gem.required_ruby_version = ">= 3.0", "< 4.0"
   gem.add_dependency "rationalist", "~> 2.0"
   gem.add_dependency "rubyzip", "~> 1.2"
   gem.add_dependency "oga", "~> 2.9"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.1
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-04 00:00:00.000000000 Z
+date: 2024-10-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -85,6 +85,7 @@ files:
 - lib/unicoder/constants.rb
 - lib/unicoder/downloader.rb
 - lib/unicoder/multi_dimensional_array_builder.rb
+- lib/unicoder/replace_common_words.rb
 - lib/unicoder/tasks.rake
 - unicoder.gemspec
 homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.0'
+      version: '3.0'
   - - "<"
     - !ruby/object:Gem::Version
       version: '4.0'