RubyGems - unicoder - Versions diffs - 1.0.0 → 1.1.0 - Mend

unicoder 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/Gemfile.lock +1 -1
data/README.md +2 -2
data/lib/unicoder/builders/name.rb +32 -6
data/lib/unicoder/builders/sequence_name.rb +29 -2
data/lib/unicoder/constants.rb +1 -1
data/lib/unicoder/replace_common_words.rb +20 -0
data/lib/unicoder.rb +1 -0
data/unicoder.gemspec +1 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
-  data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
+  metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
+  data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
 SHA512:
-  metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
-  data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
+  metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
+  data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,10 @@
 ## CHANGELOG
+### 1.1.0
+- Improve name index size: Support ranges
+- Improve name index size: Replace common words
 ### 1.0.0
 With the first 1.0 release, unicoder supports 10 indexes:

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    unicoder (1.0.0)
+    unicoder (1.1.0)
       oga (~> 2.9)
       rationalist (~> 2.0)
       rubyzip (~> 1.2)

data/README.md CHANGED Viewed

@@ -37,8 +37,8 @@ types         | [unicode-types](https://github.com/janlelis/unicode-types)
 Index Name    | Module
 --------------|----
-numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
-name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
+name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
+numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
 ## MIT License

data/lib/unicoder/builders/name.rb CHANGED Viewed

@@ -1,19 +1,34 @@
 module Unicoder
   module Builder
     class Name
       include Builder
+      include ReplaceCommonWords
       JAMO_INITIAL = 4352
       JAMO_MEDIAL = 4449
       JAMO_FINAL = 4520
       JAMO_END = 4697
+      CJK = "CJK UNIFIED IDEOGRAPH-"
+      TANGUT = "TANGUT IDEOGRAPH-"
+      REPLACE_COUNT = 500
+      REPLACE_BASE = ?[.ord
       def initialize_index
         @index = {
           NAMES: {},
           ALIASES: {},
-          CJK: [],
-          HANGUL: [],
+          # HANGUL: [],
+          CP_RANGES: {
+            CJK => [], # filled while parsing
+            TANGUT => [], # filled while parsing
+            "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
+            "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
+            "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+          },
           # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
           JAMO: {
             INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
             FINAL: [""],
           },
         }
+        @words = []
         @range_start = nil
       end
@@ -36,22 +52,32 @@ module Unicoder
             if line["name"] =~ /First/
               @range_start = line["codepoint"].to_i(16)
             elsif line["name"] =~ /Last/ && @range_start
-              if line["name"] =~ /Hangul/
-                @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
-              elsif line["name"] =~ /CJK/
-                @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
+              case line["name"]
+              when /Hangul/
+                # currently not necessary
+                # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
+              when /CJK/
+                @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
+              when /Tangut/
+                @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
               else
                 # no name
+                warn "ignoring range: #{line["name"]}"
               end
               @range_start = nil
             elsif line["name"] != "<control>"
               raise ArgumentError, "inconsistent range found in data, don't know what to do"
             end
+          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
+            # ignore
           else
             assign :NAMES, line["codepoint"].to_i(16), line["name"]
+            @words += line["name"].split
           end
         end
+        replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
         parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
           @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
           @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []

data/lib/unicoder/builders/sequence_name.rb CHANGED Viewed

@@ -2,11 +2,18 @@ module Unicoder
   module Builder
     class SequenceName
       include Builder
+      include ReplaceCommonWords
+      REPLACE_COUNT = 100
+      REPLACE_BASE = ?{.ord
+      REPLACE_MIN_WORD_LENGTH = 3
       def initialize_index
         @index = {
           SEQUENCES: {},
+          SEQUENCES_NOT_QUALIFIED: {},
         }
+        @words = []
       end
       def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
         else
           idx[key] = value
         end
+        @words += value.split
       end
       def parse!
@@ -61,10 +70,28 @@ module Unicoder
           assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         end
-        parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
-          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+          codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
+          assign_codepoint codepoints, name
+          if codepoints.include?(0xFE0F)
+            # Build all combinations of VS16 present and missing
+            codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
+              if cur.include? 0xFE0F
+                acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
+              else
+                acc.map{|prev| prev + cur}
+              end
+            }.
+            select {|sub_codepoints| sub_codepoints != codepoints }.
+            each { |sub_codepoints|
+              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+            }
+          end
         end
+        replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Unicoder
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
   UNICODE_VERSIONS = %w[
     16.0.0

data/lib/unicoder/replace_common_words.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "json"
+module Unicoder
+  module ReplaceCommonWords
+  	def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
+  	  puts "Starting to replace the #{count} most common words"
+  	  @index[:REPLACE_BASE] = base
+  	  @index[:COMMON_WORDS] = words.
+  	    select{_1.size >= min_word_length}.
+  	    tally.
+  	    max_by(count){_2}.
+  	    map(&:first)
+  	  @index[which_index].each{|_, name|
+  	    @index[:COMMON_WORDS].each_with_index{|word, index|
+  	      name.gsub! word + " ", [base + index].pack("U")
+  	    }
+  	  }
+  	end
+  end
+end

data/lib/unicoder.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
 require_relative "unicoder/downloader"
 require_relative "unicoder/builder"
 require_relative "unicoder/multi_dimensional_array_builder"
+require_relative "unicoder/replace_common_words"
 if defined?(Rake)
   Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))

data/unicoder.gemspec CHANGED Viewed

@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.required_ruby_version = ">= 2.0", "< 4.0"
+  gem.required_ruby_version = ">= 3.0", "< 4.0"
   gem.add_dependency "rationalist", "~> 2.0"
   gem.add_dependency "rubyzip", "~> 1.2"
   gem.add_dependency "oga", "~> 2.9"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Jan Lelis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-10-04 00:00:00.000000000 Z
+date: 2024-10-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -85,6 +85,7 @@ files:
 - lib/unicoder/constants.rb
 - lib/unicoder/downloader.rb
 - lib/unicoder/multi_dimensional_array_builder.rb
+- lib/unicoder/replace_common_words.rb
 - lib/unicoder/tasks.rake
 - unicoder.gemspec
 homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.0'
+      version: '3.0'
   - - "<"
     - !ruby/object:Gem::Version
       version: '4.0'