RubyGems - unicoder - Versions diffs - 0.1.0 → 1.0.0 - Mend

unicoder 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +5 -5
data/.gitignore +6 -1
data/.travis.yml +13 -13
data/CHANGELOG.md +19 -1
data/Gemfile +2 -0
data/Gemfile.lock +99 -0
data/MIT-LICENSE.txt +1 -1
data/README.md +35 -5
data/bin/unicoder +1 -1
data/lib/unicoder/builder.rb +77 -15
data/lib/unicoder/builders/categories.rb +7 -12
data/lib/unicoder/builders/display_width.rb +28 -7
data/lib/unicoder/builders/emoji.rb +97 -0
data/lib/unicoder/builders/name.rb +75 -0
data/lib/unicoder/builders/numeric_value.rb +30 -0
data/lib/unicoder/builders/sequence_name.rb +72 -0
data/lib/unicoder/builders/types.rb +83 -0
data/lib/unicoder/constants.rb +81 -16
data/lib/unicoder/downloader.rb +54 -8
data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
data/unicoder.gemspec +7 -5
metadata +48 -25
data/data/.keep +0 -0
data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
data/data/unicode/security/8.0.0/confusables.txt +0 -9274
data/spec/unicoder_spec.rb +0 -9

data/lib/unicoder/builders/numeric_value.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Unicoder
+  module Builder
+    class NumericValue
+      include Builder
+      def initialize_index
+        @index = {
+          NUMBERS: {},
+        }
+      end
+      def parse!
+        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
+          unless line["value"].empty?
+            if line["value"] =~ %r</>
+              assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
+            else
+              assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
+            end
+          end
+        end
+        parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
+          assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
+        end
+      end
+    end
+  end
+end

data/lib/unicoder/builders/sequence_name.rb ADDED Viewed

@@ -0,0 +1,72 @@
+module Unicoder
+  module Builder
+    class SequenceName
+      include Builder
+      def initialize_index
+        @index = {
+          SEQUENCES: {},
+        }
+      end
+      def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
+        if option =~ /charkeys/
+          key = codepoints.pack("U*")
+        else
+          key = codepoints
+        end
+        if idx.has_key?(codepoints)
+          if combine
+            idx[key] << " / #{value}"
+          else
+            # ignore new one
+          end
+        else
+          idx[key] = value
+        end
+      end
+      def parse!
+        parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
+          name = "#{line["name"].strip} (#{line["variant"]})"
+          name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
+        end
+        parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
+        end
+        parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
+          name = "#{line["name"].strip} (#{line["variant"]})"
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
+          next if line["type"] == "Basic_Emoji"
+          name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+        end
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+          name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+        end
+      end
+    end
+  end
+end

data/lib/unicoder/builders/types.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module Unicoder
+  module Builder
+    class Types
+      include Builder
+      include MultiDimensionalArrayBuilder
+      NONCHARACTERS = [
+          *0xFDD0..0xFDEF,
+          0xFFFE,  0xFFFF,
+         0x1FFFE, 0x1FFFF,
+         0x2FFFE, 0x2FFFF,
+         0x3FFFE, 0x3FFFF,
+         0x4FFFE, 0x4FFFF,
+         0x5FFFE, 0x5FFFF,
+         0x6FFFE, 0x6FFFF,
+         0x7FFFE, 0x7FFFF,
+         0x8FFFE, 0x8FFFF,
+         0x9FFFE, 0x9FFFF,
+         0xAFFFE, 0xAFFFF,
+         0xBFFFE, 0xBFFFF,
+         0xCFFFE, 0xCFFFF,
+         0xDFFFE, 0xDFFFF,
+         0xEFFFE, 0xEFFFF,
+         0xFFFFE, 0xFFFFF,
+        0x10FFFE, 0x10FFFF,
+      ]
+      def initialize_index
+        @index = {
+          TYPES: [],
+          TYPE_NAMES: %w[
+            Graphic
+            Format
+            Control
+            Private-use
+            Surrogate
+            Noncharacter
+            Reserved
+          ],
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
+        }
+      end
+      def parse!
+        parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
+          if line["to"]
+            codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
+          else
+            codepoints = [line["from"].to_i(16)]
+          end
+          codepoints.each{ |codepoint|
+            case line["category"]
+            when "Cf", "Zl", "Zp"
+              type = 1
+            when "Cc"
+              type = 2
+            when "Co"
+              type = 3
+            when "Cs"
+              type = 4
+            when "Cn"
+              if NONCHARACTERS.include?(codepoint)
+                type = 5
+              else
+                type = 6
+              end
+            end
+            assign :TYPES, codepoint, type
+          }
+        end
+        4.times{ compress! @index[:TYPES] }
+      end
+    end
+  end
+end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,29 +1,94 @@
-module Unicoder
-  VERSION = "0.1.0".freeze
+# frozen_string_literal: true
-  CURRENT_UNICODE_VERSION = "8.0.0".freeze
+module Unicoder
+  VERSION = "1.0.0"
   UNICODE_VERSIONS = %w[
-     6.3.0
-     7.0.0
-     8.0.0
+    16.0.0
+    15.1.0
+    15.0.0
+    14.0.0
+    13.0.0
+    12.1.0
+    12.0.0
+    11.0.0
+    10.0.0
      9.0.0
+     8.0.0
+     7.0.0
+     6.3.0
   ].freeze
-  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public".freeze
+  CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
+  EMOJI_VERSIONS = %w[
+   16.0
+   15.1
+   15.0
+   14.0
+   13.1
+   13.0
+   12.1
+   12.0
+   11.0
+    5.0
+    4.0
+    3.0
+    2.0
+  ].freeze
+  EMOJI_RELATED_UNICODE_VERSIONS = {
+   "16.0" => "16.0.0",
+   "15.1" => "15.1.0",
+   "15.0" => "15.0.0",
+   "14.0" => "14.0.0",
+   "13.1" => "13.0.0",
+   "13.0" => "13.0.0",
+   "12.1" => "12.1.0",
+   "12.0" => "12.0.0",
+   "11.0" => "11.0.0",
+    "5.0" => "10.0.0",
+    "4.0" => "9.0.0",
+    "3.0" => "9.0.0",
+    "2.0" => "8.0.0",
+  }.freeze
+  CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
+  IVD_VERSION = "2022-09-13"
+  CLDR_VERSION = "45"
+  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
   LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
   UNICODE_FILES = {
-    east_asian_width:         "/VERSION/ucd/EastAsianWidth.txt",
-    unicode_data:             "/VERSION/ucd/UnicodeData.txt",
-    name_aliases:             "/VERSION/ucd/NameAliases.txt",
-    confusables:              "/security/VERSION/confusables.txt",
-    blocks:                   "/VERSION/ucd/Blocks.txt",
-    scripts:                  "/VERSION/ucd/Scripts.txt",
-    script_extensions:        "/VERSION/ucd/ScriptExtensions.txt",
-    property_value_aliases:   "/VERSION/ucd/PropertyValueAliases.txt",
-    general_categories:       "/VERSION/ucd/extracted/DerivedGeneralCategory.txt",
+    east_asian_width:          "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
+    unicode_data:              "/UNICODE_VERSION/ucd/UnicodeData.txt",
+    name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
+    confusables:               "/security/UNICODE_VERSION/confusables.txt",
+    blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
+    scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
+    script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
+    property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
+    general_categories:        "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
+    unihan_numeric_values:     "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
+    jamo:                      "/UNICODE_VERSION/ucd/Jamo.txt",
+    named_sequences:           "/UNICODE_VERSION/ucd/NamedSequences.txt",
+    named_sequences_prov:      "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
+    standardized_variants:     "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
+    ivd_sequences:             "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
+    # emoji_data:                "/EMOJI_VERSION/ucd/emoji/",
+    emoji_data:                "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
+    emoji_sequences:           "/emoji/EMOJI_VERSION/emoji-sequences.txt",
+    # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
+    emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
+    emoji_zwj_sequences:       "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
+    emoji_test:                "/emoji/EMOJI_VERSION/emoji-test.txt",
+    # valid_subdivisions:        "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
+    valid_subdivisions:        "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
+    # ""
   }
 end

data/lib/unicoder/downloader.rb CHANGED Viewed

@@ -1,28 +1,74 @@
 require "open-uri"
 require "fileutils"
+require "zip"
 module Unicoder
   module Downloader
     def self.fetch(identifier,
         unicode_version: CURRENT_UNICODE_VERSION,
+        emoji_version: CURRENT_EMOJI_VERSION,
         destination_directory: LOCAL_DATA_DIRECTORY,
         destination: nil,
         filename: nil
       )
       filename = UNICODE_FILES[identifier.to_sym] || filename
       raise ArgumentError, "No valid file identifier or filename given" if !filename
-      filename.sub! 'VERSION', unicode_version
-      source = UNICODE_DATA_ENDPOINT + filename
-      destination ||= destination_directory + filename
-      open(source){ |f|
-        FileUtils.mkdir_p(File.dirname(destination))
-        File.write(destination, f.read)
-      }
+      filename = filename.dup
+      filename.sub! 'UNICODE_VERSION', unicode_version
+      filename.sub! 'EMOJI_VERSION', emoji_version
+      filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
+      if filename =~ /\A(https?|ftp):\/\//
+        source = filename
+        destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
+      else
+        source = UNICODE_DATA_ENDPOINT + filename
+        destination ||= destination_directory + filename
+      end
       puts "GET #{source} => #{destination}"
+      if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
+        # Too much magic, download unzip zip files
+        zip = true
+        source = $~[:outer_path] + ".zip"
+        inner_zip_filename = $~[:inner_path]
+        if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
+          destination = $~[:outer_path] + ".zip"
+          destination_files = $~[:outer_path]
+        else
+          raise "uncoder bug"
+        end
+      else
+        zip = false
+      end
+      if File.exist?(destination)
+        puts "Skipping download of #{source} (already exists)"
+      else
+        URI.open(source){ |f|
+          FileUtils.mkdir_p(File.dirname(destination))
+          File.write(destination, f.read)
+        }
+      end
+      if zip
+        unzip(destination, [inner_zip_filename], destination_files)
+      end
     rescue => e
       $stderr.puts "#{e.class}: #{e.message}"
     end
+    def self.unzip(archive, files, destination_dir)
+      Zip::File.open(archive) do |zip|
+        zip.each do |file_in_zip|
+          if files.include?(file_in_zip.name)
+            FileUtils.mkdir_p(destination_dir)
+            puts "Extract #{file_in_zip.name}"
+            file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
+          end
+        end
+        # entry = zip.glob('*.csv').first
+      end
+    end
   end
 end

data/lib/unicoder/multi_dimensional_array_builder.rb CHANGED Viewed

@@ -59,6 +59,28 @@ module Unicoder
         end
       }
     end
+    def remove_trailing_nils!(index = @index)
+      index.each{ |plane|
+        if plane.is_a?(Array)
+          plane.pop while plane[-1] == nil
+          plane.each{ |row|
+            if row.is_a?(Array)
+            row.pop while row[-1] == nil
+            row.each{ |byte|
+              if byte.is_a?(Array)
+                byte.pop while byte[-1] == nil
+                byte.each{ |nibble|
+                  if nibble.is_a?(Array)
+                    nibble.pop while nibble[-1] == nil
+                  end
+                }
+              end
+            }
+            end
+        }
+        end
+      }
+    end
   end
-end
+end

data/unicoder.gemspec CHANGED Viewed

@@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants"
 Gem::Specification.new do |gem|
   gem.name          = "unicoder"
   gem.version       = Unicoder::VERSION
-  gem.summary       = "Create specialized indexes for Unicode data lookup"
-  gem.description   = "Generate specialized indexes for Unicode data lookup"
+  gem.summary       = "Creates specialized indexes for Unicode data lookup"
+  gem.description   = "Generates specialized indexes for Unicode data lookup"
   gem.authors       = ["Jan Lelis"]
-  gem.email         = ["mail@janlelis.de"]
+  gem.email         = ["hi@ruby.consulting"]
   gem.homepage      = "https://github.com/janlelis/unicoder"
   gem.license       = "MIT"
-  gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
+  gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.required_ruby_version = "~> 2.0"
+  gem.required_ruby_version = ">= 2.0", "< 4.0"
   gem.add_dependency "rationalist", "~> 2.0"
+  gem.add_dependency "rubyzip", "~> 1.2"
+  gem.add_dependency "oga", "~> 2.9"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicoder
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 1.0.0
 platform: ruby
 authors:
 - Jan Lelis
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-04-13 00:00:00.000000000 Z
+date: 2024-10-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rationalist
@@ -24,9 +24,37 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.0'
-description: Generate specialized indexes for Unicode data lookup
+- !ruby/object:Gem::Dependency
+  name: rubyzip
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+- !ruby/object:Gem::Dependency
+  name: oga
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.9'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.9'
+description: Generates specialized indexes for Unicode data lookup
 email:
-- mail@janlelis.de
+- hi@ruby.consulting
 executables:
 - unicoder
 extensions: []
@@ -37,57 +65,52 @@ files:
 - CHANGELOG.md
 - CODE_OF_CONDUCT.md
 - Gemfile
+- Gemfile.lock
 - MIT-LICENSE.txt
 - README.md
 - Rakefile
 - bin/unicoder
-- data/.keep
-- data/unicode/8.0.0/ucd/Blocks.txt
-- data/unicode/8.0.0/ucd/EastAsianWidth.txt
-- data/unicode/8.0.0/ucd/NameAliases.txt
-- data/unicode/8.0.0/ucd/PropertyValueAliases.txt
-- data/unicode/8.0.0/ucd/ScriptExtensions.txt
-- data/unicode/8.0.0/ucd/Scripts.txt
-- data/unicode/8.0.0/ucd/UnicodeData.txt
-- data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt
-- data/unicode/security/8.0.0/confusables.txt
 - lib/unicoder.rb
 - lib/unicoder/builder.rb
 - lib/unicoder/builders/blocks.rb
 - lib/unicoder/builders/categories.rb
 - lib/unicoder/builders/confusable.rb
 - lib/unicoder/builders/display_width.rb
+- lib/unicoder/builders/emoji.rb
+- lib/unicoder/builders/name.rb
+- lib/unicoder/builders/numeric_value.rb
 - lib/unicoder/builders/scripts.rb
+- lib/unicoder/builders/sequence_name.rb
+- lib/unicoder/builders/types.rb
 - lib/unicoder/constants.rb
 - lib/unicoder/downloader.rb
 - lib/unicoder/multi_dimensional_array_builder.rb
 - lib/unicoder/tasks.rake
-- spec/unicoder_spec.rb
 - unicoder.gemspec
 homepage: https://github.com/janlelis/unicoder
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - "~>"
+  - - ">="
     - !ruby/object:Gem::Version
       version: '2.0'
+  - - "<"
+    - !ruby/object:Gem::Version
+      version: '4.0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.6.3
-signing_key:
+rubygems_version: 3.5.21
+signing_key:
 specification_version: 4
-summary: Create specialized indexes for Unicode data lookup
-test_files:
-- spec/unicoder_spec.rb
-has_rdoc:
+summary: Creates specialized indexes for Unicode data lookup
+test_files: []

data/data/.keep DELETED Viewed

File without changes