RubyGems - unicoder - Versions diffs - 0.1.0 → 1.1.0 - Mend

unicoder 0.1.0 → 1.1.0

Files changed (35) hide show

checksums.yaml +5 -5
data/.gitignore +6 -1
data/.travis.yml +13 -13
data/CHANGELOG.md +24 -1
data/Gemfile +2 -0
data/Gemfile.lock +99 -0
data/MIT-LICENSE.txt +1 -1
data/README.md +35 -5
data/bin/unicoder +1 -1
data/lib/unicoder/builder.rb +77 -15
data/lib/unicoder/builders/categories.rb +7 -12
data/lib/unicoder/builders/display_width.rb +28 -7
data/lib/unicoder/builders/emoji.rb +97 -0
data/lib/unicoder/builders/name.rb +101 -0
data/lib/unicoder/builders/numeric_value.rb +30 -0
data/lib/unicoder/builders/sequence_name.rb +99 -0
data/lib/unicoder/builders/types.rb +83 -0
data/lib/unicoder/constants.rb +81 -16
data/lib/unicoder/downloader.rb +54 -8
data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
data/lib/unicoder/replace_common_words.rb +20 -0
data/lib/unicoder.rb +1 -0
data/unicoder.gemspec +7 -5
metadata +50 -26
data/data/.keep +0 -0
data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
data/data/unicode/security/8.0.0/confusables.txt +0 -9274
data/spec/unicoder_spec.rb +0 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: f79eb48ad06b13b61fc4ceb7fc5e176ee4e9e984
-  data.tar.gz: 94a62eb108e01e1d7da774b58352ab0585235bc7
+SHA256:
+  metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
+  data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
 SHA512:
-  metadata.gz: 01714742c72568ab92a9c3df0b700f3918e32482b7f658da8f099e2cfb54359e098e90fa1caa72a343cbdf2ede36081a9c01a6d65ee76cee841e65b87c9083ad
-  data.tar.gz: dd5b55100962d9408a503b338ebf25062c3dee7dc1ff9ceaccd97e30d57f97d131191ef96ce266e267cc729d70e6a0860702f22fbb1c9a6e4b512547ff1b5805
+  metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
+  data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1

data/.gitignore CHANGED Viewed

@@ -1,3 +1,8 @@
 Gemfile.lock
 /pkg
-/data
+/data*
+*.marshal
+*.marshal.gz
+*.json
+*.mjs
+/old-data

data/.travis.yml CHANGED Viewed

@@ -1,20 +1,20 @@
 sudo: false
 language: ruby
-script: bundle exec ruby spec/unicoder_spec.rb
 rvm:
-- 2.3.0
-- 2.2
-- 2.1
-- 2.0
+- 2.7
+- 2.6
+- 2.5
+- 2.4
+- 2.3
 - ruby-head
-- rbx-2
-- jruby-head
-- jruby-9000
-cache:
-- bundler
+- jruby-9.2.9.0
+- truffleruby
-# matrix:
+matrix:
+  allow_failures:
+    - rvm: 2.3
+    - rvm: ruby-head
+    - rvm: jruby-2.9.2.0
+    - rvm: truffleruby
 #   fast_finish: true

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,28 @@
 ## CHANGELOG
+### 1.1.0
+- Improve name index size: Support ranges
+- Improve name index size: Replace common words
+### 1.0.0
+With the first 1.0 release, unicoder supports 10 indexes:
+- blocks
+- categories
+- confusable
+- display_width
+- emoji
+- name
+- numeric_value
+- scripts
+- sequence_name
+- types
+All indexes can be build in `marshal` format (Ruby's internal
+serialization format) and some now support `esm` (JavaScript module)
 ### 0.1.0
-* WIP
+* Initial release

data/Gemfile CHANGED Viewed

@@ -3,3 +3,5 @@ source 'https://rubygems.org'
 gemspec
 gem 'minitest'
+gem 'rake'
+gem 'irbtools', require: "irbtools/binding"

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,99 @@
+PATH
+  remote: .
+  specs:
+    unicoder (1.1.0)
+      oga (~> 2.9)
+      rationalist (~> 2.0)
+      rubyzip (~> 1.2)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    ansi (1.5.0)
+    ast (2.4.2)
+    cd (1.0.2)
+    clipboard (2.0.0)
+    code (0.9.4)
+      coderay (~> 1.1)
+      method_source (>= 0.9, < 2.0)
+    coderay (1.1.3)
+    core_docs (0.9.11)
+      yard (~> 0.9.11)
+    debugging (2.1.0)
+      paint (>= 0.9, < 3.0)
+    every_day_irb (2.2.0)
+      cd (~> 1.0)
+    fancy_irb (2.1.2)
+      irb (>= 1.7, < 2.0)
+      paint (>= 0.9, < 3.0)
+      unicode-display_width (>= 2.5)
+    ffi (1.17.0)
+    hirb (0.7.3)
+    interactive_editor (0.0.12)
+      spoon (~> 0.0.6)
+    io-console (0.7.2)
+    irb (1.14.1)
+      rdoc (>= 4.0.0)
+      reline (>= 0.4.2)
+    irbtools (4.1.0)
+      clipboard (>= 1.4, < 3.0)
+      code (>= 0.9.4, < 2.0)
+      coderay (~> 1.1)
+      core_docs (~> 0.9.11)
+      debugging (~> 2.1)
+      every_day_irb (~> 2.2)
+      fancy_irb (~> 2.1)
+      hirb (~> 0.7, >= 0.7.3)
+      interactive_editor (~> 0.0, >= 0.0.12)
+      irb (>= 1.13.0, < 1.15)
+      looksee (~> 5.0)
+      methodfinder (~> 2.2, >= 2.2.5)
+      object_shadow (~> 1.1)
+      os (~> 1.1, >= 1.1.4)
+      paint (>= 0.9, < 3.0)
+      ruby_engine (~> 2.0)
+      ruby_version (~> 1.0)
+      wirb (~> 2.0, >= 2.2.1)
+    looksee (5.0.0)
+    method_source (1.1.0)
+    methodfinder (2.2.5)
+    minitest (5.25.1)
+    object_shadow (1.1.1)
+    oga (2.15)
+      ast
+      ruby-ll (~> 2.1)
+    os (1.1.4)
+    paint (2.3.0)
+    psych (5.1.2)
+      stringio
+    rake (13.2.1)
+    rationalist (2.0.1)
+    rdoc (6.7.0)
+      psych (>= 4.0.0)
+    reline (0.5.10)
+      io-console (~> 0.5)
+    ruby-ll (2.1.3)
+      ansi
+      ast
+    ruby_engine (2.0.3)
+    ruby_version (1.0.3)
+    rubyzip (1.3.0)
+    spoon (0.0.6)
+      ffi
+    stringio (3.1.1)
+    unicode-display_width (2.6.0)
+    wirb (2.2.2)
+      paint (>= 0.9, < 3.0)
+    yard (0.9.37)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  irbtools
+  minitest
+  rake
+  unicoder!
+BUNDLED WITH
+   2.5.21

data/MIT-LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2016 Jan Lelis, mail@janlelis.de
+Copyright (c) 2016-2020 Jan Lelis, https://janlelis.com
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,15 +1,45 @@
-# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](http://badge.fury.io/rb/unicoder)
-WIP
+# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](https://badge.fury.io/rb/unicoder)
+unicoder turns Unicode data into bundles for programming libraries.
 ## Usage
 ```
-$ unicoder build index_name
+$ unicoder build <index_name> [--gzip]
+```
+Examples:
+```
+$ unicoder build emoji --format marshal --gzip
+$ unicoder build numeric_value --format esm
 ```
+## Libraries With unicoder-based Indexes
+### Ruby
+Index Name    | Gem
+--------------|----
+blocks        | [unicode-blocks](https://github.com/janlelis/unicode-blocks)
+categories    | [unicode-categories](https://github.com/janlelis/unicode-categories)
+confusable    | [unicode-confusable](https://github.com/janlelis/unicode-confusable)
+emoji         | [unicode-emoji](https://github.com/janlelis/unicode-emoji)
+display\_width| [unicode-display_width](https://github.com/janlelis/unicode-display_width)
+name          | [unicode-name](https://github.com/janlelis/unicode-name)
+numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-numeric_value)
+scripts       | [unicode-scripts](https://github.com/janlelis/unicode-scripts)
+sequence\_name| [unicode-sequence_name](https://github.com/janlelis/unicode-sequence_name)
+types         | [unicode-types](https://github.com/janlelis/unicode-types)
+### JavaScript (ESM)
+Index Name    | Module
+--------------|----
+name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
+numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
 ## MIT License
-Copyright (C) 2016 Jan Lelis <http://janlelis.com>. Released under the MIT license.
+Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.

data/bin/unicoder CHANGED Viewed

@@ -6,7 +6,7 @@ require "rationalist"
 args = Rationalist.parse
 command = args[:_][0]
 identifier = args[:_][1]
-KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip]
+KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip, :option, :meta]
 options = args.select { |option,| KNOWN_OPTIONS.include? option }
 if options.has_key?(:version)

data/lib/unicoder/builder.rb CHANGED Viewed

@@ -1,13 +1,42 @@
 require "json"
+require "rubygems/util"
 module Unicoder
   # A builder defines a parse function which translates one (ore more) unicode data
   # files into an index hash
   module Builder
-    attr_reader :index
+    attr_reader :index, :formats, :option
+    attr_writer :option
-    def initialize(unicode_version = nil)
-      @unicode_version = unicode_version
+    def formats
+      {
+        marshal: {
+          ext: ".marshal",
+        },
+        json: {
+          ext: ".json",
+          option: "charkeys+stringfractions"
+        },
+        esm: {
+          ext: ".mjs",
+          option: "charkeys+stringfractions"
+        }
+      }
+    end
+    def meta
+      {
+        META: {
+          generator: "unicoder v#{Unicoder::VERSION}",
+          unicodeVersion: @unicode_version,
+        },
+      }
+    end
+    def initialize(unicode_version = nil, emoji_version = nil, format = nil)
+      @unicode_version = unicode_version || CURRENT_UNICODE_VERSION
+      @emoji_version = emoji_version || CURRENT_EMOJI_VERSION
+      @option = formats[format.to_sym] ? formats[format.to_sym][:option] || "" : ""
       initialize_index
     end
@@ -15,8 +44,16 @@ module Unicoder
       @index = {}
     end
-    def assign_codepoint(codepoint, value, index = @index)
-      index[codepoint] = value
+    def assign_codepoint(codepoint, value, idx = @index)
+      if option =~ /charkeys/
+        idx[[codepoint].pack("U*")] = value
+      else
+        idx[codepoint] = value
+      end
+    end
+    def assign(sub_index_name, codepoint, value)
+      assign_codepoint(codepoint, value, index[sub_index_name])
     end
     def parse!
@@ -26,47 +63,72 @@ module Unicoder
     def parse_file(identifier, parse_mode, **parse_options)
       filename = UNICODE_FILES[identifier.to_sym] || filename
       raise ArgumentError, "No valid file identifier or filename given" if !filename
-      filename.sub! 'VERSION', @unicode_version
-      Downloader.fetch(identifier) unless File.exists?(filename)
+      filename = filename.dup
+      filename.sub! 'UNICODE_VERSION', @unicode_version
+      filename.sub! 'EMOJI_VERSION', @emoji_version
+      filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[@emoji_version]
+      filename.sub! '.zip', ''
+      filename.sub! /\A(https?|ftp):\//, ""
+      Downloader.fetch(identifier) unless File.exist?(LOCAL_DATA_DIRECTORY + filename)
       file = File.read(LOCAL_DATA_DIRECTORY + filename)
       if parse_mode == :line
         file.each_line{ |line|
           yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
         }
+      elsif parse_mode == :xml
+        require "oga"
+        yield Oga.parse_xml(file)
+      else
+        yield file
       end
     end
     def export(format: :marshal, **options)
       p index if options[:verbose]
+      if options[:meta]
+        idx = meta.merge(index)
+      else
+        idx = index
+      end
       case format.to_sym
       when :marshal
-        index_file = Marshal.dump(index)
+        index_file = Marshal.dump(idx)
       when :json
-        index_file = JSON.dump(index)
+        index_file = JSON.dump(idx)
+      when :esm
+        index_file = "export default " + JSON.dump(idx)
       end
-      # if false# || options[:gzip]
       if options[:gzip]
-        Gem.gzip(index_file)
+        Gem::Util.gzip(index_file)
       else
         index_file
       end
     end
     def self.build(identifier, **options)
       format = options[:format] || :marshal
       require_relative "builders/#{identifier}"
       # require "unicoder/builders/#{identifier}"
       builder_class = self.const_get(identifier.to_s.gsub(/(?:^|_)([a-z])/){ $1.upcase })
-      builder = builder_class.new(options[:unicode_version] || CURRENT_UNICODE_VERSION)
+      builder = builder_class.new(
+        options[:unicode_version],
+        options[:emoji_version],
+        format
+      )
       puts "Building index for #{identifier}…"
+      if options[:option]
+        builder.option = options[:option]
+      end
       builder.parse!
-      index_file = builder.export(options)
+      index_file = builder.export(**options)
       destination ||= options[:destination] || identifier.to_s
-      destination += ".#{format}"
+      destination += "#{builder.formats.dig(format.to_sym, :ext)}"
       destination += ".gz" if options[:gzip]
       bytes = File.write destination, index_file

data/lib/unicoder/builders/categories.rb CHANGED Viewed

@@ -14,23 +14,18 @@ module Unicoder
       end
       def parse!
-        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<range><(?!control).+>)?.*?;(?<category>.+?);.*$/ do |line|
-          if line["range"]
-            if line["range"] =~ /First/
-              @range_start = line["codepoint"].to_i(16)
-            elsif line["range"] =~ /Last/ && @range_start
-              (@range_start..line["codepoint"].to_i(16)).each{ |codepoint|
-                assign_codepoint(codepoint, line["category"], @index[:CATEGORIES])
-              }
-            else
-              raise ArgumentError, "inconsistent range found in data, don't know what to do"
-            end
+        parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
+          if line["to"]
+            (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
+              assign_codepoint(codepoint, line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
+            }
           else
-            assign_codepoint(line["codepoint"].to_i(16), line["category"], @index[:CATEGORIES])
+            assign_codepoint(line["from"].to_i(16), line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
           end
         end
         4.times{ compress! @index[:CATEGORIES] }
+        remove_trailing_nils! @index[:CATEGORIES]
         parse_file :property_value_aliases, :line, regex: /^gc ; (?<short>\S{2}?) *; (?<long>\S+).*$/ do |line|
           @index[:CATEGORY_NAMES][line["short"]] = line["long"]

data/lib/unicoder/builders/display_width.rb CHANGED Viewed

@@ -6,7 +6,23 @@ module Unicoder
       IGNORE_CATEGORIES     = %w[Cs Co Cn].freeze
       ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
-      ZERO_WIDTH_CODEPOINTS = [*0x1160..0x11FF].freeze
+      ZERO_WIDTH_RANGES = [
+        *0x1160..0x11FF, # HANGUL JUNGSEONG
+        *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
+        *0x2060..0x206F, # Ignorables
+        *0xFFF0..0xFFF8, # Ignorables
+        *0xE0000..0xE0FFF, # Ignorables
+      ].freeze
+      WIDE_RANGES = [
+        *0x3400..0x4DBF,
+        *0x4E00..0x9FFF,
+        *0xF900..0xFAFF,
+        *0x20000..0x2FFFD,
+        *0x30000..0x3FFFD,
+      ].freeze
       SPECIAL_WIDTHS = {
         0x0    =>  0, # \0 NULL
         0x5    =>  0, #    ENQUIRY
@@ -18,7 +34,7 @@ module Unicoder
         0xD    =>  0, # \r CARRIAGE RETURN
         0xE    =>  0, #    SHIFT OUT
         0xF    =>  0, #    SHIFT IN
-        0x00AD =>  1, #    SOFT HYPHEN
+        0x00AD =>  nil, #    SOFT HYPHEN
         0x2E3A =>  2, #    TWO-EM DASH
         0x2E3B =>  3, #    THREE-EM DASH
       }.freeze
@@ -28,7 +44,7 @@ module Unicoder
       end
       def parse!
-        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?);(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
+        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
           next if IGNORE_CATEGORIES.include?(line["category"])
           if line["codepoints"]['..']
@@ -44,19 +60,24 @@ module Unicoder
           }
         end
+        ZERO_WIDTH_RANGES.each{ |codepoint|
+          assign_codepoint codepoint, 0
+        }
+        WIDE_RANGES.each{ |codepoint|
+          assign_codepoint codepoint, 2
+        }
         SPECIAL_WIDTHS.each{ |codepoint, value|
           assign_codepoint codepoint, value
         }
         4.times{ compress! }
-        p @index
       end
       def determine_width(codepoint, category, east_asian_width)
         if  ( ZERO_WIDTH_CATEGORIES.include?(category) &&
-              [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ ) ||
-            ZERO_WIDTH_CODEPOINTS.include?(codepoint)
+              [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
           0
         elsif east_asian_width == "F" || east_asian_width == "W"
           2

data/lib/unicoder/builders/emoji.rb ADDED Viewed

@@ -0,0 +1,97 @@
+module Unicoder
+  module Builder
+    class Emoji
+      include Builder
+      REVERSE_PROPERTY_NAMES = {
+        "Emoji" => :E,
+        "Emoji_Modifier_Base" => :B,
+        "Emoji_Modifier" => :M,
+        "Emoji_Component" => :C,
+        "Emoji_Presentation" => :P,
+        "Extended_Pictographic" => :X,
+      }
+      def initialize_index
+        @index = {
+          PROPERTIES: {},
+          FLAGS: [],
+          TAGS: [],
+          KEYCAPS: [],
+          ZWJ: [],
+          SD: [],
+          LIST: {},
+        }
+      end
+      def parse!
+        parse_file :emoji_data, :line, regex: /^(?<codepoints>\S+?) +; (?<property>\S+) *#/ do |line|
+          if line["codepoints"]['..']
+            codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
+              codepoint.to_i(16)
+            })
+          else
+            codepoints = [line["codepoints"].to_i(16)]
+          end
+          codepoints.each{ |codepoint|
+            @index[:PROPERTIES][codepoint] ||= []
+            @index[:PROPERTIES][codepoint] << (REVERSE_PROPERTY_NAMES[line["property"]] || line["property"])
+          }
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Flag_Sequence/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:FLAGS] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Tag_Sequence/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:TAGS] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; Emoji_Keycap_Sequence/ do |line|
+          @index[:KEYCAPS] << line["codepoints"].split[0].to_i(16)
+        end
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:ZWJ] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :valid_subdivisions, :xml do |xml|
+          subdivisions = []
+          xml.css('[idStatus="regular"], [idStatus="deprecated"]').each{ |id|
+            subdivisions += id.text.split
+          }
+          @index[:SD] = subdivisions.uniq
+        end
+        parse_file :emoji_test, :line, regex: /^(?:# (?<sub>sub)?group: (?<group_name>.*)$)|(?:(?<codepoints>.+?)\s*; fully-qualified )/ do |line|
+          if line["group_name"]
+            if !line["sub"]
+              @current_group_name = line["group_name"]
+              @index[:LIST][@current_group_name] = {}
+            else
+              @current_subgroup_name = line["group_name"]
+              @index[:LIST][@current_group_name][@current_subgroup_name] = []
+            end
+          else
+            codepoints = line["codepoints"].split
+            @index[:LIST][@current_group_name][@current_subgroup_name] << codepoints.map{|e| e.to_i(16)}.pack("U*")
+          end
+        end
+      end
+    end
+  end
+end
+=begin alternative
+current_index_level = @index[:SEQUENCES]
+codepoints.each{ |cp|
+  ord = cp.to_i(16)
+  current_index_level[ord] ||= {}
+  current_index_level = current_index_level[ord]
+}
+current_index_level[true] = true # end mark
+=end