RubyGems - unicoder - Versions diffs - 0.1.0 → 1.1.0 - Mend

unicoder 0.1.0 → 1.1.0

Files changed (35) hide show

checksums.yaml +5 -5
data/.gitignore +6 -1
data/.travis.yml +13 -13
data/CHANGELOG.md +24 -1
data/Gemfile +2 -0
data/Gemfile.lock +99 -0
data/MIT-LICENSE.txt +1 -1
data/README.md +35 -5
data/bin/unicoder +1 -1
data/lib/unicoder/builder.rb +77 -15
data/lib/unicoder/builders/categories.rb +7 -12
data/lib/unicoder/builders/display_width.rb +28 -7
data/lib/unicoder/builders/emoji.rb +97 -0
data/lib/unicoder/builders/name.rb +101 -0
data/lib/unicoder/builders/numeric_value.rb +30 -0
data/lib/unicoder/builders/sequence_name.rb +99 -0
data/lib/unicoder/builders/types.rb +83 -0
data/lib/unicoder/constants.rb +81 -16
data/lib/unicoder/downloader.rb +54 -8
data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
data/lib/unicoder/replace_common_words.rb +20 -0
data/lib/unicoder.rb +1 -0
data/unicoder.gemspec +7 -5
metadata +50 -26
data/data/.keep +0 -0
data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
data/data/unicode/security/8.0.0/confusables.txt +0 -9274
data/spec/unicoder_spec.rb +0 -9

data/lib/unicoder/builders/name.rb ADDED Viewed

@@ -0,0 +1,101 @@
+module Unicoder
+  module Builder
+    class Name
+      include Builder
+      include ReplaceCommonWords
+      JAMO_INITIAL = 4352
+      JAMO_MEDIAL = 4449
+      JAMO_FINAL = 4520
+      JAMO_END = 4697
+      CJK = "CJK UNIFIED IDEOGRAPH-"
+      TANGUT = "TANGUT IDEOGRAPH-"
+      REPLACE_COUNT = 500
+      REPLACE_BASE = ?[.ord
+      def initialize_index
+        @index = {
+          NAMES: {},
+          ALIASES: {},
+          # HANGUL: [],
+          CP_RANGES: {
+            CJK => [], # filled while parsing
+            TANGUT => [], # filled while parsing
+            "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
+            "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
+            "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
+            "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
+          },
+          # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
+          JAMO: {
+            INITIAL: [],
+            MEDIAL: [],
+            FINAL: [""],
+          },
+        }
+        @words = []
+        @range_start = nil
+      end
+      def parse!
+        if option =~ /charkeys/
+          get_key = ->(codepoint){ [codepoint].pack("U*") }
+        else
+          get_key = -> (codepoint){ codepoint }
+        end
+        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
+          if line["name"][0] == "<" && line["name"][-1] == ">"
+            if line["name"] =~ /First/
+              @range_start = line["codepoint"].to_i(16)
+            elsif line["name"] =~ /Last/ && @range_start
+              case line["name"]
+              when /Hangul/
+                # currently not necessary
+                # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
+              when /CJK/
+                @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
+              when /Tangut/
+                @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
+              else
+                # no name
+                warn "ignoring range: #{line["name"]}"
+              end
+              @range_start = nil
+            elsif line["name"] != "<control>"
+              raise ArgumentError, "inconsistent range found in data, don't know what to do"
+            end
+          elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
+            # ignore
+          else
+            assign :NAMES, line["codepoint"].to_i(16), line["name"]
+            @words += line["name"].split
+          end
+        end
+        replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
+        parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
+        end
+        parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
+          case line["codepoint"].to_i(16)
+          when JAMO_INITIAL...JAMO_MEDIAL
+            @index[:JAMO][:INITIAL] << line["short_name"]
+          when JAMO_MEDIAL...JAMO_FINAL
+            @index[:JAMO][:MEDIAL] << line["short_name"]
+          when JAMO_FINAL..JAMO_END
+            @index[:JAMO][:FINAL] << line["short_name"]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/unicoder/builders/numeric_value.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Unicoder
+  module Builder
+    class NumericValue
+      include Builder
+      def initialize_index
+        @index = {
+          NUMBERS: {},
+        }
+      end
+      def parse!
+        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
+          unless line["value"].empty?
+            if line["value"] =~ %r</>
+              assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
+            else
+              assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
+            end
+          end
+        end
+        parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
+          assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
+        end
+      end
+    end
+  end
+end

data/lib/unicoder/builders/sequence_name.rb ADDED Viewed

@@ -0,0 +1,99 @@
+module Unicoder
+  module Builder
+    class SequenceName
+      include Builder
+      include ReplaceCommonWords
+      REPLACE_COUNT = 100
+      REPLACE_BASE = ?{.ord
+      REPLACE_MIN_WORD_LENGTH = 3
+      def initialize_index
+        @index = {
+          SEQUENCES: {},
+          SEQUENCES_NOT_QUALIFIED: {},
+        }
+        @words = []
+      end
+      def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
+        if option =~ /charkeys/
+          key = codepoints.pack("U*")
+        else
+          key = codepoints
+        end
+        if idx.has_key?(codepoints)
+          if combine
+            idx[key] << " / #{value}"
+          else
+            # ignore new one
+          end
+        else
+          idx[key] = value
+        end
+        @words += value.split
+      end
+      def parse!
+        parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
+          name = "#{line["name"].strip} (#{line["variant"]})"
+          name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
+        end
+        parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
+        end
+        parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
+        end
+        parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
+          name = "#{line["name"].strip} (#{line["variant"]})"
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
+          next if line["type"] == "Basic_Emoji"
+          name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
+          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+        end
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+          name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
+          codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
+          assign_codepoint codepoints, name
+          if codepoints.include?(0xFE0F)
+            # Build all combinations of VS16 present and missing
+            codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
+              if cur.include? 0xFE0F
+                acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
+              else
+                acc.map{|prev| prev + cur}
+              end
+            }.
+            select {|sub_codepoints| sub_codepoints != codepoints }.
+            each { |sub_codepoints|
+              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+            }
+          end
+        end
+        replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+      end
+    end
+  end
+end

data/lib/unicoder/builders/types.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module Unicoder
+  module Builder
+    class Types
+      include Builder
+      include MultiDimensionalArrayBuilder
+      NONCHARACTERS = [
+          *0xFDD0..0xFDEF,
+          0xFFFE,  0xFFFF,
+         0x1FFFE, 0x1FFFF,
+         0x2FFFE, 0x2FFFF,
+         0x3FFFE, 0x3FFFF,
+         0x4FFFE, 0x4FFFF,
+         0x5FFFE, 0x5FFFF,
+         0x6FFFE, 0x6FFFF,
+         0x7FFFE, 0x7FFFF,
+         0x8FFFE, 0x8FFFF,
+         0x9FFFE, 0x9FFFF,
+         0xAFFFE, 0xAFFFF,
+         0xBFFFE, 0xBFFFF,
+         0xCFFFE, 0xCFFFF,
+         0xDFFFE, 0xDFFFF,
+         0xEFFFE, 0xEFFFF,
+         0xFFFFE, 0xFFFFF,
+        0x10FFFE, 0x10FFFF,
+      ]
+      def initialize_index
+        @index = {
+          TYPES: [],
+          TYPE_NAMES: %w[
+            Graphic
+            Format
+            Control
+            Private-use
+            Surrogate
+            Noncharacter
+            Reserved
+          ],
+          OFFSETS: [
+            0x10000,
+            0x1000,
+            0x100,
+            0x10
+          ],
+        }
+      end
+      def parse!
+        parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
+          if line["to"]
+            codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
+          else
+            codepoints = [line["from"].to_i(16)]
+          end
+          codepoints.each{ |codepoint|
+            case line["category"]
+            when "Cf", "Zl", "Zp"
+              type = 1
+            when "Cc"
+              type = 2
+            when "Co"
+              type = 3
+            when "Cs"
+              type = 4
+            when "Cn"
+              if NONCHARACTERS.include?(codepoint)
+                type = 5
+              else
+                type = 6
+              end
+            end
+            assign :TYPES, codepoint, type
+          }
+        end
+        4.times{ compress! @index[:TYPES] }
+      end
+    end
+  end
+end

data/lib/unicoder/constants.rb CHANGED Viewed

@@ -1,29 +1,94 @@
-module Unicoder
-  VERSION = "0.1.0".freeze
+# frozen_string_literal: true
-  CURRENT_UNICODE_VERSION = "8.0.0".freeze
+module Unicoder
+  VERSION = "1.1.0"
   UNICODE_VERSIONS = %w[
-     6.3.0
-     7.0.0
-     8.0.0
+    16.0.0
+    15.1.0
+    15.0.0
+    14.0.0
+    13.0.0
+    12.1.0
+    12.0.0
+    11.0.0
+    10.0.0
      9.0.0
+     8.0.0
+     7.0.0
+     6.3.0
   ].freeze
-  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public".freeze
+  CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
+  EMOJI_VERSIONS = %w[
+   16.0
+   15.1
+   15.0
+   14.0
+   13.1
+   13.0
+   12.1
+   12.0
+   11.0
+    5.0
+    4.0
+    3.0
+    2.0
+  ].freeze
+  EMOJI_RELATED_UNICODE_VERSIONS = {
+   "16.0" => "16.0.0",
+   "15.1" => "15.1.0",
+   "15.0" => "15.0.0",
+   "14.0" => "14.0.0",
+   "13.1" => "13.0.0",
+   "13.0" => "13.0.0",
+   "12.1" => "12.1.0",
+   "12.0" => "12.0.0",
+   "11.0" => "11.0.0",
+    "5.0" => "10.0.0",
+    "4.0" => "9.0.0",
+    "3.0" => "9.0.0",
+    "2.0" => "8.0.0",
+  }.freeze
+  CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
+  IVD_VERSION = "2022-09-13"
+  CLDR_VERSION = "45"
+  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
   LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
   UNICODE_FILES = {
-    east_asian_width:         "/VERSION/ucd/EastAsianWidth.txt",
-    unicode_data:             "/VERSION/ucd/UnicodeData.txt",
-    name_aliases:             "/VERSION/ucd/NameAliases.txt",
-    confusables:              "/security/VERSION/confusables.txt",
-    blocks:                   "/VERSION/ucd/Blocks.txt",
-    scripts:                  "/VERSION/ucd/Scripts.txt",
-    script_extensions:        "/VERSION/ucd/ScriptExtensions.txt",
-    property_value_aliases:   "/VERSION/ucd/PropertyValueAliases.txt",
-    general_categories:       "/VERSION/ucd/extracted/DerivedGeneralCategory.txt",
+    east_asian_width:          "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
+    unicode_data:              "/UNICODE_VERSION/ucd/UnicodeData.txt",
+    name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
+    confusables:               "/security/UNICODE_VERSION/confusables.txt",
+    blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
+    scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
+    script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
+    property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
+    general_categories:        "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
+    unihan_numeric_values:     "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
+    jamo:                      "/UNICODE_VERSION/ucd/Jamo.txt",
+    named_sequences:           "/UNICODE_VERSION/ucd/NamedSequences.txt",
+    named_sequences_prov:      "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
+    standardized_variants:     "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
+    ivd_sequences:             "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
+    # emoji_data:                "/EMOJI_VERSION/ucd/emoji/",
+    emoji_data:                "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
+    emoji_sequences:           "/emoji/EMOJI_VERSION/emoji-sequences.txt",
+    # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
+    emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
+    emoji_zwj_sequences:       "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
+    emoji_test:                "/emoji/EMOJI_VERSION/emoji-test.txt",
+    # valid_subdivisions:        "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
+    valid_subdivisions:        "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
+    # ""
   }
 end

data/lib/unicoder/downloader.rb CHANGED Viewed

@@ -1,28 +1,74 @@
 require "open-uri"
 require "fileutils"
+require "zip"
 module Unicoder
   module Downloader
     def self.fetch(identifier,
         unicode_version: CURRENT_UNICODE_VERSION,
+        emoji_version: CURRENT_EMOJI_VERSION,
         destination_directory: LOCAL_DATA_DIRECTORY,
         destination: nil,
         filename: nil
       )
       filename = UNICODE_FILES[identifier.to_sym] || filename
       raise ArgumentError, "No valid file identifier or filename given" if !filename
-      filename.sub! 'VERSION', unicode_version
-      source = UNICODE_DATA_ENDPOINT + filename
-      destination ||= destination_directory + filename
-      open(source){ |f|
-        FileUtils.mkdir_p(File.dirname(destination))
-        File.write(destination, f.read)
-      }
+      filename = filename.dup
+      filename.sub! 'UNICODE_VERSION', unicode_version
+      filename.sub! 'EMOJI_VERSION', emoji_version
+      filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
+      if filename =~ /\A(https?|ftp):\/\//
+        source = filename
+        destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
+      else
+        source = UNICODE_DATA_ENDPOINT + filename
+        destination ||= destination_directory + filename
+      end
       puts "GET #{source} => #{destination}"
+      if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
+        # Too much magic, download unzip zip files
+        zip = true
+        source = $~[:outer_path] + ".zip"
+        inner_zip_filename = $~[:inner_path]
+        if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
+          destination = $~[:outer_path] + ".zip"
+          destination_files = $~[:outer_path]
+        else
+          raise "uncoder bug"
+        end
+      else
+        zip = false
+      end
+      if File.exist?(destination)
+        puts "Skipping download of #{source} (already exists)"
+      else
+        URI.open(source){ |f|
+          FileUtils.mkdir_p(File.dirname(destination))
+          File.write(destination, f.read)
+        }
+      end
+      if zip
+        unzip(destination, [inner_zip_filename], destination_files)
+      end
     rescue => e
       $stderr.puts "#{e.class}: #{e.message}"
     end
+    def self.unzip(archive, files, destination_dir)
+      Zip::File.open(archive) do |zip|
+        zip.each do |file_in_zip|
+          if files.include?(file_in_zip.name)
+            FileUtils.mkdir_p(destination_dir)
+            puts "Extract #{file_in_zip.name}"
+            file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
+          end
+        end
+        # entry = zip.glob('*.csv').first
+      end
+    end
   end
 end

data/lib/unicoder/multi_dimensional_array_builder.rb CHANGED Viewed

@@ -59,6 +59,28 @@ module Unicoder
         end
       }
     end
+    def remove_trailing_nils!(index = @index)
+      index.each{ |plane|
+        if plane.is_a?(Array)
+          plane.pop while plane[-1] == nil
+          plane.each{ |row|
+            if row.is_a?(Array)
+            row.pop while row[-1] == nil
+            row.each{ |byte|
+              if byte.is_a?(Array)
+                byte.pop while byte[-1] == nil
+                byte.each{ |nibble|
+                  if nibble.is_a?(Array)
+                    nibble.pop while nibble[-1] == nil
+                  end
+                }
+              end
+            }
+            end
+        }
+        end
+      }
+    end
   end
-end
+end

data/lib/unicoder/replace_common_words.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "json"
+module Unicoder
+  module ReplaceCommonWords
+  	def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
+  	  puts "Starting to replace the #{count} most common words"
+  	  @index[:REPLACE_BASE] = base
+  	  @index[:COMMON_WORDS] = words.
+  	    select{_1.size >= min_word_length}.
+  	    tally.
+  	    max_by(count){_2}.
+  	    map(&:first)
+  	  @index[which_index].each{|_, name|
+  	    @index[:COMMON_WORDS].each_with_index{|word, index|
+  	      name.gsub! word + " ", [base + index].pack("U")
+  	    }
+  	  }
+  	end
+  end
+end

data/lib/unicoder.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
 require_relative "unicoder/downloader"
 require_relative "unicoder/builder"
 require_relative "unicoder/multi_dimensional_array_builder"
+require_relative "unicoder/replace_common_words"
 if defined?(Rake)
   Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))

data/unicoder.gemspec CHANGED Viewed

@@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants"
 Gem::Specification.new do |gem|
   gem.name          = "unicoder"
   gem.version       = Unicoder::VERSION
-  gem.summary       = "Create specialized indexes for Unicode data lookup"
-  gem.description   = "Generate specialized indexes for Unicode data lookup"
+  gem.summary       = "Creates specialized indexes for Unicode data lookup"
+  gem.description   = "Generates specialized indexes for Unicode data lookup"
   gem.authors       = ["Jan Lelis"]
-  gem.email         = ["mail@janlelis.de"]
+  gem.email         = ["hi@ruby.consulting"]
   gem.homepage      = "https://github.com/janlelis/unicoder"
   gem.license       = "MIT"
-  gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
+  gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
-  gem.required_ruby_version = "~> 2.0"
+  gem.required_ruby_version = ">= 3.0", "< 4.0"
   gem.add_dependency "rationalist", "~> 2.0"
+  gem.add_dependency "rubyzip", "~> 1.2"
+  gem.add_dependency "oga", "~> 2.9"
 end