unicoder 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +6 -1
- data/.travis.yml +13 -13
- data/CHANGELOG.md +19 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +99 -0
- data/MIT-LICENSE.txt +1 -1
- data/README.md +35 -5
- data/bin/unicoder +1 -1
- data/lib/unicoder/builder.rb +77 -15
- data/lib/unicoder/builders/categories.rb +7 -12
- data/lib/unicoder/builders/display_width.rb +28 -7
- data/lib/unicoder/builders/emoji.rb +97 -0
- data/lib/unicoder/builders/name.rb +75 -0
- data/lib/unicoder/builders/numeric_value.rb +30 -0
- data/lib/unicoder/builders/sequence_name.rb +72 -0
- data/lib/unicoder/builders/types.rb +83 -0
- data/lib/unicoder/constants.rb +81 -16
- data/lib/unicoder/downloader.rb +54 -8
- data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
- data/unicoder.gemspec +7 -5
- metadata +48 -25
- data/data/.keep +0 -0
- data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
- data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
- data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
- data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
- data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
- data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
- data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
- data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
- data/data/unicode/security/8.0.0/confusables.txt +0 -9274
- data/spec/unicoder_spec.rb +0 -9
| @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            module Unicoder
         | 
| 2 | 
            +
              module Builder
         | 
| 3 | 
            +
                class NumericValue
         | 
| 4 | 
            +
                  include Builder
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                  def initialize_index
         | 
| 7 | 
            +
                    @index = {
         | 
| 8 | 
            +
                      NUMBERS: {},
         | 
| 9 | 
            +
                    }
         | 
| 10 | 
            +
                  end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  def parse!
         | 
| 13 | 
            +
                    parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
         | 
| 14 | 
            +
                      unless line["value"].empty?
         | 
| 15 | 
            +
                        if line["value"] =~ %r</>
         | 
| 16 | 
            +
                          
         | 
| 17 | 
            +
                          assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
         | 
| 18 | 
            +
                        else
         | 
| 19 | 
            +
                          assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
         | 
| 20 | 
            +
                        end
         | 
| 21 | 
            +
                      end
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
         | 
| 25 | 
            +
                      assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
         | 
| 26 | 
            +
                    end
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
            end
         | 
| @@ -0,0 +1,72 @@ | |
| 1 | 
            +
            module Unicoder
         | 
| 2 | 
            +
              module Builder
         | 
| 3 | 
            +
                class SequenceName
         | 
| 4 | 
            +
                  include Builder
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                  def initialize_index
         | 
| 7 | 
            +
                    @index = {
         | 
| 8 | 
            +
                      SEQUENCES: {},
         | 
| 9 | 
            +
                    }
         | 
| 10 | 
            +
                  end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
         | 
| 13 | 
            +
                    if option =~ /charkeys/
         | 
| 14 | 
            +
                      key = codepoints.pack("U*")
         | 
| 15 | 
            +
                    else
         | 
| 16 | 
            +
                      key = codepoints
         | 
| 17 | 
            +
                    end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    if idx.has_key?(codepoints)
         | 
| 20 | 
            +
                      if combine
         | 
| 21 | 
            +
                        idx[key] << " / #{value}"
         | 
| 22 | 
            +
                      else
         | 
| 23 | 
            +
                        # ignore new one
         | 
| 24 | 
            +
                      end
         | 
| 25 | 
            +
                    else
         | 
| 26 | 
            +
                      idx[key] = value
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
                  end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  def parse!
         | 
| 31 | 
            +
                    parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
         | 
| 32 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
         | 
| 36 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
         | 
| 40 | 
            +
                      name = "#{line["name"].strip} (#{line["variant"]})"
         | 
| 41 | 
            +
                      name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
         | 
| 42 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
         | 
| 46 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
         | 
| 50 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
         | 
| 54 | 
            +
                      name = "#{line["name"].strip} (#{line["variant"]})"
         | 
| 55 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         | 
| 56 | 
            +
                    end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
         | 
| 59 | 
            +
                      next if line["type"] == "Basic_Emoji"
         | 
| 60 | 
            +
                      name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
         | 
| 61 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         | 
| 62 | 
            +
                    end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
         | 
| 65 | 
            +
                      name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
         | 
| 66 | 
            +
                      assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         | 
| 67 | 
            +
                    end
         | 
| 68 | 
            +
                  end
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
            end
         | 
| 72 | 
            +
             | 
| @@ -0,0 +1,83 @@ | |
| 1 | 
            +
            module Unicoder
         | 
| 2 | 
            +
              module Builder
         | 
| 3 | 
            +
                class Types
         | 
| 4 | 
            +
                  include Builder
         | 
| 5 | 
            +
                  include MultiDimensionalArrayBuilder
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                  NONCHARACTERS = [
         | 
| 8 | 
            +
                      *0xFDD0..0xFDEF,
         | 
| 9 | 
            +
                      0xFFFE,  0xFFFF,
         | 
| 10 | 
            +
                     0x1FFFE, 0x1FFFF,
         | 
| 11 | 
            +
                     0x2FFFE, 0x2FFFF,
         | 
| 12 | 
            +
                     0x3FFFE, 0x3FFFF,
         | 
| 13 | 
            +
                     0x4FFFE, 0x4FFFF,
         | 
| 14 | 
            +
                     0x5FFFE, 0x5FFFF,
         | 
| 15 | 
            +
                     0x6FFFE, 0x6FFFF,
         | 
| 16 | 
            +
                     0x7FFFE, 0x7FFFF,
         | 
| 17 | 
            +
                     0x8FFFE, 0x8FFFF,
         | 
| 18 | 
            +
                     0x9FFFE, 0x9FFFF,
         | 
| 19 | 
            +
                     0xAFFFE, 0xAFFFF,
         | 
| 20 | 
            +
                     0xBFFFE, 0xBFFFF,
         | 
| 21 | 
            +
                     0xCFFFE, 0xCFFFF,
         | 
| 22 | 
            +
                     0xDFFFE, 0xDFFFF,
         | 
| 23 | 
            +
                     0xEFFFE, 0xEFFFF,
         | 
| 24 | 
            +
                     0xFFFFE, 0xFFFFF,
         | 
| 25 | 
            +
                    0x10FFFE, 0x10FFFF,
         | 
| 26 | 
            +
                  ]
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  def initialize_index
         | 
| 29 | 
            +
                    @index = {
         | 
| 30 | 
            +
                      TYPES: [],
         | 
| 31 | 
            +
                      TYPE_NAMES: %w[
         | 
| 32 | 
            +
                        Graphic
         | 
| 33 | 
            +
                        Format
         | 
| 34 | 
            +
                        Control
         | 
| 35 | 
            +
                        Private-use
         | 
| 36 | 
            +
                        Surrogate
         | 
| 37 | 
            +
                        Noncharacter
         | 
| 38 | 
            +
                        Reserved
         | 
| 39 | 
            +
                      ],
         | 
| 40 | 
            +
                      OFFSETS: [
         | 
| 41 | 
            +
                        0x10000,
         | 
| 42 | 
            +
                        0x1000,
         | 
| 43 | 
            +
                        0x100,
         | 
| 44 | 
            +
                        0x10
         | 
| 45 | 
            +
                      ],
         | 
| 46 | 
            +
                    }
         | 
| 47 | 
            +
                  end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                  def parse!
         | 
| 50 | 
            +
                    parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
         | 
| 51 | 
            +
                      if line["to"]
         | 
| 52 | 
            +
                        codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
         | 
| 53 | 
            +
                      else
         | 
| 54 | 
            +
                        codepoints = [line["from"].to_i(16)]
         | 
| 55 | 
            +
                      end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                      codepoints.each{ |codepoint|
         | 
| 58 | 
            +
                        case line["category"]
         | 
| 59 | 
            +
                        when "Cf", "Zl", "Zp"
         | 
| 60 | 
            +
                          type = 1
         | 
| 61 | 
            +
                        when "Cc"
         | 
| 62 | 
            +
                          type = 2
         | 
| 63 | 
            +
                        when "Co"
         | 
| 64 | 
            +
                          type = 3
         | 
| 65 | 
            +
                        when "Cs"
         | 
| 66 | 
            +
                          type = 4
         | 
| 67 | 
            +
                        when "Cn"
         | 
| 68 | 
            +
                          if NONCHARACTERS.include?(codepoint)
         | 
| 69 | 
            +
                            type = 5
         | 
| 70 | 
            +
                          else
         | 
| 71 | 
            +
                            type = 6
         | 
| 72 | 
            +
                          end
         | 
| 73 | 
            +
                        end
         | 
| 74 | 
            +
                        
         | 
| 75 | 
            +
                        assign :TYPES, codepoint, type
         | 
| 76 | 
            +
                      }
         | 
| 77 | 
            +
                    end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    4.times{ compress! @index[:TYPES] }
         | 
| 80 | 
            +
                  end
         | 
| 81 | 
            +
                end
         | 
| 82 | 
            +
              end
         | 
| 83 | 
            +
            end
         | 
    
        data/lib/unicoder/constants.rb
    CHANGED
    
    | @@ -1,29 +1,94 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
              VERSION = "0.1.0".freeze
         | 
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 3 2 |  | 
| 4 | 
            -
             | 
| 3 | 
            +
            module Unicoder
         | 
| 4 | 
            +
              VERSION = "1.0.0"
         | 
| 5 5 |  | 
| 6 6 | 
             
              UNICODE_VERSIONS = %w[
         | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 7 | 
            +
                16.0.0
         | 
| 8 | 
            +
                15.1.0
         | 
| 9 | 
            +
                15.0.0
         | 
| 10 | 
            +
                14.0.0
         | 
| 11 | 
            +
                13.0.0
         | 
| 12 | 
            +
                12.1.0
         | 
| 13 | 
            +
                12.0.0
         | 
| 14 | 
            +
                11.0.0
         | 
| 15 | 
            +
                10.0.0
         | 
| 10 16 | 
             
                 9.0.0
         | 
| 17 | 
            +
                 8.0.0
         | 
| 18 | 
            +
                 7.0.0
         | 
| 19 | 
            +
                 6.3.0
         | 
| 11 20 | 
             
              ].freeze
         | 
| 12 21 |  | 
| 13 | 
            -
               | 
| 22 | 
            +
              CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
         | 
| 23 | 
            +
             | 
| 24 | 
            +
              EMOJI_VERSIONS = %w[
         | 
| 25 | 
            +
               16.0
         | 
| 26 | 
            +
               15.1
         | 
| 27 | 
            +
               15.0
         | 
| 28 | 
            +
               14.0
         | 
| 29 | 
            +
               13.1
         | 
| 30 | 
            +
               13.0
         | 
| 31 | 
            +
               12.1
         | 
| 32 | 
            +
               12.0
         | 
| 33 | 
            +
               11.0
         | 
| 34 | 
            +
                5.0
         | 
| 35 | 
            +
                4.0
         | 
| 36 | 
            +
                3.0
         | 
| 37 | 
            +
                2.0
         | 
| 38 | 
            +
              ].freeze
         | 
| 39 | 
            +
             | 
| 40 | 
            +
              EMOJI_RELATED_UNICODE_VERSIONS = {
         | 
| 41 | 
            +
               "16.0" => "16.0.0",
         | 
| 42 | 
            +
               "15.1" => "15.1.0",
         | 
| 43 | 
            +
               "15.0" => "15.0.0",
         | 
| 44 | 
            +
               "14.0" => "14.0.0",
         | 
| 45 | 
            +
               "13.1" => "13.0.0",
         | 
| 46 | 
            +
               "13.0" => "13.0.0",
         | 
| 47 | 
            +
               "12.1" => "12.1.0",
         | 
| 48 | 
            +
               "12.0" => "12.0.0",
         | 
| 49 | 
            +
               "11.0" => "11.0.0",
         | 
| 50 | 
            +
                "5.0" => "10.0.0",
         | 
| 51 | 
            +
                "4.0" => "9.0.0",
         | 
| 52 | 
            +
                "3.0" => "9.0.0",
         | 
| 53 | 
            +
                "2.0" => "8.0.0",
         | 
| 54 | 
            +
              }.freeze
         | 
| 55 | 
            +
             | 
| 56 | 
            +
              CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              IVD_VERSION = "2022-09-13"
         | 
| 59 | 
            +
             | 
| 60 | 
            +
              CLDR_VERSION = "45"
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
         | 
| 14 63 |  | 
| 15 64 | 
             
              LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
         | 
| 16 65 |  | 
| 17 66 | 
             
              UNICODE_FILES = {
         | 
| 18 | 
            -
                east_asian_width: | 
| 19 | 
            -
                unicode_data: | 
| 20 | 
            -
                name_aliases: | 
| 21 | 
            -
                confusables: | 
| 22 | 
            -
                blocks: | 
| 23 | 
            -
                scripts: | 
| 24 | 
            -
                script_extensions: | 
| 25 | 
            -
                property_value_aliases: | 
| 26 | 
            -
                general_categories: | 
| 67 | 
            +
                east_asian_width:          "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
         | 
| 68 | 
            +
                unicode_data:              "/UNICODE_VERSION/ucd/UnicodeData.txt",
         | 
| 69 | 
            +
                name_aliases:              "/UNICODE_VERSION/ucd/NameAliases.txt",
         | 
| 70 | 
            +
                confusables:               "/security/UNICODE_VERSION/confusables.txt",
         | 
| 71 | 
            +
                blocks:                    "/UNICODE_VERSION/ucd/Blocks.txt",
         | 
| 72 | 
            +
                scripts:                   "/UNICODE_VERSION/ucd/Scripts.txt",
         | 
| 73 | 
            +
                script_extensions:         "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
         | 
| 74 | 
            +
                property_value_aliases:    "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
         | 
| 75 | 
            +
                general_categories:        "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
         | 
| 76 | 
            +
                unihan_numeric_values:     "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
         | 
| 77 | 
            +
                jamo:                      "/UNICODE_VERSION/ucd/Jamo.txt",
         | 
| 78 | 
            +
                named_sequences:           "/UNICODE_VERSION/ucd/NamedSequences.txt",
         | 
| 79 | 
            +
                named_sequences_prov:      "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
         | 
| 80 | 
            +
                standardized_variants:     "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
         | 
| 81 | 
            +
                ivd_sequences:             "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
         | 
| 82 | 
            +
                # emoji_data:                "/EMOJI_VERSION/ucd/emoji/",
         | 
| 83 | 
            +
                emoji_data:                "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
         | 
| 84 | 
            +
                emoji_sequences:           "/emoji/EMOJI_VERSION/emoji-sequences.txt",
         | 
| 85 | 
            +
                # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
         | 
| 86 | 
            +
                emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
         | 
| 87 | 
            +
                emoji_zwj_sequences:       "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
         | 
| 88 | 
            +
                emoji_test:                "/emoji/EMOJI_VERSION/emoji-test.txt",
         | 
| 89 | 
            +
                # valid_subdivisions:        "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
         | 
| 90 | 
            +
                valid_subdivisions:        "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
         | 
| 91 | 
            +
                # ""
         | 
| 27 92 | 
             
              }
         | 
| 28 93 | 
             
            end
         | 
| 29 94 |  | 
    
        data/lib/unicoder/downloader.rb
    CHANGED
    
    | @@ -1,28 +1,74 @@ | |
| 1 1 | 
             
            require "open-uri"
         | 
| 2 2 | 
             
            require "fileutils"
         | 
| 3 | 
            +
            require "zip"
         | 
| 3 4 |  | 
| 4 5 | 
             
            module Unicoder
         | 
| 5 6 | 
             
              module Downloader
         | 
| 6 7 | 
             
                def self.fetch(identifier,
         | 
| 7 8 | 
             
                    unicode_version: CURRENT_UNICODE_VERSION,
         | 
| 9 | 
            +
                    emoji_version: CURRENT_EMOJI_VERSION,
         | 
| 8 10 | 
             
                    destination_directory: LOCAL_DATA_DIRECTORY,
         | 
| 9 11 | 
             
                    destination: nil,
         | 
| 10 12 | 
             
                    filename: nil
         | 
| 11 13 | 
             
                  )
         | 
| 12 14 | 
             
                  filename = UNICODE_FILES[identifier.to_sym] || filename
         | 
| 13 15 | 
             
                  raise ArgumentError, "No valid file identifier or filename given" if !filename
         | 
| 14 | 
            -
                  filename | 
| 15 | 
            -
                   | 
| 16 | 
            -
                   | 
| 17 | 
            -
             | 
| 18 | 
            -
                   | 
| 19 | 
            -
                     | 
| 20 | 
            -
                     | 
| 21 | 
            -
                   | 
| 16 | 
            +
                  filename = filename.dup
         | 
| 17 | 
            +
                  filename.sub! 'UNICODE_VERSION', unicode_version
         | 
| 18 | 
            +
                  filename.sub! 'EMOJI_VERSION', emoji_version
         | 
| 19 | 
            +
                  filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
         | 
| 20 | 
            +
                  if filename =~ /\A(https?|ftp):\/\//
         | 
| 21 | 
            +
                    source = filename
         | 
| 22 | 
            +
                    destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
         | 
| 23 | 
            +
                  else
         | 
| 24 | 
            +
                    source = UNICODE_DATA_ENDPOINT + filename
         | 
| 25 | 
            +
                    destination ||= destination_directory + filename
         | 
| 26 | 
            +
                  end
         | 
| 22 27 |  | 
| 23 28 | 
             
                  puts "GET #{source} => #{destination}"
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
         | 
| 31 | 
            +
                    # Too much magic, download unzip zip files
         | 
| 32 | 
            +
                    zip = true
         | 
| 33 | 
            +
                    source = $~[:outer_path] + ".zip"
         | 
| 34 | 
            +
                    inner_zip_filename = $~[:inner_path]
         | 
| 35 | 
            +
                    if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
         | 
| 36 | 
            +
                      destination = $~[:outer_path] + ".zip"
         | 
| 37 | 
            +
                      destination_files = $~[:outer_path]
         | 
| 38 | 
            +
                    else
         | 
| 39 | 
            +
                      raise "uncoder bug"
         | 
| 40 | 
            +
                    end
         | 
| 41 | 
            +
                  else
         | 
| 42 | 
            +
                    zip = false
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  if File.exist?(destination)
         | 
| 46 | 
            +
                    puts "Skipping download of #{source} (already exists)"
         | 
| 47 | 
            +
                  else
         | 
| 48 | 
            +
                    URI.open(source){ |f|
         | 
| 49 | 
            +
                      FileUtils.mkdir_p(File.dirname(destination))
         | 
| 50 | 
            +
                      File.write(destination, f.read)
         | 
| 51 | 
            +
                    }
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                  if zip
         | 
| 55 | 
            +
                    unzip(destination, [inner_zip_filename], destination_files)
         | 
| 56 | 
            +
                  end
         | 
| 24 57 | 
             
                rescue => e
         | 
| 25 58 | 
             
                  $stderr.puts "#{e.class}: #{e.message}"
         | 
| 26 59 | 
             
                end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def self.unzip(archive, files, destination_dir)
         | 
| 62 | 
            +
                  Zip::File.open(archive) do |zip|
         | 
| 63 | 
            +
                    zip.each do |file_in_zip|
         | 
| 64 | 
            +
                      if files.include?(file_in_zip.name)
         | 
| 65 | 
            +
                        FileUtils.mkdir_p(destination_dir)
         | 
| 66 | 
            +
                        puts "Extract #{file_in_zip.name}"
         | 
| 67 | 
            +
                        file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
         | 
| 68 | 
            +
                      end
         | 
| 69 | 
            +
                    end
         | 
| 70 | 
            +
                    # entry = zip.glob('*.csv').first
         | 
| 71 | 
            +
                  end
         | 
| 72 | 
            +
                end
         | 
| 27 73 | 
             
              end
         | 
| 28 74 | 
             
            end
         | 
| @@ -59,6 +59,28 @@ module Unicoder | |
| 59 59 | 
             
                    end
         | 
| 60 60 | 
             
                  }
         | 
| 61 61 | 
             
                end
         | 
| 62 | 
            -
             | 
| 62 | 
            +
             | 
| 63 | 
            +
                def remove_trailing_nils!(index = @index)
         | 
| 64 | 
            +
                  index.each{ |plane|
         | 
| 65 | 
            +
                    if plane.is_a?(Array)
         | 
| 66 | 
            +
                      plane.pop while plane[-1] == nil
         | 
| 67 | 
            +
                      plane.each{ |row|
         | 
| 68 | 
            +
                        if row.is_a?(Array)
         | 
| 69 | 
            +
                        row.pop while row[-1] == nil
         | 
| 70 | 
            +
                        row.each{ |byte|
         | 
| 71 | 
            +
                          if byte.is_a?(Array)
         | 
| 72 | 
            +
                            byte.pop while byte[-1] == nil
         | 
| 73 | 
            +
                            byte.each{ |nibble|
         | 
| 74 | 
            +
                              if nibble.is_a?(Array)
         | 
| 75 | 
            +
                                nibble.pop while nibble[-1] == nil
         | 
| 76 | 
            +
                              end
         | 
| 77 | 
            +
                            }
         | 
| 78 | 
            +
                          end
         | 
| 79 | 
            +
                        }
         | 
| 80 | 
            +
                        end
         | 
| 81 | 
            +
                    }
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                  }
         | 
| 84 | 
            +
                end
         | 
| 63 85 | 
             
              end
         | 
| 64 | 
            -
            end
         | 
| 86 | 
            +
            end
         | 
    
        data/unicoder.gemspec
    CHANGED
    
    | @@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants" | |
| 5 5 | 
             
            Gem::Specification.new do |gem|
         | 
| 6 6 | 
             
              gem.name          = "unicoder"
         | 
| 7 7 | 
             
              gem.version       = Unicoder::VERSION
         | 
| 8 | 
            -
              gem.summary       = " | 
| 9 | 
            -
              gem.description   = " | 
| 8 | 
            +
              gem.summary       = "Creates specialized indexes for Unicode data lookup"
         | 
| 9 | 
            +
              gem.description   = "Generates specialized indexes for Unicode data lookup"
         | 
| 10 10 | 
             
              gem.authors       = ["Jan Lelis"]
         | 
| 11 | 
            -
              gem.email         = [" | 
| 11 | 
            +
              gem.email         = ["hi@ruby.consulting"]
         | 
| 12 12 | 
             
              gem.homepage      = "https://github.com/janlelis/unicoder"
         | 
| 13 13 | 
             
              gem.license       = "MIT"
         | 
| 14 14 |  | 
| 15 | 
            -
              gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
         | 
| 15 | 
            +
              gem.files         = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
         | 
| 16 16 | 
             
              gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
         | 
| 17 17 | 
             
              gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
         | 
| 18 18 | 
             
              gem.require_paths = ["lib"]
         | 
| 19 19 |  | 
| 20 | 
            -
              gem.required_ruby_version = " | 
| 20 | 
            +
              gem.required_ruby_version = ">= 2.0", "< 4.0"
         | 
| 21 21 | 
             
              gem.add_dependency "rationalist", "~> 2.0"
         | 
| 22 | 
            +
              gem.add_dependency "rubyzip", "~> 1.2"
         | 
| 23 | 
            +
              gem.add_dependency "oga", "~> 2.9"
         | 
| 22 24 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: unicoder
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version:  | 
| 4 | 
            +
              version: 1.0.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jan Lelis
         | 
| 8 | 
            -
            autorequire: | 
| 8 | 
            +
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2024-10-04 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rationalist
         | 
| @@ -24,9 +24,37 @@ dependencies: | |
| 24 24 | 
             
                - - "~>"
         | 
| 25 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 26 26 | 
             
                    version: '2.0'
         | 
| 27 | 
            -
             | 
| 27 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 28 | 
            +
              name: rubyzip
         | 
| 29 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 | 
            +
                requirements:
         | 
| 31 | 
            +
                - - "~>"
         | 
| 32 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            +
                    version: '1.2'
         | 
| 34 | 
            +
              type: :runtime
         | 
| 35 | 
            +
              prerelease: false
         | 
| 36 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 | 
            +
                requirements:
         | 
| 38 | 
            +
                - - "~>"
         | 
| 39 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            +
                    version: '1.2'
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: oga
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - "~>"
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: '2.9'
         | 
| 48 | 
            +
              type: :runtime
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - "~>"
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: '2.9'
         | 
| 55 | 
            +
            description: Generates specialized indexes for Unicode data lookup
         | 
| 28 56 | 
             
            email:
         | 
| 29 | 
            -
            -  | 
| 57 | 
            +
            - hi@ruby.consulting
         | 
| 30 58 | 
             
            executables:
         | 
| 31 59 | 
             
            - unicoder
         | 
| 32 60 | 
             
            extensions: []
         | 
| @@ -37,57 +65,52 @@ files: | |
| 37 65 | 
             
            - CHANGELOG.md
         | 
| 38 66 | 
             
            - CODE_OF_CONDUCT.md
         | 
| 39 67 | 
             
            - Gemfile
         | 
| 68 | 
            +
            - Gemfile.lock
         | 
| 40 69 | 
             
            - MIT-LICENSE.txt
         | 
| 41 70 | 
             
            - README.md
         | 
| 42 71 | 
             
            - Rakefile
         | 
| 43 72 | 
             
            - bin/unicoder
         | 
| 44 | 
            -
            - data/.keep
         | 
| 45 | 
            -
            - data/unicode/8.0.0/ucd/Blocks.txt
         | 
| 46 | 
            -
            - data/unicode/8.0.0/ucd/EastAsianWidth.txt
         | 
| 47 | 
            -
            - data/unicode/8.0.0/ucd/NameAliases.txt
         | 
| 48 | 
            -
            - data/unicode/8.0.0/ucd/PropertyValueAliases.txt
         | 
| 49 | 
            -
            - data/unicode/8.0.0/ucd/ScriptExtensions.txt
         | 
| 50 | 
            -
            - data/unicode/8.0.0/ucd/Scripts.txt
         | 
| 51 | 
            -
            - data/unicode/8.0.0/ucd/UnicodeData.txt
         | 
| 52 | 
            -
            - data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt
         | 
| 53 | 
            -
            - data/unicode/security/8.0.0/confusables.txt
         | 
| 54 73 | 
             
            - lib/unicoder.rb
         | 
| 55 74 | 
             
            - lib/unicoder/builder.rb
         | 
| 56 75 | 
             
            - lib/unicoder/builders/blocks.rb
         | 
| 57 76 | 
             
            - lib/unicoder/builders/categories.rb
         | 
| 58 77 | 
             
            - lib/unicoder/builders/confusable.rb
         | 
| 59 78 | 
             
            - lib/unicoder/builders/display_width.rb
         | 
| 79 | 
            +
            - lib/unicoder/builders/emoji.rb
         | 
| 80 | 
            +
            - lib/unicoder/builders/name.rb
         | 
| 81 | 
            +
            - lib/unicoder/builders/numeric_value.rb
         | 
| 60 82 | 
             
            - lib/unicoder/builders/scripts.rb
         | 
| 83 | 
            +
            - lib/unicoder/builders/sequence_name.rb
         | 
| 84 | 
            +
            - lib/unicoder/builders/types.rb
         | 
| 61 85 | 
             
            - lib/unicoder/constants.rb
         | 
| 62 86 | 
             
            - lib/unicoder/downloader.rb
         | 
| 63 87 | 
             
            - lib/unicoder/multi_dimensional_array_builder.rb
         | 
| 64 88 | 
             
            - lib/unicoder/tasks.rake
         | 
| 65 | 
            -
            - spec/unicoder_spec.rb
         | 
| 66 89 | 
             
            - unicoder.gemspec
         | 
| 67 90 | 
             
            homepage: https://github.com/janlelis/unicoder
         | 
| 68 91 | 
             
            licenses:
         | 
| 69 92 | 
             
            - MIT
         | 
| 70 93 | 
             
            metadata: {}
         | 
| 71 | 
            -
            post_install_message: | 
| 94 | 
            +
            post_install_message:
         | 
| 72 95 | 
             
            rdoc_options: []
         | 
| 73 96 | 
             
            require_paths:
         | 
| 74 97 | 
             
            - lib
         | 
| 75 98 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 76 99 | 
             
              requirements:
         | 
| 77 | 
            -
              - - " | 
| 100 | 
            +
              - - ">="
         | 
| 78 101 | 
             
                - !ruby/object:Gem::Version
         | 
| 79 102 | 
             
                  version: '2.0'
         | 
| 103 | 
            +
              - - "<"
         | 
| 104 | 
            +
                - !ruby/object:Gem::Version
         | 
| 105 | 
            +
                  version: '4.0'
         | 
| 80 106 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 81 107 | 
             
              requirements:
         | 
| 82 108 | 
             
              - - ">="
         | 
| 83 109 | 
             
                - !ruby/object:Gem::Version
         | 
| 84 110 | 
             
                  version: '0'
         | 
| 85 111 | 
             
            requirements: []
         | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
            signing_key: 
         | 
| 112 | 
            +
            rubygems_version: 3.5.21
         | 
| 113 | 
            +
            signing_key:
         | 
| 89 114 | 
             
            specification_version: 4
         | 
| 90 | 
            -
            summary:  | 
| 91 | 
            -
            test_files:
         | 
| 92 | 
            -
            - spec/unicoder_spec.rb
         | 
| 93 | 
            -
            has_rdoc: 
         | 
| 115 | 
            +
            summary: Creates specialized indexes for Unicode data lookup
         | 
| 116 | 
            +
            test_files: []
         | 
    
        data/data/.keep
    DELETED
    
    | 
            File without changes
         |