RubyGems - unicoder - Versions diffs - 0.1.0 → 1.0.0 - Mend

unicoder 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +5 -5
data/.gitignore +6 -1
data/.travis.yml +13 -13
data/CHANGELOG.md +19 -1
data/Gemfile +2 -0
data/Gemfile.lock +99 -0
data/MIT-LICENSE.txt +1 -1
data/README.md +35 -5
data/bin/unicoder +1 -1
data/lib/unicoder/builder.rb +77 -15
data/lib/unicoder/builders/categories.rb +7 -12
data/lib/unicoder/builders/display_width.rb +28 -7
data/lib/unicoder/builders/emoji.rb +97 -0
data/lib/unicoder/builders/name.rb +75 -0
data/lib/unicoder/builders/numeric_value.rb +30 -0
data/lib/unicoder/builders/sequence_name.rb +72 -0
data/lib/unicoder/builders/types.rb +83 -0
data/lib/unicoder/constants.rb +81 -16
data/lib/unicoder/downloader.rb +54 -8
data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
data/unicoder.gemspec +7 -5
metadata +48 -25
data/data/.keep +0 -0
data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
data/data/unicode/security/8.0.0/confusables.txt +0 -9274
data/spec/unicoder_spec.rb +0 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: f79eb48ad06b13b61fc4ceb7fc5e176ee4e9e984
-  data.tar.gz: 94a62eb108e01e1d7da774b58352ab0585235bc7
+SHA256:
+  metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
+  data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
 SHA512:
-  metadata.gz: 01714742c72568ab92a9c3df0b700f3918e32482b7f658da8f099e2cfb54359e098e90fa1caa72a343cbdf2ede36081a9c01a6d65ee76cee841e65b87c9083ad
-  data.tar.gz: dd5b55100962d9408a503b338ebf25062c3dee7dc1ff9ceaccd97e30d57f97d131191ef96ce266e267cc729d70e6a0860702f22fbb1c9a6e4b512547ff1b5805
+  metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
+  data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1

data/.gitignore CHANGED Viewed

@@ -1,3 +1,8 @@
 Gemfile.lock
 /pkg
-/data
+/data*
+*.marshal
+*.marshal.gz
+*.json
+*.mjs
+/old-data

data/.travis.yml CHANGED Viewed

@@ -1,20 +1,20 @@
 sudo: false
 language: ruby
-script: bundle exec ruby spec/unicoder_spec.rb
 rvm:
-- 2.3.0
-- 2.2
-- 2.1
-- 2.0
+- 2.7
+- 2.6
+- 2.5
+- 2.4
+- 2.3
 - ruby-head
-- rbx-2
-- jruby-head
-- jruby-9000
-cache:
-- bundler
+- jruby-9.2.9.0
+- truffleruby
-# matrix:
+matrix:
+  allow_failures:
+    - rvm: 2.3
+    - rvm: ruby-head
+    - rvm: jruby-2.9.2.0
+    - rvm: truffleruby
 #   fast_finish: true

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,23 @@
 ## CHANGELOG
+### 1.0.0
+With the first 1.0 release, unicoder supports 10 indexes:
+- blocks
+- categories
+- confusable
+- display_width
+- emoji
+- name
+- numeric_value
+- scripts
+- sequence_name
+- types
+All indexes can be build in `marshal` format (Ruby's internal
+serialization format) and some now support `esm` (JavaScript module)
 ### 0.1.0
-* WIP
+* Initial release

data/Gemfile CHANGED Viewed

@@ -3,3 +3,5 @@ source 'https://rubygems.org'
 gemspec
 gem 'minitest'
+gem 'rake'
+gem 'irbtools', require: "irbtools/binding"

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,99 @@
+PATH
+  remote: .
+  specs:
+    unicoder (1.0.0)
+      oga (~> 2.9)
+      rationalist (~> 2.0)
+      rubyzip (~> 1.2)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    ansi (1.5.0)
+    ast (2.4.2)
+    cd (1.0.2)
+    clipboard (2.0.0)
+    code (0.9.4)
+      coderay (~> 1.1)
+      method_source (>= 0.9, < 2.0)
+    coderay (1.1.3)
+    core_docs (0.9.11)
+      yard (~> 0.9.11)
+    debugging (2.1.0)
+      paint (>= 0.9, < 3.0)
+    every_day_irb (2.2.0)
+      cd (~> 1.0)
+    fancy_irb (2.1.2)
+      irb (>= 1.7, < 2.0)
+      paint (>= 0.9, < 3.0)
+      unicode-display_width (>= 2.5)
+    ffi (1.17.0)
+    hirb (0.7.3)
+    interactive_editor (0.0.12)
+      spoon (~> 0.0.6)
+    io-console (0.7.2)
+    irb (1.14.1)
+      rdoc (>= 4.0.0)
+      reline (>= 0.4.2)
+    irbtools (4.1.0)
+      clipboard (>= 1.4, < 3.0)
+      code (>= 0.9.4, < 2.0)
+      coderay (~> 1.1)
+      core_docs (~> 0.9.11)
+      debugging (~> 2.1)
+      every_day_irb (~> 2.2)
+      fancy_irb (~> 2.1)
+      hirb (~> 0.7, >= 0.7.3)
+      interactive_editor (~> 0.0, >= 0.0.12)
+      irb (>= 1.13.0, < 1.15)
+      looksee (~> 5.0)
+      methodfinder (~> 2.2, >= 2.2.5)
+      object_shadow (~> 1.1)
+      os (~> 1.1, >= 1.1.4)
+      paint (>= 0.9, < 3.0)
+      ruby_engine (~> 2.0)
+      ruby_version (~> 1.0)
+      wirb (~> 2.0, >= 2.2.1)
+    looksee (5.0.0)
+    method_source (1.1.0)
+    methodfinder (2.2.5)
+    minitest (5.25.1)
+    object_shadow (1.1.1)
+    oga (2.15)
+      ast
+      ruby-ll (~> 2.1)
+    os (1.1.4)
+    paint (2.3.0)
+    psych (5.1.2)
+      stringio
+    rake (13.2.1)
+    rationalist (2.0.1)
+    rdoc (6.7.0)
+      psych (>= 4.0.0)
+    reline (0.5.10)
+      io-console (~> 0.5)
+    ruby-ll (2.1.3)
+      ansi
+      ast
+    ruby_engine (2.0.3)
+    ruby_version (1.0.3)
+    rubyzip (1.3.0)
+    spoon (0.0.6)
+      ffi
+    stringio (3.1.1)
+    unicode-display_width (2.6.0)
+    wirb (2.2.2)
+      paint (>= 0.9, < 3.0)
+    yard (0.9.37)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  irbtools
+  minitest
+  rake
+  unicoder!
+BUNDLED WITH
+   2.5.21

data/MIT-LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2016 Jan Lelis, mail@janlelis.de
+Copyright (c) 2016-2020 Jan Lelis, https://janlelis.com
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/README.md CHANGED Viewed

@@ -1,15 +1,45 @@
-# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](http://badge.fury.io/rb/unicoder)
-WIP
+# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](https://badge.fury.io/rb/unicoder)
+unicoder turns Unicode data into bundles for programming libraries.
 ## Usage
 ```
-$ unicoder build index_name
+$ unicoder build <index_name> [--gzip]
+```
+Examples:
+```
+$ unicoder build emoji --format marshal --gzip
+$ unicoder build numeric_value --format esm
 ```
+## Libraries With unicoder-based Indexes
+### Ruby
+Index Name    | Gem
+--------------|----
+blocks        | [unicode-blocks](https://github.com/janlelis/unicode-blocks)
+categories    | [unicode-categories](https://github.com/janlelis/unicode-categories)
+confusable    | [unicode-confusable](https://github.com/janlelis/unicode-confusable)
+emoji         | [unicode-emoji](https://github.com/janlelis/unicode-emoji)
+display\_width| [unicode-display_width](https://github.com/janlelis/unicode-display_width)
+name          | [unicode-name](https://github.com/janlelis/unicode-name)
+numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-numeric_value)
+scripts       | [unicode-scripts](https://github.com/janlelis/unicode-scripts)
+sequence\_name| [unicode-sequence_name](https://github.com/janlelis/unicode-sequence_name)
+types         | [unicode-types](https://github.com/janlelis/unicode-types)
+### JavaScript (ESM)
+Index Name    | Module
+--------------|----
+numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
+name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
 ## MIT License
-Copyright (C) 2016 Jan Lelis <http://janlelis.com>. Released under the MIT license.
+Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.

data/bin/unicoder CHANGED Viewed

@@ -6,7 +6,7 @@ require "rationalist"
 args = Rationalist.parse
 command = args[:_][0]
 identifier = args[:_][1]
-KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip]
+KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip, :option, :meta]
 options = args.select { |option,| KNOWN_OPTIONS.include? option }
 if options.has_key?(:version)

data/lib/unicoder/builder.rb CHANGED Viewed

@@ -1,13 +1,42 @@
 require "json"
+require "rubygems/util"
 module Unicoder
   # A builder defines a parse function which translates one (ore more) unicode data
   # files into an index hash
   module Builder
-    attr_reader :index
+    attr_reader :index, :formats, :option
+    attr_writer :option
-    def initialize(unicode_version = nil)
-      @unicode_version = unicode_version
+    def formats
+      {
+        marshal: {
+          ext: ".marshal",
+        },
+        json: {
+          ext: ".json",
+          option: "charkeys+stringfractions"
+        },
+        esm: {
+          ext: ".mjs",
+          option: "charkeys+stringfractions"
+        }
+      }
+    end
+    def meta
+      {
+        META: {
+          generator: "unicoder v#{Unicoder::VERSION}",
+          unicodeVersion: @unicode_version,
+        },
+      }
+    end
+    def initialize(unicode_version = nil, emoji_version = nil, format = nil)
+      @unicode_version = unicode_version || CURRENT_UNICODE_VERSION
+      @emoji_version = emoji_version || CURRENT_EMOJI_VERSION
+      @option = formats[format.to_sym] ? formats[format.to_sym][:option] || "" : ""
       initialize_index
     end
@@ -15,8 +44,16 @@ module Unicoder
       @index = {}
     end
-    def assign_codepoint(codepoint, value, index = @index)
-      index[codepoint] = value
+    def assign_codepoint(codepoint, value, idx = @index)
+      if option =~ /charkeys/
+        idx[[codepoint].pack("U*")] = value
+      else
+        idx[codepoint] = value
+      end
+    end
+    def assign(sub_index_name, codepoint, value)
+      assign_codepoint(codepoint, value, index[sub_index_name])
     end
     def parse!
@@ -26,47 +63,72 @@ module Unicoder
     def parse_file(identifier, parse_mode, **parse_options)
       filename = UNICODE_FILES[identifier.to_sym] || filename
       raise ArgumentError, "No valid file identifier or filename given" if !filename
-      filename.sub! 'VERSION', @unicode_version
-      Downloader.fetch(identifier) unless File.exists?(filename)
+      filename = filename.dup
+      filename.sub! 'UNICODE_VERSION', @unicode_version
+      filename.sub! 'EMOJI_VERSION', @emoji_version
+      filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[@emoji_version]
+      filename.sub! '.zip', ''
+      filename.sub! /\A(https?|ftp):\//, ""
+      Downloader.fetch(identifier) unless File.exist?(LOCAL_DATA_DIRECTORY + filename)
       file = File.read(LOCAL_DATA_DIRECTORY + filename)
       if parse_mode == :line
         file.each_line{ |line|
           yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
         }
+      elsif parse_mode == :xml
+        require "oga"
+        yield Oga.parse_xml(file)
+      else
+        yield file
       end
     end
     def export(format: :marshal, **options)
       p index if options[:verbose]
+      if options[:meta]
+        idx = meta.merge(index)
+      else
+        idx = index
+      end
       case format.to_sym
       when :marshal
-        index_file = Marshal.dump(index)
+        index_file = Marshal.dump(idx)
       when :json
-        index_file = JSON.dump(index)
+        index_file = JSON.dump(idx)
+      when :esm
+        index_file = "export default " + JSON.dump(idx)
       end
-      # if false# || options[:gzip]
       if options[:gzip]
-        Gem.gzip(index_file)
+        Gem::Util.gzip(index_file)
       else
         index_file
       end
     end
     def self.build(identifier, **options)
       format = options[:format] || :marshal
       require_relative "builders/#{identifier}"
       # require "unicoder/builders/#{identifier}"
       builder_class = self.const_get(identifier.to_s.gsub(/(?:^|_)([a-z])/){ $1.upcase })
-      builder = builder_class.new(options[:unicode_version] || CURRENT_UNICODE_VERSION)
+      builder = builder_class.new(
+        options[:unicode_version],
+        options[:emoji_version],
+        format
+      )
       puts "Building index for #{identifier}…"
+      if options[:option]
+        builder.option = options[:option]
+      end
       builder.parse!
-      index_file = builder.export(options)
+      index_file = builder.export(**options)
       destination ||= options[:destination] || identifier.to_s
-      destination += ".#{format}"
+      destination += "#{builder.formats.dig(format.to_sym, :ext)}"
       destination += ".gz" if options[:gzip]
       bytes = File.write destination, index_file

data/lib/unicoder/builders/categories.rb CHANGED Viewed

@@ -14,23 +14,18 @@ module Unicoder
       end
       def parse!
-        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<range><(?!control).+>)?.*?;(?<category>.+?);.*$/ do |line|
-          if line["range"]
-            if line["range"] =~ /First/
-              @range_start = line["codepoint"].to_i(16)
-            elsif line["range"] =~ /Last/ && @range_start
-              (@range_start..line["codepoint"].to_i(16)).each{ |codepoint|
-                assign_codepoint(codepoint, line["category"], @index[:CATEGORIES])
-              }
-            else
-              raise ArgumentError, "inconsistent range found in data, don't know what to do"
-            end
+        parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
+          if line["to"]
+            (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
+              assign_codepoint(codepoint, line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
+            }
           else
-            assign_codepoint(line["codepoint"].to_i(16), line["category"], @index[:CATEGORIES])
+            assign_codepoint(line["from"].to_i(16), line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
           end
         end
         4.times{ compress! @index[:CATEGORIES] }
+        remove_trailing_nils! @index[:CATEGORIES]
         parse_file :property_value_aliases, :line, regex: /^gc ; (?<short>\S{2}?) *; (?<long>\S+).*$/ do |line|
           @index[:CATEGORY_NAMES][line["short"]] = line["long"]

data/lib/unicoder/builders/display_width.rb CHANGED Viewed

@@ -6,7 +6,23 @@ module Unicoder
       IGNORE_CATEGORIES     = %w[Cs Co Cn].freeze
       ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
-      ZERO_WIDTH_CODEPOINTS = [*0x1160..0x11FF].freeze
+      ZERO_WIDTH_RANGES = [
+        *0x1160..0x11FF, # HANGUL JUNGSEONG
+        *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
+        *0x2060..0x206F, # Ignorables
+        *0xFFF0..0xFFF8, # Ignorables
+        *0xE0000..0xE0FFF, # Ignorables
+      ].freeze
+      WIDE_RANGES = [
+        *0x3400..0x4DBF,
+        *0x4E00..0x9FFF,
+        *0xF900..0xFAFF,
+        *0x20000..0x2FFFD,
+        *0x30000..0x3FFFD,
+      ].freeze
       SPECIAL_WIDTHS = {
         0x0    =>  0, # \0 NULL
         0x5    =>  0, #    ENQUIRY
@@ -18,7 +34,7 @@ module Unicoder
         0xD    =>  0, # \r CARRIAGE RETURN
         0xE    =>  0, #    SHIFT OUT
         0xF    =>  0, #    SHIFT IN
-        0x00AD =>  1, #    SOFT HYPHEN
+        0x00AD =>  nil, #    SOFT HYPHEN
         0x2E3A =>  2, #    TWO-EM DASH
         0x2E3B =>  3, #    THREE-EM DASH
       }.freeze
@@ -28,7 +44,7 @@ module Unicoder
       end
       def parse!
-        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?);(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
+        parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
           next if IGNORE_CATEGORIES.include?(line["category"])
           if line["codepoints"]['..']
@@ -44,19 +60,24 @@ module Unicoder
           }
         end
+        ZERO_WIDTH_RANGES.each{ |codepoint|
+          assign_codepoint codepoint, 0
+        }
+        WIDE_RANGES.each{ |codepoint|
+          assign_codepoint codepoint, 2
+        }
         SPECIAL_WIDTHS.each{ |codepoint, value|
           assign_codepoint codepoint, value
         }
         4.times{ compress! }
-        p @index
       end
       def determine_width(codepoint, category, east_asian_width)
         if  ( ZERO_WIDTH_CATEGORIES.include?(category) &&
-              [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ ) ||
-            ZERO_WIDTH_CODEPOINTS.include?(codepoint)
+              [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
           0
         elsif east_asian_width == "F" || east_asian_width == "W"
           2

data/lib/unicoder/builders/emoji.rb ADDED Viewed

@@ -0,0 +1,97 @@
+module Unicoder
+  module Builder
+    class Emoji
+      include Builder
+      REVERSE_PROPERTY_NAMES = {
+        "Emoji" => :E,
+        "Emoji_Modifier_Base" => :B,
+        "Emoji_Modifier" => :M,
+        "Emoji_Component" => :C,
+        "Emoji_Presentation" => :P,
+        "Extended_Pictographic" => :X,
+      }
+      def initialize_index
+        @index = {
+          PROPERTIES: {},
+          FLAGS: [],
+          TAGS: [],
+          KEYCAPS: [],
+          ZWJ: [],
+          SD: [],
+          LIST: {},
+        }
+      end
+      def parse!
+        parse_file :emoji_data, :line, regex: /^(?<codepoints>\S+?) +; (?<property>\S+) *#/ do |line|
+          if line["codepoints"]['..']
+            codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
+              codepoint.to_i(16)
+            })
+          else
+            codepoints = [line["codepoints"].to_i(16)]
+          end
+          codepoints.each{ |codepoint|
+            @index[:PROPERTIES][codepoint] ||= []
+            @index[:PROPERTIES][codepoint] << (REVERSE_PROPERTY_NAMES[line["property"]] || line["property"])
+          }
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Flag_Sequence/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:FLAGS] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Tag_Sequence/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:TAGS] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; Emoji_Keycap_Sequence/ do |line|
+          @index[:KEYCAPS] << line["codepoints"].split[0].to_i(16)
+        end
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;/ do |line|
+          codepoints = line["codepoints"].split
+          @index[:ZWJ] << codepoints.map{|e| e.to_i(16)}
+        end
+        parse_file :valid_subdivisions, :xml do |xml|
+          subdivisions = []
+          xml.css('[idStatus="regular"], [idStatus="deprecated"]').each{ |id|
+            subdivisions += id.text.split
+          }
+          @index[:SD] = subdivisions.uniq
+        end
+        parse_file :emoji_test, :line, regex: /^(?:# (?<sub>sub)?group: (?<group_name>.*)$)|(?:(?<codepoints>.+?)\s*; fully-qualified )/ do |line|
+          if line["group_name"]
+            if !line["sub"]
+              @current_group_name = line["group_name"]
+              @index[:LIST][@current_group_name] = {}
+            else
+              @current_subgroup_name = line["group_name"]
+              @index[:LIST][@current_group_name][@current_subgroup_name] = []
+            end
+          else
+            codepoints = line["codepoints"].split
+            @index[:LIST][@current_group_name][@current_subgroup_name] << codepoints.map{|e| e.to_i(16)}.pack("U*")
+          end
+        end
+      end
+    end
+  end
+end
+=begin alternative
+current_index_level = @index[:SEQUENCES]
+codepoints.each{ |cp|
+  ord = cp.to_i(16)
+  current_index_level[ord] ||= {}
+  current_index_level = current_index_level[ord]
+}
+current_index_level[true] = true # end mark
+=end

data/lib/unicoder/builders/name.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module Unicoder
+  module Builder
+    class Name
+      include Builder
+      JAMO_INITIAL = 4352
+      JAMO_MEDIAL = 4449
+      JAMO_FINAL = 4520
+      JAMO_END = 4697
+      def initialize_index
+        @index = {
+          NAMES: {},
+          ALIASES: {},
+          CJK: [],
+          HANGUL: [],
+          # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
+          JAMO: {
+            INITIAL: [],
+            MEDIAL: [],
+            FINAL: [""],
+          },
+        }
+        @range_start = nil
+      end
+      def parse!
+        if option =~ /charkeys/
+          get_key = ->(codepoint){ [codepoint].pack("U*") }
+        else
+          get_key = -> (codepoint){ codepoint }
+        end
+        parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
+          if line["name"][0] == "<" && line["name"][-1] == ">"
+            if line["name"] =~ /First/
+              @range_start = line["codepoint"].to_i(16)
+            elsif line["name"] =~ /Last/ && @range_start
+              if line["name"] =~ /Hangul/
+                @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
+              elsif line["name"] =~ /CJK/
+                @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
+              else
+                # no name
+              end
+              @range_start = nil
+            elsif line["name"] != "<control>"
+              raise ArgumentError, "inconsistent range found in data, don't know what to do"
+            end
+          else
+            assign :NAMES, line["codepoint"].to_i(16), line["name"]
+          end
+        end
+        parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
+          @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
+        end
+        parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
+          case line["codepoint"].to_i(16)
+          when JAMO_INITIAL...JAMO_MEDIAL
+            @index[:JAMO][:INITIAL] << line["short_name"]
+          when JAMO_MEDIAL...JAMO_FINAL
+            @index[:JAMO][:MEDIAL] << line["short_name"]
+          when JAMO_FINAL..JAMO_END
+            @index[:JAMO][:FINAL] << line["short_name"]
+          end
+        end
+      end
+    end
+  end
+end