RubyGems - ruby-jdict - Versions diffs - 0.0.1 - Mend

ruby-jdict 0.0.1

Files changed (26) hide show

checksums.yaml +7 -0
data/LICENSING +28 -0
data/README.md +10 -0
data/Rakefile +30 -0
data/examples/query.rb +29 -0
data/lib/#sense.rb# +14 -0
data/lib/configuration.rb +20 -0
data/lib/constants.rb +64 -0
data/lib/dictionaries/jmdict.rb +14 -0
data/lib/dictionary.rb +62 -0
data/lib/entry.rb +79 -0
data/lib/index.rb +346 -0
data/lib/jdict.rb +20 -0
data/lib/kana.rb +4 -0
data/lib/kanji.rb +4 -0
data/lib/ruby-jdict/version.rb +3 -0
data/lib/sense.rb +14 -0
data/lib/unicode.rb +63 -0
data/spec/configuration_spec.rb +20 -0
data/spec/dictionary_spec.rb +117 -0
data/spec/fixtures/feeds/sample_entry.xml +33 -0
data/spec/index_spec.rb +84 -0
data/spec/jdict_spec.rb +17 -0
data/spec/jmdict_spec.rb +19 -0
data/spec/spec_helper.rb +50 -0
metadata +124 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 48dbfb86f9f72639eae7cecdde05da6953afdc8c
+  data.tar.gz: 01bae383b6df3ae0e9a524e094d7f1f1890663cd
+SHA512:
+  metadata.gz: 4253b05fc65786103431707d298711170b8cc4cd426919b1dee06ba37a767766021ac80d1207a31a61bf8ea9a15466cf5e173f3b155328986dc674978793b5cd
+  data.tar.gz: 47a4b27fe519e1284bfd5311404f18489d701ddedc754d9203eaec07101bb4485378760de4f51e56f4fbe7aa235925e01505b18943e81d22af1cbb5464eca801

data/LICENSING ADDED

@@ -0,0 +1,28 @@
+Copyright (C) 2015 Ian Pickering
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+3. The name of the author may not be used to endorse or promote
+   products derived from this software without specific prior
+   written permission.
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED

@@ -0,0 +1,10 @@
+# Ruby-JDict
+Ruby gem for accessing Jim Breen's Japanese dictionaries. Can currently access the following:
+  * JMdict (Japanese-English dictionary)
+Dictionary files are located [here](http://www.csse.monash.edu.au/~jwb/wwwjdicinf.html#dicfil_tag).
+## Install
+```
+gem install ruby-jdict
+```

data/Rakefile ADDED

@@ -0,0 +1,30 @@
+require 'rubygems'
+require 'rake'   #task runner
+INDEX_PATH  = 'index'
+JMDICT_PATH = 'dictionaries/JMdict'
+namespace :index do
+  desc "Build the dictionary's search index"
+  task :build do
+    raise "Index already exists at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
+    @index = DictIndex.new(INDEX_PATH,
+                           JMDICT_PATH,
+                           false) # lazy_loadind? no. don't lazy load
+    puts "Index created at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
+    puts "Index with #{@index.size} entries."
+  end
+  desc "Destroy the dictionary's search index"
+  task :destroy do
+    puts 'TODO: destory the index'
+    `sudo rm -R index`
+    # This will not work, because we don't have sudooooo.
+    # How do you delete folders in Ruby without sudo? Probably
+    # can't... that'd be more consistent actually.
+    # if File.exists? INDEX_PATH
+    #   File.delete INDEX_PATH
+    # end
+  end
+end

data/examples/query.rb ADDED

@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+require 'jdict'
+BASE_PATH   = ENV["HOME"]
+DICT_PATH   = File.join(BASE_PATH, '.dicts')
+INDEX_PATH  = DICT_PATH
+JDict.configure do |config|
+  config.dictionary_path    = DICT_PATH                                  # directory containing dictionary files
+  config.index_path         = INDEX_PATH                                 # directory containing the full text search index
+  config.language           = JDict::JMDictConstants::Languages::ENGLISH # language for search results
+  config.num_results        = 50                                         # maximum results to return from searching
+end
+dict = JDict::JMDict.new
+query = "日本語"
+results = dict.search(query)
+results.each do |entry|
+  puts entry.kanji.join(", ")
+  puts entry.kana.join(", ")
+  entry.senses.each do |sense|
+    glosses = sense.glosses.join(", ")
+    parts_of_speech = sense.parts_of_speech.join(", ")
+    puts "(" + parts_of_speech + ") " + glosses
+  end
+  puts
+end

data/lib/#sense.rb# ADDED

@@ -0,0 +1,14 @@
+# The sense element will record the translational equivalent
+# of the Japanese word, plus other related information. Where there
+# are several distinctly different meanings of the word, multiple
+# sense elements will be employed.
+module JDict
+  class Sense
+    attr_reader :parts_of_speech, :glosses
+    #
+    # Create a new +Sense+
+    def initialize(parts_of_speech, glosses, language)
+      @parts_of_speech, @glosses = parts_of_speech, glosses
+    end
+  end
+end

data/lib/configuration.rb ADDED

@@ -0,0 +1,20 @@
+require 'constants'
+module JDict
+  class Configuration
+    attr_accessor :dictionary_path, :index_path, :num_results, :language, :lazy_index_loading, :debug
+    BASE_PATH   = ENV["HOME"]
+    DICT_PATH   = File.join(BASE_PATH, '.dicts')
+    INDEX_PATH  = DICT_PATH
+    def initialize
+      @dictionary_path    = DICT_PATH                                  # directory containing dictionary files
+      @index_path         = INDEX_PATH                                 # directory containing the full text search index
+      @num_results        = 50                                         # maximum results to return from searching
+      @language           = JDict::JMDictConstants::Languages::ENGLISH # language to return search results in
+      @lazy_index_loading = false                                      # load the index only on attempting to access it
+      @debug              = false                                      # limit number of entries indexed, rebuild index on instantiation
+    end
+  end
+end

data/lib/constants.rb ADDED

@@ -0,0 +1,64 @@
+# Constants and descriptions for important elements/attributes
+# of the JMdict XML dictionary.
+# Descriptions come from JMdict.dtd (document type definition)
+module JDict
+  module JMDictConstants
+    # TODO: change these strings to symbols ?
+    # XML elements of the JMDict file
+    module Elements
+      # Entries consist of kanji elements, kana elements,
+      # general information and sense elements. Each entry must have at
+      # least one kana element and one sense element. Others are optional.
+      ENTRY          = 'entry'
+      SEQUENCE       = 'ent_seq'
+      # This element will contain a word or short phrase in Japanese
+      # which is written using at least one kanji. The valid characters are
+      # kanji, kana, related characters such as chouon and kurikaeshi, and
+      # in exceptional cases, letters from other alphabets.
+      KANJI          = 'keb'
+      # This element content is restricted to kana and related
+      # characters such as chouon and kurikaeshi. Kana usage will be
+      # consistent between the keb and reb elements; e.g. if the keb
+      # contains katakana, so too will the reb.
+      KANA           = 'reb'
+      # The sense element will record the translational equivalent
+      # of the Japanese word, plus other related information. Where there
+      # are several distinctly different meanings of the word, multiple
+      # sense elements will be employed.
+      SENSE          = 'sense'
+      # Part-of-speech information about the entry/sense. Should use
+      # appropriate entity codes.
+      PART_OF_SPEECH = 'pos'
+      # Within each sense will be one or more "glosses", i.e.
+      # target-language words or phrases which are equivalents to the
+      # Japanese word. This element would normally be present, however it
+      # may be omitted in entries which are purely for a cross-reference.
+      GLOSS          = 'gloss'
+      CROSSREFERENCE = 'xref'
+    end
+    # Constants for selecting the search language.
+    # Used in the "gloss" element's xml:lang attribute.
+    #   :eng never appears as a xml:lang constant because gloss is assumed to be English when not specified
+    #   :jpn never appears as a xml:lang because the dictionary itself pivots around Japanese
+    module Languages
+      JAPANESE  = :jpn
+      ENGLISH   = :eng
+      DUTCH     = :dut
+      FRENCH    = :fre
+      GERMAN    = :ger
+      RUSSIAN   = :rus
+      SPANISH   = :spa
+      SLOVENIAN = :slv
+      SWEDISH   = :swe
+      HUNGARIAN = :hun
+    end
+  end
+end

data/lib/dictionaries/jmdict.rb ADDED

@@ -0,0 +1,14 @@
+require 'jdict'
+require 'dictionary'
+module JDict
+  class JMDict < Dictionary
+    private
+    # DICT_PATH = JDict.configuration.dictionary_path + '/JMdict'
+    def initialize(index_path = JDict.configuration.index_path, lazy_index_loading=JDict.configuration.lazy_index_loading)
+      path = JDict.configuration.dictionary_path + '/JMdict'
+      super(index_path, path, lazy_index_loading)
+    end
+  end
+end

data/lib/dictionary.rb ADDED

@@ -0,0 +1,62 @@
+require 'jdict'
+require 'index'
+module JDict
+  class Dictionary
+    attr_reader :entries_cache, :lazy_index_loading
+    def initialize(index_path = JDict.configuration.index_path, dictionary_path = nil, lazy_index_loading = JDict.configuration.lazy_index_loading)
+      path_specified = dictionary_path.nil? ? false : true
+      if path_specified and not File.exists? dictionary_path
+        raise "Dictionary not found at path #{dictionary_path}"
+      end
+      #store some args for future reference
+      @dictionary_path    = dictionary_path
+      @lazy_index_loading = lazy_index_loading
+      @entries       = []
+      @entries_cache = []
+      #instantiate and load the full-text search index
+      @index = DictIndex.new(index_path, dictionary_path, lazy_index_loading)
+    end
+    def size
+      @entries.size
+    end
+    def loaded?
+      @index.built?
+    end
+    # Search this dictionary's index for the given string.
+    # @param query [String] the search query
+    # @return [Array(Entry)] the results of the search
+    def search(query)
+      results = []
+      return results if query.empty?
+      load_index if lazy_index_loading and not loaded?
+      results = @index.search(query)
+    end
+    # Retrieves the definition of a part-of-speech from its abbreviation
+    # @param pos [String] the abbreviation for the part-of-speech
+    # @return [String] the full description of the part-of-speech
+    def get_pos(pos)
+      @index.get_pos(pos)
+    end
+    private
+    def load_index
+      if loaded?
+        Exception.new("Dictionary index is already loaded")
+      else
+        @index.build
+      end
+    end
+  end
+end

data/lib/entry.rb ADDED

@@ -0,0 +1,79 @@
+#include Constants #XML constants from the dictionary file
+# Entries consist of kanji elements, kana elements,
+# general information and sense elements. Each entry must have at
+# least one kana element and one sense element. Others are optional.
+module JDict
+  class Entry
+    attr_accessor :kanji, :kana, :senses
+    # Create a new Entry
+    #  entry = initialize(kanji, kana, senses)
+    def initialize(kanji, kana, senses)
+      @kanji, @kana, @senses = kanji, kana, senses
+    end
+    KANA_RE = /^kana/
+    SENSE_RE = /^sense/
+    PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
+    MEANING_SENTINEL = '**'
+    PART_OF_SPEECH_SENTINEL = '$$'
+    SENSE_SENTINEL = '%%'
+    LANGUAGE_SENTINEL = '&&'
+    GLOSS_SENTINEL = '@@'
+    # Converts an SQLite row from the index to the Entry format
+    def self.from_sql(row)
+      kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
+      kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
+      senses = []
+      row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
+        ary = txt.scan(PART_OF_SPEECH_RE)
+        if ary.size == 1
+          parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
+          gloss_strings = txt[(ary.to_s.length-1)..-1]
+        else
+          parts_of_speech = nil
+          gloss_strings = txt[5..-1]
+        end
+        gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
+        glosses = {}
+        gloss_strings.each do |str|
+          lang, meaning_string = str.split(LANGUAGE_SENTINEL)
+          lang = lang.to_sym
+          meanings = meaning_string.split(MEANING_SENTINEL)
+          (glosses[lang] ||= []) << meanings
+        end
+        glosses_for_lang = glosses[JDict.configuration.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
+        senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
+      end
+      self.new(kanji, kana, senses)
+    end
+    # Converts an Entry to a string to be indexed into the SQLite database
+    # @return [String] the serialized string for this Entry
+    def to_sql
+      sense_strings = senses.map do |s|
+        sense = ''
+        sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
+        sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
+      end
+      insert_data  = {
+        ':kanji'   => kanji.join(", "),
+        ':kana' => kana.join(", "),
+        ':senses' => sense_strings.join(SENSE_SENTINEL)
+      }
+      return insert_data
+    end
+    # Get an array of +Senses+ for the specified language
+    def senses_by_language(l)
+      senses.select { |s| s.language == l }
+    end
+  end
+end

data/lib/index.rb ADDED

@@ -0,0 +1,346 @@
+# encoding: utf-8
+require 'rubygems'      #use gems
+require 'bundler/setup' #load up the bundled environment
+require 'amalgalite'
+require 'libxml'    #XML parsing
+require 'fileutils'
+require_relative 'constants' #XML constants from the dictionary file
+require_relative 'entry'     #dictionary elements
+require_relative 'kanji'     #...
+require_relative 'kana'      #...
+require_relative 'sense'
+require 'amalgalite'
+include LibXML
+module JDict
+  class DictIndex
+    LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
+    NUM_ENTRIES_TO_INDEX = 50
+    ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
+    attr_reader :path
+    # Initialize a full-text search index backend for JMdict
+    # @param index_path [String] desired filesystem path where you'd like the *search index* stored
+    # @param dictionary_path [String] desired filesystem path where you'd like the *dictionary* stored
+    # @param lazy_loading [Boolean] lazily load the index just when it's needed, instead of building it ahead of time
+    def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
+      raise "Index path was nil" if index_path.nil?
+      path_specified = dictionary_path.nil? ? false : true
+      if path_specified and not File.exists? dictionary_path
+        raise "Dictionary not found at path #{dictionary_path}"
+      end
+      @path = index_path
+      @dictionary_path = dictionary_path
+      @pos_hash = {}
+      # create path if nonexistent
+      FileUtils.mkdir_p(@path)
+      db_file = File.join(@path, "fts5.db")
+      File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)
+      @index = Amalgalite::Database.new(db_file)
+      create_schema
+      #check if the index has already been built before Ferret creates it
+      already_built = built?
+      #build the index right now if "lazy loading" isn't on and the index is empty
+      build unless lazy_loading or (already_built && !JDict.configuration.debug)
+      #make the hash from abbreviated parts of speech to full definitions
+      build_pos_hash
+    end
+    # Creates the SQL schema for the Amalgalite database
+    def create_schema
+      schema = @index.schema
+      unless schema.tables['search']
+        @index.execute_batch <<-SQL
+        CREATE VIRTUAL TABLE search USING fts5(
+            kanji,
+            kana,
+            senses
+        );
+        SQL
+        @index.reload_schema!
+      end
+    end
+    # Returns the search results as an array of +Entry+
+    # @param term [String] the search string
+    # @param language [Symbol] the language to return results in
+    # @return [Array(Entry)] the results of the search
+    def search(term, language=LANGUAGE_DEFAULT, exact=false)
+      raise "Index not found at path #{@path}" unless File.exists? @path
+      # no results yet...
+      results = []
+      @entries_cache = []
+      # search for:
+      #   kanji... one field
+      #   kana ... up to 10 fields
+      #   sense... up to 10 fields
+      # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""
+      query = "{kanji kana senses} : \"#{term}\""
+      query += "*" unless exact
+      @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH '#{query}' LIMIT #{JDict.configuration.num_results}") do |row|
+        entry = Entry.from_sql(row)
+        score = 0.0
+        # load entry from the index. from cache, if it's available
+        # load from cache if it's available
+        # if entry = @entries_cache[docid]
+        #   entry = Entry.from_index_doc(@ferret_index[docid].load)
+        #   @entries_cache[docid] = entry
+        # end
+        # # load entry from the index
+        # if entry.nil?
+        #   entry = Entry.from_index_doc(@ferret_index[docid].load)
+        #   @entries_cache[docid] = entry
+        # end
+        is_exact_match = false
+        is_exact_match = entry.kanji == term ||
+          entry.kana.any? { |k| k == term }
+        re = Regexp.new("#{term}", Regexp::IGNORECASE) # match the search term, ignoring case
+        # entry.senses.each do |s|
+        #   s.glosses.each { |g| is_exact_match = is_exact_match || g.force_encoding("UTF-8").match(re) }
+        # end
+        # score = 1.0 if is_exact_match
+        # add the result
+        results << [score, entry]
+      end
+      @entries_cache = []
+      results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
+    end
+    def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end
+    # Builds the full-text search index
+    # @param overwrite [Boolean] force a build even if the index path already exists
+    # @param dictionary_path [String] path to the dictionary file
+    # @return [Integer] the number of indexed entries
+    def build(overwrite=false, dictionary_path=nil)
+      @dictionary_path = dictionary_path unless dictionary_path.nil?
+      raise "No dictionary path was provided" if @dictionary_path.nil?
+      raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
+      reader = open_reader(@dictionary_path)
+      puts "Building index..."
+      # whenever there is a reader error, print its block parameters
+      XML::Error.set_handler { |*args| p args }
+      # components of an entry
+      kanji, kana, senses = [], [], []
+      glosses = {}
+      parts_of_speech = []
+      entries_added = 0
+      @index.transaction do |db_transaction|
+        # read until the end
+        while reader.read
+          # check what type of node we're currently on
+          case reader.node_type
+            # start-of-element node
+          when XML::Reader::TYPE_ELEMENT
+            case reader.name
+            when JDict::JMDictConstants::Elements::SEQUENCE
+              entry_sequence_num = reader.next_text
+              # TODO: Raise an exception if reader.next_text.empty? inside the when's
+              #       JMdict shouldn't have any empty elements, I believe.
+            when JDict::JMDictConstants::Elements::KANJI
+              text = reader.next_text
+              kanji << text unless text.empty?
+            when JDict::JMDictConstants::Elements::KANA
+              text = reader.next_text
+              kana << text unless text.empty?
+            when JDict::JMDictConstants::Elements::GLOSS
+              language = reader.node.lang || LANGUAGE_DEFAULT
+              language = language.intern
+              text = reader.next_text
+              unless text.empty?
+                (glosses[language] ||= []) << text
+              end
+            when JDict::JMDictConstants::Elements::CROSSREFERENCE
+              text = reader.next_text
+            end
+            # XML entity references are treated as a different node type
+            # the parent node of the entity reference itself has the actual tag name
+          when XML::Reader::TYPE_ENTITY_REFERENCE
+            if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
+                text = reader.name
+                parts_of_speech << text unless text.empty?
+            end
+            # end-of-element node
+          when XML::Reader::TYPE_END_ELEMENT
+            case reader.name
+            when JDict::JMDictConstants::Elements::SENSE
+              # build sense
+              senses << Sense.new(parts_of_speech, glosses)
+              # glosses.each do |language, texts|
+              #   senses << Sense.new(parts_of_speech,
+              #                       texts.join(', ').strip,
+              #                       language)
+              # end
+              # clear data for the next sense
+              glosses = {}
+              parts_of_speech = []
+              # we're at the end of the entry element, so index it
+            when JDict::JMDictConstants::Elements::ENTRY
+              raise "No kana found for this entry!" if kana.empty?
+              #index
+              # @index.add_entry(i, Entry.new(kanji, kana, senses))
+              insert_data = Entry.new(kanji, kana, senses).to_sql
+              db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
+                stmt.execute( insert_data )
+              end
+              # TODO: add entry_sequence_num to the entry
+              # clear data for the next entry
+              kanji, kana, senses = [], [], []
+              entries_added += 1
+              #debug
+              if JDict.configuration.debug
+                break if entries_added >= NUM_ENTRIES_TO_INDEX
+                #   # if @index.size.modulo(1000) == 0
+                #   if @index.size.modulo(100) == 0
+                #     # puts "#{@index.size/1000} thousand"
+                #     puts "\r#{@index.size/100} hundred"
+                #   end
+              end
+            end
+          end
+        end
+      end
+      # puts "#{@index.size} entries indexed"
+      # Done reading & indexing
+      reader.close
+      # @index.close
+    end
+    def rebuild
+      raise "Index already exists at path #{@path}" if File.exists? @path
+      build
+    end
+    # Creates an XML::Reader object for the given path
+    # @param dictionary_path [String] path to the dictionary file
+    # @return [XML::Reader] the reader for the given dictionary
+    def open_reader(dictionary_path)
+      # open reader
+      reader = nil
+      Dir.chdir(Dir.pwd) do
+        jmdict_path = File.join(dictionary_path)
+        reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
+        raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
+      end
+      reader
+    end
+    # Creates the hash of part-of-speech symbols to full definitions from the dictionary
+    def build_pos_hash
+      @pos_hash ||= begin
+        pos_hash = {}
+        reader = open_reader(@dictionary_path)
+        done = false
+        while done == false
+            reader.read
+            case reader.node_type
+            when XML::Reader::TYPE_DOCUMENT_TYPE
+                # random segfault when attempting this
+                # cs.each do |child|
+                #   p child.to_s
+                # end
+                doctype_string = reader.node.to_s
+                entities = doctype_string.scan(ENTITY_REGEX)
+                entities.map do |entity|
+                  abbrev = entity[0]
+                  full = entity[1]
+                  sym = pos_to_sym(abbrev)
+                  pos_hash[sym] = full
+                end
+                done = true
+            when XML::Reader::TYPE_ELEMENT
+                done = true
+            end
+        end
+        pos_hash
+      end
+    end
+    # Converts a part-of-speech entity reference string into a symbol
+    # @param entity [String] the entity reference string
+    # @return [Symbol] the part-of-speech symbol
+    def pos_to_sym(entity)
+      entity.gsub('-', '_').to_sym
+    end
+    # Retrieves the definition of a part-of-speech from its abbreviation
+    # @param pos [String] the abbreviation for the part-of-speech
+    # @return [String] the full description of the part-of-speech
+    def get_pos(pos)
+      build_pos_hash if @pos_hash.empty?
+      @pos_hash[pos_to_sym(pos)]
+    end
+  end
+  # Add custom parsing methods to XML::Reader
+  class XML::Reader
+    public
+    # Get the next text node
+    def next_text
+      # read until a text node
+      while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
+      self.value
+    end
+    # Get the next entity node
+    def next_entity
+      # read until an entity node
+      while (self.node_type != XML::Reader::TYPE_ENTITY and
+        self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
+        self.read); end
+      self.value
+    end
+  end
+end

data/lib/jdict.rb ADDED

@@ -0,0 +1,20 @@
+require 'configuration'
+require 'dictionaries/jmdict'
+module JDict
+  class << self
+    attr_accessor :configuration
+  end
+  def self.configuration
+    @configuration ||= Configuration.new
+  end
+  def self.reset
+    @configuration = Configuration.new
+  end
+  def self.configure
+    yield(configuration)
+  end
+end

data/lib/kana.rb ADDED

@@ -0,0 +1,4 @@
+module JDict
+  class Kana
+  end
+end

data/lib/kanji.rb ADDED

@@ -0,0 +1,4 @@
+module JDict
+  class Kanji
+  end
+end

data/lib/ruby-jdict/version.rb ADDED

@@ -0,0 +1,3 @@
+module JDict
+  Version = '0.0.1'
+end

data/lib/sense.rb ADDED

@@ -0,0 +1,14 @@
+# The sense element will record the translational equivalent
+# of the Japanese word, plus other related information. Where there
+# are several distinctly different meanings of the word, multiple
+# sense elements will be employed.
+module JDict
+  class Sense
+    attr_reader :parts_of_speech, :glosses
+    #
+    # Create a new +Sense+
+    def initialize(parts_of_speech, glosses)
+      @parts_of_speech, @glosses = parts_of_speech, glosses
+    end
+  end
+end

data/lib/unicode.rb ADDED

@@ -0,0 +1,63 @@
+module JDict
+  module Unicode
+    # Codepoint ranges for japanese unicode characters (in decimal)
+    # from: http://unicode.org/charts/
+    module CodepointRanges
+      HIRAGANA           = 12352..12447
+      KATAKANA           = 12448..12543
+      KATAKANA_PHONETIC  = 12784..12799
+      HALFWIDTH_KATAKANA = 65280..65519
+      UNIFIED_CJK        = 19968..40911
+      UNIFIED_CJK_EXT_A  = 13312..19903
+      UNIFIED_CJK_EXT_B  = 131072..173791
+      PUNCTUATION        = 12288..12351
+    end
+    # Get Unicode hex codepoint from a Unicode character
+    def hex_codepoint(unicode_char)
+      unicode_char.unpack("U0U*")[0]
+    end
+    # TODO: write unit test with a variety of strings to ensure this method
+    #       returns the expected output
+    # Determine the script of the specified string:
+    #   :kanji
+    #   :kana
+    #   :english
+    def script_type?(unicode_string)
+      type = ''
+      unicode_string.each_char do |c|
+        code = hex_codepoint(c)
+        #kana
+        if CodepointRanges::HIRAGANA.include?(code)           ||
+           CodepointRanges::KATAKANA.include?(code)           ||
+           CodepointRanges::KATAKANA_PHONETIC.include?(code)  ||
+           CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
+           CodepointRanges::PUNCTUATION.include?(code) then
+          type = :kana
+          break
+        #kanji
+        elsif CodepointRanges::UNIFIED_CJK.include?(code)        ||
+              CodepointRanges::UNIFIED_CJK_EXT_A.include?(code)  ||
+              CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
+          type = :kanji
+        #english
+        else
+          type = :english
+        end
+      end
+      type
+    end
+    def japanese?(unicode_string)
+      type = script_type?(unicode_string)
+      type == :kanji || type == :kana
+    end
+    def english?(unicode_string)
+      type = script_type?(unicode_string)
+      type == :english
+    end
+  end
+end

data/spec/configuration_spec.rb ADDED

@@ -0,0 +1,20 @@
+require "spec_helper"
+require 'configuration'
+module JDict
+  describe Configuration do
+    describe "#debug" do
+      it "default value is false" do
+        Configuration.new.debug = false
+      end
+    end
+    describe "#debug=" do
+      it "can set value" do
+        config = Configuration.new
+        config.debug = true
+        expect(config.debug).to eq(true)
+      end
+    end
+  end
+end

data/spec/dictionary_spec.rb ADDED

@@ -0,0 +1,117 @@
+require 'spec_helper'
+require BASE_PATH + '/lib/dictionary'
+#require BASE_PATH + '/lib/jmdict'
+module DictionarySpecHelper
+  JMDICT_PATH = File.join(BASE_PATH+'/dictionaries/JMdict')
+  INDEX_PATH  = File.join(BASE_PATH+'/index')
+  def mock_index
+  end
+  class Increase
+    def initialize(&measure_proc) # + args
+      @measure_proc = measure_proc
+    end
+    def matches?(target)
+      @target = target
+      @original_value = @measure_proc.call
+      target.call
+      @new_value = @measure_proc.call
+      return @new_value.to_i > @original_value.to_i
+    end
+    def failure_message
+      "expected #{@new_value} to be greater than #{@original_value}"
+    end
+    def negative_failure_message
+      "expected #{@new_value} to not be greater than #{@original_value}"
+    end
+    def description
+      "increase #{@original_value}"
+    end
+  end
+  def increase(&measure_proc) # + args
+    Increase.new(&measure_proc)
+  end
+end
+module DictionarySpec
+include DictionarySpecHelper
+describe JDict::Dictionary do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "is searchable" do
+    @dictionary.should respond_to(:search)
+  end
+  it "can tell you whether or not it's loaded" do
+    @dictionary.should respond_to(:loaded?)
+  end
+  it "should generate fixtures" do
+    pending
+    @dictionary.should respond_to(:generate_fixtures)
+  end
+end
+describe JDict::Dictionary, "after initialization" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has no entries" do
+    @dictionary.size.should == 0
+  end
+  it "has an empty entries cache" do
+    @dictionary.entries_cache.empty?
+  end
+end
+describe JDict::Dictionary, "when loading from a dictionary file" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has at least 1 entry" do
+    pending("implement loading from index")
+    @dictionary.load(JMDICT_PATH)
+    @dictionary.size.should > 0
+  end
+  it "says it's loaded" do
+    pending("implement loading from index")
+    @dictionary.load(JMDICT_PATH)
+    # @dictionary.loaded?.should == true
+    @dictionary.loaded?.should equal(true)
+  end
+end
+describe JDict::Dictionary, "when loading from a dictionary file (already loaded)" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has the same size as it did before being loaded"
+end
+describe JDict::Dictionary, "when searching" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "should raise an error if an index isn't built yet"
+  it "should give no results if the search phrase is empty" do
+    @dictionary.search('').should be_empty
+  end
+end
+end

data/spec/fixtures/feeds/sample_entry.xml ADDED

@@ -0,0 +1,33 @@
+<JMdict>
+<entry>
+<ent_seq>1171270</ent_seq>
+<k_ele>
+<keb>右翼</keb>
+<ke_pri>ichi1</ke_pri>
+<ke_pri>news1</ke_pri>
+<ke_pri>nf04</ke_pri>
+</k_ele>
+<r_ele>
+<reb>うよく</reb>
+<re_pri>ichi1</re_pri>
+<re_pri>news1</re_pri>
+<re_pri>nf04</re_pri>
+</r_ele>
+<sense>
+<pos>&n;</pos>
+<gloss>right-wing</gloss>
+<gloss g_lang="fr">aile droite (oiseau, armée, parti politique, base-ball)</gloss>
+<gloss g_lang="ru">пра́вое крыло́</gloss>
+<gloss g_lang="ru">пра́вый фланг</gloss>
+<gloss g_lang="de">rechter Flügel</gloss>
+</sense>
+<sense>
+<gloss g_lang="de">{Sport}</gloss>
+<gloss g_lang="de">rechte Flanke</gloss>
+<gloss g_lang="de">rechter Flügel</gloss>
+</sense>
+<sense>
+<gloss g_lang="de">die Rechte</gloss>
+</sense>
+</entry>
+</JMdict>

data/spec/index_spec.rb ADDED

@@ -0,0 +1,84 @@
+require 'rubygems'
+require File.dirname(__FILE__) + '/spec_helper'
+require BASE_PATH + '/lib/dictionary'
+require BASE_PATH + '/lib/jmdict'
+require BASE_PATH + '/lib/index'
+require 'fileutils'
+module IndexSpecHelper
+end
+describe JDict::DictIndex do
+  include IndexSpecHelper
+  before do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
+  end
+  # Searching
+  it "is searchable" do
+    @index.should respond_to(:search)
+  end
+  # Building
+  it "is buildable" do
+    @index.should respond_to(:build) # and return an index
+  end
+  it "is rebuildable" do
+    @index.should respond_to(:rebuild)
+  end
+  it "tells whether it's built or not" do
+    @index.should respond_to(:built?)
+  end
+  # Destroying
+  it "is destroyable" do
+    @index.should respond_to(:destroy)
+  end
+  it "raises an error if an invalid dictionary path is specified" do
+    lambda { JDict::DictIndex.new(INDEX_PATH, 'bad_dictionary_path') }.should raise_error
+  end
+end
+describe JDict::DictIndex, "after initialization" do
+  it "the path should be set" do
+    @index = JDict::DictIndex.new(INDEX_PATH)
+    @index.path.should_not be(nil)
+    @index.path.should_not be('')
+  end
+end
+describe JDict::DictIndex, "when building," do
+  it "it is created on the file system" do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
+    @index.build
+    File.exists?(INDEX_PATH).should == true
+  end
+  it "its directory on the file system shouldn't be empty" do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH,
+                           false) #no lazy loading
+    @index.build
+    # .
+    # ..
+    # ^-------- an empty directory has only these 2 entries
+    expect(Dir.entries(INDEX_PATH).size).to be >= 3
+  end
+  it "loads from a dictionary file"
+end
+describe JDict::DictIndex, "when rebuilding" do
+  include FileUtils
+  it "raises an error if it doesn't already exist" do
+    rm_rf(INDEX_PATH)
+    File.exists?(INDEX_PATH).should == false
+    lambda {
+      JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH).rebuild
+      }.should raise_error
+  end
+end

data/spec/jdict_spec.rb ADDED

@@ -0,0 +1,17 @@
+require 'spec_helper'
+describe JDict do
+  describe "#configure" do
+    before do
+      JDict.configure do |config|
+        config.dictionary_path = DICT_PATH
+        config.debug = true
+      end
+    end
+    it "uses the configured path" do
+      expect(JDICT.dictionary_path).to eq(DICT_PATH)
+    end
+  end
+end

data/spec/jmdict_spec.rb ADDED

@@ -0,0 +1,19 @@
+require 'spec_helper'
+require BASE_PATH + '/lib/dictionary'
+require BASE_PATH + '/lib/jmdict'
+module JMDictSpecHelper
+  INDEX_PATH  = File.join(BASE_PATH+'/index')
+end
+describe JDict::JMDict do
+  include JMDictSpecHelper
+  before do
+    @jmdict = JDict::JMDict.new(JMDictSpecHelper::INDEX_PATH)
+  end
+  it do
+    @jmdict.should be_a_kind_of(JDict::Dictionary)
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,50 @@
+require 'rubygems'      #use gems
+require 'bundler/setup' #load up the bundled environment
+# require 'spec'          #test framework
+$DEBUG = true
+BASE_PATH   = File.dirname(__FILE__) + '/..'
+INDEX_PATH  = BASE_PATH + '/test_index'
+JMDICT_PATH = BASE_PATH + '/dictionaries/JMdict'
+##
+# rSpec Hash additions.
+#
+# From
+#   * http://wincent.com/knowledge-base/Fixtures_considered_harmful%3F
+#   * Neil Rahilly
+class Hash
+  ##
+  # Filter keys out of a Hash.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.except(:a)
+  #   => { :b => 2, :c => 3 }
+  def except(*keys)
+    self.reject { |k,v| keys.include?(k || k.to_sym) }
+  end
+  ##
+  # Override some keys.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.with(:a => 4)
+  #   => { :a => 4, :b => 2, :c => 3 }
+  def with(overrides = {})
+    self.merge overrides
+  end
+  ##
+  # Returns a Hash with only the pairs identified by +keys+.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.only(:a)
+  #   => { :a => 1 }
+  def only(*keys)
+    self.reject { |k,v| !keys.include?(k || k.to_sym) }
+  end
+end

metadata ADDED

@@ -0,0 +1,124 @@
+--- !ruby/object:Gem::Specification
+name: ruby-jdict
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Ian Pickering
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-12-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: libxml-ruby
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+- !ruby/object:Gem::Dependency
+  name: amalgalite
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+- !ruby/object:Gem::Dependency
+  name: autotest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+description:
+email:
+- ipickering2@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- LICENSING
+- README.md
+- Rakefile
+- examples/query.rb
+- lib/#sense.rb#
+- lib/configuration.rb
+- lib/constants.rb
+- lib/dictionaries/jmdict.rb
+- lib/dictionary.rb
+- lib/entry.rb
+- lib/index.rb
+- lib/jdict.rb
+- lib/kana.rb
+- lib/kanji.rb
+- lib/ruby-jdict/version.rb
+- lib/sense.rb
+- lib/unicode.rb
+- spec/configuration_spec.rb
+- spec/dictionary_spec.rb
+- spec/fixtures/feeds/sample_entry.xml
+- spec/index_spec.rb
+- spec/jdict_spec.rb
+- spec/jmdict_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/Ruin0x11/ruby-jdict
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5.1
+signing_key:
+specification_version: 4
+summary: Ruby gem for accessing Jim Breen's Japanese dictionaries
+test_files: []
+has_rdoc: