RubyGems - ruby-jdict - Versions diffs - 0.0.1 - Mend

ruby-jdict 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +7 -0
data/LICENSING +28 -0
data/README.md +10 -0
data/Rakefile +30 -0
data/examples/query.rb +29 -0
data/lib/#sense.rb# +14 -0
data/lib/configuration.rb +20 -0
data/lib/constants.rb +64 -0
data/lib/dictionaries/jmdict.rb +14 -0
data/lib/dictionary.rb +62 -0
data/lib/entry.rb +79 -0
data/lib/index.rb +346 -0
data/lib/jdict.rb +20 -0
data/lib/kana.rb +4 -0
data/lib/kanji.rb +4 -0
data/lib/ruby-jdict/version.rb +3 -0
data/lib/sense.rb +14 -0
data/lib/unicode.rb +63 -0
data/spec/configuration_spec.rb +20 -0
data/spec/dictionary_spec.rb +117 -0
data/spec/fixtures/feeds/sample_entry.xml +33 -0
data/spec/index_spec.rb +84 -0
data/spec/jdict_spec.rb +17 -0
data/spec/jmdict_spec.rb +19 -0
data/spec/spec_helper.rb +50 -0
metadata +124 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 48dbfb86f9f72639eae7cecdde05da6953afdc8c
+  data.tar.gz: 01bae383b6df3ae0e9a524e094d7f1f1890663cd
+SHA512:
+  metadata.gz: 4253b05fc65786103431707d298711170b8cc4cd426919b1dee06ba37a767766021ac80d1207a31a61bf8ea9a15466cf5e173f3b155328986dc674978793b5cd
+  data.tar.gz: 47a4b27fe519e1284bfd5311404f18489d701ddedc754d9203eaec07101bb4485378760de4f51e56f4fbe7aa235925e01505b18943e81d22af1cbb5464eca801

data/LICENSING ADDED

@@ -0,0 +1,28 @@
+Copyright (C) 2015 Ian Pickering
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+3. The name of the author may not be used to endorse or promote
+   products derived from this software without specific prior
+   written permission.
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED

@@ -0,0 +1,10 @@
+# Ruby-JDict
+Ruby gem for accessing Jim Breen's Japanese dictionaries. Can currently access the following:
+  * JMdict (Japanese-English dictionary)
+Dictionary files are located [here](http://www.csse.monash.edu.au/~jwb/wwwjdicinf.html#dicfil_tag).
+## Install
+```
+gem install ruby-jdict
+```

data/Rakefile ADDED

@@ -0,0 +1,30 @@
+require 'rubygems'
+require 'rake'   #task runner
+INDEX_PATH  = 'index'
+JMDICT_PATH = 'dictionaries/JMdict'
+namespace :index do
+  desc "Build the dictionary's search index"
+  task :build do
+    raise "Index already exists at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
+    @index = DictIndex.new(INDEX_PATH,
+                           JMDICT_PATH,
+                           false) # lazy_loadind? no. don't lazy load
+    puts "Index created at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
+    puts "Index with #{@index.size} entries."
+  end
+  desc "Destroy the dictionary's search index"
+  task :destroy do
+    puts 'TODO: destory the index'
+    `sudo rm -R index`
+    # This will not work, because we don't have sudooooo.
+    # How do you delete folders in Ruby without sudo? Probably
+    # can't... that'd be more consistent actually.
+    # if File.exists? INDEX_PATH
+    #   File.delete INDEX_PATH
+    # end
+  end
+end

data/examples/query.rb ADDED

@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+require 'jdict'
+BASE_PATH   = ENV["HOME"]
+DICT_PATH   = File.join(BASE_PATH, '.dicts')
+INDEX_PATH  = DICT_PATH
+JDict.configure do |config|
+  config.dictionary_path    = DICT_PATH                                  # directory containing dictionary files
+  config.index_path         = INDEX_PATH                                 # directory containing the full text search index
+  config.language           = JDict::JMDictConstants::Languages::ENGLISH # language for search results
+  config.num_results        = 50                                         # maximum results to return from searching
+end
+dict = JDict::JMDict.new
+query = "日本語"
+results = dict.search(query)
+results.each do |entry|
+  puts entry.kanji.join(", ")
+  puts entry.kana.join(", ")
+  entry.senses.each do |sense|
+    glosses = sense.glosses.join(", ")
+    parts_of_speech = sense.parts_of_speech.join(", ")
+    puts "(" + parts_of_speech + ") " + glosses
+  end
+  puts
+end

data/lib/#sense.rb# ADDED

@@ -0,0 +1,14 @@
+# The sense element will record the translational equivalent
+# of the Japanese word, plus other related information. Where there
+# are several distinctly different meanings of the word, multiple
+# sense elements will be employed.
+module JDict
+  class Sense
+    attr_reader :parts_of_speech, :glosses
+    #
+    # Create a new +Sense+
+    def initialize(parts_of_speech, glosses, language)
+      @parts_of_speech, @glosses = parts_of_speech, glosses
+    end
+  end
+end

data/lib/configuration.rb ADDED

@@ -0,0 +1,20 @@
+require 'constants'
+module JDict
+  class Configuration
+    attr_accessor :dictionary_path, :index_path, :num_results, :language, :lazy_index_loading, :debug
+    BASE_PATH   = ENV["HOME"]
+    DICT_PATH   = File.join(BASE_PATH, '.dicts')
+    INDEX_PATH  = DICT_PATH
+    def initialize
+      @dictionary_path    = DICT_PATH                                  # directory containing dictionary files
+      @index_path         = INDEX_PATH                                 # directory containing the full text search index
+      @num_results        = 50                                         # maximum results to return from searching
+      @language           = JDict::JMDictConstants::Languages::ENGLISH # language to return search results in
+      @lazy_index_loading = false                                      # load the index only on attempting to access it
+      @debug              = false                                      # limit number of entries indexed, rebuild index on instantiation
+    end
+  end
+end

data/lib/constants.rb ADDED

@@ -0,0 +1,64 @@
+# Constants and descriptions for important elements/attributes
+# of the JMdict XML dictionary.
+# Descriptions come from JMdict.dtd (document type definition)
+module JDict
+  module JMDictConstants
+    # TODO: change these strings to symbols ?
+    # XML elements of the JMDict file
+    module Elements
+      # Entries consist of kanji elements, kana elements,
+      # general information and sense elements. Each entry must have at
+      # least one kana element and one sense element. Others are optional.
+      ENTRY          = 'entry'
+      SEQUENCE       = 'ent_seq'
+      # This element will contain a word or short phrase in Japanese
+      # which is written using at least one kanji. The valid characters are
+      # kanji, kana, related characters such as chouon and kurikaeshi, and
+      # in exceptional cases, letters from other alphabets.
+      KANJI          = 'keb'
+      # This element content is restricted to kana and related
+      # characters such as chouon and kurikaeshi. Kana usage will be
+      # consistent between the keb and reb elements; e.g. if the keb
+      # contains katakana, so too will the reb.
+      KANA           = 'reb'
+      # The sense element will record the translational equivalent
+      # of the Japanese word, plus other related information. Where there
+      # are several distinctly different meanings of the word, multiple
+      # sense elements will be employed.
+      SENSE          = 'sense'
+      # Part-of-speech information about the entry/sense. Should use
+      # appropriate entity codes.
+      PART_OF_SPEECH = 'pos'
+      # Within each sense will be one or more "glosses", i.e.
+      # target-language words or phrases which are equivalents to the
+      # Japanese word. This element would normally be present, however it
+      # may be omitted in entries which are purely for a cross-reference.
+      GLOSS          = 'gloss'
+      CROSSREFERENCE = 'xref'
+    end
+    # Constants for selecting the search language.
+    # Used in the "gloss" element's xml:lang attribute.
+    #   :eng never appears as a xml:lang constant because gloss is assumed to be English when not specified
+    #   :jpn never appears as a xml:lang because the dictionary itself pivots around Japanese
+    module Languages
+      JAPANESE  = :jpn
+      ENGLISH   = :eng
+      DUTCH     = :dut
+      FRENCH    = :fre
+      GERMAN    = :ger
+      RUSSIAN   = :rus
+      SPANISH   = :spa
+      SLOVENIAN = :slv
+      SWEDISH   = :swe
+      HUNGARIAN = :hun
+    end
+  end
+end

data/lib/dictionaries/jmdict.rb ADDED

@@ -0,0 +1,14 @@
+require 'jdict'
+require 'dictionary'
+module JDict
+  class JMDict < Dictionary
+    private
+    # DICT_PATH = JDict.configuration.dictionary_path + '/JMdict'
+    def initialize(index_path = JDict.configuration.index_path, lazy_index_loading=JDict.configuration.lazy_index_loading)
+      path = JDict.configuration.dictionary_path + '/JMdict'
+      super(index_path, path, lazy_index_loading)
+    end
+  end
+end

data/lib/dictionary.rb ADDED

@@ -0,0 +1,62 @@
+require 'jdict'
+require 'index'
+module JDict
+  class Dictionary
+    attr_reader :entries_cache, :lazy_index_loading
+    def initialize(index_path = JDict.configuration.index_path, dictionary_path = nil, lazy_index_loading = JDict.configuration.lazy_index_loading)
+      path_specified = dictionary_path.nil? ? false : true
+      if path_specified and not File.exists? dictionary_path
+        raise "Dictionary not found at path #{dictionary_path}"
+      end
+      #store some args for future reference
+      @dictionary_path    = dictionary_path
+      @lazy_index_loading = lazy_index_loading
+      @entries       = []
+      @entries_cache = []
+      #instantiate and load the full-text search index
+      @index = DictIndex.new(index_path, dictionary_path, lazy_index_loading)
+    end
+    def size
+      @entries.size
+    end
+    def loaded?
+      @index.built?
+    end
+    # Search this dictionary's index for the given string.
+    # @param query [String] the search query
+    # @return [Array(Entry)] the results of the search
+    def search(query)
+      results = []
+      return results if query.empty?
+      load_index if lazy_index_loading and not loaded?
+      results = @index.search(query)
+    end
+    # Retrieves the definition of a part-of-speech from its abbreviation
+    # @param pos [String] the abbreviation for the part-of-speech
+    # @return [String] the full description of the part-of-speech
+    def get_pos(pos)
+      @index.get_pos(pos)
+    end
+    private
+    def load_index
+      if loaded?
+        Exception.new("Dictionary index is already loaded")
+      else
+        @index.build
+      end
+    end
+  end
+end

data/lib/entry.rb ADDED

@@ -0,0 +1,79 @@
+#include Constants #XML constants from the dictionary file
+# Entries consist of kanji elements, kana elements,
+# general information and sense elements. Each entry must have at
+# least one kana element and one sense element. Others are optional.
+module JDict
+  class Entry
+    attr_accessor :kanji, :kana, :senses
+    # Create a new Entry
+    #  entry = initialize(kanji, kana, senses)
+    def initialize(kanji, kana, senses)
+      @kanji, @kana, @senses = kanji, kana, senses
+    end
+    KANA_RE = /^kana/
+    SENSE_RE = /^sense/
+    PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
+    MEANING_SENTINEL = '**'
+    PART_OF_SPEECH_SENTINEL = '$$'
+    SENSE_SENTINEL = '%%'
+    LANGUAGE_SENTINEL = '&&'
+    GLOSS_SENTINEL = '@@'
+    # Converts an SQLite row from the index to the Entry format
+    def self.from_sql(row)
+      kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
+      kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
+      senses = []
+      row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
+        ary = txt.scan(PART_OF_SPEECH_RE)
+        if ary.size == 1
+          parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
+          gloss_strings = txt[(ary.to_s.length-1)..-1]
+        else
+          parts_of_speech = nil
+          gloss_strings = txt[5..-1]
+        end
+        gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
+        glosses = {}
+        gloss_strings.each do |str|
+          lang, meaning_string = str.split(LANGUAGE_SENTINEL)
+          lang = lang.to_sym
+          meanings = meaning_string.split(MEANING_SENTINEL)
+          (glosses[lang] ||= []) << meanings
+        end
+        glosses_for_lang = glosses[JDict.configuration.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
+        senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
+      end
+      self.new(kanji, kana, senses)
+    end
+    # Converts an Entry to a string to be indexed into the SQLite database
+    # @return [String] the serialized string for this Entry
+    def to_sql
+      sense_strings = senses.map do |s|
+        sense = ''
+        sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
+        sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
+      end
+      insert_data  = {
+        ':kanji'   => kanji.join(", "),
+        ':kana' => kana.join(", "),
+        ':senses' => sense_strings.join(SENSE_SENTINEL)
+      }
+      return insert_data
+    end
+    # Get an array of +Senses+ for the specified language
+    def senses_by_language(l)
+      senses.select { |s| s.language == l }
+    end
+  end
+end

data/lib/index.rb ADDED

@@ -0,0 +1,346 @@
+# encoding: utf-8
+require 'rubygems'      #use gems
+require 'bundler/setup' #load up the bundled environment
+require 'amalgalite'
+require 'libxml'    #XML parsing
+require 'fileutils'
+require_relative 'constants' #XML constants from the dictionary file
+require_relative 'entry'     #dictionary elements
+require_relative 'kanji'     #...
+require_relative 'kana'      #...
+require_relative 'sense'
+require 'amalgalite'
+include LibXML
+module JDict
+  class DictIndex
+    LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
+    NUM_ENTRIES_TO_INDEX = 50
+    ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
+    attr_reader :path
+    # Initialize a full-text search index backend for JMdict
+    # @param index_path [String] desired filesystem path where you'd like the *search index* stored
+    # @param dictionary_path [String] desired filesystem path where you'd like the *dictionary* stored
+    # @param lazy_loading [Boolean] lazily load the index just when it's needed, instead of building it ahead of time
+    def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
+      raise "Index path was nil" if index_path.nil?
+      path_specified = dictionary_path.nil? ? false : true
+      if path_specified and not File.exists? dictionary_path
+        raise "Dictionary not found at path #{dictionary_path}"
+      end
+      @path = index_path
+      @dictionary_path = dictionary_path
+      @pos_hash = {}
+      # create path if nonexistent
+      FileUtils.mkdir_p(@path)
+      db_file = File.join(@path, "fts5.db")
+      File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)
+      @index = Amalgalite::Database.new(db_file)
+      create_schema
+      #check if the index has already been built before Ferret creates it
+      already_built = built?
+      #build the index right now if "lazy loading" isn't on and the index is empty
+      build unless lazy_loading or (already_built && !JDict.configuration.debug)
+      #make the hash from abbreviated parts of speech to full definitions
+      build_pos_hash
+    end
+    # Creates the SQL schema for the Amalgalite database
+    def create_schema
+      schema = @index.schema
+      unless schema.tables['search']
+        @index.execute_batch <<-SQL
+        CREATE VIRTUAL TABLE search USING fts5(
+            kanji,
+            kana,
+            senses
+        );
+        SQL
+        @index.reload_schema!
+      end
+    end
+    # Returns the search results as an array of +Entry+
+    # @param term [String] the search string
+    # @param language [Symbol] the language to return results in
+    # @return [Array(Entry)] the results of the search
+    def search(term, language=LANGUAGE_DEFAULT, exact=false)
+      raise "Index not found at path #{@path}" unless File.exists? @path
+      # no results yet...
+      results = []
+      @entries_cache = []
+      # search for:
+      #   kanji... one field
+      #   kana ... up to 10 fields
+      #   sense... up to 10 fields
+      # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""
+      query = "{kanji kana senses} : \"#{term}\""
+      query += "*" unless exact
+      @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH '#{query}' LIMIT #{JDict.configuration.num_results}") do |row|
+        entry = Entry.from_sql(row)
+        score = 0.0
+        # load entry from the index. from cache, if it's available
+        # load from cache if it's available
+        # if entry = @entries_cache[docid]
+        #   entry = Entry.from_index_doc(@ferret_index[docid].load)
+        #   @entries_cache[docid] = entry
+        # end
+        # # load entry from the index
+        # if entry.nil?
+        #   entry = Entry.from_index_doc(@ferret_index[docid].load)
+        #   @entries_cache[docid] = entry
+        # end
+        is_exact_match = false
+        is_exact_match = entry.kanji == term ||
+          entry.kana.any? { |k| k == term }
+        re = Regexp.new("#{term}", Regexp::IGNORECASE) # match the search term, ignoring case
+        # entry.senses.each do |s|
+        #   s.glosses.each { |g| is_exact_match = is_exact_match || g.force_encoding("UTF-8").match(re) }
+        # end
+        # score = 1.0 if is_exact_match
+        # add the result
+        results << [score, entry]
+      end
+      @entries_cache = []
+      results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
+    end
+    def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end
+    # Builds the full-text search index
+    # @param overwrite [Boolean] force a build even if the index path already exists
+    # @param dictionary_path [String] path to the dictionary file
+    # @return [Integer] the number of indexed entries
+    def build(overwrite=false, dictionary_path=nil)
+      @dictionary_path = dictionary_path unless dictionary_path.nil?
+      raise "No dictionary path was provided" if @dictionary_path.nil?
+      raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
+      reader = open_reader(@dictionary_path)
+      puts "Building index..."
+      # whenever there is a reader error, print its block parameters
+      XML::Error.set_handler { |*args| p args }
+      # components of an entry
+      kanji, kana, senses = [], [], []
+      glosses = {}
+      parts_of_speech = []
+      entries_added = 0
+      @index.transaction do |db_transaction|
+        # read until the end
+        while reader.read
+          # check what type of node we're currently on
+          case reader.node_type
+            # start-of-element node
+          when XML::Reader::TYPE_ELEMENT
+            case reader.name
+            when JDict::JMDictConstants::Elements::SEQUENCE
+              entry_sequence_num = reader.next_text
+              # TODO: Raise an exception if reader.next_text.empty? inside the when's
+              #       JMdict shouldn't have any empty elements, I believe.
+            when JDict::JMDictConstants::Elements::KANJI
+              text = reader.next_text
+              kanji << text unless text.empty?
+            when JDict::JMDictConstants::Elements::KANA
+              text = reader.next_text
+              kana << text unless text.empty?
+            when JDict::JMDictConstants::Elements::GLOSS
+              language = reader.node.lang || LANGUAGE_DEFAULT
+              language = language.intern
+              text = reader.next_text
+              unless text.empty?
+                (glosses[language] ||= []) << text
+              end
+            when JDict::JMDictConstants::Elements::CROSSREFERENCE
+              text = reader.next_text
+            end
+            # XML entity references are treated as a different node type
+            # the parent node of the entity reference itself has the actual tag name
+          when XML::Reader::TYPE_ENTITY_REFERENCE
+            if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
+                text = reader.name
+                parts_of_speech << text unless text.empty?
+            end
+            # end-of-element node
+          when XML::Reader::TYPE_END_ELEMENT
+            case reader.name
+            when JDict::JMDictConstants::Elements::SENSE
+              # build sense
+              senses << Sense.new(parts_of_speech, glosses)
+              # glosses.each do |language, texts|
+              #   senses << Sense.new(parts_of_speech,
+              #                       texts.join(', ').strip,
+              #                       language)
+              # end
+              # clear data for the next sense
+              glosses = {}
+              parts_of_speech = []
+              # we're at the end of the entry element, so index it
+            when JDict::JMDictConstants::Elements::ENTRY
+              raise "No kana found for this entry!" if kana.empty?
+              #index
+              # @index.add_entry(i, Entry.new(kanji, kana, senses))
+              insert_data = Entry.new(kanji, kana, senses).to_sql
+              db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
+                stmt.execute( insert_data )
+              end
+              # TODO: add entry_sequence_num to the entry
+              # clear data for the next entry
+              kanji, kana, senses = [], [], []
+              entries_added += 1
+              #debug
+              if JDict.configuration.debug
+                break if entries_added >= NUM_ENTRIES_TO_INDEX
+                #   # if @index.size.modulo(1000) == 0
+                #   if @index.size.modulo(100) == 0
+                #     # puts "#{@index.size/1000} thousand"
+                #     puts "\r#{@index.size/100} hundred"
+                #   end
+              end
+            end
+          end
+        end
+      end
+      # puts "#{@index.size} entries indexed"
+      # Done reading & indexing
+      reader.close
+      # @index.close
+    end
+    def rebuild
+      raise "Index already exists at path #{@path}" if File.exists? @path
+      build
+    end
+    # Creates an XML::Reader object for the given path
+    # @param dictionary_path [String] path to the dictionary file
+    # @return [XML::Reader] the reader for the given dictionary
+    def open_reader(dictionary_path)
+      # open reader
+      reader = nil
+      Dir.chdir(Dir.pwd) do
+        jmdict_path = File.join(dictionary_path)
+        reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
+        raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
+      end
+      reader
+    end
+    # Creates the hash of part-of-speech symbols to full definitions from the dictionary
+    def build_pos_hash
+      @pos_hash ||= begin
+        pos_hash = {}
+        reader = open_reader(@dictionary_path)
+        done = false
+        while done == false
+            reader.read
+            case reader.node_type
+            when XML::Reader::TYPE_DOCUMENT_TYPE
+                # random segfault when attempting this
+                # cs.each do |child|
+                #   p child.to_s
+                # end
+                doctype_string = reader.node.to_s
+                entities = doctype_string.scan(ENTITY_REGEX)
+                entities.map do |entity|
+                  abbrev = entity[0]
+                  full = entity[1]
+                  sym = pos_to_sym(abbrev)
+                  pos_hash[sym] = full
+                end
+                done = true
+            when XML::Reader::TYPE_ELEMENT
+                done = true
+            end
+        end
+        pos_hash
+      end
+    end
+    # Converts a part-of-speech entity reference string into a symbol
+    # @param entity [String] the entity reference string
+    # @return [Symbol] the part-of-speech symbol
+    def pos_to_sym(entity)
+      entity.gsub('-', '_').to_sym
+    end
+    # Retrieves the definition of a part-of-speech from its abbreviation
+    # @param pos [String] the abbreviation for the part-of-speech
+    # @return [String] the full description of the part-of-speech
+    def get_pos(pos)
+      build_pos_hash if @pos_hash.empty?
+      @pos_hash[pos_to_sym(pos)]
+    end
+  end
+  # Add custom parsing methods to XML::Reader
+  class XML::Reader
+    public
+    # Get the next text node
+    def next_text
+      # read until a text node
+      while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
+      self.value
+    end
+    # Get the next entity node
+    def next_entity
+      # read until an entity node
+      while (self.node_type != XML::Reader::TYPE_ENTITY and
+        self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
+        self.read); end
+      self.value
+    end
+  end
+end

data/lib/jdict.rb ADDED

@@ -0,0 +1,20 @@
+require 'configuration'
+require 'dictionaries/jmdict'
+module JDict
+  class << self
+    attr_accessor :configuration
+  end
+  def self.configuration
+    @configuration ||= Configuration.new
+  end
+  def self.reset
+    @configuration = Configuration.new
+  end
+  def self.configure
+    yield(configuration)
+  end
+end

data/lib/kana.rb ADDED

@@ -0,0 +1,4 @@
+module JDict
+  class Kana
+  end
+end

data/lib/kanji.rb ADDED

@@ -0,0 +1,4 @@
+module JDict
+  class Kanji
+  end
+end

data/lib/ruby-jdict/version.rb ADDED

@@ -0,0 +1,3 @@
+module JDict
+  Version = '0.0.1'
+end

data/lib/sense.rb ADDED

@@ -0,0 +1,14 @@
+# The sense element will record the translational equivalent
+# of the Japanese word, plus other related information. Where there
+# are several distinctly different meanings of the word, multiple
+# sense elements will be employed.
+module JDict
+  class Sense
+    attr_reader :parts_of_speech, :glosses
+    #
+    # Create a new +Sense+
+    def initialize(parts_of_speech, glosses)
+      @parts_of_speech, @glosses = parts_of_speech, glosses
+    end
+  end
+end

data/lib/unicode.rb ADDED

@@ -0,0 +1,63 @@
+module JDict
+  module Unicode
+    # Codepoint ranges for japanese unicode characters (in decimal)
+    # from: http://unicode.org/charts/
+    module CodepointRanges
+      HIRAGANA           = 12352..12447
+      KATAKANA           = 12448..12543
+      KATAKANA_PHONETIC  = 12784..12799
+      HALFWIDTH_KATAKANA = 65280..65519
+      UNIFIED_CJK        = 19968..40911
+      UNIFIED_CJK_EXT_A  = 13312..19903
+      UNIFIED_CJK_EXT_B  = 131072..173791
+      PUNCTUATION        = 12288..12351
+    end
+    # Get Unicode hex codepoint from a Unicode character
+    def hex_codepoint(unicode_char)
+      unicode_char.unpack("U0U*")[0]
+    end
+    # TODO: write unit test with a variety of strings to ensure this method
+    #       returns the expected output
+    # Determine the script of the specified string:
+    #   :kanji
+    #   :kana
+    #   :english
+    def script_type?(unicode_string)
+      type = ''
+      unicode_string.each_char do |c|
+        code = hex_codepoint(c)
+        #kana
+        if CodepointRanges::HIRAGANA.include?(code)           ||
+           CodepointRanges::KATAKANA.include?(code)           ||
+           CodepointRanges::KATAKANA_PHONETIC.include?(code)  ||
+           CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
+           CodepointRanges::PUNCTUATION.include?(code) then
+          type = :kana
+          break
+        #kanji
+        elsif CodepointRanges::UNIFIED_CJK.include?(code)        ||
+              CodepointRanges::UNIFIED_CJK_EXT_A.include?(code)  ||
+              CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
+          type = :kanji
+        #english
+        else
+          type = :english
+        end
+      end
+      type
+    end
+    def japanese?(unicode_string)
+      type = script_type?(unicode_string)
+      type == :kanji || type == :kana
+    end
+    def english?(unicode_string)
+      type = script_type?(unicode_string)
+      type == :english
+    end
+  end
+end

data/spec/configuration_spec.rb ADDED

@@ -0,0 +1,20 @@
+require "spec_helper"
+require 'configuration'
+module JDict
+  describe Configuration do
+    describe "#debug" do
+      it "default value is false" do
+        Configuration.new.debug = false
+      end
+    end
+    describe "#debug=" do
+      it "can set value" do
+        config = Configuration.new
+        config.debug = true
+        expect(config.debug).to eq(true)
+      end
+    end
+  end
+end

data/spec/dictionary_spec.rb ADDED

@@ -0,0 +1,117 @@
+require 'spec_helper'
+require BASE_PATH + '/lib/dictionary'
+#require BASE_PATH + '/lib/jmdict'
+module DictionarySpecHelper
+  JMDICT_PATH = File.join(BASE_PATH+'/dictionaries/JMdict')
+  INDEX_PATH  = File.join(BASE_PATH+'/index')
+  def mock_index
+  end
+  class Increase
+    def initialize(&measure_proc) # + args
+      @measure_proc = measure_proc
+    end
+    def matches?(target)
+      @target = target
+      @original_value = @measure_proc.call
+      target.call
+      @new_value = @measure_proc.call
+      return @new_value.to_i > @original_value.to_i
+    end
+    def failure_message
+      "expected #{@new_value} to be greater than #{@original_value}"
+    end
+    def negative_failure_message
+      "expected #{@new_value} to not be greater than #{@original_value}"
+    end
+    def description
+      "increase #{@original_value}"
+    end
+  end
+  def increase(&measure_proc) # + args
+    Increase.new(&measure_proc)
+  end
+end
+module DictionarySpec
+include DictionarySpecHelper
+describe JDict::Dictionary do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "is searchable" do
+    @dictionary.should respond_to(:search)
+  end
+  it "can tell you whether or not it's loaded" do
+    @dictionary.should respond_to(:loaded?)
+  end
+  it "should generate fixtures" do
+    pending
+    @dictionary.should respond_to(:generate_fixtures)
+  end
+end
+describe JDict::Dictionary, "after initialization" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has no entries" do
+    @dictionary.size.should == 0
+  end
+  it "has an empty entries cache" do
+    @dictionary.entries_cache.empty?
+  end
+end
+describe JDict::Dictionary, "when loading from a dictionary file" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has at least 1 entry" do
+    pending("implement loading from index")
+    @dictionary.load(JMDICT_PATH)
+    @dictionary.size.should > 0
+  end
+  it "says it's loaded" do
+    pending("implement loading from index")
+    @dictionary.load(JMDICT_PATH)
+    # @dictionary.loaded?.should == true
+    @dictionary.loaded?.should equal(true)
+  end
+end
+describe JDict::Dictionary, "when loading from a dictionary file (already loaded)" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "has the same size as it did before being loaded"
+end
+describe JDict::Dictionary, "when searching" do
+  before do
+    @dictionary = JDict::Dictionary.new(INDEX_PATH)
+  end
+  it "should raise an error if an index isn't built yet"
+  it "should give no results if the search phrase is empty" do
+    @dictionary.search('').should be_empty
+  end
+end
+end

data/spec/fixtures/feeds/sample_entry.xml ADDED

@@ -0,0 +1,33 @@
+<JMdict>
+<entry>
+<ent_seq>1171270</ent_seq>
+<k_ele>
+<keb>右翼</keb>
+<ke_pri>ichi1</ke_pri>
+<ke_pri>news1</ke_pri>
+<ke_pri>nf04</ke_pri>
+</k_ele>
+<r_ele>
+<reb>うよく</reb>
+<re_pri>ichi1</re_pri>
+<re_pri>news1</re_pri>
+<re_pri>nf04</re_pri>
+</r_ele>
+<sense>
+<pos>&n;</pos>
+<gloss>right-wing</gloss>
+<gloss g_lang="fr">aile droite (oiseau, armée, parti politique, base-ball)</gloss>
+<gloss g_lang="ru">пра́вое крыло́</gloss>
+<gloss g_lang="ru">пра́вый фланг</gloss>
+<gloss g_lang="de">rechter Flügel</gloss>
+</sense>
+<sense>
+<gloss g_lang="de">{Sport}</gloss>
+<gloss g_lang="de">rechte Flanke</gloss>
+<gloss g_lang="de">rechter Flügel</gloss>
+</sense>
+<sense>
+<gloss g_lang="de">die Rechte</gloss>
+</sense>
+</entry>
+</JMdict>

data/spec/index_spec.rb ADDED

@@ -0,0 +1,84 @@
+require 'rubygems'
+require File.dirname(__FILE__) + '/spec_helper'
+require BASE_PATH + '/lib/dictionary'
+require BASE_PATH + '/lib/jmdict'
+require BASE_PATH + '/lib/index'
+require 'fileutils'
+module IndexSpecHelper
+end
+describe JDict::DictIndex do
+  include IndexSpecHelper
+  before do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
+  end
+  # Searching
+  it "is searchable" do
+    @index.should respond_to(:search)
+  end
+  # Building
+  it "is buildable" do
+    @index.should respond_to(:build) # and return an index
+  end
+  it "is rebuildable" do
+    @index.should respond_to(:rebuild)
+  end
+  it "tells whether it's built or not" do
+    @index.should respond_to(:built?)
+  end
+  # Destroying
+  it "is destroyable" do
+    @index.should respond_to(:destroy)
+  end
+  it "raises an error if an invalid dictionary path is specified" do
+    lambda { JDict::DictIndex.new(INDEX_PATH, 'bad_dictionary_path') }.should raise_error
+  end
+end
+describe JDict::DictIndex, "after initialization" do
+  it "the path should be set" do
+    @index = JDict::DictIndex.new(INDEX_PATH)
+    @index.path.should_not be(nil)
+    @index.path.should_not be('')
+  end
+end
+describe JDict::DictIndex, "when building," do
+  it "it is created on the file system" do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
+    @index.build
+    File.exists?(INDEX_PATH).should == true
+  end
+  it "its directory on the file system shouldn't be empty" do
+    @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH,
+                           false) #no lazy loading
+    @index.build
+    # .
+    # ..
+    # ^-------- an empty directory has only these 2 entries
+    expect(Dir.entries(INDEX_PATH).size).to be >= 3
+  end
+  it "loads from a dictionary file"
+end
+describe JDict::DictIndex, "when rebuilding" do
+  include FileUtils
+  it "raises an error if it doesn't already exist" do
+    rm_rf(INDEX_PATH)
+    File.exists?(INDEX_PATH).should == false
+    lambda {
+      JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH).rebuild
+      }.should raise_error
+  end
+end

data/spec/jdict_spec.rb ADDED

@@ -0,0 +1,17 @@
+require 'spec_helper'
+describe JDict do
+  describe "#configure" do
+    before do
+      JDict.configure do |config|
+        config.dictionary_path = DICT_PATH
+        config.debug = true
+      end
+    end
+    it "uses the configured path" do
+      expect(JDICT.dictionary_path).to eq(DICT_PATH)
+    end
+  end
+end

data/spec/jmdict_spec.rb ADDED

@@ -0,0 +1,19 @@
+require 'spec_helper'
+require BASE_PATH + '/lib/dictionary'
+require BASE_PATH + '/lib/jmdict'
+module JMDictSpecHelper
+  INDEX_PATH  = File.join(BASE_PATH+'/index')
+end
+describe JDict::JMDict do
+  include JMDictSpecHelper
+  before do
+    @jmdict = JDict::JMDict.new(JMDictSpecHelper::INDEX_PATH)
+  end
+  it do
+    @jmdict.should be_a_kind_of(JDict::Dictionary)
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,50 @@
+require 'rubygems'      #use gems
+require 'bundler/setup' #load up the bundled environment
+# require 'spec'          #test framework
+$DEBUG = true
+BASE_PATH   = File.dirname(__FILE__) + '/..'
+INDEX_PATH  = BASE_PATH + '/test_index'
+JMDICT_PATH = BASE_PATH + '/dictionaries/JMdict'
+##
+# rSpec Hash additions.
+#
+# From
+#   * http://wincent.com/knowledge-base/Fixtures_considered_harmful%3F
+#   * Neil Rahilly
+class Hash
+  ##
+  # Filter keys out of a Hash.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.except(:a)
+  #   => { :b => 2, :c => 3 }
+  def except(*keys)
+    self.reject { |k,v| keys.include?(k || k.to_sym) }
+  end
+  ##
+  # Override some keys.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.with(:a => 4)
+  #   => { :a => 4, :b => 2, :c => 3 }
+  def with(overrides = {})
+    self.merge overrides
+  end
+  ##
+  # Returns a Hash with only the pairs identified by +keys+.
+  #
+  #   { :a => 1, :b => 2, :c => 3 }.only(:a)
+  #   => { :a => 1 }
+  def only(*keys)
+    self.reject { |k,v| !keys.include?(k || k.to_sym) }
+  end
+end

metadata ADDED

@@ -0,0 +1,124 @@
+--- !ruby/object:Gem::Specification
+name: ruby-jdict
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Ian Pickering
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-12-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: libxml-ruby
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+- !ruby/object:Gem::Dependency
+  name: amalgalite
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+- !ruby/object:Gem::Dependency
+  name: autotest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+description:
+email:
+- ipickering2@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- LICENSING
+- README.md
+- Rakefile
+- examples/query.rb
+- lib/#sense.rb#
+- lib/configuration.rb
+- lib/constants.rb
+- lib/dictionaries/jmdict.rb
+- lib/dictionary.rb
+- lib/entry.rb
+- lib/index.rb
+- lib/jdict.rb
+- lib/kana.rb
+- lib/kanji.rb
+- lib/ruby-jdict/version.rb
+- lib/sense.rb
+- lib/unicode.rb
+- spec/configuration_spec.rb
+- spec/dictionary_spec.rb
+- spec/fixtures/feeds/sample_entry.xml
+- spec/index_spec.rb
+- spec/jdict_spec.rb
+- spec/jmdict_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/Ruin0x11/ruby-jdict
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5.1
+signing_key:
+specification_version: 4
+summary: Ruby gem for accessing Jim Breen's Japanese dictionaries
+test_files: []
+has_rdoc: