ruby-jdict 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSING +28 -0
- data/README.md +10 -0
- data/Rakefile +30 -0
- data/examples/query.rb +29 -0
- data/lib/#sense.rb# +14 -0
- data/lib/configuration.rb +20 -0
- data/lib/constants.rb +64 -0
- data/lib/dictionaries/jmdict.rb +14 -0
- data/lib/dictionary.rb +62 -0
- data/lib/entry.rb +79 -0
- data/lib/index.rb +346 -0
- data/lib/jdict.rb +20 -0
- data/lib/kana.rb +4 -0
- data/lib/kanji.rb +4 -0
- data/lib/ruby-jdict/version.rb +3 -0
- data/lib/sense.rb +14 -0
- data/lib/unicode.rb +63 -0
- data/spec/configuration_spec.rb +20 -0
- data/spec/dictionary_spec.rb +117 -0
- data/spec/fixtures/feeds/sample_entry.xml +33 -0
- data/spec/index_spec.rb +84 -0
- data/spec/jdict_spec.rb +17 -0
- data/spec/jmdict_spec.rb +19 -0
- data/spec/spec_helper.rb +50 -0
- metadata +124 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 48dbfb86f9f72639eae7cecdde05da6953afdc8c
|
4
|
+
data.tar.gz: 01bae383b6df3ae0e9a524e094d7f1f1890663cd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4253b05fc65786103431707d298711170b8cc4cd426919b1dee06ba37a767766021ac80d1207a31a61bf8ea9a15466cf5e173f3b155328986dc674978793b5cd
|
7
|
+
data.tar.gz: 47a4b27fe519e1284bfd5311404f18489d701ddedc754d9203eaec07101bb4485378760de4f51e56f4fbe7aa235925e01505b18943e81d22af1cbb5464eca801
|
data/LICENSING
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
Copyright (C) 2015 Ian Pickering
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions
|
6
|
+
are met:
|
7
|
+
|
8
|
+
1. Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
2. Redistributions in binary form must reproduce the above copyright
|
11
|
+
notice, this list of conditions and the following disclaimer in
|
12
|
+
the documentation and/or other materials provided with the
|
13
|
+
distribution.
|
14
|
+
3. The name of the author may not be used to endorse or promote
|
15
|
+
products derived from this software without specific prior
|
16
|
+
written permission.
|
17
|
+
|
18
|
+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS
|
19
|
+
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
20
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
21
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
22
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
23
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
24
|
+
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
25
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
26
|
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
27
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
28
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# Ruby-JDict
|
2
|
+
Ruby gem for accessing Jim Breen's Japanese dictionaries. Can currently access the following:
|
3
|
+
* JMdict (Japanese-English dictionary)
|
4
|
+
|
5
|
+
Dictionary files are located [here](http://www.csse.monash.edu.au/~jwb/wwwjdicinf.html#dicfil_tag).
|
6
|
+
|
7
|
+
## Install
|
8
|
+
```
|
9
|
+
gem install ruby-jdict
|
10
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake' #task runner
|
3
|
+
|
4
|
+
INDEX_PATH = 'index'
|
5
|
+
JMDICT_PATH = 'dictionaries/JMdict'
|
6
|
+
|
7
|
+
namespace :index do
|
8
|
+
|
9
|
+
desc "Build the dictionary's search index"
|
10
|
+
task :build do
|
11
|
+
raise "Index already exists at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
|
12
|
+
@index = DictIndex.new(INDEX_PATH,
|
13
|
+
JMDICT_PATH,
|
14
|
+
false) # lazy_loadind? no. don't lazy load
|
15
|
+
puts "Index created at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
|
16
|
+
puts "Index with #{@index.size} entries."
|
17
|
+
end
|
18
|
+
|
19
|
+
desc "Destroy the dictionary's search index"
|
20
|
+
task :destroy do
|
21
|
+
puts 'TODO: destory the index'
|
22
|
+
`sudo rm -R index`
|
23
|
+
# This will not work, because we don't have sudooooo.
|
24
|
+
# How do you delete folders in Ruby without sudo? Probably
|
25
|
+
# can't... that'd be more consistent actually.
|
26
|
+
# if File.exists? INDEX_PATH
|
27
|
+
# File.delete INDEX_PATH
|
28
|
+
# end
|
29
|
+
end
|
30
|
+
end
|
data/examples/query.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'jdict'
|
3
|
+
|
4
|
+
BASE_PATH = ENV["HOME"]
|
5
|
+
DICT_PATH = File.join(BASE_PATH, '.dicts')
|
6
|
+
INDEX_PATH = DICT_PATH
|
7
|
+
|
8
|
+
JDict.configure do |config|
|
9
|
+
config.dictionary_path = DICT_PATH # directory containing dictionary files
|
10
|
+
config.index_path = INDEX_PATH # directory containing the full text search index
|
11
|
+
config.language = JDict::JMDictConstants::Languages::ENGLISH # language for search results
|
12
|
+
config.num_results = 50 # maximum results to return from searching
|
13
|
+
end
|
14
|
+
|
15
|
+
dict = JDict::JMDict.new
|
16
|
+
|
17
|
+
query = "日本語"
|
18
|
+
|
19
|
+
results = dict.search(query)
|
20
|
+
results.each do |entry|
|
21
|
+
puts entry.kanji.join(", ")
|
22
|
+
puts entry.kana.join(", ")
|
23
|
+
entry.senses.each do |sense|
|
24
|
+
glosses = sense.glosses.join(", ")
|
25
|
+
parts_of_speech = sense.parts_of_speech.join(", ")
|
26
|
+
puts "(" + parts_of_speech + ") " + glosses
|
27
|
+
end
|
28
|
+
puts
|
29
|
+
end
|
data/lib/#sense.rb#
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# The sense element will record the translational equivalent
|
2
|
+
# of the Japanese word, plus other related information. Where there
|
3
|
+
# are several distinctly different meanings of the word, multiple
|
4
|
+
# sense elements will be employed.
|
5
|
+
module JDict
|
6
|
+
class Sense
|
7
|
+
attr_reader :parts_of_speech, :glosses
|
8
|
+
#
|
9
|
+
# Create a new +Sense+
|
10
|
+
def initialize(parts_of_speech, glosses, language)
|
11
|
+
@parts_of_speech, @glosses = parts_of_speech, glosses
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'constants'
|
2
|
+
|
3
|
+
module JDict
|
4
|
+
class Configuration
|
5
|
+
attr_accessor :dictionary_path, :index_path, :num_results, :language, :lazy_index_loading, :debug
|
6
|
+
|
7
|
+
BASE_PATH = ENV["HOME"]
|
8
|
+
DICT_PATH = File.join(BASE_PATH, '.dicts')
|
9
|
+
INDEX_PATH = DICT_PATH
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@dictionary_path = DICT_PATH # directory containing dictionary files
|
13
|
+
@index_path = INDEX_PATH # directory containing the full text search index
|
14
|
+
@num_results = 50 # maximum results to return from searching
|
15
|
+
@language = JDict::JMDictConstants::Languages::ENGLISH # language to return search results in
|
16
|
+
@lazy_index_loading = false # load the index only on attempting to access it
|
17
|
+
@debug = false # limit number of entries indexed, rebuild index on instantiation
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/constants.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Constants and descriptions for important elements/attributes
|
2
|
+
# of the JMdict XML dictionary.
|
3
|
+
# Descriptions come from JMdict.dtd (document type definition)
|
4
|
+
module JDict
|
5
|
+
module JMDictConstants
|
6
|
+
|
7
|
+
# TODO: change these strings to symbols ?
|
8
|
+
# XML elements of the JMDict file
|
9
|
+
module Elements
|
10
|
+
# Entries consist of kanji elements, kana elements,
|
11
|
+
# general information and sense elements. Each entry must have at
|
12
|
+
# least one kana element and one sense element. Others are optional.
|
13
|
+
ENTRY = 'entry'
|
14
|
+
SEQUENCE = 'ent_seq'
|
15
|
+
|
16
|
+
# This element will contain a word or short phrase in Japanese
|
17
|
+
# which is written using at least one kanji. The valid characters are
|
18
|
+
# kanji, kana, related characters such as chouon and kurikaeshi, and
|
19
|
+
# in exceptional cases, letters from other alphabets.
|
20
|
+
KANJI = 'keb'
|
21
|
+
|
22
|
+
# This element content is restricted to kana and related
|
23
|
+
# characters such as chouon and kurikaeshi. Kana usage will be
|
24
|
+
# consistent between the keb and reb elements; e.g. if the keb
|
25
|
+
# contains katakana, so too will the reb.
|
26
|
+
KANA = 'reb'
|
27
|
+
|
28
|
+
# The sense element will record the translational equivalent
|
29
|
+
# of the Japanese word, plus other related information. Where there
|
30
|
+
# are several distinctly different meanings of the word, multiple
|
31
|
+
# sense elements will be employed.
|
32
|
+
SENSE = 'sense'
|
33
|
+
|
34
|
+
# Part-of-speech information about the entry/sense. Should use
|
35
|
+
# appropriate entity codes.
|
36
|
+
PART_OF_SPEECH = 'pos'
|
37
|
+
|
38
|
+
# Within each sense will be one or more "glosses", i.e.
|
39
|
+
# target-language words or phrases which are equivalents to the
|
40
|
+
# Japanese word. This element would normally be present, however it
|
41
|
+
# may be omitted in entries which are purely for a cross-reference.
|
42
|
+
GLOSS = 'gloss'
|
43
|
+
|
44
|
+
CROSSREFERENCE = 'xref'
|
45
|
+
end
|
46
|
+
|
47
|
+
# Constants for selecting the search language.
|
48
|
+
# Used in the "gloss" element's xml:lang attribute.
|
49
|
+
# :eng never appears as a xml:lang constant because gloss is assumed to be English when not specified
|
50
|
+
# :jpn never appears as a xml:lang because the dictionary itself pivots around Japanese
|
51
|
+
module Languages
|
52
|
+
JAPANESE = :jpn
|
53
|
+
ENGLISH = :eng
|
54
|
+
DUTCH = :dut
|
55
|
+
FRENCH = :fre
|
56
|
+
GERMAN = :ger
|
57
|
+
RUSSIAN = :rus
|
58
|
+
SPANISH = :spa
|
59
|
+
SLOVENIAN = :slv
|
60
|
+
SWEDISH = :swe
|
61
|
+
HUNGARIAN = :hun
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'jdict'
|
2
|
+
require 'dictionary'
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
class JMDict < Dictionary
|
6
|
+
private
|
7
|
+
# DICT_PATH = JDict.configuration.dictionary_path + '/JMdict'
|
8
|
+
|
9
|
+
def initialize(index_path = JDict.configuration.index_path, lazy_index_loading=JDict.configuration.lazy_index_loading)
|
10
|
+
path = JDict.configuration.dictionary_path + '/JMdict'
|
11
|
+
super(index_path, path, lazy_index_loading)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/dictionary.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'jdict'
|
2
|
+
require 'index'
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
class Dictionary
|
6
|
+
attr_reader :entries_cache, :lazy_index_loading
|
7
|
+
|
8
|
+
def initialize(index_path = JDict.configuration.index_path, dictionary_path = nil, lazy_index_loading = JDict.configuration.lazy_index_loading)
|
9
|
+
path_specified = dictionary_path.nil? ? false : true
|
10
|
+
if path_specified and not File.exists? dictionary_path
|
11
|
+
raise "Dictionary not found at path #{dictionary_path}"
|
12
|
+
end
|
13
|
+
|
14
|
+
#store some args for future reference
|
15
|
+
@dictionary_path = dictionary_path
|
16
|
+
@lazy_index_loading = lazy_index_loading
|
17
|
+
|
18
|
+
@entries = []
|
19
|
+
@entries_cache = []
|
20
|
+
|
21
|
+
#instantiate and load the full-text search index
|
22
|
+
@index = DictIndex.new(index_path, dictionary_path, lazy_index_loading)
|
23
|
+
end
|
24
|
+
|
25
|
+
def size
|
26
|
+
@entries.size
|
27
|
+
end
|
28
|
+
|
29
|
+
def loaded?
|
30
|
+
@index.built?
|
31
|
+
end
|
32
|
+
|
33
|
+
# Search this dictionary's index for the given string.
|
34
|
+
# @param query [String] the search query
|
35
|
+
# @return [Array(Entry)] the results of the search
|
36
|
+
def search(query)
|
37
|
+
results = []
|
38
|
+
return results if query.empty?
|
39
|
+
|
40
|
+
load_index if lazy_index_loading and not loaded?
|
41
|
+
|
42
|
+
results = @index.search(query)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Retrieves the definition of a part-of-speech from its abbreviation
|
46
|
+
# @param pos [String] the abbreviation for the part-of-speech
|
47
|
+
# @return [String] the full description of the part-of-speech
|
48
|
+
def get_pos(pos)
|
49
|
+
@index.get_pos(pos)
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def load_index
|
55
|
+
if loaded?
|
56
|
+
Exception.new("Dictionary index is already loaded")
|
57
|
+
else
|
58
|
+
@index.build
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/lib/entry.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#include Constants #XML constants from the dictionary file
|
2
|
+
|
3
|
+
# Entries consist of kanji elements, kana elements,
|
4
|
+
# general information and sense elements. Each entry must have at
|
5
|
+
# least one kana element and one sense element. Others are optional.
|
6
|
+
module JDict
|
7
|
+
class Entry
|
8
|
+
|
9
|
+
attr_accessor :kanji, :kana, :senses
|
10
|
+
# Create a new Entry
|
11
|
+
# entry = initialize(kanji, kana, senses)
|
12
|
+
def initialize(kanji, kana, senses)
|
13
|
+
@kanji, @kana, @senses = kanji, kana, senses
|
14
|
+
end
|
15
|
+
|
16
|
+
KANA_RE = /^kana/
|
17
|
+
SENSE_RE = /^sense/
|
18
|
+
PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
|
19
|
+
|
20
|
+
MEANING_SENTINEL = '**'
|
21
|
+
PART_OF_SPEECH_SENTINEL = '$$'
|
22
|
+
SENSE_SENTINEL = '%%'
|
23
|
+
LANGUAGE_SENTINEL = '&&'
|
24
|
+
GLOSS_SENTINEL = '@@'
|
25
|
+
|
26
|
+
# Converts an SQLite row from the index to the Entry format
|
27
|
+
def self.from_sql(row)
|
28
|
+
kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
29
|
+
kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
30
|
+
senses = []
|
31
|
+
row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
|
32
|
+
ary = txt.scan(PART_OF_SPEECH_RE)
|
33
|
+
if ary.size == 1
|
34
|
+
parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
|
35
|
+
gloss_strings = txt[(ary.to_s.length-1)..-1]
|
36
|
+
else
|
37
|
+
parts_of_speech = nil
|
38
|
+
gloss_strings = txt[5..-1]
|
39
|
+
end
|
40
|
+
|
41
|
+
gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
|
42
|
+
|
43
|
+
glosses = {}
|
44
|
+
gloss_strings.each do |str|
|
45
|
+
lang, meaning_string = str.split(LANGUAGE_SENTINEL)
|
46
|
+
lang = lang.to_sym
|
47
|
+
meanings = meaning_string.split(MEANING_SENTINEL)
|
48
|
+
(glosses[lang] ||= []) << meanings
|
49
|
+
end
|
50
|
+
glosses_for_lang = glosses[JDict.configuration.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
|
51
|
+
senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
|
52
|
+
end
|
53
|
+
self.new(kanji, kana, senses)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Converts an Entry to a string to be indexed into the SQLite database
|
57
|
+
# @return [String] the serialized string for this Entry
|
58
|
+
def to_sql
|
59
|
+
sense_strings = senses.map do |s|
|
60
|
+
sense = ''
|
61
|
+
sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
|
62
|
+
sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
|
63
|
+
end
|
64
|
+
|
65
|
+
insert_data = {
|
66
|
+
':kanji' => kanji.join(", "),
|
67
|
+
':kana' => kana.join(", "),
|
68
|
+
':senses' => sense_strings.join(SENSE_SENTINEL)
|
69
|
+
}
|
70
|
+
|
71
|
+
return insert_data
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get an array of +Senses+ for the specified language
|
75
|
+
def senses_by_language(l)
|
76
|
+
senses.select { |s| s.language == l }
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/index.rb
ADDED
@@ -0,0 +1,346 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'rubygems' #use gems
|
3
|
+
require 'bundler/setup' #load up the bundled environment
|
4
|
+
|
5
|
+
require 'amalgalite'
|
6
|
+
require 'libxml' #XML parsing
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
require_relative 'constants' #XML constants from the dictionary file
|
10
|
+
|
11
|
+
require_relative 'entry' #dictionary elements
|
12
|
+
require_relative 'kanji' #...
|
13
|
+
require_relative 'kana' #...
|
14
|
+
require_relative 'sense'
|
15
|
+
|
16
|
+
require 'amalgalite'
|
17
|
+
|
18
|
+
include LibXML
|
19
|
+
|
20
|
+
module JDict
|
21
|
+
class DictIndex
|
22
|
+
|
23
|
+
LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
|
24
|
+
NUM_ENTRIES_TO_INDEX = 50
|
25
|
+
ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
|
26
|
+
|
27
|
+
attr_reader :path
|
28
|
+
# Initialize a full-text search index backend for JMdict
|
29
|
+
# @param index_path [String] desired filesystem path where you'd like the *search index* stored
|
30
|
+
# @param dictionary_path [String] desired filesystem path where you'd like the *dictionary* stored
|
31
|
+
# @param lazy_loading [Boolean] lazily load the index just when it's needed, instead of building it ahead of time
|
32
|
+
def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
|
33
|
+
raise "Index path was nil" if index_path.nil?
|
34
|
+
|
35
|
+
path_specified = dictionary_path.nil? ? false : true
|
36
|
+
if path_specified and not File.exists? dictionary_path
|
37
|
+
raise "Dictionary not found at path #{dictionary_path}"
|
38
|
+
end
|
39
|
+
|
40
|
+
@path = index_path
|
41
|
+
@dictionary_path = dictionary_path
|
42
|
+
@pos_hash = {}
|
43
|
+
|
44
|
+
# create path if nonexistent
|
45
|
+
FileUtils.mkdir_p(@path)
|
46
|
+
db_file = File.join(@path, "fts5.db")
|
47
|
+
|
48
|
+
File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)
|
49
|
+
|
50
|
+
@index = Amalgalite::Database.new(db_file)
|
51
|
+
|
52
|
+
create_schema
|
53
|
+
|
54
|
+
#check if the index has already been built before Ferret creates it
|
55
|
+
already_built = built?
|
56
|
+
|
57
|
+
#build the index right now if "lazy loading" isn't on and the index is empty
|
58
|
+
build unless lazy_loading or (already_built && !JDict.configuration.debug)
|
59
|
+
|
60
|
+
#make the hash from abbreviated parts of speech to full definitions
|
61
|
+
build_pos_hash
|
62
|
+
end
|
63
|
+
|
64
|
+
# Creates the SQL schema for the Amalgalite database
|
65
|
+
def create_schema
|
66
|
+
schema = @index.schema
|
67
|
+
unless schema.tables['search']
|
68
|
+
@index.execute_batch <<-SQL
|
69
|
+
CREATE VIRTUAL TABLE search USING fts5(
|
70
|
+
kanji,
|
71
|
+
kana,
|
72
|
+
senses
|
73
|
+
);
|
74
|
+
SQL
|
75
|
+
@index.reload_schema!
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns the search results as an array of +Entry+
|
80
|
+
# @param term [String] the search string
|
81
|
+
# @param language [Symbol] the language to return results in
|
82
|
+
# @return [Array(Entry)] the results of the search
|
83
|
+
def search(term, language=LANGUAGE_DEFAULT, exact=false)
|
84
|
+
raise "Index not found at path #{@path}" unless File.exists? @path
|
85
|
+
|
86
|
+
# no results yet...
|
87
|
+
results = []
|
88
|
+
|
89
|
+
@entries_cache = []
|
90
|
+
|
91
|
+
# search for:
|
92
|
+
# kanji... one field
|
93
|
+
# kana ... up to 10 fields
|
94
|
+
# sense... up to 10 fields
|
95
|
+
# query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""
|
96
|
+
query = "{kanji kana senses} : \"#{term}\""
|
97
|
+
query += "*" unless exact
|
98
|
+
|
99
|
+
@index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH '#{query}' LIMIT #{JDict.configuration.num_results}") do |row|
|
100
|
+
entry = Entry.from_sql(row)
|
101
|
+
score = 0.0
|
102
|
+
|
103
|
+
# load entry from the index. from cache, if it's available
|
104
|
+
# load from cache if it's available
|
105
|
+
# if entry = @entries_cache[docid]
|
106
|
+
# entry = Entry.from_index_doc(@ferret_index[docid].load)
|
107
|
+
# @entries_cache[docid] = entry
|
108
|
+
# end
|
109
|
+
|
110
|
+
# # load entry from the index
|
111
|
+
# if entry.nil?
|
112
|
+
# entry = Entry.from_index_doc(@ferret_index[docid].load)
|
113
|
+
# @entries_cache[docid] = entry
|
114
|
+
# end
|
115
|
+
|
116
|
+
is_exact_match = false
|
117
|
+
is_exact_match = entry.kanji == term ||
|
118
|
+
entry.kana.any? { |k| k == term }
|
119
|
+
|
120
|
+
re = Regexp.new("#{term}", Regexp::IGNORECASE) # match the search term, ignoring case
|
121
|
+
# entry.senses.each do |s|
|
122
|
+
# s.glosses.each { |g| is_exact_match = is_exact_match || g.force_encoding("UTF-8").match(re) }
|
123
|
+
# end
|
124
|
+
|
125
|
+
# score = 1.0 if is_exact_match
|
126
|
+
|
127
|
+
# add the result
|
128
|
+
results << [score, entry]
|
129
|
+
end
|
130
|
+
|
131
|
+
@entries_cache = []
|
132
|
+
|
133
|
+
results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
|
134
|
+
end
|
135
|
+
|
136
|
+
def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end
|
137
|
+
|
138
|
+
# Builds the full-text search index
|
139
|
+
# @param overwrite [Boolean] force a build even if the index path already exists
|
140
|
+
# @param dictionary_path [String] path to the dictionary file
|
141
|
+
# @return [Integer] the number of indexed entries
|
142
|
+
def build(overwrite=false, dictionary_path=nil)
|
143
|
+
@dictionary_path = dictionary_path unless dictionary_path.nil?
|
144
|
+
raise "No dictionary path was provided" if @dictionary_path.nil?
|
145
|
+
raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
|
146
|
+
|
147
|
+
reader = open_reader(@dictionary_path)
|
148
|
+
|
149
|
+
puts "Building index..."
|
150
|
+
|
151
|
+
# whenever there is a reader error, print its block parameters
|
152
|
+
XML::Error.set_handler { |*args| p args }
|
153
|
+
|
154
|
+
# components of an entry
|
155
|
+
kanji, kana, senses = [], [], []
|
156
|
+
glosses = {}
|
157
|
+
parts_of_speech = []
|
158
|
+
|
159
|
+
entries_added = 0
|
160
|
+
|
161
|
+
@index.transaction do |db_transaction|
|
162
|
+
|
163
|
+
# read until the end
|
164
|
+
while reader.read
|
165
|
+
|
166
|
+
# check what type of node we're currently on
|
167
|
+
case reader.node_type
|
168
|
+
|
169
|
+
# start-of-element node
|
170
|
+
when XML::Reader::TYPE_ELEMENT
|
171
|
+
case reader.name
|
172
|
+
when JDict::JMDictConstants::Elements::SEQUENCE
|
173
|
+
entry_sequence_num = reader.next_text
|
174
|
+
|
175
|
+
# TODO: Raise an exception if reader.next_text.empty? inside the when's
|
176
|
+
# JMdict shouldn't have any empty elements, I believe.
|
177
|
+
when JDict::JMDictConstants::Elements::KANJI
|
178
|
+
text = reader.next_text
|
179
|
+
kanji << text unless text.empty?
|
180
|
+
|
181
|
+
when JDict::JMDictConstants::Elements::KANA
|
182
|
+
text = reader.next_text
|
183
|
+
kana << text unless text.empty?
|
184
|
+
|
185
|
+
when JDict::JMDictConstants::Elements::GLOSS
|
186
|
+
language = reader.node.lang || LANGUAGE_DEFAULT
|
187
|
+
language = language.intern
|
188
|
+
text = reader.next_text
|
189
|
+
unless text.empty?
|
190
|
+
(glosses[language] ||= []) << text
|
191
|
+
end
|
192
|
+
|
193
|
+
when JDict::JMDictConstants::Elements::CROSSREFERENCE
|
194
|
+
text = reader.next_text
|
195
|
+
end
|
196
|
+
|
197
|
+
# XML entity references are treated as a different node type
|
198
|
+
# the parent node of the entity reference itself has the actual tag name
|
199
|
+
when XML::Reader::TYPE_ENTITY_REFERENCE
|
200
|
+
if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
|
201
|
+
text = reader.name
|
202
|
+
parts_of_speech << text unless text.empty?
|
203
|
+
end
|
204
|
+
|
205
|
+
# end-of-element node
|
206
|
+
when XML::Reader::TYPE_END_ELEMENT
|
207
|
+
case reader.name
|
208
|
+
|
209
|
+
when JDict::JMDictConstants::Elements::SENSE
|
210
|
+
# build sense
|
211
|
+
senses << Sense.new(parts_of_speech, glosses)
|
212
|
+
# glosses.each do |language, texts|
|
213
|
+
# senses << Sense.new(parts_of_speech,
|
214
|
+
# texts.join(', ').strip,
|
215
|
+
# language)
|
216
|
+
# end
|
217
|
+
|
218
|
+
# clear data for the next sense
|
219
|
+
glosses = {}
|
220
|
+
parts_of_speech = []
|
221
|
+
|
222
|
+
# we're at the end of the entry element, so index it
|
223
|
+
when JDict::JMDictConstants::Elements::ENTRY
|
224
|
+
raise "No kana found for this entry!" if kana.empty?
|
225
|
+
|
226
|
+
#index
|
227
|
+
# @index.add_entry(i, Entry.new(kanji, kana, senses))
|
228
|
+
insert_data = Entry.new(kanji, kana, senses).to_sql
|
229
|
+
|
230
|
+
db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
|
231
|
+
stmt.execute( insert_data )
|
232
|
+
end
|
233
|
+
|
234
|
+
# TODO: add entry_sequence_num to the entry
|
235
|
+
|
236
|
+
# clear data for the next entry
|
237
|
+
kanji, kana, senses = [], [], []
|
238
|
+
|
239
|
+
entries_added += 1
|
240
|
+
#debug
|
241
|
+
if JDict.configuration.debug
|
242
|
+
break if entries_added >= NUM_ENTRIES_TO_INDEX
|
243
|
+
# # if @index.size.modulo(1000) == 0
|
244
|
+
# if @index.size.modulo(100) == 0
|
245
|
+
# # puts "#{@index.size/1000} thousand"
|
246
|
+
# puts "\r#{@index.size/100} hundred"
|
247
|
+
# end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# puts "#{@index.size} entries indexed"
|
255
|
+
|
256
|
+
# Done reading & indexing
|
257
|
+
reader.close
|
258
|
+
# @index.close
|
259
|
+
end
|
260
|
+
|
261
|
+
def rebuild
|
262
|
+
raise "Index already exists at path #{@path}" if File.exists? @path
|
263
|
+
build
|
264
|
+
end
|
265
|
+
|
266
|
+
# Creates an XML::Reader object for the given path
|
267
|
+
# @param dictionary_path [String] path to the dictionary file
|
268
|
+
# @return [XML::Reader] the reader for the given dictionary
|
269
|
+
def open_reader(dictionary_path)
|
270
|
+
# open reader
|
271
|
+
reader = nil
|
272
|
+
Dir.chdir(Dir.pwd) do
|
273
|
+
jmdict_path = File.join(dictionary_path)
|
274
|
+
reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
|
275
|
+
raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
|
276
|
+
end
|
277
|
+
reader
|
278
|
+
end
|
279
|
+
|
280
|
+
# Creates the hash of part-of-speech symbols to full definitions from the dictionary
|
281
|
+
def build_pos_hash
|
282
|
+
@pos_hash ||= begin
|
283
|
+
pos_hash = {}
|
284
|
+
reader = open_reader(@dictionary_path)
|
285
|
+
done = false
|
286
|
+
while done == false
|
287
|
+
reader.read
|
288
|
+
case reader.node_type
|
289
|
+
when XML::Reader::TYPE_DOCUMENT_TYPE
|
290
|
+
# random segfault when attempting this
|
291
|
+
# cs.each do |child|
|
292
|
+
# p child.to_s
|
293
|
+
# end
|
294
|
+
doctype_string = reader.node.to_s
|
295
|
+
entities = doctype_string.scan(ENTITY_REGEX)
|
296
|
+
entities.map do |entity|
|
297
|
+
abbrev = entity[0]
|
298
|
+
full = entity[1]
|
299
|
+
sym = pos_to_sym(abbrev)
|
300
|
+
pos_hash[sym] = full
|
301
|
+
end
|
302
|
+
done = true
|
303
|
+
when XML::Reader::TYPE_ELEMENT
|
304
|
+
done = true
|
305
|
+
end
|
306
|
+
end
|
307
|
+
pos_hash
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Converts a part-of-speech entity reference string into a symbol
|
312
|
+
# @param entity [String] the entity reference string
|
313
|
+
# @return [Symbol] the part-of-speech symbol
|
314
|
+
def pos_to_sym(entity)
|
315
|
+
entity.gsub('-', '_').to_sym
|
316
|
+
end
|
317
|
+
|
318
|
+
# Retrieves the definition of a part-of-speech from its abbreviation
|
319
|
+
# @param pos [String] the abbreviation for the part-of-speech
|
320
|
+
# @return [String] the full description of the part-of-speech
|
321
|
+
def get_pos(pos)
|
322
|
+
build_pos_hash if @pos_hash.empty?
|
323
|
+
@pos_hash[pos_to_sym(pos)]
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
# Add custom parsing methods to XML::Reader
|
328
|
+
class XML::Reader
|
329
|
+
|
330
|
+
public
|
331
|
+
# Get the next text node
|
332
|
+
def next_text
|
333
|
+
# read until a text node
|
334
|
+
while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
|
335
|
+
self.value
|
336
|
+
end
|
337
|
+
# Get the next entity node
|
338
|
+
def next_entity
|
339
|
+
# read until an entity node
|
340
|
+
while (self.node_type != XML::Reader::TYPE_ENTITY and
|
341
|
+
self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
|
342
|
+
self.read); end
|
343
|
+
self.value
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
data/lib/jdict.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'configuration'
|
2
|
+
require 'dictionaries/jmdict'
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
class << self
|
6
|
+
attr_accessor :configuration
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.configuration
|
10
|
+
@configuration ||= Configuration.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.reset
|
14
|
+
@configuration = Configuration.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.configure
|
18
|
+
yield(configuration)
|
19
|
+
end
|
20
|
+
end
|
data/lib/kana.rb
ADDED
data/lib/kanji.rb
ADDED
data/lib/sense.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# The sense element will record the translational equivalent
|
2
|
+
# of the Japanese word, plus other related information. Where there
|
3
|
+
# are several distinctly different meanings of the word, multiple
|
4
|
+
# sense elements will be employed.
|
5
|
+
module JDict
|
6
|
+
class Sense
|
7
|
+
attr_reader :parts_of_speech, :glosses
|
8
|
+
#
|
9
|
+
# Create a new +Sense+
|
10
|
+
def initialize(parts_of_speech, glosses)
|
11
|
+
@parts_of_speech, @glosses = parts_of_speech, glosses
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/unicode.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
module JDict
|
2
|
+
module Unicode
|
3
|
+
# Codepoint ranges for japanese unicode characters (in decimal)
|
4
|
+
# from: http://unicode.org/charts/
|
5
|
+
module CodepointRanges
|
6
|
+
HIRAGANA = 12352..12447
|
7
|
+
KATAKANA = 12448..12543
|
8
|
+
KATAKANA_PHONETIC = 12784..12799
|
9
|
+
HALFWIDTH_KATAKANA = 65280..65519
|
10
|
+
UNIFIED_CJK = 19968..40911
|
11
|
+
UNIFIED_CJK_EXT_A = 13312..19903
|
12
|
+
UNIFIED_CJK_EXT_B = 131072..173791
|
13
|
+
PUNCTUATION = 12288..12351
|
14
|
+
end
|
15
|
+
|
16
|
+
# Get Unicode hex codepoint from a Unicode character
|
17
|
+
def hex_codepoint(unicode_char)
|
18
|
+
unicode_char.unpack("U0U*")[0]
|
19
|
+
end
|
20
|
+
|
21
|
+
# TODO: write unit test with a variety of strings to ensure this method
|
22
|
+
# returns the expected output
|
23
|
+
# Determine the script of the specified string:
|
24
|
+
# :kanji
|
25
|
+
# :kana
|
26
|
+
# :english
|
27
|
+
def script_type?(unicode_string)
|
28
|
+
type = ''
|
29
|
+
|
30
|
+
unicode_string.each_char do |c|
|
31
|
+
code = hex_codepoint(c)
|
32
|
+
#kana
|
33
|
+
if CodepointRanges::HIRAGANA.include?(code) ||
|
34
|
+
CodepointRanges::KATAKANA.include?(code) ||
|
35
|
+
CodepointRanges::KATAKANA_PHONETIC.include?(code) ||
|
36
|
+
CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
|
37
|
+
CodepointRanges::PUNCTUATION.include?(code) then
|
38
|
+
type = :kana
|
39
|
+
break
|
40
|
+
#kanji
|
41
|
+
elsif CodepointRanges::UNIFIED_CJK.include?(code) ||
|
42
|
+
CodepointRanges::UNIFIED_CJK_EXT_A.include?(code) ||
|
43
|
+
CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
|
44
|
+
type = :kanji
|
45
|
+
#english
|
46
|
+
else
|
47
|
+
type = :english
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
type
|
52
|
+
end
|
53
|
+
|
54
|
+
def japanese?(unicode_string)
|
55
|
+
type = script_type?(unicode_string)
|
56
|
+
type == :kanji || type == :kana
|
57
|
+
end
|
58
|
+
def english?(unicode_string)
|
59
|
+
type = script_type?(unicode_string)
|
60
|
+
type == :english
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require 'configuration'
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
describe Configuration do
|
6
|
+
describe "#debug" do
|
7
|
+
it "default value is false" do
|
8
|
+
Configuration.new.debug = false
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#debug=" do
|
13
|
+
it "can set value" do
|
14
|
+
config = Configuration.new
|
15
|
+
config.debug = true
|
16
|
+
expect(config.debug).to eq(true)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require BASE_PATH + '/lib/dictionary'
|
3
|
+
#require BASE_PATH + '/lib/jmdict'
|
4
|
+
|
5
|
+
module DictionarySpecHelper
|
6
|
+
JMDICT_PATH = File.join(BASE_PATH+'/dictionaries/JMdict')
|
7
|
+
INDEX_PATH = File.join(BASE_PATH+'/index')
|
8
|
+
|
9
|
+
def mock_index
|
10
|
+
end
|
11
|
+
|
12
|
+
class Increase
|
13
|
+
def initialize(&measure_proc) # + args
|
14
|
+
@measure_proc = measure_proc
|
15
|
+
end
|
16
|
+
|
17
|
+
def matches?(target)
|
18
|
+
@target = target
|
19
|
+
@original_value = @measure_proc.call
|
20
|
+
target.call
|
21
|
+
@new_value = @measure_proc.call
|
22
|
+
return @new_value.to_i > @original_value.to_i
|
23
|
+
end
|
24
|
+
|
25
|
+
def failure_message
|
26
|
+
"expected #{@new_value} to be greater than #{@original_value}"
|
27
|
+
end
|
28
|
+
|
29
|
+
def negative_failure_message
|
30
|
+
"expected #{@new_value} to not be greater than #{@original_value}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def description
|
34
|
+
"increase #{@original_value}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def increase(&measure_proc) # + args
|
39
|
+
Increase.new(&measure_proc)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module DictionarySpec
|
44
|
+
include DictionarySpecHelper
|
45
|
+
|
46
|
+
describe JDict::Dictionary do
|
47
|
+
before do
|
48
|
+
@dictionary = JDict::Dictionary.new(INDEX_PATH)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "is searchable" do
|
52
|
+
@dictionary.should respond_to(:search)
|
53
|
+
end
|
54
|
+
|
55
|
+
it "can tell you whether or not it's loaded" do
|
56
|
+
@dictionary.should respond_to(:loaded?)
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should generate fixtures" do
|
60
|
+
pending
|
61
|
+
@dictionary.should respond_to(:generate_fixtures)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe JDict::Dictionary, "after initialization" do
|
66
|
+
before do
|
67
|
+
@dictionary = JDict::Dictionary.new(INDEX_PATH)
|
68
|
+
end
|
69
|
+
|
70
|
+
it "has no entries" do
|
71
|
+
@dictionary.size.should == 0
|
72
|
+
end
|
73
|
+
|
74
|
+
it "has an empty entries cache" do
|
75
|
+
@dictionary.entries_cache.empty?
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe JDict::Dictionary, "when loading from a dictionary file" do
|
80
|
+
before do
|
81
|
+
@dictionary = JDict::Dictionary.new(INDEX_PATH)
|
82
|
+
end
|
83
|
+
|
84
|
+
it "has at least 1 entry" do
|
85
|
+
pending("implement loading from index")
|
86
|
+
@dictionary.load(JMDICT_PATH)
|
87
|
+
@dictionary.size.should > 0
|
88
|
+
end
|
89
|
+
|
90
|
+
it "says it's loaded" do
|
91
|
+
pending("implement loading from index")
|
92
|
+
@dictionary.load(JMDICT_PATH)
|
93
|
+
# @dictionary.loaded?.should == true
|
94
|
+
@dictionary.loaded?.should equal(true)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe JDict::Dictionary, "when loading from a dictionary file (already loaded)" do
|
99
|
+
before do
|
100
|
+
@dictionary = JDict::Dictionary.new(INDEX_PATH)
|
101
|
+
end
|
102
|
+
|
103
|
+
it "has the same size as it did before being loaded"
|
104
|
+
end
|
105
|
+
|
106
|
+
describe JDict::Dictionary, "when searching" do
|
107
|
+
before do
|
108
|
+
@dictionary = JDict::Dictionary.new(INDEX_PATH)
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should raise an error if an index isn't built yet"
|
112
|
+
it "should give no results if the search phrase is empty" do
|
113
|
+
@dictionary.search('').should be_empty
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
<JMdict>
|
2
|
+
<entry>
|
3
|
+
<ent_seq>1171270</ent_seq>
|
4
|
+
<k_ele>
|
5
|
+
<keb>右翼</keb>
|
6
|
+
<ke_pri>ichi1</ke_pri>
|
7
|
+
<ke_pri>news1</ke_pri>
|
8
|
+
<ke_pri>nf04</ke_pri>
|
9
|
+
</k_ele>
|
10
|
+
<r_ele>
|
11
|
+
<reb>うよく</reb>
|
12
|
+
<re_pri>ichi1</re_pri>
|
13
|
+
<re_pri>news1</re_pri>
|
14
|
+
<re_pri>nf04</re_pri>
|
15
|
+
</r_ele>
|
16
|
+
<sense>
|
17
|
+
<pos>&n;</pos>
|
18
|
+
<gloss>right-wing</gloss>
|
19
|
+
<gloss g_lang="fr">aile droite (oiseau, armée, parti politique, base-ball)</gloss>
|
20
|
+
<gloss g_lang="ru">пра́вое крыло́</gloss>
|
21
|
+
<gloss g_lang="ru">пра́вый фланг</gloss>
|
22
|
+
<gloss g_lang="de">rechter Flügel</gloss>
|
23
|
+
</sense>
|
24
|
+
<sense>
|
25
|
+
<gloss g_lang="de">{Sport}</gloss>
|
26
|
+
<gloss g_lang="de">rechte Flanke</gloss>
|
27
|
+
<gloss g_lang="de">rechter Flügel</gloss>
|
28
|
+
</sense>
|
29
|
+
<sense>
|
30
|
+
<gloss g_lang="de">die Rechte</gloss>
|
31
|
+
</sense>
|
32
|
+
</entry>
|
33
|
+
</JMdict>
|
data/spec/index_spec.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
4
|
+
require BASE_PATH + '/lib/dictionary'
|
5
|
+
require BASE_PATH + '/lib/jmdict'
|
6
|
+
require BASE_PATH + '/lib/index'
|
7
|
+
|
8
|
+
require 'fileutils'
|
9
|
+
|
10
|
+
module IndexSpecHelper
|
11
|
+
end
|
12
|
+
|
13
|
+
describe JDict::DictIndex do
|
14
|
+
include IndexSpecHelper
|
15
|
+
|
16
|
+
before do
|
17
|
+
@index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Searching
|
21
|
+
it "is searchable" do
|
22
|
+
@index.should respond_to(:search)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Building
|
26
|
+
it "is buildable" do
|
27
|
+
@index.should respond_to(:build) # and return an index
|
28
|
+
end
|
29
|
+
it "is rebuildable" do
|
30
|
+
@index.should respond_to(:rebuild)
|
31
|
+
end
|
32
|
+
it "tells whether it's built or not" do
|
33
|
+
@index.should respond_to(:built?)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Destroying
|
37
|
+
it "is destroyable" do
|
38
|
+
@index.should respond_to(:destroy)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "raises an error if an invalid dictionary path is specified" do
|
42
|
+
lambda { JDict::DictIndex.new(INDEX_PATH, 'bad_dictionary_path') }.should raise_error
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
describe JDict::DictIndex, "after initialization" do
|
47
|
+
it "the path should be set" do
|
48
|
+
@index = JDict::DictIndex.new(INDEX_PATH)
|
49
|
+
@index.path.should_not be(nil)
|
50
|
+
@index.path.should_not be('')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe JDict::DictIndex, "when building," do
|
55
|
+
it "it is created on the file system" do
|
56
|
+
@index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
|
57
|
+
@index.build
|
58
|
+
File.exists?(INDEX_PATH).should == true
|
59
|
+
end
|
60
|
+
|
61
|
+
it "its directory on the file system shouldn't be empty" do
|
62
|
+
@index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH,
|
63
|
+
false) #no lazy loading
|
64
|
+
@index.build
|
65
|
+
# .
|
66
|
+
# ..
|
67
|
+
# ^-------- an empty directory has only these 2 entries
|
68
|
+
expect(Dir.entries(INDEX_PATH).size).to be >= 3
|
69
|
+
end
|
70
|
+
|
71
|
+
it "loads from a dictionary file"
|
72
|
+
end
|
73
|
+
|
74
|
+
describe JDict::DictIndex, "when rebuilding" do
|
75
|
+
include FileUtils
|
76
|
+
|
77
|
+
it "raises an error if it doesn't already exist" do
|
78
|
+
rm_rf(INDEX_PATH)
|
79
|
+
File.exists?(INDEX_PATH).should == false
|
80
|
+
lambda {
|
81
|
+
JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH).rebuild
|
82
|
+
}.should raise_error
|
83
|
+
end
|
84
|
+
end
|
data/spec/jdict_spec.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe JDict do
|
4
|
+
describe "#configure" do
|
5
|
+
before do
|
6
|
+
JDict.configure do |config|
|
7
|
+
config.dictionary_path = DICT_PATH
|
8
|
+
config.debug = true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
it "uses the configured path" do
|
13
|
+
expect(JDICT.dictionary_path).to eq(DICT_PATH)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
data/spec/jmdict_spec.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require BASE_PATH + '/lib/dictionary'
|
3
|
+
require BASE_PATH + '/lib/jmdict'
|
4
|
+
|
5
|
+
module JMDictSpecHelper
|
6
|
+
INDEX_PATH = File.join(BASE_PATH+'/index')
|
7
|
+
end
|
8
|
+
|
9
|
+
describe JDict::JMDict do
|
10
|
+
include JMDictSpecHelper
|
11
|
+
|
12
|
+
before do
|
13
|
+
@jmdict = JDict::JMDict.new(JMDictSpecHelper::INDEX_PATH)
|
14
|
+
end
|
15
|
+
|
16
|
+
it do
|
17
|
+
@jmdict.should be_a_kind_of(JDict::Dictionary)
|
18
|
+
end
|
19
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems' #use gems
|
2
|
+
require 'bundler/setup' #load up the bundled environment
|
3
|
+
# require 'spec' #test framework
|
4
|
+
|
5
|
+
$DEBUG = true
|
6
|
+
|
7
|
+
BASE_PATH = File.dirname(__FILE__) + '/..'
|
8
|
+
INDEX_PATH = BASE_PATH + '/test_index'
|
9
|
+
JMDICT_PATH = BASE_PATH + '/dictionaries/JMdict'
|
10
|
+
|
11
|
+
##
|
12
|
+
# rSpec Hash additions.
|
13
|
+
#
|
14
|
+
# From
|
15
|
+
# * http://wincent.com/knowledge-base/Fixtures_considered_harmful%3F
|
16
|
+
# * Neil Rahilly
|
17
|
+
|
18
|
+
class Hash
|
19
|
+
|
20
|
+
##
|
21
|
+
# Filter keys out of a Hash.
|
22
|
+
#
|
23
|
+
# { :a => 1, :b => 2, :c => 3 }.except(:a)
|
24
|
+
# => { :b => 2, :c => 3 }
|
25
|
+
|
26
|
+
def except(*keys)
|
27
|
+
self.reject { |k,v| keys.include?(k || k.to_sym) }
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Override some keys.
|
32
|
+
#
|
33
|
+
# { :a => 1, :b => 2, :c => 3 }.with(:a => 4)
|
34
|
+
# => { :a => 4, :b => 2, :c => 3 }
|
35
|
+
|
36
|
+
def with(overrides = {})
|
37
|
+
self.merge overrides
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Returns a Hash with only the pairs identified by +keys+.
|
42
|
+
#
|
43
|
+
# { :a => 1, :b => 2, :c => 3 }.only(:a)
|
44
|
+
# => { :a => 1 }
|
45
|
+
|
46
|
+
def only(*keys)
|
47
|
+
self.reject { |k,v| !keys.include?(k || k.to_sym) }
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-jdict
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ian Pickering
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: libxml-ruby
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.8.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.8.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: amalgalite
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.5.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.5.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: autotest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 3.4.0
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 3.4.0
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- ipickering2@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- LICENSING
|
77
|
+
- README.md
|
78
|
+
- Rakefile
|
79
|
+
- examples/query.rb
|
80
|
+
- lib/#sense.rb#
|
81
|
+
- lib/configuration.rb
|
82
|
+
- lib/constants.rb
|
83
|
+
- lib/dictionaries/jmdict.rb
|
84
|
+
- lib/dictionary.rb
|
85
|
+
- lib/entry.rb
|
86
|
+
- lib/index.rb
|
87
|
+
- lib/jdict.rb
|
88
|
+
- lib/kana.rb
|
89
|
+
- lib/kanji.rb
|
90
|
+
- lib/ruby-jdict/version.rb
|
91
|
+
- lib/sense.rb
|
92
|
+
- lib/unicode.rb
|
93
|
+
- spec/configuration_spec.rb
|
94
|
+
- spec/dictionary_spec.rb
|
95
|
+
- spec/fixtures/feeds/sample_entry.xml
|
96
|
+
- spec/index_spec.rb
|
97
|
+
- spec/jdict_spec.rb
|
98
|
+
- spec/jmdict_spec.rb
|
99
|
+
- spec/spec_helper.rb
|
100
|
+
homepage: https://github.com/Ruin0x11/ruby-jdict
|
101
|
+
licenses: []
|
102
|
+
metadata: {}
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.4.5.1
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Ruby gem for accessing Jim Breen's Japanese dictionaries
|
123
|
+
test_files: []
|
124
|
+
has_rdoc:
|