ruby-jdict 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSING +28 -28
- data/README.md +18 -20
- data/Rakefile +41 -30
- data/examples/query.rb +19 -22
- data/lib/ruby-jdict.rb +14 -0
- data/lib/{constants.rb → ruby-jdict/constants.rb} +73 -64
- data/lib/ruby-jdict/convert.rb +33 -0
- data/lib/ruby-jdict/dictionary.rb +59 -0
- data/lib/ruby-jdict/index.rb +151 -0
- data/lib/ruby-jdict/indexer/dictionary_indexer.rb +28 -0
- data/lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb +164 -0
- data/lib/ruby-jdict/indexer/nokogiri_dictionary_indexer.rb +60 -0
- data/lib/ruby-jdict/jdict.rb +2 -0
- data/lib/ruby-jdict/models/entry.rb +64 -0
- data/lib/ruby-jdict/models/sense.rb +81 -0
- data/lib/ruby-jdict/version.rb +3 -3
- data/spec/convert_spec.rb +27 -0
- data/spec/dictionary_spec.rb +113 -113
- data/spec/entry_spec.rb +25 -0
- data/spec/fixtures/feeds/sample_entry.xml +32 -32
- data/spec/index_spec.rb +82 -84
- data/spec/spec_helper.rb +49 -49
- metadata +35 -36
- data/examples/lst.txt +0 -4
- data/lib/configuration.rb +0 -34
- data/lib/dictionaries/jmdict.rb +0 -38
- data/lib/dictionary.rb +0 -90
- data/lib/downloader.rb +0 -42
- data/lib/entry.rb +0 -101
- data/lib/index.rb +0 -305
- data/lib/jdict.rb +0 -20
- data/lib/kana.rb +0 -4
- data/lib/kanji.rb +0 -4
- data/lib/sense.rb +0 -28
- data/lib/unicode.rb +0 -63
- data/spec/configuration_spec.rb +0 -20
- data/spec/jmdict_spec.rb +0 -19
@@ -0,0 +1,151 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'amalgalite'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'io/console'
|
5
|
+
|
6
|
+
module JDict
|
7
|
+
class DictIndex
|
8
|
+
ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
|
9
|
+
|
10
|
+
attr_reader :path
|
11
|
+
|
12
|
+
# Initialize a full-text search index backend for JMdict
|
13
|
+
# @param path [String] path to the dictionary
|
14
|
+
def initialize(path)
|
15
|
+
@dictionary_path = path
|
16
|
+
@index_path = File.dirname(@dictionary_path)
|
17
|
+
@pos_hash = {}
|
18
|
+
|
19
|
+
raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
|
20
|
+
|
21
|
+
@db_file = File.join(@index_path, "jdict.db")
|
22
|
+
initialize_db(@db_file)
|
23
|
+
|
24
|
+
build_index!
|
25
|
+
end
|
26
|
+
|
27
|
+
def built?
|
28
|
+
@index.first_value_from( "SELECT count(*) from search" ) != 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete!
|
32
|
+
@index.close
|
33
|
+
@index = nil
|
34
|
+
|
35
|
+
File.unlink(@db_file) if File.exist?(@db_file)
|
36
|
+
|
37
|
+
initialize_db(@db_file)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Builds the full-text search index
|
41
|
+
# @return [Integer] the number of indexed entries
|
42
|
+
def build_index!(&block)
|
43
|
+
entries_added = do_build_index(&block) unless built?
|
44
|
+
|
45
|
+
#make the hash from abbreviated parts of speech to full definitions
|
46
|
+
@pos_hash ||= build_pos_hash
|
47
|
+
|
48
|
+
entries_added
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns the search results as an array of +Entry+
|
52
|
+
# @param term [String] the search string
|
53
|
+
# @param language [Symbol] the language to return results in
|
54
|
+
# @return [Array(Entry)] the results of the search
|
55
|
+
def search(term, opts = {})
|
56
|
+
raise "Index not found at path #{@index_path}" unless File.exists? @index_path
|
57
|
+
|
58
|
+
results = []
|
59
|
+
|
60
|
+
query = make_query(term, opts[:exact])
|
61
|
+
|
62
|
+
@index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, opts[:max_results]) do |row|
|
63
|
+
entry = Entry.from_sql(row)
|
64
|
+
score = 0.0
|
65
|
+
|
66
|
+
is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
|
67
|
+
score = 1.0 if is_exact_match
|
68
|
+
|
69
|
+
should_add = !opts[:exact] || (opts[:exact] && is_exact_match)
|
70
|
+
|
71
|
+
# add the result
|
72
|
+
results << [score, entry] if should_add
|
73
|
+
end
|
74
|
+
|
75
|
+
# Sort the results by first column (score) and return only the second column (entry)
|
76
|
+
results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
|
77
|
+
end
|
78
|
+
|
79
|
+
# Retrieves the definition of a part-of-speech from its abbreviation
|
80
|
+
# @param pos [String] the abbreviation for the part-of-speech
|
81
|
+
# @return [String] the full description of the part-of-speech
|
82
|
+
def get_pos(pos)
|
83
|
+
build_pos_hash if @pos_hash.empty?
|
84
|
+
@pos_hash[pos_to_sym(pos)]
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def initialize_db(db_file)
|
90
|
+
@index = Amalgalite::Database.new(db_file)
|
91
|
+
@pos_hash = nil
|
92
|
+
|
93
|
+
create_schema
|
94
|
+
end
|
95
|
+
|
96
|
+
# Creates the SQL schema for the Amalgalite database
|
97
|
+
def create_schema
|
98
|
+
schema = @index.schema
|
99
|
+
unless schema.tables['search']
|
100
|
+
@index.execute_batch <<-SQL
|
101
|
+
CREATE VIRTUAL TABLE search USING fts5(
|
102
|
+
sequence_number,
|
103
|
+
kanji,
|
104
|
+
kana,
|
105
|
+
senses
|
106
|
+
);
|
107
|
+
SQL
|
108
|
+
@index.reload_schema!
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def make_query(term, exact)
|
113
|
+
# convert full-width katakana to hiragana
|
114
|
+
# TODO: move to user code
|
115
|
+
# term = Convert.kata_to_hira(term)
|
116
|
+
|
117
|
+
if term.start_with?('seq:')
|
118
|
+
query = "sequence_number : \"#{term[4..-1]}\""
|
119
|
+
else
|
120
|
+
query = "{kanji kana senses} : \"#{term}\""
|
121
|
+
query += "*" unless exact
|
122
|
+
end
|
123
|
+
|
124
|
+
query
|
125
|
+
end
|
126
|
+
|
127
|
+
def do_build_index(&block)
|
128
|
+
indexer = NokogiriDictionaryIndexer.new @dictionary_path
|
129
|
+
entries_added = 0
|
130
|
+
|
131
|
+
@index.transaction do |db_transaction|
|
132
|
+
entries_added = indexer.index(db_transaction, &block)
|
133
|
+
end
|
134
|
+
|
135
|
+
entries_added
|
136
|
+
end
|
137
|
+
|
138
|
+
# Creates the hash of part-of-speech symbols to full definitions from the dictionary
|
139
|
+
def build_pos_hash
|
140
|
+
indexer = NokogiriDictionaryIndexer.new @dictionary_path
|
141
|
+
indexer.parse_parts_of_speech
|
142
|
+
end
|
143
|
+
|
144
|
+
# Converts a part-of-speech entity reference string into a symbol
|
145
|
+
# @param entity [String] the entity reference string
|
146
|
+
# @return [Symbol] the part-of-speech symbol
|
147
|
+
def pos_to_sym(entity)
|
148
|
+
entity.gsub('-', '_').to_sym
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'amalgalite'
|
2
|
+
|
3
|
+
module JDict
|
4
|
+
class DictionaryIndexer
|
5
|
+
attr_reader :parts_of_speech
|
6
|
+
|
7
|
+
def initialize(path)
|
8
|
+
raise "No dictionary path was provided" if path.nil?
|
9
|
+
raise "Dictionary not found at path #{@path}" unless File.exists?(path)
|
10
|
+
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
def index(db_transaction, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_parts_of_speech
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
def add_entry(db_transaction, entry)
|
23
|
+
db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
|
24
|
+
stmt.execute(entry.to_sql)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
include LibXML
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
|
6
|
+
class LibXMLDictionaryIndexer < DictionaryIndexer
|
7
|
+
def initialize(path)
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def index(db_transaction, &block)
|
12
|
+
reader = open_reader(@path)
|
13
|
+
|
14
|
+
# whenever there is a reader error, print its block parameters
|
15
|
+
XML::Error.set_handler { |*args| p args }
|
16
|
+
|
17
|
+
entry_sequence_num, kanji, kana, senses = 0, [], [], []
|
18
|
+
language = nil
|
19
|
+
glosses = {}
|
20
|
+
parts_of_speech = []
|
21
|
+
|
22
|
+
entries_added = 0
|
23
|
+
|
24
|
+
while reader.read
|
25
|
+
yield entries_added, 0 if block_given?
|
26
|
+
|
27
|
+
case reader.node_type
|
28
|
+
|
29
|
+
# start-of-element node
|
30
|
+
when XML::Reader::TYPE_ELEMENT
|
31
|
+
case reader.name
|
32
|
+
when JDict::JMDictConstants::Elements::SEQUENCE
|
33
|
+
entry_sequence_num = reader.next_text.to_i
|
34
|
+
|
35
|
+
# TODO: Raise an exception if reader.next_text.empty? inside the when's
|
36
|
+
# JMdict shouldn't have any empty elements, I believe.
|
37
|
+
when JDict::JMDictConstants::Elements::KANJI
|
38
|
+
text = reader.next_text
|
39
|
+
kanji << text unless text.empty?
|
40
|
+
|
41
|
+
when JDict::JMDictConstants::Elements::KANA
|
42
|
+
text = reader.next_text
|
43
|
+
kana << text unless text.empty?
|
44
|
+
|
45
|
+
when JDict::JMDictConstants::Elements::GLOSS
|
46
|
+
# Assume the language of the whole sense is the language
|
47
|
+
# of the first gloss (in practice, there is never a gloss
|
48
|
+
# with more than one language)
|
49
|
+
unless language
|
50
|
+
language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT
|
51
|
+
language = language.intern
|
52
|
+
end
|
53
|
+
text = reader.next_text
|
54
|
+
glosses << text unless text.empty?
|
55
|
+
|
56
|
+
when JDict::JMDictConstants::Elements::CROSSREFERENCE
|
57
|
+
text = reader.next_text
|
58
|
+
end
|
59
|
+
|
60
|
+
# XML entity references are treated as a different node type
|
61
|
+
# the parent node of the entity reference itself has the actual tag name
|
62
|
+
when XML::Reader::TYPE_ENTITY_REFERENCE
|
63
|
+
if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
|
64
|
+
text = reader.name
|
65
|
+
parts_of_speech << text unless text.empty?
|
66
|
+
end
|
67
|
+
|
68
|
+
when XML::Reader::TYPE_END_ELEMENT
|
69
|
+
case reader.name
|
70
|
+
|
71
|
+
when JDict::JMDictConstants::Elements::SENSE
|
72
|
+
senses << Sense.new(parts_of_speech, glosses, language)
|
73
|
+
|
74
|
+
# clear data for the next sense
|
75
|
+
glosses = {}
|
76
|
+
parts_of_speech = []
|
77
|
+
language = nil
|
78
|
+
|
79
|
+
# we're at the end of the entry element, so index it
|
80
|
+
when JDict::JMDictConstants::Elements::ENTRY
|
81
|
+
raise "No kana found for this entry!" if kana.empty?
|
82
|
+
|
83
|
+
entry = Entry.new(entry_sequence_num, kanji, kana, senses)
|
84
|
+
add_entry(entry)
|
85
|
+
|
86
|
+
# clear data for the next entry
|
87
|
+
kanji, kana, senses = [], [], []
|
88
|
+
|
89
|
+
entries_added += 1
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
reader.close
|
95
|
+
|
96
|
+
entries_added
|
97
|
+
end
|
98
|
+
|
99
|
+
def parse_parts_of_speech
|
100
|
+
pos_hash = {}
|
101
|
+
done = false
|
102
|
+
until done
|
103
|
+
reader.read
|
104
|
+
case reader.node_type
|
105
|
+
when XML::Reader::TYPE_DOCUMENT_TYPE
|
106
|
+
# segfaults when attempting this:
|
107
|
+
# cs.each do |child|
|
108
|
+
# p child.to_s
|
109
|
+
# end
|
110
|
+
doctype_string = reader.node.to_s
|
111
|
+
entities = doctype_string.scan(ENTITY_REGEX)
|
112
|
+
entities.map do |entity|
|
113
|
+
abbrev = entity[0]
|
114
|
+
full = entity[1]
|
115
|
+
sym = pos_to_sym(abbrev)
|
116
|
+
pos_hash[sym] = full
|
117
|
+
end
|
118
|
+
done = true
|
119
|
+
when XML::Reader::TYPE_ELEMENT
|
120
|
+
done = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
reader.close
|
125
|
+
|
126
|
+
printf "\n"
|
127
|
+
|
128
|
+
pos_hash
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
# Creates an XML::Reader object for the given path
|
134
|
+
# @param dictionary_path [String] path to the dictionary file
|
135
|
+
# @return [XML::Reader] the reader for the given dictionary
|
136
|
+
def open_reader(dictionary_path)
|
137
|
+
# open reader
|
138
|
+
reader = nil
|
139
|
+
Dir.chdir(Dir.pwd) do
|
140
|
+
jmdict_path = File.join(dictionary_path)
|
141
|
+
reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
|
142
|
+
raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
|
143
|
+
end
|
144
|
+
reader
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Add custom parsing methods to XML::Reader
|
149
|
+
class XML::Reader
|
150
|
+
public
|
151
|
+
|
152
|
+
def next_text
|
153
|
+
while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
|
154
|
+
self.value
|
155
|
+
end
|
156
|
+
|
157
|
+
def next_entity
|
158
|
+
while (self.node_type != XML::Reader::TYPE_ENTITY and
|
159
|
+
self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
|
160
|
+
self.read); end
|
161
|
+
self.value
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module JDict
|
4
|
+
class NokogiriDictionaryIndexer < JDict::DictionaryIndexer
|
5
|
+
def initialize(path)
|
6
|
+
super
|
7
|
+
|
8
|
+
@doc = File.open(path) do |f|
|
9
|
+
Nokogiri::XML(f) { |c| c.strict }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def index(db_transaction, &block)
|
14
|
+
raw = @doc/"./JMdict/entry"
|
15
|
+
total = raw.count
|
16
|
+
entries_added = 0
|
17
|
+
|
18
|
+
raw.each do |entry|
|
19
|
+
yield entries_added, total if block_given?
|
20
|
+
|
21
|
+
sequence_number = entry.at(JDict::JMDictConstants::Elements::SEQUENCE).content.to_i
|
22
|
+
kanji = (entry/JDict::JMDictConstants::Elements::KANJI).map(&:content)
|
23
|
+
kana = (entry/JDict::JMDictConstants::Elements::KANA).map(&:content)
|
24
|
+
senses = (entry/JDict::JMDictConstants::Elements::SENSE).map(&method(:extract_sense))
|
25
|
+
|
26
|
+
entry = Entry.new(sequence_number, kanji, kana, senses)
|
27
|
+
add_entry(db_transaction, entry)
|
28
|
+
entries_added += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
printf "\n"
|
32
|
+
|
33
|
+
entries_added
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_parts_of_speech
|
37
|
+
{}
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def extract_sense(e)
|
43
|
+
parts_of_speech = (e/JDict::JMDictConstants::Elements::PART_OF_SPEECH).map(&:inner_html)
|
44
|
+
glosses = (e/JDict::JMDictConstants::Elements::GLOSS).map(&:content)
|
45
|
+
|
46
|
+
# Assume the language of the whole sense is the language
|
47
|
+
# of the first gloss (in practice, there is never a gloss
|
48
|
+
# with more than one language in the official JMDict)
|
49
|
+
first_gloss = e.at(JDict::JMDictConstants::Elements::GLOSS)
|
50
|
+
|
51
|
+
language = if first_gloss
|
52
|
+
first_gloss.attr("xml:lang")
|
53
|
+
end
|
54
|
+
|
55
|
+
language ||= "en"
|
56
|
+
|
57
|
+
Sense.new(parts_of_speech, glosses, language)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#include Constants #XML constants from the dictionary file
|
2
|
+
|
3
|
+
# Entries consist of kanji elements, kana elements,
|
4
|
+
# general information and sense elements. Each entry must have at
|
5
|
+
# least one kana element and one sense element. Others are optional.
|
6
|
+
module JDict
|
7
|
+
class Entry
|
8
|
+
attr_accessor :sequence_number, :kanji, :kana, :senses
|
9
|
+
# Create a new Entry
|
10
|
+
# entry = initialize(kanji, kana, senses)
|
11
|
+
def initialize(sequence_number, kanji, kana, senses)
|
12
|
+
@sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
|
13
|
+
end
|
14
|
+
|
15
|
+
# Converts an SQLite row from the index to the Entry format
|
16
|
+
def self.from_sql(row)
|
17
|
+
sequence_number = row["sequence_number"].to_i
|
18
|
+
kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
19
|
+
kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
20
|
+
senses = row["senses"].split(SerialConstants::SENSE_SENTINEL).sort.reduce([]) do |arr, txt|
|
21
|
+
arr << Sense.from_sql(txt)
|
22
|
+
end
|
23
|
+
self.new(sequence_number, kanji, kana, senses)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Converts an Entry to a string to be indexed into the SQLite database
|
27
|
+
# @return [String] the serialized string for this Entry
|
28
|
+
def to_sql
|
29
|
+
sense_strings = senses.map(&:to_sql).join(SerialConstants::SENSE_SENTINEL)
|
30
|
+
|
31
|
+
{ ':sequence_number' => sequence_number.to_s,
|
32
|
+
':kanji' => kanji.join(", "),
|
33
|
+
':kana' => kana.join(", "),
|
34
|
+
':senses' => sense_strings }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get an array of +Senses+ for the specified language
|
38
|
+
def senses_by_language(l)
|
39
|
+
senses.select { |s| s.language == l }
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
str = ""
|
44
|
+
str << "#{kanji_to_s}#{kana_to_s}\n"
|
45
|
+
str << "#{senses_to_s}\n"
|
46
|
+
str
|
47
|
+
end
|
48
|
+
|
49
|
+
def kanji_to_s
|
50
|
+
@kanji.join(', ')
|
51
|
+
end
|
52
|
+
|
53
|
+
def kana_to_s
|
54
|
+
" (#{@kana.join(', ')})" unless @kana.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
def senses_to_s(delimiter = "\n")
|
58
|
+
list = @senses.map.with_index(1) do |sense, i|
|
59
|
+
"#{i}. #{sense.to_s}"
|
60
|
+
end
|
61
|
+
list.join(delimiter)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|