ruby-jdict 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSING +28 -28
- data/README.md +18 -20
- data/Rakefile +41 -30
- data/examples/query.rb +19 -22
- data/lib/ruby-jdict.rb +14 -0
- data/lib/{constants.rb → ruby-jdict/constants.rb} +73 -64
- data/lib/ruby-jdict/convert.rb +33 -0
- data/lib/ruby-jdict/dictionary.rb +59 -0
- data/lib/ruby-jdict/index.rb +151 -0
- data/lib/ruby-jdict/indexer/dictionary_indexer.rb +28 -0
- data/lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb +164 -0
- data/lib/ruby-jdict/indexer/nokogiri_dictionary_indexer.rb +60 -0
- data/lib/ruby-jdict/jdict.rb +2 -0
- data/lib/ruby-jdict/models/entry.rb +64 -0
- data/lib/ruby-jdict/models/sense.rb +81 -0
- data/lib/ruby-jdict/version.rb +3 -3
- data/spec/convert_spec.rb +27 -0
- data/spec/dictionary_spec.rb +113 -113
- data/spec/entry_spec.rb +25 -0
- data/spec/fixtures/feeds/sample_entry.xml +32 -32
- data/spec/index_spec.rb +82 -84
- data/spec/spec_helper.rb +49 -49
- metadata +35 -36
- data/examples/lst.txt +0 -4
- data/lib/configuration.rb +0 -34
- data/lib/dictionaries/jmdict.rb +0 -38
- data/lib/dictionary.rb +0 -90
- data/lib/downloader.rb +0 -42
- data/lib/entry.rb +0 -101
- data/lib/index.rb +0 -305
- data/lib/jdict.rb +0 -20
- data/lib/kana.rb +0 -4
- data/lib/kanji.rb +0 -4
- data/lib/sense.rb +0 -28
- data/lib/unicode.rb +0 -63
- data/spec/configuration_spec.rb +0 -20
- data/spec/jmdict_spec.rb +0 -19
@@ -0,0 +1,151 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'amalgalite'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'io/console'
|
5
|
+
|
6
|
+
module JDict
|
7
|
+
class DictIndex
|
8
|
+
ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
|
9
|
+
|
10
|
+
attr_reader :path
|
11
|
+
|
12
|
+
# Initialize a full-text search index backend for JMdict
|
13
|
+
# @param path [String] path to the dictionary
|
14
|
+
def initialize(path)
|
15
|
+
@dictionary_path = path
|
16
|
+
@index_path = File.dirname(@dictionary_path)
|
17
|
+
@pos_hash = {}
|
18
|
+
|
19
|
+
raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
|
20
|
+
|
21
|
+
@db_file = File.join(@index_path, "jdict.db")
|
22
|
+
initialize_db(@db_file)
|
23
|
+
|
24
|
+
build_index!
|
25
|
+
end
|
26
|
+
|
27
|
+
def built?
|
28
|
+
@index.first_value_from( "SELECT count(*) from search" ) != 0
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete!
|
32
|
+
@index.close
|
33
|
+
@index = nil
|
34
|
+
|
35
|
+
File.unlink(@db_file) if File.exist?(@db_file)
|
36
|
+
|
37
|
+
initialize_db(@db_file)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Builds the full-text search index
|
41
|
+
# @return [Integer] the number of indexed entries
|
42
|
+
def build_index!(&block)
|
43
|
+
entries_added = do_build_index(&block) unless built?
|
44
|
+
|
45
|
+
#make the hash from abbreviated parts of speech to full definitions
|
46
|
+
@pos_hash ||= build_pos_hash
|
47
|
+
|
48
|
+
entries_added
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns the search results as an array of +Entry+
|
52
|
+
# @param term [String] the search string
|
53
|
+
# @param language [Symbol] the language to return results in
|
54
|
+
# @return [Array(Entry)] the results of the search
|
55
|
+
def search(term, opts = {})
|
56
|
+
raise "Index not found at path #{@index_path}" unless File.exists? @index_path
|
57
|
+
|
58
|
+
results = []
|
59
|
+
|
60
|
+
query = make_query(term, opts[:exact])
|
61
|
+
|
62
|
+
@index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, opts[:max_results]) do |row|
|
63
|
+
entry = Entry.from_sql(row)
|
64
|
+
score = 0.0
|
65
|
+
|
66
|
+
is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
|
67
|
+
score = 1.0 if is_exact_match
|
68
|
+
|
69
|
+
should_add = !opts[:exact] || (opts[:exact] && is_exact_match)
|
70
|
+
|
71
|
+
# add the result
|
72
|
+
results << [score, entry] if should_add
|
73
|
+
end
|
74
|
+
|
75
|
+
# Sort the results by first column (score) and return only the second column (entry)
|
76
|
+
results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
|
77
|
+
end
|
78
|
+
|
79
|
+
# Retrieves the definition of a part-of-speech from its abbreviation
|
80
|
+
# @param pos [String] the abbreviation for the part-of-speech
|
81
|
+
# @return [String] the full description of the part-of-speech
|
82
|
+
def get_pos(pos)
|
83
|
+
build_pos_hash if @pos_hash.empty?
|
84
|
+
@pos_hash[pos_to_sym(pos)]
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def initialize_db(db_file)
|
90
|
+
@index = Amalgalite::Database.new(db_file)
|
91
|
+
@pos_hash = nil
|
92
|
+
|
93
|
+
create_schema
|
94
|
+
end
|
95
|
+
|
96
|
+
# Creates the SQL schema for the Amalgalite database
|
97
|
+
def create_schema
|
98
|
+
schema = @index.schema
|
99
|
+
unless schema.tables['search']
|
100
|
+
@index.execute_batch <<-SQL
|
101
|
+
CREATE VIRTUAL TABLE search USING fts5(
|
102
|
+
sequence_number,
|
103
|
+
kanji,
|
104
|
+
kana,
|
105
|
+
senses
|
106
|
+
);
|
107
|
+
SQL
|
108
|
+
@index.reload_schema!
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def make_query(term, exact)
|
113
|
+
# convert full-width katakana to hiragana
|
114
|
+
# TODO: move to user code
|
115
|
+
# term = Convert.kata_to_hira(term)
|
116
|
+
|
117
|
+
if term.start_with?('seq:')
|
118
|
+
query = "sequence_number : \"#{term[4..-1]}\""
|
119
|
+
else
|
120
|
+
query = "{kanji kana senses} : \"#{term}\""
|
121
|
+
query += "*" unless exact
|
122
|
+
end
|
123
|
+
|
124
|
+
query
|
125
|
+
end
|
126
|
+
|
127
|
+
def do_build_index(&block)
|
128
|
+
indexer = NokogiriDictionaryIndexer.new @dictionary_path
|
129
|
+
entries_added = 0
|
130
|
+
|
131
|
+
@index.transaction do |db_transaction|
|
132
|
+
entries_added = indexer.index(db_transaction, &block)
|
133
|
+
end
|
134
|
+
|
135
|
+
entries_added
|
136
|
+
end
|
137
|
+
|
138
|
+
# Creates the hash of part-of-speech symbols to full definitions from the dictionary
|
139
|
+
def build_pos_hash
|
140
|
+
indexer = NokogiriDictionaryIndexer.new @dictionary_path
|
141
|
+
indexer.parse_parts_of_speech
|
142
|
+
end
|
143
|
+
|
144
|
+
# Converts a part-of-speech entity reference string into a symbol
|
145
|
+
# @param entity [String] the entity reference string
|
146
|
+
# @return [Symbol] the part-of-speech symbol
|
147
|
+
def pos_to_sym(entity)
|
148
|
+
entity.gsub('-', '_').to_sym
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'amalgalite'
|
2
|
+
|
3
|
+
module JDict
|
4
|
+
class DictionaryIndexer
|
5
|
+
attr_reader :parts_of_speech
|
6
|
+
|
7
|
+
def initialize(path)
|
8
|
+
raise "No dictionary path was provided" if path.nil?
|
9
|
+
raise "Dictionary not found at path #{@path}" unless File.exists?(path)
|
10
|
+
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
def index(db_transaction, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_parts_of_speech
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
def add_entry(db_transaction, entry)
|
23
|
+
db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
|
24
|
+
stmt.execute(entry.to_sql)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
include LibXML
|
3
|
+
|
4
|
+
module JDict
|
5
|
+
|
6
|
+
class LibXMLDictionaryIndexer < DictionaryIndexer
|
7
|
+
def initialize(path)
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
def index(db_transaction, &block)
|
12
|
+
reader = open_reader(@path)
|
13
|
+
|
14
|
+
# whenever there is a reader error, print its block parameters
|
15
|
+
XML::Error.set_handler { |*args| p args }
|
16
|
+
|
17
|
+
entry_sequence_num, kanji, kana, senses = 0, [], [], []
|
18
|
+
language = nil
|
19
|
+
glosses = {}
|
20
|
+
parts_of_speech = []
|
21
|
+
|
22
|
+
entries_added = 0
|
23
|
+
|
24
|
+
while reader.read
|
25
|
+
yield entries_added, 0 if block_given?
|
26
|
+
|
27
|
+
case reader.node_type
|
28
|
+
|
29
|
+
# start-of-element node
|
30
|
+
when XML::Reader::TYPE_ELEMENT
|
31
|
+
case reader.name
|
32
|
+
when JDict::JMDictConstants::Elements::SEQUENCE
|
33
|
+
entry_sequence_num = reader.next_text.to_i
|
34
|
+
|
35
|
+
# TODO: Raise an exception if reader.next_text.empty? inside the when's
|
36
|
+
# JMdict shouldn't have any empty elements, I believe.
|
37
|
+
when JDict::JMDictConstants::Elements::KANJI
|
38
|
+
text = reader.next_text
|
39
|
+
kanji << text unless text.empty?
|
40
|
+
|
41
|
+
when JDict::JMDictConstants::Elements::KANA
|
42
|
+
text = reader.next_text
|
43
|
+
kana << text unless text.empty?
|
44
|
+
|
45
|
+
when JDict::JMDictConstants::Elements::GLOSS
|
46
|
+
# Assume the language of the whole sense is the language
|
47
|
+
# of the first gloss (in practice, there is never a gloss
|
48
|
+
# with more than one language)
|
49
|
+
unless language
|
50
|
+
language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT
|
51
|
+
language = language.intern
|
52
|
+
end
|
53
|
+
text = reader.next_text
|
54
|
+
glosses << text unless text.empty?
|
55
|
+
|
56
|
+
when JDict::JMDictConstants::Elements::CROSSREFERENCE
|
57
|
+
text = reader.next_text
|
58
|
+
end
|
59
|
+
|
60
|
+
# XML entity references are treated as a different node type
|
61
|
+
# the parent node of the entity reference itself has the actual tag name
|
62
|
+
when XML::Reader::TYPE_ENTITY_REFERENCE
|
63
|
+
if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
|
64
|
+
text = reader.name
|
65
|
+
parts_of_speech << text unless text.empty?
|
66
|
+
end
|
67
|
+
|
68
|
+
when XML::Reader::TYPE_END_ELEMENT
|
69
|
+
case reader.name
|
70
|
+
|
71
|
+
when JDict::JMDictConstants::Elements::SENSE
|
72
|
+
senses << Sense.new(parts_of_speech, glosses, language)
|
73
|
+
|
74
|
+
# clear data for the next sense
|
75
|
+
glosses = {}
|
76
|
+
parts_of_speech = []
|
77
|
+
language = nil
|
78
|
+
|
79
|
+
# we're at the end of the entry element, so index it
|
80
|
+
when JDict::JMDictConstants::Elements::ENTRY
|
81
|
+
raise "No kana found for this entry!" if kana.empty?
|
82
|
+
|
83
|
+
entry = Entry.new(entry_sequence_num, kanji, kana, senses)
|
84
|
+
add_entry(entry)
|
85
|
+
|
86
|
+
# clear data for the next entry
|
87
|
+
kanji, kana, senses = [], [], []
|
88
|
+
|
89
|
+
entries_added += 1
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
reader.close
|
95
|
+
|
96
|
+
entries_added
|
97
|
+
end
|
98
|
+
|
99
|
+
def parse_parts_of_speech
|
100
|
+
pos_hash = {}
|
101
|
+
done = false
|
102
|
+
until done
|
103
|
+
reader.read
|
104
|
+
case reader.node_type
|
105
|
+
when XML::Reader::TYPE_DOCUMENT_TYPE
|
106
|
+
# segfaults when attempting this:
|
107
|
+
# cs.each do |child|
|
108
|
+
# p child.to_s
|
109
|
+
# end
|
110
|
+
doctype_string = reader.node.to_s
|
111
|
+
entities = doctype_string.scan(ENTITY_REGEX)
|
112
|
+
entities.map do |entity|
|
113
|
+
abbrev = entity[0]
|
114
|
+
full = entity[1]
|
115
|
+
sym = pos_to_sym(abbrev)
|
116
|
+
pos_hash[sym] = full
|
117
|
+
end
|
118
|
+
done = true
|
119
|
+
when XML::Reader::TYPE_ELEMENT
|
120
|
+
done = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
reader.close
|
125
|
+
|
126
|
+
printf "\n"
|
127
|
+
|
128
|
+
pos_hash
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
# Creates an XML::Reader object for the given path
|
134
|
+
# @param dictionary_path [String] path to the dictionary file
|
135
|
+
# @return [XML::Reader] the reader for the given dictionary
|
136
|
+
def open_reader(dictionary_path)
|
137
|
+
# open reader
|
138
|
+
reader = nil
|
139
|
+
Dir.chdir(Dir.pwd) do
|
140
|
+
jmdict_path = File.join(dictionary_path)
|
141
|
+
reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
|
142
|
+
raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
|
143
|
+
end
|
144
|
+
reader
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Add custom parsing methods to XML::Reader
|
149
|
+
class XML::Reader
|
150
|
+
public
|
151
|
+
|
152
|
+
def next_text
|
153
|
+
while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
|
154
|
+
self.value
|
155
|
+
end
|
156
|
+
|
157
|
+
def next_entity
|
158
|
+
while (self.node_type != XML::Reader::TYPE_ENTITY and
|
159
|
+
self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
|
160
|
+
self.read); end
|
161
|
+
self.value
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module JDict
|
4
|
+
class NokogiriDictionaryIndexer < JDict::DictionaryIndexer
|
5
|
+
def initialize(path)
|
6
|
+
super
|
7
|
+
|
8
|
+
@doc = File.open(path) do |f|
|
9
|
+
Nokogiri::XML(f) { |c| c.strict }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def index(db_transaction, &block)
|
14
|
+
raw = @doc/"./JMdict/entry"
|
15
|
+
total = raw.count
|
16
|
+
entries_added = 0
|
17
|
+
|
18
|
+
raw.each do |entry|
|
19
|
+
yield entries_added, total if block_given?
|
20
|
+
|
21
|
+
sequence_number = entry.at(JDict::JMDictConstants::Elements::SEQUENCE).content.to_i
|
22
|
+
kanji = (entry/JDict::JMDictConstants::Elements::KANJI).map(&:content)
|
23
|
+
kana = (entry/JDict::JMDictConstants::Elements::KANA).map(&:content)
|
24
|
+
senses = (entry/JDict::JMDictConstants::Elements::SENSE).map(&method(:extract_sense))
|
25
|
+
|
26
|
+
entry = Entry.new(sequence_number, kanji, kana, senses)
|
27
|
+
add_entry(db_transaction, entry)
|
28
|
+
entries_added += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
printf "\n"
|
32
|
+
|
33
|
+
entries_added
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_parts_of_speech
|
37
|
+
{}
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def extract_sense(e)
|
43
|
+
parts_of_speech = (e/JDict::JMDictConstants::Elements::PART_OF_SPEECH).map(&:inner_html)
|
44
|
+
glosses = (e/JDict::JMDictConstants::Elements::GLOSS).map(&:content)
|
45
|
+
|
46
|
+
# Assume the language of the whole sense is the language
|
47
|
+
# of the first gloss (in practice, there is never a gloss
|
48
|
+
# with more than one language in the official JMDict)
|
49
|
+
first_gloss = e.at(JDict::JMDictConstants::Elements::GLOSS)
|
50
|
+
|
51
|
+
language = if first_gloss
|
52
|
+
first_gloss.attr("xml:lang")
|
53
|
+
end
|
54
|
+
|
55
|
+
language ||= "en"
|
56
|
+
|
57
|
+
Sense.new(parts_of_speech, glosses, language)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#include Constants #XML constants from the dictionary file
|
2
|
+
|
3
|
+
# Entries consist of kanji elements, kana elements,
|
4
|
+
# general information and sense elements. Each entry must have at
|
5
|
+
# least one kana element and one sense element. Others are optional.
|
6
|
+
module JDict
|
7
|
+
class Entry
|
8
|
+
attr_accessor :sequence_number, :kanji, :kana, :senses
|
9
|
+
# Create a new Entry
|
10
|
+
# entry = initialize(kanji, kana, senses)
|
11
|
+
def initialize(sequence_number, kanji, kana, senses)
|
12
|
+
@sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
|
13
|
+
end
|
14
|
+
|
15
|
+
# Converts an SQLite row from the index to the Entry format
|
16
|
+
def self.from_sql(row)
|
17
|
+
sequence_number = row["sequence_number"].to_i
|
18
|
+
kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
19
|
+
kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
20
|
+
senses = row["senses"].split(SerialConstants::SENSE_SENTINEL).sort.reduce([]) do |arr, txt|
|
21
|
+
arr << Sense.from_sql(txt)
|
22
|
+
end
|
23
|
+
self.new(sequence_number, kanji, kana, senses)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Converts an Entry to a string to be indexed into the SQLite database
|
27
|
+
# @return [String] the serialized string for this Entry
|
28
|
+
def to_sql
|
29
|
+
sense_strings = senses.map(&:to_sql).join(SerialConstants::SENSE_SENTINEL)
|
30
|
+
|
31
|
+
{ ':sequence_number' => sequence_number.to_s,
|
32
|
+
':kanji' => kanji.join(", "),
|
33
|
+
':kana' => kana.join(", "),
|
34
|
+
':senses' => sense_strings }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get an array of +Senses+ for the specified language
|
38
|
+
def senses_by_language(l)
|
39
|
+
senses.select { |s| s.language == l }
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
str = ""
|
44
|
+
str << "#{kanji_to_s}#{kana_to_s}\n"
|
45
|
+
str << "#{senses_to_s}\n"
|
46
|
+
str
|
47
|
+
end
|
48
|
+
|
49
|
+
def kanji_to_s
|
50
|
+
@kanji.join(', ')
|
51
|
+
end
|
52
|
+
|
53
|
+
def kana_to_s
|
54
|
+
" (#{@kana.join(', ')})" unless @kana.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
def senses_to_s(delimiter = "\n")
|
58
|
+
list = @senses.map.with_index(1) do |sense, i|
|
59
|
+
"#{i}. #{sense.to_s}"
|
60
|
+
end
|
61
|
+
list.join(delimiter)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|