ruby-jdict 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSING +28 -28
- data/README.md +18 -20
- data/Rakefile +41 -30
- data/examples/query.rb +19 -22
- data/lib/ruby-jdict.rb +14 -0
- data/lib/{constants.rb → ruby-jdict/constants.rb} +73 -64
- data/lib/ruby-jdict/convert.rb +33 -0
- data/lib/ruby-jdict/dictionary.rb +59 -0
- data/lib/ruby-jdict/index.rb +151 -0
- data/lib/ruby-jdict/indexer/dictionary_indexer.rb +28 -0
- data/lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb +164 -0
- data/lib/ruby-jdict/indexer/nokogiri_dictionary_indexer.rb +60 -0
- data/lib/ruby-jdict/jdict.rb +2 -0
- data/lib/ruby-jdict/models/entry.rb +64 -0
- data/lib/ruby-jdict/models/sense.rb +81 -0
- data/lib/ruby-jdict/version.rb +3 -3
- data/spec/convert_spec.rb +27 -0
- data/spec/dictionary_spec.rb +113 -113
- data/spec/entry_spec.rb +25 -0
- data/spec/fixtures/feeds/sample_entry.xml +32 -32
- data/spec/index_spec.rb +82 -84
- data/spec/spec_helper.rb +49 -49
- metadata +35 -36
- data/examples/lst.txt +0 -4
- data/lib/configuration.rb +0 -34
- data/lib/dictionaries/jmdict.rb +0 -38
- data/lib/dictionary.rb +0 -90
- data/lib/downloader.rb +0 -42
- data/lib/entry.rb +0 -101
- data/lib/index.rb +0 -305
- data/lib/jdict.rb +0 -20
- data/lib/kana.rb +0 -4
- data/lib/kanji.rb +0 -4
- data/lib/sense.rb +0 -28
- data/lib/unicode.rb +0 -63
- data/spec/configuration_spec.rb +0 -20
- data/spec/jmdict_spec.rb +0 -19
data/lib/entry.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
#include Constants #XML constants from the dictionary file
|
2
|
-
|
3
|
-
# Entries consist of kanji elements, kana elements,
|
4
|
-
# general information and sense elements. Each entry must have at
|
5
|
-
# least one kana element and one sense element. Others are optional.
|
6
|
-
module JDict
|
7
|
-
class Entry
|
8
|
-
|
9
|
-
attr_accessor :sequence_number, :kanji, :kana, :senses
|
10
|
-
# Create a new Entry
|
11
|
-
# entry = initialize(kanji, kana, senses)
|
12
|
-
def initialize(sequence_number, kanji, kana, senses)
|
13
|
-
@sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
|
14
|
-
end
|
15
|
-
|
16
|
-
KANA_RE = /^kana/
|
17
|
-
SENSE_RE = /^sense/
|
18
|
-
PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
|
19
|
-
|
20
|
-
MEANING_SENTINEL = '**'
|
21
|
-
PART_OF_SPEECH_SENTINEL = '$$'
|
22
|
-
SENSE_SENTINEL = '%%'
|
23
|
-
LANGUAGE_SENTINEL = '&&'
|
24
|
-
GLOSS_SENTINEL = '@@'
|
25
|
-
|
26
|
-
# Converts an SQLite row from the index to the Entry format
|
27
|
-
def self.from_sql(row)
|
28
|
-
sequence_number = row["sequence_number"].to_i
|
29
|
-
kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
30
|
-
kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
31
|
-
senses = []
|
32
|
-
row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
|
33
|
-
ary = txt.scan(PART_OF_SPEECH_RE)
|
34
|
-
if ary.size == 1
|
35
|
-
parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
|
36
|
-
gloss_strings = txt[(ary.to_s.length-1)..-1]
|
37
|
-
else
|
38
|
-
parts_of_speech = nil
|
39
|
-
gloss_strings = txt[5..-1]
|
40
|
-
end
|
41
|
-
|
42
|
-
gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
|
43
|
-
|
44
|
-
glosses = {}
|
45
|
-
gloss_strings.each do |str|
|
46
|
-
lang, meaning_string = str.split(LANGUAGE_SENTINEL)
|
47
|
-
lang = lang.to_sym
|
48
|
-
meanings = meaning_string.split(MEANING_SENTINEL)
|
49
|
-
(glosses[lang] ||= []) << meanings
|
50
|
-
end
|
51
|
-
glosses_for_lang = glosses[JDict.config.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
|
52
|
-
senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
|
53
|
-
end
|
54
|
-
self.new(sequence_number, kanji, kana, senses)
|
55
|
-
end
|
56
|
-
|
57
|
-
# Converts an Entry to a string to be indexed into the SQLite database
|
58
|
-
# @return [String] the serialized string for this Entry
|
59
|
-
def to_sql
|
60
|
-
sense_strings = senses.map do |s|
|
61
|
-
sense = ''
|
62
|
-
sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
|
63
|
-
|
64
|
-
# FIXME: it fails when retrieving entries from an existing index, because only one language is retrieved and the 'lang' field is nil
|
65
|
-
sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
|
66
|
-
end
|
67
|
-
|
68
|
-
{ ':sequence_number' => sequence_number.to_s,
|
69
|
-
':kanji' => kanji.join(", "),
|
70
|
-
':kana' => kana.join(", "),
|
71
|
-
':senses' => sense_strings.join(SENSE_SENTINEL) }
|
72
|
-
end
|
73
|
-
|
74
|
-
# Get an array of +Senses+ for the specified language
|
75
|
-
def senses_by_language(l)
|
76
|
-
senses.select { |s| s.language == l }
|
77
|
-
end
|
78
|
-
|
79
|
-
def to_s
|
80
|
-
str = ""
|
81
|
-
str << "#{kanji_to_s} (#{kana_to_s})\n"
|
82
|
-
str << "#{senses_to_s}\n"
|
83
|
-
str
|
84
|
-
end
|
85
|
-
|
86
|
-
def kanji_to_s
|
87
|
-
@kanji.join(', ')
|
88
|
-
end
|
89
|
-
|
90
|
-
def kana_to_s
|
91
|
-
@kana.join(', ') unless @kana.nil?
|
92
|
-
end
|
93
|
-
|
94
|
-
def senses_to_s(delimiter = "\n")
|
95
|
-
list = @senses.map.with_index(1) do |sense, i|
|
96
|
-
"#{i}. #{sense.to_s}"
|
97
|
-
end
|
98
|
-
list.join(delimiter)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
data/lib/index.rb
DELETED
@@ -1,305 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'amalgalite'
|
3
|
-
require 'libxml'
|
4
|
-
require 'fileutils'
|
5
|
-
require 'io/console'
|
6
|
-
|
7
|
-
require_relative 'constants' #XML constants from the dictionary file
|
8
|
-
|
9
|
-
require_relative 'entry' #dictionary elements
|
10
|
-
require_relative 'kanji' #...
|
11
|
-
require_relative 'kana' #...
|
12
|
-
require_relative 'sense'
|
13
|
-
|
14
|
-
include LibXML
|
15
|
-
|
16
|
-
module JDict
|
17
|
-
class DictIndex
|
18
|
-
|
19
|
-
LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
|
20
|
-
NUM_ENTRIES_TO_INDEX = 50
|
21
|
-
ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
|
22
|
-
|
23
|
-
attr_reader :path
|
24
|
-
|
25
|
-
# Initialize a full-text search index backend for JMdict
|
26
|
-
# @param path [String] path to the dictionary
|
27
|
-
def initialize(path)
|
28
|
-
@dictionary_path = path
|
29
|
-
@index_path = File.dirname(@dictionary_path)
|
30
|
-
@pos_hash = {}
|
31
|
-
|
32
|
-
raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
|
33
|
-
|
34
|
-
db_file = File.join(@index_path, "fts5.db")
|
35
|
-
|
36
|
-
File.unlink(db_file) if JDict.config.debug && File.exist?(db_file)
|
37
|
-
|
38
|
-
@index = Amalgalite::Database.new(db_file)
|
39
|
-
|
40
|
-
create_schema
|
41
|
-
|
42
|
-
build_index unless built?
|
43
|
-
|
44
|
-
#make the hash from abbreviated parts of speech to full definitions
|
45
|
-
@pos_hash ||= build_pos_hash
|
46
|
-
end
|
47
|
-
|
48
|
-
# Creates the SQL schema for the Amalgalite database
|
49
|
-
def create_schema
|
50
|
-
schema = @index.schema
|
51
|
-
unless schema.tables['search']
|
52
|
-
@index.execute_batch <<-SQL
|
53
|
-
CREATE VIRTUAL TABLE search USING fts5(
|
54
|
-
sequence_number,
|
55
|
-
kanji,
|
56
|
-
kana,
|
57
|
-
senses
|
58
|
-
);
|
59
|
-
SQL
|
60
|
-
@index.reload_schema!
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
def built?
|
65
|
-
@index.first_value_from( "SELECT count(*) from search" ) != 0
|
66
|
-
end
|
67
|
-
|
68
|
-
def make_query(term, exact)
|
69
|
-
# convert full-width katakana to hiragana
|
70
|
-
# TODO: convert half-width katakana to hiragana
|
71
|
-
term.tr!('ァ-ン','ぁ-ん')
|
72
|
-
|
73
|
-
if term.start_with?('seq:')
|
74
|
-
query = "sequence_number : \"#{term[4..-1]}\""
|
75
|
-
else
|
76
|
-
query = "{kanji kana senses} : \"#{term}\""
|
77
|
-
query += "*" unless exact
|
78
|
-
end
|
79
|
-
|
80
|
-
query
|
81
|
-
end
|
82
|
-
|
83
|
-
# Returns the search results as an array of +Entry+
|
84
|
-
# @param term [String] the search string
|
85
|
-
# @param language [Symbol] the language to return results in
|
86
|
-
# @return [Array(Entry)] the results of the search
|
87
|
-
def search(term, exact=false, language=LANGUAGE_DEFAULT)
|
88
|
-
raise "Index not found at path #{@index_path}" unless File.exists? @index_path
|
89
|
-
|
90
|
-
results = []
|
91
|
-
|
92
|
-
query = make_query(term, exact)
|
93
|
-
|
94
|
-
@index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row|
|
95
|
-
entry = Entry.from_sql(row)
|
96
|
-
score = 0.0
|
97
|
-
|
98
|
-
is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
|
99
|
-
score = 1.0 if is_exact_match
|
100
|
-
|
101
|
-
should_add = !exact || (exact && is_exact_match)
|
102
|
-
|
103
|
-
# add the result
|
104
|
-
results << [score, entry] if should_add
|
105
|
-
end
|
106
|
-
|
107
|
-
# Sort the results by first column (score) and return only the second column (entry)
|
108
|
-
results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
|
109
|
-
end
|
110
|
-
|
111
|
-
# Builds the full-text search index
|
112
|
-
# @param overwrite [Boolean] force a build even if the index path already exists
|
113
|
-
# @param dictionary_path [String] path to the dictionary file
|
114
|
-
# @return [Integer] the number of indexed entries
|
115
|
-
def build_index(overwrite=false, dictionary_path=nil)
|
116
|
-
@dictionary_path = dictionary_path unless dictionary_path.nil?
|
117
|
-
raise "No dictionary path was provided" if @dictionary_path.nil?
|
118
|
-
raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
|
119
|
-
|
120
|
-
reader = open_reader(@dictionary_path)
|
121
|
-
|
122
|
-
puts "Building index..."
|
123
|
-
|
124
|
-
# whenever there is a reader error, print its block parameters
|
125
|
-
XML::Error.set_handler { |*args| p args }
|
126
|
-
|
127
|
-
# components of an entry
|
128
|
-
entry_sequence_num, kanji, kana, senses = 0, [], [], []
|
129
|
-
glosses = {}
|
130
|
-
parts_of_speech = []
|
131
|
-
|
132
|
-
entries_added = 0
|
133
|
-
|
134
|
-
@index.transaction do |db_transaction|
|
135
|
-
|
136
|
-
# read until the end
|
137
|
-
while reader.read
|
138
|
-
|
139
|
-
# check what type of node we're currently on
|
140
|
-
case reader.node_type
|
141
|
-
|
142
|
-
# start-of-element node
|
143
|
-
when XML::Reader::TYPE_ELEMENT
|
144
|
-
case reader.name
|
145
|
-
when JDict::JMDictConstants::Elements::SEQUENCE
|
146
|
-
entry_sequence_num = reader.next_text.to_i
|
147
|
-
|
148
|
-
# TODO: Raise an exception if reader.next_text.empty? inside the when's
|
149
|
-
# JMdict shouldn't have any empty elements, I believe.
|
150
|
-
when JDict::JMDictConstants::Elements::KANJI
|
151
|
-
text = reader.next_text
|
152
|
-
kanji << text unless text.empty?
|
153
|
-
|
154
|
-
when JDict::JMDictConstants::Elements::KANA
|
155
|
-
text = reader.next_text
|
156
|
-
kana << text unless text.empty?
|
157
|
-
|
158
|
-
when JDict::JMDictConstants::Elements::GLOSS
|
159
|
-
language = reader.node.lang || LANGUAGE_DEFAULT
|
160
|
-
language = language.intern
|
161
|
-
text = reader.next_text
|
162
|
-
unless text.empty?
|
163
|
-
(glosses[language] ||= []) << text
|
164
|
-
end
|
165
|
-
|
166
|
-
when JDict::JMDictConstants::Elements::CROSSREFERENCE
|
167
|
-
text = reader.next_text
|
168
|
-
end
|
169
|
-
|
170
|
-
# XML entity references are treated as a different node type
|
171
|
-
# the parent node of the entity reference itself has the actual tag name
|
172
|
-
when XML::Reader::TYPE_ENTITY_REFERENCE
|
173
|
-
if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
|
174
|
-
text = reader.name
|
175
|
-
parts_of_speech << text unless text.empty?
|
176
|
-
end
|
177
|
-
|
178
|
-
# end-of-element node
|
179
|
-
when XML::Reader::TYPE_END_ELEMENT
|
180
|
-
case reader.name
|
181
|
-
|
182
|
-
when JDict::JMDictConstants::Elements::SENSE
|
183
|
-
# build sense
|
184
|
-
senses << Sense.new(parts_of_speech, glosses)
|
185
|
-
# glosses.each do |language, texts|
|
186
|
-
# senses << Sense.new(parts_of_speech,
|
187
|
-
# texts.join(', ').strip,
|
188
|
-
# language)
|
189
|
-
# end
|
190
|
-
|
191
|
-
# clear data for the next sense
|
192
|
-
glosses = {}
|
193
|
-
parts_of_speech = []
|
194
|
-
|
195
|
-
# we're at the end of the entry element, so index it
|
196
|
-
when JDict::JMDictConstants::Elements::ENTRY
|
197
|
-
raise "No kana found for this entry!" if kana.empty?
|
198
|
-
|
199
|
-
#index
|
200
|
-
insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql
|
201
|
-
|
202
|
-
db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
|
203
|
-
stmt.execute( insert_data )
|
204
|
-
end
|
205
|
-
|
206
|
-
# clear data for the next entry
|
207
|
-
kanji, kana, senses = [], [], []
|
208
|
-
|
209
|
-
entries_added += 1
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
# puts "#{@index.size} entries indexed"
|
216
|
-
|
217
|
-
# Done reading & indexing
|
218
|
-
reader.close
|
219
|
-
# @index.close
|
220
|
-
end
|
221
|
-
|
222
|
-
def rebuild_index
|
223
|
-
raise "Index already exists at path #{@index_path}" if File.exists? @index_path
|
224
|
-
build_index
|
225
|
-
end
|
226
|
-
|
227
|
-
# Creates an XML::Reader object for the given path
|
228
|
-
# @param dictionary_path [String] path to the dictionary file
|
229
|
-
# @return [XML::Reader] the reader for the given dictionary
|
230
|
-
def open_reader(dictionary_path)
|
231
|
-
# open reader
|
232
|
-
reader = nil
|
233
|
-
Dir.chdir(Dir.pwd) do
|
234
|
-
jmdict_path = File.join(dictionary_path)
|
235
|
-
reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
|
236
|
-
raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
|
237
|
-
end
|
238
|
-
reader
|
239
|
-
end
|
240
|
-
|
241
|
-
# Creates the hash of part-of-speech symbols to full definitions from the dictionary
|
242
|
-
def build_pos_hash
|
243
|
-
pos_hash = {}
|
244
|
-
reader = open_reader(@dictionary_path)
|
245
|
-
done = false
|
246
|
-
until done
|
247
|
-
reader.read
|
248
|
-
case reader.node_type
|
249
|
-
when XML::Reader::TYPE_DOCUMENT_TYPE
|
250
|
-
# segfaults when attempting this:
|
251
|
-
# cs.each do |child|
|
252
|
-
# p child.to_s
|
253
|
-
# end
|
254
|
-
doctype_string = reader.node.to_s
|
255
|
-
entities = doctype_string.scan(ENTITY_REGEX)
|
256
|
-
entities.map do |entity|
|
257
|
-
abbrev = entity[0]
|
258
|
-
full = entity[1]
|
259
|
-
sym = pos_to_sym(abbrev)
|
260
|
-
pos_hash[sym] = full
|
261
|
-
end
|
262
|
-
done = true
|
263
|
-
when XML::Reader::TYPE_ELEMENT
|
264
|
-
done = true
|
265
|
-
end
|
266
|
-
end
|
267
|
-
pos_hash
|
268
|
-
end
|
269
|
-
|
270
|
-
# Converts a part-of-speech entity reference string into a symbol
|
271
|
-
# @param entity [String] the entity reference string
|
272
|
-
# @return [Symbol] the part-of-speech symbol
|
273
|
-
def pos_to_sym(entity)
|
274
|
-
entity.gsub('-', '_').to_sym
|
275
|
-
end
|
276
|
-
|
277
|
-
# Retrieves the definition of a part-of-speech from its abbreviation
|
278
|
-
# @param pos [String] the abbreviation for the part-of-speech
|
279
|
-
# @return [String] the full description of the part-of-speech
|
280
|
-
def get_pos(pos)
|
281
|
-
build_pos_hash if @pos_hash.empty?
|
282
|
-
@pos_hash[pos_to_sym(pos)]
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
# Add custom parsing methods to XML::Reader
|
287
|
-
class XML::Reader
|
288
|
-
|
289
|
-
public
|
290
|
-
# Get the next text node
|
291
|
-
def next_text
|
292
|
-
# read until a text node
|
293
|
-
while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
|
294
|
-
self.value
|
295
|
-
end
|
296
|
-
# Get the next entity node
|
297
|
-
def next_entity
|
298
|
-
# read until an entity node
|
299
|
-
while (self.node_type != XML::Reader::TYPE_ENTITY and
|
300
|
-
self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
|
301
|
-
self.read); end
|
302
|
-
self.value
|
303
|
-
end
|
304
|
-
end
|
305
|
-
end
|
data/lib/jdict.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'configuration'
|
2
|
-
require 'dictionaries/jmdict'
|
3
|
-
|
4
|
-
module JDict
|
5
|
-
class << self
|
6
|
-
attr_accessor :config
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.config
|
10
|
-
@config ||= Configuration.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.reset
|
14
|
-
@config = Configuration.new
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.configure
|
18
|
-
yield(config)
|
19
|
-
end
|
20
|
-
end
|
data/lib/kana.rb
DELETED
data/lib/kanji.rb
DELETED
data/lib/sense.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# The sense element will record the translational equivalent
|
2
|
-
# of the Japanese word, plus other related information. Where there
|
3
|
-
# are several distinctly different meanings of the word, multiple
|
4
|
-
# sense elements will be employed.
|
5
|
-
module JDict
|
6
|
-
class Sense
|
7
|
-
attr_reader :parts_of_speech, :glosses
|
8
|
-
#
|
9
|
-
# Create a new +Sense+
|
10
|
-
def initialize(parts_of_speech, glosses)
|
11
|
-
@parts_of_speech, @glosses = parts_of_speech, glosses
|
12
|
-
end
|
13
|
-
|
14
|
-
def to_s
|
15
|
-
parts_of_speech_to_s(@parts_of_speech) + glosses_to_s(@glosses)
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def glosses_to_s(glosses)
|
21
|
-
glosses.join('; ')
|
22
|
-
end
|
23
|
-
|
24
|
-
def parts_of_speech_to_s(parts_of_speech)
|
25
|
-
parts_of_speech.nil? ? '' : '[' + parts_of_speech.join(',') + '] '
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|