ruby-jdict 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSING +28 -28
- data/README.md +18 -20
- data/Rakefile +41 -30
- data/examples/query.rb +19 -22
- data/lib/ruby-jdict.rb +14 -0
- data/lib/{constants.rb → ruby-jdict/constants.rb} +73 -64
- data/lib/ruby-jdict/convert.rb +33 -0
- data/lib/ruby-jdict/dictionary.rb +59 -0
- data/lib/ruby-jdict/index.rb +151 -0
- data/lib/ruby-jdict/indexer/dictionary_indexer.rb +28 -0
- data/lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb +164 -0
- data/lib/ruby-jdict/indexer/nokogiri_dictionary_indexer.rb +60 -0
- data/lib/ruby-jdict/jdict.rb +2 -0
- data/lib/ruby-jdict/models/entry.rb +64 -0
- data/lib/ruby-jdict/models/sense.rb +81 -0
- data/lib/ruby-jdict/version.rb +3 -3
- data/spec/convert_spec.rb +27 -0
- data/spec/dictionary_spec.rb +113 -113
- data/spec/entry_spec.rb +25 -0
- data/spec/fixtures/feeds/sample_entry.xml +32 -32
- data/spec/index_spec.rb +82 -84
- data/spec/spec_helper.rb +49 -49
- metadata +35 -36
- data/examples/lst.txt +0 -4
- data/lib/configuration.rb +0 -34
- data/lib/dictionaries/jmdict.rb +0 -38
- data/lib/dictionary.rb +0 -90
- data/lib/downloader.rb +0 -42
- data/lib/entry.rb +0 -101
- data/lib/index.rb +0 -305
- data/lib/jdict.rb +0 -20
- data/lib/kana.rb +0 -4
- data/lib/kanji.rb +0 -4
- data/lib/sense.rb +0 -28
- data/lib/unicode.rb +0 -63
- data/spec/configuration_spec.rb +0 -20
- data/spec/jmdict_spec.rb +0 -19
data/lib/entry.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
#include Constants #XML constants from the dictionary file
|
2
|
-
|
3
|
-
# Entries consist of kanji elements, kana elements,
|
4
|
-
# general information and sense elements. Each entry must have at
|
5
|
-
# least one kana element and one sense element. Others are optional.
|
6
|
-
module JDict
|
7
|
-
class Entry
|
8
|
-
|
9
|
-
attr_accessor :sequence_number, :kanji, :kana, :senses
|
10
|
-
# Create a new Entry
|
11
|
-
# entry = initialize(kanji, kana, senses)
|
12
|
-
def initialize(sequence_number, kanji, kana, senses)
|
13
|
-
@sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
|
14
|
-
end
|
15
|
-
|
16
|
-
KANA_RE = /^kana/
|
17
|
-
SENSE_RE = /^sense/
|
18
|
-
PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
|
19
|
-
|
20
|
-
MEANING_SENTINEL = '**'
|
21
|
-
PART_OF_SPEECH_SENTINEL = '$$'
|
22
|
-
SENSE_SENTINEL = '%%'
|
23
|
-
LANGUAGE_SENTINEL = '&&'
|
24
|
-
GLOSS_SENTINEL = '@@'
|
25
|
-
|
26
|
-
# Converts an SQLite row from the index to the Entry format
|
27
|
-
def self.from_sql(row)
|
28
|
-
sequence_number = row["sequence_number"].to_i
|
29
|
-
kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
30
|
-
kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
|
31
|
-
senses = []
|
32
|
-
row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
|
33
|
-
ary = txt.scan(PART_OF_SPEECH_RE)
|
34
|
-
if ary.size == 1
|
35
|
-
parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
|
36
|
-
gloss_strings = txt[(ary.to_s.length-1)..-1]
|
37
|
-
else
|
38
|
-
parts_of_speech = nil
|
39
|
-
gloss_strings = txt[5..-1]
|
40
|
-
end
|
41
|
-
|
42
|
-
gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
|
43
|
-
|
44
|
-
glosses = {}
|
45
|
-
gloss_strings.each do |str|
|
46
|
-
lang, meaning_string = str.split(LANGUAGE_SENTINEL)
|
47
|
-
lang = lang.to_sym
|
48
|
-
meanings = meaning_string.split(MEANING_SENTINEL)
|
49
|
-
(glosses[lang] ||= []) << meanings
|
50
|
-
end
|
51
|
-
glosses_for_lang = glosses[JDict.config.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
|
52
|
-
senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
|
53
|
-
end
|
54
|
-
self.new(sequence_number, kanji, kana, senses)
|
55
|
-
end
|
56
|
-
|
57
|
-
# Converts an Entry to a string to be indexed into the SQLite database
|
58
|
-
# @return [String] the serialized string for this Entry
|
59
|
-
def to_sql
|
60
|
-
sense_strings = senses.map do |s|
|
61
|
-
sense = ''
|
62
|
-
sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
|
63
|
-
|
64
|
-
# FIXME: it fails when retrieving entries from an existing index, because only one language is retrieved and the 'lang' field is nil
|
65
|
-
sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
|
66
|
-
end
|
67
|
-
|
68
|
-
{ ':sequence_number' => sequence_number.to_s,
|
69
|
-
':kanji' => kanji.join(", "),
|
70
|
-
':kana' => kana.join(", "),
|
71
|
-
':senses' => sense_strings.join(SENSE_SENTINEL) }
|
72
|
-
end
|
73
|
-
|
74
|
-
# Get an array of +Senses+ for the specified language
|
75
|
-
def senses_by_language(l)
|
76
|
-
senses.select { |s| s.language == l }
|
77
|
-
end
|
78
|
-
|
79
|
-
def to_s
|
80
|
-
str = ""
|
81
|
-
str << "#{kanji_to_s} (#{kana_to_s})\n"
|
82
|
-
str << "#{senses_to_s}\n"
|
83
|
-
str
|
84
|
-
end
|
85
|
-
|
86
|
-
def kanji_to_s
|
87
|
-
@kanji.join(', ')
|
88
|
-
end
|
89
|
-
|
90
|
-
def kana_to_s
|
91
|
-
@kana.join(', ') unless @kana.nil?
|
92
|
-
end
|
93
|
-
|
94
|
-
def senses_to_s(delimiter = "\n")
|
95
|
-
list = @senses.map.with_index(1) do |sense, i|
|
96
|
-
"#{i}. #{sense.to_s}"
|
97
|
-
end
|
98
|
-
list.join(delimiter)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
data/lib/index.rb
DELETED
@@ -1,305 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'amalgalite'
|
3
|
-
require 'libxml'
|
4
|
-
require 'fileutils'
|
5
|
-
require 'io/console'
|
6
|
-
|
7
|
-
require_relative 'constants' #XML constants from the dictionary file
|
8
|
-
|
9
|
-
require_relative 'entry' #dictionary elements
|
10
|
-
require_relative 'kanji' #...
|
11
|
-
require_relative 'kana' #...
|
12
|
-
require_relative 'sense'
|
13
|
-
|
14
|
-
include LibXML
|
15
|
-
|
16
|
-
module JDict
|
17
|
-
class DictIndex
|
18
|
-
|
19
|
-
LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
|
20
|
-
NUM_ENTRIES_TO_INDEX = 50
|
21
|
-
ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
|
22
|
-
|
23
|
-
attr_reader :path
|
24
|
-
|
25
|
-
# Initialize a full-text search index backend for JMdict
|
26
|
-
# @param path [String] path to the dictionary
|
27
|
-
def initialize(path)
|
28
|
-
@dictionary_path = path
|
29
|
-
@index_path = File.dirname(@dictionary_path)
|
30
|
-
@pos_hash = {}
|
31
|
-
|
32
|
-
raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
|
33
|
-
|
34
|
-
db_file = File.join(@index_path, "fts5.db")
|
35
|
-
|
36
|
-
File.unlink(db_file) if JDict.config.debug && File.exist?(db_file)
|
37
|
-
|
38
|
-
@index = Amalgalite::Database.new(db_file)
|
39
|
-
|
40
|
-
create_schema
|
41
|
-
|
42
|
-
build_index unless built?
|
43
|
-
|
44
|
-
#make the hash from abbreviated parts of speech to full definitions
|
45
|
-
@pos_hash ||= build_pos_hash
|
46
|
-
end
|
47
|
-
|
48
|
-
# Creates the SQL schema for the Amalgalite database
|
49
|
-
def create_schema
|
50
|
-
schema = @index.schema
|
51
|
-
unless schema.tables['search']
|
52
|
-
@index.execute_batch <<-SQL
|
53
|
-
CREATE VIRTUAL TABLE search USING fts5(
|
54
|
-
sequence_number,
|
55
|
-
kanji,
|
56
|
-
kana,
|
57
|
-
senses
|
58
|
-
);
|
59
|
-
SQL
|
60
|
-
@index.reload_schema!
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
def built?
|
65
|
-
@index.first_value_from( "SELECT count(*) from search" ) != 0
|
66
|
-
end
|
67
|
-
|
68
|
-
def make_query(term, exact)
|
69
|
-
# convert full-width katakana to hiragana
|
70
|
-
# TODO: convert half-width katakana to hiragana
|
71
|
-
term.tr!('ァ-ン','ぁ-ん')
|
72
|
-
|
73
|
-
if term.start_with?('seq:')
|
74
|
-
query = "sequence_number : \"#{term[4..-1]}\""
|
75
|
-
else
|
76
|
-
query = "{kanji kana senses} : \"#{term}\""
|
77
|
-
query += "*" unless exact
|
78
|
-
end
|
79
|
-
|
80
|
-
query
|
81
|
-
end
|
82
|
-
|
83
|
-
# Returns the search results as an array of +Entry+
|
84
|
-
# @param term [String] the search string
|
85
|
-
# @param language [Symbol] the language to return results in
|
86
|
-
# @return [Array(Entry)] the results of the search
|
87
|
-
def search(term, exact=false, language=LANGUAGE_DEFAULT)
|
88
|
-
raise "Index not found at path #{@index_path}" unless File.exists? @index_path
|
89
|
-
|
90
|
-
results = []
|
91
|
-
|
92
|
-
query = make_query(term, exact)
|
93
|
-
|
94
|
-
@index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row|
|
95
|
-
entry = Entry.from_sql(row)
|
96
|
-
score = 0.0
|
97
|
-
|
98
|
-
is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
|
99
|
-
score = 1.0 if is_exact_match
|
100
|
-
|
101
|
-
should_add = !exact || (exact && is_exact_match)
|
102
|
-
|
103
|
-
# add the result
|
104
|
-
results << [score, entry] if should_add
|
105
|
-
end
|
106
|
-
|
107
|
-
# Sort the results by first column (score) and return only the second column (entry)
|
108
|
-
results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
|
109
|
-
end
|
110
|
-
|
111
|
-
# Builds the full-text search index
|
112
|
-
# @param overwrite [Boolean] force a build even if the index path already exists
|
113
|
-
# @param dictionary_path [String] path to the dictionary file
|
114
|
-
# @return [Integer] the number of indexed entries
|
115
|
-
def build_index(overwrite=false, dictionary_path=nil)
|
116
|
-
@dictionary_path = dictionary_path unless dictionary_path.nil?
|
117
|
-
raise "No dictionary path was provided" if @dictionary_path.nil?
|
118
|
-
raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
|
119
|
-
|
120
|
-
reader = open_reader(@dictionary_path)
|
121
|
-
|
122
|
-
puts "Building index..."
|
123
|
-
|
124
|
-
# whenever there is a reader error, print its block parameters
|
125
|
-
XML::Error.set_handler { |*args| p args }
|
126
|
-
|
127
|
-
# components of an entry
|
128
|
-
entry_sequence_num, kanji, kana, senses = 0, [], [], []
|
129
|
-
glosses = {}
|
130
|
-
parts_of_speech = []
|
131
|
-
|
132
|
-
entries_added = 0
|
133
|
-
|
134
|
-
@index.transaction do |db_transaction|
|
135
|
-
|
136
|
-
# read until the end
|
137
|
-
while reader.read
|
138
|
-
|
139
|
-
# check what type of node we're currently on
|
140
|
-
case reader.node_type
|
141
|
-
|
142
|
-
# start-of-element node
|
143
|
-
when XML::Reader::TYPE_ELEMENT
|
144
|
-
case reader.name
|
145
|
-
when JDict::JMDictConstants::Elements::SEQUENCE
|
146
|
-
entry_sequence_num = reader.next_text.to_i
|
147
|
-
|
148
|
-
# TODO: Raise an exception if reader.next_text.empty? inside the when's
|
149
|
-
# JMdict shouldn't have any empty elements, I believe.
|
150
|
-
when JDict::JMDictConstants::Elements::KANJI
|
151
|
-
text = reader.next_text
|
152
|
-
kanji << text unless text.empty?
|
153
|
-
|
154
|
-
when JDict::JMDictConstants::Elements::KANA
|
155
|
-
text = reader.next_text
|
156
|
-
kana << text unless text.empty?
|
157
|
-
|
158
|
-
when JDict::JMDictConstants::Elements::GLOSS
|
159
|
-
language = reader.node.lang || LANGUAGE_DEFAULT
|
160
|
-
language = language.intern
|
161
|
-
text = reader.next_text
|
162
|
-
unless text.empty?
|
163
|
-
(glosses[language] ||= []) << text
|
164
|
-
end
|
165
|
-
|
166
|
-
when JDict::JMDictConstants::Elements::CROSSREFERENCE
|
167
|
-
text = reader.next_text
|
168
|
-
end
|
169
|
-
|
170
|
-
# XML entity references are treated as a different node type
|
171
|
-
# the parent node of the entity reference itself has the actual tag name
|
172
|
-
when XML::Reader::TYPE_ENTITY_REFERENCE
|
173
|
-
if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
|
174
|
-
text = reader.name
|
175
|
-
parts_of_speech << text unless text.empty?
|
176
|
-
end
|
177
|
-
|
178
|
-
# end-of-element node
|
179
|
-
when XML::Reader::TYPE_END_ELEMENT
|
180
|
-
case reader.name
|
181
|
-
|
182
|
-
when JDict::JMDictConstants::Elements::SENSE
|
183
|
-
# build sense
|
184
|
-
senses << Sense.new(parts_of_speech, glosses)
|
185
|
-
# glosses.each do |language, texts|
|
186
|
-
# senses << Sense.new(parts_of_speech,
|
187
|
-
# texts.join(', ').strip,
|
188
|
-
# language)
|
189
|
-
# end
|
190
|
-
|
191
|
-
# clear data for the next sense
|
192
|
-
glosses = {}
|
193
|
-
parts_of_speech = []
|
194
|
-
|
195
|
-
# we're at the end of the entry element, so index it
|
196
|
-
when JDict::JMDictConstants::Elements::ENTRY
|
197
|
-
raise "No kana found for this entry!" if kana.empty?
|
198
|
-
|
199
|
-
#index
|
200
|
-
insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql
|
201
|
-
|
202
|
-
db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
|
203
|
-
stmt.execute( insert_data )
|
204
|
-
end
|
205
|
-
|
206
|
-
# clear data for the next entry
|
207
|
-
kanji, kana, senses = [], [], []
|
208
|
-
|
209
|
-
entries_added += 1
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
# puts "#{@index.size} entries indexed"
|
216
|
-
|
217
|
-
# Done reading & indexing
|
218
|
-
reader.close
|
219
|
-
# @index.close
|
220
|
-
end
|
221
|
-
|
222
|
-
def rebuild_index
|
223
|
-
raise "Index already exists at path #{@index_path}" if File.exists? @index_path
|
224
|
-
build_index
|
225
|
-
end
|
226
|
-
|
227
|
-
# Creates an XML::Reader object for the given path
|
228
|
-
# @param dictionary_path [String] path to the dictionary file
|
229
|
-
# @return [XML::Reader] the reader for the given dictionary
|
230
|
-
def open_reader(dictionary_path)
|
231
|
-
# open reader
|
232
|
-
reader = nil
|
233
|
-
Dir.chdir(Dir.pwd) do
|
234
|
-
jmdict_path = File.join(dictionary_path)
|
235
|
-
reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
|
236
|
-
raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
|
237
|
-
end
|
238
|
-
reader
|
239
|
-
end
|
240
|
-
|
241
|
-
# Creates the hash of part-of-speech symbols to full definitions from the dictionary
|
242
|
-
def build_pos_hash
|
243
|
-
pos_hash = {}
|
244
|
-
reader = open_reader(@dictionary_path)
|
245
|
-
done = false
|
246
|
-
until done
|
247
|
-
reader.read
|
248
|
-
case reader.node_type
|
249
|
-
when XML::Reader::TYPE_DOCUMENT_TYPE
|
250
|
-
# segfaults when attempting this:
|
251
|
-
# cs.each do |child|
|
252
|
-
# p child.to_s
|
253
|
-
# end
|
254
|
-
doctype_string = reader.node.to_s
|
255
|
-
entities = doctype_string.scan(ENTITY_REGEX)
|
256
|
-
entities.map do |entity|
|
257
|
-
abbrev = entity[0]
|
258
|
-
full = entity[1]
|
259
|
-
sym = pos_to_sym(abbrev)
|
260
|
-
pos_hash[sym] = full
|
261
|
-
end
|
262
|
-
done = true
|
263
|
-
when XML::Reader::TYPE_ELEMENT
|
264
|
-
done = true
|
265
|
-
end
|
266
|
-
end
|
267
|
-
pos_hash
|
268
|
-
end
|
269
|
-
|
270
|
-
# Converts a part-of-speech entity reference string into a symbol
|
271
|
-
# @param entity [String] the entity reference string
|
272
|
-
# @return [Symbol] the part-of-speech symbol
|
273
|
-
def pos_to_sym(entity)
|
274
|
-
entity.gsub('-', '_').to_sym
|
275
|
-
end
|
276
|
-
|
277
|
-
# Retrieves the definition of a part-of-speech from its abbreviation
|
278
|
-
# @param pos [String] the abbreviation for the part-of-speech
|
279
|
-
# @return [String] the full description of the part-of-speech
|
280
|
-
def get_pos(pos)
|
281
|
-
build_pos_hash if @pos_hash.empty?
|
282
|
-
@pos_hash[pos_to_sym(pos)]
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
# Add custom parsing methods to XML::Reader
|
287
|
-
class XML::Reader
|
288
|
-
|
289
|
-
public
|
290
|
-
# Get the next text node
|
291
|
-
def next_text
|
292
|
-
# read until a text node
|
293
|
-
while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
|
294
|
-
self.value
|
295
|
-
end
|
296
|
-
# Get the next entity node
|
297
|
-
def next_entity
|
298
|
-
# read until an entity node
|
299
|
-
while (self.node_type != XML::Reader::TYPE_ENTITY and
|
300
|
-
self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
|
301
|
-
self.read); end
|
302
|
-
self.value
|
303
|
-
end
|
304
|
-
end
|
305
|
-
end
|
data/lib/jdict.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'configuration'
|
2
|
-
require 'dictionaries/jmdict'
|
3
|
-
|
4
|
-
module JDict
|
5
|
-
class << self
|
6
|
-
attr_accessor :config
|
7
|
-
end
|
8
|
-
|
9
|
-
def self.config
|
10
|
-
@config ||= Configuration.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.reset
|
14
|
-
@config = Configuration.new
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.configure
|
18
|
-
yield(config)
|
19
|
-
end
|
20
|
-
end
|
data/lib/kana.rb
DELETED
data/lib/kanji.rb
DELETED
data/lib/sense.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# The sense element will record the translational equivalent
|
2
|
-
# of the Japanese word, plus other related information. Where there
|
3
|
-
# are several distinctly different meanings of the word, multiple
|
4
|
-
# sense elements will be employed.
|
5
|
-
module JDict
|
6
|
-
class Sense
|
7
|
-
attr_reader :parts_of_speech, :glosses
|
8
|
-
#
|
9
|
-
# Create a new +Sense+
|
10
|
-
def initialize(parts_of_speech, glosses)
|
11
|
-
@parts_of_speech, @glosses = parts_of_speech, glosses
|
12
|
-
end
|
13
|
-
|
14
|
-
def to_s
|
15
|
-
parts_of_speech_to_s(@parts_of_speech) + glosses_to_s(@glosses)
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def glosses_to_s(glosses)
|
21
|
-
glosses.join('; ')
|
22
|
-
end
|
23
|
-
|
24
|
-
def parts_of_speech_to_s(parts_of_speech)
|
25
|
-
parts_of_speech.nil? ? '' : '[' + parts_of_speech.join(',') + '] '
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|