ruby-jdict 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/lib/entry.rb DELETED
@@ -1,101 +0,0 @@
1
- #include Constants #XML constants from the dictionary file
2
-
3
- # Entries consist of kanji elements, kana elements,
4
- # general information and sense elements. Each entry must have at
5
- # least one kana element and one sense element. Others are optional.
6
- module JDict
7
- class Entry
8
-
9
- attr_accessor :sequence_number, :kanji, :kana, :senses
10
- # Create a new Entry
11
- # entry = initialize(kanji, kana, senses)
12
- def initialize(sequence_number, kanji, kana, senses)
13
- @sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
14
- end
15
-
16
- KANA_RE = /^kana/
17
- SENSE_RE = /^sense/
18
- PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
19
-
20
- MEANING_SENTINEL = '**'
21
- PART_OF_SPEECH_SENTINEL = '$$'
22
- SENSE_SENTINEL = '%%'
23
- LANGUAGE_SENTINEL = '&&'
24
- GLOSS_SENTINEL = '@@'
25
-
26
- # Converts an SQLite row from the index to the Entry format
27
- def self.from_sql(row)
28
- sequence_number = row["sequence_number"].to_i
29
- kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
30
- kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
31
- senses = []
32
- row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
33
- ary = txt.scan(PART_OF_SPEECH_RE)
34
- if ary.size == 1
35
- parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
36
- gloss_strings = txt[(ary.to_s.length-1)..-1]
37
- else
38
- parts_of_speech = nil
39
- gloss_strings = txt[5..-1]
40
- end
41
-
42
- gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
43
-
44
- glosses = {}
45
- gloss_strings.each do |str|
46
- lang, meaning_string = str.split(LANGUAGE_SENTINEL)
47
- lang = lang.to_sym
48
- meanings = meaning_string.split(MEANING_SENTINEL)
49
- (glosses[lang] ||= []) << meanings
50
- end
51
- glosses_for_lang = glosses[JDict.config.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
52
- senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
53
- end
54
- self.new(sequence_number, kanji, kana, senses)
55
- end
56
-
57
- # Converts an Entry to a string to be indexed into the SQLite database
58
- # @return [String] the serialized string for this Entry
59
- def to_sql
60
- sense_strings = senses.map do |s|
61
- sense = ''
62
- sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
63
-
64
- # FIXME: it fails when retrieving entries from an existing index, because only one language is retrieved and the 'lang' field is nil
65
- sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
66
- end
67
-
68
- { ':sequence_number' => sequence_number.to_s,
69
- ':kanji' => kanji.join(", "),
70
- ':kana' => kana.join(", "),
71
- ':senses' => sense_strings.join(SENSE_SENTINEL) }
72
- end
73
-
74
- # Get an array of +Senses+ for the specified language
75
- def senses_by_language(l)
76
- senses.select { |s| s.language == l }
77
- end
78
-
79
- def to_s
80
- str = ""
81
- str << "#{kanji_to_s} (#{kana_to_s})\n"
82
- str << "#{senses_to_s}\n"
83
- str
84
- end
85
-
86
- def kanji_to_s
87
- @kanji.join(', ')
88
- end
89
-
90
- def kana_to_s
91
- @kana.join(', ') unless @kana.nil?
92
- end
93
-
94
- def senses_to_s(delimiter = "\n")
95
- list = @senses.map.with_index(1) do |sense, i|
96
- "#{i}. #{sense.to_s}"
97
- end
98
- list.join(delimiter)
99
- end
100
- end
101
- end
data/lib/index.rb DELETED
@@ -1,305 +0,0 @@
1
- # encoding: utf-8
2
- require 'amalgalite'
3
- require 'libxml'
4
- require 'fileutils'
5
- require 'io/console'
6
-
7
- require_relative 'constants' #XML constants from the dictionary file
8
-
9
- require_relative 'entry' #dictionary elements
10
- require_relative 'kanji' #...
11
- require_relative 'kana' #...
12
- require_relative 'sense'
13
-
14
- include LibXML
15
-
16
- module JDict
17
- class DictIndex
18
-
19
- LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
20
- NUM_ENTRIES_TO_INDEX = 50
21
- ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
22
-
23
- attr_reader :path
24
-
25
- # Initialize a full-text search index backend for JMdict
26
- # @param path [String] path to the dictionary
27
- def initialize(path)
28
- @dictionary_path = path
29
- @index_path = File.dirname(@dictionary_path)
30
- @pos_hash = {}
31
-
32
- raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
33
-
34
- db_file = File.join(@index_path, "fts5.db")
35
-
36
- File.unlink(db_file) if JDict.config.debug && File.exist?(db_file)
37
-
38
- @index = Amalgalite::Database.new(db_file)
39
-
40
- create_schema
41
-
42
- build_index unless built?
43
-
44
- #make the hash from abbreviated parts of speech to full definitions
45
- @pos_hash ||= build_pos_hash
46
- end
47
-
48
- # Creates the SQL schema for the Amalgalite database
49
- def create_schema
50
- schema = @index.schema
51
- unless schema.tables['search']
52
- @index.execute_batch <<-SQL
53
- CREATE VIRTUAL TABLE search USING fts5(
54
- sequence_number,
55
- kanji,
56
- kana,
57
- senses
58
- );
59
- SQL
60
- @index.reload_schema!
61
- end
62
- end
63
-
64
- def built?
65
- @index.first_value_from( "SELECT count(*) from search" ) != 0
66
- end
67
-
68
- def make_query(term, exact)
69
- # convert full-width katakana to hiragana
70
- # TODO: convert half-width katakana to hiragana
71
- term.tr!('ァ-ン','ぁ-ん')
72
-
73
- if term.start_with?('seq:')
74
- query = "sequence_number : \"#{term[4..-1]}\""
75
- else
76
- query = "{kanji kana senses} : \"#{term}\""
77
- query += "*" unless exact
78
- end
79
-
80
- query
81
- end
82
-
83
- # Returns the search results as an array of +Entry+
84
- # @param term [String] the search string
85
- # @param language [Symbol] the language to return results in
86
- # @return [Array(Entry)] the results of the search
87
- def search(term, exact=false, language=LANGUAGE_DEFAULT)
88
- raise "Index not found at path #{@index_path}" unless File.exists? @index_path
89
-
90
- results = []
91
-
92
- query = make_query(term, exact)
93
-
94
- @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row|
95
- entry = Entry.from_sql(row)
96
- score = 0.0
97
-
98
- is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
99
- score = 1.0 if is_exact_match
100
-
101
- should_add = !exact || (exact && is_exact_match)
102
-
103
- # add the result
104
- results << [score, entry] if should_add
105
- end
106
-
107
- # Sort the results by first column (score) and return only the second column (entry)
108
- results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
109
- end
110
-
111
- # Builds the full-text search index
112
- # @param overwrite [Boolean] force a build even if the index path already exists
113
- # @param dictionary_path [String] path to the dictionary file
114
- # @return [Integer] the number of indexed entries
115
- def build_index(overwrite=false, dictionary_path=nil)
116
- @dictionary_path = dictionary_path unless dictionary_path.nil?
117
- raise "No dictionary path was provided" if @dictionary_path.nil?
118
- raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
119
-
120
- reader = open_reader(@dictionary_path)
121
-
122
- puts "Building index..."
123
-
124
- # whenever there is a reader error, print its block parameters
125
- XML::Error.set_handler { |*args| p args }
126
-
127
- # components of an entry
128
- entry_sequence_num, kanji, kana, senses = 0, [], [], []
129
- glosses = {}
130
- parts_of_speech = []
131
-
132
- entries_added = 0
133
-
134
- @index.transaction do |db_transaction|
135
-
136
- # read until the end
137
- while reader.read
138
-
139
- # check what type of node we're currently on
140
- case reader.node_type
141
-
142
- # start-of-element node
143
- when XML::Reader::TYPE_ELEMENT
144
- case reader.name
145
- when JDict::JMDictConstants::Elements::SEQUENCE
146
- entry_sequence_num = reader.next_text.to_i
147
-
148
- # TODO: Raise an exception if reader.next_text.empty? inside the when's
149
- # JMdict shouldn't have any empty elements, I believe.
150
- when JDict::JMDictConstants::Elements::KANJI
151
- text = reader.next_text
152
- kanji << text unless text.empty?
153
-
154
- when JDict::JMDictConstants::Elements::KANA
155
- text = reader.next_text
156
- kana << text unless text.empty?
157
-
158
- when JDict::JMDictConstants::Elements::GLOSS
159
- language = reader.node.lang || LANGUAGE_DEFAULT
160
- language = language.intern
161
- text = reader.next_text
162
- unless text.empty?
163
- (glosses[language] ||= []) << text
164
- end
165
-
166
- when JDict::JMDictConstants::Elements::CROSSREFERENCE
167
- text = reader.next_text
168
- end
169
-
170
- # XML entity references are treated as a different node type
171
- # the parent node of the entity reference itself has the actual tag name
172
- when XML::Reader::TYPE_ENTITY_REFERENCE
173
- if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
174
- text = reader.name
175
- parts_of_speech << text unless text.empty?
176
- end
177
-
178
- # end-of-element node
179
- when XML::Reader::TYPE_END_ELEMENT
180
- case reader.name
181
-
182
- when JDict::JMDictConstants::Elements::SENSE
183
- # build sense
184
- senses << Sense.new(parts_of_speech, glosses)
185
- # glosses.each do |language, texts|
186
- # senses << Sense.new(parts_of_speech,
187
- # texts.join(', ').strip,
188
- # language)
189
- # end
190
-
191
- # clear data for the next sense
192
- glosses = {}
193
- parts_of_speech = []
194
-
195
- # we're at the end of the entry element, so index it
196
- when JDict::JMDictConstants::Elements::ENTRY
197
- raise "No kana found for this entry!" if kana.empty?
198
-
199
- #index
200
- insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql
201
-
202
- db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
203
- stmt.execute( insert_data )
204
- end
205
-
206
- # clear data for the next entry
207
- kanji, kana, senses = [], [], []
208
-
209
- entries_added += 1
210
- end
211
- end
212
- end
213
- end
214
-
215
- # puts "#{@index.size} entries indexed"
216
-
217
- # Done reading & indexing
218
- reader.close
219
- # @index.close
220
- end
221
-
222
- def rebuild_index
223
- raise "Index already exists at path #{@index_path}" if File.exists? @index_path
224
- build_index
225
- end
226
-
227
- # Creates an XML::Reader object for the given path
228
- # @param dictionary_path [String] path to the dictionary file
229
- # @return [XML::Reader] the reader for the given dictionary
230
- def open_reader(dictionary_path)
231
- # open reader
232
- reader = nil
233
- Dir.chdir(Dir.pwd) do
234
- jmdict_path = File.join(dictionary_path)
235
- reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
236
- raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
237
- end
238
- reader
239
- end
240
-
241
- # Creates the hash of part-of-speech symbols to full definitions from the dictionary
242
- def build_pos_hash
243
- pos_hash = {}
244
- reader = open_reader(@dictionary_path)
245
- done = false
246
- until done
247
- reader.read
248
- case reader.node_type
249
- when XML::Reader::TYPE_DOCUMENT_TYPE
250
- # segfaults when attempting this:
251
- # cs.each do |child|
252
- # p child.to_s
253
- # end
254
- doctype_string = reader.node.to_s
255
- entities = doctype_string.scan(ENTITY_REGEX)
256
- entities.map do |entity|
257
- abbrev = entity[0]
258
- full = entity[1]
259
- sym = pos_to_sym(abbrev)
260
- pos_hash[sym] = full
261
- end
262
- done = true
263
- when XML::Reader::TYPE_ELEMENT
264
- done = true
265
- end
266
- end
267
- pos_hash
268
- end
269
-
270
- # Converts a part-of-speech entity reference string into a symbol
271
- # @param entity [String] the entity reference string
272
- # @return [Symbol] the part-of-speech symbol
273
- def pos_to_sym(entity)
274
- entity.gsub('-', '_').to_sym
275
- end
276
-
277
- # Retrieves the definition of a part-of-speech from its abbreviation
278
- # @param pos [String] the abbreviation for the part-of-speech
279
- # @return [String] the full description of the part-of-speech
280
- def get_pos(pos)
281
- build_pos_hash if @pos_hash.empty?
282
- @pos_hash[pos_to_sym(pos)]
283
- end
284
- end
285
-
286
- # Add custom parsing methods to XML::Reader
287
- class XML::Reader
288
-
289
- public
290
- # Get the next text node
291
- def next_text
292
- # read until a text node
293
- while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
294
- self.value
295
- end
296
- # Get the next entity node
297
- def next_entity
298
- # read until an entity node
299
- while (self.node_type != XML::Reader::TYPE_ENTITY and
300
- self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
301
- self.read); end
302
- self.value
303
- end
304
- end
305
- end
data/lib/jdict.rb DELETED
@@ -1,20 +0,0 @@
1
- require 'configuration'
2
- require 'dictionaries/jmdict'
3
-
4
- module JDict
5
- class << self
6
- attr_accessor :config
7
- end
8
-
9
- def self.config
10
- @config ||= Configuration.new
11
- end
12
-
13
- def self.reset
14
- @config = Configuration.new
15
- end
16
-
17
- def self.configure
18
- yield(config)
19
- end
20
- end
data/lib/kana.rb DELETED
@@ -1,4 +0,0 @@
1
- module JDict
2
- class Kana
3
- end
4
- end
data/lib/kanji.rb DELETED
@@ -1,4 +0,0 @@
1
- module JDict
2
- class Kanji
3
- end
4
- end
data/lib/sense.rb DELETED
@@ -1,28 +0,0 @@
1
- # The sense element will record the translational equivalent
2
- # of the Japanese word, plus other related information. Where there
3
- # are several distinctly different meanings of the word, multiple
4
- # sense elements will be employed.
5
- module JDict
6
- class Sense
7
- attr_reader :parts_of_speech, :glosses
8
- #
9
- # Create a new +Sense+
10
- def initialize(parts_of_speech, glosses)
11
- @parts_of_speech, @glosses = parts_of_speech, glosses
12
- end
13
-
14
- def to_s
15
- parts_of_speech_to_s(@parts_of_speech) + glosses_to_s(@glosses)
16
- end
17
-
18
- private
19
-
20
- def glosses_to_s(glosses)
21
- glosses.join('; ')
22
- end
23
-
24
- def parts_of_speech_to_s(parts_of_speech)
25
- parts_of_speech.nil? ? '' : '[' + parts_of_speech.join(',') + '] '
26
- end
27
- end
28
- end