ruby-jdict 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/entry.rb DELETED
@@ -1,101 +0,0 @@
1
- #include Constants #XML constants from the dictionary file
2
-
3
- # Entries consist of kanji elements, kana elements,
4
- # general information and sense elements. Each entry must have at
5
- # least one kana element and one sense element. Others are optional.
6
- module JDict
7
- class Entry
8
-
9
- attr_accessor :sequence_number, :kanji, :kana, :senses
10
- # Create a new Entry
11
- # entry = initialize(kanji, kana, senses)
12
- def initialize(sequence_number, kanji, kana, senses)
13
- @sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
14
- end
15
-
16
- KANA_RE = /^kana/
17
- SENSE_RE = /^sense/
18
- PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
19
-
20
- MEANING_SENTINEL = '**'
21
- PART_OF_SPEECH_SENTINEL = '$$'
22
- SENSE_SENTINEL = '%%'
23
- LANGUAGE_SENTINEL = '&&'
24
- GLOSS_SENTINEL = '@@'
25
-
26
- # Converts an SQLite row from the index to the Entry format
27
- def self.from_sql(row)
28
- sequence_number = row["sequence_number"].to_i
29
- kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
30
- kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
31
- senses = []
32
- row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
33
- ary = txt.scan(PART_OF_SPEECH_RE)
34
- if ary.size == 1
35
- parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
36
- gloss_strings = txt[(ary.to_s.length-1)..-1]
37
- else
38
- parts_of_speech = nil
39
- gloss_strings = txt[5..-1]
40
- end
41
-
42
- gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
43
-
44
- glosses = {}
45
- gloss_strings.each do |str|
46
- lang, meaning_string = str.split(LANGUAGE_SENTINEL)
47
- lang = lang.to_sym
48
- meanings = meaning_string.split(MEANING_SENTINEL)
49
- (glosses[lang] ||= []) << meanings
50
- end
51
- glosses_for_lang = glosses[JDict.config.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
52
- senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
53
- end
54
- self.new(sequence_number, kanji, kana, senses)
55
- end
56
-
57
- # Converts an Entry to a string to be indexed into the SQLite database
58
- # @return [String] the serialized string for this Entry
59
- def to_sql
60
- sense_strings = senses.map do |s|
61
- sense = ''
62
- sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
63
-
64
- # FIXME: it fails when retrieving entries from an existing index, because only one language is retrieved and the 'lang' field is nil
65
- sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
66
- end
67
-
68
- { ':sequence_number' => sequence_number.to_s,
69
- ':kanji' => kanji.join(", "),
70
- ':kana' => kana.join(", "),
71
- ':senses' => sense_strings.join(SENSE_SENTINEL) }
72
- end
73
-
74
- # Get an array of +Senses+ for the specified language
75
- def senses_by_language(l)
76
- senses.select { |s| s.language == l }
77
- end
78
-
79
- def to_s
80
- str = ""
81
- str << "#{kanji_to_s} (#{kana_to_s})\n"
82
- str << "#{senses_to_s}\n"
83
- str
84
- end
85
-
86
- def kanji_to_s
87
- @kanji.join(', ')
88
- end
89
-
90
- def kana_to_s
91
- @kana.join(', ') unless @kana.nil?
92
- end
93
-
94
- def senses_to_s(delimiter = "\n")
95
- list = @senses.map.with_index(1) do |sense, i|
96
- "#{i}. #{sense.to_s}"
97
- end
98
- list.join(delimiter)
99
- end
100
- end
101
- end
data/lib/index.rb DELETED
@@ -1,305 +0,0 @@
1
- # encoding: utf-8
2
- require 'amalgalite'
3
- require 'libxml'
4
- require 'fileutils'
5
- require 'io/console'
6
-
7
- require_relative 'constants' #XML constants from the dictionary file
8
-
9
- require_relative 'entry' #dictionary elements
10
- require_relative 'kanji' #...
11
- require_relative 'kana' #...
12
- require_relative 'sense'
13
-
14
- include LibXML
15
-
16
- module JDict
17
- class DictIndex
18
-
19
- LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
20
- NUM_ENTRIES_TO_INDEX = 50
21
- ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
22
-
23
- attr_reader :path
24
-
25
- # Initialize a full-text search index backend for JMdict
26
- # @param path [String] path to the dictionary
27
- def initialize(path)
28
- @dictionary_path = path
29
- @index_path = File.dirname(@dictionary_path)
30
- @pos_hash = {}
31
-
32
- raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
33
-
34
- db_file = File.join(@index_path, "fts5.db")
35
-
36
- File.unlink(db_file) if JDict.config.debug && File.exist?(db_file)
37
-
38
- @index = Amalgalite::Database.new(db_file)
39
-
40
- create_schema
41
-
42
- build_index unless built?
43
-
44
- #make the hash from abbreviated parts of speech to full definitions
45
- @pos_hash ||= build_pos_hash
46
- end
47
-
48
- # Creates the SQL schema for the Amalgalite database
49
- def create_schema
50
- schema = @index.schema
51
- unless schema.tables['search']
52
- @index.execute_batch <<-SQL
53
- CREATE VIRTUAL TABLE search USING fts5(
54
- sequence_number,
55
- kanji,
56
- kana,
57
- senses
58
- );
59
- SQL
60
- @index.reload_schema!
61
- end
62
- end
63
-
64
- def built?
65
- @index.first_value_from( "SELECT count(*) from search" ) != 0
66
- end
67
-
68
- def make_query(term, exact)
69
- # convert full-width katakana to hiragana
70
- # TODO: convert half-width katakana to hiragana
71
- term.tr!('ァ-ン','ぁ-ん')
72
-
73
- if term.start_with?('seq:')
74
- query = "sequence_number : \"#{term[4..-1]}\""
75
- else
76
- query = "{kanji kana senses} : \"#{term}\""
77
- query += "*" unless exact
78
- end
79
-
80
- query
81
- end
82
-
83
- # Returns the search results as an array of +Entry+
84
- # @param term [String] the search string
85
- # @param language [Symbol] the language to return results in
86
- # @return [Array(Entry)] the results of the search
87
- def search(term, exact=false, language=LANGUAGE_DEFAULT)
88
- raise "Index not found at path #{@index_path}" unless File.exists? @index_path
89
-
90
- results = []
91
-
92
- query = make_query(term, exact)
93
-
94
- @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row|
95
- entry = Entry.from_sql(row)
96
- score = 0.0
97
-
98
- is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
99
- score = 1.0 if is_exact_match
100
-
101
- should_add = !exact || (exact && is_exact_match)
102
-
103
- # add the result
104
- results << [score, entry] if should_add
105
- end
106
-
107
- # Sort the results by first column (score) and return only the second column (entry)
108
- results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
109
- end
110
-
111
- # Builds the full-text search index
112
- # @param overwrite [Boolean] force a build even if the index path already exists
113
- # @param dictionary_path [String] path to the dictionary file
114
- # @return [Integer] the number of indexed entries
115
- def build_index(overwrite=false, dictionary_path=nil)
116
- @dictionary_path = dictionary_path unless dictionary_path.nil?
117
- raise "No dictionary path was provided" if @dictionary_path.nil?
118
- raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
119
-
120
- reader = open_reader(@dictionary_path)
121
-
122
- puts "Building index..."
123
-
124
- # whenever there is a reader error, print its block parameters
125
- XML::Error.set_handler { |*args| p args }
126
-
127
- # components of an entry
128
- entry_sequence_num, kanji, kana, senses = 0, [], [], []
129
- glosses = {}
130
- parts_of_speech = []
131
-
132
- entries_added = 0
133
-
134
- @index.transaction do |db_transaction|
135
-
136
- # read until the end
137
- while reader.read
138
-
139
- # check what type of node we're currently on
140
- case reader.node_type
141
-
142
- # start-of-element node
143
- when XML::Reader::TYPE_ELEMENT
144
- case reader.name
145
- when JDict::JMDictConstants::Elements::SEQUENCE
146
- entry_sequence_num = reader.next_text.to_i
147
-
148
- # TODO: Raise an exception if reader.next_text.empty? inside the when's
149
- # JMdict shouldn't have any empty elements, I believe.
150
- when JDict::JMDictConstants::Elements::KANJI
151
- text = reader.next_text
152
- kanji << text unless text.empty?
153
-
154
- when JDict::JMDictConstants::Elements::KANA
155
- text = reader.next_text
156
- kana << text unless text.empty?
157
-
158
- when JDict::JMDictConstants::Elements::GLOSS
159
- language = reader.node.lang || LANGUAGE_DEFAULT
160
- language = language.intern
161
- text = reader.next_text
162
- unless text.empty?
163
- (glosses[language] ||= []) << text
164
- end
165
-
166
- when JDict::JMDictConstants::Elements::CROSSREFERENCE
167
- text = reader.next_text
168
- end
169
-
170
- # XML entity references are treated as a different node type
171
- # the parent node of the entity reference itself has the actual tag name
172
- when XML::Reader::TYPE_ENTITY_REFERENCE
173
- if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
174
- text = reader.name
175
- parts_of_speech << text unless text.empty?
176
- end
177
-
178
- # end-of-element node
179
- when XML::Reader::TYPE_END_ELEMENT
180
- case reader.name
181
-
182
- when JDict::JMDictConstants::Elements::SENSE
183
- # build sense
184
- senses << Sense.new(parts_of_speech, glosses)
185
- # glosses.each do |language, texts|
186
- # senses << Sense.new(parts_of_speech,
187
- # texts.join(', ').strip,
188
- # language)
189
- # end
190
-
191
- # clear data for the next sense
192
- glosses = {}
193
- parts_of_speech = []
194
-
195
- # we're at the end of the entry element, so index it
196
- when JDict::JMDictConstants::Elements::ENTRY
197
- raise "No kana found for this entry!" if kana.empty?
198
-
199
- #index
200
- insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql
201
-
202
- db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
203
- stmt.execute( insert_data )
204
- end
205
-
206
- # clear data for the next entry
207
- kanji, kana, senses = [], [], []
208
-
209
- entries_added += 1
210
- end
211
- end
212
- end
213
- end
214
-
215
- # puts "#{@index.size} entries indexed"
216
-
217
- # Done reading & indexing
218
- reader.close
219
- # @index.close
220
- end
221
-
222
- def rebuild_index
223
- raise "Index already exists at path #{@index_path}" if File.exists? @index_path
224
- build_index
225
- end
226
-
227
- # Creates an XML::Reader object for the given path
228
- # @param dictionary_path [String] path to the dictionary file
229
- # @return [XML::Reader] the reader for the given dictionary
230
- def open_reader(dictionary_path)
231
- # open reader
232
- reader = nil
233
- Dir.chdir(Dir.pwd) do
234
- jmdict_path = File.join(dictionary_path)
235
- reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
236
- raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
237
- end
238
- reader
239
- end
240
-
241
- # Creates the hash of part-of-speech symbols to full definitions from the dictionary
242
- def build_pos_hash
243
- pos_hash = {}
244
- reader = open_reader(@dictionary_path)
245
- done = false
246
- until done
247
- reader.read
248
- case reader.node_type
249
- when XML::Reader::TYPE_DOCUMENT_TYPE
250
- # segfaults when attempting this:
251
- # cs.each do |child|
252
- # p child.to_s
253
- # end
254
- doctype_string = reader.node.to_s
255
- entities = doctype_string.scan(ENTITY_REGEX)
256
- entities.map do |entity|
257
- abbrev = entity[0]
258
- full = entity[1]
259
- sym = pos_to_sym(abbrev)
260
- pos_hash[sym] = full
261
- end
262
- done = true
263
- when XML::Reader::TYPE_ELEMENT
264
- done = true
265
- end
266
- end
267
- pos_hash
268
- end
269
-
270
- # Converts a part-of-speech entity reference string into a symbol
271
- # @param entity [String] the entity reference string
272
- # @return [Symbol] the part-of-speech symbol
273
- def pos_to_sym(entity)
274
- entity.gsub('-', '_').to_sym
275
- end
276
-
277
- # Retrieves the definition of a part-of-speech from its abbreviation
278
- # @param pos [String] the abbreviation for the part-of-speech
279
- # @return [String] the full description of the part-of-speech
280
- def get_pos(pos)
281
- build_pos_hash if @pos_hash.empty?
282
- @pos_hash[pos_to_sym(pos)]
283
- end
284
- end
285
-
286
- # Add custom parsing methods to XML::Reader
287
- class XML::Reader
288
-
289
- public
290
- # Get the next text node
291
- def next_text
292
- # read until a text node
293
- while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
294
- self.value
295
- end
296
- # Get the next entity node
297
- def next_entity
298
- # read until an entity node
299
- while (self.node_type != XML::Reader::TYPE_ENTITY and
300
- self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
301
- self.read); end
302
- self.value
303
- end
304
- end
305
- end
data/lib/jdict.rb DELETED
@@ -1,20 +0,0 @@
1
- require 'configuration'
2
- require 'dictionaries/jmdict'
3
-
4
- module JDict
5
- class << self
6
- attr_accessor :config
7
- end
8
-
9
- def self.config
10
- @config ||= Configuration.new
11
- end
12
-
13
- def self.reset
14
- @config = Configuration.new
15
- end
16
-
17
- def self.configure
18
- yield(config)
19
- end
20
- end
data/lib/kana.rb DELETED
@@ -1,4 +0,0 @@
1
- module JDict
2
- class Kana
3
- end
4
- end
data/lib/kanji.rb DELETED
@@ -1,4 +0,0 @@
1
- module JDict
2
- class Kanji
3
- end
4
- end
data/lib/sense.rb DELETED
@@ -1,28 +0,0 @@
1
- # The sense element will record the translational equivalent
2
- # of the Japanese word, plus other related information. Where there
3
- # are several distinctly different meanings of the word, multiple
4
- # sense elements will be employed.
5
- module JDict
6
- class Sense
7
- attr_reader :parts_of_speech, :glosses
8
- #
9
- # Create a new +Sense+
10
- def initialize(parts_of_speech, glosses)
11
- @parts_of_speech, @glosses = parts_of_speech, glosses
12
- end
13
-
14
- def to_s
15
- parts_of_speech_to_s(@parts_of_speech) + glosses_to_s(@glosses)
16
- end
17
-
18
- private
19
-
20
- def glosses_to_s(glosses)
21
- glosses.join('; ')
22
- end
23
-
24
- def parts_of_speech_to_s(parts_of_speech)
25
- parts_of_speech.nil? ? '' : '[' + parts_of_speech.join(',') + '] '
26
- end
27
- end
28
- end