ruby-jdict 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ # encoding: utf-8
2
+ require 'amalgalite'
3
+ require 'fileutils'
4
+ require 'io/console'
5
+
6
+ module JDict
7
+ class DictIndex
8
+ ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
9
+
10
+ attr_reader :path
11
+
12
+ # Initialize a full-text search index backend for JMdict
13
+ # @param path [String] path to the dictionary
14
+ def initialize(path)
15
+ @dictionary_path = path
16
+ @index_path = File.dirname(@dictionary_path)
17
+ @pos_hash = {}
18
+
19
+ raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
20
+
21
+ @db_file = File.join(@index_path, "jdict.db")
22
+ initialize_db(@db_file)
23
+
24
+ build_index!
25
+ end
26
+
27
+ def built?
28
+ @index.first_value_from( "SELECT count(*) from search" ) != 0
29
+ end
30
+
31
+ def delete!
32
+ @index.close
33
+ @index = nil
34
+
35
+ File.unlink(@db_file) if File.exist?(@db_file)
36
+
37
+ initialize_db(@db_file)
38
+ end
39
+
40
+ # Builds the full-text search index
41
+ # @return [Integer] the number of indexed entries
42
+ def build_index!(&block)
43
+ entries_added = do_build_index(&block) unless built?
44
+
45
+ #make the hash from abbreviated parts of speech to full definitions
46
+ @pos_hash ||= build_pos_hash
47
+
48
+ entries_added
49
+ end
50
+
51
+ # Returns the search results as an array of +Entry+
52
+ # @param term [String] the search string
53
+ # @param language [Symbol] the language to return results in
54
+ # @return [Array(Entry)] the results of the search
55
+ def search(term, opts = {})
56
+ raise "Index not found at path #{@index_path}" unless File.exists? @index_path
57
+
58
+ results = []
59
+
60
+ query = make_query(term, opts[:exact])
61
+
62
+ @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, opts[:max_results]) do |row|
63
+ entry = Entry.from_sql(row)
64
+ score = 0.0
65
+
66
+ is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
67
+ score = 1.0 if is_exact_match
68
+
69
+ should_add = !opts[:exact] || (opts[:exact] && is_exact_match)
70
+
71
+ # add the result
72
+ results << [score, entry] if should_add
73
+ end
74
+
75
+ # Sort the results by first column (score) and return only the second column (entry)
76
+ results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
77
+ end
78
+
79
+ # Retrieves the definition of a part-of-speech from its abbreviation
80
+ # @param pos [String] the abbreviation for the part-of-speech
81
+ # @return [String] the full description of the part-of-speech
82
+ def get_pos(pos)
83
+ build_pos_hash if @pos_hash.empty?
84
+ @pos_hash[pos_to_sym(pos)]
85
+ end
86
+
87
+ private
88
+
89
+ def initialize_db(db_file)
90
+ @index = Amalgalite::Database.new(db_file)
91
+ @pos_hash = nil
92
+
93
+ create_schema
94
+ end
95
+
96
+ # Creates the SQL schema for the Amalgalite database
97
+ def create_schema
98
+ schema = @index.schema
99
+ unless schema.tables['search']
100
+ @index.execute_batch <<-SQL
101
+ CREATE VIRTUAL TABLE search USING fts5(
102
+ sequence_number,
103
+ kanji,
104
+ kana,
105
+ senses
106
+ );
107
+ SQL
108
+ @index.reload_schema!
109
+ end
110
+ end
111
+
112
+ def make_query(term, exact)
113
+ # convert full-width katakana to hiragana
114
+ # TODO: move to user code
115
+ # term = Convert.kata_to_hira(term)
116
+
117
+ if term.start_with?('seq:')
118
+ query = "sequence_number : \"#{term[4..-1]}\""
119
+ else
120
+ query = "{kanji kana senses} : \"#{term}\""
121
+ query += "*" unless exact
122
+ end
123
+
124
+ query
125
+ end
126
+
127
+ def do_build_index(&block)
128
+ indexer = NokogiriDictionaryIndexer.new @dictionary_path
129
+ entries_added = 0
130
+
131
+ @index.transaction do |db_transaction|
132
+ entries_added = indexer.index(db_transaction, &block)
133
+ end
134
+
135
+ entries_added
136
+ end
137
+
138
+ # Creates the hash of part-of-speech symbols to full definitions from the dictionary
139
+ def build_pos_hash
140
+ indexer = NokogiriDictionaryIndexer.new @dictionary_path
141
+ indexer.parse_parts_of_speech
142
+ end
143
+
144
+ # Converts a part-of-speech entity reference string into a symbol
145
+ # @param entity [String] the entity reference string
146
+ # @return [Symbol] the part-of-speech symbol
147
+ def pos_to_sym(entity)
148
+ entity.gsub('-', '_').to_sym
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,28 @@
1
+ require 'amalgalite'
2
+
3
+ module JDict
4
+ class DictionaryIndexer
5
+ attr_reader :parts_of_speech
6
+
7
+ def initialize(path)
8
+ raise "No dictionary path was provided" if path.nil?
9
+ raise "Dictionary not found at path #{@path}" unless File.exists?(path)
10
+
11
+ @path = path
12
+ end
13
+
14
+ def index(db_transaction, &block)
15
+ end
16
+
17
+ def parse_parts_of_speech
18
+ end
19
+
20
+ protected
21
+
22
+ def add_entry(db_transaction, entry)
23
+ db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
24
+ stmt.execute(entry.to_sql)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,164 @@
1
+ require 'libxml'
2
+ include LibXML
3
+
4
+ module JDict
5
+
6
+ class LibXMLDictionaryIndexer < DictionaryIndexer
7
+ def initialize(path)
8
+ super
9
+ end
10
+
11
+ def index(db_transaction, &block)
12
+ reader = open_reader(@path)
13
+
14
+ # whenever there is a reader error, print its block parameters
15
+ XML::Error.set_handler { |*args| p args }
16
+
17
+ entry_sequence_num, kanji, kana, senses = 0, [], [], []
18
+ language = nil
19
+ glosses = {}
20
+ parts_of_speech = []
21
+
22
+ entries_added = 0
23
+
24
+ while reader.read
25
+ yield entries_added, 0 if block_given?
26
+
27
+ case reader.node_type
28
+
29
+ # start-of-element node
30
+ when XML::Reader::TYPE_ELEMENT
31
+ case reader.name
32
+ when JDict::JMDictConstants::Elements::SEQUENCE
33
+ entry_sequence_num = reader.next_text.to_i
34
+
35
+ # TODO: Raise an exception if reader.next_text.empty? inside the when's
36
+ # JMdict shouldn't have any empty elements, I believe.
37
+ when JDict::JMDictConstants::Elements::KANJI
38
+ text = reader.next_text
39
+ kanji << text unless text.empty?
40
+
41
+ when JDict::JMDictConstants::Elements::KANA
42
+ text = reader.next_text
43
+ kana << text unless text.empty?
44
+
45
+ when JDict::JMDictConstants::Elements::GLOSS
46
+ # Assume the language of the whole sense is the language
47
+ # of the first gloss (in practice, there is never a gloss
48
+ # with more than one language)
49
+ unless language
50
+ language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT
51
+ language = language.intern
52
+ end
53
+ text = reader.next_text
54
+ glosses << text unless text.empty?
55
+
56
+ when JDict::JMDictConstants::Elements::CROSSREFERENCE
57
+ text = reader.next_text
58
+ end
59
+
60
+ # XML entity references are treated as a different node type
61
+ # the parent node of the entity reference itself has the actual tag name
62
+ when XML::Reader::TYPE_ENTITY_REFERENCE
63
+ if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
64
+ text = reader.name
65
+ parts_of_speech << text unless text.empty?
66
+ end
67
+
68
+ when XML::Reader::TYPE_END_ELEMENT
69
+ case reader.name
70
+
71
+ when JDict::JMDictConstants::Elements::SENSE
72
+ senses << Sense.new(parts_of_speech, glosses, language)
73
+
74
+ # clear data for the next sense
75
+ glosses = {}
76
+ parts_of_speech = []
77
+ language = nil
78
+
79
+ # we're at the end of the entry element, so index it
80
+ when JDict::JMDictConstants::Elements::ENTRY
81
+ raise "No kana found for this entry!" if kana.empty?
82
+
83
+ entry = Entry.new(entry_sequence_num, kanji, kana, senses)
84
+ add_entry(entry)
85
+
86
+ # clear data for the next entry
87
+ kanji, kana, senses = [], [], []
88
+
89
+ entries_added += 1
90
+ end
91
+ end
92
+ end
93
+
94
+ reader.close
95
+
96
+ entries_added
97
+ end
98
+
99
+ def parse_parts_of_speech
100
+ pos_hash = {}
101
+ done = false
102
+ until done
103
+ reader.read
104
+ case reader.node_type
105
+ when XML::Reader::TYPE_DOCUMENT_TYPE
106
+ # segfaults when attempting this:
107
+ # cs.each do |child|
108
+ # p child.to_s
109
+ # end
110
+ doctype_string = reader.node.to_s
111
+ entities = doctype_string.scan(ENTITY_REGEX)
112
+ entities.map do |entity|
113
+ abbrev = entity[0]
114
+ full = entity[1]
115
+ sym = pos_to_sym(abbrev)
116
+ pos_hash[sym] = full
117
+ end
118
+ done = true
119
+ when XML::Reader::TYPE_ELEMENT
120
+ done = true
121
+ end
122
+ end
123
+
124
+ reader.close
125
+
126
+ printf "\n"
127
+
128
+ pos_hash
129
+ end
130
+
131
+ private
132
+
133
+ # Creates an XML::Reader object for the given path
134
+ # @param dictionary_path [String] path to the dictionary file
135
+ # @return [XML::Reader] the reader for the given dictionary
136
+ def open_reader(dictionary_path)
137
+ # open reader
138
+ reader = nil
139
+ Dir.chdir(Dir.pwd) do
140
+ jmdict_path = File.join(dictionary_path)
141
+ reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
142
+ raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
143
+ end
144
+ reader
145
+ end
146
+ end
147
+
148
+ # Add custom parsing methods to XML::Reader
149
+ class XML::Reader
150
+ public
151
+
152
+ def next_text
153
+ while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
154
+ self.value
155
+ end
156
+
157
+ def next_entity
158
+ while (self.node_type != XML::Reader::TYPE_ENTITY and
159
+ self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
160
+ self.read); end
161
+ self.value
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,60 @@
1
+ require 'nokogiri'
2
+
3
+ module JDict
4
+ class NokogiriDictionaryIndexer < JDict::DictionaryIndexer
5
+ def initialize(path)
6
+ super
7
+
8
+ @doc = File.open(path) do |f|
9
+ Nokogiri::XML(f) { |c| c.strict }
10
+ end
11
+ end
12
+
13
+ def index(db_transaction, &block)
14
+ raw = @doc/"./JMdict/entry"
15
+ total = raw.count
16
+ entries_added = 0
17
+
18
+ raw.each do |entry|
19
+ yield entries_added, total if block_given?
20
+
21
+ sequence_number = entry.at(JDict::JMDictConstants::Elements::SEQUENCE).content.to_i
22
+ kanji = (entry/JDict::JMDictConstants::Elements::KANJI).map(&:content)
23
+ kana = (entry/JDict::JMDictConstants::Elements::KANA).map(&:content)
24
+ senses = (entry/JDict::JMDictConstants::Elements::SENSE).map(&method(:extract_sense))
25
+
26
+ entry = Entry.new(sequence_number, kanji, kana, senses)
27
+ add_entry(db_transaction, entry)
28
+ entries_added += 1
29
+ end
30
+
31
+ printf "\n"
32
+
33
+ entries_added
34
+ end
35
+
36
+ def parse_parts_of_speech
37
+ {}
38
+ end
39
+
40
+ private
41
+
42
+ def extract_sense(e)
43
+ parts_of_speech = (e/JDict::JMDictConstants::Elements::PART_OF_SPEECH).map(&:inner_html)
44
+ glosses = (e/JDict::JMDictConstants::Elements::GLOSS).map(&:content)
45
+
46
+ # Assume the language of the whole sense is the language
47
+ # of the first gloss (in practice, there is never a gloss
48
+ # with more than one language in the official JMDict)
49
+ first_gloss = e.at(JDict::JMDictConstants::Elements::GLOSS)
50
+
51
+ language = if first_gloss
52
+ first_gloss.attr("xml:lang")
53
+ end
54
+
55
+ language ||= "en"
56
+
57
+ Sense.new(parts_of_speech, glosses, language)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,2 @@
1
+ module JDict
2
+ end
@@ -0,0 +1,64 @@
1
+ #include Constants #XML constants from the dictionary file
2
+
3
+ # Entries consist of kanji elements, kana elements,
4
+ # general information and sense elements. Each entry must have at
5
+ # least one kana element and one sense element. Others are optional.
6
+ module JDict
7
+ class Entry
8
+ attr_accessor :sequence_number, :kanji, :kana, :senses
9
+ # Create a new Entry
10
+ # entry = initialize(kanji, kana, senses)
11
+ def initialize(sequence_number, kanji, kana, senses)
12
+ @sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
13
+ end
14
+
15
+ # Converts an SQLite row from the index to the Entry format
16
+ def self.from_sql(row)
17
+ sequence_number = row["sequence_number"].to_i
18
+ kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
19
+ kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
20
+ senses = row["senses"].split(SerialConstants::SENSE_SENTINEL).sort.reduce([]) do |arr, txt|
21
+ arr << Sense.from_sql(txt)
22
+ end
23
+ self.new(sequence_number, kanji, kana, senses)
24
+ end
25
+
26
+ # Converts an Entry to a string to be indexed into the SQLite database
27
+ # @return [String] the serialized string for this Entry
28
+ def to_sql
29
+ sense_strings = senses.map(&:to_sql).join(SerialConstants::SENSE_SENTINEL)
30
+
31
+ { ':sequence_number' => sequence_number.to_s,
32
+ ':kanji' => kanji.join(", "),
33
+ ':kana' => kana.join(", "),
34
+ ':senses' => sense_strings }
35
+ end
36
+
37
+ # Get an array of +Senses+ for the specified language
38
+ def senses_by_language(l)
39
+ senses.select { |s| s.language == l }
40
+ end
41
+
42
+ def to_s
43
+ str = ""
44
+ str << "#{kanji_to_s}#{kana_to_s}\n"
45
+ str << "#{senses_to_s}\n"
46
+ str
47
+ end
48
+
49
+ def kanji_to_s
50
+ @kanji.join(', ')
51
+ end
52
+
53
+ def kana_to_s
54
+ " (#{@kana.join(', ')})" unless @kana.nil?
55
+ end
56
+
57
+ def senses_to_s(delimiter = "\n")
58
+ list = @senses.map.with_index(1) do |sense, i|
59
+ "#{i}. #{sense.to_s}"
60
+ end
61
+ list.join(delimiter)
62
+ end
63
+ end
64
+ end