ruby-jdict 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,151 @@
1
+ # encoding: utf-8
2
+ require 'amalgalite'
3
+ require 'fileutils'
4
+ require 'io/console'
5
+
6
+ module JDict
7
+ class DictIndex
8
+ ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
9
+
10
+ attr_reader :path
11
+
12
+ # Initialize a full-text search index backend for JMdict
13
+ # @param path [String] path to the dictionary
14
+ def initialize(path)
15
+ @dictionary_path = path
16
+ @index_path = File.dirname(@dictionary_path)
17
+ @pos_hash = {}
18
+
19
+ raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path
20
+
21
+ @db_file = File.join(@index_path, "jdict.db")
22
+ initialize_db(@db_file)
23
+
24
+ build_index!
25
+ end
26
+
27
+ def built?
28
+ @index.first_value_from( "SELECT count(*) from search" ) != 0
29
+ end
30
+
31
+ def delete!
32
+ @index.close
33
+ @index = nil
34
+
35
+ File.unlink(@db_file) if File.exist?(@db_file)
36
+
37
+ initialize_db(@db_file)
38
+ end
39
+
40
+ # Builds the full-text search index
41
+ # @return [Integer] the number of indexed entries
42
+ def build_index!(&block)
43
+ entries_added = do_build_index(&block) unless built?
44
+
45
+ #make the hash from abbreviated parts of speech to full definitions
46
+ @pos_hash ||= build_pos_hash
47
+
48
+ entries_added
49
+ end
50
+
51
+ # Returns the search results as an array of +Entry+
52
+ # @param term [String] the search string
53
+ # @param language [Symbol] the language to return results in
54
+ # @return [Array(Entry)] the results of the search
55
+ def search(term, opts = {})
56
+ raise "Index not found at path #{@index_path}" unless File.exists? @index_path
57
+
58
+ results = []
59
+
60
+ query = make_query(term, opts[:exact])
61
+
62
+ @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, opts[:max_results]) do |row|
63
+ entry = Entry.from_sql(row)
64
+ score = 0.0
65
+
66
+ is_exact_match = entry.kanji.include?(term) || entry.kana.include?(term)
67
+ score = 1.0 if is_exact_match
68
+
69
+ should_add = !opts[:exact] || (opts[:exact] && is_exact_match)
70
+
71
+ # add the result
72
+ results << [score, entry] if should_add
73
+ end
74
+
75
+ # Sort the results by first column (score) and return only the second column (entry)
76
+ results.sort_by { |entry| -entry[0] }.map { |entry| entry[1] }
77
+ end
78
+
79
+ # Retrieves the definition of a part-of-speech from its abbreviation
80
+ # @param pos [String] the abbreviation for the part-of-speech
81
+ # @return [String] the full description of the part-of-speech
82
+ def get_pos(pos)
83
+ build_pos_hash if @pos_hash.empty?
84
+ @pos_hash[pos_to_sym(pos)]
85
+ end
86
+
87
+ private
88
+
89
+ def initialize_db(db_file)
90
+ @index = Amalgalite::Database.new(db_file)
91
+ @pos_hash = nil
92
+
93
+ create_schema
94
+ end
95
+
96
+ # Creates the SQL schema for the Amalgalite database
97
+ def create_schema
98
+ schema = @index.schema
99
+ unless schema.tables['search']
100
+ @index.execute_batch <<-SQL
101
+ CREATE VIRTUAL TABLE search USING fts5(
102
+ sequence_number,
103
+ kanji,
104
+ kana,
105
+ senses
106
+ );
107
+ SQL
108
+ @index.reload_schema!
109
+ end
110
+ end
111
+
112
+ def make_query(term, exact)
113
+ # convert full-width katakana to hiragana
114
+ # TODO: move to user code
115
+ # term = Convert.kata_to_hira(term)
116
+
117
+ if term.start_with?('seq:')
118
+ query = "sequence_number : \"#{term[4..-1]}\""
119
+ else
120
+ query = "{kanji kana senses} : \"#{term}\""
121
+ query += "*" unless exact
122
+ end
123
+
124
+ query
125
+ end
126
+
127
+ def do_build_index(&block)
128
+ indexer = NokogiriDictionaryIndexer.new @dictionary_path
129
+ entries_added = 0
130
+
131
+ @index.transaction do |db_transaction|
132
+ entries_added = indexer.index(db_transaction, &block)
133
+ end
134
+
135
+ entries_added
136
+ end
137
+
138
+ # Creates the hash of part-of-speech symbols to full definitions from the dictionary
139
+ def build_pos_hash
140
+ indexer = NokogiriDictionaryIndexer.new @dictionary_path
141
+ indexer.parse_parts_of_speech
142
+ end
143
+
144
+ # Converts a part-of-speech entity reference string into a symbol
145
+ # @param entity [String] the entity reference string
146
+ # @return [Symbol] the part-of-speech symbol
147
+ def pos_to_sym(entity)
148
+ entity.gsub('-', '_').to_sym
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,28 @@
1
+ require 'amalgalite'
2
+
3
+ module JDict
4
+ class DictionaryIndexer
5
+ attr_reader :parts_of_speech
6
+
7
+ def initialize(path)
8
+ raise "No dictionary path was provided" if path.nil?
9
+ raise "Dictionary not found at path #{@path}" unless File.exists?(path)
10
+
11
+ @path = path
12
+ end
13
+
14
+ def index(db_transaction, &block)
15
+ end
16
+
17
+ def parse_parts_of_speech
18
+ end
19
+
20
+ protected
21
+
22
+ def add_entry(db_transaction, entry)
23
+ db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
24
+ stmt.execute(entry.to_sql)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,164 @@
1
+ require 'libxml'
2
+ include LibXML
3
+
4
+ module JDict
5
+
6
+ class LibXMLDictionaryIndexer < DictionaryIndexer
7
+ def initialize(path)
8
+ super
9
+ end
10
+
11
+ def index(db_transaction, &block)
12
+ reader = open_reader(@path)
13
+
14
+ # whenever there is a reader error, print its block parameters
15
+ XML::Error.set_handler { |*args| p args }
16
+
17
+ entry_sequence_num, kanji, kana, senses = 0, [], [], []
18
+ language = nil
19
+ glosses = {}
20
+ parts_of_speech = []
21
+
22
+ entries_added = 0
23
+
24
+ while reader.read
25
+ yield entries_added, 0 if block_given?
26
+
27
+ case reader.node_type
28
+
29
+ # start-of-element node
30
+ when XML::Reader::TYPE_ELEMENT
31
+ case reader.name
32
+ when JDict::JMDictConstants::Elements::SEQUENCE
33
+ entry_sequence_num = reader.next_text.to_i
34
+
35
+ # TODO: Raise an exception if reader.next_text.empty? inside the when's
36
+ # JMdict shouldn't have any empty elements, I believe.
37
+ when JDict::JMDictConstants::Elements::KANJI
38
+ text = reader.next_text
39
+ kanji << text unless text.empty?
40
+
41
+ when JDict::JMDictConstants::Elements::KANA
42
+ text = reader.next_text
43
+ kana << text unless text.empty?
44
+
45
+ when JDict::JMDictConstants::Elements::GLOSS
46
+ # Assume the language of the whole sense is the language
47
+ # of the first gloss (in practice, there is never a gloss
48
+ # with more than one language)
49
+ unless language
50
+ language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT
51
+ language = language.intern
52
+ end
53
+ text = reader.next_text
54
+ glosses << text unless text.empty?
55
+
56
+ when JDict::JMDictConstants::Elements::CROSSREFERENCE
57
+ text = reader.next_text
58
+ end
59
+
60
+ # XML entity references are treated as a different node type
61
+ # the parent node of the entity reference itself has the actual tag name
62
+ when XML::Reader::TYPE_ENTITY_REFERENCE
63
+ if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
64
+ text = reader.name
65
+ parts_of_speech << text unless text.empty?
66
+ end
67
+
68
+ when XML::Reader::TYPE_END_ELEMENT
69
+ case reader.name
70
+
71
+ when JDict::JMDictConstants::Elements::SENSE
72
+ senses << Sense.new(parts_of_speech, glosses, language)
73
+
74
+ # clear data for the next sense
75
+ glosses = {}
76
+ parts_of_speech = []
77
+ language = nil
78
+
79
+ # we're at the end of the entry element, so index it
80
+ when JDict::JMDictConstants::Elements::ENTRY
81
+ raise "No kana found for this entry!" if kana.empty?
82
+
83
+ entry = Entry.new(entry_sequence_num, kanji, kana, senses)
84
+ add_entry(entry)
85
+
86
+ # clear data for the next entry
87
+ kanji, kana, senses = [], [], []
88
+
89
+ entries_added += 1
90
+ end
91
+ end
92
+ end
93
+
94
+ reader.close
95
+
96
+ entries_added
97
+ end
98
+
99
+ def parse_parts_of_speech
100
+ pos_hash = {}
101
+ done = false
102
+ until done
103
+ reader.read
104
+ case reader.node_type
105
+ when XML::Reader::TYPE_DOCUMENT_TYPE
106
+ # segfaults when attempting this:
107
+ # cs.each do |child|
108
+ # p child.to_s
109
+ # end
110
+ doctype_string = reader.node.to_s
111
+ entities = doctype_string.scan(ENTITY_REGEX)
112
+ entities.map do |entity|
113
+ abbrev = entity[0]
114
+ full = entity[1]
115
+ sym = pos_to_sym(abbrev)
116
+ pos_hash[sym] = full
117
+ end
118
+ done = true
119
+ when XML::Reader::TYPE_ELEMENT
120
+ done = true
121
+ end
122
+ end
123
+
124
+ reader.close
125
+
126
+ printf "\n"
127
+
128
+ pos_hash
129
+ end
130
+
131
+ private
132
+
133
+ # Creates an XML::Reader object for the given path
134
+ # @param dictionary_path [String] path to the dictionary file
135
+ # @return [XML::Reader] the reader for the given dictionary
136
+ def open_reader(dictionary_path)
137
+ # open reader
138
+ reader = nil
139
+ Dir.chdir(Dir.pwd) do
140
+ jmdict_path = File.join(dictionary_path)
141
+ reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
142
+ raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
143
+ end
144
+ reader
145
+ end
146
+ end
147
+
148
+ # Add custom parsing methods to XML::Reader
149
+ class XML::Reader
150
+ public
151
+
152
+ def next_text
153
+ while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
154
+ self.value
155
+ end
156
+
157
+ def next_entity
158
+ while (self.node_type != XML::Reader::TYPE_ENTITY and
159
+ self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
160
+ self.read); end
161
+ self.value
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,60 @@
1
+ require 'nokogiri'
2
+
3
+ module JDict
4
+ class NokogiriDictionaryIndexer < JDict::DictionaryIndexer
5
+ def initialize(path)
6
+ super
7
+
8
+ @doc = File.open(path) do |f|
9
+ Nokogiri::XML(f) { |c| c.strict }
10
+ end
11
+ end
12
+
13
+ def index(db_transaction, &block)
14
+ raw = @doc/"./JMdict/entry"
15
+ total = raw.count
16
+ entries_added = 0
17
+
18
+ raw.each do |entry|
19
+ yield entries_added, total if block_given?
20
+
21
+ sequence_number = entry.at(JDict::JMDictConstants::Elements::SEQUENCE).content.to_i
22
+ kanji = (entry/JDict::JMDictConstants::Elements::KANJI).map(&:content)
23
+ kana = (entry/JDict::JMDictConstants::Elements::KANA).map(&:content)
24
+ senses = (entry/JDict::JMDictConstants::Elements::SENSE).map(&method(:extract_sense))
25
+
26
+ entry = Entry.new(sequence_number, kanji, kana, senses)
27
+ add_entry(db_transaction, entry)
28
+ entries_added += 1
29
+ end
30
+
31
+ printf "\n"
32
+
33
+ entries_added
34
+ end
35
+
36
+ def parse_parts_of_speech
37
+ {}
38
+ end
39
+
40
+ private
41
+
42
+ def extract_sense(e)
43
+ parts_of_speech = (e/JDict::JMDictConstants::Elements::PART_OF_SPEECH).map(&:inner_html)
44
+ glosses = (e/JDict::JMDictConstants::Elements::GLOSS).map(&:content)
45
+
46
+ # Assume the language of the whole sense is the language
47
+ # of the first gloss (in practice, there is never a gloss
48
+ # with more than one language in the official JMDict)
49
+ first_gloss = e.at(JDict::JMDictConstants::Elements::GLOSS)
50
+
51
+ language = if first_gloss
52
+ first_gloss.attr("xml:lang")
53
+ end
54
+
55
+ language ||= "en"
56
+
57
+ Sense.new(parts_of_speech, glosses, language)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,2 @@
1
+ module JDict
2
+ end
@@ -0,0 +1,64 @@
1
+ #include Constants #XML constants from the dictionary file
2
+
3
+ # Entries consist of kanji elements, kana elements,
4
+ # general information and sense elements. Each entry must have at
5
+ # least one kana element and one sense element. Others are optional.
6
+ module JDict
7
+ class Entry
8
+ attr_accessor :sequence_number, :kanji, :kana, :senses
9
+ # Create a new Entry
10
+ # entry = initialize(kanji, kana, senses)
11
+ def initialize(sequence_number, kanji, kana, senses)
12
+ @sequence_number, @kanji, @kana, @senses = sequence_number, kanji, kana, senses
13
+ end
14
+
15
+ # Converts an SQLite row from the index to the Entry format
16
+ def self.from_sql(row)
17
+ sequence_number = row["sequence_number"].to_i
18
+ kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
19
+ kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
20
+ senses = row["senses"].split(SerialConstants::SENSE_SENTINEL).sort.reduce([]) do |arr, txt|
21
+ arr << Sense.from_sql(txt)
22
+ end
23
+ self.new(sequence_number, kanji, kana, senses)
24
+ end
25
+
26
+ # Converts an Entry to a string to be indexed into the SQLite database
27
+ # @return [String] the serialized string for this Entry
28
+ def to_sql
29
+ sense_strings = senses.map(&:to_sql).join(SerialConstants::SENSE_SENTINEL)
30
+
31
+ { ':sequence_number' => sequence_number.to_s,
32
+ ':kanji' => kanji.join(", "),
33
+ ':kana' => kana.join(", "),
34
+ ':senses' => sense_strings }
35
+ end
36
+
37
+ # Get an array of +Senses+ for the specified language
38
+ def senses_by_language(l)
39
+ senses.select { |s| s.language == l }
40
+ end
41
+
42
+ def to_s
43
+ str = ""
44
+ str << "#{kanji_to_s}#{kana_to_s}\n"
45
+ str << "#{senses_to_s}\n"
46
+ str
47
+ end
48
+
49
+ def kanji_to_s
50
+ @kanji.join(', ')
51
+ end
52
+
53
+ def kana_to_s
54
+ " (#{@kana.join(', ')})" unless @kana.nil?
55
+ end
56
+
57
+ def senses_to_s(delimiter = "\n")
58
+ list = @senses.map.with_index(1) do |sense, i|
59
+ "#{i}. #{sense.to_s}"
60
+ end
61
+ list.join(delimiter)
62
+ end
63
+ end
64
+ end