ruby-jdict 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 48dbfb86f9f72639eae7cecdde05da6953afdc8c
4
+ data.tar.gz: 01bae383b6df3ae0e9a524e094d7f1f1890663cd
5
+ SHA512:
6
+ metadata.gz: 4253b05fc65786103431707d298711170b8cc4cd426919b1dee06ba37a767766021ac80d1207a31a61bf8ea9a15466cf5e173f3b155328986dc674978793b5cd
7
+ data.tar.gz: 47a4b27fe519e1284bfd5311404f18489d701ddedc754d9203eaec07101bb4485378760de4f51e56f4fbe7aa235925e01505b18943e81d22af1cbb5464eca801
@@ -0,0 +1,28 @@
1
+ Copyright (C) 2015 Ian Pickering
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+ 2. Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in
12
+ the documentation and/or other materials provided with the
13
+ distribution.
14
+ 3. The name of the author may not be used to endorse or promote
15
+ products derived from this software without specific prior
16
+ written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS
19
+ OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
+ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24
+ GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,10 @@
1
+ # Ruby-JDict
2
+ Ruby gem for accessing Jim Breen's Japanese dictionaries. Can currently access the following:
3
+ * JMdict (Japanese-English dictionary)
4
+
5
+ Dictionary files are located [here](http://www.csse.monash.edu.au/~jwb/wwwjdicinf.html#dicfil_tag).
6
+
7
+ ## Install
8
+ ```
9
+ gem install ruby-jdict
10
+ ```
@@ -0,0 +1,30 @@
1
+ require 'rubygems'
2
+ require 'rake' #task runner
3
+
4
+ INDEX_PATH = 'index'
5
+ JMDICT_PATH = 'dictionaries/JMdict'
6
+
7
+ namespace :index do
8
+
9
+ desc "Build the dictionary's search index"
10
+ task :build do
11
+ raise "Index already exists at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
12
+ @index = DictIndex.new(INDEX_PATH,
13
+ JMDICT_PATH,
14
+ false) # lazy_loadind? no. don't lazy load
15
+ puts "Index created at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
16
+ puts "Index with #{@index.size} entries."
17
+ end
18
+
19
+ desc "Destroy the dictionary's search index"
20
+ task :destroy do
21
+ puts 'TODO: destory the index'
22
+ `sudo rm -R index`
23
+ # This will not work, because we don't have sudooooo.
24
+ # How do you delete folders in Ruby without sudo? Probably
25
+ # can't... that'd be more consistent actually.
26
+ # if File.exists? INDEX_PATH
27
+ # File.delete INDEX_PATH
28
+ # end
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'jdict'
3
+
4
+ BASE_PATH = ENV["HOME"]
5
+ DICT_PATH = File.join(BASE_PATH, '.dicts')
6
+ INDEX_PATH = DICT_PATH
7
+
8
+ JDict.configure do |config|
9
+ config.dictionary_path = DICT_PATH # directory containing dictionary files
10
+ config.index_path = INDEX_PATH # directory containing the full text search index
11
+ config.language = JDict::JMDictConstants::Languages::ENGLISH # language for search results
12
+ config.num_results = 50 # maximum results to return from searching
13
+ end
14
+
15
+ dict = JDict::JMDict.new
16
+
17
+ query = "日本語"
18
+
19
+ results = dict.search(query)
20
+ results.each do |entry|
21
+ puts entry.kanji.join(", ")
22
+ puts entry.kana.join(", ")
23
+ entry.senses.each do |sense|
24
+ glosses = sense.glosses.join(", ")
25
+ parts_of_speech = sense.parts_of_speech.join(", ")
26
+ puts "(" + parts_of_speech + ") " + glosses
27
+ end
28
+ puts
29
+ end
@@ -0,0 +1,14 @@
1
+ # The sense element will record the translational equivalent
2
+ # of the Japanese word, plus other related information. Where there
3
+ # are several distinctly different meanings of the word, multiple
4
+ # sense elements will be employed.
5
+ module JDict
6
+ class Sense
7
+ attr_reader :parts_of_speech, :glosses
8
+ #
9
+ # Create a new +Sense+
10
+ def initialize(parts_of_speech, glosses, language)
11
+ @parts_of_speech, @glosses = parts_of_speech, glosses
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ require 'constants'
2
+
3
+ module JDict
4
+ class Configuration
5
+ attr_accessor :dictionary_path, :index_path, :num_results, :language, :lazy_index_loading, :debug
6
+
7
+ BASE_PATH = ENV["HOME"]
8
+ DICT_PATH = File.join(BASE_PATH, '.dicts')
9
+ INDEX_PATH = DICT_PATH
10
+
11
+ def initialize
12
+ @dictionary_path = DICT_PATH # directory containing dictionary files
13
+ @index_path = INDEX_PATH # directory containing the full text search index
14
+ @num_results = 50 # maximum results to return from searching
15
+ @language = JDict::JMDictConstants::Languages::ENGLISH # language to return search results in
16
+ @lazy_index_loading = false # load the index only on attempting to access it
17
+ @debug = false # limit number of entries indexed, rebuild index on instantiation
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,64 @@
1
+ # Constants and descriptions for important elements/attributes
2
+ # of the JMdict XML dictionary.
3
+ # Descriptions come from JMdict.dtd (document type definition)
4
+ module JDict
5
+ module JMDictConstants
6
+
7
+ # TODO: change these strings to symbols ?
8
+ # XML elements of the JMDict file
9
+ module Elements
10
+ # Entries consist of kanji elements, kana elements,
11
+ # general information and sense elements. Each entry must have at
12
+ # least one kana element and one sense element. Others are optional.
13
+ ENTRY = 'entry'
14
+ SEQUENCE = 'ent_seq'
15
+
16
+ # This element will contain a word or short phrase in Japanese
17
+ # which is written using at least one kanji. The valid characters are
18
+ # kanji, kana, related characters such as chouon and kurikaeshi, and
19
+ # in exceptional cases, letters from other alphabets.
20
+ KANJI = 'keb'
21
+
22
+ # This element content is restricted to kana and related
23
+ # characters such as chouon and kurikaeshi. Kana usage will be
24
+ # consistent between the keb and reb elements; e.g. if the keb
25
+ # contains katakana, so too will the reb.
26
+ KANA = 'reb'
27
+
28
+ # The sense element will record the translational equivalent
29
+ # of the Japanese word, plus other related information. Where there
30
+ # are several distinctly different meanings of the word, multiple
31
+ # sense elements will be employed.
32
+ SENSE = 'sense'
33
+
34
+ # Part-of-speech information about the entry/sense. Should use
35
+ # appropriate entity codes.
36
+ PART_OF_SPEECH = 'pos'
37
+
38
+ # Within each sense will be one or more "glosses", i.e.
39
+ # target-language words or phrases which are equivalents to the
40
+ # Japanese word. This element would normally be present, however it
41
+ # may be omitted in entries which are purely for a cross-reference.
42
+ GLOSS = 'gloss'
43
+
44
+ CROSSREFERENCE = 'xref'
45
+ end
46
+
47
+ # Constants for selecting the search language.
48
+ # Used in the "gloss" element's xml:lang attribute.
49
+ # :eng never appears as a xml:lang constant because gloss is assumed to be English when not specified
50
+ # :jpn never appears as a xml:lang because the dictionary itself pivots around Japanese
51
+ module Languages
52
+ JAPANESE = :jpn
53
+ ENGLISH = :eng
54
+ DUTCH = :dut
55
+ FRENCH = :fre
56
+ GERMAN = :ger
57
+ RUSSIAN = :rus
58
+ SPANISH = :spa
59
+ SLOVENIAN = :slv
60
+ SWEDISH = :swe
61
+ HUNGARIAN = :hun
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,14 @@
1
+ require 'jdict'
2
+ require 'dictionary'
3
+
4
+ module JDict
5
+ class JMDict < Dictionary
6
+ private
7
+ # DICT_PATH = JDict.configuration.dictionary_path + '/JMdict'
8
+
9
+ def initialize(index_path = JDict.configuration.index_path, lazy_index_loading=JDict.configuration.lazy_index_loading)
10
+ path = JDict.configuration.dictionary_path + '/JMdict'
11
+ super(index_path, path, lazy_index_loading)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,62 @@
1
+ require 'jdict'
2
+ require 'index'
3
+
4
+ module JDict
5
+ class Dictionary
6
+ attr_reader :entries_cache, :lazy_index_loading
7
+
8
+ def initialize(index_path = JDict.configuration.index_path, dictionary_path = nil, lazy_index_loading = JDict.configuration.lazy_index_loading)
9
+ path_specified = dictionary_path.nil? ? false : true
10
+ if path_specified and not File.exists? dictionary_path
11
+ raise "Dictionary not found at path #{dictionary_path}"
12
+ end
13
+
14
+ #store some args for future reference
15
+ @dictionary_path = dictionary_path
16
+ @lazy_index_loading = lazy_index_loading
17
+
18
+ @entries = []
19
+ @entries_cache = []
20
+
21
+ #instantiate and load the full-text search index
22
+ @index = DictIndex.new(index_path, dictionary_path, lazy_index_loading)
23
+ end
24
+
25
+ def size
26
+ @entries.size
27
+ end
28
+
29
+ def loaded?
30
+ @index.built?
31
+ end
32
+
33
+ # Search this dictionary's index for the given string.
34
+ # @param query [String] the search query
35
+ # @return [Array(Entry)] the results of the search
36
+ def search(query)
37
+ results = []
38
+ return results if query.empty?
39
+
40
+ load_index if lazy_index_loading and not loaded?
41
+
42
+ results = @index.search(query)
43
+ end
44
+
45
+ # Retrieves the definition of a part-of-speech from its abbreviation
46
+ # @param pos [String] the abbreviation for the part-of-speech
47
+ # @return [String] the full description of the part-of-speech
48
+ def get_pos(pos)
49
+ @index.get_pos(pos)
50
+ end
51
+
52
+ private
53
+
54
+ def load_index
55
+ if loaded?
56
+ Exception.new("Dictionary index is already loaded")
57
+ else
58
+ @index.build
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,79 @@
1
+ #include Constants #XML constants from the dictionary file
2
+
3
+ # Entries consist of kanji elements, kana elements,
4
+ # general information and sense elements. Each entry must have at
5
+ # least one kana element and one sense element. Others are optional.
6
+ module JDict
7
+ class Entry
8
+
9
+ attr_accessor :kanji, :kana, :senses
10
+ # Create a new Entry
11
+ # entry = initialize(kanji, kana, senses)
12
+ def initialize(kanji, kana, senses)
13
+ @kanji, @kana, @senses = kanji, kana, senses
14
+ end
15
+
16
+ KANA_RE = /^kana/
17
+ SENSE_RE = /^sense/
18
+ PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
19
+
20
+ MEANING_SENTINEL = '**'
21
+ PART_OF_SPEECH_SENTINEL = '$$'
22
+ SENSE_SENTINEL = '%%'
23
+ LANGUAGE_SENTINEL = '&&'
24
+ GLOSS_SENTINEL = '@@'
25
+
26
+ # Converts an SQLite row from the index to the Entry format
27
+ def self.from_sql(row)
28
+ kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
29
+ kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
30
+ senses = []
31
+ row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
32
+ ary = txt.scan(PART_OF_SPEECH_RE)
33
+ if ary.size == 1
34
+ parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
35
+ gloss_strings = txt[(ary.to_s.length-1)..-1]
36
+ else
37
+ parts_of_speech = nil
38
+ gloss_strings = txt[5..-1]
39
+ end
40
+
41
+ gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
42
+
43
+ glosses = {}
44
+ gloss_strings.each do |str|
45
+ lang, meaning_string = str.split(LANGUAGE_SENTINEL)
46
+ lang = lang.to_sym
47
+ meanings = meaning_string.split(MEANING_SENTINEL)
48
+ (glosses[lang] ||= []) << meanings
49
+ end
50
+ glosses_for_lang = glosses[JDict.configuration.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
51
+ senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
52
+ end
53
+ self.new(kanji, kana, senses)
54
+ end
55
+
56
+ # Converts an Entry to a string to be indexed into the SQLite database
57
+ # @return [String] the serialized string for this Entry
58
+ def to_sql
59
+ sense_strings = senses.map do |s|
60
+ sense = ''
61
+ sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
62
+ sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
63
+ end
64
+
65
+ insert_data = {
66
+ ':kanji' => kanji.join(", "),
67
+ ':kana' => kana.join(", "),
68
+ ':senses' => sense_strings.join(SENSE_SENTINEL)
69
+ }
70
+
71
+ return insert_data
72
+ end
73
+
74
+ # Get an array of +Senses+ for the specified language
75
+ def senses_by_language(l)
76
+ senses.select { |s| s.language == l }
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,346 @@
1
+ # encoding: utf-8
2
+ require 'rubygems' #use gems
3
+ require 'bundler/setup' #load up the bundled environment
4
+
5
+ require 'amalgalite'
6
+ require 'libxml' #XML parsing
7
+ require 'fileutils'
8
+
9
+ require_relative 'constants' #XML constants from the dictionary file
10
+
11
+ require_relative 'entry' #dictionary elements
12
+ require_relative 'kanji' #...
13
+ require_relative 'kana' #...
14
+ require_relative 'sense'
15
+
16
+ require 'amalgalite'
17
+
18
+ include LibXML
19
+
20
+ module JDict
21
+ class DictIndex
22
+
23
+ LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
24
+ NUM_ENTRIES_TO_INDEX = 50
25
+ ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
26
+
27
+ attr_reader :path
28
+ # Initialize a full-text search index backend for JMdict
29
+ # @param index_path [String] desired filesystem path where you'd like the *search index* stored
30
+ # @param dictionary_path [String] desired filesystem path where you'd like the *dictionary* stored
31
+ # @param lazy_loading [Boolean] lazily load the index just when it's needed, instead of building it ahead of time
32
+ def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
33
+ raise "Index path was nil" if index_path.nil?
34
+
35
+ path_specified = dictionary_path.nil? ? false : true
36
+ if path_specified and not File.exists? dictionary_path
37
+ raise "Dictionary not found at path #{dictionary_path}"
38
+ end
39
+
40
+ @path = index_path
41
+ @dictionary_path = dictionary_path
42
+ @pos_hash = {}
43
+
44
+ # create path if nonexistent
45
+ FileUtils.mkdir_p(@path)
46
+ db_file = File.join(@path, "fts5.db")
47
+
48
+ File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)
49
+
50
+ @index = Amalgalite::Database.new(db_file)
51
+
52
+ create_schema
53
+
54
+ #check if the index has already been built before Ferret creates it
55
+ already_built = built?
56
+
57
+ #build the index right now if "lazy loading" isn't on and the index is empty
58
+ build unless lazy_loading or (already_built && !JDict.configuration.debug)
59
+
60
+ #make the hash from abbreviated parts of speech to full definitions
61
+ build_pos_hash
62
+ end
63
+
64
+ # Creates the SQL schema for the Amalgalite database
65
+ def create_schema
66
+ schema = @index.schema
67
+ unless schema.tables['search']
68
+ @index.execute_batch <<-SQL
69
+ CREATE VIRTUAL TABLE search USING fts5(
70
+ kanji,
71
+ kana,
72
+ senses
73
+ );
74
+ SQL
75
+ @index.reload_schema!
76
+ end
77
+ end
78
+
79
+ # Returns the search results as an array of +Entry+
80
+ # @param term [String] the search string
81
+ # @param language [Symbol] the language to return results in
82
+ # @return [Array(Entry)] the results of the search
83
+ def search(term, language=LANGUAGE_DEFAULT, exact=false)
84
+ raise "Index not found at path #{@path}" unless File.exists? @path
85
+
86
+ # no results yet...
87
+ results = []
88
+
89
+ @entries_cache = []
90
+
91
+ # search for:
92
+ # kanji... one field
93
+ # kana ... up to 10 fields
94
+ # sense... up to 10 fields
95
+ # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""
96
+ query = "{kanji kana senses} : \"#{term}\""
97
+ query += "*" unless exact
98
+
99
+ @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH '#{query}' LIMIT #{JDict.configuration.num_results}") do |row|
100
+ entry = Entry.from_sql(row)
101
+ score = 0.0
102
+
103
+ # load entry from the index. from cache, if it's available
104
+ # load from cache if it's available
105
+ # if entry = @entries_cache[docid]
106
+ # entry = Entry.from_index_doc(@ferret_index[docid].load)
107
+ # @entries_cache[docid] = entry
108
+ # end
109
+
110
+ # # load entry from the index
111
+ # if entry.nil?
112
+ # entry = Entry.from_index_doc(@ferret_index[docid].load)
113
+ # @entries_cache[docid] = entry
114
+ # end
115
+
116
+ is_exact_match = false
117
+ is_exact_match = entry.kanji == term ||
118
+ entry.kana.any? { |k| k == term }
119
+
120
+ re = Regexp.new("#{term}", Regexp::IGNORECASE) # match the search term, ignoring case
121
+ # entry.senses.each do |s|
122
+ # s.glosses.each { |g| is_exact_match = is_exact_match || g.force_encoding("UTF-8").match(re) }
123
+ # end
124
+
125
+ # score = 1.0 if is_exact_match
126
+
127
+ # add the result
128
+ results << [score, entry]
129
+ end
130
+
131
+ @entries_cache = []
132
+
133
+ results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
134
+ end
135
+
136
+ def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end
137
+
138
+ # Builds the full-text search index
139
+ # @param overwrite [Boolean] force a build even if the index path already exists
140
+ # @param dictionary_path [String] path to the dictionary file
141
+ # @return [Integer] the number of indexed entries
142
+ def build(overwrite=false, dictionary_path=nil)
143
+ @dictionary_path = dictionary_path unless dictionary_path.nil?
144
+ raise "No dictionary path was provided" if @dictionary_path.nil?
145
+ raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
146
+
147
+ reader = open_reader(@dictionary_path)
148
+
149
+ puts "Building index..."
150
+
151
+ # whenever there is a reader error, print its block parameters
152
+ XML::Error.set_handler { |*args| p args }
153
+
154
+ # components of an entry
155
+ kanji, kana, senses = [], [], []
156
+ glosses = {}
157
+ parts_of_speech = []
158
+
159
+ entries_added = 0
160
+
161
+ @index.transaction do |db_transaction|
162
+
163
+ # read until the end
164
+ while reader.read
165
+
166
+ # check what type of node we're currently on
167
+ case reader.node_type
168
+
169
+ # start-of-element node
170
+ when XML::Reader::TYPE_ELEMENT
171
+ case reader.name
172
+ when JDict::JMDictConstants::Elements::SEQUENCE
173
+ entry_sequence_num = reader.next_text
174
+
175
+ # TODO: Raise an exception if reader.next_text.empty? inside the when's
176
+ # JMdict shouldn't have any empty elements, I believe.
177
+ when JDict::JMDictConstants::Elements::KANJI
178
+ text = reader.next_text
179
+ kanji << text unless text.empty?
180
+
181
+ when JDict::JMDictConstants::Elements::KANA
182
+ text = reader.next_text
183
+ kana << text unless text.empty?
184
+
185
+ when JDict::JMDictConstants::Elements::GLOSS
186
+ language = reader.node.lang || LANGUAGE_DEFAULT
187
+ language = language.intern
188
+ text = reader.next_text
189
+ unless text.empty?
190
+ (glosses[language] ||= []) << text
191
+ end
192
+
193
+ when JDict::JMDictConstants::Elements::CROSSREFERENCE
194
+ text = reader.next_text
195
+ end
196
+
197
+ # XML entity references are treated as a different node type
198
+ # the parent node of the entity reference itself has the actual tag name
199
+ when XML::Reader::TYPE_ENTITY_REFERENCE
200
+ if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
201
+ text = reader.name
202
+ parts_of_speech << text unless text.empty?
203
+ end
204
+
205
+ # end-of-element node
206
+ when XML::Reader::TYPE_END_ELEMENT
207
+ case reader.name
208
+
209
+ when JDict::JMDictConstants::Elements::SENSE
210
+ # build sense
211
+ senses << Sense.new(parts_of_speech, glosses)
212
+ # glosses.each do |language, texts|
213
+ # senses << Sense.new(parts_of_speech,
214
+ # texts.join(', ').strip,
215
+ # language)
216
+ # end
217
+
218
+ # clear data for the next sense
219
+ glosses = {}
220
+ parts_of_speech = []
221
+
222
+ # we're at the end of the entry element, so index it
223
+ when JDict::JMDictConstants::Elements::ENTRY
224
+ raise "No kana found for this entry!" if kana.empty?
225
+
226
+ #index
227
+ # @index.add_entry(i, Entry.new(kanji, kana, senses))
228
+ insert_data = Entry.new(kanji, kana, senses).to_sql
229
+
230
+ db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
231
+ stmt.execute( insert_data )
232
+ end
233
+
234
+ # TODO: add entry_sequence_num to the entry
235
+
236
+ # clear data for the next entry
237
+ kanji, kana, senses = [], [], []
238
+
239
+ entries_added += 1
240
+ #debug
241
+ if JDict.configuration.debug
242
+ break if entries_added >= NUM_ENTRIES_TO_INDEX
243
+ # # if @index.size.modulo(1000) == 0
244
+ # if @index.size.modulo(100) == 0
245
+ # # puts "#{@index.size/1000} thousand"
246
+ # puts "\r#{@index.size/100} hundred"
247
+ # end
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
253
+
254
+ # puts "#{@index.size} entries indexed"
255
+
256
+ # Done reading & indexing
257
+ reader.close
258
+ # @index.close
259
+ end
260
+
261
+ def rebuild
262
+ raise "Index already exists at path #{@path}" if File.exists? @path
263
+ build
264
+ end
265
+
266
+ # Creates an XML::Reader object for the given path
267
+ # @param dictionary_path [String] path to the dictionary file
268
+ # @return [XML::Reader] the reader for the given dictionary
269
+ def open_reader(dictionary_path)
270
+ # open reader
271
+ reader = nil
272
+ Dir.chdir(Dir.pwd) do
273
+ jmdict_path = File.join(dictionary_path)
274
+ reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
275
+ raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
276
+ end
277
+ reader
278
+ end
279
+
280
+ # Creates the hash of part-of-speech symbols to full definitions from the dictionary
281
+ def build_pos_hash
282
+ @pos_hash ||= begin
283
+ pos_hash = {}
284
+ reader = open_reader(@dictionary_path)
285
+ done = false
286
+ while done == false
287
+ reader.read
288
+ case reader.node_type
289
+ when XML::Reader::TYPE_DOCUMENT_TYPE
290
+ # random segfault when attempting this
291
+ # cs.each do |child|
292
+ # p child.to_s
293
+ # end
294
+ doctype_string = reader.node.to_s
295
+ entities = doctype_string.scan(ENTITY_REGEX)
296
+ entities.map do |entity|
297
+ abbrev = entity[0]
298
+ full = entity[1]
299
+ sym = pos_to_sym(abbrev)
300
+ pos_hash[sym] = full
301
+ end
302
+ done = true
303
+ when XML::Reader::TYPE_ELEMENT
304
+ done = true
305
+ end
306
+ end
307
+ pos_hash
308
+ end
309
+ end
310
+
311
+ # Converts a part-of-speech entity reference string into a symbol
312
+ # @param entity [String] the entity reference string
313
+ # @return [Symbol] the part-of-speech symbol
314
+ def pos_to_sym(entity)
315
+ entity.gsub('-', '_').to_sym
316
+ end
317
+
318
+ # Retrieves the definition of a part-of-speech from its abbreviation
319
+ # @param pos [String] the abbreviation for the part-of-speech
320
+ # @return [String] the full description of the part-of-speech
321
+ def get_pos(pos)
322
+ build_pos_hash if @pos_hash.empty?
323
+ @pos_hash[pos_to_sym(pos)]
324
+ end
325
+ end
326
+
327
+ # Add custom parsing methods to XML::Reader
328
+ class XML::Reader
329
+
330
+ public
331
+ # Get the next text node
332
+ def next_text
333
+ # read until a text node
334
+ while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
335
+ self.value
336
+ end
337
+ # Get the next entity node
338
+ def next_entity
339
+ # read until an entity node
340
+ while (self.node_type != XML::Reader::TYPE_ENTITY and
341
+ self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
342
+ self.read); end
343
+ self.value
344
+ end
345
+ end
346
+ end
@@ -0,0 +1,20 @@
1
+ require 'configuration'
2
+ require 'dictionaries/jmdict'
3
+
4
+ module JDict
5
+ class << self
6
+ attr_accessor :configuration
7
+ end
8
+
9
+ def self.configuration
10
+ @configuration ||= Configuration.new
11
+ end
12
+
13
+ def self.reset
14
+ @configuration = Configuration.new
15
+ end
16
+
17
+ def self.configure
18
+ yield(configuration)
19
+ end
20
+ end
@@ -0,0 +1,4 @@
1
+ module JDict
2
+ class Kana
3
+ end
4
+ end
@@ -0,0 +1,4 @@
1
+ module JDict
2
+ class Kanji
3
+ end
4
+ end
@@ -0,0 +1,3 @@
1
+ module JDict
2
+ Version = '0.0.1'
3
+ end
@@ -0,0 +1,14 @@
1
+ # The sense element will record the translational equivalent
2
+ # of the Japanese word, plus other related information. Where there
3
+ # are several distinctly different meanings of the word, multiple
4
+ # sense elements will be employed.
5
+ module JDict
6
+ class Sense
7
+ attr_reader :parts_of_speech, :glosses
8
+ #
9
+ # Create a new +Sense+
10
+ def initialize(parts_of_speech, glosses)
11
+ @parts_of_speech, @glosses = parts_of_speech, glosses
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,63 @@
1
+ module JDict
2
+ module Unicode
3
+ # Codepoint ranges for japanese unicode characters (in decimal)
4
+ # from: http://unicode.org/charts/
5
+ module CodepointRanges
6
+ HIRAGANA = 12352..12447
7
+ KATAKANA = 12448..12543
8
+ KATAKANA_PHONETIC = 12784..12799
9
+ HALFWIDTH_KATAKANA = 65280..65519
10
+ UNIFIED_CJK = 19968..40911
11
+ UNIFIED_CJK_EXT_A = 13312..19903
12
+ UNIFIED_CJK_EXT_B = 131072..173791
13
+ PUNCTUATION = 12288..12351
14
+ end
15
+
16
+ # Get Unicode hex codepoint from a Unicode character
17
+ def hex_codepoint(unicode_char)
18
+ unicode_char.unpack("U0U*")[0]
19
+ end
20
+
21
+ # TODO: write unit test with a variety of strings to ensure this method
22
+ # returns the expected output
23
+ # Determine the script of the specified string:
24
+ # :kanji
25
+ # :kana
26
+ # :english
27
+ def script_type?(unicode_string)
28
+ type = ''
29
+
30
+ unicode_string.each_char do |c|
31
+ code = hex_codepoint(c)
32
+ #kana
33
+ if CodepointRanges::HIRAGANA.include?(code) ||
34
+ CodepointRanges::KATAKANA.include?(code) ||
35
+ CodepointRanges::KATAKANA_PHONETIC.include?(code) ||
36
+ CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
37
+ CodepointRanges::PUNCTUATION.include?(code) then
38
+ type = :kana
39
+ break
40
+ #kanji
41
+ elsif CodepointRanges::UNIFIED_CJK.include?(code) ||
42
+ CodepointRanges::UNIFIED_CJK_EXT_A.include?(code) ||
43
+ CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
44
+ type = :kanji
45
+ #english
46
+ else
47
+ type = :english
48
+ end
49
+ end
50
+
51
+ type
52
+ end
53
+
54
+ def japanese?(unicode_string)
55
+ type = script_type?(unicode_string)
56
+ type == :kanji || type == :kana
57
+ end
58
+ def english?(unicode_string)
59
+ type = script_type?(unicode_string)
60
+ type == :english
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,20 @@
1
+ require "spec_helper"
2
+ require 'configuration'
3
+
4
+ module JDict
5
+ describe Configuration do
6
+ describe "#debug" do
7
+ it "default value is false" do
8
+ Configuration.new.debug = false
9
+ end
10
+ end
11
+
12
+ describe "#debug=" do
13
+ it "can set value" do
14
+ config = Configuration.new
15
+ config.debug = true
16
+ expect(config.debug).to eq(true)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+ require BASE_PATH + '/lib/dictionary'
3
+ #require BASE_PATH + '/lib/jmdict'
4
+
5
+ module DictionarySpecHelper
6
+ JMDICT_PATH = File.join(BASE_PATH+'/dictionaries/JMdict')
7
+ INDEX_PATH = File.join(BASE_PATH+'/index')
8
+
9
+ def mock_index
10
+ end
11
+
12
+ class Increase
13
+ def initialize(&measure_proc) # + args
14
+ @measure_proc = measure_proc
15
+ end
16
+
17
+ def matches?(target)
18
+ @target = target
19
+ @original_value = @measure_proc.call
20
+ target.call
21
+ @new_value = @measure_proc.call
22
+ return @new_value.to_i > @original_value.to_i
23
+ end
24
+
25
+ def failure_message
26
+ "expected #{@new_value} to be greater than #{@original_value}"
27
+ end
28
+
29
+ def negative_failure_message
30
+ "expected #{@new_value} to not be greater than #{@original_value}"
31
+ end
32
+
33
+ def description
34
+ "increase #{@original_value}"
35
+ end
36
+ end
37
+
38
+ def increase(&measure_proc) # + args
39
+ Increase.new(&measure_proc)
40
+ end
41
+ end
42
+
43
+ module DictionarySpec
44
+ include DictionarySpecHelper
45
+
46
+ describe JDict::Dictionary do
47
+ before do
48
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
49
+ end
50
+
51
+ it "is searchable" do
52
+ @dictionary.should respond_to(:search)
53
+ end
54
+
55
+ it "can tell you whether or not it's loaded" do
56
+ @dictionary.should respond_to(:loaded?)
57
+ end
58
+
59
+ it "should generate fixtures" do
60
+ pending
61
+ @dictionary.should respond_to(:generate_fixtures)
62
+ end
63
+ end
64
+
65
+ describe JDict::Dictionary, "after initialization" do
66
+ before do
67
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
68
+ end
69
+
70
+ it "has no entries" do
71
+ @dictionary.size.should == 0
72
+ end
73
+
74
+ it "has an empty entries cache" do
75
+ @dictionary.entries_cache.empty?
76
+ end
77
+ end
78
+
79
+ describe JDict::Dictionary, "when loading from a dictionary file" do
80
+ before do
81
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
82
+ end
83
+
84
+ it "has at least 1 entry" do
85
+ pending("implement loading from index")
86
+ @dictionary.load(JMDICT_PATH)
87
+ @dictionary.size.should > 0
88
+ end
89
+
90
+ it "says it's loaded" do
91
+ pending("implement loading from index")
92
+ @dictionary.load(JMDICT_PATH)
93
+ # @dictionary.loaded?.should == true
94
+ @dictionary.loaded?.should equal(true)
95
+ end
96
+ end
97
+
98
+ describe JDict::Dictionary, "when loading from a dictionary file (already loaded)" do
99
+ before do
100
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
101
+ end
102
+
103
+ it "has the same size as it did before being loaded"
104
+ end
105
+
106
+ describe JDict::Dictionary, "when searching" do
107
+ before do
108
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
109
+ end
110
+
111
+ it "should raise an error if an index isn't built yet"
112
+ it "should give no results if the search phrase is empty" do
113
+ @dictionary.search('').should be_empty
114
+ end
115
+ end
116
+
117
+ end
@@ -0,0 +1,33 @@
1
+ <JMdict>
2
+ <entry>
3
+ <ent_seq>1171270</ent_seq>
4
+ <k_ele>
5
+ <keb>右翼</keb>
6
+ <ke_pri>ichi1</ke_pri>
7
+ <ke_pri>news1</ke_pri>
8
+ <ke_pri>nf04</ke_pri>
9
+ </k_ele>
10
+ <r_ele>
11
+ <reb>うよく</reb>
12
+ <re_pri>ichi1</re_pri>
13
+ <re_pri>news1</re_pri>
14
+ <re_pri>nf04</re_pri>
15
+ </r_ele>
16
+ <sense>
17
+ <pos>&n;</pos>
18
+ <gloss>right-wing</gloss>
19
+ <gloss g_lang="fr">aile droite (oiseau, armée, parti politique, base-ball)</gloss>
20
+ <gloss g_lang="ru">пра́вое крыло́</gloss>
21
+ <gloss g_lang="ru">пра́вый фланг</gloss>
22
+ <gloss g_lang="de">rechter Flügel</gloss>
23
+ </sense>
24
+ <sense>
25
+ <gloss g_lang="de">{Sport}</gloss>
26
+ <gloss g_lang="de">rechte Flanke</gloss>
27
+ <gloss g_lang="de">rechter Flügel</gloss>
28
+ </sense>
29
+ <sense>
30
+ <gloss g_lang="de">die Rechte</gloss>
31
+ </sense>
32
+ </entry>
33
+ </JMdict>
@@ -0,0 +1,84 @@
1
+ require 'rubygems'
2
+
3
+ require File.dirname(__FILE__) + '/spec_helper'
4
+ require BASE_PATH + '/lib/dictionary'
5
+ require BASE_PATH + '/lib/jmdict'
6
+ require BASE_PATH + '/lib/index'
7
+
8
+ require 'fileutils'
9
+
10
+ module IndexSpecHelper
11
+ end
12
+
13
+ describe JDict::DictIndex do
14
+ include IndexSpecHelper
15
+
16
+ before do
17
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
18
+ end
19
+
20
+ # Searching
21
+ it "is searchable" do
22
+ @index.should respond_to(:search)
23
+ end
24
+
25
+ # Building
26
+ it "is buildable" do
27
+ @index.should respond_to(:build) # and return an index
28
+ end
29
+ it "is rebuildable" do
30
+ @index.should respond_to(:rebuild)
31
+ end
32
+ it "tells whether it's built or not" do
33
+ @index.should respond_to(:built?)
34
+ end
35
+
36
+ # Destroying
37
+ it "is destroyable" do
38
+ @index.should respond_to(:destroy)
39
+ end
40
+
41
+ it "raises an error if an invalid dictionary path is specified" do
42
+ lambda { JDict::DictIndex.new(INDEX_PATH, 'bad_dictionary_path') }.should raise_error
43
+ end
44
+ end
45
+
46
+ describe JDict::DictIndex, "after initialization" do
47
+ it "the path should be set" do
48
+ @index = JDict::DictIndex.new(INDEX_PATH)
49
+ @index.path.should_not be(nil)
50
+ @index.path.should_not be('')
51
+ end
52
+ end
53
+
54
+ describe JDict::DictIndex, "when building," do
55
+ it "it is created on the file system" do
56
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
57
+ @index.build
58
+ File.exists?(INDEX_PATH).should == true
59
+ end
60
+
61
+ it "its directory on the file system shouldn't be empty" do
62
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH,
63
+ false) #no lazy loading
64
+ @index.build
65
+ # .
66
+ # ..
67
+ # ^-------- an empty directory has only these 2 entries
68
+ expect(Dir.entries(INDEX_PATH).size).to be >= 3
69
+ end
70
+
71
+ it "loads from a dictionary file"
72
+ end
73
+
74
+ describe JDict::DictIndex, "when rebuilding" do
75
+ include FileUtils
76
+
77
+ it "raises an error if it doesn't already exist" do
78
+ rm_rf(INDEX_PATH)
79
+ File.exists?(INDEX_PATH).should == false
80
+ lambda {
81
+ JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH).rebuild
82
+ }.should raise_error
83
+ end
84
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe JDict do
4
+ describe "#configure" do
5
+ before do
6
+ JDict.configure do |config|
7
+ config.dictionary_path = DICT_PATH
8
+ config.debug = true
9
+ end
10
+ end
11
+
12
+ it "uses the configured path" do
13
+ expect(JDICT.dictionary_path).to eq(DICT_PATH)
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+ require BASE_PATH + '/lib/dictionary'
3
+ require BASE_PATH + '/lib/jmdict'
4
+
5
+ module JMDictSpecHelper
6
+ INDEX_PATH = File.join(BASE_PATH+'/index')
7
+ end
8
+
9
+ describe JDict::JMDict do
10
+ include JMDictSpecHelper
11
+
12
+ before do
13
+ @jmdict = JDict::JMDict.new(JMDictSpecHelper::INDEX_PATH)
14
+ end
15
+
16
+ it do
17
+ @jmdict.should be_a_kind_of(JDict::Dictionary)
18
+ end
19
+ end
@@ -0,0 +1,50 @@
1
+ require 'rubygems' #use gems
2
+ require 'bundler/setup' #load up the bundled environment
3
+ # require 'spec' #test framework
4
+
5
+ $DEBUG = true
6
+
7
+ BASE_PATH = File.dirname(__FILE__) + '/..'
8
+ INDEX_PATH = BASE_PATH + '/test_index'
9
+ JMDICT_PATH = BASE_PATH + '/dictionaries/JMdict'
10
+
11
+ ##
12
+ # rSpec Hash additions.
13
+ #
14
+ # From
15
+ # * http://wincent.com/knowledge-base/Fixtures_considered_harmful%3F
16
+ # * Neil Rahilly
17
+
18
+ class Hash
19
+
20
+ ##
21
+ # Filter keys out of a Hash.
22
+ #
23
+ # { :a => 1, :b => 2, :c => 3 }.except(:a)
24
+ # => { :b => 2, :c => 3 }
25
+
26
+ def except(*keys)
27
+ self.reject { |k,v| keys.include?(k || k.to_sym) }
28
+ end
29
+
30
+ ##
31
+ # Override some keys.
32
+ #
33
+ # { :a => 1, :b => 2, :c => 3 }.with(:a => 4)
34
+ # => { :a => 4, :b => 2, :c => 3 }
35
+
36
+ def with(overrides = {})
37
+ self.merge overrides
38
+ end
39
+
40
+ ##
41
+ # Returns a Hash with only the pairs identified by +keys+.
42
+ #
43
+ # { :a => 1, :b => 2, :c => 3 }.only(:a)
44
+ # => { :a => 1 }
45
+
46
+ def only(*keys)
47
+ self.reject { |k,v| !keys.include?(k || k.to_sym) }
48
+ end
49
+
50
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-jdict
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ian Pickering
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: libxml-ruby
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.8.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: amalgalite
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.5.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.5.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: autotest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 3.4.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 3.4.0
69
+ description:
70
+ email:
71
+ - ipickering2@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - LICENSING
77
+ - README.md
78
+ - Rakefile
79
+ - examples/query.rb
80
+ - lib/#sense.rb#
81
+ - lib/configuration.rb
82
+ - lib/constants.rb
83
+ - lib/dictionaries/jmdict.rb
84
+ - lib/dictionary.rb
85
+ - lib/entry.rb
86
+ - lib/index.rb
87
+ - lib/jdict.rb
88
+ - lib/kana.rb
89
+ - lib/kanji.rb
90
+ - lib/ruby-jdict/version.rb
91
+ - lib/sense.rb
92
+ - lib/unicode.rb
93
+ - spec/configuration_spec.rb
94
+ - spec/dictionary_spec.rb
95
+ - spec/fixtures/feeds/sample_entry.xml
96
+ - spec/index_spec.rb
97
+ - spec/jdict_spec.rb
98
+ - spec/jmdict_spec.rb
99
+ - spec/spec_helper.rb
100
+ homepage: https://github.com/Ruin0x11/ruby-jdict
101
+ licenses: []
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.4.5.1
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Ruby gem for accessing Jim Breen's Japanese dictionaries
123
+ test_files: []
124
+ has_rdoc: