ruby-jdict 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 48dbfb86f9f72639eae7cecdde05da6953afdc8c
4
+ data.tar.gz: 01bae383b6df3ae0e9a524e094d7f1f1890663cd
5
+ SHA512:
6
+ metadata.gz: 4253b05fc65786103431707d298711170b8cc4cd426919b1dee06ba37a767766021ac80d1207a31a61bf8ea9a15466cf5e173f3b155328986dc674978793b5cd
7
+ data.tar.gz: 47a4b27fe519e1284bfd5311404f18489d701ddedc754d9203eaec07101bb4485378760de4f51e56f4fbe7aa235925e01505b18943e81d22af1cbb5464eca801
@@ -0,0 +1,28 @@
1
+ Copyright (C) 2015 Ian Pickering
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+ 2. Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in
12
+ the documentation and/or other materials provided with the
13
+ distribution.
14
+ 3. The name of the author may not be used to endorse or promote
15
+ products derived from this software without specific prior
16
+ written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS
19
+ OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
+ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24
+ GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,10 @@
1
+ # Ruby-JDict
2
+ Ruby gem for accessing Jim Breen's Japanese dictionaries. Can currently access the following:
3
+ * JMdict (Japanese-English dictionary)
4
+
5
+ Dictionary files are located [here](http://www.csse.monash.edu.au/~jwb/wwwjdicinf.html#dicfil_tag).
6
+
7
+ ## Install
8
+ ```
9
+ gem install ruby-jdict
10
+ ```
@@ -0,0 +1,30 @@
1
+ require 'rubygems'
2
+ require 'rake' #task runner
3
+
4
+ INDEX_PATH = 'index'
5
+ JMDICT_PATH = 'dictionaries/JMdict'
6
+
7
+ namespace :index do
8
+
9
+ desc "Build the dictionary's search index"
10
+ task :build do
11
+ raise "Index already exists at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
12
+ @index = DictIndex.new(INDEX_PATH,
13
+ JMDICT_PATH,
14
+ false) # lazy_loadind? no. don't lazy load
15
+ puts "Index created at path #{File.expand_path(INDEX_PATH)}" if File.exists? INDEX_PATH
16
+ puts "Index with #{@index.size} entries."
17
+ end
18
+
19
+ desc "Destroy the dictionary's search index"
20
+ task :destroy do
21
+ puts 'TODO: destory the index'
22
+ `sudo rm -R index`
23
+ # This will not work, because we don't have sudooooo.
24
+ # How do you delete folders in Ruby without sudo? Probably
25
+ # can't... that'd be more consistent actually.
26
+ # if File.exists? INDEX_PATH
27
+ # File.delete INDEX_PATH
28
+ # end
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'jdict'
3
+
4
+ BASE_PATH = ENV["HOME"]
5
+ DICT_PATH = File.join(BASE_PATH, '.dicts')
6
+ INDEX_PATH = DICT_PATH
7
+
8
+ JDict.configure do |config|
9
+ config.dictionary_path = DICT_PATH # directory containing dictionary files
10
+ config.index_path = INDEX_PATH # directory containing the full text search index
11
+ config.language = JDict::JMDictConstants::Languages::ENGLISH # language for search results
12
+ config.num_results = 50 # maximum results to return from searching
13
+ end
14
+
15
+ dict = JDict::JMDict.new
16
+
17
+ query = "日本語"
18
+
19
+ results = dict.search(query)
20
+ results.each do |entry|
21
+ puts entry.kanji.join(", ")
22
+ puts entry.kana.join(", ")
23
+ entry.senses.each do |sense|
24
+ glosses = sense.glosses.join(", ")
25
+ parts_of_speech = sense.parts_of_speech.join(", ")
26
+ puts "(" + parts_of_speech + ") " + glosses
27
+ end
28
+ puts
29
+ end
@@ -0,0 +1,14 @@
1
+ # The sense element will record the translational equivalent
2
+ # of the Japanese word, plus other related information. Where there
3
+ # are several distinctly different meanings of the word, multiple
4
+ # sense elements will be employed.
5
+ module JDict
6
+ class Sense
7
+ attr_reader :parts_of_speech, :glosses
8
+ #
9
+ # Create a new +Sense+
10
+ def initialize(parts_of_speech, glosses, language)
11
+ @parts_of_speech, @glosses = parts_of_speech, glosses
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ require 'constants'
2
+
3
+ module JDict
4
+ class Configuration
5
+ attr_accessor :dictionary_path, :index_path, :num_results, :language, :lazy_index_loading, :debug
6
+
7
+ BASE_PATH = ENV["HOME"]
8
+ DICT_PATH = File.join(BASE_PATH, '.dicts')
9
+ INDEX_PATH = DICT_PATH
10
+
11
+ def initialize
12
+ @dictionary_path = DICT_PATH # directory containing dictionary files
13
+ @index_path = INDEX_PATH # directory containing the full text search index
14
+ @num_results = 50 # maximum results to return from searching
15
+ @language = JDict::JMDictConstants::Languages::ENGLISH # language to return search results in
16
+ @lazy_index_loading = false # load the index only on attempting to access it
17
+ @debug = false # limit number of entries indexed, rebuild index on instantiation
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,64 @@
1
+ # Constants and descriptions for important elements/attributes
2
+ # of the JMdict XML dictionary.
3
+ # Descriptions come from JMdict.dtd (document type definition)
4
+ module JDict
5
+ module JMDictConstants
6
+
7
+ # TODO: change these strings to symbols ?
8
+ # XML elements of the JMDict file
9
+ module Elements
10
+ # Entries consist of kanji elements, kana elements,
11
+ # general information and sense elements. Each entry must have at
12
+ # least one kana element and one sense element. Others are optional.
13
+ ENTRY = 'entry'
14
+ SEQUENCE = 'ent_seq'
15
+
16
+ # This element will contain a word or short phrase in Japanese
17
+ # which is written using at least one kanji. The valid characters are
18
+ # kanji, kana, related characters such as chouon and kurikaeshi, and
19
+ # in exceptional cases, letters from other alphabets.
20
+ KANJI = 'keb'
21
+
22
+ # This element content is restricted to kana and related
23
+ # characters such as chouon and kurikaeshi. Kana usage will be
24
+ # consistent between the keb and reb elements; e.g. if the keb
25
+ # contains katakana, so too will the reb.
26
+ KANA = 'reb'
27
+
28
+ # The sense element will record the translational equivalent
29
+ # of the Japanese word, plus other related information. Where there
30
+ # are several distinctly different meanings of the word, multiple
31
+ # sense elements will be employed.
32
+ SENSE = 'sense'
33
+
34
+ # Part-of-speech information about the entry/sense. Should use
35
+ # appropriate entity codes.
36
+ PART_OF_SPEECH = 'pos'
37
+
38
+ # Within each sense will be one or more "glosses", i.e.
39
+ # target-language words or phrases which are equivalents to the
40
+ # Japanese word. This element would normally be present, however it
41
+ # may be omitted in entries which are purely for a cross-reference.
42
+ GLOSS = 'gloss'
43
+
44
+ CROSSREFERENCE = 'xref'
45
+ end
46
+
47
+ # Constants for selecting the search language.
48
+ # Used in the "gloss" element's xml:lang attribute.
49
+ # :eng never appears as a xml:lang constant because gloss is assumed to be English when not specified
50
+ # :jpn never appears as a xml:lang because the dictionary itself pivots around Japanese
51
+ module Languages
52
+ JAPANESE = :jpn
53
+ ENGLISH = :eng
54
+ DUTCH = :dut
55
+ FRENCH = :fre
56
+ GERMAN = :ger
57
+ RUSSIAN = :rus
58
+ SPANISH = :spa
59
+ SLOVENIAN = :slv
60
+ SWEDISH = :swe
61
+ HUNGARIAN = :hun
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,14 @@
1
+ require 'jdict'
2
+ require 'dictionary'
3
+
4
+ module JDict
5
+ class JMDict < Dictionary
6
+ private
7
+ # DICT_PATH = JDict.configuration.dictionary_path + '/JMdict'
8
+
9
+ def initialize(index_path = JDict.configuration.index_path, lazy_index_loading=JDict.configuration.lazy_index_loading)
10
+ path = JDict.configuration.dictionary_path + '/JMdict'
11
+ super(index_path, path, lazy_index_loading)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,62 @@
1
+ require 'jdict'
2
+ require 'index'
3
+
4
+ module JDict
5
+ class Dictionary
6
+ attr_reader :entries_cache, :lazy_index_loading
7
+
8
+ def initialize(index_path = JDict.configuration.index_path, dictionary_path = nil, lazy_index_loading = JDict.configuration.lazy_index_loading)
9
+ path_specified = dictionary_path.nil? ? false : true
10
+ if path_specified and not File.exists? dictionary_path
11
+ raise "Dictionary not found at path #{dictionary_path}"
12
+ end
13
+
14
+ #store some args for future reference
15
+ @dictionary_path = dictionary_path
16
+ @lazy_index_loading = lazy_index_loading
17
+
18
+ @entries = []
19
+ @entries_cache = []
20
+
21
+ #instantiate and load the full-text search index
22
+ @index = DictIndex.new(index_path, dictionary_path, lazy_index_loading)
23
+ end
24
+
25
+ def size
26
+ @entries.size
27
+ end
28
+
29
+ def loaded?
30
+ @index.built?
31
+ end
32
+
33
+ # Search this dictionary's index for the given string.
34
+ # @param query [String] the search query
35
+ # @return [Array(Entry)] the results of the search
36
+ def search(query)
37
+ results = []
38
+ return results if query.empty?
39
+
40
+ load_index if lazy_index_loading and not loaded?
41
+
42
+ results = @index.search(query)
43
+ end
44
+
45
+ # Retrieves the definition of a part-of-speech from its abbreviation
46
+ # @param pos [String] the abbreviation for the part-of-speech
47
+ # @return [String] the full description of the part-of-speech
48
+ def get_pos(pos)
49
+ @index.get_pos(pos)
50
+ end
51
+
52
+ private
53
+
54
+ def load_index
55
+ if loaded?
56
+ Exception.new("Dictionary index is already loaded")
57
+ else
58
+ @index.build
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,79 @@
1
+ #include Constants #XML constants from the dictionary file
2
+
3
+ # Entries consist of kanji elements, kana elements,
4
+ # general information and sense elements. Each entry must have at
5
+ # least one kana element and one sense element. Others are optional.
6
+ module JDict
7
+ class Entry
8
+
9
+ attr_accessor :kanji, :kana, :senses
10
+ # Create a new Entry
11
+ # entry = initialize(kanji, kana, senses)
12
+ def initialize(kanji, kana, senses)
13
+ @kanji, @kana, @senses = kanji, kana, senses
14
+ end
15
+
16
+ KANA_RE = /^kana/
17
+ SENSE_RE = /^sense/
18
+ PART_OF_SPEECH_RE = /^\[\[([^\]]+)\]\]\s+/
19
+
20
+ MEANING_SENTINEL = '**'
21
+ PART_OF_SPEECH_SENTINEL = '$$'
22
+ SENSE_SENTINEL = '%%'
23
+ LANGUAGE_SENTINEL = '&&'
24
+ GLOSS_SENTINEL = '@@'
25
+
26
+ # Converts an SQLite row from the index to the Entry format
27
+ def self.from_sql(row)
28
+ kanji = row["kanji"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
29
+ kana = row["kana"].split(", ").map { |k| k = k.force_encoding("UTF-8") }
30
+ senses = []
31
+ row["senses"].split(SENSE_SENTINEL).sort.each do |txt|
32
+ ary = txt.scan(PART_OF_SPEECH_RE)
33
+ if ary.size == 1
34
+ parts_of_speech = ary[0][0].split(PART_OF_SPEECH_SENTINEL)
35
+ gloss_strings = txt[(ary.to_s.length-1)..-1]
36
+ else
37
+ parts_of_speech = nil
38
+ gloss_strings = txt[5..-1]
39
+ end
40
+
41
+ gloss_strings = gloss_strings.force_encoding("UTF-8").strip.split(GLOSS_SENTINEL)
42
+
43
+ glosses = {}
44
+ gloss_strings.each do |str|
45
+ lang, meaning_string = str.split(LANGUAGE_SENTINEL)
46
+ lang = lang.to_sym
47
+ meanings = meaning_string.split(MEANING_SENTINEL)
48
+ (glosses[lang] ||= []) << meanings
49
+ end
50
+ glosses_for_lang = glosses[JDict.configuration.language] || glosses[JDict::JMDictConstants::Languages::ENGLISH]
51
+ senses << Sense.new(parts_of_speech, glosses_for_lang) # ** is the sentinel sequence
52
+ end
53
+ self.new(kanji, kana, senses)
54
+ end
55
+
56
+ # Converts an Entry to a string to be indexed into the SQLite database
57
+ # @return [String] the serialized string for this Entry
58
+ def to_sql
59
+ sense_strings = senses.map do |s|
60
+ sense = ''
61
+ sense << "[[#{s.parts_of_speech.join(PART_OF_SPEECH_SENTINEL)}]] " if s.parts_of_speech
62
+ sense << s.glosses.collect { |lang, texts| lang.to_s + LANGUAGE_SENTINEL + texts.join(MEANING_SENTINEL) }.compact.join(GLOSS_SENTINEL)
63
+ end
64
+
65
+ insert_data = {
66
+ ':kanji' => kanji.join(", "),
67
+ ':kana' => kana.join(", "),
68
+ ':senses' => sense_strings.join(SENSE_SENTINEL)
69
+ }
70
+
71
+ return insert_data
72
+ end
73
+
74
+ # Get an array of +Senses+ for the specified language
75
+ def senses_by_language(l)
76
+ senses.select { |s| s.language == l }
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,346 @@
1
+ # encoding: utf-8
2
+ require 'rubygems' #use gems
3
+ require 'bundler/setup' #load up the bundled environment
4
+
5
+ require 'amalgalite'
6
+ require 'libxml' #XML parsing
7
+ require 'fileutils'
8
+
9
+ require_relative 'constants' #XML constants from the dictionary file
10
+
11
+ require_relative 'entry' #dictionary elements
12
+ require_relative 'kanji' #...
13
+ require_relative 'kana' #...
14
+ require_relative 'sense'
15
+
16
+ require 'amalgalite'
17
+
18
+ include LibXML
19
+
20
+ module JDict
21
+ class DictIndex
22
+
23
+ LANGUAGE_DEFAULT = JDict::JMDictConstants::Languages::ENGLISH
24
+ NUM_ENTRIES_TO_INDEX = 50
25
+ ENTITY_REGEX = /<!ENTITY\s([^ ]*)\s\"(.*)">/
26
+
27
+ attr_reader :path
28
+ # Initialize a full-text search index backend for JMdict
29
+ # @param index_path [String] desired filesystem path where you'd like the *search index* stored
30
+ # @param dictionary_path [String] desired filesystem path where you'd like the *dictionary* stored
31
+ # @param lazy_loading [Boolean] lazily load the index just when it's needed, instead of building it ahead of time
32
+ def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
33
+ raise "Index path was nil" if index_path.nil?
34
+
35
+ path_specified = dictionary_path.nil? ? false : true
36
+ if path_specified and not File.exists? dictionary_path
37
+ raise "Dictionary not found at path #{dictionary_path}"
38
+ end
39
+
40
+ @path = index_path
41
+ @dictionary_path = dictionary_path
42
+ @pos_hash = {}
43
+
44
+ # create path if nonexistent
45
+ FileUtils.mkdir_p(@path)
46
+ db_file = File.join(@path, "fts5.db")
47
+
48
+ File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)
49
+
50
+ @index = Amalgalite::Database.new(db_file)
51
+
52
+ create_schema
53
+
54
+ #check if the index has already been built before Ferret creates it
55
+ already_built = built?
56
+
57
+ #build the index right now if "lazy loading" isn't on and the index is empty
58
+ build unless lazy_loading or (already_built && !JDict.configuration.debug)
59
+
60
+ #make the hash from abbreviated parts of speech to full definitions
61
+ build_pos_hash
62
+ end
63
+
64
+ # Creates the SQL schema for the Amalgalite database
65
+ def create_schema
66
+ schema = @index.schema
67
+ unless schema.tables['search']
68
+ @index.execute_batch <<-SQL
69
+ CREATE VIRTUAL TABLE search USING fts5(
70
+ kanji,
71
+ kana,
72
+ senses
73
+ );
74
+ SQL
75
+ @index.reload_schema!
76
+ end
77
+ end
78
+
79
+ # Returns the search results as an array of +Entry+
80
+ # @param term [String] the search string
81
+ # @param language [Symbol] the language to return results in
82
+ # @return [Array(Entry)] the results of the search
83
+ def search(term, language=LANGUAGE_DEFAULT, exact=false)
84
+ raise "Index not found at path #{@path}" unless File.exists? @path
85
+
86
+ # no results yet...
87
+ results = []
88
+
89
+ @entries_cache = []
90
+
91
+ # search for:
92
+ # kanji... one field
93
+ # kana ... up to 10 fields
94
+ # sense... up to 10 fields
95
+ # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""
96
+ query = "{kanji kana senses} : \"#{term}\""
97
+ query += "*" unless exact
98
+
99
+ @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH '#{query}' LIMIT #{JDict.configuration.num_results}") do |row|
100
+ entry = Entry.from_sql(row)
101
+ score = 0.0
102
+
103
+ # load entry from the index. from cache, if it's available
104
+ # load from cache if it's available
105
+ # if entry = @entries_cache[docid]
106
+ # entry = Entry.from_index_doc(@ferret_index[docid].load)
107
+ # @entries_cache[docid] = entry
108
+ # end
109
+
110
+ # # load entry from the index
111
+ # if entry.nil?
112
+ # entry = Entry.from_index_doc(@ferret_index[docid].load)
113
+ # @entries_cache[docid] = entry
114
+ # end
115
+
116
+ is_exact_match = false
117
+ is_exact_match = entry.kanji == term ||
118
+ entry.kana.any? { |k| k == term }
119
+
120
+ re = Regexp.new("#{term}", Regexp::IGNORECASE) # match the search term, ignoring case
121
+ # entry.senses.each do |s|
122
+ # s.glosses.each { |g| is_exact_match = is_exact_match || g.force_encoding("UTF-8").match(re) }
123
+ # end
124
+
125
+ # score = 1.0 if is_exact_match
126
+
127
+ # add the result
128
+ results << [score, entry]
129
+ end
130
+
131
+ @entries_cache = []
132
+
133
+ results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
134
+ end
135
+
136
+ def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end
137
+
138
+ # Builds the full-text search index
139
+ # @param overwrite [Boolean] force a build even if the index path already exists
140
+ # @param dictionary_path [String] path to the dictionary file
141
+ # @return [Integer] the number of indexed entries
142
+ def build(overwrite=false, dictionary_path=nil)
143
+ @dictionary_path = dictionary_path unless dictionary_path.nil?
144
+ raise "No dictionary path was provided" if @dictionary_path.nil?
145
+ raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
146
+
147
+ reader = open_reader(@dictionary_path)
148
+
149
+ puts "Building index..."
150
+
151
+ # whenever there is a reader error, print its block parameters
152
+ XML::Error.set_handler { |*args| p args }
153
+
154
+ # components of an entry
155
+ kanji, kana, senses = [], [], []
156
+ glosses = {}
157
+ parts_of_speech = []
158
+
159
+ entries_added = 0
160
+
161
+ @index.transaction do |db_transaction|
162
+
163
+ # read until the end
164
+ while reader.read
165
+
166
+ # check what type of node we're currently on
167
+ case reader.node_type
168
+
169
+ # start-of-element node
170
+ when XML::Reader::TYPE_ELEMENT
171
+ case reader.name
172
+ when JDict::JMDictConstants::Elements::SEQUENCE
173
+ entry_sequence_num = reader.next_text
174
+
175
+ # TODO: Raise an exception if reader.next_text.empty? inside the when's
176
+ # JMdict shouldn't have any empty elements, I believe.
177
+ when JDict::JMDictConstants::Elements::KANJI
178
+ text = reader.next_text
179
+ kanji << text unless text.empty?
180
+
181
+ when JDict::JMDictConstants::Elements::KANA
182
+ text = reader.next_text
183
+ kana << text unless text.empty?
184
+
185
+ when JDict::JMDictConstants::Elements::GLOSS
186
+ language = reader.node.lang || LANGUAGE_DEFAULT
187
+ language = language.intern
188
+ text = reader.next_text
189
+ unless text.empty?
190
+ (glosses[language] ||= []) << text
191
+ end
192
+
193
+ when JDict::JMDictConstants::Elements::CROSSREFERENCE
194
+ text = reader.next_text
195
+ end
196
+
197
+ # XML entity references are treated as a different node type
198
+ # the parent node of the entity reference itself has the actual tag name
199
+ when XML::Reader::TYPE_ENTITY_REFERENCE
200
+ if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
201
+ text = reader.name
202
+ parts_of_speech << text unless text.empty?
203
+ end
204
+
205
+ # end-of-element node
206
+ when XML::Reader::TYPE_END_ELEMENT
207
+ case reader.name
208
+
209
+ when JDict::JMDictConstants::Elements::SENSE
210
+ # build sense
211
+ senses << Sense.new(parts_of_speech, glosses)
212
+ # glosses.each do |language, texts|
213
+ # senses << Sense.new(parts_of_speech,
214
+ # texts.join(', ').strip,
215
+ # language)
216
+ # end
217
+
218
+ # clear data for the next sense
219
+ glosses = {}
220
+ parts_of_speech = []
221
+
222
+ # we're at the end of the entry element, so index it
223
+ when JDict::JMDictConstants::Elements::ENTRY
224
+ raise "No kana found for this entry!" if kana.empty?
225
+
226
+ #index
227
+ # @index.add_entry(i, Entry.new(kanji, kana, senses))
228
+ insert_data = Entry.new(kanji, kana, senses).to_sql
229
+
230
+ db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
231
+ stmt.execute( insert_data )
232
+ end
233
+
234
+ # TODO: add entry_sequence_num to the entry
235
+
236
+ # clear data for the next entry
237
+ kanji, kana, senses = [], [], []
238
+
239
+ entries_added += 1
240
+ #debug
241
+ if JDict.configuration.debug
242
+ break if entries_added >= NUM_ENTRIES_TO_INDEX
243
+ # # if @index.size.modulo(1000) == 0
244
+ # if @index.size.modulo(100) == 0
245
+ # # puts "#{@index.size/1000} thousand"
246
+ # puts "\r#{@index.size/100} hundred"
247
+ # end
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
253
+
254
+ # puts "#{@index.size} entries indexed"
255
+
256
+ # Done reading & indexing
257
+ reader.close
258
+ # @index.close
259
+ end
260
+
261
+ def rebuild
262
+ raise "Index already exists at path #{@path}" if File.exists? @path
263
+ build
264
+ end
265
+
266
+ # Creates an XML::Reader object for the given path
267
+ # @param dictionary_path [String] path to the dictionary file
268
+ # @return [XML::Reader] the reader for the given dictionary
269
+ def open_reader(dictionary_path)
270
+ # open reader
271
+ reader = nil
272
+ Dir.chdir(Dir.pwd) do
273
+ jmdict_path = File.join(dictionary_path)
274
+ reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
275
+ raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
276
+ end
277
+ reader
278
+ end
279
+
280
+ # Creates the hash of part-of-speech symbols to full definitions from the dictionary
281
+ def build_pos_hash
282
+ @pos_hash ||= begin
283
+ pos_hash = {}
284
+ reader = open_reader(@dictionary_path)
285
+ done = false
286
+ while done == false
287
+ reader.read
288
+ case reader.node_type
289
+ when XML::Reader::TYPE_DOCUMENT_TYPE
290
+ # random segfault when attempting this
291
+ # cs.each do |child|
292
+ # p child.to_s
293
+ # end
294
+ doctype_string = reader.node.to_s
295
+ entities = doctype_string.scan(ENTITY_REGEX)
296
+ entities.map do |entity|
297
+ abbrev = entity[0]
298
+ full = entity[1]
299
+ sym = pos_to_sym(abbrev)
300
+ pos_hash[sym] = full
301
+ end
302
+ done = true
303
+ when XML::Reader::TYPE_ELEMENT
304
+ done = true
305
+ end
306
+ end
307
+ pos_hash
308
+ end
309
+ end
310
+
311
+ # Converts a part-of-speech entity reference string into a symbol
312
+ # @param entity [String] the entity reference string
313
+ # @return [Symbol] the part-of-speech symbol
314
+ def pos_to_sym(entity)
315
+ entity.gsub('-', '_').to_sym
316
+ end
317
+
318
+ # Retrieves the definition of a part-of-speech from its abbreviation
319
+ # @param pos [String] the abbreviation for the part-of-speech
320
+ # @return [String] the full description of the part-of-speech
321
+ def get_pos(pos)
322
+ build_pos_hash if @pos_hash.empty?
323
+ @pos_hash[pos_to_sym(pos)]
324
+ end
325
+ end
326
+
327
+ # Add custom parsing methods to XML::Reader
328
+ class XML::Reader
329
+
330
+ public
331
+ # Get the next text node
332
+ def next_text
333
+ # read until a text node
334
+ while (self.node_type != XML::Reader::TYPE_TEXT and self.read); end
335
+ self.value
336
+ end
337
+ # Get the next entity node
338
+ def next_entity
339
+ # read until an entity node
340
+ while (self.node_type != XML::Reader::TYPE_ENTITY and
341
+ self.node_type != XML::Reader::TYPE_ENTITY_REFERENCE and
342
+ self.read); end
343
+ self.value
344
+ end
345
+ end
346
+ end
@@ -0,0 +1,20 @@
1
+ require 'configuration'
2
+ require 'dictionaries/jmdict'
3
+
4
+ module JDict
5
+ class << self
6
+ attr_accessor :configuration
7
+ end
8
+
9
+ def self.configuration
10
+ @configuration ||= Configuration.new
11
+ end
12
+
13
+ def self.reset
14
+ @configuration = Configuration.new
15
+ end
16
+
17
+ def self.configure
18
+ yield(configuration)
19
+ end
20
+ end
@@ -0,0 +1,4 @@
1
+ module JDict
2
+ class Kana
3
+ end
4
+ end
@@ -0,0 +1,4 @@
1
+ module JDict
2
+ class Kanji
3
+ end
4
+ end
@@ -0,0 +1,3 @@
1
+ module JDict
2
+ Version = '0.0.1'
3
+ end
@@ -0,0 +1,14 @@
1
+ # The sense element will record the translational equivalent
2
+ # of the Japanese word, plus other related information. Where there
3
+ # are several distinctly different meanings of the word, multiple
4
+ # sense elements will be employed.
5
+ module JDict
6
+ class Sense
7
+ attr_reader :parts_of_speech, :glosses
8
+ #
9
+ # Create a new +Sense+
10
+ def initialize(parts_of_speech, glosses)
11
+ @parts_of_speech, @glosses = parts_of_speech, glosses
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,63 @@
1
+ module JDict
2
+ module Unicode
3
+ # Codepoint ranges for japanese unicode characters (in decimal)
4
+ # from: http://unicode.org/charts/
5
+ module CodepointRanges
6
+ HIRAGANA = 12352..12447
7
+ KATAKANA = 12448..12543
8
+ KATAKANA_PHONETIC = 12784..12799
9
+ HALFWIDTH_KATAKANA = 65280..65519
10
+ UNIFIED_CJK = 19968..40911
11
+ UNIFIED_CJK_EXT_A = 13312..19903
12
+ UNIFIED_CJK_EXT_B = 131072..173791
13
+ PUNCTUATION = 12288..12351
14
+ end
15
+
16
+ # Get Unicode hex codepoint from a Unicode character
17
+ def hex_codepoint(unicode_char)
18
+ unicode_char.unpack("U0U*")[0]
19
+ end
20
+
21
+ # TODO: write unit test with a variety of strings to ensure this method
22
+ # returns the expected output
23
+ # Determine the script of the specified string:
24
+ # :kanji
25
+ # :kana
26
+ # :english
27
+ def script_type?(unicode_string)
28
+ type = ''
29
+
30
+ unicode_string.each_char do |c|
31
+ code = hex_codepoint(c)
32
+ #kana
33
+ if CodepointRanges::HIRAGANA.include?(code) ||
34
+ CodepointRanges::KATAKANA.include?(code) ||
35
+ CodepointRanges::KATAKANA_PHONETIC.include?(code) ||
36
+ CodepointRanges::HALFWIDTH_KATAKANA.include?(code) ||
37
+ CodepointRanges::PUNCTUATION.include?(code) then
38
+ type = :kana
39
+ break
40
+ #kanji
41
+ elsif CodepointRanges::UNIFIED_CJK.include?(code) ||
42
+ CodepointRanges::UNIFIED_CJK_EXT_A.include?(code) ||
43
+ CodepointRanges::UNIFIED_CJK_EXT_B.include?(code) then
44
+ type = :kanji
45
+ #english
46
+ else
47
+ type = :english
48
+ end
49
+ end
50
+
51
+ type
52
+ end
53
+
54
+ def japanese?(unicode_string)
55
+ type = script_type?(unicode_string)
56
+ type == :kanji || type == :kana
57
+ end
58
+ def english?(unicode_string)
59
+ type = script_type?(unicode_string)
60
+ type == :english
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,20 @@
1
+ require "spec_helper"
2
+ require 'configuration'
3
+
4
+ module JDict
5
+ describe Configuration do
6
+ describe "#debug" do
7
+ it "default value is false" do
8
+ Configuration.new.debug = false
9
+ end
10
+ end
11
+
12
+ describe "#debug=" do
13
+ it "can set value" do
14
+ config = Configuration.new
15
+ config.debug = true
16
+ expect(config.debug).to eq(true)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+ require BASE_PATH + '/lib/dictionary'
3
+ #require BASE_PATH + '/lib/jmdict'
4
+
5
+ module DictionarySpecHelper
6
+ JMDICT_PATH = File.join(BASE_PATH+'/dictionaries/JMdict')
7
+ INDEX_PATH = File.join(BASE_PATH+'/index')
8
+
9
+ def mock_index
10
+ end
11
+
12
+ class Increase
13
+ def initialize(&measure_proc) # + args
14
+ @measure_proc = measure_proc
15
+ end
16
+
17
+ def matches?(target)
18
+ @target = target
19
+ @original_value = @measure_proc.call
20
+ target.call
21
+ @new_value = @measure_proc.call
22
+ return @new_value.to_i > @original_value.to_i
23
+ end
24
+
25
+ def failure_message
26
+ "expected #{@new_value} to be greater than #{@original_value}"
27
+ end
28
+
29
+ def negative_failure_message
30
+ "expected #{@new_value} to not be greater than #{@original_value}"
31
+ end
32
+
33
+ def description
34
+ "increase #{@original_value}"
35
+ end
36
+ end
37
+
38
+ def increase(&measure_proc) # + args
39
+ Increase.new(&measure_proc)
40
+ end
41
+ end
42
+
43
+ module DictionarySpec
44
+ include DictionarySpecHelper
45
+
46
+ describe JDict::Dictionary do
47
+ before do
48
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
49
+ end
50
+
51
+ it "is searchable" do
52
+ @dictionary.should respond_to(:search)
53
+ end
54
+
55
+ it "can tell you whether or not it's loaded" do
56
+ @dictionary.should respond_to(:loaded?)
57
+ end
58
+
59
+ it "should generate fixtures" do
60
+ pending
61
+ @dictionary.should respond_to(:generate_fixtures)
62
+ end
63
+ end
64
+
65
+ describe JDict::Dictionary, "after initialization" do
66
+ before do
67
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
68
+ end
69
+
70
+ it "has no entries" do
71
+ @dictionary.size.should == 0
72
+ end
73
+
74
+ it "has an empty entries cache" do
75
+ @dictionary.entries_cache.empty?
76
+ end
77
+ end
78
+
79
+ describe JDict::Dictionary, "when loading from a dictionary file" do
80
+ before do
81
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
82
+ end
83
+
84
+ it "has at least 1 entry" do
85
+ pending("implement loading from index")
86
+ @dictionary.load(JMDICT_PATH)
87
+ @dictionary.size.should > 0
88
+ end
89
+
90
+ it "says it's loaded" do
91
+ pending("implement loading from index")
92
+ @dictionary.load(JMDICT_PATH)
93
+ # @dictionary.loaded?.should == true
94
+ @dictionary.loaded?.should equal(true)
95
+ end
96
+ end
97
+
98
+ describe JDict::Dictionary, "when loading from a dictionary file (already loaded)" do
99
+ before do
100
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
101
+ end
102
+
103
+ it "has the same size as it did before being loaded"
104
+ end
105
+
106
+ describe JDict::Dictionary, "when searching" do
107
+ before do
108
+ @dictionary = JDict::Dictionary.new(INDEX_PATH)
109
+ end
110
+
111
+ it "should raise an error if an index isn't built yet"
112
+ it "should give no results if the search phrase is empty" do
113
+ @dictionary.search('').should be_empty
114
+ end
115
+ end
116
+
117
+ end
@@ -0,0 +1,33 @@
1
+ <JMdict>
2
+ <entry>
3
+ <ent_seq>1171270</ent_seq>
4
+ <k_ele>
5
+ <keb>右翼</keb>
6
+ <ke_pri>ichi1</ke_pri>
7
+ <ke_pri>news1</ke_pri>
8
+ <ke_pri>nf04</ke_pri>
9
+ </k_ele>
10
+ <r_ele>
11
+ <reb>うよく</reb>
12
+ <re_pri>ichi1</re_pri>
13
+ <re_pri>news1</re_pri>
14
+ <re_pri>nf04</re_pri>
15
+ </r_ele>
16
+ <sense>
17
+ <pos>&n;</pos>
18
+ <gloss>right-wing</gloss>
19
+ <gloss g_lang="fr">aile droite (oiseau, armée, parti politique, base-ball)</gloss>
20
+ <gloss g_lang="ru">пра́вое крыло́</gloss>
21
+ <gloss g_lang="ru">пра́вый фланг</gloss>
22
+ <gloss g_lang="de">rechter Flügel</gloss>
23
+ </sense>
24
+ <sense>
25
+ <gloss g_lang="de">{Sport}</gloss>
26
+ <gloss g_lang="de">rechte Flanke</gloss>
27
+ <gloss g_lang="de">rechter Flügel</gloss>
28
+ </sense>
29
+ <sense>
30
+ <gloss g_lang="de">die Rechte</gloss>
31
+ </sense>
32
+ </entry>
33
+ </JMdict>
@@ -0,0 +1,84 @@
1
+ require 'rubygems'
2
+
3
+ require File.dirname(__FILE__) + '/spec_helper'
4
+ require BASE_PATH + '/lib/dictionary'
5
+ require BASE_PATH + '/lib/jmdict'
6
+ require BASE_PATH + '/lib/index'
7
+
8
+ require 'fileutils'
9
+
10
+ module IndexSpecHelper
11
+ end
12
+
13
+ describe JDict::DictIndex do
14
+ include IndexSpecHelper
15
+
16
+ before do
17
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
18
+ end
19
+
20
+ # Searching
21
+ it "is searchable" do
22
+ @index.should respond_to(:search)
23
+ end
24
+
25
+ # Building
26
+ it "is buildable" do
27
+ @index.should respond_to(:build) # and return an index
28
+ end
29
+ it "is rebuildable" do
30
+ @index.should respond_to(:rebuild)
31
+ end
32
+ it "tells whether it's built or not" do
33
+ @index.should respond_to(:built?)
34
+ end
35
+
36
+ # Destroying
37
+ it "is destroyable" do
38
+ @index.should respond_to(:destroy)
39
+ end
40
+
41
+ it "raises an error if an invalid dictionary path is specified" do
42
+ lambda { JDict::DictIndex.new(INDEX_PATH, 'bad_dictionary_path') }.should raise_error
43
+ end
44
+ end
45
+
46
+ describe JDict::DictIndex, "after initialization" do
47
+ it "the path should be set" do
48
+ @index = JDict::DictIndex.new(INDEX_PATH)
49
+ @index.path.should_not be(nil)
50
+ @index.path.should_not be('')
51
+ end
52
+ end
53
+
54
+ describe JDict::DictIndex, "when building," do
55
+ it "it is created on the file system" do
56
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH)
57
+ @index.build
58
+ File.exists?(INDEX_PATH).should == true
59
+ end
60
+
61
+ it "its directory on the file system shouldn't be empty" do
62
+ @index = JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH,
63
+ false) #no lazy loading
64
+ @index.build
65
+ # .
66
+ # ..
67
+ # ^-------- an empty directory has only these 2 entries
68
+ expect(Dir.entries(INDEX_PATH).size).to be >= 3
69
+ end
70
+
71
+ it "loads from a dictionary file"
72
+ end
73
+
74
+ describe JDict::DictIndex, "when rebuilding" do
75
+ include FileUtils
76
+
77
+ it "raises an error if it doesn't already exist" do
78
+ rm_rf(INDEX_PATH)
79
+ File.exists?(INDEX_PATH).should == false
80
+ lambda {
81
+ JDict::DictIndex.new(INDEX_PATH, JMDICT_PATH).rebuild
82
+ }.should raise_error
83
+ end
84
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe JDict do
4
+ describe "#configure" do
5
+ before do
6
+ JDict.configure do |config|
7
+ config.dictionary_path = DICT_PATH
8
+ config.debug = true
9
+ end
10
+ end
11
+
12
+ it "uses the configured path" do
13
+ expect(JDICT.dictionary_path).to eq(DICT_PATH)
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+ require BASE_PATH + '/lib/dictionary'
3
+ require BASE_PATH + '/lib/jmdict'
4
+
5
+ module JMDictSpecHelper
6
+ INDEX_PATH = File.join(BASE_PATH+'/index')
7
+ end
8
+
9
+ describe JDict::JMDict do
10
+ include JMDictSpecHelper
11
+
12
+ before do
13
+ @jmdict = JDict::JMDict.new(JMDictSpecHelper::INDEX_PATH)
14
+ end
15
+
16
+ it do
17
+ @jmdict.should be_a_kind_of(JDict::Dictionary)
18
+ end
19
+ end
@@ -0,0 +1,50 @@
1
+ require 'rubygems' #use gems
2
+ require 'bundler/setup' #load up the bundled environment
3
+ # require 'spec' #test framework
4
+
5
+ $DEBUG = true
6
+
7
+ BASE_PATH = File.dirname(__FILE__) + '/..'
8
+ INDEX_PATH = BASE_PATH + '/test_index'
9
+ JMDICT_PATH = BASE_PATH + '/dictionaries/JMdict'
10
+
11
+ ##
12
+ # rSpec Hash additions.
13
+ #
14
+ # From
15
+ # * http://wincent.com/knowledge-base/Fixtures_considered_harmful%3F
16
+ # * Neil Rahilly
17
+
18
+ class Hash
19
+
20
+ ##
21
+ # Filter keys out of a Hash.
22
+ #
23
+ # { :a => 1, :b => 2, :c => 3 }.except(:a)
24
+ # => { :b => 2, :c => 3 }
25
+
26
+ def except(*keys)
27
+ self.reject { |k,v| keys.include?(k || k.to_sym) }
28
+ end
29
+
30
+ ##
31
+ # Override some keys.
32
+ #
33
+ # { :a => 1, :b => 2, :c => 3 }.with(:a => 4)
34
+ # => { :a => 4, :b => 2, :c => 3 }
35
+
36
+ def with(overrides = {})
37
+ self.merge overrides
38
+ end
39
+
40
+ ##
41
+ # Returns a Hash with only the pairs identified by +keys+.
42
+ #
43
+ # { :a => 1, :b => 2, :c => 3 }.only(:a)
44
+ # => { :a => 1 }
45
+
46
+ def only(*keys)
47
+ self.reject { |k,v| !keys.include?(k || k.to_sym) }
48
+ end
49
+
50
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-jdict
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ian Pickering
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: libxml-ruby
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.8.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: amalgalite
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.5.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.5.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: autotest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 3.4.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 3.4.0
69
+ description:
70
+ email:
71
+ - ipickering2@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - LICENSING
77
+ - README.md
78
+ - Rakefile
79
+ - examples/query.rb
80
+ - lib/#sense.rb#
81
+ - lib/configuration.rb
82
+ - lib/constants.rb
83
+ - lib/dictionaries/jmdict.rb
84
+ - lib/dictionary.rb
85
+ - lib/entry.rb
86
+ - lib/index.rb
87
+ - lib/jdict.rb
88
+ - lib/kana.rb
89
+ - lib/kanji.rb
90
+ - lib/ruby-jdict/version.rb
91
+ - lib/sense.rb
92
+ - lib/unicode.rb
93
+ - spec/configuration_spec.rb
94
+ - spec/dictionary_spec.rb
95
+ - spec/fixtures/feeds/sample_entry.xml
96
+ - spec/index_spec.rb
97
+ - spec/jdict_spec.rb
98
+ - spec/jmdict_spec.rb
99
+ - spec/spec_helper.rb
100
+ homepage: https://github.com/Ruin0x11/ruby-jdict
101
+ licenses: []
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.4.5.1
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Ruby gem for accessing Jim Breen's Japanese dictionaries
123
+ test_files: []
124
+ has_rdoc: