kanjidic 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/kanjidic.rb +388 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d84cedfc8f473322799717d3323f921467f10a87
4
+ data.tar.gz: 330e0ec676b954de1173e782009ff2da206bc976
5
+ SHA512:
6
+ metadata.gz: 7e36868efafa9aa48252192ae3e3de1dc2fedc4329acf6297dd7b60e6ad49be7a83202380fdc25f5139fe07ca4e94927ceb3e995485f6fc8e9b7d6ceb7453288
7
+ data.tar.gz: fb481fcbde65c81a239bc86e5d259ed8b99fc519cc542b084af5e3dc2e9017f1f8e9e1c1278895dfa4d76c4f33b9e592fccfcf8de425449da9dc64156f1784e1
@@ -0,0 +1,388 @@
1
+ require "forwardable.rb"
2
+
3
+ module Kanjidic
4
+
5
+ @@dic ||= nil
6
+ @@parser ||= nil
7
+
8
+ # sym => [code, name, additional information]
9
+ @@dictionaries ||= {
10
+ :halpern => ['H', 'New Japanese-English Character Dictionary', '(1990), edited by Jack Halpern'],
11
+ :nelson => ['N', 'Modern Reader\'s Japanese-English Character Dictionary', 'edited by Andrew Nelson'],
12
+ :new_nelson => ['V', 'The New Nelson Japanese-English Character Dictionary', 'edited by John Haig'],
13
+ :spahn_hadaminsky => ['DA', 'Kanji & Kana', '(2011), by Spahn & Hadamitzky'],
14
+ :spahn_hadaminsky_2 => ['I', 'The Kanji Dictionary', "(1996), by Spahn and Hadaminsky"],
15
+ :AJLT => ['DB', 'Japanese For Busy People', 'vols I-III, published by the AJLT'],
16
+ :crowley => ['DC', 'The Kanji Way to Japanese Language Power', 'by Dale Crowley'],
17
+ :hodges_okazaki => ['DF', 'Japanese Kanji Flashcards', 'by Max Hodges and Tomoko Okazaki (White Rabbit Press)'],
18
+ :kodansha => ['DG', 'Kodansha Compact Kanji Guide'],
19
+ :hensall => ['DH', 'A Guide To Reading and Writing Japanese', '3rd edition, edited by Ken Hensall et al'],
20
+ :nishiguchi_kono => ['DJ', 'Kanji in Context', 'by Nishiguchi and Kono'],
21
+ :halpern_2 => ['DK', 'Kanji Learners Dictionary (1999)', 'edited by Jack Halpern (Kodansha)'],
22
+ :halpern_3 => ['DL', 'Kanji Learners Dictionary (2013)', 'edited by Jack Halpern (Kodansha)'],
23
+ :maniette => ['DM', 'Les Kanji dans la tête', 'by Yves Maniette'],
24
+ :heisig_6th => ['DN', 'Remembering The Kanji, 6th Edition', 'by James Heisig'],
25
+ :oneil_2 => ['DO', 'Essential Kanji', 'by P.G. O\'Neill'],
26
+ :halpern_4 => ['DP', 'Kodansha Kanji Dictionary', '(2013), by Jack Halpern'],
27
+ :deroo => ['DR', '2001 Kanji', '(Bonjinsha), by Father Joseph De Roo'],
28
+ :sakade => ['DS', 'A Guide To Reading and Writing Japanese', 'edited by Florence Sakade'],
29
+ :kask => ['DT', 'Tuttle Kanji Cards', 'compiled by Alexander Kask. '],
30
+ :henshall => ['E', 'A Guide To Remembering Japanese Characters', 'by Kenneth G. Henshall'],
31
+ :gakken => ['K', 'A New Dictionary of Kanji Usage', 'by Nao\'omi Kuratani, Akemi Kobayashi'],
32
+ :heisig => ['L', 'Remembering The Kanji', 'by James Heisig'],
33
+ :oneil => ['O', 'Japanese Names', '(1972), by P.G. O\'Neill. (Weatherhill)'],
34
+ :morohasidaikanwajiten => ['M', '大漢和辞典', "13 volumes, by Morohashi Tetsuji" ]
35
+ }
36
+
37
+ @@additional_codes ||= {
38
+ classification_radical: ['B', "Nelson classification radical (部首)"],
39
+ classical_radical: ['C', "Classical radical (部首)"],
40
+ frequency: ['F', "Frequency in newspapers"],
41
+ grade: ['G', "Grade taught"],
42
+ jlpt: ['J', "JLPT level"],
43
+ pinyin: ['Y', "Pinyin"],
44
+ hangul: ['W', "Hangul"],
45
+ skip_code: ['P', "SKIP"],
46
+ strokes: ['S', "Stroke count"],
47
+ unicode: ['U', "Unicode value"],
48
+ four_corner_index: ['Q', '"Four Corner" index'],
49
+ crossreference: ['X', "Cross-reference code"],
50
+ misclassification: ['Z', "Mis-classification code"]
51
+ }
52
+
53
+ @@uncoded ||= {
54
+ reading: "Reading",
55
+ name_reading: "Name reading (名乗り)",
56
+ radical_name: "Radical name",
57
+ character: "Character",
58
+ jis_code: "JIS code",
59
+ meanings: "Meaning",
60
+ kokuji: "Original Japanese character (国字)",
61
+ dictionaries: "Dictionaries",
62
+ number: 'Number',
63
+ page: 'Page',
64
+ position: "Position",
65
+ both: "Stroke count and position",
66
+ disagreement: "Disagreement over the number of strokes",
67
+ undefined: "undefined"
68
+ }
69
+
70
+ @@codes ||= nil
71
+ @@all_symbols ||= nil
72
+
73
+ @@special_codes ||= {
74
+ 'T' => ->(_, value, sup) {
75
+ case value.to_i
76
+ when 1
77
+ sup.call(:name_reading)
78
+ when 2
79
+ sup.call(:radical_name)
80
+ end
81
+ {}
82
+ },
83
+ 'M' => ->(subcode, value, _) {
84
+ {
85
+ dictionaries: {
86
+ morohasidaikanwajiten: {
87
+ case subcode
88
+ when 'N'
89
+ :number
90
+ when 'P'
91
+ :page
92
+ end => value
93
+ }
94
+ }
95
+ }
96
+ },
97
+ 'X' => ->(subcode, value, _) {
98
+ { crossreference:
99
+ if subcode == "J"
100
+ { jis_code: value }
101
+ elsif t = codes[subcode]
102
+ t.call("", value, proc {})
103
+ else
104
+ { undefined: value }
105
+ end
106
+ }
107
+ },
108
+ 'Z' => ->(subcode, value, _) {
109
+ key = case subcode[0]
110
+ when 'S' then :strokes
111
+ when 'P' then :position
112
+ when 'B' then :both
113
+ when 'R' then :disagreement
114
+ else :undefined
115
+ end
116
+ { misclassification: { key => value } }
117
+ },
118
+ 'IN' => ->(_, value, _) {
119
+ { dictionaries: { spahn_hadaminsky: value } }
120
+ }
121
+ }
122
+
123
+ # Load the Kanji dictionary
124
+ #
125
+ # Load a file at the location given in argument in the KANJIDIC format and parse it into a data structure in memory.
126
+ #
127
+ # Raise an exception if a file has already been loaded. See also Kanjidic::close, Kanjidic::expand
128
+ def self.open filename, jis
129
+ raise "Kanjidic already open (use Kanjidic::close first if you want to reload it, or Kanjidic::expand if you want to extend it)" if @@dic
130
+ @@dic = build(filename, jis)
131
+ end
132
+
133
+ # Expand the Kanji dictionary
134
+ #
135
+ # Load a file, parse it and add its informations to an existing in-memory dictionary
136
+ def self.expand filename, jis
137
+ @@dic.concat build(filename, jis)
138
+ end
139
+
140
+ # Close the Kanji dictionary
141
+ #
142
+ # The Kanjidic is a big file, resulting in a big structure in memory.
143
+ #
144
+ # Use this function if you need to close it
145
+ def self.close
146
+ @@dic = nil
147
+ GC.start
148
+ end
149
+
150
+ # Checks whether the Kanjidic is loaded
151
+ #
152
+ # Returns true if a Kanjidic is available to use through the Kanjidic module interface, false otherwise.
153
+ def self.open?
154
+ !!@@dic
155
+ end
156
+
157
+ # Parse a Kanjidic file
158
+ #
159
+ # Parse the file at the location given in argument and return a data structure representing it
160
+ def self.build filename, jis
161
+ File.open(filename) do |f|
162
+ result = []
163
+ f.each do |l|
164
+ if r = parse(l, jis)
165
+ result << r
166
+ end
167
+ end
168
+ result
169
+ end
170
+ end
171
+
172
+ # Parse a string in Kanjidic format
173
+ #
174
+ # Returns nil if the string doesn't start with a kanji, otherwise
175
+ #
176
+ # Returns a Hash containing the Kanji informations found in the String given in argument.
177
+
178
+ # Refer to the Kanjidic homepage for details about the accepted structure of the string.
179
+ def self.parse line, jis
180
+ return nil if line =~ /^[[:ascii:]]/ #Anything that doesn't start with a (supposedly) kanji is treated as a comment
181
+ elements = line.scan(/{[^}]+}|\S+/)
182
+ kanji = { character: elements.shift, jis_code: jis.to_s + elements.shift, dictionaries: {} }
183
+ kanji.extend self
184
+ kana = :reading
185
+ elements.each do |e|
186
+ # We'll only consider the first match, because reasons
187
+ # (namely a well formed file should never yield more than 1 match array)
188
+ matches = e.scan(parser)[0]
189
+ unless matches
190
+ _insert kanji, { undefined: e }
191
+ else
192
+ matches.compact!
193
+ case matches.length
194
+ when 1 # It's a reading, see Kanjidic::parser
195
+ _insert kanji, { kana => matches[0] }
196
+ when 2 # It's a meaning, see Kanjidic::parser
197
+ m = matches[1]
198
+ (m == "(kokuji)") ? kanji[:kokuji] = true : _insert(kanji, { meanings: m })
199
+ when 3 # It's a code, see Kanjidic::parser
200
+ code, subcode, value = *matches
201
+ _insert kanji, codes[code].call(subcode, value, ->(n) { kana = n })
202
+ else raise "Unhandled case"
203
+ end
204
+ end
205
+ end
206
+ kanji
207
+ end
208
+
209
+ # Builds a Regexp for line parsing
210
+ #
211
+ #
212
+ # Builds a Regexp based on the informations available in the @@dictionaries variables.
213
+ #
214
+ # Takes a boolean parameter to indicate whether the regexp should be constructed from
215
+ # scratches as opposed to retrieved from a cached value, false by default (returns the cache).
216
+ #
217
+ #The resulting regexp will return matches as follow:
218
+ #
219
+ # 3 groups (code, sub code, value) if the element is code based,
220
+ #
221
+ # 2 groups ("{", content) if it is a bracket delimited string,
222
+ #
223
+ # 1 group (content) if it is a string of japanese characters
224
+ def self.parser reload = false
225
+ return @@parser if @@parser and !reload
226
+ # It's gonna get ugly so here's the reasoning: take all the codes and check for them,
227
+ # then take the remaining informations and refer it for later
228
+
229
+ # First fetch the dictionary codes and assemble them in a A|B|DR|... fashion
230
+ dic_codes = codes.keys.join("|")
231
+ # Build the actual regexp.
232
+ # The format is dic_code + optionaly 1 or 2 uppercase letters + kanji_code
233
+ # OR {text with spaces} OR <japanese characters>
234
+ @@parser = /(#{dic_codes})([A-Z]{0,2})(.+)|({)(.*)}|(\W+)/
235
+ end
236
+
237
+ # Return a hash of all the informations that will be used when building the dictionary
238
+ #
239
+ # The Hash is build from the values returned by Kanjidic::dictionaries and Kanjidic::additional_codes
240
+ # and cached for further use.
241
+ #
242
+ # The parameter in a boolean indicating whether the value should be
243
+ # fetched from the cache or rebuild (default to false: from cache)
244
+ def self.codes reload = false
245
+ return @@codes if @@codes and !reload
246
+ @@codes = dictionaries.to_a.map { |e|
247
+ sym, arr = *e
248
+ [ arr[0], ->(s, v, _) { { dictionaries: { sym => s + v } } } ]
249
+ }.to_h.
250
+ merge(additional_codes.to_a.map { |e|
251
+ sym, arr = *e
252
+ [ arr[0], ->(s, v, _) { { sym => s + v } } ]
253
+ }.to_h).merge(special_codes)
254
+ end
255
+
256
+ # Return a hash containing all the informations about dictionary codes
257
+ #
258
+ # Modifying the return value will change the behaviour of the module. See
259
+ # implementation for details
260
+ def self.dictionaries
261
+ @@dictionaries
262
+ end
263
+
264
+ # Return a hash containing the informations about non dictionary codes
265
+ #
266
+ # Modifying the return value will change the behaviour of the module. See
267
+ # implementation for details
268
+ def self.additional_codes
269
+ @@additional_codes
270
+ end
271
+
272
+ # Return a hash of all symbols used in the datastructure, associated with a description string
273
+ #
274
+ # The hash is build from the values returned by Kanjidic::dictionaries,
275
+ # Kanjidic::additional_codes and Kanjidic::uncoded_symbols. Modifying it
276
+ # will not affect the behaviour of the module.
277
+ #
278
+ # The hash is cached, reload
279
+ # can be forced by passing true to the function.
280
+ def self.all_symbols reload = false
281
+ return @@all_symbols if @@all_symbols and !reload
282
+ coded_symboles.merge(uncoded_symbols)
283
+ end
284
+
285
+ # Returns a hash of all symbols and their String representations
286
+ def self.coded_symboles
287
+ dictionaries.to_a.map { |e|
288
+ sym, arr = *e
289
+ [ sym, arr[1] ]
290
+ }.to_h.
291
+ merge(additional_codes.to_a.map { |e|
292
+ sym, arr = *e
293
+ [ sym, arr[1] ]
294
+ }.to_h)
295
+ end
296
+
297
+ # Return a hash of all symboles not associated with a letter code
298
+ #
299
+ # The values are the description strings
300
+ def self.uncoded_symbols
301
+ @@uncoded
302
+ end
303
+
304
+ # Return a hash of all the special codes and associated Procs
305
+ def self.special_codes
306
+ @@special_codes
307
+ end
308
+
309
+
310
+ # Forward anything not specificaly defined to the dictionary array if it is
311
+ # loaded
312
+ def self.method_missing sym, *args, &blck
313
+ raise NoMethodError,
314
+ "No method named #{sym} for Kanjidic#{" (try loading the dictionary with Kanjidic::open first)" if [].respond_to?(sym)}" unless @@dic
315
+ @@dic.send sym, *args, &blck
316
+ end
317
+
318
+ def to_s
319
+ Kanjidic::format self,
320
+ character: 0,
321
+ reading: 1,
322
+ name_reading: 2,
323
+ radical_name: 3,
324
+ meanings: 4,
325
+ dictionaries: false
326
+ end
327
+
328
+ # Turns a Kanjidic entry into an easy to read string
329
+ def self.format e, opt = {}
330
+ if e.is_a? Array
331
+ e.map { |el| format el, opt }.join("\n")
332
+ elsif e.is_a? Hash
333
+ opt = { character: 0 }.merge(opt)
334
+ ret = ""
335
+ opt.sort_by { |_, value| value ? value : 0 }.to_h.each { |key, visible| ret += _to_s(key, e[key]) if visible and e[key] }
336
+ e.each { |k,v| ret += _to_s k, v unless opt.has_key?(k) }
337
+ ret
338
+ else
339
+ raise ArgumentError, "Invalid parameter #{e}"
340
+ end
341
+ end
342
+
343
+ # Insert values in a hash depending on the previous content of the hash
344
+ #
345
+ # Essentially a deep_merge implementation..
346
+ private_class_method def self._insert hash, dic
347
+ dic.each do |key, value|
348
+ t = hash[key]
349
+ # If the key doesn't exist, insert
350
+ if t.nil?
351
+ hash[key] = value
352
+ # If the key exist and its value is an array, add to it
353
+ elsif t.is_a?(Array)
354
+ hash[key] << value
355
+ # If the key exist and its value is a hash, merge them following the rules of this function
356
+ elsif t.is_a?(Hash)
357
+ _insert hash[key], value
358
+ # If the key exists and its value is anything else, build an array to contain the previous value and
359
+ # the new one
360
+ else
361
+ hash[key] = [hash[key], value]
362
+ end
363
+ end
364
+ end
365
+
366
+ private_class_method def self._to_s key, value, nesting = 1, resolve = false
367
+ resolve = (resolve || key == :crossreference)
368
+ ret = "#{all_symbols[key] || key}:"
369
+ if value.is_a? Hash
370
+ ret += "\n"
371
+ value.each { |k, v| ret += " " * 2 * nesting + _to_s(k, v, nesting + 1, resolve) }
372
+ elsif value.is_a? Array
373
+ ret += " " + value.map{ |e| e.to_s + _resolve(key, e, resolve) }.join(", ") + "\n"
374
+ else
375
+ ret += " #{value}#{_resolve(key, value, resolve)}\n"
376
+ end
377
+ ret
378
+ end
379
+
380
+ private_class_method def self._resolve key, value, resolve
381
+ return "" unless open? and resolve
382
+ r = Kanjidic.find { |e|
383
+ (e[key] == value) || (e[:dictionaries][key] == value)
384
+ }
385
+ r ? " (#{r[:character]})" : ""
386
+ end
387
+ end
388
+
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kanjidic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Sylvain Leclercq
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-10-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A gem to extract and explore the KANJIDIC (http://ftp.monash.edu.au/pub/nihongo/kanjidic.html)
14
+ email: maisbiensurqueoui@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/kanjidic.rb
20
+ homepage: http://www.github.com/de-passage/kanjidic
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.5
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Extract and explore the KANJIDIC
44
+ test_files: []