wenlin_db_scanner 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ module WenlinDbScanner
2
+
3
+ # Wraps record data returned by WenlinDbScanner::Db methods.
4
+ class DbRecord
5
+ # @return {Boolean} true if the record has non-text data, such as pictures
6
+ attr_reader :binary
7
+ alias_method :binary?, :binary
8
+
9
+ # @return {String, nil} UTF-8 encoded text data; nil for binary records
10
+ attr_reader :text
11
+
12
+ # @return {String, nil} ASCII-8BIT encoded binary data; nil for text records
13
+ attr_reader :data
14
+
15
+ # @return {Fixnum} the record's tag byte (a number in 0...256)
16
+ attr_reader :tag
17
+
18
+ # Raw data size, in bytes.
19
+ #
20
+ # For text records, this is the size of the compressed data.
21
+ attr_reader :size
22
+
23
+ # The record's position in the database.
24
+ attr_reader :offset
25
+
26
+ # Used by the WenlinDbScanner::Db methods.
27
+ def initialize(offset, tag, size, binary, data)
28
+ @offset = offset
29
+ @tag = tag
30
+ @size = size
31
+ @binary = binary
32
+ if @binary
33
+ @data = data
34
+ @text = nil
35
+ else
36
+ @text = data
37
+ @data = nil
38
+ end
39
+ end
40
+ end # class WenlinDbScanner::DbRecord
41
+
42
+ end # namespace WenlinDbScanner
43
+
@@ -0,0 +1,373 @@
1
+ # coding: utf-8
2
+
3
+ module WenlinDbScanner
4
+
5
+ # Parses the data in the dictionary databases.
6
+ module Dicts
7
+ # The entries in the English->Chinese dictionary.
8
+ #
9
+ # @param [String] db_root the directory containing the .db files
10
+ # @return [Enumerator<DictEntry>]
11
+ def self.en_zh(db_root)
12
+ entries File.join(db_root, 'yinghan.db')
13
+ end
14
+
15
+ # The entries in the Chinese->English dictionary.
16
+ #
17
+ # @param [String] db_root the directory containing the .db files
18
+ # @return [Enumerator<DictEntry>]
19
+ def self.zh_en(db_root)
20
+ entries File.join(db_root, 'cidian.db')
21
+ end
22
+
23
+ # Generic decodeder for a database of dictionary entries.
24
+ #
25
+ # @param [String] db_file path to the .db file containing dictionary data
26
+ # @return [Enumerator<DictEntry>]
27
+ def self.entries(db_file)
28
+ Enumerator.new do |yielder|
29
+ db = Db.new db_file
30
+ db.records.each do |record|
31
+ next if record.binary?
32
+ lines = record.text.split("\n").map(&:strip).reject(&:empty?)
33
+
34
+ key = lines[0]
35
+
36
+ entry = DictEntry.new
37
+ entry.key = key
38
+ entry.term = key_term key
39
+ entry.latin_term = key_latin_term key
40
+ entry.term_frequency = key_frequency key
41
+ entry.latin_frequency_boost = key_latin_frequency key
42
+
43
+ collect_values = false
44
+ lines[1..-1].each do |line|
45
+ tag, data = *line.split(' ', 2)
46
+ tag_parts = /^(\d*)(\w+)(\@.*)?$/.match tag
47
+ unless tag_parts
48
+ raise "Unknown tag format #{tag} in dictionary entry!\n#{record.text}"
49
+ end
50
+ case tag_parts[2]
51
+ when 'ipa'
52
+ prop = :ipa
53
+ when 'a'
54
+ prop = :abbreviates
55
+ when 'c'
56
+ prop = nil
57
+ prop1 = :zh
58
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
59
+ prop2 = :zh_tw
60
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
61
+ if data2.empty?
62
+ data2 = data1
63
+ else
64
+ if data2.index '-'
65
+ # Handle entries like
66
+ data2 = data2.chars.map.with_index { |char, index|
67
+ (char == '-') ? data1[index] : char
68
+ }.join ''
69
+ end
70
+ end
71
+ when 'd'
72
+ prop = :defn
73
+ when 'b' # NOTE: base of?
74
+ prop = nil
75
+ prop1 = :used_in_terms
76
+ prop2 = :used_in_serials
77
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
78
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
79
+ collect_values = true
80
+ when 'e' # NOTE: equivalent?
81
+ prop = nil
82
+ prop1 = :linked_terms
83
+ prop2 = :linked_serials
84
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
85
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
86
+ collect_values = true
87
+ when 'f' # e.g. 2.2 [XHPC:4]
88
+ prop = :freq
89
+ data = data.split('[', 2).first.strip
90
+ when 'gr'
91
+ prop = :grade
92
+ when 'h'
93
+ # NOTE: guessing this means it shows up in the application's help.
94
+ # it seems to only be set for technical terms
95
+ prop = false
96
+ when 'hz'
97
+ prop = :example_zh
98
+ when 'infl'
99
+ prop = :inflection
100
+ when 'j' # NOTE: jump?
101
+ prop = :see_serial
102
+ when 'k'
103
+ prop = :see_term
104
+ when 'm'
105
+ prop = :measure_word
106
+ # NOTE: stripping the complex hanzi, as it can be found by
107
+ # cross-referencing the measure word's key
108
+ data = data.gsub(/\[[^\]]*\]/, '').strip
109
+ data = data.split('/').map(&:strip)
110
+ when 'n'
111
+ # NOTE: the field of reference sometimes looks like "mus.[音]"
112
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
113
+ if data2.empty?
114
+ prop = :field
115
+ else
116
+ prop = nil
117
+ prop1 = :field
118
+ prop2 = :field_zh
119
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
120
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
121
+ end
122
+ when 'note'
123
+ prop = :note
124
+ when 'o'
125
+ prop = :construction
126
+ when 'p'
127
+ prop = :speech_part
128
+ when 'q'
129
+ prop = :usage
130
+ when 'r', 'rem'
131
+ # NOTE: skipping remarks / revisions for now; they might be
132
+ # interesting for research
133
+ prop = false
134
+ when 's'
135
+ prop = :serial
136
+ when 'sub'
137
+ prop = nil
138
+ prop1 = :extend
139
+ prop2 = :extend_serial
140
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
141
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
142
+ collect_values = true
143
+ when 'subof'
144
+ prop = nil
145
+ prop1 = :extended_from
146
+ prop2 = :extended_from_serial
147
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
148
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
149
+ collect_values = true
150
+ when 't'
151
+ prop = :example_translation
152
+ when 'u'
153
+ prop = :unverified
154
+ data = true
155
+ when 'v'
156
+ # NOTE: no idea what this is, only shows up once
157
+ prop = false
158
+ when 'w'
159
+ prop = :reference
160
+ when 'x'
161
+ prop = :example
162
+ when 'y'
163
+ prop = :years
164
+ else
165
+ raise "Unknown tag #{tag} in dictionary entry!\n#{record.text}"
166
+ end
167
+ next if prop == false
168
+
169
+ ops = if prop
170
+ [[prop, data]]
171
+ else
172
+ [[prop1, data1], [prop2, data2]]
173
+ end
174
+ ops.each do |k, v|
175
+ if tag_parts[1].empty?
176
+ if collect_values
177
+ entry[k] ||= []
178
+ entry[k] << v
179
+ else
180
+ entry[k] = v
181
+ end
182
+ else
183
+ # Exampe: 31x means example: [blah, blah, [value]]
184
+ indexes = tag_parts[1].chars.map do |char|
185
+ ((char == ?0) ? 10 : char.to_i) - 1
186
+ end
187
+ if indexes.any? { |i| i < 0 }
188
+ puts "Broken tag #{tag} #{tag_parts[1]} #{indexes.inspect}\n#{record.text}"
189
+ end
190
+ entry[k] ||= []
191
+ unless entry[k].kind_of?(Array)
192
+ # Fix entries listing props x and 2x instead of 1x, 2x.
193
+ entry[k] = [entry[k]]
194
+ end
195
+ array = entry[k]
196
+ indexes[0...-1].each do |i|
197
+ array[i] ||= []
198
+ unless array[i].kind_of?(Array)
199
+ # Fix entries listing props 1x and 12x instead of 11x, 12x.
200
+ array[i] = [array[i]]
201
+ end
202
+ array = array[i]
203
+ end
204
+ if collect_values
205
+ array[indexes.last] ||= []
206
+ array[indexes.last] << v
207
+ else
208
+ array[indexes.last] = v
209
+ end
210
+ end
211
+ end
212
+ end
213
+
214
+ yielder << entry
215
+ end
216
+ end
217
+ end
218
+
219
+ # The term defined by a dictionary key.
220
+ #
221
+ # @param [String] key a dictionary key
222
+ # @return [String] the term inside the key
223
+ def self.key_term(key)
224
+ key.gsub(/[^\p{L}]/, '')
225
+ end
226
+
227
+ # The term defined by a dictionary key, spelled using Latin characters.
228
+ #
229
+ # @param [String] key a dictionary key
230
+ # @return [String] the term inside the key, spelled using Latin characters
231
+ def self.key_latin_term(key)
232
+ Chars.pinyin_to_latin key_term(key)
233
+ end
234
+
235
+ # The frequency information expressed in a dictionary key.
236
+ #
237
+ # This shows the relative frequency of the term, among all the terms with
238
+ # the same exact spelling. For Chinese terms, the spelling is pinyin.
239
+ #
240
+ # @param [String] key a dictionary key
241
+ # @return [Integer, nil] nil if the key does not have frequency information
242
+ def self.key_frequency(key)
243
+ match = /^[^\p{L}]+/.match(key)
244
+ return nil unless match
245
+ match[0].tr('⁰¹²³⁴⁵⁶⁷⁸⁹' , '0123456789').to_i
246
+ end
247
+
248
+ # The latin frequency information expressed in a dictionary key.
249
+ #
250
+ # This is true if the term is the most frequent, among all terms with the
251
+ # same latin spelling. For Chinese terms, the latin spelling is pinyin with
252
+ # tone information removed.
253
+ #
254
+ # @param [String] key a dictionary key
255
+ # @return [Boolean]
256
+ def self.key_latin_frequency(key)
257
+ key[-1] == ?*
258
+ end
259
+ end # module WenlinDbScanner::Dicts
260
+
261
+ # Wraps a record in a dictionary database
262
+ class DictEntry < Struct.new(:key, :term, :ipa, :zh, :zh_tw, :defn, :freq,
263
+ :grade, :example_zh, :measure_word, :speech_part,
264
+ :serial, :example, :example_translation,
265
+ :reference, :term_frequency,
266
+ :latin_frequency_boost, :field, :field_zh,
267
+ :unverified, :inflection, :extend, :extend_serial,
268
+ :extended_from, :extended_from_serial,
269
+ :abbreviates, :see_serial, :usage, :latin_term,
270
+ :linked_terms, :linked_serials, :note, :see_term,
271
+ :used_in_terms, :used_in_serials,
272
+ :construction, :years)
273
+ # @!attribute [r] key
274
+ # @return [String] the term's sort key in the dictionary
275
+ # @!attribute [r] ipa
276
+ # @return [String] pronunciation for English terms, in IPA
277
+ # @!attribute [r] term
278
+ # @return [String] the defined term
279
+ # @!attribute [r] latin_term
280
+ # @return [String] the defined term, spelled using Latin characters
281
+ # @!attribute [r] zh
282
+ # @return [String] the defined term, in simplified Hanzi
283
+ # @!attribute [r] zh_tw
284
+ # @return [String] the defined term, in complex Hanzi
285
+ # @!attribute [r] defn
286
+ # @return [String] the term's definition, in English or pinyin
287
+ # @!attribute [r] help_text
288
+ # @return [String] might be used in the dictionary app's help
289
+ # @!attribute [r] example
290
+ # @return [String] text that shows how the term is used
291
+ # @!attribute [r] grade
292
+ # @return [String] the level at which this term is taught; 'A' to 'E'
293
+ # @!attribute [r] freq
294
+ # @return [String] floating-point number indicating the term's popularity;
295
+ # lower strings, such as "1" are associated with more popular terms;
296
+ # unpopular terms don't have this property set
297
+ # @!attribute [r] measure_word
298
+ # @return [String, Array<String>] the key of the measure word than should
299
+ # be used with this noun; might be an array of dictionary keys instead
300
+ # @!attribute [r] speech_part
301
+ # @return [String] abbreviation of the term's part of speech; cross
302
+ # reference this in the appropraite parts-of-speech database
303
+ # @!attribute [r] serial
304
+ # @return [String] uniquely identifies a definition in the dictionary
305
+ # @!attribute [r] example_translation
306
+ # @return [String] translation of the example usage text; if the text is in
307
+ # English, the translation uses pinyin
308
+ # @!attribute [r] example_zh
309
+ # @return [String] the example usage text, in simplified hanzi
310
+ # @!attribute [r] reference
311
+ # @return [String] code that indicates where the definition was obtained
312
+ # from; might be useful to researchers
313
+ # @!attribute [r] term_frequency
314
+ # @return [String] the definition rank among all terms that are spelled the
315
+ # same; for Chinese terms, the spelling is pinyin; the most frequent
316
+ # term has rank 1
317
+ # @!attribute [r] latin_frequency_boost
318
+ # @return [Boolean] if true, the term is the most frequent among all terms
319
+ # with the same Latin alphabet spelling; useful for ordering
320
+ # suggestions when the user is typing raw Latin characters; use the
321
+ # latin_term property to get a term's LAtin spelling
322
+ # @!attribute [r] field
323
+ # @return [String] the term's field of reference, abbreviated in English
324
+ # @!attribute [r] field_zh
325
+ # @return [String] the term's field of relevance, in simplified hanzi
326
+ # @!attribute [r] unverified
327
+ # @return [Boolean] set on some auto-generated entries in en->zh
328
+ # @!attribute [r] inflection
329
+ # @return [String] instructions on how to inflect an English word; not yet
330
+ # parsed, but in the future it should be an array of String words
331
+ # @!attribute [r] extend
332
+ # @return [String] method for extending the term with a suffix to obtain a
333
+ # related meaning
334
+ # @!attribute [r] extend_serial
335
+ # @return [String] the serial of the term obtained by extending this term
336
+ # with a suffix
337
+ # @!attribute [r] extended_from
338
+ # @return [String] the term that this term was derived from
339
+ # @!attribute [r] extended_from_serial
340
+ # @return [String] the serial of the term that this term was derived from
341
+ # @!attribute [r] abbreviates
342
+ # @return [String] the term that this term is an abbreviation for
343
+ # @!attribute [r] see_serial
344
+ # @return [String] the serial of the term that this term should be
345
+ # considered to be identical with
346
+ # @!attribute [r] see_term
347
+ # @return [String] a term that is closely related to this term; unlike
348
+ # see_serial, this is rendered as a "See also " link
349
+ # @!attribute [r] usage
350
+ # @return [String] short clarification about how the term is used
351
+ # @!attribute [r] linked_terms
352
+ # @return [String] terms that have the same spelling in some system and
353
+ # and are closely related
354
+ # @!attribute [r] linked_serials
355
+ # @return [String] serials for terms that have the same spelling in some
356
+ # system and are closely related
357
+ # @!attribute [r] used_in_terms
358
+ # @return [String] terms that contain this term
359
+ # @!attribute [r] used_in_serials
360
+ # @return [String] serials for terms that contain this term
361
+ # @!attribute [r] construction
362
+ # @return [String] a grammatical construction that the term can be used in;
363
+ # the associated definition, examples, etc. all apply to the term's use
364
+ # in the construction
365
+ # @!attribute [y] years
366
+ # @return [String] year / range of years that a person lived
367
+ # @return [Hash]
368
+ def to_hash
369
+ Hash[each_pair.reject { |k, v| v.nil? }.to_a]
370
+ end
371
+ end # class WenlinDbScanner::DictEntry
372
+
373
+ end # namespace WenlinDbScanner
@@ -0,0 +1,68 @@
1
+ # coding: utf-8
2
+
3
+ module WenlinDbScanner
4
+
5
+ # Parses the databases that describe the parts of speech.
6
+ module SpeechParts
7
+ # The English parts of speech used in the en->zh dictionary.
8
+ #
9
+ # @param [String] db_root the directory containing the .db files
10
+ # @return [Enumerator<SpeechPart>]
11
+ def self.en(db_root)
12
+ parts File.join(db_root, 'ab_ec.db')
13
+ end
14
+
15
+ # The Chinese parts of speech used in the zh->en dictionary.
16
+ #
17
+ # @param [String] db_root the directory containing the .db files
18
+ # @return [Enumerator<SpeechPart>]
19
+ def self.zh(db_root)
20
+ parts File.join(db_root, 'ab_ce.db')
21
+ end
22
+
23
+ # Generic decoder for a database of parts of speech.
24
+ #
25
+ # @param [String] db_file path to the .db file containing part-of-speech data
26
+ # @return [Enumerator<SpeechPart>]
27
+ def self.parts(db_file)
28
+ Enumerator.new do |yielder|
29
+ db = Db.new db_file
30
+ db.records.each do |record|
31
+ next if record.binary?
32
+ lines = record.text.split("\n").map(&:strip).reject(&:empty?)
33
+
34
+ part = SpeechPart.new
35
+ part.abbrev, part.en, part.pinyin, part.zh =
36
+ *lines[0].split('=').map(&:strip)
37
+ part.description = lines[1]
38
+ yielder << part
39
+ end
40
+ db.close
41
+ end
42
+ end
43
+ end # module WenlinDbScanner::SpeechParts
44
+
45
+ # Wraps a record in the parts-of-speech database.
46
+ class SpeechPart < Struct.new(:abbrev, :en, :zh, :pinyin, :description)
47
+ # @!attribute [r] abbrev
48
+ # @return [String] e.g., 'v.'; used by entries in the other databases
49
+
50
+ # @!attribute [r] en
51
+ # @return [String] English name, e.g. 'Verb'
52
+
53
+ # @!attribute [r] zh
54
+ # @return [String] Chinese name, in Hanzi, e.g. '动词'
55
+
56
+ # @!attribute [r] pinyin
57
+ # @return [String] Chinese name, in pinyin, e.g. 'Dòngcí'
58
+
59
+ # @!attribute [r] description
60
+ # @return [String] English-language explanation of what the part of speech is
61
+
62
+ # @return [Hash]
63
+ def to_hash
64
+ Hash[each_pair.to_a]
65
+ end
66
+ end # class WenlinDbScanner::SpeechPart
67
+
68
+ end # namespace WenlinDbScanner