wenlin_db_scanner 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ module WenlinDbScanner
2
+
3
+ # Wraps record data returned by WenlinDbScanner::Db methods.
4
+ class DbRecord
5
+ # @return {Boolean} true if the record has non-text data, such as pictures
6
+ attr_reader :binary
7
+ alias_method :binary?, :binary
8
+
9
+ # @return {String, nil} UTF-8 encoded text data; nil for binary records
10
+ attr_reader :text
11
+
12
+ # @return {String, nil} ASCII-8BIT encoded binary data; nil for text records
13
+ attr_reader :data
14
+
15
+ # @return {Fixnum} the record's tag byte (a number in 0...256)
16
+ attr_reader :tag
17
+
18
+ # Raw data size, in bytes.
19
+ #
20
+ # For text records, this is the size of the compressed data.
21
+ attr_reader :size
22
+
23
+ # The record's position in the database.
24
+ attr_reader :offset
25
+
26
+ # Used by the WenlinDbScanner::Db methods.
27
+ def initialize(offset, tag, size, binary, data)
28
+ @offset = offset
29
+ @tag = tag
30
+ @size = size
31
+ @binary = binary
32
+ if @binary
33
+ @data = data
34
+ @text = nil
35
+ else
36
+ @text = data
37
+ @data = nil
38
+ end
39
+ end
40
+ end # class WenlinDbScanner::DbRecord
41
+
42
+ end # namespace WenlinDbScanner
43
+
@@ -0,0 +1,373 @@
1
+ # coding: utf-8
2
+
3
+ module WenlinDbScanner
4
+
5
+ # Parses the data in the dictionary databases.
6
+ module Dicts
7
+ # The entries in the English->Chinese dictionary.
8
+ #
9
+ # @param [String] db_root the directory containing the .db files
10
+ # @return [Enumerator<DictEntry>]
11
+ def self.en_zh(db_root)
12
+ entries File.join(db_root, 'yinghan.db')
13
+ end
14
+
15
+ # The entries in the Chinese->English dictionary.
16
+ #
17
+ # @param [String] db_root the directory containing the .db files
18
+ # @return [Enumerator<DictEntry>]
19
+ def self.zh_en(db_root)
20
+ entries File.join(db_root, 'cidian.db')
21
+ end
22
+
23
+ # Generic decodeder for a database of dictionary entries.
24
+ #
25
+ # @param [String] db_file path to the .db file containing dictionary data
26
+ # @return [Enumerator<DictEntry>]
27
+ def self.entries(db_file)
28
+ Enumerator.new do |yielder|
29
+ db = Db.new db_file
30
+ db.records.each do |record|
31
+ next if record.binary?
32
+ lines = record.text.split("\n").map(&:strip).reject(&:empty?)
33
+
34
+ key = lines[0]
35
+
36
+ entry = DictEntry.new
37
+ entry.key = key
38
+ entry.term = key_term key
39
+ entry.latin_term = key_latin_term key
40
+ entry.term_frequency = key_frequency key
41
+ entry.latin_frequency_boost = key_latin_frequency key
42
+
43
+ collect_values = false
44
+ lines[1..-1].each do |line|
45
+ tag, data = *line.split(' ', 2)
46
+ tag_parts = /^(\d*)(\w+)(\@.*)?$/.match tag
47
+ unless tag_parts
48
+ raise "Unknown tag format #{tag} in dictionary entry!\n#{record.text}"
49
+ end
50
+ case tag_parts[2]
51
+ when 'ipa'
52
+ prop = :ipa
53
+ when 'a'
54
+ prop = :abbreviates
55
+ when 'c'
56
+ prop = nil
57
+ prop1 = :zh
58
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
59
+ prop2 = :zh_tw
60
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
61
+ if data2.empty?
62
+ data2 = data1
63
+ else
64
+ if data2.index '-'
65
+ # Handle entries like
66
+ data2 = data2.chars.map.with_index { |char, index|
67
+ (char == '-') ? data1[index] : char
68
+ }.join ''
69
+ end
70
+ end
71
+ when 'd'
72
+ prop = :defn
73
+ when 'b' # NOTE: base of?
74
+ prop = nil
75
+ prop1 = :used_in_terms
76
+ prop2 = :used_in_serials
77
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
78
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
79
+ collect_values = true
80
+ when 'e' # NOTE: equivalent?
81
+ prop = nil
82
+ prop1 = :linked_terms
83
+ prop2 = :linked_serials
84
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
85
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
86
+ collect_values = true
87
+ when 'f' # e.g. 2.2 [XHPC:4]
88
+ prop = :freq
89
+ data = data.split('[', 2).first.strip
90
+ when 'gr'
91
+ prop = :grade
92
+ when 'h'
93
+ # NOTE: guessing this means it shows up in the application's help.
94
+ # it seems to only be set for technical terms
95
+ prop = false
96
+ when 'hz'
97
+ prop = :example_zh
98
+ when 'infl'
99
+ prop = :inflection
100
+ when 'j' # NOTE: jump?
101
+ prop = :see_serial
102
+ when 'k'
103
+ prop = :see_term
104
+ when 'm'
105
+ prop = :measure_word
106
+ # NOTE: stripping the complex hanzi, as it can be found by
107
+ # cross-referencing the measure word's key
108
+ data = data.gsub(/\[[^\]]*\]/, '').strip
109
+ data = data.split('/').map(&:strip)
110
+ when 'n'
111
+ # NOTE: the field of reference sometimes looks like "mus.[音]"
112
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
113
+ if data2.empty?
114
+ prop = :field
115
+ else
116
+ prop = nil
117
+ prop1 = :field
118
+ prop2 = :field_zh
119
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
120
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
121
+ end
122
+ when 'note'
123
+ prop = :note
124
+ when 'o'
125
+ prop = :construction
126
+ when 'p'
127
+ prop = :speech_part
128
+ when 'q'
129
+ prop = :usage
130
+ when 'r', 'rem'
131
+ # NOTE: skipping remarks / revisions for now; they might be
132
+ # interesting for research
133
+ prop = false
134
+ when 's'
135
+ prop = :serial
136
+ when 'sub'
137
+ prop = nil
138
+ prop1 = :extend
139
+ prop2 = :extend_serial
140
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
141
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
142
+ collect_values = true
143
+ when 'subof'
144
+ prop = nil
145
+ prop1 = :extended_from
146
+ prop2 = :extended_from_serial
147
+ data1 = data.gsub(/\[[^\]]*\]/, '').strip
148
+ data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
149
+ collect_values = true
150
+ when 't'
151
+ prop = :example_translation
152
+ when 'u'
153
+ prop = :unverified
154
+ data = true
155
+ when 'v'
156
+ # NOTE: no idea what this is, only shows up once
157
+ prop = false
158
+ when 'w'
159
+ prop = :reference
160
+ when 'x'
161
+ prop = :example
162
+ when 'y'
163
+ prop = :years
164
+ else
165
+ raise "Unknown tag #{tag} in dictionary entry!\n#{record.text}"
166
+ end
167
+ next if prop == false
168
+
169
+ ops = if prop
170
+ [[prop, data]]
171
+ else
172
+ [[prop1, data1], [prop2, data2]]
173
+ end
174
+ ops.each do |k, v|
175
+ if tag_parts[1].empty?
176
+ if collect_values
177
+ entry[k] ||= []
178
+ entry[k] << v
179
+ else
180
+ entry[k] = v
181
+ end
182
+ else
183
+ # Exampe: 31x means example: [blah, blah, [value]]
184
+ indexes = tag_parts[1].chars.map do |char|
185
+ ((char == ?0) ? 10 : char.to_i) - 1
186
+ end
187
+ if indexes.any? { |i| i < 0 }
188
+ puts "Broken tag #{tag} #{tag_parts[1]} #{indexes.inspect}\n#{record.text}"
189
+ end
190
+ entry[k] ||= []
191
+ unless entry[k].kind_of?(Array)
192
+ # Fix entries listing props x and 2x instead of 1x, 2x.
193
+ entry[k] = [entry[k]]
194
+ end
195
+ array = entry[k]
196
+ indexes[0...-1].each do |i|
197
+ array[i] ||= []
198
+ unless array[i].kind_of?(Array)
199
+ # Fix entries listing props 1x and 12x instead of 11x, 12x.
200
+ array[i] = [array[i]]
201
+ end
202
+ array = array[i]
203
+ end
204
+ if collect_values
205
+ array[indexes.last] ||= []
206
+ array[indexes.last] << v
207
+ else
208
+ array[indexes.last] = v
209
+ end
210
+ end
211
+ end
212
+ end
213
+
214
+ yielder << entry
215
+ end
216
+ end
217
+ end
218
+
219
+ # The term defined by a dictionary key.
220
+ #
221
+ # @param [String] key a dictionary key
222
+ # @return [String] the term inside the key
223
+ def self.key_term(key)
224
+ key.gsub(/[^\p{L}]/, '')
225
+ end
226
+
227
+ # The term defined by a dictionary key, spelled using Latin characters.
228
+ #
229
+ # @param [String] key a dictionary key
230
+ # @return [String] the term inside the key, spelled using Latin characters
231
+ def self.key_latin_term(key)
232
+ Chars.pinyin_to_latin key_term(key)
233
+ end
234
+
235
+ # The frequency information expressed in a dictionary key.
236
+ #
237
+ # This shows the relative frequency of the term, among all the terms with
238
+ # the same exact spelling. For Chinese terms, the spelling is pinyin.
239
+ #
240
+ # @param [String] key a dictionary key
241
+ # @return [Integer, nil] nil if the key does not have frequency information
242
+ def self.key_frequency(key)
243
+ match = /^[^\p{L}]+/.match(key)
244
+ return nil unless match
245
+ match[0].tr('⁰¹²³⁴⁵⁶⁷⁸⁹' , '0123456789').to_i
246
+ end
247
+
248
+ # The latin frequency information expressed in a dictionary key.
249
+ #
250
+ # This is true if the term is the most frequent, among all terms with the
251
+ # same latin spelling. For Chinese terms, the latin spelling is pinyin with
252
+ # tone information removed.
253
+ #
254
+ # @param [String] key a dictionary key
255
+ # @return [Boolean]
256
+ def self.key_latin_frequency(key)
257
+ key[-1] == ?*
258
+ end
259
+ end # module WenlinDbScanner::Dicts
260
+
261
+ # Wraps a record in a dictionary database
262
+ class DictEntry < Struct.new(:key, :term, :ipa, :zh, :zh_tw, :defn, :freq,
263
+ :grade, :example_zh, :measure_word, :speech_part,
264
+ :serial, :example, :example_translation,
265
+ :reference, :term_frequency,
266
+ :latin_frequency_boost, :field, :field_zh,
267
+ :unverified, :inflection, :extend, :extend_serial,
268
+ :extended_from, :extended_from_serial,
269
+ :abbreviates, :see_serial, :usage, :latin_term,
270
+ :linked_terms, :linked_serials, :note, :see_term,
271
+ :used_in_terms, :used_in_serials,
272
+ :construction, :years)
273
+ # @!attribute [r] key
274
+ # @return [String] the term's sort key in the dictionary
275
+ # @!attribute [r] ipa
276
+ # @return [String] pronunciation for English terms, in IPA
277
+ # @!attribute [r] term
278
+ # @return [String] the defined term
279
+ # @!attribute [r] latin_term
280
+ # @return [String] the defined term, spelled using Latin characters
281
+ # @!attribute [r] zh
282
+ # @return [String] the defined term, in simplified Hanzi
283
+ # @!attribute [r] zh_tw
284
+ # @return [String] the defined term, in complex Hanzi
285
+ # @!attribute [r] defn
286
+ # @return [String] the term's definition, in English or pinyin
287
+ # @!attribute [r] help_text
288
+ # @return [String] might be used in the dictionary app's help
289
+ # @!attribute [r] example
290
+ # @return [String] text that shows how the term is used
291
+ # @!attribute [r] grade
292
+ # @return [String] the level at which this term is taught; 'A' to 'E'
293
+ # @!attribute [r] freq
294
+ # @return [String] floating-point number indicating the term's popularity;
295
+ # lower strings, such as "1" are associated with more popular terms;
296
+ # unpopular terms don't have this property set
297
+ # @!attribute [r] measure_word
298
+ # @return [String, Array<String>] the key of the measure word than should
299
+ # be used with this noun; might be an array of dictionary keys instead
300
+ # @!attribute [r] speech_part
301
+ # @return [String] abbreviation of the term's part of speech; cross
302
+ # reference this in the appropraite parts-of-speech database
303
+ # @!attribute [r] serial
304
+ # @return [String] uniquely identifies a definition in the dictionary
305
+ # @!attribute [r] example_translation
306
+ # @return [String] translation of the example usage text; if the text is in
307
+ # English, the translation uses pinyin
308
+ # @!attribute [r] example_zh
309
+ # @return [String] the example usage text, in simplified hanzi
310
+ # @!attribute [r] reference
311
+ # @return [String] code that indicates where the definition was obtained
312
+ # from; might be useful to researchers
313
+ # @!attribute [r] term_frequency
314
+ # @return [String] the definition rank among all terms that are spelled the
315
+ # same; for Chinese terms, the spelling is pinyin; the most frequent
316
+ # term has rank 1
317
+ # @!attribute [r] latin_frequency_boost
318
+ # @return [Boolean] if true, the term is the most frequent among all terms
319
+ # with the same Latin alphabet spelling; useful for ordering
320
+ # suggestions when the user is typing raw Latin characters; use the
321
+ # latin_term property to get a term's LAtin spelling
322
+ # @!attribute [r] field
323
+ # @return [String] the term's field of reference, abbreviated in English
324
+ # @!attribute [r] field_zh
325
+ # @return [String] the term's field of relevance, in simplified hanzi
326
+ # @!attribute [r] unverified
327
+ # @return [Boolean] set on some auto-generated entries in en->zh
328
+ # @!attribute [r] inflection
329
+ # @return [String] instructions on how to inflect an English word; not yet
330
+ # parsed, but in the future it should be an array of String words
331
+ # @!attribute [r] extend
332
+ # @return [String] method for extending the term with a suffix to obtain a
333
+ # related meaning
334
+ # @!attribute [r] extend_serial
335
+ # @return [String] the serial of the term obtained by extending this term
336
+ # with a suffix
337
+ # @!attribute [r] extended_from
338
+ # @return [String] the term that this term was derived from
339
+ # @!attribute [r] extended_from_serial
340
+ # @return [String] the serial of the term that this term was derived from
341
+ # @!attribute [r] abbreviates
342
+ # @return [String] the term that this term is an abbreviation for
343
+ # @!attribute [r] see_serial
344
+ # @return [String] the serial of the term that this term should be
345
+ # considered to be identical with
346
+ # @!attribute [r] see_term
347
+ # @return [String] a term that is closely related to this term; unlike
348
+ # see_serial, this is rendered as a "See also " link
349
+ # @!attribute [r] usage
350
+ # @return [String] short clarification about how the term is used
351
+ # @!attribute [r] linked_terms
352
+ # @return [String] terms that have the same spelling in some system and
353
+ # and are closely related
354
+ # @!attribute [r] linked_serials
355
+ # @return [String] serials for terms that have the same spelling in some
356
+ # system and are closely related
357
+ # @!attribute [r] used_in_terms
358
+ # @return [String] terms that contain this term
359
+ # @!attribute [r] used_in_serials
360
+ # @return [String] serials for terms that contain this term
361
+ # @!attribute [r] construction
362
+ # @return [String] a grammatical construction that the term can be used in;
363
+ # the associated definition, examples, etc. all apply to the term's use
364
+ # in the construction
365
+ # @!attribute [y] years
366
+ # @return [String] year / range of years that a person lived
367
+ # @return [Hash]
368
+ def to_hash
369
+ Hash[each_pair.reject { |k, v| v.nil? }.to_a]
370
+ end
371
+ end # class WenlinDbScanner::DictEntry
372
+
373
+ end # namespace WenlinDbScanner
@@ -0,0 +1,68 @@
1
+ # coding: utf-8
2
+
3
+ module WenlinDbScanner
4
+
5
+ # Parses the databases that describe the parts of speech.
6
+ module SpeechParts
7
+ # The English parts of speech used in the en->zh dictionary.
8
+ #
9
+ # @param [String] db_root the directory containing the .db files
10
+ # @return [Enumerator<SpeechPart>]
11
+ def self.en(db_root)
12
+ parts File.join(db_root, 'ab_ec.db')
13
+ end
14
+
15
+ # The Chinese parts of speech used in the zh->en dictionary.
16
+ #
17
+ # @param [String] db_root the directory containing the .db files
18
+ # @return [Enumerator<SpeechPart>]
19
+ def self.zh(db_root)
20
+ parts File.join(db_root, 'ab_ce.db')
21
+ end
22
+
23
+ # Generic decoder for a database of parts of speech.
24
+ #
25
+ # @param [String] db_file path to the .db file containing part-of-speech data
26
+ # @return [Enumerator<SpeechPart>]
27
+ def self.parts(db_file)
28
+ Enumerator.new do |yielder|
29
+ db = Db.new db_file
30
+ db.records.each do |record|
31
+ next if record.binary?
32
+ lines = record.text.split("\n").map(&:strip).reject(&:empty?)
33
+
34
+ part = SpeechPart.new
35
+ part.abbrev, part.en, part.pinyin, part.zh =
36
+ *lines[0].split('=').map(&:strip)
37
+ part.description = lines[1]
38
+ yielder << part
39
+ end
40
+ db.close
41
+ end
42
+ end
43
+ end # module WenlinDbScanner::SpeechParts
44
+
45
+ # Wraps a record in the parts-of-speech database.
46
+ class SpeechPart < Struct.new(:abbrev, :en, :zh, :pinyin, :description)
47
+ # @!attribute [r] abbrev
48
+ # @return [String] e.g., 'v.'; used by entries in the other databases
49
+
50
+ # @!attribute [r] en
51
+ # @return [String] English name, e.g. 'Verb'
52
+
53
+ # @!attribute [r] zh
54
+ # @return [String] Chinese name, in Hanzi, e.g. '动词'
55
+
56
+ # @!attribute [r] pinyin
57
+ # @return [String] Chinese name, in pinyin, e.g. 'Dòngcí'
58
+
59
+ # @!attribute [r] description
60
+ # @return [String] English-language explanation of what the part of speech is
61
+
62
+ # @return [Hash]
63
+ def to_hash
64
+ Hash[each_pair.to_a]
65
+ end
66
+ end # class WenlinDbScanner::SpeechPart
67
+
68
+ end # namespace WenlinDbScanner