camdict 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,641 +1,110 @@
1
- require 'camdict/explanation'
1
+ # frozen_string_literal: true
2
+ require 'camdict/entry'
3
+ require 'camdict/ipa'
4
+ require 'camdict/pronunciation'
2
5
 
3
6
  module Camdict
4
-
5
- # Parse an html definition to get explanations, word, IPA, prounciation,
7
+ # Parse an html definition to get explanations, word, IPA, prounciation,
6
8
  # part of speech, etc.
7
-
8
9
  class Definition
9
- # Struct IPA is the written pronunciations for UK/US.
10
- # +uk+: the UK IPA; +k+: the superscript index in UK IPA.
11
- # +us+: the US IPA; +s+: the superscript index in US IPA.
12
- IPA = Struct.new(:uk, :k, :us, :s)
13
- # Struct Pronunciation has two memebers.
14
- # Each +uk+/+us+ has its own mp3/ogg links.
15
- Pronunciation = Struct.new(:uk, :us)
16
- # Struct Link has two memembers +mp3+ and +ogg+, which are the http links.
17
- Link = Struct.new(:mp3, :ogg)
10
+ # Get senses for this definition.
11
+ attr_reader :senses
18
12
 
19
- # Simple Past, Past Participle, PRsent participle of a verb. Only irregular
20
- # verbs have these values. It struct memebers are +sp+, +pp+, +pr+.
21
- Irregular = Struct.new(:sp, :pp, :pr)
22
- # Get part of speech of a word or phrase.
23
- attr_reader :part_of_speech
24
- # Get explanations for this definition.
25
- attr_reader :explanations
26
- # Is the queried word/phrase an idiom?
27
- attr_reader :is_idiom
28
- # Get the IPA
29
- attr_reader :ipa
30
- # Get the pronunciation
31
- attr_reader :pronunciation
32
- # Get the region: UK or US
33
- attr_reader :region
34
- # Get the short usage
35
- attr_reader :usage
36
- # Grammar code. Like U, means uncountable noun.
37
- attr_reader :gc
38
- # Get the guided word for this definition entry, which is usually just one
39
- # word or a phrase. This does not exist when there is only one definition.
40
- # It is useful when there are many definitions for one word to distinguish
41
- # them.
42
- attr_reader :guided
43
- # Get the verb irregular form word. +word.verb.sp+ gets the simple past
44
- # tense of this verb.
45
- attr_reader :verb
46
-
47
- # Input +word+ and +entry_html+ are
48
- # { entry ID => its html definition source }
49
- def initialize(word, entry_html)
13
+ def initialize(word)
50
14
  @word = word
51
- @entry_id, @html = entry_html.flatten
52
- @html = Nokogiri::HTML(@html)
53
- @title_word = title_word # String
54
- @derived_words = derived_words # String or [String]
55
- @spelling_variant = spell_variant # String
56
- @head_variant = get_head_variant # [String]
57
- @body_variant = get_body_variant # [String]
58
- @inflection = get_inflection # [String]
59
- @phrase = get_phrase # [String]
60
- @is_idiom = is_idiom? # True or False
61
- @part_of_speech = pos # String or [String] or []
62
- @explanations = get_explanations # [Camdict::Explanation]
63
- @ipa = get_ipa # Struct uk:String,us:String,k:[],s:[]
64
- @pronunciation = get_pronunciation # Struct uk:Link, us:Link
65
- @region = get_region # String
66
- @usage = get_usage # String
67
- @gc = get_gc # String
68
- @plural = get_plural # String or [String]
69
- @guided = get_guided_word # String
70
- @verb = get_irregular # Struct Irregular
15
+ end
16
+
17
+ def parse(html)
18
+ get_ipa(html)
19
+ get_pronunciation(html)
20
+ entry(html)
21
+ self
71
22
  end
72
23
 
73
24
  private
74
- # Get the definition page title word, which is either a word or phrase.
25
+
26
+ def entry(html)
27
+ @senses ||= html.css('.entry-body__el').map { |e| get_senses(e) }.flatten
28
+ end
29
+
30
+ # Get the definition page title word, which is either a word or phrase.
75
31
  # This is necessary because it doesn't always get the searched
76
- # word exactly. For instance, searching baldness gets bald. This is
77
- # how the online dictionary is organised -- when words having
32
+ # word exactly. For instance, searching bald also gets baldness. This is
33
+ # how the online dictionary is organised -- when words having
78
34
  # the same root they often share the same explanations.
79
- # <h2 class="di-title cdo-section-title-hw">look at sth</h2>
80
- def title_word
81
- css_text ".di-title.cdo-section-title-hw"
35
+ # <h2 class="di-title cdo-section-title-hw"><span class="headword">
36
+ # look at sth<span></h2>
37
+ def title_word(html)
38
+ @title_word ||=
39
+ html.css('.di-title.cdo-section-title-hw .headword').first.text
82
40
  end
83
41
 
84
42
  # Some words have more than one derived words, like plagiarize has two.
85
43
  # Return an Array of derived words or nil when no derived word found
86
44
  # <span class=runon-title" title="Derived word">
87
45
  # <span class="w">plagiarism
88
- def derived_words
89
- node = @html.css('[title="Derived word"]')
90
- node.map { |e| e.content } unless node.empty?
46
+ def derived_words(html)
47
+ @derived_words ||= parse_derived_words(html)
48
+ end
49
+
50
+ def parse_derived_words(html)
51
+ node = html.css('[title="Derived word"]')
52
+ node.map(&:content) unless node.empty?
91
53
  end
92
54
 
93
- # Get the variant word or phrase inside di-info block but exclude those
94
- # inside phrase-block or spelling variant, from where is part of the
55
+ # Get the variant word or phrase inside di-info block but exclude those
56
+ # inside phrase-block or spelling variant, from where is part of the
95
57
  # definition header.
96
58
  # Such as, US/UK variant, or hasing the same meaning, but
97
59
  # different pronunciation.
98
- # There are more than one variant for one entry, such as ruby, aluminium
99
- def get_head_variant
100
- # aluminium: aluminum, Al
101
- node = @html.css(".di-info .var .v[title='Variant form']")
102
- node.map { |n| n.text } unless node.empty?
60
+ def get_head_variant(html)
61
+ node = html.css(".pos-header .var .v[title='Variant form']")
62
+ node.map(&:text) unless node.empty?
103
63
  end
104
64
 
105
- # Body variant is inside the di-body block. This is useful to get their
106
- # part of speech, such as e-book.
107
- def get_body_variant
108
- css_text ".di-body .v[title='Variant form']"
65
+ def head_variant?(html)
66
+ hv = get_head_variant(html)
67
+ hv && hv.include?(@word)
109
68
  end
110
69
 
111
70
  # Get spelling variants, which have same pronunciations.
112
- def spell_variant
113
- # plagiarize: plagiarise
114
- css_text(".spellvar .v[title='Variant form']")
115
- end
116
-
117
- # Irregular plural, like criteria
118
- def get_inflection
119
- css_text ".di-info .inf"
120
- end
121
-
122
- # Get phrase and its variant which are not flattened yet
123
- def get_phrase
124
- node = @html.css(".phrase, .phrase-info .v[title='Variant form']")
125
- node.map { |n| n.text } unless node.empty?
71
+ # plagiarize: plagiarise
72
+ def spell_variant(html)
73
+ css_text(html, ".spellvar .v[title='Variant form']")
126
74
  end
127
75
 
128
76
  # Where are the searched word's part of speech, IPAs, prounciations
129
77
  # It could be found either at the position of "title" or "derived",
130
- # or "head_variant", "spellvar", "phrase", "idiom".
78
+ # "spellvar"
131
79
  # Other places are still "unknown".
132
- def where?
133
- location = "title" if @word == @title_word
134
- unless @title_word.nil?
135
- location = "title" if @title_word.include?("/") &&
136
- @title_word.flatten.include?(@word)
137
- end
138
- location = "idiom" if @is_idiom && @title_word.include?(@word)
139
- unless @spelling_variant.nil?
140
- # spelling variant is treated as "title word"
141
- location = "spellvar" if @spelling_variant == @word
142
- end
143
- unless @head_variant.nil?
144
- location = "head_variant" if @head_variant.include? @word
145
- end
146
- location ="body_variant" if @body_variant && @body_variant == @word
147
- location = "inflection" if @inflection && @inflection == @word
148
- unless @derived_words.nil?
149
- if @derived_words.include? @word
150
- unless location.nil?
151
- #'ruby' has two locations title and derived
152
- location = [location, "derived"]
153
- else
154
- location = "derived"
155
- end
156
- end
157
- end
158
- unless @phrase.nil?
159
- location = "phrase" if @phrase.has?(@word) && @word.include?(" ")
160
- # rubbers has no space, but it's treated as a phrase.
161
- location = "phrase" if @phrase.include? @word
162
- end
163
- location ||= "unknown"
164
- end
165
-
166
- # * When the searched word is a title word
167
- # <span class="di-info">
168
- # For noun, verb, adj, adv, pronoun, prep, conj, exclamation:
169
- # <span class="posgram">
170
- # <span class="pos" title="A word that ...">noun</span>
171
- # For phrasal verb: reach out to sb
172
- # <span class="anc-info-head">
173
- # <span class="pos" title="Verb with an adverb ...">phrasal verb</span>
174
- # ... same as above line ... verb ...
175
- # For idiom:
176
- # "curiosity killed the cat"
177
- # <span class="lab" title="A short, well-know ...">
178
- # <span class="usage" title="A short ...">saying</span>
179
- # or "can't get your head around sth"
180
- # <span class="usage" title="A short ...">informal</span>
181
- # or "set/put the seal on sth" and many other idioms have no di-info, but
182
- # all should have di-body idiom-block idiom-body
183
- # * When the searched word is a derived word
184
- # <span class="runon">...<span class="runon-info">
185
- # <span class="posgram"><span class="pos">noun
186
- # * When there are more than one part of speech on the same page, like,
187
- # 'ruby': adjective and noun are both returned.
188
- # * When the dictionary has no direct answer - unknown
189
- def pos
190
- pos_ret = []
191
- loc = where?
192
- loc = [loc] if loc.is_a? String
193
- loc.each { |loca|
194
- case loca
195
- when 'title', 'head_variant', 'body_variant', 'spellvar', 'inflection'
196
- # for phrasal verb
197
- node = @html.css(".anc-info-head > .pos")
198
- # center has two pos, noun,verb; centre: noun, adj.
199
- node = @html.css(".di-info .pos") if node.empty?
200
- pos_ret += node.map {|n| n.text} unless node.empty?
201
- when 'idiom'
202
- pos_ret << "idiom"
203
- when 'derived'
204
- derived_css(".runon-info .posgram .pos") { |node|
205
- pos_ret << node.text
206
- }
207
- when 'unknown'
208
- #"Unknown or don't have a part of speech"
209
- end
210
- }
211
- return pos_ret.pop if pos_ret.length == 1
212
- pos_ret
213
- end
214
-
215
- # Get explanations inside a definition block
216
- def get_explanations
217
- defblocks = @html.css(".sense-body > .def-block")
218
- exps = defblocks.map { |db|
219
- Camdict::Explanation.new(db)
220
- }
221
- loc = where?
222
- loc = [loc] if loc.is_a? String
223
- loc.each { |loca|
224
- case loca
225
- when 'title', 'head_variant', 'spellvar', 'inflection'
226
- # Got it already
227
- when 'derived'
228
- derived_css(".def-block") { |node|
229
- exps << Camdict::Explanation.new(node)
230
- }
231
- when 'phrase'
232
- phrase_css(".def-block") { |node|
233
- exps << Camdict::Explanation.new(node)
234
- }
235
- when 'idiom'
236
- node = @html.css(".idiom-block .def-block")
237
- exps << Camdict::Explanation.new(node)
238
- end
239
- }
240
- exps
241
- end
242
-
243
- # Parse html and check whether there is idiom related block.
244
- def is_idiom?
245
- node = @html.css(".idiom-block .idiom-body")
246
- true unless node.empty?
247
- end
248
-
249
- # A word may has uk and us written pronouncation. Superscripts in an IPA
250
- # are stored in an array, k for UK, s for US. The returned IPA Struct likes,
251
- # uk: String, us:String, k:[position1, length1, position2, length2],
252
- # s: [position, length]
253
- # Position is the superscript index in the IPA, and the next number length
254
- # is the length of this superscript.
255
- def get_ipa
256
- # UK is always the first one
257
- uknode = @html.at_css ".di-info .ipa"
258
- # phrase or idiom has no IPA
259
- return IPA.new if uknode.nil?
260
- ukbase = parse_ipa(uknode)
261
- # in most cases they are same
262
- usbase = ukbase
263
- loc = where?
264
- loc = [loc] if loc.is_a? String
265
- loc.each { |loca|
266
- case loca
267
- when 'title', 'spellvar'
268
- # US IPA is always followed by a symbol US
269
- # favorite: UK/US ipa (spellvar US s:favorite) => normal title word
270
- usnode = @html.css ".di-info img.ussymbol + .pron .ipa"
271
- usnode = usnode.first
272
- usbase = parse_ipa(usnode) unless usnode.nil?
273
- when 'inflection'
274
- usnode = @html.css ".info-group img.ussymbol + .pron .ipa"
275
- usbase = parse_ipa(usnode) unless usnode.nil?
276
- ukinfnode = @html.css ".info-group .pron .ipa"
277
- ukinf = parse_ipa(ukinfnode) unless ukinfnode.nil?
278
- if usbase[:baseipa] && usbase[:baseipa].include?('-')
279
- usbase = join_ipa(ukbase, usbase)
280
- end
281
- if ukinf[:baseipa] && ukinf[:baseipa].include?('-')
282
- ukbase = join_ipa(ukbase, ukinf)
283
- end
284
- when 'head_variant'
285
- # variant word's IPA can be got from its definition page when it is a
286
- # title word, or from the bracket. Like,
287
- # aluminium: UK ipa, (variant s:aluminum: US ipa) => in bracket
288
- # behove: UK ipa, US ipa (variant US s:behoove ipa) => in bracket
289
- # Many other variants have no IPA inside the bracket and title word's
290
- # IPA are not theirs.
291
- # eraser: UK ipa, US ipa US (variant UK s:rubber) => no IPA
292
- # plane: UK/US ipa (variant UK s:aeroplane, US s:airplane) => no IPA
293
- # aeroplane: UK ipa,US ipa (variant US s:airplane) => no IPA
294
- # ass: UK/US ipa, | variant UK s:arse => no IPA
295
- # sledge: UK ipa, (variant US s:sled) => no IPA
296
- # titbit: UK/US ipa, (variant US s:tidbit) => no IPA
297
- node = @html.css ".di-info .var .ipa"
298
- node.empty? ? (return IPA.new) : ukbase = usbase = parse_ipa(node)
299
- return IPA.new unless ukbase[:baseipa]
300
- when 'derived'
301
- derived_uk = nil
302
- derived_css('.ipa') { |node|
303
- derived_uk = parse_ipa(node.first) unless node.first.nil?
304
- }
305
- derived_css("img.ussymbol + .pron .ipa") { |node|
306
- usbase = parse_ipa(node.first) unless node.first.nil?
307
- }
308
- if derived_uk && derived_uk[:baseipa].include?('-')
309
- ukbase = join_ipa(ukbase, derived_uk)
310
- elsif derived_uk
311
- # uk base may come from the derived word, such as fermentation.
312
- ukbase = derived_uk
313
- end
314
- end
315
- }
316
- if usbase[:baseipa] && usbase[:baseipa].include?('-')
317
- usbase = join_ipa(ukbase, usbase)
318
- end
319
- uk, k = ukbase[:baseipa], ukbase[:sindex]
320
- us, s = usbase[:baseipa], usbase[:sindex]
321
- IPA.new(uk, k, us, s)
322
- end
323
-
324
- # Parse an ipa node to get the ipa string and its superscript index
325
- def parse_ipa(node)
326
- position = 0
327
- pindex = []
328
- node.children.each { |c|
329
- len = c.text.length
330
- pindex += [position,len] if c["class"] == "sp"
331
- position += len
332
- }
333
- pindex = nil if pindex.empty?
334
- { baseipa: node.text, sindex: pindex }
335
- end
336
-
337
- # A short IPA begins with a hyphen, which shares a common beginning with the
338
- # full IPA. Return the joined result for the short one. The superscripts
339
- # are added when the common parts have that or removed if the non common
340
- # parts override them.
341
- def join_ipa(full_sp, short_sp)
342
- # understand -sd-; preparation -Sddss-; imaginary -dssds-
343
- # plagiarise -ssdddsss; dictionary -dsss; painting -sdss
344
- # harmfully -d
345
- # toxic ssddd-; privacy sssd-; formally sssd-; harmful ssssds-
346
- full, basesp = full_sp[:baseipa], full_sp[:sindex]
347
- short, ussp = short_sp[:baseipa], short_sp[:sindex]
348
- slen = short.length
349
- flen = full.length
350
- if short[0] == '-'
351
- # head-tail hyphen
352
- if short[-1] == '-'
353
- center = short[1, slen-2]
354
- position = full.index(center[0])
355
- # match left and right
356
- if full.index(center[-1])
357
- left_matched_index = position
358
- right_matched_index = flen-1 - full.index(center[-1])
359
- rev_number = center.length - (right_matched_index -
360
- left_matched_index + 1)
361
- if left_matched_index && rev_number <= 0
362
- right_index = mix_spi(basesp, right_matched_index+1..flen-1)
363
- rev_right_index = revise_index(right_index, rev_number)
364
- findex = mix_spi(basesp, 0..left_matched_index-1,
365
- ussp, left_matched_index+1,
366
- rev_right_index, 0)
367
- ret = full[0..left_matched_index-1] + center +
368
- full[right_matched_index+1..flen-1]
369
- return {baseipa: ret, sindex: findex}
370
- end
371
- end
372
- # match left only
373
- if position && (slen - 2 < flen - 1 - position)
374
- findex = mix_spi(basesp, 0..position-1, ussp, position-1,
375
- basesp, position+slen-2..flen-1)
376
- ret = full[0..position-1] + center + full[position+slen-2..flen-1]
377
- return {baseipa: ret, sindex: findex}
378
- end
379
- position = full.index(center[-1])
380
- # match right only
381
- if position && (position + 1 > slen - 2)
382
- findex = mix_spi(basesp, 0..position-slen+2, ussp, position-slen+2,
383
- basesp, position+1..flen-1)
384
- ret = full[0..position-slen+2] + center + full[position+1..flen-1]
385
- return {baseipa: ret, sindex: findex}
386
- end
387
- # this is a simple solution to workaround the issue since no common
388
- # chars are found between the full and short ipa. Such as the word
389
- # 'difference', so just assign full to short
390
- begin
391
- raise "head-tail hyphen IPA #{short} for the word #{@word}" +
392
- "unmatched with #{full}."
393
- rescue RuntimeError
394
- return full_sp
395
- end
80
+ def where(html)
81
+ @location ||=
82
+ if on_title?(html) || spell_variant?(html) || head_variant?(html)
83
+ 'title'
84
+ elsif derived_word?(html)
85
+ 'derived'
396
86
  else
397
- # head hyphen
398
- right = short[1, slen-1]
399
- position = full.index(right[0])
400
- # match left #&& plagiarism fails this test
401
- if position #&& (flen-position >= slen-1)
402
- findex = mix_spi( basesp, 0..position-1, ussp, position-1)
403
- ret = full[0..position-1] + right
404
- return {baseipa: ret, sindex: findex}
405
- end
406
- position = full.index(right[-1])
407
- # match right
408
- if position && (position+1 >= slen-1)
409
- findex = mix_spi(basesp, 0..position-slen+1, ussp, position-slen+1)
410
- ret = full[0..position-slen+1] + right
411
- return {baseipa: ret, sindex: findex}
412
- end
413
- # unmatched case, like harmfulness
414
- findex = mix_spi(basesp, 0..flen-1, ussp, flen-1)
415
- ret = full + right
416
- return {baseipa: ret, sindex: findex}
87
+ 'unknown'
417
88
  end
418
- # tail hyphen
419
- elsif short[-1] == '-'
420
- left = short[0, slen-1]
421
- ret = left + full[slen-1..flen-1]
422
- findex = mix_spi( ussp, 0, basesp, slen-1..flen-1)
423
- return {baseipa: ret, sindex: findex}
424
- # begin with a primary or secondary stress mark like reunion
425
- elsif ["\u{2cc}", "\u{2c8}"].include? short[0]
426
- return full_sp # for simple, use uk ipa instead
427
- else
428
- raise ArgumentError,
429
- "IPA doesn't begin with a hyphen or stress, nor end with a hyphen. " +
430
- "Nothing is done."
431
- end
432
89
  end
433
90
 
434
- # +superscript_index+ is the superscript index for an IPA
435
- # +rev_number+ is the number that is used to revise the superscript index
436
- # after the common part of a us shorten ipa is joined with uk ipa, the
437
- # remainding part requires to be revised as it becomes longer or shorter.
438
- # return the revised superscript_index or nil if the passed
439
- # +superscript_index+ is nil.
440
- def revise_index(superscript_index, rev_number)
441
- return nil if superscript_index.nil?
442
- ret = []
443
- superscript_index.each_pair { |position, len|
444
- ret += [position+rev_number, len]
445
- }
446
- return nil if ret.empty?
447
- ret
91
+ def derived_word?(html)
92
+ return false unless derived_words(html) && @derived_words.include?(@word)
93
+ true
448
94
  end
449
95
 
450
- # Determine whether or not the range is included by the superscript index.
451
- # Return the pair of index array when it is included by that. Or return nil.
452
- def at_range(spindex, range)
453
- return if spindex.nil?
454
- ret = []
455
- spindex.each_pair { |position, len|
456
- ret += [position, len] if range.include? position
457
- }
458
- return nil if ret.empty?
459
- ret
460
- end
461
-
462
- # Mix the superscript index. Return mixed result or nil if no superscript.
463
- # Each pair of array element is superscript index and a Range/Fixnum.
464
- # All of them are part of two superscripts that need joining. Only the
465
- # superscripts in range are kept, and the index of the superscript with
466
- # a number is increased by this number. Finally, the joined superscript is
467
- # returned.
468
- def mix_spi(*p)
469
- findex = []
470
- p.each_pair { |spindex, r_or_n|
471
- if spindex and r_or_n.kind_of? Range
472
- aindex = at_range(spindex, r_or_n)
473
- findex += aindex if aindex
474
- elsif spindex and r_or_n.is_a? Fixnum
475
- bindex = []
476
- spindex.each_pair { |p, i|
477
- bindex += [p + r_or_n, i]
478
- }
479
- findex += bindex unless bindex.empty?
480
- end
481
- }
482
- return nil if findex.empty?
483
- findex
96
+ def on_title?(html)
97
+ @word == title_word(html)
484
98
  end
485
99
 
486
- # Get the UK/US pronunciation mp3/ogg links
487
- def get_pronunciation
488
- # parameter pron is a Nokigiri::Node
489
- links = lambda { |pron|
490
- unless pron.empty?
491
- pron.each { |a|
492
- return Link.new a['data-src-mp3'], a['data-src-ogg']
493
- }
494
- else
495
- return Link.new
496
- end
497
- }
498
- ukpron = uspron = []
499
- loc = where?
500
- loc = [loc] if loc.is_a? String
501
- loc.each { |loca|
502
- case loca
503
- when 'title', 'spellvar'
504
- ukpron = @html.css(".di-info a.pron-uk")
505
- uspron = @html.css(".di-info a.pron-us")
506
- when 'derived'
507
- derived_css("a.pron-uk") { |node|
508
- ukpron = node
509
- }
510
- derived_css("a.pron-us") { |node|
511
- uspron = node
512
- }
513
- end
514
- }
515
- uklinks = links.call(ukpron)
516
- uslinks = links.call(uspron)
517
- Pronunciation.new(uklinks, uslinks)
518
- end
519
-
520
- # Get a word or phrase's region. Possible values: UK, US.
521
- def get_region
522
- ret = nil
523
- loc = where?
524
- loc = [loc] if loc.is_a? String
525
- loc.each { |loca|
526
- case loca
527
- when 'title', 'idiom'
528
- ret = css_text(".di-info > .lab .region")
529
- ret = css_text(".di-info > .lab") unless ret && !ret.empty?
530
- when 'spellvar'
531
- ret = css_text(".spellvar .region")
532
- when 'head_variant'
533
- ret = css_text(".di-info .var .region")
534
- when 'derived'
535
- derived_css(".region") { |node|
536
- ret = node.text unless node.empty?
537
- }
538
- when 'phrase'
539
- phrase_css(".region") { |node|
540
- ret = node.text unless node.empty?
541
- }
542
- end
543
- }
544
- ret
545
- end
546
-
547
- # Parse and get the usage
548
- def get_usage
549
- ret = nil
550
- loc = where?
551
- loc = [loc] if loc.is_a? String
552
- loc.each { |loca|
553
- case loca
554
- when 'title', 'idiom', 'spellvar'
555
- ret = css_text(".di-info > .lab .usage")
556
- when 'head_variant'
557
- ret = css_text(".di-info .var .usage")
558
- when 'derived'
559
- derived_css(".usage") { |node|
560
- ret = node.text unless node.empty?
561
- }
562
- when 'phrase'
563
- phrase_css(".usage") { |node|
564
- ret = node.text unless node.empty?
565
- }
566
- end
567
- }
568
- ret
569
- end
570
-
571
- # Get grammar code
572
- def get_gc
573
- ret = nil
574
- loc = where?
575
- loc = [loc] if loc.is_a? String
576
- loc.each { |loca|
577
- case loca
578
- when 'title', 'idiom', 'spellvar', 'head_variant'
579
- ret = css_text(".di-info .gcs")
580
- when 'derived'
581
- derived_css(".gcs") { |node|
582
- ret = node.text unless node.empty?
583
- }
584
- when 'phrase'
585
- phrase_css(".gcs") { |node|
586
- ret = node.text unless node.empty?
587
- }
588
- end
589
- }
590
- ret
591
- end
592
-
593
- # Return values: String, [String], nil
594
- def get_plural
595
- return unless @part_of_speech.include? 'noun'
596
- ret = nil
597
- node = @html.css(".di-info .inf-group[type='plural'] .inf")
598
- unless node.empty?
599
- # fish has two
600
- if node.size > 1
601
- ret = node.map { |n| n.text }
602
- elsif node.size == 1
603
- ret = node.text
604
- end
605
- end
606
- ret
607
- end
608
-
609
- # Parse and get the guided word
610
- def get_guided_word
611
- gw = css_text(".di-info .gw")
612
- gw.delete "()" if gw
613
- end
614
-
615
- # Return nil or Irregular struct
616
- def get_irregular
617
- return unless @part_of_speech.include? 'verb'
618
- present = css_text(".di-info .inf-group[type='pres_part'] .inf")
619
- past = css_text(".di-info .inf-group[type='past'] .inf")
620
- sp = pp = past
621
- if past.nil? || past.empty?
622
- node = @html.css(".di-info span[class='inf']")
623
- unless node.empty?
624
- past = node.map { |n| n.text }
625
- sp, pp = past
626
- end
627
- end
628
- sp = css_text(".di-info .inf-group[type='past-tense'] .inf") if sp.nil?
629
- pp = css_text(".di-info .inf-group[type='past-part'] .inf") if pp.nil?
630
- if sp || pp || present
631
- return Irregular.new(sp, pp, present)
632
- end
100
+ # spelling variant is treated as "title word"
101
+ def spell_variant?(html)
102
+ spell_variant(html) == @word
633
103
  end
634
104
 
635
105
  include Camdict::Common
636
- # Limitation: some irregular words are not reachable(phenomena, arisen)
637
- # because they are not shown on the search result page. They can be got
638
- # by their original forms - phenomenon, arise.
639
-
106
+ include Camdict::IPA
107
+ include Camdict::Pronunciation
108
+ include Camdict::Entry
640
109
  end
641
110
  end