camdict 1.0.3 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,641 +1,110 @@
1
- require 'camdict/explanation'
1
+ # frozen_string_literal: true
2
+ require 'camdict/entry'
3
+ require 'camdict/ipa'
4
+ require 'camdict/pronunciation'
2
5
 
3
6
  module Camdict
4
-
5
- # Parse an html definition to get explanations, word, IPA, prounciation,
7
+ # Parse an html definition to get explanations, word, IPA, prounciation,
6
8
  # part of speech, etc.
7
-
8
9
  class Definition
9
- # Struct IPA is the written pronunciations for UK/US.
10
- # +uk+: the UK IPA; +k+: the superscript index in UK IPA.
11
- # +us+: the US IPA; +s+: the superscript index in US IPA.
12
- IPA = Struct.new(:uk, :k, :us, :s)
13
- # Struct Pronunciation has two memebers.
14
- # Each +uk+/+us+ has its own mp3/ogg links.
15
- Pronunciation = Struct.new(:uk, :us)
16
- # Struct Link has two memembers +mp3+ and +ogg+, which are the http links.
17
- Link = Struct.new(:mp3, :ogg)
10
+ # Get senses for this definition.
11
+ attr_reader :senses
18
12
 
19
- # Simple Past, Past Participle, PRsent participle of a verb. Only irregular
20
- # verbs have these values. It struct memebers are +sp+, +pp+, +pr+.
21
- Irregular = Struct.new(:sp, :pp, :pr)
22
- # Get part of speech of a word or phrase.
23
- attr_reader :part_of_speech
24
- # Get explanations for this definition.
25
- attr_reader :explanations
26
- # Is the queried word/phrase an idiom?
27
- attr_reader :is_idiom
28
- # Get the IPA
29
- attr_reader :ipa
30
- # Get the pronunciation
31
- attr_reader :pronunciation
32
- # Get the region: UK or US
33
- attr_reader :region
34
- # Get the short usage
35
- attr_reader :usage
36
- # Grammar code. Like U, means uncountable noun.
37
- attr_reader :gc
38
- # Get the guided word for this definition entry, which is usually just one
39
- # word or a phrase. This does not exist when there is only one definition.
40
- # It is useful when there are many definitions for one word to distinguish
41
- # them.
42
- attr_reader :guided
43
- # Get the verb irregular form word. +word.verb.sp+ gets the simple past
44
- # tense of this verb.
45
- attr_reader :verb
46
-
47
- # Input +word+ and +entry_html+ are
48
- # { entry ID => its html definition source }
49
- def initialize(word, entry_html)
13
+ def initialize(word)
50
14
  @word = word
51
- @entry_id, @html = entry_html.flatten
52
- @html = Nokogiri::HTML(@html)
53
- @title_word = title_word # String
54
- @derived_words = derived_words # String or [String]
55
- @spelling_variant = spell_variant # String
56
- @head_variant = get_head_variant # [String]
57
- @body_variant = get_body_variant # [String]
58
- @inflection = get_inflection # [String]
59
- @phrase = get_phrase # [String]
60
- @is_idiom = is_idiom? # True or False
61
- @part_of_speech = pos # String or [String] or []
62
- @explanations = get_explanations # [Camdict::Explanation]
63
- @ipa = get_ipa # Struct uk:String,us:String,k:[],s:[]
64
- @pronunciation = get_pronunciation # Struct uk:Link, us:Link
65
- @region = get_region # String
66
- @usage = get_usage # String
67
- @gc = get_gc # String
68
- @plural = get_plural # String or [String]
69
- @guided = get_guided_word # String
70
- @verb = get_irregular # Struct Irregular
15
+ end
16
+
17
+ def parse(html)
18
+ get_ipa(html)
19
+ get_pronunciation(html)
20
+ entry(html)
21
+ self
71
22
  end
72
23
 
73
24
  private
74
- # Get the definition page title word, which is either a word or phrase.
25
+
26
+ def entry(html)
27
+ @senses ||= html.css('.entry-body__el').map { |e| get_senses(e) }.flatten
28
+ end
29
+
30
+ # Get the definition page title word, which is either a word or phrase.
75
31
  # This is necessary because it doesn't always get the searched
76
- # word exactly. For instance, searching baldness gets bald. This is
77
- # how the online dictionary is organised -- when words having
32
+ # word exactly. For instance, searching bald also gets baldness. This is
33
+ # how the online dictionary is organised -- when words having
78
34
  # the same root they often share the same explanations.
79
- # <h2 class="di-title cdo-section-title-hw">look at sth</h2>
80
- def title_word
81
- css_text ".di-title.cdo-section-title-hw"
35
+ # <h2 class="di-title cdo-section-title-hw"><span class="headword">
36
+ # look at sth<span></h2>
37
+ def title_word(html)
38
+ @title_word ||=
39
+ html.css('.di-title.cdo-section-title-hw .headword').first.text
82
40
  end
83
41
 
84
42
  # Some words have more than one derived words, like plagiarize has two.
85
43
  # Return an Array of derived words or nil when no derived word found
86
44
  # <span class=runon-title" title="Derived word">
87
45
  # <span class="w">plagiarism
88
- def derived_words
89
- node = @html.css('[title="Derived word"]')
90
- node.map { |e| e.content } unless node.empty?
46
+ def derived_words(html)
47
+ @derived_words ||= parse_derived_words(html)
48
+ end
49
+
50
+ def parse_derived_words(html)
51
+ node = html.css('[title="Derived word"]')
52
+ node.map(&:content) unless node.empty?
91
53
  end
92
54
 
93
- # Get the variant word or phrase inside di-info block but exclude those
94
- # inside phrase-block or spelling variant, from where is part of the
55
+ # Get the variant word or phrase inside di-info block but exclude those
56
+ # inside phrase-block or spelling variant, from where is part of the
95
57
  # definition header.
96
58
  # Such as, US/UK variant, or hasing the same meaning, but
97
59
  # different pronunciation.
98
- # There are more than one variant for one entry, such as ruby, aluminium
99
- def get_head_variant
100
- # aluminium: aluminum, Al
101
- node = @html.css(".di-info .var .v[title='Variant form']")
102
- node.map { |n| n.text } unless node.empty?
60
+ def get_head_variant(html)
61
+ node = html.css(".pos-header .var .v[title='Variant form']")
62
+ node.map(&:text) unless node.empty?
103
63
  end
104
64
 
105
- # Body variant is inside the di-body block. This is useful to get their
106
- # part of speech, such as e-book.
107
- def get_body_variant
108
- css_text ".di-body .v[title='Variant form']"
65
+ def head_variant?(html)
66
+ hv = get_head_variant(html)
67
+ hv && hv.include?(@word)
109
68
  end
110
69
 
111
70
  # Get spelling variants, which have same pronunciations.
112
- def spell_variant
113
- # plagiarize: plagiarise
114
- css_text(".spellvar .v[title='Variant form']")
115
- end
116
-
117
- # Irregular plural, like criteria
118
- def get_inflection
119
- css_text ".di-info .inf"
120
- end
121
-
122
- # Get phrase and its variant which are not flattened yet
123
- def get_phrase
124
- node = @html.css(".phrase, .phrase-info .v[title='Variant form']")
125
- node.map { |n| n.text } unless node.empty?
71
+ # plagiarize: plagiarise
72
+ def spell_variant(html)
73
+ css_text(html, ".spellvar .v[title='Variant form']")
126
74
  end
127
75
 
128
76
  # Where are the searched word's part of speech, IPAs, prounciations
129
77
  # It could be found either at the position of "title" or "derived",
130
- # or "head_variant", "spellvar", "phrase", "idiom".
78
+ # "spellvar"
131
79
  # Other places are still "unknown".
132
- def where?
133
- location = "title" if @word == @title_word
134
- unless @title_word.nil?
135
- location = "title" if @title_word.include?("/") &&
136
- @title_word.flatten.include?(@word)
137
- end
138
- location = "idiom" if @is_idiom && @title_word.include?(@word)
139
- unless @spelling_variant.nil?
140
- # spelling variant is treated as "title word"
141
- location = "spellvar" if @spelling_variant == @word
142
- end
143
- unless @head_variant.nil?
144
- location = "head_variant" if @head_variant.include? @word
145
- end
146
- location ="body_variant" if @body_variant && @body_variant == @word
147
- location = "inflection" if @inflection && @inflection == @word
148
- unless @derived_words.nil?
149
- if @derived_words.include? @word
150
- unless location.nil?
151
- #'ruby' has two locations title and derived
152
- location = [location, "derived"]
153
- else
154
- location = "derived"
155
- end
156
- end
157
- end
158
- unless @phrase.nil?
159
- location = "phrase" if @phrase.has?(@word) && @word.include?(" ")
160
- # rubbers has no space, but it's treated as a phrase.
161
- location = "phrase" if @phrase.include? @word
162
- end
163
- location ||= "unknown"
164
- end
165
-
166
- # * When the searched word is a title word
167
- # <span class="di-info">
168
- # For noun, verb, adj, adv, pronoun, prep, conj, exclamation:
169
- # <span class="posgram">
170
- # <span class="pos" title="A word that ...">noun</span>
171
- # For phrasal verb: reach out to sb
172
- # <span class="anc-info-head">
173
- # <span class="pos" title="Verb with an adverb ...">phrasal verb</span>
174
- # ... same as above line ... verb ...
175
- # For idiom:
176
- # "curiosity killed the cat"
177
- # <span class="lab" title="A short, well-know ...">
178
- # <span class="usage" title="A short ...">saying</span>
179
- # or "can't get your head around sth"
180
- # <span class="usage" title="A short ...">informal</span>
181
- # or "set/put the seal on sth" and many other idioms have no di-info, but
182
- # all should have di-body idiom-block idiom-body
183
- # * When the searched word is a derived word
184
- # <span class="runon">...<span class="runon-info">
185
- # <span class="posgram"><span class="pos">noun
186
- # * When there are more than one part of speech on the same page, like,
187
- # 'ruby': adjective and noun are both returned.
188
- # * When the dictionary has no direct answer - unknown
189
- def pos
190
- pos_ret = []
191
- loc = where?
192
- loc = [loc] if loc.is_a? String
193
- loc.each { |loca|
194
- case loca
195
- when 'title', 'head_variant', 'body_variant', 'spellvar', 'inflection'
196
- # for phrasal verb
197
- node = @html.css(".anc-info-head > .pos")
198
- # center has two pos, noun,verb; centre: noun, adj.
199
- node = @html.css(".di-info .pos") if node.empty?
200
- pos_ret += node.map {|n| n.text} unless node.empty?
201
- when 'idiom'
202
- pos_ret << "idiom"
203
- when 'derived'
204
- derived_css(".runon-info .posgram .pos") { |node|
205
- pos_ret << node.text
206
- }
207
- when 'unknown'
208
- #"Unknown or don't have a part of speech"
209
- end
210
- }
211
- return pos_ret.pop if pos_ret.length == 1
212
- pos_ret
213
- end
214
-
215
- # Get explanations inside a definition block
216
- def get_explanations
217
- defblocks = @html.css(".sense-body > .def-block")
218
- exps = defblocks.map { |db|
219
- Camdict::Explanation.new(db)
220
- }
221
- loc = where?
222
- loc = [loc] if loc.is_a? String
223
- loc.each { |loca|
224
- case loca
225
- when 'title', 'head_variant', 'spellvar', 'inflection'
226
- # Got it already
227
- when 'derived'
228
- derived_css(".def-block") { |node|
229
- exps << Camdict::Explanation.new(node)
230
- }
231
- when 'phrase'
232
- phrase_css(".def-block") { |node|
233
- exps << Camdict::Explanation.new(node)
234
- }
235
- when 'idiom'
236
- node = @html.css(".idiom-block .def-block")
237
- exps << Camdict::Explanation.new(node)
238
- end
239
- }
240
- exps
241
- end
242
-
243
- # Parse html and check whether there is idiom related block.
244
- def is_idiom?
245
- node = @html.css(".idiom-block .idiom-body")
246
- true unless node.empty?
247
- end
248
-
249
- # A word may has uk and us written pronouncation. Superscripts in an IPA
250
- # are stored in an array, k for UK, s for US. The returned IPA Struct likes,
251
- # uk: String, us:String, k:[position1, length1, position2, length2],
252
- # s: [position, length]
253
- # Position is the superscript index in the IPA, and the next number length
254
- # is the length of this superscript.
255
- def get_ipa
256
- # UK is always the first one
257
- uknode = @html.at_css ".di-info .ipa"
258
- # phrase or idiom has no IPA
259
- return IPA.new if uknode.nil?
260
- ukbase = parse_ipa(uknode)
261
- # in most cases they are same
262
- usbase = ukbase
263
- loc = where?
264
- loc = [loc] if loc.is_a? String
265
- loc.each { |loca|
266
- case loca
267
- when 'title', 'spellvar'
268
- # US IPA is always followed by a symbol US
269
- # favorite: UK/US ipa (spellvar US s:favorite) => normal title word
270
- usnode = @html.css ".di-info img.ussymbol + .pron .ipa"
271
- usnode = usnode.first
272
- usbase = parse_ipa(usnode) unless usnode.nil?
273
- when 'inflection'
274
- usnode = @html.css ".info-group img.ussymbol + .pron .ipa"
275
- usbase = parse_ipa(usnode) unless usnode.nil?
276
- ukinfnode = @html.css ".info-group .pron .ipa"
277
- ukinf = parse_ipa(ukinfnode) unless ukinfnode.nil?
278
- if usbase[:baseipa] && usbase[:baseipa].include?('-')
279
- usbase = join_ipa(ukbase, usbase)
280
- end
281
- if ukinf[:baseipa] && ukinf[:baseipa].include?('-')
282
- ukbase = join_ipa(ukbase, ukinf)
283
- end
284
- when 'head_variant'
285
- # variant word's IPA can be got from its definition page when it is a
286
- # title word, or from the bracket. Like,
287
- # aluminium: UK ipa, (variant s:aluminum: US ipa) => in bracket
288
- # behove: UK ipa, US ipa (variant US s:behoove ipa) => in bracket
289
- # Many other variants have no IPA inside the bracket and title word's
290
- # IPA are not theirs.
291
- # eraser: UK ipa, US ipa US (variant UK s:rubber) => no IPA
292
- # plane: UK/US ipa (variant UK s:aeroplane, US s:airplane) => no IPA
293
- # aeroplane: UK ipa,US ipa (variant US s:airplane) => no IPA
294
- # ass: UK/US ipa, | variant UK s:arse => no IPA
295
- # sledge: UK ipa, (variant US s:sled) => no IPA
296
- # titbit: UK/US ipa, (variant US s:tidbit) => no IPA
297
- node = @html.css ".di-info .var .ipa"
298
- node.empty? ? (return IPA.new) : ukbase = usbase = parse_ipa(node)
299
- return IPA.new unless ukbase[:baseipa]
300
- when 'derived'
301
- derived_uk = nil
302
- derived_css('.ipa') { |node|
303
- derived_uk = parse_ipa(node.first) unless node.first.nil?
304
- }
305
- derived_css("img.ussymbol + .pron .ipa") { |node|
306
- usbase = parse_ipa(node.first) unless node.first.nil?
307
- }
308
- if derived_uk && derived_uk[:baseipa].include?('-')
309
- ukbase = join_ipa(ukbase, derived_uk)
310
- elsif derived_uk
311
- # uk base may come from the derived word, such as fermentation.
312
- ukbase = derived_uk
313
- end
314
- end
315
- }
316
- if usbase[:baseipa] && usbase[:baseipa].include?('-')
317
- usbase = join_ipa(ukbase, usbase)
318
- end
319
- uk, k = ukbase[:baseipa], ukbase[:sindex]
320
- us, s = usbase[:baseipa], usbase[:sindex]
321
- IPA.new(uk, k, us, s)
322
- end
323
-
324
- # Parse an ipa node to get the ipa string and its superscript index
325
- def parse_ipa(node)
326
- position = 0
327
- pindex = []
328
- node.children.each { |c|
329
- len = c.text.length
330
- pindex += [position,len] if c["class"] == "sp"
331
- position += len
332
- }
333
- pindex = nil if pindex.empty?
334
- { baseipa: node.text, sindex: pindex }
335
- end
336
-
337
- # A short IPA begins with a hyphen, which shares a common beginning with the
338
- # full IPA. Return the joined result for the short one. The superscripts
339
- # are added when the common parts have that or removed if the non common
340
- # parts override them.
341
- def join_ipa(full_sp, short_sp)
342
- # understand -sd-; preparation -Sddss-; imaginary -dssds-
343
- # plagiarise -ssdddsss; dictionary -dsss; painting -sdss
344
- # harmfully -d
345
- # toxic ssddd-; privacy sssd-; formally sssd-; harmful ssssds-
346
- full, basesp = full_sp[:baseipa], full_sp[:sindex]
347
- short, ussp = short_sp[:baseipa], short_sp[:sindex]
348
- slen = short.length
349
- flen = full.length
350
- if short[0] == '-'
351
- # head-tail hyphen
352
- if short[-1] == '-'
353
- center = short[1, slen-2]
354
- position = full.index(center[0])
355
- # match left and right
356
- if full.index(center[-1])
357
- left_matched_index = position
358
- right_matched_index = flen-1 - full.index(center[-1])
359
- rev_number = center.length - (right_matched_index -
360
- left_matched_index + 1)
361
- if left_matched_index && rev_number <= 0
362
- right_index = mix_spi(basesp, right_matched_index+1..flen-1)
363
- rev_right_index = revise_index(right_index, rev_number)
364
- findex = mix_spi(basesp, 0..left_matched_index-1,
365
- ussp, left_matched_index+1,
366
- rev_right_index, 0)
367
- ret = full[0..left_matched_index-1] + center +
368
- full[right_matched_index+1..flen-1]
369
- return {baseipa: ret, sindex: findex}
370
- end
371
- end
372
- # match left only
373
- if position && (slen - 2 < flen - 1 - position)
374
- findex = mix_spi(basesp, 0..position-1, ussp, position-1,
375
- basesp, position+slen-2..flen-1)
376
- ret = full[0..position-1] + center + full[position+slen-2..flen-1]
377
- return {baseipa: ret, sindex: findex}
378
- end
379
- position = full.index(center[-1])
380
- # match right only
381
- if position && (position + 1 > slen - 2)
382
- findex = mix_spi(basesp, 0..position-slen+2, ussp, position-slen+2,
383
- basesp, position+1..flen-1)
384
- ret = full[0..position-slen+2] + center + full[position+1..flen-1]
385
- return {baseipa: ret, sindex: findex}
386
- end
387
- # this is a simple solution to workaround the issue since no common
388
- # chars are found between the full and short ipa. Such as the word
389
- # 'difference', so just assign full to short
390
- begin
391
- raise "head-tail hyphen IPA #{short} for the word #{@word}" +
392
- "unmatched with #{full}."
393
- rescue RuntimeError
394
- return full_sp
395
- end
80
+ def where(html)
81
+ @location ||=
82
+ if on_title?(html) || spell_variant?(html) || head_variant?(html)
83
+ 'title'
84
+ elsif derived_word?(html)
85
+ 'derived'
396
86
  else
397
- # head hyphen
398
- right = short[1, slen-1]
399
- position = full.index(right[0])
400
- # match left #&& plagiarism fails this test
401
- if position #&& (flen-position >= slen-1)
402
- findex = mix_spi( basesp, 0..position-1, ussp, position-1)
403
- ret = full[0..position-1] + right
404
- return {baseipa: ret, sindex: findex}
405
- end
406
- position = full.index(right[-1])
407
- # match right
408
- if position && (position+1 >= slen-1)
409
- findex = mix_spi(basesp, 0..position-slen+1, ussp, position-slen+1)
410
- ret = full[0..position-slen+1] + right
411
- return {baseipa: ret, sindex: findex}
412
- end
413
- # unmatched case, like harmfulness
414
- findex = mix_spi(basesp, 0..flen-1, ussp, flen-1)
415
- ret = full + right
416
- return {baseipa: ret, sindex: findex}
87
+ 'unknown'
417
88
  end
418
- # tail hyphen
419
- elsif short[-1] == '-'
420
- left = short[0, slen-1]
421
- ret = left + full[slen-1..flen-1]
422
- findex = mix_spi( ussp, 0, basesp, slen-1..flen-1)
423
- return {baseipa: ret, sindex: findex}
424
- # begin with a primary or secondary stress mark like reunion
425
- elsif ["\u{2cc}", "\u{2c8}"].include? short[0]
426
- return full_sp # for simple, use uk ipa instead
427
- else
428
- raise ArgumentError,
429
- "IPA doesn't begin with a hyphen or stress, nor end with a hyphen. " +
430
- "Nothing is done."
431
- end
432
89
  end
433
90
 
434
- # +superscript_index+ is the superscript index for an IPA
435
- # +rev_number+ is the number that is used to revise the superscript index
436
- # after the common part of a us shorten ipa is joined with uk ipa, the
437
- # remainding part requires to be revised as it becomes longer or shorter.
438
- # return the revised superscript_index or nil if the passed
439
- # +superscript_index+ is nil.
440
- def revise_index(superscript_index, rev_number)
441
- return nil if superscript_index.nil?
442
- ret = []
443
- superscript_index.each_pair { |position, len|
444
- ret += [position+rev_number, len]
445
- }
446
- return nil if ret.empty?
447
- ret
91
+ def derived_word?(html)
92
+ return false unless derived_words(html) && @derived_words.include?(@word)
93
+ true
448
94
  end
449
95
 
450
- # Determine whether or not the range is included by the superscript index.
451
- # Return the pair of index array when it is included by that. Or return nil.
452
- def at_range(spindex, range)
453
- return if spindex.nil?
454
- ret = []
455
- spindex.each_pair { |position, len|
456
- ret += [position, len] if range.include? position
457
- }
458
- return nil if ret.empty?
459
- ret
460
- end
461
-
462
- # Mix the superscript index. Return mixed result or nil if no superscript.
463
- # Each pair of array element is superscript index and a Range/Fixnum.
464
- # All of them are part of two superscripts that need joining. Only the
465
- # superscripts in range are kept, and the index of the superscript with
466
- # a number is increased by this number. Finally, the joined superscript is
467
- # returned.
468
- def mix_spi(*p)
469
- findex = []
470
- p.each_pair { |spindex, r_or_n|
471
- if spindex and r_or_n.kind_of? Range
472
- aindex = at_range(spindex, r_or_n)
473
- findex += aindex if aindex
474
- elsif spindex and r_or_n.is_a? Fixnum
475
- bindex = []
476
- spindex.each_pair { |p, i|
477
- bindex += [p + r_or_n, i]
478
- }
479
- findex += bindex unless bindex.empty?
480
- end
481
- }
482
- return nil if findex.empty?
483
- findex
96
+ def on_title?(html)
97
+ @word == title_word(html)
484
98
  end
485
99
 
486
- # Get the UK/US pronunciation mp3/ogg links
487
- def get_pronunciation
488
- # parameter pron is a Nokigiri::Node
489
- links = lambda { |pron|
490
- unless pron.empty?
491
- pron.each { |a|
492
- return Link.new a['data-src-mp3'], a['data-src-ogg']
493
- }
494
- else
495
- return Link.new
496
- end
497
- }
498
- ukpron = uspron = []
499
- loc = where?
500
- loc = [loc] if loc.is_a? String
501
- loc.each { |loca|
502
- case loca
503
- when 'title', 'spellvar'
504
- ukpron = @html.css(".di-info a.pron-uk")
505
- uspron = @html.css(".di-info a.pron-us")
506
- when 'derived'
507
- derived_css("a.pron-uk") { |node|
508
- ukpron = node
509
- }
510
- derived_css("a.pron-us") { |node|
511
- uspron = node
512
- }
513
- end
514
- }
515
- uklinks = links.call(ukpron)
516
- uslinks = links.call(uspron)
517
- Pronunciation.new(uklinks, uslinks)
518
- end
519
-
520
- # Get a word or phrase's region. Possible values: UK, US.
521
- def get_region
522
- ret = nil
523
- loc = where?
524
- loc = [loc] if loc.is_a? String
525
- loc.each { |loca|
526
- case loca
527
- when 'title', 'idiom'
528
- ret = css_text(".di-info > .lab .region")
529
- ret = css_text(".di-info > .lab") unless ret && !ret.empty?
530
- when 'spellvar'
531
- ret = css_text(".spellvar .region")
532
- when 'head_variant'
533
- ret = css_text(".di-info .var .region")
534
- when 'derived'
535
- derived_css(".region") { |node|
536
- ret = node.text unless node.empty?
537
- }
538
- when 'phrase'
539
- phrase_css(".region") { |node|
540
- ret = node.text unless node.empty?
541
- }
542
- end
543
- }
544
- ret
545
- end
546
-
547
- # Parse and get the usage
548
- def get_usage
549
- ret = nil
550
- loc = where?
551
- loc = [loc] if loc.is_a? String
552
- loc.each { |loca|
553
- case loca
554
- when 'title', 'idiom', 'spellvar'
555
- ret = css_text(".di-info > .lab .usage")
556
- when 'head_variant'
557
- ret = css_text(".di-info .var .usage")
558
- when 'derived'
559
- derived_css(".usage") { |node|
560
- ret = node.text unless node.empty?
561
- }
562
- when 'phrase'
563
- phrase_css(".usage") { |node|
564
- ret = node.text unless node.empty?
565
- }
566
- end
567
- }
568
- ret
569
- end
570
-
571
- # Get grammar code
572
- def get_gc
573
- ret = nil
574
- loc = where?
575
- loc = [loc] if loc.is_a? String
576
- loc.each { |loca|
577
- case loca
578
- when 'title', 'idiom', 'spellvar', 'head_variant'
579
- ret = css_text(".di-info .gcs")
580
- when 'derived'
581
- derived_css(".gcs") { |node|
582
- ret = node.text unless node.empty?
583
- }
584
- when 'phrase'
585
- phrase_css(".gcs") { |node|
586
- ret = node.text unless node.empty?
587
- }
588
- end
589
- }
590
- ret
591
- end
592
-
593
- # Return values: String, [String], nil
594
- def get_plural
595
- return unless @part_of_speech.include? 'noun'
596
- ret = nil
597
- node = @html.css(".di-info .inf-group[type='plural'] .inf")
598
- unless node.empty?
599
- # fish has two
600
- if node.size > 1
601
- ret = node.map { |n| n.text }
602
- elsif node.size == 1
603
- ret = node.text
604
- end
605
- end
606
- ret
607
- end
608
-
609
- # Parse and get the guided word
610
- def get_guided_word
611
- gw = css_text(".di-info .gw")
612
- gw.delete "()" if gw
613
- end
614
-
615
- # Return nil or Irregular struct
616
- def get_irregular
617
- return unless @part_of_speech.include? 'verb'
618
- present = css_text(".di-info .inf-group[type='pres_part'] .inf")
619
- past = css_text(".di-info .inf-group[type='past'] .inf")
620
- sp = pp = past
621
- if past.nil? || past.empty?
622
- node = @html.css(".di-info span[class='inf']")
623
- unless node.empty?
624
- past = node.map { |n| n.text }
625
- sp, pp = past
626
- end
627
- end
628
- sp = css_text(".di-info .inf-group[type='past-tense'] .inf") if sp.nil?
629
- pp = css_text(".di-info .inf-group[type='past-part'] .inf") if pp.nil?
630
- if sp || pp || present
631
- return Irregular.new(sp, pp, present)
632
- end
100
+ # spelling variant is treated as "title word"
101
+ def spell_variant?(html)
102
+ spell_variant(html) == @word
633
103
  end
634
104
 
635
105
  include Camdict::Common
636
- # Limitation: some irregular words are not reachable(phenomena, arisen)
637
- # because they are not shown on the search result page. They can be got
638
- # by their original forms - phenomenon, arise.
639
-
106
+ include Camdict::IPA
107
+ include Camdict::Pronunciation
108
+ include Camdict::Entry
640
109
  end
641
110
  end