camdict 1.0.3 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +28 -33
- data/lib/camdict/array_ext.rb +37 -0
- data/lib/camdict/client.rb +133 -97
- data/lib/camdict/common.rb +25 -143
- data/lib/camdict/definition.rb +65 -596
- data/lib/camdict/entry.rb +76 -0
- data/lib/camdict/exception.rb +5 -0
- data/lib/camdict/explanation.rb +29 -66
- data/lib/camdict/http_client.rb +14 -10
- data/lib/camdict/ipa.rb +52 -0
- data/lib/camdict/pronunciation.rb +53 -0
- data/lib/camdict/sentence.rb +38 -0
- data/lib/camdict/string_ext.rb +141 -0
- data/lib/camdict/word.rb +83 -17
- data/test/debug.rb +60 -0
- data/test/helper.rb +2 -0
- data/test/itest_client.rb +39 -8
- data/test/itest_definition.rb +24 -75
- data/test/itest_entry.rb +37 -0
- data/test/itest_explanation.rb +41 -20
- data/test/itest_ipa.rb +105 -0
- data/test/itest_pronunciation.rb +74 -0
- data/test/itest_word.rb +49 -0
- data/test/test_array_ext.rb +23 -0
- data/test/test_client.rb +35 -42
- data/test/test_common.rb +22 -78
- data/test/test_explanation.rb +21 -25
- data/test/test_http_client.rb +27 -13
- data/test/test_string_ext.rb +95 -0
- metadata +42 -7
- data/test/test_definition.rb +0 -345
data/lib/camdict/definition.rb
CHANGED
@@ -1,641 +1,110 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'camdict/entry'
|
3
|
+
require 'camdict/ipa'
|
4
|
+
require 'camdict/pronunciation'
|
2
5
|
|
3
6
|
module Camdict
|
4
|
-
|
5
|
-
# Parse an html definition to get explanations, word, IPA, prounciation,
|
7
|
+
# Parse an html definition to get explanations, word, IPA, prounciation,
|
6
8
|
# part of speech, etc.
|
7
|
-
|
8
9
|
class Definition
|
9
|
-
#
|
10
|
-
|
11
|
-
# +us+: the US IPA; +s+: the superscript index in US IPA.
|
12
|
-
IPA = Struct.new(:uk, :k, :us, :s)
|
13
|
-
# Struct Pronunciation has two memebers.
|
14
|
-
# Each +uk+/+us+ has its own mp3/ogg links.
|
15
|
-
Pronunciation = Struct.new(:uk, :us)
|
16
|
-
# Struct Link has two memembers +mp3+ and +ogg+, which are the http links.
|
17
|
-
Link = Struct.new(:mp3, :ogg)
|
10
|
+
# Get senses for this definition.
|
11
|
+
attr_reader :senses
|
18
12
|
|
19
|
-
|
20
|
-
# verbs have these values. It struct memebers are +sp+, +pp+, +pr+.
|
21
|
-
Irregular = Struct.new(:sp, :pp, :pr)
|
22
|
-
# Get part of speech of a word or phrase.
|
23
|
-
attr_reader :part_of_speech
|
24
|
-
# Get explanations for this definition.
|
25
|
-
attr_reader :explanations
|
26
|
-
# Is the queried word/phrase an idiom?
|
27
|
-
attr_reader :is_idiom
|
28
|
-
# Get the IPA
|
29
|
-
attr_reader :ipa
|
30
|
-
# Get the pronunciation
|
31
|
-
attr_reader :pronunciation
|
32
|
-
# Get the region: UK or US
|
33
|
-
attr_reader :region
|
34
|
-
# Get the short usage
|
35
|
-
attr_reader :usage
|
36
|
-
# Grammar code. Like U, means uncountable noun.
|
37
|
-
attr_reader :gc
|
38
|
-
# Get the guided word for this definition entry, which is usually just one
|
39
|
-
# word or a phrase. This does not exist when there is only one definition.
|
40
|
-
# It is useful when there are many definitions for one word to distinguish
|
41
|
-
# them.
|
42
|
-
attr_reader :guided
|
43
|
-
# Get the verb irregular form word. +word.verb.sp+ gets the simple past
|
44
|
-
# tense of this verb.
|
45
|
-
attr_reader :verb
|
46
|
-
|
47
|
-
# Input +word+ and +entry_html+ are
|
48
|
-
# { entry ID => its html definition source }
|
49
|
-
def initialize(word, entry_html)
|
13
|
+
def initialize(word)
|
50
14
|
@word = word
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
@inflection = get_inflection # [String]
|
59
|
-
@phrase = get_phrase # [String]
|
60
|
-
@is_idiom = is_idiom? # True or False
|
61
|
-
@part_of_speech = pos # String or [String] or []
|
62
|
-
@explanations = get_explanations # [Camdict::Explanation]
|
63
|
-
@ipa = get_ipa # Struct uk:String,us:String,k:[],s:[]
|
64
|
-
@pronunciation = get_pronunciation # Struct uk:Link, us:Link
|
65
|
-
@region = get_region # String
|
66
|
-
@usage = get_usage # String
|
67
|
-
@gc = get_gc # String
|
68
|
-
@plural = get_plural # String or [String]
|
69
|
-
@guided = get_guided_word # String
|
70
|
-
@verb = get_irregular # Struct Irregular
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse(html)
|
18
|
+
get_ipa(html)
|
19
|
+
get_pronunciation(html)
|
20
|
+
entry(html)
|
21
|
+
self
|
71
22
|
end
|
72
23
|
|
73
24
|
private
|
74
|
-
|
25
|
+
|
26
|
+
def entry(html)
|
27
|
+
@senses ||= html.css('.entry-body__el').map { |e| get_senses(e) }.flatten
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the definition page title word, which is either a word or phrase.
|
75
31
|
# This is necessary because it doesn't always get the searched
|
76
|
-
# word exactly. For instance, searching
|
77
|
-
# how the online dictionary is organised -- when words having
|
32
|
+
# word exactly. For instance, searching bald also gets baldness. This is
|
33
|
+
# how the online dictionary is organised -- when words having
|
78
34
|
# the same root they often share the same explanations.
|
79
|
-
# <h2 class="di-title cdo-section-title-hw"
|
80
|
-
|
81
|
-
|
35
|
+
# <h2 class="di-title cdo-section-title-hw"><span class="headword">
|
36
|
+
# look at sth<span></h2>
|
37
|
+
def title_word(html)
|
38
|
+
@title_word ||=
|
39
|
+
html.css('.di-title.cdo-section-title-hw .headword').first.text
|
82
40
|
end
|
83
41
|
|
84
42
|
# Some words have more than one derived words, like plagiarize has two.
|
85
43
|
# Return an Array of derived words or nil when no derived word found
|
86
44
|
# <span class=runon-title" title="Derived word">
|
87
45
|
# <span class="w">plagiarism
|
88
|
-
def derived_words
|
89
|
-
|
90
|
-
|
46
|
+
def derived_words(html)
|
47
|
+
@derived_words ||= parse_derived_words(html)
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_derived_words(html)
|
51
|
+
node = html.css('[title="Derived word"]')
|
52
|
+
node.map(&:content) unless node.empty?
|
91
53
|
end
|
92
54
|
|
93
|
-
# Get the variant word or phrase inside di-info block but exclude those
|
94
|
-
# inside phrase-block or spelling variant, from where is part of the
|
55
|
+
# Get the variant word or phrase inside di-info block but exclude those
|
56
|
+
# inside phrase-block or spelling variant, from where is part of the
|
95
57
|
# definition header.
|
96
58
|
# Such as, US/UK variant, or hasing the same meaning, but
|
97
59
|
# different pronunciation.
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
node = @html.css(".di-info .var .v[title='Variant form']")
|
102
|
-
node.map { |n| n.text } unless node.empty?
|
60
|
+
def get_head_variant(html)
|
61
|
+
node = html.css(".pos-header .var .v[title='Variant form']")
|
62
|
+
node.map(&:text) unless node.empty?
|
103
63
|
end
|
104
64
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
css_text ".di-body .v[title='Variant form']"
|
65
|
+
def head_variant?(html)
|
66
|
+
hv = get_head_variant(html)
|
67
|
+
hv && hv.include?(@word)
|
109
68
|
end
|
110
69
|
|
111
70
|
# Get spelling variants, which have same pronunciations.
|
112
|
-
|
113
|
-
|
114
|
-
css_text(".spellvar .v[title='Variant form']")
|
115
|
-
end
|
116
|
-
|
117
|
-
# Irregular plural, like criteria
|
118
|
-
def get_inflection
|
119
|
-
css_text ".di-info .inf"
|
120
|
-
end
|
121
|
-
|
122
|
-
# Get phrase and its variant which are not flattened yet
|
123
|
-
def get_phrase
|
124
|
-
node = @html.css(".phrase, .phrase-info .v[title='Variant form']")
|
125
|
-
node.map { |n| n.text } unless node.empty?
|
71
|
+
# plagiarize: plagiarise
|
72
|
+
def spell_variant(html)
|
73
|
+
css_text(html, ".spellvar .v[title='Variant form']")
|
126
74
|
end
|
127
75
|
|
128
76
|
# Where are the searched word's part of speech, IPAs, prounciations
|
129
77
|
# It could be found either at the position of "title" or "derived",
|
130
|
-
#
|
78
|
+
# "spellvar"
|
131
79
|
# Other places are still "unknown".
|
132
|
-
def where
|
133
|
-
location
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
location = "idiom" if @is_idiom && @title_word.include?(@word)
|
139
|
-
unless @spelling_variant.nil?
|
140
|
-
# spelling variant is treated as "title word"
|
141
|
-
location = "spellvar" if @spelling_variant == @word
|
142
|
-
end
|
143
|
-
unless @head_variant.nil?
|
144
|
-
location = "head_variant" if @head_variant.include? @word
|
145
|
-
end
|
146
|
-
location ="body_variant" if @body_variant && @body_variant == @word
|
147
|
-
location = "inflection" if @inflection && @inflection == @word
|
148
|
-
unless @derived_words.nil?
|
149
|
-
if @derived_words.include? @word
|
150
|
-
unless location.nil?
|
151
|
-
#'ruby' has two locations title and derived
|
152
|
-
location = [location, "derived"]
|
153
|
-
else
|
154
|
-
location = "derived"
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
158
|
-
unless @phrase.nil?
|
159
|
-
location = "phrase" if @phrase.has?(@word) && @word.include?(" ")
|
160
|
-
# rubbers has no space, but it's treated as a phrase.
|
161
|
-
location = "phrase" if @phrase.include? @word
|
162
|
-
end
|
163
|
-
location ||= "unknown"
|
164
|
-
end
|
165
|
-
|
166
|
-
# * When the searched word is a title word
|
167
|
-
# <span class="di-info">
|
168
|
-
# For noun, verb, adj, adv, pronoun, prep, conj, exclamation:
|
169
|
-
# <span class="posgram">
|
170
|
-
# <span class="pos" title="A word that ...">noun</span>
|
171
|
-
# For phrasal verb: reach out to sb
|
172
|
-
# <span class="anc-info-head">
|
173
|
-
# <span class="pos" title="Verb with an adverb ...">phrasal verb</span>
|
174
|
-
# ... same as above line ... verb ...
|
175
|
-
# For idiom:
|
176
|
-
# "curiosity killed the cat"
|
177
|
-
# <span class="lab" title="A short, well-know ...">
|
178
|
-
# <span class="usage" title="A short ...">saying</span>
|
179
|
-
# or "can't get your head around sth"
|
180
|
-
# <span class="usage" title="A short ...">informal</span>
|
181
|
-
# or "set/put the seal on sth" and many other idioms have no di-info, but
|
182
|
-
# all should have di-body idiom-block idiom-body
|
183
|
-
# * When the searched word is a derived word
|
184
|
-
# <span class="runon">...<span class="runon-info">
|
185
|
-
# <span class="posgram"><span class="pos">noun
|
186
|
-
# * When there are more than one part of speech on the same page, like,
|
187
|
-
# 'ruby': adjective and noun are both returned.
|
188
|
-
# * When the dictionary has no direct answer - unknown
|
189
|
-
def pos
|
190
|
-
pos_ret = []
|
191
|
-
loc = where?
|
192
|
-
loc = [loc] if loc.is_a? String
|
193
|
-
loc.each { |loca|
|
194
|
-
case loca
|
195
|
-
when 'title', 'head_variant', 'body_variant', 'spellvar', 'inflection'
|
196
|
-
# for phrasal verb
|
197
|
-
node = @html.css(".anc-info-head > .pos")
|
198
|
-
# center has two pos, noun,verb; centre: noun, adj.
|
199
|
-
node = @html.css(".di-info .pos") if node.empty?
|
200
|
-
pos_ret += node.map {|n| n.text} unless node.empty?
|
201
|
-
when 'idiom'
|
202
|
-
pos_ret << "idiom"
|
203
|
-
when 'derived'
|
204
|
-
derived_css(".runon-info .posgram .pos") { |node|
|
205
|
-
pos_ret << node.text
|
206
|
-
}
|
207
|
-
when 'unknown'
|
208
|
-
#"Unknown or don't have a part of speech"
|
209
|
-
end
|
210
|
-
}
|
211
|
-
return pos_ret.pop if pos_ret.length == 1
|
212
|
-
pos_ret
|
213
|
-
end
|
214
|
-
|
215
|
-
# Get explanations inside a definition block
|
216
|
-
def get_explanations
|
217
|
-
defblocks = @html.css(".sense-body > .def-block")
|
218
|
-
exps = defblocks.map { |db|
|
219
|
-
Camdict::Explanation.new(db)
|
220
|
-
}
|
221
|
-
loc = where?
|
222
|
-
loc = [loc] if loc.is_a? String
|
223
|
-
loc.each { |loca|
|
224
|
-
case loca
|
225
|
-
when 'title', 'head_variant', 'spellvar', 'inflection'
|
226
|
-
# Got it already
|
227
|
-
when 'derived'
|
228
|
-
derived_css(".def-block") { |node|
|
229
|
-
exps << Camdict::Explanation.new(node)
|
230
|
-
}
|
231
|
-
when 'phrase'
|
232
|
-
phrase_css(".def-block") { |node|
|
233
|
-
exps << Camdict::Explanation.new(node)
|
234
|
-
}
|
235
|
-
when 'idiom'
|
236
|
-
node = @html.css(".idiom-block .def-block")
|
237
|
-
exps << Camdict::Explanation.new(node)
|
238
|
-
end
|
239
|
-
}
|
240
|
-
exps
|
241
|
-
end
|
242
|
-
|
243
|
-
# Parse html and check whether there is idiom related block.
|
244
|
-
def is_idiom?
|
245
|
-
node = @html.css(".idiom-block .idiom-body")
|
246
|
-
true unless node.empty?
|
247
|
-
end
|
248
|
-
|
249
|
-
# A word may has uk and us written pronouncation. Superscripts in an IPA
|
250
|
-
# are stored in an array, k for UK, s for US. The returned IPA Struct likes,
|
251
|
-
# uk: String, us:String, k:[position1, length1, position2, length2],
|
252
|
-
# s: [position, length]
|
253
|
-
# Position is the superscript index in the IPA, and the next number length
|
254
|
-
# is the length of this superscript.
|
255
|
-
def get_ipa
|
256
|
-
# UK is always the first one
|
257
|
-
uknode = @html.at_css ".di-info .ipa"
|
258
|
-
# phrase or idiom has no IPA
|
259
|
-
return IPA.new if uknode.nil?
|
260
|
-
ukbase = parse_ipa(uknode)
|
261
|
-
# in most cases they are same
|
262
|
-
usbase = ukbase
|
263
|
-
loc = where?
|
264
|
-
loc = [loc] if loc.is_a? String
|
265
|
-
loc.each { |loca|
|
266
|
-
case loca
|
267
|
-
when 'title', 'spellvar'
|
268
|
-
# US IPA is always followed by a symbol US
|
269
|
-
# favorite: UK/US ipa (spellvar US s:favorite) => normal title word
|
270
|
-
usnode = @html.css ".di-info img.ussymbol + .pron .ipa"
|
271
|
-
usnode = usnode.first
|
272
|
-
usbase = parse_ipa(usnode) unless usnode.nil?
|
273
|
-
when 'inflection'
|
274
|
-
usnode = @html.css ".info-group img.ussymbol + .pron .ipa"
|
275
|
-
usbase = parse_ipa(usnode) unless usnode.nil?
|
276
|
-
ukinfnode = @html.css ".info-group .pron .ipa"
|
277
|
-
ukinf = parse_ipa(ukinfnode) unless ukinfnode.nil?
|
278
|
-
if usbase[:baseipa] && usbase[:baseipa].include?('-')
|
279
|
-
usbase = join_ipa(ukbase, usbase)
|
280
|
-
end
|
281
|
-
if ukinf[:baseipa] && ukinf[:baseipa].include?('-')
|
282
|
-
ukbase = join_ipa(ukbase, ukinf)
|
283
|
-
end
|
284
|
-
when 'head_variant'
|
285
|
-
# variant word's IPA can be got from its definition page when it is a
|
286
|
-
# title word, or from the bracket. Like,
|
287
|
-
# aluminium: UK ipa, (variant s:aluminum: US ipa) => in bracket
|
288
|
-
# behove: UK ipa, US ipa (variant US s:behoove ipa) => in bracket
|
289
|
-
# Many other variants have no IPA inside the bracket and title word's
|
290
|
-
# IPA are not theirs.
|
291
|
-
# eraser: UK ipa, US ipa US (variant UK s:rubber) => no IPA
|
292
|
-
# plane: UK/US ipa (variant UK s:aeroplane, US s:airplane) => no IPA
|
293
|
-
# aeroplane: UK ipa,US ipa (variant US s:airplane) => no IPA
|
294
|
-
# ass: UK/US ipa, | variant UK s:arse => no IPA
|
295
|
-
# sledge: UK ipa, (variant US s:sled) => no IPA
|
296
|
-
# titbit: UK/US ipa, (variant US s:tidbit) => no IPA
|
297
|
-
node = @html.css ".di-info .var .ipa"
|
298
|
-
node.empty? ? (return IPA.new) : ukbase = usbase = parse_ipa(node)
|
299
|
-
return IPA.new unless ukbase[:baseipa]
|
300
|
-
when 'derived'
|
301
|
-
derived_uk = nil
|
302
|
-
derived_css('.ipa') { |node|
|
303
|
-
derived_uk = parse_ipa(node.first) unless node.first.nil?
|
304
|
-
}
|
305
|
-
derived_css("img.ussymbol + .pron .ipa") { |node|
|
306
|
-
usbase = parse_ipa(node.first) unless node.first.nil?
|
307
|
-
}
|
308
|
-
if derived_uk && derived_uk[:baseipa].include?('-')
|
309
|
-
ukbase = join_ipa(ukbase, derived_uk)
|
310
|
-
elsif derived_uk
|
311
|
-
# uk base may come from the derived word, such as fermentation.
|
312
|
-
ukbase = derived_uk
|
313
|
-
end
|
314
|
-
end
|
315
|
-
}
|
316
|
-
if usbase[:baseipa] && usbase[:baseipa].include?('-')
|
317
|
-
usbase = join_ipa(ukbase, usbase)
|
318
|
-
end
|
319
|
-
uk, k = ukbase[:baseipa], ukbase[:sindex]
|
320
|
-
us, s = usbase[:baseipa], usbase[:sindex]
|
321
|
-
IPA.new(uk, k, us, s)
|
322
|
-
end
|
323
|
-
|
324
|
-
# Parse an ipa node to get the ipa string and its superscript index
|
325
|
-
def parse_ipa(node)
|
326
|
-
position = 0
|
327
|
-
pindex = []
|
328
|
-
node.children.each { |c|
|
329
|
-
len = c.text.length
|
330
|
-
pindex += [position,len] if c["class"] == "sp"
|
331
|
-
position += len
|
332
|
-
}
|
333
|
-
pindex = nil if pindex.empty?
|
334
|
-
{ baseipa: node.text, sindex: pindex }
|
335
|
-
end
|
336
|
-
|
337
|
-
# A short IPA begins with a hyphen, which shares a common beginning with the
|
338
|
-
# full IPA. Return the joined result for the short one. The superscripts
|
339
|
-
# are added when the common parts have that or removed if the non common
|
340
|
-
# parts override them.
|
341
|
-
def join_ipa(full_sp, short_sp)
|
342
|
-
# understand -sd-; preparation -Sddss-; imaginary -dssds-
|
343
|
-
# plagiarise -ssdddsss; dictionary -dsss; painting -sdss
|
344
|
-
# harmfully -d
|
345
|
-
# toxic ssddd-; privacy sssd-; formally sssd-; harmful ssssds-
|
346
|
-
full, basesp = full_sp[:baseipa], full_sp[:sindex]
|
347
|
-
short, ussp = short_sp[:baseipa], short_sp[:sindex]
|
348
|
-
slen = short.length
|
349
|
-
flen = full.length
|
350
|
-
if short[0] == '-'
|
351
|
-
# head-tail hyphen
|
352
|
-
if short[-1] == '-'
|
353
|
-
center = short[1, slen-2]
|
354
|
-
position = full.index(center[0])
|
355
|
-
# match left and right
|
356
|
-
if full.index(center[-1])
|
357
|
-
left_matched_index = position
|
358
|
-
right_matched_index = flen-1 - full.index(center[-1])
|
359
|
-
rev_number = center.length - (right_matched_index -
|
360
|
-
left_matched_index + 1)
|
361
|
-
if left_matched_index && rev_number <= 0
|
362
|
-
right_index = mix_spi(basesp, right_matched_index+1..flen-1)
|
363
|
-
rev_right_index = revise_index(right_index, rev_number)
|
364
|
-
findex = mix_spi(basesp, 0..left_matched_index-1,
|
365
|
-
ussp, left_matched_index+1,
|
366
|
-
rev_right_index, 0)
|
367
|
-
ret = full[0..left_matched_index-1] + center +
|
368
|
-
full[right_matched_index+1..flen-1]
|
369
|
-
return {baseipa: ret, sindex: findex}
|
370
|
-
end
|
371
|
-
end
|
372
|
-
# match left only
|
373
|
-
if position && (slen - 2 < flen - 1 - position)
|
374
|
-
findex = mix_spi(basesp, 0..position-1, ussp, position-1,
|
375
|
-
basesp, position+slen-2..flen-1)
|
376
|
-
ret = full[0..position-1] + center + full[position+slen-2..flen-1]
|
377
|
-
return {baseipa: ret, sindex: findex}
|
378
|
-
end
|
379
|
-
position = full.index(center[-1])
|
380
|
-
# match right only
|
381
|
-
if position && (position + 1 > slen - 2)
|
382
|
-
findex = mix_spi(basesp, 0..position-slen+2, ussp, position-slen+2,
|
383
|
-
basesp, position+1..flen-1)
|
384
|
-
ret = full[0..position-slen+2] + center + full[position+1..flen-1]
|
385
|
-
return {baseipa: ret, sindex: findex}
|
386
|
-
end
|
387
|
-
# this is a simple solution to workaround the issue since no common
|
388
|
-
# chars are found between the full and short ipa. Such as the word
|
389
|
-
# 'difference', so just assign full to short
|
390
|
-
begin
|
391
|
-
raise "head-tail hyphen IPA #{short} for the word #{@word}" +
|
392
|
-
"unmatched with #{full}."
|
393
|
-
rescue RuntimeError
|
394
|
-
return full_sp
|
395
|
-
end
|
80
|
+
def where(html)
|
81
|
+
@location ||=
|
82
|
+
if on_title?(html) || spell_variant?(html) || head_variant?(html)
|
83
|
+
'title'
|
84
|
+
elsif derived_word?(html)
|
85
|
+
'derived'
|
396
86
|
else
|
397
|
-
|
398
|
-
right = short[1, slen-1]
|
399
|
-
position = full.index(right[0])
|
400
|
-
# match left #&& plagiarism fails this test
|
401
|
-
if position #&& (flen-position >= slen-1)
|
402
|
-
findex = mix_spi( basesp, 0..position-1, ussp, position-1)
|
403
|
-
ret = full[0..position-1] + right
|
404
|
-
return {baseipa: ret, sindex: findex}
|
405
|
-
end
|
406
|
-
position = full.index(right[-1])
|
407
|
-
# match right
|
408
|
-
if position && (position+1 >= slen-1)
|
409
|
-
findex = mix_spi(basesp, 0..position-slen+1, ussp, position-slen+1)
|
410
|
-
ret = full[0..position-slen+1] + right
|
411
|
-
return {baseipa: ret, sindex: findex}
|
412
|
-
end
|
413
|
-
# unmatched case, like harmfulness
|
414
|
-
findex = mix_spi(basesp, 0..flen-1, ussp, flen-1)
|
415
|
-
ret = full + right
|
416
|
-
return {baseipa: ret, sindex: findex}
|
87
|
+
'unknown'
|
417
88
|
end
|
418
|
-
# tail hyphen
|
419
|
-
elsif short[-1] == '-'
|
420
|
-
left = short[0, slen-1]
|
421
|
-
ret = left + full[slen-1..flen-1]
|
422
|
-
findex = mix_spi( ussp, 0, basesp, slen-1..flen-1)
|
423
|
-
return {baseipa: ret, sindex: findex}
|
424
|
-
# begin with a primary or secondary stress mark like reunion
|
425
|
-
elsif ["\u{2cc}", "\u{2c8}"].include? short[0]
|
426
|
-
return full_sp # for simple, use uk ipa instead
|
427
|
-
else
|
428
|
-
raise ArgumentError,
|
429
|
-
"IPA doesn't begin with a hyphen or stress, nor end with a hyphen. " +
|
430
|
-
"Nothing is done."
|
431
|
-
end
|
432
89
|
end
|
433
90
|
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
# remainding part requires to be revised as it becomes longer or shorter.
|
438
|
-
# return the revised superscript_index or nil if the passed
|
439
|
-
# +superscript_index+ is nil.
|
440
|
-
def revise_index(superscript_index, rev_number)
|
441
|
-
return nil if superscript_index.nil?
|
442
|
-
ret = []
|
443
|
-
superscript_index.each_pair { |position, len|
|
444
|
-
ret += [position+rev_number, len]
|
445
|
-
}
|
446
|
-
return nil if ret.empty?
|
447
|
-
ret
|
91
|
+
def derived_word?(html)
|
92
|
+
return false unless derived_words(html) && @derived_words.include?(@word)
|
93
|
+
true
|
448
94
|
end
|
449
95
|
|
450
|
-
|
451
|
-
|
452
|
-
def at_range(spindex, range)
|
453
|
-
return if spindex.nil?
|
454
|
-
ret = []
|
455
|
-
spindex.each_pair { |position, len|
|
456
|
-
ret += [position, len] if range.include? position
|
457
|
-
}
|
458
|
-
return nil if ret.empty?
|
459
|
-
ret
|
460
|
-
end
|
461
|
-
|
462
|
-
# Mix the superscript index. Return mixed result or nil if no superscript.
|
463
|
-
# Each pair of array element is superscript index and a Range/Fixnum.
|
464
|
-
# All of them are part of two superscripts that need joining. Only the
|
465
|
-
# superscripts in range are kept, and the index of the superscript with
|
466
|
-
# a number is increased by this number. Finally, the joined superscript is
|
467
|
-
# returned.
|
468
|
-
def mix_spi(*p)
|
469
|
-
findex = []
|
470
|
-
p.each_pair { |spindex, r_or_n|
|
471
|
-
if spindex and r_or_n.kind_of? Range
|
472
|
-
aindex = at_range(spindex, r_or_n)
|
473
|
-
findex += aindex if aindex
|
474
|
-
elsif spindex and r_or_n.is_a? Fixnum
|
475
|
-
bindex = []
|
476
|
-
spindex.each_pair { |p, i|
|
477
|
-
bindex += [p + r_or_n, i]
|
478
|
-
}
|
479
|
-
findex += bindex unless bindex.empty?
|
480
|
-
end
|
481
|
-
}
|
482
|
-
return nil if findex.empty?
|
483
|
-
findex
|
96
|
+
def on_title?(html)
|
97
|
+
@word == title_word(html)
|
484
98
|
end
|
485
99
|
|
486
|
-
#
|
487
|
-
def
|
488
|
-
|
489
|
-
links = lambda { |pron|
|
490
|
-
unless pron.empty?
|
491
|
-
pron.each { |a|
|
492
|
-
return Link.new a['data-src-mp3'], a['data-src-ogg']
|
493
|
-
}
|
494
|
-
else
|
495
|
-
return Link.new
|
496
|
-
end
|
497
|
-
}
|
498
|
-
ukpron = uspron = []
|
499
|
-
loc = where?
|
500
|
-
loc = [loc] if loc.is_a? String
|
501
|
-
loc.each { |loca|
|
502
|
-
case loca
|
503
|
-
when 'title', 'spellvar'
|
504
|
-
ukpron = @html.css(".di-info a.pron-uk")
|
505
|
-
uspron = @html.css(".di-info a.pron-us")
|
506
|
-
when 'derived'
|
507
|
-
derived_css("a.pron-uk") { |node|
|
508
|
-
ukpron = node
|
509
|
-
}
|
510
|
-
derived_css("a.pron-us") { |node|
|
511
|
-
uspron = node
|
512
|
-
}
|
513
|
-
end
|
514
|
-
}
|
515
|
-
uklinks = links.call(ukpron)
|
516
|
-
uslinks = links.call(uspron)
|
517
|
-
Pronunciation.new(uklinks, uslinks)
|
518
|
-
end
|
519
|
-
|
520
|
-
# Get a word or phrase's region. Possible values: UK, US.
|
521
|
-
def get_region
|
522
|
-
ret = nil
|
523
|
-
loc = where?
|
524
|
-
loc = [loc] if loc.is_a? String
|
525
|
-
loc.each { |loca|
|
526
|
-
case loca
|
527
|
-
when 'title', 'idiom'
|
528
|
-
ret = css_text(".di-info > .lab .region")
|
529
|
-
ret = css_text(".di-info > .lab") unless ret && !ret.empty?
|
530
|
-
when 'spellvar'
|
531
|
-
ret = css_text(".spellvar .region")
|
532
|
-
when 'head_variant'
|
533
|
-
ret = css_text(".di-info .var .region")
|
534
|
-
when 'derived'
|
535
|
-
derived_css(".region") { |node|
|
536
|
-
ret = node.text unless node.empty?
|
537
|
-
}
|
538
|
-
when 'phrase'
|
539
|
-
phrase_css(".region") { |node|
|
540
|
-
ret = node.text unless node.empty?
|
541
|
-
}
|
542
|
-
end
|
543
|
-
}
|
544
|
-
ret
|
545
|
-
end
|
546
|
-
|
547
|
-
# Parse and get the usage
|
548
|
-
def get_usage
|
549
|
-
ret = nil
|
550
|
-
loc = where?
|
551
|
-
loc = [loc] if loc.is_a? String
|
552
|
-
loc.each { |loca|
|
553
|
-
case loca
|
554
|
-
when 'title', 'idiom', 'spellvar'
|
555
|
-
ret = css_text(".di-info > .lab .usage")
|
556
|
-
when 'head_variant'
|
557
|
-
ret = css_text(".di-info .var .usage")
|
558
|
-
when 'derived'
|
559
|
-
derived_css(".usage") { |node|
|
560
|
-
ret = node.text unless node.empty?
|
561
|
-
}
|
562
|
-
when 'phrase'
|
563
|
-
phrase_css(".usage") { |node|
|
564
|
-
ret = node.text unless node.empty?
|
565
|
-
}
|
566
|
-
end
|
567
|
-
}
|
568
|
-
ret
|
569
|
-
end
|
570
|
-
|
571
|
-
# Get grammar code
|
572
|
-
def get_gc
|
573
|
-
ret = nil
|
574
|
-
loc = where?
|
575
|
-
loc = [loc] if loc.is_a? String
|
576
|
-
loc.each { |loca|
|
577
|
-
case loca
|
578
|
-
when 'title', 'idiom', 'spellvar', 'head_variant'
|
579
|
-
ret = css_text(".di-info .gcs")
|
580
|
-
when 'derived'
|
581
|
-
derived_css(".gcs") { |node|
|
582
|
-
ret = node.text unless node.empty?
|
583
|
-
}
|
584
|
-
when 'phrase'
|
585
|
-
phrase_css(".gcs") { |node|
|
586
|
-
ret = node.text unless node.empty?
|
587
|
-
}
|
588
|
-
end
|
589
|
-
}
|
590
|
-
ret
|
591
|
-
end
|
592
|
-
|
593
|
-
# Return values: String, [String], nil
|
594
|
-
def get_plural
|
595
|
-
return unless @part_of_speech.include? 'noun'
|
596
|
-
ret = nil
|
597
|
-
node = @html.css(".di-info .inf-group[type='plural'] .inf")
|
598
|
-
unless node.empty?
|
599
|
-
# fish has two
|
600
|
-
if node.size > 1
|
601
|
-
ret = node.map { |n| n.text }
|
602
|
-
elsif node.size == 1
|
603
|
-
ret = node.text
|
604
|
-
end
|
605
|
-
end
|
606
|
-
ret
|
607
|
-
end
|
608
|
-
|
609
|
-
# Parse and get the guided word
|
610
|
-
def get_guided_word
|
611
|
-
gw = css_text(".di-info .gw")
|
612
|
-
gw.delete "()" if gw
|
613
|
-
end
|
614
|
-
|
615
|
-
# Return nil or Irregular struct
|
616
|
-
def get_irregular
|
617
|
-
return unless @part_of_speech.include? 'verb'
|
618
|
-
present = css_text(".di-info .inf-group[type='pres_part'] .inf")
|
619
|
-
past = css_text(".di-info .inf-group[type='past'] .inf")
|
620
|
-
sp = pp = past
|
621
|
-
if past.nil? || past.empty?
|
622
|
-
node = @html.css(".di-info span[class='inf']")
|
623
|
-
unless node.empty?
|
624
|
-
past = node.map { |n| n.text }
|
625
|
-
sp, pp = past
|
626
|
-
end
|
627
|
-
end
|
628
|
-
sp = css_text(".di-info .inf-group[type='past-tense'] .inf") if sp.nil?
|
629
|
-
pp = css_text(".di-info .inf-group[type='past-part'] .inf") if pp.nil?
|
630
|
-
if sp || pp || present
|
631
|
-
return Irregular.new(sp, pp, present)
|
632
|
-
end
|
100
|
+
# spelling variant is treated as "title word"
|
101
|
+
def spell_variant?(html)
|
102
|
+
spell_variant(html) == @word
|
633
103
|
end
|
634
104
|
|
635
105
|
include Camdict::Common
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
106
|
+
include Camdict::IPA
|
107
|
+
include Camdict::Pronunciation
|
108
|
+
include Camdict::Entry
|
640
109
|
end
|
641
110
|
end
|