camdict 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +59 -0
- data/Rakefile +20 -0
- data/lib/camdict/client.rb +153 -0
- data/lib/camdict/common.rb +136 -0
- data/lib/camdict/definition.rb +606 -0
- data/lib/camdict/explanation.rb +128 -0
- data/lib/camdict/http_client.rb +22 -0
- data/lib/camdict/word.rb +36 -0
- data/lib/camdict.rb +2 -0
- data/license +21 -0
- data/test/itest_client.rb +20 -0
- data/test/itest_definition.rb +89 -0
- data/test/itest_explanation.rb +34 -0
- data/test/test_client.rb +72 -0
- data/test/test_common.rb +59 -0
- data/test/test_definition.rb +345 -0
- data/test/test_explanation.rb +71 -0
- data/test/test_http_client.rb +28 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 86357de26d1ad3547925075d9592facaadcb39f4
|
4
|
+
data.tar.gz: e16be0c399d94659828ad176a9c7557d2289dfda
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b35d9c52c329f91c84cd644520d440d17268f52b29c1eb4328d2ea5b90bedf2cbbcd7308eeb37a919ed0be4fbed16a1ff31d6a2909c72786ec42feaaee92e224
|
7
|
+
data.tar.gz: ac6ce320b885258a6108365e056cfd4b8bb8a6601d950862fcfe5806ce77525807b6894f564063e6b949970843c936f8193cc12d157ed89cf0725becfc24c969
|
data/README.md
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# A ruby gem - camdict
|
2
|
+
|
3
|
+
## Introduction
|
4
|
+
|
5
|
+
The ruby gem camdict is a [Cambridge online dictionary][1] client.
|
6
|
+
You could use this excellent dictionary with a browser, but now it is possible
|
7
|
+
to use it with this ruby API in your code.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
`gem install camdict`
|
11
|
+
|
12
|
+
## Verification
|
13
|
+
The gem can be tested by below commands in the directory where it's installed.
|
14
|
+
`rake` - run all the testcases which don't need internet connection.
|
15
|
+
`rake itest` - run all the testcases that need internet connection.
|
16
|
+
`rake testall` - run all above tests.
|
17
|
+
|
18
|
+
One test may fail if the gem nokogiri hasn't pulled in the fix [here][2]. But
|
19
|
+
it is safe to apply the patch to your nokogiri copy.
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
require 'camdict'
|
23
|
+
|
24
|
+
# Look up a new word
|
25
|
+
word = Camdict::Word.new "health"
|
26
|
+
|
27
|
+
# get all definitions for this word from remote dictionary and select the
|
28
|
+
# first one. A word usually has many definitions.
|
29
|
+
health = word.definitions.first
|
30
|
+
|
31
|
+
# Print the part of speech
|
32
|
+
puts health.part_of_speech #=> noun
|
33
|
+
|
34
|
+
# One definition may have more than one explanations.
|
35
|
+
# Just look at the details of the first one.
|
36
|
+
explanation1 = health.explanations.first
|
37
|
+
|
38
|
+
# What's the meaning
|
39
|
+
puts explanation1.meaning #=>
|
40
|
+
# the condition of the body and the degree to which it is free from
|
41
|
+
# illness, or the state of being well:
|
42
|
+
|
43
|
+
# And it may have some useful example sentences.
|
44
|
+
explanation1.examples.each { |e|
|
45
|
+
puts e.sentence #=>
|
46
|
+
# to be in good/poor health
|
47
|
+
# Regular exercise is good for your health.
|
48
|
+
# I had to give up drinking for health reasons.
|
49
|
+
# He gave up work because of ill health.
|
50
|
+
}
|
51
|
+
|
52
|
+
|
53
|
+
There are some useful testing examples in test directory of this gem.
|
54
|
+
|
55
|
+
## Licence MIT
|
56
|
+
Copyright (c) 2014 Pan Gaoyong
|
57
|
+
|
58
|
+
[1]: http://dictionary.cambridge.com "Cambridge"
|
59
|
+
[2]: https://github.com/sparklemotion/nokogiri/pull/1020 "My Nokogiri Bug Fix"
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
Rake::TestTask.new(:default) do |t|
|
4
|
+
t.test_files = FileList['test/test_*.rb']
|
5
|
+
end
|
6
|
+
|
7
|
+
Rake::TestTask.new(:itest) do |t|
|
8
|
+
t.test_files = FileList['test/itest_*.rb']
|
9
|
+
end
|
10
|
+
|
11
|
+
desc "No internet connection required"
|
12
|
+
task :default => :test
|
13
|
+
|
14
|
+
desc "Needs internet connection"
|
15
|
+
task :itest => :test
|
16
|
+
|
17
|
+
desc "Run all tests"
|
18
|
+
task :testall => :default
|
19
|
+
task :testall => :itest
|
20
|
+
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require "camdict/http_client"
|
2
|
+
|
3
|
+
module Camdict
|
4
|
+
|
5
|
+
# The client downloads all the useful data about a word or phrase from
|
6
|
+
# remote Cambridge dictionaries, but not includes the extended data.
|
7
|
+
# For example,
|
8
|
+
# when the word "mind" is searched, all its four exactly matched entries are
|
9
|
+
# downloaded. However, separated entries like "turn of mind" & "open mind"
|
10
|
+
# are not included.
|
11
|
+
class Client
|
12
|
+
|
13
|
+
# Default dictionary is british. Other possible +dict+ values:
|
14
|
+
# american-english, business-english, learner-english.
|
15
|
+
def initialize(dict=nil)
|
16
|
+
@dictionary = dict || "british"
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Get a word's html definition(s) by searching it from the web dictionary.
|
21
|
+
# The returned result could be an empty array when nothing is found, or
|
22
|
+
# is an array with a hash element,
|
23
|
+
# [{ word => html definition }],
|
24
|
+
# or many hash elements when it has multiple entries,
|
25
|
+
# [{ entry_id => html definition }, ...].
|
26
|
+
# Normally, when a +word+ has more than one meanings, its entry ID format is
|
27
|
+
# like word_nn. Otherwise it's just the word itself.
|
28
|
+
def html_definition(word)
|
29
|
+
html = fetch(word)
|
30
|
+
return [] if html.nil?
|
31
|
+
html_defs = []
|
32
|
+
# some words return their only definition directly, such as aluminium.
|
33
|
+
if definition_page? html
|
34
|
+
# entry id is just the word when there is only one definition
|
35
|
+
html_defs << { word => di_head(html) + di_body(html) }
|
36
|
+
else
|
37
|
+
# returned page could be a spelling check suggestion page in case it is
|
38
|
+
# not found, or the found page with all matched entries and related.
|
39
|
+
# when entry urls are not found, they are empty and spelling suggestion
|
40
|
+
# pages. So mentry_links() returns an empty array. Otherwise, it returns
|
41
|
+
# all the exactly matched entry links.
|
42
|
+
matched_urls = mentry_links(word, html)
|
43
|
+
unless matched_urls.empty?
|
44
|
+
matched_urls.each { |url|
|
45
|
+
html_defs << { entry_id(url) => get_htmldef(url) }
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
html_defs
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get a word html page source by its entry +url+.
|
53
|
+
def get_htmldef(url)
|
54
|
+
html = Camdict::HTTP::Client.get_html(url)
|
55
|
+
di_head(html) + di_body(html)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# Fetch word searching result page.
|
61
|
+
# Returned result is either just a single definition page if there is only
|
62
|
+
# one entry, or a result page listing all possible entries, or spelling
|
63
|
+
# check result. All results are objects of Nokogiri::HTML.
|
64
|
+
def fetch(w)
|
65
|
+
# search a word with this URL
|
66
|
+
search_url = "http://dictionary.cambridge.org/search/#{@dictionary}/?q="
|
67
|
+
url = search_url + w
|
68
|
+
begin
|
69
|
+
Camdict::HTTP::Client.get_html(url)
|
70
|
+
rescue OpenURI::HTTPError => e
|
71
|
+
# "404" == e.message[0..2], When a word is not found, it returns 404
|
72
|
+
# Not Found and spelling suggestions page.
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# To determine whether or not the input object of Nokogiri::HTML is a page
|
77
|
+
# of a word definition. Return true if it has a source structure like this,
|
78
|
+
# <div class="di-head">
|
79
|
+
# <div class="di-title">
|
80
|
+
# <h1 class="hw">
|
81
|
+
# This works for the translation page too, like English-Spanish.
|
82
|
+
def single_def?(html)
|
83
|
+
node = html.css(".di-head .di-title .hw")
|
84
|
+
! node.empty?
|
85
|
+
end
|
86
|
+
|
87
|
+
# Find out matched entry links from search result page
|
88
|
+
# <ul class="result-list">
|
89
|
+
# <li><a href="entry_link1">
|
90
|
+
# <li><a href="entry_link2">
|
91
|
+
# The search result html page should include above piece of code.
|
92
|
+
# The extended links are filtered out and the matched word or phrase's
|
93
|
+
# links are kept. An array of them are returned.
|
94
|
+
# For example, when the searched word is "related", entry links are like,
|
95
|
+
# http://dictionary.cambridge.org/dictionary/british/related_1
|
96
|
+
# http://dictionary.cambridge.org/dictionary/british/related_2
|
97
|
+
# http://dictionary.cambridge.org/dictionary/british/stress-related
|
98
|
+
# ...
|
99
|
+
# Returned result should only contain the first two.
|
100
|
+
# Input html is an object of Nokogiri::HTML.
|
101
|
+
def mentry_links(word, html)
|
102
|
+
# suppose the word is not found in the dictionary, so it is empty.
|
103
|
+
links = []
|
104
|
+
nodes = html.css(".result-list a")
|
105
|
+
# when found
|
106
|
+
unless nodes.empty?
|
107
|
+
nodes.each { |a|
|
108
|
+
links << a['href'] if matched_word?(word, a)
|
109
|
+
}
|
110
|
+
end
|
111
|
+
links
|
112
|
+
end
|
113
|
+
|
114
|
+
# Return true if the searched word matches the one on result page.
|
115
|
+
# Node is an object of Nokogiri::Node
|
116
|
+
# <li>
|
117
|
+
# <span class="base">
|
118
|
+
# <b class="phrase">out of mind, or
|
119
|
+
# <b class="hw">turn of mind, or
|
120
|
+
# <b class="w">mind-numbingly
|
121
|
+
# Match criterion: the queried word should equal to the result word;
|
122
|
+
# the result phrase should be flattened, which should equal to the
|
123
|
+
# queried phrase.
|
124
|
+
def matched_word?(word, node)
|
125
|
+
li = node.css(".base")
|
126
|
+
resword = li.size == 1 ? li.text : li[0].text
|
127
|
+
if resword.include? '/'
|
128
|
+
resword.flatten.include?(word)
|
129
|
+
else
|
130
|
+
word == resword
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return definition head in html source
|
135
|
+
def di_head(html)
|
136
|
+
html.css(".cdo-section-title-hw").to_html(:save_with=>0) +
|
137
|
+
html.css(".di-info").to_html(:save_with=>0)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return definition body in html source
|
141
|
+
def di_body(html)
|
142
|
+
html.css(".di-body").to_html(:save_with=>0)
|
143
|
+
end
|
144
|
+
|
145
|
+
# get the last part of http://dictionary.cambridge.org/british/related_1
|
146
|
+
def entry_id(url)
|
147
|
+
url.split('/').last
|
148
|
+
end
|
149
|
+
|
150
|
+
alias :definition_page? :single_def?
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module Camdict
|
2
|
+
module Common
|
3
|
+
|
4
|
+
# Extend String class.
|
5
|
+
String.class_eval do
|
6
|
+
# 'blow a kiss to/at sb'.flatten =>
|
7
|
+
# %q(blow a kiss to sb, blow a kiss at sb)
|
8
|
+
# if it doesn't include a slash, returns unchanged itself
|
9
|
+
def flatten
|
10
|
+
return self unless self.include? '/'
|
11
|
+
ret = []
|
12
|
+
len = self.length
|
13
|
+
j=0 # count of the alternative words, 'to/at' has two.
|
14
|
+
b=[] # b[]/e[] index of the beginning/end of a alternative word
|
15
|
+
e=[]
|
16
|
+
# set this flag when next word is expected an alternate word after slash
|
17
|
+
include_next = false
|
18
|
+
for i in (0..len-1)
|
19
|
+
c = self[i]
|
20
|
+
case
|
21
|
+
when c =~ /[[:alpha:]\-\(\)]/
|
22
|
+
if b[j].nil?
|
23
|
+
b[j] = i
|
24
|
+
e[j] = i
|
25
|
+
else
|
26
|
+
e[j] = i
|
27
|
+
end
|
28
|
+
when c == " "
|
29
|
+
if include_next
|
30
|
+
break
|
31
|
+
else
|
32
|
+
b[j] = nil
|
33
|
+
e[j] = nil
|
34
|
+
end
|
35
|
+
when c == "/"
|
36
|
+
j += 1
|
37
|
+
include_next = true
|
38
|
+
else
|
39
|
+
raise "Invalid char '#{c}' found in a string."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
if j > 0
|
43
|
+
for i in (0..j)
|
44
|
+
# alternative word is not the last word and not at the beginning
|
45
|
+
if (e[j]+1 < len) && (b[0] > 0)
|
46
|
+
ret << self[0..b[0]-1] + self[b[i]..e[i]] + self[e[j]+1..len-1]
|
47
|
+
elsif (e[j]+1 == len) && (b[0] > 0)
|
48
|
+
ret << self[0..b[0]-1] + self[b[i]..e[i]]
|
49
|
+
elsif (e[j]+1 < len) && (b[0] == 0)
|
50
|
+
ret << self[b[i]..e[i]] + self[e[j]+1..len-1]
|
51
|
+
else
|
52
|
+
ret << self[b[i]..e[i]]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
ret
|
57
|
+
end
|
58
|
+
|
59
|
+
# Test whether a String includes the +word+. It's useful while testing
|
60
|
+
# a variable which might be an array of phrase or just a single phrase.
|
61
|
+
def has?(word)
|
62
|
+
self.include? word
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Extend Array class.
|
67
|
+
Array.class_eval do
|
68
|
+
# Expand a phrase array into a flattened one. Example,
|
69
|
+
# ['blow your nose', 'blow a kiss to/at sb'] #=>
|
70
|
+
# ['blow your nose', 'blow a kiss to sb', 'blow a kiss at sb']
|
71
|
+
def expand
|
72
|
+
ret = self.map { |p|
|
73
|
+
p.flatten if p.is_a? String
|
74
|
+
}
|
75
|
+
ret.flatten
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
# Test if a phrase array includes a +word+.
|
80
|
+
# ['blow your nose', 'blow a kiss to/at sb'].has?("a kiss at") #=>true
|
81
|
+
def has?(word)
|
82
|
+
self.expand.each { |phr|
|
83
|
+
return true if phr.include? word
|
84
|
+
}
|
85
|
+
false
|
86
|
+
end
|
87
|
+
|
88
|
+
# Iterate an array and return two elements +a+ +b+ each time for handling.
|
89
|
+
def each_pair
|
90
|
+
len = self.length
|
91
|
+
i = 0
|
92
|
+
while (i < len)
|
93
|
+
a = self.at(i)
|
94
|
+
b = self.at(i+1)
|
95
|
+
yield(a, b)
|
96
|
+
i += 2
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
# Get the text selected by the css +selector+.
|
103
|
+
def css_text(selector)
|
104
|
+
node = @html.css(selector)
|
105
|
+
node.text unless node.empty?
|
106
|
+
end
|
107
|
+
|
108
|
+
# Get sth by the css +selector+ for the derived word inside its runon node
|
109
|
+
def derived_css(selector)
|
110
|
+
runon = @html.css(".runon")
|
111
|
+
runon.each { |r|
|
112
|
+
n = r.css('[title="Derived word"]')
|
113
|
+
if n.text == @word
|
114
|
+
node = r.css(selector)
|
115
|
+
yield(node)
|
116
|
+
end
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
# Get sth by the css +selector+ for the phrase inside the node phrase-block
|
121
|
+
def phrase_css(selector)
|
122
|
+
phbs = @html.css(".phrase-block")
|
123
|
+
phbs.each { |phb|
|
124
|
+
nodes = phb.css('.phrase, .v[title="Variant form"]')
|
125
|
+
nodes.each { |n|
|
126
|
+
if n.text.flatten.has? @word
|
127
|
+
node = phb.css(selector)
|
128
|
+
yield(node)
|
129
|
+
break
|
130
|
+
end
|
131
|
+
}
|
132
|
+
}
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|