gscraper 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
@@ -1,12 +1,32 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/search/query'
|
2
|
-
require 'gscraper/
|
24
|
+
require 'gscraper/gscraper'
|
3
25
|
|
4
26
|
module GScraper
|
5
27
|
module Search
|
6
28
|
class Result
|
7
29
|
|
8
|
-
include WebAgent
|
9
|
-
|
10
30
|
# Rank of the result page
|
11
31
|
attr_reader :rank
|
12
32
|
|
@@ -30,6 +50,8 @@ module GScraper
|
|
30
50
|
# _summary_, _url_, _size_, _cache_url_ and _similar_url_.
|
31
51
|
#
|
32
52
|
def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
|
53
|
+
@agent = GScraper.web_agent
|
54
|
+
|
33
55
|
@rank = rank
|
34
56
|
@title = title
|
35
57
|
@url = url
|
@@ -39,40 +61,21 @@ module GScraper
|
|
39
61
|
end
|
40
62
|
|
41
63
|
#
|
42
|
-
# Fetches the page of the result.
|
43
|
-
# passed the page.
|
64
|
+
# Fetches the page of the result.
|
44
65
|
#
|
45
|
-
def page
|
46
|
-
|
66
|
+
def page
|
67
|
+
@agent.get(@url)
|
47
68
|
end
|
48
69
|
|
49
70
|
#
|
50
|
-
#
|
51
|
-
# a _block_ is given, it will be passed the newly created Query
|
52
|
-
# object.
|
53
|
-
#
|
54
|
-
# result.similar_query # => Query
|
55
|
-
#
|
56
|
-
# result.similar_query do |q|
|
57
|
-
# q.first_page.each_url do |url|
|
58
|
-
# puts url
|
59
|
-
# end
|
60
|
-
# end
|
71
|
+
# Fetches the cached page of the result.
|
61
72
|
#
|
62
|
-
def
|
63
|
-
if @
|
64
|
-
return
|
73
|
+
def cached_page
|
74
|
+
if @cached_url
|
75
|
+
return @agent.get(@cached_url)
|
65
76
|
end
|
66
77
|
end
|
67
78
|
|
68
|
-
#
|
69
|
-
# Fetches the cached page of the result. If a _block_ is given it will
|
70
|
-
# be passed the cached page.
|
71
|
-
#
|
72
|
-
def cached_page(&block)
|
73
|
-
get_page(@cached_url,&block)
|
74
|
-
end
|
75
|
-
|
76
79
|
#
|
77
80
|
# Returns a string containing the result's title.
|
78
81
|
#
|
@@ -1,4 +1,27 @@
|
|
1
|
-
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/web_query'
|
24
|
+
require 'gscraper/search/ajax_query'
|
2
25
|
|
3
26
|
module GScraper
|
4
27
|
module Search
|
@@ -12,7 +35,7 @@ module GScraper
|
|
12
35
|
# end
|
13
36
|
#
|
14
37
|
def Search.query(options={},&block)
|
15
|
-
|
38
|
+
WebQuery.new(options,&block)
|
16
39
|
end
|
17
40
|
|
18
41
|
#
|
@@ -27,7 +50,25 @@ module GScraper
|
|
27
50
|
# end
|
28
51
|
#
|
29
52
|
def Search.query_from_url(url,&block)
|
30
|
-
|
53
|
+
WebQuery.from_url(url,&block)
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# Returns a new AJAXQuery object with the given _options_. See
|
58
|
+
# AJAXQuery.new.
|
59
|
+
#
|
60
|
+
# Search.ajax_query(:query => 'ruby')
|
61
|
+
#
|
62
|
+
def Search.ajax_query(options={},&block)
|
63
|
+
AJAXQuery.new(options,&block)
|
64
|
+
end
|
65
|
+
|
66
|
+
#
|
67
|
+
# Returns the AJAXQuery object that represents the specified _url_.
|
68
|
+
# See AJAXQuery.from_url.
|
69
|
+
#
|
70
|
+
def Search.ajax_query_from_url(url,&block)
|
71
|
+
AJAXQuery.from_url(url,&block)
|
31
72
|
end
|
32
73
|
end
|
33
74
|
end
|
@@ -0,0 +1,472 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
23
|
+
require 'gscraper/search/result'
|
24
|
+
require 'gscraper/search/page'
|
25
|
+
require 'gscraper/search/query'
|
26
|
+
require 'gscraper/sponsored_ad'
|
27
|
+
require 'gscraper/sponsored_links'
|
28
|
+
require 'gscraper/extensions/uri'
|
29
|
+
require 'gscraper/has_pages'
|
30
|
+
require 'gscraper/licenses'
|
31
|
+
require 'gscraper/gscraper'
|
32
|
+
|
33
|
+
require 'hpricot'
|
34
|
+
|
35
|
+
module GScraper
|
36
|
+
module Search
|
37
|
+
class WebQuery < Query
|
38
|
+
|
39
|
+
include HasPages
|
40
|
+
|
41
|
+
# Search host
|
42
|
+
SEARCH_HOST = 'www.google.com'
|
43
|
+
|
44
|
+
# Search URL
|
45
|
+
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
46
|
+
|
47
|
+
# Default results per-page
|
48
|
+
RESULTS_PER_PAGE = 10
|
49
|
+
|
50
|
+
# Results per-page
|
51
|
+
attr_accessor :results_per_page
|
52
|
+
|
53
|
+
# Search query
|
54
|
+
attr_accessor :query
|
55
|
+
|
56
|
+
# Search 'link' modifier
|
57
|
+
attr_accessor :link
|
58
|
+
|
59
|
+
# Search 'related' modifier
|
60
|
+
attr_accessor :related
|
61
|
+
|
62
|
+
# Search 'info' modifier
|
63
|
+
attr_accessor :info
|
64
|
+
|
65
|
+
# Search 'site' modifier
|
66
|
+
attr_accessor :site
|
67
|
+
|
68
|
+
# Search 'filetype' modifier
|
69
|
+
attr_accessor :filetype
|
70
|
+
|
71
|
+
# Search 'allintitle' modifier
|
72
|
+
attr_accessor :allintitle
|
73
|
+
|
74
|
+
# Search 'intitle' modifier
|
75
|
+
attr_accessor :intitle
|
76
|
+
|
77
|
+
# Search 'allinurl' modifier
|
78
|
+
attr_accessor :allinurl
|
79
|
+
|
80
|
+
# Search 'inurl' modifier
|
81
|
+
attr_accessor :inurl
|
82
|
+
|
83
|
+
# Search 'allintext' modifier
|
84
|
+
attr_accessor :allintext
|
85
|
+
|
86
|
+
# Search 'intext' modifier
|
87
|
+
attr_accessor :intext
|
88
|
+
|
89
|
+
# Search for results containing the exact phrase
|
90
|
+
attr_accessor :exact_phrase
|
91
|
+
|
92
|
+
# Search for results with the words
|
93
|
+
attr_accessor :with_words
|
94
|
+
|
95
|
+
# Search for results with-out the words
|
96
|
+
attr_accessor :without_words
|
97
|
+
|
98
|
+
# Search for results written in the language
|
99
|
+
attr_accessor :language
|
100
|
+
|
101
|
+
# Search for results from the region
|
102
|
+
attr_accessor :region
|
103
|
+
|
104
|
+
# Search for results in the format
|
105
|
+
attr_accessor :in_format
|
106
|
+
|
107
|
+
# Search for results not in the format
|
108
|
+
attr_accessor :not_in_format
|
109
|
+
|
110
|
+
# Search for results within the past day
|
111
|
+
attr_accessor :within_past_day
|
112
|
+
|
113
|
+
# Search for results within the past week
|
114
|
+
attr_accessor :within_past_week
|
115
|
+
|
116
|
+
# Search for results within the past months
|
117
|
+
attr_accessor :within_past_months
|
118
|
+
|
119
|
+
# Search for results within the past year
|
120
|
+
attr_accessor :within_past_year
|
121
|
+
|
122
|
+
# Search for results containing numbers between the range
|
123
|
+
attr_accessor :numeric_range
|
124
|
+
|
125
|
+
# Search for results where the query ocurrs within the area
|
126
|
+
attr_accessor :occurrs_within
|
127
|
+
|
128
|
+
# Search for results inside the domain
|
129
|
+
attr_accessor :inside_domain
|
130
|
+
|
131
|
+
# Search for results outside the domain
|
132
|
+
attr_accessor :outside_domain
|
133
|
+
|
134
|
+
# Search for results which have the rights
|
135
|
+
attr_accessor :rights
|
136
|
+
|
137
|
+
# Filter the search results
|
138
|
+
attr_accessor :filtered
|
139
|
+
|
140
|
+
# Search for results similar to the page
|
141
|
+
attr_accessor :similar_to
|
142
|
+
|
143
|
+
# Search for results linking to the page
|
144
|
+
attr_accessor :links_to
|
145
|
+
|
146
|
+
#
|
147
|
+
# Creates a new WebQuery object from the given search options. If a
|
148
|
+
# block is given, it will be passed the newly created query object.
|
149
|
+
#
|
150
|
+
# WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
|
151
|
+
#
|
152
|
+
# WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
|
153
|
+
# q.within_past_week = true
|
154
|
+
# end
|
155
|
+
#
|
156
|
+
def initialize(options={},&block)
|
157
|
+
@agent = GScraper.web_agent(options)
|
158
|
+
|
159
|
+
@results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
|
160
|
+
|
161
|
+
@language = options[:language]
|
162
|
+
@region = options[:region]
|
163
|
+
|
164
|
+
if options[:within_past_day]
|
165
|
+
@within_past_day = options[:within_past_day]
|
166
|
+
@within_past_week = false
|
167
|
+
@within_past_months = false
|
168
|
+
@within_past_year = false
|
169
|
+
elsif options[:within_past_week]
|
170
|
+
@within_past_day = false
|
171
|
+
@within_past_week = options[:within_past_week]
|
172
|
+
@within_past_months = false
|
173
|
+
@within_past_year = false
|
174
|
+
elsif options[:within_past_months]
|
175
|
+
@within_past_day = false
|
176
|
+
@within_past_week = false
|
177
|
+
@within_past_months = options[:within_past_months]
|
178
|
+
@within_past_year = false
|
179
|
+
elsif options[:within_past_year]
|
180
|
+
@within_past_day = false
|
181
|
+
@within_past_week = false
|
182
|
+
@within_past_months = false
|
183
|
+
@within_past_year = options[:within_past_year]
|
184
|
+
else
|
185
|
+
@within_past_day = false
|
186
|
+
@within_past_week = false
|
187
|
+
@within_past_months = false
|
188
|
+
@within_past_year = false
|
189
|
+
end
|
190
|
+
|
191
|
+
@occurrs_within = options[:occurrs_within]
|
192
|
+
@rights = options[:rights]
|
193
|
+
@filtered = options[:filtered]
|
194
|
+
|
195
|
+
@similar_to = options[:similar_to]
|
196
|
+
@links_to = options[:links_to]
|
197
|
+
|
198
|
+
super(options,&block)
|
199
|
+
end
|
200
|
+
|
201
|
+
#
|
202
|
+
# Creates a new WebQuery object from the specified URL. If a block is
|
203
|
+
# given, it will be passed the newly created WebQuery object.
|
204
|
+
#
|
205
|
+
# WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
|
206
|
+
#
|
207
|
+
# WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
|
208
|
+
# q.within_last_month = true
|
209
|
+
# q.occurrs_within = :title
|
210
|
+
# end
|
211
|
+
#
|
212
|
+
def self.from_url(url,options={},&block)
|
213
|
+
url = URI(url.to_s)
|
214
|
+
|
215
|
+
options[:results_per_page] = url.query_params['num'].to_i
|
216
|
+
|
217
|
+
options[:query] = url.query_params['q']
|
218
|
+
options[:exact_phrase] = url.query_params['as_epq']
|
219
|
+
options[:with_words] = url.query_params['as_oq']
|
220
|
+
options[:without_words] = url.query_params['as_eq']
|
221
|
+
|
222
|
+
options[:language] = url.query_params['lr']
|
223
|
+
options[:region] = url.query_params['cr']
|
224
|
+
|
225
|
+
if url.query_params['as_filetype']
|
226
|
+
options[:filetype] = url.query_params['as_filetype']
|
227
|
+
end
|
228
|
+
|
229
|
+
case url.query_params['as_qdr']
|
230
|
+
when 'd'
|
231
|
+
options[:within_past_day] = true
|
232
|
+
when 'w'
|
233
|
+
options[:within_past_week] = true
|
234
|
+
when 'm'
|
235
|
+
options[:within_past_months] = 1
|
236
|
+
when 'm2'
|
237
|
+
options[:within_past_months] = 2
|
238
|
+
when 'm3'
|
239
|
+
options[:within_past_months] = 3
|
240
|
+
when 'm6'
|
241
|
+
options[:within_past_months] = 6
|
242
|
+
when 'y'
|
243
|
+
options[:within_past_year] = true
|
244
|
+
end
|
245
|
+
|
246
|
+
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
247
|
+
options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,
|
248
|
+
url.query_params['as_nhi'].to_i)
|
249
|
+
end
|
250
|
+
|
251
|
+
case url.query_params['as_occt']
|
252
|
+
when 'title'
|
253
|
+
options[:occurrs_within] = :title
|
254
|
+
when 'body'
|
255
|
+
options[:occurrs_within] = :body
|
256
|
+
when 'url'
|
257
|
+
options[:occurrs_within] = :url
|
258
|
+
when 'links'
|
259
|
+
options[:occurrs_within] = :links
|
260
|
+
end
|
261
|
+
|
262
|
+
options[:site] = url.query_params['as_sitesearch']
|
263
|
+
|
264
|
+
case url.query_params['as_rights']
|
265
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
266
|
+
options[:rights] = Licenses::CC_BY_NC_ND
|
267
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
268
|
+
options[:rights] = Licenses::CC_BY_SA
|
269
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
270
|
+
options[:rights] = Licenses::CC_BY_NC
|
271
|
+
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
272
|
+
options[:rights] = Licenses::CC_BY
|
273
|
+
end
|
274
|
+
|
275
|
+
if url.query_params[:safe] == 'active'
|
276
|
+
options[:filtered] = true
|
277
|
+
end
|
278
|
+
|
279
|
+
if url.query_params['as_rq']
|
280
|
+
options[:similar_to] = url.query_params['as_rq']
|
281
|
+
elsif url.query_params['as_lq']
|
282
|
+
options[:links_to] = url.query_params['as_lq']
|
283
|
+
end
|
284
|
+
|
285
|
+
return self.new(options,&block)
|
286
|
+
end
|
287
|
+
|
288
|
+
#
|
289
|
+
# Returns the URL that represents the query.
|
290
|
+
#
|
291
|
+
def search_url
|
292
|
+
url = URI(SEARCH_URL)
|
293
|
+
query_expr = []
|
294
|
+
|
295
|
+
set_param = lambda { |param,value|
|
296
|
+
url.query_params[param.to_s] = value if value
|
297
|
+
}
|
298
|
+
|
299
|
+
set_param.call('num',@results_per_page)
|
300
|
+
set_param.call('q',expression)
|
301
|
+
set_param.call('as_epq',@exact_phrase)
|
302
|
+
set_param.call('as_oq',@with_words)
|
303
|
+
set_param.call('as_eq',@without_words)
|
304
|
+
|
305
|
+
set_param.call('lr',@language)
|
306
|
+
set_param.call('cr',@region)
|
307
|
+
|
308
|
+
set_param.call('as_filetype',@filetype)
|
309
|
+
|
310
|
+
if @within_past_day
|
311
|
+
url.query_params['as_qdr'] = 'd'
|
312
|
+
elsif @within_past_week
|
313
|
+
url.query_params['as_qdr'] = 'w'
|
314
|
+
elsif @within_past_months
|
315
|
+
case @within_past_months
|
316
|
+
when 1
|
317
|
+
url.query_params['as_qdr'] = 'm'
|
318
|
+
when 2
|
319
|
+
url.query_params['as_qdr'] = 'm2'
|
320
|
+
when 3
|
321
|
+
url.query_params['as_qdr'] = 'm3'
|
322
|
+
when 6
|
323
|
+
url.query_params['as_qdr'] = 'm6'
|
324
|
+
end
|
325
|
+
elsif @within_past_year
|
326
|
+
url.query_params['as_qdr'] = 'y'
|
327
|
+
end
|
328
|
+
|
329
|
+
if @numeric_range.kind_of?(Range)
|
330
|
+
url.query_params['as_nlo'] = @numeric_range.begin
|
331
|
+
url.query_params['as_nhi'] = @numeric_range.end
|
332
|
+
end
|
333
|
+
|
334
|
+
case @occurrs_within
|
335
|
+
when :title, 'title'
|
336
|
+
url.query_params['as_occt'] = 'title'
|
337
|
+
when :body, 'body'
|
338
|
+
url.query_params['as_occt'] = 'body'
|
339
|
+
when :url, 'url'
|
340
|
+
url.query_params['as_occt'] = 'url'
|
341
|
+
when :links, 'links'
|
342
|
+
url.query_params['as_occt'] = 'links'
|
343
|
+
end
|
344
|
+
|
345
|
+
set_param.call('as_sitesearch',@site)
|
346
|
+
|
347
|
+
case @rights
|
348
|
+
when Licenses::CC_BY_NC_ND
|
349
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
350
|
+
when Licenses::CC_BY_SA
|
351
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
352
|
+
when Licenses::CC_BY_ND
|
353
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
354
|
+
when Licenses::CC_BY
|
355
|
+
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
356
|
+
end
|
357
|
+
|
358
|
+
url.query_params['safe'] = 'active' if @filtered
|
359
|
+
|
360
|
+
if @similar_to
|
361
|
+
url.query_params['as_rq'] = @similar_to
|
362
|
+
elsif @links_to
|
363
|
+
url.query_params['as_lq'] = @links_to
|
364
|
+
end
|
365
|
+
|
366
|
+
return url
|
367
|
+
end
|
368
|
+
|
369
|
+
#
|
370
|
+
# Returns the URL that represents the query at the specific
|
371
|
+
# _page_index_.
|
372
|
+
#
|
373
|
+
def page_url(page_index)
|
374
|
+
url = search_url
|
375
|
+
|
376
|
+
url.query_params['start'] = result_offset_of(page_index)
|
377
|
+
url.query_params['sa'] = 'N'
|
378
|
+
|
379
|
+
return url
|
380
|
+
end
|
381
|
+
|
382
|
+
#
|
383
|
+
# Returns a Page object containing Result objects at the specified
|
384
|
+
# _page_index_.
|
385
|
+
#
|
386
|
+
def page(page_index)
|
387
|
+
Page.new do |new_page|
|
388
|
+
doc = @agent.get(page_url(page_index))
|
389
|
+
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
390
|
+
|
391
|
+
rank_offset = result_offset_of(page_index)
|
392
|
+
|
393
|
+
results.each_with_index do |result,index|
|
394
|
+
rank = rank_offset + (index + 1)
|
395
|
+
link = result.at('//a.l')
|
396
|
+
title = link.inner_text
|
397
|
+
url = link.get_attribute('href')
|
398
|
+
summary_text = ''
|
399
|
+
cached_url = nil
|
400
|
+
similar_url = nil
|
401
|
+
|
402
|
+
if (content = (result.at('//td.j//font|//td.j/div')))
|
403
|
+
content.children.each do |elem|
|
404
|
+
break if (!(elem.text?) && elem.name=='br')
|
405
|
+
|
406
|
+
summary_text << elem.inner_text
|
407
|
+
end
|
408
|
+
|
409
|
+
if (cached_link = result.at('nobr/a:first'))
|
410
|
+
cached_url = cached_link.get_attribute('href')
|
411
|
+
end
|
412
|
+
|
413
|
+
if (similar_link = result.at('nobr/a:last'))
|
414
|
+
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
415
|
+
end
|
416
|
+
end
|
417
|
+
|
418
|
+
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
419
|
+
end
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
#
|
424
|
+
# Returns the first Result on the first_page.
|
425
|
+
#
|
426
|
+
def top_result
|
427
|
+
first_page.first
|
428
|
+
end
|
429
|
+
|
430
|
+
#
|
431
|
+
# Returns the Result at the specified _index_.
|
432
|
+
#
|
433
|
+
def result_at(index)
|
434
|
+
page(page_index_of(index))[result_index_of(index)]
|
435
|
+
end
|
436
|
+
|
437
|
+
#
|
438
|
+
# Returns a SponsoredLinks object containing SponsoredAd objects of
|
439
|
+
# the query.
|
440
|
+
#
|
441
|
+
def sponsored_links
|
442
|
+
SponsoredLinks.new do |links|
|
443
|
+
doc = @agent.get(search_url)
|
444
|
+
|
445
|
+
# top and side ads
|
446
|
+
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
447
|
+
title = link.inner_text
|
448
|
+
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
449
|
+
|
450
|
+
links << SponsoredAd.new(title,url)
|
451
|
+
end
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
#
|
456
|
+
# Returns the first sponsored link on the first page of results.
|
457
|
+
#
|
458
|
+
def top_sponsored_link
|
459
|
+
top_sponsored_links.first
|
460
|
+
end
|
461
|
+
|
462
|
+
#
|
463
|
+
# Iterates over the sponsored links on the first page of
|
464
|
+
# results passing each to the specified _block_.
|
465
|
+
#
|
466
|
+
def each_sponsored_link(&block)
|
467
|
+
sponsored_links.each(&block)
|
468
|
+
end
|
469
|
+
|
470
|
+
end
|
471
|
+
end
|
472
|
+
end
|