gscraper 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +33 -21
- data/Manifest.txt +3 -0
- data/README.txt +107 -4
- data/lib/gscraper/gscraper.rb +92 -21
- data/lib/gscraper/licenses.rb +27 -4
- data/lib/gscraper/search/page.rb +9 -11
- data/lib/gscraper/search/query.rb +142 -104
- data/lib/gscraper/search/result.rb +13 -12
- data/lib/gscraper/search/search.rb +3 -3
- data/lib/gscraper/sponsored_ad.rb +35 -0
- data/lib/gscraper/sponsored_links.rb +151 -0
- data/lib/gscraper/version.rb +1 -1
- data/lib/gscraper/web_agent.rb +38 -0
- data/test/search/query_result.rb +1 -1
- data/test/test_gscraper.rb +1 -4
- metadata +73 -63
data/lib/gscraper/search/page.rb
CHANGED
@@ -165,13 +165,12 @@ module GScraper
|
|
165
165
|
|
166
166
|
#
|
167
167
|
# Returns an Array containing the cached pages of the results within
|
168
|
-
# the Page.
|
169
|
-
# cached page.
|
168
|
+
# the Page.
|
170
169
|
#
|
171
170
|
# page.cached_pages # => [...]
|
172
171
|
#
|
173
|
-
def cached_pages
|
174
|
-
map { |result| result.cached_page
|
172
|
+
def cached_pages
|
173
|
+
map { |result| result.cached_page }
|
175
174
|
end
|
176
175
|
|
177
176
|
#
|
@@ -246,13 +245,12 @@ module GScraper
|
|
246
245
|
|
247
246
|
#
|
248
247
|
# Iterates over each result's cached pages within the Page, passing
|
249
|
-
# each to the given _block_.
|
250
|
-
# in accessing the cached pages.
|
248
|
+
# each to the given _block_.
|
251
249
|
#
|
252
250
|
# each_cached_page { |page| puts page.readlines }
|
253
251
|
#
|
254
|
-
def each_cached_page(
|
255
|
-
cached_pages
|
252
|
+
def each_cached_page(&block)
|
253
|
+
cached_pages.each(&block)
|
256
254
|
end
|
257
255
|
|
258
256
|
#
|
@@ -328,13 +326,13 @@ module GScraper
|
|
328
326
|
|
329
327
|
#
|
330
328
|
# Returns the cached pages of the results that match the specified
|
331
|
-
# _block_. If
|
329
|
+
# _block_. If _options_ are given, they will be used in accessing
|
332
330
|
# the cached pages.
|
333
331
|
#
|
334
332
|
# page.cached_pages_of { |result| result.title =~ /dude/ }
|
335
333
|
#
|
336
|
-
def cached_pages_of(
|
337
|
-
results_with(&block).cached_pages(
|
334
|
+
def cached_pages_of(options={},&block)
|
335
|
+
results_with(&block).cached_pages(options)
|
338
336
|
end
|
339
337
|
|
340
338
|
#
|
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'gscraper/search/result'
|
2
2
|
require 'gscraper/search/page'
|
3
|
+
require 'gscraper/sponsored_ad'
|
4
|
+
require 'gscraper/sponsored_links'
|
3
5
|
require 'gscraper/extensions/uri'
|
4
6
|
require 'gscraper/licenses'
|
5
|
-
require 'gscraper/
|
7
|
+
require 'gscraper/web_agent'
|
6
8
|
|
7
9
|
require 'hpricot'
|
8
10
|
|
@@ -10,9 +12,15 @@ module GScraper
|
|
10
12
|
module Search
|
11
13
|
class Query
|
12
14
|
|
15
|
+
include WebAgent
|
16
|
+
|
17
|
+
# Search host
|
13
18
|
SEARCH_HOST = 'www.google.com'
|
19
|
+
|
20
|
+
# Search URL
|
14
21
|
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
15
22
|
|
23
|
+
# Default results per-page
|
16
24
|
RESULTS_PER_PAGE = 10
|
17
25
|
|
18
26
|
# Results per-page
|
@@ -88,41 +96,39 @@ module GScraper
|
|
88
96
|
# q.within_past_week = true
|
89
97
|
# end
|
90
98
|
#
|
91
|
-
def initialize(
|
92
|
-
|
99
|
+
def initialize(options={},&block)
|
100
|
+
@results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
|
93
101
|
|
94
|
-
@
|
102
|
+
@query = options[:query]
|
103
|
+
@exact_phrase = options[:exact_phrase]
|
104
|
+
@with_words = options[:with_words]
|
105
|
+
@without_words = options[:without_words]
|
95
106
|
|
96
|
-
@
|
97
|
-
@
|
98
|
-
@
|
99
|
-
@
|
107
|
+
@language = options[:language]
|
108
|
+
@region = options[:region]
|
109
|
+
@in_format = options[:in_format]
|
110
|
+
@not_in_format = options[:not_in_format]
|
100
111
|
|
101
|
-
|
102
|
-
|
103
|
-
@in_format = opts[:in_format]
|
104
|
-
@not_in_format = opts[:not_in_format]
|
105
|
-
|
106
|
-
if opts[:within_past_day]
|
107
|
-
@within_past_day = opts[:within_past_day]
|
112
|
+
if options[:within_past_day]
|
113
|
+
@within_past_day = options[:within_past_day]
|
108
114
|
@within_past_week = false
|
109
115
|
@within_past_months = false
|
110
116
|
@within_past_year = false
|
111
|
-
elsif
|
117
|
+
elsif options[:within_past_week]
|
112
118
|
@within_past_day = false
|
113
|
-
@within_past_week =
|
119
|
+
@within_past_week = options[:within_past_week]
|
114
120
|
@within_past_months = false
|
115
121
|
@within_past_year = false
|
116
|
-
elsif
|
122
|
+
elsif options[:within_past_months]
|
117
123
|
@within_past_day = false
|
118
124
|
@within_past_week = false
|
119
|
-
@within_past_months =
|
125
|
+
@within_past_months = options[:within_past_months]
|
120
126
|
@within_past_year = false
|
121
|
-
elsif
|
127
|
+
elsif options[:within_past_year]
|
122
128
|
@within_past_day = false
|
123
129
|
@within_past_week = false
|
124
130
|
@within_past_months = false
|
125
|
-
@within_past_year =
|
131
|
+
@within_past_year = options[:within_past_year]
|
126
132
|
else
|
127
133
|
@within_past_day = false
|
128
134
|
@within_past_week = false
|
@@ -130,15 +136,15 @@ module GScraper
|
|
130
136
|
@within_past_year = false
|
131
137
|
end
|
132
138
|
|
133
|
-
@numeric_range =
|
134
|
-
@occurrs_within =
|
135
|
-
@inside_domain =
|
136
|
-
@outside_domain =
|
137
|
-
@rights =
|
138
|
-
@filtered =
|
139
|
+
@numeric_range = options[:numeric_range]
|
140
|
+
@occurrs_within = options[:occurrs_within]
|
141
|
+
@inside_domain = options[:inside_domain]
|
142
|
+
@outside_domain = options[:outside_domain]
|
143
|
+
@rights = options[:rights]
|
144
|
+
@filtered = options[:filtered]
|
139
145
|
|
140
|
-
@similar_to =
|
141
|
-
@links_to =
|
146
|
+
@similar_to = options[:similar_to]
|
147
|
+
@links_to = options[:links_to]
|
142
148
|
|
143
149
|
block.call(self) if block
|
144
150
|
end
|
@@ -154,95 +160,94 @@ module GScraper
|
|
154
160
|
# q.occurrs_within = :title
|
155
161
|
# end
|
156
162
|
#
|
157
|
-
def self.from_url(url,&block)
|
163
|
+
def self.from_url(url,options={},&block)
|
158
164
|
url = URI.parse(url)
|
159
|
-
opts = {}
|
160
165
|
|
161
|
-
|
166
|
+
options[:results_per_page] = url.query_params['num']
|
162
167
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
168
|
+
options[:query] = url.query_params['as_q']
|
169
|
+
options[:exact_phrase] = url.query_params['as_epq']
|
170
|
+
options[:with_words] = url.query_params['as_oq']
|
171
|
+
options[:without_words] = url.query_params['as_eq']
|
167
172
|
|
168
|
-
|
169
|
-
|
173
|
+
options[:language] = url.query_params['lr']
|
174
|
+
options[:region] = url.query_params['cr']
|
170
175
|
|
171
176
|
case url.query_params['as_ft']
|
172
177
|
when 'i'
|
173
|
-
|
178
|
+
options[:in_format] = url.query_params['as_filetype']
|
174
179
|
when 'e'
|
175
|
-
|
180
|
+
options[:not_in_format] = url.query_params['as_filetype']
|
176
181
|
end
|
177
182
|
|
178
183
|
case url.query_params['as_qdr']
|
179
184
|
when 'd'
|
180
|
-
|
185
|
+
options[:within_past_day] = true
|
181
186
|
when 'w'
|
182
|
-
|
187
|
+
options[:within_past_week] = true
|
183
188
|
when 'm'
|
184
|
-
|
189
|
+
options[:within_past_months] = 1
|
185
190
|
when 'm2'
|
186
|
-
|
191
|
+
options[:within_past_months] = 2
|
187
192
|
when 'm3'
|
188
|
-
|
193
|
+
options[:within_past_months] = 3
|
189
194
|
when 'm6'
|
190
|
-
|
195
|
+
options[:within_past_months] = 6
|
191
196
|
when 'y'
|
192
|
-
|
197
|
+
options[:within_past_year] = true
|
193
198
|
end
|
194
199
|
|
195
200
|
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
196
|
-
|
201
|
+
options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
|
197
202
|
end
|
198
203
|
|
199
204
|
case url.query_params['as_occt']
|
200
205
|
when 'title'
|
201
|
-
|
206
|
+
options[:occurrs_within] = :title
|
202
207
|
when 'body'
|
203
|
-
|
208
|
+
options[:occurrs_within] = :body
|
204
209
|
when 'url'
|
205
|
-
|
210
|
+
options[:occurrs_within] = :url
|
206
211
|
when 'links'
|
207
|
-
|
212
|
+
options[:occurrs_within] = :links
|
208
213
|
end
|
209
214
|
|
210
215
|
case url.query_params['as_dt']
|
211
216
|
when 'i'
|
212
|
-
|
217
|
+
options[:inside_domain] = url.query_params['as_sitesearch']
|
213
218
|
when 'e'
|
214
|
-
|
219
|
+
options[:outside_domain] = url.query_params['as_sitesearch']
|
215
220
|
end
|
216
221
|
|
217
222
|
case url.query_params['as_rights']
|
218
223
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
219
|
-
|
224
|
+
options[:rights] = Licenses::CC_BY_NC_ND
|
220
225
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
221
|
-
|
226
|
+
options[:rights] = Licenses::CC_BY_SA
|
222
227
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
223
|
-
|
228
|
+
options[:rights] = Licenses::CC_BY_NC
|
224
229
|
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
225
|
-
|
230
|
+
options[:rights] = Licenses::CC_BY
|
226
231
|
end
|
227
232
|
|
228
233
|
if url.query_params[:safe]=='active'
|
229
|
-
|
234
|
+
options[:filtered] = true
|
230
235
|
end
|
231
236
|
|
232
237
|
if url.query_params['as_rq']
|
233
|
-
|
238
|
+
options[:similar_to] = url.query_params['as_rq']
|
234
239
|
elsif url.query_params['as_lq']
|
235
|
-
|
240
|
+
options[:links_to] = url.query_params['as_lq']
|
236
241
|
end
|
237
242
|
|
238
|
-
return self.new(
|
243
|
+
return self.new(options,&block)
|
239
244
|
end
|
240
245
|
|
241
246
|
#
|
242
247
|
# Returns the URL that represents the query.
|
243
248
|
#
|
244
249
|
def search_url
|
245
|
-
url = URI
|
250
|
+
url = URI(SEARCH_URL)
|
246
251
|
|
247
252
|
if @results_per_page
|
248
253
|
url.query_params['num'] = @results_per_page
|
@@ -344,37 +349,41 @@ module GScraper
|
|
344
349
|
|
345
350
|
#
|
346
351
|
# Returns a Page object containing Result objects at the specified
|
347
|
-
# _page_index_. If
|
348
|
-
# the SEARCH_URL. If a _block_ is given, it will be passed the newly
|
352
|
+
# _page_index_. If a _block_ is given, it will be passed the newly
|
349
353
|
# created Page.
|
350
354
|
#
|
351
|
-
def page(page_index
|
352
|
-
doc =
|
355
|
+
def page(page_index,&block)
|
356
|
+
doc = get_page(page_url(page_index))
|
353
357
|
|
354
358
|
new_page = Page.new
|
355
359
|
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
356
360
|
|
357
361
|
results.each_with_index do |result,index|
|
358
362
|
rank = page_result_offset(page_index) + (index + 1)
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
accum + elem.inner_text
|
364
|
-
end
|
365
|
-
|
363
|
+
link = result.at('//a.l')
|
364
|
+
title = link.inner_text
|
365
|
+
url = link.get_attribute('href')
|
366
|
+
summary_text = ''
|
366
367
|
cached_url = nil
|
367
368
|
similar_url = nil
|
368
369
|
|
369
|
-
if (
|
370
|
-
|
371
|
-
|
370
|
+
if (content = (result.at('//td.j//font|//td.j/div.sml')))
|
371
|
+
content.children.each do |elem|
|
372
|
+
break if (!(elem.text?) && elem.name=='br')
|
372
373
|
|
373
|
-
|
374
|
-
|
374
|
+
summary_text << elem.inner_text
|
375
|
+
end
|
376
|
+
|
377
|
+
if (cached_link = result.at('nobr/a:first'))
|
378
|
+
cached_url = cached_link.get_attribute('href')
|
379
|
+
end
|
380
|
+
|
381
|
+
if (similar_link = result.at('nobr/a:last'))
|
382
|
+
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
383
|
+
end
|
375
384
|
end
|
376
385
|
|
377
|
-
new_page << Result.new(rank,title,url,
|
386
|
+
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
378
387
|
end
|
379
388
|
|
380
389
|
block.call(new_page) if block
|
@@ -382,56 +391,85 @@ module GScraper
|
|
382
391
|
end
|
383
392
|
|
384
393
|
#
|
385
|
-
# Returns the Results on the first page. If
|
386
|
-
# will be
|
387
|
-
# it will be passed the newly created Page.
|
394
|
+
# Returns the Results on the first page. If a _block_ is given it
|
395
|
+
# will be passed the newly created Page.
|
388
396
|
#
|
389
|
-
def first_page(
|
390
|
-
page(1
|
397
|
+
def first_page(&block)
|
398
|
+
page(1,&block)
|
391
399
|
end
|
392
400
|
|
393
401
|
#
|
394
|
-
# Returns the Result at the specified _index_.
|
395
|
-
# they will be used in accessing the Page containing the requested
|
396
|
-
# Result.
|
402
|
+
# Returns the Result at the specified _index_.
|
397
403
|
#
|
398
|
-
def result_at(index
|
399
|
-
page(result_page_index(index)
|
404
|
+
def result_at(index)
|
405
|
+
page(result_page_index(index))[page_result_index(index)]
|
400
406
|
end
|
401
407
|
|
402
408
|
#
|
403
|
-
# Returns the first Result
|
404
|
-
# given, they will be used in accessing the Page containing the
|
405
|
-
# requested Result.
|
409
|
+
# Returns the first Result on the first_page.
|
406
410
|
#
|
407
|
-
def
|
408
|
-
result_at(1
|
411
|
+
def top_result
|
412
|
+
result_at(1)
|
409
413
|
end
|
410
414
|
|
411
415
|
#
|
412
416
|
# Iterates over the results at the specified _page_index_, passing
|
413
|
-
# each to the given _block_.
|
414
|
-
# in accessing the SEARCH_URL.
|
417
|
+
# each to the given _block_.
|
415
418
|
#
|
416
419
|
# query.each_on_page(2) do |result|
|
417
420
|
# puts result.title
|
418
421
|
# end
|
419
422
|
#
|
420
|
-
def each_on_page(page_index
|
421
|
-
page(page_index
|
423
|
+
def each_on_page(page_index,&block)
|
424
|
+
page(page_index).each(&block)
|
422
425
|
end
|
423
426
|
|
424
427
|
#
|
425
|
-
# Iterates over the results on the first page, passing
|
426
|
-
#
|
427
|
-
# in accessing the SEARCH_URL.
|
428
|
+
# Iterates over the results on the first page, passing each to the
|
429
|
+
# given _block_.
|
428
430
|
#
|
429
431
|
# query.each_on_first_page do |result|
|
430
432
|
# puts result.url
|
431
433
|
# end
|
432
434
|
#
|
433
|
-
def each_on_first_page(
|
434
|
-
each_on_page(1
|
435
|
+
def each_on_first_page(&block)
|
436
|
+
each_on_page(1,&block)
|
437
|
+
end
|
438
|
+
|
439
|
+
#
|
440
|
+
# Returns a SponsoredLinks object containing SponsoredAd objects of
|
441
|
+
# the query. If a _block_ is given, it will be passed the newly
|
442
|
+
# created Page.
|
443
|
+
#
|
444
|
+
def sponsored_links(&block)
|
445
|
+
doc = get_page(search_url)
|
446
|
+
new_links = SponsoredLinks.new
|
447
|
+
|
448
|
+
# top and side ads
|
449
|
+
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
450
|
+
title = link.inner_text
|
451
|
+
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
452
|
+
|
453
|
+
new_links << SponsoredAd.new(title,url)
|
454
|
+
end
|
455
|
+
|
456
|
+
block.call(new_links) if block
|
457
|
+
return new_links
|
458
|
+
end
|
459
|
+
|
460
|
+
#
|
461
|
+
# Returns the first sponsored link on the first page of results.
|
462
|
+
#
|
463
|
+
def top_sponsored_link
|
464
|
+
top_sponsored_links.first
|
465
|
+
end
|
466
|
+
|
467
|
+
#
|
468
|
+
# Iterates over the sponsored links on the first page of
|
469
|
+
# results passing each to the specified _block_.
|
470
|
+
#
|
471
|
+
def each_sponsored_link(&block)
|
472
|
+
sponsored_links.each(&block)
|
435
473
|
end
|
436
474
|
|
437
475
|
protected
|