gscraper 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +33 -21
- data/Manifest.txt +3 -0
- data/README.txt +107 -4
- data/lib/gscraper/gscraper.rb +92 -21
- data/lib/gscraper/licenses.rb +27 -4
- data/lib/gscraper/search/page.rb +9 -11
- data/lib/gscraper/search/query.rb +142 -104
- data/lib/gscraper/search/result.rb +13 -12
- data/lib/gscraper/search/search.rb +3 -3
- data/lib/gscraper/sponsored_ad.rb +35 -0
- data/lib/gscraper/sponsored_links.rb +151 -0
- data/lib/gscraper/version.rb +1 -1
- data/lib/gscraper/web_agent.rb +38 -0
- data/test/search/query_result.rb +1 -1
- data/test/test_gscraper.rb +1 -4
- metadata +73 -63
data/lib/gscraper/search/page.rb
CHANGED
@@ -165,13 +165,12 @@ module GScraper
|
|
165
165
|
|
166
166
|
#
|
167
167
|
# Returns an Array containing the cached pages of the results within
|
168
|
-
# the Page.
|
169
|
-
# cached page.
|
168
|
+
# the Page.
|
170
169
|
#
|
171
170
|
# page.cached_pages # => [...]
|
172
171
|
#
|
173
|
-
def cached_pages
|
174
|
-
map { |result| result.cached_page
|
172
|
+
def cached_pages
|
173
|
+
map { |result| result.cached_page }
|
175
174
|
end
|
176
175
|
|
177
176
|
#
|
@@ -246,13 +245,12 @@ module GScraper
|
|
246
245
|
|
247
246
|
#
|
248
247
|
# Iterates over each result's cached pages within the Page, passing
|
249
|
-
# each to the given _block_.
|
250
|
-
# in accessing the cached pages.
|
248
|
+
# each to the given _block_.
|
251
249
|
#
|
252
250
|
# each_cached_page { |page| puts page.readlines }
|
253
251
|
#
|
254
|
-
def each_cached_page(
|
255
|
-
cached_pages
|
252
|
+
def each_cached_page(&block)
|
253
|
+
cached_pages.each(&block)
|
256
254
|
end
|
257
255
|
|
258
256
|
#
|
@@ -328,13 +326,13 @@ module GScraper
|
|
328
326
|
|
329
327
|
#
|
330
328
|
# Returns the cached pages of the results that match the specified
|
331
|
-
# _block_. If
|
329
|
+
# _block_. If _options_ are given, they will be used in accessing
|
332
330
|
# the cached pages.
|
333
331
|
#
|
334
332
|
# page.cached_pages_of { |result| result.title =~ /dude/ }
|
335
333
|
#
|
336
|
-
def cached_pages_of(
|
337
|
-
results_with(&block).cached_pages(
|
334
|
+
def cached_pages_of(options={},&block)
|
335
|
+
results_with(&block).cached_pages(options)
|
338
336
|
end
|
339
337
|
|
340
338
|
#
|
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'gscraper/search/result'
|
2
2
|
require 'gscraper/search/page'
|
3
|
+
require 'gscraper/sponsored_ad'
|
4
|
+
require 'gscraper/sponsored_links'
|
3
5
|
require 'gscraper/extensions/uri'
|
4
6
|
require 'gscraper/licenses'
|
5
|
-
require 'gscraper/
|
7
|
+
require 'gscraper/web_agent'
|
6
8
|
|
7
9
|
require 'hpricot'
|
8
10
|
|
@@ -10,9 +12,15 @@ module GScraper
|
|
10
12
|
module Search
|
11
13
|
class Query
|
12
14
|
|
15
|
+
include WebAgent
|
16
|
+
|
17
|
+
# Search host
|
13
18
|
SEARCH_HOST = 'www.google.com'
|
19
|
+
|
20
|
+
# Search URL
|
14
21
|
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
15
22
|
|
23
|
+
# Default results per-page
|
16
24
|
RESULTS_PER_PAGE = 10
|
17
25
|
|
18
26
|
# Results per-page
|
@@ -88,41 +96,39 @@ module GScraper
|
|
88
96
|
# q.within_past_week = true
|
89
97
|
# end
|
90
98
|
#
|
91
|
-
def initialize(
|
92
|
-
|
99
|
+
def initialize(options={},&block)
|
100
|
+
@results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
|
93
101
|
|
94
|
-
@
|
102
|
+
@query = options[:query]
|
103
|
+
@exact_phrase = options[:exact_phrase]
|
104
|
+
@with_words = options[:with_words]
|
105
|
+
@without_words = options[:without_words]
|
95
106
|
|
96
|
-
@
|
97
|
-
@
|
98
|
-
@
|
99
|
-
@
|
107
|
+
@language = options[:language]
|
108
|
+
@region = options[:region]
|
109
|
+
@in_format = options[:in_format]
|
110
|
+
@not_in_format = options[:not_in_format]
|
100
111
|
|
101
|
-
|
102
|
-
|
103
|
-
@in_format = opts[:in_format]
|
104
|
-
@not_in_format = opts[:not_in_format]
|
105
|
-
|
106
|
-
if opts[:within_past_day]
|
107
|
-
@within_past_day = opts[:within_past_day]
|
112
|
+
if options[:within_past_day]
|
113
|
+
@within_past_day = options[:within_past_day]
|
108
114
|
@within_past_week = false
|
109
115
|
@within_past_months = false
|
110
116
|
@within_past_year = false
|
111
|
-
elsif
|
117
|
+
elsif options[:within_past_week]
|
112
118
|
@within_past_day = false
|
113
|
-
@within_past_week =
|
119
|
+
@within_past_week = options[:within_past_week]
|
114
120
|
@within_past_months = false
|
115
121
|
@within_past_year = false
|
116
|
-
elsif
|
122
|
+
elsif options[:within_past_months]
|
117
123
|
@within_past_day = false
|
118
124
|
@within_past_week = false
|
119
|
-
@within_past_months =
|
125
|
+
@within_past_months = options[:within_past_months]
|
120
126
|
@within_past_year = false
|
121
|
-
elsif
|
127
|
+
elsif options[:within_past_year]
|
122
128
|
@within_past_day = false
|
123
129
|
@within_past_week = false
|
124
130
|
@within_past_months = false
|
125
|
-
@within_past_year =
|
131
|
+
@within_past_year = options[:within_past_year]
|
126
132
|
else
|
127
133
|
@within_past_day = false
|
128
134
|
@within_past_week = false
|
@@ -130,15 +136,15 @@ module GScraper
|
|
130
136
|
@within_past_year = false
|
131
137
|
end
|
132
138
|
|
133
|
-
@numeric_range =
|
134
|
-
@occurrs_within =
|
135
|
-
@inside_domain =
|
136
|
-
@outside_domain =
|
137
|
-
@rights =
|
138
|
-
@filtered =
|
139
|
+
@numeric_range = options[:numeric_range]
|
140
|
+
@occurrs_within = options[:occurrs_within]
|
141
|
+
@inside_domain = options[:inside_domain]
|
142
|
+
@outside_domain = options[:outside_domain]
|
143
|
+
@rights = options[:rights]
|
144
|
+
@filtered = options[:filtered]
|
139
145
|
|
140
|
-
@similar_to =
|
141
|
-
@links_to =
|
146
|
+
@similar_to = options[:similar_to]
|
147
|
+
@links_to = options[:links_to]
|
142
148
|
|
143
149
|
block.call(self) if block
|
144
150
|
end
|
@@ -154,95 +160,94 @@ module GScraper
|
|
154
160
|
# q.occurrs_within = :title
|
155
161
|
# end
|
156
162
|
#
|
157
|
-
def self.from_url(url,&block)
|
163
|
+
def self.from_url(url,options={},&block)
|
158
164
|
url = URI.parse(url)
|
159
|
-
opts = {}
|
160
165
|
|
161
|
-
|
166
|
+
options[:results_per_page] = url.query_params['num']
|
162
167
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
168
|
+
options[:query] = url.query_params['as_q']
|
169
|
+
options[:exact_phrase] = url.query_params['as_epq']
|
170
|
+
options[:with_words] = url.query_params['as_oq']
|
171
|
+
options[:without_words] = url.query_params['as_eq']
|
167
172
|
|
168
|
-
|
169
|
-
|
173
|
+
options[:language] = url.query_params['lr']
|
174
|
+
options[:region] = url.query_params['cr']
|
170
175
|
|
171
176
|
case url.query_params['as_ft']
|
172
177
|
when 'i'
|
173
|
-
|
178
|
+
options[:in_format] = url.query_params['as_filetype']
|
174
179
|
when 'e'
|
175
|
-
|
180
|
+
options[:not_in_format] = url.query_params['as_filetype']
|
176
181
|
end
|
177
182
|
|
178
183
|
case url.query_params['as_qdr']
|
179
184
|
when 'd'
|
180
|
-
|
185
|
+
options[:within_past_day] = true
|
181
186
|
when 'w'
|
182
|
-
|
187
|
+
options[:within_past_week] = true
|
183
188
|
when 'm'
|
184
|
-
|
189
|
+
options[:within_past_months] = 1
|
185
190
|
when 'm2'
|
186
|
-
|
191
|
+
options[:within_past_months] = 2
|
187
192
|
when 'm3'
|
188
|
-
|
193
|
+
options[:within_past_months] = 3
|
189
194
|
when 'm6'
|
190
|
-
|
195
|
+
options[:within_past_months] = 6
|
191
196
|
when 'y'
|
192
|
-
|
197
|
+
options[:within_past_year] = true
|
193
198
|
end
|
194
199
|
|
195
200
|
if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
|
196
|
-
|
201
|
+
options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
|
197
202
|
end
|
198
203
|
|
199
204
|
case url.query_params['as_occt']
|
200
205
|
when 'title'
|
201
|
-
|
206
|
+
options[:occurrs_within] = :title
|
202
207
|
when 'body'
|
203
|
-
|
208
|
+
options[:occurrs_within] = :body
|
204
209
|
when 'url'
|
205
|
-
|
210
|
+
options[:occurrs_within] = :url
|
206
211
|
when 'links'
|
207
|
-
|
212
|
+
options[:occurrs_within] = :links
|
208
213
|
end
|
209
214
|
|
210
215
|
case url.query_params['as_dt']
|
211
216
|
when 'i'
|
212
|
-
|
217
|
+
options[:inside_domain] = url.query_params['as_sitesearch']
|
213
218
|
when 'e'
|
214
|
-
|
219
|
+
options[:outside_domain] = url.query_params['as_sitesearch']
|
215
220
|
end
|
216
221
|
|
217
222
|
case url.query_params['as_rights']
|
218
223
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
219
|
-
|
224
|
+
options[:rights] = Licenses::CC_BY_NC_ND
|
220
225
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
221
|
-
|
226
|
+
options[:rights] = Licenses::CC_BY_SA
|
222
227
|
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
223
|
-
|
228
|
+
options[:rights] = Licenses::CC_BY_NC
|
224
229
|
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
225
|
-
|
230
|
+
options[:rights] = Licenses::CC_BY
|
226
231
|
end
|
227
232
|
|
228
233
|
if url.query_params[:safe]=='active'
|
229
|
-
|
234
|
+
options[:filtered] = true
|
230
235
|
end
|
231
236
|
|
232
237
|
if url.query_params['as_rq']
|
233
|
-
|
238
|
+
options[:similar_to] = url.query_params['as_rq']
|
234
239
|
elsif url.query_params['as_lq']
|
235
|
-
|
240
|
+
options[:links_to] = url.query_params['as_lq']
|
236
241
|
end
|
237
242
|
|
238
|
-
return self.new(
|
243
|
+
return self.new(options,&block)
|
239
244
|
end
|
240
245
|
|
241
246
|
#
|
242
247
|
# Returns the URL that represents the query.
|
243
248
|
#
|
244
249
|
def search_url
|
245
|
-
url = URI
|
250
|
+
url = URI(SEARCH_URL)
|
246
251
|
|
247
252
|
if @results_per_page
|
248
253
|
url.query_params['num'] = @results_per_page
|
@@ -344,37 +349,41 @@ module GScraper
|
|
344
349
|
|
345
350
|
#
|
346
351
|
# Returns a Page object containing Result objects at the specified
|
347
|
-
# _page_index_. If
|
348
|
-
# the SEARCH_URL. If a _block_ is given, it will be passed the newly
|
352
|
+
# _page_index_. If a _block_ is given, it will be passed the newly
|
349
353
|
# created Page.
|
350
354
|
#
|
351
|
-
def page(page_index
|
352
|
-
doc =
|
355
|
+
def page(page_index,&block)
|
356
|
+
doc = get_page(page_url(page_index))
|
353
357
|
|
354
358
|
new_page = Page.new
|
355
359
|
results = doc.search('//div.g')[0...@results_per_page.to_i]
|
356
360
|
|
357
361
|
results.each_with_index do |result,index|
|
358
362
|
rank = page_result_offset(page_index) + (index + 1)
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
accum + elem.inner_text
|
364
|
-
end
|
365
|
-
|
363
|
+
link = result.at('//a.l')
|
364
|
+
title = link.inner_text
|
365
|
+
url = link.get_attribute('href')
|
366
|
+
summary_text = ''
|
366
367
|
cached_url = nil
|
367
368
|
similar_url = nil
|
368
369
|
|
369
|
-
if (
|
370
|
-
|
371
|
-
|
370
|
+
if (content = (result.at('//td.j//font|//td.j/div.sml')))
|
371
|
+
content.children.each do |elem|
|
372
|
+
break if (!(elem.text?) && elem.name=='br')
|
372
373
|
|
373
|
-
|
374
|
-
|
374
|
+
summary_text << elem.inner_text
|
375
|
+
end
|
376
|
+
|
377
|
+
if (cached_link = result.at('nobr/a:first'))
|
378
|
+
cached_url = cached_link.get_attribute('href')
|
379
|
+
end
|
380
|
+
|
381
|
+
if (similar_link = result.at('nobr/a:last'))
|
382
|
+
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
|
383
|
+
end
|
375
384
|
end
|
376
385
|
|
377
|
-
new_page << Result.new(rank,title,url,
|
386
|
+
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
378
387
|
end
|
379
388
|
|
380
389
|
block.call(new_page) if block
|
@@ -382,56 +391,85 @@ module GScraper
|
|
382
391
|
end
|
383
392
|
|
384
393
|
#
|
385
|
-
# Returns the Results on the first page. If
|
386
|
-
# will be
|
387
|
-
# it will be passed the newly created Page.
|
394
|
+
# Returns the Results on the first page. If a _block_ is given it
|
395
|
+
# will be passed the newly created Page.
|
388
396
|
#
|
389
|
-
def first_page(
|
390
|
-
page(1
|
397
|
+
def first_page(&block)
|
398
|
+
page(1,&block)
|
391
399
|
end
|
392
400
|
|
393
401
|
#
|
394
|
-
# Returns the Result at the specified _index_.
|
395
|
-
# they will be used in accessing the Page containing the requested
|
396
|
-
# Result.
|
402
|
+
# Returns the Result at the specified _index_.
|
397
403
|
#
|
398
|
-
def result_at(index
|
399
|
-
page(result_page_index(index)
|
404
|
+
def result_at(index)
|
405
|
+
page(result_page_index(index))[page_result_index(index)]
|
400
406
|
end
|
401
407
|
|
402
408
|
#
|
403
|
-
# Returns the first Result
|
404
|
-
# given, they will be used in accessing the Page containing the
|
405
|
-
# requested Result.
|
409
|
+
# Returns the first Result on the first_page.
|
406
410
|
#
|
407
|
-
def
|
408
|
-
result_at(1
|
411
|
+
def top_result
|
412
|
+
result_at(1)
|
409
413
|
end
|
410
414
|
|
411
415
|
#
|
412
416
|
# Iterates over the results at the specified _page_index_, passing
|
413
|
-
# each to the given _block_.
|
414
|
-
# in accessing the SEARCH_URL.
|
417
|
+
# each to the given _block_.
|
415
418
|
#
|
416
419
|
# query.each_on_page(2) do |result|
|
417
420
|
# puts result.title
|
418
421
|
# end
|
419
422
|
#
|
420
|
-
def each_on_page(page_index
|
421
|
-
page(page_index
|
423
|
+
def each_on_page(page_index,&block)
|
424
|
+
page(page_index).each(&block)
|
422
425
|
end
|
423
426
|
|
424
427
|
#
|
425
|
-
# Iterates over the results on the first page, passing
|
426
|
-
#
|
427
|
-
# in accessing the SEARCH_URL.
|
428
|
+
# Iterates over the results on the first page, passing each to the
|
429
|
+
# given _block_.
|
428
430
|
#
|
429
431
|
# query.each_on_first_page do |result|
|
430
432
|
# puts result.url
|
431
433
|
# end
|
432
434
|
#
|
433
|
-
def each_on_first_page(
|
434
|
-
each_on_page(1
|
435
|
+
def each_on_first_page(&block)
|
436
|
+
each_on_page(1,&block)
|
437
|
+
end
|
438
|
+
|
439
|
+
#
|
440
|
+
# Returns a SponsoredLinks object containing SponsoredAd objects of
|
441
|
+
# the query. If a _block_ is given, it will be passed the newly
|
442
|
+
# created Page.
|
443
|
+
#
|
444
|
+
def sponsored_links(&block)
|
445
|
+
doc = get_page(search_url)
|
446
|
+
new_links = SponsoredLinks.new
|
447
|
+
|
448
|
+
# top and side ads
|
449
|
+
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
|
450
|
+
title = link.inner_text
|
451
|
+
url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
|
452
|
+
|
453
|
+
new_links << SponsoredAd.new(title,url)
|
454
|
+
end
|
455
|
+
|
456
|
+
block.call(new_links) if block
|
457
|
+
return new_links
|
458
|
+
end
|
459
|
+
|
460
|
+
#
|
461
|
+
# Returns the first sponsored link on the first page of results.
|
462
|
+
#
|
463
|
+
def top_sponsored_link
|
464
|
+
top_sponsored_links.first
|
465
|
+
end
|
466
|
+
|
467
|
+
#
|
468
|
+
# Iterates over the sponsored links on the first page of
|
469
|
+
# results passing each to the specified _block_.
|
470
|
+
#
|
471
|
+
def each_sponsored_link(&block)
|
472
|
+
sponsored_links.each(&block)
|
435
473
|
end
|
436
474
|
|
437
475
|
protected
|