gscraper 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -165,13 +165,12 @@ module GScraper
165
165
 
166
166
  #
167
167
  # Returns an Array containing the cached pages of the results within
168
- # the Page. If _opts_ are given, they will be used in accessing the
169
- # cached page.
168
+ # the Page.
170
169
  #
171
170
  # page.cached_pages # => [...]
172
171
  #
173
- def cached_pages(opts={})
174
- map { |result| result.cached_page(opts) }
172
+ def cached_pages
173
+ map { |result| result.cached_page }
175
174
  end
176
175
 
177
176
  #
@@ -246,13 +245,12 @@ module GScraper
246
245
 
247
246
  #
248
247
  # Iterates over each result's cached pages within the Page, passing
249
- # each to the given _block_. If _opts_ are given, they will be used
250
- # in accessing the cached pages.
248
+ # each to the given _block_.
251
249
  #
252
250
  # each_cached_page { |page| puts page.readlines }
253
251
  #
254
- def each_cached_page(opts={},&block)
255
- cached_pages(opts).each(&block)
252
+ def each_cached_page(&block)
253
+ cached_pages.each(&block)
256
254
  end
257
255
 
258
256
  #
@@ -328,13 +326,13 @@ module GScraper
328
326
 
329
327
  #
330
328
  # Returns the cached pages of the results that match the specified
331
- # _block_. If _opts_ are given, they will be used in accessing
329
+ # _block_. If _options_ are given, they will be used in accessing
332
330
  # the cached pages.
333
331
  #
334
332
  # page.cached_pages_of { |result| result.title =~ /dude/ }
335
333
  #
336
- def cached_pages_of(opts={},&block)
337
- results_with(&block).cached_pages(opts)
334
+ def cached_pages_of(options={},&block)
335
+ results_with(&block).cached_pages(options)
338
336
  end
339
337
 
340
338
  #
@@ -1,8 +1,10 @@
1
1
  require 'gscraper/search/result'
2
2
  require 'gscraper/search/page'
3
+ require 'gscraper/sponsored_ad'
4
+ require 'gscraper/sponsored_links'
3
5
  require 'gscraper/extensions/uri'
4
6
  require 'gscraper/licenses'
5
- require 'gscraper/gscraper'
7
+ require 'gscraper/web_agent'
6
8
 
7
9
  require 'hpricot'
8
10
 
@@ -10,9 +12,15 @@ module GScraper
10
12
  module Search
11
13
  class Query
12
14
 
15
+ include WebAgent
16
+
17
+ # Search host
13
18
  SEARCH_HOST = 'www.google.com'
19
+
20
+ # Search URL
14
21
  SEARCH_URL = "http://#{SEARCH_HOST}/search"
15
22
 
23
+ # Default results per-page
16
24
  RESULTS_PER_PAGE = 10
17
25
 
18
26
  # Results per-page
@@ -88,41 +96,39 @@ module GScraper
88
96
  # q.within_past_week = true
89
97
  # end
90
98
  #
91
- def initialize(opts={},&block)
92
- super()
99
+ def initialize(options={},&block)
100
+ @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
93
101
 
94
- @results_per_page = (opts[:results_per_page] || RESULTS_PER_PAGE)
102
+ @query = options[:query]
103
+ @exact_phrase = options[:exact_phrase]
104
+ @with_words = options[:with_words]
105
+ @without_words = options[:without_words]
95
106
 
96
- @query = opts[:query]
97
- @exact_phrase = opts[:exact_phrase]
98
- @with_words = opts[:with_words]
99
- @without_words = opts[:without_words]
107
+ @language = options[:language]
108
+ @region = options[:region]
109
+ @in_format = options[:in_format]
110
+ @not_in_format = options[:not_in_format]
100
111
 
101
- @language = opts[:language]
102
- @region = opts[:region]
103
- @in_format = opts[:in_format]
104
- @not_in_format = opts[:not_in_format]
105
-
106
- if opts[:within_past_day]
107
- @within_past_day = opts[:within_past_day]
112
+ if options[:within_past_day]
113
+ @within_past_day = options[:within_past_day]
108
114
  @within_past_week = false
109
115
  @within_past_months = false
110
116
  @within_past_year = false
111
- elsif opts[:within_past_week]
117
+ elsif options[:within_past_week]
112
118
  @within_past_day = false
113
- @within_past_week = opts[:within_past_week]
119
+ @within_past_week = options[:within_past_week]
114
120
  @within_past_months = false
115
121
  @within_past_year = false
116
- elsif opts[:within_past_months]
122
+ elsif options[:within_past_months]
117
123
  @within_past_day = false
118
124
  @within_past_week = false
119
- @within_past_months = opts[:within_past_months]
125
+ @within_past_months = options[:within_past_months]
120
126
  @within_past_year = false
121
- elsif opts[:within_past_year]
127
+ elsif options[:within_past_year]
122
128
  @within_past_day = false
123
129
  @within_past_week = false
124
130
  @within_past_months = false
125
- @within_past_year = opts[:within_past_year]
131
+ @within_past_year = options[:within_past_year]
126
132
  else
127
133
  @within_past_day = false
128
134
  @within_past_week = false
@@ -130,15 +136,15 @@ module GScraper
130
136
  @within_past_year = false
131
137
  end
132
138
 
133
- @numeric_range = opts[:numeric_range]
134
- @occurrs_within = opts[:occurrs_within]
135
- @inside_domain = opts[:inside_domain]
136
- @outside_domain = opts[:outside_domain]
137
- @rights = opts[:rights]
138
- @filtered = opts[:filtered]
139
+ @numeric_range = options[:numeric_range]
140
+ @occurrs_within = options[:occurrs_within]
141
+ @inside_domain = options[:inside_domain]
142
+ @outside_domain = options[:outside_domain]
143
+ @rights = options[:rights]
144
+ @filtered = options[:filtered]
139
145
 
140
- @similar_to = opts[:similar_to]
141
- @links_to = opts[:links_to]
146
+ @similar_to = options[:similar_to]
147
+ @links_to = options[:links_to]
142
148
 
143
149
  block.call(self) if block
144
150
  end
@@ -154,95 +160,94 @@ module GScraper
154
160
  # q.occurrs_within = :title
155
161
  # end
156
162
  #
157
- def self.from_url(url,&block)
163
+ def self.from_url(url,options={},&block)
158
164
  url = URI.parse(url)
159
- opts = {}
160
165
 
161
- opts[:results_per_page] = url.query_params['num']
166
+ options[:results_per_page] = url.query_params['num']
162
167
 
163
- opts[:query] = url.query_params['as_q']
164
- opts[:exact_phrase] = url.query_params['as_epq']
165
- opts[:with_words] = url.query_params['as_oq']
166
- opts[:without_words] = url.query_params['as_eq']
168
+ options[:query] = url.query_params['as_q']
169
+ options[:exact_phrase] = url.query_params['as_epq']
170
+ options[:with_words] = url.query_params['as_oq']
171
+ options[:without_words] = url.query_params['as_eq']
167
172
 
168
- opts[:language] = url.query_params['lr']
169
- opts[:region] = url.query_params['cr']
173
+ options[:language] = url.query_params['lr']
174
+ options[:region] = url.query_params['cr']
170
175
 
171
176
  case url.query_params['as_ft']
172
177
  when 'i'
173
- opts[:in_format] = url.query_params['as_filetype']
178
+ options[:in_format] = url.query_params['as_filetype']
174
179
  when 'e'
175
- opts[:not_in_format] = url.query_params['as_filetype']
180
+ options[:not_in_format] = url.query_params['as_filetype']
176
181
  end
177
182
 
178
183
  case url.query_params['as_qdr']
179
184
  when 'd'
180
- opts[:within_past_day] = true
185
+ options[:within_past_day] = true
181
186
  when 'w'
182
- opts[:within_past_week] = true
187
+ options[:within_past_week] = true
183
188
  when 'm'
184
- opts[:within_past_months] = 1
189
+ options[:within_past_months] = 1
185
190
  when 'm2'
186
- opts[:within_past_months] = 2
191
+ options[:within_past_months] = 2
187
192
  when 'm3'
188
- opts[:within_past_months] = 3
193
+ options[:within_past_months] = 3
189
194
  when 'm6'
190
- opts[:within_past_months] = 6
195
+ options[:within_past_months] = 6
191
196
  when 'y'
192
- opts[:within_past_year] = true
197
+ options[:within_past_year] = true
193
198
  end
194
199
 
195
200
  if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
196
- opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
201
+ options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
197
202
  end
198
203
 
199
204
  case url.query_params['as_occt']
200
205
  when 'title'
201
- opts[:occurrs_within] = :title
206
+ options[:occurrs_within] = :title
202
207
  when 'body'
203
- opts[:occurrs_within] = :body
208
+ options[:occurrs_within] = :body
204
209
  when 'url'
205
- opts[:occurrs_within] = :url
210
+ options[:occurrs_within] = :url
206
211
  when 'links'
207
- opts[:occurrs_within] = :links
212
+ options[:occurrs_within] = :links
208
213
  end
209
214
 
210
215
  case url.query_params['as_dt']
211
216
  when 'i'
212
- opts[:inside_domain] = url.query_params['as_sitesearch']
217
+ options[:inside_domain] = url.query_params['as_sitesearch']
213
218
  when 'e'
214
- opts[:outside_domain] = url.query_params['as_sitesearch']
219
+ options[:outside_domain] = url.query_params['as_sitesearch']
215
220
  end
216
221
 
217
222
  case url.query_params['as_rights']
218
223
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
219
- opts[:rights] = Licenses::CC_BY_NC_ND
224
+ options[:rights] = Licenses::CC_BY_NC_ND
220
225
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
221
- opts[:rights] = Licenses::CC_BY_SA
226
+ options[:rights] = Licenses::CC_BY_SA
222
227
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
223
- opts[:rights] = Licenses::CC_BY_NC
228
+ options[:rights] = Licenses::CC_BY_NC
224
229
  when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
225
- opts[:rights] = Licenses::CC_BY
230
+ options[:rights] = Licenses::CC_BY
226
231
  end
227
232
 
228
233
  if url.query_params[:safe]=='active'
229
- opts[:filtered] = true
234
+ options[:filtered] = true
230
235
  end
231
236
 
232
237
  if url.query_params['as_rq']
233
- opts[:similar_to] = url.query_params['as_rq']
238
+ options[:similar_to] = url.query_params['as_rq']
234
239
  elsif url.query_params['as_lq']
235
- opts[:links_to] = url.query_params['as_lq']
240
+ options[:links_to] = url.query_params['as_lq']
236
241
  end
237
242
 
238
- return self.new(opts,&block)
243
+ return self.new(options,&block)
239
244
  end
240
245
 
241
246
  #
242
247
  # Returns the URL that represents the query.
243
248
  #
244
249
  def search_url
245
- url = URI.parse(SEARCH_URL)
250
+ url = URI(SEARCH_URL)
246
251
 
247
252
  if @results_per_page
248
253
  url.query_params['num'] = @results_per_page
@@ -344,37 +349,41 @@ module GScraper
344
349
 
345
350
  #
346
351
  # Returns a Page object containing Result objects at the specified
347
- # _page_index_. If _opts_ are given, they will be used in accessing
348
- # the SEARCH_URL. If a _block_ is given, it will be passed the newly
352
+ # _page_index_. If a _block_ is given, it will be passed the newly
349
353
  # created Page.
350
354
  #
351
- def page(page_index,opts={},&block)
352
- doc = Hpricot(GScraper.open(page_url(page_index),opts))
355
+ def page(page_index,&block)
356
+ doc = get_page(page_url(page_index))
353
357
 
354
358
  new_page = Page.new
355
359
  results = doc.search('//div.g')[0...@results_per_page.to_i]
356
360
 
357
361
  results.each_with_index do |result,index|
358
362
  rank = page_result_offset(page_index) + (index + 1)
359
- title = result.at('//h2.r').inner_text
360
- url = result.at('//h2.r/a').get_attribute('href')
361
-
362
- summary = result.at('//td.j//font').children[0...-3].inject('') do |accum,elem|
363
- accum + elem.inner_text
364
- end
365
-
363
+ link = result.at('//a.l')
364
+ title = link.inner_text
365
+ url = link.get_attribute('href')
366
+ summary_text = ''
366
367
  cached_url = nil
367
368
  similar_url = nil
368
369
 
369
- if (cached_link = result.at('//td.j//font/nobr/a:first'))
370
- cached_url = cached_link.get_attribute('href')
371
- end
370
+ if (content = (result.at('//td.j//font|//td.j/div.sml')))
371
+ content.children.each do |elem|
372
+ break if (!(elem.text?) && elem.name=='br')
372
373
 
373
- if (similar_link = result.at('//td.j//font/nobr/a:last'))
374
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
374
+ summary_text << elem.inner_text
375
+ end
376
+
377
+ if (cached_link = result.at('nobr/a:first'))
378
+ cached_url = cached_link.get_attribute('href')
379
+ end
380
+
381
+ if (similar_link = result.at('nobr/a:last'))
382
+ similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
383
+ end
375
384
  end
376
385
 
377
- new_page << Result.new(rank,title,url,summary,cached_url,similar_url)
386
+ new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
378
387
  end
379
388
 
380
389
  block.call(new_page) if block
@@ -382,56 +391,85 @@ module GScraper
382
391
  end
383
392
 
384
393
  #
385
- # Returns the Results on the first page. If _opts_ are given, they
386
- # will be used in accessing the SEARCH_URL. If a _block_ is given
387
- # it will be passed the newly created Page.
394
+ # Returns the Results on the first page. If a _block_ is given it
395
+ # will be passed the newly created Page.
388
396
  #
389
- def first_page(opts={},&block)
390
- page(1,opts,&block)
397
+ def first_page(&block)
398
+ page(1,&block)
391
399
  end
392
400
 
393
401
  #
394
- # Returns the Result at the specified _index_. If _opts_ are given,
395
- # they will be used in accessing the Page containing the requested
396
- # Result.
402
+ # Returns the Result at the specified _index_.
397
403
  #
398
- def result_at(index,opts={})
399
- page(result_page_index(index),opts)[page_result_index(index)]
404
+ def result_at(index)
405
+ page(result_page_index(index))[page_result_index(index)]
400
406
  end
401
407
 
402
408
  #
403
- # Returns the first Result at the specified _index_. If _opts_ are
404
- # given, they will be used in accessing the Page containing the
405
- # requested Result.
409
+ # Returns the first Result on the first_page.
406
410
  #
407
- def first_result(opts={})
408
- result_at(1,opts)
411
+ def top_result
412
+ result_at(1)
409
413
  end
410
414
 
411
415
  #
412
416
  # Iterates over the results at the specified _page_index_, passing
413
- # each to the given _block_. If _opts_ are given they will be used
414
- # in accessing the SEARCH_URL.
417
+ # each to the given _block_.
415
418
  #
416
419
  # query.each_on_page(2) do |result|
417
420
  # puts result.title
418
421
  # end
419
422
  #
420
- def each_on_page(page_index,opts={},&block)
421
- page(page_index,opts).each(&block)
423
+ def each_on_page(page_index,&block)
424
+ page(page_index).each(&block)
422
425
  end
423
426
 
424
427
  #
425
- # Iterates over the results on the first page, passing
426
- # each to the given _block_. If _opts_ are given, they will be used
427
- # in accessing the SEARCH_URL.
428
+ # Iterates over the results on the first page, passing each to the
429
+ # given _block_.
428
430
  #
429
431
  # query.each_on_first_page do |result|
430
432
  # puts result.url
431
433
  # end
432
434
  #
433
- def each_on_first_page(opts={},&block)
434
- each_on_page(1,opts,&block)
435
+ def each_on_first_page(&block)
436
+ each_on_page(1,&block)
437
+ end
438
+
439
+ #
440
+ # Returns a SponsoredLinks object containing SponsoredAd objects of
441
+ # the query. If a _block_ is given, it will be passed the newly
442
+ # created Page.
443
+ #
444
+ def sponsored_links(&block)
445
+ doc = get_page(search_url)
446
+ new_links = SponsoredLinks.new
447
+
448
+ # top and side ads
449
+ doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
450
+ title = link.inner_text
451
+ url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
452
+
453
+ new_links << SponsoredAd.new(title,url)
454
+ end
455
+
456
+ block.call(new_links) if block
457
+ return new_links
458
+ end
459
+
460
+ #
461
+ # Returns the first sponsored link on the first page of results.
462
+ #
463
+ def top_sponsored_link
464
+ top_sponsored_links.first
465
+ end
466
+
467
+ #
468
+ # Iterates over the sponsored links on the first page of
469
+ # results passing each to the specified _block_.
470
+ #
471
+ def each_sponsored_link(&block)
472
+ sponsored_links.each(&block)
435
473
  end
436
474
 
437
475
  protected