gscraper 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -165,13 +165,12 @@ module GScraper
165
165
 
166
166
  #
167
167
  # Returns an Array containing the cached pages of the results within
168
- # the Page. If _opts_ are given, they will be used in accessing the
169
- # cached page.
168
+ # the Page.
170
169
  #
171
170
  # page.cached_pages # => [...]
172
171
  #
173
- def cached_pages(opts={})
174
- map { |result| result.cached_page(opts) }
172
+ def cached_pages
173
+ map { |result| result.cached_page }
175
174
  end
176
175
 
177
176
  #
@@ -246,13 +245,12 @@ module GScraper
246
245
 
247
246
  #
248
247
  # Iterates over each result's cached pages within the Page, passing
249
- # each to the given _block_. If _opts_ are given, they will be used
250
- # in accessing the cached pages.
248
+ # each to the given _block_.
251
249
  #
252
250
  # each_cached_page { |page| puts page.readlines }
253
251
  #
254
- def each_cached_page(opts={},&block)
255
- cached_pages(opts).each(&block)
252
+ def each_cached_page(&block)
253
+ cached_pages.each(&block)
256
254
  end
257
255
 
258
256
  #
@@ -328,13 +326,13 @@ module GScraper
328
326
 
329
327
  #
330
328
  # Returns the cached pages of the results that match the specified
331
- # _block_. If _opts_ are given, they will be used in accessing
329
+ # _block_. If _options_ are given, they will be used in accessing
332
330
  # the cached pages.
333
331
  #
334
332
  # page.cached_pages_of { |result| result.title =~ /dude/ }
335
333
  #
336
- def cached_pages_of(opts={},&block)
337
- results_with(&block).cached_pages(opts)
334
+ def cached_pages_of(options={},&block)
335
+ results_with(&block).cached_pages(options)
338
336
  end
339
337
 
340
338
  #
@@ -1,8 +1,10 @@
1
1
  require 'gscraper/search/result'
2
2
  require 'gscraper/search/page'
3
+ require 'gscraper/sponsored_ad'
4
+ require 'gscraper/sponsored_links'
3
5
  require 'gscraper/extensions/uri'
4
6
  require 'gscraper/licenses'
5
- require 'gscraper/gscraper'
7
+ require 'gscraper/web_agent'
6
8
 
7
9
  require 'hpricot'
8
10
 
@@ -10,9 +12,15 @@ module GScraper
10
12
  module Search
11
13
  class Query
12
14
 
15
+ include WebAgent
16
+
17
+ # Search host
13
18
  SEARCH_HOST = 'www.google.com'
19
+
20
+ # Search URL
14
21
  SEARCH_URL = "http://#{SEARCH_HOST}/search"
15
22
 
23
+ # Default results per-page
16
24
  RESULTS_PER_PAGE = 10
17
25
 
18
26
  # Results per-page
@@ -88,41 +96,39 @@ module GScraper
88
96
  # q.within_past_week = true
89
97
  # end
90
98
  #
91
- def initialize(opts={},&block)
92
- super()
99
+ def initialize(options={},&block)
100
+ @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
93
101
 
94
- @results_per_page = (opts[:results_per_page] || RESULTS_PER_PAGE)
102
+ @query = options[:query]
103
+ @exact_phrase = options[:exact_phrase]
104
+ @with_words = options[:with_words]
105
+ @without_words = options[:without_words]
95
106
 
96
- @query = opts[:query]
97
- @exact_phrase = opts[:exact_phrase]
98
- @with_words = opts[:with_words]
99
- @without_words = opts[:without_words]
107
+ @language = options[:language]
108
+ @region = options[:region]
109
+ @in_format = options[:in_format]
110
+ @not_in_format = options[:not_in_format]
100
111
 
101
- @language = opts[:language]
102
- @region = opts[:region]
103
- @in_format = opts[:in_format]
104
- @not_in_format = opts[:not_in_format]
105
-
106
- if opts[:within_past_day]
107
- @within_past_day = opts[:within_past_day]
112
+ if options[:within_past_day]
113
+ @within_past_day = options[:within_past_day]
108
114
  @within_past_week = false
109
115
  @within_past_months = false
110
116
  @within_past_year = false
111
- elsif opts[:within_past_week]
117
+ elsif options[:within_past_week]
112
118
  @within_past_day = false
113
- @within_past_week = opts[:within_past_week]
119
+ @within_past_week = options[:within_past_week]
114
120
  @within_past_months = false
115
121
  @within_past_year = false
116
- elsif opts[:within_past_months]
122
+ elsif options[:within_past_months]
117
123
  @within_past_day = false
118
124
  @within_past_week = false
119
- @within_past_months = opts[:within_past_months]
125
+ @within_past_months = options[:within_past_months]
120
126
  @within_past_year = false
121
- elsif opts[:within_past_year]
127
+ elsif options[:within_past_year]
122
128
  @within_past_day = false
123
129
  @within_past_week = false
124
130
  @within_past_months = false
125
- @within_past_year = opts[:within_past_year]
131
+ @within_past_year = options[:within_past_year]
126
132
  else
127
133
  @within_past_day = false
128
134
  @within_past_week = false
@@ -130,15 +136,15 @@ module GScraper
130
136
  @within_past_year = false
131
137
  end
132
138
 
133
- @numeric_range = opts[:numeric_range]
134
- @occurrs_within = opts[:occurrs_within]
135
- @inside_domain = opts[:inside_domain]
136
- @outside_domain = opts[:outside_domain]
137
- @rights = opts[:rights]
138
- @filtered = opts[:filtered]
139
+ @numeric_range = options[:numeric_range]
140
+ @occurrs_within = options[:occurrs_within]
141
+ @inside_domain = options[:inside_domain]
142
+ @outside_domain = options[:outside_domain]
143
+ @rights = options[:rights]
144
+ @filtered = options[:filtered]
139
145
 
140
- @similar_to = opts[:similar_to]
141
- @links_to = opts[:links_to]
146
+ @similar_to = options[:similar_to]
147
+ @links_to = options[:links_to]
142
148
 
143
149
  block.call(self) if block
144
150
  end
@@ -154,95 +160,94 @@ module GScraper
154
160
  # q.occurrs_within = :title
155
161
  # end
156
162
  #
157
- def self.from_url(url,&block)
163
+ def self.from_url(url,options={},&block)
158
164
  url = URI.parse(url)
159
- opts = {}
160
165
 
161
- opts[:results_per_page] = url.query_params['num']
166
+ options[:results_per_page] = url.query_params['num']
162
167
 
163
- opts[:query] = url.query_params['as_q']
164
- opts[:exact_phrase] = url.query_params['as_epq']
165
- opts[:with_words] = url.query_params['as_oq']
166
- opts[:without_words] = url.query_params['as_eq']
168
+ options[:query] = url.query_params['as_q']
169
+ options[:exact_phrase] = url.query_params['as_epq']
170
+ options[:with_words] = url.query_params['as_oq']
171
+ options[:without_words] = url.query_params['as_eq']
167
172
 
168
- opts[:language] = url.query_params['lr']
169
- opts[:region] = url.query_params['cr']
173
+ options[:language] = url.query_params['lr']
174
+ options[:region] = url.query_params['cr']
170
175
 
171
176
  case url.query_params['as_ft']
172
177
  when 'i'
173
- opts[:in_format] = url.query_params['as_filetype']
178
+ options[:in_format] = url.query_params['as_filetype']
174
179
  when 'e'
175
- opts[:not_in_format] = url.query_params['as_filetype']
180
+ options[:not_in_format] = url.query_params['as_filetype']
176
181
  end
177
182
 
178
183
  case url.query_params['as_qdr']
179
184
  when 'd'
180
- opts[:within_past_day] = true
185
+ options[:within_past_day] = true
181
186
  when 'w'
182
- opts[:within_past_week] = true
187
+ options[:within_past_week] = true
183
188
  when 'm'
184
- opts[:within_past_months] = 1
189
+ options[:within_past_months] = 1
185
190
  when 'm2'
186
- opts[:within_past_months] = 2
191
+ options[:within_past_months] = 2
187
192
  when 'm3'
188
- opts[:within_past_months] = 3
193
+ options[:within_past_months] = 3
189
194
  when 'm6'
190
- opts[:within_past_months] = 6
195
+ options[:within_past_months] = 6
191
196
  when 'y'
192
- opts[:within_past_year] = true
197
+ options[:within_past_year] = true
193
198
  end
194
199
 
195
200
  if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
196
- opts[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
201
+ options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,url.query_params['as_nhi'].to_i)
197
202
  end
198
203
 
199
204
  case url.query_params['as_occt']
200
205
  when 'title'
201
- opts[:occurrs_within] = :title
206
+ options[:occurrs_within] = :title
202
207
  when 'body'
203
- opts[:occurrs_within] = :body
208
+ options[:occurrs_within] = :body
204
209
  when 'url'
205
- opts[:occurrs_within] = :url
210
+ options[:occurrs_within] = :url
206
211
  when 'links'
207
- opts[:occurrs_within] = :links
212
+ options[:occurrs_within] = :links
208
213
  end
209
214
 
210
215
  case url.query_params['as_dt']
211
216
  when 'i'
212
- opts[:inside_domain] = url.query_params['as_sitesearch']
217
+ options[:inside_domain] = url.query_params['as_sitesearch']
213
218
  when 'e'
214
- opts[:outside_domain] = url.query_params['as_sitesearch']
219
+ options[:outside_domain] = url.query_params['as_sitesearch']
215
220
  end
216
221
 
217
222
  case url.query_params['as_rights']
218
223
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
219
- opts[:rights] = Licenses::CC_BY_NC_ND
224
+ options[:rights] = Licenses::CC_BY_NC_ND
220
225
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
221
- opts[:rights] = Licenses::CC_BY_SA
226
+ options[:rights] = Licenses::CC_BY_SA
222
227
  when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
223
- opts[:rights] = Licenses::CC_BY_NC
228
+ options[:rights] = Licenses::CC_BY_NC
224
229
  when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
225
- opts[:rights] = Licenses::CC_BY
230
+ options[:rights] = Licenses::CC_BY
226
231
  end
227
232
 
228
233
  if url.query_params[:safe]=='active'
229
- opts[:filtered] = true
234
+ options[:filtered] = true
230
235
  end
231
236
 
232
237
  if url.query_params['as_rq']
233
- opts[:similar_to] = url.query_params['as_rq']
238
+ options[:similar_to] = url.query_params['as_rq']
234
239
  elsif url.query_params['as_lq']
235
- opts[:links_to] = url.query_params['as_lq']
240
+ options[:links_to] = url.query_params['as_lq']
236
241
  end
237
242
 
238
- return self.new(opts,&block)
243
+ return self.new(options,&block)
239
244
  end
240
245
 
241
246
  #
242
247
  # Returns the URL that represents the query.
243
248
  #
244
249
  def search_url
245
- url = URI.parse(SEARCH_URL)
250
+ url = URI(SEARCH_URL)
246
251
 
247
252
  if @results_per_page
248
253
  url.query_params['num'] = @results_per_page
@@ -344,37 +349,41 @@ module GScraper
344
349
 
345
350
  #
346
351
  # Returns a Page object containing Result objects at the specified
347
- # _page_index_. If _opts_ are given, they will be used in accessing
348
- # the SEARCH_URL. If a _block_ is given, it will be passed the newly
352
+ # _page_index_. If a _block_ is given, it will be passed the newly
349
353
  # created Page.
350
354
  #
351
- def page(page_index,opts={},&block)
352
- doc = Hpricot(GScraper.open(page_url(page_index),opts))
355
+ def page(page_index,&block)
356
+ doc = get_page(page_url(page_index))
353
357
 
354
358
  new_page = Page.new
355
359
  results = doc.search('//div.g')[0...@results_per_page.to_i]
356
360
 
357
361
  results.each_with_index do |result,index|
358
362
  rank = page_result_offset(page_index) + (index + 1)
359
- title = result.at('//h2.r').inner_text
360
- url = result.at('//h2.r/a').get_attribute('href')
361
-
362
- summary = result.at('//td.j//font').children[0...-3].inject('') do |accum,elem|
363
- accum + elem.inner_text
364
- end
365
-
363
+ link = result.at('//a.l')
364
+ title = link.inner_text
365
+ url = link.get_attribute('href')
366
+ summary_text = ''
366
367
  cached_url = nil
367
368
  similar_url = nil
368
369
 
369
- if (cached_link = result.at('//td.j//font/nobr/a:first'))
370
- cached_url = cached_link.get_attribute('href')
371
- end
370
+ if (content = (result.at('//td.j//font|//td.j/div.sml')))
371
+ content.children.each do |elem|
372
+ break if (!(elem.text?) && elem.name=='br')
372
373
 
373
- if (similar_link = result.at('//td.j//font/nobr/a:last'))
374
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
374
+ summary_text << elem.inner_text
375
+ end
376
+
377
+ if (cached_link = result.at('nobr/a:first'))
378
+ cached_url = cached_link.get_attribute('href')
379
+ end
380
+
381
+ if (similar_link = result.at('nobr/a:last'))
382
+ similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
383
+ end
375
384
  end
376
385
 
377
- new_page << Result.new(rank,title,url,summary,cached_url,similar_url)
386
+ new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
378
387
  end
379
388
 
380
389
  block.call(new_page) if block
@@ -382,56 +391,85 @@ module GScraper
382
391
  end
383
392
 
384
393
  #
385
- # Returns the Results on the first page. If _opts_ are given, they
386
- # will be used in accessing the SEARCH_URL. If a _block_ is given
387
- # it will be passed the newly created Page.
394
+ # Returns the Results on the first page. If a _block_ is given it
395
+ # will be passed the newly created Page.
388
396
  #
389
- def first_page(opts={},&block)
390
- page(1,opts,&block)
397
+ def first_page(&block)
398
+ page(1,&block)
391
399
  end
392
400
 
393
401
  #
394
- # Returns the Result at the specified _index_. If _opts_ are given,
395
- # they will be used in accessing the Page containing the requested
396
- # Result.
402
+ # Returns the Result at the specified _index_.
397
403
  #
398
- def result_at(index,opts={})
399
- page(result_page_index(index),opts)[page_result_index(index)]
404
+ def result_at(index)
405
+ page(result_page_index(index))[page_result_index(index)]
400
406
  end
401
407
 
402
408
  #
403
- # Returns the first Result at the specified _index_. If _opts_ are
404
- # given, they will be used in accessing the Page containing the
405
- # requested Result.
409
+ # Returns the first Result on the first_page.
406
410
  #
407
- def first_result(opts={})
408
- result_at(1,opts)
411
+ def top_result
412
+ result_at(1)
409
413
  end
410
414
 
411
415
  #
412
416
  # Iterates over the results at the specified _page_index_, passing
413
- # each to the given _block_. If _opts_ are given they will be used
414
- # in accessing the SEARCH_URL.
417
+ # each to the given _block_.
415
418
  #
416
419
  # query.each_on_page(2) do |result|
417
420
  # puts result.title
418
421
  # end
419
422
  #
420
- def each_on_page(page_index,opts={},&block)
421
- page(page_index,opts).each(&block)
423
+ def each_on_page(page_index,&block)
424
+ page(page_index).each(&block)
422
425
  end
423
426
 
424
427
  #
425
- # Iterates over the results on the first page, passing
426
- # each to the given _block_. If _opts_ are given, they will be used
427
- # in accessing the SEARCH_URL.
428
+ # Iterates over the results on the first page, passing each to the
429
+ # given _block_.
428
430
  #
429
431
  # query.each_on_first_page do |result|
430
432
  # puts result.url
431
433
  # end
432
434
  #
433
- def each_on_first_page(opts={},&block)
434
- each_on_page(1,opts,&block)
435
+ def each_on_first_page(&block)
436
+ each_on_page(1,&block)
437
+ end
438
+
439
+ #
440
+ # Returns a SponsoredLinks object containing SponsoredAd objects of
441
+ # the query. If a _block_ is given, it will be passed the newly
442
+ # created Page.
443
+ #
444
+ def sponsored_links(&block)
445
+ doc = get_page(search_url)
446
+ new_links = SponsoredLinks.new
447
+
448
+ # top and side ads
449
+ doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
450
+ title = link.inner_text
451
+ url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
452
+
453
+ new_links << SponsoredAd.new(title,url)
454
+ end
455
+
456
+ block.call(new_links) if block
457
+ return new_links
458
+ end
459
+
460
+ #
461
+ # Returns the first sponsored link on the first page of results.
462
+ #
463
+ def top_sponsored_link
464
+ top_sponsored_links.first
465
+ end
466
+
467
+ #
468
+ # Iterates over the sponsored links on the first page of
469
+ # results passing each to the specified _block_.
470
+ #
471
+ def each_sponsored_link(&block)
472
+ sponsored_links.each(&block)
435
473
  end
436
474
 
437
475
  protected