gscraper 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -64,11 +64,11 @@ module GScraper
64
64
  def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
65
65
  @agent = GScraper.web_agent
66
66
 
67
- @rank = rank
68
- @title = title
69
- @url = url
70
- @summary = summary
71
- @cached_url = cached_url
67
+ @rank = rank
68
+ @title = title
69
+ @url = url
70
+ @summary = summary
71
+ @cached_url = cached_url
72
72
  @similar_url = similar_url
73
73
  end
74
74
 
@@ -89,9 +89,7 @@ module GScraper
89
89
  # The Cached Page for the result.
90
90
  #
91
91
  def cached_page
92
- if @cached_url
93
- return @agent.get(@cached_url)
94
- end
92
+ @agent.get(@cached_url) if @cached_url
95
93
  end
96
94
 
97
95
  #
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -73,7 +73,7 @@ module GScraper
73
73
  # @example
74
74
  # Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
75
75
  # q.within_last_month = true
76
- # q.occurrs_within = :title
76
+ # q.occurs_within = :title
77
77
  # end
78
78
  #
79
79
  # @see WebQuery.from_url.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,37 +18,41 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
+ require 'gscraper/search/exceptions/blocked'
21
22
  require 'gscraper/search/result'
22
23
  require 'gscraper/search/page'
23
24
  require 'gscraper/search/query'
24
25
  require 'gscraper/sponsored_ad'
25
26
  require 'gscraper/sponsored_links'
26
- require 'gscraper/extensions/uri'
27
27
  require 'gscraper/has_pages'
28
28
  require 'gscraper/licenses'
29
29
  require 'gscraper/gscraper'
30
30
 
31
+ require 'uri/query_params'
32
+
31
33
  module GScraper
32
34
  module Search
33
35
  class WebQuery < Query
34
36
 
35
37
  include HasPages
36
38
 
37
- # Search host
38
- SEARCH_HOST = 'www.google.com'
39
-
40
- # Search URL
41
- SEARCH_URL = "http://#{SEARCH_HOST}/search"
39
+ # Web Search path
40
+ PATH = '/search'
42
41
 
43
42
  # Default results per-page
44
43
  RESULTS_PER_PAGE = 10
45
44
 
45
+ # Web Search licenses
46
+ LICENSES = {
47
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
48
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
49
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
50
+ '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
51
+ }
52
+
46
53
  # Results per-page
47
54
  attr_accessor :results_per_page
48
55
 
49
- # Search for results written in the language
50
- attr_accessor :language
51
-
52
56
  # Search for results from the region
53
57
  attr_accessor :region
54
58
 
@@ -70,8 +74,8 @@ module GScraper
70
74
  # Search for results within the past year
71
75
  attr_accessor :within_past_year
72
76
 
73
- # Search for results where the query ocurrs within the area
74
- attr_accessor :occurrs_within
77
+ # Search for results where the query occurs within the area
78
+ attr_accessor :occurs_within
75
79
 
76
80
  # Search for results inside the domain
77
81
  attr_accessor :inside_domain
@@ -91,10 +95,13 @@ module GScraper
91
95
  # @param [Hash] options
92
96
  # Additional options.
93
97
  #
98
+ # @option options [String] :search_host (www.google.com)
99
+ # The host to submit queries to.
100
+ #
94
101
  # @option options [Integer] :results_per_page
95
102
  # Specifies the number of results for each page.
96
103
  #
97
- # @option options [String] :language
104
+ # @option options [String, Symbol] :language (Languages.native)
98
105
  # Search for results in the specified language.
99
106
  #
100
107
  # @option options [String] :region
@@ -112,7 +119,7 @@ module GScraper
112
119
  # @option options [Boolean] :within_past_year
113
120
  # Search for results that were created within the past year.
114
121
  #
115
- # @option options [:title, :body, :url] :occurrs_within
122
+ # @option options [:title, :body, :url] :occurs_within
116
123
  # Searches for results where the keywords occurr within a specific
117
124
  # part of the result page.
118
125
  #
@@ -142,41 +149,40 @@ module GScraper
142
149
  def initialize(options={},&block)
143
150
  @agent = GScraper.web_agent(options)
144
151
 
145
- @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
152
+ @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)
146
153
 
147
- @language = options[:language]
148
154
  @region = options[:region]
149
155
 
150
156
  if options[:within_past_day]
151
- @within_past_day = options[:within_past_day]
152
- @within_past_week = false
157
+ @within_past_day = options[:within_past_day]
158
+ @within_past_week = false
153
159
  @within_past_months = false
154
- @within_past_year = false
160
+ @within_past_year = false
155
161
  elsif options[:within_past_week]
156
- @within_past_day = false
157
- @within_past_week = options[:within_past_week]
162
+ @within_past_day = false
163
+ @within_past_week = options[:within_past_week]
158
164
  @within_past_months = false
159
- @within_past_year = false
165
+ @within_past_year = false
160
166
  elsif options[:within_past_months]
161
- @within_past_day = false
162
- @within_past_week = false
167
+ @within_past_day = false
168
+ @within_past_week = false
163
169
  @within_past_months = options[:within_past_months]
164
- @within_past_year = false
170
+ @within_past_year = false
165
171
  elsif options[:within_past_year]
166
- @within_past_day = false
167
- @within_past_week = false
172
+ @within_past_day = false
173
+ @within_past_week = false
168
174
  @within_past_months = false
169
- @within_past_year = options[:within_past_year]
175
+ @within_past_year = options[:within_past_year]
170
176
  else
171
- @within_past_day = false
172
- @within_past_week = false
177
+ @within_past_day = false
178
+ @within_past_week = false
173
179
  @within_past_months = false
174
- @within_past_year = false
180
+ @within_past_year = false
175
181
  end
176
182
 
177
- @occurrs_within = options[:occurrs_within]
178
- @rights = options[:rights]
179
- @filtered = options[:filtered]
183
+ @occurs_within = options[:occurs_within]
184
+ @rights = options[:rights]
185
+ @filtered = options[:filtered]
180
186
 
181
187
  super(options,&block)
182
188
  end
@@ -211,25 +217,27 @@ module GScraper
211
217
  # @example
212
218
  # WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
213
219
  # q.within_last_month = true
214
- # q.occurrs_within = :title
220
+ # q.occurs_within = :title
215
221
  # end
216
222
  #
217
- def self.from_url(url,options={},&block)
223
+ def WebQuery.from_url(url,options={},&block)
218
224
  url = URI(url.to_s)
219
225
 
220
- if url.query_params['num']
221
- options[:results_per_page] = url.query_params['num'].to_i
222
- else
223
- options[:results_per_page] = RESULTS_PER_PAGE
224
- end
226
+ options[:search_host] = url.host
227
+
228
+ options[:results_per_page] = if url.query_params['num']
229
+ url.query_params['num'].to_i
230
+ else
231
+ RESULTS_PER_PAGE
232
+ end
225
233
 
226
- options[:query] = url.query_params['q']
227
- options[:exact_phrase] = url.query_params['as_epq']
228
- options[:with_words] = url.query_params['as_oq']
234
+ options[:query] = url.query_params['q']
235
+ options[:exact_phrase] = url.query_params['as_epq']
236
+ options[:with_words] = url.query_params['as_oq']
229
237
  options[:without_words] = url.query_params['as_eq']
230
238
 
231
239
  options[:language] = url.query_params['lr']
232
- options[:region] = url.query_params['cr']
240
+ options[:region] = url.query_params['cr']
233
241
 
234
242
  if url.query_params['as_filetype']
235
243
  options[:filetype] = url.query_params['as_filetype']
@@ -259,33 +267,14 @@ module GScraper
259
267
  )
260
268
  end
261
269
 
262
- case url.query_params['as_occt']
263
- when 'title'
264
- options[:occurrs_within] = :title
265
- when 'body'
266
- options[:occurrs_within] = :body
267
- when 'url'
268
- options[:occurrs_within] = :url
269
- when 'links'
270
- options[:occurrs_within] = :links
270
+ if url.query_params['as_occt']
271
+ options[:occurs_within] = url.query_params['as_occt'].to_sym
271
272
  end
272
273
 
273
274
  options[:site] = url.query_params['as_sitesearch']
274
275
 
275
- case url.query_params['as_rights']
276
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
277
- options[:rights] = Licenses::CC_BY_NC_ND
278
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
279
- options[:rights] = Licenses::CC_BY_SA
280
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
281
- options[:rights] = Licenses::CC_BY_NC
282
- when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
283
- options[:rights] = Licenses::CC_BY
284
- end
285
-
286
- if url.query_params[:safe] == 'active'
287
- options[:filtered] = true
288
- end
276
+ options[:rights] = LICENSES[url.query_params['as_rights']]
277
+ options[:filtered] = (url.query_params[:safe] == 'active')
289
278
 
290
279
  if url.query_params['as_rq']
291
280
  options[:related] = url.query_params['as_rq']
@@ -293,7 +282,7 @@ module GScraper
293
282
  options[:link] = url.query_params['as_lq']
294
283
  end
295
284
 
296
- return self.new(options,&block)
285
+ return WebQuery.new(options,&block)
297
286
  end
298
287
 
299
288
  #
@@ -303,8 +292,7 @@ module GScraper
303
292
  # The URL for the query.
304
293
  #
305
294
  def search_url
306
- url = URI(SEARCH_URL)
307
- query_expr = []
295
+ url = URI::HTTP.build(:host => search_host, :path => PATH)
308
296
 
309
297
  set_param = lambda { |param,value|
310
298
  url.query_params[param.to_s] = value if value
@@ -345,7 +333,7 @@ module GScraper
345
333
  url.query_params['as_nhi'] = @numeric_range.end
346
334
  end
347
335
 
348
- case @occurrs_within
336
+ case @occurs_within
349
337
  when :title, 'title'
350
338
  url.query_params['as_occt'] = 'title'
351
339
  when :body, 'body'
@@ -358,18 +346,13 @@ module GScraper
358
346
 
359
347
  set_param.call('as_sitesearch',@site)
360
348
 
361
- case @rights
362
- when Licenses::CC_BY_NC_ND
363
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
364
- when Licenses::CC_BY_SA
365
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
366
- when Licenses::CC_BY_ND
367
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
368
- when Licenses::CC_BY
369
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
349
+ if @rights
350
+ url.query_params['as_rights'] = LICENSES.reverse[@rights]
370
351
  end
371
352
 
372
- url.query_params['safe'] = 'active' if @filtered
353
+ if @filtered
354
+ url.query_params['safe'] = 'active'
355
+ end
373
356
 
374
357
  return url
375
358
  end
@@ -387,7 +370,7 @@ module GScraper
387
370
  url = search_url
388
371
 
389
372
  url.query_params['start'] = result_offset_of(page_index)
390
- url.query_params['sa'] = 'N'
373
+ url.query_params['sa'] = 'N'
391
374
 
392
375
  return url
393
376
  end
@@ -404,23 +387,27 @@ module GScraper
404
387
  def page(page_index)
405
388
  Page.new do |new_page|
406
389
  doc = @agent.get(page_url(page_index))
407
- results = doc.search('li.g','li/div.g')
390
+
391
+ if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
392
+ raise(Blocked,"Google has temporarily blocked our IP Address",caller)
393
+ end
394
+
395
+ results = doc.search('//li[@class="g"]')
408
396
  results_length = [@results_per_page, results.length].min
409
397
 
410
398
  rank_offset = result_offset_of(page_index)
411
399
 
412
- (0...results_length).each do |index|
413
- result = results[index]
414
-
415
- rank = rank_offset + (index + 1)
416
- link = result.at('h3.r/a')
417
- title = link.inner_text
418
- url = URI(link.get_attribute('href'))
400
+ results_length.times do |index|
401
+ result = results[index]
402
+ rank = rank_offset + (index + 1)
403
+ link = result.at('.//h3/a')
404
+ title = link.inner_text
405
+ link_url = URI(link.get_attribute('href')).query_params['q']
406
+ url = URI(link_url)
407
+
419
408
  summary_text = ''
420
- cached_url = nil
421
- similar_url = nil
422
409
 
423
- if (content = (result.at('div.s','td.j//font')))
410
+ if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
424
411
  content.children.each do |elem|
425
412
  break if (!(elem.text?) && elem.name=='br')
426
413
 
@@ -429,12 +416,17 @@ module GScraper
429
416
 
430
417
  end
431
418
 
432
- if (cached_link = result.at('span.gl/a:first'))
433
- cached_url = URI(cached_link.get_attribute('href'))
434
- end
419
+ cached_url = nil
420
+ similar_url = nil
435
421
 
436
- if (similar_link = result.at('span.gl/a:last'))
437
- similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
422
+ if (gl = result.at('.//div[@class="s"]'))
423
+ if (cached_link = gl.at('.//a[1]'))
424
+ cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
425
+ end
426
+
427
+ if (similar_link = gl.at('.//a[2]'))
428
+ similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
429
+ end
438
430
  end
439
431
 
440
432
  new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -473,9 +465,9 @@ module GScraper
473
465
  doc = @agent.get(search_url)
474
466
 
475
467
  # top and side ads
476
- doc.search('#pa1', 'a[@id^="an"]').each do |link|
468
+ doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
477
469
  title = link.inner_text
478
- url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
470
+ url = URI("http://#{search_host}" + link.get_attribute('href'))
479
471
 
480
472
  links << SponsoredAd.new(title,url)
481
473
  end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri'
21
+ require 'uri/query_params'
22
22
 
23
23
  module GScraper
24
24
  class SponsoredAd
@@ -40,7 +40,7 @@ module GScraper
40
40
  #
41
41
  def initialize(title,url)
42
42
  @title = title
43
- @url = url
43
+ @url = url
44
44
  end
45
45
 
46
46
  #
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -20,8 +20,6 @@
20
20
 
21
21
  require 'gscraper/sponsored_ad'
22
22
 
23
- require 'enumerator'
24
-
25
23
  module GScraper
26
24
  class SponsoredLinks < Array
27
25