gscraper 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -64,11 +64,11 @@ module GScraper
64
64
  def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
65
65
  @agent = GScraper.web_agent
66
66
 
67
- @rank = rank
68
- @title = title
69
- @url = url
70
- @summary = summary
71
- @cached_url = cached_url
67
+ @rank = rank
68
+ @title = title
69
+ @url = url
70
+ @summary = summary
71
+ @cached_url = cached_url
72
72
  @similar_url = similar_url
73
73
  end
74
74
 
@@ -89,9 +89,7 @@ module GScraper
89
89
  # The Cached Page for the result.
90
90
  #
91
91
  def cached_page
92
- if @cached_url
93
- return @agent.get(@cached_url)
94
- end
92
+ @agent.get(@cached_url) if @cached_url
95
93
  end
96
94
 
97
95
  #
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -73,7 +73,7 @@ module GScraper
73
73
  # @example
74
74
  # Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
75
75
  # q.within_last_month = true
76
- # q.occurrs_within = :title
76
+ # q.occurs_within = :title
77
77
  # end
78
78
  #
79
79
  # @see WebQuery.from_url.
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,37 +18,41 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
+ require 'gscraper/search/exceptions/blocked'
21
22
  require 'gscraper/search/result'
22
23
  require 'gscraper/search/page'
23
24
  require 'gscraper/search/query'
24
25
  require 'gscraper/sponsored_ad'
25
26
  require 'gscraper/sponsored_links'
26
- require 'gscraper/extensions/uri'
27
27
  require 'gscraper/has_pages'
28
28
  require 'gscraper/licenses'
29
29
  require 'gscraper/gscraper'
30
30
 
31
+ require 'uri/query_params'
32
+
31
33
  module GScraper
32
34
  module Search
33
35
  class WebQuery < Query
34
36
 
35
37
  include HasPages
36
38
 
37
- # Search host
38
- SEARCH_HOST = 'www.google.com'
39
-
40
- # Search URL
41
- SEARCH_URL = "http://#{SEARCH_HOST}/search"
39
+ # Web Search path
40
+ PATH = '/search'
42
41
 
43
42
  # Default results per-page
44
43
  RESULTS_PER_PAGE = 10
45
44
 
45
+ # Web Search licenses
46
+ LICENSES = {
47
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
48
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
49
+ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
50
+ '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
51
+ }
52
+
46
53
  # Results per-page
47
54
  attr_accessor :results_per_page
48
55
 
49
- # Search for results written in the language
50
- attr_accessor :language
51
-
52
56
  # Search for results from the region
53
57
  attr_accessor :region
54
58
 
@@ -70,8 +74,8 @@ module GScraper
70
74
  # Search for results within the past year
71
75
  attr_accessor :within_past_year
72
76
 
73
- # Search for results where the query ocurrs within the area
74
- attr_accessor :occurrs_within
77
+ # Search for results where the query occurs within the area
78
+ attr_accessor :occurs_within
75
79
 
76
80
  # Search for results inside the domain
77
81
  attr_accessor :inside_domain
@@ -91,10 +95,13 @@ module GScraper
91
95
  # @param [Hash] options
92
96
  # Additional options.
93
97
  #
98
+ # @option options [String] :search_host (www.google.com)
99
+ # The host to submit queries to.
100
+ #
94
101
  # @option options [Integer] :results_per_page
95
102
  # Specifies the number of results for each page.
96
103
  #
97
- # @option options [String] :language
104
+ # @option options [String, Symbol] :language (Languages.native)
98
105
  # Search for results in the specified language.
99
106
  #
100
107
  # @option options [String] :region
@@ -112,7 +119,7 @@ module GScraper
112
119
  # @option options [Boolean] :within_past_year
113
120
  # Search for results that were created within the past year.
114
121
  #
115
- # @option options [:title, :body, :url] :occurrs_within
122
+ # @option options [:title, :body, :url] :occurs_within
116
123
  # Searches for results where the keywords occurr within a specific
117
124
  # part of the result page.
118
125
  #
@@ -142,41 +149,40 @@ module GScraper
142
149
  def initialize(options={},&block)
143
150
  @agent = GScraper.web_agent(options)
144
151
 
145
- @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
152
+ @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)
146
153
 
147
- @language = options[:language]
148
154
  @region = options[:region]
149
155
 
150
156
  if options[:within_past_day]
151
- @within_past_day = options[:within_past_day]
152
- @within_past_week = false
157
+ @within_past_day = options[:within_past_day]
158
+ @within_past_week = false
153
159
  @within_past_months = false
154
- @within_past_year = false
160
+ @within_past_year = false
155
161
  elsif options[:within_past_week]
156
- @within_past_day = false
157
- @within_past_week = options[:within_past_week]
162
+ @within_past_day = false
163
+ @within_past_week = options[:within_past_week]
158
164
  @within_past_months = false
159
- @within_past_year = false
165
+ @within_past_year = false
160
166
  elsif options[:within_past_months]
161
- @within_past_day = false
162
- @within_past_week = false
167
+ @within_past_day = false
168
+ @within_past_week = false
163
169
  @within_past_months = options[:within_past_months]
164
- @within_past_year = false
170
+ @within_past_year = false
165
171
  elsif options[:within_past_year]
166
- @within_past_day = false
167
- @within_past_week = false
172
+ @within_past_day = false
173
+ @within_past_week = false
168
174
  @within_past_months = false
169
- @within_past_year = options[:within_past_year]
175
+ @within_past_year = options[:within_past_year]
170
176
  else
171
- @within_past_day = false
172
- @within_past_week = false
177
+ @within_past_day = false
178
+ @within_past_week = false
173
179
  @within_past_months = false
174
- @within_past_year = false
180
+ @within_past_year = false
175
181
  end
176
182
 
177
- @occurrs_within = options[:occurrs_within]
178
- @rights = options[:rights]
179
- @filtered = options[:filtered]
183
+ @occurs_within = options[:occurs_within]
184
+ @rights = options[:rights]
185
+ @filtered = options[:filtered]
180
186
 
181
187
  super(options,&block)
182
188
  end
@@ -211,25 +217,27 @@ module GScraper
211
217
  # @example
212
218
  # WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
213
219
  # q.within_last_month = true
214
- # q.occurrs_within = :title
220
+ # q.occurs_within = :title
215
221
  # end
216
222
  #
217
- def self.from_url(url,options={},&block)
223
+ def WebQuery.from_url(url,options={},&block)
218
224
  url = URI(url.to_s)
219
225
 
220
- if url.query_params['num']
221
- options[:results_per_page] = url.query_params['num'].to_i
222
- else
223
- options[:results_per_page] = RESULTS_PER_PAGE
224
- end
226
+ options[:search_host] = url.host
227
+
228
+ options[:results_per_page] = if url.query_params['num']
229
+ url.query_params['num'].to_i
230
+ else
231
+ RESULTS_PER_PAGE
232
+ end
225
233
 
226
- options[:query] = url.query_params['q']
227
- options[:exact_phrase] = url.query_params['as_epq']
228
- options[:with_words] = url.query_params['as_oq']
234
+ options[:query] = url.query_params['q']
235
+ options[:exact_phrase] = url.query_params['as_epq']
236
+ options[:with_words] = url.query_params['as_oq']
229
237
  options[:without_words] = url.query_params['as_eq']
230
238
 
231
239
  options[:language] = url.query_params['lr']
232
- options[:region] = url.query_params['cr']
240
+ options[:region] = url.query_params['cr']
233
241
 
234
242
  if url.query_params['as_filetype']
235
243
  options[:filetype] = url.query_params['as_filetype']
@@ -259,33 +267,14 @@ module GScraper
259
267
  )
260
268
  end
261
269
 
262
- case url.query_params['as_occt']
263
- when 'title'
264
- options[:occurrs_within] = :title
265
- when 'body'
266
- options[:occurrs_within] = :body
267
- when 'url'
268
- options[:occurrs_within] = :url
269
- when 'links'
270
- options[:occurrs_within] = :links
270
+ if url.query_params['as_occt']
271
+ options[:occurs_within] = url.query_params['as_occt'].to_sym
271
272
  end
272
273
 
273
274
  options[:site] = url.query_params['as_sitesearch']
274
275
 
275
- case url.query_params['as_rights']
276
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
277
- options[:rights] = Licenses::CC_BY_NC_ND
278
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
279
- options[:rights] = Licenses::CC_BY_SA
280
- when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
281
- options[:rights] = Licenses::CC_BY_NC
282
- when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
283
- options[:rights] = Licenses::CC_BY
284
- end
285
-
286
- if url.query_params[:safe] == 'active'
287
- options[:filtered] = true
288
- end
276
+ options[:rights] = LICENSES[url.query_params['as_rights']]
277
+ options[:filtered] = (url.query_params[:safe] == 'active')
289
278
 
290
279
  if url.query_params['as_rq']
291
280
  options[:related] = url.query_params['as_rq']
@@ -293,7 +282,7 @@ module GScraper
293
282
  options[:link] = url.query_params['as_lq']
294
283
  end
295
284
 
296
- return self.new(options,&block)
285
+ return WebQuery.new(options,&block)
297
286
  end
298
287
 
299
288
  #
@@ -303,8 +292,7 @@ module GScraper
303
292
  # The URL for the query.
304
293
  #
305
294
  def search_url
306
- url = URI(SEARCH_URL)
307
- query_expr = []
295
+ url = URI::HTTP.build(:host => search_host, :path => PATH)
308
296
 
309
297
  set_param = lambda { |param,value|
310
298
  url.query_params[param.to_s] = value if value
@@ -345,7 +333,7 @@ module GScraper
345
333
  url.query_params['as_nhi'] = @numeric_range.end
346
334
  end
347
335
 
348
- case @occurrs_within
336
+ case @occurs_within
349
337
  when :title, 'title'
350
338
  url.query_params['as_occt'] = 'title'
351
339
  when :body, 'body'
@@ -358,18 +346,13 @@ module GScraper
358
346
 
359
347
  set_param.call('as_sitesearch',@site)
360
348
 
361
- case @rights
362
- when Licenses::CC_BY_NC_ND
363
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
364
- when Licenses::CC_BY_SA
365
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
366
- when Licenses::CC_BY_ND
367
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
368
- when Licenses::CC_BY
369
- url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
349
+ if @rights
350
+ url.query_params['as_rights'] = LICENSES.reverse[@rights]
370
351
  end
371
352
 
372
- url.query_params['safe'] = 'active' if @filtered
353
+ if @filtered
354
+ url.query_params['safe'] = 'active'
355
+ end
373
356
 
374
357
  return url
375
358
  end
@@ -387,7 +370,7 @@ module GScraper
387
370
  url = search_url
388
371
 
389
372
  url.query_params['start'] = result_offset_of(page_index)
390
- url.query_params['sa'] = 'N'
373
+ url.query_params['sa'] = 'N'
391
374
 
392
375
  return url
393
376
  end
@@ -404,23 +387,27 @@ module GScraper
404
387
  def page(page_index)
405
388
  Page.new do |new_page|
406
389
  doc = @agent.get(page_url(page_index))
407
- results = doc.search('li.g','li/div.g')
390
+
391
+ if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
392
+ raise(Blocked,"Google has temporarily blocked our IP Address",caller)
393
+ end
394
+
395
+ results = doc.search('//li[@class="g"]')
408
396
  results_length = [@results_per_page, results.length].min
409
397
 
410
398
  rank_offset = result_offset_of(page_index)
411
399
 
412
- (0...results_length).each do |index|
413
- result = results[index]
414
-
415
- rank = rank_offset + (index + 1)
416
- link = result.at('h3.r/a')
417
- title = link.inner_text
418
- url = URI(link.get_attribute('href'))
400
+ results_length.times do |index|
401
+ result = results[index]
402
+ rank = rank_offset + (index + 1)
403
+ link = result.at('.//h3/a')
404
+ title = link.inner_text
405
+ link_url = URI(link.get_attribute('href')).query_params['q']
406
+ url = URI(link_url)
407
+
419
408
  summary_text = ''
420
- cached_url = nil
421
- similar_url = nil
422
409
 
423
- if (content = (result.at('div.s','td.j//font')))
410
+ if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
424
411
  content.children.each do |elem|
425
412
  break if (!(elem.text?) && elem.name=='br')
426
413
 
@@ -429,12 +416,17 @@ module GScraper
429
416
 
430
417
  end
431
418
 
432
- if (cached_link = result.at('span.gl/a:first'))
433
- cached_url = URI(cached_link.get_attribute('href'))
434
- end
419
+ cached_url = nil
420
+ similar_url = nil
435
421
 
436
- if (similar_link = result.at('span.gl/a:last'))
437
- similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
422
+ if (gl = result.at('.//div[@class="s"]'))
423
+ if (cached_link = gl.at('.//a[1]'))
424
+ cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
425
+ end
426
+
427
+ if (similar_link = gl.at('.//a[2]'))
428
+ similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
429
+ end
438
430
  end
439
431
 
440
432
  new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -473,9 +465,9 @@ module GScraper
473
465
  doc = @agent.get(search_url)
474
466
 
475
467
  # top and side ads
476
- doc.search('#pa1', 'a[@id^="an"]').each do |link|
468
+ doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
477
469
  title = link.inner_text
478
- url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
470
+ url = URI("http://#{search_host}" + link.get_attribute('href'))
479
471
 
480
472
  links << SponsoredAd.new(title,url)
481
473
  end
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
18
18
  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
19
  #
20
20
 
21
- require 'gscraper/extensions/uri'
21
+ require 'uri/query_params'
22
22
 
23
23
  module GScraper
24
24
  class SponsoredAd
@@ -40,7 +40,7 @@ module GScraper
40
40
  #
41
41
  def initialize(title,url)
42
42
  @title = title
43
- @url = url
43
+ @url = url
44
44
  end
45
45
 
46
46
  #
@@ -1,7 +1,7 @@
1
1
  #
2
2
  # GScraper - A web-scraping interface to various Google Services.
3
3
  #
4
- # Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
4
+ # Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
5
5
  #
6
6
  # This program is free software; you can redistribute it and/or modify
7
7
  # it under the terms of the GNU General Public License as published by
@@ -20,8 +20,6 @@
20
20
 
21
21
  require 'gscraper/sponsored_ad'
22
22
 
23
- require 'enumerator'
24
-
25
23
  module GScraper
26
24
  class SponsoredLinks < Array
27
25