gscraper 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -64,11 +64,11 @@ module GScraper
|
|
64
64
|
def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
|
65
65
|
@agent = GScraper.web_agent
|
66
66
|
|
67
|
-
@rank
|
68
|
-
@title
|
69
|
-
@url
|
70
|
-
@summary
|
71
|
-
@cached_url
|
67
|
+
@rank = rank
|
68
|
+
@title = title
|
69
|
+
@url = url
|
70
|
+
@summary = summary
|
71
|
+
@cached_url = cached_url
|
72
72
|
@similar_url = similar_url
|
73
73
|
end
|
74
74
|
|
@@ -89,9 +89,7 @@ module GScraper
|
|
89
89
|
# The Cached Page for the result.
|
90
90
|
#
|
91
91
|
def cached_page
|
92
|
-
if @cached_url
|
93
|
-
return @agent.get(@cached_url)
|
94
|
-
end
|
92
|
+
@agent.get(@cached_url) if @cached_url
|
95
93
|
end
|
96
94
|
|
97
95
|
#
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -73,7 +73,7 @@ module GScraper
|
|
73
73
|
# @example
|
74
74
|
# Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
|
75
75
|
# q.within_last_month = true
|
76
|
-
# q.
|
76
|
+
# q.occurs_within = :title
|
77
77
|
# end
|
78
78
|
#
|
79
79
|
# @see WebQuery.from_url.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,37 +18,41 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
+
require 'gscraper/search/exceptions/blocked'
|
21
22
|
require 'gscraper/search/result'
|
22
23
|
require 'gscraper/search/page'
|
23
24
|
require 'gscraper/search/query'
|
24
25
|
require 'gscraper/sponsored_ad'
|
25
26
|
require 'gscraper/sponsored_links'
|
26
|
-
require 'gscraper/extensions/uri'
|
27
27
|
require 'gscraper/has_pages'
|
28
28
|
require 'gscraper/licenses'
|
29
29
|
require 'gscraper/gscraper'
|
30
30
|
|
31
|
+
require 'uri/query_params'
|
32
|
+
|
31
33
|
module GScraper
|
32
34
|
module Search
|
33
35
|
class WebQuery < Query
|
34
36
|
|
35
37
|
include HasPages
|
36
38
|
|
37
|
-
# Search
|
38
|
-
|
39
|
-
|
40
|
-
# Search URL
|
41
|
-
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
39
|
+
# Web Search path
|
40
|
+
PATH = '/search'
|
42
41
|
|
43
42
|
# Default results per-page
|
44
43
|
RESULTS_PER_PAGE = 10
|
45
44
|
|
45
|
+
# Web Search licenses
|
46
|
+
LICENSES = {
|
47
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
|
48
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
|
49
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
|
50
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
|
51
|
+
}
|
52
|
+
|
46
53
|
# Results per-page
|
47
54
|
attr_accessor :results_per_page
|
48
55
|
|
49
|
-
# Search for results written in the language
|
50
|
-
attr_accessor :language
|
51
|
-
|
52
56
|
# Search for results from the region
|
53
57
|
attr_accessor :region
|
54
58
|
|
@@ -70,8 +74,8 @@ module GScraper
|
|
70
74
|
# Search for results within the past year
|
71
75
|
attr_accessor :within_past_year
|
72
76
|
|
73
|
-
# Search for results where the query
|
74
|
-
attr_accessor :
|
77
|
+
# Search for results where the query occurs within the area
|
78
|
+
attr_accessor :occurs_within
|
75
79
|
|
76
80
|
# Search for results inside the domain
|
77
81
|
attr_accessor :inside_domain
|
@@ -91,10 +95,13 @@ module GScraper
|
|
91
95
|
# @param [Hash] options
|
92
96
|
# Additional options.
|
93
97
|
#
|
98
|
+
# @option options [String] :search_host (www.google.com)
|
99
|
+
# The host to submit queries to.
|
100
|
+
#
|
94
101
|
# @option options [Integer] :results_per_page
|
95
102
|
# Specifies the number of results for each page.
|
96
103
|
#
|
97
|
-
# @option options [String] :language
|
104
|
+
# @option options [String, Symbol] :language (Languages.native)
|
98
105
|
# Search for results in the specified language.
|
99
106
|
#
|
100
107
|
# @option options [String] :region
|
@@ -112,7 +119,7 @@ module GScraper
|
|
112
119
|
# @option options [Boolean] :within_past_year
|
113
120
|
# Search for results that were created within the past year.
|
114
121
|
#
|
115
|
-
# @option options [:title, :body, :url] :
|
122
|
+
# @option options [:title, :body, :url] :occurs_within
|
116
123
|
# Searches for results where the keywords occurr within a specific
|
117
124
|
# part of the result page.
|
118
125
|
#
|
@@ -142,41 +149,40 @@ module GScraper
|
|
142
149
|
def initialize(options={},&block)
|
143
150
|
@agent = GScraper.web_agent(options)
|
144
151
|
|
145
|
-
@results_per_page = (
|
152
|
+
@results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)
|
146
153
|
|
147
|
-
@language = options[:language]
|
148
154
|
@region = options[:region]
|
149
155
|
|
150
156
|
if options[:within_past_day]
|
151
|
-
@within_past_day
|
152
|
-
@within_past_week
|
157
|
+
@within_past_day = options[:within_past_day]
|
158
|
+
@within_past_week = false
|
153
159
|
@within_past_months = false
|
154
|
-
@within_past_year
|
160
|
+
@within_past_year = false
|
155
161
|
elsif options[:within_past_week]
|
156
|
-
@within_past_day
|
157
|
-
@within_past_week
|
162
|
+
@within_past_day = false
|
163
|
+
@within_past_week = options[:within_past_week]
|
158
164
|
@within_past_months = false
|
159
|
-
@within_past_year
|
165
|
+
@within_past_year = false
|
160
166
|
elsif options[:within_past_months]
|
161
|
-
@within_past_day
|
162
|
-
@within_past_week
|
167
|
+
@within_past_day = false
|
168
|
+
@within_past_week = false
|
163
169
|
@within_past_months = options[:within_past_months]
|
164
|
-
@within_past_year
|
170
|
+
@within_past_year = false
|
165
171
|
elsif options[:within_past_year]
|
166
|
-
@within_past_day
|
167
|
-
@within_past_week
|
172
|
+
@within_past_day = false
|
173
|
+
@within_past_week = false
|
168
174
|
@within_past_months = false
|
169
|
-
@within_past_year
|
175
|
+
@within_past_year = options[:within_past_year]
|
170
176
|
else
|
171
|
-
@within_past_day
|
172
|
-
@within_past_week
|
177
|
+
@within_past_day = false
|
178
|
+
@within_past_week = false
|
173
179
|
@within_past_months = false
|
174
|
-
@within_past_year
|
180
|
+
@within_past_year = false
|
175
181
|
end
|
176
182
|
|
177
|
-
@
|
178
|
-
@rights
|
179
|
-
@filtered
|
183
|
+
@occurs_within = options[:occurs_within]
|
184
|
+
@rights = options[:rights]
|
185
|
+
@filtered = options[:filtered]
|
180
186
|
|
181
187
|
super(options,&block)
|
182
188
|
end
|
@@ -211,25 +217,27 @@ module GScraper
|
|
211
217
|
# @example
|
212
218
|
# WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
|
213
219
|
# q.within_last_month = true
|
214
|
-
# q.
|
220
|
+
# q.occurs_within = :title
|
215
221
|
# end
|
216
222
|
#
|
217
|
-
def
|
223
|
+
def WebQuery.from_url(url,options={},&block)
|
218
224
|
url = URI(url.to_s)
|
219
225
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
226
|
+
options[:search_host] = url.host
|
227
|
+
|
228
|
+
options[:results_per_page] = if url.query_params['num']
|
229
|
+
url.query_params['num'].to_i
|
230
|
+
else
|
231
|
+
RESULTS_PER_PAGE
|
232
|
+
end
|
225
233
|
|
226
|
-
options[:query]
|
227
|
-
options[:exact_phrase]
|
228
|
-
options[:with_words]
|
234
|
+
options[:query] = url.query_params['q']
|
235
|
+
options[:exact_phrase] = url.query_params['as_epq']
|
236
|
+
options[:with_words] = url.query_params['as_oq']
|
229
237
|
options[:without_words] = url.query_params['as_eq']
|
230
238
|
|
231
239
|
options[:language] = url.query_params['lr']
|
232
|
-
options[:region]
|
240
|
+
options[:region] = url.query_params['cr']
|
233
241
|
|
234
242
|
if url.query_params['as_filetype']
|
235
243
|
options[:filetype] = url.query_params['as_filetype']
|
@@ -259,33 +267,14 @@ module GScraper
|
|
259
267
|
)
|
260
268
|
end
|
261
269
|
|
262
|
-
|
263
|
-
|
264
|
-
options[:occurrs_within] = :title
|
265
|
-
when 'body'
|
266
|
-
options[:occurrs_within] = :body
|
267
|
-
when 'url'
|
268
|
-
options[:occurrs_within] = :url
|
269
|
-
when 'links'
|
270
|
-
options[:occurrs_within] = :links
|
270
|
+
if url.query_params['as_occt']
|
271
|
+
options[:occurs_within] = url.query_params['as_occt'].to_sym
|
271
272
|
end
|
272
273
|
|
273
274
|
options[:site] = url.query_params['as_sitesearch']
|
274
275
|
|
275
|
-
|
276
|
-
|
277
|
-
options[:rights] = Licenses::CC_BY_NC_ND
|
278
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
279
|
-
options[:rights] = Licenses::CC_BY_SA
|
280
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
281
|
-
options[:rights] = Licenses::CC_BY_NC
|
282
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
283
|
-
options[:rights] = Licenses::CC_BY
|
284
|
-
end
|
285
|
-
|
286
|
-
if url.query_params[:safe] == 'active'
|
287
|
-
options[:filtered] = true
|
288
|
-
end
|
276
|
+
options[:rights] = LICENSES[url.query_params['as_rights']]
|
277
|
+
options[:filtered] = (url.query_params[:safe] == 'active')
|
289
278
|
|
290
279
|
if url.query_params['as_rq']
|
291
280
|
options[:related] = url.query_params['as_rq']
|
@@ -293,7 +282,7 @@ module GScraper
|
|
293
282
|
options[:link] = url.query_params['as_lq']
|
294
283
|
end
|
295
284
|
|
296
|
-
return
|
285
|
+
return WebQuery.new(options,&block)
|
297
286
|
end
|
298
287
|
|
299
288
|
#
|
@@ -303,8 +292,7 @@ module GScraper
|
|
303
292
|
# The URL for the query.
|
304
293
|
#
|
305
294
|
def search_url
|
306
|
-
url = URI(
|
307
|
-
query_expr = []
|
295
|
+
url = URI::HTTP.build(:host => search_host, :path => PATH)
|
308
296
|
|
309
297
|
set_param = lambda { |param,value|
|
310
298
|
url.query_params[param.to_s] = value if value
|
@@ -345,7 +333,7 @@ module GScraper
|
|
345
333
|
url.query_params['as_nhi'] = @numeric_range.end
|
346
334
|
end
|
347
335
|
|
348
|
-
case @
|
336
|
+
case @occurs_within
|
349
337
|
when :title, 'title'
|
350
338
|
url.query_params['as_occt'] = 'title'
|
351
339
|
when :body, 'body'
|
@@ -358,18 +346,13 @@ module GScraper
|
|
358
346
|
|
359
347
|
set_param.call('as_sitesearch',@site)
|
360
348
|
|
361
|
-
|
362
|
-
|
363
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
364
|
-
when Licenses::CC_BY_SA
|
365
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
366
|
-
when Licenses::CC_BY_ND
|
367
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
368
|
-
when Licenses::CC_BY
|
369
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
349
|
+
if @rights
|
350
|
+
url.query_params['as_rights'] = LICENSES.reverse[@rights]
|
370
351
|
end
|
371
352
|
|
372
|
-
|
353
|
+
if @filtered
|
354
|
+
url.query_params['safe'] = 'active'
|
355
|
+
end
|
373
356
|
|
374
357
|
return url
|
375
358
|
end
|
@@ -387,7 +370,7 @@ module GScraper
|
|
387
370
|
url = search_url
|
388
371
|
|
389
372
|
url.query_params['start'] = result_offset_of(page_index)
|
390
|
-
url.query_params['sa']
|
373
|
+
url.query_params['sa'] = 'N'
|
391
374
|
|
392
375
|
return url
|
393
376
|
end
|
@@ -404,23 +387,27 @@ module GScraper
|
|
404
387
|
def page(page_index)
|
405
388
|
Page.new do |new_page|
|
406
389
|
doc = @agent.get(page_url(page_index))
|
407
|
-
|
390
|
+
|
391
|
+
if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
|
392
|
+
raise(Blocked,"Google has temporarily blocked our IP Address",caller)
|
393
|
+
end
|
394
|
+
|
395
|
+
results = doc.search('//li[@class="g"]')
|
408
396
|
results_length = [@results_per_page, results.length].min
|
409
397
|
|
410
398
|
rank_offset = result_offset_of(page_index)
|
411
399
|
|
412
|
-
|
413
|
-
result
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
url
|
400
|
+
results_length.times do |index|
|
401
|
+
result = results[index]
|
402
|
+
rank = rank_offset + (index + 1)
|
403
|
+
link = result.at('.//h3/a')
|
404
|
+
title = link.inner_text
|
405
|
+
link_url = URI(link.get_attribute('href')).query_params['q']
|
406
|
+
url = URI(link_url)
|
407
|
+
|
419
408
|
summary_text = ''
|
420
|
-
cached_url = nil
|
421
|
-
similar_url = nil
|
422
409
|
|
423
|
-
if (content = (result.at('div
|
410
|
+
if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
|
424
411
|
content.children.each do |elem|
|
425
412
|
break if (!(elem.text?) && elem.name=='br')
|
426
413
|
|
@@ -429,12 +416,17 @@ module GScraper
|
|
429
416
|
|
430
417
|
end
|
431
418
|
|
432
|
-
|
433
|
-
|
434
|
-
end
|
419
|
+
cached_url = nil
|
420
|
+
similar_url = nil
|
435
421
|
|
436
|
-
if (
|
437
|
-
|
422
|
+
if (gl = result.at('.//div[@class="s"]'))
|
423
|
+
if (cached_link = gl.at('.//a[1]'))
|
424
|
+
cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
|
425
|
+
end
|
426
|
+
|
427
|
+
if (similar_link = gl.at('.//a[2]'))
|
428
|
+
similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
|
429
|
+
end
|
438
430
|
end
|
439
431
|
|
440
432
|
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
@@ -473,9 +465,9 @@ module GScraper
|
|
473
465
|
doc = @agent.get(search_url)
|
474
466
|
|
475
467
|
# top and side ads
|
476
|
-
doc.search('
|
468
|
+
doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
|
477
469
|
title = link.inner_text
|
478
|
-
url
|
470
|
+
url = URI("http://#{search_host}" + link.get_attribute('href'))
|
479
471
|
|
480
472
|
links << SponsoredAd.new(title,url)
|
481
473
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,7 +18,7 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require '
|
21
|
+
require 'uri/query_params'
|
22
22
|
|
23
23
|
module GScraper
|
24
24
|
class SponsoredAd
|
@@ -40,7 +40,7 @@ module GScraper
|
|
40
40
|
#
|
41
41
|
def initialize(title,url)
|
42
42
|
@title = title
|
43
|
-
@url
|
43
|
+
@url = url
|
44
44
|
end
|
45
45
|
|
46
46
|
#
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -20,8 +20,6 @@
|
|
20
20
|
|
21
21
|
require 'gscraper/sponsored_ad'
|
22
22
|
|
23
|
-
require 'enumerator'
|
24
|
-
|
25
23
|
module GScraper
|
26
24
|
class SponsoredLinks < Array
|
27
25
|
|