gscraper 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +24 -2
- data/README.md +12 -7
- data/Rakefile +26 -29
- data/gemspec.yml +20 -0
- data/gscraper.gemspec +124 -109
- data/lib/gscraper.rb +1 -1
- data/lib/gscraper/gscraper.rb +24 -20
- data/lib/gscraper/has_pages.rb +1 -3
- data/lib/gscraper/hosts.rb +158 -0
- data/lib/gscraper/languages.rb +110 -0
- data/lib/gscraper/licenses.rb +4 -1
- data/lib/gscraper/page.rb +1 -3
- data/lib/gscraper/search.rb +1 -1
- data/lib/gscraper/search/ajax_query.rb +33 -34
- data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
- data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
- data/lib/gscraper/search/page.rb +47 -67
- data/lib/gscraper/search/query.rb +90 -44
- data/lib/gscraper/search/result.rb +7 -9
- data/lib/gscraper/search/search.rb +2 -2
- data/lib/gscraper/search/web_query.rb +93 -101
- data/lib/gscraper/sponsored_ad.rb +3 -3
- data/lib/gscraper/sponsored_links.rb +1 -3
- data/lib/gscraper/version.rb +2 -2
- data/spec/languages_spec.rb +28 -0
- data/spec/search/ajax_query_spec.rb +2 -1
- data/spec/search/query_spec.rb +29 -0
- data/spec/search/web_query_spec.rb +21 -1
- data/spec/spec_helper.rb +2 -12
- metadata +107 -125
- data/.specopts +0 -1
- data/Gemfile +0 -25
- data/lib/gscraper/extensions/uri/http.rb +0 -31
- data/lib/gscraper/extensions/uri/query_params.rb +0 -109
- data/spec/extensions/uri/http_spec.rb +0 -9
- data/spec/extensions/uri/query_params_spec.rb +0 -46
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -64,11 +64,11 @@ module GScraper
|
|
64
64
|
def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
|
65
65
|
@agent = GScraper.web_agent
|
66
66
|
|
67
|
-
@rank
|
68
|
-
@title
|
69
|
-
@url
|
70
|
-
@summary
|
71
|
-
@cached_url
|
67
|
+
@rank = rank
|
68
|
+
@title = title
|
69
|
+
@url = url
|
70
|
+
@summary = summary
|
71
|
+
@cached_url = cached_url
|
72
72
|
@similar_url = similar_url
|
73
73
|
end
|
74
74
|
|
@@ -89,9 +89,7 @@ module GScraper
|
|
89
89
|
# The Cached Page for the result.
|
90
90
|
#
|
91
91
|
def cached_page
|
92
|
-
if @cached_url
|
93
|
-
return @agent.get(@cached_url)
|
94
|
-
end
|
92
|
+
@agent.get(@cached_url) if @cached_url
|
95
93
|
end
|
96
94
|
|
97
95
|
#
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -73,7 +73,7 @@ module GScraper
|
|
73
73
|
# @example
|
74
74
|
# Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
|
75
75
|
# q.within_last_month = true
|
76
|
-
# q.
|
76
|
+
# q.occurs_within = :title
|
77
77
|
# end
|
78
78
|
#
|
79
79
|
# @see WebQuery.from_url.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,37 +18,41 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
+
require 'gscraper/search/exceptions/blocked'
|
21
22
|
require 'gscraper/search/result'
|
22
23
|
require 'gscraper/search/page'
|
23
24
|
require 'gscraper/search/query'
|
24
25
|
require 'gscraper/sponsored_ad'
|
25
26
|
require 'gscraper/sponsored_links'
|
26
|
-
require 'gscraper/extensions/uri'
|
27
27
|
require 'gscraper/has_pages'
|
28
28
|
require 'gscraper/licenses'
|
29
29
|
require 'gscraper/gscraper'
|
30
30
|
|
31
|
+
require 'uri/query_params'
|
32
|
+
|
31
33
|
module GScraper
|
32
34
|
module Search
|
33
35
|
class WebQuery < Query
|
34
36
|
|
35
37
|
include HasPages
|
36
38
|
|
37
|
-
# Search
|
38
|
-
|
39
|
-
|
40
|
-
# Search URL
|
41
|
-
SEARCH_URL = "http://#{SEARCH_HOST}/search"
|
39
|
+
# Web Search path
|
40
|
+
PATH = '/search'
|
42
41
|
|
43
42
|
# Default results per-page
|
44
43
|
RESULTS_PER_PAGE = 10
|
45
44
|
|
45
|
+
# Web Search licenses
|
46
|
+
LICENSES = {
|
47
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
|
48
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
|
49
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
|
50
|
+
'(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
|
51
|
+
}
|
52
|
+
|
46
53
|
# Results per-page
|
47
54
|
attr_accessor :results_per_page
|
48
55
|
|
49
|
-
# Search for results written in the language
|
50
|
-
attr_accessor :language
|
51
|
-
|
52
56
|
# Search for results from the region
|
53
57
|
attr_accessor :region
|
54
58
|
|
@@ -70,8 +74,8 @@ module GScraper
|
|
70
74
|
# Search for results within the past year
|
71
75
|
attr_accessor :within_past_year
|
72
76
|
|
73
|
-
# Search for results where the query
|
74
|
-
attr_accessor :
|
77
|
+
# Search for results where the query occurs within the area
|
78
|
+
attr_accessor :occurs_within
|
75
79
|
|
76
80
|
# Search for results inside the domain
|
77
81
|
attr_accessor :inside_domain
|
@@ -91,10 +95,13 @@ module GScraper
|
|
91
95
|
# @param [Hash] options
|
92
96
|
# Additional options.
|
93
97
|
#
|
98
|
+
# @option options [String] :search_host (www.google.com)
|
99
|
+
# The host to submit queries to.
|
100
|
+
#
|
94
101
|
# @option options [Integer] :results_per_page
|
95
102
|
# Specifies the number of results for each page.
|
96
103
|
#
|
97
|
-
# @option options [String] :language
|
104
|
+
# @option options [String, Symbol] :language (Languages.native)
|
98
105
|
# Search for results in the specified language.
|
99
106
|
#
|
100
107
|
# @option options [String] :region
|
@@ -112,7 +119,7 @@ module GScraper
|
|
112
119
|
# @option options [Boolean] :within_past_year
|
113
120
|
# Search for results that were created within the past year.
|
114
121
|
#
|
115
|
-
# @option options [:title, :body, :url] :
|
122
|
+
# @option options [:title, :body, :url] :occurs_within
|
116
123
|
# Searches for results where the keywords occurr within a specific
|
117
124
|
# part of the result page.
|
118
125
|
#
|
@@ -142,41 +149,40 @@ module GScraper
|
|
142
149
|
def initialize(options={},&block)
|
143
150
|
@agent = GScraper.web_agent(options)
|
144
151
|
|
145
|
-
@results_per_page = (
|
152
|
+
@results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)
|
146
153
|
|
147
|
-
@language = options[:language]
|
148
154
|
@region = options[:region]
|
149
155
|
|
150
156
|
if options[:within_past_day]
|
151
|
-
@within_past_day
|
152
|
-
@within_past_week
|
157
|
+
@within_past_day = options[:within_past_day]
|
158
|
+
@within_past_week = false
|
153
159
|
@within_past_months = false
|
154
|
-
@within_past_year
|
160
|
+
@within_past_year = false
|
155
161
|
elsif options[:within_past_week]
|
156
|
-
@within_past_day
|
157
|
-
@within_past_week
|
162
|
+
@within_past_day = false
|
163
|
+
@within_past_week = options[:within_past_week]
|
158
164
|
@within_past_months = false
|
159
|
-
@within_past_year
|
165
|
+
@within_past_year = false
|
160
166
|
elsif options[:within_past_months]
|
161
|
-
@within_past_day
|
162
|
-
@within_past_week
|
167
|
+
@within_past_day = false
|
168
|
+
@within_past_week = false
|
163
169
|
@within_past_months = options[:within_past_months]
|
164
|
-
@within_past_year
|
170
|
+
@within_past_year = false
|
165
171
|
elsif options[:within_past_year]
|
166
|
-
@within_past_day
|
167
|
-
@within_past_week
|
172
|
+
@within_past_day = false
|
173
|
+
@within_past_week = false
|
168
174
|
@within_past_months = false
|
169
|
-
@within_past_year
|
175
|
+
@within_past_year = options[:within_past_year]
|
170
176
|
else
|
171
|
-
@within_past_day
|
172
|
-
@within_past_week
|
177
|
+
@within_past_day = false
|
178
|
+
@within_past_week = false
|
173
179
|
@within_past_months = false
|
174
|
-
@within_past_year
|
180
|
+
@within_past_year = false
|
175
181
|
end
|
176
182
|
|
177
|
-
@
|
178
|
-
@rights
|
179
|
-
@filtered
|
183
|
+
@occurs_within = options[:occurs_within]
|
184
|
+
@rights = options[:rights]
|
185
|
+
@filtered = options[:filtered]
|
180
186
|
|
181
187
|
super(options,&block)
|
182
188
|
end
|
@@ -211,25 +217,27 @@ module GScraper
|
|
211
217
|
# @example
|
212
218
|
# WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
|
213
219
|
# q.within_last_month = true
|
214
|
-
# q.
|
220
|
+
# q.occurs_within = :title
|
215
221
|
# end
|
216
222
|
#
|
217
|
-
def
|
223
|
+
def WebQuery.from_url(url,options={},&block)
|
218
224
|
url = URI(url.to_s)
|
219
225
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
226
|
+
options[:search_host] = url.host
|
227
|
+
|
228
|
+
options[:results_per_page] = if url.query_params['num']
|
229
|
+
url.query_params['num'].to_i
|
230
|
+
else
|
231
|
+
RESULTS_PER_PAGE
|
232
|
+
end
|
225
233
|
|
226
|
-
options[:query]
|
227
|
-
options[:exact_phrase]
|
228
|
-
options[:with_words]
|
234
|
+
options[:query] = url.query_params['q']
|
235
|
+
options[:exact_phrase] = url.query_params['as_epq']
|
236
|
+
options[:with_words] = url.query_params['as_oq']
|
229
237
|
options[:without_words] = url.query_params['as_eq']
|
230
238
|
|
231
239
|
options[:language] = url.query_params['lr']
|
232
|
-
options[:region]
|
240
|
+
options[:region] = url.query_params['cr']
|
233
241
|
|
234
242
|
if url.query_params['as_filetype']
|
235
243
|
options[:filetype] = url.query_params['as_filetype']
|
@@ -259,33 +267,14 @@ module GScraper
|
|
259
267
|
)
|
260
268
|
end
|
261
269
|
|
262
|
-
|
263
|
-
|
264
|
-
options[:occurrs_within] = :title
|
265
|
-
when 'body'
|
266
|
-
options[:occurrs_within] = :body
|
267
|
-
when 'url'
|
268
|
-
options[:occurrs_within] = :url
|
269
|
-
when 'links'
|
270
|
-
options[:occurrs_within] = :links
|
270
|
+
if url.query_params['as_occt']
|
271
|
+
options[:occurs_within] = url.query_params['as_occt'].to_sym
|
271
272
|
end
|
272
273
|
|
273
274
|
options[:site] = url.query_params['as_sitesearch']
|
274
275
|
|
275
|
-
|
276
|
-
|
277
|
-
options[:rights] = Licenses::CC_BY_NC_ND
|
278
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
279
|
-
options[:rights] = Licenses::CC_BY_SA
|
280
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
281
|
-
options[:rights] = Licenses::CC_BY_NC
|
282
|
-
when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
283
|
-
options[:rights] = Licenses::CC_BY
|
284
|
-
end
|
285
|
-
|
286
|
-
if url.query_params[:safe] == 'active'
|
287
|
-
options[:filtered] = true
|
288
|
-
end
|
276
|
+
options[:rights] = LICENSES[url.query_params['as_rights']]
|
277
|
+
options[:filtered] = (url.query_params[:safe] == 'active')
|
289
278
|
|
290
279
|
if url.query_params['as_rq']
|
291
280
|
options[:related] = url.query_params['as_rq']
|
@@ -293,7 +282,7 @@ module GScraper
|
|
293
282
|
options[:link] = url.query_params['as_lq']
|
294
283
|
end
|
295
284
|
|
296
|
-
return
|
285
|
+
return WebQuery.new(options,&block)
|
297
286
|
end
|
298
287
|
|
299
288
|
#
|
@@ -303,8 +292,7 @@ module GScraper
|
|
303
292
|
# The URL for the query.
|
304
293
|
#
|
305
294
|
def search_url
|
306
|
-
url = URI(
|
307
|
-
query_expr = []
|
295
|
+
url = URI::HTTP.build(:host => search_host, :path => PATH)
|
308
296
|
|
309
297
|
set_param = lambda { |param,value|
|
310
298
|
url.query_params[param.to_s] = value if value
|
@@ -345,7 +333,7 @@ module GScraper
|
|
345
333
|
url.query_params['as_nhi'] = @numeric_range.end
|
346
334
|
end
|
347
335
|
|
348
|
-
case @
|
336
|
+
case @occurs_within
|
349
337
|
when :title, 'title'
|
350
338
|
url.query_params['as_occt'] = 'title'
|
351
339
|
when :body, 'body'
|
@@ -358,18 +346,13 @@ module GScraper
|
|
358
346
|
|
359
347
|
set_param.call('as_sitesearch',@site)
|
360
348
|
|
361
|
-
|
362
|
-
|
363
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
|
364
|
-
when Licenses::CC_BY_SA
|
365
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
|
366
|
-
when Licenses::CC_BY_ND
|
367
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
|
368
|
-
when Licenses::CC_BY
|
369
|
-
url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
|
349
|
+
if @rights
|
350
|
+
url.query_params['as_rights'] = LICENSES.reverse[@rights]
|
370
351
|
end
|
371
352
|
|
372
|
-
|
353
|
+
if @filtered
|
354
|
+
url.query_params['safe'] = 'active'
|
355
|
+
end
|
373
356
|
|
374
357
|
return url
|
375
358
|
end
|
@@ -387,7 +370,7 @@ module GScraper
|
|
387
370
|
url = search_url
|
388
371
|
|
389
372
|
url.query_params['start'] = result_offset_of(page_index)
|
390
|
-
url.query_params['sa']
|
373
|
+
url.query_params['sa'] = 'N'
|
391
374
|
|
392
375
|
return url
|
393
376
|
end
|
@@ -404,23 +387,27 @@ module GScraper
|
|
404
387
|
def page(page_index)
|
405
388
|
Page.new do |new_page|
|
406
389
|
doc = @agent.get(page_url(page_index))
|
407
|
-
|
390
|
+
|
391
|
+
if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
|
392
|
+
raise(Blocked,"Google has temporarily blocked our IP Address",caller)
|
393
|
+
end
|
394
|
+
|
395
|
+
results = doc.search('//li[@class="g"]')
|
408
396
|
results_length = [@results_per_page, results.length].min
|
409
397
|
|
410
398
|
rank_offset = result_offset_of(page_index)
|
411
399
|
|
412
|
-
|
413
|
-
result
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
url
|
400
|
+
results_length.times do |index|
|
401
|
+
result = results[index]
|
402
|
+
rank = rank_offset + (index + 1)
|
403
|
+
link = result.at('.//h3/a')
|
404
|
+
title = link.inner_text
|
405
|
+
link_url = URI(link.get_attribute('href')).query_params['q']
|
406
|
+
url = URI(link_url)
|
407
|
+
|
419
408
|
summary_text = ''
|
420
|
-
cached_url = nil
|
421
|
-
similar_url = nil
|
422
409
|
|
423
|
-
if (content = (result.at('div
|
410
|
+
if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
|
424
411
|
content.children.each do |elem|
|
425
412
|
break if (!(elem.text?) && elem.name=='br')
|
426
413
|
|
@@ -429,12 +416,17 @@ module GScraper
|
|
429
416
|
|
430
417
|
end
|
431
418
|
|
432
|
-
|
433
|
-
|
434
|
-
end
|
419
|
+
cached_url = nil
|
420
|
+
similar_url = nil
|
435
421
|
|
436
|
-
if (
|
437
|
-
|
422
|
+
if (gl = result.at('.//div[@class="s"]'))
|
423
|
+
if (cached_link = gl.at('.//a[1]'))
|
424
|
+
cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
|
425
|
+
end
|
426
|
+
|
427
|
+
if (similar_link = gl.at('.//a[2]'))
|
428
|
+
similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
|
429
|
+
end
|
438
430
|
end
|
439
431
|
|
440
432
|
new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
|
@@ -473,9 +465,9 @@ module GScraper
|
|
473
465
|
doc = @agent.get(search_url)
|
474
466
|
|
475
467
|
# top and side ads
|
476
|
-
doc.search('
|
468
|
+
doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
|
477
469
|
title = link.inner_text
|
478
|
-
url
|
470
|
+
url = URI("http://#{search_host}" + link.get_attribute('href'))
|
479
471
|
|
480
472
|
links << SponsoredAd.new(title,url)
|
481
473
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -18,7 +18,7 @@
|
|
18
18
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
19
19
|
#
|
20
20
|
|
21
|
-
require '
|
21
|
+
require 'uri/query_params'
|
22
22
|
|
23
23
|
module GScraper
|
24
24
|
class SponsoredAd
|
@@ -40,7 +40,7 @@ module GScraper
|
|
40
40
|
#
|
41
41
|
def initialize(title,url)
|
42
42
|
@title = title
|
43
|
-
@url
|
43
|
+
@url = url
|
44
44
|
end
|
45
45
|
|
46
46
|
#
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#
|
2
2
|
# GScraper - A web-scraping interface to various Google Services.
|
3
3
|
#
|
4
|
-
# Copyright (c) 2007-
|
4
|
+
# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
|
5
5
|
#
|
6
6
|
# This program is free software; you can redistribute it and/or modify
|
7
7
|
# it under the terms of the GNU General Public License as published by
|
@@ -20,8 +20,6 @@
|
|
20
20
|
|
21
21
|
require 'gscraper/sponsored_ad'
|
22
22
|
|
23
|
-
require 'enumerator'
|
24
|
-
|
25
23
|
module GScraper
|
26
24
|
class SponsoredLinks < Array
|
27
25
|
|