gscraper 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri'
|
2
24
|
|
3
25
|
module GScraper
|
@@ -14,14 +36,16 @@ module GScraper
|
|
14
36
|
#
|
15
37
|
def initialize(title,url)
|
16
38
|
@title = title
|
17
|
-
@url =
|
39
|
+
@url = url
|
18
40
|
end
|
19
41
|
|
20
42
|
#
|
21
43
|
# Returns the direct URL of the ad.
|
22
44
|
#
|
23
45
|
def direct_url
|
24
|
-
|
46
|
+
uri = URI(@url)
|
47
|
+
|
48
|
+
return (uri.query_params['adurl'] || uri.query_params['q'])
|
25
49
|
end
|
26
50
|
|
27
51
|
#
|
@@ -1,12 +1,38 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/sponsored_ad'
|
2
24
|
|
3
25
|
module GScraper
|
4
26
|
class SponsoredLinks < Array
|
5
27
|
#
|
6
|
-
# Creates a new SponsoredLinks object with the given _ads_.
|
28
|
+
# Creates a new SponsoredLinks object with the given _ads_. If a
|
29
|
+
# _block_ is given, it will be passed the newly created SponsoredLinks
|
30
|
+
# object.
|
7
31
|
#
|
8
|
-
def initialize(ads=[])
|
32
|
+
def initialize(ads=[],&block)
|
9
33
|
super(ads)
|
34
|
+
|
35
|
+
block.call(self) if block
|
10
36
|
end
|
11
37
|
|
12
38
|
#
|
@@ -74,10 +100,6 @@ module GScraper
|
|
74
100
|
#
|
75
101
|
# sponsored.ads_with_url(/\.com/) # => SponsoredLinks
|
76
102
|
#
|
77
|
-
# sponsored.ads_with_url(/^https:\/\//) do |ad|
|
78
|
-
# puts ad.title
|
79
|
-
# end
|
80
|
-
#
|
81
103
|
def ads_with_url(url,&block)
|
82
104
|
if url.kind_of?(Regexp)
|
83
105
|
ads = ads_with { |ad| ad.url =~ url }
|
@@ -89,6 +111,24 @@ module GScraper
|
|
89
111
|
return ads
|
90
112
|
end
|
91
113
|
|
114
|
+
#
|
115
|
+
# Selects the ads with the matching _direct_url_. The _direct_url_ may
|
116
|
+
# be either a String or a Regexp. If _block_ is given, each matching
|
117
|
+
# ad will be passed to the _block_.
|
118
|
+
#
|
119
|
+
# sponsored.ads_with_direct_url(/\.com/) # => SponsoredLinks
|
120
|
+
#
|
121
|
+
def ads_with_direct_url(direct_url,&block)
|
122
|
+
if direct_url.kind_of?(Regexp)
|
123
|
+
ads = ads_with { |ad| ad.direct_url =~ direct_url }
|
124
|
+
else
|
125
|
+
ads = ads_with { |ad| ad.direct_url == direct_url }
|
126
|
+
end
|
127
|
+
|
128
|
+
ads.each(&block) if block
|
129
|
+
return ads
|
130
|
+
end
|
131
|
+
|
92
132
|
#
|
93
133
|
# Returns an Array containing the titles of the ads within the
|
94
134
|
# SponsoredLinks.
|
@@ -109,6 +149,16 @@ module GScraper
|
|
109
149
|
map { |ad| ad.url }
|
110
150
|
end
|
111
151
|
|
152
|
+
#
|
153
|
+
# Returns an Array containing the direct URLs of the ads within the
|
154
|
+
# SponsoredLinks.
|
155
|
+
#
|
156
|
+
# sponsored.direct_urls # => [...]
|
157
|
+
#
|
158
|
+
def direct_urls
|
159
|
+
map { |ad| ad.direct_url }
|
160
|
+
end
|
161
|
+
|
112
162
|
#
|
113
163
|
# Iterates over each ad's title within the SponsoredLinks, passing each to
|
114
164
|
# the given _block_.
|
@@ -120,7 +170,7 @@ module GScraper
|
|
120
170
|
end
|
121
171
|
|
122
172
|
#
|
123
|
-
# Iterates over each ad's
|
173
|
+
# Iterates over each ad's URL within the SponsoredLinks, passing each to
|
124
174
|
# the given _block_.
|
125
175
|
#
|
126
176
|
# each_url { |url| puts url }
|
@@ -129,6 +179,16 @@ module GScraper
|
|
129
179
|
urls.each(&block)
|
130
180
|
end
|
131
181
|
|
182
|
+
#
|
183
|
+
# Iterates over each ad's direct URL within the SponsoredLinks, passing
|
184
|
+
# each to the given _block_.
|
185
|
+
#
|
186
|
+
# each_direct_url { |url| puts url }
|
187
|
+
#
|
188
|
+
def each_direct_url(&block)
|
189
|
+
direct_urls.each(&block)
|
190
|
+
end
|
191
|
+
|
132
192
|
#
|
133
193
|
# Returns the titles of the ads that match the specified _block_.
|
134
194
|
#
|
@@ -139,7 +199,7 @@ module GScraper
|
|
139
199
|
end
|
140
200
|
|
141
201
|
#
|
142
|
-
# Returns the
|
202
|
+
# Returns the URLs of the ads that match the specified _block_.
|
143
203
|
#
|
144
204
|
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
145
205
|
#
|
@@ -147,5 +207,14 @@ module GScraper
|
|
147
207
|
ads_with(&block).urls
|
148
208
|
end
|
149
209
|
|
210
|
+
#
|
211
|
+
# Returns the direct URLs of the ads that match the specified _block_.
|
212
|
+
#
|
213
|
+
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
214
|
+
#
|
215
|
+
def direct_urls_of(&block)
|
216
|
+
ads_with(&block).direct_urls
|
217
|
+
end
|
218
|
+
|
150
219
|
end
|
151
220
|
end
|
data/lib/gscraper/version.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
module GScraper
|
2
|
-
VERSION = '0.
|
24
|
+
VERSION = '0.2.0'
|
3
25
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'gscraper/extensions/uri'
|
4
|
+
|
5
|
+
describe "URI::QueryParams" do
|
6
|
+
before(:each) do
|
7
|
+
@uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should provide #query_params" do
|
11
|
+
@uri.should respond_to(:query_params)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "#query_params should be a Hash" do
|
15
|
+
@uri.query_params.class.should == Hash
|
16
|
+
end
|
17
|
+
|
18
|
+
it "#query_params should contain params" do
|
19
|
+
@uri.query_params.empty?.should == false
|
20
|
+
end
|
21
|
+
|
22
|
+
it "#query_params can contain single-word params" do
|
23
|
+
@uri.query_params['x'].should == '1'
|
24
|
+
end
|
25
|
+
|
26
|
+
it "#query_params can contain multi-word params" do
|
27
|
+
@uri.query_params['y'].should == 'one two'
|
28
|
+
end
|
29
|
+
|
30
|
+
it "#query_params can contain empty params" do
|
31
|
+
@uri.query_params['z'].should be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should update #query_params along with #query=" do
|
35
|
+
@uri.query = 'u=3'
|
36
|
+
@uri.query_params['u'].should == '3'
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'gscraper/gscraper'
|
4
|
+
|
5
|
+
describe "GScraper" do
|
6
|
+
describe "User-Agent support" do
|
7
|
+
it "should have a default User-Agent string" do
|
8
|
+
GScraper.user_agent.should_not be_nil
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "Proxy support" do
|
13
|
+
it "should provide a :host key" do
|
14
|
+
GScraper.proxy.has_key?(:host).should == true
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should provide a :port key" do
|
18
|
+
GScraper.proxy.has_key?(:port).should == true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should provide a :user key" do
|
22
|
+
GScraper.proxy.has_key?(:user).should == true
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should provide a :password key" do
|
26
|
+
GScraper.proxy.has_key?(:password).should == true
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "has Pages" do
|
4
|
+
|
5
|
+
it "should have a first page" do
|
6
|
+
@query.first_page.should_not be_nil
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should allow indexed access" do
|
10
|
+
@query[1].should_not be_nil
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should allow accessing multiple pages" do
|
14
|
+
pages = @query.pages(1..2)
|
15
|
+
pages.should_not be_nil
|
16
|
+
pages.length.should == 2
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "has Sponsored Links" do
|
4
|
+
|
5
|
+
it "should have ads" do
|
6
|
+
@links.length.should_not == 0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have titles" do
|
10
|
+
@links.each_title do |title|
|
11
|
+
title.should_not be_nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should have non-empty titles" do
|
16
|
+
@links.each_title do |title|
|
17
|
+
title.length.should_not == 0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have URLs" do
|
22
|
+
@links.each_url do |url|
|
23
|
+
url.should_not be_nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have non-empty URLs" do
|
28
|
+
@links.each_url do |url|
|
29
|
+
url.length.should_not == 0
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have valid URLs" do
|
34
|
+
@links.each_url do |url|
|
35
|
+
url_should_be_valid(url)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have direct URLs" do
|
40
|
+
@links.each_direct_url do |url|
|
41
|
+
url.should_not be_nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should have non-empty direct URLs" do
|
46
|
+
@links.each_direct_url do |url|
|
47
|
+
url.length.should_not == 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should have valid direct URLs" do
|
52
|
+
@links.each_direct_url do |url|
|
53
|
+
url_should_be_valid(url)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
DEFAULT_QUERY = 'Ruby'
|
data/spec/helpers/uri.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "Page has Results" do
|
4
|
+
|
5
|
+
it "should have results" do
|
6
|
+
@page.length.should_not == 0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have the maximum amount of results per page" do
|
10
|
+
@page.length.should == @query.results_per_page
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'has_pages_examples'
|
3
|
+
require 'page_has_results_examples'
|
4
|
+
require 'search/page_has_results_examples'
|
5
|
+
|
6
|
+
require 'gscraper/search/ajax_query'
|
7
|
+
|
8
|
+
describe GScraper::Search::AJAXQuery do
|
9
|
+
|
10
|
+
before(:all) do
|
11
|
+
@query = GScraper::Search::AJAXQuery.new(:query => DEFAULT_QUERY)
|
12
|
+
@page = @query.first_page
|
13
|
+
end
|
14
|
+
|
15
|
+
it_should_behave_like "has Pages"
|
16
|
+
it_should_behave_like "Page has Results"
|
17
|
+
it_should_behave_like "Page has Search Results"
|
18
|
+
|
19
|
+
describe "Search URL" do
|
20
|
+
|
21
|
+
before(:all) do
|
22
|
+
@uri = @query.search_url
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be a valid HTTP URI" do
|
26
|
+
@uri.class.should == URI::HTTP
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should be a RESTful AJAX Search URL" do
|
30
|
+
@uri.path.should == '/uds/GwebSearch'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have the default 'callback' query-param" do
|
34
|
+
callback = @uri.query_params['callback']
|
35
|
+
callback.should == 'google.search.WebSearch.RawCompletion'
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should have the default 'context' query-param" do
|
39
|
+
@uri.query_params['context'].should == '0'
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should have a default 'lstkp' query-param" do
|
43
|
+
@uri.query_params['lstkp'].should == '0'
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should have a default 'rsz' query-param of 'large'" do
|
47
|
+
@uri.query_params['rsz'].should == 'large'
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should have a default 'hl' query-param" do
|
51
|
+
hl = @uri.query_params['hl']
|
52
|
+
hl.should == GScraper::Search::AJAXQuery::DEFAULT_LANGUAGE
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should have a default 'gss' query-param of '.com'" do
|
56
|
+
@uri.query_params['gss'].should == '.com'
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should have a 'q' query-param" do
|
60
|
+
@uri.query_params['q'].should == DEFAULT_QUERY
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should have a default 'sig' query-param" do
|
64
|
+
sig = @uri.query_params['sig']
|
65
|
+
sig.should == GScraper::Search::AJAXQuery::DEFAULT_SIG
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should have a default 'key' query-param" do
|
69
|
+
key = @uri.query_params['key']
|
70
|
+
key.should == GScraper::Search::AJAXQuery::DEFAULT_KEY
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should have a default 'v' query-param" do
|
74
|
+
v = @uri.query_params['v']
|
75
|
+
v.should == GScraper::Search::AJAXQuery::DEFAULT_VERSION
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "page specific URLs" do
|
81
|
+
|
82
|
+
before(:all) do
|
83
|
+
@uri = @query.page_url(2)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should have a 'start' query-param" do
|
87
|
+
@uri.query_params['start'].should == @query.results_per_page
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "queries from AJAX search URLs" do
|
93
|
+
|
94
|
+
before(:all) do
|
95
|
+
@version = '1.0'
|
96
|
+
@language = 'en'
|
97
|
+
@sig = '582c1116317355adf613a6a843f19ece'
|
98
|
+
@key = 'notsupplied'
|
99
|
+
@query = GScraper::Search::AJAXQuery.from_url("http://www.google.com/uds/GwebSearch?v=#{@version}&lstkp=0&rsz=large&hl=#{@language}&callback=google.search.WebSearch.RawCompletion&sig=#{@sig}&q=#{DEFAULT_QUERY}&gss=.com&context=0&key=#{@key}")
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should have a version" do
|
103
|
+
@query.version.should == @version
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should have a language" do
|
107
|
+
@query.language.should == @language
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should have a sig" do
|
111
|
+
@query.sig.should == @sig
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should have a key" do
|
115
|
+
@query.key.should == @key
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should have a query" do
|
119
|
+
@query.query.should == DEFAULT_QUERY
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|