gscraper 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING.txt +339 -0
- data/History.txt +21 -0
- data/Manifest.txt +23 -10
- data/README.txt +17 -21
- data/Rakefile +3 -6
- data/lib/gscraper.rb +22 -0
- data/lib/gscraper/extensions.rb +22 -0
- data/lib/gscraper/extensions/uri.rb +22 -0
- data/lib/gscraper/extensions/uri/http.rb +25 -71
- data/lib/gscraper/extensions/uri/query_params.rb +96 -0
- data/lib/gscraper/gscraper.rb +30 -0
- data/lib/gscraper/has_pages.rb +114 -0
- data/lib/gscraper/licenses.rb +22 -0
- data/lib/gscraper/page.rb +64 -0
- data/lib/gscraper/search.rb +24 -0
- data/lib/gscraper/search/ajax_query.rb +176 -0
- data/lib/gscraper/search/page.rb +27 -72
- data/lib/gscraper/search/query.rb +46 -457
- data/lib/gscraper/search/result.rb +32 -29
- data/lib/gscraper/search/search.rb +44 -3
- data/lib/gscraper/search/web_query.rb +472 -0
- data/lib/gscraper/sponsored_ad.rb +26 -2
- data/lib/gscraper/sponsored_links.rb +77 -8
- data/lib/gscraper/version.rb +23 -1
- data/spec/extensions/uri/http_spec.rb +9 -0
- data/spec/extensions/uri/query_params_spec.rb +38 -0
- data/spec/gscraper_spec.rb +29 -0
- data/spec/has_pages_examples.rb +19 -0
- data/spec/has_sponsored_links_examples.rb +57 -0
- data/spec/helpers/query.rb +1 -0
- data/spec/helpers/uri.rb +8 -0
- data/spec/page_has_results_examples.rb +13 -0
- data/spec/search/ajax_query_spec.rb +124 -0
- data/spec/search/page_has_results_examples.rb +51 -0
- data/spec/search/query_spec.rb +103 -0
- data/spec/search/web_query_spec.rb +74 -0
- data/spec/spec_helper.rb +6 -0
- data/tasks/spec.rb +7 -0
- metadata +34 -20
- data/LICENSE.txt +0 -23
- data/lib/gscraper/web_agent.rb +0 -38
- data/test/search/page_results.rb +0 -103
- data/test/search/query_from_url.rb +0 -50
- data/test/search/query_pages.rb +0 -32
- data/test/search/query_result.rb +0 -30
- data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/extensions/uri'
|
2
24
|
|
3
25
|
module GScraper
|
@@ -14,14 +36,16 @@ module GScraper
|
|
14
36
|
#
|
15
37
|
def initialize(title,url)
|
16
38
|
@title = title
|
17
|
-
@url =
|
39
|
+
@url = url
|
18
40
|
end
|
19
41
|
|
20
42
|
#
|
21
43
|
# Returns the direct URL of the ad.
|
22
44
|
#
|
23
45
|
def direct_url
|
24
|
-
|
46
|
+
uri = URI(@url)
|
47
|
+
|
48
|
+
return (uri.query_params['adurl'] || uri.query_params['q'])
|
25
49
|
end
|
26
50
|
|
27
51
|
#
|
@@ -1,12 +1,38 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
require 'gscraper/sponsored_ad'
|
2
24
|
|
3
25
|
module GScraper
|
4
26
|
class SponsoredLinks < Array
|
5
27
|
#
|
6
|
-
# Creates a new SponsoredLinks object with the given _ads_.
|
28
|
+
# Creates a new SponsoredLinks object with the given _ads_. If a
|
29
|
+
# _block_ is given, it will be passed the newly created SponsoredLinks
|
30
|
+
# object.
|
7
31
|
#
|
8
|
-
def initialize(ads=[])
|
32
|
+
def initialize(ads=[],&block)
|
9
33
|
super(ads)
|
34
|
+
|
35
|
+
block.call(self) if block
|
10
36
|
end
|
11
37
|
|
12
38
|
#
|
@@ -74,10 +100,6 @@ module GScraper
|
|
74
100
|
#
|
75
101
|
# sponsored.ads_with_url(/\.com/) # => SponsoredLinks
|
76
102
|
#
|
77
|
-
# sponsored.ads_with_url(/^https:\/\//) do |ad|
|
78
|
-
# puts ad.title
|
79
|
-
# end
|
80
|
-
#
|
81
103
|
def ads_with_url(url,&block)
|
82
104
|
if url.kind_of?(Regexp)
|
83
105
|
ads = ads_with { |ad| ad.url =~ url }
|
@@ -89,6 +111,24 @@ module GScraper
|
|
89
111
|
return ads
|
90
112
|
end
|
91
113
|
|
114
|
+
#
|
115
|
+
# Selects the ads with the matching _direct_url_. The _direct_url_ may
|
116
|
+
# be either a String or a Regexp. If _block_ is given, each matching
|
117
|
+
# ad will be passed to the _block_.
|
118
|
+
#
|
119
|
+
# sponsored.ads_with_direct_url(/\.com/) # => SponsoredLinks
|
120
|
+
#
|
121
|
+
def ads_with_direct_url(direct_url,&block)
|
122
|
+
if direct_url.kind_of?(Regexp)
|
123
|
+
ads = ads_with { |ad| ad.direct_url =~ direct_url }
|
124
|
+
else
|
125
|
+
ads = ads_with { |ad| ad.direct_url == direct_url }
|
126
|
+
end
|
127
|
+
|
128
|
+
ads.each(&block) if block
|
129
|
+
return ads
|
130
|
+
end
|
131
|
+
|
92
132
|
#
|
93
133
|
# Returns an Array containing the titles of the ads within the
|
94
134
|
# SponsoredLinks.
|
@@ -109,6 +149,16 @@ module GScraper
|
|
109
149
|
map { |ad| ad.url }
|
110
150
|
end
|
111
151
|
|
152
|
+
#
|
153
|
+
# Returns an Array containing the direct URLs of the ads within the
|
154
|
+
# SponsoredLinks.
|
155
|
+
#
|
156
|
+
# sponsored.direct_urls # => [...]
|
157
|
+
#
|
158
|
+
def direct_urls
|
159
|
+
map { |ad| ad.direct_url }
|
160
|
+
end
|
161
|
+
|
112
162
|
#
|
113
163
|
# Iterates over each ad's title within the SponsoredLinks, passing each to
|
114
164
|
# the given _block_.
|
@@ -120,7 +170,7 @@ module GScraper
|
|
120
170
|
end
|
121
171
|
|
122
172
|
#
|
123
|
-
# Iterates over each ad's
|
173
|
+
# Iterates over each ad's URL within the SponsoredLinks, passing each to
|
124
174
|
# the given _block_.
|
125
175
|
#
|
126
176
|
# each_url { |url| puts url }
|
@@ -129,6 +179,16 @@ module GScraper
|
|
129
179
|
urls.each(&block)
|
130
180
|
end
|
131
181
|
|
182
|
+
#
|
183
|
+
# Iterates over each ad's direct URL within the SponsoredLinks, passing
|
184
|
+
# each to the given _block_.
|
185
|
+
#
|
186
|
+
# each_direct_url { |url| puts url }
|
187
|
+
#
|
188
|
+
def each_direct_url(&block)
|
189
|
+
direct_urls.each(&block)
|
190
|
+
end
|
191
|
+
|
132
192
|
#
|
133
193
|
# Returns the titles of the ads that match the specified _block_.
|
134
194
|
#
|
@@ -139,7 +199,7 @@ module GScraper
|
|
139
199
|
end
|
140
200
|
|
141
201
|
#
|
142
|
-
# Returns the
|
202
|
+
# Returns the URLs of the ads that match the specified _block_.
|
143
203
|
#
|
144
204
|
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
145
205
|
#
|
@@ -147,5 +207,14 @@ module GScraper
|
|
147
207
|
ads_with(&block).urls
|
148
208
|
end
|
149
209
|
|
210
|
+
#
|
211
|
+
# Returns the direct URLs of the ads that match the specified _block_.
|
212
|
+
#
|
213
|
+
# sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
|
214
|
+
#
|
215
|
+
def direct_urls_of(&block)
|
216
|
+
ads_with(&block).direct_urls
|
217
|
+
end
|
218
|
+
|
150
219
|
end
|
151
220
|
end
|
data/lib/gscraper/version.rb
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
#--
|
3
|
+
# GScraper - A web-scraping interface to various Google Services.
|
4
|
+
#
|
5
|
+
# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
|
6
|
+
#
|
7
|
+
# This program is free software; you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation; either version 2 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with this program; if not, write to the Free Software
|
19
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
20
|
+
#++
|
21
|
+
#
|
22
|
+
|
1
23
|
module GScraper
|
2
|
-
VERSION = '0.
|
24
|
+
VERSION = '0.2.0'
|
3
25
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'gscraper/extensions/uri'
|
4
|
+
|
5
|
+
describe "URI::QueryParams" do
|
6
|
+
before(:each) do
|
7
|
+
@uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should provide #query_params" do
|
11
|
+
@uri.should respond_to(:query_params)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "#query_params should be a Hash" do
|
15
|
+
@uri.query_params.class.should == Hash
|
16
|
+
end
|
17
|
+
|
18
|
+
it "#query_params should contain params" do
|
19
|
+
@uri.query_params.empty?.should == false
|
20
|
+
end
|
21
|
+
|
22
|
+
it "#query_params can contain single-word params" do
|
23
|
+
@uri.query_params['x'].should == '1'
|
24
|
+
end
|
25
|
+
|
26
|
+
it "#query_params can contain multi-word params" do
|
27
|
+
@uri.query_params['y'].should == 'one two'
|
28
|
+
end
|
29
|
+
|
30
|
+
it "#query_params can contain empty params" do
|
31
|
+
@uri.query_params['z'].should be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should update #query_params along with #query=" do
|
35
|
+
@uri.query = 'u=3'
|
36
|
+
@uri.query_params['u'].should == '3'
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'gscraper/gscraper'
|
4
|
+
|
5
|
+
describe "GScraper" do
|
6
|
+
describe "User-Agent support" do
|
7
|
+
it "should have a default User-Agent string" do
|
8
|
+
GScraper.user_agent.should_not be_nil
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "Proxy support" do
|
13
|
+
it "should provide a :host key" do
|
14
|
+
GScraper.proxy.has_key?(:host).should == true
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should provide a :port key" do
|
18
|
+
GScraper.proxy.has_key?(:port).should == true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should provide a :user key" do
|
22
|
+
GScraper.proxy.has_key?(:user).should == true
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should provide a :password key" do
|
26
|
+
GScraper.proxy.has_key?(:password).should == true
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "has Pages" do
|
4
|
+
|
5
|
+
it "should have a first page" do
|
6
|
+
@query.first_page.should_not be_nil
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should allow indexed access" do
|
10
|
+
@query[1].should_not be_nil
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should allow accessing multiple pages" do
|
14
|
+
pages = @query.pages(1..2)
|
15
|
+
pages.should_not be_nil
|
16
|
+
pages.length.should == 2
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "has Sponsored Links" do
|
4
|
+
|
5
|
+
it "should have ads" do
|
6
|
+
@links.length.should_not == 0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have titles" do
|
10
|
+
@links.each_title do |title|
|
11
|
+
title.should_not be_nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should have non-empty titles" do
|
16
|
+
@links.each_title do |title|
|
17
|
+
title.length.should_not == 0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have URLs" do
|
22
|
+
@links.each_url do |url|
|
23
|
+
url.should_not be_nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have non-empty URLs" do
|
28
|
+
@links.each_url do |url|
|
29
|
+
url.length.should_not == 0
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have valid URLs" do
|
34
|
+
@links.each_url do |url|
|
35
|
+
url_should_be_valid(url)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have direct URLs" do
|
40
|
+
@links.each_direct_url do |url|
|
41
|
+
url.should_not be_nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should have non-empty direct URLs" do
|
46
|
+
@links.each_direct_url do |url|
|
47
|
+
url.length.should_not == 0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should have valid direct URLs" do
|
52
|
+
@links.each_direct_url do |url|
|
53
|
+
url_should_be_valid(url)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
DEFAULT_QUERY = 'Ruby'
|
data/spec/helpers/uri.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "Page has Results" do
|
4
|
+
|
5
|
+
it "should have results" do
|
6
|
+
@page.length.should_not == 0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have the maximum amount of results per page" do
|
10
|
+
@page.length.should == @query.results_per_page
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'has_pages_examples'
|
3
|
+
require 'page_has_results_examples'
|
4
|
+
require 'search/page_has_results_examples'
|
5
|
+
|
6
|
+
require 'gscraper/search/ajax_query'
|
7
|
+
|
8
|
+
describe GScraper::Search::AJAXQuery do
|
9
|
+
|
10
|
+
before(:all) do
|
11
|
+
@query = GScraper::Search::AJAXQuery.new(:query => DEFAULT_QUERY)
|
12
|
+
@page = @query.first_page
|
13
|
+
end
|
14
|
+
|
15
|
+
it_should_behave_like "has Pages"
|
16
|
+
it_should_behave_like "Page has Results"
|
17
|
+
it_should_behave_like "Page has Search Results"
|
18
|
+
|
19
|
+
describe "Search URL" do
|
20
|
+
|
21
|
+
before(:all) do
|
22
|
+
@uri = @query.search_url
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should be a valid HTTP URI" do
|
26
|
+
@uri.class.should == URI::HTTP
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should be a RESTful AJAX Search URL" do
|
30
|
+
@uri.path.should == '/uds/GwebSearch'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have the default 'callback' query-param" do
|
34
|
+
callback = @uri.query_params['callback']
|
35
|
+
callback.should == 'google.search.WebSearch.RawCompletion'
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should have the default 'context' query-param" do
|
39
|
+
@uri.query_params['context'].should == '0'
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should have a default 'lstkp' query-param" do
|
43
|
+
@uri.query_params['lstkp'].should == '0'
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should have a default 'rsz' query-param of 'large'" do
|
47
|
+
@uri.query_params['rsz'].should == 'large'
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should have a default 'hl' query-param" do
|
51
|
+
hl = @uri.query_params['hl']
|
52
|
+
hl.should == GScraper::Search::AJAXQuery::DEFAULT_LANGUAGE
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should have a default 'gss' query-param of '.com'" do
|
56
|
+
@uri.query_params['gss'].should == '.com'
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should have a 'q' query-param" do
|
60
|
+
@uri.query_params['q'].should == DEFAULT_QUERY
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should have a default 'sig' query-param" do
|
64
|
+
sig = @uri.query_params['sig']
|
65
|
+
sig.should == GScraper::Search::AJAXQuery::DEFAULT_SIG
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should have a default 'key' query-param" do
|
69
|
+
key = @uri.query_params['key']
|
70
|
+
key.should == GScraper::Search::AJAXQuery::DEFAULT_KEY
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should have a default 'v' query-param" do
|
74
|
+
v = @uri.query_params['v']
|
75
|
+
v.should == GScraper::Search::AJAXQuery::DEFAULT_VERSION
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "page specific URLs" do
|
81
|
+
|
82
|
+
before(:all) do
|
83
|
+
@uri = @query.page_url(2)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should have a 'start' query-param" do
|
87
|
+
@uri.query_params['start'].should == @query.results_per_page
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "queries from AJAX search URLs" do
|
93
|
+
|
94
|
+
before(:all) do
|
95
|
+
@version = '1.0'
|
96
|
+
@language = 'en'
|
97
|
+
@sig = '582c1116317355adf613a6a843f19ece'
|
98
|
+
@key = 'notsupplied'
|
99
|
+
@query = GScraper::Search::AJAXQuery.from_url("http://www.google.com/uds/GwebSearch?v=#{@version}&lstkp=0&rsz=large&hl=#{@language}&callback=google.search.WebSearch.RawCompletion&sig=#{@sig}&q=#{DEFAULT_QUERY}&gss=.com&context=0&key=#{@key}")
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should have a version" do
|
103
|
+
@query.version.should == @version
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should have a language" do
|
107
|
+
@query.language.should == @language
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should have a sig" do
|
111
|
+
@query.sig.should == @sig
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should have a key" do
|
115
|
+
@query.key.should == @key
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should have a query" do
|
119
|
+
@query.query.should == DEFAULT_QUERY
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|