gscraper 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri'
2
24
 
3
25
  module GScraper
@@ -14,14 +36,16 @@ module GScraper
14
36
  #
15
37
  def initialize(title,url)
16
38
  @title = title
17
- @url = URI.parse(url)
39
+ @url = url
18
40
  end
19
41
 
20
42
  #
21
43
  # Returns the direct URL of the ad.
22
44
  #
23
45
  def direct_url
24
- @url.query_params['adurl'] || @url.query_params['q']
46
+ uri = URI(@url)
47
+
48
+ return (uri.query_params['adurl'] || uri.query_params['q'])
25
49
  end
26
50
 
27
51
  #
@@ -1,12 +1,38 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/sponsored_ad'
2
24
 
3
25
  module GScraper
4
26
  class SponsoredLinks < Array
5
27
  #
6
- # Creates a new SponsoredLinks object with the given _ads_.
28
+ # Creates a new SponsoredLinks object with the given _ads_. If a
29
+ # _block_ is given, it will be passed the newly created SponsoredLinks
30
+ # object.
7
31
  #
8
- def initialize(ads=[])
32
+ def initialize(ads=[],&block)
9
33
  super(ads)
34
+
35
+ block.call(self) if block
10
36
  end
11
37
 
12
38
  #
@@ -74,10 +100,6 @@ module GScraper
74
100
  #
75
101
  # sponsored.ads_with_url(/\.com/) # => SponsoredLinks
76
102
  #
77
- # sponsored.ads_with_url(/^https:\/\//) do |ad|
78
- # puts ad.title
79
- # end
80
- #
81
103
  def ads_with_url(url,&block)
82
104
  if url.kind_of?(Regexp)
83
105
  ads = ads_with { |ad| ad.url =~ url }
@@ -89,6 +111,24 @@ module GScraper
89
111
  return ads
90
112
  end
91
113
 
114
+ #
115
+ # Selects the ads with the matching _direct_url_. The _direct_url_ may
116
+ # be either a String or a Regexp. If _block_ is given, each matching
117
+ # ad will be passed to the _block_.
118
+ #
119
+ # sponsored.ads_with_direct_url(/\.com/) # => SponsoredLinks
120
+ #
121
+ def ads_with_direct_url(direct_url,&block)
122
+ if direct_url.kind_of?(Regexp)
123
+ ads = ads_with { |ad| ad.direct_url =~ direct_url }
124
+ else
125
+ ads = ads_with { |ad| ad.direct_url == direct_url }
126
+ end
127
+
128
+ ads.each(&block) if block
129
+ return ads
130
+ end
131
+
92
132
  #
93
133
  # Returns an Array containing the titles of the ads within the
94
134
  # SponsoredLinks.
@@ -109,6 +149,16 @@ module GScraper
109
149
  map { |ad| ad.url }
110
150
  end
111
151
 
152
+ #
153
+ # Returns an Array containing the direct URLs of the ads within the
154
+ # SponsoredLinks.
155
+ #
156
+ # sponsored.direct_urls # => [...]
157
+ #
158
+ def direct_urls
159
+ map { |ad| ad.direct_url }
160
+ end
161
+
112
162
  #
113
163
  # Iterates over each ad's title within the SponsoredLinks, passing each to
114
164
  # the given _block_.
@@ -120,7 +170,7 @@ module GScraper
120
170
  end
121
171
 
122
172
  #
123
- # Iterates over each ad's url within the SponsoredLinks, passing each to
173
+ # Iterates over each ad's URL within the SponsoredLinks, passing each to
124
174
  # the given _block_.
125
175
  #
126
176
  # each_url { |url| puts url }
@@ -129,6 +179,16 @@ module GScraper
129
179
  urls.each(&block)
130
180
  end
131
181
 
182
+ #
183
+ # Iterates over each ad's direct URL within the SponsoredLinks, passing
184
+ # each to the given _block_.
185
+ #
186
+ # each_direct_url { |url| puts url }
187
+ #
188
+ def each_direct_url(&block)
189
+ direct_urls.each(&block)
190
+ end
191
+
132
192
  #
133
193
  # Returns the titles of the ads that match the specified _block_.
134
194
  #
@@ -139,7 +199,7 @@ module GScraper
139
199
  end
140
200
 
141
201
  #
142
- # Returns the urls of the ads that match the specified _block_.
202
+ # Returns the URLs of the ads that match the specified _block_.
143
203
  #
144
204
  # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
145
205
  #
@@ -147,5 +207,14 @@ module GScraper
147
207
  ads_with(&block).urls
148
208
  end
149
209
 
210
+ #
211
+ # Returns the direct URLs of the ads that match the specified _block_.
212
+ #
213
+ # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
214
+ #
215
+ def direct_urls_of(&block)
216
+ ads_with(&block).direct_urls
217
+ end
218
+
150
219
  end
151
220
  end
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  module GScraper
2
- VERSION = '0.1.7'
24
+ VERSION = '0.2.0'
3
25
  end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/extensions/uri'
4
+
5
+ describe URI::HTTP do
6
+ it "should include QueryParams" do
7
+ URI::HTTP.include?(URI::QueryParams).should == true
8
+ end
9
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/extensions/uri'
4
+
5
+ describe "URI::QueryParams" do
6
+ before(:each) do
7
+ @uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
8
+ end
9
+
10
+ it "should provide #query_params" do
11
+ @uri.should respond_to(:query_params)
12
+ end
13
+
14
+ it "#query_params should be a Hash" do
15
+ @uri.query_params.class.should == Hash
16
+ end
17
+
18
+ it "#query_params should contain params" do
19
+ @uri.query_params.empty?.should == false
20
+ end
21
+
22
+ it "#query_params can contain single-word params" do
23
+ @uri.query_params['x'].should == '1'
24
+ end
25
+
26
+ it "#query_params can contain multi-word params" do
27
+ @uri.query_params['y'].should == 'one two'
28
+ end
29
+
30
+ it "#query_params can contain empty params" do
31
+ @uri.query_params['z'].should be_nil
32
+ end
33
+
34
+ it "should update #query_params along with #query=" do
35
+ @uri.query = 'u=3'
36
+ @uri.query_params['u'].should == '3'
37
+ end
38
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/gscraper'
4
+
5
+ describe "GScraper" do
6
+ describe "User-Agent support" do
7
+ it "should have a default User-Agent string" do
8
+ GScraper.user_agent.should_not be_nil
9
+ end
10
+ end
11
+
12
+ describe "Proxy support" do
13
+ it "should provide a :host key" do
14
+ GScraper.proxy.has_key?(:host).should == true
15
+ end
16
+
17
+ it "should provide a :port key" do
18
+ GScraper.proxy.has_key?(:port).should == true
19
+ end
20
+
21
+ it "should provide a :user key" do
22
+ GScraper.proxy.has_key?(:user).should == true
23
+ end
24
+
25
+ it "should provide a :password key" do
26
+ GScraper.proxy.has_key?(:password).should == true
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "has Pages" do
4
+
5
+ it "should have a first page" do
6
+ @query.first_page.should_not be_nil
7
+ end
8
+
9
+ it "should allow indexed access" do
10
+ @query[1].should_not be_nil
11
+ end
12
+
13
+ it "should allow accessing multiple pages" do
14
+ pages = @query.pages(1..2)
15
+ pages.should_not be_nil
16
+ pages.length.should == 2
17
+ end
18
+
19
+ end
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "has Sponsored Links" do
4
+
5
+ it "should have ads" do
6
+ @links.length.should_not == 0
7
+ end
8
+
9
+ it "should have titles" do
10
+ @links.each_title do |title|
11
+ title.should_not be_nil
12
+ end
13
+ end
14
+
15
+ it "should have non-empty titles" do
16
+ @links.each_title do |title|
17
+ title.length.should_not == 0
18
+ end
19
+ end
20
+
21
+ it "should have URLs" do
22
+ @links.each_url do |url|
23
+ url.should_not be_nil
24
+ end
25
+ end
26
+
27
+ it "should have non-empty URLs" do
28
+ @links.each_url do |url|
29
+ url.length.should_not == 0
30
+ end
31
+ end
32
+
33
+ it "should have valid URLs" do
34
+ @links.each_url do |url|
35
+ url_should_be_valid(url)
36
+ end
37
+ end
38
+
39
+ it "should have direct URLs" do
40
+ @links.each_direct_url do |url|
41
+ url.should_not be_nil
42
+ end
43
+ end
44
+
45
+ it "should have non-empty direct URLs" do
46
+ @links.each_direct_url do |url|
47
+ url.length.should_not == 0
48
+ end
49
+ end
50
+
51
+ it "should have valid direct URLs" do
52
+ @links.each_direct_url do |url|
53
+ url_should_be_valid(url)
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1 @@
1
+ DEFAULT_QUERY = 'Ruby'
@@ -0,0 +1,8 @@
1
+ require 'uri'
2
+
3
+ def url_should_be_valid(url)
4
+ uri = URI(url)
5
+ uri.scheme.should_not be_nil
6
+ uri.host.should_not be_nil
7
+ uri.path.should_not be_nil
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Page has Results" do
4
+
5
+ it "should have results" do
6
+ @page.length.should_not == 0
7
+ end
8
+
9
+ it "should have the maximum amount of results per page" do
10
+ @page.length.should == @query.results_per_page
11
+ end
12
+
13
+ end
@@ -0,0 +1,124 @@
1
+ require 'spec_helper'
2
+ require 'has_pages_examples'
3
+ require 'page_has_results_examples'
4
+ require 'search/page_has_results_examples'
5
+
6
+ require 'gscraper/search/ajax_query'
7
+
8
+ describe GScraper::Search::AJAXQuery do
9
+
10
+ before(:all) do
11
+ @query = GScraper::Search::AJAXQuery.new(:query => DEFAULT_QUERY)
12
+ @page = @query.first_page
13
+ end
14
+
15
+ it_should_behave_like "has Pages"
16
+ it_should_behave_like "Page has Results"
17
+ it_should_behave_like "Page has Search Results"
18
+
19
+ describe "Search URL" do
20
+
21
+ before(:all) do
22
+ @uri = @query.search_url
23
+ end
24
+
25
+ it "should be a valid HTTP URI" do
26
+ @uri.class.should == URI::HTTP
27
+ end
28
+
29
+ it "should be a RESTful AJAX Search URL" do
30
+ @uri.path.should == '/uds/GwebSearch'
31
+ end
32
+
33
+ it "should have the default 'callback' query-param" do
34
+ callback = @uri.query_params['callback']
35
+ callback.should == 'google.search.WebSearch.RawCompletion'
36
+ end
37
+
38
+ it "should have the default 'context' query-param" do
39
+ @uri.query_params['context'].should == '0'
40
+ end
41
+
42
+ it "should have a default 'lstkp' query-param" do
43
+ @uri.query_params['lstkp'].should == '0'
44
+ end
45
+
46
+ it "should have a default 'rsz' query-param of 'large'" do
47
+ @uri.query_params['rsz'].should == 'large'
48
+ end
49
+
50
+ it "should have a default 'hl' query-param" do
51
+ hl = @uri.query_params['hl']
52
+ hl.should == GScraper::Search::AJAXQuery::DEFAULT_LANGUAGE
53
+ end
54
+
55
+ it "should have a default 'gss' query-param of '.com'" do
56
+ @uri.query_params['gss'].should == '.com'
57
+ end
58
+
59
+ it "should have a 'q' query-param" do
60
+ @uri.query_params['q'].should == DEFAULT_QUERY
61
+ end
62
+
63
+ it "should have a default 'sig' query-param" do
64
+ sig = @uri.query_params['sig']
65
+ sig.should == GScraper::Search::AJAXQuery::DEFAULT_SIG
66
+ end
67
+
68
+ it "should have a default 'key' query-param" do
69
+ key = @uri.query_params['key']
70
+ key.should == GScraper::Search::AJAXQuery::DEFAULT_KEY
71
+ end
72
+
73
+ it "should have a default 'v' query-param" do
74
+ v = @uri.query_params['v']
75
+ v.should == GScraper::Search::AJAXQuery::DEFAULT_VERSION
76
+ end
77
+
78
+ end
79
+
80
+ describe "page specific URLs" do
81
+
82
+ before(:all) do
83
+ @uri = @query.page_url(2)
84
+ end
85
+
86
+ it "should have a 'start' query-param" do
87
+ @uri.query_params['start'].should == @query.results_per_page
88
+ end
89
+
90
+ end
91
+
92
+ describe "queries from AJAX search URLs" do
93
+
94
+ before(:all) do
95
+ @version = '1.0'
96
+ @language = 'en'
97
+ @sig = '582c1116317355adf613a6a843f19ece'
98
+ @key = 'notsupplied'
99
+ @query = GScraper::Search::AJAXQuery.from_url("http://www.google.com/uds/GwebSearch?v=#{@version}&lstkp=0&rsz=large&hl=#{@language}&callback=google.search.WebSearch.RawCompletion&sig=#{@sig}&q=#{DEFAULT_QUERY}&gss=.com&context=0&key=#{@key}")
100
+ end
101
+
102
+ it "should have a version" do
103
+ @query.version.should == @version
104
+ end
105
+
106
+ it "should have a language" do
107
+ @query.language.should == @language
108
+ end
109
+
110
+ it "should have a sig" do
111
+ @query.sig.should == @sig
112
+ end
113
+
114
+ it "should have a key" do
115
+ @query.key.should == @key
116
+ end
117
+
118
+ it "should have a query" do
119
+ @query.query.should == DEFAULT_QUERY
120
+ end
121
+
122
+ end
123
+
124
+ end