gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/extensions/uri'
2
24
 
3
25
  module GScraper
@@ -14,14 +36,16 @@ module GScraper
14
36
  #
15
37
  def initialize(title,url)
16
38
  @title = title
17
- @url = URI.parse(url)
39
+ @url = url
18
40
  end
19
41
 
20
42
  #
21
43
  # Returns the direct URL of the ad.
22
44
  #
23
45
  def direct_url
24
- @url.query_params['adurl'] || @url.query_params['q']
46
+ uri = URI(@url)
47
+
48
+ return (uri.query_params['adurl'] || uri.query_params['q'])
25
49
  end
26
50
 
27
51
  #
@@ -1,12 +1,38 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  require 'gscraper/sponsored_ad'
2
24
 
3
25
  module GScraper
4
26
  class SponsoredLinks < Array
5
27
  #
6
- # Creates a new SponsoredLinks object with the given _ads_.
28
+ # Creates a new SponsoredLinks object with the given _ads_. If a
29
+ # _block_ is given, it will be passed the newly created SponsoredLinks
30
+ # object.
7
31
  #
8
- def initialize(ads=[])
32
+ def initialize(ads=[],&block)
9
33
  super(ads)
34
+
35
+ block.call(self) if block
10
36
  end
11
37
 
12
38
  #
@@ -74,10 +100,6 @@ module GScraper
74
100
  #
75
101
  # sponsored.ads_with_url(/\.com/) # => SponsoredLinks
76
102
  #
77
- # sponsored.ads_with_url(/^https:\/\//) do |ad|
78
- # puts ad.title
79
- # end
80
- #
81
103
  def ads_with_url(url,&block)
82
104
  if url.kind_of?(Regexp)
83
105
  ads = ads_with { |ad| ad.url =~ url }
@@ -89,6 +111,24 @@ module GScraper
89
111
  return ads
90
112
  end
91
113
 
114
+ #
115
+ # Selects the ads with the matching _direct_url_. The _direct_url_ may
116
+ # be either a String or a Regexp. If _block_ is given, each matching
117
+ # ad will be passed to the _block_.
118
+ #
119
+ # sponsored.ads_with_direct_url(/\.com/) # => SponsoredLinks
120
+ #
121
+ def ads_with_direct_url(direct_url,&block)
122
+ if direct_url.kind_of?(Regexp)
123
+ ads = ads_with { |ad| ad.direct_url =~ direct_url }
124
+ else
125
+ ads = ads_with { |ad| ad.direct_url == direct_url }
126
+ end
127
+
128
+ ads.each(&block) if block
129
+ return ads
130
+ end
131
+
92
132
  #
93
133
  # Returns an Array containing the titles of the ads within the
94
134
  # SponsoredLinks.
@@ -109,6 +149,16 @@ module GScraper
109
149
  map { |ad| ad.url }
110
150
  end
111
151
 
152
+ #
153
+ # Returns an Array containing the direct URLs of the ads within the
154
+ # SponsoredLinks.
155
+ #
156
+ # sponsored.direct_urls # => [...]
157
+ #
158
+ def direct_urls
159
+ map { |ad| ad.direct_url }
160
+ end
161
+
112
162
  #
113
163
  # Iterates over each ad's title within the SponsoredLinks, passing each to
114
164
  # the given _block_.
@@ -120,7 +170,7 @@ module GScraper
120
170
  end
121
171
 
122
172
  #
123
- # Iterates over each ad's url within the SponsoredLinks, passing each to
173
+ # Iterates over each ad's URL within the SponsoredLinks, passing each to
124
174
  # the given _block_.
125
175
  #
126
176
  # each_url { |url| puts url }
@@ -129,6 +179,16 @@ module GScraper
129
179
  urls.each(&block)
130
180
  end
131
181
 
182
+ #
183
+ # Iterates over each ad's direct URL within the SponsoredLinks, passing
184
+ # each to the given _block_.
185
+ #
186
+ # each_direct_url { |url| puts url }
187
+ #
188
+ def each_direct_url(&block)
189
+ direct_urls.each(&block)
190
+ end
191
+
132
192
  #
133
193
  # Returns the titles of the ads that match the specified _block_.
134
194
  #
@@ -139,7 +199,7 @@ module GScraper
139
199
  end
140
200
 
141
201
  #
142
- # Returns the urls of the ads that match the specified _block_.
202
+ # Returns the URLs of the ads that match the specified _block_.
143
203
  #
144
204
  # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
145
205
  #
@@ -147,5 +207,14 @@ module GScraper
147
207
  ads_with(&block).urls
148
208
  end
149
209
 
210
+ #
211
+ # Returns the direct URLs of the ads that match the specified _block_.
212
+ #
213
+ # sponsored.urls_of { |ad| ad.title =~ /buy these pants/ }
214
+ #
215
+ def direct_urls_of(&block)
216
+ ads_with(&block).direct_urls
217
+ end
218
+
150
219
  end
151
220
  end
@@ -1,3 +1,25 @@
1
+ #
2
+ #--
3
+ # GScraper - A web-scraping interface to various Google Services.
4
+ #
5
+ # Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
6
+ #
7
+ # This program is free software; you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation; either version 2 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with this program; if not, write to the Free Software
19
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
+ #++
21
+ #
22
+
1
23
  module GScraper
2
- VERSION = '0.1.7'
24
+ VERSION = '0.2.0'
3
25
  end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/extensions/uri'
4
+
5
+ describe URI::HTTP do
6
+ it "should include QueryParams" do
7
+ URI::HTTP.include?(URI::QueryParams).should == true
8
+ end
9
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/extensions/uri'
4
+
5
+ describe "URI::QueryParams" do
6
+ before(:each) do
7
+ @uri = URI('http://www.test.com/page.php?x=1&y=one%20two&z')
8
+ end
9
+
10
+ it "should provide #query_params" do
11
+ @uri.should respond_to(:query_params)
12
+ end
13
+
14
+ it "#query_params should be a Hash" do
15
+ @uri.query_params.class.should == Hash
16
+ end
17
+
18
+ it "#query_params should contain params" do
19
+ @uri.query_params.empty?.should == false
20
+ end
21
+
22
+ it "#query_params can contain single-word params" do
23
+ @uri.query_params['x'].should == '1'
24
+ end
25
+
26
+ it "#query_params can contain multi-word params" do
27
+ @uri.query_params['y'].should == 'one two'
28
+ end
29
+
30
+ it "#query_params can contain empty params" do
31
+ @uri.query_params['z'].should be_nil
32
+ end
33
+
34
+ it "should update #query_params along with #query=" do
35
+ @uri.query = 'u=3'
36
+ @uri.query_params['u'].should == '3'
37
+ end
38
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/gscraper'
4
+
5
+ describe "GScraper" do
6
+ describe "User-Agent support" do
7
+ it "should have a default User-Agent string" do
8
+ GScraper.user_agent.should_not be_nil
9
+ end
10
+ end
11
+
12
+ describe "Proxy support" do
13
+ it "should provide a :host key" do
14
+ GScraper.proxy.has_key?(:host).should == true
15
+ end
16
+
17
+ it "should provide a :port key" do
18
+ GScraper.proxy.has_key?(:port).should == true
19
+ end
20
+
21
+ it "should provide a :user key" do
22
+ GScraper.proxy.has_key?(:user).should == true
23
+ end
24
+
25
+ it "should provide a :password key" do
26
+ GScraper.proxy.has_key?(:password).should == true
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "has Pages" do
4
+
5
+ it "should have a first page" do
6
+ @query.first_page.should_not be_nil
7
+ end
8
+
9
+ it "should allow indexed access" do
10
+ @query[1].should_not be_nil
11
+ end
12
+
13
+ it "should allow accessing multiple pages" do
14
+ pages = @query.pages(1..2)
15
+ pages.should_not be_nil
16
+ pages.length.should == 2
17
+ end
18
+
19
+ end
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "has Sponsored Links" do
4
+
5
+ it "should have ads" do
6
+ @links.length.should_not == 0
7
+ end
8
+
9
+ it "should have titles" do
10
+ @links.each_title do |title|
11
+ title.should_not be_nil
12
+ end
13
+ end
14
+
15
+ it "should have non-empty titles" do
16
+ @links.each_title do |title|
17
+ title.length.should_not == 0
18
+ end
19
+ end
20
+
21
+ it "should have URLs" do
22
+ @links.each_url do |url|
23
+ url.should_not be_nil
24
+ end
25
+ end
26
+
27
+ it "should have non-empty URLs" do
28
+ @links.each_url do |url|
29
+ url.length.should_not == 0
30
+ end
31
+ end
32
+
33
+ it "should have valid URLs" do
34
+ @links.each_url do |url|
35
+ url_should_be_valid(url)
36
+ end
37
+ end
38
+
39
+ it "should have direct URLs" do
40
+ @links.each_direct_url do |url|
41
+ url.should_not be_nil
42
+ end
43
+ end
44
+
45
+ it "should have non-empty direct URLs" do
46
+ @links.each_direct_url do |url|
47
+ url.length.should_not == 0
48
+ end
49
+ end
50
+
51
+ it "should have valid direct URLs" do
52
+ @links.each_direct_url do |url|
53
+ url_should_be_valid(url)
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1 @@
1
+ DEFAULT_QUERY = 'Ruby'
@@ -0,0 +1,8 @@
1
+ require 'uri'
2
+
3
+ def url_should_be_valid(url)
4
+ uri = URI(url)
5
+ uri.scheme.should_not be_nil
6
+ uri.host.should_not be_nil
7
+ uri.path.should_not be_nil
8
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Page has Results" do
4
+
5
+ it "should have results" do
6
+ @page.length.should_not == 0
7
+ end
8
+
9
+ it "should have the maximum amount of results per page" do
10
+ @page.length.should == @query.results_per_page
11
+ end
12
+
13
+ end
@@ -0,0 +1,124 @@
1
+ require 'spec_helper'
2
+ require 'has_pages_examples'
3
+ require 'page_has_results_examples'
4
+ require 'search/page_has_results_examples'
5
+
6
+ require 'gscraper/search/ajax_query'
7
+
8
+ describe GScraper::Search::AJAXQuery do
9
+
10
+ before(:all) do
11
+ @query = GScraper::Search::AJAXQuery.new(:query => DEFAULT_QUERY)
12
+ @page = @query.first_page
13
+ end
14
+
15
+ it_should_behave_like "has Pages"
16
+ it_should_behave_like "Page has Results"
17
+ it_should_behave_like "Page has Search Results"
18
+
19
+ describe "Search URL" do
20
+
21
+ before(:all) do
22
+ @uri = @query.search_url
23
+ end
24
+
25
+ it "should be a valid HTTP URI" do
26
+ @uri.class.should == URI::HTTP
27
+ end
28
+
29
+ it "should be a RESTful AJAX Search URL" do
30
+ @uri.path.should == '/uds/GwebSearch'
31
+ end
32
+
33
+ it "should have the default 'callback' query-param" do
34
+ callback = @uri.query_params['callback']
35
+ callback.should == 'google.search.WebSearch.RawCompletion'
36
+ end
37
+
38
+ it "should have the default 'context' query-param" do
39
+ @uri.query_params['context'].should == '0'
40
+ end
41
+
42
+ it "should have a default 'lstkp' query-param" do
43
+ @uri.query_params['lstkp'].should == '0'
44
+ end
45
+
46
+ it "should have a default 'rsz' query-param of 'large'" do
47
+ @uri.query_params['rsz'].should == 'large'
48
+ end
49
+
50
+ it "should have a default 'hl' query-param" do
51
+ hl = @uri.query_params['hl']
52
+ hl.should == GScraper::Search::AJAXQuery::DEFAULT_LANGUAGE
53
+ end
54
+
55
+ it "should have a default 'gss' query-param of '.com'" do
56
+ @uri.query_params['gss'].should == '.com'
57
+ end
58
+
59
+ it "should have a 'q' query-param" do
60
+ @uri.query_params['q'].should == DEFAULT_QUERY
61
+ end
62
+
63
+ it "should have a default 'sig' query-param" do
64
+ sig = @uri.query_params['sig']
65
+ sig.should == GScraper::Search::AJAXQuery::DEFAULT_SIG
66
+ end
67
+
68
+ it "should have a default 'key' query-param" do
69
+ key = @uri.query_params['key']
70
+ key.should == GScraper::Search::AJAXQuery::DEFAULT_KEY
71
+ end
72
+
73
+ it "should have a default 'v' query-param" do
74
+ v = @uri.query_params['v']
75
+ v.should == GScraper::Search::AJAXQuery::DEFAULT_VERSION
76
+ end
77
+
78
+ end
79
+
80
+ describe "page specific URLs" do
81
+
82
+ before(:all) do
83
+ @uri = @query.page_url(2)
84
+ end
85
+
86
+ it "should have a 'start' query-param" do
87
+ @uri.query_params['start'].should == @query.results_per_page
88
+ end
89
+
90
+ end
91
+
92
+ describe "queries from AJAX search URLs" do
93
+
94
+ before(:all) do
95
+ @version = '1.0'
96
+ @language = 'en'
97
+ @sig = '582c1116317355adf613a6a843f19ece'
98
+ @key = 'notsupplied'
99
+ @query = GScraper::Search::AJAXQuery.from_url("http://www.google.com/uds/GwebSearch?v=#{@version}&lstkp=0&rsz=large&hl=#{@language}&callback=google.search.WebSearch.RawCompletion&sig=#{@sig}&q=#{DEFAULT_QUERY}&gss=.com&context=0&key=#{@key}")
100
+ end
101
+
102
+ it "should have a version" do
103
+ @query.version.should == @version
104
+ end
105
+
106
+ it "should have a language" do
107
+ @query.language.should == @language
108
+ end
109
+
110
+ it "should have a sig" do
111
+ @query.sig.should == @sig
112
+ end
113
+
114
+ it "should have a key" do
115
+ @query.key.should == @key
116
+ end
117
+
118
+ it "should have a query" do
119
+ @query.query.should == DEFAULT_QUERY
120
+ end
121
+
122
+ end
123
+
124
+ end