gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Page has Search Results" do
4
+
5
+ it "should have incremental ranks" do
6
+ ranks = @page.ranks
7
+
8
+ (0..(ranks.length - 2)).each do |index|
9
+ ranks[index].should < ranks[index + 1]
10
+ end
11
+ end
12
+
13
+ it "should have titles" do
14
+ @page.each_title do |title|
15
+ title.should_not be_nil
16
+ end
17
+ end
18
+
19
+ it "should have non-empty titles" do
20
+ @page.each_title do |title|
21
+ title.length.should_not == 0
22
+ end
23
+ end
24
+
25
+ it "should have URLs" do
26
+ @page.each_url do |url|
27
+ url.should_not be_nil
28
+ end
29
+ end
30
+
31
+ it "should have non-empty URLs" do
32
+ @page.each_url do |url|
33
+ url.length.should_not == 0
34
+ end
35
+ end
36
+
37
+ it "should have valid URLs" do
38
+ @page.each_url do |url|
39
+ url_should_be_valid(url)
40
+ end
41
+ end
42
+
43
+ it "should have atleast one cached URL" do
44
+ @page.cached_urls.should_not == 0
45
+ end
46
+
47
+ it "should have atleast one similar query URL" do
48
+ @page.similar_urls.should_not == 0
49
+ end
50
+
51
+ end
@@ -0,0 +1,103 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/search/query'
4
+
5
+ describe GScraper::Search::Query do
6
+
7
+ it "should support basic queries" do
8
+ expr = 'ruby -blog'
9
+ query = GScraper::Search::Query.new(:query => expr)
10
+ query.expression.should == expr
11
+ end
12
+
13
+ it "should support the 'link' modifier" do
14
+ url = 'www.wired.com/'
15
+ query = GScraper::Search::Query.new(:link => url)
16
+ query.expression.should == "link:#{url}"
17
+ end
18
+
19
+ it "should support the 'related' modifier" do
20
+ url = 'www.rubyinside.com'
21
+ query = GScraper::Search::Query.new(:related => url)
22
+ query.expression.should == "related:#{url}"
23
+ end
24
+
25
+ it "should support the 'info' modifier" do
26
+ url = "www.rspec.info"
27
+ query = GScraper::Search::Query.new(:info => url)
28
+ query.expression.should == "info:#{url}"
29
+ end
30
+
31
+ it "should support the 'site' modifier" do
32
+ url = "www.ruby-lang.net"
33
+ query = GScraper::Search::Query.new(:site => url)
34
+ query.expression.should == "site:#{url}"
35
+ end
36
+
37
+ it "should support the 'filetype' modifier" do
38
+ file_type = 'rss'
39
+ query = GScraper::Search::Query.new(:filetype => file_type)
40
+ query.expression.should == "filetype:#{file_type}"
41
+ end
42
+
43
+ it "should support 'allintitle' options" do
44
+ words = ['one', 'two', 'three']
45
+ query = GScraper::Search::Query.new(:allintitle => words)
46
+ query.expression.should == "allintitle:#{words.join(' ')}"
47
+ end
48
+
49
+ it "should support the 'intitle' modifier" do
50
+ word = 'coffee'
51
+ query = GScraper::Search::Query.new(:intitle => word)
52
+ query.expression.should == "intitle:#{word}"
53
+ end
54
+
55
+ it "should support 'allinurl' options" do
56
+ params = ['search', 'id', 'page']
57
+ query = GScraper::Search::Query.new(:allinurl => params)
58
+ query.expression.should == "allinurl:#{params.join(' ')}"
59
+ end
60
+
61
+ it "should support the 'inurl' modifier" do
62
+ param = 'id'
63
+ query = GScraper::Search::Query.new(:inurl => param)
64
+ query.expression.should == "inurl:#{param}"
65
+ end
66
+
67
+ it "should support 'allintext' options" do
68
+ words = ['dog', 'blog', 'log']
69
+ query = GScraper::Search::Query.new(:allintext => words)
70
+ query.expression.should == "allintext:#{words.join(' ')}"
71
+ end
72
+
73
+ it "should support the 'intext' modifier" do
74
+ word = 'word'
75
+ query = GScraper::Search::Query.new(:intext => word)
76
+ query.expression.should == "intext:#{word}"
77
+ end
78
+
79
+ it "should support 'exact phrases'" do
80
+ phrase = 'how do you do?'
81
+ query = GScraper::Search::Query.new(:exact_phrase => phrase)
82
+ query.expression.should == "\"#{phrase}\""
83
+ end
84
+
85
+ it "should support 'with words'" do
86
+ words = ['one', 'two', 'three']
87
+ query = GScraper::Search::Query.new(:with_words => words)
88
+ query.expression.should == words.join(' OR ')
89
+ end
90
+
91
+ it "should support 'without words'" do
92
+ words = ['bla', 'haha', 'spam']
93
+ query = GScraper::Search::Query.new(:without_words => words)
94
+ query.expression.should == words.map { |word| "-#{word}" }.join(' ')
95
+ end
96
+
97
+ it "should support 'numeric range'" do
98
+ range = (3..8)
99
+ query = GScraper::Search::Query.new(:numeric_range => range)
100
+ query.expression.should == "#{range.begin}..#{range.end}"
101
+ end
102
+
103
+ end
@@ -0,0 +1,74 @@
1
+ require 'spec_helper'
2
+ require 'has_pages_examples'
3
+ require 'page_has_results_examples'
4
+ require 'has_sponsored_links_examples'
5
+ require 'search/page_has_results_examples'
6
+
7
+ require 'gscraper/search/web_query'
8
+
9
+ describe GScraper::Search::WebQuery do
10
+
11
+ before(:all) do
12
+ @query = GScraper::Search::WebQuery.new(:query => DEFAULT_QUERY)
13
+ @page = @query.first_page
14
+ @links = @query.sponsored_links
15
+ end
16
+
17
+ it_should_behave_like "has Pages"
18
+ it_should_behave_like "Page has Results"
19
+ it_should_behave_like "Page has Search Results"
20
+ it_should_behave_like "has Sponsored Links"
21
+
22
+ describe "Search URL" do
23
+
24
+ before(:all) do
25
+ @uri = @query.search_url
26
+ end
27
+
28
+ it "should be a valid HTTP URI" do
29
+ @uri.class.should == URI::HTTP
30
+ end
31
+
32
+ it "should have a 'q' query-param" do
33
+ @uri.query_params['q'].should == DEFAULT_QUERY
34
+ end
35
+
36
+ it "should have a 'num' query-param" do
37
+ @uri.query_params['num'].should == @query.results_per_page
38
+ end
39
+
40
+ end
41
+
42
+ describe "page specific URLs" do
43
+
44
+ before(:all) do
45
+ @uri = @query.page_url(2)
46
+ end
47
+
48
+ it "should have a 'start' query-param" do
49
+ @uri.query_params['start'].should == @query.results_per_page
50
+ end
51
+
52
+ it "should have a 'sa' query-param" do
53
+ @uri.query_params['sa'].should == 'N'
54
+ end
55
+
56
+ end
57
+
58
+ describe "queries from Web search URLs" do
59
+
60
+ before(:all) do
61
+ @query = GScraper::Search::WebQuery.from_url("http://www.google.com/search?sa=N&start=0&q=#{DEFAULT_QUERY}&num=20")
62
+ end
63
+
64
+ it "should have a results-per-page" do
65
+ @query.results_per_page.should == 20
66
+ end
67
+
68
+ it "should have a query" do
69
+ @query.query.should == DEFAULT_QUERY
70
+ end
71
+
72
+ end
73
+
74
+ end
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ gem 'rspec', '>=1.1.3'
3
+ require 'spec'
4
+
5
+ require 'helpers/query'
6
+ require 'helpers/uri'
data/tasks/spec.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ desc "Run all specifications"
4
+ Spec::Rake::SpecTask.new(:spec) do |t|
5
+ t.libs += ['lib', 'spec']
6
+ t.spec_opts = ['--colour', '--format', 'specdoc']
7
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-04-28 00:00:00 -07:00
12
+ date: 2008-06-21 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -37,47 +37,61 @@ dependencies:
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: 1.5.1
40
+ version: 1.6.0
41
41
  version:
42
- description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides access to search results and ranks. * Provides access to the Sponsored Links. * Provides HTTP access with custom User-Agent strings. * Provides proxy settings for HTTP access. == REQUIREMENTS: * Hpricot * WWW::Mechanize == INSTALL:"
43
- email: postmodern.mod3@gmail.com
42
+ description: GScraper is a web-scraping interface to various Google Services.
43
+ email:
44
+ - postmodern.mod3@gmail.com
44
45
  executables: []
45
46
 
46
47
  extensions: []
47
48
 
48
49
  extra_rdoc_files:
49
50
  - History.txt
50
- - LICENSE.txt
51
+ - COPYING.txt
51
52
  - Manifest.txt
52
53
  - README.txt
53
54
  files:
54
55
  - History.txt
55
- - LICENSE.txt
56
+ - COPYING.txt
56
57
  - Manifest.txt
57
58
  - README.txt
58
59
  - Rakefile
59
- - lib/gscraper.rb
60
- - lib/gscraper/version.rb
61
- - lib/gscraper/gscraper.rb
62
- - lib/gscraper/web_agent.rb
60
+ - lib/gscraper/extensions/uri/query_params.rb
63
61
  - lib/gscraper/extensions/uri/http.rb
64
62
  - lib/gscraper/extensions/uri.rb
65
63
  - lib/gscraper/extensions.rb
66
64
  - lib/gscraper/licenses.rb
65
+ - lib/gscraper/page.rb
66
+ - lib/gscraper/has_pages.rb
67
67
  - lib/gscraper/sponsored_ad.rb
68
68
  - lib/gscraper/sponsored_links.rb
69
69
  - lib/gscraper/search/result.rb
70
70
  - lib/gscraper/search/page.rb
71
71
  - lib/gscraper/search/query.rb
72
+ - lib/gscraper/search/web_query.rb
73
+ - lib/gscraper/search/ajax_query.rb
72
74
  - lib/gscraper/search/search.rb
73
75
  - lib/gscraper/search.rb
74
- - test/test_gscraper.rb
75
- - test/search/query_from_url.rb
76
- - test/search/query_result.rb
77
- - test/search/query_pages.rb
78
- - test/search/page_results.rb
76
+ - lib/gscraper/gscraper.rb
77
+ - lib/gscraper/version.rb
78
+ - lib/gscraper.rb
79
+ - tasks/spec.rb
80
+ - spec/spec_helper.rb
81
+ - spec/helpers/uri.rb
82
+ - spec/helpers/query.rb
83
+ - spec/extensions/uri/query_params_spec.rb
84
+ - spec/extensions/uri/http_spec.rb
85
+ - spec/has_pages_examples.rb
86
+ - spec/page_has_results_examples.rb
87
+ - spec/has_sponsored_links_examples.rb
88
+ - spec/search/page_has_results_examples.rb
89
+ - spec/search/query_spec.rb
90
+ - spec/search/ajax_query_spec.rb
91
+ - spec/search/web_query_spec.rb
92
+ - spec/gscraper_spec.rb
79
93
  has_rdoc: true
80
- homepage: " by Postmodern Modulus III"
94
+ homepage: http://rubyforge.org/projects/gscraper/
81
95
  post_install_message:
82
96
  rdoc_options:
83
97
  - --main
@@ -102,6 +116,6 @@ rubyforge_project: gscraper
102
116
  rubygems_version: 1.1.1
103
117
  signing_key:
104
118
  specification_version: 2
105
- summary: A ruby web-scraping interface to various Google Services
106
- test_files:
107
- - test/test_gscraper.rb
119
+ summary: GScraper is a web-scraping interface to various Google Services.
120
+ test_files: []
121
+
data/LICENSE.txt DELETED
@@ -1,23 +0,0 @@
1
-
2
-
3
- The MIT License
4
-
5
- Copyright (c) 2007 Hal Brodigan
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in
15
- all copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
- THE SOFTWARE.
@@ -1,38 +0,0 @@
1
- require 'gscraper/gscraper'
2
-
3
- module GScraper
4
- module WebAgent
5
- protected
6
-
7
- #
8
- # Returns the WWW::Mechanize agent.
9
- #
10
- def web_agent(&block)
11
- @web_agent ||= GScraper.web_agent
12
-
13
- block.call(@web_agent) if block
14
- return @web_agent
15
- end
16
-
17
- #
18
- # Fetches the specified _url_, with the given _referer_ using the
19
- # web_agent.
20
- #
21
- # get_page('http://www.hackety.org/')
22
- #
23
- def get_page(url,referer=nil,&block)
24
- web_agent.get(url,referer,&block)
25
- end
26
-
27
- #
28
- # Posts the specified _url_ and the given _query_ parameters using the
29
- # web_agent.
30
- #
31
- # post_page('http://www.wired.com/', :q => 'the future')
32
- #
33
- def post_page(url,query={})
34
- web_agent.post(url,query)
35
- end
36
-
37
- end
38
- end
@@ -1,103 +0,0 @@
1
- require 'test/unit'
2
- require 'gscraper/search/page'
3
- require 'gscraper/search/query'
4
-
5
- class PageResults < Test::Unit::TestCase
6
-
7
- include GScraper
8
-
9
- def setup
10
- @query = Search::Query.new(:query => 'ruby')
11
- @page = @query.first_page
12
- end
13
-
14
- def test_results_per_page
15
- assert_equal @page.length, @query.results_per_page
16
- end
17
-
18
- def test_first_result
19
- assert_not_nil @page[0], "First Page for Query 'ruby' does not have a first Result"
20
- end
21
-
22
- def test_last_result
23
- assert_not_nil @page[-1], "First Page for Query 'ruby' does not have a last Result"
24
- end
25
-
26
- def test_ranks
27
- ranks = @page.ranks
28
-
29
- assert_not_nil ranks, "First Page for Query 'ruby' does not have any ranks"
30
-
31
- assert_equal ranks.class, Array, "The ranks of a Page must be an Array"
32
-
33
- assert_equal ranks.empty?, false, "The ranks of the First Page are empty"
34
-
35
- assert_equal ranks.length, @page.length
36
- end
37
-
38
- def test_titles
39
- titles = @page.titles
40
-
41
- assert_not_nil titles, "First Page for Query 'ruby' does not have any titles"
42
-
43
- assert_equal titles.class, Array, "The titles of a Page must be an Array"
44
-
45
- assert_equal titles.empty?, false, "The titles of the First Page are empty"
46
-
47
- assert_equal titles.length, @page.length
48
- end
49
-
50
- def test_urls
51
- urls = @page.urls
52
-
53
- assert_not_nil urls, "First Page for Query 'ruby' does not have any urls"
54
-
55
- assert_equal urls.class, Array, "The urls of a Page must be an Array"
56
-
57
- assert_equal urls.empty?, false, "The urls of the First Page are empty"
58
-
59
- assert_equal urls.length, @page.length
60
- end
61
-
62
- def test_summaries
63
- summaries = @page.summaries
64
-
65
- assert_not_nil summaries, "First Page for Query 'ruby' does not have any summaries"
66
-
67
- assert_equal summaries.class, Array, "The summaries of a Page must be an Array"
68
-
69
- assert_equal summaries.empty?, false, "The summaries of the First Page are empty"
70
-
71
- assert_equal summaries.length, @page.length
72
- end
73
-
74
- def test_cached_urls
75
- cached_urls = @page.cached_urls
76
-
77
- assert_not_nil cached_urls, "First Page for Query 'ruby' does not have any cached_urls"
78
-
79
- assert_equal cached_urls.class, Array, "The cached_urls of a Page must be an Array"
80
-
81
- assert_equal cached_urls.empty?, false, "The cached_urls of the First Page are empty"
82
-
83
- assert_equal cached_urls.length, @page.length
84
- end
85
-
86
- def test_similar_urls
87
- similar_urls = @page.similar_urls
88
-
89
- assert_not_nil similar_urls, "First Page for Query 'ruby' does not have any similar URLs"
90
-
91
- assert_equal similar_urls.class, Array, "The similar URLs of a Page must be an Array"
92
-
93
- assert_equal similar_urls.empty?, false, "The similar URLs of the First Page are empty"
94
-
95
- assert_equal similar_urls.length, @page.length
96
- end
97
-
98
- def teardown
99
- @page = nil
100
- @query = nil
101
- end
102
-
103
- end