gscraper 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/COPYING.txt +339 -0
  2. data/History.txt +21 -0
  3. data/Manifest.txt +23 -10
  4. data/README.txt +17 -21
  5. data/Rakefile +3 -6
  6. data/lib/gscraper.rb +22 -0
  7. data/lib/gscraper/extensions.rb +22 -0
  8. data/lib/gscraper/extensions/uri.rb +22 -0
  9. data/lib/gscraper/extensions/uri/http.rb +25 -71
  10. data/lib/gscraper/extensions/uri/query_params.rb +96 -0
  11. data/lib/gscraper/gscraper.rb +30 -0
  12. data/lib/gscraper/has_pages.rb +114 -0
  13. data/lib/gscraper/licenses.rb +22 -0
  14. data/lib/gscraper/page.rb +64 -0
  15. data/lib/gscraper/search.rb +24 -0
  16. data/lib/gscraper/search/ajax_query.rb +176 -0
  17. data/lib/gscraper/search/page.rb +27 -72
  18. data/lib/gscraper/search/query.rb +46 -457
  19. data/lib/gscraper/search/result.rb +32 -29
  20. data/lib/gscraper/search/search.rb +44 -3
  21. data/lib/gscraper/search/web_query.rb +472 -0
  22. data/lib/gscraper/sponsored_ad.rb +26 -2
  23. data/lib/gscraper/sponsored_links.rb +77 -8
  24. data/lib/gscraper/version.rb +23 -1
  25. data/spec/extensions/uri/http_spec.rb +9 -0
  26. data/spec/extensions/uri/query_params_spec.rb +38 -0
  27. data/spec/gscraper_spec.rb +29 -0
  28. data/spec/has_pages_examples.rb +19 -0
  29. data/spec/has_sponsored_links_examples.rb +57 -0
  30. data/spec/helpers/query.rb +1 -0
  31. data/spec/helpers/uri.rb +8 -0
  32. data/spec/page_has_results_examples.rb +13 -0
  33. data/spec/search/ajax_query_spec.rb +124 -0
  34. data/spec/search/page_has_results_examples.rb +51 -0
  35. data/spec/search/query_spec.rb +103 -0
  36. data/spec/search/web_query_spec.rb +74 -0
  37. data/spec/spec_helper.rb +6 -0
  38. data/tasks/spec.rb +7 -0
  39. metadata +34 -20
  40. data/LICENSE.txt +0 -23
  41. data/lib/gscraper/web_agent.rb +0 -38
  42. data/test/search/page_results.rb +0 -103
  43. data/test/search/query_from_url.rb +0 -50
  44. data/test/search/query_pages.rb +0 -32
  45. data/test/search/query_result.rb +0 -30
  46. data/test/test_gscraper.rb +0 -4
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ shared_examples_for "Page has Search Results" do
4
+
5
+ it "should have incremental ranks" do
6
+ ranks = @page.ranks
7
+
8
+ (0..(ranks.length - 2)).each do |index|
9
+ ranks[index].should < ranks[index + 1]
10
+ end
11
+ end
12
+
13
+ it "should have titles" do
14
+ @page.each_title do |title|
15
+ title.should_not be_nil
16
+ end
17
+ end
18
+
19
+ it "should have non-empty titles" do
20
+ @page.each_title do |title|
21
+ title.length.should_not == 0
22
+ end
23
+ end
24
+
25
+ it "should have URLs" do
26
+ @page.each_url do |url|
27
+ url.should_not be_nil
28
+ end
29
+ end
30
+
31
+ it "should have non-empty URLs" do
32
+ @page.each_url do |url|
33
+ url.length.should_not == 0
34
+ end
35
+ end
36
+
37
+ it "should have valid URLs" do
38
+ @page.each_url do |url|
39
+ url_should_be_valid(url)
40
+ end
41
+ end
42
+
43
+ it "should have atleast one cached URL" do
44
+ @page.cached_urls.should_not == 0
45
+ end
46
+
47
+ it "should have atleast one similar query URL" do
48
+ @page.similar_urls.should_not == 0
49
+ end
50
+
51
+ end
@@ -0,0 +1,103 @@
1
+ require 'spec_helper'
2
+
3
+ require 'gscraper/search/query'
4
+
5
+ describe GScraper::Search::Query do
6
+
7
+ it "should support basic queries" do
8
+ expr = 'ruby -blog'
9
+ query = GScraper::Search::Query.new(:query => expr)
10
+ query.expression.should == expr
11
+ end
12
+
13
+ it "should support the 'link' modifier" do
14
+ url = 'www.wired.com/'
15
+ query = GScraper::Search::Query.new(:link => url)
16
+ query.expression.should == "link:#{url}"
17
+ end
18
+
19
+ it "should support the 'related' modifier" do
20
+ url = 'www.rubyinside.com'
21
+ query = GScraper::Search::Query.new(:related => url)
22
+ query.expression.should == "related:#{url}"
23
+ end
24
+
25
+ it "should support the 'info' modifier" do
26
+ url = "www.rspec.info"
27
+ query = GScraper::Search::Query.new(:info => url)
28
+ query.expression.should == "info:#{url}"
29
+ end
30
+
31
+ it "should support the 'site' modifier" do
32
+ url = "www.ruby-lang.net"
33
+ query = GScraper::Search::Query.new(:site => url)
34
+ query.expression.should == "site:#{url}"
35
+ end
36
+
37
+ it "should support the 'filetype' modifier" do
38
+ file_type = 'rss'
39
+ query = GScraper::Search::Query.new(:filetype => file_type)
40
+ query.expression.should == "filetype:#{file_type}"
41
+ end
42
+
43
+ it "should support 'allintitle' options" do
44
+ words = ['one', 'two', 'three']
45
+ query = GScraper::Search::Query.new(:allintitle => words)
46
+ query.expression.should == "allintitle:#{words.join(' ')}"
47
+ end
48
+
49
+ it "should support the 'intitle' modifier" do
50
+ word = 'coffee'
51
+ query = GScraper::Search::Query.new(:intitle => word)
52
+ query.expression.should == "intitle:#{word}"
53
+ end
54
+
55
+ it "should support 'allinurl' options" do
56
+ params = ['search', 'id', 'page']
57
+ query = GScraper::Search::Query.new(:allinurl => params)
58
+ query.expression.should == "allinurl:#{params.join(' ')}"
59
+ end
60
+
61
+ it "should support the 'inurl' modifier" do
62
+ param = 'id'
63
+ query = GScraper::Search::Query.new(:inurl => param)
64
+ query.expression.should == "inurl:#{param}"
65
+ end
66
+
67
+ it "should support 'allintext' options" do
68
+ words = ['dog', 'blog', 'log']
69
+ query = GScraper::Search::Query.new(:allintext => words)
70
+ query.expression.should == "allintext:#{words.join(' ')}"
71
+ end
72
+
73
+ it "should support the 'intext' modifier" do
74
+ word = 'word'
75
+ query = GScraper::Search::Query.new(:intext => word)
76
+ query.expression.should == "intext:#{word}"
77
+ end
78
+
79
+ it "should support 'exact phrases'" do
80
+ phrase = 'how do you do?'
81
+ query = GScraper::Search::Query.new(:exact_phrase => phrase)
82
+ query.expression.should == "\"#{phrase}\""
83
+ end
84
+
85
+ it "should support 'with words'" do
86
+ words = ['one', 'two', 'three']
87
+ query = GScraper::Search::Query.new(:with_words => words)
88
+ query.expression.should == words.join(' OR ')
89
+ end
90
+
91
+ it "should support 'without words'" do
92
+ words = ['bla', 'haha', 'spam']
93
+ query = GScraper::Search::Query.new(:without_words => words)
94
+ query.expression.should == words.map { |word| "-#{word}" }.join(' ')
95
+ end
96
+
97
+ it "should support 'numeric range'" do
98
+ range = (3..8)
99
+ query = GScraper::Search::Query.new(:numeric_range => range)
100
+ query.expression.should == "#{range.begin}..#{range.end}"
101
+ end
102
+
103
+ end
@@ -0,0 +1,74 @@
1
+ require 'spec_helper'
2
+ require 'has_pages_examples'
3
+ require 'page_has_results_examples'
4
+ require 'has_sponsored_links_examples'
5
+ require 'search/page_has_results_examples'
6
+
7
+ require 'gscraper/search/web_query'
8
+
9
+ describe GScraper::Search::WebQuery do
10
+
11
+ before(:all) do
12
+ @query = GScraper::Search::WebQuery.new(:query => DEFAULT_QUERY)
13
+ @page = @query.first_page
14
+ @links = @query.sponsored_links
15
+ end
16
+
17
+ it_should_behave_like "has Pages"
18
+ it_should_behave_like "Page has Results"
19
+ it_should_behave_like "Page has Search Results"
20
+ it_should_behave_like "has Sponsored Links"
21
+
22
+ describe "Search URL" do
23
+
24
+ before(:all) do
25
+ @uri = @query.search_url
26
+ end
27
+
28
+ it "should be a valid HTTP URI" do
29
+ @uri.class.should == URI::HTTP
30
+ end
31
+
32
+ it "should have a 'q' query-param" do
33
+ @uri.query_params['q'].should == DEFAULT_QUERY
34
+ end
35
+
36
+ it "should have a 'num' query-param" do
37
+ @uri.query_params['num'].should == @query.results_per_page
38
+ end
39
+
40
+ end
41
+
42
+ describe "page specific URLs" do
43
+
44
+ before(:all) do
45
+ @uri = @query.page_url(2)
46
+ end
47
+
48
+ it "should have a 'start' query-param" do
49
+ @uri.query_params['start'].should == @query.results_per_page
50
+ end
51
+
52
+ it "should have a 'sa' query-param" do
53
+ @uri.query_params['sa'].should == 'N'
54
+ end
55
+
56
+ end
57
+
58
+ describe "queries from Web search URLs" do
59
+
60
+ before(:all) do
61
+ @query = GScraper::Search::WebQuery.from_url("http://www.google.com/search?sa=N&start=0&q=#{DEFAULT_QUERY}&num=20")
62
+ end
63
+
64
+ it "should have a results-per-page" do
65
+ @query.results_per_page.should == 20
66
+ end
67
+
68
+ it "should have a query" do
69
+ @query.query.should == DEFAULT_QUERY
70
+ end
71
+
72
+ end
73
+
74
+ end
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ gem 'rspec', '>=1.1.3'
3
+ require 'spec'
4
+
5
+ require 'helpers/query'
6
+ require 'helpers/uri'
data/tasks/spec.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ desc "Run all specifications"
4
+ Spec::Rake::SpecTask.new(:spec) do |t|
5
+ t.libs += ['lib', 'spec']
6
+ t.spec_opts = ['--colour', '--format', 'specdoc']
7
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern Modulus III
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-04-28 00:00:00 -07:00
12
+ date: 2008-06-21 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -37,47 +37,61 @@ dependencies:
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: 1.5.1
40
+ version: 1.6.0
41
41
  version:
42
- description: "== FEATURES/PROBLEMS: * Supports the Google Search service. * Provides access to search results and ranks. * Provides access to the Sponsored Links. * Provides HTTP access with custom User-Agent strings. * Provides proxy settings for HTTP access. == REQUIREMENTS: * Hpricot * WWW::Mechanize == INSTALL:"
43
- email: postmodern.mod3@gmail.com
42
+ description: GScraper is a web-scraping interface to various Google Services.
43
+ email:
44
+ - postmodern.mod3@gmail.com
44
45
  executables: []
45
46
 
46
47
  extensions: []
47
48
 
48
49
  extra_rdoc_files:
49
50
  - History.txt
50
- - LICENSE.txt
51
+ - COPYING.txt
51
52
  - Manifest.txt
52
53
  - README.txt
53
54
  files:
54
55
  - History.txt
55
- - LICENSE.txt
56
+ - COPYING.txt
56
57
  - Manifest.txt
57
58
  - README.txt
58
59
  - Rakefile
59
- - lib/gscraper.rb
60
- - lib/gscraper/version.rb
61
- - lib/gscraper/gscraper.rb
62
- - lib/gscraper/web_agent.rb
60
+ - lib/gscraper/extensions/uri/query_params.rb
63
61
  - lib/gscraper/extensions/uri/http.rb
64
62
  - lib/gscraper/extensions/uri.rb
65
63
  - lib/gscraper/extensions.rb
66
64
  - lib/gscraper/licenses.rb
65
+ - lib/gscraper/page.rb
66
+ - lib/gscraper/has_pages.rb
67
67
  - lib/gscraper/sponsored_ad.rb
68
68
  - lib/gscraper/sponsored_links.rb
69
69
  - lib/gscraper/search/result.rb
70
70
  - lib/gscraper/search/page.rb
71
71
  - lib/gscraper/search/query.rb
72
+ - lib/gscraper/search/web_query.rb
73
+ - lib/gscraper/search/ajax_query.rb
72
74
  - lib/gscraper/search/search.rb
73
75
  - lib/gscraper/search.rb
74
- - test/test_gscraper.rb
75
- - test/search/query_from_url.rb
76
- - test/search/query_result.rb
77
- - test/search/query_pages.rb
78
- - test/search/page_results.rb
76
+ - lib/gscraper/gscraper.rb
77
+ - lib/gscraper/version.rb
78
+ - lib/gscraper.rb
79
+ - tasks/spec.rb
80
+ - spec/spec_helper.rb
81
+ - spec/helpers/uri.rb
82
+ - spec/helpers/query.rb
83
+ - spec/extensions/uri/query_params_spec.rb
84
+ - spec/extensions/uri/http_spec.rb
85
+ - spec/has_pages_examples.rb
86
+ - spec/page_has_results_examples.rb
87
+ - spec/has_sponsored_links_examples.rb
88
+ - spec/search/page_has_results_examples.rb
89
+ - spec/search/query_spec.rb
90
+ - spec/search/ajax_query_spec.rb
91
+ - spec/search/web_query_spec.rb
92
+ - spec/gscraper_spec.rb
79
93
  has_rdoc: true
80
- homepage: " by Postmodern Modulus III"
94
+ homepage: http://rubyforge.org/projects/gscraper/
81
95
  post_install_message:
82
96
  rdoc_options:
83
97
  - --main
@@ -102,6 +116,6 @@ rubyforge_project: gscraper
102
116
  rubygems_version: 1.1.1
103
117
  signing_key:
104
118
  specification_version: 2
105
- summary: A ruby web-scraping interface to various Google Services
106
- test_files:
107
- - test/test_gscraper.rb
119
+ summary: GScraper is a web-scraping interface to various Google Services.
120
+ test_files: []
121
+
data/LICENSE.txt DELETED
@@ -1,23 +0,0 @@
1
-
2
-
3
- The MIT License
4
-
5
- Copyright (c) 2007 Hal Brodigan
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in
15
- all copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
- THE SOFTWARE.
@@ -1,38 +0,0 @@
1
- require 'gscraper/gscraper'
2
-
3
- module GScraper
4
- module WebAgent
5
- protected
6
-
7
- #
8
- # Returns the WWW::Mechanize agent.
9
- #
10
- def web_agent(&block)
11
- @web_agent ||= GScraper.web_agent
12
-
13
- block.call(@web_agent) if block
14
- return @web_agent
15
- end
16
-
17
- #
18
- # Fetches the specified _url_, with the given _referer_ using the
19
- # web_agent.
20
- #
21
- # get_page('http://www.hackety.org/')
22
- #
23
- def get_page(url,referer=nil,&block)
24
- web_agent.get(url,referer,&block)
25
- end
26
-
27
- #
28
- # Posts the specified _url_ and the given _query_ parameters using the
29
- # web_agent.
30
- #
31
- # post_page('http://www.wired.com/', :q => 'the future')
32
- #
33
- def post_page(url,query={})
34
- web_agent.post(url,query)
35
- end
36
-
37
- end
38
- end
@@ -1,103 +0,0 @@
1
- require 'test/unit'
2
- require 'gscraper/search/page'
3
- require 'gscraper/search/query'
4
-
5
- class PageResults < Test::Unit::TestCase
6
-
7
- include GScraper
8
-
9
- def setup
10
- @query = Search::Query.new(:query => 'ruby')
11
- @page = @query.first_page
12
- end
13
-
14
- def test_results_per_page
15
- assert_equal @page.length, @query.results_per_page
16
- end
17
-
18
- def test_first_result
19
- assert_not_nil @page[0], "First Page for Query 'ruby' does not have a first Result"
20
- end
21
-
22
- def test_last_result
23
- assert_not_nil @page[-1], "First Page for Query 'ruby' does not have a last Result"
24
- end
25
-
26
- def test_ranks
27
- ranks = @page.ranks
28
-
29
- assert_not_nil ranks, "First Page for Query 'ruby' does not have any ranks"
30
-
31
- assert_equal ranks.class, Array, "The ranks of a Page must be an Array"
32
-
33
- assert_equal ranks.empty?, false, "The ranks of the First Page are empty"
34
-
35
- assert_equal ranks.length, @page.length
36
- end
37
-
38
- def test_titles
39
- titles = @page.titles
40
-
41
- assert_not_nil titles, "First Page for Query 'ruby' does not have any titles"
42
-
43
- assert_equal titles.class, Array, "The titles of a Page must be an Array"
44
-
45
- assert_equal titles.empty?, false, "The titles of the First Page are empty"
46
-
47
- assert_equal titles.length, @page.length
48
- end
49
-
50
- def test_urls
51
- urls = @page.urls
52
-
53
- assert_not_nil urls, "First Page for Query 'ruby' does not have any urls"
54
-
55
- assert_equal urls.class, Array, "The urls of a Page must be an Array"
56
-
57
- assert_equal urls.empty?, false, "The urls of the First Page are empty"
58
-
59
- assert_equal urls.length, @page.length
60
- end
61
-
62
- def test_summaries
63
- summaries = @page.summaries
64
-
65
- assert_not_nil summaries, "First Page for Query 'ruby' does not have any summaries"
66
-
67
- assert_equal summaries.class, Array, "The summaries of a Page must be an Array"
68
-
69
- assert_equal summaries.empty?, false, "The summaries of the First Page are empty"
70
-
71
- assert_equal summaries.length, @page.length
72
- end
73
-
74
- def test_cached_urls
75
- cached_urls = @page.cached_urls
76
-
77
- assert_not_nil cached_urls, "First Page for Query 'ruby' does not have any cached_urls"
78
-
79
- assert_equal cached_urls.class, Array, "The cached_urls of a Page must be an Array"
80
-
81
- assert_equal cached_urls.empty?, false, "The cached_urls of the First Page are empty"
82
-
83
- assert_equal cached_urls.length, @page.length
84
- end
85
-
86
- def test_similar_urls
87
- similar_urls = @page.similar_urls
88
-
89
- assert_not_nil similar_urls, "First Page for Query 'ruby' does not have any similar URLs"
90
-
91
- assert_equal similar_urls.class, Array, "The similar URLs of a Page must be an Array"
92
-
93
- assert_equal similar_urls.empty?, false, "The similar URLs of the First Page are empty"
94
-
95
- assert_equal similar_urls.length, @page.length
96
- end
97
-
98
- def teardown
99
- @page = nil
100
- @query = nil
101
- end
102
-
103
- end