RubyGems - gscraper - Versions diffs - 0.1.7 → 0.2.0 - Mend

gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/COPYING.txt +339 -0
data/History.txt +21 -0
data/Manifest.txt +23 -10
data/README.txt +17 -21
data/Rakefile +3 -6
data/lib/gscraper.rb +22 -0
data/lib/gscraper/extensions.rb +22 -0
data/lib/gscraper/extensions/uri.rb +22 -0
data/lib/gscraper/extensions/uri/http.rb +25 -71
data/lib/gscraper/extensions/uri/query_params.rb +96 -0
data/lib/gscraper/gscraper.rb +30 -0
data/lib/gscraper/has_pages.rb +114 -0
data/lib/gscraper/licenses.rb +22 -0
data/lib/gscraper/page.rb +64 -0
data/lib/gscraper/search.rb +24 -0
data/lib/gscraper/search/ajax_query.rb +176 -0
data/lib/gscraper/search/page.rb +27 -72
data/lib/gscraper/search/query.rb +46 -457
data/lib/gscraper/search/result.rb +32 -29
data/lib/gscraper/search/search.rb +44 -3
data/lib/gscraper/search/web_query.rb +472 -0
data/lib/gscraper/sponsored_ad.rb +26 -2
data/lib/gscraper/sponsored_links.rb +77 -8
data/lib/gscraper/version.rb +23 -1
data/spec/extensions/uri/http_spec.rb +9 -0
data/spec/extensions/uri/query_params_spec.rb +38 -0
data/spec/gscraper_spec.rb +29 -0
data/spec/has_pages_examples.rb +19 -0
data/spec/has_sponsored_links_examples.rb +57 -0
data/spec/helpers/query.rb +1 -0
data/spec/helpers/uri.rb +8 -0
data/spec/page_has_results_examples.rb +13 -0
data/spec/search/ajax_query_spec.rb +124 -0
data/spec/search/page_has_results_examples.rb +51 -0
data/spec/search/query_spec.rb +103 -0
data/spec/search/web_query_spec.rb +74 -0
data/spec/spec_helper.rb +6 -0
data/tasks/spec.rb +7 -0
metadata +34 -20
data/LICENSE.txt +0 -23
data/lib/gscraper/web_agent.rb +0 -38
data/test/search/page_results.rb +0 -103
data/test/search/query_from_url.rb +0 -50
data/test/search/query_pages.rb +0 -32
data/test/search/query_result.rb +0 -30
data/test/test_gscraper.rb +0 -4

data/spec/search/page_has_results_examples.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require 'spec_helper'
+shared_examples_for "Page has Search Results" do
+  it "should have incremental ranks" do
+    ranks = @page.ranks
+    (0..(ranks.length - 2)).each do |index|
+      ranks[index].should < ranks[index + 1]
+    end
+  end
+  it "should have titles" do
+    @page.each_title do |title|
+      title.should_not be_nil
+    end
+  end
+  it "should have non-empty titles" do
+    @page.each_title do |title|
+      title.length.should_not == 0
+    end
+  end
+  it "should have URLs" do
+    @page.each_url do |url|
+      url.should_not be_nil
+    end
+  end
+  it "should have non-empty URLs" do
+    @page.each_url do |url|
+      url.length.should_not == 0
+    end
+  end
+  it "should have valid URLs" do
+    @page.each_url do |url|
+      url_should_be_valid(url)
+    end
+  end
+  it "should have atleast one cached URL" do
+    @page.cached_urls.should_not == 0
+  end
+  it "should have atleast one similar query URL" do
+    @page.similar_urls.should_not == 0
+  end
+end

data/spec/search/query_spec.rb ADDED Viewed

@@ -0,0 +1,103 @@
+require 'spec_helper'
+require 'gscraper/search/query'
+describe GScraper::Search::Query do
+  it "should support basic queries" do
+    expr = 'ruby -blog'
+    query = GScraper::Search::Query.new(:query => expr)
+    query.expression.should == expr
+  end
+  it "should support the 'link' modifier" do
+    url = 'www.wired.com/'
+    query = GScraper::Search::Query.new(:link => url)
+    query.expression.should == "link:#{url}"
+  end
+  it "should support the 'related' modifier" do
+    url = 'www.rubyinside.com'
+    query = GScraper::Search::Query.new(:related => url)
+    query.expression.should == "related:#{url}"
+  end
+  it "should support the 'info' modifier" do
+    url = "www.rspec.info"
+    query = GScraper::Search::Query.new(:info => url)
+    query.expression.should == "info:#{url}"
+  end
+  it "should support the 'site' modifier" do
+    url = "www.ruby-lang.net"
+    query = GScraper::Search::Query.new(:site => url)
+    query.expression.should == "site:#{url}"
+  end
+  it "should support the 'filetype' modifier" do
+    file_type = 'rss'
+    query = GScraper::Search::Query.new(:filetype => file_type)
+    query.expression.should == "filetype:#{file_type}"
+  end
+  it "should support 'allintitle' options" do
+    words = ['one', 'two', 'three']
+    query = GScraper::Search::Query.new(:allintitle => words)
+    query.expression.should == "allintitle:#{words.join(' ')}"
+  end
+  it "should support the 'intitle' modifier" do
+    word = 'coffee'
+    query = GScraper::Search::Query.new(:intitle => word)
+    query.expression.should == "intitle:#{word}"
+  end
+  it "should support 'allinurl' options" do
+    params = ['search', 'id', 'page']
+    query = GScraper::Search::Query.new(:allinurl => params)
+    query.expression.should == "allinurl:#{params.join(' ')}"
+  end
+  it "should support the 'inurl' modifier" do
+    param = 'id'
+    query = GScraper::Search::Query.new(:inurl => param)
+    query.expression.should == "inurl:#{param}"
+  end
+  it "should support 'allintext' options" do
+    words = ['dog', 'blog', 'log']
+    query = GScraper::Search::Query.new(:allintext => words)
+    query.expression.should == "allintext:#{words.join(' ')}"
+  end
+  it "should support the 'intext' modifier" do
+    word = 'word'
+    query = GScraper::Search::Query.new(:intext => word)
+    query.expression.should == "intext:#{word}"
+  end
+  it "should support 'exact phrases'" do
+    phrase = 'how do you do?'
+    query = GScraper::Search::Query.new(:exact_phrase => phrase)
+    query.expression.should == "\"#{phrase}\""
+  end
+  it "should support 'with words'" do
+    words = ['one', 'two', 'three']
+    query = GScraper::Search::Query.new(:with_words => words)
+    query.expression.should == words.join(' OR ')
+  end
+  it "should support 'without words'" do
+    words = ['bla', 'haha', 'spam']
+    query = GScraper::Search::Query.new(:without_words => words)
+    query.expression.should == words.map { |word| "-#{word}" }.join(' ')
+  end
+  it "should support 'numeric range'" do
+    range = (3..8)
+    query = GScraper::Search::Query.new(:numeric_range => range)
+    query.expression.should == "#{range.begin}..#{range.end}"
+  end
+end

data/spec/search/web_query_spec.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require 'spec_helper'
+require 'has_pages_examples'
+require 'page_has_results_examples'
+require 'has_sponsored_links_examples'
+require 'search/page_has_results_examples'
+require 'gscraper/search/web_query'
+describe GScraper::Search::WebQuery do
+  before(:all) do
+    @query = GScraper::Search::WebQuery.new(:query => DEFAULT_QUERY)
+    @page = @query.first_page
+    @links = @query.sponsored_links
+  end
+  it_should_behave_like "has Pages"
+  it_should_behave_like "Page has Results"
+  it_should_behave_like "Page has Search Results"
+  it_should_behave_like "has Sponsored Links"
+  describe "Search URL" do
+    before(:all) do
+      @uri = @query.search_url
+    end
+    it "should be a valid HTTP URI" do
+      @uri.class.should == URI::HTTP
+    end
+    it "should have a 'q' query-param" do
+      @uri.query_params['q'].should == DEFAULT_QUERY
+    end
+    it "should have a 'num' query-param" do
+      @uri.query_params['num'].should == @query.results_per_page
+    end
+  end
+  describe "page specific URLs" do
+    before(:all) do
+      @uri = @query.page_url(2)
+    end
+    it "should have a 'start' query-param" do
+      @uri.query_params['start'].should == @query.results_per_page
+    end
+    it "should have a 'sa' query-param" do
+      @uri.query_params['sa'].should == 'N'
+    end
+  end
+  describe "queries from Web search URLs" do
+    before(:all) do
+      @query = GScraper::Search::WebQuery.from_url("http://www.google.com/search?sa=N&start=0&q=#{DEFAULT_QUERY}&num=20")
+    end
+    it "should have a results-per-page" do
+      @query.results_per_page.should == 20
+    end
+    it "should have a query" do
+      @query.query.should == DEFAULT_QUERY
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'rubygems'
+gem 'rspec', '>=1.1.3'
+require 'spec'
+require 'helpers/query'
+require 'helpers/uri'

data/tasks/spec.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'spec/rake/spectask'
+desc "Run all specifications"
+Spec::Rake::SpecTask.new(:spec) do |t|
+  t.libs += ['lib', 'spec']
+  t.spec_opts = ['--colour', '--format', 'specdoc']
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gscraper
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.2.0
 platform: ruby
 authors:
 - Postmodern Modulus III
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-04-28 00:00:00 -07:00
+date: 2008-06-21 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -37,47 +37,61 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.5.1
+        version: 1.6.0
     version:
-description: "== FEATURES/PROBLEMS:  * Supports the Google Search service. * Provides access to search results and ranks. * Provides access to the Sponsored Links. * Provides HTTP access with custom User-Agent strings. * Provides proxy settings for HTTP access.  == REQUIREMENTS:  * Hpricot * WWW::Mechanize  == INSTALL:"
-email: postmodern.mod3@gmail.com
+description: GScraper is a web-scraping interface to various Google Services.
+email:
+- postmodern.mod3@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files:
 - History.txt
-- LICENSE.txt
+- COPYING.txt
 - Manifest.txt
 - README.txt
 files:
 - History.txt
-- LICENSE.txt
+- COPYING.txt
 - Manifest.txt
 - README.txt
 - Rakefile
-- lib/gscraper.rb
-- lib/gscraper/version.rb
-- lib/gscraper/gscraper.rb
-- lib/gscraper/web_agent.rb
+- lib/gscraper/extensions/uri/query_params.rb
 - lib/gscraper/extensions/uri/http.rb
 - lib/gscraper/extensions/uri.rb
 - lib/gscraper/extensions.rb
 - lib/gscraper/licenses.rb
+- lib/gscraper/page.rb
+- lib/gscraper/has_pages.rb
 - lib/gscraper/sponsored_ad.rb
 - lib/gscraper/sponsored_links.rb
 - lib/gscraper/search/result.rb
 - lib/gscraper/search/page.rb
 - lib/gscraper/search/query.rb
+- lib/gscraper/search/web_query.rb
+- lib/gscraper/search/ajax_query.rb
 - lib/gscraper/search/search.rb
 - lib/gscraper/search.rb
-- test/test_gscraper.rb
-- test/search/query_from_url.rb
-- test/search/query_result.rb
-- test/search/query_pages.rb
-- test/search/page_results.rb
+- lib/gscraper/gscraper.rb
+- lib/gscraper/version.rb
+- lib/gscraper.rb
+- tasks/spec.rb
+- spec/spec_helper.rb
+- spec/helpers/uri.rb
+- spec/helpers/query.rb
+- spec/extensions/uri/query_params_spec.rb
+- spec/extensions/uri/http_spec.rb
+- spec/has_pages_examples.rb
+- spec/page_has_results_examples.rb
+- spec/has_sponsored_links_examples.rb
+- spec/search/page_has_results_examples.rb
+- spec/search/query_spec.rb
+- spec/search/ajax_query_spec.rb
+- spec/search/web_query_spec.rb
+- spec/gscraper_spec.rb
 has_rdoc: true
-homepage: "    by Postmodern Modulus III"
+homepage: http://rubyforge.org/projects/gscraper/
 post_install_message:
 rdoc_options:
 - --main
@@ -102,6 +116,6 @@ rubyforge_project: gscraper
 rubygems_version: 1.1.1
 signing_key:
 specification_version: 2
-summary: A ruby web-scraping interface to various Google Services
-test_files:
-- test/test_gscraper.rb
+summary: GScraper is a web-scraping interface to various Google Services.
+test_files: []

data/LICENSE.txt DELETED Viewed

@@ -1,23 +0,0 @@
-The MIT License
-Copyright (c) 2007 Hal Brodigan
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.

data/lib/gscraper/web_agent.rb DELETED Viewed

@@ -1,38 +0,0 @@
-require 'gscraper/gscraper'
-module GScraper
-  module WebAgent
-    protected
-    #
-    # Returns the WWW::Mechanize agent.
-    #
-    def web_agent(&block)
-      @web_agent ||= GScraper.web_agent
-      block.call(@web_agent) if block
-      return @web_agent
-    end
-    #
-    # Fetches the specified _url_, with the given _referer_ using the
-    # web_agent.
-    #
-    #   get_page('http://www.hackety.org/')
-    #
-    def get_page(url,referer=nil,&block)
-      web_agent.get(url,referer,&block)
-    end
-    #
-    # Posts the specified _url_ and the given _query_ parameters using the
-    # web_agent.
-    #
-    #   post_page('http://www.wired.com/', :q => 'the future')
-    #
-    def post_page(url,query={})
-      web_agent.post(url,query)
-    end
-  end
-end

data/test/search/page_results.rb DELETED Viewed

@@ -1,103 +0,0 @@
-require 'test/unit'
-require 'gscraper/search/page'
-require 'gscraper/search/query'
-class PageResults < Test::Unit::TestCase
-  include GScraper
-  def setup
-    @query = Search::Query.new(:query => 'ruby')
-    @page = @query.first_page
-  end
-  def test_results_per_page
-    assert_equal @page.length, @query.results_per_page
-  end
-  def test_first_result
-    assert_not_nil @page[0], "First Page for Query 'ruby' does not have a first Result"
-  end
-  def test_last_result
-    assert_not_nil @page[-1], "First Page for Query 'ruby' does not have a last Result"
-  end
-  def test_ranks
-    ranks = @page.ranks
-    assert_not_nil ranks, "First Page for Query 'ruby' does not have any ranks"
-    assert_equal ranks.class, Array, "The ranks of a Page must be an Array"
-    assert_equal ranks.empty?, false, "The ranks of the First Page are empty"
-    assert_equal ranks.length, @page.length
-  end
-  def test_titles
-    titles = @page.titles
-    assert_not_nil titles, "First Page for Query 'ruby' does not have any titles"
-    assert_equal titles.class, Array, "The titles of a Page must be an Array"
-    assert_equal titles.empty?, false, "The titles of the First Page are empty"
-    assert_equal titles.length, @page.length
-  end
-  def test_urls
-    urls = @page.urls
-    assert_not_nil urls, "First Page for Query 'ruby' does not have any urls"
-    assert_equal urls.class, Array, "The urls of a Page must be an Array"
-    assert_equal urls.empty?, false, "The urls of the First Page are empty"
-    assert_equal urls.length, @page.length
-  end
-  def test_summaries
-    summaries = @page.summaries
-    assert_not_nil summaries, "First Page for Query 'ruby' does not have any summaries"
-    assert_equal summaries.class, Array, "The summaries of a Page must be an Array"
-    assert_equal summaries.empty?, false, "The summaries of the First Page are empty"
-    assert_equal summaries.length, @page.length
-  end
-  def test_cached_urls
-    cached_urls = @page.cached_urls
-    assert_not_nil cached_urls, "First Page for Query 'ruby' does not have any cached_urls"
-    assert_equal cached_urls.class, Array, "The cached_urls of a Page must be an Array"
-    assert_equal cached_urls.empty?, false, "The cached_urls of the First Page are empty"
-    assert_equal cached_urls.length, @page.length
-  end
-  def test_similar_urls
-    similar_urls = @page.similar_urls
-    assert_not_nil similar_urls, "First Page for Query 'ruby' does not have any similar URLs"
-    assert_equal similar_urls.class, Array, "The similar URLs of a Page must be an Array"
-    assert_equal similar_urls.empty?, false, "The similar URLs of the First Page are empty"
-    assert_equal similar_urls.length, @page.length
-  end
-  def teardown
-    @page = nil
-    @query = nil
-  end
-end