RubyGems - webrat-scraper - Versions diffs - 0.1.4 - Mend

webrat-scraper 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/.document +5 -0
data/.gitignore +6 -0
data/LICENSE +20 -0
data/README.rdoc +59 -0
data/Rakefile +63 -0
data/VERSION +1 -0
data/lib/webrat_scraper.rb +30 -0
data/spec/fakeweb_fixtures/fake_form.html +47 -0
data/spec/fakeweb_fixtures/fake_form_action.html +14 -0
data/spec/spec_helper.rb +12 -0
data/spec/webrat_scraper_spec.rb +139 -0
metadata +104 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,6 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg
+webrat.log

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2009 JT Zemp
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,59 @@
+= webrat-scraper
+A web scraper built on Webrat::Mechanize that traverses the web and allows
+access to the webrat session through Mechanize and Nokogiri objects
+== How to use
+=== Install
+    gem sources -a http://gems.github.com
+    gem install jtzemp-webrat-scraper
+=== Use
+    require 'webrat_scraper'
+    class MyScraper < WebratScraper
+      def initialize
+        @url = "http://www.google.com"
+      end
+      def first_result_for(search_term)
+        visit @url
+        fill_in "Google Search", :with => search_term
+        click_button
+        link = {}
+        link[:html] = (doc/"li.g a.l").first
+        link[:text] = link.inner_text
+        link[:url]  = link.attributes["href"].to_s
+        link
+      end
+    end
+    m = MyScraper.new
+    result = m.first_result_for("webrat-mechanize")
+    puts result.inspect
+For more info check out documentation for:
+* Webrat
+* Mechanize
+* Nokogiri
+  * CSS Selectors
+  * XPath Selectors
+== Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add specs for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but
+  bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+== Copyright
+Copyright (c) 2009 JT Zemp. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,63 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "webrat-scraper"
+    gem.summary = %Q{A web scraper using Webrat & Mechanize - does acceptance-based web scraping relying on the great webrat behavior description language.}
+    gem.description = %Q{A web scraper using Webrat & Mechanize}
+    gem.email = "jtzemp@gmail.com"
+    gem.homepage = "http://github.com/jtzemp/webrat-scraper"
+    gem.authors = ["JT Zemp"]
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+    gem.add_dependency("mechanize", ">= 0.9.3")
+    gem.add_dependency("webrat", "= 0.4.5")
+    gem.add_development_dependency('rspec')
+    gem.add_development_dependency('fakeweb')
+  end
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION.yml')
+    config = YAML.load(File.read('VERSION.yml'))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "webrat-scraper #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.4

data/lib/webrat_scraper.rb ADDED

@@ -0,0 +1,30 @@
+require 'rubygems'
+gem 'webrat', '=0.4.5'
+require 'webrat'
+require 'webrat/mechanize'
+require 'test/unit/assertions'
+class WebratScraper < Webrat::MechanizeSession
+  include Webrat::Matchers
+  include Test::Unit::Assertions
+  def initialize(context=nil)
+    super(context)
+  end
+  def user_agent
+    @user_agent ||= "webrat-scraper " + mechanize.user_agent
+  end
+  def user_agent=(new_user_agent)
+    @user_agent = mechanize.user_agent = new_user_agent
+  end
+  # the Nokogiri object for the response body for the session's current state.
+  alias :doc :dom
+  # def post(url, data={})
+  #   visit(url, :post, data)
+  # end
+end

data/spec/fakeweb_fixtures/fake_form.html ADDED

@@ -0,0 +1,47 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+   "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+	<title>Fake Page</title>
+	<meta name="generator" content="TextMate http://macromates.com/">
+	<meta name="author" content="JT Zemp">
+</head>
+<body>
+  <a href="#oblivion">Go to oblivion</a>
+  <form action="/action" method="POST" accept-charset="utf-8">
+    <label for="username">Username</label><input type="text" name="username" value="" id="username">
+    <label for="password">Password</label><input type="password" name="password" value="" id="password">
+    <textarea name="Notes" rows="8" cols="40"></textarea>
+    <select name="roles" id="roles" multiple onchange="" size="4">
+      <option value="admin">Administrator</option>
+      <option value="normal">Normal User</option>
+    </select>
+    <label for="on_fire">On fire?</label><input type="checkbox" name="on_fire" value="" id="on_fire">
+    <label for="tapatio">Tapatio</label><input type="radio" name="sauces" value="Tapatio" id="tapatio">
+    <label for="franks">Frank's Red Hot</label><input type="radio" name="sauces" value="Franks" id="franks">
+    <label for="tabasco">Tabasco</label><input type="radio" name="sauces" value="Tabasco" id="tabasco">
+    <p><input type="submit" value="Save" name="Save"></p>
+  </form>
+  <br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br />
+  <a name="oblivion" id="oblivion">Oblivion</a>
+  <div id="lorem">
+    <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, <span id="sed-do">sed do eiusmod tempor</span> incididunt ut <span class="labore">labore</span> et dolore magna aliqua.</p> Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+  </div>
+  <div id="table">
+    <table border="0" cellspacing="5" cellpadding="5">
+      <tr><th>Header</th></tr>
+      <tr><td>data 1</td></tr>
+      <tr><td>data 2</td></tr>
+    </table>
+  </div>
+  <a href="/2">Page two</a>
+</body>
+</html>

data/spec/fakeweb_fixtures/fake_form_action.html ADDED

@@ -0,0 +1,14 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+   "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+	<title>Fake Page</title>
+	<meta name="generator" content="TextMate http://macromates.com/">
+	<meta name="author" content="JT Zemp">
+</head>
+<body>
+  Form Post!
+</body>
+</html>

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,12 @@
+require 'rubygems'
+require 'spec'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'webrat_scraper'
+require 'fakeweb'
+def open_fixture(fakeweb_fixture_name)
+  open(File.join(File.dirname(__FILE__), 'fakeweb_fixtures', fakeweb_fixture_name.to_s)).read
+end

data/spec/webrat_scraper_spec.rb ADDED

@@ -0,0 +1,139 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe WebratScraper do
+  describe "instance methods" do
+    before(:all) do
+      FakeWeb.register_uri(:get, "http://example.com/", :body => "Hello World!")
+    end
+    before(:each) do
+      @session = WebratScraper.new
+    end
+    describe "#doc" do
+      it "is a Nokogiri object of of the response body" do
+        @session.visit "http://www.example.com/"
+        @session.doc.class.should == Nokogiri::HTML::Document
+      end
+    end
+    describe "#user_agent=" do
+      it "sets the user_agent" do
+        @session.user_agent = user_agent = "Test User Agent 1.0"
+        @session.visit "http://www.example.com/"
+        @session.user_agent.should == user_agent
+        @session.mechanize.user_agent.should == user_agent
+      end
+    end
+  end
+  describe "webrat's methods" do
+    before(:all) do
+      FakeWeb.register_uri(:get,  "http://example.com/", :body => "Hello World!")
+      FakeWeb.register_uri(:get,  "http://example.com/1", :body => "Hello World 1!")
+      FakeWeb.register_uri(:get,  "http://example.com/2", :body => "Hello World 2!")
+      FakeWeb.register_uri(:get,  "http://example.com/form", :body => open_fixture("fake_form.html"))
+      FakeWeb.register_uri(:post, "http://example.com/action", :body => open_fixture("fake_form_action.html"))
+    end
+    before(:each) do
+      @session = WebratScraper.new
+    end
+    describe "#visit" do
+      it "visits a webpage and updates the context" do
+        @session.visit("http://example.com/")
+        @session.doc.inner_text.should == "Hello World!"
+        @session.visit("http://example.com/1")
+        @session.doc.inner_text.should == "Hello World 1!"
+      end
+    end
+    describe "#fill_in" do
+      it "fills in a form field with a value" do
+        @session.visit "http://example.com/form"
+        lambda {@session.fill_in "Username", :with => "who"}.should_not raise_error
+      end
+    end
+    describe "#click_link" do
+      it "clicks a link" do
+        @session.visit "http://example.com/form"
+        @session.click_link "Page two"
+        @session.doc.inner_text.should == "Hello World 2!"
+      end
+    end
+    describe "#click_button" do
+      it "clicks a button" do
+        @session.visit "http://example.com/form"
+        @session.click_button "Save"
+        (@session.doc/"body").inner_text.strip.should == "Form Post!"
+      end
+    end
+    describe "#assert_contain" do
+      it "asserts that the page contains the string given" do
+        @session.visit "http://example.com/form"
+        lambda { @session.assert_contain("Lorem ipsum dolor sit amet") }.should_not raise_error
+      end
+    end
+  end
+end
+# Spec these out
+# assert_contain (Webrat::Matchers)
+# assert_have_no_selector (Webrat::Matchers)
+# assert_have_no_tag (Webrat::HaveTagMatcher)
+# assert_have_no_xpath (Webrat::Matchers)
+# assert_have_selector (Webrat::Matchers)
+# assert_have_tag (Webrat::HaveTagMatcher)
+# assert_have_xpath (Webrat::Matchers)
+# assert_not_contain (Webrat::Matchers)
+# attach_file (Webrat::Scope)
+# automate (Webrat::Session)
+# basic_auth (Webrat::Session)
+# check (Webrat::Scope)
+# check_for_infinite_redirects (Webrat::Session)
+# choose (Webrat::Scope)
+# click_area (Webrat::Scope)
+# click_button (Webrat::Scope)
+# click_link (Webrat::Scope)
+# click_link_within (Webrat::Session)
+# contain (Webrat::Matchers)
+# dom (Webrat::Session)
+# field_by_xpath (Webrat::Locators)
+# field_labeled (Webrat::Locators)
+# field_named (Webrat::Locators)
+# field_with_id (Webrat::Locators)
+# fill_in (Webrat::Scope)
+# have_selector (Webrat::Matchers)
+# have_tag (Webrat::HaveTagMatcher)
+# have_xpath (Webrat::Matchers)
+# header (Webrat::Session)
+# http_accept (Webrat::Session)
+# infinite_redirect_limit_exceeded? (Webrat::Session)
+# internal_redirect? (Webrat::Session)
+# match_selector (Webrat::Matchers)
+# match_tag (Webrat::HaveTagMatcher)
+# match_xpath (Webrat::Matchers)
+# mode= (Webrat::Configuration)
+# open_in_browser (Webrat::SaveAndOpenPage)
+# redirected_to (Webrat::Session)
+# reload (Webrat::Session)
+# save_and_open_page (Webrat::SaveAndOpenPage)
+# scoped_dom (Webrat::Scope)
+# select (Webrat::Scope)
+# select_date (Webrat::Scope)
+# select_datetime (Webrat::Scope)
+# select_time (Webrat::Scope)
+# set_hidden_field (Webrat::Scope)
+# simulate (Webrat::Session)
+# submit_form (Webrat::Scope)
+# uncheck (Webrat::Scope)
+# visit (Webrat::Session)
+# within (Webrat::Session)
+# xml_content_type? (Webrat::Session)

metadata ADDED

@@ -0,0 +1,104 @@
+--- !ruby/object:Gem::Specification
+name: webrat-scraper
+version: !ruby/object:Gem::Version
+  version: 0.1.4
+platform: ruby
+authors:
+- JT Zemp
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-10-10 00:00:00 -06:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.3
+    version:
+- !ruby/object:Gem::Dependency
+  name: webrat
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "="
+      - !ruby/object:Gem::Version
+        version: 0.4.5
+    version:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description: A web scraper using Webrat & Mechanize
+email: jtzemp@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .document
+- .gitignore
+- LICENSE
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/webrat_scraper.rb
+- spec/fakeweb_fixtures/fake_form.html
+- spec/fakeweb_fixtures/fake_form_action.html
+- spec/spec_helper.rb
+- spec/webrat_scraper_spec.rb
+has_rdoc: true
+homepage: http://github.com/jtzemp/webrat-scraper
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: A web scraper using Webrat & Mechanize - does acceptance-based web scraping relying on the great webrat behavior description language.
+test_files:
+- spec/spec_helper.rb
+- spec/webrat_scraper_spec.rb