webrat-scraper 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ webrat.log
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 JT Zemp
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,59 @@
1
+ = webrat-scraper
2
+
3
+ A web scraper built on Webrat::Mechanize that traverses the web and allows
4
+ access to the webrat session through Mechanize and Nokogiri objects
5
+
6
+ == How to use
7
+
8
+ === Install
9
+ gem sources -a http://gems.github.com
10
+ gem install jtzemp-webrat-scraper
11
+
12
+ === Use
13
+
14
+ require 'webrat_scraper'
15
+
16
+ class MyScraper < WebratScraper
17
+ def initialize
18
+ @url = "http://www.google.com"
19
+ end
20
+
21
+ def first_result_for(search_term)
22
+ visit @url
23
+
24
+ fill_in "Google Search", :with => search_term
25
+ click_button
26
+
27
+ link = {}
28
+ link[:html] = (doc/"li.g a.l").first
29
+ link[:text] = link.inner_text
30
+ link[:url] = link.attributes["href"].to_s
31
+ link
32
+ end
33
+ end
34
+
35
+ m = MyScraper.new
36
+ result = m.first_result_for("webrat-mechanize")
37
+ puts result.inspect
38
+
39
+ For more info check out documentation for:
40
+ * Webrat
41
+ * Mechanize
42
+ * Nokogiri
43
+ * CSS Selectors
44
+ * XPath Selectors
45
+
46
+ == Note on Patches/Pull Requests
47
+
48
+ * Fork the project.
49
+ * Make your feature addition or bug fix.
50
+ * Add specs for it. This is important so I don't break it in a
51
+ future version unintentionally.
52
+ * Commit, do not mess with rakefile, version, or history.
53
+ (if you want to have your own version, that is fine but
54
+ bump version in a commit by itself I can ignore when I pull)
55
+ * Send me a pull request. Bonus points for topic branches.
56
+
57
+ == Copyright
58
+
59
+ Copyright (c) 2009 JT Zemp. See LICENSE for details.
@@ -0,0 +1,63 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "webrat-scraper"
8
+ gem.summary = %Q{A web scraper using Webrat & Mechanize - does acceptance-based web scraping relying on the great webrat behavior description language.}
9
+ gem.description = %Q{A web scraper using Webrat & Mechanize}
10
+ gem.email = "jtzemp@gmail.com"
11
+ gem.homepage = "http://github.com/jtzemp/webrat-scraper"
12
+ gem.authors = ["JT Zemp"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ gem.add_dependency("mechanize", ">= 0.9.3")
15
+ gem.add_dependency("webrat", "= 0.4.5")
16
+ gem.add_development_dependency('rspec')
17
+ gem.add_development_dependency('fakeweb')
18
+ end
19
+
20
+ rescue LoadError
21
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
22
+ end
23
+
24
+ require 'rake/testtask'
25
+ Rake::TestTask.new(:test) do |test|
26
+ test.libs << 'lib' << 'test'
27
+ test.pattern = 'test/**/*_test.rb'
28
+ test.verbose = true
29
+ end
30
+
31
+ begin
32
+ require 'rcov/rcovtask'
33
+ Rcov::RcovTask.new do |test|
34
+ test.libs << 'test'
35
+ test.pattern = 'test/**/*_test.rb'
36
+ test.verbose = true
37
+ end
38
+ rescue LoadError
39
+ task :rcov do
40
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
41
+ end
42
+ end
43
+
44
+
45
+
46
+
47
+ task :default => :test
48
+
49
+ require 'rake/rdoctask'
50
+ Rake::RDocTask.new do |rdoc|
51
+ if File.exist?('VERSION.yml')
52
+ config = YAML.load(File.read('VERSION.yml'))
53
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
54
+ else
55
+ version = ""
56
+ end
57
+
58
+ rdoc.rdoc_dir = 'rdoc'
59
+ rdoc.title = "webrat-scraper #{version}"
60
+ rdoc.rdoc_files.include('README*')
61
+ rdoc.rdoc_files.include('lib/**/*.rb')
62
+ end
63
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.4
@@ -0,0 +1,30 @@
1
+ require 'rubygems'
2
+ gem 'webrat', '=0.4.5'
3
+ require 'webrat'
4
+ require 'webrat/mechanize'
5
+ require 'test/unit/assertions'
6
+
7
+ class WebratScraper < Webrat::MechanizeSession
8
+ include Webrat::Matchers
9
+ include Test::Unit::Assertions
10
+
11
+ def initialize(context=nil)
12
+ super(context)
13
+ end
14
+
15
+ def user_agent
16
+ @user_agent ||= "webrat-scraper " + mechanize.user_agent
17
+ end
18
+
19
+ def user_agent=(new_user_agent)
20
+ @user_agent = mechanize.user_agent = new_user_agent
21
+ end
22
+
23
+ # the Nokogiri object for the response body for the session's current state.
24
+ alias :doc :dom
25
+
26
+
27
+ # def post(url, data={})
28
+ # visit(url, :post, data)
29
+ # end
30
+ end
@@ -0,0 +1,47 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2
+ "http://www.w3.org/TR/html4/loose.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>Fake Page</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="JT Zemp">
10
+ </head>
11
+ <body>
12
+ <a href="#oblivion">Go to oblivion</a>
13
+ <form action="/action" method="POST" accept-charset="utf-8">
14
+ <label for="username">Username</label><input type="text" name="username" value="" id="username">
15
+ <label for="password">Password</label><input type="password" name="password" value="" id="password">
16
+
17
+ <textarea name="Notes" rows="8" cols="40"></textarea>
18
+
19
+ <select name="roles" id="roles" multiple onchange="" size="4">
20
+ <option value="admin">Administrator</option>
21
+ <option value="normal">Normal User</option>
22
+ </select>
23
+
24
+ <label for="on_fire">On fire?</label><input type="checkbox" name="on_fire" value="" id="on_fire">
25
+
26
+ <label for="tapatio">Tapatio</label><input type="radio" name="sauces" value="Tapatio" id="tapatio">
27
+ <label for="franks">Frank's Red Hot</label><input type="radio" name="sauces" value="Franks" id="franks">
28
+ <label for="tabasco">Tabasco</label><input type="radio" name="sauces" value="Tabasco" id="tabasco">
29
+
30
+ <p><input type="submit" value="Save" name="Save"></p>
31
+ </form>
32
+
33
+ <br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br />
34
+ <a name="oblivion" id="oblivion">Oblivion</a>
35
+ <div id="lorem">
36
+ <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, <span id="sed-do">sed do eiusmod tempor</span> incididunt ut <span class="labore">labore</span> et dolore magna aliqua.</p> Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
37
+ </div>
38
+ <div id="table">
39
+ <table border="0" cellspacing="5" cellpadding="5">
40
+ <tr><th>Header</th></tr>
41
+ <tr><td>data 1</td></tr>
42
+ <tr><td>data 2</td></tr>
43
+ </table>
44
+ </div>
45
+ <a href="/2">Page two</a>
46
+ </body>
47
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2
+ "http://www.w3.org/TR/html4/loose.dtd">
3
+
4
+ <html lang="en">
5
+ <head>
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
7
+ <title>Fake Page</title>
8
+ <meta name="generator" content="TextMate http://macromates.com/">
9
+ <meta name="author" content="JT Zemp">
10
+ </head>
11
+ <body>
12
+ Form Post!
13
+ </body>
14
+ </html>
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+
4
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
5
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
6
+ require 'webrat_scraper'
7
+
8
+ require 'fakeweb'
9
+
10
+ def open_fixture(fakeweb_fixture_name)
11
+ open(File.join(File.dirname(__FILE__), 'fakeweb_fixtures', fakeweb_fixture_name.to_s)).read
12
+ end
@@ -0,0 +1,139 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe WebratScraper do
4
+ describe "instance methods" do
5
+ before(:all) do
6
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Hello World!")
7
+ end
8
+
9
+ before(:each) do
10
+ @session = WebratScraper.new
11
+ end
12
+
13
+ describe "#doc" do
14
+ it "is a Nokogiri object of of the response body" do
15
+ @session.visit "http://www.example.com/"
16
+ @session.doc.class.should == Nokogiri::HTML::Document
17
+ end
18
+ end
19
+
20
+ describe "#user_agent=" do
21
+ it "sets the user_agent" do
22
+ @session.user_agent = user_agent = "Test User Agent 1.0"
23
+ @session.visit "http://www.example.com/"
24
+ @session.user_agent.should == user_agent
25
+ @session.mechanize.user_agent.should == user_agent
26
+ end
27
+ end
28
+ end
29
+
30
+ describe "webrat's methods" do
31
+ before(:all) do
32
+ FakeWeb.register_uri(:get, "http://example.com/", :body => "Hello World!")
33
+ FakeWeb.register_uri(:get, "http://example.com/1", :body => "Hello World 1!")
34
+ FakeWeb.register_uri(:get, "http://example.com/2", :body => "Hello World 2!")
35
+ FakeWeb.register_uri(:get, "http://example.com/form", :body => open_fixture("fake_form.html"))
36
+ FakeWeb.register_uri(:post, "http://example.com/action", :body => open_fixture("fake_form_action.html"))
37
+ end
38
+
39
+ before(:each) do
40
+ @session = WebratScraper.new
41
+ end
42
+
43
+ describe "#visit" do
44
+ it "visits a webpage and updates the context" do
45
+ @session.visit("http://example.com/")
46
+ @session.doc.inner_text.should == "Hello World!"
47
+
48
+ @session.visit("http://example.com/1")
49
+ @session.doc.inner_text.should == "Hello World 1!"
50
+ end
51
+ end
52
+
53
+ describe "#fill_in" do
54
+ it "fills in a form field with a value" do
55
+ @session.visit "http://example.com/form"
56
+ lambda {@session.fill_in "Username", :with => "who"}.should_not raise_error
57
+ end
58
+ end
59
+
60
+ describe "#click_link" do
61
+ it "clicks a link" do
62
+ @session.visit "http://example.com/form"
63
+ @session.click_link "Page two"
64
+ @session.doc.inner_text.should == "Hello World 2!"
65
+ end
66
+ end
67
+
68
+ describe "#click_button" do
69
+ it "clicks a button" do
70
+ @session.visit "http://example.com/form"
71
+ @session.click_button "Save"
72
+ (@session.doc/"body").inner_text.strip.should == "Form Post!"
73
+ end
74
+ end
75
+
76
+ describe "#assert_contain" do
77
+ it "asserts that the page contains the string given" do
78
+ @session.visit "http://example.com/form"
79
+ lambda { @session.assert_contain("Lorem ipsum dolor sit amet") }.should_not raise_error
80
+ end
81
+ end
82
+ end
83
+ end
84
+
85
+ # Spec these out
86
+ # assert_contain (Webrat::Matchers)
87
+ # assert_have_no_selector (Webrat::Matchers)
88
+ # assert_have_no_tag (Webrat::HaveTagMatcher)
89
+ # assert_have_no_xpath (Webrat::Matchers)
90
+ # assert_have_selector (Webrat::Matchers)
91
+ # assert_have_tag (Webrat::HaveTagMatcher)
92
+ # assert_have_xpath (Webrat::Matchers)
93
+ # assert_not_contain (Webrat::Matchers)
94
+ # attach_file (Webrat::Scope)
95
+ # automate (Webrat::Session)
96
+ # basic_auth (Webrat::Session)
97
+ # check (Webrat::Scope)
98
+ # check_for_infinite_redirects (Webrat::Session)
99
+ # choose (Webrat::Scope)
100
+ # click_area (Webrat::Scope)
101
+ # click_button (Webrat::Scope)
102
+ # click_link (Webrat::Scope)
103
+ # click_link_within (Webrat::Session)
104
+ # contain (Webrat::Matchers)
105
+ # dom (Webrat::Session)
106
+ # field_by_xpath (Webrat::Locators)
107
+ # field_labeled (Webrat::Locators)
108
+ # field_named (Webrat::Locators)
109
+ # field_with_id (Webrat::Locators)
110
+ # fill_in (Webrat::Scope)
111
+ # have_selector (Webrat::Matchers)
112
+ # have_tag (Webrat::HaveTagMatcher)
113
+ # have_xpath (Webrat::Matchers)
114
+ # header (Webrat::Session)
115
+ # http_accept (Webrat::Session)
116
+ # infinite_redirect_limit_exceeded? (Webrat::Session)
117
+ # internal_redirect? (Webrat::Session)
118
+ # match_selector (Webrat::Matchers)
119
+ # match_tag (Webrat::HaveTagMatcher)
120
+ # match_xpath (Webrat::Matchers)
121
+ # mode= (Webrat::Configuration)
122
+ # open_in_browser (Webrat::SaveAndOpenPage)
123
+ # redirected_to (Webrat::Session)
124
+ # reload (Webrat::Session)
125
+ # save_and_open_page (Webrat::SaveAndOpenPage)
126
+ # scoped_dom (Webrat::Scope)
127
+ # select (Webrat::Scope)
128
+ # select_date (Webrat::Scope)
129
+ # select_datetime (Webrat::Scope)
130
+ # select_time (Webrat::Scope)
131
+ # set_hidden_field (Webrat::Scope)
132
+ # simulate (Webrat::Session)
133
+ # submit_form (Webrat::Scope)
134
+ # uncheck (Webrat::Scope)
135
+ # visit (Webrat::Session)
136
+ # within (Webrat::Session)
137
+ # xml_content_type? (Webrat::Session)
138
+
139
+
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webrat-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.4
5
+ platform: ruby
6
+ authors:
7
+ - JT Zemp
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-10-10 00:00:00 -06:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.9.3
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: webrat
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.5
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rspec
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: fakeweb
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ description: A web scraper using Webrat & Mechanize
56
+ email: jtzemp@gmail.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.rdoc
64
+ files:
65
+ - .document
66
+ - .gitignore
67
+ - LICENSE
68
+ - README.rdoc
69
+ - Rakefile
70
+ - VERSION
71
+ - lib/webrat_scraper.rb
72
+ - spec/fakeweb_fixtures/fake_form.html
73
+ - spec/fakeweb_fixtures/fake_form_action.html
74
+ - spec/spec_helper.rb
75
+ - spec/webrat_scraper_spec.rb
76
+ has_rdoc: true
77
+ homepage: http://github.com/jtzemp/webrat-scraper
78
+ post_install_message:
79
+ rdoc_options:
80
+ - --charset=UTF-8
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: "0"
88
+ version:
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: "0"
94
+ version:
95
+ requirements: []
96
+
97
+ rubyforge_project:
98
+ rubygems_version: 1.3.1
99
+ signing_key:
100
+ specification_version: 2
101
+ summary: A web scraper using Webrat & Mechanize - does acceptance-based web scraping relying on the great webrat behavior description language.
102
+ test_files:
103
+ - spec/spec_helper.rb
104
+ - spec/webrat_scraper_spec.rb