horseman 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source :rubygems
2
+
3
+ gemspec
4
+
5
+ gem "echoe", "~>4.6.3"
6
+ gem "rspec", "~>2.7"
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ horseman (0.0.1)
5
+ nokogiri (>= 1.5.0)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ allison (2.0.3)
11
+ diff-lcs (1.1.3)
12
+ echoe (4.6.3)
13
+ allison (>= 2.0.3)
14
+ gemcutter (>= 0.7.0)
15
+ rake (>= 0.9.2)
16
+ rdoc (>= 3.6.1)
17
+ rubyforge (>= 2.0.4)
18
+ gemcutter (0.7.0)
19
+ json (1.6.2)
20
+ json_pure (1.6.2)
21
+ nokogiri (1.5.0)
22
+ rake (0.9.2.2)
23
+ rdoc (3.11)
24
+ json (~> 1.4)
25
+ rspec (2.7.0)
26
+ rspec-core (~> 2.7.0)
27
+ rspec-expectations (~> 2.7.0)
28
+ rspec-mocks (~> 2.7.0)
29
+ rspec-core (2.7.1)
30
+ rspec-expectations (2.7.0)
31
+ diff-lcs (~> 1.1.2)
32
+ rspec-mocks (2.7.0)
33
+ rubyforge (2.0.4)
34
+ json_pure (>= 1.1.7)
35
+
36
+ PLATFORMS
37
+ ruby
38
+
39
+ DEPENDENCIES
40
+ echoe (~> 4.6.3)
41
+ horseman!
42
+ rspec (~> 2.7)
data/Manifest ADDED
@@ -0,0 +1,20 @@
1
+ Gemfile
2
+ Gemfile.lock
3
+ Manifest
4
+ README.rdoc
5
+ Rakefile
6
+ horseman.gemspec
7
+ lib/horseman.rb
8
+ lib/horseman/browser.rb
9
+ lib/horseman/connection.rb
10
+ lib/horseman/cookies.rb
11
+ lib/horseman/hidden_fields.rb
12
+ lib/horseman/response.rb
13
+ lib/horseman/version.rb
14
+ spec/horseman/browser_spec.rb
15
+ spec/horseman/connection_spec.rb
16
+ spec/horseman/cookies_spec.rb
17
+ spec/horseman/hidden_fields_spec.rb
18
+ spec/horseman/response_spec.rb
19
+ spec/mocks.rb
20
+ spec/spec_helper.rb
data/README.rdoc ADDED
@@ -0,0 +1,3 @@
1
+ = Horseman
2
+
3
+ Headless HTTP crawler/scraper for web applications built with ASP.NET WebForms.
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require 'rake'
2
+ require 'rspec/core/rake_task'
3
+ require 'echoe'
4
+
5
+ Echoe.new("horseman", "0.0.1") do |p|
6
+ p.description = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
7
+ p.url = "http://jarrodpeace.com"
8
+ p.author = "Jarrod Peace"
9
+ p.email = "peace.jarrod@gmail.com"
10
+ p.ignore_pattern = FileList[".gitignore"]
11
+ p.development_dependencies = []
12
+ p.runtime_dependencies = ["nokogiri >=1.5.0"]
13
+ end
14
+
15
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
16
+
17
+
18
+ desc "Default task - runs specs"
19
+ task :default => :spec
20
+
21
+ desc "Run specs"
22
+ RSpec::Core::RakeTask.new do |t|
23
+ t.rspec_opts = '-cfd'
24
+ end
data/horseman.gemspec ADDED
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "horseman"
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Jarrod Peace"]
9
+ s.date = "2012-01-06"
10
+ s.description = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
11
+ s.email = "peace.jarrod@gmail.com"
12
+ s.extra_rdoc_files = ["README.rdoc", "lib/horseman.rb", "lib/horseman/browser.rb", "lib/horseman/connection.rb", "lib/horseman/cookies.rb", "lib/horseman/hidden_fields.rb", "lib/horseman/response.rb", "lib/horseman/version.rb"]
13
+ s.files = ["Gemfile", "Gemfile.lock", "Manifest", "README.rdoc", "Rakefile", "horseman.gemspec", "lib/horseman.rb", "lib/horseman/browser.rb", "lib/horseman/connection.rb", "lib/horseman/cookies.rb", "lib/horseman/hidden_fields.rb", "lib/horseman/response.rb", "lib/horseman/version.rb", "spec/horseman/browser_spec.rb", "spec/horseman/connection_spec.rb", "spec/horseman/cookies_spec.rb", "spec/horseman/hidden_fields_spec.rb", "spec/horseman/response_spec.rb", "spec/mocks.rb", "spec/spec_helper.rb"]
14
+ s.homepage = "http://jarrodpeace.com"
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Horseman", "--main", "README.rdoc"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = "horseman"
18
+ s.rubygems_version = "1.8.10"
19
+ s.summary = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
20
+
21
+ if s.respond_to? :specification_version then
22
+ s.specification_version = 3
23
+
24
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
26
+ else
27
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
28
+ end
29
+ else
30
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
31
+ end
32
+ end
@@ -0,0 +1,32 @@
1
+ require 'horseman/response'
2
+
3
+ module Horseman
4
+ class Browser
5
+ attr_accessor :base_url
6
+ attr_reader :cookies, :connection, :last_response
7
+
8
+ def initialize(connection, base_url='')
9
+ @connection = connection
10
+ @base_url = base_url
11
+ @cookies = Horseman::Cookies.new
12
+ end
13
+
14
+ def clear_session
15
+ @cookies.clear
16
+ end
17
+
18
+ def get!(path = '/')
19
+ request = @connection.build_request(:url => "#{@base_url}#{path}", :verb => :get)
20
+ exec(request)
21
+ end
22
+
23
+ private
24
+
25
+ def exec(request)
26
+ request['cookie'] = @cookies.to_s
27
+ response = @connection.exec_request(request)
28
+ @cookies.update(response.get_fields('set-cookie'))
29
+ @last_response = Horseman::Response.new(response.body)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,42 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+ require 'net/https'
4
+
5
+ module Horseman
6
+ class Connection
7
+ attr_reader :http
8
+
9
+ def url=(url)
10
+ @uri = URI.parse(url)
11
+ build_http
12
+ end
13
+
14
+ def exec_request(request)
15
+ @http.request(request)
16
+ end
17
+
18
+ def build_request(options={})
19
+ self.url = options[:url] unless options[:url].nil?
20
+ options[:verb] == (:get || nil) ? build_get_request : build_post_request(options[:form])
21
+ end
22
+
23
+ private
24
+
25
+ def build_http
26
+ @http = Net::HTTP.new(@uri.host, @uri.port)
27
+ if (@uri.port == 443)
28
+ @http.use_ssl = true
29
+ end
30
+ end
31
+
32
+ def build_get_request
33
+ return Net::HTTP::Get.new(@uri.request_uri)
34
+ end
35
+
36
+ def build_post_request(form)
37
+ ret = Net::HTTP::Post.new(@uri.request_uri)
38
+ ret.form_data = form unless form.nil?
39
+ return ret
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,80 @@
1
+ module Horseman
2
+ class Cookie
3
+ attr_reader :value, :domain, :path, :expiration
4
+
5
+ def initialize(value, attributes)
6
+ @value = value
7
+ attributes.each {|a| parse_attribute(a)}
8
+ end
9
+
10
+ private
11
+
12
+ def parse_attribute(attribute)
13
+ md = /(\w+)=(.*)/.match(attribute)
14
+ if md
15
+ case md.captures[0].downcase
16
+ when 'domain'
17
+ @domain = md.captures[1]
18
+ when 'path'
19
+ @path = md.captures[1]
20
+ when 'expires'
21
+ @expiration = DateTime.parse(md.captures[1])
22
+ when 'max-age'
23
+ @expiration = DateTime.now + (md.captures[1] / (60 * 60 * 24))
24
+ end
25
+ end
26
+ end
27
+ end
28
+
29
+ class Cookies
30
+ def initialize
31
+ clear
32
+ end
33
+
34
+ def [](cookie_name)
35
+ return @dict[cookie_name].value unless @dict[cookie_name].nil?
36
+ end
37
+
38
+ def get(cookie_name)
39
+ return @dict[cookie_name]
40
+ end
41
+
42
+ def clear
43
+ @dict = {}
44
+ end
45
+
46
+ def count
47
+ @dict.count
48
+ end
49
+
50
+ def empty?
51
+ @dict.count == 0
52
+ end
53
+
54
+ def to_s
55
+ @dict.map {|k,v| "#{k}=#{v.value}"}.join('; ')
56
+ end
57
+
58
+ def update(header)
59
+ if header.is_a?(Array)
60
+ header.each {|h| parse_header(h)}
61
+ else
62
+ parse_header(header) unless header.nil?
63
+ end
64
+ self
65
+ end
66
+
67
+ private
68
+
69
+ def parse_header(header)
70
+ nvp, *attributes = *(header.split(';'))
71
+ raise ArgumentError if nvp.nil?
72
+ md = /(\w+)=(.*)/.match(nvp)
73
+ raise ArgumentError if md.nil?
74
+ name = md.captures[0]
75
+ value = md.captures[1]
76
+
77
+ @dict.merge!({name => Horseman::Cookie.new(value, attributes)})
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,13 @@
1
+ module Horseman
2
+ class HiddenFields
3
+ attr_reader :tokens
4
+
5
+ def initialize(html)
6
+ rx = /<input.* type=["']hidden["'].* name=["'](\S+)["'].* value=["'](\S*)["'].* \/>/
7
+ @tokens = {}
8
+ html.scan(rx).each {|field|
9
+ @tokens[field[0]] = field[1]
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,26 @@
1
+ module Horseman
2
+ class Element
3
+ attr_accessor :id, :name
4
+ end
5
+ class Form < Element
6
+ end
7
+ class FormField < Element
8
+ attr_accessor :type, :value
9
+ end
10
+
11
+ class Response
12
+ attr_reader :body, :forms
13
+
14
+ def initialize(body)
15
+ @body = body
16
+ @forms = []
17
+ parse
18
+ end
19
+
20
+ private
21
+
22
+ def parse
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Horseman
2
+ VERSION = "0.0.1"
3
+ end
data/lib/horseman.rb ADDED
@@ -0,0 +1 @@
1
+ require 'horseman/version'
@@ -0,0 +1,35 @@
1
+ require 'horseman/browser'
2
+
3
+ describe Horseman::Browser do
4
+ include Mocks
5
+
6
+ subject {described_class.new(connection, 'http://www.example.com')}
7
+
8
+ it "saves cookies" do
9
+ subject.cookies.should be_empty
10
+
11
+ subject.get!
12
+ subject.cookies.count.should eq 2
13
+ subject.cookies['name1'].should eq 'value1'
14
+ subject.cookies['name2'].should eq 'value2'
15
+
16
+ subject.connection.should_receive(:exec_request) do |request|
17
+ request['cookie'].should match /\w+=\w+; \w+=\w+/
18
+ request['cookie'].should match /name1=value1/
19
+ request['cookie'].should match /name2=value2/
20
+ end
21
+ subject.get!
22
+ end
23
+
24
+ it "empties the cookies when the session is cleared" do
25
+ subject.get!
26
+ subject.cookies.should_not be_empty
27
+ subject.clear_session
28
+ subject.cookies.should be_empty
29
+ end
30
+
31
+ it "stores information about the last response" do
32
+ subject.get!
33
+ subject.last_response.body.should eq html
34
+ end
35
+ end
@@ -0,0 +1,64 @@
1
+ require 'horseman/connection'
2
+ require 'net/http'
3
+
4
+ describe Horseman::Connection do
5
+ subject do
6
+ c = described_class.new
7
+ c.url = 'http://www.example.com/some/path'
8
+ c
9
+ end
10
+
11
+ context "when building requests" do
12
+ let(:request) {subject.build_request(:verb => :get)}
13
+
14
+ it "uses the proper path" do
15
+ request.path.should eq '/some/path'
16
+ end
17
+
18
+ context "using GET" do
19
+ it "uses the proper request type" do
20
+ request.class.should be Net::HTTP::Get
21
+ end
22
+ end
23
+
24
+ context "using POST" do
25
+ let(:request) {subject.build_request(:verb => :post)}
26
+
27
+ it "uses the proper request type" do
28
+ request.class.should be Net::HTTP::Post
29
+ end
30
+
31
+ context "with form data" do
32
+ let(:request) {subject.build_request(:verb => :post, :form => {:field1=>'value1', :field2=>'value2'})}
33
+
34
+ it "properly sets request body" do
35
+ request.body.should eq 'field1=value1&field2=value2'
36
+ end
37
+ end
38
+
39
+ context "without form data" do
40
+ it "properly sets request body" do
41
+ request.body.should be_nil
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ context "when accessed using http" do
48
+ it "does not use SSL" do
49
+ subject.http.use_ssl?.should be_false
50
+ end
51
+ end
52
+
53
+ context "when accessed using https" do
54
+ subject do
55
+ c = described_class.new
56
+ c.url = 'https://www.example.com'
57
+ c
58
+ end
59
+
60
+ it "uses SSL" do
61
+ subject.http.use_ssl?.should be_true
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,74 @@
1
+ require 'horseman/cookies'
2
+
3
+ class Yo
4
+ def test
5
+ pp "yo"
6
+ end
7
+ end
8
+
9
+ describe Horseman::Cookies do
10
+ let(:simple_header) {'name1=value1'}
11
+ let(:complex_header) {'name2=value2; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT'}
12
+
13
+ it "starts empty" do
14
+ subject.should be_empty
15
+ end
16
+
17
+ it "accepts a single header" do
18
+ subject.update(simple_header)['name1'].should eq 'value1'
19
+ end
20
+
21
+ it "accepts multiple headers" do
22
+ subject.update([simple_header, complex_header])
23
+ subject['name1'].should eq 'value1'
24
+ subject['name2'].should eq 'value2'
25
+ end
26
+
27
+ it "captures attributes" do
28
+ subject.update(complex_header)
29
+ subject.get('name2').domain.should eq 'www.example.com'
30
+ subject.get('name2').path.should eq '/path'
31
+ subject.get('name2').expiration.should eq DateTime.new(2012, 1, 1, 0, 0, 0, 0)
32
+ end
33
+
34
+ it "accepts an empty array" do
35
+ subject.update([]).should be_empty
36
+ end
37
+
38
+ it "accepts nil" do
39
+ subject.update(nil).should be_empty
40
+ end
41
+
42
+ it "raises an exception on an unrecognized header" do
43
+ expect {subject.update('bad header')}.to raise_error(ArgumentError)
44
+ end
45
+
46
+ it "generates a correct header" do
47
+ header = subject.update([simple_header, complex_header]).to_s
48
+ header.should match /\w+=\w+; \w+=\w+/
49
+ header.should match /name1=value1/
50
+ header.should match /name2=value2/
51
+ end
52
+
53
+ context "with prexisting values" do
54
+ subject do
55
+ described_class.new.update('name1=other_value')
56
+ end
57
+
58
+ it "returns nil for uninitialized values" do
59
+ subject['doesnt_exist'].should be_nil
60
+ end
61
+
62
+ it "merges new values" do
63
+ subject.update(complex_header)
64
+ subject['name1'].should eq 'other_value'
65
+ subject['name2'].should eq 'value2'
66
+ end
67
+
68
+ it "overwrites existing values" do
69
+ subject.update(simple_header)
70
+ subject['name1'].should eq 'value1'
71
+ end
72
+ end
73
+
74
+ end
@@ -0,0 +1,44 @@
1
+ require 'horseman/hidden_fields'
2
+
3
+ describe Horseman::HiddenFields do
4
+
5
+ it "parses a single simple hidden input field" do
6
+ html = %{<input type="hidden" name="test" value="test_data" />}
7
+ cut = described_class.new(html)
8
+
9
+ cut.tokens.size.should == 1
10
+ cut.tokens['test'].should == 'test_data'
11
+ end
12
+
13
+ it "parses a single complex hidden input field" do
14
+ html = %{<input attr0="value0" type="hidden" attr1="value1" name="test" attr2="value2" value="test_data" attr3="value3" />}
15
+ cut = described_class.new(html)
16
+
17
+ cut.tokens.size.should == 1
18
+ cut.tokens['test'].should == 'test_data'
19
+ end
20
+
21
+ it "parses multiple hidden input fields" do
22
+ html = %{
23
+ <input type="hidden" name="test" value="test_data" />
24
+ <input type="hidden" name="foo" value="bar" />
25
+ <some other="tag"></some>
26
+ <input type="hidden" name="dee" value="dum" />
27
+ }
28
+ cut = described_class.new(html)
29
+
30
+ cut.tokens.size.should == 3
31
+ cut.tokens['test'].should == 'test_data'
32
+ cut.tokens['foo'].should == 'bar'
33
+ cut.tokens['dee'].should == 'dum'
34
+ end
35
+
36
+ it "handles single quotes, too" do
37
+ html = %{<input type='hidden' name='test' value='test_data' />}
38
+ cut = described_class.new(html)
39
+
40
+ cut.tokens.size.should == 1
41
+ cut.tokens['test'].should == 'test_data'
42
+ end
43
+
44
+ end
@@ -0,0 +1,13 @@
1
+ require 'horseman/response'
2
+
3
+ describe Horseman::Response do
4
+ include Mocks
5
+
6
+ subject { described_class.new(html) }
7
+
8
+ it "parses forms" do
9
+ subject.forms.count.should eq 2
10
+ subject.forms[0].id.should eq 'form1'
11
+ subject.forms[1].id.should eq 'form2'
12
+ end
13
+ end
data/spec/mocks.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'horseman/connection'
2
+
3
+ module Mocks
4
+
5
+ def html
6
+ %{
7
+ <html>
8
+ <head></head>
9
+ <body>
10
+ <form id="form1">
11
+ <input type="text" name="name1" value="value1" />
12
+ <input type="submit" value="OK" />
13
+ </form>
14
+ <form id="form2">
15
+ <input type="text" name="name2" value="value2" />
16
+ <input type="submit" value="OK" />
17
+ </form>
18
+ </body>
19
+ </html>
20
+ }
21
+ end
22
+
23
+ def cookies
24
+ ['name1=value1; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT',
25
+ 'name2=value2; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT']
26
+ end
27
+
28
+ def response
29
+ m = double("HttpResponse")
30
+ m.stub(:[]) do |key|
31
+ case key
32
+ when 'set-cookie'
33
+ cookies.join(', ')
34
+ end
35
+ end
36
+ m.stub(:get_fields) do |key|
37
+ case key
38
+ when 'set-cookie'
39
+ cookies
40
+ end
41
+ end
42
+ m.stub(:body) { html }
43
+ m
44
+ end
45
+
46
+ def connection
47
+ Horseman::Connection.any_instance.stub(:exec_request) { response }
48
+ Horseman::Connection.new
49
+ end
50
+ end
@@ -0,0 +1 @@
1
+ require 'mocks'
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: horseman
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jarrod Peace
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70095638639800 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70095638639800
25
+ description: Headless HTTP crawler/scraper for ASP.NET WebForms applications
26
+ email: peace.jarrod@gmail.com
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files:
30
+ - README.rdoc
31
+ - lib/horseman.rb
32
+ - lib/horseman/browser.rb
33
+ - lib/horseman/connection.rb
34
+ - lib/horseman/cookies.rb
35
+ - lib/horseman/hidden_fields.rb
36
+ - lib/horseman/response.rb
37
+ - lib/horseman/version.rb
38
+ files:
39
+ - Gemfile
40
+ - Gemfile.lock
41
+ - Manifest
42
+ - README.rdoc
43
+ - Rakefile
44
+ - horseman.gemspec
45
+ - lib/horseman.rb
46
+ - lib/horseman/browser.rb
47
+ - lib/horseman/connection.rb
48
+ - lib/horseman/cookies.rb
49
+ - lib/horseman/hidden_fields.rb
50
+ - lib/horseman/response.rb
51
+ - lib/horseman/version.rb
52
+ - spec/horseman/browser_spec.rb
53
+ - spec/horseman/connection_spec.rb
54
+ - spec/horseman/cookies_spec.rb
55
+ - spec/horseman/hidden_fields_spec.rb
56
+ - spec/horseman/response_spec.rb
57
+ - spec/mocks.rb
58
+ - spec/spec_helper.rb
59
+ homepage: http://jarrodpeace.com
60
+ licenses: []
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --line-numbers
64
+ - --inline-source
65
+ - --title
66
+ - Horseman
67
+ - --main
68
+ - README.rdoc
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '1.2'
83
+ requirements: []
84
+ rubyforge_project: horseman
85
+ rubygems_version: 1.8.10
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: Headless HTTP crawler/scraper for ASP.NET WebForms applications
89
+ test_files: []