horseman 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source :rubygems
2
+
3
+ gemspec
4
+
5
+ gem "echoe", "~>4.6.3"
6
+ gem "rspec", "~>2.7"
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ horseman (0.0.1)
5
+ nokogiri (>= 1.5.0)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ allison (2.0.3)
11
+ diff-lcs (1.1.3)
12
+ echoe (4.6.3)
13
+ allison (>= 2.0.3)
14
+ gemcutter (>= 0.7.0)
15
+ rake (>= 0.9.2)
16
+ rdoc (>= 3.6.1)
17
+ rubyforge (>= 2.0.4)
18
+ gemcutter (0.7.0)
19
+ json (1.6.2)
20
+ json_pure (1.6.2)
21
+ nokogiri (1.5.0)
22
+ rake (0.9.2.2)
23
+ rdoc (3.11)
24
+ json (~> 1.4)
25
+ rspec (2.7.0)
26
+ rspec-core (~> 2.7.0)
27
+ rspec-expectations (~> 2.7.0)
28
+ rspec-mocks (~> 2.7.0)
29
+ rspec-core (2.7.1)
30
+ rspec-expectations (2.7.0)
31
+ diff-lcs (~> 1.1.2)
32
+ rspec-mocks (2.7.0)
33
+ rubyforge (2.0.4)
34
+ json_pure (>= 1.1.7)
35
+
36
+ PLATFORMS
37
+ ruby
38
+
39
+ DEPENDENCIES
40
+ echoe (~> 4.6.3)
41
+ horseman!
42
+ rspec (~> 2.7)
data/Manifest ADDED
@@ -0,0 +1,20 @@
1
+ Gemfile
2
+ Gemfile.lock
3
+ Manifest
4
+ README.rdoc
5
+ Rakefile
6
+ horseman.gemspec
7
+ lib/horseman.rb
8
+ lib/horseman/browser.rb
9
+ lib/horseman/connection.rb
10
+ lib/horseman/cookies.rb
11
+ lib/horseman/hidden_fields.rb
12
+ lib/horseman/response.rb
13
+ lib/horseman/version.rb
14
+ spec/horseman/browser_spec.rb
15
+ spec/horseman/connection_spec.rb
16
+ spec/horseman/cookies_spec.rb
17
+ spec/horseman/hidden_fields_spec.rb
18
+ spec/horseman/response_spec.rb
19
+ spec/mocks.rb
20
+ spec/spec_helper.rb
data/README.rdoc ADDED
@@ -0,0 +1,3 @@
1
+ = Horseman
2
+
3
+ Headless HTTP crawler/scraper for web applications built with ASP.NET WebForms.
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require 'rake'
2
+ require 'rspec/core/rake_task'
3
+ require 'echoe'
4
+
5
+ Echoe.new("horseman", "0.0.1") do |p|
6
+ p.description = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
7
+ p.url = "http://jarrodpeace.com"
8
+ p.author = "Jarrod Peace"
9
+ p.email = "peace.jarrod@gmail.com"
10
+ p.ignore_pattern = FileList[".gitignore"]
11
+ p.development_dependencies = []
12
+ p.runtime_dependencies = ["nokogiri >=1.5.0"]
13
+ end
14
+
15
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
16
+
17
+
18
+ desc "Default task - runs specs"
19
+ task :default => :spec
20
+
21
+ desc "Run specs"
22
+ RSpec::Core::RakeTask.new do |t|
23
+ t.rspec_opts = '-cfd'
24
+ end
data/horseman.gemspec ADDED
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "horseman"
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Jarrod Peace"]
9
+ s.date = "2012-01-06"
10
+ s.description = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
11
+ s.email = "peace.jarrod@gmail.com"
12
+ s.extra_rdoc_files = ["README.rdoc", "lib/horseman.rb", "lib/horseman/browser.rb", "lib/horseman/connection.rb", "lib/horseman/cookies.rb", "lib/horseman/hidden_fields.rb", "lib/horseman/response.rb", "lib/horseman/version.rb"]
13
+ s.files = ["Gemfile", "Gemfile.lock", "Manifest", "README.rdoc", "Rakefile", "horseman.gemspec", "lib/horseman.rb", "lib/horseman/browser.rb", "lib/horseman/connection.rb", "lib/horseman/cookies.rb", "lib/horseman/hidden_fields.rb", "lib/horseman/response.rb", "lib/horseman/version.rb", "spec/horseman/browser_spec.rb", "spec/horseman/connection_spec.rb", "spec/horseman/cookies_spec.rb", "spec/horseman/hidden_fields_spec.rb", "spec/horseman/response_spec.rb", "spec/mocks.rb", "spec/spec_helper.rb"]
14
+ s.homepage = "http://jarrodpeace.com"
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Horseman", "--main", "README.rdoc"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = "horseman"
18
+ s.rubygems_version = "1.8.10"
19
+ s.summary = "Headless HTTP crawler/scraper for ASP.NET WebForms applications"
20
+
21
+ if s.respond_to? :specification_version then
22
+ s.specification_version = 3
23
+
24
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
26
+ else
27
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
28
+ end
29
+ else
30
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
31
+ end
32
+ end
@@ -0,0 +1,32 @@
1
+ require 'horseman/response'
2
+
3
+ module Horseman
4
+ class Browser
5
+ attr_accessor :base_url
6
+ attr_reader :cookies, :connection, :last_response
7
+
8
+ def initialize(connection, base_url='')
9
+ @connection = connection
10
+ @base_url = base_url
11
+ @cookies = Horseman::Cookies.new
12
+ end
13
+
14
+ def clear_session
15
+ @cookies.clear
16
+ end
17
+
18
+ def get!(path = '/')
19
+ request = @connection.build_request(:url => "#{@base_url}#{path}", :verb => :get)
20
+ exec(request)
21
+ end
22
+
23
+ private
24
+
25
+ def exec(request)
26
+ request['cookie'] = @cookies.to_s
27
+ response = @connection.exec_request(request)
28
+ @cookies.update(response.get_fields('set-cookie'))
29
+ @last_response = Horseman::Response.new(response.body)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,42 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+ require 'net/https'
4
+
5
+ module Horseman
6
+ class Connection
7
+ attr_reader :http
8
+
9
+ def url=(url)
10
+ @uri = URI.parse(url)
11
+ build_http
12
+ end
13
+
14
+ def exec_request(request)
15
+ @http.request(request)
16
+ end
17
+
18
+ def build_request(options={})
19
+ self.url = options[:url] unless options[:url].nil?
20
+ options[:verb] == (:get || nil) ? build_get_request : build_post_request(options[:form])
21
+ end
22
+
23
+ private
24
+
25
+ def build_http
26
+ @http = Net::HTTP.new(@uri.host, @uri.port)
27
+ if (@uri.port == 443)
28
+ @http.use_ssl = true
29
+ end
30
+ end
31
+
32
+ def build_get_request
33
+ return Net::HTTP::Get.new(@uri.request_uri)
34
+ end
35
+
36
+ def build_post_request(form)
37
+ ret = Net::HTTP::Post.new(@uri.request_uri)
38
+ ret.form_data = form unless form.nil?
39
+ return ret
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,80 @@
1
+ module Horseman
2
+ class Cookie
3
+ attr_reader :value, :domain, :path, :expiration
4
+
5
+ def initialize(value, attributes)
6
+ @value = value
7
+ attributes.each {|a| parse_attribute(a)}
8
+ end
9
+
10
+ private
11
+
12
+ def parse_attribute(attribute)
13
+ md = /(\w+)=(.*)/.match(attribute)
14
+ if md
15
+ case md.captures[0].downcase
16
+ when 'domain'
17
+ @domain = md.captures[1]
18
+ when 'path'
19
+ @path = md.captures[1]
20
+ when 'expires'
21
+ @expiration = DateTime.parse(md.captures[1])
22
+ when 'max-age'
23
+ @expiration = DateTime.now + (md.captures[1] / (60 * 60 * 24))
24
+ end
25
+ end
26
+ end
27
+ end
28
+
29
+ class Cookies
30
+ def initialize
31
+ clear
32
+ end
33
+
34
+ def [](cookie_name)
35
+ return @dict[cookie_name].value unless @dict[cookie_name].nil?
36
+ end
37
+
38
+ def get(cookie_name)
39
+ return @dict[cookie_name]
40
+ end
41
+
42
+ def clear
43
+ @dict = {}
44
+ end
45
+
46
+ def count
47
+ @dict.count
48
+ end
49
+
50
+ def empty?
51
+ @dict.count == 0
52
+ end
53
+
54
+ def to_s
55
+ @dict.map {|k,v| "#{k}=#{v.value}"}.join('; ')
56
+ end
57
+
58
+ def update(header)
59
+ if header.is_a?(Array)
60
+ header.each {|h| parse_header(h)}
61
+ else
62
+ parse_header(header) unless header.nil?
63
+ end
64
+ self
65
+ end
66
+
67
+ private
68
+
69
+ def parse_header(header)
70
+ nvp, *attributes = *(header.split(';'))
71
+ raise ArgumentError if nvp.nil?
72
+ md = /(\w+)=(.*)/.match(nvp)
73
+ raise ArgumentError if md.nil?
74
+ name = md.captures[0]
75
+ value = md.captures[1]
76
+
77
+ @dict.merge!({name => Horseman::Cookie.new(value, attributes)})
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,13 @@
1
+ module Horseman
2
+ class HiddenFields
3
+ attr_reader :tokens
4
+
5
+ def initialize(html)
6
+ rx = /<input.* type=["']hidden["'].* name=["'](\S+)["'].* value=["'](\S*)["'].* \/>/
7
+ @tokens = {}
8
+ html.scan(rx).each {|field|
9
+ @tokens[field[0]] = field[1]
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,26 @@
1
+ module Horseman
2
+ class Element
3
+ attr_accessor :id, :name
4
+ end
5
+ class Form < Element
6
+ end
7
+ class FormField < Element
8
+ attr_accessor :type, :value
9
+ end
10
+
11
+ class Response
12
+ attr_reader :body, :forms
13
+
14
+ def initialize(body)
15
+ @body = body
16
+ @forms = []
17
+ parse
18
+ end
19
+
20
+ private
21
+
22
+ def parse
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Horseman
2
+ VERSION = "0.0.1"
3
+ end
data/lib/horseman.rb ADDED
@@ -0,0 +1 @@
1
+ require 'horseman/version'
@@ -0,0 +1,35 @@
1
+ require 'horseman/browser'
2
+
3
+ describe Horseman::Browser do
4
+ include Mocks
5
+
6
+ subject {described_class.new(connection, 'http://www.example.com')}
7
+
8
+ it "saves cookies" do
9
+ subject.cookies.should be_empty
10
+
11
+ subject.get!
12
+ subject.cookies.count.should eq 2
13
+ subject.cookies['name1'].should eq 'value1'
14
+ subject.cookies['name2'].should eq 'value2'
15
+
16
+ subject.connection.should_receive(:exec_request) do |request|
17
+ request['cookie'].should match /\w+=\w+; \w+=\w+/
18
+ request['cookie'].should match /name1=value1/
19
+ request['cookie'].should match /name2=value2/
20
+ end
21
+ subject.get!
22
+ end
23
+
24
+ it "empties the cookies when the session is cleared" do
25
+ subject.get!
26
+ subject.cookies.should_not be_empty
27
+ subject.clear_session
28
+ subject.cookies.should be_empty
29
+ end
30
+
31
+ it "stores information about the last response" do
32
+ subject.get!
33
+ subject.last_response.body.should eq html
34
+ end
35
+ end
@@ -0,0 +1,64 @@
1
+ require 'horseman/connection'
2
+ require 'net/http'
3
+
4
+ describe Horseman::Connection do
5
+ subject do
6
+ c = described_class.new
7
+ c.url = 'http://www.example.com/some/path'
8
+ c
9
+ end
10
+
11
+ context "when building requests" do
12
+ let(:request) {subject.build_request(:verb => :get)}
13
+
14
+ it "uses the proper path" do
15
+ request.path.should eq '/some/path'
16
+ end
17
+
18
+ context "using GET" do
19
+ it "uses the proper request type" do
20
+ request.class.should be Net::HTTP::Get
21
+ end
22
+ end
23
+
24
+ context "using POST" do
25
+ let(:request) {subject.build_request(:verb => :post)}
26
+
27
+ it "uses the proper request type" do
28
+ request.class.should be Net::HTTP::Post
29
+ end
30
+
31
+ context "with form data" do
32
+ let(:request) {subject.build_request(:verb => :post, :form => {:field1=>'value1', :field2=>'value2'})}
33
+
34
+ it "properly sets request body" do
35
+ request.body.should eq 'field1=value1&field2=value2'
36
+ end
37
+ end
38
+
39
+ context "without form data" do
40
+ it "properly sets request body" do
41
+ request.body.should be_nil
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ context "when accessed using http" do
48
+ it "does not use SSL" do
49
+ subject.http.use_ssl?.should be_false
50
+ end
51
+ end
52
+
53
+ context "when accessed using https" do
54
+ subject do
55
+ c = described_class.new
56
+ c.url = 'https://www.example.com'
57
+ c
58
+ end
59
+
60
+ it "uses SSL" do
61
+ subject.http.use_ssl?.should be_true
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,74 @@
1
+ require 'horseman/cookies'
2
+
3
+ class Yo
4
+ def test
5
+ pp "yo"
6
+ end
7
+ end
8
+
9
+ describe Horseman::Cookies do
10
+ let(:simple_header) {'name1=value1'}
11
+ let(:complex_header) {'name2=value2; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT'}
12
+
13
+ it "starts empty" do
14
+ subject.should be_empty
15
+ end
16
+
17
+ it "accepts a single header" do
18
+ subject.update(simple_header)['name1'].should eq 'value1'
19
+ end
20
+
21
+ it "accepts multiple headers" do
22
+ subject.update([simple_header, complex_header])
23
+ subject['name1'].should eq 'value1'
24
+ subject['name2'].should eq 'value2'
25
+ end
26
+
27
+ it "captures attributes" do
28
+ subject.update(complex_header)
29
+ subject.get('name2').domain.should eq 'www.example.com'
30
+ subject.get('name2').path.should eq '/path'
31
+ subject.get('name2').expiration.should eq DateTime.new(2012, 1, 1, 0, 0, 0, 0)
32
+ end
33
+
34
+ it "accepts an empty array" do
35
+ subject.update([]).should be_empty
36
+ end
37
+
38
+ it "accepts nil" do
39
+ subject.update(nil).should be_empty
40
+ end
41
+
42
+ it "raises an exception on an unrecognized header" do
43
+ expect {subject.update('bad header')}.to raise_error(ArgumentError)
44
+ end
45
+
46
+ it "generates a correct header" do
47
+ header = subject.update([simple_header, complex_header]).to_s
48
+ header.should match /\w+=\w+; \w+=\w+/
49
+ header.should match /name1=value1/
50
+ header.should match /name2=value2/
51
+ end
52
+
53
+ context "with prexisting values" do
54
+ subject do
55
+ described_class.new.update('name1=other_value')
56
+ end
57
+
58
+ it "returns nil for uninitialized values" do
59
+ subject['doesnt_exist'].should be_nil
60
+ end
61
+
62
+ it "merges new values" do
63
+ subject.update(complex_header)
64
+ subject['name1'].should eq 'other_value'
65
+ subject['name2'].should eq 'value2'
66
+ end
67
+
68
+ it "overwrites existing values" do
69
+ subject.update(simple_header)
70
+ subject['name1'].should eq 'value1'
71
+ end
72
+ end
73
+
74
+ end
@@ -0,0 +1,44 @@
1
+ require 'horseman/hidden_fields'
2
+
3
+ describe Horseman::HiddenFields do
4
+
5
+ it "parses a single simple hidden input field" do
6
+ html = %{<input type="hidden" name="test" value="test_data" />}
7
+ cut = described_class.new(html)
8
+
9
+ cut.tokens.size.should == 1
10
+ cut.tokens['test'].should == 'test_data'
11
+ end
12
+
13
+ it "parses a single complex hidden input field" do
14
+ html = %{<input attr0="value0" type="hidden" attr1="value1" name="test" attr2="value2" value="test_data" attr3="value3" />}
15
+ cut = described_class.new(html)
16
+
17
+ cut.tokens.size.should == 1
18
+ cut.tokens['test'].should == 'test_data'
19
+ end
20
+
21
+ it "parses multiple hidden input fields" do
22
+ html = %{
23
+ <input type="hidden" name="test" value="test_data" />
24
+ <input type="hidden" name="foo" value="bar" />
25
+ <some other="tag"></some>
26
+ <input type="hidden" name="dee" value="dum" />
27
+ }
28
+ cut = described_class.new(html)
29
+
30
+ cut.tokens.size.should == 3
31
+ cut.tokens['test'].should == 'test_data'
32
+ cut.tokens['foo'].should == 'bar'
33
+ cut.tokens['dee'].should == 'dum'
34
+ end
35
+
36
+ it "handles single quotes, too" do
37
+ html = %{<input type='hidden' name='test' value='test_data' />}
38
+ cut = described_class.new(html)
39
+
40
+ cut.tokens.size.should == 1
41
+ cut.tokens['test'].should == 'test_data'
42
+ end
43
+
44
+ end
@@ -0,0 +1,13 @@
1
+ require 'horseman/response'
2
+
3
+ describe Horseman::Response do
4
+ include Mocks
5
+
6
+ subject { described_class.new(html) }
7
+
8
+ it "parses forms" do
9
+ subject.forms.count.should eq 2
10
+ subject.forms[0].id.should eq 'form1'
11
+ subject.forms[1].id.should eq 'form2'
12
+ end
13
+ end
data/spec/mocks.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'horseman/connection'
2
+
3
+ module Mocks
4
+
5
+ def html
6
+ %{
7
+ <html>
8
+ <head></head>
9
+ <body>
10
+ <form id="form1">
11
+ <input type="text" name="name1" value="value1" />
12
+ <input type="submit" value="OK" />
13
+ </form>
14
+ <form id="form2">
15
+ <input type="text" name="name2" value="value2" />
16
+ <input type="submit" value="OK" />
17
+ </form>
18
+ </body>
19
+ </html>
20
+ }
21
+ end
22
+
23
+ def cookies
24
+ ['name1=value1; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT',
25
+ 'name2=value2; Domain=www.example.com; Path=/path; Expires=Sun, 1-Jan-2012 00:00:00 GMT']
26
+ end
27
+
28
+ def response
29
+ m = double("HttpResponse")
30
+ m.stub(:[]) do |key|
31
+ case key
32
+ when 'set-cookie'
33
+ cookies.join(', ')
34
+ end
35
+ end
36
+ m.stub(:get_fields) do |key|
37
+ case key
38
+ when 'set-cookie'
39
+ cookies
40
+ end
41
+ end
42
+ m.stub(:body) { html }
43
+ m
44
+ end
45
+
46
+ def connection
47
+ Horseman::Connection.any_instance.stub(:exec_request) { response }
48
+ Horseman::Connection.new
49
+ end
50
+ end
@@ -0,0 +1 @@
1
+ require 'mocks'
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: horseman
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jarrod Peace
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70095638639800 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70095638639800
25
+ description: Headless HTTP crawler/scraper for ASP.NET WebForms applications
26
+ email: peace.jarrod@gmail.com
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files:
30
+ - README.rdoc
31
+ - lib/horseman.rb
32
+ - lib/horseman/browser.rb
33
+ - lib/horseman/connection.rb
34
+ - lib/horseman/cookies.rb
35
+ - lib/horseman/hidden_fields.rb
36
+ - lib/horseman/response.rb
37
+ - lib/horseman/version.rb
38
+ files:
39
+ - Gemfile
40
+ - Gemfile.lock
41
+ - Manifest
42
+ - README.rdoc
43
+ - Rakefile
44
+ - horseman.gemspec
45
+ - lib/horseman.rb
46
+ - lib/horseman/browser.rb
47
+ - lib/horseman/connection.rb
48
+ - lib/horseman/cookies.rb
49
+ - lib/horseman/hidden_fields.rb
50
+ - lib/horseman/response.rb
51
+ - lib/horseman/version.rb
52
+ - spec/horseman/browser_spec.rb
53
+ - spec/horseman/connection_spec.rb
54
+ - spec/horseman/cookies_spec.rb
55
+ - spec/horseman/hidden_fields_spec.rb
56
+ - spec/horseman/response_spec.rb
57
+ - spec/mocks.rb
58
+ - spec/spec_helper.rb
59
+ homepage: http://jarrodpeace.com
60
+ licenses: []
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --line-numbers
64
+ - --inline-source
65
+ - --title
66
+ - Horseman
67
+ - --main
68
+ - README.rdoc
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '1.2'
83
+ requirements: []
84
+ rubyforge_project: horseman
85
+ rubygems_version: 1.8.10
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: Headless HTTP crawler/scraper for ASP.NET WebForms applications
89
+ test_files: []