spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
describe "#code" do
|
10
|
+
it "should return the Integer version of the response status code" do
|
11
|
+
expect(subject.code).to be code
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
shared_examples "status code method" do |method,status_codes|
|
16
|
+
status_codes.each do |code,expected|
|
17
|
+
context "when status code is #{code}" do
|
18
|
+
let(:code) { code }
|
19
|
+
|
20
|
+
it { expect(subject.send(method)).to be expected }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#is_ok?" do
|
26
|
+
include_examples "status code method", :is_ok?, {200 => true, 500 => false}
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#timedout?" do
|
30
|
+
include_examples "status code method", :timedout?, {308 => true, 200 => false}
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "#bad_request?" do
|
34
|
+
include_examples "status code method", :bad_request?, {400 => true, 200 => false}
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#is_unauthorized?" do
|
38
|
+
include_examples "status code method", :is_unauthorized?, {401 => true, 200 => false}
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "#is_forbidden?" do
|
42
|
+
include_examples "status code method", :is_forbidden?, {403 => true, 200 => false}
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#is_missing?" do
|
46
|
+
include_examples "status code method", :is_missing?, {404 => true, 200 => false}
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#had_internal_server_error?" do
|
50
|
+
include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "#is_redirect?" do
|
54
|
+
include_examples "status code method", :is_redirect?, {
|
55
|
+
300 => true,
|
56
|
+
301 => true,
|
57
|
+
302 => true,
|
58
|
+
303 => true,
|
59
|
+
304 => false,
|
60
|
+
305 => false,
|
61
|
+
306 => false,
|
62
|
+
307 => true
|
63
|
+
}
|
64
|
+
|
65
|
+
context "when code is 200" do
|
66
|
+
context "and there is a meta refresh redirect" do
|
67
|
+
let(:body) do
|
68
|
+
%{<html><head><meta http-equiv="refresh" content="0; url=/other" /></head><body>redirecting...</body></html>}
|
69
|
+
end
|
70
|
+
|
71
|
+
it { expect(subject.is_redirect?).to be true }
|
72
|
+
end
|
73
|
+
|
74
|
+
context "and there is no meta refresh redirect" do
|
75
|
+
let(:body) { %{<html><body>foo</body></html>} }
|
76
|
+
|
77
|
+
it { expect(subject.is_redirect?).to be false }
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when that status code is not 30x or 200" do
|
82
|
+
let(:code) { 404 }
|
83
|
+
|
84
|
+
it { expect(subject.is_redirect?).to be false }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,128 +1,164 @@
|
|
1
|
-
require 'spidr/page'
|
2
|
-
|
3
1
|
require 'spec_helper'
|
4
|
-
require '
|
5
|
-
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
6
5
|
|
7
6
|
describe Page do
|
8
|
-
|
9
|
-
before(:all) do
|
10
|
-
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
11
|
-
end
|
7
|
+
include_context "example Page"
|
12
8
|
|
13
|
-
|
9
|
+
describe "#initialize" do
|
10
|
+
let(:headers) { {'X-Foo' => 'bar'} }
|
14
11
|
|
15
|
-
it "should
|
16
|
-
expect(
|
12
|
+
it "should set #url" do
|
13
|
+
expect(subject.url).to be url
|
17
14
|
end
|
18
15
|
|
19
|
-
it "should
|
20
|
-
expect(
|
16
|
+
it "should set #headers" do
|
17
|
+
expect(subject.headers).to be == {
|
18
|
+
'content-type' => [content_type],
|
19
|
+
'x-foo' => ['bar']
|
20
|
+
}
|
21
21
|
end
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
end
|
24
|
+
describe "method_missing" do
|
25
|
+
let(:headers) { {'X-Foo' => 'bar'} }
|
26
26
|
|
27
|
-
it "should
|
28
|
-
expect(
|
27
|
+
it "should provide transparent access to headers" do
|
28
|
+
expect(subject.x_foo).to be == 'bar'
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
context "when the requested header does not exist" do
|
32
|
+
it do
|
33
|
+
expect { subject.x_bar }.to raise_error(NoMethodError)
|
34
|
+
end
|
34
35
|
end
|
35
36
|
|
36
|
-
|
37
|
-
|
37
|
+
context "when method arguments are also given" do
|
38
|
+
it do
|
39
|
+
expect { subject.x_foo(1) }.to raise_error(NoMethodError)
|
40
|
+
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
|
43
|
+
context "when a block is also given" do
|
44
|
+
it do
|
45
|
+
expect { subject.x_foo { } }.to raise_error(NoMethodError)
|
46
|
+
end
|
42
47
|
end
|
43
48
|
end
|
44
49
|
|
45
|
-
describe "
|
46
|
-
|
47
|
-
|
48
|
-
end
|
50
|
+
describe "#body" do
|
51
|
+
context "when there is a body" do
|
52
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
49
53
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
expect(@page).to be_ok
|
54
|
+
it "should return the body text" do
|
55
|
+
expect(subject.body).to be body
|
56
|
+
end
|
54
57
|
end
|
55
58
|
|
56
|
-
|
57
|
-
|
59
|
+
context "when there is no body" do
|
60
|
+
it "should return an empty String" do
|
61
|
+
expect(subject.body).to be == ''
|
62
|
+
end
|
58
63
|
end
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
describe "#doc" do
|
67
|
+
context "when the Content-Type is text/html" do
|
68
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
63
69
|
|
64
|
-
|
65
|
-
|
70
|
+
it "should parse the body as HTML" do
|
71
|
+
expect(subject.doc).to be_kind_of(Nokogiri::HTML::Document)
|
72
|
+
expect(subject.doc.at('//p').inner_text).to be == 'hello'
|
73
|
+
end
|
66
74
|
end
|
67
75
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
76
|
+
context "when the document is application/rss+xml" do
|
77
|
+
let(:content_type) { 'application/rss+xml' }
|
78
|
+
let(:body) do
|
79
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"></rss>}
|
80
|
+
end
|
72
81
|
|
73
|
-
|
74
|
-
|
82
|
+
it "should parse the body as XML" do
|
83
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
84
|
+
end
|
75
85
|
end
|
76
86
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
87
|
+
context "when the document is application/atom+xml" do
|
88
|
+
let(:content_type) { 'application/atom+xml' }
|
89
|
+
let(:body) do
|
90
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><feed xmlns="http://www.w3.org/2005/Atom"></feed>}
|
91
|
+
end
|
81
92
|
|
82
|
-
|
83
|
-
|
84
|
-
|
93
|
+
it "should parse the body as XML" do
|
94
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
95
|
+
end
|
85
96
|
end
|
86
97
|
|
87
|
-
|
88
|
-
|
98
|
+
context "when the document is text/xml" do
|
99
|
+
let(:content_type) { 'text/xml' }
|
100
|
+
let(:body) do
|
101
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><foo />}
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should parse the body as XML" do
|
105
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
106
|
+
end
|
89
107
|
end
|
90
108
|
|
91
|
-
|
92
|
-
|
93
|
-
|
109
|
+
context "when the document is text/xsl" do
|
110
|
+
let(:content_type) { 'text/xsl' }
|
111
|
+
let(:body) do
|
112
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"></xsl:stylesheet>}
|
113
|
+
end
|
94
114
|
|
95
|
-
|
96
|
-
|
115
|
+
it "should parse the body as XML" do
|
116
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
117
|
+
end
|
97
118
|
end
|
98
|
-
end
|
99
119
|
|
100
|
-
|
101
|
-
|
102
|
-
|
120
|
+
context "when there is no body" do
|
121
|
+
it "should return an empty String" do
|
122
|
+
expect(subject.doc).to be nil
|
123
|
+
end
|
103
124
|
end
|
125
|
+
end
|
104
126
|
|
105
|
-
|
106
|
-
|
127
|
+
describe "#search" do
|
128
|
+
context "when there is a document" do
|
129
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
107
130
|
|
108
|
-
|
109
|
-
|
131
|
+
it "should search the document" do
|
132
|
+
expect(subject.search('//p').inner_text).to be == 'hello'
|
133
|
+
end
|
110
134
|
end
|
111
135
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
136
|
+
context "when there is no document" do
|
137
|
+
it "should return an empty Array" do
|
138
|
+
expect(subject.search('//p')).to be == []
|
139
|
+
end
|
116
140
|
end
|
141
|
+
end
|
142
|
+
|
143
|
+
describe "#at" do
|
144
|
+
context "when there is a document" do
|
145
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
117
146
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
147
|
+
it "should search the document for the first matching node" do
|
148
|
+
expect(subject.at('//p').inner_text).to be == 'hello'
|
149
|
+
end
|
150
|
+
end
|
122
151
|
|
123
|
-
|
124
|
-
|
152
|
+
context "when there is no document" do
|
153
|
+
it "should return nil" do
|
154
|
+
expect(subject.at('//p')).to be nil
|
125
155
|
end
|
126
156
|
end
|
127
157
|
end
|
158
|
+
|
159
|
+
describe "#to_s" do
|
160
|
+
it "should return the body" do
|
161
|
+
expect(subject.to_s).to be body
|
162
|
+
end
|
163
|
+
end
|
128
164
|
end
|
data/spec/proxy_spec.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'spidr/proxy'
|
3
|
+
|
4
|
+
describe Spidr::Proxy do
|
5
|
+
let(:proxy_host) { 'proxy.example.com' }
|
6
|
+
let(:proxy_port) { 9999 }
|
7
|
+
let(:proxy_user) { 'bob' }
|
8
|
+
let(:proxy_password) { 'secret' }
|
9
|
+
|
10
|
+
describe "DEFAULT_PORT" do
|
11
|
+
subject { described_class::DEFAULT_PORT }
|
12
|
+
|
13
|
+
it { expect(subject).to be 8080 }
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#initialize" do
|
17
|
+
it "should default port to 8080" do
|
18
|
+
expect(subject.port).to be 8080
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "#enabled?" do
|
23
|
+
context "when host is set" do
|
24
|
+
subject { described_class.new(host: proxy_host) }
|
25
|
+
|
26
|
+
it { expect(subject.enabled?).to be true }
|
27
|
+
end
|
28
|
+
|
29
|
+
context "when hist is not set" do
|
30
|
+
it { expect(subject.enabled?).to be false }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#disabled?" do
|
35
|
+
context "when hist is not set" do
|
36
|
+
it { expect(subject.disabled?).to be true }
|
37
|
+
end
|
38
|
+
|
39
|
+
context "when host is set" do
|
40
|
+
subject { described_class.new(host: proxy_host) }
|
41
|
+
|
42
|
+
it { expect(subject.disabled?).to be false }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/spec/session_cache.rb
CHANGED
@@ -1,9 +1,110 @@
|
|
1
1
|
require 'spidr/session_cache'
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'settings/proxy_examples'
|
5
|
+
require 'settings/timeouts_examples'
|
4
6
|
|
5
7
|
describe SessionCache do
|
6
|
-
describe "
|
8
|
+
describe "#initialize" do
|
9
|
+
let(:proxy_host) { 'proxy.example.com' }
|
10
|
+
let(:proxy_port) { 9999 }
|
11
|
+
|
12
|
+
let(:open_timeout) { 1 }
|
13
|
+
let(:ssl_timeout) { 2 }
|
14
|
+
let(:read_timeout) { 3 }
|
15
|
+
let(:continue_timeout) { 4 }
|
16
|
+
let(:keep_alive_timeout) { 5 }
|
17
|
+
|
18
|
+
subject do
|
19
|
+
described_class.new(
|
20
|
+
proxy: {host: proxy_host, port: proxy_port},
|
21
|
+
|
22
|
+
open_timeout: open_timeout,
|
23
|
+
ssl_timeout: ssl_timeout,
|
24
|
+
read_timeout: read_timeout,
|
25
|
+
continue_timeout: continue_timeout,
|
26
|
+
keep_alive_timeout: keep_alive_timeout,
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should set proxy" do
|
31
|
+
expect(subject.proxy[:host]).to be == proxy_host
|
32
|
+
expect(subject.proxy[:port]).to be == proxy_port
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should set open_timeout" do
|
36
|
+
expect(subject.open_timeout).to be open_timeout
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should set ssl_timeout" do
|
40
|
+
expect(subject.ssl_timeout).to be ssl_timeout
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should set read_timeout" do
|
44
|
+
expect(subject.read_timeout).to be read_timeout
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should set continue_timeout" do
|
48
|
+
expect(subject.continue_timeout).to be continue_timeout
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should set keep_alive_timeout" do
|
52
|
+
expect(subject.keep_alive_timeout).to be keep_alive_timeout
|
53
|
+
end
|
54
|
+
|
55
|
+
context "with no arguments" do
|
56
|
+
before(:all) do
|
57
|
+
Spidr.proxy = {host: 'proxy.example.com', port: 9999}
|
58
|
+
|
59
|
+
Spidr.open_timeout = 1
|
60
|
+
Spidr.ssl_timeout = 2
|
61
|
+
Spidr.read_timeout = 3
|
62
|
+
Spidr.continue_timeout = 4
|
63
|
+
Spidr.keep_alive_timeout = 5
|
64
|
+
end
|
65
|
+
|
66
|
+
subject { described_class.new }
|
67
|
+
|
68
|
+
it "should use the global proxy settings" do
|
69
|
+
expect(subject.proxy).to be Spidr.proxy
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should use the global open_timeout" do
|
73
|
+
expect(subject.open_timeout).to be == Spidr.open_timeout
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should use the global ssl_timeout" do
|
77
|
+
expect(subject.ssl_timeout).to be == Spidr.ssl_timeout
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should use the global read_timeout" do
|
81
|
+
expect(subject.read_timeout).to be == Spidr.read_timeout
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should use the global continue_timeout" do
|
85
|
+
expect(subject.continue_timeout).to be == Spidr.continue_timeout
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should use the global keep_alive_timeout" do
|
89
|
+
expect(subject.keep_alive_timeout).to be == Spidr.keep_alive_timeout
|
90
|
+
end
|
91
|
+
|
92
|
+
before(:all) do
|
93
|
+
Spidr.proxy = nil
|
94
|
+
|
95
|
+
Spidr.open_timeout = nil
|
96
|
+
Spidr.ssl_timeout = nil
|
97
|
+
Spidr.read_timeout = nil
|
98
|
+
Spidr.continue_timeout = nil
|
99
|
+
Spidr.keep_alive_timeout = nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
it_should_behave_like "includes Spidr::Settings::Proxy"
|
105
|
+
it_should_behave_like "includes Spidr::Settings::Timeouts"
|
106
|
+
|
107
|
+
context "when empty" do
|
7
108
|
before(:all) do
|
8
109
|
@sessions = SessionCache.new
|
9
110
|
end
|
@@ -21,7 +122,7 @@ describe SessionCache do
|
|
21
122
|
end
|
22
123
|
end
|
23
124
|
|
24
|
-
|
125
|
+
context "when not-empty" do
|
25
126
|
before(:all) do
|
26
127
|
@url = URI('http://example.com/')
|
27
128
|
|