spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
describe "#code" do
|
10
|
+
it "should return the Integer version of the response status code" do
|
11
|
+
expect(subject.code).to be code
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
shared_examples "status code method" do |method,status_codes|
|
16
|
+
status_codes.each do |code,expected|
|
17
|
+
context "when status code is #{code}" do
|
18
|
+
let(:code) { code }
|
19
|
+
|
20
|
+
it { expect(subject.send(method)).to be expected }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#is_ok?" do
|
26
|
+
include_examples "status code method", :is_ok?, {200 => true, 500 => false}
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#timedout?" do
|
30
|
+
include_examples "status code method", :timedout?, {308 => true, 200 => false}
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "#bad_request?" do
|
34
|
+
include_examples "status code method", :bad_request?, {400 => true, 200 => false}
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#is_unauthorized?" do
|
38
|
+
include_examples "status code method", :is_unauthorized?, {401 => true, 200 => false}
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "#is_forbidden?" do
|
42
|
+
include_examples "status code method", :is_forbidden?, {403 => true, 200 => false}
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#is_missing?" do
|
46
|
+
include_examples "status code method", :is_missing?, {404 => true, 200 => false}
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#had_internal_server_error?" do
|
50
|
+
include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "#is_redirect?" do
|
54
|
+
include_examples "status code method", :is_redirect?, {
|
55
|
+
300 => true,
|
56
|
+
301 => true,
|
57
|
+
302 => true,
|
58
|
+
303 => true,
|
59
|
+
304 => false,
|
60
|
+
305 => false,
|
61
|
+
306 => false,
|
62
|
+
307 => true
|
63
|
+
}
|
64
|
+
|
65
|
+
context "when code is 200" do
|
66
|
+
context "and there is a meta refresh redirect" do
|
67
|
+
let(:body) do
|
68
|
+
%{<html><head><meta http-equiv="refresh" content="0; url=/other" /></head><body>redirecting...</body></html>}
|
69
|
+
end
|
70
|
+
|
71
|
+
it { expect(subject.is_redirect?).to be true }
|
72
|
+
end
|
73
|
+
|
74
|
+
context "and there is no meta refresh redirect" do
|
75
|
+
let(:body) { %{<html><body>foo</body></html>} }
|
76
|
+
|
77
|
+
it { expect(subject.is_redirect?).to be false }
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when that status code is not 30x or 200" do
|
82
|
+
let(:code) { 404 }
|
83
|
+
|
84
|
+
it { expect(subject.is_redirect?).to be false }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,128 +1,164 @@
|
|
1
|
-
require 'spidr/page'
|
2
|
-
|
3
1
|
require 'spec_helper'
|
4
|
-
require '
|
5
|
-
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
6
5
|
|
7
6
|
describe Page do
|
8
|
-
|
9
|
-
before(:all) do
|
10
|
-
@page = get_page('http://spidr.rubyforge.org/course/start.html')
|
11
|
-
end
|
7
|
+
include_context "example Page"
|
12
8
|
|
13
|
-
|
9
|
+
describe "#initialize" do
|
10
|
+
let(:headers) { {'X-Foo' => 'bar'} }
|
14
11
|
|
15
|
-
it "should
|
16
|
-
expect(
|
12
|
+
it "should set #url" do
|
13
|
+
expect(subject.url).to be url
|
17
14
|
end
|
18
15
|
|
19
|
-
it "should
|
20
|
-
expect(
|
16
|
+
it "should set #headers" do
|
17
|
+
expect(subject.headers).to be == {
|
18
|
+
'content-type' => [content_type],
|
19
|
+
'x-foo' => ['bar']
|
20
|
+
}
|
21
21
|
end
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
end
|
24
|
+
describe "method_missing" do
|
25
|
+
let(:headers) { {'X-Foo' => 'bar'} }
|
26
26
|
|
27
|
-
it "should
|
28
|
-
expect(
|
27
|
+
it "should provide transparent access to headers" do
|
28
|
+
expect(subject.x_foo).to be == 'bar'
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
context "when the requested header does not exist" do
|
32
|
+
it do
|
33
|
+
expect { subject.x_bar }.to raise_error(NoMethodError)
|
34
|
+
end
|
34
35
|
end
|
35
36
|
|
36
|
-
|
37
|
-
|
37
|
+
context "when method arguments are also given" do
|
38
|
+
it do
|
39
|
+
expect { subject.x_foo(1) }.to raise_error(NoMethodError)
|
40
|
+
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
|
43
|
+
context "when a block is also given" do
|
44
|
+
it do
|
45
|
+
expect { subject.x_foo { } }.to raise_error(NoMethodError)
|
46
|
+
end
|
42
47
|
end
|
43
48
|
end
|
44
49
|
|
45
|
-
describe "
|
46
|
-
|
47
|
-
|
48
|
-
end
|
50
|
+
describe "#body" do
|
51
|
+
context "when there is a body" do
|
52
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
49
53
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
expect(@page).to be_ok
|
54
|
+
it "should return the body text" do
|
55
|
+
expect(subject.body).to be body
|
56
|
+
end
|
54
57
|
end
|
55
58
|
|
56
|
-
|
57
|
-
|
59
|
+
context "when there is no body" do
|
60
|
+
it "should return an empty String" do
|
61
|
+
expect(subject.body).to be == ''
|
62
|
+
end
|
58
63
|
end
|
64
|
+
end
|
59
65
|
|
60
|
-
|
61
|
-
|
62
|
-
|
66
|
+
describe "#doc" do
|
67
|
+
context "when the Content-Type is text/html" do
|
68
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
63
69
|
|
64
|
-
|
65
|
-
|
70
|
+
it "should parse the body as HTML" do
|
71
|
+
expect(subject.doc).to be_kind_of(Nokogiri::HTML::Document)
|
72
|
+
expect(subject.doc.at('//p').inner_text).to be == 'hello'
|
73
|
+
end
|
66
74
|
end
|
67
75
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
76
|
+
context "when the document is application/rss+xml" do
|
77
|
+
let(:content_type) { 'application/rss+xml' }
|
78
|
+
let(:body) do
|
79
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"></rss>}
|
80
|
+
end
|
72
81
|
|
73
|
-
|
74
|
-
|
82
|
+
it "should parse the body as XML" do
|
83
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
84
|
+
end
|
75
85
|
end
|
76
86
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
87
|
+
context "when the document is application/atom+xml" do
|
88
|
+
let(:content_type) { 'application/atom+xml' }
|
89
|
+
let(:body) do
|
90
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><feed xmlns="http://www.w3.org/2005/Atom"></feed>}
|
91
|
+
end
|
81
92
|
|
82
|
-
|
83
|
-
|
84
|
-
|
93
|
+
it "should parse the body as XML" do
|
94
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
95
|
+
end
|
85
96
|
end
|
86
97
|
|
87
|
-
|
88
|
-
|
98
|
+
context "when the document is text/xml" do
|
99
|
+
let(:content_type) { 'text/xml' }
|
100
|
+
let(:body) do
|
101
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><foo />}
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should parse the body as XML" do
|
105
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
106
|
+
end
|
89
107
|
end
|
90
108
|
|
91
|
-
|
92
|
-
|
93
|
-
|
109
|
+
context "when the document is text/xsl" do
|
110
|
+
let(:content_type) { 'text/xsl' }
|
111
|
+
let(:body) do
|
112
|
+
%{<?xml version="1.0" encoding="UTF-8" ?><xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"></xsl:stylesheet>}
|
113
|
+
end
|
94
114
|
|
95
|
-
|
96
|
-
|
115
|
+
it "should parse the body as XML" do
|
116
|
+
expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
|
117
|
+
end
|
97
118
|
end
|
98
|
-
end
|
99
119
|
|
100
|
-
|
101
|
-
|
102
|
-
|
120
|
+
context "when there is no body" do
|
121
|
+
it "should return an empty String" do
|
122
|
+
expect(subject.doc).to be nil
|
123
|
+
end
|
103
124
|
end
|
125
|
+
end
|
104
126
|
|
105
|
-
|
106
|
-
|
127
|
+
describe "#search" do
|
128
|
+
context "when there is a document" do
|
129
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
107
130
|
|
108
|
-
|
109
|
-
|
131
|
+
it "should search the document" do
|
132
|
+
expect(subject.search('//p').inner_text).to be == 'hello'
|
133
|
+
end
|
110
134
|
end
|
111
135
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
136
|
+
context "when there is no document" do
|
137
|
+
it "should return an empty Array" do
|
138
|
+
expect(subject.search('//p')).to be == []
|
139
|
+
end
|
116
140
|
end
|
141
|
+
end
|
142
|
+
|
143
|
+
describe "#at" do
|
144
|
+
context "when there is a document" do
|
145
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
117
146
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
147
|
+
it "should search the document for the first matching node" do
|
148
|
+
expect(subject.at('//p').inner_text).to be == 'hello'
|
149
|
+
end
|
150
|
+
end
|
122
151
|
|
123
|
-
|
124
|
-
|
152
|
+
context "when there is no document" do
|
153
|
+
it "should return nil" do
|
154
|
+
expect(subject.at('//p')).to be nil
|
125
155
|
end
|
126
156
|
end
|
127
157
|
end
|
158
|
+
|
159
|
+
describe "#to_s" do
|
160
|
+
it "should return the body" do
|
161
|
+
expect(subject.to_s).to be body
|
162
|
+
end
|
163
|
+
end
|
128
164
|
end
|
data/spec/proxy_spec.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'spidr/proxy'
|
3
|
+
|
4
|
+
describe Spidr::Proxy do
|
5
|
+
let(:proxy_host) { 'proxy.example.com' }
|
6
|
+
let(:proxy_port) { 9999 }
|
7
|
+
let(:proxy_user) { 'bob' }
|
8
|
+
let(:proxy_password) { 'secret' }
|
9
|
+
|
10
|
+
describe "DEFAULT_PORT" do
|
11
|
+
subject { described_class::DEFAULT_PORT }
|
12
|
+
|
13
|
+
it { expect(subject).to be 8080 }
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#initialize" do
|
17
|
+
it "should default port to 8080" do
|
18
|
+
expect(subject.port).to be 8080
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "#enabled?" do
|
23
|
+
context "when host is set" do
|
24
|
+
subject { described_class.new(host: proxy_host) }
|
25
|
+
|
26
|
+
it { expect(subject.enabled?).to be true }
|
27
|
+
end
|
28
|
+
|
29
|
+
context "when hist is not set" do
|
30
|
+
it { expect(subject.enabled?).to be false }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#disabled?" do
|
35
|
+
context "when hist is not set" do
|
36
|
+
it { expect(subject.disabled?).to be true }
|
37
|
+
end
|
38
|
+
|
39
|
+
context "when host is set" do
|
40
|
+
subject { described_class.new(host: proxy_host) }
|
41
|
+
|
42
|
+
it { expect(subject.disabled?).to be false }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/spec/session_cache.rb
CHANGED
@@ -1,9 +1,110 @@
|
|
1
1
|
require 'spidr/session_cache'
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
+
require 'settings/proxy_examples'
|
5
|
+
require 'settings/timeouts_examples'
|
4
6
|
|
5
7
|
describe SessionCache do
|
6
|
-
describe "
|
8
|
+
describe "#initialize" do
|
9
|
+
let(:proxy_host) { 'proxy.example.com' }
|
10
|
+
let(:proxy_port) { 9999 }
|
11
|
+
|
12
|
+
let(:open_timeout) { 1 }
|
13
|
+
let(:ssl_timeout) { 2 }
|
14
|
+
let(:read_timeout) { 3 }
|
15
|
+
let(:continue_timeout) { 4 }
|
16
|
+
let(:keep_alive_timeout) { 5 }
|
17
|
+
|
18
|
+
subject do
|
19
|
+
described_class.new(
|
20
|
+
proxy: {host: proxy_host, port: proxy_port},
|
21
|
+
|
22
|
+
open_timeout: open_timeout,
|
23
|
+
ssl_timeout: ssl_timeout,
|
24
|
+
read_timeout: read_timeout,
|
25
|
+
continue_timeout: continue_timeout,
|
26
|
+
keep_alive_timeout: keep_alive_timeout,
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should set proxy" do
|
31
|
+
expect(subject.proxy[:host]).to be == proxy_host
|
32
|
+
expect(subject.proxy[:port]).to be == proxy_port
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should set open_timeout" do
|
36
|
+
expect(subject.open_timeout).to be open_timeout
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should set ssl_timeout" do
|
40
|
+
expect(subject.ssl_timeout).to be ssl_timeout
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should set read_timeout" do
|
44
|
+
expect(subject.read_timeout).to be read_timeout
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should set continue_timeout" do
|
48
|
+
expect(subject.continue_timeout).to be continue_timeout
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should set keep_alive_timeout" do
|
52
|
+
expect(subject.keep_alive_timeout).to be keep_alive_timeout
|
53
|
+
end
|
54
|
+
|
55
|
+
context "with no arguments" do
|
56
|
+
before(:all) do
|
57
|
+
Spidr.proxy = {host: 'proxy.example.com', port: 9999}
|
58
|
+
|
59
|
+
Spidr.open_timeout = 1
|
60
|
+
Spidr.ssl_timeout = 2
|
61
|
+
Spidr.read_timeout = 3
|
62
|
+
Spidr.continue_timeout = 4
|
63
|
+
Spidr.keep_alive_timeout = 5
|
64
|
+
end
|
65
|
+
|
66
|
+
subject { described_class.new }
|
67
|
+
|
68
|
+
it "should use the global proxy settings" do
|
69
|
+
expect(subject.proxy).to be Spidr.proxy
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should use the global open_timeout" do
|
73
|
+
expect(subject.open_timeout).to be == Spidr.open_timeout
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should use the global ssl_timeout" do
|
77
|
+
expect(subject.ssl_timeout).to be == Spidr.ssl_timeout
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should use the global read_timeout" do
|
81
|
+
expect(subject.read_timeout).to be == Spidr.read_timeout
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should use the global continue_timeout" do
|
85
|
+
expect(subject.continue_timeout).to be == Spidr.continue_timeout
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should use the global keep_alive_timeout" do
|
89
|
+
expect(subject.keep_alive_timeout).to be == Spidr.keep_alive_timeout
|
90
|
+
end
|
91
|
+
|
92
|
+
before(:all) do
|
93
|
+
Spidr.proxy = nil
|
94
|
+
|
95
|
+
Spidr.open_timeout = nil
|
96
|
+
Spidr.ssl_timeout = nil
|
97
|
+
Spidr.read_timeout = nil
|
98
|
+
Spidr.continue_timeout = nil
|
99
|
+
Spidr.keep_alive_timeout = nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
it_should_behave_like "includes Spidr::Settings::Proxy"
|
105
|
+
it_should_behave_like "includes Spidr::Settings::Timeouts"
|
106
|
+
|
107
|
+
context "when empty" do
|
7
108
|
before(:all) do
|
8
109
|
@sessions = SessionCache.new
|
9
110
|
end
|
@@ -21,7 +122,7 @@ describe SessionCache do
|
|
21
122
|
end
|
22
123
|
end
|
23
124
|
|
24
|
-
|
125
|
+
context "when not-empty" do
|
25
126
|
before(:all) do
|
26
127
|
@url = URI('http://example.com/')
|
27
128
|
|