spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ describe "#code" do
10
+ it "should return the Integer version of the response status code" do
11
+ expect(subject.code).to be code
12
+ end
13
+ end
14
+
15
+ shared_examples "status code method" do |method,status_codes|
16
+ status_codes.each do |code,expected|
17
+ context "when status code is #{code}" do
18
+ let(:code) { code }
19
+
20
+ it { expect(subject.send(method)).to be expected }
21
+ end
22
+ end
23
+ end
24
+
25
+ describe "#is_ok?" do
26
+ include_examples "status code method", :is_ok?, {200 => true, 500 => false}
27
+ end
28
+
29
+ describe "#timedout?" do
30
+ include_examples "status code method", :timedout?, {308 => true, 200 => false}
31
+ end
32
+
33
+ describe "#bad_request?" do
34
+ include_examples "status code method", :bad_request?, {400 => true, 200 => false}
35
+ end
36
+
37
+ describe "#is_unauthorized?" do
38
+ include_examples "status code method", :is_unauthorized?, {401 => true, 200 => false}
39
+ end
40
+
41
+ describe "#is_forbidden?" do
42
+ include_examples "status code method", :is_forbidden?, {403 => true, 200 => false}
43
+ end
44
+
45
+ describe "#is_missing?" do
46
+ include_examples "status code method", :is_missing?, {404 => true, 200 => false}
47
+ end
48
+
49
+ describe "#had_internal_server_error?" do
50
+ include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
51
+ end
52
+
53
+ describe "#is_redirect?" do
54
+ include_examples "status code method", :is_redirect?, {
55
+ 300 => true,
56
+ 301 => true,
57
+ 302 => true,
58
+ 303 => true,
59
+ 304 => false,
60
+ 305 => false,
61
+ 306 => false,
62
+ 307 => true
63
+ }
64
+
65
+ context "when code is 200" do
66
+ context "and there is a meta refresh redirect" do
67
+ let(:body) do
68
+ %{<html><head><meta http-equiv="refresh" content="0; url=/other" /></head><body>redirecting...</body></html>}
69
+ end
70
+
71
+ it { expect(subject.is_redirect?).to be true }
72
+ end
73
+
74
+ context "and there is no meta refresh redirect" do
75
+ let(:body) { %{<html><body>foo</body></html>} }
76
+
77
+ it { expect(subject.is_redirect?).to be false }
78
+ end
79
+ end
80
+
81
+ context "when that status code is not 30x or 200" do
82
+ let(:code) { 404 }
83
+
84
+ it { expect(subject.is_redirect?).to be false }
85
+ end
86
+ end
87
+ end
@@ -1,128 +1,164 @@
1
- require 'spidr/page'
2
-
3
1
  require 'spec_helper'
4
- require 'page_examples'
5
- require 'helpers/page'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
6
5
 
7
6
  describe Page do
8
- describe "html" do
9
- before(:all) do
10
- @page = get_page('http://spidr.rubyforge.org/course/start.html')
11
- end
7
+ include_context "example Page"
12
8
 
13
- it_should_behave_like "Page"
9
+ describe "#initialize" do
10
+ let(:headers) { {'X-Foo' => 'bar'} }
14
11
 
15
- it "should be OK" do
16
- expect(@page).to be_ok
12
+ it "should set #url" do
13
+ expect(subject.url).to be url
17
14
  end
18
15
 
19
- it "should have a content-type" do
20
- expect(@page.content_type).to include('text/html')
16
+ it "should set #headers" do
17
+ expect(subject.headers).to be == {
18
+ 'content-type' => [content_type],
19
+ 'x-foo' => ['bar']
20
+ }
21
21
  end
22
+ end
22
23
 
23
- it "should be a html page" do
24
- expect(@page).to be_html
25
- end
24
+ describe "method_missing" do
25
+ let(:headers) { {'X-Foo' => 'bar'} }
26
26
 
27
- it "should have provide a document" do
28
- expect(@page.doc.class).to eq(Nokogiri::HTML::Document)
27
+ it "should provide transparent access to headers" do
28
+ expect(subject.x_foo).to be == 'bar'
29
29
  end
30
30
 
31
- it "should allow searching the document" do
32
- expect(@page.doc.search('//p').length).to eq(2)
33
- expect(@page.doc.at('//p[2]').inner_text).to eq('Ready! Set! Go!')
31
+ context "when the requested header does not exist" do
32
+ it do
33
+ expect { subject.x_bar }.to raise_error(NoMethodError)
34
+ end
34
35
  end
35
36
 
36
- it "should have a title" do
37
- expect(@page.title).to eq('Spidr :: Web-Spider Obstacle Course :: Start')
37
+ context "when method arguments are also given" do
38
+ it do
39
+ expect { subject.x_foo(1) }.to raise_error(NoMethodError)
40
+ end
38
41
  end
39
42
 
40
- it "should have links" do
41
- expect(@page.links).not_to be_empty
43
+ context "when a block is also given" do
44
+ it do
45
+ expect { subject.x_foo { } }.to raise_error(NoMethodError)
46
+ end
42
47
  end
43
48
  end
44
49
 
45
- describe "txt" do
46
- before(:all) do
47
- @page = get_page('https://www.ruby-lang.org/en/about/license.txt')
48
- end
50
+ describe "#body" do
51
+ context "when there is a body" do
52
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
49
53
 
50
- it_should_behave_like "Page"
51
-
52
- it "should be OK" do
53
- expect(@page).to be_ok
54
+ it "should return the body text" do
55
+ expect(subject.body).to be body
56
+ end
54
57
  end
55
58
 
56
- it "should have a content-type" do
57
- expect(@page.content_type).to include('text/plain')
59
+ context "when there is no body" do
60
+ it "should return an empty String" do
61
+ expect(subject.body).to be == ''
62
+ end
58
63
  end
64
+ end
59
65
 
60
- it "should be a txt page" do
61
- expect(@page).to be_txt
62
- end
66
+ describe "#doc" do
67
+ context "when the Content-Type is text/html" do
68
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
63
69
 
64
- it "should not have provide a document" do
65
- expect(@page.doc).to be_nil
70
+ it "should parse the body as HTML" do
71
+ expect(subject.doc).to be_kind_of(Nokogiri::HTML::Document)
72
+ expect(subject.doc.at('//p').inner_text).to be == 'hello'
73
+ end
66
74
  end
67
75
 
68
- it "should not allow searching the document" do
69
- expect(@page.search('//p')).to be_empty
70
- expect(@page.at('//p')).to be_nil
71
- end
76
+ context "when the document is application/rss+xml" do
77
+ let(:content_type) { 'application/rss+xml' }
78
+ let(:body) do
79
+ %{<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"></rss>}
80
+ end
72
81
 
73
- it "should not have links" do
74
- expect(@page.links).to be_empty
82
+ it "should parse the body as XML" do
83
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
84
+ end
75
85
  end
76
86
 
77
- it "should not have a title" do
78
- expect(@page.title).to be_nil
79
- end
80
- end
87
+ context "when the document is application/atom+xml" do
88
+ let(:content_type) { 'application/atom+xml' }
89
+ let(:body) do
90
+ %{<?xml version="1.0" encoding="UTF-8" ?><feed xmlns="http://www.w3.org/2005/Atom"></feed>}
91
+ end
81
92
 
82
- describe "redirects" do
83
- before(:all) do
84
- @page = get_page('http://spidr.rubyforge.org/course/start.html')
93
+ it "should parse the body as XML" do
94
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
95
+ end
85
96
  end
86
97
 
87
- before do
88
- allow(@page).to receive(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
98
+ context "when the document is text/xml" do
99
+ let(:content_type) { 'text/xml' }
100
+ let(:body) do
101
+ %{<?xml version="1.0" encoding="UTF-8" ?><foo />}
102
+ end
103
+
104
+ it "should parse the body as XML" do
105
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
106
+ end
89
107
  end
90
108
 
91
- it "should provide access to page-level redirects" do
92
- expect(@page.redirects_to).to eq(['http://spidr.rubyforge.org/redirected'])
93
- end
109
+ context "when the document is text/xsl" do
110
+ let(:content_type) { 'text/xsl' }
111
+ let(:body) do
112
+ %{<?xml version="1.0" encoding="UTF-8" ?><xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"></xsl:stylesheet>}
113
+ end
94
114
 
95
- it "should include meta refresh redirects in the list of links" do
96
- expect(@page.links).to include('http://spidr.rubyforge.org/redirected')
115
+ it "should parse the body as XML" do
116
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
117
+ end
97
118
  end
98
- end
99
119
 
100
- describe "cookies" do
101
- before(:all) do
102
- @page = get_page('http://twitter.com/login')
120
+ context "when there is no body" do
121
+ it "should return an empty String" do
122
+ expect(subject.doc).to be nil
123
+ end
103
124
  end
125
+ end
104
126
 
105
- it "should provide access to the raw Cookie" do
106
- cookie = @page.cookie
127
+ describe "#search" do
128
+ context "when there is a document" do
129
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
107
130
 
108
- expect(cookie).not_to be_nil
109
- expect(cookie).not_to be_empty
131
+ it "should search the document" do
132
+ expect(subject.search('//p').inner_text).to be == 'hello'
133
+ end
110
134
  end
111
135
 
112
- it "should provide access to the Cookies" do
113
- cookies = @page.cookies
114
-
115
- expect(cookies).not_to be_empty
136
+ context "when there is no document" do
137
+ it "should return an empty Array" do
138
+ expect(subject.search('//p')).to be == []
139
+ end
116
140
  end
141
+ end
142
+
143
+ describe "#at" do
144
+ context "when there is a document" do
145
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
117
146
 
118
- it "should provide access to the key->value pairs within the Cookie" do
119
- params = @page.cookie_params
120
-
121
- expect(params).not_to be_empty
147
+ it "should search the document for the first matching node" do
148
+ expect(subject.at('//p').inner_text).to be == 'hello'
149
+ end
150
+ end
122
151
 
123
- params.each do |key,value|
124
- expect(key).not_to be_empty
152
+ context "when there is no document" do
153
+ it "should return nil" do
154
+ expect(subject.at('//p')).to be nil
125
155
  end
126
156
  end
127
157
  end
158
+
159
+ describe "#to_s" do
160
+ it "should return the body" do
161
+ expect(subject.to_s).to be body
162
+ end
163
+ end
128
164
  end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+ require 'spidr/proxy'
3
+
4
+ describe Spidr::Proxy do
5
+ let(:proxy_host) { 'proxy.example.com' }
6
+ let(:proxy_port) { 9999 }
7
+ let(:proxy_user) { 'bob' }
8
+ let(:proxy_password) { 'secret' }
9
+
10
+ describe "DEFAULT_PORT" do
11
+ subject { described_class::DEFAULT_PORT }
12
+
13
+ it { expect(subject).to be 8080 }
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "should default port to 8080" do
18
+ expect(subject.port).to be 8080
19
+ end
20
+ end
21
+
22
+ describe "#enabled?" do
23
+ context "when host is set" do
24
+ subject { described_class.new(host: proxy_host) }
25
+
26
+ it { expect(subject.enabled?).to be true }
27
+ end
28
+
29
+ context "when hist is not set" do
30
+ it { expect(subject.enabled?).to be false }
31
+ end
32
+ end
33
+
34
+ describe "#disabled?" do
35
+ context "when hist is not set" do
36
+ it { expect(subject.disabled?).to be true }
37
+ end
38
+
39
+ context "when host is set" do
40
+ subject { described_class.new(host: proxy_host) }
41
+
42
+ it { expect(subject.disabled?).to be false }
43
+ end
44
+ end
45
+ end
@@ -1,9 +1,110 @@
1
1
  require 'spidr/session_cache'
2
2
 
3
3
  require 'spec_helper'
4
+ require 'settings/proxy_examples'
5
+ require 'settings/timeouts_examples'
4
6
 
5
7
  describe SessionCache do
6
- describe "empty" do
8
+ describe "#initialize" do
9
+ let(:proxy_host) { 'proxy.example.com' }
10
+ let(:proxy_port) { 9999 }
11
+
12
+ let(:open_timeout) { 1 }
13
+ let(:ssl_timeout) { 2 }
14
+ let(:read_timeout) { 3 }
15
+ let(:continue_timeout) { 4 }
16
+ let(:keep_alive_timeout) { 5 }
17
+
18
+ subject do
19
+ described_class.new(
20
+ proxy: {host: proxy_host, port: proxy_port},
21
+
22
+ open_timeout: open_timeout,
23
+ ssl_timeout: ssl_timeout,
24
+ read_timeout: read_timeout,
25
+ continue_timeout: continue_timeout,
26
+ keep_alive_timeout: keep_alive_timeout,
27
+ )
28
+ end
29
+
30
+ it "should set proxy" do
31
+ expect(subject.proxy[:host]).to be == proxy_host
32
+ expect(subject.proxy[:port]).to be == proxy_port
33
+ end
34
+
35
+ it "should set open_timeout" do
36
+ expect(subject.open_timeout).to be open_timeout
37
+ end
38
+
39
+ it "should set ssl_timeout" do
40
+ expect(subject.ssl_timeout).to be ssl_timeout
41
+ end
42
+
43
+ it "should set read_timeout" do
44
+ expect(subject.read_timeout).to be read_timeout
45
+ end
46
+
47
+ it "should set continue_timeout" do
48
+ expect(subject.continue_timeout).to be continue_timeout
49
+ end
50
+
51
+ it "should set keep_alive_timeout" do
52
+ expect(subject.keep_alive_timeout).to be keep_alive_timeout
53
+ end
54
+
55
+ context "with no arguments" do
56
+ before(:all) do
57
+ Spidr.proxy = {host: 'proxy.example.com', port: 9999}
58
+
59
+ Spidr.open_timeout = 1
60
+ Spidr.ssl_timeout = 2
61
+ Spidr.read_timeout = 3
62
+ Spidr.continue_timeout = 4
63
+ Spidr.keep_alive_timeout = 5
64
+ end
65
+
66
+ subject { described_class.new }
67
+
68
+ it "should use the global proxy settings" do
69
+ expect(subject.proxy).to be Spidr.proxy
70
+ end
71
+
72
+ it "should use the global open_timeout" do
73
+ expect(subject.open_timeout).to be == Spidr.open_timeout
74
+ end
75
+
76
+ it "should use the global ssl_timeout" do
77
+ expect(subject.ssl_timeout).to be == Spidr.ssl_timeout
78
+ end
79
+
80
+ it "should use the global read_timeout" do
81
+ expect(subject.read_timeout).to be == Spidr.read_timeout
82
+ end
83
+
84
+ it "should use the global continue_timeout" do
85
+ expect(subject.continue_timeout).to be == Spidr.continue_timeout
86
+ end
87
+
88
+ it "should use the global keep_alive_timeout" do
89
+ expect(subject.keep_alive_timeout).to be == Spidr.keep_alive_timeout
90
+ end
91
+
92
+ before(:all) do
93
+ Spidr.proxy = nil
94
+
95
+ Spidr.open_timeout = nil
96
+ Spidr.ssl_timeout = nil
97
+ Spidr.read_timeout = nil
98
+ Spidr.continue_timeout = nil
99
+ Spidr.keep_alive_timeout = nil
100
+ end
101
+ end
102
+ end
103
+
104
+ it_should_behave_like "includes Spidr::Settings::Proxy"
105
+ it_should_behave_like "includes Spidr::Settings::Timeouts"
106
+
107
+ context "when empty" do
7
108
  before(:all) do
8
109
  @sessions = SessionCache.new
9
110
  end
@@ -21,7 +122,7 @@ describe SessionCache do
21
122
  end
22
123
  end
23
124
 
24
- describe "not-empty" do
125
+ context "when not-empty" do
25
126
  before(:all) do
26
127
  @url = URI('http://example.com/')
27
128