spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ describe "#code" do
10
+ it "should return the Integer version of the response status code" do
11
+ expect(subject.code).to be code
12
+ end
13
+ end
14
+
15
+ shared_examples "status code method" do |method,status_codes|
16
+ status_codes.each do |code,expected|
17
+ context "when status code is #{code}" do
18
+ let(:code) { code }
19
+
20
+ it { expect(subject.send(method)).to be expected }
21
+ end
22
+ end
23
+ end
24
+
25
+ describe "#is_ok?" do
26
+ include_examples "status code method", :is_ok?, {200 => true, 500 => false}
27
+ end
28
+
29
+ describe "#timedout?" do
30
+ include_examples "status code method", :timedout?, {308 => true, 200 => false}
31
+ end
32
+
33
+ describe "#bad_request?" do
34
+ include_examples "status code method", :bad_request?, {400 => true, 200 => false}
35
+ end
36
+
37
+ describe "#is_unauthorized?" do
38
+ include_examples "status code method", :is_unauthorized?, {401 => true, 200 => false}
39
+ end
40
+
41
+ describe "#is_forbidden?" do
42
+ include_examples "status code method", :is_forbidden?, {403 => true, 200 => false}
43
+ end
44
+
45
+ describe "#is_missing?" do
46
+ include_examples "status code method", :is_missing?, {404 => true, 200 => false}
47
+ end
48
+
49
+ describe "#had_internal_server_error?" do
50
+ include_examples "status code method", :had_internal_server_error?, {500 => true, 200 => false}
51
+ end
52
+
53
+ describe "#is_redirect?" do
54
+ include_examples "status code method", :is_redirect?, {
55
+ 300 => true,
56
+ 301 => true,
57
+ 302 => true,
58
+ 303 => true,
59
+ 304 => false,
60
+ 305 => false,
61
+ 306 => false,
62
+ 307 => true
63
+ }
64
+
65
+ context "when code is 200" do
66
+ context "and there is a meta refresh redirect" do
67
+ let(:body) do
68
+ %{<html><head><meta http-equiv="refresh" content="0; url=/other" /></head><body>redirecting...</body></html>}
69
+ end
70
+
71
+ it { expect(subject.is_redirect?).to be true }
72
+ end
73
+
74
+ context "and there is no meta refresh redirect" do
75
+ let(:body) { %{<html><body>foo</body></html>} }
76
+
77
+ it { expect(subject.is_redirect?).to be false }
78
+ end
79
+ end
80
+
81
+ context "when that status code is not 30x or 200" do
82
+ let(:code) { 404 }
83
+
84
+ it { expect(subject.is_redirect?).to be false }
85
+ end
86
+ end
87
+ end
@@ -1,128 +1,164 @@
1
- require 'spidr/page'
2
-
3
1
  require 'spec_helper'
4
- require 'page_examples'
5
- require 'helpers/page'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
6
5
 
7
6
  describe Page do
8
- describe "html" do
9
- before(:all) do
10
- @page = get_page('http://spidr.rubyforge.org/course/start.html')
11
- end
7
+ include_context "example Page"
12
8
 
13
- it_should_behave_like "Page"
9
+ describe "#initialize" do
10
+ let(:headers) { {'X-Foo' => 'bar'} }
14
11
 
15
- it "should be OK" do
16
- expect(@page).to be_ok
12
+ it "should set #url" do
13
+ expect(subject.url).to be url
17
14
  end
18
15
 
19
- it "should have a content-type" do
20
- expect(@page.content_type).to include('text/html')
16
+ it "should set #headers" do
17
+ expect(subject.headers).to be == {
18
+ 'content-type' => [content_type],
19
+ 'x-foo' => ['bar']
20
+ }
21
21
  end
22
+ end
22
23
 
23
- it "should be a html page" do
24
- expect(@page).to be_html
25
- end
24
+ describe "method_missing" do
25
+ let(:headers) { {'X-Foo' => 'bar'} }
26
26
 
27
- it "should have provide a document" do
28
- expect(@page.doc.class).to eq(Nokogiri::HTML::Document)
27
+ it "should provide transparent access to headers" do
28
+ expect(subject.x_foo).to be == 'bar'
29
29
  end
30
30
 
31
- it "should allow searching the document" do
32
- expect(@page.doc.search('//p').length).to eq(2)
33
- expect(@page.doc.at('//p[2]').inner_text).to eq('Ready! Set! Go!')
31
+ context "when the requested header does not exist" do
32
+ it do
33
+ expect { subject.x_bar }.to raise_error(NoMethodError)
34
+ end
34
35
  end
35
36
 
36
- it "should have a title" do
37
- expect(@page.title).to eq('Spidr :: Web-Spider Obstacle Course :: Start')
37
+ context "when method arguments are also given" do
38
+ it do
39
+ expect { subject.x_foo(1) }.to raise_error(NoMethodError)
40
+ end
38
41
  end
39
42
 
40
- it "should have links" do
41
- expect(@page.links).not_to be_empty
43
+ context "when a block is also given" do
44
+ it do
45
+ expect { subject.x_foo { } }.to raise_error(NoMethodError)
46
+ end
42
47
  end
43
48
  end
44
49
 
45
- describe "txt" do
46
- before(:all) do
47
- @page = get_page('https://www.ruby-lang.org/en/about/license.txt')
48
- end
50
+ describe "#body" do
51
+ context "when there is a body" do
52
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
49
53
 
50
- it_should_behave_like "Page"
51
-
52
- it "should be OK" do
53
- expect(@page).to be_ok
54
+ it "should return the body text" do
55
+ expect(subject.body).to be body
56
+ end
54
57
  end
55
58
 
56
- it "should have a content-type" do
57
- expect(@page.content_type).to include('text/plain')
59
+ context "when there is no body" do
60
+ it "should return an empty String" do
61
+ expect(subject.body).to be == ''
62
+ end
58
63
  end
64
+ end
59
65
 
60
- it "should be a txt page" do
61
- expect(@page).to be_txt
62
- end
66
+ describe "#doc" do
67
+ context "when the Content-Type is text/html" do
68
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
63
69
 
64
- it "should not have provide a document" do
65
- expect(@page.doc).to be_nil
70
+ it "should parse the body as HTML" do
71
+ expect(subject.doc).to be_kind_of(Nokogiri::HTML::Document)
72
+ expect(subject.doc.at('//p').inner_text).to be == 'hello'
73
+ end
66
74
  end
67
75
 
68
- it "should not allow searching the document" do
69
- expect(@page.search('//p')).to be_empty
70
- expect(@page.at('//p')).to be_nil
71
- end
76
+ context "when the document is application/rss+xml" do
77
+ let(:content_type) { 'application/rss+xml' }
78
+ let(:body) do
79
+ %{<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"></rss>}
80
+ end
72
81
 
73
- it "should not have links" do
74
- expect(@page.links).to be_empty
82
+ it "should parse the body as XML" do
83
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
84
+ end
75
85
  end
76
86
 
77
- it "should not have a title" do
78
- expect(@page.title).to be_nil
79
- end
80
- end
87
+ context "when the document is application/atom+xml" do
88
+ let(:content_type) { 'application/atom+xml' }
89
+ let(:body) do
90
+ %{<?xml version="1.0" encoding="UTF-8" ?><feed xmlns="http://www.w3.org/2005/Atom"></feed>}
91
+ end
81
92
 
82
- describe "redirects" do
83
- before(:all) do
84
- @page = get_page('http://spidr.rubyforge.org/course/start.html')
93
+ it "should parse the body as XML" do
94
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
95
+ end
85
96
  end
86
97
 
87
- before do
88
- allow(@page).to receive(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
98
+ context "when the document is text/xml" do
99
+ let(:content_type) { 'text/xml' }
100
+ let(:body) do
101
+ %{<?xml version="1.0" encoding="UTF-8" ?><foo />}
102
+ end
103
+
104
+ it "should parse the body as XML" do
105
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
106
+ end
89
107
  end
90
108
 
91
- it "should provide access to page-level redirects" do
92
- expect(@page.redirects_to).to eq(['http://spidr.rubyforge.org/redirected'])
93
- end
109
+ context "when the document is text/xsl" do
110
+ let(:content_type) { 'text/xsl' }
111
+ let(:body) do
112
+ %{<?xml version="1.0" encoding="UTF-8" ?><xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"></xsl:stylesheet>}
113
+ end
94
114
 
95
- it "should include meta refresh redirects in the list of links" do
96
- expect(@page.links).to include('http://spidr.rubyforge.org/redirected')
115
+ it "should parse the body as XML" do
116
+ expect(subject.doc).to be_kind_of(Nokogiri::XML::Document)
117
+ end
97
118
  end
98
- end
99
119
 
100
- describe "cookies" do
101
- before(:all) do
102
- @page = get_page('http://twitter.com/login')
120
+ context "when there is no body" do
121
+ it "should return an empty String" do
122
+ expect(subject.doc).to be nil
123
+ end
103
124
  end
125
+ end
104
126
 
105
- it "should provide access to the raw Cookie" do
106
- cookie = @page.cookie
127
+ describe "#search" do
128
+ context "when there is a document" do
129
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
107
130
 
108
- expect(cookie).not_to be_nil
109
- expect(cookie).not_to be_empty
131
+ it "should search the document" do
132
+ expect(subject.search('//p').inner_text).to be == 'hello'
133
+ end
110
134
  end
111
135
 
112
- it "should provide access to the Cookies" do
113
- cookies = @page.cookies
114
-
115
- expect(cookies).not_to be_empty
136
+ context "when there is no document" do
137
+ it "should return an empty Array" do
138
+ expect(subject.search('//p')).to be == []
139
+ end
116
140
  end
141
+ end
142
+
143
+ describe "#at" do
144
+ context "when there is a document" do
145
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
117
146
 
118
- it "should provide access to the key->value pairs within the Cookie" do
119
- params = @page.cookie_params
120
-
121
- expect(params).not_to be_empty
147
+ it "should search the document for the first matching node" do
148
+ expect(subject.at('//p').inner_text).to be == 'hello'
149
+ end
150
+ end
122
151
 
123
- params.each do |key,value|
124
- expect(key).not_to be_empty
152
+ context "when there is no document" do
153
+ it "should return nil" do
154
+ expect(subject.at('//p')).to be nil
125
155
  end
126
156
  end
127
157
  end
158
+
159
+ describe "#to_s" do
160
+ it "should return the body" do
161
+ expect(subject.to_s).to be body
162
+ end
163
+ end
128
164
  end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+ require 'spidr/proxy'
3
+
4
+ describe Spidr::Proxy do
5
+ let(:proxy_host) { 'proxy.example.com' }
6
+ let(:proxy_port) { 9999 }
7
+ let(:proxy_user) { 'bob' }
8
+ let(:proxy_password) { 'secret' }
9
+
10
+ describe "DEFAULT_PORT" do
11
+ subject { described_class::DEFAULT_PORT }
12
+
13
+ it { expect(subject).to be 8080 }
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "should default port to 8080" do
18
+ expect(subject.port).to be 8080
19
+ end
20
+ end
21
+
22
+ describe "#enabled?" do
23
+ context "when host is set" do
24
+ subject { described_class.new(host: proxy_host) }
25
+
26
+ it { expect(subject.enabled?).to be true }
27
+ end
28
+
29
+ context "when hist is not set" do
30
+ it { expect(subject.enabled?).to be false }
31
+ end
32
+ end
33
+
34
+ describe "#disabled?" do
35
+ context "when hist is not set" do
36
+ it { expect(subject.disabled?).to be true }
37
+ end
38
+
39
+ context "when host is set" do
40
+ subject { described_class.new(host: proxy_host) }
41
+
42
+ it { expect(subject.disabled?).to be false }
43
+ end
44
+ end
45
+ end
@@ -1,9 +1,110 @@
1
1
  require 'spidr/session_cache'
2
2
 
3
3
  require 'spec_helper'
4
+ require 'settings/proxy_examples'
5
+ require 'settings/timeouts_examples'
4
6
 
5
7
  describe SessionCache do
6
- describe "empty" do
8
+ describe "#initialize" do
9
+ let(:proxy_host) { 'proxy.example.com' }
10
+ let(:proxy_port) { 9999 }
11
+
12
+ let(:open_timeout) { 1 }
13
+ let(:ssl_timeout) { 2 }
14
+ let(:read_timeout) { 3 }
15
+ let(:continue_timeout) { 4 }
16
+ let(:keep_alive_timeout) { 5 }
17
+
18
+ subject do
19
+ described_class.new(
20
+ proxy: {host: proxy_host, port: proxy_port},
21
+
22
+ open_timeout: open_timeout,
23
+ ssl_timeout: ssl_timeout,
24
+ read_timeout: read_timeout,
25
+ continue_timeout: continue_timeout,
26
+ keep_alive_timeout: keep_alive_timeout,
27
+ )
28
+ end
29
+
30
+ it "should set proxy" do
31
+ expect(subject.proxy[:host]).to be == proxy_host
32
+ expect(subject.proxy[:port]).to be == proxy_port
33
+ end
34
+
35
+ it "should set open_timeout" do
36
+ expect(subject.open_timeout).to be open_timeout
37
+ end
38
+
39
+ it "should set ssl_timeout" do
40
+ expect(subject.ssl_timeout).to be ssl_timeout
41
+ end
42
+
43
+ it "should set read_timeout" do
44
+ expect(subject.read_timeout).to be read_timeout
45
+ end
46
+
47
+ it "should set continue_timeout" do
48
+ expect(subject.continue_timeout).to be continue_timeout
49
+ end
50
+
51
+ it "should set keep_alive_timeout" do
52
+ expect(subject.keep_alive_timeout).to be keep_alive_timeout
53
+ end
54
+
55
+ context "with no arguments" do
56
+ before(:all) do
57
+ Spidr.proxy = {host: 'proxy.example.com', port: 9999}
58
+
59
+ Spidr.open_timeout = 1
60
+ Spidr.ssl_timeout = 2
61
+ Spidr.read_timeout = 3
62
+ Spidr.continue_timeout = 4
63
+ Spidr.keep_alive_timeout = 5
64
+ end
65
+
66
+ subject { described_class.new }
67
+
68
+ it "should use the global proxy settings" do
69
+ expect(subject.proxy).to be Spidr.proxy
70
+ end
71
+
72
+ it "should use the global open_timeout" do
73
+ expect(subject.open_timeout).to be == Spidr.open_timeout
74
+ end
75
+
76
+ it "should use the global ssl_timeout" do
77
+ expect(subject.ssl_timeout).to be == Spidr.ssl_timeout
78
+ end
79
+
80
+ it "should use the global read_timeout" do
81
+ expect(subject.read_timeout).to be == Spidr.read_timeout
82
+ end
83
+
84
+ it "should use the global continue_timeout" do
85
+ expect(subject.continue_timeout).to be == Spidr.continue_timeout
86
+ end
87
+
88
+ it "should use the global keep_alive_timeout" do
89
+ expect(subject.keep_alive_timeout).to be == Spidr.keep_alive_timeout
90
+ end
91
+
92
+ before(:all) do
93
+ Spidr.proxy = nil
94
+
95
+ Spidr.open_timeout = nil
96
+ Spidr.ssl_timeout = nil
97
+ Spidr.read_timeout = nil
98
+ Spidr.continue_timeout = nil
99
+ Spidr.keep_alive_timeout = nil
100
+ end
101
+ end
102
+ end
103
+
104
+ it_should_behave_like "includes Spidr::Settings::Proxy"
105
+ it_should_behave_like "includes Spidr::Settings::Timeouts"
106
+
107
+ context "when empty" do
7
108
  before(:all) do
8
109
  @sessions = SessionCache.new
9
110
  end
@@ -21,7 +122,7 @@ describe SessionCache do
21
122
  end
22
123
  end
23
124
 
24
- describe "not-empty" do
125
+ context "when not-empty" do
25
126
  before(:all) do
26
127
  @url = URI('http://example.com/')
27
128