aquanaut 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +20 -0
- data/Guardfile +24 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +1 -0
- data/aquanaut.gemspec +27 -0
- data/bin/aquanaut +18 -0
- data/lib/aquanaut/asset_node.rb +23 -0
- data/lib/aquanaut/graph.rb +74 -0
- data/lib/aquanaut/node.rb +22 -0
- data/lib/aquanaut/page_node.rb +25 -0
- data/lib/aquanaut/sitemap.rb +55 -0
- data/lib/aquanaut/templates/assets/css/custom.css +27 -0
- data/lib/aquanaut/templates/assets/js/graph.js +46 -0
- data/lib/aquanaut/templates/index.html.slim +29 -0
- data/lib/aquanaut/version.rb +4 -0
- data/lib/aquanaut/worker.rb +111 -0
- data/lib/aquanaut.rb +41 -0
- data/spec/aquanaut/aquanaut_spec.rb +48 -0
- data/spec/aquanaut/asset_node_spec.rb +16 -0
- data/spec/aquanaut/graph_spec.rb +89 -0
- data/spec/aquanaut/node_spec.rb +26 -0
- data/spec/aquanaut/page_node_spec.rb +14 -0
- data/spec/aquanaut/sitemap_spec.rb +60 -0
- data/spec/aquanaut/worker_spec.rb +308 -0
- data/spec/spec_helper.rb +17 -0
- data/vendor/assets/css/bootstrap-theme.css +347 -0
- data/vendor/assets/css/bootstrap-theme.css.map +1 -0
- data/vendor/assets/css/bootstrap-theme.min.css +7 -0
- data/vendor/assets/css/bootstrap.css +5785 -0
- data/vendor/assets/css/bootstrap.css.map +1 -0
- data/vendor/assets/css/bootstrap.min.css +7 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.eot +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.svg +229 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.woff +0 -0
- data/vendor/assets/js/bootstrap.js +1951 -0
- data/vendor/assets/js/bootstrap.min.js +6 -0
- data/vendor/assets/js/d3.v3.min.js +5 -0
- data/vendor/assets/js/jquery-2.1.0.min.js +4 -0
- metadata +205 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
describe Aquanaut do
|
6
|
+
|
7
|
+
describe ".process_domain" do
|
8
|
+
it "builds a graph with pages as nodes and interlinks as edges" do
|
9
|
+
body = <<-BODY
|
10
|
+
<a href="/home.html">Home</a>
|
11
|
+
<a href="/about.html">About us</a>
|
12
|
+
BODY
|
13
|
+
|
14
|
+
uri = URI.parse('http://www.example.com')
|
15
|
+
|
16
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
17
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
18
|
+
|
19
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
20
|
+
|
21
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
22
|
+
stub_request(:get, 'www.example.com/home.html').to_return(response)
|
23
|
+
|
24
|
+
stub_request(:head, 'www.example.com/about.html').to_return(response)
|
25
|
+
stub_request(:get, 'www.example.com/about.html').to_return(response)
|
26
|
+
|
27
|
+
graph = Aquanaut.process_domain('http://www.example.com')
|
28
|
+
|
29
|
+
uris = ['http://www.example.com/home.html',
|
30
|
+
'http://www.example.com/about.html'].map { |u| URI.parse(u) }
|
31
|
+
|
32
|
+
root_node = graph[uri]
|
33
|
+
page_1_node = graph[uris[0]]
|
34
|
+
page_2_node = graph[uris[1]]
|
35
|
+
|
36
|
+
expect(root_node).to be_an_instance_of(Aquanaut::PageNode)
|
37
|
+
expect(page_1_node).to be_an_instance_of(Aquanaut::PageNode)
|
38
|
+
expect(page_2_node).to be_an_instance_of(Aquanaut::PageNode)
|
39
|
+
|
40
|
+
adjacency_list = [page_1_node, page_2_node]
|
41
|
+
expect(root_node.adjacency_list).to eq(adjacency_list)
|
42
|
+
|
43
|
+
expect(page_1_node.adjacency_list).to eq(adjacency_list)
|
44
|
+
expect(page_2_node.adjacency_list).to eq(adjacency_list)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Aquanaut::AssetNode do
|
5
|
+
|
6
|
+
describe "#initialize" do
|
7
|
+
it "stores the URI and type" do
|
8
|
+
uri = URI.parse('http://www.example.com/picture.jpg')
|
9
|
+
node = Aquanaut::AssetNode.new(uri, :image)
|
10
|
+
|
11
|
+
expect(node.uri).to eq(uri)
|
12
|
+
expect(node.type).to eq(:image)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Aquanaut::Graph do
|
5
|
+
|
6
|
+
describe "#initialize" do
|
7
|
+
it "initializes an empty nodes hash" do
|
8
|
+
graph = Aquanaut::Graph.new
|
9
|
+
expect(graph.instance_variable_get('@nodes')).to be_empty
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#add_node" do
|
14
|
+
it "adds the node and hashes the node based on its URI attribute" do
|
15
|
+
uri = URI.parse('http://www.example.com')
|
16
|
+
node = Aquanaut::PageNode.new(uri)
|
17
|
+
|
18
|
+
graph = Aquanaut::Graph.new
|
19
|
+
graph.add_node(node)
|
20
|
+
|
21
|
+
expect(graph[node.uri]).to eq(node)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "does not add a node if it already exists under the given URI" do
|
25
|
+
uri = URI.parse('http://www.example.com')
|
26
|
+
node = Aquanaut::PageNode.new(uri)
|
27
|
+
same_node = Aquanaut::PageNode.new(uri)
|
28
|
+
|
29
|
+
graph = Aquanaut::Graph.new
|
30
|
+
graph.add_node(node)
|
31
|
+
|
32
|
+
expect(graph[uri]).to be(node)
|
33
|
+
expect(graph[uri]).to_not be(same_node)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#add_edge" do
|
38
|
+
it "looks up the predecessor node and delegates to the node's method" do
|
39
|
+
uri = URI.parse('http://www.example.com')
|
40
|
+
node = Aquanaut::PageNode.new(uri)
|
41
|
+
|
42
|
+
adjacent_uri = URI.parse('http://www.example.com/home.html')
|
43
|
+
adjacent_node = Aquanaut::PageNode.new(adjacent_uri)
|
44
|
+
|
45
|
+
graph = Aquanaut::Graph.new
|
46
|
+
graph.add_node(node)
|
47
|
+
graph.add_node(adjacent_node)
|
48
|
+
graph.add_edge(uri, adjacent_uri)
|
49
|
+
|
50
|
+
expect(graph[uri].adjacency_list.first).to eq(adjacent_node)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "#[]" do
|
55
|
+
it "looks up a node based on its URI" do
|
56
|
+
uri = URI.parse('http://www.example.com')
|
57
|
+
node = Aquanaut::PageNode.new(uri)
|
58
|
+
|
59
|
+
graph = Aquanaut::Graph.new
|
60
|
+
graph.add_node(node)
|
61
|
+
|
62
|
+
expect(graph[uri]).to be(node)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "#each" do
|
67
|
+
it "returns the node object and its adjacency list in an iteration" do
|
68
|
+
uri = URI.parse('http://www.example.com')
|
69
|
+
node = Aquanaut::PageNode.new(uri)
|
70
|
+
|
71
|
+
adjacent_uri = URI.parse('http://www.example.com/home.html')
|
72
|
+
adjacent_node = Aquanaut::PageNode.new(adjacent_uri)
|
73
|
+
|
74
|
+
graph = Aquanaut::Graph.new
|
75
|
+
graph.add_node(node)
|
76
|
+
graph.add_node(adjacent_node)
|
77
|
+
graph.add_edge(uri, adjacent_uri)
|
78
|
+
|
79
|
+
graph.each do |page_node, adjacency_list|
|
80
|
+
expect(page_node).to be_an_instance_of(Aquanaut::PageNode)
|
81
|
+
expect(page_node).to be(node)
|
82
|
+
expect(adjacency_list).to be(node.adjacency_list)
|
83
|
+
break
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Aquanaut::Node do
|
5
|
+
|
6
|
+
describe "#initialize" do
|
7
|
+
it "initializes an empty adjacency list" do
|
8
|
+
node = Aquanaut::Node.new
|
9
|
+
expect(node.adjacency_list).to be_empty
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#add_edge" do
|
14
|
+
it "adds a successor to the adjacency list" do
|
15
|
+
node = Aquanaut::Node.new
|
16
|
+
adjacent_node = Aquanaut::Node.new
|
17
|
+
|
18
|
+
expect do
|
19
|
+
node.add_edge(adjacent_node)
|
20
|
+
end.to change { node.adjacency_list.count }.by(1)
|
21
|
+
|
22
|
+
expect(node.adjacency_list.first).to eq(adjacent_node)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Aquanaut::PageNode do
|
5
|
+
|
6
|
+
describe "#initialize" do
|
7
|
+
it "stores the URI" do
|
8
|
+
uri = URI.parse('http://www.example.com')
|
9
|
+
node = Aquanaut::PageNode.new(uri)
|
10
|
+
expect(node.uri).to eq(uri)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Aquanaut::Sitemap do
|
5
|
+
|
6
|
+
describe "#initialize" do
|
7
|
+
it "stores the given graph and domain" do
|
8
|
+
graph = Aquanaut::Graph.new
|
9
|
+
domain = 'http://www.example.com'
|
10
|
+
sitemap = Aquanaut::Sitemap.new(graph, domain)
|
11
|
+
|
12
|
+
expect(sitemap.instance_variable_get('@graph')).to be(graph)
|
13
|
+
expect(sitemap.instance_variable_get('@domain')).to be(domain)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "expands the path of the given target directory" do
|
17
|
+
graph = Aquanaut::Graph.new
|
18
|
+
domain = 'http://www.example.com'
|
19
|
+
target_dir = 'spec/sitemap'
|
20
|
+
|
21
|
+
sitemap = Aquanaut::Sitemap.new(graph, domain, target_dir)
|
22
|
+
expanded_dir = sitemap.instance_variable_get('@target_dir')
|
23
|
+
|
24
|
+
expect(Pathname.new(expanded_dir).absolute?).to be_true
|
25
|
+
expect(expanded_dir.end_with?("/aquanaut/#{target_dir}")).to be_true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#initialize_target_directory" do
|
30
|
+
it "creates the directory and copies assets file if neccessary" do
|
31
|
+
graph = Aquanaut::Graph.new
|
32
|
+
domain = 'http://www.example.com'
|
33
|
+
target_dir = 'spec/sitemap'
|
34
|
+
|
35
|
+
sitemap = Aquanaut::Sitemap.new(graph, domain, target_dir)
|
36
|
+
sitemap.send(:initialize_target_directory)
|
37
|
+
|
38
|
+
expect(Dir.exist?(target_dir)).to be_true
|
39
|
+
expect(Dir.exist?("#{target_dir}/assets")).to be_true
|
40
|
+
|
41
|
+
FileUtils.rm_r(target_dir)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#render_results" do
|
46
|
+
it "the result rendering works for an empty graph" do
|
47
|
+
graph = Aquanaut::Graph.new
|
48
|
+
domain = 'http://www.example.com'
|
49
|
+
target_dir = 'spec/sitemap'
|
50
|
+
|
51
|
+
sitemap = Aquanaut::Sitemap.new(graph, domain, target_dir)
|
52
|
+
sitemap.render_results
|
53
|
+
|
54
|
+
expect(File.exist?("#{target_dir}/index.html"))
|
55
|
+
|
56
|
+
FileUtils.rm_r(target_dir)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
require 'aquanaut'
|
2
|
+
require 'spec_helper'
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
describe Aquanaut::Worker do
|
6
|
+
describe "#initialize" do
|
7
|
+
it "initializes the queue with the target address" do
|
8
|
+
target = 'http://www.example.com'
|
9
|
+
worker = Aquanaut::Worker.new(target)
|
10
|
+
|
11
|
+
queue = worker.instance_variable_get('@queue')
|
12
|
+
expected_queue = [URI.parse(target)]
|
13
|
+
expect(queue).to eq(expected_queue)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "stores the target address in its different components" do
|
17
|
+
target = 'http://www.example.com'
|
18
|
+
worker = Aquanaut::Worker.new(target)
|
19
|
+
|
20
|
+
domain = worker.instance_variable_get('@domain')
|
21
|
+
expect(domain.tld).to eq('com')
|
22
|
+
expect(domain.sld).to eq('example')
|
23
|
+
expect(domain.trd).to eq('www')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#internal?" do
|
28
|
+
it "compares second-level and top-level domain" do
|
29
|
+
target = 'http://www.example.com'
|
30
|
+
worker = Aquanaut::Worker.new(target)
|
31
|
+
|
32
|
+
uri = URI.parse('http://www.example.com')
|
33
|
+
expect(worker.internal?(uri)).to be_true
|
34
|
+
|
35
|
+
uri = URI.parse('http://blog.example.com')
|
36
|
+
expect(worker.internal?(uri)).to be_true
|
37
|
+
|
38
|
+
uri = URI.parse('http://www.not-example.com')
|
39
|
+
expect(worker.internal?(uri)).to be_false
|
40
|
+
end
|
41
|
+
|
42
|
+
it "guards against invalid domains" do
|
43
|
+
target = 'http://www.example.com'
|
44
|
+
worker = Aquanaut::Worker.new(target)
|
45
|
+
|
46
|
+
uri = URI.parse('/internal.html')
|
47
|
+
expect(worker.internal?(uri)).to be_true
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#links" do
|
52
|
+
it "retrieves no links from a page with no body" do
|
53
|
+
response = { headers: { 'Content-Type' => 'text/html'} }
|
54
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
55
|
+
|
56
|
+
target = 'http://www.example.com'
|
57
|
+
worker = Aquanaut::Worker.new(target)
|
58
|
+
|
59
|
+
uri = URI.parse('http://www.example.com')
|
60
|
+
expect(worker.links(uri).first).to be_empty
|
61
|
+
end
|
62
|
+
|
63
|
+
it "returns a list of URIs for a page with anchor elements" do
|
64
|
+
body = <<-BODY
|
65
|
+
<a href="/home.html">Home</a>
|
66
|
+
<a href="/about.html">About us</a>
|
67
|
+
<a href="/contact.html">Contact</a>
|
68
|
+
BODY
|
69
|
+
|
70
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
71
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
72
|
+
|
73
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
74
|
+
|
75
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
76
|
+
stub_request(:get, 'www.example.com/home.html').to_return(response)
|
77
|
+
|
78
|
+
stub_request(:head, 'www.example.com/about.html').to_return(response)
|
79
|
+
stub_request(:get, 'www.example.com/about.html').to_return(response)
|
80
|
+
|
81
|
+
stub_request(:head, 'www.example.com/contact.html').to_return(response)
|
82
|
+
stub_request(:get, 'www.example.com/contact.html').to_return(response)
|
83
|
+
|
84
|
+
stub_request(:head, 'www.not-example.com').to_return(response)
|
85
|
+
|
86
|
+
uris = ['http://www.example.com/home.html',
|
87
|
+
'http://www.example.com/about.html',
|
88
|
+
'http://www.example.com/contact.html']
|
89
|
+
|
90
|
+
uris.map! { |uri| URI.parse(uri) }
|
91
|
+
|
92
|
+
target = 'http://www.example.com'
|
93
|
+
worker = Aquanaut::Worker.new(target)
|
94
|
+
|
95
|
+
uri = URI.parse('http://www.example.com')
|
96
|
+
expect(worker.links(uri).first).to eq(uris)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "returns the final location when encountering HTTP 3xx" do
|
100
|
+
body = '<a href="http://follow-me.com">Follow me</a>'
|
101
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
102
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
103
|
+
end
|
104
|
+
|
105
|
+
it "filters links that reference an external domain directly" do
|
106
|
+
body = <<-BODY
|
107
|
+
<a href="/home.html">Home</a>
|
108
|
+
<a href="/about.html">About us</a>
|
109
|
+
<a href="/contact.html">Contact</a>
|
110
|
+
<a href="http://www.not-example.com">Not Example</a>
|
111
|
+
BODY
|
112
|
+
|
113
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
114
|
+
|
115
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
116
|
+
|
117
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
118
|
+
stub_request(:get, 'www.example.com/home.html').to_return(response)
|
119
|
+
|
120
|
+
stub_request(:head, 'www.example.com/about.html').to_return(response)
|
121
|
+
stub_request(:get, 'www.example.com/about.html').to_return(response)
|
122
|
+
|
123
|
+
stub_request(:head, 'www.example.com/contact.html').to_return(response)
|
124
|
+
stub_request(:get, 'www.example.com/contact.html').to_return(response)
|
125
|
+
|
126
|
+
stub_request(:head, 'www.not-example.com').to_return(response)
|
127
|
+
|
128
|
+
target = 'http://www.example.com'
|
129
|
+
worker = Aquanaut::Worker.new(target)
|
130
|
+
|
131
|
+
uris = ['http://www.example.com/home.html',
|
132
|
+
'http://www.example.com/about.html',
|
133
|
+
'http://www.example.com/contact.html']
|
134
|
+
|
135
|
+
uris.map! { |uri| URI.parse(uri) }
|
136
|
+
|
137
|
+
uri = URI.parse('http://www.example.com')
|
138
|
+
expect(worker.links(uri).first).to eq(uris)
|
139
|
+
end
|
140
|
+
|
141
|
+
it "filters links that reference an external domain indirectly" do
|
142
|
+
body = <<-BODY
|
143
|
+
<a href="/home.html">Home</a>
|
144
|
+
<a href="/about.html">About us</a>
|
145
|
+
<a href="/contact.html">Contact</a>
|
146
|
+
<a href="/moved.html">Moved</a>
|
147
|
+
BODY
|
148
|
+
|
149
|
+
other_domain = 'http://www.not-example.com'
|
150
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
151
|
+
forward = { status: 301, headers: { 'Location' => other_domain } }
|
152
|
+
|
153
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
154
|
+
|
155
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
156
|
+
stub_request(:get, 'www.example.com/home.html').to_return(response)
|
157
|
+
|
158
|
+
stub_request(:head, 'www.example.com/about.html').to_return(response)
|
159
|
+
stub_request(:get, 'www.example.com/about.html').to_return(response)
|
160
|
+
|
161
|
+
stub_request(:head, 'www.example.com/contact.html').to_return(response)
|
162
|
+
stub_request(:get, 'www.example.com/contact.html').to_return(response)
|
163
|
+
|
164
|
+
stub_request(:head, 'www.example.com/moved.html').to_return(forward)
|
165
|
+
stub_request(:head, other_domain).to_return(response)
|
166
|
+
|
167
|
+
target = 'http://www.example.com'
|
168
|
+
worker = Aquanaut::Worker.new(target)
|
169
|
+
|
170
|
+
uris = ['http://www.example.com/home.html',
|
171
|
+
'http://www.example.com/about.html',
|
172
|
+
'http://www.example.com/contact.html']
|
173
|
+
|
174
|
+
uris.map! { |uri| URI.parse(uri) }
|
175
|
+
|
176
|
+
uri = URI.parse('http://www.example.com')
|
177
|
+
expect(worker.links(uri).first).to eq(uris)
|
178
|
+
end
|
179
|
+
|
180
|
+
it "rejects errors raised by Mechanize when retrieving the page" do
|
181
|
+
response = { status: 500 }
|
182
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
183
|
+
|
184
|
+
target = 'http://www.example.com'
|
185
|
+
worker = Aquanaut::Worker.new(target)
|
186
|
+
uri = URI.parse(target)
|
187
|
+
|
188
|
+
expect(worker.links(uri).first).to be_empty
|
189
|
+
end
|
190
|
+
|
191
|
+
it "rejects errors raised by Mechanize when checking the links" do
|
192
|
+
body = <<-BODY
|
193
|
+
<a href="/home.html">Home</a>
|
194
|
+
<a href="/about.html">About us</a>
|
195
|
+
BODY
|
196
|
+
|
197
|
+
headers = { 'Content-Type' => 'text/html'}
|
198
|
+
|
199
|
+
response = { body: body, headers: headers }
|
200
|
+
response_500 = { status: 500 }
|
201
|
+
|
202
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
203
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
204
|
+
stub_request(:head, 'www.example.com/about.html').to_return(response_500)
|
205
|
+
|
206
|
+
target = 'http://www.example.com'
|
207
|
+
worker = Aquanaut::Worker.new(target)
|
208
|
+
uri = URI.parse(target)
|
209
|
+
|
210
|
+
uris = [URI.parse('http://www.example.com/home.html')]
|
211
|
+
expect(worker.links(uri).first).to eq(uris)
|
212
|
+
end
|
213
|
+
|
214
|
+
it "rejects invalid URIs" do
|
215
|
+
body = '<a href="http:invalid.com">Invalid</a>'
|
216
|
+
|
217
|
+
headers = { 'Content-Type' => 'text/html'}
|
218
|
+
response = { body: body, headers: headers }
|
219
|
+
|
220
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
221
|
+
|
222
|
+
target = 'http://www.example.com'
|
223
|
+
worker = Aquanaut::Worker.new(target)
|
224
|
+
uri = URI.parse(target)
|
225
|
+
|
226
|
+
expect(worker.links(uri).first).to be_empty
|
227
|
+
end
|
228
|
+
|
229
|
+
it "rejects anchors with no href attribute" do
|
230
|
+
body = '<a>Empty</a>'
|
231
|
+
|
232
|
+
headers = { 'Content-Type' => 'text/html'}
|
233
|
+
response = { body: body, headers: headers }
|
234
|
+
|
235
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
236
|
+
|
237
|
+
target = 'http://www.example.com'
|
238
|
+
worker = Aquanaut::Worker.new(target)
|
239
|
+
uri = URI.parse(target)
|
240
|
+
|
241
|
+
expect(worker.links(uri).first).to be_empty
|
242
|
+
end
|
243
|
+
|
244
|
+
it "rejects links that lead to a timeout" do
|
245
|
+
body = '<a href="/timeout.html">Timeout</a>'
|
246
|
+
|
247
|
+
headers = { 'Content-Type' => 'text/html'}
|
248
|
+
response = { body: body, headers: headers }
|
249
|
+
|
250
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
251
|
+
stub_request(:head, 'www.example.com/timeout.html').to_timeout
|
252
|
+
|
253
|
+
target = 'http://www.example.com'
|
254
|
+
worker = Aquanaut::Worker.new(target)
|
255
|
+
uri = URI.parse(target)
|
256
|
+
|
257
|
+
expect(worker.links(uri).first).to be_empty
|
258
|
+
end
|
259
|
+
|
260
|
+
it "rejects links that have already been grabbed" do
|
261
|
+
body = <<-BODY
|
262
|
+
<a href="/home.html">Home</a>
|
263
|
+
<a href="/home.html">Home</a>
|
264
|
+
BODY
|
265
|
+
|
266
|
+
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
|
267
|
+
|
268
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
269
|
+
stub_request(:get, 'www.example.com/home.html').to_return(response)
|
270
|
+
stub_request(:head, 'www.example.com/home.html').to_return(response)
|
271
|
+
|
272
|
+
target = 'http://www.example.com'
|
273
|
+
worker = Aquanaut::Worker.new(target)
|
274
|
+
uri = URI.parse(target)
|
275
|
+
|
276
|
+
result = [URI.parse('http://www.example.com/home.html')]
|
277
|
+
expect(worker.links(uri).first).to eq(result)
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
describe "#explore" do
|
282
|
+
it "starts the crawling by processing the first queue element" do
|
283
|
+
response = { headers: { 'Content-Type' => 'text/html'} }
|
284
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
285
|
+
|
286
|
+
target = 'http://www.example.com'
|
287
|
+
worker = Aquanaut::Worker.new(target)
|
288
|
+
worker.explore
|
289
|
+
|
290
|
+
queue = worker.instance_variable_get('@queue')
|
291
|
+
expect(queue).to be_empty
|
292
|
+
end
|
293
|
+
|
294
|
+
it "marks visited sites" do
|
295
|
+
response = { headers: { 'Content-Type' => 'text/html'} }
|
296
|
+
stub_request(:get, 'www.example.com').to_return(response)
|
297
|
+
|
298
|
+
target = 'http://www.example.com'
|
299
|
+
worker = Aquanaut::Worker.new(target)
|
300
|
+
|
301
|
+
visited = worker.instance_variable_get('@visited')
|
302
|
+
expect { worker.explore }.to change { visited.size }.by(1)
|
303
|
+
end
|
304
|
+
|
305
|
+
it "skips already visited sites" do
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|