aquanaut 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +20 -0
- data/Guardfile +24 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +1 -0
- data/aquanaut.gemspec +27 -0
- data/bin/aquanaut +18 -0
- data/lib/aquanaut/asset_node.rb +23 -0
- data/lib/aquanaut/graph.rb +74 -0
- data/lib/aquanaut/node.rb +22 -0
- data/lib/aquanaut/page_node.rb +25 -0
- data/lib/aquanaut/sitemap.rb +55 -0
- data/lib/aquanaut/templates/assets/css/custom.css +27 -0
- data/lib/aquanaut/templates/assets/js/graph.js +46 -0
- data/lib/aquanaut/templates/index.html.slim +29 -0
- data/lib/aquanaut/version.rb +4 -0
- data/lib/aquanaut/worker.rb +111 -0
- data/lib/aquanaut.rb +41 -0
- data/spec/aquanaut/aquanaut_spec.rb +48 -0
- data/spec/aquanaut/asset_node_spec.rb +16 -0
- data/spec/aquanaut/graph_spec.rb +89 -0
- data/spec/aquanaut/node_spec.rb +26 -0
- data/spec/aquanaut/page_node_spec.rb +14 -0
- data/spec/aquanaut/sitemap_spec.rb +60 -0
- data/spec/aquanaut/worker_spec.rb +308 -0
- data/spec/spec_helper.rb +17 -0
- data/vendor/assets/css/bootstrap-theme.css +347 -0
- data/vendor/assets/css/bootstrap-theme.css.map +1 -0
- data/vendor/assets/css/bootstrap-theme.min.css +7 -0
- data/vendor/assets/css/bootstrap.css +5785 -0
- data/vendor/assets/css/bootstrap.css.map +1 -0
- data/vendor/assets/css/bootstrap.min.css +7 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.eot +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.svg +229 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.woff +0 -0
- data/vendor/assets/js/bootstrap.js +1951 -0
- data/vendor/assets/js/bootstrap.min.js +6 -0
- data/vendor/assets/js/d3.v3.min.js +5 -0
- data/vendor/assets/js/jquery-2.1.0.min.js +4 -0
- metadata +205 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4173052655cbf46e9072ff1d04cfd6611f33f88b
|
4
|
+
data.tar.gz: 10a5195c0f2c45d6178a8eac71737247d88449e6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f6b6e7ac135045f018e9c6ab3ac8df090dc3ad7db6840412196bcccb284f82147b0a3257bbe3aa596c66cde770b0df34d00ce7b2dac66c9e01a9dca48acaf9f0
|
7
|
+
data.tar.gz: 1d4e01f0b10794e2d10be888d0e6aba0e8d2d7563369603bf637928fa51af6b0183c7a976967f51aaa73fb56a408eab77f942beebeea78e602f1d96925dfb5cf
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in aquanaut.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'public_suffix'
|
7
|
+
gem 'mechanize'
|
8
|
+
gem 'slim'
|
9
|
+
gem 'webmock'
|
10
|
+
|
11
|
+
group :test do
|
12
|
+
gem 'rspec'
|
13
|
+
gem 'rspec-core'
|
14
|
+
gem 'guard-rspec', require: false
|
15
|
+
end
|
16
|
+
|
17
|
+
group :development, :test do
|
18
|
+
gem 'pry'
|
19
|
+
gem 'pry-byebug'
|
20
|
+
end
|
data/Guardfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :rspec do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
|
9
|
+
# Rails example
|
10
|
+
watch(%r{^app/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
11
|
+
watch(%r{^app/(.*)(\.erb|\.haml|\.slim)$}) { |m| "spec/#{m[1]}#{m[2]}_spec.rb" }
|
12
|
+
watch(%r{^app/controllers/(.+)_(controller)\.rb$}) { |m| ["spec/routing/#{m[1]}_routing_spec.rb", "spec/#{m[2]}s/#{m[1]}_#{m[2]}_spec.rb", "spec/acceptance/#{m[1]}_spec.rb"] }
|
13
|
+
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
14
|
+
watch('config/routes.rb') { "spec/routing" }
|
15
|
+
watch('app/controllers/application_controller.rb') { "spec/controllers" }
|
16
|
+
|
17
|
+
# Capybara features specs
|
18
|
+
watch(%r{^app/views/(.+)/.*\.(erb|haml|slim)$}) { |m| "spec/features/#{m[1]}_spec.rb" }
|
19
|
+
|
20
|
+
# Turnip features and steps
|
21
|
+
watch(%r{^spec/acceptance/(.+)\.feature$})
|
22
|
+
watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'spec/acceptance' }
|
23
|
+
end
|
24
|
+
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Konrad Reiche
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Aquanaut
|
2
|
+
|
3
|
+
[](http://travis-ci.org/platzhirsch/aquanaut)
|
4
|
+
|
5
|
+
A web crawler that stays on a given domain and creates a graph representing the different pages, static assets and how they are interlinked.
|
6
|
+
|
7
|
+
<img src="http://konrad-reiche.com/images/aquanaut.png">
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'aquanaut'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install aquanaut
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Execute `aquanaut` and specify the domain on which it should be executed.
|
26
|
+
|
27
|
+
$ aquanaut 'http://www.konrad-reiche.com'
|
28
|
+
|
29
|
+
The results are written into the directory `sitemap`.
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
1. Fork it ( http://github.com/<my-github-username>/aquanaut/fork )
|
34
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
35
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
36
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
37
|
+
. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/aquanaut.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'aquanaut/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "aquanaut"
|
8
|
+
spec.version = Aquanaut::VERSION
|
9
|
+
spec.authors = ["Konrad Reiche"]
|
10
|
+
spec.email = ["konrad.reiche@gmail.com"]
|
11
|
+
spec.summary = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
|
12
|
+
spec.description = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
|
13
|
+
spec.homepage = "https://github.com/platzhirsch/aquanaut"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
22
|
+
spec.add_development_dependency "rake", "~> 0.9"
|
23
|
+
spec.add_development_dependency "public_suffix", "~> 1.4", ">= 1.4.0"
|
24
|
+
spec.add_development_dependency "mechanize", "~> 2.7", ">= 2.7.3"
|
25
|
+
spec.add_development_dependency "slim", "~> 2.0", ">= 2.0.1"
|
26
|
+
spec.add_development_dependency "webmock", "~> 1.15", ">= 1.15.2"
|
27
|
+
end
|
data/bin/aquanaut
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'aquanaut'
|
5
|
+
|
6
|
+
if ARGV.empty?
|
7
|
+
raise ArgumentError, "Specify a target domain in the first argument"
|
8
|
+
end
|
9
|
+
|
10
|
+
target_domain = ARGV[0]
|
11
|
+
|
12
|
+
if target_domain =~ URI::regexp
|
13
|
+
graph = Aquanaut.process_domain(target_domain)
|
14
|
+
Aquanaut::Sitemap.new(graph, target_domain).render_results
|
15
|
+
else
|
16
|
+
raise ArgumentError, "#{target_domain} is not a valid URI"
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'aquanaut/page_node'
|
2
|
+
|
3
|
+
module Aquanaut
|
4
|
+
|
5
|
+
# An asset node is a node that represents a static asset. The type specifies
|
6
|
+
# what kind of static asset it is, for instance image or stylesheet.
|
7
|
+
class AssetNode < PageNode
|
8
|
+
|
9
|
+
attr_reader :type
|
10
|
+
|
11
|
+
# Constructor
|
12
|
+
#
|
13
|
+
# @param [URI] uri identifying the static asset uniquely.
|
14
|
+
#
|
15
|
+
# @param [String] type specifying the kind of static asset.
|
16
|
+
#
|
17
|
+
def initialize(uri, type)
|
18
|
+
@type = type
|
19
|
+
super(uri)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# A graph representing the sitemap in terms of a data structure. A hash is
|
4
|
+
# used internally to make the nodes accessible through the URIs.
|
5
|
+
#
|
6
|
+
class Aquanaut::Graph
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@nodes = Hash.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Use this method for making nodes available in the graph. New nodes are
|
14
|
+
# only assigned once.
|
15
|
+
#
|
16
|
+
# @param [Node] node the node to add to the graph.
|
17
|
+
#
|
18
|
+
def add_node(node)
|
19
|
+
@nodes[node.uri] ||= node
|
20
|
+
end
|
21
|
+
|
22
|
+
# Use this method to easily add new edges without the need to pass actual
|
23
|
+
# node objects. The method delegates the edge creation to the dedicated node
|
24
|
+
# edge method.
|
25
|
+
#
|
26
|
+
# @param [URI] predecessor_uri source node for the edge
|
27
|
+
# @param [URI] successor_uri target node for the edge
|
28
|
+
#
|
29
|
+
def add_edge(predecessor_uri, successor_uri)
|
30
|
+
@nodes[predecessor_uri].add_edge(@nodes[successor_uri])
|
31
|
+
end
|
32
|
+
|
33
|
+
# Accessor method to retrieve nodes by their URI.
|
34
|
+
#
|
35
|
+
# @param [URI] uri the URI representing the node.
|
36
|
+
#
|
37
|
+
def [](uri)
|
38
|
+
@nodes[uri]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Accessor method to iterate the nodes and their adjacency list.
|
42
|
+
#
|
43
|
+
def each
|
44
|
+
@nodes.values.each do |node|
|
45
|
+
yield node, node.adjacency_list
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Used for visualizing the graph on the front-end.
|
50
|
+
#
|
51
|
+
def to_json
|
52
|
+
model = { 'nodes' => [], 'links' => [] }
|
53
|
+
|
54
|
+
self.each do |node, adjacency|
|
55
|
+
if node.instance_of?(Aquanaut::PageNode)
|
56
|
+
group = 1
|
57
|
+
else
|
58
|
+
asset_groups = { 'image' => 2, 'stylesheet' => 3 }
|
59
|
+
group = asset_groups[node.type]
|
60
|
+
end
|
61
|
+
|
62
|
+
model['nodes'] << { 'name' => node.uri, 'group' => group }
|
63
|
+
source = @nodes.values.index(node)
|
64
|
+
|
65
|
+
adjacency.each do |adjacency_node|
|
66
|
+
target = @nodes.values.index(adjacency_node)
|
67
|
+
model['links'] << { 'source' => source, 'target' => target }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
return model.to_json
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Aquanaut
|
2
|
+
|
3
|
+
# Base node class which needs to be inherited for special cases.
|
4
|
+
#
|
5
|
+
# @abstract
|
6
|
+
#
|
7
|
+
class Node
|
8
|
+
|
9
|
+
attr_reader :adjacency_list
|
10
|
+
|
11
|
+
def initialize()
|
12
|
+
@adjacency_list = []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Implements adjacency with an adjacency list.
|
16
|
+
#
|
17
|
+
def add_edge(successor)
|
18
|
+
@adjacency_list << successor
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'aquanaut/node'
|
2
|
+
|
3
|
+
module Aquanaut
|
4
|
+
|
5
|
+
# A page node represents an actual page in the specified domain.
|
6
|
+
#
|
7
|
+
class PageNode < Node
|
8
|
+
|
9
|
+
attr_reader :uri
|
10
|
+
|
11
|
+
def initialize(uri)
|
12
|
+
@uri = uri
|
13
|
+
super()
|
14
|
+
end
|
15
|
+
|
16
|
+
# Display method used on the front-end for the sitemap in list format.
|
17
|
+
#
|
18
|
+
def display
|
19
|
+
part = "#{@uri.path}#{@uri.query}#{@uri.fragment}"
|
20
|
+
part = @uri.to_s if part.empty?
|
21
|
+
return part
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'slim'
|
3
|
+
|
4
|
+
# The sitemap class is used to render the results in HTML and JavaScript.
|
5
|
+
#
|
6
|
+
# Uses SLIM as a template engine.
|
7
|
+
#
|
8
|
+
class Aquanaut::Sitemap
|
9
|
+
|
10
|
+
def initialize(graph, domain, target_dir="#{Dir.pwd}/sitemap")
|
11
|
+
@graph = graph
|
12
|
+
@domain = domain
|
13
|
+
@target_dir = target_dir
|
14
|
+
|
15
|
+
if Pathname.new(target_dir).relative?
|
16
|
+
@target_dir = File.expand_path("../../../#{target_dir}", __FILE__)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Renders the results by initiailizing the dependencies and processingt the template.
|
21
|
+
#
|
22
|
+
def render_results
|
23
|
+
initialize_target_directory
|
24
|
+
|
25
|
+
options = { disable_escape: true }
|
26
|
+
template_path = File.expand_path('../templates/index.html.slim', __FILE__)
|
27
|
+
rendered_template = Slim::Template.new(template_path, options).render(self)
|
28
|
+
|
29
|
+
File.open("#{@target_dir}/index.html", 'w') do |file|
|
30
|
+
file.write rendered_template
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# There are several asset files required. Vendor asset files, but also local
|
37
|
+
# asset files. They need to the copied to the target directory in order to
|
38
|
+
# work properly.
|
39
|
+
#
|
40
|
+
# @private
|
41
|
+
#
|
42
|
+
def initialize_target_directory
|
43
|
+
# create result directory
|
44
|
+
Dir.mkdir(@target_dir) unless Dir.exists?(@target_dir)
|
45
|
+
|
46
|
+
# copy vendor assets
|
47
|
+
vendor_dir = File.expand_path('../../../vendor/assets', __FILE__)
|
48
|
+
FileUtils.cp_r(vendor_dir, @target_dir, remove_destination: true)
|
49
|
+
|
50
|
+
# copy local assets
|
51
|
+
assets_dir = File.expand_path('../templates/assets', __FILE__)
|
52
|
+
FileUtils.cp_r(assets_dir, @target_dir)
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
h1 {
|
2
|
+
margin-bottom: 2em;
|
3
|
+
font-size: 28px;
|
4
|
+
}
|
5
|
+
|
6
|
+
h2 {
|
7
|
+
margin-bottom: 0.5em;
|
8
|
+
font-size: 24px;
|
9
|
+
}
|
10
|
+
|
11
|
+
#graph {
|
12
|
+
width: 960px;
|
13
|
+
height: 500px;
|
14
|
+
border: 1px solid #ccc;
|
15
|
+
margin-bottom: 2em;
|
16
|
+
}
|
17
|
+
|
18
|
+
.node {
|
19
|
+
stroke: #fff;
|
20
|
+
stroke-width: 1.5px;
|
21
|
+
}
|
22
|
+
|
23
|
+
.link {
|
24
|
+
stroke: #999;
|
25
|
+
stroke-opacity: .6;
|
26
|
+
stroke-width: 1px;
|
27
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
$(document).ready(function() {
|
2
|
+
var width = 960,
|
3
|
+
height = 500;
|
4
|
+
|
5
|
+
var color = d3.scale.category10();
|
6
|
+
|
7
|
+
var force = d3.layout.force()
|
8
|
+
.charge(-120)
|
9
|
+
.linkDistance(30)
|
10
|
+
.size([width, height]);
|
11
|
+
|
12
|
+
var svg = d3.select("#graph").append("svg")
|
13
|
+
.attr("width", width)
|
14
|
+
.attr("height", height);
|
15
|
+
|
16
|
+
force
|
17
|
+
.nodes(graph.nodes)
|
18
|
+
.links(graph.links)
|
19
|
+
.start();
|
20
|
+
|
21
|
+
var link = svg.selectAll(".link")
|
22
|
+
.data(graph.links)
|
23
|
+
.enter().append("line")
|
24
|
+
.attr("class", "link");
|
25
|
+
|
26
|
+
var node = svg.selectAll(".node")
|
27
|
+
.data(graph.nodes)
|
28
|
+
.enter().append("circle")
|
29
|
+
.attr("class", "node")
|
30
|
+
.attr("r", 5)
|
31
|
+
.style("fill", function(d) { return color(d.group); })
|
32
|
+
.call(force.drag);
|
33
|
+
|
34
|
+
node.append("title")
|
35
|
+
.text(function(d) { return d.name; });
|
36
|
+
|
37
|
+
force.on("tick", function() {
|
38
|
+
link.attr("x1", function(d) { return d.source.x; })
|
39
|
+
.attr("y1", function(d) { return d.source.y; })
|
40
|
+
.attr("x2", function(d) { return d.target.x; })
|
41
|
+
.attr("y2", function(d) { return d.target.y; });
|
42
|
+
|
43
|
+
node.attr("cx", function(d) { return d.x; })
|
44
|
+
.attr("cy", function(d) { return d.y; });
|
45
|
+
});
|
46
|
+
});
|
@@ -0,0 +1,29 @@
|
|
1
|
+
doctype html
|
2
|
+
html lang="en"
|
3
|
+
head
|
4
|
+
meta charset="UTF -8"
|
5
|
+
title Sitemap #{@domain}
|
6
|
+
script type="text/javascript" src="assets/js/jquery-2.1.0.min.js"
|
7
|
+
script type="text/javascript" src="assets/js/bootstrap.min.js"
|
8
|
+
script type="text/javascript" src="assets/js/d3.v3.min.js"
|
9
|
+
script type="text/javascript" src="assets/js/graph.js"
|
10
|
+
|
11
|
+
script type="text/javascript"
|
12
|
+
| window.graph = #{@graph.to_json}
|
13
|
+
|
14
|
+
link rel="stylesheet" href="assets/css/bootstrap.min.css"
|
15
|
+
link rel="stylesheet" href="assets/css/custom.css"
|
16
|
+
body
|
17
|
+
div class="container"
|
18
|
+
h1 Sitemap for #{@domain}
|
19
|
+
h2 Visualized
|
20
|
+
div class="text-center" id="graph"
|
21
|
+
h2 Overview
|
22
|
+
ul
|
23
|
+
- @graph.select { |node, _| node.instance_of?(Aquanaut::PageNode) }.each do |node, adjacency|
|
24
|
+
li
|
25
|
+
a href="#{node.uri}" #{node.display}
|
26
|
+
ul
|
27
|
+
- adjacency.select { |node| node.instance_of?(Aquanaut::PageNode) }.each do |node|
|
28
|
+
li
|
29
|
+
a href="#{node.uri}" #{node.display}
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'public_suffix'
|
3
|
+
|
4
|
+
# The worker contains the actual crawling procedure.
|
5
|
+
#
|
6
|
+
class Aquanaut::Worker
|
7
|
+
|
8
|
+
def initialize(target)
|
9
|
+
uri = URI.parse(target)
|
10
|
+
@queue = [uri]
|
11
|
+
@domain = PublicSuffix.parse(uri.host)
|
12
|
+
|
13
|
+
@visited = Hash.new(false)
|
14
|
+
|
15
|
+
@agent = Mechanize.new do |agent|
|
16
|
+
agent.open_timeout = 5
|
17
|
+
agent.read_timeout = 5
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Triggers the crawling process.
|
22
|
+
#
|
23
|
+
def explore
|
24
|
+
while not @queue.empty?
|
25
|
+
uri = @queue.shift # dequeue
|
26
|
+
next if @visited[uri]
|
27
|
+
|
28
|
+
@visited[uri] = true
|
29
|
+
puts "Visit #{uri}"
|
30
|
+
|
31
|
+
links, assets = links(uri)
|
32
|
+
links.each do |link|
|
33
|
+
@queue.push(link) unless @visited[link] # enqueue
|
34
|
+
end
|
35
|
+
|
36
|
+
yield uri, links, assets if block_given?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Retrieves all links to pages and static assets from a given page. The
|
41
|
+
# decision whether a link points to an internal or external domain cannot be
|
42
|
+
# done by just exmaining the link's URL. Due to possible HTTP 3xx responses
|
43
|
+
# the link needs to be resolved. Hence, each link is processed through a HTTP
|
44
|
+
# HEAD request to retrieve the final location.
|
45
|
+
#
|
46
|
+
# @param uri [URI] the URI from which the page is retrieved.
|
47
|
+
#
|
48
|
+
# @return [Array<URI>, Array<Hash>] list of links and static assets found on
|
49
|
+
# the given page.
|
50
|
+
#
|
51
|
+
def links(uri)
|
52
|
+
page = @agent.get(uri)
|
53
|
+
grabbed = Hash.new(false)
|
54
|
+
return [] unless page.is_a?(Mechanize::Page)
|
55
|
+
|
56
|
+
assets = page.images.map do |image|
|
57
|
+
uri = URI.join(page.uri, image.url)
|
58
|
+
{ 'uri' => uri, 'type' => 'image' }
|
59
|
+
end
|
60
|
+
|
61
|
+
page.parser.css('link[rel="stylesheet"]').each do |stylesheet|
|
62
|
+
uri = URI.join(page.uri, stylesheet['href'])
|
63
|
+
asset = { 'uri' => uri, 'type' => 'styleshet' }
|
64
|
+
assets << asset
|
65
|
+
end
|
66
|
+
|
67
|
+
links = page.links.map do |link|
|
68
|
+
begin
|
69
|
+
next if link.uri.nil?
|
70
|
+
reference = URI.join(page.uri, link.uri)
|
71
|
+
|
72
|
+
next if grabbed[reference]
|
73
|
+
header = @agent.head(reference)
|
74
|
+
|
75
|
+
location = header.uri
|
76
|
+
next if not internal?(location) or not header.is_a?(Mechanize::Page)
|
77
|
+
|
78
|
+
grabbed[reference] = true
|
79
|
+
grabbed[location] = true
|
80
|
+
|
81
|
+
location
|
82
|
+
rescue Mechanize::Error, URI::InvalidURIError,
|
83
|
+
Net::HTTP::Persistent::Error, Net::OpenTimeout, Net::ReadTimeout
|
84
|
+
next
|
85
|
+
end
|
86
|
+
end.compact
|
87
|
+
|
88
|
+
return links, assets
|
89
|
+
rescue Mechanize::Error, Net::OpenTimeout, Net::ReadTimeout,
|
90
|
+
Net::HTTP::Persistent::Error
|
91
|
+
return [], [] # swallow
|
92
|
+
end
|
93
|
+
|
94
|
+
# Evaluates if a link stays in the initial domain.
|
95
|
+
#
|
96
|
+
# Used to keep the crawler inside the initial domain. In order to determinate
|
97
|
+
# it uses the second-level and top-level domain. If the public suffix cannot
|
98
|
+
# be detected due to possibly invalidity returns true to make sure the link
|
99
|
+
# does not go unchecked.
|
100
|
+
#
|
101
|
+
# @param link [URI] the link to be checked.
|
102
|
+
#
|
103
|
+
# @return [Boolean] whether the link is internal or not.
|
104
|
+
#
|
105
|
+
def internal?(link)
|
106
|
+
return true unless PublicSuffix.valid?(link.host)
|
107
|
+
link_domain = PublicSuffix.parse(link.host)
|
108
|
+
@domain.sld == link_domain.sld and @domain.tld == link_domain.tld
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
data/lib/aquanaut.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'aquanaut/asset_node'
|
2
|
+
require 'aquanaut/graph'
|
3
|
+
require 'aquanaut/page_node'
|
4
|
+
require 'aquanaut/sitemap'
|
5
|
+
require 'aquanaut/version'
|
6
|
+
require 'aquanaut/worker'
|
7
|
+
|
8
|
+
# Main module of Aquanaut
|
9
|
+
#
|
10
|
+
module Aquanaut
|
11
|
+
class << self
|
12
|
+
|
13
|
+
# Processes the given target domain and creates a page and asset graph.
|
14
|
+
#
|
15
|
+
# @param [String] target_address
|
16
|
+
#
|
17
|
+
# @return [Graph] the sitemap graph with pages and static assets
|
18
|
+
#
|
19
|
+
def process_domain(target_address)
|
20
|
+
worker = Worker.new(target_address)
|
21
|
+
graph = Graph.new
|
22
|
+
|
23
|
+
worker.explore do |page_uri, links, static_assets|
|
24
|
+
graph.add_node(PageNode.new(page_uri))
|
25
|
+
|
26
|
+
links.each do |link_uri|
|
27
|
+
graph.add_node(PageNode.new(link_uri))
|
28
|
+
graph.add_edge(page_uri, link_uri)
|
29
|
+
end
|
30
|
+
|
31
|
+
static_assets.each do |asset|
|
32
|
+
graph.add_node(AssetNode.new(asset['uri'], asset['type']))
|
33
|
+
graph.add_edge(page_uri, asset['uri'])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
return graph
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|