aquanaut 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +20 -0
- data/Guardfile +24 -0
- data/LICENSE.txt +22 -0
- data/README.md +37 -0
- data/Rakefile +1 -0
- data/aquanaut.gemspec +27 -0
- data/bin/aquanaut +18 -0
- data/lib/aquanaut/asset_node.rb +23 -0
- data/lib/aquanaut/graph.rb +74 -0
- data/lib/aquanaut/node.rb +22 -0
- data/lib/aquanaut/page_node.rb +25 -0
- data/lib/aquanaut/sitemap.rb +55 -0
- data/lib/aquanaut/templates/assets/css/custom.css +27 -0
- data/lib/aquanaut/templates/assets/js/graph.js +46 -0
- data/lib/aquanaut/templates/index.html.slim +29 -0
- data/lib/aquanaut/version.rb +4 -0
- data/lib/aquanaut/worker.rb +111 -0
- data/lib/aquanaut.rb +41 -0
- data/spec/aquanaut/aquanaut_spec.rb +48 -0
- data/spec/aquanaut/asset_node_spec.rb +16 -0
- data/spec/aquanaut/graph_spec.rb +89 -0
- data/spec/aquanaut/node_spec.rb +26 -0
- data/spec/aquanaut/page_node_spec.rb +14 -0
- data/spec/aquanaut/sitemap_spec.rb +60 -0
- data/spec/aquanaut/worker_spec.rb +308 -0
- data/spec/spec_helper.rb +17 -0
- data/vendor/assets/css/bootstrap-theme.css +347 -0
- data/vendor/assets/css/bootstrap-theme.css.map +1 -0
- data/vendor/assets/css/bootstrap-theme.min.css +7 -0
- data/vendor/assets/css/bootstrap.css +5785 -0
- data/vendor/assets/css/bootstrap.css.map +1 -0
- data/vendor/assets/css/bootstrap.min.css +7 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.eot +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.svg +229 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/vendor/assets/fonts/glyphicons-halflings-regular.woff +0 -0
- data/vendor/assets/js/bootstrap.js +1951 -0
- data/vendor/assets/js/bootstrap.min.js +6 -0
- data/vendor/assets/js/d3.v3.min.js +5 -0
- data/vendor/assets/js/jquery-2.1.0.min.js +4 -0
- metadata +205 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4173052655cbf46e9072ff1d04cfd6611f33f88b
|
4
|
+
data.tar.gz: 10a5195c0f2c45d6178a8eac71737247d88449e6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f6b6e7ac135045f018e9c6ab3ac8df090dc3ad7db6840412196bcccb284f82147b0a3257bbe3aa596c66cde770b0df34d00ce7b2dac66c9e01a9dca48acaf9f0
|
7
|
+
data.tar.gz: 1d4e01f0b10794e2d10be888d0e6aba0e8d2d7563369603bf637928fa51af6b0183c7a976967f51aaa73fb56a408eab77f942beebeea78e602f1d96925dfb5cf
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in aquanaut.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'public_suffix'
|
7
|
+
gem 'mechanize'
|
8
|
+
gem 'slim'
|
9
|
+
gem 'webmock'
|
10
|
+
|
11
|
+
group :test do
|
12
|
+
gem 'rspec'
|
13
|
+
gem 'rspec-core'
|
14
|
+
gem 'guard-rspec', require: false
|
15
|
+
end
|
16
|
+
|
17
|
+
group :development, :test do
|
18
|
+
gem 'pry'
|
19
|
+
gem 'pry-byebug'
|
20
|
+
end
|
data/Guardfile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :rspec do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
|
9
|
+
# Rails example
|
10
|
+
watch(%r{^app/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
11
|
+
watch(%r{^app/(.*)(\.erb|\.haml|\.slim)$}) { |m| "spec/#{m[1]}#{m[2]}_spec.rb" }
|
12
|
+
watch(%r{^app/controllers/(.+)_(controller)\.rb$}) { |m| ["spec/routing/#{m[1]}_routing_spec.rb", "spec/#{m[2]}s/#{m[1]}_#{m[2]}_spec.rb", "spec/acceptance/#{m[1]}_spec.rb"] }
|
13
|
+
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
14
|
+
watch('config/routes.rb') { "spec/routing" }
|
15
|
+
watch('app/controllers/application_controller.rb') { "spec/controllers" }
|
16
|
+
|
17
|
+
# Capybara features specs
|
18
|
+
watch(%r{^app/views/(.+)/.*\.(erb|haml|slim)$}) { |m| "spec/features/#{m[1]}_spec.rb" }
|
19
|
+
|
20
|
+
# Turnip features and steps
|
21
|
+
watch(%r{^spec/acceptance/(.+)\.feature$})
|
22
|
+
watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'spec/acceptance' }
|
23
|
+
end
|
24
|
+
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Konrad Reiche
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# Aquanaut
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/platzhirsch/aquanaut.png)](http://travis-ci.org/platzhirsch/aquanaut)
|
4
|
+
|
5
|
+
A web crawler that stays on a given domain and creates a graph representing the different pages, static assets and how they are interlinked.
|
6
|
+
|
7
|
+
<img src="http://konrad-reiche.com/images/aquanaut.png">
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'aquanaut'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install aquanaut
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Execute `aquanaut` and specify the domain on which it should be executed.
|
26
|
+
|
27
|
+
$ aquanaut 'http://www.konrad-reiche.com'
|
28
|
+
|
29
|
+
The results are written into the directory `sitemap`.
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
1. Fork it ( http://github.com/<my-github-username>/aquanaut/fork )
|
34
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
35
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
36
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
37
|
+
. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/aquanaut.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'aquanaut/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "aquanaut"
|
8
|
+
spec.version = Aquanaut::VERSION
|
9
|
+
spec.authors = ["Konrad Reiche"]
|
10
|
+
spec.email = ["konrad.reiche@gmail.com"]
|
11
|
+
spec.summary = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
|
12
|
+
spec.description = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
|
13
|
+
spec.homepage = "https://github.com/platzhirsch/aquanaut"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
22
|
+
spec.add_development_dependency "rake", "~> 0.9"
|
23
|
+
spec.add_development_dependency "public_suffix", "~> 1.4", ">= 1.4.0"
|
24
|
+
spec.add_development_dependency "mechanize", "~> 2.7", ">= 2.7.3"
|
25
|
+
spec.add_development_dependency "slim", "~> 2.0", ">= 2.0.1"
|
26
|
+
spec.add_development_dependency "webmock", "~> 1.15", ">= 1.15.2"
|
27
|
+
end
|
data/bin/aquanaut
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'aquanaut'
|
5
|
+
|
6
|
+
if ARGV.empty?
|
7
|
+
raise ArgumentError, "Specify a target domain in the first argument"
|
8
|
+
end
|
9
|
+
|
10
|
+
target_domain = ARGV[0]
|
11
|
+
|
12
|
+
if target_domain =~ URI::regexp
|
13
|
+
graph = Aquanaut.process_domain(target_domain)
|
14
|
+
Aquanaut::Sitemap.new(graph, target_domain).render_results
|
15
|
+
else
|
16
|
+
raise ArgumentError, "#{target_domain} is not a valid URI"
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'aquanaut/page_node'
|
2
|
+
|
3
|
+
module Aquanaut
|
4
|
+
|
5
|
+
# An asset node is a node that represents a static asset. The type specifies
|
6
|
+
# what kind of static asset it is, for instance image or stylesheet.
|
7
|
+
class AssetNode < PageNode
|
8
|
+
|
9
|
+
attr_reader :type
|
10
|
+
|
11
|
+
# Constructor
|
12
|
+
#
|
13
|
+
# @param [URI] uri identifying the static asset uniquely.
|
14
|
+
#
|
15
|
+
# @param [String] type specifying the kind of static asset.
|
16
|
+
#
|
17
|
+
def initialize(uri, type)
|
18
|
+
@type = type
|
19
|
+
super(uri)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# A graph representing the sitemap in terms of a data structure. A hash is
|
4
|
+
# used internally to make the nodes accessible through the URIs.
|
5
|
+
#
|
6
|
+
class Aquanaut::Graph
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@nodes = Hash.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Use this method for making nodes available in the graph. New nodes are
|
14
|
+
# only assigned once.
|
15
|
+
#
|
16
|
+
# @param [Node] node the node to add to the graph.
|
17
|
+
#
|
18
|
+
def add_node(node)
|
19
|
+
@nodes[node.uri] ||= node
|
20
|
+
end
|
21
|
+
|
22
|
+
# Use this method to easily add new edges without the need to pass actual
|
23
|
+
# node objects. The method delegates the edge creation to the dedicated node
|
24
|
+
# edge method.
|
25
|
+
#
|
26
|
+
# @param [URI] predecessor_uri source node for the edge
|
27
|
+
# @param [URI] successor_uri target node for the edge
|
28
|
+
#
|
29
|
+
def add_edge(predecessor_uri, successor_uri)
|
30
|
+
@nodes[predecessor_uri].add_edge(@nodes[successor_uri])
|
31
|
+
end
|
32
|
+
|
33
|
+
# Accessor method to retrieve nodes by their URI.
|
34
|
+
#
|
35
|
+
# @param [URI] uri the URI representing the node.
|
36
|
+
#
|
37
|
+
def [](uri)
|
38
|
+
@nodes[uri]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Accessor method to iterate the nodes and their adjacency list.
|
42
|
+
#
|
43
|
+
def each
|
44
|
+
@nodes.values.each do |node|
|
45
|
+
yield node, node.adjacency_list
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Used for visualizing the graph on the front-end.
|
50
|
+
#
|
51
|
+
def to_json
|
52
|
+
model = { 'nodes' => [], 'links' => [] }
|
53
|
+
|
54
|
+
self.each do |node, adjacency|
|
55
|
+
if node.instance_of?(Aquanaut::PageNode)
|
56
|
+
group = 1
|
57
|
+
else
|
58
|
+
asset_groups = { 'image' => 2, 'stylesheet' => 3 }
|
59
|
+
group = asset_groups[node.type]
|
60
|
+
end
|
61
|
+
|
62
|
+
model['nodes'] << { 'name' => node.uri, 'group' => group }
|
63
|
+
source = @nodes.values.index(node)
|
64
|
+
|
65
|
+
adjacency.each do |adjacency_node|
|
66
|
+
target = @nodes.values.index(adjacency_node)
|
67
|
+
model['links'] << { 'source' => source, 'target' => target }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
return model.to_json
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Aquanaut
|
2
|
+
|
3
|
+
# Base node class which needs to be inherited for special cases.
|
4
|
+
#
|
5
|
+
# @abstract
|
6
|
+
#
|
7
|
+
class Node
|
8
|
+
|
9
|
+
attr_reader :adjacency_list
|
10
|
+
|
11
|
+
def initialize()
|
12
|
+
@adjacency_list = []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Implements adjacency with an adjacency list.
|
16
|
+
#
|
17
|
+
def add_edge(successor)
|
18
|
+
@adjacency_list << successor
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'aquanaut/node'
|
2
|
+
|
3
|
+
module Aquanaut
|
4
|
+
|
5
|
+
# A page node represents an actual page in the specified domain.
|
6
|
+
#
|
7
|
+
class PageNode < Node
|
8
|
+
|
9
|
+
attr_reader :uri
|
10
|
+
|
11
|
+
def initialize(uri)
|
12
|
+
@uri = uri
|
13
|
+
super()
|
14
|
+
end
|
15
|
+
|
16
|
+
# Display method used on the front-end for the sitemap in list format.
|
17
|
+
#
|
18
|
+
def display
|
19
|
+
part = "#{@uri.path}#{@uri.query}#{@uri.fragment}"
|
20
|
+
part = @uri.to_s if part.empty?
|
21
|
+
return part
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'slim'
|
3
|
+
|
4
|
+
# The sitemap class is used to render the results in HTML and JavaScript.
|
5
|
+
#
|
6
|
+
# Uses SLIM as a template engine.
|
7
|
+
#
|
8
|
+
class Aquanaut::Sitemap
|
9
|
+
|
10
|
+
def initialize(graph, domain, target_dir="#{Dir.pwd}/sitemap")
|
11
|
+
@graph = graph
|
12
|
+
@domain = domain
|
13
|
+
@target_dir = target_dir
|
14
|
+
|
15
|
+
if Pathname.new(target_dir).relative?
|
16
|
+
@target_dir = File.expand_path("../../../#{target_dir}", __FILE__)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Renders the results by initiailizing the dependencies and processingt the template.
|
21
|
+
#
|
22
|
+
def render_results
|
23
|
+
initialize_target_directory
|
24
|
+
|
25
|
+
options = { disable_escape: true }
|
26
|
+
template_path = File.expand_path('../templates/index.html.slim', __FILE__)
|
27
|
+
rendered_template = Slim::Template.new(template_path, options).render(self)
|
28
|
+
|
29
|
+
File.open("#{@target_dir}/index.html", 'w') do |file|
|
30
|
+
file.write rendered_template
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# There are several asset files required. Vendor asset files, but also local
|
37
|
+
# asset files. They need to the copied to the target directory in order to
|
38
|
+
# work properly.
|
39
|
+
#
|
40
|
+
# @private
|
41
|
+
#
|
42
|
+
def initialize_target_directory
|
43
|
+
# create result directory
|
44
|
+
Dir.mkdir(@target_dir) unless Dir.exists?(@target_dir)
|
45
|
+
|
46
|
+
# copy vendor assets
|
47
|
+
vendor_dir = File.expand_path('../../../vendor/assets', __FILE__)
|
48
|
+
FileUtils.cp_r(vendor_dir, @target_dir, remove_destination: true)
|
49
|
+
|
50
|
+
# copy local assets
|
51
|
+
assets_dir = File.expand_path('../templates/assets', __FILE__)
|
52
|
+
FileUtils.cp_r(assets_dir, @target_dir)
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
h1 {
|
2
|
+
margin-bottom: 2em;
|
3
|
+
font-size: 28px;
|
4
|
+
}
|
5
|
+
|
6
|
+
h2 {
|
7
|
+
margin-bottom: 0.5em;
|
8
|
+
font-size: 24px;
|
9
|
+
}
|
10
|
+
|
11
|
+
#graph {
|
12
|
+
width: 960px;
|
13
|
+
height: 500px;
|
14
|
+
border: 1px solid #ccc;
|
15
|
+
margin-bottom: 2em;
|
16
|
+
}
|
17
|
+
|
18
|
+
.node {
|
19
|
+
stroke: #fff;
|
20
|
+
stroke-width: 1.5px;
|
21
|
+
}
|
22
|
+
|
23
|
+
.link {
|
24
|
+
stroke: #999;
|
25
|
+
stroke-opacity: .6;
|
26
|
+
stroke-width: 1px;
|
27
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
$(document).ready(function() {
|
2
|
+
var width = 960,
|
3
|
+
height = 500;
|
4
|
+
|
5
|
+
var color = d3.scale.category10();
|
6
|
+
|
7
|
+
var force = d3.layout.force()
|
8
|
+
.charge(-120)
|
9
|
+
.linkDistance(30)
|
10
|
+
.size([width, height]);
|
11
|
+
|
12
|
+
var svg = d3.select("#graph").append("svg")
|
13
|
+
.attr("width", width)
|
14
|
+
.attr("height", height);
|
15
|
+
|
16
|
+
force
|
17
|
+
.nodes(graph.nodes)
|
18
|
+
.links(graph.links)
|
19
|
+
.start();
|
20
|
+
|
21
|
+
var link = svg.selectAll(".link")
|
22
|
+
.data(graph.links)
|
23
|
+
.enter().append("line")
|
24
|
+
.attr("class", "link");
|
25
|
+
|
26
|
+
var node = svg.selectAll(".node")
|
27
|
+
.data(graph.nodes)
|
28
|
+
.enter().append("circle")
|
29
|
+
.attr("class", "node")
|
30
|
+
.attr("r", 5)
|
31
|
+
.style("fill", function(d) { return color(d.group); })
|
32
|
+
.call(force.drag);
|
33
|
+
|
34
|
+
node.append("title")
|
35
|
+
.text(function(d) { return d.name; });
|
36
|
+
|
37
|
+
force.on("tick", function() {
|
38
|
+
link.attr("x1", function(d) { return d.source.x; })
|
39
|
+
.attr("y1", function(d) { return d.source.y; })
|
40
|
+
.attr("x2", function(d) { return d.target.x; })
|
41
|
+
.attr("y2", function(d) { return d.target.y; });
|
42
|
+
|
43
|
+
node.attr("cx", function(d) { return d.x; })
|
44
|
+
.attr("cy", function(d) { return d.y; });
|
45
|
+
});
|
46
|
+
});
|
@@ -0,0 +1,29 @@
|
|
1
|
+
doctype html
|
2
|
+
html lang="en"
|
3
|
+
head
|
4
|
+
meta charset="UTF -8"
|
5
|
+
title Sitemap #{@domain}
|
6
|
+
script type="text/javascript" src="assets/js/jquery-2.1.0.min.js"
|
7
|
+
script type="text/javascript" src="assets/js/bootstrap.min.js"
|
8
|
+
script type="text/javascript" src="assets/js/d3.v3.min.js"
|
9
|
+
script type="text/javascript" src="assets/js/graph.js"
|
10
|
+
|
11
|
+
script type="text/javascript"
|
12
|
+
| window.graph = #{@graph.to_json}
|
13
|
+
|
14
|
+
link rel="stylesheet" href="assets/css/bootstrap.min.css"
|
15
|
+
link rel="stylesheet" href="assets/css/custom.css"
|
16
|
+
body
|
17
|
+
div class="container"
|
18
|
+
h1 Sitemap for #{@domain}
|
19
|
+
h2 Visualized
|
20
|
+
div class="text-center" id="graph"
|
21
|
+
h2 Overview
|
22
|
+
ul
|
23
|
+
- @graph.select { |node, _| node.instance_of?(Aquanaut::PageNode) }.each do |node, adjacency|
|
24
|
+
li
|
25
|
+
a href="#{node.uri}" #{node.display}
|
26
|
+
ul
|
27
|
+
- adjacency.select { |node| node.instance_of?(Aquanaut::PageNode) }.each do |node|
|
28
|
+
li
|
29
|
+
a href="#{node.uri}" #{node.display}
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'public_suffix'
|
3
|
+
|
4
|
+
# The worker contains the actual crawling procedure.
|
5
|
+
#
|
6
|
+
class Aquanaut::Worker
|
7
|
+
|
8
|
+
def initialize(target)
|
9
|
+
uri = URI.parse(target)
|
10
|
+
@queue = [uri]
|
11
|
+
@domain = PublicSuffix.parse(uri.host)
|
12
|
+
|
13
|
+
@visited = Hash.new(false)
|
14
|
+
|
15
|
+
@agent = Mechanize.new do |agent|
|
16
|
+
agent.open_timeout = 5
|
17
|
+
agent.read_timeout = 5
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Triggers the crawling process.
|
22
|
+
#
|
23
|
+
def explore
|
24
|
+
while not @queue.empty?
|
25
|
+
uri = @queue.shift # dequeue
|
26
|
+
next if @visited[uri]
|
27
|
+
|
28
|
+
@visited[uri] = true
|
29
|
+
puts "Visit #{uri}"
|
30
|
+
|
31
|
+
links, assets = links(uri)
|
32
|
+
links.each do |link|
|
33
|
+
@queue.push(link) unless @visited[link] # enqueue
|
34
|
+
end
|
35
|
+
|
36
|
+
yield uri, links, assets if block_given?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Retrieves all links to pages and static assets from a given page. The
|
41
|
+
# decision whether a link points to an internal or external domain cannot be
|
42
|
+
# done by just exmaining the link's URL. Due to possible HTTP 3xx responses
|
43
|
+
# the link needs to be resolved. Hence, each link is processed through a HTTP
|
44
|
+
# HEAD request to retrieve the final location.
|
45
|
+
#
|
46
|
+
# @param uri [URI] the URI from which the page is retrieved.
|
47
|
+
#
|
48
|
+
# @return [Array<URI>, Array<Hash>] list of links and static assets found on
|
49
|
+
# the given page.
|
50
|
+
#
|
51
|
+
def links(uri)
|
52
|
+
page = @agent.get(uri)
|
53
|
+
grabbed = Hash.new(false)
|
54
|
+
return [] unless page.is_a?(Mechanize::Page)
|
55
|
+
|
56
|
+
assets = page.images.map do |image|
|
57
|
+
uri = URI.join(page.uri, image.url)
|
58
|
+
{ 'uri' => uri, 'type' => 'image' }
|
59
|
+
end
|
60
|
+
|
61
|
+
page.parser.css('link[rel="stylesheet"]').each do |stylesheet|
|
62
|
+
uri = URI.join(page.uri, stylesheet['href'])
|
63
|
+
asset = { 'uri' => uri, 'type' => 'styleshet' }
|
64
|
+
assets << asset
|
65
|
+
end
|
66
|
+
|
67
|
+
links = page.links.map do |link|
|
68
|
+
begin
|
69
|
+
next if link.uri.nil?
|
70
|
+
reference = URI.join(page.uri, link.uri)
|
71
|
+
|
72
|
+
next if grabbed[reference]
|
73
|
+
header = @agent.head(reference)
|
74
|
+
|
75
|
+
location = header.uri
|
76
|
+
next if not internal?(location) or not header.is_a?(Mechanize::Page)
|
77
|
+
|
78
|
+
grabbed[reference] = true
|
79
|
+
grabbed[location] = true
|
80
|
+
|
81
|
+
location
|
82
|
+
rescue Mechanize::Error, URI::InvalidURIError,
|
83
|
+
Net::HTTP::Persistent::Error, Net::OpenTimeout, Net::ReadTimeout
|
84
|
+
next
|
85
|
+
end
|
86
|
+
end.compact
|
87
|
+
|
88
|
+
return links, assets
|
89
|
+
rescue Mechanize::Error, Net::OpenTimeout, Net::ReadTimeout,
|
90
|
+
Net::HTTP::Persistent::Error
|
91
|
+
return [], [] # swallow
|
92
|
+
end
|
93
|
+
|
94
|
+
# Evaluates if a link stays in the initial domain.
|
95
|
+
#
|
96
|
+
# Used to keep the crawler inside the initial domain. In order to determinate
|
97
|
+
# it uses the second-level and top-level domain. If the public suffix cannot
|
98
|
+
# be detected due to possibly invalidity returns true to make sure the link
|
99
|
+
# does not go unchecked.
|
100
|
+
#
|
101
|
+
# @param link [URI] the link to be checked.
|
102
|
+
#
|
103
|
+
# @return [Boolean] whether the link is internal or not.
|
104
|
+
#
|
105
|
+
def internal?(link)
|
106
|
+
return true unless PublicSuffix.valid?(link.host)
|
107
|
+
link_domain = PublicSuffix.parse(link.host)
|
108
|
+
@domain.sld == link_domain.sld and @domain.tld == link_domain.tld
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
data/lib/aquanaut.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'aquanaut/asset_node'
|
2
|
+
require 'aquanaut/graph'
|
3
|
+
require 'aquanaut/page_node'
|
4
|
+
require 'aquanaut/sitemap'
|
5
|
+
require 'aquanaut/version'
|
6
|
+
require 'aquanaut/worker'
|
7
|
+
|
8
|
+
# Main module of Aquanaut
|
9
|
+
#
|
10
|
+
module Aquanaut
|
11
|
+
class << self
|
12
|
+
|
13
|
+
# Processes the given target domain and creates a page and asset graph.
|
14
|
+
#
|
15
|
+
# @param [String] target_address
|
16
|
+
#
|
17
|
+
# @return [Graph] the sitemap graph with pages and static assets
|
18
|
+
#
|
19
|
+
def process_domain(target_address)
|
20
|
+
worker = Worker.new(target_address)
|
21
|
+
graph = Graph.new
|
22
|
+
|
23
|
+
worker.explore do |page_uri, links, static_assets|
|
24
|
+
graph.add_node(PageNode.new(page_uri))
|
25
|
+
|
26
|
+
links.each do |link_uri|
|
27
|
+
graph.add_node(PageNode.new(link_uri))
|
28
|
+
graph.add_edge(page_uri, link_uri)
|
29
|
+
end
|
30
|
+
|
31
|
+
static_assets.each do |asset|
|
32
|
+
graph.add_node(AssetNode.new(asset['uri'], asset['type']))
|
33
|
+
graph.add_edge(page_uri, asset['uri'])
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
return graph
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|