aquanaut 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +20 -0
  6. data/Guardfile +24 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +37 -0
  9. data/Rakefile +1 -0
  10. data/aquanaut.gemspec +27 -0
  11. data/bin/aquanaut +18 -0
  12. data/lib/aquanaut/asset_node.rb +23 -0
  13. data/lib/aquanaut/graph.rb +74 -0
  14. data/lib/aquanaut/node.rb +22 -0
  15. data/lib/aquanaut/page_node.rb +25 -0
  16. data/lib/aquanaut/sitemap.rb +55 -0
  17. data/lib/aquanaut/templates/assets/css/custom.css +27 -0
  18. data/lib/aquanaut/templates/assets/js/graph.js +46 -0
  19. data/lib/aquanaut/templates/index.html.slim +29 -0
  20. data/lib/aquanaut/version.rb +4 -0
  21. data/lib/aquanaut/worker.rb +111 -0
  22. data/lib/aquanaut.rb +41 -0
  23. data/spec/aquanaut/aquanaut_spec.rb +48 -0
  24. data/spec/aquanaut/asset_node_spec.rb +16 -0
  25. data/spec/aquanaut/graph_spec.rb +89 -0
  26. data/spec/aquanaut/node_spec.rb +26 -0
  27. data/spec/aquanaut/page_node_spec.rb +14 -0
  28. data/spec/aquanaut/sitemap_spec.rb +60 -0
  29. data/spec/aquanaut/worker_spec.rb +308 -0
  30. data/spec/spec_helper.rb +17 -0
  31. data/vendor/assets/css/bootstrap-theme.css +347 -0
  32. data/vendor/assets/css/bootstrap-theme.css.map +1 -0
  33. data/vendor/assets/css/bootstrap-theme.min.css +7 -0
  34. data/vendor/assets/css/bootstrap.css +5785 -0
  35. data/vendor/assets/css/bootstrap.css.map +1 -0
  36. data/vendor/assets/css/bootstrap.min.css +7 -0
  37. data/vendor/assets/fonts/glyphicons-halflings-regular.eot +0 -0
  38. data/vendor/assets/fonts/glyphicons-halflings-regular.svg +229 -0
  39. data/vendor/assets/fonts/glyphicons-halflings-regular.ttf +0 -0
  40. data/vendor/assets/fonts/glyphicons-halflings-regular.woff +0 -0
  41. data/vendor/assets/js/bootstrap.js +1951 -0
  42. data/vendor/assets/js/bootstrap.min.js +6 -0
  43. data/vendor/assets/js/d3.v3.min.js +5 -0
  44. data/vendor/assets/js/jquery-2.1.0.min.js +4 -0
  45. metadata +205 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4173052655cbf46e9072ff1d04cfd6611f33f88b
4
+ data.tar.gz: 10a5195c0f2c45d6178a8eac71737247d88449e6
5
+ SHA512:
6
+ metadata.gz: f6b6e7ac135045f018e9c6ab3ac8df090dc3ad7db6840412196bcccb284f82147b0a3257bbe3aa596c66cde770b0df34d00ce7b2dac66c9e01a9dca48acaf9f0
7
+ data.tar.gz: 1d4e01f0b10794e2d10be888d0e6aba0e8d2d7563369603bf637928fa51af6b0183c7a976967f51aaa73fb56a408eab77f942beebeea78e602f1d96925dfb5cf
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.1.0
5
+
6
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,20 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in aquanaut.gemspec
4
+ gemspec
5
+
6
+ gem 'public_suffix'
7
+ gem 'mechanize'
8
+ gem 'slim'
9
+ gem 'webmock'
10
+
11
+ group :test do
12
+ gem 'rspec'
13
+ gem 'rspec-core'
14
+ gem 'guard-rspec', require: false
15
+ end
16
+
17
+ group :development, :test do
18
+ gem 'pry'
19
+ gem 'pry-byebug'
20
+ end
data/Guardfile ADDED
@@ -0,0 +1,24 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+
9
+ # Rails example
10
+ watch(%r{^app/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
11
+ watch(%r{^app/(.*)(\.erb|\.haml|\.slim)$}) { |m| "spec/#{m[1]}#{m[2]}_spec.rb" }
12
+ watch(%r{^app/controllers/(.+)_(controller)\.rb$}) { |m| ["spec/routing/#{m[1]}_routing_spec.rb", "spec/#{m[2]}s/#{m[1]}_#{m[2]}_spec.rb", "spec/acceptance/#{m[1]}_spec.rb"] }
13
+ watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
14
+ watch('config/routes.rb') { "spec/routing" }
15
+ watch('app/controllers/application_controller.rb') { "spec/controllers" }
16
+
17
+ # Capybara features specs
18
+ watch(%r{^app/views/(.+)/.*\.(erb|haml|slim)$}) { |m| "spec/features/#{m[1]}_spec.rb" }
19
+
20
+ # Turnip features and steps
21
+ watch(%r{^spec/acceptance/(.+)\.feature$})
22
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'spec/acceptance' }
23
+ end
24
+
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Konrad Reiche
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # Aquanaut
2
+
3
+ [![Build Status](https://travis-ci.org/platzhirsch/aquanaut.png)](http://travis-ci.org/platzhirsch/aquanaut)
4
+
5
+ A web crawler that stays on a given domain and creates a graph representing the different pages, static assets and how they are interlinked.
6
+
7
+ <img src="http://konrad-reiche.com/images/aquanaut.png">
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'aquanaut'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install aquanaut
22
+
23
+ ## Usage
24
+
25
+ Execute `aquanaut` and specify the domain on which it should be executed.
26
+
27
+ $ aquanaut 'http://www.konrad-reiche.com'
28
+
29
+ The results are written into the directory `sitemap`.
30
+
31
+ ## Contributing
32
+
33
+ 1. Fork it ( http://github.com/<my-github-username>/aquanaut/fork )
34
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
35
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
36
+ 4. Push to the branch (`git push origin my-new-feature`)
37
+ . Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/aquanaut.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'aquanaut/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "aquanaut"
8
+ spec.version = Aquanaut::VERSION
9
+ spec.authors = ["Konrad Reiche"]
10
+ spec.email = ["konrad.reiche@gmail.com"]
11
+ spec.summary = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
12
+ spec.description = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
13
+ spec.homepage = "https://github.com/platzhirsch/aquanaut"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rake", "~> 0.9"
23
+ spec.add_development_dependency "public_suffix", "~> 1.4", ">= 1.4.0"
24
+ spec.add_development_dependency "mechanize", "~> 2.7", ">= 2.7.3"
25
+ spec.add_development_dependency "slim", "~> 2.0", ">= 2.0.1"
26
+ spec.add_development_dependency "webmock", "~> 1.15", ">= 1.15.2"
27
+ end
data/bin/aquanaut ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'aquanaut'
5
+
6
+ if ARGV.empty?
7
+ raise ArgumentError, "Specify a target domain in the first argument"
8
+ end
9
+
10
+ target_domain = ARGV[0]
11
+
12
+ if target_domain =~ URI::regexp
13
+ graph = Aquanaut.process_domain(target_domain)
14
+ Aquanaut::Sitemap.new(graph, target_domain).render_results
15
+ else
16
+ raise ArgumentError, "#{target_domain} is not a valid URI"
17
+ end
18
+
@@ -0,0 +1,23 @@
1
+ require 'aquanaut/page_node'
2
+
3
+ module Aquanaut
4
+
5
+ # An asset node is a node that represents a static asset. The type specifies
6
+ # what kind of static asset it is, for instance image or stylesheet.
7
+ class AssetNode < PageNode
8
+
9
+ attr_reader :type
10
+
11
+ # Constructor
12
+ #
13
+ # @param [URI] uri identifying the static asset uniquely.
14
+ #
15
+ # @param [String] type specifying the kind of static asset.
16
+ #
17
+ def initialize(uri, type)
18
+ @type = type
19
+ super(uri)
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,74 @@
1
+ require 'json'
2
+
3
+ # A graph representing the sitemap in terms of a data structure. A hash is
4
+ # used internally to make the nodes accessible through the URIs.
5
+ #
6
+ class Aquanaut::Graph
7
+ include Enumerable
8
+
9
+ def initialize
10
+ @nodes = Hash.new
11
+ end
12
+
13
+ # Use this method for making nodes available in the graph. New nodes are
14
+ # only assigned once.
15
+ #
16
+ # @param [Node] node the node to add to the graph.
17
+ #
18
+ def add_node(node)
19
+ @nodes[node.uri] ||= node
20
+ end
21
+
22
+ # Use this method to easily add new edges without the need to pass actual
23
+ # node objects. The method delegates the edge creation to the dedicated node
24
+ # edge method.
25
+ #
26
+ # @param [URI] predecessor_uri source node for the edge
27
+ # @param [URI] successor_uri target node for the edge
28
+ #
29
+ def add_edge(predecessor_uri, successor_uri)
30
+ @nodes[predecessor_uri].add_edge(@nodes[successor_uri])
31
+ end
32
+
33
+ # Accessor method to retrieve nodes by their URI.
34
+ #
35
+ # @param [URI] uri the URI representing the node.
36
+ #
37
+ def [](uri)
38
+ @nodes[uri]
39
+ end
40
+
41
+ # Accessor method to iterate the nodes and their adjacency list.
42
+ #
43
+ def each
44
+ @nodes.values.each do |node|
45
+ yield node, node.adjacency_list
46
+ end
47
+ end
48
+
49
+ # Used for visualizing the graph on the front-end.
50
+ #
51
+ def to_json
52
+ model = { 'nodes' => [], 'links' => [] }
53
+
54
+ self.each do |node, adjacency|
55
+ if node.instance_of?(Aquanaut::PageNode)
56
+ group = 1
57
+ else
58
+ asset_groups = { 'image' => 2, 'stylesheet' => 3 }
59
+ group = asset_groups[node.type]
60
+ end
61
+
62
+ model['nodes'] << { 'name' => node.uri, 'group' => group }
63
+ source = @nodes.values.index(node)
64
+
65
+ adjacency.each do |adjacency_node|
66
+ target = @nodes.values.index(adjacency_node)
67
+ model['links'] << { 'source' => source, 'target' => target }
68
+ end
69
+ end
70
+
71
+ return model.to_json
72
+ end
73
+
74
+ end
@@ -0,0 +1,22 @@
1
+ module Aquanaut
2
+
3
+ # Base node class which needs to be inherited for special cases.
4
+ #
5
+ # @abstract
6
+ #
7
+ class Node
8
+
9
+ attr_reader :adjacency_list
10
+
11
+ def initialize()
12
+ @adjacency_list = []
13
+ end
14
+
15
+ # Implements adjacency with an adjacency list.
16
+ #
17
+ def add_edge(successor)
18
+ @adjacency_list << successor
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ require 'aquanaut/node'
2
+
3
+ module Aquanaut
4
+
5
+ # A page node represents an actual page in the specified domain.
6
+ #
7
+ class PageNode < Node
8
+
9
+ attr_reader :uri
10
+
11
+ def initialize(uri)
12
+ @uri = uri
13
+ super()
14
+ end
15
+
16
+ # Display method used on the front-end for the sitemap in list format.
17
+ #
18
+ def display
19
+ part = "#{@uri.path}#{@uri.query}#{@uri.fragment}"
20
+ part = @uri.to_s if part.empty?
21
+ return part
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,55 @@
1
+ require 'pathname'
2
+ require 'slim'
3
+
4
+ # The sitemap class is used to render the results in HTML and JavaScript.
5
+ #
6
+ # Uses SLIM as a template engine.
7
+ #
8
+ class Aquanaut::Sitemap
9
+
10
+ def initialize(graph, domain, target_dir="#{Dir.pwd}/sitemap")
11
+ @graph = graph
12
+ @domain = domain
13
+ @target_dir = target_dir
14
+
15
+ if Pathname.new(target_dir).relative?
16
+ @target_dir = File.expand_path("../../../#{target_dir}", __FILE__)
17
+ end
18
+ end
19
+
20
+ # Renders the results by initiailizing the dependencies and processingt the template.
21
+ #
22
+ def render_results
23
+ initialize_target_directory
24
+
25
+ options = { disable_escape: true }
26
+ template_path = File.expand_path('../templates/index.html.slim', __FILE__)
27
+ rendered_template = Slim::Template.new(template_path, options).render(self)
28
+
29
+ File.open("#{@target_dir}/index.html", 'w') do |file|
30
+ file.write rendered_template
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ # There are several asset files required. Vendor asset files, but also local
37
+ # asset files. They need to the copied to the target directory in order to
38
+ # work properly.
39
+ #
40
+ # @private
41
+ #
42
+ def initialize_target_directory
43
+ # create result directory
44
+ Dir.mkdir(@target_dir) unless Dir.exists?(@target_dir)
45
+
46
+ # copy vendor assets
47
+ vendor_dir = File.expand_path('../../../vendor/assets', __FILE__)
48
+ FileUtils.cp_r(vendor_dir, @target_dir, remove_destination: true)
49
+
50
+ # copy local assets
51
+ assets_dir = File.expand_path('../templates/assets', __FILE__)
52
+ FileUtils.cp_r(assets_dir, @target_dir)
53
+ end
54
+
55
+ end
@@ -0,0 +1,27 @@
1
+ h1 {
2
+ margin-bottom: 2em;
3
+ font-size: 28px;
4
+ }
5
+
6
+ h2 {
7
+ margin-bottom: 0.5em;
8
+ font-size: 24px;
9
+ }
10
+
11
+ #graph {
12
+ width: 960px;
13
+ height: 500px;
14
+ border: 1px solid #ccc;
15
+ margin-bottom: 2em;
16
+ }
17
+
18
+ .node {
19
+ stroke: #fff;
20
+ stroke-width: 1.5px;
21
+ }
22
+
23
+ .link {
24
+ stroke: #999;
25
+ stroke-opacity: .6;
26
+ stroke-width: 1px;
27
+ }
@@ -0,0 +1,46 @@
1
+ $(document).ready(function() {
2
+ var width = 960,
3
+ height = 500;
4
+
5
+ var color = d3.scale.category10();
6
+
7
+ var force = d3.layout.force()
8
+ .charge(-120)
9
+ .linkDistance(30)
10
+ .size([width, height]);
11
+
12
+ var svg = d3.select("#graph").append("svg")
13
+ .attr("width", width)
14
+ .attr("height", height);
15
+
16
+ force
17
+ .nodes(graph.nodes)
18
+ .links(graph.links)
19
+ .start();
20
+
21
+ var link = svg.selectAll(".link")
22
+ .data(graph.links)
23
+ .enter().append("line")
24
+ .attr("class", "link");
25
+
26
+ var node = svg.selectAll(".node")
27
+ .data(graph.nodes)
28
+ .enter().append("circle")
29
+ .attr("class", "node")
30
+ .attr("r", 5)
31
+ .style("fill", function(d) { return color(d.group); })
32
+ .call(force.drag);
33
+
34
+ node.append("title")
35
+ .text(function(d) { return d.name; });
36
+
37
+ force.on("tick", function() {
38
+ link.attr("x1", function(d) { return d.source.x; })
39
+ .attr("y1", function(d) { return d.source.y; })
40
+ .attr("x2", function(d) { return d.target.x; })
41
+ .attr("y2", function(d) { return d.target.y; });
42
+
43
+ node.attr("cx", function(d) { return d.x; })
44
+ .attr("cy", function(d) { return d.y; });
45
+ });
46
+ });
@@ -0,0 +1,29 @@
1
+ doctype html
2
+ html lang="en"
3
+ head
4
+ meta charset="UTF -8"
5
+ title Sitemap #{@domain}
6
+ script type="text/javascript" src="assets/js/jquery-2.1.0.min.js"
7
+ script type="text/javascript" src="assets/js/bootstrap.min.js"
8
+ script type="text/javascript" src="assets/js/d3.v3.min.js"
9
+ script type="text/javascript" src="assets/js/graph.js"
10
+
11
+ script type="text/javascript"
12
+ | window.graph = #{@graph.to_json}
13
+
14
+ link rel="stylesheet" href="assets/css/bootstrap.min.css"
15
+ link rel="stylesheet" href="assets/css/custom.css"
16
+ body
17
+ div class="container"
18
+ h1 Sitemap for #{@domain}
19
+ h2 Visualized
20
+ div class="text-center" id="graph"
21
+ h2 Overview
22
+ ul
23
+ - @graph.select { |node, _| node.instance_of?(Aquanaut::PageNode) }.each do |node, adjacency|
24
+ li
25
+ a href="#{node.uri}" #{node.display}
26
+ ul
27
+ - adjacency.select { |node| node.instance_of?(Aquanaut::PageNode) }.each do |node|
28
+ li
29
+ a href="#{node.uri}" #{node.display}
@@ -0,0 +1,4 @@
1
+ module Aquanaut
2
+ # Version of this gem
3
+ VERSION = "0.1.1"
4
+ end
@@ -0,0 +1,111 @@
1
+ require 'mechanize'
2
+ require 'public_suffix'
3
+
4
+ # The worker contains the actual crawling procedure.
5
+ #
6
+ class Aquanaut::Worker
7
+
8
+ def initialize(target)
9
+ uri = URI.parse(target)
10
+ @queue = [uri]
11
+ @domain = PublicSuffix.parse(uri.host)
12
+
13
+ @visited = Hash.new(false)
14
+
15
+ @agent = Mechanize.new do |agent|
16
+ agent.open_timeout = 5
17
+ agent.read_timeout = 5
18
+ end
19
+ end
20
+
21
+ # Triggers the crawling process.
22
+ #
23
+ def explore
24
+ while not @queue.empty?
25
+ uri = @queue.shift # dequeue
26
+ next if @visited[uri]
27
+
28
+ @visited[uri] = true
29
+ puts "Visit #{uri}"
30
+
31
+ links, assets = links(uri)
32
+ links.each do |link|
33
+ @queue.push(link) unless @visited[link] # enqueue
34
+ end
35
+
36
+ yield uri, links, assets if block_given?
37
+ end
38
+ end
39
+
40
+ # Retrieves all links to pages and static assets from a given page. The
41
+ # decision whether a link points to an internal or external domain cannot be
42
+ # done by just exmaining the link's URL. Due to possible HTTP 3xx responses
43
+ # the link needs to be resolved. Hence, each link is processed through a HTTP
44
+ # HEAD request to retrieve the final location.
45
+ #
46
+ # @param uri [URI] the URI from which the page is retrieved.
47
+ #
48
+ # @return [Array<URI>, Array<Hash>] list of links and static assets found on
49
+ # the given page.
50
+ #
51
+ def links(uri)
52
+ page = @agent.get(uri)
53
+ grabbed = Hash.new(false)
54
+ return [] unless page.is_a?(Mechanize::Page)
55
+
56
+ assets = page.images.map do |image|
57
+ uri = URI.join(page.uri, image.url)
58
+ { 'uri' => uri, 'type' => 'image' }
59
+ end
60
+
61
+ page.parser.css('link[rel="stylesheet"]').each do |stylesheet|
62
+ uri = URI.join(page.uri, stylesheet['href'])
63
+ asset = { 'uri' => uri, 'type' => 'styleshet' }
64
+ assets << asset
65
+ end
66
+
67
+ links = page.links.map do |link|
68
+ begin
69
+ next if link.uri.nil?
70
+ reference = URI.join(page.uri, link.uri)
71
+
72
+ next if grabbed[reference]
73
+ header = @agent.head(reference)
74
+
75
+ location = header.uri
76
+ next if not internal?(location) or not header.is_a?(Mechanize::Page)
77
+
78
+ grabbed[reference] = true
79
+ grabbed[location] = true
80
+
81
+ location
82
+ rescue Mechanize::Error, URI::InvalidURIError,
83
+ Net::HTTP::Persistent::Error, Net::OpenTimeout, Net::ReadTimeout
84
+ next
85
+ end
86
+ end.compact
87
+
88
+ return links, assets
89
+ rescue Mechanize::Error, Net::OpenTimeout, Net::ReadTimeout,
90
+ Net::HTTP::Persistent::Error
91
+ return [], [] # swallow
92
+ end
93
+
94
+ # Evaluates if a link stays in the initial domain.
95
+ #
96
+ # Used to keep the crawler inside the initial domain. In order to determinate
97
+ # it uses the second-level and top-level domain. If the public suffix cannot
98
+ # be detected due to possibly invalidity returns true to make sure the link
99
+ # does not go unchecked.
100
+ #
101
+ # @param link [URI] the link to be checked.
102
+ #
103
+ # @return [Boolean] whether the link is internal or not.
104
+ #
105
+ def internal?(link)
106
+ return true unless PublicSuffix.valid?(link.host)
107
+ link_domain = PublicSuffix.parse(link.host)
108
+ @domain.sld == link_domain.sld and @domain.tld == link_domain.tld
109
+ end
110
+
111
+ end
data/lib/aquanaut.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'aquanaut/asset_node'
2
+ require 'aquanaut/graph'
3
+ require 'aquanaut/page_node'
4
+ require 'aquanaut/sitemap'
5
+ require 'aquanaut/version'
6
+ require 'aquanaut/worker'
7
+
8
+ # Main module of Aquanaut
9
+ #
10
+ module Aquanaut
11
+ class << self
12
+
13
+ # Processes the given target domain and creates a page and asset graph.
14
+ #
15
+ # @param [String] target_address
16
+ #
17
+ # @return [Graph] the sitemap graph with pages and static assets
18
+ #
19
+ def process_domain(target_address)
20
+ worker = Worker.new(target_address)
21
+ graph = Graph.new
22
+
23
+ worker.explore do |page_uri, links, static_assets|
24
+ graph.add_node(PageNode.new(page_uri))
25
+
26
+ links.each do |link_uri|
27
+ graph.add_node(PageNode.new(link_uri))
28
+ graph.add_edge(page_uri, link_uri)
29
+ end
30
+
31
+ static_assets.each do |asset|
32
+ graph.add_node(AssetNode.new(asset['uri'], asset['type']))
33
+ graph.add_edge(page_uri, asset['uri'])
34
+ end
35
+ end
36
+
37
+ return graph
38
+ end
39
+
40
+ end
41
+ end