aquanaut 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +20 -0
  6. data/Guardfile +24 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +37 -0
  9. data/Rakefile +1 -0
  10. data/aquanaut.gemspec +27 -0
  11. data/bin/aquanaut +18 -0
  12. data/lib/aquanaut/asset_node.rb +23 -0
  13. data/lib/aquanaut/graph.rb +74 -0
  14. data/lib/aquanaut/node.rb +22 -0
  15. data/lib/aquanaut/page_node.rb +25 -0
  16. data/lib/aquanaut/sitemap.rb +55 -0
  17. data/lib/aquanaut/templates/assets/css/custom.css +27 -0
  18. data/lib/aquanaut/templates/assets/js/graph.js +46 -0
  19. data/lib/aquanaut/templates/index.html.slim +29 -0
  20. data/lib/aquanaut/version.rb +4 -0
  21. data/lib/aquanaut/worker.rb +111 -0
  22. data/lib/aquanaut.rb +41 -0
  23. data/spec/aquanaut/aquanaut_spec.rb +48 -0
  24. data/spec/aquanaut/asset_node_spec.rb +16 -0
  25. data/spec/aquanaut/graph_spec.rb +89 -0
  26. data/spec/aquanaut/node_spec.rb +26 -0
  27. data/spec/aquanaut/page_node_spec.rb +14 -0
  28. data/spec/aquanaut/sitemap_spec.rb +60 -0
  29. data/spec/aquanaut/worker_spec.rb +308 -0
  30. data/spec/spec_helper.rb +17 -0
  31. data/vendor/assets/css/bootstrap-theme.css +347 -0
  32. data/vendor/assets/css/bootstrap-theme.css.map +1 -0
  33. data/vendor/assets/css/bootstrap-theme.min.css +7 -0
  34. data/vendor/assets/css/bootstrap.css +5785 -0
  35. data/vendor/assets/css/bootstrap.css.map +1 -0
  36. data/vendor/assets/css/bootstrap.min.css +7 -0
  37. data/vendor/assets/fonts/glyphicons-halflings-regular.eot +0 -0
  38. data/vendor/assets/fonts/glyphicons-halflings-regular.svg +229 -0
  39. data/vendor/assets/fonts/glyphicons-halflings-regular.ttf +0 -0
  40. data/vendor/assets/fonts/glyphicons-halflings-regular.woff +0 -0
  41. data/vendor/assets/js/bootstrap.js +1951 -0
  42. data/vendor/assets/js/bootstrap.min.js +6 -0
  43. data/vendor/assets/js/d3.v3.min.js +5 -0
  44. data/vendor/assets/js/jquery-2.1.0.min.js +4 -0
  45. metadata +205 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4173052655cbf46e9072ff1d04cfd6611f33f88b
4
+ data.tar.gz: 10a5195c0f2c45d6178a8eac71737247d88449e6
5
+ SHA512:
6
+ metadata.gz: f6b6e7ac135045f018e9c6ab3ac8df090dc3ad7db6840412196bcccb284f82147b0a3257bbe3aa596c66cde770b0df34d00ce7b2dac66c9e01a9dca48acaf9f0
7
+ data.tar.gz: 1d4e01f0b10794e2d10be888d0e6aba0e8d2d7563369603bf637928fa51af6b0183c7a976967f51aaa73fb56a408eab77f942beebeea78e602f1d96925dfb5cf
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.1.0
5
+
6
+ script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,20 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in aquanaut.gemspec
4
+ gemspec
5
+
6
+ gem 'public_suffix'
7
+ gem 'mechanize'
8
+ gem 'slim'
9
+ gem 'webmock'
10
+
11
+ group :test do
12
+ gem 'rspec'
13
+ gem 'rspec-core'
14
+ gem 'guard-rspec', require: false
15
+ end
16
+
17
+ group :development, :test do
18
+ gem 'pry'
19
+ gem 'pry-byebug'
20
+ end
data/Guardfile ADDED
@@ -0,0 +1,24 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+
9
+ # Rails example
10
+ watch(%r{^app/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
11
+ watch(%r{^app/(.*)(\.erb|\.haml|\.slim)$}) { |m| "spec/#{m[1]}#{m[2]}_spec.rb" }
12
+ watch(%r{^app/controllers/(.+)_(controller)\.rb$}) { |m| ["spec/routing/#{m[1]}_routing_spec.rb", "spec/#{m[2]}s/#{m[1]}_#{m[2]}_spec.rb", "spec/acceptance/#{m[1]}_spec.rb"] }
13
+ watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
14
+ watch('config/routes.rb') { "spec/routing" }
15
+ watch('app/controllers/application_controller.rb') { "spec/controllers" }
16
+
17
+ # Capybara features specs
18
+ watch(%r{^app/views/(.+)/.*\.(erb|haml|slim)$}) { |m| "spec/features/#{m[1]}_spec.rb" }
19
+
20
+ # Turnip features and steps
21
+ watch(%r{^spec/acceptance/(.+)\.feature$})
22
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'spec/acceptance' }
23
+ end
24
+
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Konrad Reiche
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # Aquanaut
2
+
3
+ [![Build Status](https://travis-ci.org/platzhirsch/aquanaut.png)](http://travis-ci.org/platzhirsch/aquanaut)
4
+
5
+ A web crawler that stays on a given domain and creates a graph representing the different pages, static assets and how they are interlinked.
6
+
7
+ <img src="http://konrad-reiche.com/images/aquanaut.png">
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'aquanaut'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install aquanaut
22
+
23
+ ## Usage
24
+
25
+ Execute `aquanaut` and specify the domain on which it should be executed.
26
+
27
+ $ aquanaut 'http://www.konrad-reiche.com'
28
+
29
+ The results are written into the directory `sitemap`.
30
+
31
+ ## Contributing
32
+
33
+ 1. Fork it ( http://github.com/<my-github-username>/aquanaut/fork )
34
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
35
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
36
+ 4. Push to the branch (`git push origin my-new-feature`)
37
+ . Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/aquanaut.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'aquanaut/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "aquanaut"
8
+ spec.version = Aquanaut::VERSION
9
+ spec.authors = ["Konrad Reiche"]
10
+ spec.email = ["konrad.reiche@gmail.com"]
11
+ spec.summary = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
12
+ spec.description = %q{Aquanaut creates a sitemap dynamically based on a given domain.}
13
+ spec.homepage = "https://github.com/platzhirsch/aquanaut"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rake", "~> 0.9"
23
+ spec.add_development_dependency "public_suffix", "~> 1.4", ">= 1.4.0"
24
+ spec.add_development_dependency "mechanize", "~> 2.7", ">= 2.7.3"
25
+ spec.add_development_dependency "slim", "~> 2.0", ">= 2.0.1"
26
+ spec.add_development_dependency "webmock", "~> 1.15", ">= 1.15.2"
27
+ end
data/bin/aquanaut ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'aquanaut'
5
+
6
+ if ARGV.empty?
7
+ raise ArgumentError, "Specify a target domain in the first argument"
8
+ end
9
+
10
+ target_domain = ARGV[0]
11
+
12
+ if target_domain =~ URI::regexp
13
+ graph = Aquanaut.process_domain(target_domain)
14
+ Aquanaut::Sitemap.new(graph, target_domain).render_results
15
+ else
16
+ raise ArgumentError, "#{target_domain} is not a valid URI"
17
+ end
18
+
@@ -0,0 +1,23 @@
1
+ require 'aquanaut/page_node'
2
+
3
+ module Aquanaut
4
+
5
+ # An asset node is a node that represents a static asset. The type specifies
6
+ # what kind of static asset it is, for instance image or stylesheet.
7
+ class AssetNode < PageNode
8
+
9
+ attr_reader :type
10
+
11
+ # Constructor
12
+ #
13
+ # @param [URI] uri identifying the static asset uniquely.
14
+ #
15
+ # @param [String] type specifying the kind of static asset.
16
+ #
17
+ def initialize(uri, type)
18
+ @type = type
19
+ super(uri)
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,74 @@
1
+ require 'json'
2
+
3
+ # A graph representing the sitemap in terms of a data structure. A hash is
4
+ # used internally to make the nodes accessible through the URIs.
5
+ #
6
+ class Aquanaut::Graph
7
+ include Enumerable
8
+
9
+ def initialize
10
+ @nodes = Hash.new
11
+ end
12
+
13
+ # Use this method for making nodes available in the graph. New nodes are
14
+ # only assigned once.
15
+ #
16
+ # @param [Node] node the node to add to the graph.
17
+ #
18
+ def add_node(node)
19
+ @nodes[node.uri] ||= node
20
+ end
21
+
22
+ # Use this method to easily add new edges without the need to pass actual
23
+ # node objects. The method delegates the edge creation to the dedicated node
24
+ # edge method.
25
+ #
26
+ # @param [URI] predecessor_uri source node for the edge
27
+ # @param [URI] successor_uri target node for the edge
28
+ #
29
+ def add_edge(predecessor_uri, successor_uri)
30
+ @nodes[predecessor_uri].add_edge(@nodes[successor_uri])
31
+ end
32
+
33
+ # Accessor method to retrieve nodes by their URI.
34
+ #
35
+ # @param [URI] uri the URI representing the node.
36
+ #
37
+ def [](uri)
38
+ @nodes[uri]
39
+ end
40
+
41
+ # Accessor method to iterate the nodes and their adjacency list.
42
+ #
43
+ def each
44
+ @nodes.values.each do |node|
45
+ yield node, node.adjacency_list
46
+ end
47
+ end
48
+
49
+ # Used for visualizing the graph on the front-end.
50
+ #
51
+ def to_json
52
+ model = { 'nodes' => [], 'links' => [] }
53
+
54
+ self.each do |node, adjacency|
55
+ if node.instance_of?(Aquanaut::PageNode)
56
+ group = 1
57
+ else
58
+ asset_groups = { 'image' => 2, 'stylesheet' => 3 }
59
+ group = asset_groups[node.type]
60
+ end
61
+
62
+ model['nodes'] << { 'name' => node.uri, 'group' => group }
63
+ source = @nodes.values.index(node)
64
+
65
+ adjacency.each do |adjacency_node|
66
+ target = @nodes.values.index(adjacency_node)
67
+ model['links'] << { 'source' => source, 'target' => target }
68
+ end
69
+ end
70
+
71
+ return model.to_json
72
+ end
73
+
74
+ end
@@ -0,0 +1,22 @@
1
+ module Aquanaut
2
+
3
+ # Base node class which needs to be inherited for special cases.
4
+ #
5
+ # @abstract
6
+ #
7
+ class Node
8
+
9
+ attr_reader :adjacency_list
10
+
11
+ def initialize()
12
+ @adjacency_list = []
13
+ end
14
+
15
+ # Implements adjacency with an adjacency list.
16
+ #
17
+ def add_edge(successor)
18
+ @adjacency_list << successor
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ require 'aquanaut/node'
2
+
3
+ module Aquanaut
4
+
5
+ # A page node represents an actual page in the specified domain.
6
+ #
7
+ class PageNode < Node
8
+
9
+ attr_reader :uri
10
+
11
+ def initialize(uri)
12
+ @uri = uri
13
+ super()
14
+ end
15
+
16
+ # Display method used on the front-end for the sitemap in list format.
17
+ #
18
+ def display
19
+ part = "#{@uri.path}#{@uri.query}#{@uri.fragment}"
20
+ part = @uri.to_s if part.empty?
21
+ return part
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,55 @@
1
+ require 'pathname'
2
+ require 'slim'
3
+
4
+ # The sitemap class is used to render the results in HTML and JavaScript.
5
+ #
6
+ # Uses SLIM as a template engine.
7
+ #
8
+ class Aquanaut::Sitemap
9
+
10
+ def initialize(graph, domain, target_dir="#{Dir.pwd}/sitemap")
11
+ @graph = graph
12
+ @domain = domain
13
+ @target_dir = target_dir
14
+
15
+ if Pathname.new(target_dir).relative?
16
+ @target_dir = File.expand_path("../../../#{target_dir}", __FILE__)
17
+ end
18
+ end
19
+
20
+ # Renders the results by initiailizing the dependencies and processingt the template.
21
+ #
22
+ def render_results
23
+ initialize_target_directory
24
+
25
+ options = { disable_escape: true }
26
+ template_path = File.expand_path('../templates/index.html.slim', __FILE__)
27
+ rendered_template = Slim::Template.new(template_path, options).render(self)
28
+
29
+ File.open("#{@target_dir}/index.html", 'w') do |file|
30
+ file.write rendered_template
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ # There are several asset files required. Vendor asset files, but also local
37
+ # asset files. They need to the copied to the target directory in order to
38
+ # work properly.
39
+ #
40
+ # @private
41
+ #
42
+ def initialize_target_directory
43
+ # create result directory
44
+ Dir.mkdir(@target_dir) unless Dir.exists?(@target_dir)
45
+
46
+ # copy vendor assets
47
+ vendor_dir = File.expand_path('../../../vendor/assets', __FILE__)
48
+ FileUtils.cp_r(vendor_dir, @target_dir, remove_destination: true)
49
+
50
+ # copy local assets
51
+ assets_dir = File.expand_path('../templates/assets', __FILE__)
52
+ FileUtils.cp_r(assets_dir, @target_dir)
53
+ end
54
+
55
+ end
@@ -0,0 +1,27 @@
1
+ h1 {
2
+ margin-bottom: 2em;
3
+ font-size: 28px;
4
+ }
5
+
6
+ h2 {
7
+ margin-bottom: 0.5em;
8
+ font-size: 24px;
9
+ }
10
+
11
+ #graph {
12
+ width: 960px;
13
+ height: 500px;
14
+ border: 1px solid #ccc;
15
+ margin-bottom: 2em;
16
+ }
17
+
18
+ .node {
19
+ stroke: #fff;
20
+ stroke-width: 1.5px;
21
+ }
22
+
23
+ .link {
24
+ stroke: #999;
25
+ stroke-opacity: .6;
26
+ stroke-width: 1px;
27
+ }
@@ -0,0 +1,46 @@
1
+ $(document).ready(function() {
2
+ var width = 960,
3
+ height = 500;
4
+
5
+ var color = d3.scale.category10();
6
+
7
+ var force = d3.layout.force()
8
+ .charge(-120)
9
+ .linkDistance(30)
10
+ .size([width, height]);
11
+
12
+ var svg = d3.select("#graph").append("svg")
13
+ .attr("width", width)
14
+ .attr("height", height);
15
+
16
+ force
17
+ .nodes(graph.nodes)
18
+ .links(graph.links)
19
+ .start();
20
+
21
+ var link = svg.selectAll(".link")
22
+ .data(graph.links)
23
+ .enter().append("line")
24
+ .attr("class", "link");
25
+
26
+ var node = svg.selectAll(".node")
27
+ .data(graph.nodes)
28
+ .enter().append("circle")
29
+ .attr("class", "node")
30
+ .attr("r", 5)
31
+ .style("fill", function(d) { return color(d.group); })
32
+ .call(force.drag);
33
+
34
+ node.append("title")
35
+ .text(function(d) { return d.name; });
36
+
37
+ force.on("tick", function() {
38
+ link.attr("x1", function(d) { return d.source.x; })
39
+ .attr("y1", function(d) { return d.source.y; })
40
+ .attr("x2", function(d) { return d.target.x; })
41
+ .attr("y2", function(d) { return d.target.y; });
42
+
43
+ node.attr("cx", function(d) { return d.x; })
44
+ .attr("cy", function(d) { return d.y; });
45
+ });
46
+ });
@@ -0,0 +1,29 @@
1
+ doctype html
2
+ html lang="en"
3
+ head
4
+ meta charset="UTF -8"
5
+ title Sitemap #{@domain}
6
+ script type="text/javascript" src="assets/js/jquery-2.1.0.min.js"
7
+ script type="text/javascript" src="assets/js/bootstrap.min.js"
8
+ script type="text/javascript" src="assets/js/d3.v3.min.js"
9
+ script type="text/javascript" src="assets/js/graph.js"
10
+
11
+ script type="text/javascript"
12
+ | window.graph = #{@graph.to_json}
13
+
14
+ link rel="stylesheet" href="assets/css/bootstrap.min.css"
15
+ link rel="stylesheet" href="assets/css/custom.css"
16
+ body
17
+ div class="container"
18
+ h1 Sitemap for #{@domain}
19
+ h2 Visualized
20
+ div class="text-center" id="graph"
21
+ h2 Overview
22
+ ul
23
+ - @graph.select { |node, _| node.instance_of?(Aquanaut::PageNode) }.each do |node, adjacency|
24
+ li
25
+ a href="#{node.uri}" #{node.display}
26
+ ul
27
+ - adjacency.select { |node| node.instance_of?(Aquanaut::PageNode) }.each do |node|
28
+ li
29
+ a href="#{node.uri}" #{node.display}
@@ -0,0 +1,4 @@
1
+ module Aquanaut
2
+ # Version of this gem
3
+ VERSION = "0.1.1"
4
+ end
@@ -0,0 +1,111 @@
1
+ require 'mechanize'
2
+ require 'public_suffix'
3
+
4
+ # The worker contains the actual crawling procedure.
5
+ #
6
+ class Aquanaut::Worker
7
+
8
+ def initialize(target)
9
+ uri = URI.parse(target)
10
+ @queue = [uri]
11
+ @domain = PublicSuffix.parse(uri.host)
12
+
13
+ @visited = Hash.new(false)
14
+
15
+ @agent = Mechanize.new do |agent|
16
+ agent.open_timeout = 5
17
+ agent.read_timeout = 5
18
+ end
19
+ end
20
+
21
+ # Triggers the crawling process.
22
+ #
23
+ def explore
24
+ while not @queue.empty?
25
+ uri = @queue.shift # dequeue
26
+ next if @visited[uri]
27
+
28
+ @visited[uri] = true
29
+ puts "Visit #{uri}"
30
+
31
+ links, assets = links(uri)
32
+ links.each do |link|
33
+ @queue.push(link) unless @visited[link] # enqueue
34
+ end
35
+
36
+ yield uri, links, assets if block_given?
37
+ end
38
+ end
39
+
40
+ # Retrieves all links to pages and static assets from a given page. The
41
+ # decision whether a link points to an internal or external domain cannot be
42
+ # done by just exmaining the link's URL. Due to possible HTTP 3xx responses
43
+ # the link needs to be resolved. Hence, each link is processed through a HTTP
44
+ # HEAD request to retrieve the final location.
45
+ #
46
+ # @param uri [URI] the URI from which the page is retrieved.
47
+ #
48
+ # @return [Array<URI>, Array<Hash>] list of links and static assets found on
49
+ # the given page.
50
+ #
51
+ def links(uri)
52
+ page = @agent.get(uri)
53
+ grabbed = Hash.new(false)
54
+ return [] unless page.is_a?(Mechanize::Page)
55
+
56
+ assets = page.images.map do |image|
57
+ uri = URI.join(page.uri, image.url)
58
+ { 'uri' => uri, 'type' => 'image' }
59
+ end
60
+
61
+ page.parser.css('link[rel="stylesheet"]').each do |stylesheet|
62
+ uri = URI.join(page.uri, stylesheet['href'])
63
+ asset = { 'uri' => uri, 'type' => 'styleshet' }
64
+ assets << asset
65
+ end
66
+
67
+ links = page.links.map do |link|
68
+ begin
69
+ next if link.uri.nil?
70
+ reference = URI.join(page.uri, link.uri)
71
+
72
+ next if grabbed[reference]
73
+ header = @agent.head(reference)
74
+
75
+ location = header.uri
76
+ next if not internal?(location) or not header.is_a?(Mechanize::Page)
77
+
78
+ grabbed[reference] = true
79
+ grabbed[location] = true
80
+
81
+ location
82
+ rescue Mechanize::Error, URI::InvalidURIError,
83
+ Net::HTTP::Persistent::Error, Net::OpenTimeout, Net::ReadTimeout
84
+ next
85
+ end
86
+ end.compact
87
+
88
+ return links, assets
89
+ rescue Mechanize::Error, Net::OpenTimeout, Net::ReadTimeout,
90
+ Net::HTTP::Persistent::Error
91
+ return [], [] # swallow
92
+ end
93
+
94
+ # Evaluates if a link stays in the initial domain.
95
+ #
96
+ # Used to keep the crawler inside the initial domain. In order to determinate
97
+ # it uses the second-level and top-level domain. If the public suffix cannot
98
+ # be detected due to possibly invalidity returns true to make sure the link
99
+ # does not go unchecked.
100
+ #
101
+ # @param link [URI] the link to be checked.
102
+ #
103
+ # @return [Boolean] whether the link is internal or not.
104
+ #
105
+ def internal?(link)
106
+ return true unless PublicSuffix.valid?(link.host)
107
+ link_domain = PublicSuffix.parse(link.host)
108
+ @domain.sld == link_domain.sld and @domain.tld == link_domain.tld
109
+ end
110
+
111
+ end
data/lib/aquanaut.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'aquanaut/asset_node'
2
+ require 'aquanaut/graph'
3
+ require 'aquanaut/page_node'
4
+ require 'aquanaut/sitemap'
5
+ require 'aquanaut/version'
6
+ require 'aquanaut/worker'
7
+
8
+ # Main module of Aquanaut
9
+ #
10
+ module Aquanaut
11
+ class << self
12
+
13
+ # Processes the given target domain and creates a page and asset graph.
14
+ #
15
+ # @param [String] target_address
16
+ #
17
+ # @return [Graph] the sitemap graph with pages and static assets
18
+ #
19
+ def process_domain(target_address)
20
+ worker = Worker.new(target_address)
21
+ graph = Graph.new
22
+
23
+ worker.explore do |page_uri, links, static_assets|
24
+ graph.add_node(PageNode.new(page_uri))
25
+
26
+ links.each do |link_uri|
27
+ graph.add_node(PageNode.new(link_uri))
28
+ graph.add_edge(page_uri, link_uri)
29
+ end
30
+
31
+ static_assets.each do |asset|
32
+ graph.add_node(AssetNode.new(asset['uri'], asset['type']))
33
+ graph.add_edge(page_uri, asset['uri'])
34
+ end
35
+ end
36
+
37
+ return graph
38
+ end
39
+
40
+ end
41
+ end