anaximander 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 009aaa84665dcd8497a59b9e47acef29008613ea
4
+ data.tar.gz: 956c8d4a45e84443af161c5b4c78c4c33f8a54fb
5
+ SHA512:
6
+ metadata.gz: af827976b24b5f734391e3229a2c6fbd2a05ea012b7766517367d547beca19964bacfd59e96c3a857d12a2ae18d071c6c34f70fe2a7e6354d639b6baab0aa865
7
+ data.tar.gz: 4e49ba0bb2b67931ac4826f3e2e1913c3ae32885af892226c563488ff069e044c3f36de3c799885381a2581f53f96bb81cb421563874134195f6b9ae328313cd
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Matte Noble
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Anaximander
2
+
3
+ Anaximander is a small library for crawling a website and rendering the
4
+ resulting site map to the console.
5
+
6
+ ## Installation & Usage
7
+
8
+ ```sh
9
+ gem install anaximander
10
+ ```
11
+
12
+ ```sh
13
+ mapgen <url>
14
+ ```
15
+
16
+ ## Running Tests
17
+
18
+ ```sh
19
+ bundle install
20
+ bundle exec rspec spec
21
+ ```
22
+
23
+ ### End to End Tests
24
+
25
+ There are two tests marked with the tag `endtoend`. These tests start up
26
+ a Rack app which serves a simple website and run against that server
27
+ like the library would in "production". Think of them as the Acceptance
28
+ tests for a library.
29
+
30
+ The end to end tests are run by default. To exclude them:
31
+
32
+ ```sh
33
+ bundle exec rspec spec --tag ~endtoend
34
+ ```
35
+
36
+ ## What does Anaximander mean?
37
+
38
+ Anaximander was a Greek cartographer who was the first person to try to
39
+ map the entire world.
40
+
41
+ ## Contributing
42
+
43
+ 1. Fork it ( https://github.com/[my-github-username]/anaximander/fork )
44
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
45
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
46
+ 4. Push to the branch (`git push origin my-new-feature`)
47
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'anaximander/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "anaximander"
8
+ spec.version = Anaximander::VERSION
9
+ spec.authors = ["Matte Noble"]
10
+ spec.email = ["me@mattenoble.com"]
11
+ spec.summary = %q{Web scraper that collects assets and links.}
12
+ spec.description = %q{Web scraper that collects assets and links.}
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency "nokogiri"
21
+ spec.add_dependency "colorize"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency "rack"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "rspec-autotest"
28
+ spec.add_development_dependency "fakeout"
29
+ end
data/bin/mapgen ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require "anaximander"
3
+ require "anaximander/cli"
4
+ Anaximander.logger = false
5
+ Anaximander::CLI.start
@@ -0,0 +1,17 @@
1
+ module Anaximander
2
+ class CLI
3
+ def self.start
4
+ new(ARGV[0]).start
5
+ end
6
+
7
+ def initialize(url)
8
+ @crawler = Crawler.new(url)
9
+ @renderer = Renderer.new
10
+ end
11
+
12
+ def start
13
+ @crawler.crawl
14
+ @renderer.draw(@crawler.root)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,31 @@
1
+ module Anaximander
2
+ class Crawler
3
+ attr_reader :url, :root
4
+
5
+ def initialize(url)
6
+ @url = url.chomp("/")
7
+ @root = Page.new(url)
8
+ @visited = [url]
9
+ end
10
+
11
+ def crawl(page=self.root)
12
+ page.children = page.links.map { |link| visit(link.chomp("/")) }.compact
13
+ page.children.each { |child| crawl(child) }
14
+ end
15
+
16
+ def visit(link)
17
+ return if @visited.include?(link)
18
+
19
+ logger.debug(link)
20
+ @visited << link
21
+
22
+ Page.new(link)
23
+ rescue Anaximander::PageNotAccessibleError
24
+ nil
25
+ end
26
+
27
+ def logger
28
+ Anaximander.logger
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,39 @@
1
+ module Anaximander
2
+ module Discovery
3
+ class Assets
4
+ include Enumerable
5
+ include Comparable
6
+ extend Forwardable
7
+ def_delegators :assets, :size, :inspect, :to_a
8
+
9
+ def initialize(page)
10
+ @page = page
11
+ end
12
+
13
+ def each(&block)
14
+ assets.each(&block)
15
+ end
16
+
17
+ def <=>(other)
18
+ to_a <=> other.to_a
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :page
24
+
25
+ def assets
26
+ css + javascript
27
+ end
28
+
29
+ def css
30
+ page.css("link").map { |link| link[:href] }.compact
31
+ end
32
+
33
+ def javascript
34
+ page.css("script").map { |script| script[:src] }.compact
35
+ end
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,76 @@
1
+ module Anaximander
2
+ module Discovery
3
+
4
+ # Collection of internal links on the given page.
5
+ #
6
+ # == Relative Paths
7
+ #
8
+ # `Anaximander::Discovery::Links` converts all relative paths into absolute
9
+ # paths using the base URL of the page being crawled.
10
+ #
11
+ # # http://example.com
12
+ # <a href="/contact">Contact</a>
13
+ #
14
+ # Anaximander::Discovery::Links.new(Nokogiri::HTML(open("http://example.com")))
15
+ # # => ["http://example.com/contact"]
16
+ #
17
+ # == Exclusions
18
+ #
19
+ # - External links (ones outside the domain of the page
20
+ # - Hash links (Javascript style links with href of "#")
21
+ #
22
+ # == Example
23
+ #
24
+ # page = Nokogiri::HTML(open("http://example.com"))
25
+ #
26
+ # Anaximander::Discovery::Links.new(page)
27
+ # # => ["http://www.iana.org/domains/example"]
28
+ #
29
+ class Links
30
+ include Enumerable
31
+ include Comparable
32
+ extend Forwardable
33
+ def_delegators :links, :size, :inspect, :to_a
34
+
35
+ # Parameters
36
+ #
37
+ # page [Nokogiri::HTML] Parsed html of the page.
38
+ # url [String|URI] URL of the page to discover.
39
+ #
40
+ def initialize(page, url)
41
+ @page = page
42
+ @url = Url.new(url)
43
+ end
44
+
45
+ def each(&block)
46
+ links.each(&block)
47
+ end
48
+
49
+ def <=>(other)
50
+ to_a <=> other.to_a
51
+ end
52
+
53
+ private
54
+
55
+ attr_reader :page
56
+
57
+ def links
58
+ internal_links.map(&:to_s)
59
+ end
60
+
61
+ def internal_links
62
+ all_links.select { |link| @url.base == link.base }
63
+ end
64
+
65
+ def all_links
66
+ page.css("a").map { |a| absolute(a[:href]) }.compact.uniq
67
+ end
68
+
69
+ def absolute(link)
70
+ Url.new(link).absolute(@url.base).without_fragment
71
+ rescue URI::InvalidURIError
72
+ nil
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,73 @@
1
+ module Anaximander
2
+ class Error < StandardError; end
3
+
4
+ # Raised when a page cannot be fetched.
5
+ #
6
+ class PageNotAccessibleError < Error; end
7
+
8
+ # Represents a single page of a website being crawled. Exposes the assets and
9
+ # links on the page.
10
+ #
11
+ # == Errors
12
+ #
13
+ # `Anaximander::Page` will raise a `PageNotAccessibleError` when the page cannot
14
+ # be fetched for some reason. This is often due to it not existing (404), SSL
15
+ # errors or infinite redirect loops.
16
+ #
17
+ # == Example
18
+ #
19
+ # page = Page.new("http://example.com")
20
+ # page.links # => ["http://www.iana.org/domains/example"]
21
+ # page.assets # => ["/main.css", "/default.js"]
22
+ #
23
+ class Page
24
+ include Comparable
25
+
26
+ # Absolute url of the page.
27
+ #
28
+ attr_reader :url
29
+
30
+ # Parsed Nokogiri HTML document.
31
+ #
32
+ attr_reader :html
33
+
34
+ # Collection of `Page` objects that are linked
35
+ # to from the current page.
36
+ #
37
+ attr_accessor :children
38
+
39
+ # Parameters
40
+ #
41
+ # [String] url URL to discover.
42
+ #
43
+ # OpenURI raises a generic RuntimeError when it cannot fetch a
44
+ # page, for a variety of reasons. Some of which are 404s, SSL
45
+ # errors, or redirect loops.
46
+ #
47
+ # raises `PageNotAccessibleError` when OpenURI fails to fetch the
48
+ # page, for any reason.
49
+ #
50
+ def initialize(url)
51
+ @url = url
52
+ @html = Nokogiri::HTML(open(url))
53
+ rescue RuntimeError, OpenURI::HTTPError
54
+ raise PageNotAccessibleError
55
+ end
56
+
57
+ def links
58
+ Discovery::Links.new(html, url)
59
+ end
60
+
61
+ def assets
62
+ Discovery::Assets.new(html)
63
+ end
64
+
65
+ def <=>(other)
66
+ self.url <=> other.url
67
+ end
68
+
69
+ def inspect
70
+ %(#<Anaximander::Page:#{object_id} url="#{url}">)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,83 @@
1
+ require "colorize"
2
+
3
+ module Anaximander
4
+ # Draws the crawled tree of URLs and assets, generated by `Anaximander::Crawler`.
5
+ #
6
+ # == Output
7
+ #
8
+ # └── <url> [assets]
9
+ # ├── <url> [assets]
10
+ # │ └── <url> [assets]
11
+ # └── <url> [assets]
12
+ #
13
+ # == Example
14
+ #
15
+ # root
16
+ # # => #<Anaximander::Page url="http://example.com"/>
17
+ #
18
+ # root.children
19
+ # # => [#<Anaximander::Page url="http://example.com/foo"/>, #<Anaximander::Page url="http://example.com"/bar>]
20
+ #
21
+ # renderer = Anaximander::Renderer.new(root)
22
+ # renderer.draw
23
+ #
24
+ # # => └── http://example.com [main.css]
25
+ # # => ├── http://example.com/foo [main.css, foo.js]
26
+ # # => └── http://example.com/bar [main.css, bar.js]
27
+ #
28
+ class Renderer
29
+ VERTICAL_PIPE = "│ "
30
+ MEMBER_PIPE = "├── "
31
+ TAIL_PIPE = "└── "
32
+ SPACE_PIPE = " "
33
+
34
+ attr_reader :root
35
+
36
+ def initialize(options={})
37
+ @color = options.fetch(:color, true)
38
+ end
39
+
40
+ # Draws a page URL, its assets and recursively does the same for
41
+ # all of its children.
42
+ #
43
+ # == Parameters
44
+ #
45
+ # [Anaximander::Page] page The page to render
46
+ # [String] prefix A string that should preceed the actual URLa
47
+ # and asset information.
48
+ # [Boolean] tail Is this node the last one in the collection.
49
+ #
50
+ def draw(page=self.root, prefix="", tail=true)
51
+ pipe = tail ? TAIL_PIPE : MEMBER_PIPE
52
+
53
+ url = "#{prefix}#{pipe}#{page.url} "
54
+ assets = "#{page.assets.to_a}"
55
+ assets = assets.colorize(:light_black) if @color
56
+
57
+ print url
58
+ puts assets
59
+
60
+ page.children[0..-2].each { |child| draw_child(child, prefix, tail) }
61
+ draw_tail(page.children.last, prefix, tail) if page.children.size >= 1
62
+ end
63
+
64
+ # Draws a child node, with the appropriate "connecting pipe".
65
+ #
66
+ # The "connecting pipe" is the character at the beginning of this line,
67
+ # which connects this to the previous tier of the tree.
68
+ #
69
+ def draw_child(page, prefix, parent_is_tail)
70
+ connecting_pipe = parent_is_tail ? SPACE_PIPE : VERTICAL_PIPE
71
+ draw(page, "#{prefix}#{connecting_pipe}", false)
72
+ end
73
+
74
+ # Draws a leaf node, with the appropriate "connecting pipe".
75
+ #
76
+ # See `draw_child` for "connecting pipe" definition.
77
+ #
78
+ def draw_tail(page, prefix, parent_is_tail)
79
+ connecting_pipe = parent_is_tail ? SPACE_PIPE : VERTICAL_PIPE
80
+ draw(page, "#{prefix}#{connecting_pipe}", true)
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,38 @@
1
+ module Anaximander
2
+ class Url < SimpleDelegator
3
+ include Comparable
4
+
5
+ attr_reader :uri
6
+
7
+ def initialize(uri)
8
+ @uri = URI(uri.to_s)
9
+ super(@uri)
10
+ end
11
+
12
+ def base
13
+ domain = "#{scheme}://#{host}"
14
+ domain += ":#{port}" unless port == 80
15
+ domain
16
+ end
17
+
18
+ def join(url)
19
+ self.class.new(URI.join(self.uri, url.to_s))
20
+ end
21
+
22
+ def absolute(base)
23
+ absolute? ? self : Url.new(base).join(self)
24
+ end
25
+
26
+ def without_fragment
27
+ self.class.new(self).tap { |url| url.fragment = nil }
28
+ end
29
+
30
+ def <=>(other)
31
+ other.respond_to?(:uri) ? self.uri <=> other.uri : self.uri.to_s <=> other
32
+ end
33
+
34
+ def eql?(other)
35
+ self.uri.eql?(other.uri)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,3 @@
1
+ module Anaximander
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+ require "forwardable"
4
+ require "delegate"
5
+ require "uri"
6
+ require "logger"
7
+
8
+ require "anaximander/version"
9
+ require "anaximander/url"
10
+ require "anaximander/page"
11
+ require "anaximander/crawler"
12
+ require "anaximander/renderer"
13
+ require "anaximander/discovery/links"
14
+ require "anaximander/discovery/assets"
15
+
16
+ module Anaximander
17
+ def self.logger=(out)
18
+ @logger = Logger.new(out)
19
+ end
20
+
21
+ def self.logger
22
+ @logger ||= Logger.new(STDOUT)
23
+ end
24
+ end
@@ -0,0 +1,64 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Crawler do
4
+ let(:link) { "http://example.com" }
5
+ subject!(:crawler) { described_class.new(link) }
6
+
7
+ it "does not visit pages that have already been visited" do
8
+ crawler.visit(link)
9
+ expect(crawler.visit(link)).to be_nil
10
+ end
11
+
12
+ it "does not visit pages that do not exist" do
13
+ not_a_page = "http://example.com/definitelynotapage"
14
+ expect(crawler.visit(not_a_page)).to be_nil
15
+ end
16
+
17
+ it "visits links breadth-first" do
18
+ allow_any_instance_of(Anaximander::Page).to receive(:open).and_return("")
19
+
20
+ root = Anaximander::Page.new("http://example.com")
21
+ pricing = Anaximander::Page.new("http://example.com/pricing")
22
+ features = Anaximander::Page.new("http://example.com/features")
23
+
24
+ allow(Anaximander::Page).to receive(:new).with("http://example.com/pricing").and_return(pricing)
25
+ allow(Anaximander::Page).to receive(:new).with("http://example.com/features").and_return(features)
26
+
27
+ allow(root).to receive_messages(links: ["http://example.com/pricing", "http://example.com/features"])
28
+ allow(pricing).to receive_messages(links: ["http://example.com/features"])
29
+
30
+ crawler.crawl(root)
31
+ expect(root.children).to eq [pricing, features]
32
+ end
33
+
34
+ it "crawls a multi-tiered website", :endtoend do
35
+ # See `spec/data/site` for the website hierarchy; it matches
36
+ # the directory structure.
37
+
38
+ crawler = described_class.new("http://localhost:#{@port}/index.html")
39
+ crawler.crawl
40
+
41
+ root = crawler.root
42
+
43
+ pricing = root.children[0]
44
+ features = root.children[1]
45
+
46
+ pricing_low = pricing.children[0]
47
+ pricing_med = pricing.children[1]
48
+ pricing_high = pricing.children[2]
49
+
50
+ sortof_high = pricing_high.children[0]
51
+ super_high = pricing_high.children[1]
52
+
53
+ expect(root.children.size).to eq 2
54
+ expect(root.children).to eq [pricing, features]
55
+
56
+ expect(pricing.children.size).to eq 3
57
+ expect(pricing.children).to eq [pricing_low, pricing_med, pricing_high]
58
+
59
+ expect(pricing_high.children.size).to eq 2
60
+ expect(pricing_high.children).to eq [sortof_high, super_high]
61
+
62
+ expect(features.children.size).to eq 0
63
+ end
64
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Discovery::Assets do
4
+ let(:html) { Nokogiri::HTML(open("spec/data/page.html")) }
5
+ let(:assets) { described_class.new(html) }
6
+
7
+ # See `spec/data/page.html` for the HTML used in this test.
8
+
9
+ it "collects all CSS and Javascript assets" do
10
+ expect(assets.to_a).to eq(["main.css", "other.css", "allthethings.js"])
11
+ end
12
+ end
@@ -0,0 +1,33 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Discovery::Links do
4
+ let(:html) { Nokogiri::HTML(open("spec/data/page.html")) }
5
+ let(:links) { described_class.new(html, "http://example.com/") }
6
+
7
+ # See `spec/data/page.html` for the HTML used in this test.
8
+
9
+ it "collects each unique link" do
10
+ expect(links.size).to eq(2)
11
+ end
12
+
13
+ it "only collects links within the same domain" do
14
+ expect(links).to_not include "http://example.net"
15
+ end
16
+
17
+ it "removes trailing slashes from URLs" do
18
+ expect(links).to_not include "http://example.com/"
19
+ end
20
+
21
+ it "disgards hash links" do
22
+ links.each { |link| expect(link).to_not include "#" }
23
+ end
24
+
25
+ it "expands relative paths to absolute paths" do
26
+ expect(links).to include "http://example.com/google"
27
+ end
28
+
29
+ it "expands relative paths to absolute paths using the base url of the page" do
30
+ links = described_class.new(html, "http://example.com/some/nested/page/")
31
+ expect(links).to_not include "http://example.com/some/nested/page/google"
32
+ end
33
+ end
@@ -0,0 +1,24 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Page do
4
+ let(:html) { File.read(File.expand_path("../../data/page.html", __FILE__)) }
5
+ let(:page) { described_class.new("http://example.com") }
6
+
7
+ before do
8
+ expect_any_instance_of(described_class).to receive(:open).and_return(html)
9
+ end
10
+
11
+ it "has unique links" do
12
+ expect(Anaximander::Discovery::Links).to receive(:new).with(an_instance_of(Nokogiri::HTML::Document), "http://example.com")
13
+ page.links
14
+ end
15
+
16
+ it "has assets" do
17
+ expect(Anaximander::Discovery::Assets).to receive(:new).with(an_instance_of(Nokogiri::HTML::Document))
18
+ page.assets
19
+ end
20
+
21
+ it "is comparable by URL" do
22
+ expect(page).to eq(page.clone)
23
+ end
24
+ end
@@ -0,0 +1,64 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Renderer do
4
+ include Fakeout::SpecHelpers
5
+
6
+ let(:child) { double(url: "http://example.com/foo", assets: ["/root.css", "/child.js"], children: []) }
7
+ let(:root) { double(url: "http://example.com", assets: ["/root.css"], children: [child]) }
8
+
9
+ subject(:renderer) { described_class.new(color: false) }
10
+
11
+ before :all do
12
+ Fakeout.activate!
13
+ end
14
+
15
+ after :all do
16
+ Fakeout.deactivate!
17
+ end
18
+
19
+ it "draws a root node" do
20
+ renderer.draw(root)
21
+ expect(stdout).to include %(└── http://example.com ["/root.css"])
22
+ end
23
+
24
+ it "draws a child node whose parent is not a tail" do
25
+ renderer.draw_child(child, "", false)
26
+ expect(stdout).to include %(│ ├── http://example.com/foo ["/root.css", "/child.js"])
27
+ end
28
+
29
+ it "draws a child node whose parent is a tail" do
30
+ renderer.draw_child(child, "", true)
31
+ expect(stdout).to include %( ├── http://example.com/foo ["/root.css", "/child.js"])
32
+ end
33
+
34
+ it "draws a tail node whose parent is not a tail node" do
35
+ renderer.draw_tail(child, "", false)
36
+ expect(stdout).to include %(│ └── http://example.com/foo ["/root.css", "/child.js"])
37
+ end
38
+
39
+ it "draws a tail node whose parent is also a tail" do
40
+ renderer.draw_tail(child, "", true)
41
+ expect(stdout).to include %( └── http://example.com/foo ["/root.css", "/child.js"])
42
+ end
43
+
44
+ it "draws an entire tree", :endtoend do
45
+ tree = Anaximander::Crawler.new("http://localhost:#{@port}/index.html")
46
+ tree.crawl
47
+
48
+ domain = Anaximander::Url.new(tree.root.url).base
49
+
50
+ renderer = Anaximander::Renderer.new(color: false)
51
+ renderer.draw(tree.root)
52
+
53
+ expect(stdout).to eq <<-TREE
54
+ └── #{domain}/index.html ["/main.css", "/application.js"]
55
+ ├── #{domain}/pricing.html ["/main.css", "/application.js"]
56
+ │ ├── #{domain}/pricing/low.html ["/main.css"]
57
+ │ ├── #{domain}/pricing/medium.html ["/main.css"]
58
+ │ └── #{domain}/pricing/high.html ["/main.css"]
59
+ │ ├── #{domain}/pricing/high/sortof_high.html ["/main.css"]
60
+ │ └── #{domain}/pricing/high/super_high.html ["/main.css"]
61
+ └── #{domain}/features.html ["/main.css", "/application.js"]
62
+ TREE
63
+ end
64
+ end
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Url do
4
+ it "exposes the scheme and host as base" do
5
+ base = described_class.new("http://example.com/foo/bar?baz=1").base
6
+ expect(base).to eq "http://example.com"
7
+ end
8
+
9
+ it "joins other Urls" do
10
+ url = described_class.new("http://example.com").join(described_class.new("/foo"))
11
+ expect(url).to eq described_class.new("http://example.com/foo")
12
+ end
13
+
14
+ it "joins with a String url" do
15
+ url = described_class.new("http://example.com").join("/foo")
16
+ expect(url).to eq described_class.new("http://example.com/foo")
17
+ end
18
+
19
+ it "creates the absolute path when representing a relative path" do
20
+ url = described_class.new("/foo")
21
+ expect(url.absolute("http://example.com")).to eq described_class.new("http://example.com/foo")
22
+ end
23
+
24
+ it "does not create the absolute path if already representing one" do
25
+ url = described_class.new("http://example.com/foo")
26
+ expect(url.absolute("http://example.net")).to eq described_class.new("http://example.com/foo")
27
+ end
28
+
29
+ it "removes the fragment" do
30
+ url = described_class.new("http://example.com/#one-page-app")
31
+ expect(url.without_fragment).to eq described_class.new("http://example.com/")
32
+ end
33
+
34
+ it "is comparable to other Urls" do
35
+ url1 = described_class.new("http://example.com")
36
+ url2 = described_class.new("http://example.com")
37
+ expect(url1).to eq(url2)
38
+ end
39
+
40
+ it "is comparable to a string URL" do
41
+ url1 = described_class.new("http://example.com")
42
+ url2 = "http://example.com"
43
+ expect(url1 == url2).to eq true
44
+ end
45
+ end
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="main.css" type="text/css">
5
+ <link rel="stylesheet" href="other.css" type="text/css">
6
+ <script type="text/javascript" src="allthethings.js"></script>
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com">Example</a>
10
+ <a href="/google">Google</a>
11
+ <blink>OMG A BLINK TAG</blink>
12
+ <a href="http://example.com">Duplicate Example</a>
13
+ <a href="http://example.net">Different Domain</a>
14
+ <a href="#">Some javascript thing</a>
15
+ <marquee>TODAY, IN NEWS!</marquee>
16
+ </body>
17
+ </html>
@@ -0,0 +1,6 @@
1
+ use Rack::Static, root: "public"
2
+ run lambda { |env|
3
+ page = env["PATH_INFO"].empty? ? "index.html" : env["PATH_INFO"]
4
+ file = File.open("public/#{page}", File::RDONLY)
5
+ [200, {"Content-Type" => "text/html"}, file]
6
+ }
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <script type="text/javascript" src="/application.js"></script>
13
+ </body>
14
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <script type="text/javascript" src="/application.js"></script>
13
+ </body>
14
+ </html>
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>SORTOF HIGH</h1>
13
+ </body>
14
+ </html>
15
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>SUPER HIGH</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,19 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>High</h1>
13
+
14
+ <a href="/pricing/high/sortof_high.html">Sortof High</a>
15
+ <a href="/pricing/high/super_high.html">Super High</a>
16
+ </body>
17
+ </html>
18
+
19
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>Low</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>Medium</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,21 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <ul>
13
+ <li><a href="/pricing/low.html">$0</a>
14
+ <li><a href="/pricing/medium.html">$10</a>
15
+ <li><a href="/pricing/high.html">$20</a>
16
+ </ul>
17
+
18
+ <script type="text/javascript" src="/application.js"></script>
19
+ </body>
20
+ </html>
21
+
@@ -0,0 +1,40 @@
1
+ require "socket"
2
+ require "net/http"
3
+ require "fakeout/safe"
4
+ require "anaximander"
5
+
6
+ RSpec.configure do |c|
7
+ c.before do
8
+ Anaximander.logger = false
9
+ end
10
+
11
+ c.before :each, :endtoend do
12
+ @port = obtain_port
13
+ @pid = Process.spawn({}, "rackup -p #{@port}", chdir: File.expand_path("../data/site", __FILE__), out: "/dev/null", err: "/dev/null")
14
+ sleep 0.1 until server_running?
15
+ end
16
+
17
+ c.after :each, :endtoend do
18
+ Process.kill("INT", @pid)
19
+ end
20
+
21
+ # TCPServer, given a port of 0, will ask the OS for a
22
+ # random, available, port.
23
+ #
24
+ def obtain_port
25
+ server = TCPServer.new("127.0.0.1", 0)
26
+ port = server.addr[1]
27
+ server.close
28
+ port
29
+ end
30
+
31
+ # Try to retrieve the homepage of the test site.
32
+ #
33
+ def server_running?
34
+ http = Net::HTTP.new("localhost", @port)
35
+ req = Net::HTTP::Get.new("/index.html")
36
+ http.request(req).is_a?(Net::HTTPOK)
37
+ rescue Errno::ECONNREFUSED
38
+ false
39
+ end
40
+ end
metadata ADDED
@@ -0,0 +1,208 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anaximander
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Matte Noble
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: colorize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rack
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec-autotest
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: fakeout
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Web scraper that collects assets and links.
126
+ email:
127
+ - me@mattenoble.com
128
+ executables:
129
+ - mapgen
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".rspec"
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - anaximander.gemspec
140
+ - bin/mapgen
141
+ - lib/anaximander.rb
142
+ - lib/anaximander/cli.rb
143
+ - lib/anaximander/crawler.rb
144
+ - lib/anaximander/discovery/assets.rb
145
+ - lib/anaximander/discovery/links.rb
146
+ - lib/anaximander/page.rb
147
+ - lib/anaximander/renderer.rb
148
+ - lib/anaximander/url.rb
149
+ - lib/anaximander/version.rb
150
+ - spec/anaximander/crawler_spec.rb
151
+ - spec/anaximander/discovery/assets_spec.rb
152
+ - spec/anaximander/discovery/links_spec.rb
153
+ - spec/anaximander/page_spec.rb
154
+ - spec/anaximander/renderer_spec.rb
155
+ - spec/anaximander/url_spec.rb
156
+ - spec/data/page.html
157
+ - spec/data/site/config.ru
158
+ - spec/data/site/public/features.html
159
+ - spec/data/site/public/index.html
160
+ - spec/data/site/public/pricing.html
161
+ - spec/data/site/public/pricing/high.html
162
+ - spec/data/site/public/pricing/high/sortof_high.html
163
+ - spec/data/site/public/pricing/high/super_high.html
164
+ - spec/data/site/public/pricing/low.html
165
+ - spec/data/site/public/pricing/medium.html
166
+ - spec/spec_helper.rb
167
+ homepage:
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubyforge_project:
187
+ rubygems_version: 2.2.2
188
+ signing_key:
189
+ specification_version: 4
190
+ summary: Web scraper that collects assets and links.
191
+ test_files:
192
+ - spec/anaximander/crawler_spec.rb
193
+ - spec/anaximander/discovery/assets_spec.rb
194
+ - spec/anaximander/discovery/links_spec.rb
195
+ - spec/anaximander/page_spec.rb
196
+ - spec/anaximander/renderer_spec.rb
197
+ - spec/anaximander/url_spec.rb
198
+ - spec/data/page.html
199
+ - spec/data/site/config.ru
200
+ - spec/data/site/public/features.html
201
+ - spec/data/site/public/index.html
202
+ - spec/data/site/public/pricing.html
203
+ - spec/data/site/public/pricing/high.html
204
+ - spec/data/site/public/pricing/high/sortof_high.html
205
+ - spec/data/site/public/pricing/high/super_high.html
206
+ - spec/data/site/public/pricing/low.html
207
+ - spec/data/site/public/pricing/medium.html
208
+ - spec/spec_helper.rb