anaximander 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 009aaa84665dcd8497a59b9e47acef29008613ea
4
+ data.tar.gz: 956c8d4a45e84443af161c5b4c78c4c33f8a54fb
5
+ SHA512:
6
+ metadata.gz: af827976b24b5f734391e3229a2c6fbd2a05ea012b7766517367d547beca19964bacfd59e96c3a857d12a2ae18d071c6c34f70fe2a7e6354d639b6baab0aa865
7
+ data.tar.gz: 4e49ba0bb2b67931ac4826f3e2e1913c3ae32885af892226c563488ff069e044c3f36de3c799885381a2581f53f96bb81cb421563874134195f6b9ae328313cd
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Matte Noble
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Anaximander
2
+
3
+ Anaximander is a small library for crawling a website and rendering the
4
+ resulting site map to the console.
5
+
6
+ ## Installation & Usage
7
+
8
+ ```sh
9
+ gem install anaximander
10
+ ```
11
+
12
+ ```sh
13
+ mapgen <url>
14
+ ```
15
+
16
+ ## Running Tests
17
+
18
+ ```sh
19
+ bundle install
20
+ bundle exec rspec spec
21
+ ```
22
+
23
+ ### End to End Tests
24
+
25
+ There are two tests marked with the tag `endtoend`. These tests start up
26
+ a Rack app which serves a simple website and run against that server
27
+ like the library would in "production". Think of them as the Acceptance
28
+ tests for a library.
29
+
30
+ The end to end tests are run by default. To exclude them:
31
+
32
+ ```sh
33
+ bundle exec rspec spec --tag ~endtoend
34
+ ```
35
+
36
+ ## What does Anaximander mean?
37
+
38
+ Anaximander was a Greek cartographer who was the first person to try to
39
+ map the entire world.
40
+
41
+ ## Contributing
42
+
43
+ 1. Fork it ( https://github.com/[my-github-username]/anaximander/fork )
44
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
45
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
46
+ 4. Push to the branch (`git push origin my-new-feature`)
47
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'anaximander/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "anaximander"
8
+ spec.version = Anaximander::VERSION
9
+ spec.authors = ["Matte Noble"]
10
+ spec.email = ["me@mattenoble.com"]
11
+ spec.summary = %q{Web scraper that collects assets and links.}
12
+ spec.description = %q{Web scraper that collects assets and links.}
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency "nokogiri"
21
+ spec.add_dependency "colorize"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency "rack"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "rspec-autotest"
28
+ spec.add_development_dependency "fakeout"
29
+ end
data/bin/mapgen ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require "anaximander"
3
+ require "anaximander/cli"
4
+ Anaximander.logger = false
5
+ Anaximander::CLI.start
@@ -0,0 +1,17 @@
1
+ module Anaximander
2
+ class CLI
3
+ def self.start
4
+ new(ARGV[0]).start
5
+ end
6
+
7
+ def initialize(url)
8
+ @crawler = Crawler.new(url)
9
+ @renderer = Renderer.new
10
+ end
11
+
12
+ def start
13
+ @crawler.crawl
14
+ @renderer.draw(@crawler.root)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,31 @@
1
+ module Anaximander
2
+ class Crawler
3
+ attr_reader :url, :root
4
+
5
+ def initialize(url)
6
+ @url = url.chomp("/")
7
+ @root = Page.new(url)
8
+ @visited = [url]
9
+ end
10
+
11
+ def crawl(page=self.root)
12
+ page.children = page.links.map { |link| visit(link.chomp("/")) }.compact
13
+ page.children.each { |child| crawl(child) }
14
+ end
15
+
16
+ def visit(link)
17
+ return if @visited.include?(link)
18
+
19
+ logger.debug(link)
20
+ @visited << link
21
+
22
+ Page.new(link)
23
+ rescue Anaximander::PageNotAccessibleError
24
+ nil
25
+ end
26
+
27
+ def logger
28
+ Anaximander.logger
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,39 @@
1
+ module Anaximander
2
+ module Discovery
3
+ class Assets
4
+ include Enumerable
5
+ include Comparable
6
+ extend Forwardable
7
+ def_delegators :assets, :size, :inspect, :to_a
8
+
9
+ def initialize(page)
10
+ @page = page
11
+ end
12
+
13
+ def each(&block)
14
+ assets.each(&block)
15
+ end
16
+
17
+ def <=>(other)
18
+ to_a <=> other.to_a
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :page
24
+
25
+ def assets
26
+ css + javascript
27
+ end
28
+
29
+ def css
30
+ page.css("link").map { |link| link[:href] }.compact
31
+ end
32
+
33
+ def javascript
34
+ page.css("script").map { |script| script[:src] }.compact
35
+ end
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,76 @@
1
+ module Anaximander
2
+ module Discovery
3
+
4
+ # Collection of internal links on the given page.
5
+ #
6
+ # == Relative Paths
7
+ #
8
+ # `Anaximander::Discovery::Links` converts all relative paths into absolute
9
+ # paths using the base URL of the page being crawled.
10
+ #
11
+ # # http://example.com
12
+ # <a href="/contact">Contact</a>
13
+ #
14
+ # Anaximander::Discovery::Links.new(Nokogiri::HTML(open("http://example.com")))
15
+ # # => ["http://example.com/contact"]
16
+ #
17
+ # == Exclusions
18
+ #
19
+ # - External links (ones outside the domain of the page
20
+ # - Hash links (Javascript style links with href of "#")
21
+ #
22
+ # == Example
23
+ #
24
+ # page = Nokogiri::HTML(open("http://example.com"))
25
+ #
26
+ # Anaximander::Discovery::Links.new(page)
27
+ # # => ["http://www.iana.org/domains/example"]
28
+ #
29
+ class Links
30
+ include Enumerable
31
+ include Comparable
32
+ extend Forwardable
33
+ def_delegators :links, :size, :inspect, :to_a
34
+
35
+ # Parameters
36
+ #
37
+ # page [Nokogiri::HTML] Parsed html of the page.
38
+ # url [String|URI] URL of the page to discover.
39
+ #
40
+ def initialize(page, url)
41
+ @page = page
42
+ @url = Url.new(url)
43
+ end
44
+
45
+ def each(&block)
46
+ links.each(&block)
47
+ end
48
+
49
+ def <=>(other)
50
+ to_a <=> other.to_a
51
+ end
52
+
53
+ private
54
+
55
+ attr_reader :page
56
+
57
+ def links
58
+ internal_links.map(&:to_s)
59
+ end
60
+
61
+ def internal_links
62
+ all_links.select { |link| @url.base == link.base }
63
+ end
64
+
65
+ def all_links
66
+ page.css("a").map { |a| absolute(a[:href]) }.compact.uniq
67
+ end
68
+
69
+ def absolute(link)
70
+ Url.new(link).absolute(@url.base).without_fragment
71
+ rescue URI::InvalidURIError
72
+ nil
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,73 @@
1
+ module Anaximander
2
+ class Error < StandardError; end
3
+
4
+ # Raised when a page cannot be fetched.
5
+ #
6
+ class PageNotAccessibleError < Error; end
7
+
8
+ # Represents a single page of a website being crawled. Exposes the assets and
9
+ # links on the page.
10
+ #
11
+ # == Errors
12
+ #
13
+ # `Anaximander::Page` will raise a `PageNotAccessibleError` when the page cannot
14
+ # be fetched for some reason. This is often due to it not existing (404), SSL
15
+ # errors or infinite redirect loops.
16
+ #
17
+ # == Example
18
+ #
19
+ # page = Page.new("http://example.com")
20
+ # page.links # => ["http://www.iana.org/domains/example"]
21
+ # page.assets # => ["/main.css", "/default.js"]
22
+ #
23
+ class Page
24
+ include Comparable
25
+
26
+ # Absolute url of the page.
27
+ #
28
+ attr_reader :url
29
+
30
+ # Parsed Nokogiri HTML document.
31
+ #
32
+ attr_reader :html
33
+
34
+ # Collection of `Page` objects that are linked
35
+ # to from the current page.
36
+ #
37
+ attr_accessor :children
38
+
39
+ # Parameters
40
+ #
41
+ # [String] url URL to discover.
42
+ #
43
+ # OpenURI raises a generic RuntimeError when it cannot fetch a
44
+ # page, for a variety of reasons. Some of which are 404s, SSL
45
+ # errors, or redirect loops.
46
+ #
47
+ # raises `PageNotAccessibleError` when OpenURI fails to fetch the
48
+ # page, for any reason.
49
+ #
50
+ def initialize(url)
51
+ @url = url
52
+ @html = Nokogiri::HTML(open(url))
53
+ rescue RuntimeError, OpenURI::HTTPError
54
+ raise PageNotAccessibleError
55
+ end
56
+
57
+ def links
58
+ Discovery::Links.new(html, url)
59
+ end
60
+
61
+ def assets
62
+ Discovery::Assets.new(html)
63
+ end
64
+
65
+ def <=>(other)
66
+ self.url <=> other.url
67
+ end
68
+
69
+ def inspect
70
+ %(#<Anaximander::Page:#{object_id} url="#{url}">)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,83 @@
1
+ require "colorize"
2
+
3
+ module Anaximander
4
+ # Draws the crawled tree of URLs and assets, generated by `Anaximander::Crawler`.
5
+ #
6
+ # == Output
7
+ #
8
+ # └── <url> [assets]
9
+ # ├── <url> [assets]
10
+ # │ └── <url> [assets]
11
+ # └── <url> [assets]
12
+ #
13
+ # == Example
14
+ #
15
+ # root
16
+ # # => #<Anaximander::Page url="http://example.com"/>
17
+ #
18
+ # root.children
19
+ # # => [#<Anaximander::Page url="http://example.com/foo"/>, #<Anaximander::Page url="http://example.com"/bar>]
20
+ #
21
+ # renderer = Anaximander::Renderer.new(root)
22
+ # renderer.draw
23
+ #
24
+ # # => └── http://example.com [main.css]
25
+ # # => ├── http://example.com/foo [main.css, foo.js]
26
+ # # => └── http://example.com/bar [main.css, bar.js]
27
+ #
28
+ class Renderer
29
+ VERTICAL_PIPE = "│ "
30
+ MEMBER_PIPE = "├── "
31
+ TAIL_PIPE = "└── "
32
+ SPACE_PIPE = " "
33
+
34
+ attr_reader :root
35
+
36
+ def initialize(options={})
37
+ @color = options.fetch(:color, true)
38
+ end
39
+
40
+ # Draws a page URL, its assets and recursively does the same for
41
+ # all of its children.
42
+ #
43
+ # == Parameters
44
+ #
45
+ # [Anaximander::Page] page The page to render
46
+ # [String] prefix A string that should preceed the actual URLa
47
+ # and asset information.
48
+ # [Boolean] tail Is this node the last one in the collection.
49
+ #
50
+ def draw(page=self.root, prefix="", tail=true)
51
+ pipe = tail ? TAIL_PIPE : MEMBER_PIPE
52
+
53
+ url = "#{prefix}#{pipe}#{page.url} "
54
+ assets = "#{page.assets.to_a}"
55
+ assets = assets.colorize(:light_black) if @color
56
+
57
+ print url
58
+ puts assets
59
+
60
+ page.children[0..-2].each { |child| draw_child(child, prefix, tail) }
61
+ draw_tail(page.children.last, prefix, tail) if page.children.size >= 1
62
+ end
63
+
64
+ # Draws a child node, with the appropriate "connecting pipe".
65
+ #
66
+ # The "connecting pipe" is the character at the beginning of this line,
67
+ # which connects this to the previous tier of the tree.
68
+ #
69
+ def draw_child(page, prefix, parent_is_tail)
70
+ connecting_pipe = parent_is_tail ? SPACE_PIPE : VERTICAL_PIPE
71
+ draw(page, "#{prefix}#{connecting_pipe}", false)
72
+ end
73
+
74
+ # Draws a leaf node, with the appropriate "connecting pipe".
75
+ #
76
+ # See `draw_child` for "connecting pipe" definition.
77
+ #
78
+ def draw_tail(page, prefix, parent_is_tail)
79
+ connecting_pipe = parent_is_tail ? SPACE_PIPE : VERTICAL_PIPE
80
+ draw(page, "#{prefix}#{connecting_pipe}", true)
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,38 @@
1
+ module Anaximander
2
+ class Url < SimpleDelegator
3
+ include Comparable
4
+
5
+ attr_reader :uri
6
+
7
+ def initialize(uri)
8
+ @uri = URI(uri.to_s)
9
+ super(@uri)
10
+ end
11
+
12
+ def base
13
+ domain = "#{scheme}://#{host}"
14
+ domain += ":#{port}" unless port == 80
15
+ domain
16
+ end
17
+
18
+ def join(url)
19
+ self.class.new(URI.join(self.uri, url.to_s))
20
+ end
21
+
22
+ def absolute(base)
23
+ absolute? ? self : Url.new(base).join(self)
24
+ end
25
+
26
+ def without_fragment
27
+ self.class.new(self).tap { |url| url.fragment = nil }
28
+ end
29
+
30
+ def <=>(other)
31
+ other.respond_to?(:uri) ? self.uri <=> other.uri : self.uri.to_s <=> other
32
+ end
33
+
34
+ def eql?(other)
35
+ self.uri.eql?(other.uri)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,3 @@
1
+ module Anaximander
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+ require "forwardable"
4
+ require "delegate"
5
+ require "uri"
6
+ require "logger"
7
+
8
+ require "anaximander/version"
9
+ require "anaximander/url"
10
+ require "anaximander/page"
11
+ require "anaximander/crawler"
12
+ require "anaximander/renderer"
13
+ require "anaximander/discovery/links"
14
+ require "anaximander/discovery/assets"
15
+
16
+ module Anaximander
17
+ def self.logger=(out)
18
+ @logger = Logger.new(out)
19
+ end
20
+
21
+ def self.logger
22
+ @logger ||= Logger.new(STDOUT)
23
+ end
24
+ end
@@ -0,0 +1,64 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Crawler do
4
+ let(:link) { "http://example.com" }
5
+ subject!(:crawler) { described_class.new(link) }
6
+
7
+ it "does not visit pages that have already been visited" do
8
+ crawler.visit(link)
9
+ expect(crawler.visit(link)).to be_nil
10
+ end
11
+
12
+ it "does not visit pages that do not exist" do
13
+ not_a_page = "http://example.com/definitelynotapage"
14
+ expect(crawler.visit(not_a_page)).to be_nil
15
+ end
16
+
17
+ it "visits links breadth-first" do
18
+ allow_any_instance_of(Anaximander::Page).to receive(:open).and_return("")
19
+
20
+ root = Anaximander::Page.new("http://example.com")
21
+ pricing = Anaximander::Page.new("http://example.com/pricing")
22
+ features = Anaximander::Page.new("http://example.com/features")
23
+
24
+ allow(Anaximander::Page).to receive(:new).with("http://example.com/pricing").and_return(pricing)
25
+ allow(Anaximander::Page).to receive(:new).with("http://example.com/features").and_return(features)
26
+
27
+ allow(root).to receive_messages(links: ["http://example.com/pricing", "http://example.com/features"])
28
+ allow(pricing).to receive_messages(links: ["http://example.com/features"])
29
+
30
+ crawler.crawl(root)
31
+ expect(root.children).to eq [pricing, features]
32
+ end
33
+
34
+ it "crawls a multi-tiered website", :endtoend do
35
+ # See `spec/data/site` for the website hierarchy; it matches
36
+ # the directory structure.
37
+
38
+ crawler = described_class.new("http://localhost:#{@port}/index.html")
39
+ crawler.crawl
40
+
41
+ root = crawler.root
42
+
43
+ pricing = root.children[0]
44
+ features = root.children[1]
45
+
46
+ pricing_low = pricing.children[0]
47
+ pricing_med = pricing.children[1]
48
+ pricing_high = pricing.children[2]
49
+
50
+ sortof_high = pricing_high.children[0]
51
+ super_high = pricing_high.children[1]
52
+
53
+ expect(root.children.size).to eq 2
54
+ expect(root.children).to eq [pricing, features]
55
+
56
+ expect(pricing.children.size).to eq 3
57
+ expect(pricing.children).to eq [pricing_low, pricing_med, pricing_high]
58
+
59
+ expect(pricing_high.children.size).to eq 2
60
+ expect(pricing_high.children).to eq [sortof_high, super_high]
61
+
62
+ expect(features.children.size).to eq 0
63
+ end
64
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Discovery::Assets do
4
+ let(:html) { Nokogiri::HTML(open("spec/data/page.html")) }
5
+ let(:assets) { described_class.new(html) }
6
+
7
+ # See `spec/data/page.html` for the HTML used in this test.
8
+
9
+ it "collects all CSS and Javascript assets" do
10
+ expect(assets.to_a).to eq(["main.css", "other.css", "allthethings.js"])
11
+ end
12
+ end
@@ -0,0 +1,33 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Discovery::Links do
4
+ let(:html) { Nokogiri::HTML(open("spec/data/page.html")) }
5
+ let(:links) { described_class.new(html, "http://example.com/") }
6
+
7
+ # See `spec/data/page.html` for the HTML used in this test.
8
+
9
+ it "collects each unique link" do
10
+ expect(links.size).to eq(2)
11
+ end
12
+
13
+ it "only collects links within the same domain" do
14
+ expect(links).to_not include "http://example.net"
15
+ end
16
+
17
+ it "removes trailing slashes from URLs" do
18
+ expect(links).to_not include "http://example.com/"
19
+ end
20
+
21
+ it "disgards hash links" do
22
+ links.each { |link| expect(link).to_not include "#" }
23
+ end
24
+
25
+ it "expands relative paths to absolute paths" do
26
+ expect(links).to include "http://example.com/google"
27
+ end
28
+
29
+ it "expands relative paths to absolute paths using the base url of the page" do
30
+ links = described_class.new(html, "http://example.com/some/nested/page/")
31
+ expect(links).to_not include "http://example.com/some/nested/page/google"
32
+ end
33
+ end
@@ -0,0 +1,24 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Page do
4
+ let(:html) { File.read(File.expand_path("../../data/page.html", __FILE__)) }
5
+ let(:page) { described_class.new("http://example.com") }
6
+
7
+ before do
8
+ expect_any_instance_of(described_class).to receive(:open).and_return(html)
9
+ end
10
+
11
+ it "has unique links" do
12
+ expect(Anaximander::Discovery::Links).to receive(:new).with(an_instance_of(Nokogiri::HTML::Document), "http://example.com")
13
+ page.links
14
+ end
15
+
16
+ it "has assets" do
17
+ expect(Anaximander::Discovery::Assets).to receive(:new).with(an_instance_of(Nokogiri::HTML::Document))
18
+ page.assets
19
+ end
20
+
21
+ it "is comparable by URL" do
22
+ expect(page).to eq(page.clone)
23
+ end
24
+ end
@@ -0,0 +1,64 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Renderer do
4
+ include Fakeout::SpecHelpers
5
+
6
+ let(:child) { double(url: "http://example.com/foo", assets: ["/root.css", "/child.js"], children: []) }
7
+ let(:root) { double(url: "http://example.com", assets: ["/root.css"], children: [child]) }
8
+
9
+ subject(:renderer) { described_class.new(color: false) }
10
+
11
+ before :all do
12
+ Fakeout.activate!
13
+ end
14
+
15
+ after :all do
16
+ Fakeout.deactivate!
17
+ end
18
+
19
+ it "draws a root node" do
20
+ renderer.draw(root)
21
+ expect(stdout).to include %(└── http://example.com ["/root.css"])
22
+ end
23
+
24
+ it "draws a child node whose parent is not a tail" do
25
+ renderer.draw_child(child, "", false)
26
+ expect(stdout).to include %(│ ├── http://example.com/foo ["/root.css", "/child.js"])
27
+ end
28
+
29
+ it "draws a child node whose parent is a tail" do
30
+ renderer.draw_child(child, "", true)
31
+ expect(stdout).to include %( ├── http://example.com/foo ["/root.css", "/child.js"])
32
+ end
33
+
34
+ it "draws a tail node whose parent is not a tail node" do
35
+ renderer.draw_tail(child, "", false)
36
+ expect(stdout).to include %(│ └── http://example.com/foo ["/root.css", "/child.js"])
37
+ end
38
+
39
+ it "draws a tail node whose parent is also a tail" do
40
+ renderer.draw_tail(child, "", true)
41
+ expect(stdout).to include %( └── http://example.com/foo ["/root.css", "/child.js"])
42
+ end
43
+
44
+ it "draws an entire tree", :endtoend do
45
+ tree = Anaximander::Crawler.new("http://localhost:#{@port}/index.html")
46
+ tree.crawl
47
+
48
+ domain = Anaximander::Url.new(tree.root.url).base
49
+
50
+ renderer = Anaximander::Renderer.new(color: false)
51
+ renderer.draw(tree.root)
52
+
53
+ expect(stdout).to eq <<-TREE
54
+ └── #{domain}/index.html ["/main.css", "/application.js"]
55
+ ├── #{domain}/pricing.html ["/main.css", "/application.js"]
56
+ │ ├── #{domain}/pricing/low.html ["/main.css"]
57
+ │ ├── #{domain}/pricing/medium.html ["/main.css"]
58
+ │ └── #{domain}/pricing/high.html ["/main.css"]
59
+ │ ├── #{domain}/pricing/high/sortof_high.html ["/main.css"]
60
+ │ └── #{domain}/pricing/high/super_high.html ["/main.css"]
61
+ └── #{domain}/features.html ["/main.css", "/application.js"]
62
+ TREE
63
+ end
64
+ end
@@ -0,0 +1,45 @@
1
+ require "spec_helper"
2
+
3
+ describe Anaximander::Url do
4
+ it "exposes the scheme and host as base" do
5
+ base = described_class.new("http://example.com/foo/bar?baz=1").base
6
+ expect(base).to eq "http://example.com"
7
+ end
8
+
9
+ it "joins other Urls" do
10
+ url = described_class.new("http://example.com").join(described_class.new("/foo"))
11
+ expect(url).to eq described_class.new("http://example.com/foo")
12
+ end
13
+
14
+ it "joins with a String url" do
15
+ url = described_class.new("http://example.com").join("/foo")
16
+ expect(url).to eq described_class.new("http://example.com/foo")
17
+ end
18
+
19
+ it "creates the absolute path when representing a relative path" do
20
+ url = described_class.new("/foo")
21
+ expect(url.absolute("http://example.com")).to eq described_class.new("http://example.com/foo")
22
+ end
23
+
24
+ it "does not create the absolute path if already representing one" do
25
+ url = described_class.new("http://example.com/foo")
26
+ expect(url.absolute("http://example.net")).to eq described_class.new("http://example.com/foo")
27
+ end
28
+
29
+ it "removes the fragment" do
30
+ url = described_class.new("http://example.com/#one-page-app")
31
+ expect(url.without_fragment).to eq described_class.new("http://example.com/")
32
+ end
33
+
34
+ it "is comparable to other Urls" do
35
+ url1 = described_class.new("http://example.com")
36
+ url2 = described_class.new("http://example.com")
37
+ expect(url1).to eq(url2)
38
+ end
39
+
40
+ it "is comparable to a string URL" do
41
+ url1 = described_class.new("http://example.com")
42
+ url2 = "http://example.com"
43
+ expect(url1 == url2).to eq true
44
+ end
45
+ end
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="main.css" type="text/css">
5
+ <link rel="stylesheet" href="other.css" type="text/css">
6
+ <script type="text/javascript" src="allthethings.js"></script>
7
+ </head>
8
+ <body>
9
+ <a href="http://example.com">Example</a>
10
+ <a href="/google">Google</a>
11
+ <blink>OMG A BLINK TAG</blink>
12
+ <a href="http://example.com">Duplicate Example</a>
13
+ <a href="http://example.net">Different Domain</a>
14
+ <a href="#">Some javascript thing</a>
15
+ <marquee>TODAY, IN NEWS!</marquee>
16
+ </body>
17
+ </html>
@@ -0,0 +1,6 @@
1
+ use Rack::Static, root: "public"
2
+ run lambda { |env|
3
+ page = env["PATH_INFO"].empty? ? "index.html" : env["PATH_INFO"]
4
+ file = File.open("public/#{page}", File::RDONLY)
5
+ [200, {"Content-Type" => "text/html"}, file]
6
+ }
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <script type="text/javascript" src="/application.js"></script>
13
+ </body>
14
+ </html>
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <script type="text/javascript" src="/application.js"></script>
13
+ </body>
14
+ </html>
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>SORTOF HIGH</h1>
13
+ </body>
14
+ </html>
15
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>SUPER HIGH</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,19 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>High</h1>
13
+
14
+ <a href="/pricing/high/sortof_high.html">Sortof High</a>
15
+ <a href="/pricing/high/super_high.html">Super High</a>
16
+ </body>
17
+ </html>
18
+
19
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>Low</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,16 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <h1>Medium</h1>
13
+ </body>
14
+ </html>
15
+
16
+
@@ -0,0 +1,21 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link rel="stylesheet" href="/main.css" type="text/css" media="screen" charset="utf-8">
5
+ </head>
6
+ <body>
7
+ <nav>
8
+ <a href="/pricing.html">Pricing</a>
9
+ <a href="/features.html">Features</a>
10
+ </nav>
11
+
12
+ <ul>
13
+ <li><a href="/pricing/low.html">$0</a>
14
+ <li><a href="/pricing/medium.html">$10</a>
15
+ <li><a href="/pricing/high.html">$20</a>
16
+ </ul>
17
+
18
+ <script type="text/javascript" src="/application.js"></script>
19
+ </body>
20
+ </html>
21
+
@@ -0,0 +1,40 @@
1
+ require "socket"
2
+ require "net/http"
3
+ require "fakeout/safe"
4
+ require "anaximander"
5
+
6
+ RSpec.configure do |c|
7
+ c.before do
8
+ Anaximander.logger = false
9
+ end
10
+
11
+ c.before :each, :endtoend do
12
+ @port = obtain_port
13
+ @pid = Process.spawn({}, "rackup -p #{@port}", chdir: File.expand_path("../data/site", __FILE__), out: "/dev/null", err: "/dev/null")
14
+ sleep 0.1 until server_running?
15
+ end
16
+
17
+ c.after :each, :endtoend do
18
+ Process.kill("INT", @pid)
19
+ end
20
+
21
+ # TCPServer, given a port of 0, will ask the OS for a
22
+ # random, available, port.
23
+ #
24
+ def obtain_port
25
+ server = TCPServer.new("127.0.0.1", 0)
26
+ port = server.addr[1]
27
+ server.close
28
+ port
29
+ end
30
+
31
+ # Try to retrieve the homepage of the test site.
32
+ #
33
+ def server_running?
34
+ http = Net::HTTP.new("localhost", @port)
35
+ req = Net::HTTP::Get.new("/index.html")
36
+ http.request(req).is_a?(Net::HTTPOK)
37
+ rescue Errno::ECONNREFUSED
38
+ false
39
+ end
40
+ end
metadata ADDED
@@ -0,0 +1,208 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anaximander
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Matte Noble
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: colorize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rack
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec-autotest
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: fakeout
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Web scraper that collects assets and links.
126
+ email:
127
+ - me@mattenoble.com
128
+ executables:
129
+ - mapgen
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".rspec"
135
+ - Gemfile
136
+ - LICENSE.txt
137
+ - README.md
138
+ - Rakefile
139
+ - anaximander.gemspec
140
+ - bin/mapgen
141
+ - lib/anaximander.rb
142
+ - lib/anaximander/cli.rb
143
+ - lib/anaximander/crawler.rb
144
+ - lib/anaximander/discovery/assets.rb
145
+ - lib/anaximander/discovery/links.rb
146
+ - lib/anaximander/page.rb
147
+ - lib/anaximander/renderer.rb
148
+ - lib/anaximander/url.rb
149
+ - lib/anaximander/version.rb
150
+ - spec/anaximander/crawler_spec.rb
151
+ - spec/anaximander/discovery/assets_spec.rb
152
+ - spec/anaximander/discovery/links_spec.rb
153
+ - spec/anaximander/page_spec.rb
154
+ - spec/anaximander/renderer_spec.rb
155
+ - spec/anaximander/url_spec.rb
156
+ - spec/data/page.html
157
+ - spec/data/site/config.ru
158
+ - spec/data/site/public/features.html
159
+ - spec/data/site/public/index.html
160
+ - spec/data/site/public/pricing.html
161
+ - spec/data/site/public/pricing/high.html
162
+ - spec/data/site/public/pricing/high/sortof_high.html
163
+ - spec/data/site/public/pricing/high/super_high.html
164
+ - spec/data/site/public/pricing/low.html
165
+ - spec/data/site/public/pricing/medium.html
166
+ - spec/spec_helper.rb
167
+ homepage:
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubyforge_project:
187
+ rubygems_version: 2.2.2
188
+ signing_key:
189
+ specification_version: 4
190
+ summary: Web scraper that collects assets and links.
191
+ test_files:
192
+ - spec/anaximander/crawler_spec.rb
193
+ - spec/anaximander/discovery/assets_spec.rb
194
+ - spec/anaximander/discovery/links_spec.rb
195
+ - spec/anaximander/page_spec.rb
196
+ - spec/anaximander/renderer_spec.rb
197
+ - spec/anaximander/url_spec.rb
198
+ - spec/data/page.html
199
+ - spec/data/site/config.ru
200
+ - spec/data/site/public/features.html
201
+ - spec/data/site/public/index.html
202
+ - spec/data/site/public/pricing.html
203
+ - spec/data/site/public/pricing/high.html
204
+ - spec/data/site/public/pricing/high/sortof_high.html
205
+ - spec/data/site/public/pricing/high/super_high.html
206
+ - spec/data/site/public/pricing/low.html
207
+ - spec/data/site/public/pricing/medium.html
208
+ - spec/spec_helper.rb