RubyGems - sitemap-generator - Versions diffs - 0.0.1 - Mend

sitemap-generator 0.0.1

Files changed (17) hide show

checksums.yaml +7 -0
data/Gemfile +4 -0
data/Gemfile.lock +27 -0
data/README.md +36 -0
data/Rakefile +6 -0
data/Vagrantfile +18 -0
data/bin/sitemap +10 -0
data/lib/sitemap/command.rb +61 -0
data/lib/sitemap/commands/sitemap.rb +178 -0
data/lib/sitemap/filters/filters.rb +203 -0
data/lib/sitemap/logging.rb +25 -0
data/lib/sitemap/version.rb +3 -0
data/sitemap-generator.gemspec +27 -0
data/spec/filter_spec.rb +142 -0
data/spec/generator_spec.rb +68 -0
data/spec/spec_helper.rb +1 -0
metadata +147 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
+  data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
+SHA512:
+  metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
+  data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in substantiate-analyser.gemspec
+gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,27 @@
+PATH
+  remote: .
+  specs:
+    sitemap-analyser (0.0.1)
+      clamp
+      json
+      log4r
+      nokogiri
+GEM
+  remote: https://rubygems.org/
+  specs:
+    clamp (0.6.3)
+    json (1.8.1)
+    log4r (1.1.10)
+    mini_portile (0.6.0)
+    nokogiri (1.6.2.1)
+      mini_portile (= 0.6.0)
+    rake (10.3.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.3)
+  rake
+  sitemap-analyser!

data/README.md ADDED

@@ -0,0 +1,36 @@
+# Sitemap Generator
+A simple command-line Sitemap generator tool. Useful for quickly auditing a website.
+## Getting started
+    git clone https://github.com/mefellows/sitemap-generator
+    cd sitemap-generator
+### Generate a standard CSV Sitemap file
+The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
+    bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
+### Generate a standard Sitemap JSON format
+    bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
+### Generate a Sitemap restricting to the URI provided
+    bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
+### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
+    bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
+## Getting Help
+    bin/sitemap
+    bin/sitemap generate --help
+## Alternatives?
+So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
+    wget -r --delete-after <todo>

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/Vagrantfile ADDED

@@ -0,0 +1,18 @@
+Vagrant.configure("2") do |config|
+  config.vm.define "centos-64-x64-vbox4210" do |v|
+    v.vm.box = "centos-64-x64-vbox4210"
+    v.vm.hostname = "centos"
+    v.vm.box_url = "http://puppet-vagrant-boxes.puppetlabs.com/centos-64-x64-vbox4210.box"
+    config.vm.network "forwarded_port", guest: 80, host: 8081
+  end
+  #config.vm.synced_folder "vendor/melbourneitdev/libmit", "/mit"
+  config.vm.provider :virtualbox do |vb|
+    vb.customize ["modifyvm", :id, "--memory", "256"]
+  end
+  config.vm.provision :shell, :path => "vagrant/shell/bootstrap.sh"
+end

data/bin/sitemap ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+$: << File.join(File.dirname(__FILE__), "..", "lib")
+require "sitemap/command"
+# Setup the app to work from a base Runner/Command etc.
+# exit(Sitemap::MainCommand.run || 0)
+Sitemap::MainCommand.run

data/lib/sitemap/command.rb ADDED

@@ -0,0 +1,61 @@
+require 'sitemap/version'
+require 'sitemap/logging'
+require 'sitemap/commands/sitemap'
+require 'clamp'
+module Sitemap
+  class AbstractCommand < Clamp::Command
+    include Logging
+    option ["-v", "--verbose"], :flag, "be verbose"
+    option "--version", :flag, "show version" do
+      puts "Sitemap Analyser " + Sitemap::VERSION
+      exit(0)
+    end
+  end
+  class SitemapCommand < AbstractCommand
+      option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
+      option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
+      option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
+        Integer(s)
+      end
+      # option "--restrict-path", "restrict-path", "Restrict links not on supplied path", :attribute_name => :restrict_path, :multivalued => true
+      # --follow-redirects, "follow", "Ignore redirects?"
+      # --include-resources, "include resources", "Follows links to static resources such as images, videos etc."
+      parameter "uri", "URI base to fetch URLs from", :attribute_name => :uri do |u|
+        begin
+          parsed_uri = URI::parse(u)
+          parsed_uri
+        rescue
+          puts "Invalid URI provided"
+          exit(0)
+        end
+      end
+      parameter "[output_file]", "Output file", :attribute_name => :output_file
+    def execute
+      if !format.eql?('json') && output_file.nil?
+        signal_usage_error "'output_file' parameter must be provided if format is not JSON."
+        exit(0)
+      end
+      real_depth = depth
+      if no_recursion?
+        log.debug("Recursion disabled, setting depth to 1")
+        real_depth = 1
+      end
+      log.info('Running sitemap generator')
+      generator = SitemapGenerator.new()
+      generator.generate(uri, output_file, format, real_depth)
+    end
+  end
+  class MainCommand < AbstractCommand
+    subcommand "generate", "Generate a sitemap", Sitemap::SitemapCommand
+  end
+end

data/lib/sitemap/commands/sitemap.rb ADDED

@@ -0,0 +1,178 @@
+require 'sitemap/logging'
+require 'sitemap/filters/filters'
+require 'csv'
+require 'json'
+require 'nokogiri'
+require 'open-uri'
+require 'net/http'
+class SitemapGenerator
+  include Logging
+  def initialize()
+    log.debug('Initialising generator')
+  end
+  #
+  # Public: Output the index to JSON
+  #
+  def write_index_to_json(index)
+    puts JSON::generate(index)
+  end
+  #
+  # Public: Write a Sitemap index to file
+  #
+  def write_index_to_file(index, output_file)
+    csv = CSV.open(output_file, 'wb')
+    csv << ['URI', 'Title']
+    # Flush Sitemap to CSV
+    index.each do |key, value|
+      csv << [key, value['title']]
+    end
+  end
+  #
+  # Public: Create the index recursively.
+  #
+  # link       - The URI to build the index from recursively.
+  # base_uri   - The base URI (Host) to restrict which links are indexed
+  # restrict   - An array of URIs used to restrict which URIs are indexed.
+  #              all indexed URIs will include one of these paths.
+  # link_index - Any index to start the build from.
+  # depth      - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
+  #
+  # Returns an index containing URIs as keys and an object representing the page.
+  #
+  def create_index(link, base_uri, filters, link_index = nil, depth = -1)
+    if link_index.nil?
+      log.debug('Creating new Index')
+      link_index = Hash.new
+    end
+    if link.nil? || base_uri.nil?
+      return
+    end
+    ### TODO: replace with generic filter method
+    if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
+      log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
+      # Only continue in this part if page NOT in index and is indexable
+      # Only fetch the document if it's not yet been indexed
+      doc = get_document(link)
+      ## All docs must be indexed, even if blacklisted...
+      if !doc.nil?
+        log.debug("New document found at #{link}, exploring links")
+        depth = depth - 1
+        # Set page title and add to index
+        link_index[link.to_s] = {'title' => doc.title}
+        log.info("Adding link to index: #{link.to_s}")
+        # Find all links on the page
+        links = []
+        doc.css('a').each do |l|
+           links << l.attributes["href"].to_s
+        end
+        # Filter out in-eligible links
+        a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
+        links.each do |l|
+          l = Filters::Util.remove_fragment_from_uri(l)
+          if l && !l.empty?
+            if depth != -1
+              create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
+            end
+          end
+        end
+      end
+    end
+    link_index
+  end
+  #
+  # Public: Fetch a document the Internet.
+  #
+  def fetch(uri, domain = nil, limit = 10)
+    uri = Filters::Util.make_URI(uri)
+    if domain.nil?
+      domain = uri
+    end
+    domain = Filters::Util.make_URI(domain)
+    # You should choose a better exception.
+    raise ArgumentError, 'too many HTTP redirects' if limit == 0
+    response = Net::HTTP.get_response(uri)
+    case response
+      when Net::HTTPSuccess then
+        response.body
+      when Net::HTTPRedirection then
+        location = response['location']
+        location = Filters::Util.create_absolute_uri(location, uri)
+        log.warn("Redirecting #{uri} to new location: #{location}")
+        # Check new location belongs to current domain
+        if location.host == domain.host
+          fetch(location, uri, limit - 1)
+        elsif
+          log.warn("Redirecting from #{uri} to #{location} rejected due to cross-domain restrictions")
+        end
+        nil
+      else
+        nil
+    end
+  end
+  #
+  # Public: Fetch a document
+  #
+  def get_document(uri)
+    log.debug("Fetching document at #{uri}")
+    begin
+      response = fetch(uri.to_s)
+      doc = Nokogiri::HTML(response)
+      if doc.instance_of? Nokogiri::HTML::Document
+        return doc
+      end
+    rescue StandardError => bang
+      log.error("Error reading document #{uri}: #{bang.message}")
+      nil
+    end
+  end
+  #
+  # Create the Sitemap
+  #
+  def generate(uri, output_file, format = 'csv', depth = -1)
+    log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
+    # Setup filters. Ideally, have some outsider give me these
+    # Really, these are just options to the index
+    filters = Filters::Util.get_all_filters
+    index = create_index(uri, uri, filters, nil, depth)
+    case format
+      when 'json'
+        write_index_to_json(index)
+      when 'csv'
+        write_index_to_file(index, output_file)
+      else
+        puts "Please specify a valid output format, you gave #{format} Options are ['csv', 'json']"
+        exit(1)
+    end
+  end
+end

data/lib/sitemap/filters/filters.rb ADDED

@@ -0,0 +1,203 @@
+require 'sitemap/logging'
+require 'open-uri'
+require 'net/http'
+# Public: Various index filtering operations and classes.
+module Filters
+  class Util
+    #
+    # Idempotently make a string a URI
+    #
+    def self.make_URI(uri)
+      begin
+        if !uri.is_a? URI
+          uri = URI::parse(uri)
+        end
+        uri
+      rescue
+        nil
+      end
+    end
+    #
+    # Public: Remove fragments from a URI
+    #
+    def self.remove_fragment_from_uri(uri)
+      parsed_href = Filters::Util.make_URI(uri)
+      if parsed_href.nil?
+        return nil
+      end
+      parsed_href.fragment = nil
+      parsed_href.to_s
+    end
+    #
+    # Public: Create an absolute link provided a link and base URI.
+    #
+    def self.create_absolute_uri(link, base_uri)
+      link = Filters::Util.make_URI(link)
+      base_uri = Filters::Util.make_URI(base_uri)
+      # Remove path from base
+      base_uri.path = ''
+      # Append Path to base_uri if relative
+      if !link.path.nil? && link.path.start_with?('/')
+        return base_uri + link
+      end
+      return link
+    end
+    #
+    # Public: Get all known filters
+    #
+    def self.get_all_filters
+      return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
+    end
+    # Public: Apply URI filters to a Hash.
+    #
+    # uris      - Set (Array|Hash) of URIs to be filtered.
+    # index     - Current index
+    # base_uri  - Base URI to test against
+    # filters   - Filters to reduce set of uris
+    #
+    # Returns a filtered uris Hash
+    def self.apply_filters(uris, index, base_uri, filters)
+      # Clone filters so we retain the 'functional' style of no side-effects
+      filters_clone = filters.clone
+      # Check for terminating case
+      if (!uris.nil? && uris.length > 0)
+        if !filters_clone.nil? && filters_clone.length > 0
+          # Pop a filter and apply it recursively to the result of the next filter
+          f = filters_clone.shift
+          uris = apply_filters(uris, index, base_uri, filters_clone)
+          uris = uris.select do |k,v|
+            f.filter(index, k, base_uri)
+          end
+        end
+      end
+      uris
+    end
+  end
+  #
+  # Public: Filters out non-local URIs
+  #
+  class LocalFilter
+    include Logging
+    #
+    # Public: Determines if a link is on the local domain + path or not
+    #
+    def is_link_local?(link, local)
+      begin
+        link = Filters::Util.make_URI(link)
+        local = Filters::Util.make_URI(local)
+        # Remove Absolute URLs that don't refer to local domain
+        if !link.host.nil? && !link.host.eql?(local.host)
+          log.debug("Rejecting host #{link.host} as it doesn't match #{local.host}")
+          return false
+        end
+        # Ensure path starts with a '/' (filters out junk URLs)
+        if !link.path.nil? && !link.path.eql?('') && !link.path.start_with?('/')
+          log.debug("Rejecting link #{link} as it's path (#{link.path}) doesn't start with '/'")
+          return false
+        end
+      rescue StandardError => bang
+        log.debug("Exception looking for local links: " + bang.message)
+        return false
+      end
+      return true
+    end
+    #
+    # Public: Determines if a link should be indexed.
+    #
+    # Returns boolean true iff the link is local and not indexed.
+    #
+    def should_index_local_link?(link, index, base_uri)
+      return !index.has_key?(link.to_s) && is_link_local?(link, base_uri)
+    end
+    #
+    # Public: Filter out resources that are not local.
+    #
+    # Returns the link if it should be indexed else nil.
+    #
+    def filter(index, link, base_uri)
+      return true unless !should_index_local_link?(link, index, base_uri)
+      false
+    end
+  end
+  # Public: URI Fragment filter.
+  #
+  #
+  class URIFragmentFilter
+    include Logging
+    #
+    # Public: Filters out static resources.
+    #
+    # Returns the link if it doesn't contain a URI fragment
+    #
+    def filter(index, link, base_uri)
+      link = Filters::Util.make_URI(link)
+      return false unless  (link.nil? || !link.fragment.nil?)
+      true
+    end
+  end
+  # Public: Valid URI filter.
+  #
+  #
+  class ValidURIFilter
+    include Logging
+    #
+    # Public: Filters out invalid URIs.
+    #
+    # Returns the link if it should be indexed else nil.
+    #
+    def filter(index, link, base_uri)
+      return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
+      false
+    end
+  end
+  # Public: Static resource filter.
+  #
+  #
+  class ResourcesFilter
+    include Logging
+    #
+    # Public: Filters out static resources.
+    #
+    # Returns the link if it should be indexed else nil.
+    #
+    def filter(index, link, base_uri)
+      link = Filters::Util.make_URI(link)
+      if link.nil? || link.path.nil? || link.path.to_s.empty?
+        return true
+      end
+      return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
+      false
+    end
+  end
+end

data/lib/sitemap/logging.rb ADDED

@@ -0,0 +1,25 @@
+require 'log4r'
+module Logging
+  def log
+    @log ||= Logging.logger_for(self.class.name)
+  end
+  # Use a hash class-ivar to cache a unique Logger per class:
+  @loggers = {}
+  class << self
+    include Log4r
+    def logger_for(classname)
+      @loggers[classname] ||= configure_logger_for(classname)
+    end
+    def configure_logger_for(classname)
+      logger = Logger.new classname.to_s.gsub(/[^a-zA-Z0-9]/, '.').downcase.gsub(/\.+/, '.')
+      logger.outputters << Log4r::FileOutputter.new('sitemaplog', :filename =>  'sitemap.log')
+      logger
+    end
+  end
+end

data/lib/sitemap/version.rb ADDED

@@ -0,0 +1,3 @@
+module Sitemap
+  VERSION = "0.0.1"
+end

data/sitemap-generator.gemspec ADDED

@@ -0,0 +1,27 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'sitemap/version'
+Gem::Specification.new do |spec|
+  spec.name          = "sitemap-generator"
+  spec.version       = Sitemap::VERSION
+  spec.authors       = ["mefellows"]
+  spec.email         = ["matt.fellows@onegeek.com.au"]
+  spec.description   = "Sitemap Generator"
+  spec.summary       = "A basic, human readable sitemap generator"
+  spec.homepage      = "https://github.com/mefellows/sitemap-generator"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_runtime_dependency "clamp"
+  spec.add_runtime_dependency "json"
+  spec.add_runtime_dependency "log4r"
+  spec.add_runtime_dependency "nokogiri"
+end

data/spec/filter_spec.rb ADDED

@@ -0,0 +1,142 @@
+require 'rspec'
+require 'sitemap/commands/sitemap'
+require 'sitemap/filters/filters'
+require 'spec_helper'
+url = URI::parse('http://foo.com/foo/bar')
+describe Filters::LocalFilter do
+  it 'Should exclude non-local URIs' do
+    filter = Filters::LocalFilter.new
+    filter.is_link_local?('http://www.foo.com/something', url).should eq(false)
+    filter.is_link_local?('https://www.foo.com/something', url).should eq(false)
+    filter.is_link_local?('http://www.somethingelse.com/something', url).should eq(false)
+    filter.is_link_local?('https://www.somethingelse.com/something', url).should eq(false)
+    expect(filter.filter(Hash.new, 'https://www.somethingelse.com/something', 'https://foo.com/')).to eq(false)
+  end
+  it 'Should exclude javascript links' do
+    filter = Filters::LocalFilter.new
+    # This is a valid case for THIS method, may choose to exclude elsewhere in the program
+    # filter.is_link_local?('#thisisareallylonganchorname', url).should eq(false)
+    filter.is_link_local?('alert(\'true\')', url).should eq(false)
+    expect(filter.filter(Hash.new, 'alert(\'true\')', 'https://foo.com/')).to eq(false)
+  end
+  it 'Should include relative URIs' do
+    filter = Filters::LocalFilter.new
+    filter.is_link_local?('/something', url).should eq(true)
+    filter.is_link_local?('/', url).should eq(true)
+    expect(filter.filter(Hash.new, '/something', url)).to eq(true)
+  end
+  it 'Should include absolute local URIs' do
+    filter = Filters::LocalFilter.new
+    filter.is_link_local?('http://foo.com', url).should eq(true)
+    filter.is_link_local?('http://foo.com/foo/bar', url).should eq(true)
+    filter.is_link_local?('http://foo.com/something', url).should eq(true)
+    filter.is_link_local?('https://foo.com/something', url).should eq(true)
+    filter.filter(Hash.new, 'http://foo.com', url).should eq(true)
+    filter.filter(Hash.new, 'http://foo.com/foo/bar', url).should eq(true)
+    filter.filter(Hash.new, 'http://foo.com/something', url).should eq(true)
+    filter.filter(Hash.new, 'https://foo.com/something', url).should eq(true)
+    filter.filter(Hash.new, URI::parse('http://foo.com'), url).should eq(true)
+    filter.filter(Hash.new, URI::parse('http://foo.com/foo/bar'), url).should eq(true)
+    filter.filter(Hash.new, URI::parse('http://foo.com/something'), url).should eq(true)
+    filter.filter(Hash.new, URI::parse('https://foo.com/something'), url).should eq(true)
+  end
+end
+describe Filters::ResourcesFilter do
+  it 'Should exclude static resources' do
+    filter = Filters::ResourcesFilter.new
+    filter.filter(Hash.new, 'http://www.foo.com/something.pdf', 'http://www.foo.com/').should eq(false)
+    filter.filter(Hash.new, 'http://www.foo.com/something.txt', 'http://www.foo.com/').should eq(false)
+    filter.filter(Hash.new, 'http://www.foo.com/something./', 'http://www.foo.com/').should eq(true)
+    filter.filter(Hash.new, 'http://www.foo.com/something-/', 'http://www.foo.com/').should eq(true)
+    filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-', 'http://www.foo.com/').should eq(true)
+    filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-/', 'http://www.foo.com/').should eq(true)
+    filter.filter(Hash.new, 'http://www.foo.com', 'http://www.foo.com/').should eq(true)
+  end
+  it 'Should not allow links to be indexed more than once' do
+    filter = Filters::LocalFilter.new
+    index = Hash.new
+    index['http://www.webcentral.com.au'] = {"title" => "cheese"}
+    expect(filter.should_index_local_link?(Filters::Util.create_absolute_uri('http://www.webcentral.com.au', 'http://www.webcentral.com.au'), index, 'http://www.webcentral.com.au')).to eq false
+  end
+  it 'Should return a filtered Hash' do
+    filters = Filters::Util.get_all_filters
+    # filters = [Filters::ResourcesFilter.new]
+    index = Hash.new
+    index['http://foo.com'] = ""
+    index['http://foo.com/foo'] = ""
+    index['http://foo.com/foo.pdf'] = ""
+    index['http://foo.com/bar'] = ""
+    index['http://foo.com/bar.tar.gz'] = ""
+    index['http://bar.com/foo'] = ""
+    index['http://www.mootools.net/'] = ""
+    index['http://www.wordpress.org'] = ""
+    index['http://www.blueprintcss.com'] = ""
+    index['http://www.php.net'] = ""
+    index['/contact'] = ""
+    index['http://www.onegeek.com.au'] = ""
+    index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
+    index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
+    index['http://www.twitter.com/matthewfellows'] = ""
+    index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
+    index['http://www.flickr.com/photos/mattfellows'] = ""
+    index['http://www.delicious.com/mefellows'] = ""
+    index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
+    i = Filters::Util.apply_filters(index, Hash.new, url, filters)
+    puts i
+    expect(i.length).to eq 3
+  end
+  it 'Should return an empty filtered Hash' do
+    filters = Filters::Util.get_all_filters
+    index = Hash.new
+    index['http://bar.com/foo'] = ""
+    i = Filters::Util.apply_filters(index, Hash.new, url, filters)
+    puts i
+    # Need to prevent mutation in filtering
+    expect(filters.length).to eq 3
+    expect(i.length).to eq 0
+  end
+  it 'Should return the a Hash containing the initial URI' do
+    filters = Filters::Util.get_all_filters
+    i = Filters::Util.apply_filters([url], Hash.new, url, filters)
+    puts i
+    expect(i.length).to eq 1
+  end
+end
+describe Filters::Util do
+  it 'Should return an absolute URI' do
+    expect(Filters::Util.create_absolute_uri('/', url).to_s).to eq 'http://foo.com/'
+  end
+end

data/spec/generator_spec.rb ADDED

@@ -0,0 +1,68 @@
+require 'rspec'
+require 'sitemap/commands/sitemap'
+require 'sitemap/filters/filters'
+require 'spec_helper'
+describe SitemapGenerator do
+  url = URI::parse('http://foo.com/foo/bar')
+  it 'Should return an index from a single page' do
+    generator = SitemapGenerator.new
+    filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
+    # onegeek.com.au source as at 23/05/2014
+    # Note no trailing slash -> need to find why lack of trailing slash is an issue
+    link = URI::parse("http://www.onegeek.com.au")
+    index = generator.create_index(link, link, filters, nil, 1)
+    expect(index.length).to be 18
+    puts "Here's the index:"
+    index.each do |key, value|
+      puts key
+    end
+  end
+  # Should not index an XML document
+  # should follow redirects to the same domain
+  # should treat trailing slashes the same as without???
+  # Test for blacklisted objects
+  # Should not index files (PDFs, images etc.)
+  it 'Should not index static files (PDFs, images etc.)' do
+    # generator = SitemapGenerator.new
+  end
+  # it 'Should return an index from an entire site' do
+  #   generator = SitemapGenerator.new
+  #   # onegeek.com.au source as at 23/05/2014
+  #   doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
+  #   index = generator.create_index(URI::parse("http://www.onegeek.com.au"), URI::parse("http://www.onegeek.com.au"), [], nil, 1)
+  #   index.each do |key, value|
+  #     puts key
+  #   end
+  # end
+  # it 'Let me hack stuff' do
+  #   generator = SitemapGenerator.new
+  #   print generator.fetch('http://www.webcentral.com.au/order')
+  #   doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
+  #   # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
+  #   expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
+  # end
+end

data/spec/spec_helper.rb ADDED

	@@ -0,0 +1 @@
1	+ require 'rspec'

metadata ADDED

@@ -0,0 +1,147 @@
+--- !ruby/object:Gem::Specification
+name: sitemap-generator
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- mefellows
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-24 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: clamp
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: log4r
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Sitemap Generator
+email:
+- matt.fellows@onegeek.com.au
+executables:
+- sitemap
+extensions: []
+extra_rdoc_files: []
+files:
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- Vagrantfile
+- bin/sitemap
+- lib/sitemap/command.rb
+- lib/sitemap/commands/sitemap.rb
+- lib/sitemap/filters/filters.rb
+- lib/sitemap/logging.rb
+- lib/sitemap/version.rb
+- sitemap-generator.gemspec
+- spec/filter_spec.rb
+- spec/generator_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/mefellows/sitemap-generator
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.14
+signing_key:
+specification_version: 4
+summary: A basic, human readable sitemap generator
+test_files:
+- spec/filter_spec.rb
+- spec/generator_spec.rb
+- spec/spec_helper.rb