RubyGems - sitemap-generator - Versions diffs - 0.0.1 → 0.0.2 - Mend

sitemap-generator 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +29 -10
data/lib/sitemap/command.rb +26 -1
data/lib/sitemap/commands/sitemap.rb +15 -17
data/lib/sitemap/filters/filters.rb +10 -6
data/lib/sitemap/filters/transformers.rb +70 -0
data/lib/sitemap/version.rb +1 -1
data/sitemap-generator.gemspec +1 -1
data/spec/filter_spec.rb +5 -1
data/spec/generator_spec.rb +7 -16
data/spec/transform_spec.rb +47 -0
metadata +6 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
-  data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
+  metadata.gz: 33e7b15a7651826488cdd2e3ebdefd3e34a6e1ae
+  data.tar.gz: eb219514a5b0f7dd9aa1a6d45e4ab813bd9ed5a4
 SHA512:
-  metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
-  data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390
+  metadata.gz: 620b5c8bd6d5e3591e2e9230c3849a67c176897543b52ee20fbde7bc262c62880ec32b32c8e8061749d3149ea30bf2367ede90c356da329432ce4c0bc456e11a
+  data.tar.gz: d2439232358bbdcf186665243e6e448a47def81ab68030997089de7aeb05be990130f94e296ec60d799d3ddcfb9c9db01fd4464cca66bdb7fe9c58ef7d461409

data/.gitignore ADDED

	@@ -0,0 +1 @@
1	+ *.gem

data/README.md CHANGED

@@ -3,34 +3,53 @@ A simple command-line Sitemap generator tool. Useful for quickly auditing a webs
 ## Getting started
+    gem install sitemap-generator
+*Getting started with code*
+If you want to get your hands dirty in code:
     git clone https://github.com/mefellows/sitemap-generator
     cd sitemap-generator
+    bundle install
+    bin/sitemap
+## Examples
 ### Generate a standard CSV Sitemap file
 The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
-    bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
+    sitemap generate http://www.onegeek.com.au/ sitemap.csv
 ### Generate a standard Sitemap JSON format
-    bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
+This command deliberately doesn't write to file in order to allow unix-style pipelining
+    sitemap generate --format=json http://www.onegeek.com.au/
-### Generate a Sitemap restricting to the URI provided
+### Generate a Sitemap 3 levels deep
+    sitemap generate --depth=3 http://www.onegeek.com.au/ sitemap.csv
+### Generate a Sitemap containing links only on the specified URI
-    bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
+    sitemap generate --no-recursion http://www.onegeek.com.au/ sitemap.csv
-### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
+### Generate a Sitemap that contains URI fragments and query strings
-    bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
+By default, URI fragments like ```foo.com/#!/some-page``` and query strings like ```foo.com/?bar=baz``` are ignored - they are generally duplicitous so sitemap-generator strips them off entirely. This lets them back in:
+    sitemap generate --query-strings --fragments http://www.onegeek.com.au/ sitemap.csv
 ## Getting Help
-    bin/sitemap
-    bin/sitemap generate --help
+    sitemap
+    sitemap generate --help
 ## Alternatives?
-So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
+So of course, after spending a few hours writing this I forgot that wget can do this for you, well basically anyway:
     wget -r --delete-after <todo>

data/lib/sitemap/command.rb CHANGED

@@ -1,6 +1,7 @@
 require 'sitemap/version'
 require 'sitemap/logging'
 require 'sitemap/commands/sitemap'
+require 'sitemap/filters/transformers'
 require 'clamp'
 module Sitemap
@@ -18,6 +19,8 @@ module Sitemap
   class SitemapCommand < AbstractCommand
       option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
       option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
+      option "--query-strings", :flag, "Allow query strings in URIs", :default => false
+      option "--fragments", :flag, "Allow fragments in URIs", :default => false
       option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
         Integer(s)
       end
@@ -51,7 +54,29 @@ module Sitemap
       log.info('Running sitemap generator')
       generator = SitemapGenerator.new()
-      generator.generate(uri, output_file, format, real_depth)
+      # Setup filters and transformers
+      filters = Filters::Util.get_all_filters
+      transformers = Transformers::Util.get_all_transformers
+      # If query strings enabled, remove QueryString transformer
+      if query_strings?
+        transformers = transformers.select do |t|
+          next true unless t.instance_of? Transformers::URIQueryStringTransformer
+          false
+        end
+      end
+      # If query strings enabled, remove QueryString transformer
+      if fragments?
+        filters = filters.select do |t|
+          next true unless t.instance_of? Filters::URIFragmentFilter
+          false
+        end
+      end
+      # Create the sitemap!
+      generator.generate(uri, output_file, filters, transformers, format, real_depth)
     end
   end

data/lib/sitemap/commands/sitemap.rb CHANGED

@@ -1,5 +1,6 @@
 require 'sitemap/logging'
 require 'sitemap/filters/filters'
+require 'sitemap/filters/transformers'
 require 'csv'
 require 'json'
 require 'nokogiri'
@@ -37,16 +38,16 @@ class SitemapGenerator
   #
   # Public: Create the index recursively.
   #
-  # link       - The URI to build the index from recursively.
-  # base_uri   - The base URI (Host) to restrict which links are indexed
-  # restrict   - An array of URIs used to restrict which URIs are indexed.
-  #              all indexed URIs will include one of these paths.
-  # link_index - Any index to start the build from.
-  # depth      - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
+  # link          - The URI to build the index from recursively.
+  # base_uri      - The base URI (Host) to restrict which links are indexed
+  # filters       - An array of Filters to be applied before indexing
+  # transformers  - An array of Transformers to be applied before indexing
+  # link_index    - Any index to start the build from.
+  # depth         - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
   #
   # Returns an index containing URIs as keys and an object representing the page.
   #
-  def create_index(link, base_uri, filters, link_index = nil, depth = -1)
+  def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
     if link_index.nil?
       log.debug('Creating new Index')
       link_index = Hash.new
@@ -56,8 +57,6 @@ class SitemapGenerator
       return
     end
-    ### TODO: replace with generic filter method
     if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
       log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
@@ -82,14 +81,17 @@ class SitemapGenerator
            links << l.attributes["href"].to_s
         end
+        # Transform URLs before indexing
+        Transformers::Util.apply_transformers(links, transformers)
         # Filter out in-eligible links
-        a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
+        Filters::Util.apply_filters(links, link_index, base_uri, filters)
         links.each do |l|
           l = Filters::Util.remove_fragment_from_uri(l)
           if l && !l.empty?
             if depth != -1
-              create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
+              create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters,  transformers, link_index, depth)
             end
           end
         end
@@ -155,14 +157,10 @@ class SitemapGenerator
   #
   # Create the Sitemap
   #
-  def generate(uri, output_file, format = 'csv', depth = -1)
+  def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
     log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
-    # Setup filters. Ideally, have some outsider give me these
-    # Really, these are just options to the index
-    filters = Filters::Util.get_all_filters
-    index = create_index(uri, uri, filters, nil, depth)
+    index = create_index(uri, uri, filters, transformers, nil, depth)
     case format
       when 'json'

data/lib/sitemap/filters/filters.rb CHANGED

@@ -54,10 +54,10 @@ module Filters
     # Public: Get all known filters
     #
     def self.get_all_filters
-      return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
+      return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new, Filters::URIFragmentFilter.new]
     end
-    # Public: Apply URI filters to a Hash.
+    # Public: Apply URI Filters to a Hash.
     #
     # uris      - Set (Array|Hash) of URIs to be filtered.
     # index     - Current index
@@ -79,7 +79,7 @@ module Filters
           f = filters_clone.shift
           uris = apply_filters(uris, index, base_uri, filters_clone)
-          uris = uris.select do |k,v|
+          uris = uris.select do |k|
             f.filter(index, k, base_uri)
           end
         end
@@ -157,8 +157,9 @@ module Filters
     #
     def filter(index, link, base_uri)
       link = Filters::Util.make_URI(link)
-      return false unless  (link.nil? || !link.fragment.nil?)
-      true
+      return true unless (link != nil && link.fragment != nil)
+      log.debug("Rejecting link #{link} as it contains fragments #{link.fragment}")
+      false
     end
   end
@@ -174,7 +175,9 @@ module Filters
     # Returns the link if it should be indexed else nil.
     #
     def filter(index, link, base_uri)
-      return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
+      link = Filters::Util.make_URI(link)
+      return true unless link == nil || link.eql?('')
+      log.debug("Rejecting link #{link} as it is not deemed to be a valid URI")
       false
     end
   end
@@ -196,6 +199,7 @@ module Filters
         return true
       end
       return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
+      log.debug("Rejecting link #{link} as it is a static resource #{link.path}")
       false
     end
   end

data/lib/sitemap/filters/transformers.rb ADDED

@@ -0,0 +1,70 @@
+require 'sitemap/logging'
+require 'open-uri'
+require 'net/http'
+# Public: Transformers are objects that modify a provided link.
+#
+# For example, a Transformer might be used to strip out query string URLS
+# before indexing.
+module Transformers
+  class Util
+    # Public: Apply URI transformers to a Hash.
+    #
+    # uris      - Set (Array|Hash) of URIs to be filtered.
+    # index     - Current index
+    # base_uri  - Base URI to test against
+    # transformers   - transformers to reduce set of uris
+    #
+    # Returns a filtered uris Hash
+    def self.apply_transformers(uris, transformers)
+      # Clone transformers so we retain the 'functional' style of no side-effects
+      transformers_clone = transformers.clone
+      # Check for terminating case
+      if (!uris.nil? && uris.length > 0)
+        if !transformers_clone.nil? && transformers_clone.length > 0
+          # Pop a transformer and apply it recursively to the result of the next transformer
+          t = transformers_clone.shift
+          uris = apply_transformers(uris, transformers_clone)
+          uris = uris.map do |k,v|
+            t.transform(k)
+          end
+        end
+      end
+      uris
+    end
+    #
+    # Public: Get all known transformers
+    #
+    def self.get_all_transformers
+      return [Transformers::URIQueryStringTransformer.new]
+    end
+  end
+  # Public: URI Query Sttring Transformer.
+  #
+  #
+  class URIQueryStringTransformer
+    include Logging
+    #
+    # Public: Filters out URLs with query string resources.
+    #
+    # Returns the link without the query string component
+    #
+    def transform(link)
+      link = Filters::Util.make_URI(link)
+      return link unless (link != nil && link.query != nil)
+      link.query = nil
+      link
+    end
+  end
+end

data/lib/sitemap/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Sitemap
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/sitemap-generator.gemspec CHANGED

@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
   spec.authors       = ["mefellows"]
   spec.email         = ["matt.fellows@onegeek.com.au"]
   spec.description   = "Sitemap Generator"
-  spec.summary       = "A basic, human readable sitemap generator"
+  spec.summary       = "A basic, human readable (CSV, JSON) sitemap generator. "
   spec.homepage      = "https://github.com/mefellows/sitemap-generator"
   spec.license       = "MIT"

data/spec/filter_spec.rb CHANGED

@@ -85,6 +85,10 @@ describe Filters::ResourcesFilter do
     index['http://foo.com/foo'] = ""
     index['http://foo.com/foo.pdf'] = ""
     index['http://foo.com/bar'] = ""
+    index['http://foo.com/bar#'] = ""
+    index['http://foo.com/bar/#'] = ""
+    index['http://foo.com/bar/#foo'] = ""
+    index['http://foo.com/bar/#!/hashbang/foo'] = ""
     index['http://foo.com/bar.tar.gz'] = ""
     index['http://bar.com/foo'] = ""
     index['http://www.mootools.net/'] = ""
@@ -118,7 +122,7 @@ describe Filters::ResourcesFilter do
     puts i
     # Need to prevent mutation in filtering
-    expect(filters.length).to eq 3
+    expect(filters.length).to eq 4
     expect(i.length).to eq 0

data/spec/generator_spec.rb CHANGED

@@ -8,19 +8,21 @@ describe SitemapGenerator do
   it 'Should return an index from a single page' do
     generator = SitemapGenerator.new
-    filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
+    filters = Filters::Util.get_all_filters
+    transformers = Transformers::Util.get_all_transformers
     # onegeek.com.au source as at 23/05/2014
     # Note no trailing slash -> need to find why lack of trailing slash is an issue
     link = URI::parse("http://www.onegeek.com.au")
-    index = generator.create_index(link, link, filters, nil, 1)
+    index = generator.create_index(link, link, filters, transformers, nil, 1)
-    expect(index.length).to be 18
     puts "Here's the index:"
     index.each do |key, value|
       puts key
     end
+    expect(index.length).to be 18
   end
@@ -32,7 +34,8 @@ describe SitemapGenerator do
   # should treat trailing slashes the same as without???
-  # Test for blacklisted objects
+  # Test for blacklisted URIs
   # Should not index files (PDFs, images etc.)
@@ -41,8 +44,6 @@ describe SitemapGenerator do
   end
   # it 'Should return an index from an entire site' do
   #   generator = SitemapGenerator.new
@@ -55,14 +56,4 @@ describe SitemapGenerator do
   #   end
   # end
-  # it 'Let me hack stuff' do
-  #   generator = SitemapGenerator.new
-  #   print generator.fetch('http://www.webcentral.com.au/order')
-  #   doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
-  #   # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
-  #   expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
-  # end
 end

data/spec/transform_spec.rb ADDED

@@ -0,0 +1,47 @@
+require 'rspec'
+require 'sitemap/commands/sitemap'
+require 'sitemap/filters/transformers'
+require 'spec_helper'
+url = URI::parse('http://foo.com/foo/bar')
+describe Transformers::Util do
+  it 'Should remove #fragments URIs' do
+    transformers = Transformers::Util.get_all_transformers
+    index = Hash.new
+    index['http://foo.com'] = ""
+    index['http://foo.com/#fragment'] = ""
+    index['http://foo.com/#!/hash-bang/fragment'] = ""
+    index['http://foo.com/#'] = ""
+    index['http://foo.com/foo'] = ""
+    index['http://foo.com/foo?'] = ""
+    index['http://foo.com/foo?foo'] = ""
+    index['http://foo.com/foo?foo=bar'] = ""
+    index['http://foo.com/foo.pdf'] = ""
+    index['http://foo.com/bar'] = ""
+    index['http://foo.com/bar.tar.gz'] = ""
+    index['http://bar.com/foo'] = ""
+    index['http://www.mootools.net/'] = ""
+    index['http://www.wordpress.org'] = ""
+    index['http://www.blueprintcss.com'] = ""
+    index['http://www.php.net'] = ""
+    index['/contact'] = ""
+    index['http://www.onegeek.com.au'] = ""
+    index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
+    index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
+    index['http://www.twitter.com/matthewfellows'] = ""
+    index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
+    index['http://www.flickr.com/photos/mattfellows'] = ""
+    index['http://www.delicious.com/mefellows'] = ""
+    index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
+    i = Transformers::Util.apply_transformers(index, transformers)
+    expect(i.length).to eq 25
+    expect(i[4].to_s).to eq 'http://foo.com/foo'
+    expect(i[5].to_s).to eq 'http://foo.com/foo'
+    expect(i[6].to_s).to eq 'http://foo.com/foo'
+    expect(i[7].to_s).to eq 'http://foo.com/foo'
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sitemap-generator
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - mefellows
@@ -102,6 +102,7 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
+- .gitignore
 - Gemfile
 - Gemfile.lock
 - README.md
@@ -111,12 +112,14 @@ files:
 - lib/sitemap/command.rb
 - lib/sitemap/commands/sitemap.rb
 - lib/sitemap/filters/filters.rb
+- lib/sitemap/filters/transformers.rb
 - lib/sitemap/logging.rb
 - lib/sitemap/version.rb
 - sitemap-generator.gemspec
 - spec/filter_spec.rb
 - spec/generator_spec.rb
 - spec/spec_helper.rb
+- spec/transform_spec.rb
 homepage: https://github.com/mefellows/sitemap-generator
 licenses:
 - MIT
@@ -140,8 +143,9 @@ rubyforge_project:
 rubygems_version: 2.0.14
 signing_key:
 specification_version: 4
-summary: A basic, human readable sitemap generator
+summary: A basic, human readable (CSV, JSON) sitemap generator.
 test_files:
 - spec/filter_spec.rb
 - spec/generator_spec.rb
 - spec/spec_helper.rb
+- spec/transform_spec.rb