sitemap-generator 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
4
- data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
3
+ metadata.gz: 33e7b15a7651826488cdd2e3ebdefd3e34a6e1ae
4
+ data.tar.gz: eb219514a5b0f7dd9aa1a6d45e4ab813bd9ed5a4
5
5
  SHA512:
6
- metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
7
- data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390
6
+ metadata.gz: 620b5c8bd6d5e3591e2e9230c3849a67c176897543b52ee20fbde7bc262c62880ec32b32c8e8061749d3149ea30bf2367ede90c356da329432ce4c0bc456e11a
7
+ data.tar.gz: d2439232358bbdcf186665243e6e448a47def81ab68030997089de7aeb05be990130f94e296ec60d799d3ddcfb9c9db01fd4464cca66bdb7fe9c58ef7d461409
@@ -0,0 +1 @@
1
+ *.gem
data/README.md CHANGED
@@ -3,34 +3,53 @@ A simple command-line Sitemap generator tool. Useful for quickly auditing a webs
3
3
 
4
4
  ## Getting started
5
5
 
6
+ gem install sitemap-generator
7
+
8
+ *Getting started with code*
9
+
10
+ If you want to get your hands dirty in code:
11
+
6
12
  git clone https://github.com/mefellows/sitemap-generator
7
13
  cd sitemap-generator
14
+ bundle install
15
+ bin/sitemap
16
+
17
+ ## Examples
8
18
 
9
19
  ### Generate a standard CSV Sitemap file
20
+
10
21
  The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
11
22
 
12
- bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
23
+ sitemap generate http://www.onegeek.com.au/ sitemap.csv
13
24
 
14
25
  ### Generate a standard Sitemap JSON format
15
26
 
16
- bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
27
+ This command deliberately doesn't write to file in order to allow unix-style pipelining
28
+
29
+ sitemap generate --format=json http://www.onegeek.com.au/
17
30
 
18
- ### Generate a Sitemap restricting to the URI provided
31
+ ### Generate a Sitemap 3 levels deep
32
+
33
+ sitemap generate --depth=3 http://www.onegeek.com.au/ sitemap.csv
34
+
35
+ ### Generate a Sitemap containing links only on the specified URI
19
36
 
20
- bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
37
+ sitemap generate --no-recursion http://www.onegeek.com.au/ sitemap.csv
21
38
 
22
- ### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
39
+ ### Generate a Sitemap that contains URI fragments and query strings
23
40
 
24
- bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
41
+ By default, URI fragments like ```foo.com/#!/some-page``` and query strings like ```foo.com/?bar=baz``` are ignored - they are generally duplicitous so sitemap-generator strips them off entirely. This lets them back in:
42
+
43
+ sitemap generate --query-strings --fragments http://www.onegeek.com.au/ sitemap.csv
25
44
 
26
-
27
45
  ## Getting Help
28
46
 
29
- bin/sitemap
30
- bin/sitemap generate --help
47
+ sitemap
48
+ sitemap generate --help
31
49
 
32
50
  ## Alternatives?
33
51
 
34
- So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
52
+ So of course, after spending a few hours writing this I forgot that wget can do this for you, well basically anyway:
35
53
 
36
54
  wget -r --delete-after <todo>
55
+
@@ -1,6 +1,7 @@
1
1
  require 'sitemap/version'
2
2
  require 'sitemap/logging'
3
3
  require 'sitemap/commands/sitemap'
4
+ require 'sitemap/filters/transformers'
4
5
  require 'clamp'
5
6
 
6
7
  module Sitemap
@@ -18,6 +19,8 @@ module Sitemap
18
19
  class SitemapCommand < AbstractCommand
19
20
  option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
20
21
  option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
22
+ option "--query-strings", :flag, "Allow query strings in URIs", :default => false
23
+ option "--fragments", :flag, "Allow fragments in URIs", :default => false
21
24
  option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
22
25
  Integer(s)
23
26
  end
@@ -51,7 +54,29 @@ module Sitemap
51
54
 
52
55
  log.info('Running sitemap generator')
53
56
  generator = SitemapGenerator.new()
54
- generator.generate(uri, output_file, format, real_depth)
57
+
58
+ # Setup filters and transformers
59
+ filters = Filters::Util.get_all_filters
60
+ transformers = Transformers::Util.get_all_transformers
61
+
62
+ # If query strings enabled, remove QueryString transformer
63
+ if query_strings?
64
+ transformers = transformers.select do |t|
65
+ next true unless t.instance_of? Transformers::URIQueryStringTransformer
66
+ false
67
+ end
68
+ end
69
+
70
+ # If query strings enabled, remove QueryString transformer
71
+ if fragments?
72
+ filters = filters.select do |t|
73
+ next true unless t.instance_of? Filters::URIFragmentFilter
74
+ false
75
+ end
76
+ end
77
+
78
+ # Create the sitemap!
79
+ generator.generate(uri, output_file, filters, transformers, format, real_depth)
55
80
  end
56
81
  end
57
82
 
@@ -1,5 +1,6 @@
1
1
  require 'sitemap/logging'
2
2
  require 'sitemap/filters/filters'
3
+ require 'sitemap/filters/transformers'
3
4
  require 'csv'
4
5
  require 'json'
5
6
  require 'nokogiri'
@@ -37,16 +38,16 @@ class SitemapGenerator
37
38
  #
38
39
  # Public: Create the index recursively.
39
40
  #
40
- # link - The URI to build the index from recursively.
41
- # base_uri - The base URI (Host) to restrict which links are indexed
42
- # restrict - An array of URIs used to restrict which URIs are indexed.
43
- # all indexed URIs will include one of these paths.
44
- # link_index - Any index to start the build from.
45
- # depth - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
41
+ # link - The URI to build the index from recursively.
42
+ # base_uri - The base URI (Host) to restrict which links are indexed
43
+ # filters - An array of Filters to be applied before indexing
44
+ # transformers - An array of Transformers to be applied before indexing
45
+ # link_index - Any index to start the build from.
46
+ # depth - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
46
47
  #
47
48
  # Returns an index containing URIs as keys and an object representing the page.
48
49
  #
49
- def create_index(link, base_uri, filters, link_index = nil, depth = -1)
50
+ def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
50
51
  if link_index.nil?
51
52
  log.debug('Creating new Index')
52
53
  link_index = Hash.new
@@ -56,8 +57,6 @@ class SitemapGenerator
56
57
  return
57
58
  end
58
59
 
59
- ### TODO: replace with generic filter method
60
-
61
60
  if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
62
61
 
63
62
  log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
@@ -82,14 +81,17 @@ class SitemapGenerator
82
81
  links << l.attributes["href"].to_s
83
82
  end
84
83
 
84
+ # Transform URLs before indexing
85
+ Transformers::Util.apply_transformers(links, transformers)
86
+
85
87
  # Filter out in-eligible links
86
- a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
88
+ Filters::Util.apply_filters(links, link_index, base_uri, filters)
87
89
 
88
90
  links.each do |l|
89
91
  l = Filters::Util.remove_fragment_from_uri(l)
90
92
  if l && !l.empty?
91
93
  if depth != -1
92
- create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
94
+ create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, transformers, link_index, depth)
93
95
  end
94
96
  end
95
97
  end
@@ -155,14 +157,10 @@ class SitemapGenerator
155
157
  #
156
158
  # Create the Sitemap
157
159
  #
158
- def generate(uri, output_file, format = 'csv', depth = -1)
160
+ def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
159
161
 
160
162
  log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
161
-
162
- # Setup filters. Ideally, have some outsider give me these
163
- # Really, these are just options to the index
164
- filters = Filters::Util.get_all_filters
165
- index = create_index(uri, uri, filters, nil, depth)
163
+ index = create_index(uri, uri, filters, transformers, nil, depth)
166
164
 
167
165
  case format
168
166
  when 'json'
@@ -54,10 +54,10 @@ module Filters
54
54
  # Public: Get all known filters
55
55
  #
56
56
  def self.get_all_filters
57
- return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
57
+ return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new, Filters::URIFragmentFilter.new]
58
58
  end
59
59
 
60
- # Public: Apply URI filters to a Hash.
60
+ # Public: Apply URI Filters to a Hash.
61
61
  #
62
62
  # uris - Set (Array|Hash) of URIs to be filtered.
63
63
  # index - Current index
@@ -79,7 +79,7 @@ module Filters
79
79
  f = filters_clone.shift
80
80
  uris = apply_filters(uris, index, base_uri, filters_clone)
81
81
 
82
- uris = uris.select do |k,v|
82
+ uris = uris.select do |k|
83
83
  f.filter(index, k, base_uri)
84
84
  end
85
85
  end
@@ -157,8 +157,9 @@ module Filters
157
157
  #
158
158
  def filter(index, link, base_uri)
159
159
  link = Filters::Util.make_URI(link)
160
- return false unless (link.nil? || !link.fragment.nil?)
161
- true
160
+ return true unless (link != nil && link.fragment != nil)
161
+ log.debug("Rejecting link #{link} as it contains fragments #{link.fragment}")
162
+ false
162
163
  end
163
164
  end
164
165
 
@@ -174,7 +175,9 @@ module Filters
174
175
  # Returns the link if it should be indexed else nil.
175
176
  #
176
177
  def filter(index, link, base_uri)
177
- return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
178
+ link = Filters::Util.make_URI(link)
179
+ return true unless link == nil || link.eql?('')
180
+ log.debug("Rejecting link #{link} as it is not deemed to be a valid URI")
178
181
  false
179
182
  end
180
183
  end
@@ -196,6 +199,7 @@ module Filters
196
199
  return true
197
200
  end
198
201
  return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
202
+ log.debug("Rejecting link #{link} as it is a static resource #{link.path}")
199
203
  false
200
204
  end
201
205
  end
@@ -0,0 +1,70 @@
1
+ require 'sitemap/logging'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+
5
+ # Public: Transformers are objects that modify a provided link.
6
+ #
7
+ # For example, a Transformer might be used to strip out query string URLS
8
+ # before indexing.
9
+ module Transformers
10
+ class Util
11
+ # Public: Apply URI transformers to a Hash.
12
+ #
13
+ # uris - Set (Array|Hash) of URIs to be filtered.
14
+ # index - Current index
15
+ # base_uri - Base URI to test against
16
+ # transformers - transformers to reduce set of uris
17
+ #
18
+ # Returns a filtered uris Hash
19
+ def self.apply_transformers(uris, transformers)
20
+
21
+ # Clone transformers so we retain the 'functional' style of no side-effects
22
+ transformers_clone = transformers.clone
23
+
24
+ # Check for terminating case
25
+ if (!uris.nil? && uris.length > 0)
26
+
27
+ if !transformers_clone.nil? && transformers_clone.length > 0
28
+
29
+ # Pop a transformer and apply it recursively to the result of the next transformer
30
+ t = transformers_clone.shift
31
+ uris = apply_transformers(uris, transformers_clone)
32
+
33
+ uris = uris.map do |k,v|
34
+ t.transform(k)
35
+ end
36
+ end
37
+ end
38
+
39
+ uris
40
+ end
41
+
42
+ #
43
+ # Public: Get all known transformers
44
+ #
45
+ def self.get_all_transformers
46
+ return [Transformers::URIQueryStringTransformer.new]
47
+ end
48
+
49
+ end
50
+
51
+ # Public: URI Query Sttring Transformer.
52
+ #
53
+ #
54
+ class URIQueryStringTransformer
55
+ include Logging
56
+
57
+ #
58
+ # Public: Filters out URLs with query string resources.
59
+ #
60
+ # Returns the link without the query string component
61
+ #
62
+ def transform(link)
63
+ link = Filters::Util.make_URI(link)
64
+ return link unless (link != nil && link.query != nil)
65
+ link.query = nil
66
+ link
67
+ end
68
+ end
69
+
70
+ end
@@ -1,3 +1,3 @@
1
1
  module Sitemap
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["mefellows"]
10
10
  spec.email = ["matt.fellows@onegeek.com.au"]
11
11
  spec.description = "Sitemap Generator"
12
- spec.summary = "A basic, human readable sitemap generator"
12
+ spec.summary = "A basic, human readable (CSV, JSON) sitemap generator. "
13
13
  spec.homepage = "https://github.com/mefellows/sitemap-generator"
14
14
  spec.license = "MIT"
15
15
 
@@ -85,6 +85,10 @@ describe Filters::ResourcesFilter do
85
85
  index['http://foo.com/foo'] = ""
86
86
  index['http://foo.com/foo.pdf'] = ""
87
87
  index['http://foo.com/bar'] = ""
88
+ index['http://foo.com/bar#'] = ""
89
+ index['http://foo.com/bar/#'] = ""
90
+ index['http://foo.com/bar/#foo'] = ""
91
+ index['http://foo.com/bar/#!/hashbang/foo'] = ""
88
92
  index['http://foo.com/bar.tar.gz'] = ""
89
93
  index['http://bar.com/foo'] = ""
90
94
  index['http://www.mootools.net/'] = ""
@@ -118,7 +122,7 @@ describe Filters::ResourcesFilter do
118
122
  puts i
119
123
 
120
124
  # Need to prevent mutation in filtering
121
- expect(filters.length).to eq 3
125
+ expect(filters.length).to eq 4
122
126
 
123
127
  expect(i.length).to eq 0
124
128
 
@@ -8,19 +8,21 @@ describe SitemapGenerator do
8
8
 
9
9
  it 'Should return an index from a single page' do
10
10
  generator = SitemapGenerator.new
11
- filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
11
+ filters = Filters::Util.get_all_filters
12
+ transformers = Transformers::Util.get_all_transformers
12
13
 
13
14
  # onegeek.com.au source as at 23/05/2014
14
15
 
15
16
  # Note no trailing slash -> need to find why lack of trailing slash is an issue
16
17
  link = URI::parse("http://www.onegeek.com.au")
17
- index = generator.create_index(link, link, filters, nil, 1)
18
+ index = generator.create_index(link, link, filters, transformers, nil, 1)
18
19
 
19
- expect(index.length).to be 18
20
20
  puts "Here's the index:"
21
21
  index.each do |key, value|
22
22
  puts key
23
23
  end
24
+
25
+ expect(index.length).to be 18
24
26
  end
25
27
 
26
28
 
@@ -32,7 +34,8 @@ describe SitemapGenerator do
32
34
 
33
35
  # should treat trailing slashes the same as without???
34
36
 
35
- # Test for blacklisted objects
37
+
38
+ # Test for blacklisted URIs
36
39
 
37
40
 
38
41
  # Should not index files (PDFs, images etc.)
@@ -41,8 +44,6 @@ describe SitemapGenerator do
41
44
 
42
45
  end
43
46
 
44
-
45
-
46
47
  # it 'Should return an index from an entire site' do
47
48
  # generator = SitemapGenerator.new
48
49
 
@@ -55,14 +56,4 @@ describe SitemapGenerator do
55
56
  # end
56
57
  # end
57
58
 
58
- # it 'Let me hack stuff' do
59
- # generator = SitemapGenerator.new
60
-
61
- # print generator.fetch('http://www.webcentral.com.au/order')
62
- # doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
63
- # # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
64
- # expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
65
- # end
66
-
67
-
68
59
  end
@@ -0,0 +1,47 @@
1
+ require 'rspec'
2
+ require 'sitemap/commands/sitemap'
3
+ require 'sitemap/filters/transformers'
4
+ require 'spec_helper'
5
+
6
+ url = URI::parse('http://foo.com/foo/bar')
7
+
8
+ describe Transformers::Util do
9
+ it 'Should remove #fragments URIs' do
10
+ transformers = Transformers::Util.get_all_transformers
11
+
12
+ index = Hash.new
13
+ index['http://foo.com'] = ""
14
+ index['http://foo.com/#fragment'] = ""
15
+ index['http://foo.com/#!/hash-bang/fragment'] = ""
16
+ index['http://foo.com/#'] = ""
17
+ index['http://foo.com/foo'] = ""
18
+ index['http://foo.com/foo?'] = ""
19
+ index['http://foo.com/foo?foo'] = ""
20
+ index['http://foo.com/foo?foo=bar'] = ""
21
+ index['http://foo.com/foo.pdf'] = ""
22
+ index['http://foo.com/bar'] = ""
23
+ index['http://foo.com/bar.tar.gz'] = ""
24
+ index['http://bar.com/foo'] = ""
25
+ index['http://www.mootools.net/'] = ""
26
+ index['http://www.wordpress.org'] = ""
27
+ index['http://www.blueprintcss.com'] = ""
28
+ index['http://www.php.net'] = ""
29
+ index['/contact'] = ""
30
+ index['http://www.onegeek.com.au'] = ""
31
+ index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
32
+ index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
33
+ index['http://www.twitter.com/matthewfellows'] = ""
34
+ index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
35
+ index['http://www.flickr.com/photos/mattfellows'] = ""
36
+ index['http://www.delicious.com/mefellows'] = ""
37
+ index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
38
+
39
+ i = Transformers::Util.apply_transformers(index, transformers)
40
+
41
+ expect(i.length).to eq 25
42
+ expect(i[4].to_s).to eq 'http://foo.com/foo'
43
+ expect(i[5].to_s).to eq 'http://foo.com/foo'
44
+ expect(i[6].to_s).to eq 'http://foo.com/foo'
45
+ expect(i[7].to_s).to eq 'http://foo.com/foo'
46
+ end
47
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-generator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - mefellows
@@ -102,6 +102,7 @@ executables:
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
+ - .gitignore
105
106
  - Gemfile
106
107
  - Gemfile.lock
107
108
  - README.md
@@ -111,12 +112,14 @@ files:
111
112
  - lib/sitemap/command.rb
112
113
  - lib/sitemap/commands/sitemap.rb
113
114
  - lib/sitemap/filters/filters.rb
115
+ - lib/sitemap/filters/transformers.rb
114
116
  - lib/sitemap/logging.rb
115
117
  - lib/sitemap/version.rb
116
118
  - sitemap-generator.gemspec
117
119
  - spec/filter_spec.rb
118
120
  - spec/generator_spec.rb
119
121
  - spec/spec_helper.rb
122
+ - spec/transform_spec.rb
120
123
  homepage: https://github.com/mefellows/sitemap-generator
121
124
  licenses:
122
125
  - MIT
@@ -140,8 +143,9 @@ rubyforge_project:
140
143
  rubygems_version: 2.0.14
141
144
  signing_key:
142
145
  specification_version: 4
143
- summary: A basic, human readable sitemap generator
146
+ summary: A basic, human readable (CSV, JSON) sitemap generator.
144
147
  test_files:
145
148
  - spec/filter_spec.rb
146
149
  - spec/generator_spec.rb
147
150
  - spec/spec_helper.rb
151
+ - spec/transform_spec.rb