sitemap-generator 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
4
- data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
3
+ metadata.gz: 33e7b15a7651826488cdd2e3ebdefd3e34a6e1ae
4
+ data.tar.gz: eb219514a5b0f7dd9aa1a6d45e4ab813bd9ed5a4
5
5
  SHA512:
6
- metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
7
- data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390
6
+ metadata.gz: 620b5c8bd6d5e3591e2e9230c3849a67c176897543b52ee20fbde7bc262c62880ec32b32c8e8061749d3149ea30bf2367ede90c356da329432ce4c0bc456e11a
7
+ data.tar.gz: d2439232358bbdcf186665243e6e448a47def81ab68030997089de7aeb05be990130f94e296ec60d799d3ddcfb9c9db01fd4464cca66bdb7fe9c58ef7d461409
@@ -0,0 +1 @@
1
+ *.gem
data/README.md CHANGED
@@ -3,34 +3,53 @@ A simple command-line Sitemap generator tool. Useful for quickly auditing a webs
3
3
 
4
4
  ## Getting started
5
5
 
6
+ gem install sitemap-generator
7
+
8
+ *Getting started with code*
9
+
10
+ If you want to get your hands dirty in code:
11
+
6
12
  git clone https://github.com/mefellows/sitemap-generator
7
13
  cd sitemap-generator
14
+ bundle install
15
+ bin/sitemap
16
+
17
+ ## Examples
8
18
 
9
19
  ### Generate a standard CSV Sitemap file
20
+
10
21
  The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
11
22
 
12
- bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
23
+ sitemap generate http://www.onegeek.com.au/ sitemap.csv
13
24
 
14
25
  ### Generate a standard Sitemap JSON format
15
26
 
16
- bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
27
+ This command deliberately doesn't write to file in order to allow unix-style pipelining
28
+
29
+ sitemap generate --format=json http://www.onegeek.com.au/
17
30
 
18
- ### Generate a Sitemap restricting to the URI provided
31
+ ### Generate a Sitemap 3 levels deep
32
+
33
+ sitemap generate --depth=3 http://www.onegeek.com.au/ sitemap.csv
34
+
35
+ ### Generate a Sitemap containing links only on the specified URI
19
36
 
20
- bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
37
+ sitemap generate --no-recursion http://www.onegeek.com.au/ sitemap.csv
21
38
 
22
- ### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
39
+ ### Generate a Sitemap that contains URI fragments and query strings
23
40
 
24
- bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
41
+ By default, URI fragments like ```foo.com/#!/some-page``` and query strings like ```foo.com/?bar=baz``` are ignored - they are generally duplicitous so sitemap-generator strips them off entirely. This lets them back in:
42
+
43
+ sitemap generate --query-strings --fragments http://www.onegeek.com.au/ sitemap.csv
25
44
 
26
-
27
45
  ## Getting Help
28
46
 
29
- bin/sitemap
30
- bin/sitemap generate --help
47
+ sitemap
48
+ sitemap generate --help
31
49
 
32
50
  ## Alternatives?
33
51
 
34
- So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
52
+ So of course, after spending a few hours writing this I forgot that wget can do this for you, well basically anyway:
35
53
 
36
54
  wget -r --delete-after <todo>
55
+
@@ -1,6 +1,7 @@
1
1
  require 'sitemap/version'
2
2
  require 'sitemap/logging'
3
3
  require 'sitemap/commands/sitemap'
4
+ require 'sitemap/filters/transformers'
4
5
  require 'clamp'
5
6
 
6
7
  module Sitemap
@@ -18,6 +19,8 @@ module Sitemap
18
19
  class SitemapCommand < AbstractCommand
19
20
  option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
20
21
  option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
22
+ option "--query-strings", :flag, "Allow query strings in URIs", :default => false
23
+ option "--fragments", :flag, "Allow fragments in URIs", :default => false
21
24
  option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
22
25
  Integer(s)
23
26
  end
@@ -51,7 +54,29 @@ module Sitemap
51
54
 
52
55
  log.info('Running sitemap generator')
53
56
  generator = SitemapGenerator.new()
54
- generator.generate(uri, output_file, format, real_depth)
57
+
58
+ # Setup filters and transformers
59
+ filters = Filters::Util.get_all_filters
60
+ transformers = Transformers::Util.get_all_transformers
61
+
62
+ # If query strings enabled, remove QueryString transformer
63
+ if query_strings?
64
+ transformers = transformers.select do |t|
65
+ next true unless t.instance_of? Transformers::URIQueryStringTransformer
66
+ false
67
+ end
68
+ end
69
+
70
+ # If query strings enabled, remove QueryString transformer
71
+ if fragments?
72
+ filters = filters.select do |t|
73
+ next true unless t.instance_of? Filters::URIFragmentFilter
74
+ false
75
+ end
76
+ end
77
+
78
+ # Create the sitemap!
79
+ generator.generate(uri, output_file, filters, transformers, format, real_depth)
55
80
  end
56
81
  end
57
82
 
@@ -1,5 +1,6 @@
1
1
  require 'sitemap/logging'
2
2
  require 'sitemap/filters/filters'
3
+ require 'sitemap/filters/transformers'
3
4
  require 'csv'
4
5
  require 'json'
5
6
  require 'nokogiri'
@@ -37,16 +38,16 @@ class SitemapGenerator
37
38
  #
38
39
  # Public: Create the index recursively.
39
40
  #
40
- # link - The URI to build the index from recursively.
41
- # base_uri - The base URI (Host) to restrict which links are indexed
42
- # restrict - An array of URIs used to restrict which URIs are indexed.
43
- # all indexed URIs will include one of these paths.
44
- # link_index - Any index to start the build from.
45
- # depth - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
41
+ # link - The URI to build the index from recursively.
42
+ # base_uri - The base URI (Host) to restrict which links are indexed
43
+ # filters - An array of Filters to be applied before indexing
44
+ # transformers - An array of Transformers to be applied before indexing
45
+ # link_index - Any index to start the build from.
46
+ # depth - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
46
47
  #
47
48
  # Returns an index containing URIs as keys and an object representing the page.
48
49
  #
49
- def create_index(link, base_uri, filters, link_index = nil, depth = -1)
50
+ def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
50
51
  if link_index.nil?
51
52
  log.debug('Creating new Index')
52
53
  link_index = Hash.new
@@ -56,8 +57,6 @@ class SitemapGenerator
56
57
  return
57
58
  end
58
59
 
59
- ### TODO: replace with generic filter method
60
-
61
60
  if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
62
61
 
63
62
  log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
@@ -82,14 +81,17 @@ class SitemapGenerator
82
81
  links << l.attributes["href"].to_s
83
82
  end
84
83
 
84
+ # Transform URLs before indexing
85
+ Transformers::Util.apply_transformers(links, transformers)
86
+
85
87
  # Filter out in-eligible links
86
- a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
88
+ Filters::Util.apply_filters(links, link_index, base_uri, filters)
87
89
 
88
90
  links.each do |l|
89
91
  l = Filters::Util.remove_fragment_from_uri(l)
90
92
  if l && !l.empty?
91
93
  if depth != -1
92
- create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
94
+ create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, transformers, link_index, depth)
93
95
  end
94
96
  end
95
97
  end
@@ -155,14 +157,10 @@ class SitemapGenerator
155
157
  #
156
158
  # Create the Sitemap
157
159
  #
158
- def generate(uri, output_file, format = 'csv', depth = -1)
160
+ def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
159
161
 
160
162
  log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
161
-
162
- # Setup filters. Ideally, have some outsider give me these
163
- # Really, these are just options to the index
164
- filters = Filters::Util.get_all_filters
165
- index = create_index(uri, uri, filters, nil, depth)
163
+ index = create_index(uri, uri, filters, transformers, nil, depth)
166
164
 
167
165
  case format
168
166
  when 'json'
@@ -54,10 +54,10 @@ module Filters
54
54
  # Public: Get all known filters
55
55
  #
56
56
  def self.get_all_filters
57
- return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
57
+ return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new, Filters::URIFragmentFilter.new]
58
58
  end
59
59
 
60
- # Public: Apply URI filters to a Hash.
60
+ # Public: Apply URI Filters to a Hash.
61
61
  #
62
62
  # uris - Set (Array|Hash) of URIs to be filtered.
63
63
  # index - Current index
@@ -79,7 +79,7 @@ module Filters
79
79
  f = filters_clone.shift
80
80
  uris = apply_filters(uris, index, base_uri, filters_clone)
81
81
 
82
- uris = uris.select do |k,v|
82
+ uris = uris.select do |k|
83
83
  f.filter(index, k, base_uri)
84
84
  end
85
85
  end
@@ -157,8 +157,9 @@ module Filters
157
157
  #
158
158
  def filter(index, link, base_uri)
159
159
  link = Filters::Util.make_URI(link)
160
- return false unless (link.nil? || !link.fragment.nil?)
161
- true
160
+ return true unless (link != nil && link.fragment != nil)
161
+ log.debug("Rejecting link #{link} as it contains fragments #{link.fragment}")
162
+ false
162
163
  end
163
164
  end
164
165
 
@@ -174,7 +175,9 @@ module Filters
174
175
  # Returns the link if it should be indexed else nil.
175
176
  #
176
177
  def filter(index, link, base_uri)
177
- return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
178
+ link = Filters::Util.make_URI(link)
179
+ return true unless link == nil || link.eql?('')
180
+ log.debug("Rejecting link #{link} as it is not deemed to be a valid URI")
178
181
  false
179
182
  end
180
183
  end
@@ -196,6 +199,7 @@ module Filters
196
199
  return true
197
200
  end
198
201
  return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
202
+ log.debug("Rejecting link #{link} as it is a static resource #{link.path}")
199
203
  false
200
204
  end
201
205
  end
@@ -0,0 +1,70 @@
1
+ require 'sitemap/logging'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+
5
+ # Public: Transformers are objects that modify a provided link.
6
+ #
7
+ # For example, a Transformer might be used to strip out query string URLS
8
+ # before indexing.
9
+ module Transformers
10
+ class Util
11
+ # Public: Apply URI transformers to a Hash.
12
+ #
13
+ # uris - Set (Array|Hash) of URIs to be filtered.
14
+ # index - Current index
15
+ # base_uri - Base URI to test against
16
+ # transformers - transformers to reduce set of uris
17
+ #
18
+ # Returns a filtered uris Hash
19
+ def self.apply_transformers(uris, transformers)
20
+
21
+ # Clone transformers so we retain the 'functional' style of no side-effects
22
+ transformers_clone = transformers.clone
23
+
24
+ # Check for terminating case
25
+ if (!uris.nil? && uris.length > 0)
26
+
27
+ if !transformers_clone.nil? && transformers_clone.length > 0
28
+
29
+ # Pop a transformer and apply it recursively to the result of the next transformer
30
+ t = transformers_clone.shift
31
+ uris = apply_transformers(uris, transformers_clone)
32
+
33
+ uris = uris.map do |k,v|
34
+ t.transform(k)
35
+ end
36
+ end
37
+ end
38
+
39
+ uris
40
+ end
41
+
42
+ #
43
+ # Public: Get all known transformers
44
+ #
45
+ def self.get_all_transformers
46
+ return [Transformers::URIQueryStringTransformer.new]
47
+ end
48
+
49
+ end
50
+
51
+ # Public: URI Query Sttring Transformer.
52
+ #
53
+ #
54
+ class URIQueryStringTransformer
55
+ include Logging
56
+
57
+ #
58
+ # Public: Filters out URLs with query string resources.
59
+ #
60
+ # Returns the link without the query string component
61
+ #
62
+ def transform(link)
63
+ link = Filters::Util.make_URI(link)
64
+ return link unless (link != nil && link.query != nil)
65
+ link.query = nil
66
+ link
67
+ end
68
+ end
69
+
70
+ end
@@ -1,3 +1,3 @@
1
1
  module Sitemap
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["mefellows"]
10
10
  spec.email = ["matt.fellows@onegeek.com.au"]
11
11
  spec.description = "Sitemap Generator"
12
- spec.summary = "A basic, human readable sitemap generator"
12
+ spec.summary = "A basic, human readable (CSV, JSON) sitemap generator. "
13
13
  spec.homepage = "https://github.com/mefellows/sitemap-generator"
14
14
  spec.license = "MIT"
15
15
 
@@ -85,6 +85,10 @@ describe Filters::ResourcesFilter do
85
85
  index['http://foo.com/foo'] = ""
86
86
  index['http://foo.com/foo.pdf'] = ""
87
87
  index['http://foo.com/bar'] = ""
88
+ index['http://foo.com/bar#'] = ""
89
+ index['http://foo.com/bar/#'] = ""
90
+ index['http://foo.com/bar/#foo'] = ""
91
+ index['http://foo.com/bar/#!/hashbang/foo'] = ""
88
92
  index['http://foo.com/bar.tar.gz'] = ""
89
93
  index['http://bar.com/foo'] = ""
90
94
  index['http://www.mootools.net/'] = ""
@@ -118,7 +122,7 @@ describe Filters::ResourcesFilter do
118
122
  puts i
119
123
 
120
124
  # Need to prevent mutation in filtering
121
- expect(filters.length).to eq 3
125
+ expect(filters.length).to eq 4
122
126
 
123
127
  expect(i.length).to eq 0
124
128
 
@@ -8,19 +8,21 @@ describe SitemapGenerator do
8
8
 
9
9
  it 'Should return an index from a single page' do
10
10
  generator = SitemapGenerator.new
11
- filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
11
+ filters = Filters::Util.get_all_filters
12
+ transformers = Transformers::Util.get_all_transformers
12
13
 
13
14
  # onegeek.com.au source as at 23/05/2014
14
15
 
15
16
  # Note no trailing slash -> need to find why lack of trailing slash is an issue
16
17
  link = URI::parse("http://www.onegeek.com.au")
17
- index = generator.create_index(link, link, filters, nil, 1)
18
+ index = generator.create_index(link, link, filters, transformers, nil, 1)
18
19
 
19
- expect(index.length).to be 18
20
20
  puts "Here's the index:"
21
21
  index.each do |key, value|
22
22
  puts key
23
23
  end
24
+
25
+ expect(index.length).to be 18
24
26
  end
25
27
 
26
28
 
@@ -32,7 +34,8 @@ describe SitemapGenerator do
32
34
 
33
35
  # should treat trailing slashes the same as without???
34
36
 
35
- # Test for blacklisted objects
37
+
38
+ # Test for blacklisted URIs
36
39
 
37
40
 
38
41
  # Should not index files (PDFs, images etc.)
@@ -41,8 +44,6 @@ describe SitemapGenerator do
41
44
 
42
45
  end
43
46
 
44
-
45
-
46
47
  # it 'Should return an index from an entire site' do
47
48
  # generator = SitemapGenerator.new
48
49
 
@@ -55,14 +56,4 @@ describe SitemapGenerator do
55
56
  # end
56
57
  # end
57
58
 
58
- # it 'Let me hack stuff' do
59
- # generator = SitemapGenerator.new
60
-
61
- # print generator.fetch('http://www.webcentral.com.au/order')
62
- # doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
63
- # # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
64
- # expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
65
- # end
66
-
67
-
68
59
  end
@@ -0,0 +1,47 @@
1
+ require 'rspec'
2
+ require 'sitemap/commands/sitemap'
3
+ require 'sitemap/filters/transformers'
4
+ require 'spec_helper'
5
+
6
+ url = URI::parse('http://foo.com/foo/bar')
7
+
8
+ describe Transformers::Util do
9
+ it 'Should remove #fragments URIs' do
10
+ transformers = Transformers::Util.get_all_transformers
11
+
12
+ index = Hash.new
13
+ index['http://foo.com'] = ""
14
+ index['http://foo.com/#fragment'] = ""
15
+ index['http://foo.com/#!/hash-bang/fragment'] = ""
16
+ index['http://foo.com/#'] = ""
17
+ index['http://foo.com/foo'] = ""
18
+ index['http://foo.com/foo?'] = ""
19
+ index['http://foo.com/foo?foo'] = ""
20
+ index['http://foo.com/foo?foo=bar'] = ""
21
+ index['http://foo.com/foo.pdf'] = ""
22
+ index['http://foo.com/bar'] = ""
23
+ index['http://foo.com/bar.tar.gz'] = ""
24
+ index['http://bar.com/foo'] = ""
25
+ index['http://www.mootools.net/'] = ""
26
+ index['http://www.wordpress.org'] = ""
27
+ index['http://www.blueprintcss.com'] = ""
28
+ index['http://www.php.net'] = ""
29
+ index['/contact'] = ""
30
+ index['http://www.onegeek.com.au'] = ""
31
+ index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
32
+ index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
33
+ index['http://www.twitter.com/matthewfellows'] = ""
34
+ index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
35
+ index['http://www.flickr.com/photos/mattfellows'] = ""
36
+ index['http://www.delicious.com/mefellows'] = ""
37
+ index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
38
+
39
+ i = Transformers::Util.apply_transformers(index, transformers)
40
+
41
+ expect(i.length).to eq 25
42
+ expect(i[4].to_s).to eq 'http://foo.com/foo'
43
+ expect(i[5].to_s).to eq 'http://foo.com/foo'
44
+ expect(i[6].to_s).to eq 'http://foo.com/foo'
45
+ expect(i[7].to_s).to eq 'http://foo.com/foo'
46
+ end
47
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-generator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - mefellows
@@ -102,6 +102,7 @@ executables:
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
+ - .gitignore
105
106
  - Gemfile
106
107
  - Gemfile.lock
107
108
  - README.md
@@ -111,12 +112,14 @@ files:
111
112
  - lib/sitemap/command.rb
112
113
  - lib/sitemap/commands/sitemap.rb
113
114
  - lib/sitemap/filters/filters.rb
115
+ - lib/sitemap/filters/transformers.rb
114
116
  - lib/sitemap/logging.rb
115
117
  - lib/sitemap/version.rb
116
118
  - sitemap-generator.gemspec
117
119
  - spec/filter_spec.rb
118
120
  - spec/generator_spec.rb
119
121
  - spec/spec_helper.rb
122
+ - spec/transform_spec.rb
120
123
  homepage: https://github.com/mefellows/sitemap-generator
121
124
  licenses:
122
125
  - MIT
@@ -140,8 +143,9 @@ rubyforge_project:
140
143
  rubygems_version: 2.0.14
141
144
  signing_key:
142
145
  specification_version: 4
143
- summary: A basic, human readable sitemap generator
146
+ summary: A basic, human readable (CSV, JSON) sitemap generator.
144
147
  test_files:
145
148
  - spec/filter_spec.rb
146
149
  - spec/generator_spec.rb
147
150
  - spec/spec_helper.rb
151
+ - spec/transform_spec.rb