sitemap-generator 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +29 -10
- data/lib/sitemap/command.rb +26 -1
- data/lib/sitemap/commands/sitemap.rb +15 -17
- data/lib/sitemap/filters/filters.rb +10 -6
- data/lib/sitemap/filters/transformers.rb +70 -0
- data/lib/sitemap/version.rb +1 -1
- data/sitemap-generator.gemspec +1 -1
- data/spec/filter_spec.rb +5 -1
- data/spec/generator_spec.rb +7 -16
- data/spec/transform_spec.rb +47 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33e7b15a7651826488cdd2e3ebdefd3e34a6e1ae
|
4
|
+
data.tar.gz: eb219514a5b0f7dd9aa1a6d45e4ab813bd9ed5a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 620b5c8bd6d5e3591e2e9230c3849a67c176897543b52ee20fbde7bc262c62880ec32b32c8e8061749d3149ea30bf2367ede90c356da329432ce4c0bc456e11a
|
7
|
+
data.tar.gz: d2439232358bbdcf186665243e6e448a47def81ab68030997089de7aeb05be990130f94e296ec60d799d3ddcfb9c9db01fd4464cca66bdb7fe9c58ef7d461409
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gem
|
data/README.md
CHANGED
@@ -3,34 +3,53 @@ A simple command-line Sitemap generator tool. Useful for quickly auditing a webs
|
|
3
3
|
|
4
4
|
## Getting started
|
5
5
|
|
6
|
+
gem install sitemap-generator
|
7
|
+
|
8
|
+
*Getting started with code*
|
9
|
+
|
10
|
+
If you want to get your hands dirty in code:
|
11
|
+
|
6
12
|
git clone https://github.com/mefellows/sitemap-generator
|
7
13
|
cd sitemap-generator
|
14
|
+
bundle install
|
15
|
+
bin/sitemap
|
16
|
+
|
17
|
+
## Examples
|
8
18
|
|
9
19
|
### Generate a standard CSV Sitemap file
|
20
|
+
|
10
21
|
The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
|
11
22
|
|
12
|
-
|
23
|
+
sitemap generate http://www.onegeek.com.au/ sitemap.csv
|
13
24
|
|
14
25
|
### Generate a standard Sitemap JSON format
|
15
26
|
|
16
|
-
|
27
|
+
This command deliberately doesn't write to file in order to allow unix-style pipelining
|
28
|
+
|
29
|
+
sitemap generate --format=json http://www.onegeek.com.au/
|
17
30
|
|
18
|
-
### Generate a Sitemap
|
31
|
+
### Generate a Sitemap 3 levels deep
|
32
|
+
|
33
|
+
sitemap generate --depth=3 http://www.onegeek.com.au/ sitemap.csv
|
34
|
+
|
35
|
+
### Generate a Sitemap containing links only on the specified URI
|
19
36
|
|
20
|
-
|
37
|
+
sitemap generate --no-recursion http://www.onegeek.com.au/ sitemap.csv
|
21
38
|
|
22
|
-
### Generate a Sitemap
|
39
|
+
### Generate a Sitemap that contains URI fragments and query strings
|
23
40
|
|
24
|
-
|
41
|
+
By default, URI fragments like ```foo.com/#!/some-page``` and query strings like ```foo.com/?bar=baz``` are ignored - they are generally duplicitous so sitemap-generator strips them off entirely. This lets them back in:
|
42
|
+
|
43
|
+
sitemap generate --query-strings --fragments http://www.onegeek.com.au/ sitemap.csv
|
25
44
|
|
26
|
-
|
27
45
|
## Getting Help
|
28
46
|
|
29
|
-
|
30
|
-
|
47
|
+
sitemap
|
48
|
+
sitemap generate --help
|
31
49
|
|
32
50
|
## Alternatives?
|
33
51
|
|
34
|
-
So of course, after spending
|
52
|
+
So of course, after spending a few hours writing this I forgot that wget can do this for you, well basically anyway:
|
35
53
|
|
36
54
|
wget -r --delete-after <todo>
|
55
|
+
|
data/lib/sitemap/command.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'sitemap/version'
|
2
2
|
require 'sitemap/logging'
|
3
3
|
require 'sitemap/commands/sitemap'
|
4
|
+
require 'sitemap/filters/transformers'
|
4
5
|
require 'clamp'
|
5
6
|
|
6
7
|
module Sitemap
|
@@ -18,6 +19,8 @@ module Sitemap
|
|
18
19
|
class SitemapCommand < AbstractCommand
|
19
20
|
option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
|
20
21
|
option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
|
22
|
+
option "--query-strings", :flag, "Allow query strings in URIs", :default => false
|
23
|
+
option "--fragments", :flag, "Allow fragments in URIs", :default => false
|
21
24
|
option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
|
22
25
|
Integer(s)
|
23
26
|
end
|
@@ -51,7 +54,29 @@ module Sitemap
|
|
51
54
|
|
52
55
|
log.info('Running sitemap generator')
|
53
56
|
generator = SitemapGenerator.new()
|
54
|
-
|
57
|
+
|
58
|
+
# Setup filters and transformers
|
59
|
+
filters = Filters::Util.get_all_filters
|
60
|
+
transformers = Transformers::Util.get_all_transformers
|
61
|
+
|
62
|
+
# If query strings enabled, remove QueryString transformer
|
63
|
+
if query_strings?
|
64
|
+
transformers = transformers.select do |t|
|
65
|
+
next true unless t.instance_of? Transformers::URIQueryStringTransformer
|
66
|
+
false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# If query strings enabled, remove QueryString transformer
|
71
|
+
if fragments?
|
72
|
+
filters = filters.select do |t|
|
73
|
+
next true unless t.instance_of? Filters::URIFragmentFilter
|
74
|
+
false
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Create the sitemap!
|
79
|
+
generator.generate(uri, output_file, filters, transformers, format, real_depth)
|
55
80
|
end
|
56
81
|
end
|
57
82
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'sitemap/logging'
|
2
2
|
require 'sitemap/filters/filters'
|
3
|
+
require 'sitemap/filters/transformers'
|
3
4
|
require 'csv'
|
4
5
|
require 'json'
|
5
6
|
require 'nokogiri'
|
@@ -37,16 +38,16 @@ class SitemapGenerator
|
|
37
38
|
#
|
38
39
|
# Public: Create the index recursively.
|
39
40
|
#
|
40
|
-
# link
|
41
|
-
# base_uri
|
42
|
-
#
|
43
|
-
#
|
44
|
-
# link_index
|
45
|
-
# depth
|
41
|
+
# link - The URI to build the index from recursively.
|
42
|
+
# base_uri - The base URI (Host) to restrict which links are indexed
|
43
|
+
# filters - An array of Filters to be applied before indexing
|
44
|
+
# transformers - An array of Transformers to be applied before indexing
|
45
|
+
# link_index - Any index to start the build from.
|
46
|
+
# depth - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
|
46
47
|
#
|
47
48
|
# Returns an index containing URIs as keys and an object representing the page.
|
48
49
|
#
|
49
|
-
def create_index(link, base_uri, filters, link_index = nil, depth = -1)
|
50
|
+
def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
|
50
51
|
if link_index.nil?
|
51
52
|
log.debug('Creating new Index')
|
52
53
|
link_index = Hash.new
|
@@ -56,8 +57,6 @@ class SitemapGenerator
|
|
56
57
|
return
|
57
58
|
end
|
58
59
|
|
59
|
-
### TODO: replace with generic filter method
|
60
|
-
|
61
60
|
if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
|
62
61
|
|
63
62
|
log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
|
@@ -82,14 +81,17 @@ class SitemapGenerator
|
|
82
81
|
links << l.attributes["href"].to_s
|
83
82
|
end
|
84
83
|
|
84
|
+
# Transform URLs before indexing
|
85
|
+
Transformers::Util.apply_transformers(links, transformers)
|
86
|
+
|
85
87
|
# Filter out in-eligible links
|
86
|
-
|
88
|
+
Filters::Util.apply_filters(links, link_index, base_uri, filters)
|
87
89
|
|
88
90
|
links.each do |l|
|
89
91
|
l = Filters::Util.remove_fragment_from_uri(l)
|
90
92
|
if l && !l.empty?
|
91
93
|
if depth != -1
|
92
|
-
create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
|
94
|
+
create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, transformers, link_index, depth)
|
93
95
|
end
|
94
96
|
end
|
95
97
|
end
|
@@ -155,14 +157,10 @@ class SitemapGenerator
|
|
155
157
|
#
|
156
158
|
# Create the Sitemap
|
157
159
|
#
|
158
|
-
def generate(uri, output_file, format = 'csv', depth = -1)
|
160
|
+
def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
|
159
161
|
|
160
162
|
log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
|
161
|
-
|
162
|
-
# Setup filters. Ideally, have some outsider give me these
|
163
|
-
# Really, these are just options to the index
|
164
|
-
filters = Filters::Util.get_all_filters
|
165
|
-
index = create_index(uri, uri, filters, nil, depth)
|
163
|
+
index = create_index(uri, uri, filters, transformers, nil, depth)
|
166
164
|
|
167
165
|
case format
|
168
166
|
when 'json'
|
@@ -54,10 +54,10 @@ module Filters
|
|
54
54
|
# Public: Get all known filters
|
55
55
|
#
|
56
56
|
def self.get_all_filters
|
57
|
-
return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
|
57
|
+
return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new, Filters::URIFragmentFilter.new]
|
58
58
|
end
|
59
59
|
|
60
|
-
# Public: Apply URI
|
60
|
+
# Public: Apply URI Filters to a Hash.
|
61
61
|
#
|
62
62
|
# uris - Set (Array|Hash) of URIs to be filtered.
|
63
63
|
# index - Current index
|
@@ -79,7 +79,7 @@ module Filters
|
|
79
79
|
f = filters_clone.shift
|
80
80
|
uris = apply_filters(uris, index, base_uri, filters_clone)
|
81
81
|
|
82
|
-
uris = uris.select do |k
|
82
|
+
uris = uris.select do |k|
|
83
83
|
f.filter(index, k, base_uri)
|
84
84
|
end
|
85
85
|
end
|
@@ -157,8 +157,9 @@ module Filters
|
|
157
157
|
#
|
158
158
|
def filter(index, link, base_uri)
|
159
159
|
link = Filters::Util.make_URI(link)
|
160
|
-
return
|
161
|
-
|
160
|
+
return true unless (link != nil && link.fragment != nil)
|
161
|
+
log.debug("Rejecting link #{link} as it contains fragments #{link.fragment}")
|
162
|
+
false
|
162
163
|
end
|
163
164
|
end
|
164
165
|
|
@@ -174,7 +175,9 @@ module Filters
|
|
174
175
|
# Returns the link if it should be indexed else nil.
|
175
176
|
#
|
176
177
|
def filter(index, link, base_uri)
|
177
|
-
|
178
|
+
link = Filters::Util.make_URI(link)
|
179
|
+
return true unless link == nil || link.eql?('')
|
180
|
+
log.debug("Rejecting link #{link} as it is not deemed to be a valid URI")
|
178
181
|
false
|
179
182
|
end
|
180
183
|
end
|
@@ -196,6 +199,7 @@ module Filters
|
|
196
199
|
return true
|
197
200
|
end
|
198
201
|
return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
|
202
|
+
log.debug("Rejecting link #{link} as it is a static resource #{link.path}")
|
199
203
|
false
|
200
204
|
end
|
201
205
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sitemap/logging'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
# Public: Transformers are objects that modify a provided link.
|
6
|
+
#
|
7
|
+
# For example, a Transformer might be used to strip out query string URLS
|
8
|
+
# before indexing.
|
9
|
+
module Transformers
|
10
|
+
class Util
|
11
|
+
# Public: Apply URI transformers to a Hash.
|
12
|
+
#
|
13
|
+
# uris - Set (Array|Hash) of URIs to be filtered.
|
14
|
+
# index - Current index
|
15
|
+
# base_uri - Base URI to test against
|
16
|
+
# transformers - transformers to reduce set of uris
|
17
|
+
#
|
18
|
+
# Returns a filtered uris Hash
|
19
|
+
def self.apply_transformers(uris, transformers)
|
20
|
+
|
21
|
+
# Clone transformers so we retain the 'functional' style of no side-effects
|
22
|
+
transformers_clone = transformers.clone
|
23
|
+
|
24
|
+
# Check for terminating case
|
25
|
+
if (!uris.nil? && uris.length > 0)
|
26
|
+
|
27
|
+
if !transformers_clone.nil? && transformers_clone.length > 0
|
28
|
+
|
29
|
+
# Pop a transformer and apply it recursively to the result of the next transformer
|
30
|
+
t = transformers_clone.shift
|
31
|
+
uris = apply_transformers(uris, transformers_clone)
|
32
|
+
|
33
|
+
uris = uris.map do |k,v|
|
34
|
+
t.transform(k)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
uris
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Public: Get all known transformers
|
44
|
+
#
|
45
|
+
def self.get_all_transformers
|
46
|
+
return [Transformers::URIQueryStringTransformer.new]
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
# Public: URI Query Sttring Transformer.
|
52
|
+
#
|
53
|
+
#
|
54
|
+
class URIQueryStringTransformer
|
55
|
+
include Logging
|
56
|
+
|
57
|
+
#
|
58
|
+
# Public: Filters out URLs with query string resources.
|
59
|
+
#
|
60
|
+
# Returns the link without the query string component
|
61
|
+
#
|
62
|
+
def transform(link)
|
63
|
+
link = Filters::Util.make_URI(link)
|
64
|
+
return link unless (link != nil && link.query != nil)
|
65
|
+
link.query = nil
|
66
|
+
link
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
data/lib/sitemap/version.rb
CHANGED
data/sitemap-generator.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["mefellows"]
|
10
10
|
spec.email = ["matt.fellows@onegeek.com.au"]
|
11
11
|
spec.description = "Sitemap Generator"
|
12
|
-
spec.summary = "A basic, human readable sitemap generator"
|
12
|
+
spec.summary = "A basic, human readable (CSV, JSON) sitemap generator. "
|
13
13
|
spec.homepage = "https://github.com/mefellows/sitemap-generator"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
data/spec/filter_spec.rb
CHANGED
@@ -85,6 +85,10 @@ describe Filters::ResourcesFilter do
|
|
85
85
|
index['http://foo.com/foo'] = ""
|
86
86
|
index['http://foo.com/foo.pdf'] = ""
|
87
87
|
index['http://foo.com/bar'] = ""
|
88
|
+
index['http://foo.com/bar#'] = ""
|
89
|
+
index['http://foo.com/bar/#'] = ""
|
90
|
+
index['http://foo.com/bar/#foo'] = ""
|
91
|
+
index['http://foo.com/bar/#!/hashbang/foo'] = ""
|
88
92
|
index['http://foo.com/bar.tar.gz'] = ""
|
89
93
|
index['http://bar.com/foo'] = ""
|
90
94
|
index['http://www.mootools.net/'] = ""
|
@@ -118,7 +122,7 @@ describe Filters::ResourcesFilter do
|
|
118
122
|
puts i
|
119
123
|
|
120
124
|
# Need to prevent mutation in filtering
|
121
|
-
expect(filters.length).to eq
|
125
|
+
expect(filters.length).to eq 4
|
122
126
|
|
123
127
|
expect(i.length).to eq 0
|
124
128
|
|
data/spec/generator_spec.rb
CHANGED
@@ -8,19 +8,21 @@ describe SitemapGenerator do
|
|
8
8
|
|
9
9
|
it 'Should return an index from a single page' do
|
10
10
|
generator = SitemapGenerator.new
|
11
|
-
filters =
|
11
|
+
filters = Filters::Util.get_all_filters
|
12
|
+
transformers = Transformers::Util.get_all_transformers
|
12
13
|
|
13
14
|
# onegeek.com.au source as at 23/05/2014
|
14
15
|
|
15
16
|
# Note no trailing slash -> need to find why lack of trailing slash is an issue
|
16
17
|
link = URI::parse("http://www.onegeek.com.au")
|
17
|
-
index = generator.create_index(link, link, filters, nil, 1)
|
18
|
+
index = generator.create_index(link, link, filters, transformers, nil, 1)
|
18
19
|
|
19
|
-
expect(index.length).to be 18
|
20
20
|
puts "Here's the index:"
|
21
21
|
index.each do |key, value|
|
22
22
|
puts key
|
23
23
|
end
|
24
|
+
|
25
|
+
expect(index.length).to be 18
|
24
26
|
end
|
25
27
|
|
26
28
|
|
@@ -32,7 +34,8 @@ describe SitemapGenerator do
|
|
32
34
|
|
33
35
|
# should treat trailing slashes the same as without???
|
34
36
|
|
35
|
-
|
37
|
+
|
38
|
+
# Test for blacklisted URIs
|
36
39
|
|
37
40
|
|
38
41
|
# Should not index files (PDFs, images etc.)
|
@@ -41,8 +44,6 @@ describe SitemapGenerator do
|
|
41
44
|
|
42
45
|
end
|
43
46
|
|
44
|
-
|
45
|
-
|
46
47
|
# it 'Should return an index from an entire site' do
|
47
48
|
# generator = SitemapGenerator.new
|
48
49
|
|
@@ -55,14 +56,4 @@ describe SitemapGenerator do
|
|
55
56
|
# end
|
56
57
|
# end
|
57
58
|
|
58
|
-
# it 'Let me hack stuff' do
|
59
|
-
# generator = SitemapGenerator.new
|
60
|
-
|
61
|
-
# print generator.fetch('http://www.webcentral.com.au/order')
|
62
|
-
# doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards & Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> — <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks & Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"<").replace(/>/g,">");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
|
63
|
-
# # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
|
64
|
-
# expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
|
65
|
-
# end
|
66
|
-
|
67
|
-
|
68
59
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sitemap/commands/sitemap'
|
3
|
+
require 'sitemap/filters/transformers'
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
url = URI::parse('http://foo.com/foo/bar')
|
7
|
+
|
8
|
+
describe Transformers::Util do
|
9
|
+
it 'Should remove #fragments URIs' do
|
10
|
+
transformers = Transformers::Util.get_all_transformers
|
11
|
+
|
12
|
+
index = Hash.new
|
13
|
+
index['http://foo.com'] = ""
|
14
|
+
index['http://foo.com/#fragment'] = ""
|
15
|
+
index['http://foo.com/#!/hash-bang/fragment'] = ""
|
16
|
+
index['http://foo.com/#'] = ""
|
17
|
+
index['http://foo.com/foo'] = ""
|
18
|
+
index['http://foo.com/foo?'] = ""
|
19
|
+
index['http://foo.com/foo?foo'] = ""
|
20
|
+
index['http://foo.com/foo?foo=bar'] = ""
|
21
|
+
index['http://foo.com/foo.pdf'] = ""
|
22
|
+
index['http://foo.com/bar'] = ""
|
23
|
+
index['http://foo.com/bar.tar.gz'] = ""
|
24
|
+
index['http://bar.com/foo'] = ""
|
25
|
+
index['http://www.mootools.net/'] = ""
|
26
|
+
index['http://www.wordpress.org'] = ""
|
27
|
+
index['http://www.blueprintcss.com'] = ""
|
28
|
+
index['http://www.php.net'] = ""
|
29
|
+
index['/contact'] = ""
|
30
|
+
index['http://www.onegeek.com.au'] = ""
|
31
|
+
index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
|
32
|
+
index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
|
33
|
+
index['http://www.twitter.com/matthewfellows'] = ""
|
34
|
+
index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
|
35
|
+
index['http://www.flickr.com/photos/mattfellows'] = ""
|
36
|
+
index['http://www.delicious.com/mefellows'] = ""
|
37
|
+
index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
|
38
|
+
|
39
|
+
i = Transformers::Util.apply_transformers(index, transformers)
|
40
|
+
|
41
|
+
expect(i.length).to eq 25
|
42
|
+
expect(i[4].to_s).to eq 'http://foo.com/foo'
|
43
|
+
expect(i[5].to_s).to eq 'http://foo.com/foo'
|
44
|
+
expect(i[6].to_s).to eq 'http://foo.com/foo'
|
45
|
+
expect(i[7].to_s).to eq 'http://foo.com/foo'
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap-generator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- mefellows
|
@@ -102,6 +102,7 @@ executables:
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
+
- .gitignore
|
105
106
|
- Gemfile
|
106
107
|
- Gemfile.lock
|
107
108
|
- README.md
|
@@ -111,12 +112,14 @@ files:
|
|
111
112
|
- lib/sitemap/command.rb
|
112
113
|
- lib/sitemap/commands/sitemap.rb
|
113
114
|
- lib/sitemap/filters/filters.rb
|
115
|
+
- lib/sitemap/filters/transformers.rb
|
114
116
|
- lib/sitemap/logging.rb
|
115
117
|
- lib/sitemap/version.rb
|
116
118
|
- sitemap-generator.gemspec
|
117
119
|
- spec/filter_spec.rb
|
118
120
|
- spec/generator_spec.rb
|
119
121
|
- spec/spec_helper.rb
|
122
|
+
- spec/transform_spec.rb
|
120
123
|
homepage: https://github.com/mefellows/sitemap-generator
|
121
124
|
licenses:
|
122
125
|
- MIT
|
@@ -140,8 +143,9 @@ rubyforge_project:
|
|
140
143
|
rubygems_version: 2.0.14
|
141
144
|
signing_key:
|
142
145
|
specification_version: 4
|
143
|
-
summary: A basic, human readable sitemap generator
|
146
|
+
summary: A basic, human readable (CSV, JSON) sitemap generator.
|
144
147
|
test_files:
|
145
148
|
- spec/filter_spec.rb
|
146
149
|
- spec/generator_spec.rb
|
147
150
|
- spec/spec_helper.rb
|
151
|
+
- spec/transform_spec.rb
|