sitemap-generator 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +29 -10
- data/lib/sitemap/command.rb +26 -1
- data/lib/sitemap/commands/sitemap.rb +15 -17
- data/lib/sitemap/filters/filters.rb +10 -6
- data/lib/sitemap/filters/transformers.rb +70 -0
- data/lib/sitemap/version.rb +1 -1
- data/sitemap-generator.gemspec +1 -1
- data/spec/filter_spec.rb +5 -1
- data/spec/generator_spec.rb +7 -16
- data/spec/transform_spec.rb +47 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33e7b15a7651826488cdd2e3ebdefd3e34a6e1ae
|
4
|
+
data.tar.gz: eb219514a5b0f7dd9aa1a6d45e4ab813bd9ed5a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 620b5c8bd6d5e3591e2e9230c3849a67c176897543b52ee20fbde7bc262c62880ec32b32c8e8061749d3149ea30bf2367ede90c356da329432ce4c0bc456e11a
|
7
|
+
data.tar.gz: d2439232358bbdcf186665243e6e448a47def81ab68030997089de7aeb05be990130f94e296ec60d799d3ddcfb9c9db01fd4464cca66bdb7fe9c58ef7d461409
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gem
|
data/README.md
CHANGED
@@ -3,34 +3,53 @@ A simple command-line Sitemap generator tool. Useful for quickly auditing a webs
|
|
3
3
|
|
4
4
|
## Getting started
|
5
5
|
|
6
|
+
gem install sitemap-generator
|
7
|
+
|
8
|
+
*Getting started with code*
|
9
|
+
|
10
|
+
If you want to get your hands dirty in code:
|
11
|
+
|
6
12
|
git clone https://github.com/mefellows/sitemap-generator
|
7
13
|
cd sitemap-generator
|
14
|
+
bundle install
|
15
|
+
bin/sitemap
|
16
|
+
|
17
|
+
## Examples
|
8
18
|
|
9
19
|
### Generate a standard CSV Sitemap file
|
20
|
+
|
10
21
|
The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
|
11
22
|
|
12
|
-
|
23
|
+
sitemap generate http://www.onegeek.com.au/ sitemap.csv
|
13
24
|
|
14
25
|
### Generate a standard Sitemap JSON format
|
15
26
|
|
16
|
-
|
27
|
+
This command deliberately doesn't write to file in order to allow unix-style pipelining
|
28
|
+
|
29
|
+
sitemap generate --format=json http://www.onegeek.com.au/
|
17
30
|
|
18
|
-
### Generate a Sitemap
|
31
|
+
### Generate a Sitemap 3 levels deep
|
32
|
+
|
33
|
+
sitemap generate --depth=3 http://www.onegeek.com.au/ sitemap.csv
|
34
|
+
|
35
|
+
### Generate a Sitemap containing links only on the specified URI
|
19
36
|
|
20
|
-
|
37
|
+
sitemap generate --no-recursion http://www.onegeek.com.au/ sitemap.csv
|
21
38
|
|
22
|
-
### Generate a Sitemap
|
39
|
+
### Generate a Sitemap that contains URI fragments and query strings
|
23
40
|
|
24
|
-
|
41
|
+
By default, URI fragments like ```foo.com/#!/some-page``` and query strings like ```foo.com/?bar=baz``` are ignored - they are generally duplicitous so sitemap-generator strips them off entirely. This lets them back in:
|
42
|
+
|
43
|
+
sitemap generate --query-strings --fragments http://www.onegeek.com.au/ sitemap.csv
|
25
44
|
|
26
|
-
|
27
45
|
## Getting Help
|
28
46
|
|
29
|
-
|
30
|
-
|
47
|
+
sitemap
|
48
|
+
sitemap generate --help
|
31
49
|
|
32
50
|
## Alternatives?
|
33
51
|
|
34
|
-
So of course, after spending
|
52
|
+
So of course, after spending a few hours writing this I forgot that wget can do this for you, well basically anyway:
|
35
53
|
|
36
54
|
wget -r --delete-after <todo>
|
55
|
+
|
data/lib/sitemap/command.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'sitemap/version'
|
2
2
|
require 'sitemap/logging'
|
3
3
|
require 'sitemap/commands/sitemap'
|
4
|
+
require 'sitemap/filters/transformers'
|
4
5
|
require 'clamp'
|
5
6
|
|
6
7
|
module Sitemap
|
@@ -18,6 +19,8 @@ module Sitemap
|
|
18
19
|
class SitemapCommand < AbstractCommand
|
19
20
|
option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
|
20
21
|
option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
|
22
|
+
option "--query-strings", :flag, "Allow query strings in URIs", :default => false
|
23
|
+
option "--fragments", :flag, "Allow fragments in URIs", :default => false
|
21
24
|
option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
|
22
25
|
Integer(s)
|
23
26
|
end
|
@@ -51,7 +54,29 @@ module Sitemap
|
|
51
54
|
|
52
55
|
log.info('Running sitemap generator')
|
53
56
|
generator = SitemapGenerator.new()
|
54
|
-
|
57
|
+
|
58
|
+
# Setup filters and transformers
|
59
|
+
filters = Filters::Util.get_all_filters
|
60
|
+
transformers = Transformers::Util.get_all_transformers
|
61
|
+
|
62
|
+
# If query strings enabled, remove QueryString transformer
|
63
|
+
if query_strings?
|
64
|
+
transformers = transformers.select do |t|
|
65
|
+
next true unless t.instance_of? Transformers::URIQueryStringTransformer
|
66
|
+
false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# If query strings enabled, remove QueryString transformer
|
71
|
+
if fragments?
|
72
|
+
filters = filters.select do |t|
|
73
|
+
next true unless t.instance_of? Filters::URIFragmentFilter
|
74
|
+
false
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Create the sitemap!
|
79
|
+
generator.generate(uri, output_file, filters, transformers, format, real_depth)
|
55
80
|
end
|
56
81
|
end
|
57
82
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'sitemap/logging'
|
2
2
|
require 'sitemap/filters/filters'
|
3
|
+
require 'sitemap/filters/transformers'
|
3
4
|
require 'csv'
|
4
5
|
require 'json'
|
5
6
|
require 'nokogiri'
|
@@ -37,16 +38,16 @@ class SitemapGenerator
|
|
37
38
|
#
|
38
39
|
# Public: Create the index recursively.
|
39
40
|
#
|
40
|
-
# link
|
41
|
-
# base_uri
|
42
|
-
#
|
43
|
-
#
|
44
|
-
# link_index
|
45
|
-
# depth
|
41
|
+
# link - The URI to build the index from recursively.
|
42
|
+
# base_uri - The base URI (Host) to restrict which links are indexed
|
43
|
+
# filters - An array of Filters to be applied before indexing
|
44
|
+
# transformers - An array of Transformers to be applied before indexing
|
45
|
+
# link_index - Any index to start the build from.
|
46
|
+
# depth - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
|
46
47
|
#
|
47
48
|
# Returns an index containing URIs as keys and an object representing the page.
|
48
49
|
#
|
49
|
-
def create_index(link, base_uri, filters, link_index = nil, depth = -1)
|
50
|
+
def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
|
50
51
|
if link_index.nil?
|
51
52
|
log.debug('Creating new Index')
|
52
53
|
link_index = Hash.new
|
@@ -56,8 +57,6 @@ class SitemapGenerator
|
|
56
57
|
return
|
57
58
|
end
|
58
59
|
|
59
|
-
### TODO: replace with generic filter method
|
60
|
-
|
61
60
|
if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
|
62
61
|
|
63
62
|
log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
|
@@ -82,14 +81,17 @@ class SitemapGenerator
|
|
82
81
|
links << l.attributes["href"].to_s
|
83
82
|
end
|
84
83
|
|
84
|
+
# Transform URLs before indexing
|
85
|
+
Transformers::Util.apply_transformers(links, transformers)
|
86
|
+
|
85
87
|
# Filter out in-eligible links
|
86
|
-
|
88
|
+
Filters::Util.apply_filters(links, link_index, base_uri, filters)
|
87
89
|
|
88
90
|
links.each do |l|
|
89
91
|
l = Filters::Util.remove_fragment_from_uri(l)
|
90
92
|
if l && !l.empty?
|
91
93
|
if depth != -1
|
92
|
-
create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
|
94
|
+
create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, transformers, link_index, depth)
|
93
95
|
end
|
94
96
|
end
|
95
97
|
end
|
@@ -155,14 +157,10 @@ class SitemapGenerator
|
|
155
157
|
#
|
156
158
|
# Create the Sitemap
|
157
159
|
#
|
158
|
-
def generate(uri, output_file, format = 'csv', depth = -1)
|
160
|
+
def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
|
159
161
|
|
160
162
|
log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
|
161
|
-
|
162
|
-
# Setup filters. Ideally, have some outsider give me these
|
163
|
-
# Really, these are just options to the index
|
164
|
-
filters = Filters::Util.get_all_filters
|
165
|
-
index = create_index(uri, uri, filters, nil, depth)
|
163
|
+
index = create_index(uri, uri, filters, transformers, nil, depth)
|
166
164
|
|
167
165
|
case format
|
168
166
|
when 'json'
|
@@ -54,10 +54,10 @@ module Filters
|
|
54
54
|
# Public: Get all known filters
|
55
55
|
#
|
56
56
|
def self.get_all_filters
|
57
|
-
return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
|
57
|
+
return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new, Filters::URIFragmentFilter.new]
|
58
58
|
end
|
59
59
|
|
60
|
-
# Public: Apply URI
|
60
|
+
# Public: Apply URI Filters to a Hash.
|
61
61
|
#
|
62
62
|
# uris - Set (Array|Hash) of URIs to be filtered.
|
63
63
|
# index - Current index
|
@@ -79,7 +79,7 @@ module Filters
|
|
79
79
|
f = filters_clone.shift
|
80
80
|
uris = apply_filters(uris, index, base_uri, filters_clone)
|
81
81
|
|
82
|
-
uris = uris.select do |k
|
82
|
+
uris = uris.select do |k|
|
83
83
|
f.filter(index, k, base_uri)
|
84
84
|
end
|
85
85
|
end
|
@@ -157,8 +157,9 @@ module Filters
|
|
157
157
|
#
|
158
158
|
def filter(index, link, base_uri)
|
159
159
|
link = Filters::Util.make_URI(link)
|
160
|
-
return
|
161
|
-
|
160
|
+
return true unless (link != nil && link.fragment != nil)
|
161
|
+
log.debug("Rejecting link #{link} as it contains fragments #{link.fragment}")
|
162
|
+
false
|
162
163
|
end
|
163
164
|
end
|
164
165
|
|
@@ -174,7 +175,9 @@ module Filters
|
|
174
175
|
# Returns the link if it should be indexed else nil.
|
175
176
|
#
|
176
177
|
def filter(index, link, base_uri)
|
177
|
-
|
178
|
+
link = Filters::Util.make_URI(link)
|
179
|
+
return true unless link == nil || link.eql?('')
|
180
|
+
log.debug("Rejecting link #{link} as it is not deemed to be a valid URI")
|
178
181
|
false
|
179
182
|
end
|
180
183
|
end
|
@@ -196,6 +199,7 @@ module Filters
|
|
196
199
|
return true
|
197
200
|
end
|
198
201
|
return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
|
202
|
+
log.debug("Rejecting link #{link} as it is a static resource #{link.path}")
|
199
203
|
false
|
200
204
|
end
|
201
205
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sitemap/logging'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
# Public: Transformers are objects that modify a provided link.
|
6
|
+
#
|
7
|
+
# For example, a Transformer might be used to strip out query string URLS
|
8
|
+
# before indexing.
|
9
|
+
module Transformers
|
10
|
+
class Util
|
11
|
+
# Public: Apply URI transformers to a Hash.
|
12
|
+
#
|
13
|
+
# uris - Set (Array|Hash) of URIs to be filtered.
|
14
|
+
# index - Current index
|
15
|
+
# base_uri - Base URI to test against
|
16
|
+
# transformers - transformers to reduce set of uris
|
17
|
+
#
|
18
|
+
# Returns a filtered uris Hash
|
19
|
+
def self.apply_transformers(uris, transformers)
|
20
|
+
|
21
|
+
# Clone transformers so we retain the 'functional' style of no side-effects
|
22
|
+
transformers_clone = transformers.clone
|
23
|
+
|
24
|
+
# Check for terminating case
|
25
|
+
if (!uris.nil? && uris.length > 0)
|
26
|
+
|
27
|
+
if !transformers_clone.nil? && transformers_clone.length > 0
|
28
|
+
|
29
|
+
# Pop a transformer and apply it recursively to the result of the next transformer
|
30
|
+
t = transformers_clone.shift
|
31
|
+
uris = apply_transformers(uris, transformers_clone)
|
32
|
+
|
33
|
+
uris = uris.map do |k,v|
|
34
|
+
t.transform(k)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
uris
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Public: Get all known transformers
|
44
|
+
#
|
45
|
+
def self.get_all_transformers
|
46
|
+
return [Transformers::URIQueryStringTransformer.new]
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
# Public: URI Query Sttring Transformer.
|
52
|
+
#
|
53
|
+
#
|
54
|
+
class URIQueryStringTransformer
|
55
|
+
include Logging
|
56
|
+
|
57
|
+
#
|
58
|
+
# Public: Filters out URLs with query string resources.
|
59
|
+
#
|
60
|
+
# Returns the link without the query string component
|
61
|
+
#
|
62
|
+
def transform(link)
|
63
|
+
link = Filters::Util.make_URI(link)
|
64
|
+
return link unless (link != nil && link.query != nil)
|
65
|
+
link.query = nil
|
66
|
+
link
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
data/lib/sitemap/version.rb
CHANGED
data/sitemap-generator.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["mefellows"]
|
10
10
|
spec.email = ["matt.fellows@onegeek.com.au"]
|
11
11
|
spec.description = "Sitemap Generator"
|
12
|
-
spec.summary = "A basic, human readable sitemap generator"
|
12
|
+
spec.summary = "A basic, human readable (CSV, JSON) sitemap generator. "
|
13
13
|
spec.homepage = "https://github.com/mefellows/sitemap-generator"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
data/spec/filter_spec.rb
CHANGED
@@ -85,6 +85,10 @@ describe Filters::ResourcesFilter do
|
|
85
85
|
index['http://foo.com/foo'] = ""
|
86
86
|
index['http://foo.com/foo.pdf'] = ""
|
87
87
|
index['http://foo.com/bar'] = ""
|
88
|
+
index['http://foo.com/bar#'] = ""
|
89
|
+
index['http://foo.com/bar/#'] = ""
|
90
|
+
index['http://foo.com/bar/#foo'] = ""
|
91
|
+
index['http://foo.com/bar/#!/hashbang/foo'] = ""
|
88
92
|
index['http://foo.com/bar.tar.gz'] = ""
|
89
93
|
index['http://bar.com/foo'] = ""
|
90
94
|
index['http://www.mootools.net/'] = ""
|
@@ -118,7 +122,7 @@ describe Filters::ResourcesFilter do
|
|
118
122
|
puts i
|
119
123
|
|
120
124
|
# Need to prevent mutation in filtering
|
121
|
-
expect(filters.length).to eq
|
125
|
+
expect(filters.length).to eq 4
|
122
126
|
|
123
127
|
expect(i.length).to eq 0
|
124
128
|
|
data/spec/generator_spec.rb
CHANGED
@@ -8,19 +8,21 @@ describe SitemapGenerator do
|
|
8
8
|
|
9
9
|
it 'Should return an index from a single page' do
|
10
10
|
generator = SitemapGenerator.new
|
11
|
-
filters =
|
11
|
+
filters = Filters::Util.get_all_filters
|
12
|
+
transformers = Transformers::Util.get_all_transformers
|
12
13
|
|
13
14
|
# onegeek.com.au source as at 23/05/2014
|
14
15
|
|
15
16
|
# Note no trailing slash -> need to find why lack of trailing slash is an issue
|
16
17
|
link = URI::parse("http://www.onegeek.com.au")
|
17
|
-
index = generator.create_index(link, link, filters, nil, 1)
|
18
|
+
index = generator.create_index(link, link, filters, transformers, nil, 1)
|
18
19
|
|
19
|
-
expect(index.length).to be 18
|
20
20
|
puts "Here's the index:"
|
21
21
|
index.each do |key, value|
|
22
22
|
puts key
|
23
23
|
end
|
24
|
+
|
25
|
+
expect(index.length).to be 18
|
24
26
|
end
|
25
27
|
|
26
28
|
|
@@ -32,7 +34,8 @@ describe SitemapGenerator do
|
|
32
34
|
|
33
35
|
# should treat trailing slashes the same as without???
|
34
36
|
|
35
|
-
|
37
|
+
|
38
|
+
# Test for blacklisted URIs
|
36
39
|
|
37
40
|
|
38
41
|
# Should not index files (PDFs, images etc.)
|
@@ -41,8 +44,6 @@ describe SitemapGenerator do
|
|
41
44
|
|
42
45
|
end
|
43
46
|
|
44
|
-
|
45
|
-
|
46
47
|
# it 'Should return an index from an entire site' do
|
47
48
|
# generator = SitemapGenerator.new
|
48
49
|
|
@@ -55,14 +56,4 @@ describe SitemapGenerator do
|
|
55
56
|
# end
|
56
57
|
# end
|
57
58
|
|
58
|
-
# it 'Let me hack stuff' do
|
59
|
-
# generator = SitemapGenerator.new
|
60
|
-
|
61
|
-
# print generator.fetch('http://www.webcentral.com.au/order')
|
62
|
-
# doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards & Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> — <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks & Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"<").replace(/>/g,">");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
|
63
|
-
# # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
|
64
|
-
# expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
|
65
|
-
# end
|
66
|
-
|
67
|
-
|
68
59
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sitemap/commands/sitemap'
|
3
|
+
require 'sitemap/filters/transformers'
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
url = URI::parse('http://foo.com/foo/bar')
|
7
|
+
|
8
|
+
describe Transformers::Util do
|
9
|
+
it 'Should remove #fragments URIs' do
|
10
|
+
transformers = Transformers::Util.get_all_transformers
|
11
|
+
|
12
|
+
index = Hash.new
|
13
|
+
index['http://foo.com'] = ""
|
14
|
+
index['http://foo.com/#fragment'] = ""
|
15
|
+
index['http://foo.com/#!/hash-bang/fragment'] = ""
|
16
|
+
index['http://foo.com/#'] = ""
|
17
|
+
index['http://foo.com/foo'] = ""
|
18
|
+
index['http://foo.com/foo?'] = ""
|
19
|
+
index['http://foo.com/foo?foo'] = ""
|
20
|
+
index['http://foo.com/foo?foo=bar'] = ""
|
21
|
+
index['http://foo.com/foo.pdf'] = ""
|
22
|
+
index['http://foo.com/bar'] = ""
|
23
|
+
index['http://foo.com/bar.tar.gz'] = ""
|
24
|
+
index['http://bar.com/foo'] = ""
|
25
|
+
index['http://www.mootools.net/'] = ""
|
26
|
+
index['http://www.wordpress.org'] = ""
|
27
|
+
index['http://www.blueprintcss.com'] = ""
|
28
|
+
index['http://www.php.net'] = ""
|
29
|
+
index['/contact'] = ""
|
30
|
+
index['http://www.onegeek.com.au'] = ""
|
31
|
+
index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
|
32
|
+
index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
|
33
|
+
index['http://www.twitter.com/matthewfellows'] = ""
|
34
|
+
index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
|
35
|
+
index['http://www.flickr.com/photos/mattfellows'] = ""
|
36
|
+
index['http://www.delicious.com/mefellows'] = ""
|
37
|
+
index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
|
38
|
+
|
39
|
+
i = Transformers::Util.apply_transformers(index, transformers)
|
40
|
+
|
41
|
+
expect(i.length).to eq 25
|
42
|
+
expect(i[4].to_s).to eq 'http://foo.com/foo'
|
43
|
+
expect(i[5].to_s).to eq 'http://foo.com/foo'
|
44
|
+
expect(i[6].to_s).to eq 'http://foo.com/foo'
|
45
|
+
expect(i[7].to_s).to eq 'http://foo.com/foo'
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap-generator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- mefellows
|
@@ -102,6 +102,7 @@ executables:
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
+
- .gitignore
|
105
106
|
- Gemfile
|
106
107
|
- Gemfile.lock
|
107
108
|
- README.md
|
@@ -111,12 +112,14 @@ files:
|
|
111
112
|
- lib/sitemap/command.rb
|
112
113
|
- lib/sitemap/commands/sitemap.rb
|
113
114
|
- lib/sitemap/filters/filters.rb
|
115
|
+
- lib/sitemap/filters/transformers.rb
|
114
116
|
- lib/sitemap/logging.rb
|
115
117
|
- lib/sitemap/version.rb
|
116
118
|
- sitemap-generator.gemspec
|
117
119
|
- spec/filter_spec.rb
|
118
120
|
- spec/generator_spec.rb
|
119
121
|
- spec/spec_helper.rb
|
122
|
+
- spec/transform_spec.rb
|
120
123
|
homepage: https://github.com/mefellows/sitemap-generator
|
121
124
|
licenses:
|
122
125
|
- MIT
|
@@ -140,8 +143,9 @@ rubyforge_project:
|
|
140
143
|
rubygems_version: 2.0.14
|
141
144
|
signing_key:
|
142
145
|
specification_version: 4
|
143
|
-
summary: A basic, human readable sitemap generator
|
146
|
+
summary: A basic, human readable (CSV, JSON) sitemap generator.
|
144
147
|
test_files:
|
145
148
|
- spec/filter_spec.rb
|
146
149
|
- spec/generator_spec.rb
|
147
150
|
- spec/spec_helper.rb
|
151
|
+
- spec/transform_spec.rb
|