sitemap-generator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +27 -0
- data/README.md +36 -0
- data/Rakefile +6 -0
- data/Vagrantfile +18 -0
- data/bin/sitemap +10 -0
- data/lib/sitemap/command.rb +61 -0
- data/lib/sitemap/commands/sitemap.rb +178 -0
- data/lib/sitemap/filters/filters.rb +203 -0
- data/lib/sitemap/logging.rb +25 -0
- data/lib/sitemap/version.rb +3 -0
- data/sitemap-generator.gemspec +27 -0
- data/spec/filter_spec.rb +142 -0
- data/spec/generator_spec.rb +68 -0
- data/spec/spec_helper.rb +1 -0
- metadata +147 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
|
4
|
+
data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
|
7
|
+
data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sitemap-analyser (0.0.1)
|
5
|
+
clamp
|
6
|
+
json
|
7
|
+
log4r
|
8
|
+
nokogiri
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: https://rubygems.org/
|
12
|
+
specs:
|
13
|
+
clamp (0.6.3)
|
14
|
+
json (1.8.1)
|
15
|
+
log4r (1.1.10)
|
16
|
+
mini_portile (0.6.0)
|
17
|
+
nokogiri (1.6.2.1)
|
18
|
+
mini_portile (= 0.6.0)
|
19
|
+
rake (10.3.2)
|
20
|
+
|
21
|
+
PLATFORMS
|
22
|
+
ruby
|
23
|
+
|
24
|
+
DEPENDENCIES
|
25
|
+
bundler (~> 1.3)
|
26
|
+
rake
|
27
|
+
sitemap-analyser!
|
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Sitemap Generator
|
2
|
+
A simple command-line Sitemap generator tool. Useful for quickly auditing a website.
|
3
|
+
|
4
|
+
## Getting started
|
5
|
+
|
6
|
+
git clone https://github.com/mefellows/sitemap-generator
|
7
|
+
cd sitemap-generator
|
8
|
+
|
9
|
+
### Generate a standard CSV Sitemap file
|
10
|
+
The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
|
11
|
+
|
12
|
+
bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
|
13
|
+
|
14
|
+
### Generate a standard Sitemap JSON format
|
15
|
+
|
16
|
+
bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
|
17
|
+
|
18
|
+
### Generate a Sitemap restricting to the URI provided
|
19
|
+
|
20
|
+
bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
|
21
|
+
|
22
|
+
### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
|
23
|
+
|
24
|
+
bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
|
25
|
+
|
26
|
+
|
27
|
+
## Getting Help
|
28
|
+
|
29
|
+
bin/sitemap
|
30
|
+
bin/sitemap generate --help
|
31
|
+
|
32
|
+
## Alternatives?
|
33
|
+
|
34
|
+
So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
|
35
|
+
|
36
|
+
wget -r --delete-after <todo>
|
data/Rakefile
ADDED
data/Vagrantfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Vagrant.configure("2") do |config|
|
2
|
+
|
3
|
+
config.vm.define "centos-64-x64-vbox4210" do |v|
|
4
|
+
v.vm.box = "centos-64-x64-vbox4210"
|
5
|
+
v.vm.hostname = "centos"
|
6
|
+
v.vm.box_url = "http://puppet-vagrant-boxes.puppetlabs.com/centos-64-x64-vbox4210.box"
|
7
|
+
config.vm.network "forwarded_port", guest: 80, host: 8081
|
8
|
+
end
|
9
|
+
|
10
|
+
#config.vm.synced_folder "vendor/melbourneitdev/libmit", "/mit"
|
11
|
+
|
12
|
+
config.vm.provider :virtualbox do |vb|
|
13
|
+
vb.customize ["modifyvm", :id, "--memory", "256"]
|
14
|
+
end
|
15
|
+
|
16
|
+
config.vm.provision :shell, :path => "vagrant/shell/bootstrap.sh"
|
17
|
+
|
18
|
+
end
|
data/bin/sitemap
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'sitemap/version'
|
2
|
+
require 'sitemap/logging'
|
3
|
+
require 'sitemap/commands/sitemap'
|
4
|
+
require 'clamp'
|
5
|
+
|
6
|
+
module Sitemap
|
7
|
+
class AbstractCommand < Clamp::Command
|
8
|
+
include Logging
|
9
|
+
|
10
|
+
option ["-v", "--verbose"], :flag, "be verbose"
|
11
|
+
option "--version", :flag, "show version" do
|
12
|
+
puts "Sitemap Analyser " + Sitemap::VERSION
|
13
|
+
exit(0)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
class SitemapCommand < AbstractCommand
|
19
|
+
option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
|
20
|
+
option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
|
21
|
+
option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
|
22
|
+
Integer(s)
|
23
|
+
end
|
24
|
+
|
25
|
+
# option "--restrict-path", "restrict-path", "Restrict links not on supplied path", :attribute_name => :restrict_path, :multivalued => true
|
26
|
+
# --follow-redirects, "follow", "Ignore redirects?"
|
27
|
+
# --include-resources, "include resources", "Follows links to static resources such as images, videos etc."
|
28
|
+
|
29
|
+
parameter "uri", "URI base to fetch URLs from", :attribute_name => :uri do |u|
|
30
|
+
begin
|
31
|
+
parsed_uri = URI::parse(u)
|
32
|
+
parsed_uri
|
33
|
+
rescue
|
34
|
+
puts "Invalid URI provided"
|
35
|
+
exit(0)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
parameter "[output_file]", "Output file", :attribute_name => :output_file
|
39
|
+
|
40
|
+
def execute
|
41
|
+
if !format.eql?('json') && output_file.nil?
|
42
|
+
signal_usage_error "'output_file' parameter must be provided if format is not JSON."
|
43
|
+
exit(0)
|
44
|
+
end
|
45
|
+
|
46
|
+
real_depth = depth
|
47
|
+
if no_recursion?
|
48
|
+
log.debug("Recursion disabled, setting depth to 1")
|
49
|
+
real_depth = 1
|
50
|
+
end
|
51
|
+
|
52
|
+
log.info('Running sitemap generator')
|
53
|
+
generator = SitemapGenerator.new()
|
54
|
+
generator.generate(uri, output_file, format, real_depth)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class MainCommand < AbstractCommand
|
59
|
+
subcommand "generate", "Generate a sitemap", Sitemap::SitemapCommand
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'sitemap/logging'
|
2
|
+
require 'sitemap/filters/filters'
|
3
|
+
require 'csv'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'net/http'
|
8
|
+
|
9
|
+
class SitemapGenerator
|
10
|
+
include Logging
|
11
|
+
|
12
|
+
def initialize()
|
13
|
+
log.debug('Initialising generator')
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Public: Output the index to JSON
|
18
|
+
#
|
19
|
+
def write_index_to_json(index)
|
20
|
+
puts JSON::generate(index)
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Public: Write a Sitemap index to file
|
25
|
+
#
|
26
|
+
def write_index_to_file(index, output_file)
|
27
|
+
csv = CSV.open(output_file, 'wb')
|
28
|
+
csv << ['URI', 'Title']
|
29
|
+
|
30
|
+
# Flush Sitemap to CSV
|
31
|
+
index.each do |key, value|
|
32
|
+
csv << [key, value['title']]
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Public: Create the index recursively.
|
39
|
+
#
|
40
|
+
# link - The URI to build the index from recursively.
|
41
|
+
# base_uri - The base URI (Host) to restrict which links are indexed
|
42
|
+
# restrict - An array of URIs used to restrict which URIs are indexed.
|
43
|
+
# all indexed URIs will include one of these paths.
|
44
|
+
# link_index - Any index to start the build from.
|
45
|
+
# depth - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
|
46
|
+
#
|
47
|
+
# Returns an index containing URIs as keys and an object representing the page.
|
48
|
+
#
|
49
|
+
def create_index(link, base_uri, filters, link_index = nil, depth = -1)
|
50
|
+
if link_index.nil?
|
51
|
+
log.debug('Creating new Index')
|
52
|
+
link_index = Hash.new
|
53
|
+
end
|
54
|
+
|
55
|
+
if link.nil? || base_uri.nil?
|
56
|
+
return
|
57
|
+
end
|
58
|
+
|
59
|
+
### TODO: replace with generic filter method
|
60
|
+
|
61
|
+
if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
|
62
|
+
|
63
|
+
log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
|
64
|
+
|
65
|
+
# Only continue in this part if page NOT in index and is indexable
|
66
|
+
# Only fetch the document if it's not yet been indexed
|
67
|
+
doc = get_document(link)
|
68
|
+
|
69
|
+
## All docs must be indexed, even if blacklisted...
|
70
|
+
|
71
|
+
if !doc.nil?
|
72
|
+
log.debug("New document found at #{link}, exploring links")
|
73
|
+
depth = depth - 1
|
74
|
+
|
75
|
+
# Set page title and add to index
|
76
|
+
link_index[link.to_s] = {'title' => doc.title}
|
77
|
+
log.info("Adding link to index: #{link.to_s}")
|
78
|
+
|
79
|
+
# Find all links on the page
|
80
|
+
links = []
|
81
|
+
doc.css('a').each do |l|
|
82
|
+
links << l.attributes["href"].to_s
|
83
|
+
end
|
84
|
+
|
85
|
+
# Filter out in-eligible links
|
86
|
+
a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
|
87
|
+
|
88
|
+
links.each do |l|
|
89
|
+
l = Filters::Util.remove_fragment_from_uri(l)
|
90
|
+
if l && !l.empty?
|
91
|
+
if depth != -1
|
92
|
+
create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
link_index
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Public: Fetch a document the Internet.
|
105
|
+
#
|
106
|
+
def fetch(uri, domain = nil, limit = 10)
|
107
|
+
uri = Filters::Util.make_URI(uri)
|
108
|
+
if domain.nil?
|
109
|
+
domain = uri
|
110
|
+
end
|
111
|
+
domain = Filters::Util.make_URI(domain)
|
112
|
+
|
113
|
+
# You should choose a better exception.
|
114
|
+
raise ArgumentError, 'too many HTTP redirects' if limit == 0
|
115
|
+
|
116
|
+
response = Net::HTTP.get_response(uri)
|
117
|
+
|
118
|
+
case response
|
119
|
+
when Net::HTTPSuccess then
|
120
|
+
response.body
|
121
|
+
when Net::HTTPRedirection then
|
122
|
+
location = response['location']
|
123
|
+
location = Filters::Util.create_absolute_uri(location, uri)
|
124
|
+
log.warn("Redirecting #{uri} to new location: #{location}")
|
125
|
+
|
126
|
+
# Check new location belongs to current domain
|
127
|
+
if location.host == domain.host
|
128
|
+
fetch(location, uri, limit - 1)
|
129
|
+
elsif
|
130
|
+
log.warn("Redirecting from #{uri} to #{location} rejected due to cross-domain restrictions")
|
131
|
+
end
|
132
|
+
nil
|
133
|
+
else
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Public: Fetch a document
|
140
|
+
#
|
141
|
+
def get_document(uri)
|
142
|
+
log.debug("Fetching document at #{uri}")
|
143
|
+
begin
|
144
|
+
response = fetch(uri.to_s)
|
145
|
+
doc = Nokogiri::HTML(response)
|
146
|
+
if doc.instance_of? Nokogiri::HTML::Document
|
147
|
+
return doc
|
148
|
+
end
|
149
|
+
rescue StandardError => bang
|
150
|
+
log.error("Error reading document #{uri}: #{bang.message}")
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
#
|
156
|
+
# Create the Sitemap
|
157
|
+
#
|
158
|
+
def generate(uri, output_file, format = 'csv', depth = -1)
|
159
|
+
|
160
|
+
log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
|
161
|
+
|
162
|
+
# Setup filters. Ideally, have some outsider give me these
|
163
|
+
# Really, these are just options to the index
|
164
|
+
filters = Filters::Util.get_all_filters
|
165
|
+
index = create_index(uri, uri, filters, nil, depth)
|
166
|
+
|
167
|
+
case format
|
168
|
+
when 'json'
|
169
|
+
write_index_to_json(index)
|
170
|
+
when 'csv'
|
171
|
+
write_index_to_file(index, output_file)
|
172
|
+
else
|
173
|
+
puts "Please specify a valid output format, you gave #{format} Options are ['csv', 'json']"
|
174
|
+
exit(1)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
end
|
@@ -0,0 +1,203 @@
|
|
1
|
+
require 'sitemap/logging'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
# Public: Various index filtering operations and classes.
|
6
|
+
module Filters
|
7
|
+
|
8
|
+
class Util
|
9
|
+
#
|
10
|
+
# Idempotently make a string a URI
|
11
|
+
#
|
12
|
+
def self.make_URI(uri)
|
13
|
+
begin
|
14
|
+
if !uri.is_a? URI
|
15
|
+
uri = URI::parse(uri)
|
16
|
+
end
|
17
|
+
uri
|
18
|
+
rescue
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Public: Remove fragments from a URI
|
25
|
+
#
|
26
|
+
def self.remove_fragment_from_uri(uri)
|
27
|
+
parsed_href = Filters::Util.make_URI(uri)
|
28
|
+
if parsed_href.nil?
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
parsed_href.fragment = nil
|
32
|
+
parsed_href.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Public: Create an absolute link provided a link and base URI.
|
37
|
+
#
|
38
|
+
def self.create_absolute_uri(link, base_uri)
|
39
|
+
link = Filters::Util.make_URI(link)
|
40
|
+
base_uri = Filters::Util.make_URI(base_uri)
|
41
|
+
|
42
|
+
# Remove path from base
|
43
|
+
base_uri.path = ''
|
44
|
+
|
45
|
+
# Append Path to base_uri if relative
|
46
|
+
if !link.path.nil? && link.path.start_with?('/')
|
47
|
+
return base_uri + link
|
48
|
+
end
|
49
|
+
|
50
|
+
return link
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Public: Get all known filters
|
55
|
+
#
|
56
|
+
def self.get_all_filters
|
57
|
+
return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
|
58
|
+
end
|
59
|
+
|
60
|
+
# Public: Apply URI filters to a Hash.
|
61
|
+
#
|
62
|
+
# uris - Set (Array|Hash) of URIs to be filtered.
|
63
|
+
# index - Current index
|
64
|
+
# base_uri - Base URI to test against
|
65
|
+
# filters - Filters to reduce set of uris
|
66
|
+
#
|
67
|
+
# Returns a filtered uris Hash
|
68
|
+
def self.apply_filters(uris, index, base_uri, filters)
|
69
|
+
|
70
|
+
# Clone filters so we retain the 'functional' style of no side-effects
|
71
|
+
filters_clone = filters.clone
|
72
|
+
|
73
|
+
# Check for terminating case
|
74
|
+
if (!uris.nil? && uris.length > 0)
|
75
|
+
|
76
|
+
if !filters_clone.nil? && filters_clone.length > 0
|
77
|
+
|
78
|
+
# Pop a filter and apply it recursively to the result of the next filter
|
79
|
+
f = filters_clone.shift
|
80
|
+
uris = apply_filters(uris, index, base_uri, filters_clone)
|
81
|
+
|
82
|
+
uris = uris.select do |k,v|
|
83
|
+
f.filter(index, k, base_uri)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
uris
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Public: Filters out non-local URIs
|
94
|
+
#
|
95
|
+
class LocalFilter
|
96
|
+
include Logging
|
97
|
+
|
98
|
+
#
|
99
|
+
# Public: Determines if a link is on the local domain + path or not
|
100
|
+
#
|
101
|
+
def is_link_local?(link, local)
|
102
|
+
|
103
|
+
begin
|
104
|
+
link = Filters::Util.make_URI(link)
|
105
|
+
local = Filters::Util.make_URI(local)
|
106
|
+
|
107
|
+
# Remove Absolute URLs that don't refer to local domain
|
108
|
+
if !link.host.nil? && !link.host.eql?(local.host)
|
109
|
+
log.debug("Rejecting host #{link.host} as it doesn't match #{local.host}")
|
110
|
+
return false
|
111
|
+
end
|
112
|
+
|
113
|
+
# Ensure path starts with a '/' (filters out junk URLs)
|
114
|
+
if !link.path.nil? && !link.path.eql?('') && !link.path.start_with?('/')
|
115
|
+
log.debug("Rejecting link #{link} as it's path (#{link.path}) doesn't start with '/'")
|
116
|
+
return false
|
117
|
+
end
|
118
|
+
|
119
|
+
rescue StandardError => bang
|
120
|
+
log.debug("Exception looking for local links: " + bang.message)
|
121
|
+
return false
|
122
|
+
end
|
123
|
+
|
124
|
+
return true
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Public: Determines if a link should be indexed.
|
129
|
+
#
|
130
|
+
# Returns boolean true iff the link is local and not indexed.
|
131
|
+
#
|
132
|
+
def should_index_local_link?(link, index, base_uri)
|
133
|
+
return !index.has_key?(link.to_s) && is_link_local?(link, base_uri)
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Public: Filter out resources that are not local.
|
138
|
+
#
|
139
|
+
# Returns the link if it should be indexed else nil.
|
140
|
+
#
|
141
|
+
def filter(index, link, base_uri)
|
142
|
+
return true unless !should_index_local_link?(link, index, base_uri)
|
143
|
+
false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Public: URI Fragment filter.
|
148
|
+
#
|
149
|
+
#
|
150
|
+
class URIFragmentFilter
|
151
|
+
include Logging
|
152
|
+
|
153
|
+
#
|
154
|
+
# Public: Filters out static resources.
|
155
|
+
#
|
156
|
+
# Returns the link if it doesn't contain a URI fragment
|
157
|
+
#
|
158
|
+
def filter(index, link, base_uri)
|
159
|
+
link = Filters::Util.make_URI(link)
|
160
|
+
return false unless (link.nil? || !link.fragment.nil?)
|
161
|
+
true
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Public: Valid URI filter.
|
166
|
+
#
|
167
|
+
#
|
168
|
+
class ValidURIFilter
|
169
|
+
include Logging
|
170
|
+
|
171
|
+
#
|
172
|
+
# Public: Filters out invalid URIs.
|
173
|
+
#
|
174
|
+
# Returns the link if it should be indexed else nil.
|
175
|
+
#
|
176
|
+
def filter(index, link, base_uri)
|
177
|
+
return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
|
178
|
+
false
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Public: Static resource filter.
|
183
|
+
#
|
184
|
+
#
|
185
|
+
class ResourcesFilter
|
186
|
+
include Logging
|
187
|
+
|
188
|
+
#
|
189
|
+
# Public: Filters out static resources.
|
190
|
+
#
|
191
|
+
# Returns the link if it should be indexed else nil.
|
192
|
+
#
|
193
|
+
def filter(index, link, base_uri)
|
194
|
+
link = Filters::Util.make_URI(link)
|
195
|
+
if link.nil? || link.path.nil? || link.path.to_s.empty?
|
196
|
+
return true
|
197
|
+
end
|
198
|
+
return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
|
199
|
+
false
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'log4r'
|
2
|
+
|
3
|
+
module Logging
|
4
|
+
|
5
|
+
def log
|
6
|
+
@log ||= Logging.logger_for(self.class.name)
|
7
|
+
end
|
8
|
+
|
9
|
+
# Use a hash class-ivar to cache a unique Logger per class:
|
10
|
+
@loggers = {}
|
11
|
+
|
12
|
+
class << self
|
13
|
+
include Log4r
|
14
|
+
|
15
|
+
def logger_for(classname)
|
16
|
+
@loggers[classname] ||= configure_logger_for(classname)
|
17
|
+
end
|
18
|
+
|
19
|
+
def configure_logger_for(classname)
|
20
|
+
logger = Logger.new classname.to_s.gsub(/[^a-zA-Z0-9]/, '.').downcase.gsub(/\.+/, '.')
|
21
|
+
logger.outputters << Log4r::FileOutputter.new('sitemaplog', :filename => 'sitemap.log')
|
22
|
+
logger
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'sitemap/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "sitemap-generator"
|
8
|
+
spec.version = Sitemap::VERSION
|
9
|
+
spec.authors = ["mefellows"]
|
10
|
+
spec.email = ["matt.fellows@onegeek.com.au"]
|
11
|
+
spec.description = "Sitemap Generator"
|
12
|
+
spec.summary = "A basic, human readable sitemap generator"
|
13
|
+
spec.homepage = "https://github.com/mefellows/sitemap-generator"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_runtime_dependency "clamp"
|
24
|
+
spec.add_runtime_dependency "json"
|
25
|
+
spec.add_runtime_dependency "log4r"
|
26
|
+
spec.add_runtime_dependency "nokogiri"
|
27
|
+
end
|
data/spec/filter_spec.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sitemap/commands/sitemap'
|
3
|
+
require 'sitemap/filters/filters'
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
url = URI::parse('http://foo.com/foo/bar')
|
7
|
+
|
8
|
+
describe Filters::LocalFilter do
|
9
|
+
|
10
|
+
|
11
|
+
it 'Should exclude non-local URIs' do
|
12
|
+
filter = Filters::LocalFilter.new
|
13
|
+
filter.is_link_local?('http://www.foo.com/something', url).should eq(false)
|
14
|
+
filter.is_link_local?('https://www.foo.com/something', url).should eq(false)
|
15
|
+
filter.is_link_local?('http://www.somethingelse.com/something', url).should eq(false)
|
16
|
+
filter.is_link_local?('https://www.somethingelse.com/something', url).should eq(false)
|
17
|
+
|
18
|
+
expect(filter.filter(Hash.new, 'https://www.somethingelse.com/something', 'https://foo.com/')).to eq(false)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'Should exclude javascript links' do
|
22
|
+
filter = Filters::LocalFilter.new
|
23
|
+
# This is a valid case for THIS method, may choose to exclude elsewhere in the program
|
24
|
+
# filter.is_link_local?('#thisisareallylonganchorname', url).should eq(false)
|
25
|
+
filter.is_link_local?('alert(\'true\')', url).should eq(false)
|
26
|
+
|
27
|
+
expect(filter.filter(Hash.new, 'alert(\'true\')', 'https://foo.com/')).to eq(false)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'Should include relative URIs' do
|
31
|
+
filter = Filters::LocalFilter.new
|
32
|
+
filter.is_link_local?('/something', url).should eq(true)
|
33
|
+
filter.is_link_local?('/', url).should eq(true)
|
34
|
+
|
35
|
+
expect(filter.filter(Hash.new, '/something', url)).to eq(true)
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'Should include absolute local URIs' do
|
40
|
+
filter = Filters::LocalFilter.new
|
41
|
+
filter.is_link_local?('http://foo.com', url).should eq(true)
|
42
|
+
filter.is_link_local?('http://foo.com/foo/bar', url).should eq(true)
|
43
|
+
filter.is_link_local?('http://foo.com/something', url).should eq(true)
|
44
|
+
filter.is_link_local?('https://foo.com/something', url).should eq(true)
|
45
|
+
|
46
|
+
filter.filter(Hash.new, 'http://foo.com', url).should eq(true)
|
47
|
+
filter.filter(Hash.new, 'http://foo.com/foo/bar', url).should eq(true)
|
48
|
+
filter.filter(Hash.new, 'http://foo.com/something', url).should eq(true)
|
49
|
+
filter.filter(Hash.new, 'https://foo.com/something', url).should eq(true)
|
50
|
+
|
51
|
+
filter.filter(Hash.new, URI::parse('http://foo.com'), url).should eq(true)
|
52
|
+
filter.filter(Hash.new, URI::parse('http://foo.com/foo/bar'), url).should eq(true)
|
53
|
+
filter.filter(Hash.new, URI::parse('http://foo.com/something'), url).should eq(true)
|
54
|
+
filter.filter(Hash.new, URI::parse('https://foo.com/something'), url).should eq(true)
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe Filters::ResourcesFilter do
|
60
|
+
it 'Should exclude static resources' do
|
61
|
+
filter = Filters::ResourcesFilter.new
|
62
|
+
filter.filter(Hash.new, 'http://www.foo.com/something.pdf', 'http://www.foo.com/').should eq(false)
|
63
|
+
filter.filter(Hash.new, 'http://www.foo.com/something.txt', 'http://www.foo.com/').should eq(false)
|
64
|
+
filter.filter(Hash.new, 'http://www.foo.com/something./', 'http://www.foo.com/').should eq(true)
|
65
|
+
filter.filter(Hash.new, 'http://www.foo.com/something-/', 'http://www.foo.com/').should eq(true)
|
66
|
+
filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-', 'http://www.foo.com/').should eq(true)
|
67
|
+
filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-/', 'http://www.foo.com/').should eq(true)
|
68
|
+
filter.filter(Hash.new, 'http://www.foo.com', 'http://www.foo.com/').should eq(true)
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'Should not allow links to be indexed more than once' do
|
72
|
+
filter = Filters::LocalFilter.new
|
73
|
+
|
74
|
+
index = Hash.new
|
75
|
+
index['http://www.webcentral.com.au'] = {"title" => "cheese"}
|
76
|
+
expect(filter.should_index_local_link?(Filters::Util.create_absolute_uri('http://www.webcentral.com.au', 'http://www.webcentral.com.au'), index, 'http://www.webcentral.com.au')).to eq false
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'Should return a filtered Hash' do
|
80
|
+
filters = Filters::Util.get_all_filters
|
81
|
+
# filters = [Filters::ResourcesFilter.new]
|
82
|
+
|
83
|
+
index = Hash.new
|
84
|
+
index['http://foo.com'] = ""
|
85
|
+
index['http://foo.com/foo'] = ""
|
86
|
+
index['http://foo.com/foo.pdf'] = ""
|
87
|
+
index['http://foo.com/bar'] = ""
|
88
|
+
index['http://foo.com/bar.tar.gz'] = ""
|
89
|
+
index['http://bar.com/foo'] = ""
|
90
|
+
index['http://www.mootools.net/'] = ""
|
91
|
+
index['http://www.wordpress.org'] = ""
|
92
|
+
index['http://www.blueprintcss.com'] = ""
|
93
|
+
index['http://www.php.net'] = ""
|
94
|
+
index['/contact'] = ""
|
95
|
+
index['http://www.onegeek.com.au'] = ""
|
96
|
+
index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
|
97
|
+
index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
|
98
|
+
index['http://www.twitter.com/matthewfellows'] = ""
|
99
|
+
index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
|
100
|
+
index['http://www.flickr.com/photos/mattfellows'] = ""
|
101
|
+
index['http://www.delicious.com/mefellows'] = ""
|
102
|
+
index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
|
103
|
+
|
104
|
+
i = Filters::Util.apply_filters(index, Hash.new, url, filters)
|
105
|
+
puts i
|
106
|
+
|
107
|
+
expect(i.length).to eq 3
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'Should return an empty filtered Hash' do
|
112
|
+
filters = Filters::Util.get_all_filters
|
113
|
+
|
114
|
+
index = Hash.new
|
115
|
+
index['http://bar.com/foo'] = ""
|
116
|
+
|
117
|
+
i = Filters::Util.apply_filters(index, Hash.new, url, filters)
|
118
|
+
puts i
|
119
|
+
|
120
|
+
# Need to prevent mutation in filtering
|
121
|
+
expect(filters.length).to eq 3
|
122
|
+
|
123
|
+
expect(i.length).to eq 0
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'Should return the a Hash containing the initial URI' do
|
128
|
+
filters = Filters::Util.get_all_filters
|
129
|
+
|
130
|
+
i = Filters::Util.apply_filters([url], Hash.new, url, filters)
|
131
|
+
puts i
|
132
|
+
|
133
|
+
expect(i.length).to eq 1
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe Filters::Util do
|
139
|
+
it 'Should return an absolute URI' do
|
140
|
+
expect(Filters::Util.create_absolute_uri('/', url).to_s).to eq 'http://foo.com/'
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sitemap/commands/sitemap'
|
3
|
+
require 'sitemap/filters/filters'
|
4
|
+
require 'spec_helper'
|
5
|
+
|
6
|
+
describe SitemapGenerator do
|
7
|
+
url = URI::parse('http://foo.com/foo/bar')
|
8
|
+
|
9
|
+
it 'Should return an index from a single page' do
|
10
|
+
generator = SitemapGenerator.new
|
11
|
+
filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
|
12
|
+
|
13
|
+
# onegeek.com.au source as at 23/05/2014
|
14
|
+
|
15
|
+
# Note no trailing slash -> need to find why lack of trailing slash is an issue
|
16
|
+
link = URI::parse("http://www.onegeek.com.au")
|
17
|
+
index = generator.create_index(link, link, filters, nil, 1)
|
18
|
+
|
19
|
+
expect(index.length).to be 18
|
20
|
+
puts "Here's the index:"
|
21
|
+
index.each do |key, value|
|
22
|
+
puts key
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# Should not index an XML document
|
28
|
+
|
29
|
+
|
30
|
+
# should follow redirects to the same domain
|
31
|
+
|
32
|
+
|
33
|
+
# should treat trailing slashes the same as without???
|
34
|
+
|
35
|
+
# Test for blacklisted objects
|
36
|
+
|
37
|
+
|
38
|
+
# Should not index files (PDFs, images etc.)
|
39
|
+
it 'Should not index static files (PDFs, images etc.)' do
|
40
|
+
# generator = SitemapGenerator.new
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
# it 'Should return an index from an entire site' do
|
47
|
+
# generator = SitemapGenerator.new
|
48
|
+
|
49
|
+
# # onegeek.com.au source as at 23/05/2014
|
50
|
+
# doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards & Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> — <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks & Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"<").replace(/>/g,">");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
|
51
|
+
# index = generator.create_index(URI::parse("http://www.onegeek.com.au"), URI::parse("http://www.onegeek.com.au"), [], nil, 1)
|
52
|
+
|
53
|
+
# index.each do |key, value|
|
54
|
+
# puts key
|
55
|
+
# end
|
56
|
+
# end
|
57
|
+
|
58
|
+
# it 'Let me hack stuff' do
|
59
|
+
# generator = SitemapGenerator.new
|
60
|
+
|
61
|
+
# print generator.fetch('http://www.webcentral.com.au/order')
|
62
|
+
# doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards & Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> — <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks & Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"<").replace(/>/g,">");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
|
63
|
+
# # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
|
64
|
+
# expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
|
65
|
+
# end
|
66
|
+
|
67
|
+
|
68
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'rspec'
|
metadata
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sitemap-generator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- mefellows
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: clamp
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: json
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: log4r
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: nokogiri
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Sitemap Generator
|
98
|
+
email:
|
99
|
+
- matt.fellows@onegeek.com.au
|
100
|
+
executables:
|
101
|
+
- sitemap
|
102
|
+
extensions: []
|
103
|
+
extra_rdoc_files: []
|
104
|
+
files:
|
105
|
+
- Gemfile
|
106
|
+
- Gemfile.lock
|
107
|
+
- README.md
|
108
|
+
- Rakefile
|
109
|
+
- Vagrantfile
|
110
|
+
- bin/sitemap
|
111
|
+
- lib/sitemap/command.rb
|
112
|
+
- lib/sitemap/commands/sitemap.rb
|
113
|
+
- lib/sitemap/filters/filters.rb
|
114
|
+
- lib/sitemap/logging.rb
|
115
|
+
- lib/sitemap/version.rb
|
116
|
+
- sitemap-generator.gemspec
|
117
|
+
- spec/filter_spec.rb
|
118
|
+
- spec/generator_spec.rb
|
119
|
+
- spec/spec_helper.rb
|
120
|
+
homepage: https://github.com/mefellows/sitemap-generator
|
121
|
+
licenses:
|
122
|
+
- MIT
|
123
|
+
metadata: {}
|
124
|
+
post_install_message:
|
125
|
+
rdoc_options: []
|
126
|
+
require_paths:
|
127
|
+
- lib
|
128
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
129
|
+
requirements:
|
130
|
+
- - '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - '>='
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '0'
|
138
|
+
requirements: []
|
139
|
+
rubyforge_project:
|
140
|
+
rubygems_version: 2.0.14
|
141
|
+
signing_key:
|
142
|
+
specification_version: 4
|
143
|
+
summary: A basic, human readable sitemap generator
|
144
|
+
test_files:
|
145
|
+
- spec/filter_spec.rb
|
146
|
+
- spec/generator_spec.rb
|
147
|
+
- spec/spec_helper.rb
|