sitemap-generator 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c32ff5b34a3ebe292414774325cdf8ab87ad3783
4
+ data.tar.gz: 0db3ed2033ba0cc0ca67b7b2d2eeb929aced25d3
5
+ SHA512:
6
+ metadata.gz: 6eac90d3d869c01ec173d577a4d5c94af5059b9f380574b8f3a1ea1f0f992a0764542eb69235e375b50d58c61587180ec876c779121bf5fe5272ee32cba3a5b8
7
+ data.tar.gz: f9863d7d3effdac0d0f6257128f267cc78c38411963d571e325fe2fd0f22d51eea0c5c93fa6e72adaf660e05de6c2f93f77346688277c9ebf16f596e76d94390
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in substantiate-analyser.gemspec
4
+ gemspec
@@ -0,0 +1,27 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sitemap-analyser (0.0.1)
5
+ clamp
6
+ json
7
+ log4r
8
+ nokogiri
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ clamp (0.6.3)
14
+ json (1.8.1)
15
+ log4r (1.1.10)
16
+ mini_portile (0.6.0)
17
+ nokogiri (1.6.2.1)
18
+ mini_portile (= 0.6.0)
19
+ rake (10.3.2)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ bundler (~> 1.3)
26
+ rake
27
+ sitemap-analyser!
@@ -0,0 +1,36 @@
1
+ # Sitemap Generator
2
+ A simple command-line Sitemap generator tool. Useful for quickly auditing a website.
3
+
4
+ ## Getting started
5
+
6
+ git clone https://github.com/mefellows/sitemap-generator
7
+ cd sitemap-generator
8
+
9
+ ### Generate a standard CSV Sitemap file
10
+ The following command will generate a basic sitemap, listing all links recursively from the site, containing only URIs from the specified domain name (in this case, onegeek.com.au) and will save to a file named sitemap.csv
11
+
12
+ bin/sitemap generate http://www.onegeek.com.au/ sitemap.csv
13
+
14
+ ### Generate a standard Sitemap JSON format
15
+
16
+ bin/sitemap generate --format=json http://www.onegeek.com.au/ sitemap.json
17
+
18
+ ### Generate a Sitemap restricting to the URI provided
19
+
20
+ bin/sitemap generate --recursive=false http://www.onegeek.com.au/ sitemap.csv
21
+
22
+ ### Generate a Sitemap restricting indexed URLs to only those starting with '/journal'
23
+
24
+ bin/sitemap generate --restrict-path=/journal http://www.onegeek.com.au/ sitemap.csv
25
+
26
+
27
+ ## Getting Help
28
+
29
+ bin/sitemap
30
+ bin/sitemap generate --help
31
+
32
+ ## Alternatives?
33
+
34
+ So of course, after spending an hour writing this I forgot that wget can do this for you, well basically anyway:
35
+
36
+ wget -r --delete-after <todo>
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,18 @@
1
+ Vagrant.configure("2") do |config|
2
+
3
+ config.vm.define "centos-64-x64-vbox4210" do |v|
4
+ v.vm.box = "centos-64-x64-vbox4210"
5
+ v.vm.hostname = "centos"
6
+ v.vm.box_url = "http://puppet-vagrant-boxes.puppetlabs.com/centos-64-x64-vbox4210.box"
7
+ config.vm.network "forwarded_port", guest: 80, host: 8081
8
+ end
9
+
10
+ #config.vm.synced_folder "vendor/melbourneitdev/libmit", "/mit"
11
+
12
+ config.vm.provider :virtualbox do |vb|
13
+ vb.customize ["modifyvm", :id, "--memory", "256"]
14
+ end
15
+
16
+ config.vm.provision :shell, :path => "vagrant/shell/bootstrap.sh"
17
+
18
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ $: << File.join(File.dirname(__FILE__), "..", "lib")
6
+ require "sitemap/command"
7
+
8
+ # Setup the app to work from a base Runner/Command etc.
9
+ # exit(Sitemap::MainCommand.run || 0)
10
+ Sitemap::MainCommand.run
@@ -0,0 +1,61 @@
1
+ require 'sitemap/version'
2
+ require 'sitemap/logging'
3
+ require 'sitemap/commands/sitemap'
4
+ require 'clamp'
5
+
6
+ module Sitemap
7
+ class AbstractCommand < Clamp::Command
8
+ include Logging
9
+
10
+ option ["-v", "--verbose"], :flag, "be verbose"
11
+ option "--version", :flag, "show version" do
12
+ puts "Sitemap Analyser " + Sitemap::VERSION
13
+ exit(0)
14
+ end
15
+
16
+ end
17
+
18
+ class SitemapCommand < AbstractCommand
19
+ option "--no-recursion", :flag, "Prevents sitemap recursion", :default => false
20
+ option "--format", "format", "Specify the output format. Options are [csv, json]", :attribute_name => :format, :default => 'csv'
21
+ option "--depth", "depth", "Level of depth to recurse", :attribute_name => :depth, :default => -1 do |s|
22
+ Integer(s)
23
+ end
24
+
25
+ # option "--restrict-path", "restrict-path", "Restrict links not on supplied path", :attribute_name => :restrict_path, :multivalued => true
26
+ # --follow-redirects, "follow", "Ignore redirects?"
27
+ # --include-resources, "include resources", "Follows links to static resources such as images, videos etc."
28
+
29
+ parameter "uri", "URI base to fetch URLs from", :attribute_name => :uri do |u|
30
+ begin
31
+ parsed_uri = URI::parse(u)
32
+ parsed_uri
33
+ rescue
34
+ puts "Invalid URI provided"
35
+ exit(0)
36
+ end
37
+ end
38
+ parameter "[output_file]", "Output file", :attribute_name => :output_file
39
+
40
+ def execute
41
+ if !format.eql?('json') && output_file.nil?
42
+ signal_usage_error "'output_file' parameter must be provided if format is not JSON."
43
+ exit(0)
44
+ end
45
+
46
+ real_depth = depth
47
+ if no_recursion?
48
+ log.debug("Recursion disabled, setting depth to 1")
49
+ real_depth = 1
50
+ end
51
+
52
+ log.info('Running sitemap generator')
53
+ generator = SitemapGenerator.new()
54
+ generator.generate(uri, output_file, format, real_depth)
55
+ end
56
+ end
57
+
58
+ class MainCommand < AbstractCommand
59
+ subcommand "generate", "Generate a sitemap", Sitemap::SitemapCommand
60
+ end
61
+ end
@@ -0,0 +1,178 @@
1
+ require 'sitemap/logging'
2
+ require 'sitemap/filters/filters'
3
+ require 'csv'
4
+ require 'json'
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'net/http'
8
+
9
+ class SitemapGenerator
10
+ include Logging
11
+
12
+ def initialize()
13
+ log.debug('Initialising generator')
14
+ end
15
+
16
+ #
17
+ # Public: Output the index to JSON
18
+ #
19
+ def write_index_to_json(index)
20
+ puts JSON::generate(index)
21
+ end
22
+
23
+ #
24
+ # Public: Write a Sitemap index to file
25
+ #
26
+ def write_index_to_file(index, output_file)
27
+ csv = CSV.open(output_file, 'wb')
28
+ csv << ['URI', 'Title']
29
+
30
+ # Flush Sitemap to CSV
31
+ index.each do |key, value|
32
+ csv << [key, value['title']]
33
+ end
34
+
35
+ end
36
+
37
+ #
38
+ # Public: Create the index recursively.
39
+ #
40
+ # link - The URI to build the index from recursively.
41
+ # base_uri - The base URI (Host) to restrict which links are indexed
42
+ # restrict - An array of URIs used to restrict which URIs are indexed.
43
+ # all indexed URIs will include one of these paths.
44
+ # link_index - Any index to start the build from.
45
+ # depth - The depth of recursion. 1 for no recursion, -1 for infinite. > 1 for specific depth
46
+ #
47
+ # Returns an index containing URIs as keys and an object representing the page.
48
+ #
49
+ def create_index(link, base_uri, filters, link_index = nil, depth = -1)
50
+ if link_index.nil?
51
+ log.debug('Creating new Index')
52
+ link_index = Hash.new
53
+ end
54
+
55
+ if link.nil? || base_uri.nil?
56
+ return
57
+ end
58
+
59
+ ### TODO: replace with generic filter method
60
+
61
+ if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0)
62
+
63
+ log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}")
64
+
65
+ # Only continue in this part if page NOT in index and is indexable
66
+ # Only fetch the document if it's not yet been indexed
67
+ doc = get_document(link)
68
+
69
+ ## All docs must be indexed, even if blacklisted...
70
+
71
+ if !doc.nil?
72
+ log.debug("New document found at #{link}, exploring links")
73
+ depth = depth - 1
74
+
75
+ # Set page title and add to index
76
+ link_index[link.to_s] = {'title' => doc.title}
77
+ log.info("Adding link to index: #{link.to_s}")
78
+
79
+ # Find all links on the page
80
+ links = []
81
+ doc.css('a').each do |l|
82
+ links << l.attributes["href"].to_s
83
+ end
84
+
85
+ # Filter out in-eligible links
86
+ a = Filters::Util.apply_filters(links, link_index, base_uri, filters)
87
+
88
+ links.each do |l|
89
+ l = Filters::Util.remove_fragment_from_uri(l)
90
+ if l && !l.empty?
91
+ if depth != -1
92
+ create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, link_index, depth)
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ end
99
+
100
+ link_index
101
+ end
102
+
103
+ #
104
+ # Public: Fetch a document the Internet.
105
+ #
106
+ def fetch(uri, domain = nil, limit = 10)
107
+ uri = Filters::Util.make_URI(uri)
108
+ if domain.nil?
109
+ domain = uri
110
+ end
111
+ domain = Filters::Util.make_URI(domain)
112
+
113
+ # You should choose a better exception.
114
+ raise ArgumentError, 'too many HTTP redirects' if limit == 0
115
+
116
+ response = Net::HTTP.get_response(uri)
117
+
118
+ case response
119
+ when Net::HTTPSuccess then
120
+ response.body
121
+ when Net::HTTPRedirection then
122
+ location = response['location']
123
+ location = Filters::Util.create_absolute_uri(location, uri)
124
+ log.warn("Redirecting #{uri} to new location: #{location}")
125
+
126
+ # Check new location belongs to current domain
127
+ if location.host == domain.host
128
+ fetch(location, uri, limit - 1)
129
+ elsif
130
+ log.warn("Redirecting from #{uri} to #{location} rejected due to cross-domain restrictions")
131
+ end
132
+ nil
133
+ else
134
+ nil
135
+ end
136
+ end
137
+
138
+ #
139
+ # Public: Fetch a document
140
+ #
141
+ def get_document(uri)
142
+ log.debug("Fetching document at #{uri}")
143
+ begin
144
+ response = fetch(uri.to_s)
145
+ doc = Nokogiri::HTML(response)
146
+ if doc.instance_of? Nokogiri::HTML::Document
147
+ return doc
148
+ end
149
+ rescue StandardError => bang
150
+ log.error("Error reading document #{uri}: #{bang.message}")
151
+ nil
152
+ end
153
+ end
154
+
155
+ #
156
+ # Create the Sitemap
157
+ #
158
+ def generate(uri, output_file, format = 'csv', depth = -1)
159
+
160
+ log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}")
161
+
162
+ # Setup filters. Ideally, have some outsider give me these
163
+ # Really, these are just options to the index
164
+ filters = Filters::Util.get_all_filters
165
+ index = create_index(uri, uri, filters, nil, depth)
166
+
167
+ case format
168
+ when 'json'
169
+ write_index_to_json(index)
170
+ when 'csv'
171
+ write_index_to_file(index, output_file)
172
+ else
173
+ puts "Please specify a valid output format, you gave #{format} Options are ['csv', 'json']"
174
+ exit(1)
175
+ end
176
+ end
177
+
178
+ end
@@ -0,0 +1,203 @@
1
+ require 'sitemap/logging'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+
5
+ # Public: Various index filtering operations and classes.
6
+ module Filters
7
+
8
+ class Util
9
+ #
10
+ # Idempotently make a string a URI
11
+ #
12
+ def self.make_URI(uri)
13
+ begin
14
+ if !uri.is_a? URI
15
+ uri = URI::parse(uri)
16
+ end
17
+ uri
18
+ rescue
19
+ nil
20
+ end
21
+ end
22
+
23
+ #
24
+ # Public: Remove fragments from a URI
25
+ #
26
+ def self.remove_fragment_from_uri(uri)
27
+ parsed_href = Filters::Util.make_URI(uri)
28
+ if parsed_href.nil?
29
+ return nil
30
+ end
31
+ parsed_href.fragment = nil
32
+ parsed_href.to_s
33
+ end
34
+
35
+ #
36
+ # Public: Create an absolute link provided a link and base URI.
37
+ #
38
+ def self.create_absolute_uri(link, base_uri)
39
+ link = Filters::Util.make_URI(link)
40
+ base_uri = Filters::Util.make_URI(base_uri)
41
+
42
+ # Remove path from base
43
+ base_uri.path = ''
44
+
45
+ # Append Path to base_uri if relative
46
+ if !link.path.nil? && link.path.start_with?('/')
47
+ return base_uri + link
48
+ end
49
+
50
+ return link
51
+ end
52
+
53
+ #
54
+ # Public: Get all known filters
55
+ #
56
+ def self.get_all_filters
57
+ return [Filters::ValidURIFilter.new, Filters::LocalFilter.new, Filters::ResourcesFilter.new]
58
+ end
59
+
60
+ # Public: Apply URI filters to a Hash.
61
+ #
62
+ # uris - Set (Array|Hash) of URIs to be filtered.
63
+ # index - Current index
64
+ # base_uri - Base URI to test against
65
+ # filters - Filters to reduce set of uris
66
+ #
67
+ # Returns a filtered uris Hash
68
+ def self.apply_filters(uris, index, base_uri, filters)
69
+
70
+ # Clone filters so we retain the 'functional' style of no side-effects
71
+ filters_clone = filters.clone
72
+
73
+ # Check for terminating case
74
+ if (!uris.nil? && uris.length > 0)
75
+
76
+ if !filters_clone.nil? && filters_clone.length > 0
77
+
78
+ # Pop a filter and apply it recursively to the result of the next filter
79
+ f = filters_clone.shift
80
+ uris = apply_filters(uris, index, base_uri, filters_clone)
81
+
82
+ uris = uris.select do |k,v|
83
+ f.filter(index, k, base_uri)
84
+ end
85
+ end
86
+ end
87
+
88
+ uris
89
+ end
90
+ end
91
+
92
+ #
93
+ # Public: Filters out non-local URIs
94
+ #
95
+ class LocalFilter
96
+ include Logging
97
+
98
+ #
99
+ # Public: Determines if a link is on the local domain + path or not
100
+ #
101
+ def is_link_local?(link, local)
102
+
103
+ begin
104
+ link = Filters::Util.make_URI(link)
105
+ local = Filters::Util.make_URI(local)
106
+
107
+ # Remove Absolute URLs that don't refer to local domain
108
+ if !link.host.nil? && !link.host.eql?(local.host)
109
+ log.debug("Rejecting host #{link.host} as it doesn't match #{local.host}")
110
+ return false
111
+ end
112
+
113
+ # Ensure path starts with a '/' (filters out junk URLs)
114
+ if !link.path.nil? && !link.path.eql?('') && !link.path.start_with?('/')
115
+ log.debug("Rejecting link #{link} as it's path (#{link.path}) doesn't start with '/'")
116
+ return false
117
+ end
118
+
119
+ rescue StandardError => bang
120
+ log.debug("Exception looking for local links: " + bang.message)
121
+ return false
122
+ end
123
+
124
+ return true
125
+ end
126
+
127
+ #
128
+ # Public: Determines if a link should be indexed.
129
+ #
130
+ # Returns boolean true iff the link is local and not indexed.
131
+ #
132
+ def should_index_local_link?(link, index, base_uri)
133
+ return !index.has_key?(link.to_s) && is_link_local?(link, base_uri)
134
+ end
135
+
136
+ #
137
+ # Public: Filter out resources that are not local.
138
+ #
139
+ # Returns the link if it should be indexed else nil.
140
+ #
141
+ def filter(index, link, base_uri)
142
+ return true unless !should_index_local_link?(link, index, base_uri)
143
+ false
144
+ end
145
+ end
146
+
147
+ # Public: URI Fragment filter.
148
+ #
149
+ #
150
+ class URIFragmentFilter
151
+ include Logging
152
+
153
+ #
154
+ # Public: Filters out static resources.
155
+ #
156
+ # Returns the link if it doesn't contain a URI fragment
157
+ #
158
+ def filter(index, link, base_uri)
159
+ link = Filters::Util.make_URI(link)
160
+ return false unless (link.nil? || !link.fragment.nil?)
161
+ true
162
+ end
163
+ end
164
+
165
+ # Public: Valid URI filter.
166
+ #
167
+ #
168
+ class ValidURIFilter
169
+ include Logging
170
+
171
+ #
172
+ # Public: Filters out invalid URIs.
173
+ #
174
+ # Returns the link if it should be indexed else nil.
175
+ #
176
+ def filter(index, link, base_uri)
177
+ return true unless link.nil? || link.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
178
+ false
179
+ end
180
+ end
181
+
182
+ # Public: Static resource filter.
183
+ #
184
+ #
185
+ class ResourcesFilter
186
+ include Logging
187
+
188
+ #
189
+ # Public: Filters out static resources.
190
+ #
191
+ # Returns the link if it should be indexed else nil.
192
+ #
193
+ def filter(index, link, base_uri)
194
+ link = Filters::Util.make_URI(link)
195
+ if link.nil? || link.path.nil? || link.path.to_s.empty?
196
+ return true
197
+ end
198
+ return true unless link.path.to_s.match(/.*\.[a-zA-Z0-9_\-\s]+(?!\/)$/)
199
+ false
200
+ end
201
+ end
202
+
203
+ end
@@ -0,0 +1,25 @@
1
+ require 'log4r'
2
+
3
+ module Logging
4
+
5
+ def log
6
+ @log ||= Logging.logger_for(self.class.name)
7
+ end
8
+
9
+ # Use a hash class-ivar to cache a unique Logger per class:
10
+ @loggers = {}
11
+
12
+ class << self
13
+ include Log4r
14
+
15
+ def logger_for(classname)
16
+ @loggers[classname] ||= configure_logger_for(classname)
17
+ end
18
+
19
+ def configure_logger_for(classname)
20
+ logger = Logger.new classname.to_s.gsub(/[^a-zA-Z0-9]/, '.').downcase.gsub(/\.+/, '.')
21
+ logger.outputters << Log4r::FileOutputter.new('sitemaplog', :filename => 'sitemap.log')
22
+ logger
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Sitemap
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'sitemap/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "sitemap-generator"
8
+ spec.version = Sitemap::VERSION
9
+ spec.authors = ["mefellows"]
10
+ spec.email = ["matt.fellows@onegeek.com.au"]
11
+ spec.description = "Sitemap Generator"
12
+ spec.summary = "A basic, human readable sitemap generator"
13
+ spec.homepage = "https://github.com/mefellows/sitemap-generator"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_runtime_dependency "clamp"
24
+ spec.add_runtime_dependency "json"
25
+ spec.add_runtime_dependency "log4r"
26
+ spec.add_runtime_dependency "nokogiri"
27
+ end
@@ -0,0 +1,142 @@
1
+ require 'rspec'
2
+ require 'sitemap/commands/sitemap'
3
+ require 'sitemap/filters/filters'
4
+ require 'spec_helper'
5
+
6
+ url = URI::parse('http://foo.com/foo/bar')
7
+
8
+ describe Filters::LocalFilter do
9
+
10
+
11
+ it 'Should exclude non-local URIs' do
12
+ filter = Filters::LocalFilter.new
13
+ filter.is_link_local?('http://www.foo.com/something', url).should eq(false)
14
+ filter.is_link_local?('https://www.foo.com/something', url).should eq(false)
15
+ filter.is_link_local?('http://www.somethingelse.com/something', url).should eq(false)
16
+ filter.is_link_local?('https://www.somethingelse.com/something', url).should eq(false)
17
+
18
+ expect(filter.filter(Hash.new, 'https://www.somethingelse.com/something', 'https://foo.com/')).to eq(false)
19
+ end
20
+
21
+ it 'Should exclude javascript links' do
22
+ filter = Filters::LocalFilter.new
23
+ # This is a valid case for THIS method, may choose to exclude elsewhere in the program
24
+ # filter.is_link_local?('#thisisareallylonganchorname', url).should eq(false)
25
+ filter.is_link_local?('alert(\'true\')', url).should eq(false)
26
+
27
+ expect(filter.filter(Hash.new, 'alert(\'true\')', 'https://foo.com/')).to eq(false)
28
+ end
29
+
30
+ it 'Should include relative URIs' do
31
+ filter = Filters::LocalFilter.new
32
+ filter.is_link_local?('/something', url).should eq(true)
33
+ filter.is_link_local?('/', url).should eq(true)
34
+
35
+ expect(filter.filter(Hash.new, '/something', url)).to eq(true)
36
+
37
+ end
38
+
39
+ it 'Should include absolute local URIs' do
40
+ filter = Filters::LocalFilter.new
41
+ filter.is_link_local?('http://foo.com', url).should eq(true)
42
+ filter.is_link_local?('http://foo.com/foo/bar', url).should eq(true)
43
+ filter.is_link_local?('http://foo.com/something', url).should eq(true)
44
+ filter.is_link_local?('https://foo.com/something', url).should eq(true)
45
+
46
+ filter.filter(Hash.new, 'http://foo.com', url).should eq(true)
47
+ filter.filter(Hash.new, 'http://foo.com/foo/bar', url).should eq(true)
48
+ filter.filter(Hash.new, 'http://foo.com/something', url).should eq(true)
49
+ filter.filter(Hash.new, 'https://foo.com/something', url).should eq(true)
50
+
51
+ filter.filter(Hash.new, URI::parse('http://foo.com'), url).should eq(true)
52
+ filter.filter(Hash.new, URI::parse('http://foo.com/foo/bar'), url).should eq(true)
53
+ filter.filter(Hash.new, URI::parse('http://foo.com/something'), url).should eq(true)
54
+ filter.filter(Hash.new, URI::parse('https://foo.com/something'), url).should eq(true)
55
+
56
+ end
57
+ end
58
+
59
+ describe Filters::ResourcesFilter do
60
+ it 'Should exclude static resources' do
61
+ filter = Filters::ResourcesFilter.new
62
+ filter.filter(Hash.new, 'http://www.foo.com/something.pdf', 'http://www.foo.com/').should eq(false)
63
+ filter.filter(Hash.new, 'http://www.foo.com/something.txt', 'http://www.foo.com/').should eq(false)
64
+ filter.filter(Hash.new, 'http://www.foo.com/something./', 'http://www.foo.com/').should eq(true)
65
+ filter.filter(Hash.new, 'http://www.foo.com/something-/', 'http://www.foo.com/').should eq(true)
66
+ filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-', 'http://www.foo.com/').should eq(true)
67
+ filter.filter(Hash.new, 'http://www.foo.com/something-bar/-cake-/', 'http://www.foo.com/').should eq(true)
68
+ filter.filter(Hash.new, 'http://www.foo.com', 'http://www.foo.com/').should eq(true)
69
+ end
70
+
71
+ it 'Should not allow links to be indexed more than once' do
72
+ filter = Filters::LocalFilter.new
73
+
74
+ index = Hash.new
75
+ index['http://www.webcentral.com.au'] = {"title" => "cheese"}
76
+ expect(filter.should_index_local_link?(Filters::Util.create_absolute_uri('http://www.webcentral.com.au', 'http://www.webcentral.com.au'), index, 'http://www.webcentral.com.au')).to eq false
77
+ end
78
+
79
+ it 'Should return a filtered Hash' do
80
+ filters = Filters::Util.get_all_filters
81
+ # filters = [Filters::ResourcesFilter.new]
82
+
83
+ index = Hash.new
84
+ index['http://foo.com'] = ""
85
+ index['http://foo.com/foo'] = ""
86
+ index['http://foo.com/foo.pdf'] = ""
87
+ index['http://foo.com/bar'] = ""
88
+ index['http://foo.com/bar.tar.gz'] = ""
89
+ index['http://bar.com/foo'] = ""
90
+ index['http://www.mootools.net/'] = ""
91
+ index['http://www.wordpress.org'] = ""
92
+ index['http://www.blueprintcss.com'] = ""
93
+ index['http://www.php.net'] = ""
94
+ index['/contact'] = ""
95
+ index['http://www.onegeek.com.au'] = ""
96
+ index['http://h2vx.com/vcf/http://development.onegeek.com.au/contact/'] = ""
97
+ index['http://www.cloudflare.com/email-protection#d4b9b5a0a0fab2b1b8b8bba3a794bbbab1b3b1b1bffab7bbb9fab5a1'] = ""
98
+ index['http://www.twitter.com/matthewfellows'] = ""
99
+ index['http://au.linkedin.com/pub/matt-fellows/4/153/656'] = ""
100
+ index['http://www.flickr.com/photos/mattfellows'] = ""
101
+ index['http://www.delicious.com/mefellows'] = ""
102
+ index['/_assets/faqs/pdf/managed-exchange/Exchange - Recovering Deleted Items.pdf'] = ""
103
+
104
+ i = Filters::Util.apply_filters(index, Hash.new, url, filters)
105
+ puts i
106
+
107
+ expect(i.length).to eq 3
108
+
109
+ end
110
+
111
+ it 'Should return an empty filtered Hash' do
112
+ filters = Filters::Util.get_all_filters
113
+
114
+ index = Hash.new
115
+ index['http://bar.com/foo'] = ""
116
+
117
+ i = Filters::Util.apply_filters(index, Hash.new, url, filters)
118
+ puts i
119
+
120
+ # Need to prevent mutation in filtering
121
+ expect(filters.length).to eq 3
122
+
123
+ expect(i.length).to eq 0
124
+
125
+ end
126
+
127
+ it 'Should return the a Hash containing the initial URI' do
128
+ filters = Filters::Util.get_all_filters
129
+
130
+ i = Filters::Util.apply_filters([url], Hash.new, url, filters)
131
+ puts i
132
+
133
+ expect(i.length).to eq 1
134
+
135
+ end
136
+ end
137
+
138
+ describe Filters::Util do
139
+ it 'Should return an absolute URI' do
140
+ expect(Filters::Util.create_absolute_uri('/', url).to_s).to eq 'http://foo.com/'
141
+ end
142
+ end
@@ -0,0 +1,68 @@
1
+ require 'rspec'
2
+ require 'sitemap/commands/sitemap'
3
+ require 'sitemap/filters/filters'
4
+ require 'spec_helper'
5
+
6
+ describe SitemapGenerator do
7
+ url = URI::parse('http://foo.com/foo/bar')
8
+
9
+ it 'Should return an index from a single page' do
10
+ generator = SitemapGenerator.new
11
+ filters = [Filters::LocalFilter.new, Filters::ResourcesFilter.new]
12
+
13
+ # onegeek.com.au source as at 23/05/2014
14
+
15
+ # Note no trailing slash -> need to find why lack of trailing slash is an issue
16
+ link = URI::parse("http://www.onegeek.com.au")
17
+ index = generator.create_index(link, link, filters, nil, 1)
18
+
19
+ expect(index.length).to be 18
20
+ puts "Here's the index:"
21
+ index.each do |key, value|
22
+ puts key
23
+ end
24
+ end
25
+
26
+
27
+ # Should not index an XML document
28
+
29
+
30
+ # should follow redirects to the same domain
31
+
32
+
33
+ # should treat trailing slashes the same as without???
34
+
35
+ # Test for blacklisted objects
36
+
37
+
38
+ # Should not index files (PDFs, images etc.)
39
+ it 'Should not index static files (PDFs, images etc.)' do
40
+ # generator = SitemapGenerator.new
41
+
42
+ end
43
+
44
+
45
+
46
+ # it 'Should return an index from an entire site' do
47
+ # generator = SitemapGenerator.new
48
+
49
+ # # onegeek.com.au source as at 23/05/2014
50
+ # doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
51
+ # index = generator.create_index(URI::parse("http://www.onegeek.com.au"), URI::parse("http://www.onegeek.com.au"), [], nil, 1)
52
+
53
+ # index.each do |key, value|
54
+ # puts key
55
+ # end
56
+ # end
57
+
58
+ # it 'Let me hack stuff' do
59
+ # generator = SitemapGenerator.new
60
+
61
+ # print generator.fetch('http://www.webcentral.com.au/order')
62
+ # doc = Nokogiri::HTML('<!DOCTYPE HTML> <html lang="en"> <head> <meta charset="utf-8"> <!--[if lte IE 8]> <script src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/ie-html5.js" type="text/javascript"></script> <![endif]--> <!--[if lt IE 9]> <script src="http://css3-mediaqueries-js.googlecode.com/svn/trunk/css3-mediaqueries.js"></script> <![endif]--> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Usability, Web Standards &amp; Design | Matthew Fellows</title> <script type="text/javascript"> //<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok9v=e9cb4febb4/"},atok:"8407449c08a29cd8a6c8a3bd5f55d64f",petok:"c98ca1db99b9a96d907e5221878e535c8620bc66-1400845495-1800",zone:"onegeek.com.au",rocket:"a",apps:{}}];document.write( <script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok9v=97fb4d042e/cloudflare.min.js"><\'+\'\/script>\');}}catch(e){}; //]]> </script> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/blueprint.css" type="text/css" media="screen"/> <link rel="stylesheet" href="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/css/style.css" type="text/css" media="screen"/> <link rel="pingback" href="http://www.onegeek.com.au/xmlrpc.php"/> <link rel="alternate" type="application/rss+xml" title="OneGeek | Usability, Web Standards and Design (RSS 2.0)" href="/feed/"/> <link rel="alternate" type="text/xml" title="OneGeek | Usability, Web Standards and Design (RSS .92)" href="/feed/rss/"/> <link rel="alternate" type="application/atom+xml" title="OneGeek | Usability, Web Standards and Design (ATOM .30)" href="/feed/atom/"/> <link rel="profile" href="http://microformats.org/profile/hcard"/> <link rel="profile" href="http://gmpg.org/xfn/11"/> <link rel="stylesheet" id="wpt-twitter-feed-css" href="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-to-twitter/css/twitter-feed.css?ver=3.5.2" type="text/css" media="all"/> <link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://www.onegeek.com.au/xmlrpc.php?rsd"/> <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://d2pk187t5c7952.cloudfront.net/wp-includes/wlwmanifest.xml"/> <meta name="keywords" content="Matthew Fellows, OneGeek, usability, web standards, articles, HCI, programming, javascript, php, java"/> <link rel="canonical" href="http://www.onegeek.com.au/"/> <!--[if IE]><script src="http://d2pk187t5c7952.cloudfront.net/wp-content/plugins/wp-gbcf/wp-gbcf_focus.js" type="text/javascript"></script><![endif]--><meta id="syntaxhighlighteranchor" name="syntaxhighlighter-version" content="3.1.1"/> </head> <body class="home blog"> <div class="container"> <div id="page" class="span-24"> <header id="header"> <div id="logo" class="span-11"> <a href="/" class="url"><img class="logo" src="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/images/logo.gif" alt="Matthew Fellows - OneGeek" title="Matthew Fellows - OneGeek "/></a> <div id="logo_text"> <h4 class="fn nickname">OneGeek</h4> <p>Web, Standards <em>&amp;</em> Usability</p> </div> </div> <nav id="menu" class="span-12 prepend-1 last"> <ul> <li class="current"> <a href="#">Menu</a> </li> <li><a href="/category/articles/">Articles</a> <ul> <li>Ramblings on the Web and stuff</li> </ul> </li> <li><a href="/projects/">Projects</a> <ul> <li>My attempt to give back to the community</li> </ul> </li> <li><a href="/journal/">Journal</a> <ul> <li>Personal blog posts</li> </ul> </li> <li class="last-child"><a href="/about/">About</a> <ul> <li>Who is this guy anyway?</li> </ul> </li> </ul> </nav> </header> <div id="content" class="narrowcolumn"> <div class="blurb vcard"> <h1><span class="fn name">Matthew Fellows</span> &mdash; <span class="title">Professional Web Developer</span></h1> <p>I\'m a University trained <span class="degree">Cognitive \ Computer Scientist</span> living in <span class="adr"><span class="locality">Melbourne</span></span> who enjoys building products on the <strong>web</strong> with a focus on <strong>usability</strong>, <strong>web standards</strong> and <strong>business outcomes</strong>.</p> <p class="read-more">See what makes me <a href="/about">tick</a> and what I\'m currently <a href="/about#where">doing</a> at <span class="org"><span class="organization-name">Melbourne IT</span></span>, tweet my <a href="/category/articles/">articles</a> or use my open-source <a href="/category/projects">software</a>.</p> </div> <div class="span-8" id="projects"> <div class="sidebar_item"><h4>Recent Posts</h4><ul> <li> <a href="http://www.onegeek.com.au/blog/570" title="Taking back content">Taking back content</a> </li> <li> <a href="http://www.onegeek.com.au/journal/scrum-my-life" title="Scrum my life">Scrum my life</a> </li> <li> <a href="http://www.onegeek.com.au/articles/2014-thoughtworks-tech-radar" title="2014 Thoughtworks Tech Radar">2014 Thoughtworks Tech Radar</a> </li> <li> <a href="http://www.onegeek.com.au/articles/development-articles/load-time-weaving-in-fuse-esb-equinox-aspect" title="Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects">Load Time Weaving in Fuse ESB (Apache ServiceMix) with Equinox Aspects</a> </li> <li> <a href="http://www.onegeek.com.au/rest-api/polymorphic-payloads-in-restful-api-using-apache-cxfjax-rs" title="Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS">Polymorphic Payloads in RESTful API using Apache CXF/JAX-RS</a> </li> <script type="text/rocketscript"> // <![CDATA[var disqus_shortname = \'onegeek\'; (function () {var nodes = document.getElementsByTagName(\'span\'); for (var i = 0, url; i < nodes.length; i++) {if (nodes[i].className.indexOf(\'dsq-postid\') != -1) {nodes[i].parentNode.setAttribute(\'data-disqus-identifier\', nodes[i].getAttribute(\'rel\')); url = nodes[i].parentNode.href.split(\'#\', 1); if (url.length == 1) { url = url[0]; } else { url = url[1]; } nodes[i].parentNode.href = url + \'#disqus_thread\'; } } var s = document.createElement(\'script\'); s.async = true; s.type = \'text/javascript\'; s.src = \'//\' + \'disqus.com/forums/\' + disqus_shortname + \'/count.js\'; (document.getElementsByTagName(\'HEAD\')[0] || document.getElementsByTagName(\'BODY\')[0]).appendChild(s); }()); //]]> </script> </ul> </div> </div> <div class="span-8" id="latest"> <div class="sidebar_item"><h2><a href="http://twitter.com/matthewfellows">In a Twitter</a></h2><p>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</p></div> </div> <div class="span-8 last search"> <h2>Search OneGeek</h2> <form method="get" id="searchform" action="/"> <div><label class="screen-reader-text hidden" for="s">Search</label> <input type="text" value="" name="s" id="s"/> <input type="submit" id="searchsubmit" value="Search"/></div> </form> </div> </div> </div> </div> <footer id="footer"> <div class="container"> <div class="span-8"> <h4>Downloads</h4> <h5>Contributions to the Community</h5> <h5>GSuite products</h5> <ul> <li><a href="/javascript-form-validation">GValidator</a></li> <li><a href="/javascript-serializer">GSerializer</a></li> <li class="new"><a href="/javascript-form-state-recovery">GRememberMe</a></li> </ul> </div> <div class="span-8"> <h4>Under the Hood</h4> <h5>The house that Standards built</h5> <h5>Standards</h5> <ul> <li><a href="http://www.w3c.org">HTML 5</a></li> <li><a href="http://www.w3c.org">CSS 3.0</a></li> <li><a href="http://www.microformats.org">Microformats</a></li> </ul> <h5>Frameworks &amp; Platforms</h5> <ul> <li><a href="http://www.mootools.net/">Mootools</a></li> <li><a href="http://www.wordpress.org">Wordpress</a></li> <li><a href="http://www.blueprintcss.com">Blueprint CSS</a></li> <li><a href="http://www.php.net">PHP</a></li> </ul> </div> <div class="span-8 last"> <h4>Get in touch</h4> <h5>8 ways to stalk me</h5> <ul> <li class="gicon web"><a href="/contact">Contact</a> me on <a class="url fn org" href="http://www.onegeek.com.au">OneGeek</a></li> <li class="gicon iconemail">Get my <a href="http://h2vx.com/vcf/http://development.onegeek.com.au/contact/">vcard</a> or <a class="email" href="http://www.cloudflare.com/email-protection#f69b978282d890939a9a998185b69998939193939dd895999bd89783">Email</a> me</li> <li class="gicon twitter"><a class="fn url" href="http://www.twitter.com/matthewfellows">Follow</a> me on Twitter</li> <li class="gicon linkedin">View my LinkedIn <a class="fn url" href="http://au.linkedin.com/pub/matt-fellows/4/153/656">profile</a></li> <li class="gicon flickr"><a class="url" href="http://www.flickr.com/photos/mattfellows">Spy</a> on me at Flickr</li> <li class="gicon delicious">Steal my Delicous <a class="url" href="http://www.delicious.com/mefellows">links</a></li> <li class="gicon rss">Subscribe to the OneGeek <a href="/feed">RSS Feed</a></li> </ul> </div> </div> </footer> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/mootools-1.3.js"></script> <script type="text/rocketscript" data-rocketsrc="http://d2pk187t5c7952.cloudfront.net/wp-content/themes/onegeek/js/home.js"></script> <script type="text/rocketscript"> var _gaq = _gaq || []; _gaq.push([\'_setAccount\', \'UA-1274481-1\']); _gaq.push([\'_trackPageview\']); (function() {var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true; ga.src = (\'https:\' == document.location.protocol ? \'https://ssl\' : \'http://www\') + \'.google-analytics.com/ga.js\'; var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <script type="text/javascript"> /* <![CDATA[ */ (function(){try{var s,a,i,j,r,c,l=document.getElementsByTagName("a"),t=document.createElement("textarea");for(i=0;l.length-i;i++){try{a=l[i].getAttribute("href");if(a&&"www.cloudflare.com/email-protection"==a.substr(7 ,35)){s=\'\';j=43;r=parseInt(a.substr(j,2),16);for(j+=2;a.length-j&&a.substr(j,1)!=\'X\';j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}j+=1;s+=a.substr(j,a.length-j);t.innerHTML=s.replace(/</g,"&lt;").replace(/>/g,"&gt;");l[i].setAttribute("href","mailto:"+t.value);}}catch(e){}}}catch(e){}})(); /* ]]> */ </script> </body> </html>')
63
+ # # doc = Nokogiri::XML(open('http://www.onegeek.com.au/feed'))
64
+ # expect(doc.instance_of? Nokogiri::HTML::Document).to eq true
65
+ # end
66
+
67
+
68
+ end
@@ -0,0 +1 @@
1
+ require 'rspec'
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemap-generator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - mefellows
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: clamp
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: log4r
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: nokogiri
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Sitemap Generator
98
+ email:
99
+ - matt.fellows@onegeek.com.au
100
+ executables:
101
+ - sitemap
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - Gemfile
106
+ - Gemfile.lock
107
+ - README.md
108
+ - Rakefile
109
+ - Vagrantfile
110
+ - bin/sitemap
111
+ - lib/sitemap/command.rb
112
+ - lib/sitemap/commands/sitemap.rb
113
+ - lib/sitemap/filters/filters.rb
114
+ - lib/sitemap/logging.rb
115
+ - lib/sitemap/version.rb
116
+ - sitemap-generator.gemspec
117
+ - spec/filter_spec.rb
118
+ - spec/generator_spec.rb
119
+ - spec/spec_helper.rb
120
+ homepage: https://github.com/mefellows/sitemap-generator
121
+ licenses:
122
+ - MIT
123
+ metadata: {}
124
+ post_install_message:
125
+ rdoc_options: []
126
+ require_paths:
127
+ - lib
128
+ required_ruby_version: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
133
+ required_rubygems_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - '>='
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ requirements: []
139
+ rubyforge_project:
140
+ rubygems_version: 2.0.14
141
+ signing_key:
142
+ specification_version: 4
143
+ summary: A basic, human readable sitemap generator
144
+ test_files:
145
+ - spec/filter_spec.rb
146
+ - spec/generator_spec.rb
147
+ - spec/spec_helper.rb