static_sitemap_tasks 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +9 -0
- data/README.md +24 -48
- data/lib/static_sitemap_tasks.rb +84 -32
- data/static_sitemap_tasks.gemspec +1 -1
- metadata +7 -6
data/ChangeLog
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
v0.2
|
2
|
+
* Namespace tasks with :sitemap
|
3
|
+
* Add ping task from http://github.com/adamsalter/sitemap_generator
|
4
|
+
* Add lastmod tags ability
|
5
|
+
|
6
|
+
v0.1
|
7
|
+
* Initial fork from http://github.com/tcocca/generate_sitemap
|
8
|
+
* Refactor to scan local directory
|
9
|
+
* Refactor to set config items on rake task install
|
data/README.md
CHANGED
@@ -1,59 +1,35 @@
|
|
1
|
-
|
1
|
+
## Generate Sitemap Rake Task ##
|
2
2
|
|
3
|
-
|
4
|
-
Updates by Tom Cocca
|
5
|
-
Updates include:
|
6
|
-
- Adding a YAML config file to set URL and Change Frequency
|
7
|
-
|
8
|
-
|
9
|
-
== Generate Sitemap Rake Task
|
10
|
-
|
11
|
-
This is a Rails plugin that consists of a rake task to generate a sitemap.xml file.
|
12
|
-
The task crawls a domain (specified in a YAML config per environment) for all it's url's,
|
13
|
-
then builds the sitemap.xml file in public.
|
14
|
-
|
15
|
-
Since this simply crawls a domain for url's, this could be used to generate
|
16
|
-
sitemaps for any site, not just a Rails application.
|
17
|
-
|
18
|
-
|
19
|
-
== Requirements
|
20
|
-
|
21
|
-
Both of these are available via RubyGems
|
22
|
-
Hpricot - http://code.whytheluckystiff.net/hpricot/
|
23
|
-
Builder - http://rubyforge.org/projects/builder/
|
3
|
+
This is a small rake task that will crawl a static site locally in the specific directory and generate a sitemap.xml file with a list of links, optionally compressing it.
|
24
4
|
|
5
|
+
## Installation ##
|
25
6
|
|
26
|
-
|
7
|
+
gem install static_sitemap_tasks
|
27
8
|
|
28
|
-
|
29
|
-
|
9
|
+
## Configuration/Usage ##
|
10
|
+
To use, require the gem in your Rakefile and install the task with configuration
|
30
11
|
|
31
|
-
|
32
|
-
your site.
|
12
|
+
require 'static_sitemap_tasks'
|
33
13
|
|
34
|
-
|
35
|
-
|
14
|
+
SitemapGenerator::Tasks.install(
|
15
|
+
:base_url => 'http://www.mysite.com', # Required
|
16
|
+
:change_frequency => 'daily', # Optional, see http://www.sitemaps.org/protocol.php#changefreqdef
|
17
|
+
:date_mode => 'git' # Optional, one of 'git' or 'mtime'. default: nil (disabled)
|
18
|
+
:gzip_output => true, # Optional, default: true
|
19
|
+
:index_files => [ 'index.html' ], # Optional, default: [ 'index.html', 'index.htm' ]
|
20
|
+
:public_root => 'public' # Optional, default: Dir.pwd
|
21
|
+
)
|
36
22
|
|
37
|
-
|
38
|
-
|
23
|
+
To execute,
|
24
|
+
rake generate_sitemap
|
39
25
|
|
40
|
-
|
41
|
-
rake plugin:generate_sitemap
|
42
|
-
|
43
|
-
Go to http://localhost:3000/sitemap.xml (or open public/sitemap.xml)
|
44
|
-
|
45
|
-
|
46
|
-
== TODO
|
47
|
-
|
48
|
-
- set changefreq, lastmod, priority dynamically during generation
|
49
|
-
- allow generation of sitemap index files
|
50
|
-
- write tests
|
51
|
-
- allow for exclusions to be specified in an array
|
52
|
-
|
53
|
-
|
54
|
-
== More Info
|
26
|
+
## More Info ##
|
55
27
|
|
56
28
|
http://www.sitemaps.org/protocol.php
|
57
29
|
|
58
|
-
|
59
|
-
|
30
|
+
## Credits ##
|
31
|
+
|
32
|
+
Originally Authored by Chris Marting (http://chriscodes.com/articles/view/54)
|
33
|
+
Updates by Tom Cocca
|
34
|
+
Rewrite for static sites by Michael Leinartas
|
35
|
+
ping_search_engines() lifted from http://github.com/adamsalter/sitemap_generator by Adam Salter
|
data/lib/static_sitemap_tasks.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'builder'
|
3
3
|
require 'hpricot'
|
4
|
+
require 'time'
|
5
|
+
require 'cgi'
|
4
6
|
require 'uri'
|
5
7
|
|
6
8
|
|
@@ -14,22 +16,74 @@ module SitemapGenerator
|
|
14
16
|
end
|
15
17
|
|
16
18
|
def initialize(options = {})
|
17
|
-
# Root of files to crawl
|
18
|
-
@public_root = options[:public_root] || Dir.pwd
|
19
|
-
# Change frequency - see: http://www.sitemaps.org/protocol.php#changefreqdef
|
20
|
-
@change_frequency = options[:change_frequency]
|
21
19
|
# Canonical domain of published site
|
22
20
|
@base_url = options[:base_url]
|
23
|
-
#
|
24
|
-
@
|
21
|
+
# Change frequency - see: http://www.sitemaps.org/protocol.php#changefreqdef
|
22
|
+
@change_frequency = options[:change_frequency]
|
23
|
+
# Date mode - one of [ 'git', 'mtime' ]
|
24
|
+
@date_mode = options[:date_mode]
|
25
25
|
# Compress output to sitemap.xml.gz
|
26
26
|
@gzip_output = options[:gzip_output] || true
|
27
|
+
# Index pages
|
28
|
+
@index_files = options[:index_files] || [ 'index.html', 'index.htm' ]
|
29
|
+
# Root of files to crawl
|
30
|
+
@public_root = options[:public_root] || Dir.pwd
|
27
31
|
end
|
28
32
|
|
29
33
|
def install
|
30
|
-
|
31
|
-
|
32
|
-
|
34
|
+
namespace :sitemap do
|
35
|
+
desc "Generate a sitemap based on the contents of #{@public_root}"
|
36
|
+
task :generate do
|
37
|
+
generate_sitemap
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "Ping providers to notify them that a new sitemap.xml is available"
|
41
|
+
task :ping do
|
42
|
+
ping_search_engines
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# uses Hpricot to grab links from a URI
|
48
|
+
# adds uri to @pages_crawled
|
49
|
+
# loops each link found
|
50
|
+
# adds link to pages array if it should be included, unless it already exists
|
51
|
+
def crawl_for_links(link)
|
52
|
+
if link.include?('http')
|
53
|
+
return unless link_path.include?(@base_url)
|
54
|
+
link_path = link.sub!(@base_url,'')
|
55
|
+
else
|
56
|
+
link_path = link
|
57
|
+
end
|
58
|
+
file_path = resolve_file_path(File.join(@public_root, link_path))
|
59
|
+
|
60
|
+
if file_path.nil?
|
61
|
+
puts "Warning: Unable to resolve #{link_path} to a local file"
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
puts "Inspecting #{file_path}...\n"
|
66
|
+
doc = Hpricot(open(file_path)) rescue nil
|
67
|
+
return unless doc
|
68
|
+
@pages_crawled << link
|
69
|
+
last_updated = find_date(file_path)
|
70
|
+
@page_times[link] = last_updated if last_updated
|
71
|
+
|
72
|
+
(doc/"a").each do |a|
|
73
|
+
if a['href'] && should_be_included?(a['href'])
|
74
|
+
@pages << a['href'] unless(link_exists?(a['href'],@pages))
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_date(file)
|
80
|
+
case @date_mode
|
81
|
+
when 'git'
|
82
|
+
date = %x[git log -n 1 --date=iso --format="%ad" #{file}]
|
83
|
+
date.strip()
|
84
|
+
when 'mtime'
|
85
|
+
mtime = File.mtime(file) rescue nil
|
86
|
+
mtime.iso8601 if mtime
|
33
87
|
end
|
34
88
|
end
|
35
89
|
|
@@ -37,6 +91,7 @@ module SitemapGenerator
|
|
37
91
|
# holds pages to go into map, and pages crawled
|
38
92
|
@pages = []
|
39
93
|
@pages_crawled = []
|
94
|
+
@page_times = {}
|
40
95
|
|
41
96
|
# start with index pages
|
42
97
|
crawl_for_links('/')
|
@@ -54,9 +109,10 @@ module SitemapGenerator
|
|
54
109
|
# loop through array of pages, and build sitemap.xml
|
55
110
|
@pages.sort.each {|link|
|
56
111
|
xml.url {
|
57
|
-
xml.loc URI.join(@base_url, link)
|
112
|
+
xml.loc URI.join(@base_url, link).to_s
|
58
113
|
# TODO - set changefreq dynamically per page
|
59
114
|
xml.changefreq @change_frequency unless @change_frequency.nil?
|
115
|
+
xml.lastmod @page_times[link] unless @page_times[link].nil?
|
60
116
|
}
|
61
117
|
}
|
62
118
|
}
|
@@ -77,29 +133,25 @@ module SitemapGenerator
|
|
77
133
|
xml_file.close
|
78
134
|
end
|
79
135
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
return unless link_path.include?(@base_url)
|
87
|
-
link_path.sub!(@base_url,'')
|
88
|
-
end
|
89
|
-
file_path = resolve_file_path(File.join(@public_root, link_path))
|
90
|
-
|
91
|
-
if file_path.nil?
|
92
|
-
puts "Warning: Unable to resolve #{link_path} to a local file"
|
93
|
-
return
|
136
|
+
def ping_search_engines
|
137
|
+
require 'open-uri'
|
138
|
+
if @gzip_output
|
139
|
+
url = URI.join(@base_url,'sitemap.xml.gz').to_s
|
140
|
+
else
|
141
|
+
url = URI.join(@base_url,'sitemap.xml').to_s
|
94
142
|
end
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
143
|
+
index_location = CGI.escape(url)
|
144
|
+
|
145
|
+
# engines list from http://en.wikipedia.org/wiki/Sitemap_index
|
146
|
+
{:google => "http://www.google.com/webmasters/sitemaps/ping?sitemap=#{index_location}",
|
147
|
+
:ask => "http://submissions.ask.com/ping?sitemap=#{index_location}",
|
148
|
+
:bing => "http://www.bing.com/webmaster/ping.aspx?siteMap=#{index_location}",
|
149
|
+
:sitemap_writer => "http://www.sitemapwriter.com/notify.php?crawler=all&url=#{index_location}"}.each do |engine, link|
|
150
|
+
begin
|
151
|
+
open(link)
|
152
|
+
puts "Successful ping of #{engine.to_s}" if verbose
|
153
|
+
rescue Timeout::Error, StandardError => e
|
154
|
+
puts "Ping failed for #{engine.to_s}: #{e.inspect}" if verbose
|
103
155
|
end
|
104
156
|
end
|
105
157
|
end
|
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "static_sitemap_tasks"
|
6
|
-
s.version = "0.
|
6
|
+
s.version = "0.2"
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
8
|
s.summary = 'Rake tasks to manage sitemap.xml generation for static sites'
|
9
9
|
s.description = 'Rake tasks to manage sitemap.xml generation for static sites'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: static_sitemap_tasks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,11 +11,11 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2011-10-
|
14
|
+
date: 2011-10-14 00:00:00.000000000Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rake
|
18
|
-
requirement: &
|
18
|
+
requirement: &70099743651700 !ruby/object:Gem::Requirement
|
19
19
|
none: false
|
20
20
|
requirements:
|
21
21
|
- - ! '>='
|
@@ -23,10 +23,10 @@ dependencies:
|
|
23
23
|
version: 0.8.7
|
24
24
|
type: :development
|
25
25
|
prerelease: false
|
26
|
-
version_requirements: *
|
26
|
+
version_requirements: *70099743651700
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
|
-
requirement: &
|
29
|
+
requirement: &70099743644920 !ruby/object:Gem::Requirement
|
30
30
|
none: false
|
31
31
|
requirements:
|
32
32
|
- - ! '>='
|
@@ -34,7 +34,7 @@ dependencies:
|
|
34
34
|
version: '1.0'
|
35
35
|
type: :development
|
36
36
|
prerelease: false
|
37
|
-
version_requirements: *
|
37
|
+
version_requirements: *70099743644920
|
38
38
|
description: Rake tasks to manage sitemap.xml generation for static sites
|
39
39
|
email:
|
40
40
|
- mleinartas@gmail.com
|
@@ -43,6 +43,7 @@ extensions: []
|
|
43
43
|
extra_rdoc_files: []
|
44
44
|
files:
|
45
45
|
- .gitignore
|
46
|
+
- ChangeLog
|
46
47
|
- Gemfile
|
47
48
|
- MIT-LICENSE
|
48
49
|
- README.md
|