sitemap_checker 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -2
- data/lib/sitemap_checker.rb +24 -15
- data/lib/sitemap_checker/version.rb +1 -1
- data/spec/fixtures/siteindex.xml +11 -0
- data/spec/fixtures/siteindex.xml.gz +0 -0
- data/spec/fixtures/siteindex.xsd +73 -0
- data/spec/fixtures/{valid_sitemap.xml → sitemap.xml} +0 -0
- data/spec/fixtures/{valid_sitemap.xml.gz → sitemap.xml.gz} +0 -0
- data/spec/fixtures/{sitemap_schema.xml → sitemap.xsd} +0 -0
- data/spec/sitemap_checker_spec.rb +24 -11
- metadata +61 -39
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SitemapChecker
|
2
2
|
|
3
|
-
|
3
|
+
Takes a url pointing to an xml or xml.gz sitemap or siteindex file and returns array of status messages for urls contained within.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,7 +18,8 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
SitemapChecker::Checker.new(url)
|
22
|
+
|
22
23
|
|
23
24
|
## Contributing
|
24
25
|
|
data/lib/sitemap_checker.rb
CHANGED
@@ -7,35 +7,44 @@ module SitemapChecker
|
|
7
7
|
class Checker
|
8
8
|
attr_reader :status_list
|
9
9
|
|
10
|
-
def initialize(url,schema='
|
11
|
-
@schema = schema
|
10
|
+
def initialize(url,schema='')
|
12
11
|
@url = url
|
13
|
-
@
|
14
|
-
|
15
|
-
@status_list = get_status_list
|
12
|
+
@status_list = Array.new
|
13
|
+
process_xml
|
16
14
|
end
|
17
15
|
|
18
16
|
private
|
19
17
|
|
20
|
-
def get_xml_from_url
|
18
|
+
def get_xml_from_url(url)
|
21
19
|
begin
|
22
|
-
Nokogiri::XML(Zlib::GzipReader.new(open(
|
20
|
+
Nokogiri::XML(Zlib::GzipReader.new(open(url)))
|
23
21
|
rescue
|
24
|
-
Nokogiri::XML(open(
|
22
|
+
Nokogiri::XML(open(url))
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
26
|
+
def process_xml
|
27
|
+
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
28
|
+
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
29
|
+
xml = get_xml_from_url(@url)
|
30
|
+
if mxsd.valid?(xml)
|
31
|
+
@status_list = get_status_list(urls(xml))
|
32
|
+
elsif ixsd.valid?(xml)
|
33
|
+
maps = urls(xml)
|
34
|
+
maps.each do |map|
|
35
|
+
xml = get_xml_from_url(map)
|
36
|
+
@status_list += get_status_list(urls(xml))
|
37
|
+
end
|
38
|
+
else raise 'Invalid Schema'
|
39
|
+
false
|
40
|
+
end
|
32
41
|
end
|
33
42
|
|
34
|
-
def urls
|
35
|
-
|
43
|
+
def urls(xml)
|
44
|
+
xml.xpath("//xmlns:loc")
|
36
45
|
end
|
37
46
|
|
38
|
-
def get_status_list
|
47
|
+
def get_status_list(urls)
|
39
48
|
statuses = []
|
40
49
|
urls.each do |url|
|
41
50
|
begin
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<sitemapindex xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
3
|
+
<sitemap>
|
4
|
+
<loc>http://www.github.com/sitemap.xml</loc>
|
5
|
+
<lastmod>2012-07-30T17:14:40-07:00</lastmod>
|
6
|
+
</sitemap>
|
7
|
+
<sitemap>
|
8
|
+
<loc>http://www.github.com/sitemap.xml.gz</loc>
|
9
|
+
<lastmod>2012-07-30T17:14:40-07:00</lastmod>
|
10
|
+
</sitemap>
|
11
|
+
</sitemapindex>
|
Binary file
|
@@ -0,0 +1,73 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
3
|
+
targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
|
4
|
+
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
5
|
+
elementFormDefault="qualified">
|
6
|
+
<xsd:annotation>
|
7
|
+
<xsd:documentation>
|
8
|
+
XML Schema for Sitemap index files.
|
9
|
+
Last Modifed 2009-04-08
|
10
|
+
</xsd:documentation>
|
11
|
+
</xsd:annotation>
|
12
|
+
|
13
|
+
<xsd:element name="sitemapindex">
|
14
|
+
<xsd:annotation>
|
15
|
+
<xsd:documentation>
|
16
|
+
Container for a set of up to 50,000 sitemap URLs.
|
17
|
+
This is the root element of the XML file.
|
18
|
+
</xsd:documentation>
|
19
|
+
</xsd:annotation>
|
20
|
+
<xsd:complexType>
|
21
|
+
<xsd:sequence>
|
22
|
+
<xsd:element name="sitemap" type="tSitemap" maxOccurs="unbounded"/>
|
23
|
+
</xsd:sequence>
|
24
|
+
</xsd:complexType>
|
25
|
+
</xsd:element>
|
26
|
+
|
27
|
+
<xsd:complexType name="tSitemap">
|
28
|
+
<xsd:annotation>
|
29
|
+
<xsd:documentation>
|
30
|
+
Container for the data needed to describe a sitemap.
|
31
|
+
</xsd:documentation>
|
32
|
+
</xsd:annotation>
|
33
|
+
<xsd:all>
|
34
|
+
<xsd:element name="loc" type="tLocSitemap"/>
|
35
|
+
<xsd:element name="lastmod" type="tLastmodSitemap" minOccurs="0"/>
|
36
|
+
</xsd:all>
|
37
|
+
</xsd:complexType>
|
38
|
+
|
39
|
+
<xsd:simpleType name="tLocSitemap">
|
40
|
+
<xsd:annotation>
|
41
|
+
<xsd:documentation>
|
42
|
+
REQUIRED: The location URI of a sitemap.
|
43
|
+
The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
|
44
|
+
</xsd:documentation>
|
45
|
+
</xsd:annotation>
|
46
|
+
<xsd:restriction base="xsd:anyURI">
|
47
|
+
<xsd:minLength value="12"/>
|
48
|
+
<xsd:maxLength value="2048"/>
|
49
|
+
</xsd:restriction>
|
50
|
+
</xsd:simpleType>
|
51
|
+
|
52
|
+
<xsd:simpleType name="tLastmodSitemap">
|
53
|
+
<xsd:annotation>
|
54
|
+
<xsd:documentation>
|
55
|
+
OPTIONAL: The date the document was last modified. The date must conform
|
56
|
+
to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
|
57
|
+
Example: 2005-05-10
|
58
|
+
Lastmod may also contain a timestamp.
|
59
|
+
Example: 2005-05-10T17:33:30+08:00
|
60
|
+
</xsd:documentation>
|
61
|
+
</xsd:annotation>
|
62
|
+
<xsd:union>
|
63
|
+
<xsd:simpleType>
|
64
|
+
<xsd:restriction base="xsd:date"/>
|
65
|
+
</xsd:simpleType>
|
66
|
+
<xsd:simpleType>
|
67
|
+
<xsd:restriction base="xsd:dateTime"/>
|
68
|
+
</xsd:simpleType>
|
69
|
+
</xsd:union>
|
70
|
+
</xsd:simpleType>
|
71
|
+
|
72
|
+
|
73
|
+
</xsd:schema>
|
File without changes
|
File without changes
|
File without changes
|
@@ -8,16 +8,24 @@ describe SitemapChecker do
|
|
8
8
|
@dir = Pathname.new(File.dirname(__FILE__))
|
9
9
|
stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
|
10
10
|
stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
|
11
|
-
stub_request(:any, "http://www.github.com/
|
12
|
-
stub_request(:any, "http://www.github.com/
|
13
|
-
stub_request(:
|
14
|
-
|
15
|
-
|
11
|
+
stub_request(:any, "http://www.github.com/sitemap.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml'))
|
12
|
+
stub_request(:any, "http://www.github.com/sitemap.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml.gz'))
|
13
|
+
stub_request(:any, "http://www.github.com/siteindex.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml'))
|
14
|
+
stub_request(:any, "http://www.github.com/siteindex.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml.gz'))
|
15
|
+
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xsd'), :headers => {})
|
16
|
+
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
|
16
17
|
end
|
17
18
|
|
18
|
-
it "accepts xml and gzipped
|
19
|
-
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/
|
20
|
-
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/
|
19
|
+
it "accepts xml and gzipped siteindexes" do
|
20
|
+
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
|
21
|
+
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
|
22
|
+
@xml_sitemap.status_list.size.should eq(4)
|
23
|
+
@gz_sitemap.status_list.size.should eq(4)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "accepts xml and gzipped sitemaps" do
|
27
|
+
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
|
28
|
+
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
|
21
29
|
@xml_sitemap.status_list.size.should eq(2)
|
22
30
|
@gz_sitemap.status_list.size.should eq(2)
|
23
31
|
end
|
@@ -26,9 +34,14 @@ describe SitemapChecker do
|
|
26
34
|
lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
|
27
35
|
end
|
28
36
|
|
29
|
-
it "returns list of urls with responses" do
|
30
|
-
@
|
31
|
-
@
|
37
|
+
it "returns list of urls with responses from sitemap" do
|
38
|
+
@sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
|
39
|
+
@sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
|
40
|
+
end
|
41
|
+
|
42
|
+
it "returns list of urls with responses from siteindex" do
|
43
|
+
@siteindex = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
|
44
|
+
@siteindex.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404'], ['http://www.github.com','200'], ['http://www.github.com/404','404']])
|
32
45
|
end
|
33
46
|
|
34
47
|
end
|
metadata
CHANGED
@@ -1,39 +1,46 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_checker
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 1317335842608397244
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Gerlando Piro
|
9
14
|
autorequire:
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
|
18
|
+
date: 2012-07-31 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
15
21
|
name: nokogiri
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ! '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
22
|
-
type: :runtime
|
23
22
|
prerelease: false
|
24
|
-
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
26
|
-
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 2002549777813010636
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
version: "0"
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
30
34
|
description: SiteMap Checker
|
31
|
-
email:
|
35
|
+
email:
|
32
36
|
- gerlando@gmail.com
|
33
37
|
executables: []
|
38
|
+
|
34
39
|
extensions: []
|
40
|
+
|
35
41
|
extra_rdoc_files: []
|
36
|
-
|
42
|
+
|
43
|
+
files:
|
37
44
|
- .gitignore
|
38
45
|
- Gemfile
|
39
46
|
- LICENSE
|
@@ -42,38 +49,53 @@ files:
|
|
42
49
|
- lib/sitemap_checker.rb
|
43
50
|
- lib/sitemap_checker/version.rb
|
44
51
|
- sitemap_checker.gemspec
|
45
|
-
- spec/fixtures/
|
46
|
-
- spec/fixtures/
|
47
|
-
- spec/fixtures/
|
52
|
+
- spec/fixtures/siteindex.xml
|
53
|
+
- spec/fixtures/siteindex.xml.gz
|
54
|
+
- spec/fixtures/siteindex.xsd
|
55
|
+
- spec/fixtures/sitemap.xml
|
56
|
+
- spec/fixtures/sitemap.xml.gz
|
57
|
+
- spec/fixtures/sitemap.xsd
|
48
58
|
- spec/sitemap_checker_spec.rb
|
49
59
|
- spec/spec_helper.rb
|
50
60
|
homepage: https://github.com/gerlandop/sitemap_checker
|
51
61
|
licenses: []
|
62
|
+
|
52
63
|
post_install_message:
|
53
64
|
rdoc_options: []
|
54
|
-
|
65
|
+
|
66
|
+
require_paths:
|
55
67
|
- lib
|
56
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
69
|
none: false
|
58
|
-
requirements:
|
59
|
-
- -
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
|
62
|
-
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 2002549777813010636
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
78
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 2002549777813010636
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
68
86
|
requirements: []
|
87
|
+
|
69
88
|
rubyforge_project:
|
70
89
|
rubygems_version: 1.8.24
|
71
90
|
signing_key:
|
72
91
|
specification_version: 3
|
73
92
|
summary: Gets status of Urls in SiteMap
|
74
|
-
test_files:
|
75
|
-
- spec/fixtures/
|
76
|
-
- spec/fixtures/
|
77
|
-
- spec/fixtures/
|
93
|
+
test_files:
|
94
|
+
- spec/fixtures/siteindex.xml
|
95
|
+
- spec/fixtures/siteindex.xml.gz
|
96
|
+
- spec/fixtures/siteindex.xsd
|
97
|
+
- spec/fixtures/sitemap.xml
|
98
|
+
- spec/fixtures/sitemap.xml.gz
|
99
|
+
- spec/fixtures/sitemap.xsd
|
78
100
|
- spec/sitemap_checker_spec.rb
|
79
101
|
- spec/spec_helper.rb
|