sitemap_checker 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -2
- data/lib/sitemap_checker.rb +24 -15
- data/lib/sitemap_checker/version.rb +1 -1
- data/spec/fixtures/siteindex.xml +11 -0
- data/spec/fixtures/siteindex.xml.gz +0 -0
- data/spec/fixtures/siteindex.xsd +73 -0
- data/spec/fixtures/{valid_sitemap.xml → sitemap.xml} +0 -0
- data/spec/fixtures/{valid_sitemap.xml.gz → sitemap.xml.gz} +0 -0
- data/spec/fixtures/{sitemap_schema.xml → sitemap.xsd} +0 -0
- data/spec/sitemap_checker_spec.rb +24 -11
- metadata +61 -39
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SitemapChecker
|
2
2
|
|
3
|
-
|
3
|
+
Takes a url pointing to an xml or xml.gz sitemap or siteindex file and returns array of status messages for urls contained within.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,7 +18,8 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
SitemapChecker::Checker.new(url)
|
22
|
+
|
22
23
|
|
23
24
|
## Contributing
|
24
25
|
|
data/lib/sitemap_checker.rb
CHANGED
@@ -7,35 +7,44 @@ module SitemapChecker
|
|
7
7
|
class Checker
|
8
8
|
attr_reader :status_list
|
9
9
|
|
10
|
-
def initialize(url,schema='
|
11
|
-
@schema = schema
|
10
|
+
def initialize(url,schema='')
|
12
11
|
@url = url
|
13
|
-
@
|
14
|
-
|
15
|
-
@status_list = get_status_list
|
12
|
+
@status_list = Array.new
|
13
|
+
process_xml
|
16
14
|
end
|
17
15
|
|
18
16
|
private
|
19
17
|
|
20
|
-
def get_xml_from_url
|
18
|
+
def get_xml_from_url(url)
|
21
19
|
begin
|
22
|
-
Nokogiri::XML(Zlib::GzipReader.new(open(
|
20
|
+
Nokogiri::XML(Zlib::GzipReader.new(open(url)))
|
23
21
|
rescue
|
24
|
-
Nokogiri::XML(open(
|
22
|
+
Nokogiri::XML(open(url))
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
26
|
+
def process_xml
|
27
|
+
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
28
|
+
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
29
|
+
xml = get_xml_from_url(@url)
|
30
|
+
if mxsd.valid?(xml)
|
31
|
+
@status_list = get_status_list(urls(xml))
|
32
|
+
elsif ixsd.valid?(xml)
|
33
|
+
maps = urls(xml)
|
34
|
+
maps.each do |map|
|
35
|
+
xml = get_xml_from_url(map)
|
36
|
+
@status_list += get_status_list(urls(xml))
|
37
|
+
end
|
38
|
+
else raise 'Invalid Schema'
|
39
|
+
false
|
40
|
+
end
|
32
41
|
end
|
33
42
|
|
34
|
-
def urls
|
35
|
-
|
43
|
+
def urls(xml)
|
44
|
+
xml.xpath("//xmlns:loc")
|
36
45
|
end
|
37
46
|
|
38
|
-
def get_status_list
|
47
|
+
def get_status_list(urls)
|
39
48
|
statuses = []
|
40
49
|
urls.each do |url|
|
41
50
|
begin
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<sitemapindex xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
3
|
+
<sitemap>
|
4
|
+
<loc>http://www.github.com/sitemap.xml</loc>
|
5
|
+
<lastmod>2012-07-30T17:14:40-07:00</lastmod>
|
6
|
+
</sitemap>
|
7
|
+
<sitemap>
|
8
|
+
<loc>http://www.github.com/sitemap.xml.gz</loc>
|
9
|
+
<lastmod>2012-07-30T17:14:40-07:00</lastmod>
|
10
|
+
</sitemap>
|
11
|
+
</sitemapindex>
|
Binary file
|
@@ -0,0 +1,73 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
3
|
+
targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
|
4
|
+
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
5
|
+
elementFormDefault="qualified">
|
6
|
+
<xsd:annotation>
|
7
|
+
<xsd:documentation>
|
8
|
+
XML Schema for Sitemap index files.
|
9
|
+
Last Modifed 2009-04-08
|
10
|
+
</xsd:documentation>
|
11
|
+
</xsd:annotation>
|
12
|
+
|
13
|
+
<xsd:element name="sitemapindex">
|
14
|
+
<xsd:annotation>
|
15
|
+
<xsd:documentation>
|
16
|
+
Container for a set of up to 50,000 sitemap URLs.
|
17
|
+
This is the root element of the XML file.
|
18
|
+
</xsd:documentation>
|
19
|
+
</xsd:annotation>
|
20
|
+
<xsd:complexType>
|
21
|
+
<xsd:sequence>
|
22
|
+
<xsd:element name="sitemap" type="tSitemap" maxOccurs="unbounded"/>
|
23
|
+
</xsd:sequence>
|
24
|
+
</xsd:complexType>
|
25
|
+
</xsd:element>
|
26
|
+
|
27
|
+
<xsd:complexType name="tSitemap">
|
28
|
+
<xsd:annotation>
|
29
|
+
<xsd:documentation>
|
30
|
+
Container for the data needed to describe a sitemap.
|
31
|
+
</xsd:documentation>
|
32
|
+
</xsd:annotation>
|
33
|
+
<xsd:all>
|
34
|
+
<xsd:element name="loc" type="tLocSitemap"/>
|
35
|
+
<xsd:element name="lastmod" type="tLastmodSitemap" minOccurs="0"/>
|
36
|
+
</xsd:all>
|
37
|
+
</xsd:complexType>
|
38
|
+
|
39
|
+
<xsd:simpleType name="tLocSitemap">
|
40
|
+
<xsd:annotation>
|
41
|
+
<xsd:documentation>
|
42
|
+
REQUIRED: The location URI of a sitemap.
|
43
|
+
The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
|
44
|
+
</xsd:documentation>
|
45
|
+
</xsd:annotation>
|
46
|
+
<xsd:restriction base="xsd:anyURI">
|
47
|
+
<xsd:minLength value="12"/>
|
48
|
+
<xsd:maxLength value="2048"/>
|
49
|
+
</xsd:restriction>
|
50
|
+
</xsd:simpleType>
|
51
|
+
|
52
|
+
<xsd:simpleType name="tLastmodSitemap">
|
53
|
+
<xsd:annotation>
|
54
|
+
<xsd:documentation>
|
55
|
+
OPTIONAL: The date the document was last modified. The date must conform
|
56
|
+
to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
|
57
|
+
Example: 2005-05-10
|
58
|
+
Lastmod may also contain a timestamp.
|
59
|
+
Example: 2005-05-10T17:33:30+08:00
|
60
|
+
</xsd:documentation>
|
61
|
+
</xsd:annotation>
|
62
|
+
<xsd:union>
|
63
|
+
<xsd:simpleType>
|
64
|
+
<xsd:restriction base="xsd:date"/>
|
65
|
+
</xsd:simpleType>
|
66
|
+
<xsd:simpleType>
|
67
|
+
<xsd:restriction base="xsd:dateTime"/>
|
68
|
+
</xsd:simpleType>
|
69
|
+
</xsd:union>
|
70
|
+
</xsd:simpleType>
|
71
|
+
|
72
|
+
|
73
|
+
</xsd:schema>
|
File without changes
|
File without changes
|
File without changes
|
@@ -8,16 +8,24 @@ describe SitemapChecker do
|
|
8
8
|
@dir = Pathname.new(File.dirname(__FILE__))
|
9
9
|
stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
|
10
10
|
stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
|
11
|
-
stub_request(:any, "http://www.github.com/
|
12
|
-
stub_request(:any, "http://www.github.com/
|
13
|
-
stub_request(:
|
14
|
-
|
15
|
-
|
11
|
+
stub_request(:any, "http://www.github.com/sitemap.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml'))
|
12
|
+
stub_request(:any, "http://www.github.com/sitemap.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml.gz'))
|
13
|
+
stub_request(:any, "http://www.github.com/siteindex.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml'))
|
14
|
+
stub_request(:any, "http://www.github.com/siteindex.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml.gz'))
|
15
|
+
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xsd'), :headers => {})
|
16
|
+
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
|
16
17
|
end
|
17
18
|
|
18
|
-
it "accepts xml and gzipped
|
19
|
-
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/
|
20
|
-
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/
|
19
|
+
it "accepts xml and gzipped siteindexes" do
|
20
|
+
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
|
21
|
+
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
|
22
|
+
@xml_sitemap.status_list.size.should eq(4)
|
23
|
+
@gz_sitemap.status_list.size.should eq(4)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "accepts xml and gzipped sitemaps" do
|
27
|
+
@xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
|
28
|
+
@gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
|
21
29
|
@xml_sitemap.status_list.size.should eq(2)
|
22
30
|
@gz_sitemap.status_list.size.should eq(2)
|
23
31
|
end
|
@@ -26,9 +34,14 @@ describe SitemapChecker do
|
|
26
34
|
lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
|
27
35
|
end
|
28
36
|
|
29
|
-
it "returns list of urls with responses" do
|
30
|
-
@
|
31
|
-
@
|
37
|
+
it "returns list of urls with responses from sitemap" do
|
38
|
+
@sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
|
39
|
+
@sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
|
40
|
+
end
|
41
|
+
|
42
|
+
it "returns list of urls with responses from siteindex" do
|
43
|
+
@siteindex = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
|
44
|
+
@siteindex.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404'], ['http://www.github.com','200'], ['http://www.github.com/404','404']])
|
32
45
|
end
|
33
46
|
|
34
47
|
end
|
metadata
CHANGED
@@ -1,39 +1,46 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_checker
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 1317335842608397244
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Gerlando Piro
|
9
14
|
autorequire:
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
17
|
+
|
18
|
+
date: 2012-07-31 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
15
21
|
name: nokogiri
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ! '>='
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
22
|
-
type: :runtime
|
23
22
|
prerelease: false
|
24
|
-
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
26
|
-
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 2002549777813010636
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
version: "0"
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
30
34
|
description: SiteMap Checker
|
31
|
-
email:
|
35
|
+
email:
|
32
36
|
- gerlando@gmail.com
|
33
37
|
executables: []
|
38
|
+
|
34
39
|
extensions: []
|
40
|
+
|
35
41
|
extra_rdoc_files: []
|
36
|
-
|
42
|
+
|
43
|
+
files:
|
37
44
|
- .gitignore
|
38
45
|
- Gemfile
|
39
46
|
- LICENSE
|
@@ -42,38 +49,53 @@ files:
|
|
42
49
|
- lib/sitemap_checker.rb
|
43
50
|
- lib/sitemap_checker/version.rb
|
44
51
|
- sitemap_checker.gemspec
|
45
|
-
- spec/fixtures/
|
46
|
-
- spec/fixtures/
|
47
|
-
- spec/fixtures/
|
52
|
+
- spec/fixtures/siteindex.xml
|
53
|
+
- spec/fixtures/siteindex.xml.gz
|
54
|
+
- spec/fixtures/siteindex.xsd
|
55
|
+
- spec/fixtures/sitemap.xml
|
56
|
+
- spec/fixtures/sitemap.xml.gz
|
57
|
+
- spec/fixtures/sitemap.xsd
|
48
58
|
- spec/sitemap_checker_spec.rb
|
49
59
|
- spec/spec_helper.rb
|
50
60
|
homepage: https://github.com/gerlandop/sitemap_checker
|
51
61
|
licenses: []
|
62
|
+
|
52
63
|
post_install_message:
|
53
64
|
rdoc_options: []
|
54
|
-
|
65
|
+
|
66
|
+
require_paths:
|
55
67
|
- lib
|
56
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
69
|
none: false
|
58
|
-
requirements:
|
59
|
-
- -
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
|
62
|
-
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 2002549777813010636
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
78
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 2002549777813010636
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
68
86
|
requirements: []
|
87
|
+
|
69
88
|
rubyforge_project:
|
70
89
|
rubygems_version: 1.8.24
|
71
90
|
signing_key:
|
72
91
|
specification_version: 3
|
73
92
|
summary: Gets status of Urls in SiteMap
|
74
|
-
test_files:
|
75
|
-
- spec/fixtures/
|
76
|
-
- spec/fixtures/
|
77
|
-
- spec/fixtures/
|
93
|
+
test_files:
|
94
|
+
- spec/fixtures/siteindex.xml
|
95
|
+
- spec/fixtures/siteindex.xml.gz
|
96
|
+
- spec/fixtures/siteindex.xsd
|
97
|
+
- spec/fixtures/sitemap.xml
|
98
|
+
- spec/fixtures/sitemap.xml.gz
|
99
|
+
- spec/fixtures/sitemap.xsd
|
78
100
|
- spec/sitemap_checker_spec.rb
|
79
101
|
- spec/spec_helper.rb
|