sitemap_checker 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # SitemapChecker
2
2
 
3
- Checks sitemap urls for valid response
3
+ Takes a url pointing to an xml or xml.gz sitemap or siteindex file and returns array of status messages for urls contained within.
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,8 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ SitemapChecker::Checker.new(url)
22
+
22
23
 
23
24
  ## Contributing
24
25
 
@@ -7,35 +7,44 @@ module SitemapChecker
7
7
  class Checker
8
8
  attr_reader :status_list
9
9
 
10
- def initialize(url,schema='http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
11
- @schema = schema
10
+ def initialize(url,schema='')
12
11
  @url = url
13
- @sitemap = get_xml_from_url
14
- sitemap_is_valid?
15
- @status_list = get_status_list
12
+ @status_list = Array.new
13
+ process_xml
16
14
  end
17
15
 
18
16
  private
19
17
 
20
- def get_xml_from_url
18
+ def get_xml_from_url(url)
21
19
  begin
22
- Nokogiri::XML(Zlib::GzipReader.new(open(@url)))
20
+ Nokogiri::XML(Zlib::GzipReader.new(open(url)))
23
21
  rescue
24
- Nokogiri::XML(open(@url))
22
+ Nokogiri::XML(open(url))
25
23
  end
26
24
  end
27
25
 
28
- def sitemap_is_valid?
29
- xsd = Nokogiri::XML::Schema(open(@schema))
30
- raise 'Invalid Schema' unless xsd.valid?(@sitemap)
31
- true
26
+ def process_xml
27
+ mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
28
+ ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
29
+ xml = get_xml_from_url(@url)
30
+ if mxsd.valid?(xml)
31
+ @status_list = get_status_list(urls(xml))
32
+ elsif ixsd.valid?(xml)
33
+ maps = urls(xml)
34
+ maps.each do |map|
35
+ xml = get_xml_from_url(map)
36
+ @status_list += get_status_list(urls(xml))
37
+ end
38
+ else raise 'Invalid Schema'
39
+ false
40
+ end
32
41
  end
33
42
 
34
- def urls
35
- @sitemap.xpath("//xmlns:loc")
43
+ def urls(xml)
44
+ xml.xpath("//xmlns:loc")
36
45
  end
37
46
 
38
- def get_status_list
47
+ def get_status_list(urls)
39
48
  statuses = []
40
49
  urls.each do |url|
41
50
  begin
@@ -1,3 +1,3 @@
1
1
  module SitemapChecker
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <sitemapindex xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
3
+ <sitemap>
4
+ <loc>http://www.github.com/sitemap.xml</loc>
5
+ <lastmod>2012-07-30T17:14:40-07:00</lastmod>
6
+ </sitemap>
7
+ <sitemap>
8
+ <loc>http://www.github.com/sitemap.xml.gz</loc>
9
+ <lastmod>2012-07-30T17:14:40-07:00</lastmod>
10
+ </sitemap>
11
+ </sitemapindex>
Binary file
@@ -0,0 +1,73 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
3
+ targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
4
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
5
+ elementFormDefault="qualified">
6
+ <xsd:annotation>
7
+ <xsd:documentation>
8
+ XML Schema for Sitemap index files.
9
+ Last Modifed 2009-04-08
10
+ </xsd:documentation>
11
+ </xsd:annotation>
12
+
13
+ <xsd:element name="sitemapindex">
14
+ <xsd:annotation>
15
+ <xsd:documentation>
16
+ Container for a set of up to 50,000 sitemap URLs.
17
+ This is the root element of the XML file.
18
+ </xsd:documentation>
19
+ </xsd:annotation>
20
+ <xsd:complexType>
21
+ <xsd:sequence>
22
+ <xsd:element name="sitemap" type="tSitemap" maxOccurs="unbounded"/>
23
+ </xsd:sequence>
24
+ </xsd:complexType>
25
+ </xsd:element>
26
+
27
+ <xsd:complexType name="tSitemap">
28
+ <xsd:annotation>
29
+ <xsd:documentation>
30
+ Container for the data needed to describe a sitemap.
31
+ </xsd:documentation>
32
+ </xsd:annotation>
33
+ <xsd:all>
34
+ <xsd:element name="loc" type="tLocSitemap"/>
35
+ <xsd:element name="lastmod" type="tLastmodSitemap" minOccurs="0"/>
36
+ </xsd:all>
37
+ </xsd:complexType>
38
+
39
+ <xsd:simpleType name="tLocSitemap">
40
+ <xsd:annotation>
41
+ <xsd:documentation>
42
+ REQUIRED: The location URI of a sitemap.
43
+ The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
44
+ </xsd:documentation>
45
+ </xsd:annotation>
46
+ <xsd:restriction base="xsd:anyURI">
47
+ <xsd:minLength value="12"/>
48
+ <xsd:maxLength value="2048"/>
49
+ </xsd:restriction>
50
+ </xsd:simpleType>
51
+
52
+ <xsd:simpleType name="tLastmodSitemap">
53
+ <xsd:annotation>
54
+ <xsd:documentation>
55
+ OPTIONAL: The date the document was last modified. The date must conform
56
+ to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
57
+ Example: 2005-05-10
58
+ Lastmod may also contain a timestamp.
59
+ Example: 2005-05-10T17:33:30+08:00
60
+ </xsd:documentation>
61
+ </xsd:annotation>
62
+ <xsd:union>
63
+ <xsd:simpleType>
64
+ <xsd:restriction base="xsd:date"/>
65
+ </xsd:simpleType>
66
+ <xsd:simpleType>
67
+ <xsd:restriction base="xsd:dateTime"/>
68
+ </xsd:simpleType>
69
+ </xsd:union>
70
+ </xsd:simpleType>
71
+
72
+
73
+ </xsd:schema>
File without changes
File without changes
@@ -8,16 +8,24 @@ describe SitemapChecker do
8
8
  @dir = Pathname.new(File.dirname(__FILE__))
9
9
  stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
10
10
  stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
11
- stub_request(:any, "http://www.github.com/index.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml'))
12
- stub_request(:any, "http://www.github.com/index.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml.gz'))
13
- stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").
14
- with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
15
- to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap_schema.xml'), :headers => {})
11
+ stub_request(:any, "http://www.github.com/sitemap.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml'))
12
+ stub_request(:any, "http://www.github.com/sitemap.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml.gz'))
13
+ stub_request(:any, "http://www.github.com/siteindex.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml'))
14
+ stub_request(:any, "http://www.github.com/siteindex.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml.gz'))
15
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xsd'), :headers => {})
16
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
16
17
  end
17
18
 
18
- it "accepts xml and gzipped files" do
19
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
20
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml.gz')
19
+ it "accepts xml and gzipped siteindexes" do
20
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
21
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
22
+ @xml_sitemap.status_list.size.should eq(4)
23
+ @gz_sitemap.status_list.size.should eq(4)
24
+ end
25
+
26
+ it "accepts xml and gzipped sitemaps" do
27
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
28
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
21
29
  @xml_sitemap.status_list.size.should eq(2)
22
30
  @gz_sitemap.status_list.size.should eq(2)
23
31
  end
@@ -26,9 +34,14 @@ describe SitemapChecker do
26
34
  lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
27
35
  end
28
36
 
29
- it "returns list of urls with responses" do
30
- @valid_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
31
- @valid_sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
37
+ it "returns list of urls with responses from sitemap" do
38
+ @sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
39
+ @sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
40
+ end
41
+
42
+ it "returns list of urls with responses from siteindex" do
43
+ @siteindex = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
44
+ @siteindex.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404'], ['http://www.github.com','200'], ['http://www.github.com/404','404']])
32
45
  end
33
46
 
34
47
  end
metadata CHANGED
@@ -1,39 +1,46 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: sitemap_checker
3
- version: !ruby/object:Gem::Version
4
- version: 0.0.1
3
+ version: !ruby/object:Gem::Version
4
+ hash: 1317335842608397244
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 2
10
+ version: 0.0.2
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Gerlando Piro
9
14
  autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
- date: 2012-07-29 00:00:00.000000000 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
17
+
18
+ date: 2012-07-31 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
15
21
  name: nokogiri
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '0'
22
- type: :runtime
23
22
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
23
+ requirement: &id001 !ruby/object:Gem::Requirement
25
24
  none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 2002549777813010636
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ version_requirements: *id001
30
34
  description: SiteMap Checker
31
- email:
35
+ email:
32
36
  - gerlando@gmail.com
33
37
  executables: []
38
+
34
39
  extensions: []
40
+
35
41
  extra_rdoc_files: []
36
- files:
42
+
43
+ files:
37
44
  - .gitignore
38
45
  - Gemfile
39
46
  - LICENSE
@@ -42,38 +49,53 @@ files:
42
49
  - lib/sitemap_checker.rb
43
50
  - lib/sitemap_checker/version.rb
44
51
  - sitemap_checker.gemspec
45
- - spec/fixtures/sitemap_schema.xml
46
- - spec/fixtures/valid_sitemap.xml
47
- - spec/fixtures/valid_sitemap.xml.gz
52
+ - spec/fixtures/siteindex.xml
53
+ - spec/fixtures/siteindex.xml.gz
54
+ - spec/fixtures/siteindex.xsd
55
+ - spec/fixtures/sitemap.xml
56
+ - spec/fixtures/sitemap.xml.gz
57
+ - spec/fixtures/sitemap.xsd
48
58
  - spec/sitemap_checker_spec.rb
49
59
  - spec/spec_helper.rb
50
60
  homepage: https://github.com/gerlandop/sitemap_checker
51
61
  licenses: []
62
+
52
63
  post_install_message:
53
64
  rdoc_options: []
54
- require_paths:
65
+
66
+ require_paths:
55
67
  - lib
56
- required_ruby_version: !ruby/object:Gem::Requirement
68
+ required_ruby_version: !ruby/object:Gem::Requirement
57
69
  none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ hash: 2002549777813010636
74
+ segments:
75
+ - 0
76
+ version: "0"
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
78
  none: false
64
- requirements:
65
- - - ! '>='
66
- - !ruby/object:Gem::Version
67
- version: '0'
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 2002549777813010636
83
+ segments:
84
+ - 0
85
+ version: "0"
68
86
  requirements: []
87
+
69
88
  rubyforge_project:
70
89
  rubygems_version: 1.8.24
71
90
  signing_key:
72
91
  specification_version: 3
73
92
  summary: Gets status of Urls in SiteMap
74
- test_files:
75
- - spec/fixtures/sitemap_schema.xml
76
- - spec/fixtures/valid_sitemap.xml
77
- - spec/fixtures/valid_sitemap.xml.gz
93
+ test_files:
94
+ - spec/fixtures/siteindex.xml
95
+ - spec/fixtures/siteindex.xml.gz
96
+ - spec/fixtures/siteindex.xsd
97
+ - spec/fixtures/sitemap.xml
98
+ - spec/fixtures/sitemap.xml.gz
99
+ - spec/fixtures/sitemap.xsd
78
100
  - spec/sitemap_checker_spec.rb
79
101
  - spec/spec_helper.rb