sitemap_checker 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # SitemapChecker
2
2
 
3
- Checks sitemap urls for valid response
3
+ Takes a url pointing to an xml or xml.gz sitemap or siteindex file and returns array of status messages for urls contained within.
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,8 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ SitemapChecker::Checker.new(url)
22
+
22
23
 
23
24
  ## Contributing
24
25
 
@@ -7,35 +7,44 @@ module SitemapChecker
7
7
  class Checker
8
8
  attr_reader :status_list
9
9
 
10
- def initialize(url,schema='http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
11
- @schema = schema
10
+ def initialize(url,schema='')
12
11
  @url = url
13
- @sitemap = get_xml_from_url
14
- sitemap_is_valid?
15
- @status_list = get_status_list
12
+ @status_list = Array.new
13
+ process_xml
16
14
  end
17
15
 
18
16
  private
19
17
 
20
- def get_xml_from_url
18
+ def get_xml_from_url(url)
21
19
  begin
22
- Nokogiri::XML(Zlib::GzipReader.new(open(@url)))
20
+ Nokogiri::XML(Zlib::GzipReader.new(open(url)))
23
21
  rescue
24
- Nokogiri::XML(open(@url))
22
+ Nokogiri::XML(open(url))
25
23
  end
26
24
  end
27
25
 
28
- def sitemap_is_valid?
29
- xsd = Nokogiri::XML::Schema(open(@schema))
30
- raise 'Invalid Schema' unless xsd.valid?(@sitemap)
31
- true
26
+ def process_xml
27
+ mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
28
+ ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
29
+ xml = get_xml_from_url(@url)
30
+ if mxsd.valid?(xml)
31
+ @status_list = get_status_list(urls(xml))
32
+ elsif ixsd.valid?(xml)
33
+ maps = urls(xml)
34
+ maps.each do |map|
35
+ xml = get_xml_from_url(map)
36
+ @status_list += get_status_list(urls(xml))
37
+ end
38
+ else raise 'Invalid Schema'
39
+ false
40
+ end
32
41
  end
33
42
 
34
- def urls
35
- @sitemap.xpath("//xmlns:loc")
43
+ def urls(xml)
44
+ xml.xpath("//xmlns:loc")
36
45
  end
37
46
 
38
- def get_status_list
47
+ def get_status_list(urls)
39
48
  statuses = []
40
49
  urls.each do |url|
41
50
  begin
@@ -1,3 +1,3 @@
1
1
  module SitemapChecker
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <sitemapindex xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
3
+ <sitemap>
4
+ <loc>http://www.github.com/sitemap.xml</loc>
5
+ <lastmod>2012-07-30T17:14:40-07:00</lastmod>
6
+ </sitemap>
7
+ <sitemap>
8
+ <loc>http://www.github.com/sitemap.xml.gz</loc>
9
+ <lastmod>2012-07-30T17:14:40-07:00</lastmod>
10
+ </sitemap>
11
+ </sitemapindex>
Binary file
@@ -0,0 +1,73 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
3
+ targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
4
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
5
+ elementFormDefault="qualified">
6
+ <xsd:annotation>
7
+ <xsd:documentation>
8
+ XML Schema for Sitemap index files.
9
+ Last Modifed 2009-04-08
10
+ </xsd:documentation>
11
+ </xsd:annotation>
12
+
13
+ <xsd:element name="sitemapindex">
14
+ <xsd:annotation>
15
+ <xsd:documentation>
16
+ Container for a set of up to 50,000 sitemap URLs.
17
+ This is the root element of the XML file.
18
+ </xsd:documentation>
19
+ </xsd:annotation>
20
+ <xsd:complexType>
21
+ <xsd:sequence>
22
+ <xsd:element name="sitemap" type="tSitemap" maxOccurs="unbounded"/>
23
+ </xsd:sequence>
24
+ </xsd:complexType>
25
+ </xsd:element>
26
+
27
+ <xsd:complexType name="tSitemap">
28
+ <xsd:annotation>
29
+ <xsd:documentation>
30
+ Container for the data needed to describe a sitemap.
31
+ </xsd:documentation>
32
+ </xsd:annotation>
33
+ <xsd:all>
34
+ <xsd:element name="loc" type="tLocSitemap"/>
35
+ <xsd:element name="lastmod" type="tLastmodSitemap" minOccurs="0"/>
36
+ </xsd:all>
37
+ </xsd:complexType>
38
+
39
+ <xsd:simpleType name="tLocSitemap">
40
+ <xsd:annotation>
41
+ <xsd:documentation>
42
+ REQUIRED: The location URI of a sitemap.
43
+ The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
44
+ </xsd:documentation>
45
+ </xsd:annotation>
46
+ <xsd:restriction base="xsd:anyURI">
47
+ <xsd:minLength value="12"/>
48
+ <xsd:maxLength value="2048"/>
49
+ </xsd:restriction>
50
+ </xsd:simpleType>
51
+
52
+ <xsd:simpleType name="tLastmodSitemap">
53
+ <xsd:annotation>
54
+ <xsd:documentation>
55
+ OPTIONAL: The date the document was last modified. The date must conform
56
+ to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
57
+ Example: 2005-05-10
58
+ Lastmod may also contain a timestamp.
59
+ Example: 2005-05-10T17:33:30+08:00
60
+ </xsd:documentation>
61
+ </xsd:annotation>
62
+ <xsd:union>
63
+ <xsd:simpleType>
64
+ <xsd:restriction base="xsd:date"/>
65
+ </xsd:simpleType>
66
+ <xsd:simpleType>
67
+ <xsd:restriction base="xsd:dateTime"/>
68
+ </xsd:simpleType>
69
+ </xsd:union>
70
+ </xsd:simpleType>
71
+
72
+
73
+ </xsd:schema>
File without changes
File without changes
@@ -8,16 +8,24 @@ describe SitemapChecker do
8
8
  @dir = Pathname.new(File.dirname(__FILE__))
9
9
  stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
10
10
  stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
11
- stub_request(:any, "http://www.github.com/index.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml'))
12
- stub_request(:any, "http://www.github.com/index.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml.gz'))
13
- stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").
14
- with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
15
- to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap_schema.xml'), :headers => {})
11
+ stub_request(:any, "http://www.github.com/sitemap.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml'))
12
+ stub_request(:any, "http://www.github.com/sitemap.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xml.gz'))
13
+ stub_request(:any, "http://www.github.com/siteindex.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml'))
14
+ stub_request(:any, "http://www.github.com/siteindex.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xml.gz'))
15
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap.xsd'), :headers => {})
16
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
16
17
  end
17
18
 
18
- it "accepts xml and gzipped files" do
19
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
20
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml.gz')
19
+ it "accepts xml and gzipped siteindexes" do
20
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
21
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
22
+ @xml_sitemap.status_list.size.should eq(4)
23
+ @gz_sitemap.status_list.size.should eq(4)
24
+ end
25
+
26
+ it "accepts xml and gzipped sitemaps" do
27
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
28
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
21
29
  @xml_sitemap.status_list.size.should eq(2)
22
30
  @gz_sitemap.status_list.size.should eq(2)
23
31
  end
@@ -26,9 +34,14 @@ describe SitemapChecker do
26
34
  lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
27
35
  end
28
36
 
29
- it "returns list of urls with responses" do
30
- @valid_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
31
- @valid_sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
37
+ it "returns list of urls with responses from sitemap" do
38
+ @sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
39
+ @sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
40
+ end
41
+
42
+ it "returns list of urls with responses from siteindex" do
43
+ @siteindex = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
44
+ @siteindex.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404'], ['http://www.github.com','200'], ['http://www.github.com/404','404']])
32
45
  end
33
46
 
34
47
  end
metadata CHANGED
@@ -1,39 +1,46 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: sitemap_checker
3
- version: !ruby/object:Gem::Version
4
- version: 0.0.1
3
+ version: !ruby/object:Gem::Version
4
+ hash: 1317335842608397244
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 2
10
+ version: 0.0.2
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Gerlando Piro
9
14
  autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
- date: 2012-07-29 00:00:00.000000000 Z
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
17
+
18
+ date: 2012-07-31 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
15
21
  name: nokogiri
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ! '>='
20
- - !ruby/object:Gem::Version
21
- version: '0'
22
- type: :runtime
23
22
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
23
+ requirement: &id001 !ruby/object:Gem::Requirement
25
24
  none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 2002549777813010636
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ version_requirements: *id001
30
34
  description: SiteMap Checker
31
- email:
35
+ email:
32
36
  - gerlando@gmail.com
33
37
  executables: []
38
+
34
39
  extensions: []
40
+
35
41
  extra_rdoc_files: []
36
- files:
42
+
43
+ files:
37
44
  - .gitignore
38
45
  - Gemfile
39
46
  - LICENSE
@@ -42,38 +49,53 @@ files:
42
49
  - lib/sitemap_checker.rb
43
50
  - lib/sitemap_checker/version.rb
44
51
  - sitemap_checker.gemspec
45
- - spec/fixtures/sitemap_schema.xml
46
- - spec/fixtures/valid_sitemap.xml
47
- - spec/fixtures/valid_sitemap.xml.gz
52
+ - spec/fixtures/siteindex.xml
53
+ - spec/fixtures/siteindex.xml.gz
54
+ - spec/fixtures/siteindex.xsd
55
+ - spec/fixtures/sitemap.xml
56
+ - spec/fixtures/sitemap.xml.gz
57
+ - spec/fixtures/sitemap.xsd
48
58
  - spec/sitemap_checker_spec.rb
49
59
  - spec/spec_helper.rb
50
60
  homepage: https://github.com/gerlandop/sitemap_checker
51
61
  licenses: []
62
+
52
63
  post_install_message:
53
64
  rdoc_options: []
54
- require_paths:
65
+
66
+ require_paths:
55
67
  - lib
56
- required_ruby_version: !ruby/object:Gem::Requirement
68
+ required_ruby_version: !ruby/object:Gem::Requirement
57
69
  none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ hash: 2002549777813010636
74
+ segments:
75
+ - 0
76
+ version: "0"
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
78
  none: false
64
- requirements:
65
- - - ! '>='
66
- - !ruby/object:Gem::Version
67
- version: '0'
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 2002549777813010636
83
+ segments:
84
+ - 0
85
+ version: "0"
68
86
  requirements: []
87
+
69
88
  rubyforge_project:
70
89
  rubygems_version: 1.8.24
71
90
  signing_key:
72
91
  specification_version: 3
73
92
  summary: Gets status of Urls in SiteMap
74
- test_files:
75
- - spec/fixtures/sitemap_schema.xml
76
- - spec/fixtures/valid_sitemap.xml
77
- - spec/fixtures/valid_sitemap.xml.gz
93
+ test_files:
94
+ - spec/fixtures/siteindex.xml
95
+ - spec/fixtures/siteindex.xml.gz
96
+ - spec/fixtures/siteindex.xsd
97
+ - spec/fixtures/sitemap.xml
98
+ - spec/fixtures/sitemap.xml.gz
99
+ - spec/fixtures/sitemap.xsd
78
100
  - spec/sitemap_checker_spec.rb
79
101
  - spec/spec_helper.rb