sitemap_checker 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'nokogiri'
4
+
5
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 GerlandoP
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,30 @@
1
+ # SitemapChecker
2
+
3
+ Checks sitemap urls for valid response
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'sitemap_checker'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install sitemap_checker
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
30
+
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+ desc "Run all examples"
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
9
+ task :test => :spec
@@ -0,0 +1,52 @@
1
+ require "sitemap_checker/version"
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'zlib'
5
+
6
+ module SitemapChecker
7
+ class Checker
8
+ attr_reader :status_list
9
+
10
+ def initialize(url,schema='http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
11
+ @schema = schema
12
+ @url = url
13
+ @sitemap = get_xml_from_url
14
+ sitemap_is_valid?
15
+ @status_list = get_status_list
16
+ end
17
+
18
+ private
19
+
20
+ def get_xml_from_url
21
+ begin
22
+ Nokogiri::XML(Zlib::GzipReader.new(open(@url)))
23
+ rescue
24
+ Nokogiri::XML(open(@url))
25
+ end
26
+ end
27
+
28
+ def sitemap_is_valid?
29
+ xsd = Nokogiri::XML::Schema(open(@schema))
30
+ raise 'Invalid Schema' unless xsd.valid?(@sitemap)
31
+ true
32
+ end
33
+
34
+ def urls
35
+ @sitemap.xpath("//xmlns:loc")
36
+ end
37
+
38
+ def get_status_list
39
+ statuses = []
40
+ urls.each do |url|
41
+ begin
42
+ status = [url.content,open(url).status[0]]
43
+ rescue OpenURI::HTTPError => e
44
+ status = [url.content,e.io.status[0]]
45
+ end
46
+ statuses << status
47
+ end
48
+ statuses
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,3 @@
1
+ module SitemapChecker
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/sitemap_checker/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Gerlando Piro"]
6
+ gem.email = ["gerlando@gmail.com"]
7
+ gem.description = %q{SiteMap Checker}
8
+ gem.summary = %q{Gets status of Urls in SiteMap}
9
+ gem.homepage = "https://github.com/gerlandop/sitemap_checker"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "sitemap_checker"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = SitemapChecker::VERSION
17
+
18
+ gem.add_dependency 'nokogiri'
19
+ end
@@ -0,0 +1,115 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
3
+ targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
4
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
5
+ elementFormDefault="qualified">
6
+ <xsd:annotation>
7
+ <xsd:documentation>
8
+ XML Schema for Sitemap files.
9
+ Last Modifed 2008-03-26
10
+ </xsd:documentation>
11
+ </xsd:annotation>
12
+
13
+ <xsd:element name="urlset">
14
+ <xsd:annotation>
15
+ <xsd:documentation>
16
+ Container for a set of up to 50,000 document elements.
17
+ This is the root element of the XML file.
18
+ </xsd:documentation>
19
+ </xsd:annotation>
20
+ <xsd:complexType>
21
+ <xsd:sequence>
22
+ <xsd:element name="url" type="tUrl" maxOccurs="unbounded"/>
23
+ </xsd:sequence>
24
+ </xsd:complexType>
25
+ </xsd:element>
26
+
27
+ <xsd:complexType name="tUrl">
28
+ <xsd:annotation>
29
+ <xsd:documentation>
30
+ Container for the data needed to describe a document to crawl.
31
+ </xsd:documentation>
32
+ </xsd:annotation>
33
+ <xsd:sequence>
34
+ <xsd:element name="loc" type="tLoc"/>
35
+ <xsd:element name="lastmod" type="tLastmod" minOccurs="0"/>
36
+ <xsd:element name="changefreq" type="tChangeFreq" minOccurs="0"/>
37
+ <xsd:element name="priority" type="tPriority" minOccurs="0"/>
38
+ <xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
39
+ </xsd:sequence>
40
+ </xsd:complexType>
41
+
42
+ <xsd:simpleType name="tLoc">
43
+ <xsd:annotation>
44
+ <xsd:documentation>
45
+ REQUIRED: The location URI of a document.
46
+ The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
47
+ </xsd:documentation>
48
+ </xsd:annotation>
49
+ <xsd:restriction base="xsd:anyURI">
50
+ <xsd:minLength value="12"/>
51
+ <xsd:maxLength value="2048"/>
52
+ </xsd:restriction>
53
+ </xsd:simpleType>
54
+
55
+ <xsd:simpleType name="tLastmod">
56
+ <xsd:annotation>
57
+ <xsd:documentation>
58
+ OPTIONAL: The date the document was last modified. The date must conform
59
+ to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
60
+ Example: 2005-05-10
61
+ Lastmod may also contain a timestamp.
62
+ Example: 2005-05-10T17:33:30+08:00
63
+ </xsd:documentation>
64
+ </xsd:annotation>
65
+ <xsd:union>
66
+ <xsd:simpleType>
67
+ <xsd:restriction base="xsd:date"/>
68
+ </xsd:simpleType>
69
+ <xsd:simpleType>
70
+ <xsd:restriction base="xsd:dateTime"/>
71
+ </xsd:simpleType>
72
+ </xsd:union>
73
+ </xsd:simpleType>
74
+
75
+ <xsd:simpleType name="tChangeFreq">
76
+ <xsd:annotation>
77
+ <xsd:documentation>
78
+ OPTIONAL: Indicates how frequently the content at a particular URL is
79
+ likely to change. The value "always" should be used to describe
80
+ documents that change each time they are accessed. The value "never"
81
+ should be used to describe archived URLs. Please note that web
82
+ crawlers may not necessarily crawl pages marked "always" more often.
83
+ Consider this element as a friendly suggestion and not a command.
84
+ </xsd:documentation>
85
+ </xsd:annotation>
86
+ <xsd:restriction base="xsd:string">
87
+ <xsd:enumeration value="always"/>
88
+ <xsd:enumeration value="hourly"/>
89
+ <xsd:enumeration value="daily"/>
90
+ <xsd:enumeration value="weekly"/>
91
+ <xsd:enumeration value="monthly"/>
92
+ <xsd:enumeration value="yearly"/>
93
+ <xsd:enumeration value="never"/>
94
+ </xsd:restriction>
95
+ </xsd:simpleType>
96
+
97
+ <xsd:simpleType name="tPriority">
98
+ <xsd:annotation>
99
+ <xsd:documentation>
100
+ OPTIONAL: The priority of a particular URL relative to other pages
101
+ on the same site. The value for this element is a number between
102
+ 0.0 and 1.0 where 0.0 identifies the lowest priority page(s).
103
+ The default priority of a page is 0.5. Priority is used to select
104
+ between pages on your site. Setting a priority of 1.0 for all URLs
105
+ will not help you, as the relative priority of pages on your site
106
+ is what will be considered.
107
+ </xsd:documentation>
108
+ </xsd:annotation>
109
+ <xsd:restriction base="xsd:decimal">
110
+ <xsd:minInclusive value="0.0"/>
111
+ <xsd:maxInclusive value="1.0"/>
112
+ </xsd:restriction>
113
+ </xsd:simpleType>
114
+
115
+ </xsd:schema>
@@ -0,0 +1,15 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" xmlns:geo="http://www.google.com/geo/schemas/sitemap/1.0" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9/">
3
+ <url>
4
+ <loc>http://www.github.com</loc>
5
+ <lastmod>2012-07-27T15:27:41-07:00</lastmod>
6
+ <changefreq>always</changefreq>
7
+ <priority>1.0</priority>
8
+ </url>
9
+ <url>
10
+ <loc>http://www.github.com/404</loc>
11
+ <lastmod>2012-07-27T15:27:41-07:00</lastmod>
12
+ <changefreq>always</changefreq>
13
+ <priority>1.0</priority>
14
+ </url>
15
+ </urlset>
Binary file
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'webmock/rspec'
3
+ require './lib/sitemap_checker'
4
+ WebMock.disable_net_connect!(:allow_localhost => true)
5
+
6
+ describe SitemapChecker do
7
+ before(:each) do
8
+ @dir = Pathname.new(File.dirname(__FILE__))
9
+ stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
10
+ stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
11
+ stub_request(:any, "http://www.github.com/index.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml'))
12
+ stub_request(:any, "http://www.github.com/index.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml.gz'))
13
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").
14
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
15
+ to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap_schema.xml'), :headers => {})
16
+ end
17
+
18
+ it "accepts xml and gzipped files" do
19
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
20
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml.gz')
21
+ @xml_sitemap.status_list.size.should eq(2)
22
+ @gz_sitemap.status_list.size.should eq(2)
23
+ end
24
+
25
+ it "Errors if input doc does not match sitemap schema" do
26
+ lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
27
+ end
28
+
29
+ it "returns list of urls with responses" do
30
+ @valid_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
31
+ @valid_sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
32
+ end
33
+
34
+ end
@@ -0,0 +1,38 @@
1
+ # This file is copied to spec/ when you run 'rails generate rspec:install'
2
+ ENV["RAILS_ENV"] ||= 'test'
3
+ require File.expand_path("../../config/environment", __FILE__)
4
+ require 'rspec/rails'
5
+ require 'rspec/autorun'
6
+
7
+ # Requires supporting ruby files with custom matchers and macros, etc,
8
+ # in spec/support/ and its subdirectories.
9
+ Dir[Rails.root.join("spec/support/**/*.rb")].each {|f| require f}
10
+
11
+ RSpec.configure do |config|
12
+ # ## Mock Framework
13
+ #
14
+ # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
15
+ #
16
+ # config.mock_with :mocha
17
+ # config.mock_with :flexmock
18
+ # config.mock_with :rr
19
+
20
+ # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
21
+ config.fixture_path = "#{::Rails.root}/spec/fixtures"
22
+
23
+ # If you're not using ActiveRecord, or you'd prefer not to run each of your
24
+ # examples within a transaction, remove the following line or assign false
25
+ # instead of true.
26
+ config.use_transactional_fixtures = true
27
+
28
+ # If true, the base class of anonymous controllers will be inferred
29
+ # automatically. This will be the default behavior in future versions of
30
+ # rspec-rails.
31
+ config.infer_base_class_for_anonymous_controllers = false
32
+
33
+ # Run specs in random order to surface order dependencies. If you find an
34
+ # order dependency and want to debug it, you can fix the order by providing
35
+ # the seed, which is printed after each run.
36
+ # --seed 1234
37
+ config.order = "random"
38
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemap_checker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gerlando Piro
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: SiteMap Checker
31
+ email:
32
+ - gerlando@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE
40
+ - README.md
41
+ - Rakefile
42
+ - lib/sitemap_checker.rb
43
+ - lib/sitemap_checker/version.rb
44
+ - sitemap_checker.gemspec
45
+ - spec/fixtures/sitemap_schema.xml
46
+ - spec/fixtures/valid_sitemap.xml
47
+ - spec/fixtures/valid_sitemap.xml.gz
48
+ - spec/sitemap_checker_spec.rb
49
+ - spec/spec_helper.rb
50
+ homepage: https://github.com/gerlandop/sitemap_checker
51
+ licenses: []
52
+ post_install_message:
53
+ rdoc_options: []
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ! '>='
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubyforge_project:
70
+ rubygems_version: 1.8.24
71
+ signing_key:
72
+ specification_version: 3
73
+ summary: Gets status of Urls in SiteMap
74
+ test_files:
75
+ - spec/fixtures/sitemap_schema.xml
76
+ - spec/fixtures/valid_sitemap.xml
77
+ - spec/fixtures/valid_sitemap.xml.gz
78
+ - spec/sitemap_checker_spec.rb
79
+ - spec/spec_helper.rb