sitemap_checker 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'nokogiri'
4
+
5
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 GerlandoP
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,30 @@
1
+ # SitemapChecker
2
+
3
+ Checks sitemap urls for valid response
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'sitemap_checker'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install sitemap_checker
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
30
+
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+ desc "Run all examples"
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
9
+ task :test => :spec
@@ -0,0 +1,52 @@
1
+ require "sitemap_checker/version"
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'zlib'
5
+
6
+ module SitemapChecker
7
+ class Checker
8
+ attr_reader :status_list
9
+
10
+ def initialize(url,schema='http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
11
+ @schema = schema
12
+ @url = url
13
+ @sitemap = get_xml_from_url
14
+ sitemap_is_valid?
15
+ @status_list = get_status_list
16
+ end
17
+
18
+ private
19
+
20
+ def get_xml_from_url
21
+ begin
22
+ Nokogiri::XML(Zlib::GzipReader.new(open(@url)))
23
+ rescue
24
+ Nokogiri::XML(open(@url))
25
+ end
26
+ end
27
+
28
+ def sitemap_is_valid?
29
+ xsd = Nokogiri::XML::Schema(open(@schema))
30
+ raise 'Invalid Schema' unless xsd.valid?(@sitemap)
31
+ true
32
+ end
33
+
34
+ def urls
35
+ @sitemap.xpath("//xmlns:loc")
36
+ end
37
+
38
+ def get_status_list
39
+ statuses = []
40
+ urls.each do |url|
41
+ begin
42
+ status = [url.content,open(url).status[0]]
43
+ rescue OpenURI::HTTPError => e
44
+ status = [url.content,e.io.status[0]]
45
+ end
46
+ statuses << status
47
+ end
48
+ statuses
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,3 @@
1
+ module SitemapChecker
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/sitemap_checker/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Gerlando Piro"]
6
+ gem.email = ["gerlando@gmail.com"]
7
+ gem.description = %q{SiteMap Checker}
8
+ gem.summary = %q{Gets status of Urls in SiteMap}
9
+ gem.homepage = "https://github.com/gerlandop/sitemap_checker"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "sitemap_checker"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = SitemapChecker::VERSION
17
+
18
+ gem.add_dependency 'nokogiri'
19
+ end
@@ -0,0 +1,115 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
3
+ targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
4
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
5
+ elementFormDefault="qualified">
6
+ <xsd:annotation>
7
+ <xsd:documentation>
8
+ XML Schema for Sitemap files.
9
+ Last Modifed 2008-03-26
10
+ </xsd:documentation>
11
+ </xsd:annotation>
12
+
13
+ <xsd:element name="urlset">
14
+ <xsd:annotation>
15
+ <xsd:documentation>
16
+ Container for a set of up to 50,000 document elements.
17
+ This is the root element of the XML file.
18
+ </xsd:documentation>
19
+ </xsd:annotation>
20
+ <xsd:complexType>
21
+ <xsd:sequence>
22
+ <xsd:element name="url" type="tUrl" maxOccurs="unbounded"/>
23
+ </xsd:sequence>
24
+ </xsd:complexType>
25
+ </xsd:element>
26
+
27
+ <xsd:complexType name="tUrl">
28
+ <xsd:annotation>
29
+ <xsd:documentation>
30
+ Container for the data needed to describe a document to crawl.
31
+ </xsd:documentation>
32
+ </xsd:annotation>
33
+ <xsd:sequence>
34
+ <xsd:element name="loc" type="tLoc"/>
35
+ <xsd:element name="lastmod" type="tLastmod" minOccurs="0"/>
36
+ <xsd:element name="changefreq" type="tChangeFreq" minOccurs="0"/>
37
+ <xsd:element name="priority" type="tPriority" minOccurs="0"/>
38
+ <xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
39
+ </xsd:sequence>
40
+ </xsd:complexType>
41
+
42
+ <xsd:simpleType name="tLoc">
43
+ <xsd:annotation>
44
+ <xsd:documentation>
45
+ REQUIRED: The location URI of a document.
46
+ The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
47
+ </xsd:documentation>
48
+ </xsd:annotation>
49
+ <xsd:restriction base="xsd:anyURI">
50
+ <xsd:minLength value="12"/>
51
+ <xsd:maxLength value="2048"/>
52
+ </xsd:restriction>
53
+ </xsd:simpleType>
54
+
55
+ <xsd:simpleType name="tLastmod">
56
+ <xsd:annotation>
57
+ <xsd:documentation>
58
+ OPTIONAL: The date the document was last modified. The date must conform
59
+ to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
60
+ Example: 2005-05-10
61
+ Lastmod may also contain a timestamp.
62
+ Example: 2005-05-10T17:33:30+08:00
63
+ </xsd:documentation>
64
+ </xsd:annotation>
65
+ <xsd:union>
66
+ <xsd:simpleType>
67
+ <xsd:restriction base="xsd:date"/>
68
+ </xsd:simpleType>
69
+ <xsd:simpleType>
70
+ <xsd:restriction base="xsd:dateTime"/>
71
+ </xsd:simpleType>
72
+ </xsd:union>
73
+ </xsd:simpleType>
74
+
75
+ <xsd:simpleType name="tChangeFreq">
76
+ <xsd:annotation>
77
+ <xsd:documentation>
78
+ OPTIONAL: Indicates how frequently the content at a particular URL is
79
+ likely to change. The value "always" should be used to describe
80
+ documents that change each time they are accessed. The value "never"
81
+ should be used to describe archived URLs. Please note that web
82
+ crawlers may not necessarily crawl pages marked "always" more often.
83
+ Consider this element as a friendly suggestion and not a command.
84
+ </xsd:documentation>
85
+ </xsd:annotation>
86
+ <xsd:restriction base="xsd:string">
87
+ <xsd:enumeration value="always"/>
88
+ <xsd:enumeration value="hourly"/>
89
+ <xsd:enumeration value="daily"/>
90
+ <xsd:enumeration value="weekly"/>
91
+ <xsd:enumeration value="monthly"/>
92
+ <xsd:enumeration value="yearly"/>
93
+ <xsd:enumeration value="never"/>
94
+ </xsd:restriction>
95
+ </xsd:simpleType>
96
+
97
+ <xsd:simpleType name="tPriority">
98
+ <xsd:annotation>
99
+ <xsd:documentation>
100
+ OPTIONAL: The priority of a particular URL relative to other pages
101
+ on the same site. The value for this element is a number between
102
+ 0.0 and 1.0 where 0.0 identifies the lowest priority page(s).
103
+ The default priority of a page is 0.5. Priority is used to select
104
+ between pages on your site. Setting a priority of 1.0 for all URLs
105
+ will not help you, as the relative priority of pages on your site
106
+ is what will be considered.
107
+ </xsd:documentation>
108
+ </xsd:annotation>
109
+ <xsd:restriction base="xsd:decimal">
110
+ <xsd:minInclusive value="0.0"/>
111
+ <xsd:maxInclusive value="1.0"/>
112
+ </xsd:restriction>
113
+ </xsd:simpleType>
114
+
115
+ </xsd:schema>
@@ -0,0 +1,15 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" xmlns:geo="http://www.google.com/geo/schemas/sitemap/1.0" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9/">
3
+ <url>
4
+ <loc>http://www.github.com</loc>
5
+ <lastmod>2012-07-27T15:27:41-07:00</lastmod>
6
+ <changefreq>always</changefreq>
7
+ <priority>1.0</priority>
8
+ </url>
9
+ <url>
10
+ <loc>http://www.github.com/404</loc>
11
+ <lastmod>2012-07-27T15:27:41-07:00</lastmod>
12
+ <changefreq>always</changefreq>
13
+ <priority>1.0</priority>
14
+ </url>
15
+ </urlset>
Binary file
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'webmock/rspec'
3
+ require './lib/sitemap_checker'
4
+ WebMock.disable_net_connect!(:allow_localhost => true)
5
+
6
+ describe SitemapChecker do
7
+ before(:each) do
8
+ @dir = Pathname.new(File.dirname(__FILE__))
9
+ stub_request(:any, "http://www.github.com").to_return(:status => 200, :body => 'foo')
10
+ stub_request(:any, "http://www.github.com/404").to_return(:status => 404, :body => 'foo')
11
+ stub_request(:any, "http://www.github.com/index.xml").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml'))
12
+ stub_request(:any, "http://www.github.com/index.xml.gz").to_return(:status => 200, :body => File.read(@dir + 'fixtures/valid_sitemap.xml.gz'))
13
+ stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd").
14
+ with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).
15
+ to_return(:status => 200, :body => File.read(@dir + 'fixtures/sitemap_schema.xml'), :headers => {})
16
+ end
17
+
18
+ it "accepts xml and gzipped files" do
19
+ @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
20
+ @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml.gz')
21
+ @xml_sitemap.status_list.size.should eq(2)
22
+ @gz_sitemap.status_list.size.should eq(2)
23
+ end
24
+
25
+ it "Errors if input doc does not match sitemap schema" do
26
+ lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
27
+ end
28
+
29
+ it "returns list of urls with responses" do
30
+ @valid_sitemap = SitemapChecker::Checker.new('http://www.github.com/index.xml')
31
+ @valid_sitemap.status_list.should eq([['http://www.github.com','200'], ['http://www.github.com/404','404']])
32
+ end
33
+
34
+ end
@@ -0,0 +1,38 @@
1
+ # This file is copied to spec/ when you run 'rails generate rspec:install'
2
+ ENV["RAILS_ENV"] ||= 'test'
3
+ require File.expand_path("../../config/environment", __FILE__)
4
+ require 'rspec/rails'
5
+ require 'rspec/autorun'
6
+
7
+ # Requires supporting ruby files with custom matchers and macros, etc,
8
+ # in spec/support/ and its subdirectories.
9
+ Dir[Rails.root.join("spec/support/**/*.rb")].each {|f| require f}
10
+
11
+ RSpec.configure do |config|
12
+ # ## Mock Framework
13
+ #
14
+ # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
15
+ #
16
+ # config.mock_with :mocha
17
+ # config.mock_with :flexmock
18
+ # config.mock_with :rr
19
+
20
+ # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
21
+ config.fixture_path = "#{::Rails.root}/spec/fixtures"
22
+
23
+ # If you're not using ActiveRecord, or you'd prefer not to run each of your
24
+ # examples within a transaction, remove the following line or assign false
25
+ # instead of true.
26
+ config.use_transactional_fixtures = true
27
+
28
+ # If true, the base class of anonymous controllers will be inferred
29
+ # automatically. This will be the default behavior in future versions of
30
+ # rspec-rails.
31
+ config.infer_base_class_for_anonymous_controllers = false
32
+
33
+ # Run specs in random order to surface order dependencies. If you find an
34
+ # order dependency and want to debug it, you can fix the order by providing
35
+ # the seed, which is printed after each run.
36
+ # --seed 1234
37
+ config.order = "random"
38
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemap_checker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gerlando Piro
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: SiteMap Checker
31
+ email:
32
+ - gerlando@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE
40
+ - README.md
41
+ - Rakefile
42
+ - lib/sitemap_checker.rb
43
+ - lib/sitemap_checker/version.rb
44
+ - sitemap_checker.gemspec
45
+ - spec/fixtures/sitemap_schema.xml
46
+ - spec/fixtures/valid_sitemap.xml
47
+ - spec/fixtures/valid_sitemap.xml.gz
48
+ - spec/sitemap_checker_spec.rb
49
+ - spec/spec_helper.rb
50
+ homepage: https://github.com/gerlandop/sitemap_checker
51
+ licenses: []
52
+ post_install_message:
53
+ rdoc_options: []
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ! '>='
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ requirements: []
69
+ rubyforge_project:
70
+ rubygems_version: 1.8.24
71
+ signing_key:
72
+ specification_version: 3
73
+ summary: Gets status of Urls in SiteMap
74
+ test_files:
75
+ - spec/fixtures/sitemap_schema.xml
76
+ - spec/fixtures/valid_sitemap.xml
77
+ - spec/fixtures/valid_sitemap.xml.gz
78
+ - spec/sitemap_checker_spec.rb
79
+ - spec/spec_helper.rb