sitemap_checker 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +10 -4
- data/lib/sitemap_checker/path.rb +22 -0
- data/lib/sitemap_checker/sitemap.rb +62 -0
- data/lib/sitemap_checker/version.rb +1 -1
- data/lib/sitemap_checker.rb +3 -67
- data/spec/sitemap_checker_spec.rb +23 -19
- metadata +4 -2
data/README.md
CHANGED
@@ -18,13 +18,19 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
Get list of urls from xml or xml.gz sitemap url
|
21
|
+
Get list of urls(locs) from xml or xml.gz sitemap url.
|
22
22
|
|
23
|
-
SitemapChecker::
|
23
|
+
@sitemap = SitemapChecker::Sitemap.new(url)
|
24
|
+
@sitemap.locs.size
|
24
25
|
|
25
|
-
Get status of url
|
26
|
+
Get status of url from Sitemap
|
26
27
|
|
27
|
-
SitemapChecker::
|
28
|
+
@sitemap = SitemapChecker::Sitemap.new(url)
|
29
|
+
@sitemap.locs.first.status
|
30
|
+
|
31
|
+
or directly as a Path
|
32
|
+
|
33
|
+
SitemapChecker::Path.new(url).status
|
28
34
|
|
29
35
|
## Contributing
|
30
36
|
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module SitemapChecker
|
2
|
+
class Path
|
3
|
+
attr_accessor :url, :status
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
@status = nil
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_status_from_xml(url)
|
11
|
+
status(url.content)
|
12
|
+
end
|
13
|
+
|
14
|
+
def status
|
15
|
+
begin
|
16
|
+
@status ||= open(@url).status[0]
|
17
|
+
rescue OpenURI::HTTPError => e
|
18
|
+
e.io.status[0]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module SitemapChecker
|
2
|
+
class Sitemap
|
3
|
+
attr_accessor :locs, :map
|
4
|
+
|
5
|
+
def initialize(map)
|
6
|
+
@map = map
|
7
|
+
@locs = process_map
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def process_map
|
13
|
+
xml = get_xml_from_map(@map)
|
14
|
+
if is_siteindex?(xml)
|
15
|
+
process_siteindex(xml)
|
16
|
+
else
|
17
|
+
process_sitemap(xml)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_xml_from_map(map)
|
22
|
+
begin
|
23
|
+
Nokogiri::XML(Zlib::GzipReader.new(open(map)))
|
24
|
+
rescue
|
25
|
+
Nokogiri::XML(open(map))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def is_siteindex?(xml)
|
30
|
+
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
31
|
+
ixsd.valid?(xml)
|
32
|
+
end
|
33
|
+
|
34
|
+
def is_sitemap?(xml)
|
35
|
+
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
36
|
+
mxsd.valid?(xml)
|
37
|
+
end
|
38
|
+
|
39
|
+
def process_siteindex(xml)
|
40
|
+
@urls = []
|
41
|
+
maps = get_locs(xml)
|
42
|
+
maps.each do |map|
|
43
|
+
xml = get_xml_from_map(map.url)
|
44
|
+
@urls += process_sitemap(xml)
|
45
|
+
end
|
46
|
+
return @urls
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_sitemap(xml)
|
50
|
+
if is_sitemap?(xml)
|
51
|
+
return get_locs(xml)
|
52
|
+
else raise 'Invalid Schema'
|
53
|
+
return false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_locs(xml)
|
58
|
+
xml.xpath("//xmlns:loc").map{|path| Path.new(path) }
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
data/lib/sitemap_checker.rb
CHANGED
@@ -3,72 +3,8 @@ require 'open-uri'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'zlib'
|
5
5
|
|
6
|
-
module SitemapChecker
|
7
|
-
class Checker
|
8
|
-
attr_reader :url_list
|
9
|
-
|
10
|
-
def initialize(url,schema='')
|
11
|
-
@url = url
|
12
|
-
@url_list = Array.new
|
13
|
-
@status_list = Array.new
|
14
|
-
process_xml
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.get_status_from_xml(url)
|
18
|
-
get_status(url.content)
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.get_status(url)
|
22
|
-
begin
|
23
|
-
status = [url,open(url).status[0]]
|
24
|
-
rescue OpenURI::HTTPError => e
|
25
|
-
status = [url,e.io.status[0]]
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def get_xml_from_url(url)
|
32
|
-
begin
|
33
|
-
Nokogiri::XML(Zlib::GzipReader.new(open(url)))
|
34
|
-
rescue
|
35
|
-
Nokogiri::XML(open(url))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def process_xml
|
40
|
-
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
41
|
-
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
42
|
-
xml = get_xml_from_url(@url)
|
43
|
-
if mxsd.valid?(xml)
|
44
|
-
@url_list = urls(xml)
|
45
|
-
elsif ixsd.valid?(xml)
|
46
|
-
maps = urls(xml)
|
47
|
-
maps.each do |map|
|
48
|
-
xml = get_xml_from_url(map)
|
49
|
-
@url_list = urls(xml)
|
50
|
-
end
|
51
|
-
else raise 'Invalid Schema'
|
52
|
-
false
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def urls(xml)
|
57
|
-
xml.xpath("//xmlns:loc")
|
58
|
-
end
|
59
|
-
|
60
|
-
def get_status_list(urls)
|
61
|
-
statuses = []
|
62
|
-
urls.each do |url|
|
63
|
-
begin
|
64
|
-
status = [url.content,open(url).status[0]]
|
65
|
-
rescue OpenURI::HTTPError => e
|
66
|
-
status = [url.content,e.io.status[0]]
|
67
|
-
end
|
68
|
-
statuses << status
|
69
|
-
end
|
70
|
-
statuses
|
71
|
-
end
|
72
|
-
end
|
73
6
|
|
7
|
+
module SitemapChecker
|
8
|
+
autoload :Sitemap, "./lib/sitemap_checker/sitemap"
|
9
|
+
autoload :Path, "./lib/sitemap_checker/path"
|
74
10
|
end
|
@@ -16,35 +16,39 @@ describe SitemapChecker do
|
|
16
16
|
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
|
17
17
|
end
|
18
18
|
|
19
|
-
it "accepts xml siteindexes" do
|
20
|
-
@
|
21
|
-
@
|
19
|
+
it "Sitemap accepts xml siteindexes" do
|
20
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml')
|
21
|
+
@list.locs.size.should eq(4)
|
22
22
|
end
|
23
23
|
|
24
|
-
it "accepts gzipped siteindexes" do
|
25
|
-
@
|
26
|
-
@
|
24
|
+
it "Sitemap accepts gzipped siteindexes" do
|
25
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml.gz')
|
26
|
+
@list.locs.size.should eq(4)
|
27
27
|
end
|
28
28
|
|
29
|
-
it "accepts xml sitemaps" do
|
30
|
-
@
|
31
|
-
@
|
29
|
+
it "Sitemap accepts xml sitemaps" do
|
30
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
31
|
+
@list.locs.size.should eq(2)
|
32
32
|
end
|
33
33
|
|
34
|
-
it "accepts xml and gzipped sitemaps" do
|
35
|
-
@xml_sitemap = SitemapChecker::
|
36
|
-
@gz_sitemap = SitemapChecker::
|
37
|
-
@xml_sitemap.
|
38
|
-
@gz_sitemap.
|
34
|
+
it "Sitemap accepts xml and gzipped sitemaps" do
|
35
|
+
@xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
36
|
+
@gz_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml.gz')
|
37
|
+
@xml_sitemap.locs.size.should eq(2)
|
38
|
+
@gz_sitemap.locs.size.should eq(2)
|
39
39
|
end
|
40
40
|
|
41
|
-
it "
|
42
|
-
lambda {SitemapChecker::
|
41
|
+
it "Sitemap errors if input doc does not match sitemap schema" do
|
42
|
+
lambda {SitemapChecker::Sitemap.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
|
43
43
|
end
|
44
44
|
|
45
|
-
it "
|
46
|
-
@
|
47
|
-
|
45
|
+
it "Sitemap locs are Path objects" do
|
46
|
+
@xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
47
|
+
@xml_sitemap.locs.first.class.should eq(SitemapChecker::Path)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "Path#status returns status code" do
|
51
|
+
SitemapChecker::Path.new('http://www.github.com').status.should eq('200')
|
48
52
|
end
|
49
53
|
|
50
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_checker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -40,6 +40,8 @@ files:
|
|
40
40
|
- README.md
|
41
41
|
- Rakefile
|
42
42
|
- lib/sitemap_checker.rb
|
43
|
+
- lib/sitemap_checker/path.rb
|
44
|
+
- lib/sitemap_checker/sitemap.rb
|
43
45
|
- lib/sitemap_checker/version.rb
|
44
46
|
- sitemap_checker.gemspec
|
45
47
|
- spec/fixtures/siteindex.xml
|