sitemap_checker 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +10 -4
- data/lib/sitemap_checker/path.rb +22 -0
- data/lib/sitemap_checker/sitemap.rb +62 -0
- data/lib/sitemap_checker/version.rb +1 -1
- data/lib/sitemap_checker.rb +3 -67
- data/spec/sitemap_checker_spec.rb +23 -19
- metadata +4 -2
data/README.md
CHANGED
@@ -18,13 +18,19 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
Get list of urls from xml or xml.gz sitemap url
|
21
|
+
Get list of urls(locs) from xml or xml.gz sitemap url.
|
22
22
|
|
23
|
-
SitemapChecker::
|
23
|
+
@sitemap = SitemapChecker::Sitemap.new(url)
|
24
|
+
@sitemap.locs.size
|
24
25
|
|
25
|
-
Get status of url
|
26
|
+
Get status of url from Sitemap
|
26
27
|
|
27
|
-
SitemapChecker::
|
28
|
+
@sitemap = SitemapChecker::Sitemap.new(url)
|
29
|
+
@sitemap.locs.first.status
|
30
|
+
|
31
|
+
or directly as a Path
|
32
|
+
|
33
|
+
SitemapChecker::Path.new(url).status
|
28
34
|
|
29
35
|
## Contributing
|
30
36
|
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module SitemapChecker
|
2
|
+
class Path
|
3
|
+
attr_accessor :url, :status
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
@url = url
|
7
|
+
@status = nil
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_status_from_xml(url)
|
11
|
+
status(url.content)
|
12
|
+
end
|
13
|
+
|
14
|
+
def status
|
15
|
+
begin
|
16
|
+
@status ||= open(@url).status[0]
|
17
|
+
rescue OpenURI::HTTPError => e
|
18
|
+
e.io.status[0]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module SitemapChecker
|
2
|
+
class Sitemap
|
3
|
+
attr_accessor :locs, :map
|
4
|
+
|
5
|
+
def initialize(map)
|
6
|
+
@map = map
|
7
|
+
@locs = process_map
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def process_map
|
13
|
+
xml = get_xml_from_map(@map)
|
14
|
+
if is_siteindex?(xml)
|
15
|
+
process_siteindex(xml)
|
16
|
+
else
|
17
|
+
process_sitemap(xml)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_xml_from_map(map)
|
22
|
+
begin
|
23
|
+
Nokogiri::XML(Zlib::GzipReader.new(open(map)))
|
24
|
+
rescue
|
25
|
+
Nokogiri::XML(open(map))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def is_siteindex?(xml)
|
30
|
+
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
31
|
+
ixsd.valid?(xml)
|
32
|
+
end
|
33
|
+
|
34
|
+
def is_sitemap?(xml)
|
35
|
+
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
36
|
+
mxsd.valid?(xml)
|
37
|
+
end
|
38
|
+
|
39
|
+
def process_siteindex(xml)
|
40
|
+
@urls = []
|
41
|
+
maps = get_locs(xml)
|
42
|
+
maps.each do |map|
|
43
|
+
xml = get_xml_from_map(map.url)
|
44
|
+
@urls += process_sitemap(xml)
|
45
|
+
end
|
46
|
+
return @urls
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_sitemap(xml)
|
50
|
+
if is_sitemap?(xml)
|
51
|
+
return get_locs(xml)
|
52
|
+
else raise 'Invalid Schema'
|
53
|
+
return false
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_locs(xml)
|
58
|
+
xml.xpath("//xmlns:loc").map{|path| Path.new(path) }
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
data/lib/sitemap_checker.rb
CHANGED
@@ -3,72 +3,8 @@ require 'open-uri'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'zlib'
|
5
5
|
|
6
|
-
module SitemapChecker
|
7
|
-
class Checker
|
8
|
-
attr_reader :url_list
|
9
|
-
|
10
|
-
def initialize(url,schema='')
|
11
|
-
@url = url
|
12
|
-
@url_list = Array.new
|
13
|
-
@status_list = Array.new
|
14
|
-
process_xml
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.get_status_from_xml(url)
|
18
|
-
get_status(url.content)
|
19
|
-
end
|
20
|
-
|
21
|
-
def self.get_status(url)
|
22
|
-
begin
|
23
|
-
status = [url,open(url).status[0]]
|
24
|
-
rescue OpenURI::HTTPError => e
|
25
|
-
status = [url,e.io.status[0]]
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def get_xml_from_url(url)
|
32
|
-
begin
|
33
|
-
Nokogiri::XML(Zlib::GzipReader.new(open(url)))
|
34
|
-
rescue
|
35
|
-
Nokogiri::XML(open(url))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def process_xml
|
40
|
-
mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
|
41
|
-
ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
|
42
|
-
xml = get_xml_from_url(@url)
|
43
|
-
if mxsd.valid?(xml)
|
44
|
-
@url_list = urls(xml)
|
45
|
-
elsif ixsd.valid?(xml)
|
46
|
-
maps = urls(xml)
|
47
|
-
maps.each do |map|
|
48
|
-
xml = get_xml_from_url(map)
|
49
|
-
@url_list = urls(xml)
|
50
|
-
end
|
51
|
-
else raise 'Invalid Schema'
|
52
|
-
false
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def urls(xml)
|
57
|
-
xml.xpath("//xmlns:loc")
|
58
|
-
end
|
59
|
-
|
60
|
-
def get_status_list(urls)
|
61
|
-
statuses = []
|
62
|
-
urls.each do |url|
|
63
|
-
begin
|
64
|
-
status = [url.content,open(url).status[0]]
|
65
|
-
rescue OpenURI::HTTPError => e
|
66
|
-
status = [url.content,e.io.status[0]]
|
67
|
-
end
|
68
|
-
statuses << status
|
69
|
-
end
|
70
|
-
statuses
|
71
|
-
end
|
72
|
-
end
|
73
6
|
|
7
|
+
module SitemapChecker
|
8
|
+
autoload :Sitemap, "./lib/sitemap_checker/sitemap"
|
9
|
+
autoload :Path, "./lib/sitemap_checker/path"
|
74
10
|
end
|
@@ -16,35 +16,39 @@ describe SitemapChecker do
|
|
16
16
|
stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
|
17
17
|
end
|
18
18
|
|
19
|
-
it "accepts xml siteindexes" do
|
20
|
-
@
|
21
|
-
@
|
19
|
+
it "Sitemap accepts xml siteindexes" do
|
20
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml')
|
21
|
+
@list.locs.size.should eq(4)
|
22
22
|
end
|
23
23
|
|
24
|
-
it "accepts gzipped siteindexes" do
|
25
|
-
@
|
26
|
-
@
|
24
|
+
it "Sitemap accepts gzipped siteindexes" do
|
25
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml.gz')
|
26
|
+
@list.locs.size.should eq(4)
|
27
27
|
end
|
28
28
|
|
29
|
-
it "accepts xml sitemaps" do
|
30
|
-
@
|
31
|
-
@
|
29
|
+
it "Sitemap accepts xml sitemaps" do
|
30
|
+
@list = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
31
|
+
@list.locs.size.should eq(2)
|
32
32
|
end
|
33
33
|
|
34
|
-
it "accepts xml and gzipped sitemaps" do
|
35
|
-
@xml_sitemap = SitemapChecker::
|
36
|
-
@gz_sitemap = SitemapChecker::
|
37
|
-
@xml_sitemap.
|
38
|
-
@gz_sitemap.
|
34
|
+
it "Sitemap accepts xml and gzipped sitemaps" do
|
35
|
+
@xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
36
|
+
@gz_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml.gz')
|
37
|
+
@xml_sitemap.locs.size.should eq(2)
|
38
|
+
@gz_sitemap.locs.size.should eq(2)
|
39
39
|
end
|
40
40
|
|
41
|
-
it "
|
42
|
-
lambda {SitemapChecker::
|
41
|
+
it "Sitemap errors if input doc does not match sitemap schema" do
|
42
|
+
lambda {SitemapChecker::Sitemap.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
|
43
43
|
end
|
44
44
|
|
45
|
-
it "
|
46
|
-
@
|
47
|
-
|
45
|
+
it "Sitemap locs are Path objects" do
|
46
|
+
@xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
|
47
|
+
@xml_sitemap.locs.first.class.should eq(SitemapChecker::Path)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "Path#status returns status code" do
|
51
|
+
SitemapChecker::Path.new('http://www.github.com').status.should eq('200')
|
48
52
|
end
|
49
53
|
|
50
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap_checker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -40,6 +40,8 @@ files:
|
|
40
40
|
- README.md
|
41
41
|
- Rakefile
|
42
42
|
- lib/sitemap_checker.rb
|
43
|
+
- lib/sitemap_checker/path.rb
|
44
|
+
- lib/sitemap_checker/sitemap.rb
|
43
45
|
- lib/sitemap_checker/version.rb
|
44
46
|
- sitemap_checker.gemspec
|
45
47
|
- spec/fixtures/siteindex.xml
|