sitemap_checker 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -18,13 +18,19 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- Get list of urls from xml or xml.gz sitemap url
21
+ Get list of urls(locs) from xml or xml.gz sitemap url.
22
22
 
23
- SitemapChecker::Checker.new(url)
23
+ @sitemap = SitemapChecker::Sitemap.new(url)
24
+ @sitemap.locs.size
24
25
 
25
- Get status of url
26
+ Get status of url from Sitemap
26
27
 
27
- SitemapChecker::Checker.get_status(url)
28
+ @sitemap = SitemapChecker::Sitemap.new(url)
29
+ @sitemap.locs.first.status
30
+
31
+ or directly as a Path
32
+
33
+ SitemapChecker::Path.new(url).status
28
34
 
29
35
  ## Contributing
30
36
 
@@ -0,0 +1,22 @@
1
+ module SitemapChecker
2
+ class Path
3
+ attr_accessor :url, :status
4
+
5
+ def initialize(url)
6
+ @url = url
7
+ @status = nil
8
+ end
9
+
10
+ def get_status_from_xml(url)
11
+ status(url.content)
12
+ end
13
+
14
+ def status
15
+ begin
16
+ @status ||= open(@url).status[0]
17
+ rescue OpenURI::HTTPError => e
18
+ e.io.status[0]
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,62 @@
1
+ module SitemapChecker
2
+ class Sitemap
3
+ attr_accessor :locs, :map
4
+
5
+ def initialize(map)
6
+ @map = map
7
+ @locs = process_map
8
+ end
9
+
10
+ private
11
+
12
+ def process_map
13
+ xml = get_xml_from_map(@map)
14
+ if is_siteindex?(xml)
15
+ process_siteindex(xml)
16
+ else
17
+ process_sitemap(xml)
18
+ end
19
+ end
20
+
21
+ def get_xml_from_map(map)
22
+ begin
23
+ Nokogiri::XML(Zlib::GzipReader.new(open(map)))
24
+ rescue
25
+ Nokogiri::XML(open(map))
26
+ end
27
+ end
28
+
29
+ def is_siteindex?(xml)
30
+ ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
31
+ ixsd.valid?(xml)
32
+ end
33
+
34
+ def is_sitemap?(xml)
35
+ mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
36
+ mxsd.valid?(xml)
37
+ end
38
+
39
+ def process_siteindex(xml)
40
+ @urls = []
41
+ maps = get_locs(xml)
42
+ maps.each do |map|
43
+ xml = get_xml_from_map(map.url)
44
+ @urls += process_sitemap(xml)
45
+ end
46
+ return @urls
47
+ end
48
+
49
+ def process_sitemap(xml)
50
+ if is_sitemap?(xml)
51
+ return get_locs(xml)
52
+ else raise 'Invalid Schema'
53
+ return false
54
+ end
55
+ end
56
+
57
+ def get_locs(xml)
58
+ xml.xpath("//xmlns:loc").map{|path| Path.new(path) }
59
+ end
60
+
61
+ end
62
+ end
@@ -1,3 +1,3 @@
1
1
  module SitemapChecker
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -3,72 +3,8 @@ require 'open-uri'
3
3
  require 'nokogiri'
4
4
  require 'zlib'
5
5
 
6
- module SitemapChecker
7
- class Checker
8
- attr_reader :url_list
9
-
10
- def initialize(url,schema='')
11
- @url = url
12
- @url_list = Array.new
13
- @status_list = Array.new
14
- process_xml
15
- end
16
-
17
- def self.get_status_from_xml(url)
18
- get_status(url.content)
19
- end
20
-
21
- def self.get_status(url)
22
- begin
23
- status = [url,open(url).status[0]]
24
- rescue OpenURI::HTTPError => e
25
- status = [url,e.io.status[0]]
26
- end
27
- end
28
-
29
- private
30
-
31
- def get_xml_from_url(url)
32
- begin
33
- Nokogiri::XML(Zlib::GzipReader.new(open(url)))
34
- rescue
35
- Nokogiri::XML(open(url))
36
- end
37
- end
38
-
39
- def process_xml
40
- mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
41
- ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
42
- xml = get_xml_from_url(@url)
43
- if mxsd.valid?(xml)
44
- @url_list = urls(xml)
45
- elsif ixsd.valid?(xml)
46
- maps = urls(xml)
47
- maps.each do |map|
48
- xml = get_xml_from_url(map)
49
- @url_list = urls(xml)
50
- end
51
- else raise 'Invalid Schema'
52
- false
53
- end
54
- end
55
-
56
- def urls(xml)
57
- xml.xpath("//xmlns:loc")
58
- end
59
-
60
- def get_status_list(urls)
61
- statuses = []
62
- urls.each do |url|
63
- begin
64
- status = [url.content,open(url).status[0]]
65
- rescue OpenURI::HTTPError => e
66
- status = [url.content,e.io.status[0]]
67
- end
68
- statuses << status
69
- end
70
- statuses
71
- end
72
- end
73
6
 
7
+ module SitemapChecker
8
+ autoload :Sitemap, "./lib/sitemap_checker/sitemap"
9
+ autoload :Path, "./lib/sitemap_checker/path"
74
10
  end
@@ -16,35 +16,39 @@ describe SitemapChecker do
16
16
  stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
17
17
  end
18
18
 
19
- it "accepts xml siteindexes" do
20
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
21
- @xml_sitemap.url_list.size.should eq(2)
19
+ it "Sitemap accepts xml siteindexes" do
20
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml')
21
+ @list.locs.size.should eq(4)
22
22
  end
23
23
 
24
- it "accepts gzipped siteindexes" do
25
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
26
- @gz_sitemap.url_list.size.should eq(2)
24
+ it "Sitemap accepts gzipped siteindexes" do
25
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml.gz')
26
+ @list.locs.size.should eq(4)
27
27
  end
28
28
 
29
- it "accepts xml sitemaps" do
30
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
31
- @xml_sitemap.url_list.size.should eq(2)
29
+ it "Sitemap accepts xml sitemaps" do
30
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
31
+ @list.locs.size.should eq(2)
32
32
  end
33
33
 
34
- it "accepts xml and gzipped sitemaps" do
35
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
36
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
37
- @xml_sitemap.url_list.size.should eq(2)
38
- @gz_sitemap.url_list.size.should eq(2)
34
+ it "Sitemap accepts xml and gzipped sitemaps" do
35
+ @xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
36
+ @gz_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml.gz')
37
+ @xml_sitemap.locs.size.should eq(2)
38
+ @gz_sitemap.locs.size.should eq(2)
39
39
  end
40
40
 
41
- it "Errors if input doc does not match sitemap schema" do
42
- lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
41
+ it "Sitemap errors if input doc does not match sitemap schema" do
42
+ lambda {SitemapChecker::Sitemap.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
43
43
  end
44
44
 
45
- it "returns status if given a url" do
46
- @sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
47
- SitemapChecker::Checker.get_status_from_xml(@sitemap.url_list.first).should eq(['http://www.github.com','200'])
45
+ it "Sitemap locs are Path objects" do
46
+ @xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
47
+ @xml_sitemap.locs.first.class.should eq(SitemapChecker::Path)
48
+ end
49
+
50
+ it "Path#status returns status code" do
51
+ SitemapChecker::Path.new('http://www.github.com').status.should eq('200')
48
52
  end
49
53
 
50
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap_checker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-24 00:00:00.000000000 Z
12
+ date: 2012-08-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -40,6 +40,8 @@ files:
40
40
  - README.md
41
41
  - Rakefile
42
42
  - lib/sitemap_checker.rb
43
+ - lib/sitemap_checker/path.rb
44
+ - lib/sitemap_checker/sitemap.rb
43
45
  - lib/sitemap_checker/version.rb
44
46
  - sitemap_checker.gemspec
45
47
  - spec/fixtures/siteindex.xml