sitemap_checker 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -18,13 +18,19 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- Get list of urls from xml or xml.gz sitemap url
21
+ Get list of urls(locs) from xml or xml.gz sitemap url.
22
22
 
23
- SitemapChecker::Checker.new(url)
23
+ @sitemap = SitemapChecker::Sitemap.new(url)
24
+ @sitemap.locs.size
24
25
 
25
- Get status of url
26
+ Get status of url from Sitemap
26
27
 
27
- SitemapChecker::Checker.get_status(url)
28
+ @sitemap = SitemapChecker::Sitemap.new(url)
29
+ @sitemap.locs.first.status
30
+
31
+ or directly as a Path
32
+
33
+ SitemapChecker::Path.new(url).status
28
34
 
29
35
  ## Contributing
30
36
 
@@ -0,0 +1,22 @@
1
+ module SitemapChecker
2
+ class Path
3
+ attr_accessor :url, :status
4
+
5
+ def initialize(url)
6
+ @url = url
7
+ @status = nil
8
+ end
9
+
10
+ def get_status_from_xml(url)
11
+ status(url.content)
12
+ end
13
+
14
+ def status
15
+ begin
16
+ @status ||= open(@url).status[0]
17
+ rescue OpenURI::HTTPError => e
18
+ e.io.status[0]
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,62 @@
1
+ module SitemapChecker
2
+ class Sitemap
3
+ attr_accessor :locs, :map
4
+
5
+ def initialize(map)
6
+ @map = map
7
+ @locs = process_map
8
+ end
9
+
10
+ private
11
+
12
+ def process_map
13
+ xml = get_xml_from_map(@map)
14
+ if is_siteindex?(xml)
15
+ process_siteindex(xml)
16
+ else
17
+ process_sitemap(xml)
18
+ end
19
+ end
20
+
21
+ def get_xml_from_map(map)
22
+ begin
23
+ Nokogiri::XML(Zlib::GzipReader.new(open(map)))
24
+ rescue
25
+ Nokogiri::XML(open(map))
26
+ end
27
+ end
28
+
29
+ def is_siteindex?(xml)
30
+ ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
31
+ ixsd.valid?(xml)
32
+ end
33
+
34
+ def is_sitemap?(xml)
35
+ mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
36
+ mxsd.valid?(xml)
37
+ end
38
+
39
+ def process_siteindex(xml)
40
+ @urls = []
41
+ maps = get_locs(xml)
42
+ maps.each do |map|
43
+ xml = get_xml_from_map(map.url)
44
+ @urls += process_sitemap(xml)
45
+ end
46
+ return @urls
47
+ end
48
+
49
+ def process_sitemap(xml)
50
+ if is_sitemap?(xml)
51
+ return get_locs(xml)
52
+ else raise 'Invalid Schema'
53
+ return false
54
+ end
55
+ end
56
+
57
+ def get_locs(xml)
58
+ xml.xpath("//xmlns:loc").map{|path| Path.new(path) }
59
+ end
60
+
61
+ end
62
+ end
@@ -1,3 +1,3 @@
1
1
  module SitemapChecker
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -3,72 +3,8 @@ require 'open-uri'
3
3
  require 'nokogiri'
4
4
  require 'zlib'
5
5
 
6
- module SitemapChecker
7
- class Checker
8
- attr_reader :url_list
9
-
10
- def initialize(url,schema='')
11
- @url = url
12
- @url_list = Array.new
13
- @status_list = Array.new
14
- process_xml
15
- end
16
-
17
- def self.get_status_from_xml(url)
18
- get_status(url.content)
19
- end
20
-
21
- def self.get_status(url)
22
- begin
23
- status = [url,open(url).status[0]]
24
- rescue OpenURI::HTTPError => e
25
- status = [url,e.io.status[0]]
26
- end
27
- end
28
-
29
- private
30
-
31
- def get_xml_from_url(url)
32
- begin
33
- Nokogiri::XML(Zlib::GzipReader.new(open(url)))
34
- rescue
35
- Nokogiri::XML(open(url))
36
- end
37
- end
38
-
39
- def process_xml
40
- mxsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'))
41
- ixsd = Nokogiri::XML::Schema(open('http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd'))
42
- xml = get_xml_from_url(@url)
43
- if mxsd.valid?(xml)
44
- @url_list = urls(xml)
45
- elsif ixsd.valid?(xml)
46
- maps = urls(xml)
47
- maps.each do |map|
48
- xml = get_xml_from_url(map)
49
- @url_list = urls(xml)
50
- end
51
- else raise 'Invalid Schema'
52
- false
53
- end
54
- end
55
-
56
- def urls(xml)
57
- xml.xpath("//xmlns:loc")
58
- end
59
-
60
- def get_status_list(urls)
61
- statuses = []
62
- urls.each do |url|
63
- begin
64
- status = [url.content,open(url).status[0]]
65
- rescue OpenURI::HTTPError => e
66
- status = [url.content,e.io.status[0]]
67
- end
68
- statuses << status
69
- end
70
- statuses
71
- end
72
- end
73
6
 
7
+ module SitemapChecker
8
+ autoload :Sitemap, "./lib/sitemap_checker/sitemap"
9
+ autoload :Path, "./lib/sitemap_checker/path"
74
10
  end
@@ -16,35 +16,39 @@ describe SitemapChecker do
16
16
  stub_request(:get, "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd").to_return(:status => 200, :body => File.read(@dir + 'fixtures/siteindex.xsd'), :headers => {})
17
17
  end
18
18
 
19
- it "accepts xml siteindexes" do
20
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml')
21
- @xml_sitemap.url_list.size.should eq(2)
19
+ it "Sitemap accepts xml siteindexes" do
20
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml')
21
+ @list.locs.size.should eq(4)
22
22
  end
23
23
 
24
- it "accepts gzipped siteindexes" do
25
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/siteindex.xml.gz')
26
- @gz_sitemap.url_list.size.should eq(2)
24
+ it "Sitemap accepts gzipped siteindexes" do
25
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/siteindex.xml.gz')
26
+ @list.locs.size.should eq(4)
27
27
  end
28
28
 
29
- it "accepts xml sitemaps" do
30
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
31
- @xml_sitemap.url_list.size.should eq(2)
29
+ it "Sitemap accepts xml sitemaps" do
30
+ @list = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
31
+ @list.locs.size.should eq(2)
32
32
  end
33
33
 
34
- it "accepts xml and gzipped sitemaps" do
35
- @xml_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
36
- @gz_sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml.gz')
37
- @xml_sitemap.url_list.size.should eq(2)
38
- @gz_sitemap.url_list.size.should eq(2)
34
+ it "Sitemap accepts xml and gzipped sitemaps" do
35
+ @xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
36
+ @gz_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml.gz')
37
+ @xml_sitemap.locs.size.should eq(2)
38
+ @gz_sitemap.locs.size.should eq(2)
39
39
  end
40
40
 
41
- it "Errors if input doc does not match sitemap schema" do
42
- lambda {SitemapChecker::Checker.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
41
+ it "Sitemap errors if input doc does not match sitemap schema" do
42
+ lambda {SitemapChecker::Sitemap.new('http://www.github.com')}.should raise_error(RuntimeError, 'Invalid Schema')
43
43
  end
44
44
 
45
- it "returns status if given a url" do
46
- @sitemap = SitemapChecker::Checker.new('http://www.github.com/sitemap.xml')
47
- SitemapChecker::Checker.get_status_from_xml(@sitemap.url_list.first).should eq(['http://www.github.com','200'])
45
+ it "Sitemap locs are Path objects" do
46
+ @xml_sitemap = SitemapChecker::Sitemap.new('http://www.github.com/sitemap.xml')
47
+ @xml_sitemap.locs.first.class.should eq(SitemapChecker::Path)
48
+ end
49
+
50
+ it "Path#status returns status code" do
51
+ SitemapChecker::Path.new('http://www.github.com').status.should eq('200')
48
52
  end
49
53
 
50
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap_checker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-24 00:00:00.000000000 Z
12
+ date: 2012-08-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -40,6 +40,8 @@ files:
40
40
  - README.md
41
41
  - Rakefile
42
42
  - lib/sitemap_checker.rb
43
+ - lib/sitemap_checker/path.rb
44
+ - lib/sitemap_checker/sitemap.rb
43
45
  - lib/sitemap_checker/version.rb
44
46
  - sitemap_checker.gemspec
45
47
  - spec/fixtures/siteindex.xml