sitemaped 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 71633dcc823c53b58e985df4cd7d704678e3413a
4
+ data.tar.gz: 8f76e93e6ca137fdde8a33274f87331f24afd46d
5
+ SHA512:
6
+ metadata.gz: 6b700f97ed304f4eff6a24ed8c05b19e9f85e8a6b8b1f6211fd461e5d0fcecbf67d47e982653ae801d3b449dbd903a499ffd49ca7212655a160a2054bba1e080
7
+ data.tar.gz: a0ee197e2d965f12aef969b2160617839becc7a4a21487ba3f7c9f1299771e7083336156d034491fb6533e0d849216107b16e726dbf0f35c078044ef3fcab310
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ # Gemfile.lock
30
+ # .ruby-version
31
+ # .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,22 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sitemaped (0.0.1)
5
+ nokogiri (~> 1.6)
6
+ nokogiri (~> 1.6)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ mini_portile (0.6.0)
12
+ nokogiri (1.6.2.1)
13
+ mini_portile (= 0.6.0)
14
+ rake (10.3.2)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ bundler (~> 1.6)
21
+ rake
22
+ sitemaped!
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Railsemechanic, Matthias Kalb
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,44 @@
1
+ # Sitemaped
2
+ Sitemaped is a powerful parser for XML sitemaps which respects all sitemaps listed in robots.txt and handles gziped and nested sitemaps as well.
3
+
4
+ ## Features
5
+
6
+ - Respects sitemaps listed in robots.txt
7
+ - Handles gziped sitemaps
8
+ - Supports nested sitemaps (sitemap of sitemaps)
9
+
10
+ ## Installation
11
+ ### With Bundler
12
+ Just add to your Gemfile
13
+ ~~~ruby
14
+ gem 'sitemaped'
15
+ ~~~
16
+
17
+ ### Without Bundler
18
+ If you're not using Bundler just execute on your commandline
19
+ ~~~bash
20
+ $ gem install sitemaped
21
+ ~~~
22
+
23
+ ## Usage
24
+ ### Get a list of all URLs listed in the sitemap
25
+ Sitemaped returns an array with all URLs listed in the sitemap(s)
26
+ ~~~ruby
27
+ require 'sitemaped'
28
+
29
+ website = Sitemaped.new('http://www.example.com') # => ["http://www.example.com/", "http://www.example.com/contact", ...]
30
+ sitemap = website.sitemap
31
+ ~~~
32
+
33
+ ### Check whether an URL is covered by the sitemap
34
+ To check whether an URL is covered by the sitemap use
35
+ ~~~ruby
36
+ require 'sitemaped'
37
+
38
+ sitemap = Sitemaped.new('http://www.example.com')
39
+ sitemap.include?('http://www.example.com/contact') # => true or false
40
+ ~~~
41
+
42
+ ## ToDo
43
+ - Search for optimizations
44
+ - Add tests
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,74 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+ require 'set'
4
+ require 'open-uri'
5
+
6
+ class Sitemaped
7
+
8
+ def initialize(url)
9
+ @url = URI.parse(URI.encode(url))
10
+ raise URI::InvalidURIError.new('scheme or host missing') unless @url.scheme and @url.host
11
+ @sitemap = Set.new
12
+ end
13
+
14
+ def sitemap
15
+ sitemaps.to_a
16
+ end
17
+
18
+ def include?(path)
19
+ sitemaps.include?(path)
20
+ end
21
+
22
+ private
23
+
24
+ def sitemaps
25
+ return @sitemap.empty? ? @sitemap.merge(robots_sitemap || default_sitemap) : @sitemap
26
+ end
27
+
28
+ def robots_sitemap
29
+ @robots_sitemap_urls ||= open("#{@url.scheme}://#{@url.host}/robots.txt").read.scan(/\s*sitemap:\s*([^\r\n]+)\s*$/i).flatten!
30
+ rescue
31
+ @robots_sitemap_urls = []
32
+ ensure
33
+ unless @robots_sitemap_urls.empty?
34
+ return @robots_sitemap_data ||= handle_nested_sitemaps(@robots_sitemap_urls)
35
+ end
36
+ end
37
+
38
+ def default_sitemap
39
+ @default_sitemap_data ||= parse_sitemap(load_sitemap("#{@url.scheme}://#{@url.host}/sitemap.xml") || load_sitemap("#{@url.scheme}://#{@url.host}/sitemap.xml.gz"))
40
+ end
41
+
42
+ def handle_nested_sitemaps(sitemap_list=[])
43
+ return sitemap_list.map do |sitemap_url|
44
+ load_sitemap(sitemap_url)
45
+ end.compact.map do |sitemap_io|
46
+ parse_sitemap(sitemap_io)
47
+ end.compact.flatten
48
+ end
49
+
50
+ def load_sitemap(url=nil)
51
+ sitemap_io = open(url)
52
+ rescue
53
+ return nil
54
+ else
55
+ begin
56
+ return Zlib::GzipReader.new(sitemap_io)
57
+ rescue
58
+ sitemap_io.rewind
59
+ return sitemap_io
60
+ end
61
+ end
62
+
63
+ def parse_sitemap(sitemap_io)
64
+ sitemap_data = Nokogiri::HTML(sitemap_io)
65
+ if nested_sitemaps = sitemap_data.xpath("//sitemapindex/sitemap/loc") and !nested_sitemaps.empty?
66
+ return handle_nested_sitemaps(nested_sitemaps.map(&:text).flatten)
67
+ end
68
+
69
+ return sitemap_data.xpath("//urlset/url/loc").map(&:text).flatten
70
+ rescue
71
+ nil
72
+ end
73
+
74
+ end
@@ -0,0 +1,3 @@
1
+ class Sitemaped
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'sitemaped/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "sitemaped"
8
+ spec.version = Sitemaped::VERSION
9
+ spec.authors = ["Matthias Kalb"]
10
+ spec.email = ["matthias.kalb@railsmechanic.de"]
11
+ spec.summary = %q{Parser for XML sitemaps}
12
+ spec.description = %q{Parser for XML sitemaps which respects sitemaps listed in robots.txt and handles gziped and nested sitemaps as well.}
13
+ spec.homepage = "https://github.com/railsmechanic/sitemaped"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6', '~> 1.6'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemaped
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthias Kalb
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.6'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Parser for XML sitemaps which respects sitemaps listed in robots.txt
56
+ and handles gziped and nested sitemaps as well.
57
+ email:
58
+ - matthias.kalb@railsmechanic.de
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE
67
+ - README.md
68
+ - Rakefile
69
+ - lib/sitemaped.rb
70
+ - lib/sitemaped/version.rb
71
+ - sitemaped.gemspec
72
+ homepage: https://github.com/railsmechanic/sitemaped
73
+ licenses:
74
+ - MIT
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.2.2
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Parser for XML sitemaps
96
+ test_files: []