check-sitemap 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e40861b2488ec746e7342b62f35d7dcdee70d9c1
4
+ data.tar.gz: 948236bbae97c569352ad28bae4b935b893dd44f
5
+ SHA512:
6
+ metadata.gz: eee047424f31dcc5dceb83ea4f8881be7f7122b09348ec479fd6a5ef8c60a58da1d56b24dc5a6e7feba5d14f95270da3f34faa48f5fe75edb0afb2975013dfd1
7
+ data.tar.gz: 573b0e64f5cb98da98c194077b26772f177a8c6eccccb898f66a1c22fa7a241cd329d12453cf5dc31164d77f72f0664989906d1e91b9721a1c701a46c20c187b
data/.gitignore ADDED
@@ -0,0 +1,50 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.3
4
+ before_install: gem install bundler -v 1.10.6
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'nokogiri'
4
+ gem 'rainbow'
5
+
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,43 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ check-sitemap (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.3)
10
+ mini_portile2 (2.1.0)
11
+ nokogiri (1.6.8)
12
+ mini_portile2 (~> 2.1.0)
13
+ pkg-config (~> 1.1.7)
14
+ pkg-config (1.1.7)
15
+ rainbow (2.1.0)
16
+ rake (10.5.0)
17
+ rspec (3.7.0)
18
+ rspec-core (~> 3.7.0)
19
+ rspec-expectations (~> 3.7.0)
20
+ rspec-mocks (~> 3.7.0)
21
+ rspec-core (3.7.1)
22
+ rspec-support (~> 3.7.0)
23
+ rspec-expectations (3.7.0)
24
+ diff-lcs (>= 1.2.0, < 2.0)
25
+ rspec-support (~> 3.7.0)
26
+ rspec-mocks (3.7.0)
27
+ diff-lcs (>= 1.2.0, < 2.0)
28
+ rspec-support (~> 3.7.0)
29
+ rspec-support (3.7.1)
30
+
31
+ PLATFORMS
32
+ ruby
33
+
34
+ DEPENDENCIES
35
+ bundler (~> 1.10)
36
+ check-sitemap!
37
+ nokogiri
38
+ rainbow
39
+ rake (~> 10.0)
40
+ rspec (~> 3.7)
41
+
42
+ BUNDLED WITH
43
+ 1.16.0
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Marcos G. Zimmermann
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,30 @@
1
+ # Check Sitemap
2
+
3
+ `check-sitemap` is agem that pulls a sitemap and checks all of the URLs listed in it.
4
+
5
+ ## Installation
6
+
7
+ $ gem install check-sitemap
8
+
9
+ ## Usage
10
+
11
+
12
+ $ checksitemap <location> [options]
13
+
14
+ Remote url:
15
+
16
+ $ checksitemap http://example.com/sitemap.xml
17
+
18
+ Local filename:
19
+
20
+ $ checksitemap path/to/sitemap.xml.gz # Yeah.. works also with gziped files(Remote or local)
21
+
22
+
23
+ It process all the files included in the sitemap index. Use `--concurrence=<num threads>` to parallel process sitemap urls.
24
+
25
+ $ checksitemap http://example.com/sitemap_index.xml --concurrence=5
26
+
27
+
28
+ ## Contributing
29
+
30
+ Bug reports and pull requests are welcome on GitHub at https://github.com/marcosgz/check-sitemap.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "check-sitemap"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,5 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'check_sitemap/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'check-sitemap'
9
+ spec.version = CheckSitemap::VERSION
10
+ spec.authors = ['Marcos G. Zimmermann']
11
+ spec.email = ['mgzmaster@gmail.com']
12
+ spec.license = 'MIT'
13
+
14
+ spec.summary = 'Check URLs in a sitemap.xml'
15
+ spec.description = 'Parse a sitemap.xml URLS and check status of each URL'
16
+ spec.homepage = 'https://github.com/marcosgz/check-sitemap'
17
+
18
+ spec.executables << 'checksitemap'
19
+
20
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
21
+ f.match(%r{^(test|spec|features)/})
22
+ end
23
+ spec.bindir = 'exe'
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ['lib']
26
+
27
+ spec.add_development_dependency 'bundler', '~> 1.10'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'rspec', '~> 3.7'
30
+ end
data/exe/checksitemap ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ require 'check-sitemap'
3
+ require 'optparse'
4
+
5
+ url = String(ARGV.shift)
6
+
7
+ parser = OptionParser.new do |opts|
8
+ opts.banner = "Usage: checksitemap <url> [options]"
9
+
10
+ opts.on("-c CONCURRENCY", "--concurrency=CONCURRENCY", "processor threads to use. (Default: 1)") do |v|
11
+ CheckSitemap.config.num_threads = v.to_i if v.to_i > 0
12
+ end
13
+
14
+ opts.on("-t TIMEOUT", "--timeout TIMEOUT", "set HTTP Timeout. (Default: 30)") do |v|
15
+ CheckSitemap.config.timeout = v.to_i if v.to_i > 0
16
+ end
17
+
18
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
19
+ CheckSitemap.config.verbose = v
20
+ end
21
+ end
22
+
23
+ parser.parse!(ARGV)
24
+
25
+ if (url.size > 0) && !(url =~ /^-/)
26
+ sitemap = CheckSitemap::CommandLine.new(url)
27
+ sitemap.process!
28
+ else
29
+ puts parser.display
30
+ end
@@ -0,0 +1,29 @@
1
+ require 'fileutils'
2
+ require 'open-uri'
3
+ require 'zlib'
4
+ require 'nokogiri'
5
+ require 'timeout'
6
+ require 'uri'
7
+ require 'rainbow'
8
+ require 'openssl'
9
+
10
+ require 'check_sitemap/version'
11
+ require 'check_sitemap/errors'
12
+
13
+ module CheckSitemap
14
+
15
+ autoload 'XMLReader', 'check_sitemap/xml_reader'
16
+ autoload 'CommandLine', 'check_sitemap/command_line'
17
+ autoload 'Config', 'check_sitemap/config'
18
+ autoload 'CheckURL', 'check_sitemap/check_url'
19
+
20
+ def config
21
+ @config ||= Config.new
22
+ end
23
+
24
+ def log(msg)
25
+ puts(msg) if config.verbose?
26
+ end
27
+
28
+ extend self
29
+ end
@@ -0,0 +1,35 @@
1
+ class CheckSitemap::CheckURL
2
+
3
+ def self.call(url)
4
+ uri = URI.parse(url)
5
+ h = Net::HTTP.new(uri.host, uri.port)
6
+ if uri.scheme == 'https'
7
+ h.use_ssl = true
8
+ h.verify_mode = OpenSSL::SSL::VERIFY_NONE
9
+ end
10
+
11
+ head = \
12
+ begin
13
+ Timeout::timeout(::CheckSitemap.config.timeout) do
14
+ h.start do |ua|
15
+ ua.head(String(uri.path).empty? ? '/' : uri.path)
16
+ end
17
+ end
18
+ rescue Timeout::Error
19
+ puts "#{Rainbow(url).yellow} => #{Rainbow('timeout').red}"
20
+ rescue => e
21
+ puts "#{Rainbow(url).yellow} => #{Rainbow(e.message).red}"
22
+ end
23
+
24
+ case head.code.to_i
25
+ when 301
26
+ puts "#{Rainbow(url).yellow} => redirect to #{Rainbow(head['location']).blue}"
27
+ when 200
28
+ puts "#{Rainbow(url).yellow} => #{Rainbow('OK').green}"
29
+ when 404
30
+ puts "#{Rainbow(url).yellow} => #{Rainbow('Not found').red}"
31
+ else
32
+ puts "#{url} => #{Rainbow(head.code.to_s).indianred}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,37 @@
1
+ class CheckSitemap::CommandLine
2
+
3
+ def initialize(url_or_filename)
4
+ @url_or_filename = url_or_filename
5
+ end
6
+
7
+ def process!
8
+ reader = ::CheckSitemap::XMLReader.new(@url_or_filename)
9
+ if reader.sitemap_index?
10
+ reader.each do |url|
11
+ queue << url
12
+ end
13
+ [].tap do |threads|
14
+ ::CheckSitemap.config.num_threads.to_i.times do
15
+ threads << Thread.new do
16
+ until queue.empty?
17
+ url = queue.pop(true) rescue nil
18
+ if url
19
+ ::CheckSitemap::CommandLine.new(url).process!
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end.each { |t| t.join }
25
+ elsif reader.urlset?
26
+ reader.each do |url|
27
+ ::CheckSitemap.config.adapter.call(url)
28
+ end
29
+ end
30
+ end
31
+
32
+ protected
33
+
34
+ def queue
35
+ @queue ||= Queue.new
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ class CheckSitemap::Config
2
+ attr_writer :verbose, :num_threads, :timeout, :adapter
3
+
4
+ def verbose?
5
+ !!@verbose
6
+ end
7
+
8
+ def num_threads
9
+ @num_threads ||= 1
10
+ end
11
+
12
+ def timeout
13
+ @timeout ||= 30
14
+ end
15
+
16
+ def adapter
17
+ @adapter ||= CheckSitemap::CheckURL
18
+ end
19
+
20
+ end
@@ -0,0 +1,12 @@
1
+ module CheckSitemap
2
+ class Error < StandardError
3
+ end
4
+ class HTTPNotFound < Error
5
+ end
6
+ class FileNotFound < Error
7
+ end
8
+ class InvalidContentType < Error
9
+ end
10
+ class XMLSyntaxError < Error
11
+ end
12
+ end
@@ -0,0 +1,3 @@
1
+ module CheckSitemap
2
+ VERSION = '0.2.0'.freeze
3
+ end
@@ -0,0 +1,70 @@
1
+ class CheckSitemap::XMLReader
2
+ include Enumerable
3
+
4
+ def initialize(filename, options={})
5
+ @url_or_filename = filename
6
+ @options = {}
7
+ end
8
+
9
+ def sitemap_index?
10
+ doc.css('sitemapindex > sitemap').any?
11
+ end
12
+
13
+ def urlset?
14
+ doc.css('urlset > url').any?
15
+ end
16
+
17
+ def each(&block)
18
+ return results.to_enum(:each) unless block_given?
19
+
20
+ doc.css('loc').each do |loc|
21
+ block.call loc.content
22
+ end
23
+ end
24
+
25
+ protected
26
+
27
+ def doc
28
+ @doc ||= begin
29
+ CheckSitemap.log("Reading: '#{ @url_or_filename }'")
30
+ Nokogiri::XML(read_xml) { |config| config.strict }
31
+ rescue Nokogiri::XML::SyntaxError => e
32
+ raise(CheckSitemap::XMLSyntaxError.new(e.message))
33
+ end
34
+ end
35
+
36
+ def read_xml
37
+ @xml_file = begin
38
+ if gzip?
39
+ gz=Zlib::GzipReader.new(raw_file)
40
+ gz.read
41
+ elsif xml?
42
+ raw_file.read
43
+ end || raise(CheckSitemap::InvalidContentType.new("File format or MIME type is not supported: '#{mime_type(raw_file)}'"))
44
+ end
45
+ end
46
+
47
+ def raw_file
48
+ @raw_file ||= download!
49
+ end
50
+
51
+ def gzip?
52
+ File.extname(@url_or_filename) == '.gz'
53
+ end
54
+
55
+ def xml?
56
+ File.extname(@url_or_filename) == '.xml'
57
+ end
58
+
59
+ def download!
60
+ begin
61
+ CheckSitemap.log("Opening '#{ @url_or_filename }'")
62
+ open(@url_or_filename)
63
+ rescue Errno::ENOENT => e
64
+ raise CheckSitemap::FileNotFound.new("Missing filename '#{@url_or_filename}'")
65
+ rescue OpenURI::HTTPError => e
66
+ raise CheckSitemap::HTTPNotFound.new("HTTP request to '#{@url_or_filename}' failed")
67
+ end
68
+ end
69
+
70
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: check-sitemap
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Marcos G. Zimmermann
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-02-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.7'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.7'
55
+ description: Parse a sitemap.xml URLS and check status of each URL
56
+ email:
57
+ - mgzmaster@gmail.com
58
+ executables:
59
+ - checksitemap
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - ".rspec"
65
+ - ".travis.yml"
66
+ - Gemfile
67
+ - Gemfile.lock
68
+ - LICENSE
69
+ - README.md
70
+ - Rakefile
71
+ - bin/console
72
+ - bin/setup
73
+ - check-sitemap.gemspec
74
+ - exe/checksitemap
75
+ - lib/check-sitemap.rb
76
+ - lib/check_sitemap/check_url.rb
77
+ - lib/check_sitemap/command_line.rb
78
+ - lib/check_sitemap/config.rb
79
+ - lib/check_sitemap/errors.rb
80
+ - lib/check_sitemap/version.rb
81
+ - lib/check_sitemap/xml_reader.rb
82
+ homepage: https://github.com/marcosgz/check-sitemap
83
+ licenses:
84
+ - MIT
85
+ metadata: {}
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project:
102
+ rubygems_version: 2.5.1
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Check URLs in a sitemap.xml
106
+ test_files: []