sitemaps_parsers 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0dda6b2e07cbdbfde69a03ccd115184338819325da942d7551b5aaef037e3294
4
+ data.tar.gz: fc41c6b4b2af071ea02f407832234ccc7d362c35faa9910b6a0185d3dd838f3a
5
+ SHA512:
6
+ metadata.gz: f8035d70ae575afa224181cfd46a7d87bd9f6f676b3ea32c46b283026c0c7937b238739b0db7449bc81a0a04dde8121899433b2a91b58f5800aaa19daa0b7ed8
7
+ data.tar.gz: 738103fb141ce1b224d9b9c36ae768d32dac1d0ed93f672e0c2425e910cf7cb16e0e0415397ee5fbb4bef95794ea62009690a6767c85f05d4cd595bf1f1dc595
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.3
7
+ before_install: gem install bundler -v 1.17.3
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ gem 'nokogiri'
6
+ gem 'rspec'
7
+ # Specify your gem's dependencies in sitemaps_parsers.gemspec
8
+ gemspec
@@ -0,0 +1,42 @@
1
+ # SitemapsParsers
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/sitemaps_parsers`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'sitemaps_parsers'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install sitemaps_parsers
22
+
23
+ ## Usage
24
+ Create a new instance of the Parser: sp = SitemapParser.new(domain name site), examples SitemapParser.new('test.com')
25
+
26
+ Extract the URLs of the sitemap:
27
+
28
+ sp.urls # => Array of urls from file sitemap.xml
29
+
30
+ sp.sitemap_path # => Gets path to sitemap.xml
31
+
32
+ sp.list_nested_sitemap #=> Gets list nested sitemaps
33
+
34
+ ## Development
35
+
36
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
37
+
38
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/sitemaps_parsers.
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "sitemaps_parsers"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,100 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'uri'
4
+ require 'net/http'
5
+
6
+ require 'sitemaps_parsers/version'
7
+
8
+ class SitemapParser
9
+ attr_reader :domain, :robots_sitemap_path, :list_nested_sitemap
10
+
11
+ def initialize(domain)
12
+ @domain = domain
13
+ @url = URI.parse("http://#{@domain}")
14
+ end
15
+
16
+ def urls
17
+ return parse_sitemap(check_default_sitemap) if check_default_sitemap
18
+
19
+ parse_nested_sitemaps(robots_sitemap)
20
+ end
21
+
22
+ def sitemap_path
23
+ robots_sitemap
24
+ check_default_sitemap&.to_s || robots_sitemap_path&.join(',')
25
+ end
26
+
27
+ private
28
+
29
+ def check_default_sitemap
30
+ url_http_sitemap = URI.join(@url, 'sitemap.xml')
31
+ url_https_sitemap = URI.join(URI.parse("https://#{@domain}"), 'sitemap.xml')
32
+ url_http_www_sitemap = URI.join(URI.parse("http://www.#{@domain}"), 'sitemap.xml')
33
+ url_https_www_sitemap = URI.join(URI.parse("https://www.#{@domain}"), 'sitemap.xml')
34
+
35
+ return url_http_sitemap if Net::HTTP.get_response(url_http_sitemap).code == '200'
36
+ return url_https_sitemap if Net::HTTP.get_response(url_https_sitemap).code == '200'
37
+ return url_http_www_sitemap if Net::HTTP.get_response(url_http_www_sitemap).code == '200'
38
+ return url_https_www_sitemap if Net::HTTP.get_response(url_https_www_sitemap).code == '200'
39
+ rescue StandardError
40
+ nil
41
+ end
42
+
43
+ def robots_sitemap
44
+ @robots_sitemap_path ||= open(URI.join(@url, 'robots.txt')).read.scan(/\s*sitemap:\s*([^\r\n]+)\s*$/i).flatten!.uniq
45
+
46
+ @list_nested_sitemap ||= nested_sitemaps(@robots_sitemap_path)
47
+ @list_nested_sitemap
48
+ rescue StandardError
49
+ nil
50
+ end
51
+
52
+ def nested_sitemaps(sitemap_list = [])
53
+ sitemap_list.map do |path|
54
+ path_io = open(path)
55
+ Nokogiri::HTML(path_io).xpath('//sitemapindex/sitemap/loc').map(&:text)
56
+ end.flatten
57
+ end
58
+
59
+ def parse_nested_sitemaps(nested_sitemaps = [])
60
+ nested_sitemaps.map do |sitemap|
61
+ Nokogiri::XML(load_sitemap(sitemap))
62
+ end.compact.map do |sitemap_io|
63
+ filter_sitemap_urls(sitemap_io)
64
+ end.compact.flatten
65
+ end
66
+
67
+ def load_sitemap(url = nil)
68
+ sitemap_io = open(url)
69
+ rescue StandardError
70
+ nil
71
+ else
72
+ begin
73
+ return Zlib::GzipReader.new(sitemap_io)
74
+ rescue StandardError
75
+ return sitemap_io
76
+ end
77
+ end
78
+
79
+ def filter_sitemap_urls(sitemap_data)
80
+ sitemap_data.search('url').map { |url| url.at('loc').content.strip }
81
+ end
82
+
83
+ def parse_sitemap(url)
84
+ sitemap_data = Nokogiri::XML(open(url))
85
+
86
+ if !sitemap_data.at('urlset').nil?
87
+ return filter_sitemap_urls(sitemap_data.at('urlset'))
88
+
89
+ elsif !sitemap_data.at('sitemapindex').nil?
90
+ found_urls = []
91
+ nested_sitemaps = sitemap_data.at('sitemapindex').search('sitemap')
92
+
93
+ nested_sitemaps.each do |sitemap|
94
+ child_sitemap_location = sitemap.at('loc').content.strip
95
+ found_urls << filter_sitemap_urls(Nokogiri::XML(open(child_sitemap_location)))
96
+ end
97
+ return found_urls.flatten
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SitemapsParsers
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,26 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'sitemaps_parsers/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'sitemaps_parsers'
7
+ spec.version = SitemapsParsers::VERSION
8
+ spec.authors = ['Maksimov Konstantin']
9
+ spec.email = ['kmakimovv@gmail.com']
10
+ spec.license = 'MIT'
11
+ spec.summary = 'SitemapsParsers - gem for parsing file sitemap.xml and gets all urls from sitemap.xml '
12
+ spec.description = 'SitemapsParsers - parser for XML sitemaps which respects sitemaps listed in robots.txt and handles gziped and nested sitemaps as well'
13
+ spec.homepage = 'https://github.com/kmaksimovv/sitemaps_parsers'
14
+
15
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
16
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_development_dependency 'bundler', '~> 1.17'
23
+ spec.add_development_dependency 'nokogiri', '~> 1.10', '>= 1.10.3'
24
+ spec.add_development_dependency 'rake', '~> 10.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.0'
26
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemaps_parsers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Maksimov Konstantin
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-07-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.10.3
37
+ type: :development
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '1.10'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.10.3
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '10.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.0'
75
+ description: SitemapsParsers - parser for XML sitemaps which respects sitemaps listed
76
+ in robots.txt and handles gziped and nested sitemaps as well
77
+ email:
78
+ - kmakimovv@gmail.com
79
+ executables: []
80
+ extensions: []
81
+ extra_rdoc_files: []
82
+ files:
83
+ - ".gitignore"
84
+ - ".rspec"
85
+ - ".travis.yml"
86
+ - Gemfile
87
+ - README.md
88
+ - Rakefile
89
+ - bin/console
90
+ - bin/setup
91
+ - lib/sitemaps_parsers.rb
92
+ - lib/sitemaps_parsers/version.rb
93
+ - sitemaps_parsers.gemspec
94
+ homepage: https://github.com/kmaksimovv/sitemaps_parsers
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubygems_version: 3.0.3
114
+ signing_key:
115
+ specification_version: 4
116
+ summary: SitemapsParsers - gem for parsing file sitemap.xml and gets all urls from
117
+ sitemap.xml
118
+ test_files: []