sitemaps_parsers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0dda6b2e07cbdbfde69a03ccd115184338819325da942d7551b5aaef037e3294
4
+ data.tar.gz: fc41c6b4b2af071ea02f407832234ccc7d362c35faa9910b6a0185d3dd838f3a
5
+ SHA512:
6
+ metadata.gz: f8035d70ae575afa224181cfd46a7d87bd9f6f676b3ea32c46b283026c0c7937b238739b0db7449bc81a0a04dde8121899433b2a91b58f5800aaa19daa0b7ed8
7
+ data.tar.gz: 738103fb141ce1b224d9b9c36ae768d32dac1d0ed93f672e0c2425e910cf7cb16e0e0415397ee5fbb4bef95794ea62009690a6767c85f05d4cd595bf1f1dc595
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.3
7
+ before_install: gem install bundler -v 1.17.3
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ gem 'nokogiri'
6
+ gem 'rspec'
7
+ # Specify your gem's dependencies in sitemaps_parsers.gemspec
8
+ gemspec
@@ -0,0 +1,42 @@
1
+ # SitemapsParsers
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/sitemaps_parsers`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'sitemaps_parsers'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install sitemaps_parsers
22
+
23
+ ## Usage
24
+ Create a new instance of the Parser: sp = SitemapParser.new(domain name site), examples SitemapParser.new('test.com')
25
+
26
+ Extract the URLs of the sitemap:
27
+
28
+ sp.urls # => Array of urls from file sitemap.xml
29
+
30
+ sp.sitemap_path # => Gets path to sitemap.xml
31
+
32
+ sp.list_nested_sitemap #=> Gets list nested sitemaps
33
+
34
+ ## Development
35
+
36
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
37
+
38
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/sitemaps_parsers.
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "sitemaps_parsers"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,100 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'uri'
4
+ require 'net/http'
5
+
6
+ require 'sitemaps_parsers/version'
7
+
8
+ class SitemapParser
9
+ attr_reader :domain, :robots_sitemap_path, :list_nested_sitemap
10
+
11
+ def initialize(domain)
12
+ @domain = domain
13
+ @url = URI.parse("http://#{@domain}")
14
+ end
15
+
16
+ def urls
17
+ return parse_sitemap(check_default_sitemap) if check_default_sitemap
18
+
19
+ parse_nested_sitemaps(robots_sitemap)
20
+ end
21
+
22
+ def sitemap_path
23
+ robots_sitemap
24
+ check_default_sitemap&.to_s || robots_sitemap_path&.join(',')
25
+ end
26
+
27
+ private
28
+
29
+ def check_default_sitemap
30
+ url_http_sitemap = URI.join(@url, 'sitemap.xml')
31
+ url_https_sitemap = URI.join(URI.parse("https://#{@domain}"), 'sitemap.xml')
32
+ url_http_www_sitemap = URI.join(URI.parse("http://www.#{@domain}"), 'sitemap.xml')
33
+ url_https_www_sitemap = URI.join(URI.parse("https://www.#{@domain}"), 'sitemap.xml')
34
+
35
+ return url_http_sitemap if Net::HTTP.get_response(url_http_sitemap).code == '200'
36
+ return url_https_sitemap if Net::HTTP.get_response(url_https_sitemap).code == '200'
37
+ return url_http_www_sitemap if Net::HTTP.get_response(url_http_www_sitemap).code == '200'
38
+ return url_https_www_sitemap if Net::HTTP.get_response(url_https_www_sitemap).code == '200'
39
+ rescue StandardError
40
+ nil
41
+ end
42
+
43
+ def robots_sitemap
44
+ @robots_sitemap_path ||= open(URI.join(@url, 'robots.txt')).read.scan(/\s*sitemap:\s*([^\r\n]+)\s*$/i).flatten!.uniq
45
+
46
+ @list_nested_sitemap ||= nested_sitemaps(@robots_sitemap_path)
47
+ @list_nested_sitemap
48
+ rescue StandardError
49
+ nil
50
+ end
51
+
52
+ def nested_sitemaps(sitemap_list = [])
53
+ sitemap_list.map do |path|
54
+ path_io = open(path)
55
+ Nokogiri::HTML(path_io).xpath('//sitemapindex/sitemap/loc').map(&:text)
56
+ end.flatten
57
+ end
58
+
59
+ def parse_nested_sitemaps(nested_sitemaps = [])
60
+ nested_sitemaps.map do |sitemap|
61
+ Nokogiri::XML(load_sitemap(sitemap))
62
+ end.compact.map do |sitemap_io|
63
+ filter_sitemap_urls(sitemap_io)
64
+ end.compact.flatten
65
+ end
66
+
67
+ def load_sitemap(url = nil)
68
+ sitemap_io = open(url)
69
+ rescue StandardError
70
+ nil
71
+ else
72
+ begin
73
+ return Zlib::GzipReader.new(sitemap_io)
74
+ rescue StandardError
75
+ return sitemap_io
76
+ end
77
+ end
78
+
79
+ def filter_sitemap_urls(sitemap_data)
80
+ sitemap_data.search('url').map { |url| url.at('loc').content.strip }
81
+ end
82
+
83
+ def parse_sitemap(url)
84
+ sitemap_data = Nokogiri::XML(open(url))
85
+
86
+ if !sitemap_data.at('urlset').nil?
87
+ return filter_sitemap_urls(sitemap_data.at('urlset'))
88
+
89
+ elsif !sitemap_data.at('sitemapindex').nil?
90
+ found_urls = []
91
+ nested_sitemaps = sitemap_data.at('sitemapindex').search('sitemap')
92
+
93
+ nested_sitemaps.each do |sitemap|
94
+ child_sitemap_location = sitemap.at('loc').content.strip
95
+ found_urls << filter_sitemap_urls(Nokogiri::XML(open(child_sitemap_location)))
96
+ end
97
+ return found_urls.flatten
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SitemapsParsers
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,26 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'sitemaps_parsers/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'sitemaps_parsers'
7
+ spec.version = SitemapsParsers::VERSION
8
+ spec.authors = ['Maksimov Konstantin']
9
+ spec.email = ['kmakimovv@gmail.com']
10
+ spec.license = 'MIT'
11
+ spec.summary = 'SitemapsParsers - gem for parsing file sitemap.xml and gets all urls from sitemap.xml '
12
+ spec.description = 'SitemapsParsers - parser for XML sitemaps which respects sitemaps listed in robots.txt and handles gziped and nested sitemaps as well'
13
+ spec.homepage = 'https://github.com/kmaksimovv/sitemaps_parsers'
14
+
15
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
16
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_development_dependency 'bundler', '~> 1.17'
23
+ spec.add_development_dependency 'nokogiri', '~> 1.10', '>= 1.10.3'
24
+ spec.add_development_dependency 'rake', '~> 10.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.0'
26
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sitemaps_parsers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Maksimov Konstantin
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-07-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.10.3
37
+ type: :development
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '1.10'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.10.3
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '10.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.0'
75
+ description: SitemapsParsers - parser for XML sitemaps which respects sitemaps listed
76
+ in robots.txt and handles gziped and nested sitemaps as well
77
+ email:
78
+ - kmakimovv@gmail.com
79
+ executables: []
80
+ extensions: []
81
+ extra_rdoc_files: []
82
+ files:
83
+ - ".gitignore"
84
+ - ".rspec"
85
+ - ".travis.yml"
86
+ - Gemfile
87
+ - README.md
88
+ - Rakefile
89
+ - bin/console
90
+ - bin/setup
91
+ - lib/sitemaps_parsers.rb
92
+ - lib/sitemaps_parsers/version.rb
93
+ - sitemaps_parsers.gemspec
94
+ homepage: https://github.com/kmaksimovv/sitemaps_parsers
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubygems_version: 3.0.3
114
+ signing_key:
115
+ specification_version: 4
116
+ summary: SitemapsParsers - gem for parsing file sitemap.xml and gets all urls from
117
+ sitemap.xml
118
+ test_files: []