sitemap-parser 0.2.1 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/sitemap-parser/version.rb +5 -0
- data/lib/sitemap-parser.rb +50 -19
- metadata +66 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b98303437a057f9f3cdbe7ef5ef3712ed6534e8d9e228364a89bac75488e27df
|
4
|
+
data.tar.gz: 380dfe60cde2614823adaaa87520306b5689424d882f6134d80920b6947f717e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6afba1d0068955ccf40bb147ddb97349dd8b04499dbe755b04e03b89b8188ab0a37bf618ca96ad49908f4a850fd230d3c00ff24066881044d12e222ef415221
|
7
|
+
data.tar.gz: 43e3baffe31bb3dc672d2c73c8002a2707721551be1efd1314a03818038b4a5afb3350ad8c507dfc884c25501982c84297f4c20a207019dc965e9b27a3b82941
|
data/lib/sitemap-parser.rb
CHANGED
@@ -1,46 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'typhoeus'
|
5
|
+
require 'zlib'
|
6
|
+
require_relative 'sitemap-parser/version'
|
3
7
|
|
4
8
|
class SitemapParser
|
5
|
-
|
6
9
|
def initialize(url, opts = {})
|
7
10
|
@url = url
|
8
|
-
@options = {:
|
11
|
+
@options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
|
9
12
|
end
|
10
13
|
|
11
14
|
def raw_sitemap
|
12
15
|
@raw_sitemap ||= begin
|
13
|
-
if
|
14
|
-
|
16
|
+
if /\Ahttp/i.match?(@url)
|
17
|
+
request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
|
18
|
+
request = Typhoeus::Request.new(@url, request_options)
|
15
19
|
request.on_complete do |response|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
return nil
|
20
|
-
end
|
20
|
+
raise "HTTP request to #{@url} failed" unless response.success?
|
21
|
+
|
22
|
+
return inflate_body_if_needed(response)
|
21
23
|
end
|
22
24
|
request.run
|
23
|
-
elsif File.exist?(@url) && @url =~
|
24
|
-
open(@url
|
25
|
+
elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
|
26
|
+
File.open(@url, &:read)
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
28
30
|
|
29
31
|
def sitemap
|
30
32
|
@sitemap ||= Nokogiri::XML(raw_sitemap)
|
31
|
-
rescue
|
32
|
-
nil
|
33
33
|
end
|
34
34
|
|
35
35
|
def urls
|
36
|
-
sitemap.at(
|
37
|
-
|
38
|
-
|
36
|
+
if sitemap.at('urlset')
|
37
|
+
filter_sitemap_urls(sitemap.at('urlset').search('url'))
|
38
|
+
elsif sitemap.at('sitemapindex')
|
39
|
+
found_urls = []
|
40
|
+
if @options[:recurse]
|
41
|
+
urls = sitemap.at('sitemapindex').search('sitemap')
|
42
|
+
filter_sitemap_urls(urls).each do |sitemap|
|
43
|
+
child_sitemap_location = sitemap.at('loc').content
|
44
|
+
found_urls << self.class.new(child_sitemap_location, recurse: false).urls
|
45
|
+
end
|
46
|
+
end
|
47
|
+
found_urls.flatten
|
48
|
+
else
|
49
|
+
raise 'Malformed sitemap, no urlset'
|
50
|
+
end
|
39
51
|
end
|
40
52
|
|
41
53
|
def to_a
|
42
|
-
urls.map { |url| url.at(
|
43
|
-
rescue
|
44
|
-
|
54
|
+
urls.map { |url| url.at('loc').content }
|
55
|
+
rescue NoMethodError
|
56
|
+
raise 'Malformed sitemap, url without loc'
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def filter_sitemap_urls(urls)
|
62
|
+
return urls if @options[:url_regex].nil?
|
63
|
+
|
64
|
+
urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
|
65
|
+
end
|
66
|
+
|
67
|
+
def inflate_body_if_needed(response)
|
68
|
+
return response.body unless response.headers
|
69
|
+
|
70
|
+
case response.headers['Content-Type']
|
71
|
+
when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
|
72
|
+
Zlib.gunzip(response.body)
|
73
|
+
else
|
74
|
+
response.body
|
75
|
+
end
|
45
76
|
end
|
46
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-12-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -28,16 +28,36 @@ dependencies:
|
|
28
28
|
name: typhoeus
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0.6'
|
34
|
+
- - "<"
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '2.0'
|
34
37
|
type: :runtime
|
35
38
|
prerelease: false
|
36
39
|
version_requirements: !ruby/object:Gem::Requirement
|
37
40
|
requirements:
|
38
|
-
- - "
|
41
|
+
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
40
43
|
version: '0.6'
|
44
|
+
- - "<"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: minitest
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '4.7'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '4.7'
|
41
61
|
- !ruby/object:Gem::Dependency
|
42
62
|
name: rake
|
43
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -53,47 +73,75 @@ dependencies:
|
|
53
73
|
- !ruby/object:Gem::Version
|
54
74
|
version: '10.4'
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
76
|
+
name: rubocop
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
79
|
- - "~>"
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
81
|
+
version: '0.80'
|
62
82
|
type: :development
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
86
|
- - "~>"
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
88
|
+
version: '0.80'
|
69
89
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
90
|
+
name: rubocop-minitest
|
71
91
|
requirement: !ruby/object:Gem::Requirement
|
72
92
|
requirements:
|
73
93
|
- - "~>"
|
74
94
|
- !ruby/object:Gem::Version
|
75
|
-
version: '1
|
95
|
+
version: '0.1'
|
76
96
|
type: :development
|
77
97
|
prerelease: false
|
78
98
|
version_requirements: !ruby/object:Gem::Requirement
|
79
99
|
requirements:
|
80
100
|
- - "~>"
|
81
101
|
- !ruby/object:Gem::Version
|
82
|
-
version: '1
|
102
|
+
version: '0.1'
|
83
103
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
104
|
+
name: rubocop-performance
|
85
105
|
requirement: !ruby/object:Gem::Requirement
|
86
106
|
requirements:
|
87
107
|
- - "~>"
|
88
108
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
109
|
+
version: '1.5'
|
90
110
|
type: :development
|
91
111
|
prerelease: false
|
92
112
|
version_requirements: !ruby/object:Gem::Requirement
|
93
113
|
requirements:
|
94
114
|
- - "~>"
|
95
115
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
116
|
+
version: '1.5'
|
117
|
+
- !ruby/object:Gem::Dependency
|
118
|
+
name: shoulda
|
119
|
+
requirement: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '3.5'
|
124
|
+
type: :development
|
125
|
+
prerelease: false
|
126
|
+
version_requirements: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '3.5'
|
131
|
+
- !ruby/object:Gem::Dependency
|
132
|
+
name: test-unit
|
133
|
+
requirement: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - "~>"
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '3.1'
|
138
|
+
type: :development
|
139
|
+
prerelease: false
|
140
|
+
version_requirements: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '3.1'
|
97
145
|
description: Ruby Gem to parse sitemaps.org compliant sitemaps.
|
98
146
|
email: ben.balter@github.com
|
99
147
|
executables: []
|
@@ -101,11 +149,12 @@ extensions: []
|
|
101
149
|
extra_rdoc_files: []
|
102
150
|
files:
|
103
151
|
- lib/sitemap-parser.rb
|
152
|
+
- lib/sitemap-parser/version.rb
|
104
153
|
homepage: https://github.com/benbalter/sitemap-parser
|
105
154
|
licenses:
|
106
155
|
- MIT
|
107
156
|
metadata: {}
|
108
|
-
post_install_message:
|
157
|
+
post_install_message:
|
109
158
|
rdoc_options: []
|
110
159
|
require_paths:
|
111
160
|
- lib
|
@@ -120,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
120
169
|
- !ruby/object:Gem::Version
|
121
170
|
version: '0'
|
122
171
|
requirements: []
|
123
|
-
|
124
|
-
|
125
|
-
signing_key:
|
172
|
+
rubygems_version: 3.2.22
|
173
|
+
signing_key:
|
126
174
|
specification_version: 4
|
127
175
|
summary: Ruby Gem to parse sitemaps.org compliant sitemaps
|
128
176
|
test_files: []
|