sitemap-parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/sitemap-parser.rb +37 -16
- metadata +51 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8eddc8204f36a00114c47cbee1a9e1bf884fda408e51383e3af615c7811b1a25
|
4
|
+
data.tar.gz: 10fde984cc652d03edf863ee8c6cc03a531b6cadaa1e37de49b2d2095d3d4f46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec5cad59821bdc90a202fca1ac7b514c34be55c4addd3948ba3b9fe86d68170dbb773836d8eb094f1b7625ee4112ea9903693edabb395b2951fa638a98d3083b
|
7
|
+
data.tar.gz: d2ffb35d82995f2e031afb304350f88a726db081937e9b955b689f774bf3c7e5a4ae8da132c95d6ef2f6edad5326a95a079194c048f01ae4e91d7878d0c51066
|
data/lib/sitemap-parser.rb
CHANGED
@@ -1,28 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'typhoeus'
|
5
|
+
require 'zlib'
|
6
|
+
require_relative 'sitemap-parser/version'
|
3
7
|
|
4
8
|
class SitemapParser
|
5
|
-
|
6
9
|
def initialize(url, opts = {})
|
7
10
|
@url = url
|
8
|
-
@options = {:
|
11
|
+
@options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
|
9
12
|
end
|
10
13
|
|
11
14
|
def raw_sitemap
|
12
15
|
@raw_sitemap ||= begin
|
13
|
-
if
|
14
|
-
request_options = @options.dup.tap { |opts| opts.delete(:recurse) }
|
16
|
+
if /\Ahttp/i.match?(@url)
|
17
|
+
request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
|
15
18
|
request = Typhoeus::Request.new(@url, request_options)
|
16
19
|
request.on_complete do |response|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
raise "HTTP request to #{@url} failed"
|
21
|
-
end
|
20
|
+
raise "HTTP request to #{@url} failed" unless response.success?
|
21
|
+
|
22
|
+
return inflate_body_if_needed(response)
|
22
23
|
end
|
23
24
|
request.run
|
24
|
-
elsif File.exist?(@url) && @url =~
|
25
|
-
open(@url
|
25
|
+
elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
|
26
|
+
File.open(@url, &:read)
|
26
27
|
end
|
27
28
|
end
|
28
29
|
end
|
@@ -33,24 +34,44 @@ class SitemapParser
|
|
33
34
|
|
34
35
|
def urls
|
35
36
|
if sitemap.at('urlset')
|
36
|
-
sitemap.at(
|
37
|
+
filter_sitemap_urls(sitemap.at('urlset').search('url'))
|
37
38
|
elsif sitemap.at('sitemapindex')
|
38
39
|
found_urls = []
|
39
40
|
if @options[:recurse]
|
40
|
-
sitemap.at('sitemapindex').search('sitemap')
|
41
|
+
urls = sitemap.at('sitemapindex').search('sitemap')
|
42
|
+
filter_sitemap_urls(urls).each do |sitemap|
|
41
43
|
child_sitemap_location = sitemap.at('loc').content
|
42
|
-
found_urls << self.class.new(child_sitemap_location, :
|
44
|
+
found_urls << self.class.new(child_sitemap_location, recurse: false).urls
|
43
45
|
end
|
44
46
|
end
|
45
|
-
|
47
|
+
found_urls.flatten
|
46
48
|
else
|
47
49
|
raise 'Malformed sitemap, no urlset'
|
48
50
|
end
|
49
51
|
end
|
50
52
|
|
51
53
|
def to_a
|
52
|
-
urls.map { |url| url.at(
|
54
|
+
urls.map { |url| url.at('loc').content }
|
53
55
|
rescue NoMethodError
|
54
56
|
raise 'Malformed sitemap, url without loc'
|
55
57
|
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def filter_sitemap_urls(urls)
|
62
|
+
return urls if @options[:url_regex].nil?
|
63
|
+
|
64
|
+
urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
|
65
|
+
end
|
66
|
+
|
67
|
+
def inflate_body_if_needed(response)
|
68
|
+
return response.body unless response.headers
|
69
|
+
|
70
|
+
case response.headers['Content-Type']
|
71
|
+
when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
|
72
|
+
Zlib.gunzip(response.body)
|
73
|
+
else
|
74
|
+
response.body
|
75
|
+
end
|
76
|
+
end
|
56
77
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemap-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Balter
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-12-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -28,16 +28,36 @@ dependencies:
|
|
28
28
|
name: typhoeus
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0.6'
|
34
|
+
- - "<"
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '2.0'
|
34
37
|
type: :runtime
|
35
38
|
prerelease: false
|
36
39
|
version_requirements: !ruby/object:Gem::Requirement
|
37
40
|
requirements:
|
38
|
-
- - "
|
41
|
+
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
40
43
|
version: '0.6'
|
44
|
+
- - "<"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: minitest
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '4.7'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '4.7'
|
41
61
|
- !ruby/object:Gem::Dependency
|
42
62
|
name: rake
|
43
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -53,47 +73,61 @@ dependencies:
|
|
53
73
|
- !ruby/object:Gem::Version
|
54
74
|
version: '10.4'
|
55
75
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
76
|
+
name: rubocop
|
57
77
|
requirement: !ruby/object:Gem::Requirement
|
58
78
|
requirements:
|
59
79
|
- - "~>"
|
60
80
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
81
|
+
version: '0.80'
|
62
82
|
type: :development
|
63
83
|
prerelease: false
|
64
84
|
version_requirements: !ruby/object:Gem::Requirement
|
65
85
|
requirements:
|
66
86
|
- - "~>"
|
67
87
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
88
|
+
version: '0.80'
|
69
89
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
90
|
+
name: rubocop-minitest
|
71
91
|
requirement: !ruby/object:Gem::Requirement
|
72
92
|
requirements:
|
73
93
|
- - "~>"
|
74
94
|
- !ruby/object:Gem::Version
|
75
|
-
version: '1
|
95
|
+
version: '0.1'
|
76
96
|
type: :development
|
77
97
|
prerelease: false
|
78
98
|
version_requirements: !ruby/object:Gem::Requirement
|
79
99
|
requirements:
|
80
100
|
- - "~>"
|
81
101
|
- !ruby/object:Gem::Version
|
82
|
-
version: '1
|
102
|
+
version: '0.1'
|
83
103
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
104
|
+
name: rubocop-performance
|
85
105
|
requirement: !ruby/object:Gem::Requirement
|
86
106
|
requirements:
|
87
107
|
- - "~>"
|
88
108
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
109
|
+
version: '1.5'
|
90
110
|
type: :development
|
91
111
|
prerelease: false
|
92
112
|
version_requirements: !ruby/object:Gem::Requirement
|
93
113
|
requirements:
|
94
114
|
- - "~>"
|
95
115
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
116
|
+
version: '1.5'
|
117
|
+
- !ruby/object:Gem::Dependency
|
118
|
+
name: shoulda
|
119
|
+
requirement: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '3.5'
|
124
|
+
type: :development
|
125
|
+
prerelease: false
|
126
|
+
version_requirements: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '3.5'
|
97
131
|
- !ruby/object:Gem::Dependency
|
98
132
|
name: test-unit
|
99
133
|
requirement: !ruby/object:Gem::Requirement
|
@@ -119,7 +153,7 @@ homepage: https://github.com/benbalter/sitemap-parser
|
|
119
153
|
licenses:
|
120
154
|
- MIT
|
121
155
|
metadata: {}
|
122
|
-
post_install_message:
|
156
|
+
post_install_message:
|
123
157
|
rdoc_options: []
|
124
158
|
require_paths:
|
125
159
|
- lib
|
@@ -134,9 +168,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
168
|
- !ruby/object:Gem::Version
|
135
169
|
version: '0'
|
136
170
|
requirements: []
|
137
|
-
|
138
|
-
|
139
|
-
signing_key:
|
171
|
+
rubygems_version: 3.1.4
|
172
|
+
signing_key:
|
140
173
|
specification_version: 4
|
141
174
|
summary: Ruby Gem to parse sitemaps.org compliant sitemaps
|
142
175
|
test_files: []
|