sitemap-parser 0.2.1 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d03dd8db51f6b4d2f7fc07f1bf82a8208e09879e
4
- data.tar.gz: 36e811e0184838fdb0332fc5914b5ef232f6ee87
2
+ SHA256:
3
+ metadata.gz: b98303437a057f9f3cdbe7ef5ef3712ed6534e8d9e228364a89bac75488e27df
4
+ data.tar.gz: 380dfe60cde2614823adaaa87520306b5689424d882f6134d80920b6947f717e
5
5
  SHA512:
6
- metadata.gz: 49c136b2f86b00dffbf9c565a7284ea1349a68f752dae6e1cab96b8135ab69955091f9fbb2fbd2d32880d3bc5cde7b8a29bf8eca42c2e954f09a4f4a306d7e76
7
- data.tar.gz: d5241537a66a454a6097a6c88bc3e1e643535c0a085d98f935c13aea902c5b5bf094042fbc2a140c437d548e1e9e9864dda05f21a42113452ce811499b7bd7cc
6
+ metadata.gz: d6afba1d0068955ccf40bb147ddb97349dd8b04499dbe755b04e03b89b8188ab0a37bf618ca96ad49908f4a850fd230d3c00ff24066881044d12e222ef415221
7
+ data.tar.gz: 43e3baffe31bb3dc672d2c73c8002a2707721551be1efd1314a03818038b4a5afb3350ad8c507dfc884c25501982c84297f4c20a207019dc965e9b27a3b82941
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SitemapParser
4
+ VERSION = '0.5.6'
5
+ end
@@ -1,46 +1,77 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
9
  def initialize(url, opts = {})
7
10
  @url = url
8
- @options = {:followlocation => true}.merge(opts)
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
9
12
  end
10
13
 
11
14
  def raw_sitemap
12
15
  @raw_sitemap ||= begin
13
- if @url =~ /\Ahttp/i
14
- request = Typhoeus::Request.new(@url, followlocation: @options[:followlocation])
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
18
+ request = Typhoeus::Request.new(@url, request_options)
15
19
  request.on_complete do |response|
16
- if response.success?
17
- return response.body
18
- else
19
- return nil
20
- end
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
21
23
  end
22
24
  request.run
23
- elsif File.exist?(@url) && @url =~ /[\\\/]sitemap\.xml\Z/i
24
- open(@url) { |f| f.read }
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
25
27
  end
26
28
  end
27
29
  end
28
30
 
29
31
  def sitemap
30
32
  @sitemap ||= Nokogiri::XML(raw_sitemap)
31
- rescue
32
- nil
33
33
  end
34
34
 
35
35
  def urls
36
- sitemap.at("urlset").search("url")
37
- rescue
38
- nil
36
+ if sitemap.at('urlset')
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
38
+ elsif sitemap.at('sitemapindex')
39
+ found_urls = []
40
+ if @options[:recurse]
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
43
+ child_sitemap_location = sitemap.at('loc').content
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
45
+ end
46
+ end
47
+ found_urls.flatten
48
+ else
49
+ raise 'Malformed sitemap, no urlset'
50
+ end
39
51
  end
40
52
 
41
53
  def to_a
42
- urls.map { |url| url.at("loc").content }
43
- rescue
44
- []
54
+ urls.map { |url| url.at('loc').content }
55
+ rescue NoMethodError
56
+ raise 'Malformed sitemap, url without loc'
57
+ end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
45
76
  end
46
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.5.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-05 00:00:00.000000000 Z
11
+ date: 2021-12-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -28,16 +28,36 @@ dependencies:
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.6'
34
+ - - "<"
35
+ - !ruby/object:Gem::Version
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
- - - "~>"
41
+ - - ">="
39
42
  - !ruby/object:Gem::Version
40
43
  version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
41
61
  - !ruby/object:Gem::Dependency
42
62
  name: rake
43
63
  requirement: !ruby/object:Gem::Requirement
@@ -53,47 +73,75 @@ dependencies:
53
73
  - !ruby/object:Gem::Version
54
74
  version: '10.4'
55
75
  - !ruby/object:Gem::Dependency
56
- name: shoulda
76
+ name: rubocop
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: '3.5'
81
+ version: '0.80'
62
82
  type: :development
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: '3.5'
88
+ version: '0.80'
69
89
  - !ruby/object:Gem::Dependency
70
- name: bundler
90
+ name: rubocop-minitest
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
93
  - - "~>"
74
94
  - !ruby/object:Gem::Version
75
- version: '1.6'
95
+ version: '0.1'
76
96
  type: :development
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
100
  - - "~>"
81
101
  - !ruby/object:Gem::Version
82
- version: '1.6'
102
+ version: '0.1'
83
103
  - !ruby/object:Gem::Dependency
84
- name: minitest
104
+ name: rubocop-performance
85
105
  requirement: !ruby/object:Gem::Requirement
86
106
  requirements:
87
107
  - - "~>"
88
108
  - !ruby/object:Gem::Version
89
- version: '4.7'
109
+ version: '1.5'
90
110
  type: :development
91
111
  prerelease: false
92
112
  version_requirements: !ruby/object:Gem::Requirement
93
113
  requirements:
94
114
  - - "~>"
95
115
  - !ruby/object:Gem::Version
96
- version: '4.7'
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
131
+ - !ruby/object:Gem::Dependency
132
+ name: test-unit
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '3.1'
138
+ type: :development
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: '3.1'
97
145
  description: Ruby Gem to parse sitemaps.org compliant sitemaps.
98
146
  email: ben.balter@github.com
99
147
  executables: []
@@ -101,11 +149,12 @@ extensions: []
101
149
  extra_rdoc_files: []
102
150
  files:
103
151
  - lib/sitemap-parser.rb
152
+ - lib/sitemap-parser/version.rb
104
153
  homepage: https://github.com/benbalter/sitemap-parser
105
154
  licenses:
106
155
  - MIT
107
156
  metadata: {}
108
- post_install_message:
157
+ post_install_message:
109
158
  rdoc_options: []
110
159
  require_paths:
111
160
  - lib
@@ -120,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
169
  - !ruby/object:Gem::Version
121
170
  version: '0'
122
171
  requirements: []
123
- rubyforge_project:
124
- rubygems_version: 2.2.0
125
- signing_key:
172
+ rubygems_version: 3.2.22
173
+ signing_key:
126
174
  specification_version: 4
127
175
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
128
176
  test_files: []