sitemap-parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/sitemap-parser.rb +37 -16
  3. metadata +51 -18
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5e1f45974fe39269287ae515e63e56d245dd0ff6
4
- data.tar.gz: 99382aa0efafaccedf73189a741c6770ccf683a3
2
+ SHA256:
3
+ metadata.gz: 8eddc8204f36a00114c47cbee1a9e1bf884fda408e51383e3af615c7811b1a25
4
+ data.tar.gz: 10fde984cc652d03edf863ee8c6cc03a531b6cadaa1e37de49b2d2095d3d4f46
5
5
  SHA512:
6
- metadata.gz: 275725d0338f36502a147469b28a154e71e03af94d64d5f4947ef36d2a40dddb8707bdc83a97cbf2f582c9b17083a8160b023271754d67dafb76dc6cb6a83518
7
- data.tar.gz: 1b000477be977c027646d887e7b5dc3595c029c01cf4467fbac8e79e92ceeae947259902b00761d071fbe656305deaeb8c3289063286a265fac44e3c27d32350
6
+ metadata.gz: ec5cad59821bdc90a202fca1ac7b514c34be55c4addd3948ba3b9fe86d68170dbb773836d8eb094f1b7625ee4112ea9903693edabb395b2951fa638a98d3083b
7
+ data.tar.gz: d2ffb35d82995f2e031afb304350f88a726db081937e9b955b689f774bf3c7e5a4ae8da132c95d6ef2f6edad5326a95a079194c048f01ae4e91d7878d0c51066
@@ -1,28 +1,29 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
9
  def initialize(url, opts = {})
7
10
  @url = url
8
- @options = {:followlocation => true, :recurse => false}.merge(opts)
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
9
12
  end
10
13
 
11
14
  def raw_sitemap
12
15
  @raw_sitemap ||= begin
13
- if @url =~ /\Ahttp/i
14
- request_options = @options.dup.tap { |opts| opts.delete(:recurse) }
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
15
18
  request = Typhoeus::Request.new(@url, request_options)
16
19
  request.on_complete do |response|
17
- if response.success?
18
- return response.body
19
- else
20
- raise "HTTP request to #{@url} failed"
21
- end
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
22
23
  end
23
24
  request.run
24
- elsif File.exist?(@url) && @url =~ /[\\\/]sitemap\.xml\Z/i
25
- open(@url) { |f| f.read }
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
26
27
  end
27
28
  end
28
29
  end
@@ -33,24 +34,44 @@ class SitemapParser
33
34
 
34
35
  def urls
35
36
  if sitemap.at('urlset')
36
- sitemap.at("urlset").search("url")
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
37
38
  elsif sitemap.at('sitemapindex')
38
39
  found_urls = []
39
40
  if @options[:recurse]
40
- sitemap.at('sitemapindex').search('sitemap').each do |sitemap|
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
41
43
  child_sitemap_location = sitemap.at('loc').content
42
- found_urls << self.class.new(child_sitemap_location, :recurse => false).urls
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
43
45
  end
44
46
  end
45
- return found_urls.flatten
47
+ found_urls.flatten
46
48
  else
47
49
  raise 'Malformed sitemap, no urlset'
48
50
  end
49
51
  end
50
52
 
51
53
  def to_a
52
- urls.map { |url| url.at("loc").content }
54
+ urls.map { |url| url.at('loc').content }
53
55
  rescue NoMethodError
54
56
  raise 'Malformed sitemap, url without loc'
55
57
  end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
76
+ end
56
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-31 00:00:00.000000000 Z
11
+ date: 2020-12-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -28,16 +28,36 @@ dependencies:
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.6'
34
+ - - "<"
35
+ - !ruby/object:Gem::Version
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
- - - "~>"
41
+ - - ">="
39
42
  - !ruby/object:Gem::Version
40
43
  version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
41
61
  - !ruby/object:Gem::Dependency
42
62
  name: rake
43
63
  requirement: !ruby/object:Gem::Requirement
@@ -53,47 +73,61 @@ dependencies:
53
73
  - !ruby/object:Gem::Version
54
74
  version: '10.4'
55
75
  - !ruby/object:Gem::Dependency
56
- name: shoulda
76
+ name: rubocop
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: '3.5'
81
+ version: '0.80'
62
82
  type: :development
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: '3.5'
88
+ version: '0.80'
69
89
  - !ruby/object:Gem::Dependency
70
- name: bundler
90
+ name: rubocop-minitest
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
93
  - - "~>"
74
94
  - !ruby/object:Gem::Version
75
- version: '1.6'
95
+ version: '0.1'
76
96
  type: :development
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
100
  - - "~>"
81
101
  - !ruby/object:Gem::Version
82
- version: '1.6'
102
+ version: '0.1'
83
103
  - !ruby/object:Gem::Dependency
84
- name: minitest
104
+ name: rubocop-performance
85
105
  requirement: !ruby/object:Gem::Requirement
86
106
  requirements:
87
107
  - - "~>"
88
108
  - !ruby/object:Gem::Version
89
- version: '4.7'
109
+ version: '1.5'
90
110
  type: :development
91
111
  prerelease: false
92
112
  version_requirements: !ruby/object:Gem::Requirement
93
113
  requirements:
94
114
  - - "~>"
95
115
  - !ruby/object:Gem::Version
96
- version: '4.7'
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
97
131
  - !ruby/object:Gem::Dependency
98
132
  name: test-unit
99
133
  requirement: !ruby/object:Gem::Requirement
@@ -119,7 +153,7 @@ homepage: https://github.com/benbalter/sitemap-parser
119
153
  licenses:
120
154
  - MIT
121
155
  metadata: {}
122
- post_install_message:
156
+ post_install_message:
123
157
  rdoc_options: []
124
158
  require_paths:
125
159
  - lib
@@ -134,9 +168,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
168
  - !ruby/object:Gem::Version
135
169
  version: '0'
136
170
  requirements: []
137
- rubyforge_project:
138
- rubygems_version: 2.6.11
139
- signing_key:
171
+ rubygems_version: 3.1.4
172
+ signing_key:
140
173
  specification_version: 4
141
174
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
142
175
  test_files: []