sitemap-parser 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/sitemap-parser.rb +37 -16
  3. metadata +51 -18
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5e1f45974fe39269287ae515e63e56d245dd0ff6
4
- data.tar.gz: 99382aa0efafaccedf73189a741c6770ccf683a3
2
+ SHA256:
3
+ metadata.gz: 8eddc8204f36a00114c47cbee1a9e1bf884fda408e51383e3af615c7811b1a25
4
+ data.tar.gz: 10fde984cc652d03edf863ee8c6cc03a531b6cadaa1e37de49b2d2095d3d4f46
5
5
  SHA512:
6
- metadata.gz: 275725d0338f36502a147469b28a154e71e03af94d64d5f4947ef36d2a40dddb8707bdc83a97cbf2f582c9b17083a8160b023271754d67dafb76dc6cb6a83518
7
- data.tar.gz: 1b000477be977c027646d887e7b5dc3595c029c01cf4467fbac8e79e92ceeae947259902b00761d071fbe656305deaeb8c3289063286a265fac44e3c27d32350
6
+ metadata.gz: ec5cad59821bdc90a202fca1ac7b514c34be55c4addd3948ba3b9fe86d68170dbb773836d8eb094f1b7625ee4112ea9903693edabb395b2951fa638a98d3083b
7
+ data.tar.gz: d2ffb35d82995f2e031afb304350f88a726db081937e9b955b689f774bf3c7e5a4ae8da132c95d6ef2f6edad5326a95a079194c048f01ae4e91d7878d0c51066
@@ -1,28 +1,29 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
9
  def initialize(url, opts = {})
7
10
  @url = url
8
- @options = {:followlocation => true, :recurse => false}.merge(opts)
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
9
12
  end
10
13
 
11
14
  def raw_sitemap
12
15
  @raw_sitemap ||= begin
13
- if @url =~ /\Ahttp/i
14
- request_options = @options.dup.tap { |opts| opts.delete(:recurse) }
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
15
18
  request = Typhoeus::Request.new(@url, request_options)
16
19
  request.on_complete do |response|
17
- if response.success?
18
- return response.body
19
- else
20
- raise "HTTP request to #{@url} failed"
21
- end
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
22
23
  end
23
24
  request.run
24
- elsif File.exist?(@url) && @url =~ /[\\\/]sitemap\.xml\Z/i
25
- open(@url) { |f| f.read }
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
26
27
  end
27
28
  end
28
29
  end
@@ -33,24 +34,44 @@ class SitemapParser
33
34
 
34
35
  def urls
35
36
  if sitemap.at('urlset')
36
- sitemap.at("urlset").search("url")
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
37
38
  elsif sitemap.at('sitemapindex')
38
39
  found_urls = []
39
40
  if @options[:recurse]
40
- sitemap.at('sitemapindex').search('sitemap').each do |sitemap|
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
41
43
  child_sitemap_location = sitemap.at('loc').content
42
- found_urls << self.class.new(child_sitemap_location, :recurse => false).urls
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
43
45
  end
44
46
  end
45
- return found_urls.flatten
47
+ found_urls.flatten
46
48
  else
47
49
  raise 'Malformed sitemap, no urlset'
48
50
  end
49
51
  end
50
52
 
51
53
  def to_a
52
- urls.map { |url| url.at("loc").content }
54
+ urls.map { |url| url.at('loc').content }
53
55
  rescue NoMethodError
54
56
  raise 'Malformed sitemap, url without loc'
55
57
  end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
76
+ end
56
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-31 00:00:00.000000000 Z
11
+ date: 2020-12-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -28,16 +28,36 @@ dependencies:
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.6'
34
+ - - "<"
35
+ - !ruby/object:Gem::Version
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
- - - "~>"
41
+ - - ">="
39
42
  - !ruby/object:Gem::Version
40
43
  version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
41
61
  - !ruby/object:Gem::Dependency
42
62
  name: rake
43
63
  requirement: !ruby/object:Gem::Requirement
@@ -53,47 +73,61 @@ dependencies:
53
73
  - !ruby/object:Gem::Version
54
74
  version: '10.4'
55
75
  - !ruby/object:Gem::Dependency
56
- name: shoulda
76
+ name: rubocop
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: '3.5'
81
+ version: '0.80'
62
82
  type: :development
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: '3.5'
88
+ version: '0.80'
69
89
  - !ruby/object:Gem::Dependency
70
- name: bundler
90
+ name: rubocop-minitest
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
93
  - - "~>"
74
94
  - !ruby/object:Gem::Version
75
- version: '1.6'
95
+ version: '0.1'
76
96
  type: :development
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
100
  - - "~>"
81
101
  - !ruby/object:Gem::Version
82
- version: '1.6'
102
+ version: '0.1'
83
103
  - !ruby/object:Gem::Dependency
84
- name: minitest
104
+ name: rubocop-performance
85
105
  requirement: !ruby/object:Gem::Requirement
86
106
  requirements:
87
107
  - - "~>"
88
108
  - !ruby/object:Gem::Version
89
- version: '4.7'
109
+ version: '1.5'
90
110
  type: :development
91
111
  prerelease: false
92
112
  version_requirements: !ruby/object:Gem::Requirement
93
113
  requirements:
94
114
  - - "~>"
95
115
  - !ruby/object:Gem::Version
96
- version: '4.7'
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
97
131
  - !ruby/object:Gem::Dependency
98
132
  name: test-unit
99
133
  requirement: !ruby/object:Gem::Requirement
@@ -119,7 +153,7 @@ homepage: https://github.com/benbalter/sitemap-parser
119
153
  licenses:
120
154
  - MIT
121
155
  metadata: {}
122
- post_install_message:
156
+ post_install_message:
123
157
  rdoc_options: []
124
158
  require_paths:
125
159
  - lib
@@ -134,9 +168,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
168
  - !ruby/object:Gem::Version
135
169
  version: '0'
136
170
  requirements: []
137
- rubyforge_project:
138
- rubygems_version: 2.6.11
139
- signing_key:
171
+ rubygems_version: 3.1.4
172
+ signing_key:
140
173
  specification_version: 4
141
174
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
142
175
  test_files: []