sitemap-parser 0.2.1 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d03dd8db51f6b4d2f7fc07f1bf82a8208e09879e
4
- data.tar.gz: 36e811e0184838fdb0332fc5914b5ef232f6ee87
2
+ SHA256:
3
+ metadata.gz: b98303437a057f9f3cdbe7ef5ef3712ed6534e8d9e228364a89bac75488e27df
4
+ data.tar.gz: 380dfe60cde2614823adaaa87520306b5689424d882f6134d80920b6947f717e
5
5
  SHA512:
6
- metadata.gz: 49c136b2f86b00dffbf9c565a7284ea1349a68f752dae6e1cab96b8135ab69955091f9fbb2fbd2d32880d3bc5cde7b8a29bf8eca42c2e954f09a4f4a306d7e76
7
- data.tar.gz: d5241537a66a454a6097a6c88bc3e1e643535c0a085d98f935c13aea902c5b5bf094042fbc2a140c437d548e1e9e9864dda05f21a42113452ce811499b7bd7cc
6
+ metadata.gz: d6afba1d0068955ccf40bb147ddb97349dd8b04499dbe755b04e03b89b8188ab0a37bf618ca96ad49908f4a850fd230d3c00ff24066881044d12e222ef415221
7
+ data.tar.gz: 43e3baffe31bb3dc672d2c73c8002a2707721551be1efd1314a03818038b4a5afb3350ad8c507dfc884c25501982c84297f4c20a207019dc965e9b27a3b82941
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SitemapParser
4
+ VERSION = '0.5.6'
5
+ end
@@ -1,46 +1,77 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
9
  def initialize(url, opts = {})
7
10
  @url = url
8
- @options = {:followlocation => true}.merge(opts)
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
9
12
  end
10
13
 
11
14
  def raw_sitemap
12
15
  @raw_sitemap ||= begin
13
- if @url =~ /\Ahttp/i
14
- request = Typhoeus::Request.new(@url, followlocation: @options[:followlocation])
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
18
+ request = Typhoeus::Request.new(@url, request_options)
15
19
  request.on_complete do |response|
16
- if response.success?
17
- return response.body
18
- else
19
- return nil
20
- end
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
21
23
  end
22
24
  request.run
23
- elsif File.exist?(@url) && @url =~ /[\\\/]sitemap\.xml\Z/i
24
- open(@url) { |f| f.read }
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
25
27
  end
26
28
  end
27
29
  end
28
30
 
29
31
  def sitemap
30
32
  @sitemap ||= Nokogiri::XML(raw_sitemap)
31
- rescue
32
- nil
33
33
  end
34
34
 
35
35
  def urls
36
- sitemap.at("urlset").search("url")
37
- rescue
38
- nil
36
+ if sitemap.at('urlset')
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
38
+ elsif sitemap.at('sitemapindex')
39
+ found_urls = []
40
+ if @options[:recurse]
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
43
+ child_sitemap_location = sitemap.at('loc').content
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
45
+ end
46
+ end
47
+ found_urls.flatten
48
+ else
49
+ raise 'Malformed sitemap, no urlset'
50
+ end
39
51
  end
40
52
 
41
53
  def to_a
42
- urls.map { |url| url.at("loc").content }
43
- rescue
44
- []
54
+ urls.map { |url| url.at('loc').content }
55
+ rescue NoMethodError
56
+ raise 'Malformed sitemap, url without loc'
57
+ end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
45
76
  end
46
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.5.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-05 00:00:00.000000000 Z
11
+ date: 2021-12-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -28,16 +28,36 @@ dependencies:
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0.6'
34
+ - - "<"
35
+ - !ruby/object:Gem::Version
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
38
- - - "~>"
41
+ - - ">="
39
42
  - !ruby/object:Gem::Version
40
43
  version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
41
61
  - !ruby/object:Gem::Dependency
42
62
  name: rake
43
63
  requirement: !ruby/object:Gem::Requirement
@@ -53,47 +73,75 @@ dependencies:
53
73
  - !ruby/object:Gem::Version
54
74
  version: '10.4'
55
75
  - !ruby/object:Gem::Dependency
56
- name: shoulda
76
+ name: rubocop
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
79
  - - "~>"
60
80
  - !ruby/object:Gem::Version
61
- version: '3.5'
81
+ version: '0.80'
62
82
  type: :development
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
86
  - - "~>"
67
87
  - !ruby/object:Gem::Version
68
- version: '3.5'
88
+ version: '0.80'
69
89
  - !ruby/object:Gem::Dependency
70
- name: bundler
90
+ name: rubocop-minitest
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
93
  - - "~>"
74
94
  - !ruby/object:Gem::Version
75
- version: '1.6'
95
+ version: '0.1'
76
96
  type: :development
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
100
  - - "~>"
81
101
  - !ruby/object:Gem::Version
82
- version: '1.6'
102
+ version: '0.1'
83
103
  - !ruby/object:Gem::Dependency
84
- name: minitest
104
+ name: rubocop-performance
85
105
  requirement: !ruby/object:Gem::Requirement
86
106
  requirements:
87
107
  - - "~>"
88
108
  - !ruby/object:Gem::Version
89
- version: '4.7'
109
+ version: '1.5'
90
110
  type: :development
91
111
  prerelease: false
92
112
  version_requirements: !ruby/object:Gem::Requirement
93
113
  requirements:
94
114
  - - "~>"
95
115
  - !ruby/object:Gem::Version
96
- version: '4.7'
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
131
+ - !ruby/object:Gem::Dependency
132
+ name: test-unit
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '3.1'
138
+ type: :development
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: '3.1'
97
145
  description: Ruby Gem to parse sitemaps.org compliant sitemaps.
98
146
  email: ben.balter@github.com
99
147
  executables: []
@@ -101,11 +149,12 @@ extensions: []
101
149
  extra_rdoc_files: []
102
150
  files:
103
151
  - lib/sitemap-parser.rb
152
+ - lib/sitemap-parser/version.rb
104
153
  homepage: https://github.com/benbalter/sitemap-parser
105
154
  licenses:
106
155
  - MIT
107
156
  metadata: {}
108
- post_install_message:
157
+ post_install_message:
109
158
  rdoc_options: []
110
159
  require_paths:
111
160
  - lib
@@ -120,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
169
  - !ruby/object:Gem::Version
121
170
  version: '0'
122
171
  requirements: []
123
- rubyforge_project:
124
- rubygems_version: 2.2.0
125
- signing_key:
172
+ rubygems_version: 3.2.22
173
+ signing_key:
126
174
  specification_version: 4
127
175
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
128
176
  test_files: []