sitemap-parser 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/sitemap-parser.rb +53 -17
  3. metadata +115 -12
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5cf7f24d9633d44bd80514db606dafa869de8e8a
4
- data.tar.gz: 5dc60886285798683a43f21e81f44f7afdbafc19
2
+ SHA256:
3
+ metadata.gz: 8eddc8204f36a00114c47cbee1a9e1bf884fda408e51383e3af615c7811b1a25
4
+ data.tar.gz: 10fde984cc652d03edf863ee8c6cc03a531b6cadaa1e37de49b2d2095d3d4f46
5
5
  SHA512:
6
- metadata.gz: 07241173a5e14eb552827d48b75a3caa27c5b40f1e3eaf39db588a7009d1c8f65b7c302072eef7d9e6a9ce6bce0b7ad94ec36130039a42640d159e8ddcea140f
7
- data.tar.gz: c3a5d2fa4b2e182eb0b335cb04ba65839d213690fad73f12d3055b85e2ad10da5133aa88fa6baab304576dd0d04fd870de793c1b73c849f08cfaaf691f90b238
6
+ metadata.gz: ec5cad59821bdc90a202fca1ac7b514c34be55c4addd3948ba3b9fe86d68170dbb773836d8eb094f1b7625ee4112ea9903693edabb395b2951fa638a98d3083b
7
+ data.tar.gz: d2ffb35d82995f2e031afb304350f88a726db081937e9b955b689f774bf3c7e5a4ae8da132c95d6ef2f6edad5326a95a079194c048f01ae4e91d7878d0c51066
@@ -1,41 +1,77 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
- def initialize(url)
9
+ def initialize(url, opts = {})
7
10
  @url = url
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
8
12
  end
9
13
 
10
14
  def raw_sitemap
11
15
  @raw_sitemap ||= begin
12
- request = Typhoeus::Request.new(@url, followlocation: true)
13
- request.on_complete do |response|
14
- if response.success?
15
- return response.body
16
- else
17
- return nil
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
18
+ request = Typhoeus::Request.new(@url, request_options)
19
+ request.on_complete do |response|
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
18
23
  end
24
+ request.run
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
19
27
  end
20
- request.run
21
28
  end
22
29
  end
23
30
 
24
31
  def sitemap
25
32
  @sitemap ||= Nokogiri::XML(raw_sitemap)
26
- rescue
27
- nil
28
33
  end
29
34
 
30
35
  def urls
31
- sitemap.at("urlset").search("url")
32
- rescue
33
- nil
36
+ if sitemap.at('urlset')
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
38
+ elsif sitemap.at('sitemapindex')
39
+ found_urls = []
40
+ if @options[:recurse]
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
43
+ child_sitemap_location = sitemap.at('loc').content
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
45
+ end
46
+ end
47
+ found_urls.flatten
48
+ else
49
+ raise 'Malformed sitemap, no urlset'
50
+ end
34
51
  end
35
52
 
36
53
  def to_a
37
- urls.map { |url| url.at("loc").content }
38
- rescue
39
- []
54
+ urls.map { |url| url.at('loc').content }
55
+ rescue NoMethodError
56
+ raise 'Malformed sitemap, url without loc'
57
+ end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
40
76
  end
41
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-04 00:00:00.000000000 Z
11
+ date: 2020-12-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,28 +16,132 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.5.6
19
+ version: '1.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 1.5.6
26
+ version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ - - "<"
32
35
  - !ruby/object:Gem::Version
33
- version: 0.6.7
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rake
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '10.4'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '10.4'
75
+ - !ruby/object:Gem::Dependency
76
+ name: rubocop
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.80'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.80'
89
+ - !ruby/object:Gem::Dependency
90
+ name: rubocop-minitest
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '0.1'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.1'
103
+ - !ruby/object:Gem::Dependency
104
+ name: rubocop-performance
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.5'
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
131
+ - !ruby/object:Gem::Dependency
132
+ name: test-unit
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '3.1'
138
+ type: :development
139
+ prerelease: false
36
140
  version_requirements: !ruby/object:Gem::Requirement
37
141
  requirements:
38
142
  - - "~>"
39
143
  - !ruby/object:Gem::Version
40
- version: 0.6.7
144
+ version: '3.1'
41
145
  description: Ruby Gem to parse sitemaps.org compliant sitemaps.
42
146
  email: ben.balter@github.com
43
147
  executables: []
@@ -49,7 +153,7 @@ homepage: https://github.com/benbalter/sitemap-parser
49
153
  licenses:
50
154
  - MIT
51
155
  metadata: {}
52
- post_install_message:
156
+ post_install_message:
53
157
  rdoc_options: []
54
158
  require_paths:
55
159
  - lib
@@ -64,9 +168,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
64
168
  - !ruby/object:Gem::Version
65
169
  version: '0'
66
170
  requirements: []
67
- rubyforge_project:
68
- rubygems_version: 2.2.0
69
- signing_key:
171
+ rubygems_version: 3.1.4
172
+ signing_key:
70
173
  specification_version: 4
71
174
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
72
175
  test_files: []