sitemap-parser 0.1.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/sitemap-parser.rb +53 -17
  3. metadata +115 -12
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5cf7f24d9633d44bd80514db606dafa869de8e8a
4
- data.tar.gz: 5dc60886285798683a43f21e81f44f7afdbafc19
2
+ SHA256:
3
+ metadata.gz: 8eddc8204f36a00114c47cbee1a9e1bf884fda408e51383e3af615c7811b1a25
4
+ data.tar.gz: 10fde984cc652d03edf863ee8c6cc03a531b6cadaa1e37de49b2d2095d3d4f46
5
5
  SHA512:
6
- metadata.gz: 07241173a5e14eb552827d48b75a3caa27c5b40f1e3eaf39db588a7009d1c8f65b7c302072eef7d9e6a9ce6bce0b7ad94ec36130039a42640d159e8ddcea140f
7
- data.tar.gz: c3a5d2fa4b2e182eb0b335cb04ba65839d213690fad73f12d3055b85e2ad10da5133aa88fa6baab304576dd0d04fd870de793c1b73c849f08cfaaf691f90b238
6
+ metadata.gz: ec5cad59821bdc90a202fca1ac7b514c34be55c4addd3948ba3b9fe86d68170dbb773836d8eb094f1b7625ee4112ea9903693edabb395b2951fa638a98d3083b
7
+ data.tar.gz: d2ffb35d82995f2e031afb304350f88a726db081937e9b955b689f774bf3c7e5a4ae8da132c95d6ef2f6edad5326a95a079194c048f01ae4e91d7878d0c51066
@@ -1,41 +1,77 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
  require 'typhoeus'
5
+ require 'zlib'
6
+ require_relative 'sitemap-parser/version'
3
7
 
4
8
  class SitemapParser
5
-
6
- def initialize(url)
9
+ def initialize(url, opts = {})
7
10
  @url = url
11
+ @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts)
8
12
  end
9
13
 
10
14
  def raw_sitemap
11
15
  @raw_sitemap ||= begin
12
- request = Typhoeus::Request.new(@url, followlocation: true)
13
- request.on_complete do |response|
14
- if response.success?
15
- return response.body
16
- else
17
- return nil
16
+ if /\Ahttp/i.match?(@url)
17
+ request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) }
18
+ request = Typhoeus::Request.new(@url, request_options)
19
+ request.on_complete do |response|
20
+ raise "HTTP request to #{@url} failed" unless response.success?
21
+
22
+ return inflate_body_if_needed(response)
18
23
  end
24
+ request.run
25
+ elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i
26
+ File.open(@url, &:read)
19
27
  end
20
- request.run
21
28
  end
22
29
  end
23
30
 
24
31
  def sitemap
25
32
  @sitemap ||= Nokogiri::XML(raw_sitemap)
26
- rescue
27
- nil
28
33
  end
29
34
 
30
35
  def urls
31
- sitemap.at("urlset").search("url")
32
- rescue
33
- nil
36
+ if sitemap.at('urlset')
37
+ filter_sitemap_urls(sitemap.at('urlset').search('url'))
38
+ elsif sitemap.at('sitemapindex')
39
+ found_urls = []
40
+ if @options[:recurse]
41
+ urls = sitemap.at('sitemapindex').search('sitemap')
42
+ filter_sitemap_urls(urls).each do |sitemap|
43
+ child_sitemap_location = sitemap.at('loc').content
44
+ found_urls << self.class.new(child_sitemap_location, recurse: false).urls
45
+ end
46
+ end
47
+ found_urls.flatten
48
+ else
49
+ raise 'Malformed sitemap, no urlset'
50
+ end
34
51
  end
35
52
 
36
53
  def to_a
37
- urls.map { |url| url.at("loc").content }
38
- rescue
39
- []
54
+ urls.map { |url| url.at('loc').content }
55
+ rescue NoMethodError
56
+ raise 'Malformed sitemap, url without loc'
57
+ end
58
+
59
+ private
60
+
61
+ def filter_sitemap_urls(urls)
62
+ return urls if @options[:url_regex].nil?
63
+
64
+ urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] }
65
+ end
66
+
67
+ def inflate_body_if_needed(response)
68
+ return response.body unless response.headers
69
+
70
+ case response.headers['Content-Type']
71
+ when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream}
72
+ Zlib.gunzip(response.body)
73
+ else
74
+ response.body
75
+ end
40
76
  end
41
77
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemap-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Balter
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-04 00:00:00.000000000 Z
11
+ date: 2020-12-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -16,28 +16,132 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.5.6
19
+ version: '1.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 1.5.6
26
+ version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: typhoeus
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ - - "<"
32
35
  - !ruby/object:Gem::Version
33
- version: 0.6.7
36
+ version: '2.0'
34
37
  type: :runtime
35
38
  prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0.6'
44
+ - - "<"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '4.7'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '4.7'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rake
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '10.4'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '10.4'
75
+ - !ruby/object:Gem::Dependency
76
+ name: rubocop
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.80'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.80'
89
+ - !ruby/object:Gem::Dependency
90
+ name: rubocop-minitest
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '0.1'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.1'
103
+ - !ruby/object:Gem::Dependency
104
+ name: rubocop-performance
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.5'
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '1.5'
117
+ - !ruby/object:Gem::Dependency
118
+ name: shoulda
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '3.5'
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '3.5'
131
+ - !ruby/object:Gem::Dependency
132
+ name: test-unit
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '3.1'
138
+ type: :development
139
+ prerelease: false
36
140
  version_requirements: !ruby/object:Gem::Requirement
37
141
  requirements:
38
142
  - - "~>"
39
143
  - !ruby/object:Gem::Version
40
- version: 0.6.7
144
+ version: '3.1'
41
145
  description: Ruby Gem to parse sitemaps.org compliant sitemaps.
42
146
  email: ben.balter@github.com
43
147
  executables: []
@@ -49,7 +153,7 @@ homepage: https://github.com/benbalter/sitemap-parser
49
153
  licenses:
50
154
  - MIT
51
155
  metadata: {}
52
- post_install_message:
156
+ post_install_message:
53
157
  rdoc_options: []
54
158
  require_paths:
55
159
  - lib
@@ -64,9 +168,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
64
168
  - !ruby/object:Gem::Version
65
169
  version: '0'
66
170
  requirements: []
67
- rubyforge_project:
68
- rubygems_version: 2.2.0
69
- signing_key:
171
+ rubygems_version: 3.1.4
172
+ signing_key:
70
173
  specification_version: 4
71
174
  summary: Ruby Gem to parse sitemaps.org compliant sitemaps
72
175
  test_files: []