sitemaps_parser 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
4
- data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
3
+ metadata.gz: f6a5cf10a4446ce42e918767ee0279a81178951c
4
+ data.tar.gz: 878301d4d7ffd4429a0d4763d9ecb4ccca719039
5
5
  SHA512:
6
- metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
7
- data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
6
+ metadata.gz: 1a209890b6494fb17fed6c234656854a657dec8dc5d6012d752d6962339b8b5c06f8cd0facb15c96884f18bcabf4bb2a56aea53b84ad8326695ae43c4e57fe2b
7
+ data.tar.gz: 108c423e4969c262bf00959fa6ae4d5855c0d35faf10d21c282d3a05413e2f4e9178767dfce86da7872231dc48ac7a81338c72a6b1f809d73ff2d7e3e11a7c9d
data/README.md CHANGED
@@ -4,18 +4,14 @@
4
4
 
5
5
  Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
6
6
 
7
- ## TODO
8
-
9
- * discovery
10
- * sorting by last modified, or priority
11
- * filtering by last modified
7
+ See [RDOC Documentation](http://lygaret.github.io/sitemaps) for detailed documentation.
12
8
 
13
9
  ## Installation
14
10
 
15
11
  Add this line to your application's Gemfile:
16
12
 
17
13
  ```ruby
18
- gem 'sitemaps'
14
+ gem 'sitemaps_parser', require: 'sitemaps'
19
15
  ```
20
16
 
21
17
  And then execute:
@@ -24,7 +20,7 @@ And then execute:
24
20
 
25
21
  Or install it yourself as:
26
22
 
27
- $ gem install sitemaps
23
+ $ gem install sitemaps_parser
28
24
 
29
25
  ## Usage
30
26
 
@@ -35,15 +31,20 @@ require 'sitemaps'
35
31
  Sitemaps.parse("<xml ns=\"...")
36
32
 
37
33
  # fetch and parse a sitemap from a known url
38
- sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
34
+ sitemap = Sitemaps.fetch("http://termscout.com/sitemap.xml")
39
35
 
40
36
  # fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
41
37
  sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
42
38
  entry.loc.path !~ /blog/i
43
39
  end
44
40
 
41
+ # attempt to discover sitemaps for a site without a known sitemap location. Checks robots.txt and some common locations.
42
+ sitemap = Sitemaps.discover("https://www.digitalocean.com", max_entries: 200) do |entry|
43
+ entry.loc.path !~ /blog/i
44
+ end
45
+
45
46
  # sitemap usage
46
- sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
47
+ sitemap.entries.first #> Sitemaps::Entry(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
47
48
  urls = sitemap.entries.map(&:loc)
48
49
  ```
49
50
 
@@ -55,7 +56,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
55
56
 
56
57
  ## Contributing
57
58
 
58
- Bug reports and pull requests are welcome on GitHub at https://github.com/termscout/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
59
+ Bug reports and pull requests are welcome on GitHub at https://github.com/lygaret/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
59
60
 
60
61
  ## License
61
62
 
data/Rakefile CHANGED
@@ -4,3 +4,30 @@ require "rspec/core/rake_task"
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task default: :spec
7
+
8
+ require 'yard'
9
+ DOC_FILES = ['lib/**/*.rb']
10
+
11
+ YARD::Rake::YardocTask.new(:doc) do |t|
12
+ t.files = DOC_FILES
13
+ end
14
+
15
+ namespace :doc do
16
+ YARD::Rake::YardocTask.new(:pages) do |t|
17
+ t.files = DOC_FILES
18
+ t.options = ['-o', '../sitemaps.doc']
19
+ end
20
+
21
+ namespace :pages do
22
+ desc 'Generate and publish docs to gh-pages'
23
+ task publish: ['doc:pages'] do
24
+ Dir.chdir(File.dirname(__FILE__) + '/../sitemaps.doc') do
25
+ system 'git checkout gh-pages'
26
+ system 'git add .'
27
+ system 'git add -u'
28
+ system "git commit -m 'Generating docs for version #{Sitemaps::VERSION}.'"
29
+ system 'git push origin gh-pages'
30
+ end
31
+ end
32
+ end
33
+ end
data/lib/sitemaps.rb CHANGED
@@ -2,6 +2,7 @@ require "active_support"
2
2
  require "active_support/core_ext/object/try"
3
3
  require "active_support/core_ext/object/blank"
4
4
 
5
+ require "set"
5
6
  require "time"
6
7
  require "rexml/document"
7
8
 
@@ -11,66 +12,176 @@ require "sitemaps/fetcher"
11
12
 
12
13
  # Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
13
14
  module Sitemaps
15
+
16
+ # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
17
+ # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
18
+ # @attr changefreq [:always, :hourly, :daily, :weekly, :monthly, :yearly, :never, nil]
19
+ # the change frequency of this entry, or nil if unspecified.
20
+ # @attr priority [Float] the priority of this entry, a float from 0 to 1. 0.5 if unspecified.
14
21
  Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
22
+
23
+ # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
24
+ # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
15
25
  Submap = Struct.new(:loc, :lastmod)
26
+
27
+ # @attr entries [Enumerable<Entry>] A set of entries that were parsed out of one or more sitemaps, recursively.
28
+ # @attr sitemaps [Enumerable<Sitemap>] A set of sitemaps that were found in a sitemap index.
16
29
  Sitemap = Struct.new(:entries, :sitemaps)
17
30
 
31
+ @default_fetcher = ->(u) { Sitemaps::Fetcher.fetch(u) }
32
+
33
+ # Parse a sitemap from an XML string. Does not fail on invalid documents, but doesn't include
34
+ # invalid entries in the final set. As such, a non-XML file, or non-sitemap XML file will return
35
+ # an empty sitemap.
36
+ #
37
+ # @param source [String] an XML string to parse as a sitemap.
38
+ # @return [Sitemap] the sitemap represented by the given XML string.
18
39
  def self.parse(source)
19
40
  Sitemaps::Parser.parse(source)
20
41
  end
21
42
 
22
- def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
23
- fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
24
- url = parse_url(url)
43
+ # Fetch and parse a sitemap from the given URL.
44
+ #
45
+ # @overload fetch(url, fetcher: nil, max_entries: nil)
46
+ # @param url [String, URI] the url of the sitemap in question.
47
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
48
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
49
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
50
+ # a good idea to include, as many sites have _very_ large sitemaps.
51
+ # @return [Sitemap]
52
+ #
53
+ # @overload fetch(url, fetcher: nil, max_entries: nil)
54
+ # If a block is given, it's used as a filter for entries before they're added to the sitemap.
55
+ #
56
+ # @param url [String, URI] the url of the sitemap in question.
57
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
58
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
59
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
60
+ # a good idea to include, as many sites have _very_ large sitemaps.
61
+ # @return [Sitemap]
62
+ # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
63
+ # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
64
+ def self.fetch(url, fetcher: nil, max_entries: nil, &block)
65
+ fetcher ||= @default_fetcher
66
+ unless url.is_a? URI
67
+ url = "http://#{url}" unless url =~ %r{^https?://}
68
+ url = URI.parse(url)
69
+ end
25
70
 
26
- recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
71
+ _instance.fetch_recursive(url, fetcher, max_entries, &block)
27
72
  end
28
73
 
29
- def self.fetch_single(url, fetch, max_entries, &block)
30
- url = parse_url(url)
31
- source = fetch.call(url)
74
+ # Discover, fetch and parse sitemaps from the given host.
75
+ #
76
+ # Attempts to find and fetch sitemaps at a given host, by examining the `robots.txt` at that
77
+ # host, or if no sitemaps are found via `robots.txt`, checking a small number of common locations,
78
+ # including `sitemap.xml`, `sitemap_index.xml`, and the gzip versions of those same locations.
79
+ #
80
+ # @overload discover(host, fetcher: nil, max_entries: nil)
81
+ # @param host [String, URI] the url of the host to interrogate for sitemaps.
82
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
83
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
84
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
85
+ # a good idea to include, as many sites have _very_ large sitemaps.
86
+ # @return [Sitemap]
87
+ #
88
+ # @overload discover(host, fetcher: nil, max_entries: nil)
89
+ # If a block is given, it's used as a filter for entries before they're added to the sitemap.
90
+ #
91
+ # @param host [String, URI] the url of the host to interrogate for sitemaps.
92
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
93
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
94
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
95
+ # a good idea to include, as many sites have _very_ large sitemaps.
96
+ # @return [Sitemap]
97
+ # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
98
+ # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
99
+ def self.discover(url, fetcher: nil, max_entries: nil, &block)
100
+ fetcher ||= @default_fetcher
101
+ unless url.is_a? URI
102
+ url = "http://#{url}" unless url =~ %r{^https?://}
103
+ url = URI.parse(url)
104
+ end
105
+
106
+ roots = _instance.discover_roots(url, fetcher)
107
+ _instance.fetch_recursive(roots, fetcher, max_entries, &block)
108
+ end
32
109
 
33
- Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
110
+ # @return [Instance]
111
+ # @private
112
+ # @api private
113
+ def self._instance
114
+ @instance ||= Sitemaps::Instance.new
34
115
  end
35
116
 
36
- def self.fetch_recursive(url, fetch, max_entries, &block)
37
- queue = [parse_url(url)]
38
- maps = {}
39
-
40
- # walk the queue, fetching the sitemap requested and adding
41
- # new sitemaps to the queue as found
42
- loop do
43
- begin
44
- url = queue.pop
45
- break if url.nil?
46
- next unless maps[url].nil?
47
-
48
- # fetch this item in the queue, and queue up any sub maps it found
49
- maps[url] = fetch_single(url, fetch, max_entries, &block)
50
- queue.push(*maps[url].sitemaps.map(&:loc))
51
-
52
- # decrement max_entries (since it's max_entries total, not per map)
53
- unless max_entries.nil?
54
- max_entries -= maps[url].entries.length
55
- break if max_entries <= 0
117
+ # Holder for methods that shouldn't be exposed as public API
118
+ # @private
119
+ # @api private
120
+ class Instance
121
+ # recursively fetch sitemaps and sitemap indexes from the given urls.
122
+ # @return [Sitemap]
123
+ def fetch_recursive(urls, fetcher, max_entries, &block)
124
+ queue = urls.is_a?(Array) ? urls : [urls]
125
+ maps = {}
126
+
127
+ # walk the queue, fetching the sitemap requested and adding
128
+ # new sitemaps to the queue as found
129
+ loop do
130
+ begin
131
+ url = queue.pop
132
+ break if url.nil?
133
+ next unless maps[url].nil?
134
+
135
+ # fetch this item in the queue, and queue up any sub maps it found
136
+ source = fetcher.call(url)
137
+ sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
138
+
139
+ # save the results and queue up any submaps it found
140
+ maps[url] = sitemap
141
+ queue.push(*sitemap.sitemaps.map(&:loc))
142
+
143
+ # decrement max_entries (since it's max_entries total, not per map)
144
+ unless max_entries.nil?
145
+ max_entries -= maps[url].entries.length
146
+ break if max_entries <= 0
147
+ end
148
+ rescue => ex
149
+ # otherwise keep on going, because we've got something at least
150
+ $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
151
+ next
56
152
  end
57
- rescue => ex
58
- $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
59
- next
60
153
  end
61
- end
62
154
 
63
- # collapse the recovered maps into a single one with everything
64
- maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
65
- result.sitemaps.concat(map.sitemaps)
66
- result.entries.concat(map.entries)
155
+ # collapse the recovered maps into a single one with everything
156
+ maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
157
+ result.sitemaps.concat(map.sitemaps).uniq! { |e| e.loc.to_s }
158
+ result.entries.concat(map.entries).uniq! { |e| e.loc.to_s }
159
+ end
67
160
  end
68
- end
69
161
 
70
- def self.parse_url(url)
71
- return url if url.is_a? URI
162
+ # interrogate a host for sitemaps from robots.txt, or return some potential locations.
163
+ # @return [Array<URI>]
164
+ def discover_roots(url, fetcher)
165
+ robots = begin
166
+ robotsurl = url.clone
167
+ robotsurl.path = "/robots.txt"
168
+ robotstxt = fetcher.call(robotsurl)
72
169
 
73
- url = "http://#{url}" unless url =~ %r{^https?://}
74
- URI.parse(url)
170
+ discovered = robotstxt.scan(/^Sitemap: (.+)$/).flatten.reverse.map { |u| URI.parse(u) }
171
+ discovered.empty? ? nil : discovered
172
+ rescue
173
+ nil
174
+ end
175
+
176
+ # try for files in a handful of known locations
177
+ known_locations = %w(/sitemap_index.xml.gz /sitemap_index.xml /sitemap.xml.gz /sitemap.xml)
178
+ known_locations = known_locations.lazy.map do |path|
179
+ pathurl = url.clone
180
+ pathurl.path = path
181
+ pathurl
182
+ end
183
+
184
+ robots || known_locations.to_a
185
+ end
75
186
  end
76
187
  end
@@ -1,14 +1,29 @@
1
1
  module Sitemaps
2
- # Simple single purpose HTTP client
2
+ # Simple single purpose HTTP client. Uses `Net::HTTP` directly, so as to not incur dependencies.
3
3
  module Fetcher
4
- class FetchError < StandardError; end
4
+ class FetchError < StandardError; end
5
5
  class MaxRedirectError < StandardError; end
6
6
 
7
7
  @max_attempts = 10
8
8
 
9
+ # Fetch the given URI.
10
+ #
11
+ # Handles redirects (up to 10 times), and additionally will inflate a body delivered without
12
+ # a content-encoding header, but with a `.gz` as the end of the path.
13
+ #
14
+ # @param uri [String, URI] the URI to fetch.
15
+ # @return [String]
16
+ # @raise [FetchError] if the server responds with an HTTP status that's not 2xx.
17
+ # @raise [MaxRedirectError] if more than 10 redirects have occurred while attempting to fetch the resource.
9
18
  def self.fetch(uri)
10
19
  attempts = 0
11
20
 
21
+ # we only work on URI objects
22
+ unless uri.is_a? URI
23
+ uri = "http://#{uri}" unless uri =~ %r{^https?://}
24
+ uri = URI.parse(uri)
25
+ end
26
+
12
27
  until attempts >= @max_attempts
13
28
  resp = Net::HTTP.get_response(uri)
14
29
 
@@ -1,6 +1,16 @@
1
1
  module Sitemaps
2
2
  # Parse XML Sitemaps
3
3
  module Parser
4
+ VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
5
+
6
+ # Given a source string, returns a sitemap containing all valid url entries, or all valid sub-sitemaps.
7
+ # See `http://sitemaps.org` for information on the spec.
8
+ #
9
+ # @param source [String] an XML string to parse.
10
+ # @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
11
+ # @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
12
+ # @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
13
+ # a sitemap will still be returned, but the entries and sitemaps keys will be empty.
4
14
  def self.parse(source, max_entries: nil, filter: nil)
5
15
  document = REXML::Document.new(source)
6
16
  entries = document.elements.to_a("/urlset/url").map do |root|
@@ -12,6 +22,7 @@ module Sitemaps
12
22
  entry = Sitemaps::Entry.new(loc, mod, freq, pri)
13
23
  (!filter || filter.call(entry)) ? entry : nil
14
24
  end.reject(&:nil?)
25
+ entries = entries.uniq(&:loc)
15
26
  entries = entries.take(max_entries) unless max_entries.nil?
16
27
 
17
28
  sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
@@ -24,22 +35,29 @@ module Sitemaps
24
35
  Sitemaps::Sitemap.new(entries, sitemaps)
25
36
  end
26
37
 
38
+ # @api private
39
+ # @private
27
40
  def self.parse_loc(root)
28
41
  loc = root.get_text("loc").try(:value)
29
42
  loc && URI.parse(loc) rescue nil
30
43
  end
31
44
 
45
+ # @api private
46
+ # @private
32
47
  def self.parse_lastmod(root)
33
48
  mod = root.get_text("lastmod").try(:value)
34
49
  mod && Time.parse(mod) rescue nil
35
50
  end
36
51
 
37
- VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
52
+ # @api private
53
+ # @private
38
54
  def self.parse_changefreq(root)
39
55
  freq = root.get_text("changefreq").try(:value)
40
56
  freq && VALID_CHANGEFREQ.include?(freq) ? freq.to_sym : nil
41
57
  end
42
58
 
59
+ # @api private
60
+ # @private
43
61
  def self.parse_priority(root)
44
62
  priority = root.get_text("priority").try(:value) || "0.5"
45
63
  priority && Float(priority) rescue 0.5 # default priority according to spec
@@ -1,3 +1,3 @@
1
1
  module Sitemaps
2
- VERSION = "0.1.1".freeze
2
+ VERSION = "0.2.0".freeze
3
3
  end
data/sitemaps.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "vcr", "~> 3"
27
27
  spec.add_development_dependency "rubocop", "~> 0.38.0"
28
28
  spec.add_development_dependency "byebug", "~> 8.2"
29
+ spec.add_development_dependency "yard", "~> 0.8"
29
30
 
30
31
  spec.add_runtime_dependency "activesupport", "~> 4"
31
32
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemaps_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raphaelson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '8.2'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.8'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.8'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: activesupport
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -170,3 +184,4 @@ signing_key:
170
184
  specification_version: 4
171
185
  summary: Retrieve and parse sitemaps, according to the sitemaps.org spec.
172
186
  test_files: []
187
+ has_rdoc: