sitemaps_parser 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
4
- data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
3
+ metadata.gz: f6a5cf10a4446ce42e918767ee0279a81178951c
4
+ data.tar.gz: 878301d4d7ffd4429a0d4763d9ecb4ccca719039
5
5
  SHA512:
6
- metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
7
- data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
6
+ metadata.gz: 1a209890b6494fb17fed6c234656854a657dec8dc5d6012d752d6962339b8b5c06f8cd0facb15c96884f18bcabf4bb2a56aea53b84ad8326695ae43c4e57fe2b
7
+ data.tar.gz: 108c423e4969c262bf00959fa6ae4d5855c0d35faf10d21c282d3a05413e2f4e9178767dfce86da7872231dc48ac7a81338c72a6b1f809d73ff2d7e3e11a7c9d
data/README.md CHANGED
@@ -4,18 +4,14 @@
4
4
 
5
5
  Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
6
6
 
7
- ## TODO
8
-
9
- * discovery
10
- * sorting by last modified, or priority
11
- * filtering by last modified
7
+ See [RDOC Documentation](http://lygaret.github.io/sitemaps) for detailed documentation.
12
8
 
13
9
  ## Installation
14
10
 
15
11
  Add this line to your application's Gemfile:
16
12
 
17
13
  ```ruby
18
- gem 'sitemaps'
14
+ gem 'sitemaps_parser', require: 'sitemaps'
19
15
  ```
20
16
 
21
17
  And then execute:
@@ -24,7 +20,7 @@ And then execute:
24
20
 
25
21
  Or install it yourself as:
26
22
 
27
- $ gem install sitemaps
23
+ $ gem install sitemaps_parser
28
24
 
29
25
  ## Usage
30
26
 
@@ -35,15 +31,20 @@ require 'sitemaps'
35
31
  Sitemaps.parse("<xml ns=\"...")
36
32
 
37
33
  # fetch and parse a sitemap from a known url
38
- sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
34
+ sitemap = Sitemaps.fetch("http://termscout.com/sitemap.xml")
39
35
 
40
36
  # fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
41
37
  sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
42
38
  entry.loc.path !~ /blog/i
43
39
  end
44
40
 
41
+ # attempt to discover sitemaps for a site without a known sitemap location. Checks robots.txt and some common locations.
42
+ sitemap = Sitemaps.discover("https://www.digitalocean.com", max_entries: 200) do |entry|
43
+ entry.loc.path !~ /blog/i
44
+ end
45
+
45
46
  # sitemap usage
46
- sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
47
+ sitemap.entries.first #> Sitemaps::Entry(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
47
48
  urls = sitemap.entries.map(&:loc)
48
49
  ```
49
50
 
@@ -55,7 +56,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
55
56
 
56
57
  ## Contributing
57
58
 
58
- Bug reports and pull requests are welcome on GitHub at https://github.com/termscout/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
59
+ Bug reports and pull requests are welcome on GitHub at https://github.com/lygaret/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
59
60
 
60
61
  ## License
61
62
 
data/Rakefile CHANGED
@@ -4,3 +4,30 @@ require "rspec/core/rake_task"
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task default: :spec
7
+
8
+ require 'yard'
9
+ DOC_FILES = ['lib/**/*.rb']
10
+
11
+ YARD::Rake::YardocTask.new(:doc) do |t|
12
+ t.files = DOC_FILES
13
+ end
14
+
15
+ namespace :doc do
16
+ YARD::Rake::YardocTask.new(:pages) do |t|
17
+ t.files = DOC_FILES
18
+ t.options = ['-o', '../sitemaps.doc']
19
+ end
20
+
21
+ namespace :pages do
22
+ desc 'Generate and publish docs to gh-pages'
23
+ task publish: ['doc:pages'] do
24
+ Dir.chdir(File.dirname(__FILE__) + '/../sitemaps.doc') do
25
+ system 'git checkout gh-pages'
26
+ system 'git add .'
27
+ system 'git add -u'
28
+ system "git commit -m 'Generating docs for version #{Sitemaps::VERSION}.'"
29
+ system 'git push origin gh-pages'
30
+ end
31
+ end
32
+ end
33
+ end
data/lib/sitemaps.rb CHANGED
@@ -2,6 +2,7 @@ require "active_support"
2
2
  require "active_support/core_ext/object/try"
3
3
  require "active_support/core_ext/object/blank"
4
4
 
5
+ require "set"
5
6
  require "time"
6
7
  require "rexml/document"
7
8
 
@@ -11,66 +12,176 @@ require "sitemaps/fetcher"
11
12
 
12
13
  # Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
13
14
  module Sitemaps
15
+
16
+ # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
17
+ # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
18
+ # @attr changefreq [:always, :hourly, :daily, :weekly, :monthly, :yearly, :never, nil]
19
+ # the change frequency of this entry, or nil if unspecified.
20
+ # @attr priority [Float] the priority of this entry, a float from 0 to 1. 0.5 if unspecified.
14
21
  Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
22
+
23
+ # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
24
+ # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
15
25
  Submap = Struct.new(:loc, :lastmod)
26
+
27
+ # @attr entries [Enumerable<Entry>] A set of entries that were parsed out of one or more sitemaps, recursively.
28
+ # @attr sitemaps [Enumerable<Sitemap>] A set of sitemaps that were found in a sitemap index.
16
29
  Sitemap = Struct.new(:entries, :sitemaps)
17
30
 
31
+ @default_fetcher = ->(u) { Sitemaps::Fetcher.fetch(u) }
32
+
33
+ # Parse a sitemap from an XML string. Does not fail on invalid documents, but doesn't include
34
+ # invalid entries in the final set. As such, a non-XML file, or non-sitemap XML file will return
35
+ # an empty sitemap.
36
+ #
37
+ # @param source [String] an XML string to parse as a sitemap.
38
+ # @return [Sitemap] the sitemap represented by the given XML string.
18
39
  def self.parse(source)
19
40
  Sitemaps::Parser.parse(source)
20
41
  end
21
42
 
22
- def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
23
- fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
24
- url = parse_url(url)
43
+ # Fetch and parse a sitemap from the given URL.
44
+ #
45
+ # @overload fetch(url, fetcher: nil, max_entries: nil)
46
+ # @param url [String, URI] the url of the sitemap in question.
47
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
48
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
49
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
50
+ # a good idea to include, as many sites have _very_ large sitemaps.
51
+ # @return [Sitemap]
52
+ #
53
+ # @overload fetch(url, fetcher: nil, max_entries: nil)
54
+ # If a block is given, it's used as a filter for entries before they're added to the sitemap.
55
+ #
56
+ # @param url [String, URI] the url of the sitemap in question.
57
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
58
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
59
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
60
+ # a good idea to include, as many sites have _very_ large sitemaps.
61
+ # @return [Sitemap]
62
+ # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
63
+ # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
64
+ def self.fetch(url, fetcher: nil, max_entries: nil, &block)
65
+ fetcher ||= @default_fetcher
66
+ unless url.is_a? URI
67
+ url = "http://#{url}" unless url =~ %r{^https?://}
68
+ url = URI.parse(url)
69
+ end
25
70
 
26
- recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
71
+ _instance.fetch_recursive(url, fetcher, max_entries, &block)
27
72
  end
28
73
 
29
- def self.fetch_single(url, fetch, max_entries, &block)
30
- url = parse_url(url)
31
- source = fetch.call(url)
74
+ # Discover, fetch and parse sitemaps from the given host.
75
+ #
76
+ # Attempts to find and fetch sitemaps at a given host, by examining the `robots.txt` at that
77
+ # host, or if no sitemaps are found via `robots.txt`, checking a small number of common locations,
78
+ # including `sitemap.xml`, `sitemap_index.xml`, and the gzip versions of those same locations.
79
+ #
80
+ # @overload discover(host, fetcher: nil, max_entries: nil)
81
+ # @param host [String, URI] the url of the host to interrogate for sitemaps.
82
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
83
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
84
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
85
+ # a good idea to include, as many sites have _very_ large sitemaps.
86
+ # @return [Sitemap]
87
+ #
88
+ # @overload discover(host, fetcher: nil, max_entries: nil)
89
+ # If a block is given, it's used as a filter for entries before they're added to the sitemap.
90
+ #
91
+ # @param host [String, URI] the url of the host to interrogate for sitemaps.
92
+ # @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
93
+ # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
94
+ # sitemap has this many entries, further fetches and parsing will not occur. This is always
95
+ # a good idea to include, as many sites have _very_ large sitemaps.
96
+ # @return [Sitemap]
97
+ # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
98
+ # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
99
+ def self.discover(url, fetcher: nil, max_entries: nil, &block)
100
+ fetcher ||= @default_fetcher
101
+ unless url.is_a? URI
102
+ url = "http://#{url}" unless url =~ %r{^https?://}
103
+ url = URI.parse(url)
104
+ end
105
+
106
+ roots = _instance.discover_roots(url, fetcher)
107
+ _instance.fetch_recursive(roots, fetcher, max_entries, &block)
108
+ end
32
109
 
33
- Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
110
+ # @return [Instance]
111
+ # @private
112
+ # @api private
113
+ def self._instance
114
+ @instance ||= Sitemaps::Instance.new
34
115
  end
35
116
 
36
- def self.fetch_recursive(url, fetch, max_entries, &block)
37
- queue = [parse_url(url)]
38
- maps = {}
39
-
40
- # walk the queue, fetching the sitemap requested and adding
41
- # new sitemaps to the queue as found
42
- loop do
43
- begin
44
- url = queue.pop
45
- break if url.nil?
46
- next unless maps[url].nil?
47
-
48
- # fetch this item in the queue, and queue up any sub maps it found
49
- maps[url] = fetch_single(url, fetch, max_entries, &block)
50
- queue.push(*maps[url].sitemaps.map(&:loc))
51
-
52
- # decrement max_entries (since it's max_entries total, not per map)
53
- unless max_entries.nil?
54
- max_entries -= maps[url].entries.length
55
- break if max_entries <= 0
117
+ # Holder for methods that shouldn't be exposed as public API
118
+ # @private
119
+ # @api private
120
+ class Instance
121
+ # recursively fetch sitemaps and sitemap indexes from the given urls.
122
+ # @return [Sitemap]
123
+ def fetch_recursive(urls, fetcher, max_entries, &block)
124
+ queue = urls.is_a?(Array) ? urls : [urls]
125
+ maps = {}
126
+
127
+ # walk the queue, fetching the sitemap requested and adding
128
+ # new sitemaps to the queue as found
129
+ loop do
130
+ begin
131
+ url = queue.pop
132
+ break if url.nil?
133
+ next unless maps[url].nil?
134
+
135
+ # fetch this item in the queue, and queue up any sub maps it found
136
+ source = fetcher.call(url)
137
+ sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
138
+
139
+ # save the results and queue up any submaps it found
140
+ maps[url] = sitemap
141
+ queue.push(*sitemap.sitemaps.map(&:loc))
142
+
143
+ # decrement max_entries (since it's max_entries total, not per map)
144
+ unless max_entries.nil?
145
+ max_entries -= maps[url].entries.length
146
+ break if max_entries <= 0
147
+ end
148
+ rescue => ex
149
+ # otherwise keep on going, because we've got something at least
150
+ $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
151
+ next
56
152
  end
57
- rescue => ex
58
- $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
59
- next
60
153
  end
61
- end
62
154
 
63
- # collapse the recovered maps into a single one with everything
64
- maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
65
- result.sitemaps.concat(map.sitemaps)
66
- result.entries.concat(map.entries)
155
+ # collapse the recovered maps into a single one with everything
156
+ maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
157
+ result.sitemaps.concat(map.sitemaps).uniq! { |e| e.loc.to_s }
158
+ result.entries.concat(map.entries).uniq! { |e| e.loc.to_s }
159
+ end
67
160
  end
68
- end
69
161
 
70
- def self.parse_url(url)
71
- return url if url.is_a? URI
162
+ # interrogate a host for sitemaps from robots.txt, or return some potential locations.
163
+ # @return [Array<URI>]
164
+ def discover_roots(url, fetcher)
165
+ robots = begin
166
+ robotsurl = url.clone
167
+ robotsurl.path = "/robots.txt"
168
+ robotstxt = fetcher.call(robotsurl)
72
169
 
73
- url = "http://#{url}" unless url =~ %r{^https?://}
74
- URI.parse(url)
170
+ discovered = robotstxt.scan(/^Sitemap: (.+)$/).flatten.reverse.map { |u| URI.parse(u) }
171
+ discovered.empty? ? nil : discovered
172
+ rescue
173
+ nil
174
+ end
175
+
176
+ # try for files in a handful of known locations
177
+ known_locations = %w(/sitemap_index.xml.gz /sitemap_index.xml /sitemap.xml.gz /sitemap.xml)
178
+ known_locations = known_locations.lazy.map do |path|
179
+ pathurl = url.clone
180
+ pathurl.path = path
181
+ pathurl
182
+ end
183
+
184
+ robots || known_locations.to_a
185
+ end
75
186
  end
76
187
  end
@@ -1,14 +1,29 @@
1
1
  module Sitemaps
2
- # Simple single purpose HTTP client
2
+ # Simple single purpose HTTP client. Uses `Net::HTTP` directly, so as to not incur dependencies.
3
3
  module Fetcher
4
- class FetchError < StandardError; end
4
+ class FetchError < StandardError; end
5
5
  class MaxRedirectError < StandardError; end
6
6
 
7
7
  @max_attempts = 10
8
8
 
9
+ # Fetch the given URI.
10
+ #
11
+ # Handles redirects (up to 10 times), and additionally will inflate a body delivered without
12
+ # a content-encoding header, but with a `.gz` as the end of the path.
13
+ #
14
+ # @param uri [String, URI] the URI to fetch.
15
+ # @return [String]
16
+ # @raise [FetchError] if the server responds with an HTTP status that's not 2xx.
17
+ # @raise [MaxRedirectError] if more than 10 redirects have occurred while attempting to fetch the resource.
9
18
  def self.fetch(uri)
10
19
  attempts = 0
11
20
 
21
+ # we only work on URI objects
22
+ unless uri.is_a? URI
23
+ uri = "http://#{uri}" unless uri =~ %r{^https?://}
24
+ uri = URI.parse(uri)
25
+ end
26
+
12
27
  until attempts >= @max_attempts
13
28
  resp = Net::HTTP.get_response(uri)
14
29
 
@@ -1,6 +1,16 @@
1
1
  module Sitemaps
2
2
  # Parse XML Sitemaps
3
3
  module Parser
4
+ VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
5
+
6
+ # Given a source string, returns a sitemap containing all valid url entries, or all valid sub-sitemaps.
7
+ # See `http://sitemaps.org` for information on the spec.
8
+ #
9
+ # @param source [String] an XML string to parse.
10
+ # @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
11
+ # @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
12
+ # @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
13
+ # a sitemap will still be returned, but the entries and sitemaps keys will be empty.
4
14
  def self.parse(source, max_entries: nil, filter: nil)
5
15
  document = REXML::Document.new(source)
6
16
  entries = document.elements.to_a("/urlset/url").map do |root|
@@ -12,6 +22,7 @@ module Sitemaps
12
22
  entry = Sitemaps::Entry.new(loc, mod, freq, pri)
13
23
  (!filter || filter.call(entry)) ? entry : nil
14
24
  end.reject(&:nil?)
25
+ entries = entries.uniq(&:loc)
15
26
  entries = entries.take(max_entries) unless max_entries.nil?
16
27
 
17
28
  sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
@@ -24,22 +35,29 @@ module Sitemaps
24
35
  Sitemaps::Sitemap.new(entries, sitemaps)
25
36
  end
26
37
 
38
+ # @api private
39
+ # @private
27
40
  def self.parse_loc(root)
28
41
  loc = root.get_text("loc").try(:value)
29
42
  loc && URI.parse(loc) rescue nil
30
43
  end
31
44
 
45
+ # @api private
46
+ # @private
32
47
  def self.parse_lastmod(root)
33
48
  mod = root.get_text("lastmod").try(:value)
34
49
  mod && Time.parse(mod) rescue nil
35
50
  end
36
51
 
37
- VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
52
+ # @api private
53
+ # @private
38
54
  def self.parse_changefreq(root)
39
55
  freq = root.get_text("changefreq").try(:value)
40
56
  freq && VALID_CHANGEFREQ.include?(freq) ? freq.to_sym : nil
41
57
  end
42
58
 
59
+ # @api private
60
+ # @private
43
61
  def self.parse_priority(root)
44
62
  priority = root.get_text("priority").try(:value) || "0.5"
45
63
  priority && Float(priority) rescue 0.5 # default priority according to spec
@@ -1,3 +1,3 @@
1
1
  module Sitemaps
2
- VERSION = "0.1.1".freeze
2
+ VERSION = "0.2.0".freeze
3
3
  end
data/sitemaps.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "vcr", "~> 3"
27
27
  spec.add_development_dependency "rubocop", "~> 0.38.0"
28
28
  spec.add_development_dependency "byebug", "~> 8.2"
29
+ spec.add_development_dependency "yard", "~> 0.8"
29
30
 
30
31
  spec.add_runtime_dependency "activesupport", "~> 4"
31
32
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemaps_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raphaelson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-22 00:00:00.000000000 Z
11
+ date: 2016-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '8.2'
111
+ - !ruby/object:Gem::Dependency
112
+ name: yard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.8'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.8'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: activesupport
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -170,3 +184,4 @@ signing_key:
170
184
  specification_version: 4
171
185
  summary: Retrieve and parse sitemaps, according to the sitemaps.org spec.
172
186
  test_files: []
187
+ has_rdoc: