RubyGems - sitemaps_parser - Versions diffs - 0.1.1 → 0.2.0 - Mend

sitemaps_parser 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
-  data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
+  metadata.gz: f6a5cf10a4446ce42e918767ee0279a81178951c
+  data.tar.gz: 878301d4d7ffd4429a0d4763d9ecb4ccca719039
 SHA512:
-  metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
-  data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
+  metadata.gz: 1a209890b6494fb17fed6c234656854a657dec8dc5d6012d752d6962339b8b5c06f8cd0facb15c96884f18bcabf4bb2a56aea53b84ad8326695ae43c4e57fe2b
+  data.tar.gz: 108c423e4969c262bf00959fa6ae4d5855c0d35faf10d21c282d3a05413e2f4e9178767dfce86da7872231dc48ac7a81338c72a6b1f809d73ff2d7e3e11a7c9d

data/README.md CHANGED Viewed

@@ -4,18 +4,14 @@
 Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
-## TODO
-* discovery
-* sorting by last modified, or priority
-* filtering by last modified
+See [RDOC Documentation](http://lygaret.github.io/sitemaps) for detailed documentation.
 ## Installation
 Add this line to your application's Gemfile:
 ```ruby
-gem 'sitemaps'
+gem 'sitemaps_parser', require: 'sitemaps'
 ```
 And then execute:
@@ -24,7 +20,7 @@ And then execute:
 Or install it yourself as:
-    $ gem install sitemaps
+    $ gem install sitemaps_parser
 ## Usage
@@ -35,15 +31,20 @@ require 'sitemaps'
 Sitemaps.parse("<xml ns=\"...")
 # fetch and parse a sitemap from a known url
-sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
+sitemap = Sitemaps.fetch("http://termscout.com/sitemap.xml")
 # fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
 sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
   entry.loc.path !~ /blog/i
 end
+# attempt to discover sitemaps for a site without a known sitemap location. Checks robots.txt and some common locations.
+sitemap = Sitemaps.discover("https://www.digitalocean.com", max_entries: 200) do |entry|
+  entry.loc.path !~ /blog/i
+end
 # sitemap usage
-sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
+sitemap.entries.first #> Sitemaps::Entry(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
 urls = sitemap.entries.map(&:loc)
 ```
@@ -55,7 +56,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/termscout/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
+Bug reports and pull requests are welcome on GitHub at https://github.com/lygaret/sitemaps. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
 ## License

data/Rakefile CHANGED Viewed

@@ -4,3 +4,30 @@ require "rspec/core/rake_task"
 RSpec::Core::RakeTask.new(:spec)
 task default: :spec
+require 'yard'
+DOC_FILES = ['lib/**/*.rb']
+YARD::Rake::YardocTask.new(:doc) do |t|
+  t.files = DOC_FILES
+end
+namespace :doc do
+  YARD::Rake::YardocTask.new(:pages) do |t|
+    t.files   = DOC_FILES
+    t.options = ['-o', '../sitemaps.doc']
+  end
+  namespace :pages do
+    desc 'Generate and publish docs to gh-pages'
+    task publish: ['doc:pages'] do
+      Dir.chdir(File.dirname(__FILE__) + '/../sitemaps.doc') do
+        system 'git checkout gh-pages'
+        system 'git add .'
+        system 'git add -u'
+        system "git commit -m 'Generating docs for version #{Sitemaps::VERSION}.'"
+        system 'git push origin gh-pages'
+      end
+    end
+  end
+end

data/lib/sitemaps.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require "active_support"
 require "active_support/core_ext/object/try"
 require "active_support/core_ext/object/blank"
+require "set"
 require "time"
 require "rexml/document"
@@ -11,66 +12,176 @@ require "sitemaps/fetcher"
 # Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
 module Sitemaps
+  # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
+  # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
+  # @attr changefreq [:always, :hourly, :daily, :weekly, :monthly, :yearly, :never, nil]
+  #   the change frequency of this entry, or nil if unspecified.
+  # @attr priority [Float] the priority of this entry, a float from 0 to 1. 0.5 if unspecified.
   Entry   = Struct.new(:loc, :lastmod, :changefreq, :priority)
+  # @attr loc [URI] the location referred to by this entry. Will never be `nil`.
+  # @attr lastmod [Time, nil] the last modification time of this entry, or `nil` if unspecified.
   Submap  = Struct.new(:loc, :lastmod)
+  # @attr entries [Enumerable<Entry>] A set of entries that were parsed out of one or more sitemaps, recursively.
+  # @attr sitemaps [Enumerable<Sitemap>] A set of sitemaps that were found in a sitemap index.
   Sitemap = Struct.new(:entries, :sitemaps)
+  @default_fetcher = ->(u) { Sitemaps::Fetcher.fetch(u) }
+  # Parse a sitemap from an XML string. Does not fail on invalid documents, but doesn't include
+  # invalid entries in the final set. As such, a non-XML file, or non-sitemap XML file will return
+  # an empty sitemap.
+  #
+  # @param source [String] an XML string to parse as a sitemap.
+  # @return [Sitemap] the sitemap represented by the given XML string.
   def self.parse(source)
     Sitemaps::Parser.parse(source)
   end
-  def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
-    fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
-    url     = parse_url(url)
+  # Fetch and parse a sitemap from the given URL.
+  #
+  # @overload fetch(url, fetcher: nil, max_entries: nil)
+  #   @param url [String, URI] the url of the sitemap in question.
+  #   @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
+  #   @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
+  #     sitemap has this many entries, further fetches and parsing will not occur. This is always
+  #     a good idea to include, as many sites have _very_ large sitemaps.
+  #   @return [Sitemap]
+  #
+  # @overload fetch(url, fetcher: nil, max_entries: nil)
+  #   If a block is given, it's used as a filter for entries before they're added to the sitemap.
+  #
+  #   @param url [String, URI] the url of the sitemap in question.
+  #   @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
+  #   @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
+  #     sitemap has this many entries, further fetches and parsing will not occur. This is always
+  #     a good idea to include, as many sites have _very_ large sitemaps.
+  #   @return [Sitemap]
+  #   @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
+  #   @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
+  def self.fetch(url, fetcher: nil, max_entries: nil, &block)
+    fetcher ||= @default_fetcher
+    unless url.is_a? URI
+      url = "http://#{url}" unless url =~ %r{^https?://}
+      url = URI.parse(url)
+    end
-    recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
+    _instance.fetch_recursive(url, fetcher, max_entries, &block)
   end
-  def self.fetch_single(url, fetch, max_entries, &block)
-    url    = parse_url(url)
-    source = fetch.call(url)
+  # Discover, fetch and parse sitemaps from the given host.
+  #
+  # Attempts to find and fetch sitemaps at a given host, by examining the `robots.txt` at that
+  # host, or if no sitemaps are found via `robots.txt`, checking a small number of common locations,
+  # including `sitemap.xml`, `sitemap_index.xml`, and the gzip versions of those same locations.
+  #
+  # @overload discover(host, fetcher: nil, max_entries: nil)
+  #   @param host [String, URI] the url of the host to interrogate for sitemaps.
+  #   @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
+  #   @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
+  #     sitemap has this many entries, further fetches and parsing will not occur. This is always
+  #     a good idea to include, as many sites have _very_ large sitemaps.
+  #   @return [Sitemap]
+  #
+  # @overload discover(host, fetcher: nil, max_entries: nil)
+  #   If a block is given, it's used as a filter for entries before they're added to the sitemap.
+  #
+  #   @param host [String, URI] the url of the host to interrogate for sitemaps.
+  #   @param fetcher [#call] given a URI, fetch an HTTP document. Defaults to using `Fetcher`.
+  #   @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
+  #     sitemap has this many entries, further fetches and parsing will not occur. This is always
+  #     a good idea to include, as many sites have _very_ large sitemaps.
+  #   @return [Sitemap]
+  #   @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
+  #   @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
+  def self.discover(url, fetcher: nil, max_entries: nil, &block)
+    fetcher ||= @default_fetcher
+    unless url.is_a? URI
+      url = "http://#{url}" unless url =~ %r{^https?://}
+      url = URI.parse(url)
+    end
+    roots = _instance.discover_roots(url, fetcher)
+    _instance.fetch_recursive(roots, fetcher, max_entries, &block)
+  end
-    Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
+  # @return [Instance]
+  # @private
+  # @api private
+  def self._instance
+    @instance ||= Sitemaps::Instance.new
   end
-  def self.fetch_recursive(url, fetch, max_entries, &block)
-    queue = [parse_url(url)]
-    maps  = {}
-    # walk the queue, fetching the sitemap requested and adding
-    # new sitemaps to the queue as found
-    loop do
-      begin
-        url = queue.pop
-        break if url.nil?
-        next  unless maps[url].nil?
-        # fetch this item in the queue, and queue up any sub maps it found
-        maps[url] = fetch_single(url, fetch, max_entries, &block)
-        queue.push(*maps[url].sitemaps.map(&:loc))
-        # decrement max_entries (since it's max_entries total, not per map)
-        unless max_entries.nil?
-          max_entries -= maps[url].entries.length
-          break if max_entries <= 0
+  # Holder for methods that shouldn't be exposed as public API
+  # @private
+  # @api private
+  class Instance
+    # recursively fetch sitemaps and sitemap indexes from the given urls.
+    # @return [Sitemap]
+    def fetch_recursive(urls, fetcher, max_entries, &block)
+      queue = urls.is_a?(Array) ? urls : [urls]
+      maps  = {}
+      # walk the queue, fetching the sitemap requested and adding
+      # new sitemaps to the queue as found
+      loop do
+        begin
+          url = queue.pop
+          break if url.nil?
+          next  unless maps[url].nil?
+          # fetch this item in the queue, and queue up any sub maps it found
+          source  = fetcher.call(url)
+          sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
+          # save the results and queue up any submaps it found
+          maps[url] = sitemap
+          queue.push(*sitemap.sitemaps.map(&:loc))
+          # decrement max_entries (since it's max_entries total, not per map)
+          unless max_entries.nil?
+            max_entries -= maps[url].entries.length
+            break if max_entries <= 0
+          end
+        rescue => ex
+          # otherwise keep on going, because we've got something at least
+          $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
+          next
         end
-      rescue => ex
-        $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
-        next
       end
-    end
-    # collapse the recovered maps into a single one with everything
-    maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
-      result.sitemaps.concat(map.sitemaps)
-      result.entries.concat(map.entries)
+      # collapse the recovered maps into a single one with everything
+      maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
+        result.sitemaps.concat(map.sitemaps).uniq! { |e| e.loc.to_s }
+        result.entries.concat(map.entries).uniq!   { |e| e.loc.to_s }
+      end
     end
-  end
-  def self.parse_url(url)
-    return url if url.is_a? URI
+    # interrogate a host for sitemaps from robots.txt, or return some potential locations.
+    # @return [Array<URI>]
+    def discover_roots(url, fetcher)
+      robots = begin
+        robotsurl      = url.clone
+        robotsurl.path = "/robots.txt"
+        robotstxt      = fetcher.call(robotsurl)
-    url = "http://#{url}" unless url =~ %r{^https?://}
-    URI.parse(url)
+        discovered = robotstxt.scan(/^Sitemap: (.+)$/).flatten.reverse.map { |u| URI.parse(u) }
+        discovered.empty? ? nil : discovered
+      rescue
+        nil
+      end
+      # try for files in a handful of known locations
+      known_locations = %w(/sitemap_index.xml.gz /sitemap_index.xml /sitemap.xml.gz /sitemap.xml)
+      known_locations = known_locations.lazy.map do |path|
+        pathurl      = url.clone
+        pathurl.path = path
+        pathurl
+      end
+      robots || known_locations.to_a
+    end
   end
 end

data/lib/sitemaps/fetcher.rb CHANGED Viewed

@@ -1,14 +1,29 @@
 module Sitemaps
-  # Simple single purpose HTTP client
+  # Simple single purpose HTTP client. Uses `Net::HTTP` directly, so as to not incur dependencies.
   module Fetcher
-    class FetchError < StandardError; end
+    class FetchError       < StandardError; end
     class MaxRedirectError < StandardError; end
     @max_attempts = 10
+    # Fetch the given URI.
+    #
+    # Handles redirects (up to 10 times), and additionally will inflate a body delivered without
+    # a content-encoding header, but with a `.gz` as the end of the path.
+    #
+    # @param uri [String, URI] the URI to fetch.
+    # @return [String]
+    # @raise [FetchError] if the server responds with an HTTP status that's not 2xx.
+    # @raise [MaxRedirectError] if more than 10 redirects have occurred while attempting to fetch the resource.
     def self.fetch(uri)
       attempts = 0
+      # we only work on URI objects
+      unless uri.is_a? URI
+        uri = "http://#{uri}" unless uri =~ %r{^https?://}
+        uri = URI.parse(uri)
+      end
       until attempts >= @max_attempts
         resp = Net::HTTP.get_response(uri)

data/lib/sitemaps/parser.rb CHANGED Viewed

@@ -1,6 +1,16 @@
 module Sitemaps
   # Parse XML Sitemaps
   module Parser
+    VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
+    # Given a source string, returns a sitemap containing all valid url entries, or all valid sub-sitemaps.
+    # See `http://sitemaps.org` for information on the spec.
+    #
+    # @param source [String] an XML string to parse.
+    # @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
+    # @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
+    # @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
+    #   a sitemap will still be returned, but the entries and sitemaps keys will be empty.
     def self.parse(source, max_entries: nil, filter: nil)
       document = REXML::Document.new(source)
       entries  = document.elements.to_a("/urlset/url").map do |root|
@@ -12,6 +22,7 @@ module Sitemaps
         entry = Sitemaps::Entry.new(loc, mod, freq, pri)
         (!filter || filter.call(entry)) ? entry : nil
       end.reject(&:nil?)
+      entries = entries.uniq(&:loc)
       entries = entries.take(max_entries) unless max_entries.nil?
       sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
@@ -24,22 +35,29 @@ module Sitemaps
       Sitemaps::Sitemap.new(entries, sitemaps)
     end
+    # @api private
+    # @private
     def self.parse_loc(root)
       loc = root.get_text("loc").try(:value)
       loc && URI.parse(loc) rescue nil
     end
+    # @api private
+    # @private
     def self.parse_lastmod(root)
       mod = root.get_text("lastmod").try(:value)
       mod && Time.parse(mod) rescue nil
     end
-    VALID_CHANGEFREQ = %w(always hourly daily weekly monthly yearly never).freeze
+    # @api private
+    # @private
     def self.parse_changefreq(root)
       freq = root.get_text("changefreq").try(:value)
       freq && VALID_CHANGEFREQ.include?(freq) ? freq.to_sym : nil
     end
+    # @api private
+    # @private
     def self.parse_priority(root)
       priority = root.get_text("priority").try(:value) || "0.5"
       priority && Float(priority) rescue 0.5 # default priority according to spec

data/lib/sitemaps/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Sitemaps
-  VERSION = "0.1.1".freeze
+  VERSION = "0.2.0".freeze
 end

data/sitemaps.gemspec CHANGED Viewed

@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "vcr", "~> 3"
   spec.add_development_dependency "rubocop", "~> 0.38.0"
   spec.add_development_dependency "byebug", "~> 8.2"
+  spec.add_development_dependency "yard", "~> 0.8"
   spec.add_runtime_dependency "activesupport", "~> 4"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sitemaps_parser
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Jonathan Raphaelson
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-03-22 00:00:00.000000000 Z
+date: 2016-03-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -108,6 +108,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '8.2'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
 - !ruby/object:Gem::Dependency
   name: activesupport
   requirement: !ruby/object:Gem::Requirement
@@ -170,3 +184,4 @@ signing_key:
 specification_version: 4
 summary: Retrieve and parse sitemaps, according to the sitemaps.org spec.
 test_files: []
+has_rdoc: