RubyGems - varnisher - Versions diffs - 1.0.beta.2 → 1.0.beta.3 - Mend

varnisher 1.0.beta.2 → 1.0.beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +11 -6
data/bin/varnisher +44 -28
data/lib/varnisher.rb +61 -0
data/lib/varnisher/domainpurger.rb +17 -17
data/lib/varnisher/pagepurger.rb +117 -141
data/lib/varnisher/purger.rb +62 -0
data/lib/varnisher/spider.rb +187 -119
data/lib/varnisher/version.rb +1 -1
metadata +20 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4a9d710584e6d43f0f925cd4894f536fa1630551
-  data.tar.gz: 9d22ce540f64d36683db840c395c03c1096a88cd
+  metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
+  data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
 SHA512:
-  metadata.gz: 7afaedc98a7557689c4908da0ba13e54f674d70ec3fd48543f0bf4181b016d61d05f532196683f3dfd97ff8e44ddde417afb4e116bfdf941af73068664531327
-  data.tar.gz: 55cd0e503a1152418c84e3a1876ee75a38b2c6450ea5e9495b0cd2d2d55bf608d8ef07939e84f170cfc19d2c665f8863cafb5e6aeb21241a71e8fc71caa706f2
+  metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
+  data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba

data/README.md CHANGED Viewed

@@ -12,12 +12,17 @@ Varnisher lets you do things like:
 * Purge an entire domain, including optionally re-spidering it
   afterwards to keep the cache warm
+Full documentation is available [on
+rdoc.info](http://rdoc.info/github/robmiller/varnisher).
 ## Installation
 Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
-installed, then Varnisher can be installed by running:
+installed, then Varnisher can be installed easily via RubyGems.
+Varnisher is still in beta; you can install it with:
-	gem install varnisher
+	gem install varnisher --pre
 ## Usage
@@ -52,9 +57,9 @@ you want to paste and override them:
     verbose: false
     hostname: localhost
     port: 80
-    num_pages: 100
-    ignore_hash: true
-    ignore_query_string: false
+    num-pages: -1
+    ignore-hashes: true
+    ignore-query-strings: false
 ## Examples
@@ -89,7 +94,7 @@ which is fairly standard:
 (For an explanation of just what `obj.http.x-url` means, and why you
 should use it rather than `req.url`, see [this
-page](http://kristianlyng.wordpress.com/2010/07/28/smart-bans-with-varnish/).)
+page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
 ### Purging an entire domain

data/bin/varnisher CHANGED Viewed

@@ -9,9 +9,9 @@ require 'yaml'
 require 'varnisher'
 Main {
-  examples "varnisher purge http://example.com", "varnisher spider example.com", "varnisher purge --reindex example.com"
+  examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
-  description "Varnisher is a set of tools for working with the Varnish HTTP cache."
+  description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
   argument 'target'
@@ -19,89 +19,105 @@ Main {
     description "If given, Varnisher will be noisier about what it's up to."
   }
+  option('q', 'quiet') {
+    description 'If given, Varnisher will be silent apart from errors.'
+  }
   option('H', 'hostname') {
     argument :required
-    description "The hostname/IP address of your Varnish server."
-    default "localhost"
+    description 'The hostname/IP address of your Varnish server.'
   }
   option('p', 'port') {
     argument :required
     cast :int
-    description "The port Varnish is listening on."
-    default 80
+    description 'The port Varnish is listening on.'
+  }
+  option('o', 'output-file') {
+    argument :required
+    description 'A file to output log information to. If not given, output will be printed to STDOUT'
   }
   def before_run
     load_config
   end
-  mode "purge" do
-    argument('target') { description "The URL or hostname to purge" }
+  mode 'purge' do
+    argument('target') { description 'The URL or hostname to purge' }
     option('reindex') {
-      description "If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible."
+      description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
     }
     def run
       target = params['target'].value
       # If target is a valid URL, then assume we're purging a page and its contents.
-      if target =~ /^[a-z]+:\/\//
-        Varnisher::PagePurger.new target
-      end
+      if target =~ %r(^[a-z]+://)
+        purger = Varnisher::PagePurger.new target
+        purger.purge
       # If target is a hostname, assume we want to purge an entire domain.
-      if target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
+      elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
         Varnisher::DomainPurger.new target
         if params['reindex'].given?
-          Varnisher::Spider.new "http://#{target}/"
+          spider = Varnisher::Spider.new "http://#{target}/"
+          spider.run
         end
       end
     end
   end
-  mode "spider" do
-    argument('target') { description "The URL to begin spidering from." }
+  mode 'spider' do
+    argument('target') { description 'The URL to begin spidering from.' }
     option('n', 'num-pages') {
       argument :required
       cast :int
-      description "Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit."
-      default -1
+      description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
     }
     option('t', 'threads') {
       argument :required
       cast :int
-      description "Spidering is done in parallel; this variable controls how many threads will be used."
-      default 16
+      description 'Spidering is done in parallel; this variable controls how many threads will be used.'
     }
     option('#', 'ignore-hashes') {
-      description "When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource."
+      description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
     }
     option('q', 'ignore-query-strings') {
-      description "When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource."
+      description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
     }
     def run
       target = params['target'].value
-      Varnisher::Spider.new target
+      spider = Varnisher::Spider.new target
+      spider.run
     end
   end
   def load_config
-    $options = params.to_options
+    # Start with our default options.
+    options = Varnisher.options
-    rcfile = File.expand_path("~/.varnishrc")
+    # Check the user's RC file -- if it exists -- to see if they've
+    # specified any defaults of their own.
+    rcfile = File.expand_path('~/.varnishrc')
     if FileTest.readable? rcfile
-      rc = YAML::load(File.open(rcfile))
-      $options.merge!(rc)
+      rc = YAML.load(File.open(rcfile))
+      options.merge!(rc)
     end
+    # The highest priority is given to command line arguments, so that
+    # the user can override things that are in their RC file if they
+    # choose to.
+    options.merge!(params.to_options.reject { |k, v| v.nil? })
+    Varnisher.options = options
   end
 }

data/lib/varnisher.rb CHANGED Viewed

@@ -1,4 +1,65 @@
 require_relative 'varnisher/spider'
+require_relative 'varnisher/purger'
 require_relative 'varnisher/domainpurger'
 require_relative 'varnisher/pagepurger'
+require 'logger'
+# This module is a namespace for our main functionality:
+#
+# * {Varnisher::Spider}
+# * {Varnisher::DomainPurger}
+# * {Varnisher::PagePurger}
+module Varnisher
+  # Our default options are set here; they can be overriden either by
+  # command-line arguments or by settings in a user's ~/.varnishrc file.
+  @options = {
+    'verbose' => false,
+    'quiet' => false,
+    'hostname' => nil,
+    'port' => 80,
+    'num-pages' => -1,
+    'threads' => 16,
+    'ignore-hashes' => true,
+    'ignore-query-strings' => false,
+    'output-file' => nil
+  }
+  def self.options
+    @options
+  end
+  def self.options=(options)
+    @options = options
+    if options['hostname'].nil? && options['target']
+      uri = URI.parse(options['target'])
+      options['hostname'] = uri.host
+    end
+    start_logging
+  end
+  # Sets up our Logger object, which will write output either to STDOUT
+  # (the default) or to the specified file.
+  def self.start_logging
+    output = @options['output-file'] || STDOUT
+    @log = Logger.new(output)
+    # By default, only display the log message, nothing else.
+    @log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
+    @log.level = if @options['verbose']
+                   Logger::DEBUG
+                 elsif @options['quiet']
+                   Logger::FATAL
+                 else
+                   Logger::INFO
+                 end
+  end
+  def self.log
+    @log
+  end
+end

data/lib/varnisher/domainpurger.rb CHANGED Viewed

@@ -1,27 +1,27 @@
 require 'net/http'
-# This requires a special bit of VCL:
-#
-# if ( req.request == "DOMAINPURGE" ) {
-#   if ( client.ip ~ auth ) {
-#     ban("obj.http.x-host == " + req.http.host);
-#     error 200 "Purged.";
-#   }
-# }
 module Varnisher
+  # Purges an entire domain from the Varnish cache.
+  #
+  # This requires a special bit of VCL in your Varnish configuration:
+  #
+  #     if ( req.request == "DOMAINPURGE" ) {
+  #       if ( client.ip ~ auth ) {
+  #         ban("obj.http.x-host == " + req.http.host);
+  #         error 200 "Purged.";
+  #       }
+  #     }
   class DomainPurger
+    # Executes the purge request.
+    #
+    # @param domain [String] The hostname to purge
     def initialize(domain)
-      s = TCPSocket.open($options['hostname'], $options['port'])
-      s.print("DOMAINPURGE / HTTP/1.1\r\nHost: #{domain}\r\n\r\n")
-      if s.read =~ /HTTP\/1\.1 200 Purged\./
-        puts "Purged  #{domain}"
+      purged = Varnisher.purge(domain, :domain)
+      if purged
+        Varnisher.log.info "Purged #{domain}"
       else
-        puts "Failed to purge #{domain}"
+        Varnisher.log.info "Failed to purge #{domain}"
       end
-      s.close
     end
   end
 end

data/lib/varnisher/pagepurger.rb CHANGED Viewed

@@ -1,180 +1,156 @@
 require 'rubygems'
-require 'hpricot'
+require 'nokogiri'
 require 'net/http'
 require 'parallel'
 module Varnisher
+  # Purges an individual URL from Varnish.
   class PagePurger
+    # A bash at an abstract representation of resources. All you need
+    # is an XPath, and what attribute to select from the matched
+    # elements.
+    Resource = Struct.new :name, :selector, :attribute
+    def self.resources
+      [
+        Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
+        Resource.new('JavaScript file', 'script[src]', 'src'),
+        Resource.new('image file', 'img[src]', 'src')
+      ]
+    end
+    # Purges the given URL from the Varnish cache.
+    #
+    # Will also purge all of the resources it finds on that page (e.g.
+    # images, CSS files, JavaScript files, etc.)
+    #
+    # @param url [String, URI] The URL to purge
     def initialize(url)
       @url = url
       @uri = URI.parse(url)
-      @urls = []
-      # First, purge the URL itself; that means we'll get up-to-date references within that page.
-      puts "Purging #{@url}...\n\n"
-      purge(@url)
-      # Then, do a fresh GET of the page and queue any resources we find on it.
-      puts "Looking for external resources on #{@url}..."
-      if $options["verbose"]
-        puts "\n\n"
-      end
-      fetch_page(@url)
+      @urls = []
+    end
-      if $options["verbose"]
-        puts "\n"
+    # Sends a PURGE request to the Varnish server, asking it to purge
+    # the given URL from its cache.
+    #
+    # This presupposes that you have the following VCL in your Varnish
+    # config file:
+    #
+    #     if (req.request == "PURGE") {
+    #       if ( client.ip ~ auth ) {
+    #         ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
+    #         error 200 "Purged.";
+    #       }
+    #     }
+    #
+    # More about purging can be found
+    # [in the Varnish documentation][purging-and-banning].
+    #
+    # [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
+    #
+    # @api private
+    def purge
+      Varnisher.log.info "Purging #{@url}..."
+      purged = Varnisher.purge(@url)
+      if purged
+        Varnisher.log.info ''
+        Varnisher.log.debug "Purged #{@url}"
+      else
+        Varnisher.log.info "Failed to purge #{@url}\n"
       end
-      puts "#{@urls.length} total resources found.\n\n"
+      purge_resources
+    end
-      if @urls.length == 0
-        puts "No resources found. Abort!"
-        return
-      end
-      # Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
-      puts "Tidying resources...\n"
-      tidy_resources
-      puts "#{@urls.length} purgeable resources found.\n\n"
-      # Now, purge all of the resources we just queued.
-      puts "Purging resources..."
+    # Purges all the resources on the given page.
+    def purge_resources
+      fetch_page
-      if $options["verbose"]
-        puts "\n\n"
-      end
+      return if @urls.empty?
+      tidy_resources
       purge_queue
-      if $options["verbose"]
-        puts "\n"
-      end
-      puts "Nothing more to do!\n\n"
     end
-    # Sends a PURGE request to the Varnish server, asking it to purge the given URL from its cache.
-    def purge(url)
+    # Fetches a page and parses out any external resources (e.g.
+    # JavaScript files, images, CSS files) it finds on it.
+    #
+    # @api private
+    def fetch_page
+      Varnisher.log.info "Looking for external resources on #{@url}..."
       begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
+        @doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
       rescue
-        puts "Couldn't parse URL for purging: #{$!}"
+        Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
         return
       end
-      s = TCPSocket.open($options['hostname'], $options['port'])
-      s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
-      if $options["verbose"]
-        if s.read =~ /HTTP\/1\.1 200 Purged\./
-          puts "Purged  #{url}"
-        else
-          puts "Failed to purge #{url}"
-        end
-      end
+      @urls = find_resources
-      s.close
+      Varnisher.log.debug ''
+      Varnisher.log.info "#{@urls.length} total resources found.\n"
     end
-    # Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
-    def fetch_page(url)
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        puts "Couldn't parse URL for resource-searching: #{url}"
-        return
-      end
-      headers = {
-        "User-Agent"     => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
-        "Accept-Charset" => "utf-8",
-        "Accept"         => "text/html"
-      }
-      begin
-        doc = Hpricot(Net::HTTP.get_response(uri).body)
-      rescue
-        puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
-        return
-      end
-      find_resources(doc) do |resource|
-        if $options["verbose"]
-            puts "Found #{resource}"
-          end
-        queue_resource(resource)
+    # Returns an array of resources contained within the current page.
+    #
+    # Resources include things like CSS files, images, and JavaScript
+    # files.
+    #
+    # If a block is given, the block will be executed once for each
+    # resource.
+    #
+    # @return [Array] An array of strings, each representing a URL
+    #
+    # @api private
+    def find_resources
+      found = []
+      self.class.resources.each do |res|
+        @doc.css(res.selector).each do |e|
+          attribute = e[res.attribute]
+          Varnisher.log.debug("Found resource: #{attribute}")
+          yield attribute if block_given?
+          found << attribute
+        end
       end
+      found
     end
-    def find_resources(doc)
-      return unless doc.respond_to? 'search'
+    # Tidies up the resource queue, converting relative URLs to
+    # absolute.
+    #
+    # @return [Array] The new URLs
+    #
+    # @api private
+    def tidy_resources
+      Varnisher.log.info 'Tidying resources...'
-      # A bash at an abstract representation of resources. All you need is an XPath, and what attribute to select from the matched elements.
-      resource = Struct.new :name, :xpath, :attribute
-      resources = [
-        resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
-        resource.new('JavaScript file', 'script[@src]', 'src'),
-        resource.new('image file', 'img[@src]', 'src')
-      ]
+      @urls = @urls.map { |url| URI.join(@uri, url) }
+        .select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
-      resources.each { |resource|
-        doc.search(resource.xpath).each { |e|
-          att = e.get_attribute(resource.attribute)
-          yield att
-        }
-      }
-    end
-    # Adds a URL to the processing queue.
-    def queue_resource(url)
-      @urls << url.to_s
+      Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
     end
-    def tidy_resources
-      valid_urls = []
-      @urls.each { |url|
-        # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
-        if url.to_s =~ /^\//
-          url = @uri.scheme + "://" + @uri.host + url.to_s
-        end
-        # If we're dealing with a path-relative URL, make it relative to the current directory.
-        unless url.to_s =~ /[a-z]+:\/\//
-          # Take everything up to the final / in the path to be the current directory.
-          /^(.*)\//.match(@uri.path)
-          url = @uri.scheme + "://" + @uri.host + $1 + "/" + url.to_s
-        end
-        begin
-          uri = URI.parse(url)
-        rescue
-          next
-        end
-        # Skip URLs that aren't HTTP, or that are on different domains.
-        next if uri.scheme != "http"
-        next if uri.host != @uri.host
+    # Processes the queue of URLs, sending a purge request for each of
+    # them.
+    #
+    # @api private
+    def purge_queue
+      Varnisher.log.info 'Purging resources...'
-        valid_urls << url
-      }
+      Parallel.map(@urls) do |url|
+        Varnisher.log.debug "Purging #{url}..."
-      @urls = valid_urls.dup
-    end
-    # Processes the queue of URLs, sending a purge request for each of them.
-    def purge_queue()
-      Parallel.map(@urls) { |url|
-        if $options["verbose"]
-          puts "Purging #{url}..."
-        end
+        Varnisher.purge(url.to_s)
+      end
-        purge(url)
-      }
+      Varnisher.log.info 'Done.'
     end
   end

data/lib/varnisher/purger.rb ADDED Viewed

@@ -0,0 +1,62 @@
+module Varnisher
+  # Sends a purge request to the Varnish server
+  #
+  # It does this by sending an HTTP request with a custom method; either
+  # PURGE, if the specified target is a URL, or DOMAINPURGE if the
+  # specified target is a hostname.
+  #
+  # This naturally relies on you having your Varnish config prepared
+  # appropriately, so that the actual purge will take place when we send
+  # these requests.
+  #
+  # @param target [String, URI] The URL or hostname to purge
+  # @param type [:page, :domain] Whether to do a purge of an individual
+  #   URL or a whole hostname
+  # @return [true, false] True if we received an acceptable response
+  #   from the server; false otherwise
+  def self.purge(target, type = :page)
+    if type == :page
+      purger = Purger.from_url(target)
+    else
+      purger = Purger.new('DOMAINPURGE', '/', target)
+    end
+    purger.send if purger
+  end
+  # Responsible for sending purge requests to the Varnish server.
+  class Purger
+    # Prepares a new purge request.
+    #
+    # @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
+    #   the server
+    # @param path [String] The path to purge; for a domain purge,
+    #   use "/"
+    # @param host [String] The hostname of the URL being purged
+    def initialize(method, path, host)
+      @method = method
+      @path = path
+      @host = host
+    end
+    def self.from_url(url)
+      begin
+        uri = URI.parse(URI.encode(url.to_s.strip))
+      rescue
+        return
+      end
+      new('PURGE', uri.path, uri.host)
+    end
+    def send
+      hostname = Varnisher.options['hostname']
+      port = Varnisher.options['port']
+      TCPSocket.open(hostname, port) do |s|
+        s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
+        !!s.read.match(/HTTP\/1\.1 200 Purged\./)
+      end
+    end
+  end
+end

data/lib/varnisher/spider.rb CHANGED Viewed

@@ -1,73 +1,85 @@
 require 'rubygems'
-require 'hpricot'
+require 'nokogiri'
 require 'net/http'
 require 'parallel'
 module Varnisher
+  # Crawls a website, following links that it finds along the way, until
+  # it either runs out of pages to visit or reaches the limit of pages
+  # that you impose on it.
+  #
+  # The spider is multithreaded, which means that one slow request won't
+  # prevent the rest of your requests from happening; this is often the
+  # case when the cached resources are a combination of static or
+  # near-static resources (like CSS and images) and slow, dynamically
+  # generated pages.
+  #
+  # The spider's behaviour can be configured somewhat, so that for
+  # example it ignores query strings (treating /foo?foo=bar and
+  # /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
+  # and /foo#bar will be treated as different URLs).
+  #
+  #
   class Spider
+    # Starts a new spider instance.
+    #
+    # Once it's done a bit of housekeeping and verified that the URL is
+    # acceptable, it calls {#spider} to do the actual fetching of the
+    # pages.
+    #
+    # @param url [String, URI] The URL to begin the spidering from. This
+    #   also restricts the spider to fetching pages only on that
+    #   (sub)domain - so, for example, if you specify
+    #   http://example.com/foo as your starting page, only URLs that begin
+    #   http://example.com will be followed.
     def initialize(url)
-      if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
-        url = 'http://' + url
-      end
+      # If we've been given only a hostname, assume that we want to
+      # start spidering from the homepage
+      url = 'http://' + url unless url =~ %r(^[a-z]+://)
       @uri = URI.parse(url)
-      @pages_hit = 0
       @visited = []
       @to_visit = []
-      puts "Beginning spider of #{url}"
-      crawl_page(url)
-      spider
-      puts "Done; #{@pages_hit} pages hit."
     end
+    # Adds a link to the queue of pages to be visited.
+    #
+    # Doesn't perform any duplication-checking; however, {#crawl_page}
+    # will refuse to crawl pages that have already been visited, so you
+    # can safely queue links blindly and trust that {#crawl_page} will do
+    # the de-duping for you.
+    #
+    # @api private
     def queue_link(url)
       @to_visit << url
     end
-    def crawl_page(url, limit = 10)
+    # Visits a page, and extracts the links that it finds there.
+    #
+    # Links can be in the href attributes of HTML anchor tags, or they
+    # can just be URLs that are mentioned in the content of the page;
+    # the spider is flexible about what it crawls.
+    #
+    # Each link that it finds will be added to the queue of further
+    # pages to visit.
+    #
+    # @param url [String, URI] The URL of the page to fetch
+    #
+    # @api private
+    def crawl_page(uri)
       # Don't crawl a page twice
-      return if @visited.include? url
+      return if @visited.include? uri.to_s
       # Let's not hit this again
-      @visited << url
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        return
-      end
-      headers = {
-        "User-Agent"     => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
-        "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
-        "Accept"         => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-      }
-      begin
-        req = Net::HTTP::Get.new(uri.path, headers)
-        response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
-        case response
-        when Net::HTTPRedirection
-          return crawl_page(response['location'], limit - 1)
-        when Net::HTTPSuccess
-          doc = Hpricot(response.body)
-        end
-      rescue
-        return
-      end
+      @visited << uri.to_s
-      @pages_hit += 1
+      doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
-      if $options["verbose"]
-        puts "Fetched #{url}..."
-      end
+      Varnisher.log.debug "Fetched #{uri}..."
-      find_links(doc, url) do |link|
+      find_links(doc, uri).each do |link|
         next if @visited.include? link
         next if @to_visit.include? link
@@ -75,93 +87,149 @@ module Varnisher
       end
     end
-    def find_links(doc, url)
-      return unless doc.respond_to? 'search'
+    # Given a Nokogiri document, will return all the links in that
+    # document.
+    #
+    # "Links" are defined, for now, as the contents of the `href`
+    # attributes on HTML `<a>` tags, and URLs that are mentioned in
+    # comments.
+    #
+    # @param doc A Nokogiri document
+    # @param url [String, URI] The URL that the document came from;
+    #   this is used to resolve relative URIs
+    #
+    # @return [Array] An array of URIs
+    #
+    # @api private
+    def find_links(doc, uri)
+      hrefs = []
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        return
-      end
+      hrefs  = get_anchors(doc)
+      hrefs += get_commented_urls(doc)
-      hrefs = []
+      hrefs = valid_urls(hrefs, uri)
+      hrefs = remove_hashes(hrefs)
+      hrefs = remove_query_strings(hrefs)
+      hrefs
+    end
+    # Given an HTML document, will return all the URLs that exist as
+    # href attributes of anchor tags.
+    #
+    # @return [Array] An array of strings
+    def get_anchors(doc)
+      doc.xpath('//a[@href]').map { |e| e['href'] }
+    end
+    # Given an HTML document, will return all the URLs that exist in
+    # HTML comments, e.g.:
+    #
+    #     <!-- http://example.com/foo/bar -->
+    def get_commented_urls(doc)
+      doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
+    end
-      # Looks like a valid document! Let's parse it for links
-      doc.search("//a[@href]").each do |e|
-        hrefs << e.get_attribute("href")
+    # Given a set of URLs, will return only the ones that are valid for
+    # spidering.
+    #
+    # That means URLs that have the same hostname as the hostname we
+    # started from, and that are on the HTTP scheme rather than HTTPS
+    # (since Varnish doesn't support HTTPS).
+    #
+    # Additionally, some normalisation will be performed, so that the
+    # URLs are absolute (using the page that they were fetched from as
+    # the base, just like a browser would).
+    #
+    # @return [Array] An array of URIs
+    def valid_urls(hrefs, uri)
+      hrefs.map { |u| URI.join(uri, URI.escape(u)) }
+        .select { |u| u.scheme == 'http' && u.host == @uri.host }
+    end
+    # Given a set of URLs, will normalise them according to their URL
+    # minus the hash; that is, normalise them so that:
+    #
+    # foo#bar
+    #
+    # and:
+    #
+    # foo#baz
+    #
+    # Are considered the same.
+    #
+    # @return [Array] An array of URIs
+    def remove_hashes(hrefs)
+      return hrefs unless Varnisher.options['ignore-hashes']
+      hrefs = hrefs.group_by do |h|
+        URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
       end
-      # Let's also look for commented-out URIs
-      doc.search("//comment()").each do |e|
-        e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
+      hrefs.keys
+    end
+    # Given a set of URLs, will normalise them according to their URL
+    # minus the query string; that is, normalise them so that:
+    #
+    # foo?foo=bar
+    #
+    # and:
+    #
+    # foo?foo=baz
+    #
+    # Are considered the same.
+    #
+    # @return [Array] An array of URIs
+    def remove_query_strings(hrefs)
+      return hrefs unless Varnisher.options['ignore-query-strings']
+      hrefs = hrefs.group_by do |h|
+        URI.parse(h.scheme + '://' + h.host + h.path.to_s)
       end
-      hrefs.each do |href|
-          # Skip mailto links
-          next if href =~ /^mailto:/
-          # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
-          if href.to_s =~ /^\//
-            href = uri.scheme + "://" + uri.host + href.to_s
-          end
-          # If we're dealing with a path-relative URL, make it relative to the current directory.
-          unless href.to_s =~ /[a-z]+:\/\//
-            # Take everything up to the final / in the path to be the current directory.
-            if uri.path =~ /\//
-              /^(.*)\//.match(uri.path)
-              path = $1
-            # If we're on the homepage, then we don't need a path.
-            else
-              path = ""
-            end
-            href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
-          end
-          # At this point, we should have an absolute URL regardless of
-          # its original format.
-          # Strip hash links
-          if ( $options["ignore-hashes"] )
-            href.gsub!(/(#.*?)$/, '')
-          end
-          # Strip query strings
-          if ( $options["ignore-query-strings"] )
-            href.gsub!(/(\?.*?)$/, '')
-          end
-          begin
-            href_uri = URI.parse(href)
-          rescue
-            # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
-            next
-          end
-          next if href_uri.host != uri.host
-          next unless href_uri.scheme =~ /^https?$/
-          yield href
+      hrefs.keys
+    end
+    # Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
+    # it's not one that we've visited before.
+    #
+    # @return [URI] A URI object for an unvisited page
+    def pop_url
+      url = ''
+      loop do
+        url = @to_visit.pop
+        break unless @visited.include?(url)
       end
+      url
     end
-    def spider
-      threads = $options["threads"] || 16
-      num_pages = $options["num-pages"] || -1
+    # Kicks off the spidering process.
+    #
+    # Fires up Parallel in as many threads as have been configured, and
+    # begins to visit the pages in turn.
+    #
+    # This method is also responsible for checking whether the page
+    # limit has been reached and, if it has, ending the spidering.
+    #
+    # @api private
+    def run
+      Varnisher.log.info "Beginning spider of #{@uri}"
-      Parallel.in_threads(threads) { |thread_number|
-          # We've crawled too many pages
-          next if @pages_hit > num_pages && num_pages >= 0
+      crawl_page(@uri)
-          while @to_visit.length > 0 do
-            begin
-              url = @to_visit.pop
-            end while ( @visited.include? url )
+      threads = Varnisher.options['threads']
+      num_pages = Varnisher.options['num-pages']
+      Parallel.in_threads(threads) do |thread_number|
+        next if @visited.length > num_pages && num_pages >= 0
+        crawl_page(pop_url) while @to_visit.length > 0
+      end
-            crawl_page(url)
-          end
-        }
+      Varnisher.log.info "Done; #{@visited.length} pages hit."
     end
   end
 end

data/lib/varnisher/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Varnisher
-  VERSION = "1.0.beta.2"
+  VERSION = '1.0.beta.3'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: varnisher
 version: !ruby/object:Gem::Version
-  version: 1.0.beta.2
+  version: 1.0.beta.3
 platform: ruby
 authors:
 - Rob Miller
@@ -25,19 +25,19 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 5.2.0
 - !ruby/object:Gem::Dependency
-  name: hpricot
+  name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 0.8.6
+        version: 1.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 0.8.6
+        version: 1.6.0
 - !ruby/object:Gem::Dependency
   name: parallel
   requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 0.4.1
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.8.7
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.8.7
 description: Some tools that make working with the Varnish HTTP cache easier, including
   things like doing mass purges of entire domains.
 email: rob@bigfish.co.uk
@@ -77,6 +91,7 @@ files:
 - bin/varnisher
 - lib/varnisher/domainpurger.rb
 - lib/varnisher/pagepurger.rb
+- lib/varnisher/purger.rb
 - lib/varnisher/spider.rb
 - lib/varnisher/version.rb
 - lib/varnisher.rb
@@ -107,3 +122,4 @@ signing_key:
 specification_version: 4
 summary: Helpful tools for working with Varnish caches
 test_files: []
+has_rdoc: