RubyGems - varnisher - Versions diffs - 1.0.beta.2 → 1.0.beta.3 - Mend

varnisher 1.0.beta.2 → 1.0.beta.3

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +11 -6
data/bin/varnisher +44 -28
data/lib/varnisher.rb +61 -0
data/lib/varnisher/domainpurger.rb +17 -17
data/lib/varnisher/pagepurger.rb +117 -141
data/lib/varnisher/purger.rb +62 -0
data/lib/varnisher/spider.rb +187 -119
data/lib/varnisher/version.rb +1 -1
metadata +20 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4a9d710584e6d43f0f925cd4894f536fa1630551
-  data.tar.gz: 9d22ce540f64d36683db840c395c03c1096a88cd
+  metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
+  data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
 SHA512:
-  metadata.gz: 7afaedc98a7557689c4908da0ba13e54f674d70ec3fd48543f0bf4181b016d61d05f532196683f3dfd97ff8e44ddde417afb4e116bfdf941af73068664531327
-  data.tar.gz: 55cd0e503a1152418c84e3a1876ee75a38b2c6450ea5e9495b0cd2d2d55bf608d8ef07939e84f170cfc19d2c665f8863cafb5e6aeb21241a71e8fc71caa706f2
+  metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
+  data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba

data/README.md CHANGED Viewed

@@ -12,12 +12,17 @@ Varnisher lets you do things like:
 * Purge an entire domain, including optionally re-spidering it
   afterwards to keep the cache warm
+Full documentation is available [on
+rdoc.info](http://rdoc.info/github/robmiller/varnisher).
 ## Installation
 Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
-installed, then Varnisher can be installed by running:
+installed, then Varnisher can be installed easily via RubyGems.
+Varnisher is still in beta; you can install it with:
-	gem install varnisher
+	gem install varnisher --pre
 ## Usage
@@ -52,9 +57,9 @@ you want to paste and override them:
     verbose: false
     hostname: localhost
     port: 80
-    num_pages: 100
-    ignore_hash: true
-    ignore_query_string: false
+    num-pages: -1
+    ignore-hashes: true
+    ignore-query-strings: false
 ## Examples
@@ -89,7 +94,7 @@ which is fairly standard:
 (For an explanation of just what `obj.http.x-url` means, and why you
 should use it rather than `req.url`, see [this
-page](http://kristianlyng.wordpress.com/2010/07/28/smart-bans-with-varnish/).)
+page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
 ### Purging an entire domain

data/bin/varnisher CHANGED Viewed

@@ -9,9 +9,9 @@ require 'yaml'
 require 'varnisher'
 Main {
-  examples "varnisher purge http://example.com", "varnisher spider example.com", "varnisher purge --reindex example.com"
+  examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
-  description "Varnisher is a set of tools for working with the Varnish HTTP cache."
+  description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
   argument 'target'
@@ -19,89 +19,105 @@ Main {
     description "If given, Varnisher will be noisier about what it's up to."
   }
+  option('q', 'quiet') {
+    description 'If given, Varnisher will be silent apart from errors.'
+  }
   option('H', 'hostname') {
     argument :required
-    description "The hostname/IP address of your Varnish server."
-    default "localhost"
+    description 'The hostname/IP address of your Varnish server.'
   }
   option('p', 'port') {
     argument :required
     cast :int
-    description "The port Varnish is listening on."
-    default 80
+    description 'The port Varnish is listening on.'
+  }
+  option('o', 'output-file') {
+    argument :required
+    description 'A file to output log information to. If not given, output will be printed to STDOUT'
   }
   def before_run
     load_config
   end
-  mode "purge" do
-    argument('target') { description "The URL or hostname to purge" }
+  mode 'purge' do
+    argument('target') { description 'The URL or hostname to purge' }
     option('reindex') {
-      description "If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible."
+      description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
     }
     def run
       target = params['target'].value
       # If target is a valid URL, then assume we're purging a page and its contents.
-      if target =~ /^[a-z]+:\/\//
-        Varnisher::PagePurger.new target
-      end
+      if target =~ %r(^[a-z]+://)
+        purger = Varnisher::PagePurger.new target
+        purger.purge
       # If target is a hostname, assume we want to purge an entire domain.
-      if target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
+      elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
         Varnisher::DomainPurger.new target
         if params['reindex'].given?
-          Varnisher::Spider.new "http://#{target}/"
+          spider = Varnisher::Spider.new "http://#{target}/"
+          spider.run
         end
       end
     end
   end
-  mode "spider" do
-    argument('target') { description "The URL to begin spidering from." }
+  mode 'spider' do
+    argument('target') { description 'The URL to begin spidering from.' }
     option('n', 'num-pages') {
       argument :required
       cast :int
-      description "Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit."
-      default -1
+      description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
     }
     option('t', 'threads') {
       argument :required
       cast :int
-      description "Spidering is done in parallel; this variable controls how many threads will be used."
-      default 16
+      description 'Spidering is done in parallel; this variable controls how many threads will be used.'
     }
     option('#', 'ignore-hashes') {
-      description "When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource."
+      description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
     }
     option('q', 'ignore-query-strings') {
-      description "When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource."
+      description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
     }
     def run
       target = params['target'].value
-      Varnisher::Spider.new target
+      spider = Varnisher::Spider.new target
+      spider.run
     end
   end
   def load_config
-    $options = params.to_options
+    # Start with our default options.
+    options = Varnisher.options
-    rcfile = File.expand_path("~/.varnishrc")
+    # Check the user's RC file -- if it exists -- to see if they've
+    # specified any defaults of their own.
+    rcfile = File.expand_path('~/.varnishrc')
     if FileTest.readable? rcfile
-      rc = YAML::load(File.open(rcfile))
-      $options.merge!(rc)
+      rc = YAML.load(File.open(rcfile))
+      options.merge!(rc)
     end
+    # The highest priority is given to command line arguments, so that
+    # the user can override things that are in their RC file if they
+    # choose to.
+    options.merge!(params.to_options.reject { |k, v| v.nil? })
+    Varnisher.options = options
   end
 }

data/lib/varnisher.rb CHANGED Viewed

@@ -1,4 +1,65 @@
 require_relative 'varnisher/spider'
+require_relative 'varnisher/purger'
 require_relative 'varnisher/domainpurger'
 require_relative 'varnisher/pagepurger'
+require 'logger'
+# This module is a namespace for our main functionality:
+#
+# * {Varnisher::Spider}
+# * {Varnisher::DomainPurger}
+# * {Varnisher::PagePurger}
+module Varnisher
+  # Our default options are set here; they can be overriden either by
+  # command-line arguments or by settings in a user's ~/.varnishrc file.
+  @options = {
+    'verbose' => false,
+    'quiet' => false,
+    'hostname' => nil,
+    'port' => 80,
+    'num-pages' => -1,
+    'threads' => 16,
+    'ignore-hashes' => true,
+    'ignore-query-strings' => false,
+    'output-file' => nil
+  }
+  def self.options
+    @options
+  end
+  def self.options=(options)
+    @options = options
+    if options['hostname'].nil? && options['target']
+      uri = URI.parse(options['target'])
+      options['hostname'] = uri.host
+    end
+    start_logging
+  end
+  # Sets up our Logger object, which will write output either to STDOUT
+  # (the default) or to the specified file.
+  def self.start_logging
+    output = @options['output-file'] || STDOUT
+    @log = Logger.new(output)
+    # By default, only display the log message, nothing else.
+    @log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
+    @log.level = if @options['verbose']
+                   Logger::DEBUG
+                 elsif @options['quiet']
+                   Logger::FATAL
+                 else
+                   Logger::INFO
+                 end
+  end
+  def self.log
+    @log
+  end
+end

data/lib/varnisher/domainpurger.rb CHANGED Viewed

@@ -1,27 +1,27 @@
 require 'net/http'
-# This requires a special bit of VCL:
-#
-# if ( req.request == "DOMAINPURGE" ) {
-#   if ( client.ip ~ auth ) {
-#     ban("obj.http.x-host == " + req.http.host);
-#     error 200 "Purged.";
-#   }
-# }
 module Varnisher
+  # Purges an entire domain from the Varnish cache.
+  #
+  # This requires a special bit of VCL in your Varnish configuration:
+  #
+  #     if ( req.request == "DOMAINPURGE" ) {
+  #       if ( client.ip ~ auth ) {
+  #         ban("obj.http.x-host == " + req.http.host);
+  #         error 200 "Purged.";
+  #       }
+  #     }
   class DomainPurger
+    # Executes the purge request.
+    #
+    # @param domain [String] The hostname to purge
     def initialize(domain)
-      s = TCPSocket.open($options['hostname'], $options['port'])
-      s.print("DOMAINPURGE / HTTP/1.1\r\nHost: #{domain}\r\n\r\n")
-      if s.read =~ /HTTP\/1\.1 200 Purged\./
-        puts "Purged  #{domain}"
+      purged = Varnisher.purge(domain, :domain)
+      if purged
+        Varnisher.log.info "Purged #{domain}"
       else
-        puts "Failed to purge #{domain}"
+        Varnisher.log.info "Failed to purge #{domain}"
       end
-      s.close
     end
   end
 end

data/lib/varnisher/pagepurger.rb CHANGED Viewed

@@ -1,180 +1,156 @@
 require 'rubygems'
-require 'hpricot'
+require 'nokogiri'
 require 'net/http'
 require 'parallel'
 module Varnisher
+  # Purges an individual URL from Varnish.
   class PagePurger
+    # A bash at an abstract representation of resources. All you need
+    # is an XPath, and what attribute to select from the matched
+    # elements.
+    Resource = Struct.new :name, :selector, :attribute
+    def self.resources
+      [
+        Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
+        Resource.new('JavaScript file', 'script[src]', 'src'),
+        Resource.new('image file', 'img[src]', 'src')
+      ]
+    end
+    # Purges the given URL from the Varnish cache.
+    #
+    # Will also purge all of the resources it finds on that page (e.g.
+    # images, CSS files, JavaScript files, etc.)
+    #
+    # @param url [String, URI] The URL to purge
     def initialize(url)
       @url = url
       @uri = URI.parse(url)
-      @urls = []
-      # First, purge the URL itself; that means we'll get up-to-date references within that page.
-      puts "Purging #{@url}...\n\n"
-      purge(@url)
-      # Then, do a fresh GET of the page and queue any resources we find on it.
-      puts "Looking for external resources on #{@url}..."
-      if $options["verbose"]
-        puts "\n\n"
-      end
-      fetch_page(@url)
+      @urls = []
+    end
-      if $options["verbose"]
-        puts "\n"
+    # Sends a PURGE request to the Varnish server, asking it to purge
+    # the given URL from its cache.
+    #
+    # This presupposes that you have the following VCL in your Varnish
+    # config file:
+    #
+    #     if (req.request == "PURGE") {
+    #       if ( client.ip ~ auth ) {
+    #         ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
+    #         error 200 "Purged.";
+    #       }
+    #     }
+    #
+    # More about purging can be found
+    # [in the Varnish documentation][purging-and-banning].
+    #
+    # [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
+    #
+    # @api private
+    def purge
+      Varnisher.log.info "Purging #{@url}..."
+      purged = Varnisher.purge(@url)
+      if purged
+        Varnisher.log.info ''
+        Varnisher.log.debug "Purged #{@url}"
+      else
+        Varnisher.log.info "Failed to purge #{@url}\n"
       end
-      puts "#{@urls.length} total resources found.\n\n"
+      purge_resources
+    end
-      if @urls.length == 0
-        puts "No resources found. Abort!"
-        return
-      end
-      # Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
-      puts "Tidying resources...\n"
-      tidy_resources
-      puts "#{@urls.length} purgeable resources found.\n\n"
-      # Now, purge all of the resources we just queued.
-      puts "Purging resources..."
+    # Purges all the resources on the given page.
+    def purge_resources
+      fetch_page
-      if $options["verbose"]
-        puts "\n\n"
-      end
+      return if @urls.empty?
+      tidy_resources
       purge_queue
-      if $options["verbose"]
-        puts "\n"
-      end
-      puts "Nothing more to do!\n\n"
     end
-    # Sends a PURGE request to the Varnish server, asking it to purge the given URL from its cache.
-    def purge(url)
+    # Fetches a page and parses out any external resources (e.g.
+    # JavaScript files, images, CSS files) it finds on it.
+    #
+    # @api private
+    def fetch_page
+      Varnisher.log.info "Looking for external resources on #{@url}..."
       begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
+        @doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
       rescue
-        puts "Couldn't parse URL for purging: #{$!}"
+        Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
         return
       end
-      s = TCPSocket.open($options['hostname'], $options['port'])
-      s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
-      if $options["verbose"]
-        if s.read =~ /HTTP\/1\.1 200 Purged\./
-          puts "Purged  #{url}"
-        else
-          puts "Failed to purge #{url}"
-        end
-      end
+      @urls = find_resources
-      s.close
+      Varnisher.log.debug ''
+      Varnisher.log.info "#{@urls.length} total resources found.\n"
     end
-    # Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
-    def fetch_page(url)
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        puts "Couldn't parse URL for resource-searching: #{url}"
-        return
-      end
-      headers = {
-        "User-Agent"     => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
-        "Accept-Charset" => "utf-8",
-        "Accept"         => "text/html"
-      }
-      begin
-        doc = Hpricot(Net::HTTP.get_response(uri).body)
-      rescue
-        puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
-        return
-      end
-      find_resources(doc) do |resource|
-        if $options["verbose"]
-            puts "Found #{resource}"
-          end
-        queue_resource(resource)
+    # Returns an array of resources contained within the current page.
+    #
+    # Resources include things like CSS files, images, and JavaScript
+    # files.
+    #
+    # If a block is given, the block will be executed once for each
+    # resource.
+    #
+    # @return [Array] An array of strings, each representing a URL
+    #
+    # @api private
+    def find_resources
+      found = []
+      self.class.resources.each do |res|
+        @doc.css(res.selector).each do |e|
+          attribute = e[res.attribute]
+          Varnisher.log.debug("Found resource: #{attribute}")
+          yield attribute if block_given?
+          found << attribute
+        end
       end
+      found
     end
-    def find_resources(doc)
-      return unless doc.respond_to? 'search'
+    # Tidies up the resource queue, converting relative URLs to
+    # absolute.
+    #
+    # @return [Array] The new URLs
+    #
+    # @api private
+    def tidy_resources
+      Varnisher.log.info 'Tidying resources...'
-      # A bash at an abstract representation of resources. All you need is an XPath, and what attribute to select from the matched elements.
-      resource = Struct.new :name, :xpath, :attribute
-      resources = [
-        resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
-        resource.new('JavaScript file', 'script[@src]', 'src'),
-        resource.new('image file', 'img[@src]', 'src')
-      ]
+      @urls = @urls.map { |url| URI.join(@uri, url) }
+        .select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
-      resources.each { |resource|
-        doc.search(resource.xpath).each { |e|
-          att = e.get_attribute(resource.attribute)
-          yield att
-        }
-      }
-    end
-    # Adds a URL to the processing queue.
-    def queue_resource(url)
-      @urls << url.to_s
+      Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
     end
-    def tidy_resources
-      valid_urls = []
-      @urls.each { |url|
-        # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
-        if url.to_s =~ /^\//
-          url = @uri.scheme + "://" + @uri.host + url.to_s
-        end
-        # If we're dealing with a path-relative URL, make it relative to the current directory.
-        unless url.to_s =~ /[a-z]+:\/\//
-          # Take everything up to the final / in the path to be the current directory.
-          /^(.*)\//.match(@uri.path)
-          url = @uri.scheme + "://" + @uri.host + $1 + "/" + url.to_s
-        end
-        begin
-          uri = URI.parse(url)
-        rescue
-          next
-        end
-        # Skip URLs that aren't HTTP, or that are on different domains.
-        next if uri.scheme != "http"
-        next if uri.host != @uri.host
+    # Processes the queue of URLs, sending a purge request for each of
+    # them.
+    #
+    # @api private
+    def purge_queue
+      Varnisher.log.info 'Purging resources...'
-        valid_urls << url
-      }
+      Parallel.map(@urls) do |url|
+        Varnisher.log.debug "Purging #{url}..."
-      @urls = valid_urls.dup
-    end
-    # Processes the queue of URLs, sending a purge request for each of them.
-    def purge_queue()
-      Parallel.map(@urls) { |url|
-        if $options["verbose"]
-          puts "Purging #{url}..."
-        end
+        Varnisher.purge(url.to_s)
+      end
-        purge(url)
-      }
+      Varnisher.log.info 'Done.'
     end
   end

data/lib/varnisher/purger.rb ADDED Viewed

@@ -0,0 +1,62 @@
+module Varnisher
+  # Sends a purge request to the Varnish server
+  #
+  # It does this by sending an HTTP request with a custom method; either
+  # PURGE, if the specified target is a URL, or DOMAINPURGE if the
+  # specified target is a hostname.
+  #
+  # This naturally relies on you having your Varnish config prepared
+  # appropriately, so that the actual purge will take place when we send
+  # these requests.
+  #
+  # @param target [String, URI] The URL or hostname to purge
+  # @param type [:page, :domain] Whether to do a purge of an individual
+  #   URL or a whole hostname
+  # @return [true, false] True if we received an acceptable response
+  #   from the server; false otherwise
+  def self.purge(target, type = :page)
+    if type == :page
+      purger = Purger.from_url(target)
+    else
+      purger = Purger.new('DOMAINPURGE', '/', target)
+    end
+    purger.send if purger
+  end
+  # Responsible for sending purge requests to the Varnish server.
+  class Purger
+    # Prepares a new purge request.
+    #
+    # @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
+    #   the server
+    # @param path [String] The path to purge; for a domain purge,
+    #   use "/"
+    # @param host [String] The hostname of the URL being purged
+    def initialize(method, path, host)
+      @method = method
+      @path = path
+      @host = host
+    end
+    def self.from_url(url)
+      begin
+        uri = URI.parse(URI.encode(url.to_s.strip))
+      rescue
+        return
+      end
+      new('PURGE', uri.path, uri.host)
+    end
+    def send
+      hostname = Varnisher.options['hostname']
+      port = Varnisher.options['port']
+      TCPSocket.open(hostname, port) do |s|
+        s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
+        !!s.read.match(/HTTP\/1\.1 200 Purged\./)
+      end
+    end
+  end
+end

data/lib/varnisher/spider.rb CHANGED Viewed

@@ -1,73 +1,85 @@
 require 'rubygems'
-require 'hpricot'
+require 'nokogiri'
 require 'net/http'
 require 'parallel'
 module Varnisher
+  # Crawls a website, following links that it finds along the way, until
+  # it either runs out of pages to visit or reaches the limit of pages
+  # that you impose on it.
+  #
+  # The spider is multithreaded, which means that one slow request won't
+  # prevent the rest of your requests from happening; this is often the
+  # case when the cached resources are a combination of static or
+  # near-static resources (like CSS and images) and slow, dynamically
+  # generated pages.
+  #
+  # The spider's behaviour can be configured somewhat, so that for
+  # example it ignores query strings (treating /foo?foo=bar and
+  # /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
+  # and /foo#bar will be treated as different URLs).
+  #
+  #
   class Spider
+    # Starts a new spider instance.
+    #
+    # Once it's done a bit of housekeeping and verified that the URL is
+    # acceptable, it calls {#spider} to do the actual fetching of the
+    # pages.
+    #
+    # @param url [String, URI] The URL to begin the spidering from. This
+    #   also restricts the spider to fetching pages only on that
+    #   (sub)domain - so, for example, if you specify
+    #   http://example.com/foo as your starting page, only URLs that begin
+    #   http://example.com will be followed.
     def initialize(url)
-      if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
-        url = 'http://' + url
-      end
+      # If we've been given only a hostname, assume that we want to
+      # start spidering from the homepage
+      url = 'http://' + url unless url =~ %r(^[a-z]+://)
       @uri = URI.parse(url)
-      @pages_hit = 0
       @visited = []
       @to_visit = []
-      puts "Beginning spider of #{url}"
-      crawl_page(url)
-      spider
-      puts "Done; #{@pages_hit} pages hit."
     end
+    # Adds a link to the queue of pages to be visited.
+    #
+    # Doesn't perform any duplication-checking; however, {#crawl_page}
+    # will refuse to crawl pages that have already been visited, so you
+    # can safely queue links blindly and trust that {#crawl_page} will do
+    # the de-duping for you.
+    #
+    # @api private
     def queue_link(url)
       @to_visit << url
     end
-    def crawl_page(url, limit = 10)
+    # Visits a page, and extracts the links that it finds there.
+    #
+    # Links can be in the href attributes of HTML anchor tags, or they
+    # can just be URLs that are mentioned in the content of the page;
+    # the spider is flexible about what it crawls.
+    #
+    # Each link that it finds will be added to the queue of further
+    # pages to visit.
+    #
+    # @param url [String, URI] The URL of the page to fetch
+    #
+    # @api private
+    def crawl_page(uri)
       # Don't crawl a page twice
-      return if @visited.include? url
+      return if @visited.include? uri.to_s
       # Let's not hit this again
-      @visited << url
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        return
-      end
-      headers = {
-        "User-Agent"     => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
-        "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
-        "Accept"         => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-      }
-      begin
-        req = Net::HTTP::Get.new(uri.path, headers)
-        response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
-        case response
-        when Net::HTTPRedirection
-          return crawl_page(response['location'], limit - 1)
-        when Net::HTTPSuccess
-          doc = Hpricot(response.body)
-        end
-      rescue
-        return
-      end
+      @visited << uri.to_s
-      @pages_hit += 1
+      doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
-      if $options["verbose"]
-        puts "Fetched #{url}..."
-      end
+      Varnisher.log.debug "Fetched #{uri}..."
-      find_links(doc, url) do |link|
+      find_links(doc, uri).each do |link|
         next if @visited.include? link
         next if @to_visit.include? link
@@ -75,93 +87,149 @@ module Varnisher
       end
     end
-    def find_links(doc, url)
-      return unless doc.respond_to? 'search'
+    # Given a Nokogiri document, will return all the links in that
+    # document.
+    #
+    # "Links" are defined, for now, as the contents of the `href`
+    # attributes on HTML `<a>` tags, and URLs that are mentioned in
+    # comments.
+    #
+    # @param doc A Nokogiri document
+    # @param url [String, URI] The URL that the document came from;
+    #   this is used to resolve relative URIs
+    #
+    # @return [Array] An array of URIs
+    #
+    # @api private
+    def find_links(doc, uri)
+      hrefs = []
-      begin
-        uri = URI.parse(URI.encode(url.to_s.strip))
-      rescue
-        return
-      end
+      hrefs  = get_anchors(doc)
+      hrefs += get_commented_urls(doc)
-      hrefs = []
+      hrefs = valid_urls(hrefs, uri)
+      hrefs = remove_hashes(hrefs)
+      hrefs = remove_query_strings(hrefs)
+      hrefs
+    end
+    # Given an HTML document, will return all the URLs that exist as
+    # href attributes of anchor tags.
+    #
+    # @return [Array] An array of strings
+    def get_anchors(doc)
+      doc.xpath('//a[@href]').map { |e| e['href'] }
+    end
+    # Given an HTML document, will return all the URLs that exist in
+    # HTML comments, e.g.:
+    #
+    #     <!-- http://example.com/foo/bar -->
+    def get_commented_urls(doc)
+      doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
+    end
-      # Looks like a valid document! Let's parse it for links
-      doc.search("//a[@href]").each do |e|
-        hrefs << e.get_attribute("href")
+    # Given a set of URLs, will return only the ones that are valid for
+    # spidering.
+    #
+    # That means URLs that have the same hostname as the hostname we
+    # started from, and that are on the HTTP scheme rather than HTTPS
+    # (since Varnish doesn't support HTTPS).
+    #
+    # Additionally, some normalisation will be performed, so that the
+    # URLs are absolute (using the page that they were fetched from as
+    # the base, just like a browser would).
+    #
+    # @return [Array] An array of URIs
+    def valid_urls(hrefs, uri)
+      hrefs.map { |u| URI.join(uri, URI.escape(u)) }
+        .select { |u| u.scheme == 'http' && u.host == @uri.host }
+    end
+    # Given a set of URLs, will normalise them according to their URL
+    # minus the hash; that is, normalise them so that:
+    #
+    # foo#bar
+    #
+    # and:
+    #
+    # foo#baz
+    #
+    # Are considered the same.
+    #
+    # @return [Array] An array of URIs
+    def remove_hashes(hrefs)
+      return hrefs unless Varnisher.options['ignore-hashes']
+      hrefs = hrefs.group_by do |h|
+        URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
       end
-      # Let's also look for commented-out URIs
-      doc.search("//comment()").each do |e|
-        e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
+      hrefs.keys
+    end
+    # Given a set of URLs, will normalise them according to their URL
+    # minus the query string; that is, normalise them so that:
+    #
+    # foo?foo=bar
+    #
+    # and:
+    #
+    # foo?foo=baz
+    #
+    # Are considered the same.
+    #
+    # @return [Array] An array of URIs
+    def remove_query_strings(hrefs)
+      return hrefs unless Varnisher.options['ignore-query-strings']
+      hrefs = hrefs.group_by do |h|
+        URI.parse(h.scheme + '://' + h.host + h.path.to_s)
       end
-      hrefs.each do |href|
-          # Skip mailto links
-          next if href =~ /^mailto:/
-          # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
-          if href.to_s =~ /^\//
-            href = uri.scheme + "://" + uri.host + href.to_s
-          end
-          # If we're dealing with a path-relative URL, make it relative to the current directory.
-          unless href.to_s =~ /[a-z]+:\/\//
-            # Take everything up to the final / in the path to be the current directory.
-            if uri.path =~ /\//
-              /^(.*)\//.match(uri.path)
-              path = $1
-            # If we're on the homepage, then we don't need a path.
-            else
-              path = ""
-            end
-            href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
-          end
-          # At this point, we should have an absolute URL regardless of
-          # its original format.
-          # Strip hash links
-          if ( $options["ignore-hashes"] )
-            href.gsub!(/(#.*?)$/, '')
-          end
-          # Strip query strings
-          if ( $options["ignore-query-strings"] )
-            href.gsub!(/(\?.*?)$/, '')
-          end
-          begin
-            href_uri = URI.parse(href)
-          rescue
-            # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
-            next
-          end
-          next if href_uri.host != uri.host
-          next unless href_uri.scheme =~ /^https?$/
-          yield href
+      hrefs.keys
+    end
+    # Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
+    # it's not one that we've visited before.
+    #
+    # @return [URI] A URI object for an unvisited page
+    def pop_url
+      url = ''
+      loop do
+        url = @to_visit.pop
+        break unless @visited.include?(url)
       end
+      url
     end
-    def spider
-      threads = $options["threads"] || 16
-      num_pages = $options["num-pages"] || -1
+    # Kicks off the spidering process.
+    #
+    # Fires up Parallel in as many threads as have been configured, and
+    # begins to visit the pages in turn.
+    #
+    # This method is also responsible for checking whether the page
+    # limit has been reached and, if it has, ending the spidering.
+    #
+    # @api private
+    def run
+      Varnisher.log.info "Beginning spider of #{@uri}"
-      Parallel.in_threads(threads) { |thread_number|
-          # We've crawled too many pages
-          next if @pages_hit > num_pages && num_pages >= 0
+      crawl_page(@uri)
-          while @to_visit.length > 0 do
-            begin
-              url = @to_visit.pop
-            end while ( @visited.include? url )
+      threads = Varnisher.options['threads']
+      num_pages = Varnisher.options['num-pages']
+      Parallel.in_threads(threads) do |thread_number|
+        next if @visited.length > num_pages && num_pages >= 0
+        crawl_page(pop_url) while @to_visit.length > 0
+      end
-            crawl_page(url)
-          end
-        }
+      Varnisher.log.info "Done; #{@visited.length} pages hit."
     end
   end
 end

data/lib/varnisher/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Varnisher
-  VERSION = "1.0.beta.2"
+  VERSION = '1.0.beta.3'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: varnisher
 version: !ruby/object:Gem::Version
-  version: 1.0.beta.2
+  version: 1.0.beta.3
 platform: ruby
 authors:
 - Rob Miller
@@ -25,19 +25,19 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 5.2.0
 - !ruby/object:Gem::Dependency
-  name: hpricot
+  name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 0.8.6
+        version: 1.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 0.8.6
+        version: 1.6.0
 - !ruby/object:Gem::Dependency
   name: parallel
   requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: 0.4.1
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.8.7
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.8.7
 description: Some tools that make working with the Varnish HTTP cache easier, including
   things like doing mass purges of entire domains.
 email: rob@bigfish.co.uk
@@ -77,6 +91,7 @@ files:
 - bin/varnisher
 - lib/varnisher/domainpurger.rb
 - lib/varnisher/pagepurger.rb
+- lib/varnisher/purger.rb
 - lib/varnisher/spider.rb
 - lib/varnisher/version.rb
 - lib/varnisher.rb
@@ -107,3 +122,4 @@ signing_key:
 specification_version: 4
 summary: Helpful tools for working with Varnish caches
 test_files: []
+has_rdoc: