varnisher 1.0.beta.2 → 1.0.beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a9d710584e6d43f0f925cd4894f536fa1630551
4
- data.tar.gz: 9d22ce540f64d36683db840c395c03c1096a88cd
3
+ metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
4
+ data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
5
5
  SHA512:
6
- metadata.gz: 7afaedc98a7557689c4908da0ba13e54f674d70ec3fd48543f0bf4181b016d61d05f532196683f3dfd97ff8e44ddde417afb4e116bfdf941af73068664531327
7
- data.tar.gz: 55cd0e503a1152418c84e3a1876ee75a38b2c6450ea5e9495b0cd2d2d55bf608d8ef07939e84f170cfc19d2c665f8863cafb5e6aeb21241a71e8fc71caa706f2
6
+ metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
7
+ data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba
data/README.md CHANGED
@@ -12,12 +12,17 @@ Varnisher lets you do things like:
12
12
  * Purge an entire domain, including optionally re-spidering it
13
13
  afterwards to keep the cache warm
14
14
 
15
+ Full documentation is available [on
16
+ rdoc.info](http://rdoc.info/github/robmiller/varnisher).
17
+
15
18
  ## Installation
16
19
 
17
20
  Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
18
- installed, then Varnisher can be installed by running:
21
+ installed, then Varnisher can be installed easily via RubyGems.
22
+
23
+ Varnisher is still in beta; you can install it with:
19
24
 
20
- gem install varnisher
25
+ gem install varnisher --pre
21
26
 
22
27
  ## Usage
23
28
 
@@ -52,9 +57,9 @@ you want to paste and override them:
52
57
  verbose: false
53
58
  hostname: localhost
54
59
  port: 80
55
- num_pages: 100
56
- ignore_hash: true
57
- ignore_query_string: false
60
+ num-pages: -1
61
+ ignore-hashes: true
62
+ ignore-query-strings: false
58
63
 
59
64
  ## Examples
60
65
 
@@ -89,7 +94,7 @@ which is fairly standard:
89
94
 
90
95
  (For an explanation of just what `obj.http.x-url` means, and why you
91
96
  should use it rather than `req.url`, see [this
92
- page](http://kristianlyng.wordpress.com/2010/07/28/smart-bans-with-varnish/).)
97
+ page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
93
98
 
94
99
  ### Purging an entire domain
95
100
 
data/bin/varnisher CHANGED
@@ -9,9 +9,9 @@ require 'yaml'
9
9
  require 'varnisher'
10
10
 
11
11
  Main {
12
- examples "varnisher purge http://example.com", "varnisher spider example.com", "varnisher purge --reindex example.com"
12
+ examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
13
13
 
14
- description "Varnisher is a set of tools for working with the Varnish HTTP cache."
14
+ description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
15
15
 
16
16
  argument 'target'
17
17
 
@@ -19,89 +19,105 @@ Main {
19
19
  description "If given, Varnisher will be noisier about what it's up to."
20
20
  }
21
21
 
22
+ option('q', 'quiet') {
23
+ description 'If given, Varnisher will be silent apart from errors.'
24
+ }
25
+
22
26
  option('H', 'hostname') {
23
27
  argument :required
24
- description "The hostname/IP address of your Varnish server."
25
- default "localhost"
28
+ description 'The hostname/IP address of your Varnish server.'
26
29
  }
27
30
 
28
31
  option('p', 'port') {
29
32
  argument :required
30
33
  cast :int
31
- description "The port Varnish is listening on."
32
- default 80
34
+ description 'The port Varnish is listening on.'
35
+ }
36
+
37
+ option('o', 'output-file') {
38
+ argument :required
39
+ description 'A file to output log information to. If not given, output will be printed to STDOUT'
33
40
  }
34
41
 
35
42
  def before_run
36
43
  load_config
37
44
  end
38
45
 
39
- mode "purge" do
40
- argument('target') { description "The URL or hostname to purge" }
46
+ mode 'purge' do
47
+ argument('target') { description 'The URL or hostname to purge' }
41
48
 
42
49
  option('reindex') {
43
- description "If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible."
50
+ description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
44
51
  }
45
52
 
46
53
  def run
47
54
  target = params['target'].value
48
55
 
49
56
  # If target is a valid URL, then assume we're purging a page and its contents.
50
- if target =~ /^[a-z]+:\/\//
51
- Varnisher::PagePurger.new target
52
- end
53
-
57
+ if target =~ %r(^[a-z]+://)
58
+ purger = Varnisher::PagePurger.new target
59
+ purger.purge
54
60
  # If target is a hostname, assume we want to purge an entire domain.
55
- if target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
61
+ elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
56
62
  Varnisher::DomainPurger.new target
57
63
 
58
64
  if params['reindex'].given?
59
- Varnisher::Spider.new "http://#{target}/"
65
+ spider = Varnisher::Spider.new "http://#{target}/"
66
+ spider.run
60
67
  end
61
68
  end
62
69
  end
63
70
  end
64
71
 
65
- mode "spider" do
66
- argument('target') { description "The URL to begin spidering from." }
72
+ mode 'spider' do
73
+ argument('target') { description 'The URL to begin spidering from.' }
67
74
 
68
75
  option('n', 'num-pages') {
69
76
  argument :required
70
77
  cast :int
71
- description "Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit."
72
- default -1
78
+ description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
73
79
  }
74
80
 
75
81
  option('t', 'threads') {
76
82
  argument :required
77
83
  cast :int
78
- description "Spidering is done in parallel; this variable controls how many threads will be used."
79
- default 16
84
+ description 'Spidering is done in parallel; this variable controls how many threads will be used.'
80
85
  }
81
86
 
82
87
  option('#', 'ignore-hashes') {
83
- description "When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource."
88
+ description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
84
89
  }
85
90
 
86
91
  option('q', 'ignore-query-strings') {
87
- description "When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource."
92
+ description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
88
93
  }
89
94
 
90
95
  def run
91
96
  target = params['target'].value
92
97
 
93
- Varnisher::Spider.new target
98
+ spider = Varnisher::Spider.new target
99
+ spider.run
94
100
  end
95
101
  end
96
102
 
97
103
  def load_config
98
- $options = params.to_options
104
+ # Start with our default options.
105
+ options = Varnisher.options
99
106
 
100
- rcfile = File.expand_path("~/.varnishrc")
107
+ # Check the user's RC file -- if it exists -- to see if they've
108
+ # specified any defaults of their own.
109
+ rcfile = File.expand_path('~/.varnishrc')
101
110
  if FileTest.readable? rcfile
102
- rc = YAML::load(File.open(rcfile))
103
- $options.merge!(rc)
111
+ rc = YAML.load(File.open(rcfile))
112
+ options.merge!(rc)
104
113
  end
114
+
115
+ # The highest priority is given to command line arguments, so that
116
+ # the user can override things that are in their RC file if they
117
+ # choose to.
118
+ options.merge!(params.to_options.reject { |k, v| v.nil? })
119
+
120
+ Varnisher.options = options
105
121
  end
106
122
  }
107
123
 
data/lib/varnisher.rb CHANGED
@@ -1,4 +1,65 @@
1
1
  require_relative 'varnisher/spider'
2
+ require_relative 'varnisher/purger'
2
3
  require_relative 'varnisher/domainpurger'
3
4
  require_relative 'varnisher/pagepurger'
4
5
 
6
+ require 'logger'
7
+
8
+ # This module is a namespace for our main functionality:
9
+ #
10
+ # * {Varnisher::Spider}
11
+ # * {Varnisher::DomainPurger}
12
+ # * {Varnisher::PagePurger}
13
+ module Varnisher
14
+ # Our default options are set here; they can be overriden either by
15
+ # command-line arguments or by settings in a user's ~/.varnishrc file.
16
+ @options = {
17
+ 'verbose' => false,
18
+ 'quiet' => false,
19
+ 'hostname' => nil,
20
+ 'port' => 80,
21
+ 'num-pages' => -1,
22
+ 'threads' => 16,
23
+ 'ignore-hashes' => true,
24
+ 'ignore-query-strings' => false,
25
+ 'output-file' => nil
26
+ }
27
+
28
+ def self.options
29
+ @options
30
+ end
31
+
32
+ def self.options=(options)
33
+ @options = options
34
+
35
+ if options['hostname'].nil? && options['target']
36
+ uri = URI.parse(options['target'])
37
+ options['hostname'] = uri.host
38
+ end
39
+
40
+ start_logging
41
+ end
42
+
43
+ # Sets up our Logger object, which will write output either to STDOUT
44
+ # (the default) or to the specified file.
45
+ def self.start_logging
46
+ output = @options['output-file'] || STDOUT
47
+ @log = Logger.new(output)
48
+
49
+ # By default, only display the log message, nothing else.
50
+ @log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
51
+
52
+ @log.level = if @options['verbose']
53
+ Logger::DEBUG
54
+ elsif @options['quiet']
55
+ Logger::FATAL
56
+ else
57
+ Logger::INFO
58
+ end
59
+ end
60
+
61
+ def self.log
62
+ @log
63
+ end
64
+ end
65
+
@@ -1,27 +1,27 @@
1
1
  require 'net/http'
2
2
 
3
- # This requires a special bit of VCL:
4
- #
5
- # if ( req.request == "DOMAINPURGE" ) {
6
- # if ( client.ip ~ auth ) {
7
- # ban("obj.http.x-host == " + req.http.host);
8
- # error 200 "Purged.";
9
- # }
10
- # }
11
-
12
3
  module Varnisher
4
+ # Purges an entire domain from the Varnish cache.
5
+ #
6
+ # This requires a special bit of VCL in your Varnish configuration:
7
+ #
8
+ # if ( req.request == "DOMAINPURGE" ) {
9
+ # if ( client.ip ~ auth ) {
10
+ # ban("obj.http.x-host == " + req.http.host);
11
+ # error 200 "Purged.";
12
+ # }
13
+ # }
13
14
  class DomainPurger
15
+ # Executes the purge request.
16
+ #
17
+ # @param domain [String] The hostname to purge
14
18
  def initialize(domain)
15
- s = TCPSocket.open($options['hostname'], $options['port'])
16
- s.print("DOMAINPURGE / HTTP/1.1\r\nHost: #{domain}\r\n\r\n")
17
-
18
- if s.read =~ /HTTP\/1\.1 200 Purged\./
19
- puts "Purged #{domain}"
19
+ purged = Varnisher.purge(domain, :domain)
20
+ if purged
21
+ Varnisher.log.info "Purged #{domain}"
20
22
  else
21
- puts "Failed to purge #{domain}"
23
+ Varnisher.log.info "Failed to purge #{domain}"
22
24
  end
23
-
24
- s.close
25
25
  end
26
26
  end
27
27
  end
@@ -1,180 +1,156 @@
1
1
  require 'rubygems'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
  require 'net/http'
4
4
  require 'parallel'
5
5
 
6
6
  module Varnisher
7
+ # Purges an individual URL from Varnish.
7
8
  class PagePurger
8
-
9
+ # A bash at an abstract representation of resources. All you need
10
+ # is an XPath, and what attribute to select from the matched
11
+ # elements.
12
+ Resource = Struct.new :name, :selector, :attribute
13
+ def self.resources
14
+ [
15
+ Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
16
+ Resource.new('JavaScript file', 'script[src]', 'src'),
17
+ Resource.new('image file', 'img[src]', 'src')
18
+ ]
19
+ end
20
+
21
+ # Purges the given URL from the Varnish cache.
22
+ #
23
+ # Will also purge all of the resources it finds on that page (e.g.
24
+ # images, CSS files, JavaScript files, etc.)
25
+ #
26
+ # @param url [String, URI] The URL to purge
9
27
  def initialize(url)
10
28
  @url = url
11
29
  @uri = URI.parse(url)
12
-
13
- @urls = []
14
-
15
- # First, purge the URL itself; that means we'll get up-to-date references within that page.
16
- puts "Purging #{@url}...\n\n"
17
- purge(@url)
18
-
19
- # Then, do a fresh GET of the page and queue any resources we find on it.
20
- puts "Looking for external resources on #{@url}..."
21
-
22
- if $options["verbose"]
23
- puts "\n\n"
24
- end
25
30
 
26
- fetch_page(@url)
31
+ @urls = []
32
+ end
27
33
 
28
- if $options["verbose"]
29
- puts "\n"
34
+ # Sends a PURGE request to the Varnish server, asking it to purge
35
+ # the given URL from its cache.
36
+ #
37
+ # This presupposes that you have the following VCL in your Varnish
38
+ # config file:
39
+ #
40
+ # if (req.request == "PURGE") {
41
+ # if ( client.ip ~ auth ) {
42
+ # ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
43
+ # error 200 "Purged.";
44
+ # }
45
+ # }
46
+ #
47
+ # More about purging can be found
48
+ # [in the Varnish documentation][purging-and-banning].
49
+ #
50
+ # [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
51
+ #
52
+ # @api private
53
+ def purge
54
+ Varnisher.log.info "Purging #{@url}..."
55
+
56
+ purged = Varnisher.purge(@url)
57
+ if purged
58
+ Varnisher.log.info ''
59
+ Varnisher.log.debug "Purged #{@url}"
60
+ else
61
+ Varnisher.log.info "Failed to purge #{@url}\n"
30
62
  end
31
63
 
32
- puts "#{@urls.length} total resources found.\n\n"
64
+ purge_resources
65
+ end
33
66
 
34
- if @urls.length == 0
35
- puts "No resources found. Abort!"
36
- return
37
- end
38
-
39
- # Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
40
- puts "Tidying resources...\n"
41
- tidy_resources
42
- puts "#{@urls.length} purgeable resources found.\n\n"
43
-
44
- # Now, purge all of the resources we just queued.
45
- puts "Purging resources..."
67
+ # Purges all the resources on the given page.
68
+ def purge_resources
69
+ fetch_page
46
70
 
47
- if $options["verbose"]
48
- puts "\n\n"
49
- end
71
+ return if @urls.empty?
50
72
 
73
+ tidy_resources
51
74
  purge_queue
52
-
53
- if $options["verbose"]
54
- puts "\n"
55
- end
56
-
57
- puts "Nothing more to do!\n\n"
58
75
  end
59
-
60
- # Sends a PURGE request to the Varnish server, asking it to purge the given URL from its cache.
61
- def purge(url)
76
+
77
+ # Fetches a page and parses out any external resources (e.g.
78
+ # JavaScript files, images, CSS files) it finds on it.
79
+ #
80
+ # @api private
81
+ def fetch_page
82
+ Varnisher.log.info "Looking for external resources on #{@url}..."
83
+
62
84
  begin
63
- uri = URI.parse(URI.encode(url.to_s.strip))
85
+ @doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
64
86
  rescue
65
- puts "Couldn't parse URL for purging: #{$!}"
87
+ Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
66
88
  return
67
89
  end
68
90
 
69
- s = TCPSocket.open($options['hostname'], $options['port'])
70
- s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
71
-
72
- if $options["verbose"]
73
- if s.read =~ /HTTP\/1\.1 200 Purged\./
74
- puts "Purged #{url}"
75
- else
76
- puts "Failed to purge #{url}"
77
- end
78
- end
91
+ @urls = find_resources
79
92
 
80
- s.close
93
+ Varnisher.log.debug ''
94
+ Varnisher.log.info "#{@urls.length} total resources found.\n"
81
95
  end
82
-
83
- # Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
84
- def fetch_page(url)
85
- begin
86
- uri = URI.parse(URI.encode(url.to_s.strip))
87
- rescue
88
- puts "Couldn't parse URL for resource-searching: #{url}"
89
- return
90
- end
91
-
92
- headers = {
93
- "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
94
- "Accept-Charset" => "utf-8",
95
- "Accept" => "text/html"
96
- }
97
-
98
- begin
99
- doc = Hpricot(Net::HTTP.get_response(uri).body)
100
- rescue
101
- puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
102
- return
103
- end
104
96
 
105
- find_resources(doc) do |resource|
106
- if $options["verbose"]
107
- puts "Found #{resource}"
108
- end
109
- queue_resource(resource)
97
+ # Returns an array of resources contained within the current page.
98
+ #
99
+ # Resources include things like CSS files, images, and JavaScript
100
+ # files.
101
+ #
102
+ # If a block is given, the block will be executed once for each
103
+ # resource.
104
+ #
105
+ # @return [Array] An array of strings, each representing a URL
106
+ #
107
+ # @api private
108
+ def find_resources
109
+ found = []
110
+
111
+ self.class.resources.each do |res|
112
+ @doc.css(res.selector).each do |e|
113
+ attribute = e[res.attribute]
114
+
115
+ Varnisher.log.debug("Found resource: #{attribute}")
116
+
117
+ yield attribute if block_given?
118
+ found << attribute
119
+ end
110
120
  end
121
+
122
+ found
111
123
  end
112
124
 
113
- def find_resources(doc)
114
- return unless doc.respond_to? 'search'
125
+ # Tidies up the resource queue, converting relative URLs to
126
+ # absolute.
127
+ #
128
+ # @return [Array] The new URLs
129
+ #
130
+ # @api private
131
+ def tidy_resources
132
+ Varnisher.log.info 'Tidying resources...'
115
133
 
116
- # A bash at an abstract representation of resources. All you need is an XPath, and what attribute to select from the matched elements.
117
- resource = Struct.new :name, :xpath, :attribute
118
- resources = [
119
- resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
120
- resource.new('JavaScript file', 'script[@src]', 'src'),
121
- resource.new('image file', 'img[@src]', 'src')
122
- ]
134
+ @urls = @urls.map { |url| URI.join(@uri, url) }
135
+ .select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
123
136
 
124
- resources.each { |resource|
125
- doc.search(resource.xpath).each { |e|
126
- att = e.get_attribute(resource.attribute)
127
- yield att
128
- }
129
- }
130
- end
131
-
132
- # Adds a URL to the processing queue.
133
- def queue_resource(url)
134
- @urls << url.to_s
137
+ Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
135
138
  end
136
-
137
- def tidy_resources
138
- valid_urls = []
139
-
140
- @urls.each { |url|
141
- # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
142
- if url.to_s =~ /^\//
143
- url = @uri.scheme + "://" + @uri.host + url.to_s
144
- end
145
139
 
146
- # If we're dealing with a path-relative URL, make it relative to the current directory.
147
- unless url.to_s =~ /[a-z]+:\/\//
148
- # Take everything up to the final / in the path to be the current directory.
149
- /^(.*)\//.match(@uri.path)
150
- url = @uri.scheme + "://" + @uri.host + $1 + "/" + url.to_s
151
- end
152
-
153
- begin
154
- uri = URI.parse(url)
155
- rescue
156
- next
157
- end
158
-
159
- # Skip URLs that aren't HTTP, or that are on different domains.
160
- next if uri.scheme != "http"
161
- next if uri.host != @uri.host
140
+ # Processes the queue of URLs, sending a purge request for each of
141
+ # them.
142
+ #
143
+ # @api private
144
+ def purge_queue
145
+ Varnisher.log.info 'Purging resources...'
162
146
 
163
- valid_urls << url
164
- }
147
+ Parallel.map(@urls) do |url|
148
+ Varnisher.log.debug "Purging #{url}..."
165
149
 
166
- @urls = valid_urls.dup
167
- end
168
-
169
- # Processes the queue of URLs, sending a purge request for each of them.
170
- def purge_queue()
171
- Parallel.map(@urls) { |url|
172
- if $options["verbose"]
173
- puts "Purging #{url}..."
174
- end
150
+ Varnisher.purge(url.to_s)
151
+ end
175
152
 
176
- purge(url)
177
- }
153
+ Varnisher.log.info 'Done.'
178
154
  end
179
155
 
180
156
  end
@@ -0,0 +1,62 @@
1
+ module Varnisher
2
+ # Sends a purge request to the Varnish server
3
+ #
4
+ # It does this by sending an HTTP request with a custom method; either
5
+ # PURGE, if the specified target is a URL, or DOMAINPURGE if the
6
+ # specified target is a hostname.
7
+ #
8
+ # This naturally relies on you having your Varnish config prepared
9
+ # appropriately, so that the actual purge will take place when we send
10
+ # these requests.
11
+ #
12
+ # @param target [String, URI] The URL or hostname to purge
13
+ # @param type [:page, :domain] Whether to do a purge of an individual
14
+ # URL or a whole hostname
15
+ # @return [true, false] True if we received an acceptable response
16
+ # from the server; false otherwise
17
+ def self.purge(target, type = :page)
18
+ if type == :page
19
+ purger = Purger.from_url(target)
20
+ else
21
+ purger = Purger.new('DOMAINPURGE', '/', target)
22
+ end
23
+
24
+ purger.send if purger
25
+ end
26
+
27
+ # Responsible for sending purge requests to the Varnish server.
28
+ class Purger
29
+ # Prepares a new purge request.
30
+ #
31
+ # @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
32
+ # the server
33
+ # @param path [String] The path to purge; for a domain purge,
34
+ # use "/"
35
+ # @param host [String] The hostname of the URL being purged
36
+ def initialize(method, path, host)
37
+ @method = method
38
+ @path = path
39
+ @host = host
40
+ end
41
+
42
+ def self.from_url(url)
43
+ begin
44
+ uri = URI.parse(URI.encode(url.to_s.strip))
45
+ rescue
46
+ return
47
+ end
48
+
49
+ new('PURGE', uri.path, uri.host)
50
+ end
51
+
52
+ def send
53
+ hostname = Varnisher.options['hostname']
54
+ port = Varnisher.options['port']
55
+
56
+ TCPSocket.open(hostname, port) do |s|
57
+ s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
58
+ !!s.read.match(/HTTP\/1\.1 200 Purged\./)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,73 +1,85 @@
1
1
  require 'rubygems'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
  require 'net/http'
4
4
  require 'parallel'
5
5
 
6
6
  module Varnisher
7
+ # Crawls a website, following links that it finds along the way, until
8
+ # it either runs out of pages to visit or reaches the limit of pages
9
+ # that you impose on it.
10
+ #
11
+ # The spider is multithreaded, which means that one slow request won't
12
+ # prevent the rest of your requests from happening; this is often the
13
+ # case when the cached resources are a combination of static or
14
+ # near-static resources (like CSS and images) and slow, dynamically
15
+ # generated pages.
16
+ #
17
+ # The spider's behaviour can be configured somewhat, so that for
18
+ # example it ignores query strings (treating /foo?foo=bar and
19
+ # /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
20
+ # and /foo#bar will be treated as different URLs).
21
+ #
22
+ #
7
23
  class Spider
8
24
 
25
+ # Starts a new spider instance.
26
+ #
27
+ # Once it's done a bit of housekeeping and verified that the URL is
28
+ # acceptable, it calls {#spider} to do the actual fetching of the
29
+ # pages.
30
+ #
31
+ # @param url [String, URI] The URL to begin the spidering from. This
32
+ # also restricts the spider to fetching pages only on that
33
+ # (sub)domain - so, for example, if you specify
34
+ # http://example.com/foo as your starting page, only URLs that begin
35
+ # http://example.com will be followed.
9
36
  def initialize(url)
10
- if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
11
- url = 'http://' + url
12
- end
37
+ # If we've been given only a hostname, assume that we want to
38
+ # start spidering from the homepage
39
+ url = 'http://' + url unless url =~ %r(^[a-z]+://)
13
40
 
14
41
  @uri = URI.parse(url)
15
42
 
16
- @pages_hit = 0
17
-
18
43
  @visited = []
19
44
  @to_visit = []
20
-
21
- puts "Beginning spider of #{url}"
22
- crawl_page(url)
23
- spider
24
- puts "Done; #{@pages_hit} pages hit."
25
45
  end
26
46
 
47
+ # Adds a link to the queue of pages to be visited.
48
+ #
49
+ # Doesn't perform any duplication-checking; however, {#crawl_page}
50
+ # will refuse to crawl pages that have already been visited, so you
51
+ # can safely queue links blindly and trust that {#crawl_page} will do
52
+ # the de-duping for you.
53
+ #
54
+ # @api private
27
55
  def queue_link(url)
28
56
  @to_visit << url
29
57
  end
30
58
 
31
- def crawl_page(url, limit = 10)
59
+ # Visits a page, and extracts the links that it finds there.
60
+ #
61
+ # Links can be in the href attributes of HTML anchor tags, or they
62
+ # can just be URLs that are mentioned in the content of the page;
63
+ # the spider is flexible about what it crawls.
64
+ #
65
+ # Each link that it finds will be added to the queue of further
66
+ # pages to visit.
67
+ #
68
+ # @param url [String, URI] The URL of the page to fetch
69
+ #
70
+ # @api private
71
+ def crawl_page(uri)
32
72
  # Don't crawl a page twice
33
- return if @visited.include? url
73
+ return if @visited.include? uri.to_s
34
74
 
35
75
  # Let's not hit this again
36
- @visited << url
37
-
38
- begin
39
- uri = URI.parse(URI.encode(url.to_s.strip))
40
- rescue
41
- return
42
- end
43
-
44
- headers = {
45
- "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
46
- "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
47
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
48
- }
49
-
50
- begin
51
- req = Net::HTTP::Get.new(uri.path, headers)
52
- response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
53
-
54
- case response
55
- when Net::HTTPRedirection
56
- return crawl_page(response['location'], limit - 1)
57
- when Net::HTTPSuccess
58
- doc = Hpricot(response.body)
59
- end
60
- rescue
61
- return
62
- end
76
+ @visited << uri.to_s
63
77
 
64
- @pages_hit += 1
78
+ doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
65
79
 
66
- if $options["verbose"]
67
- puts "Fetched #{url}..."
68
- end
80
+ Varnisher.log.debug "Fetched #{uri}..."
69
81
 
70
- find_links(doc, url) do |link|
82
+ find_links(doc, uri).each do |link|
71
83
  next if @visited.include? link
72
84
  next if @to_visit.include? link
73
85
 
@@ -75,93 +87,149 @@ module Varnisher
75
87
  end
76
88
  end
77
89
 
78
- def find_links(doc, url)
79
- return unless doc.respond_to? 'search'
90
+ # Given a Nokogiri document, will return all the links in that
91
+ # document.
92
+ #
93
+ # "Links" are defined, for now, as the contents of the `href`
94
+ # attributes on HTML `<a>` tags, and URLs that are mentioned in
95
+ # comments.
96
+ #
97
+ # @param doc A Nokogiri document
98
+ # @param url [String, URI] The URL that the document came from;
99
+ # this is used to resolve relative URIs
100
+ #
101
+ # @return [Array] An array of URIs
102
+ #
103
+ # @api private
104
+ def find_links(doc, uri)
105
+ hrefs = []
80
106
 
81
- begin
82
- uri = URI.parse(URI.encode(url.to_s.strip))
83
- rescue
84
- return
85
- end
107
+ hrefs = get_anchors(doc)
108
+ hrefs += get_commented_urls(doc)
86
109
 
87
- hrefs = []
110
+ hrefs = valid_urls(hrefs, uri)
111
+ hrefs = remove_hashes(hrefs)
112
+ hrefs = remove_query_strings(hrefs)
113
+
114
+ hrefs
115
+ end
116
+
117
+ # Given an HTML document, will return all the URLs that exist as
118
+ # href attributes of anchor tags.
119
+ #
120
+ # @return [Array] An array of strings
121
+ def get_anchors(doc)
122
+ doc.xpath('//a[@href]').map { |e| e['href'] }
123
+ end
124
+
125
+ # Given an HTML document, will return all the URLs that exist in
126
+ # HTML comments, e.g.:
127
+ #
128
+ # <!-- http://example.com/foo/bar -->
129
+ def get_commented_urls(doc)
130
+ doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
131
+ end
88
132
 
89
- # Looks like a valid document! Let's parse it for links
90
- doc.search("//a[@href]").each do |e|
91
- hrefs << e.get_attribute("href")
133
+ # Given a set of URLs, will return only the ones that are valid for
134
+ # spidering.
135
+ #
136
+ # That means URLs that have the same hostname as the hostname we
137
+ # started from, and that are on the HTTP scheme rather than HTTPS
138
+ # (since Varnish doesn't support HTTPS).
139
+ #
140
+ # Additionally, some normalisation will be performed, so that the
141
+ # URLs are absolute (using the page that they were fetched from as
142
+ # the base, just like a browser would).
143
+ #
144
+ # @return [Array] An array of URIs
145
+ def valid_urls(hrefs, uri)
146
+ hrefs.map { |u| URI.join(uri, URI.escape(u)) }
147
+ .select { |u| u.scheme == 'http' && u.host == @uri.host }
148
+ end
149
+
150
+ # Given a set of URLs, will normalise them according to their URL
151
+ # minus the hash; that is, normalise them so that:
152
+ #
153
+ # foo#bar
154
+ #
155
+ # and:
156
+ #
157
+ # foo#baz
158
+ #
159
+ # Are considered the same.
160
+ #
161
+ # @return [Array] An array of URIs
162
+ def remove_hashes(hrefs)
163
+ return hrefs unless Varnisher.options['ignore-hashes']
164
+
165
+ hrefs = hrefs.group_by do |h|
166
+ URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
92
167
  end
93
168
 
94
- # Let's also look for commented-out URIs
95
- doc.search("//comment()").each do |e|
96
- e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
169
+ hrefs.keys
170
+ end
171
+
172
+ # Given a set of URLs, will normalise them according to their URL
173
+ # minus the query string; that is, normalise them so that:
174
+ #
175
+ # foo?foo=bar
176
+ #
177
+ # and:
178
+ #
179
+ # foo?foo=baz
180
+ #
181
+ # Are considered the same.
182
+ #
183
+ # @return [Array] An array of URIs
184
+ def remove_query_strings(hrefs)
185
+ return hrefs unless Varnisher.options['ignore-query-strings']
186
+
187
+ hrefs = hrefs.group_by do |h|
188
+ URI.parse(h.scheme + '://' + h.host + h.path.to_s)
97
189
  end
98
190
 
99
- hrefs.each do |href|
100
- # Skip mailto links
101
- next if href =~ /^mailto:/
102
-
103
- # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
104
- if href.to_s =~ /^\//
105
- href = uri.scheme + "://" + uri.host + href.to_s
106
- end
107
-
108
- # If we're dealing with a path-relative URL, make it relative to the current directory.
109
- unless href.to_s =~ /[a-z]+:\/\//
110
- # Take everything up to the final / in the path to be the current directory.
111
- if uri.path =~ /\//
112
- /^(.*)\//.match(uri.path)
113
- path = $1
114
- # If we're on the homepage, then we don't need a path.
115
- else
116
- path = ""
117
- end
118
-
119
- href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
120
- end
121
-
122
- # At this point, we should have an absolute URL regardless of
123
- # its original format.
124
-
125
- # Strip hash links
126
- if ( $options["ignore-hashes"] )
127
- href.gsub!(/(#.*?)$/, '')
128
- end
129
-
130
- # Strip query strings
131
- if ( $options["ignore-query-strings"] )
132
- href.gsub!(/(\?.*?)$/, '')
133
- end
134
-
135
- begin
136
- href_uri = URI.parse(href)
137
- rescue
138
- # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
139
- next
140
- end
141
-
142
- next if href_uri.host != uri.host
143
- next unless href_uri.scheme =~ /^https?$/
144
-
145
- yield href
191
+ hrefs.keys
192
+ end
193
+
194
+ # Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
195
+ # it's not one that we've visited before.
196
+ #
197
+ # @return [URI] A URI object for an unvisited page
198
+ def pop_url
199
+ url = ''
200
+
201
+ loop do
202
+ url = @to_visit.pop
203
+ break unless @visited.include?(url)
146
204
  end
205
+
206
+ url
147
207
  end
148
208
 
149
- def spider
150
- threads = $options["threads"] || 16
151
- num_pages = $options["num-pages"] || -1
209
+ # Kicks off the spidering process.
210
+ #
211
+ # Fires up Parallel in as many threads as have been configured, and
212
+ # begins to visit the pages in turn.
213
+ #
214
+ # This method is also responsible for checking whether the page
215
+ # limit has been reached and, if it has, ending the spidering.
216
+ #
217
+ # @api private
218
+ def run
219
+ Varnisher.log.info "Beginning spider of #{@uri}"
152
220
 
153
- Parallel.in_threads(threads) { |thread_number|
154
- # We've crawled too many pages
155
- next if @pages_hit > num_pages && num_pages >= 0
221
+ crawl_page(@uri)
156
222
 
157
- while @to_visit.length > 0 do
158
- begin
159
- url = @to_visit.pop
160
- end while ( @visited.include? url )
223
+ threads = Varnisher.options['threads']
224
+ num_pages = Varnisher.options['num-pages']
225
+
226
+ Parallel.in_threads(threads) do |thread_number|
227
+ next if @visited.length > num_pages && num_pages >= 0
228
+
229
+ crawl_page(pop_url) while @to_visit.length > 0
230
+ end
161
231
 
162
- crawl_page(url)
163
- end
164
- }
232
+ Varnisher.log.info "Done; #{@visited.length} pages hit."
165
233
  end
166
234
  end
167
235
  end
@@ -1,3 +1,3 @@
1
1
  module Varnisher
2
- VERSION = "1.0.beta.2"
2
+ VERSION = '1.0.beta.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: varnisher
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.beta.2
4
+ version: 1.0.beta.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Miller
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: 5.2.0
27
27
  - !ruby/object:Gem::Dependency
28
- name: hpricot
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: 0.8.6
33
+ version: 1.6.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: 0.8.6
40
+ version: 1.6.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: parallel
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ~>
67
67
  - !ruby/object:Gem::Version
68
68
  version: 0.4.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: yard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: 0.8.7
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 0.8.7
69
83
  description: Some tools that make working with the Varnish HTTP cache easier, including
70
84
  things like doing mass purges of entire domains.
71
85
  email: rob@bigfish.co.uk
@@ -77,6 +91,7 @@ files:
77
91
  - bin/varnisher
78
92
  - lib/varnisher/domainpurger.rb
79
93
  - lib/varnisher/pagepurger.rb
94
+ - lib/varnisher/purger.rb
80
95
  - lib/varnisher/spider.rb
81
96
  - lib/varnisher/version.rb
82
97
  - lib/varnisher.rb
@@ -107,3 +122,4 @@ signing_key:
107
122
  specification_version: 4
108
123
  summary: Helpful tools for working with Varnish caches
109
124
  test_files: []
125
+ has_rdoc: