varnisher 1.0.beta.2 → 1.0.beta.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -6
- data/bin/varnisher +44 -28
- data/lib/varnisher.rb +61 -0
- data/lib/varnisher/domainpurger.rb +17 -17
- data/lib/varnisher/pagepurger.rb +117 -141
- data/lib/varnisher/purger.rb +62 -0
- data/lib/varnisher/spider.rb +187 -119
- data/lib/varnisher/version.rb +1 -1
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
|
4
|
+
data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
|
7
|
+
data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba
|
data/README.md
CHANGED
@@ -12,12 +12,17 @@ Varnisher lets you do things like:
|
|
12
12
|
* Purge an entire domain, including optionally re-spidering it
|
13
13
|
afterwards to keep the cache warm
|
14
14
|
|
15
|
+
Full documentation is available [on
|
16
|
+
rdoc.info](http://rdoc.info/github/robmiller/varnisher).
|
17
|
+
|
15
18
|
## Installation
|
16
19
|
|
17
20
|
Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
|
18
|
-
installed, then Varnisher can be installed
|
21
|
+
installed, then Varnisher can be installed easily via RubyGems.
|
22
|
+
|
23
|
+
Varnisher is still in beta; you can install it with:
|
19
24
|
|
20
|
-
gem install varnisher
|
25
|
+
gem install varnisher --pre
|
21
26
|
|
22
27
|
## Usage
|
23
28
|
|
@@ -52,9 +57,9 @@ you want to paste and override them:
|
|
52
57
|
verbose: false
|
53
58
|
hostname: localhost
|
54
59
|
port: 80
|
55
|
-
|
56
|
-
|
57
|
-
|
60
|
+
num-pages: -1
|
61
|
+
ignore-hashes: true
|
62
|
+
ignore-query-strings: false
|
58
63
|
|
59
64
|
## Examples
|
60
65
|
|
@@ -89,7 +94,7 @@ which is fairly standard:
|
|
89
94
|
|
90
95
|
(For an explanation of just what `obj.http.x-url` means, and why you
|
91
96
|
should use it rather than `req.url`, see [this
|
92
|
-
page](http://
|
97
|
+
page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
|
93
98
|
|
94
99
|
### Purging an entire domain
|
95
100
|
|
data/bin/varnisher
CHANGED
@@ -9,9 +9,9 @@ require 'yaml'
|
|
9
9
|
require 'varnisher'
|
10
10
|
|
11
11
|
Main {
|
12
|
-
examples
|
12
|
+
examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
|
13
13
|
|
14
|
-
description
|
14
|
+
description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
|
15
15
|
|
16
16
|
argument 'target'
|
17
17
|
|
@@ -19,89 +19,105 @@ Main {
|
|
19
19
|
description "If given, Varnisher will be noisier about what it's up to."
|
20
20
|
}
|
21
21
|
|
22
|
+
option('q', 'quiet') {
|
23
|
+
description 'If given, Varnisher will be silent apart from errors.'
|
24
|
+
}
|
25
|
+
|
22
26
|
option('H', 'hostname') {
|
23
27
|
argument :required
|
24
|
-
description
|
25
|
-
default "localhost"
|
28
|
+
description 'The hostname/IP address of your Varnish server.'
|
26
29
|
}
|
27
30
|
|
28
31
|
option('p', 'port') {
|
29
32
|
argument :required
|
30
33
|
cast :int
|
31
|
-
description
|
32
|
-
|
34
|
+
description 'The port Varnish is listening on.'
|
35
|
+
}
|
36
|
+
|
37
|
+
option('o', 'output-file') {
|
38
|
+
argument :required
|
39
|
+
description 'A file to output log information to. If not given, output will be printed to STDOUT'
|
33
40
|
}
|
34
41
|
|
35
42
|
def before_run
|
36
43
|
load_config
|
37
44
|
end
|
38
45
|
|
39
|
-
mode
|
40
|
-
argument('target') { description
|
46
|
+
mode 'purge' do
|
47
|
+
argument('target') { description 'The URL or hostname to purge' }
|
41
48
|
|
42
49
|
option('reindex') {
|
43
|
-
description
|
50
|
+
description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
|
44
51
|
}
|
45
52
|
|
46
53
|
def run
|
47
54
|
target = params['target'].value
|
48
55
|
|
49
56
|
# If target is a valid URL, then assume we're purging a page and its contents.
|
50
|
-
if target =~
|
51
|
-
Varnisher::PagePurger.new target
|
52
|
-
|
53
|
-
|
57
|
+
if target =~ %r(^[a-z]+://)
|
58
|
+
purger = Varnisher::PagePurger.new target
|
59
|
+
purger.purge
|
54
60
|
# If target is a hostname, assume we want to purge an entire domain.
|
55
|
-
|
61
|
+
elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
|
56
62
|
Varnisher::DomainPurger.new target
|
57
63
|
|
58
64
|
if params['reindex'].given?
|
59
|
-
Varnisher::Spider.new "http://#{target}/"
|
65
|
+
spider = Varnisher::Spider.new "http://#{target}/"
|
66
|
+
spider.run
|
60
67
|
end
|
61
68
|
end
|
62
69
|
end
|
63
70
|
end
|
64
71
|
|
65
|
-
mode
|
66
|
-
argument('target') { description
|
72
|
+
mode 'spider' do
|
73
|
+
argument('target') { description 'The URL to begin spidering from.' }
|
67
74
|
|
68
75
|
option('n', 'num-pages') {
|
69
76
|
argument :required
|
70
77
|
cast :int
|
71
|
-
description
|
72
|
-
default -1
|
78
|
+
description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
|
73
79
|
}
|
74
80
|
|
75
81
|
option('t', 'threads') {
|
76
82
|
argument :required
|
77
83
|
cast :int
|
78
|
-
description
|
79
|
-
default 16
|
84
|
+
description 'Spidering is done in parallel; this variable controls how many threads will be used.'
|
80
85
|
}
|
81
86
|
|
82
87
|
option('#', 'ignore-hashes') {
|
83
|
-
description
|
88
|
+
description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
|
84
89
|
}
|
85
90
|
|
86
91
|
option('q', 'ignore-query-strings') {
|
87
|
-
description
|
92
|
+
description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
|
88
93
|
}
|
89
94
|
|
90
95
|
def run
|
91
96
|
target = params['target'].value
|
92
97
|
|
93
|
-
Varnisher::Spider.new target
|
98
|
+
spider = Varnisher::Spider.new target
|
99
|
+
spider.run
|
94
100
|
end
|
95
101
|
end
|
96
102
|
|
97
103
|
def load_config
|
98
|
-
|
104
|
+
# Start with our default options.
|
105
|
+
options = Varnisher.options
|
99
106
|
|
100
|
-
|
107
|
+
# Check the user's RC file -- if it exists -- to see if they've
|
108
|
+
# specified any defaults of their own.
|
109
|
+
rcfile = File.expand_path('~/.varnishrc')
|
101
110
|
if FileTest.readable? rcfile
|
102
|
-
rc = YAML
|
103
|
-
|
111
|
+
rc = YAML.load(File.open(rcfile))
|
112
|
+
options.merge!(rc)
|
104
113
|
end
|
114
|
+
|
115
|
+
# The highest priority is given to command line arguments, so that
|
116
|
+
# the user can override things that are in their RC file if they
|
117
|
+
# choose to.
|
118
|
+
options.merge!(params.to_options.reject { |k, v| v.nil? })
|
119
|
+
|
120
|
+
Varnisher.options = options
|
105
121
|
end
|
106
122
|
}
|
107
123
|
|
data/lib/varnisher.rb
CHANGED
@@ -1,4 +1,65 @@
|
|
1
1
|
require_relative 'varnisher/spider'
|
2
|
+
require_relative 'varnisher/purger'
|
2
3
|
require_relative 'varnisher/domainpurger'
|
3
4
|
require_relative 'varnisher/pagepurger'
|
4
5
|
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
# This module is a namespace for our main functionality:
|
9
|
+
#
|
10
|
+
# * {Varnisher::Spider}
|
11
|
+
# * {Varnisher::DomainPurger}
|
12
|
+
# * {Varnisher::PagePurger}
|
13
|
+
module Varnisher
|
14
|
+
# Our default options are set here; they can be overriden either by
|
15
|
+
# command-line arguments or by settings in a user's ~/.varnishrc file.
|
16
|
+
@options = {
|
17
|
+
'verbose' => false,
|
18
|
+
'quiet' => false,
|
19
|
+
'hostname' => nil,
|
20
|
+
'port' => 80,
|
21
|
+
'num-pages' => -1,
|
22
|
+
'threads' => 16,
|
23
|
+
'ignore-hashes' => true,
|
24
|
+
'ignore-query-strings' => false,
|
25
|
+
'output-file' => nil
|
26
|
+
}
|
27
|
+
|
28
|
+
def self.options
|
29
|
+
@options
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.options=(options)
|
33
|
+
@options = options
|
34
|
+
|
35
|
+
if options['hostname'].nil? && options['target']
|
36
|
+
uri = URI.parse(options['target'])
|
37
|
+
options['hostname'] = uri.host
|
38
|
+
end
|
39
|
+
|
40
|
+
start_logging
|
41
|
+
end
|
42
|
+
|
43
|
+
# Sets up our Logger object, which will write output either to STDOUT
|
44
|
+
# (the default) or to the specified file.
|
45
|
+
def self.start_logging
|
46
|
+
output = @options['output-file'] || STDOUT
|
47
|
+
@log = Logger.new(output)
|
48
|
+
|
49
|
+
# By default, only display the log message, nothing else.
|
50
|
+
@log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
|
51
|
+
|
52
|
+
@log.level = if @options['verbose']
|
53
|
+
Logger::DEBUG
|
54
|
+
elsif @options['quiet']
|
55
|
+
Logger::FATAL
|
56
|
+
else
|
57
|
+
Logger::INFO
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.log
|
62
|
+
@log
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -1,27 +1,27 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
|
3
|
-
# This requires a special bit of VCL:
|
4
|
-
#
|
5
|
-
# if ( req.request == "DOMAINPURGE" ) {
|
6
|
-
# if ( client.ip ~ auth ) {
|
7
|
-
# ban("obj.http.x-host == " + req.http.host);
|
8
|
-
# error 200 "Purged.";
|
9
|
-
# }
|
10
|
-
# }
|
11
|
-
|
12
3
|
module Varnisher
|
4
|
+
# Purges an entire domain from the Varnish cache.
|
5
|
+
#
|
6
|
+
# This requires a special bit of VCL in your Varnish configuration:
|
7
|
+
#
|
8
|
+
# if ( req.request == "DOMAINPURGE" ) {
|
9
|
+
# if ( client.ip ~ auth ) {
|
10
|
+
# ban("obj.http.x-host == " + req.http.host);
|
11
|
+
# error 200 "Purged.";
|
12
|
+
# }
|
13
|
+
# }
|
13
14
|
class DomainPurger
|
15
|
+
# Executes the purge request.
|
16
|
+
#
|
17
|
+
# @param domain [String] The hostname to purge
|
14
18
|
def initialize(domain)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
if s.read =~ /HTTP\/1\.1 200 Purged\./
|
19
|
-
puts "Purged #{domain}"
|
19
|
+
purged = Varnisher.purge(domain, :domain)
|
20
|
+
if purged
|
21
|
+
Varnisher.log.info "Purged #{domain}"
|
20
22
|
else
|
21
|
-
|
23
|
+
Varnisher.log.info "Failed to purge #{domain}"
|
22
24
|
end
|
23
|
-
|
24
|
-
s.close
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|
data/lib/varnisher/pagepurger.rb
CHANGED
@@ -1,180 +1,156 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'parallel'
|
5
5
|
|
6
6
|
module Varnisher
|
7
|
+
# Purges an individual URL from Varnish.
|
7
8
|
class PagePurger
|
8
|
-
|
9
|
+
# A bash at an abstract representation of resources. All you need
|
10
|
+
# is an XPath, and what attribute to select from the matched
|
11
|
+
# elements.
|
12
|
+
Resource = Struct.new :name, :selector, :attribute
|
13
|
+
def self.resources
|
14
|
+
[
|
15
|
+
Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
|
16
|
+
Resource.new('JavaScript file', 'script[src]', 'src'),
|
17
|
+
Resource.new('image file', 'img[src]', 'src')
|
18
|
+
]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Purges the given URL from the Varnish cache.
|
22
|
+
#
|
23
|
+
# Will also purge all of the resources it finds on that page (e.g.
|
24
|
+
# images, CSS files, JavaScript files, etc.)
|
25
|
+
#
|
26
|
+
# @param url [String, URI] The URL to purge
|
9
27
|
def initialize(url)
|
10
28
|
@url = url
|
11
29
|
@uri = URI.parse(url)
|
12
|
-
|
13
|
-
@urls = []
|
14
|
-
|
15
|
-
# First, purge the URL itself; that means we'll get up-to-date references within that page.
|
16
|
-
puts "Purging #{@url}...\n\n"
|
17
|
-
purge(@url)
|
18
|
-
|
19
|
-
# Then, do a fresh GET of the page and queue any resources we find on it.
|
20
|
-
puts "Looking for external resources on #{@url}..."
|
21
|
-
|
22
|
-
if $options["verbose"]
|
23
|
-
puts "\n\n"
|
24
|
-
end
|
25
30
|
|
26
|
-
|
31
|
+
@urls = []
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
34
|
+
# Sends a PURGE request to the Varnish server, asking it to purge
|
35
|
+
# the given URL from its cache.
|
36
|
+
#
|
37
|
+
# This presupposes that you have the following VCL in your Varnish
|
38
|
+
# config file:
|
39
|
+
#
|
40
|
+
# if (req.request == "PURGE") {
|
41
|
+
# if ( client.ip ~ auth ) {
|
42
|
+
# ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
|
43
|
+
# error 200 "Purged.";
|
44
|
+
# }
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# More about purging can be found
|
48
|
+
# [in the Varnish documentation][purging-and-banning].
|
49
|
+
#
|
50
|
+
# [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
|
51
|
+
#
|
52
|
+
# @api private
|
53
|
+
def purge
|
54
|
+
Varnisher.log.info "Purging #{@url}..."
|
55
|
+
|
56
|
+
purged = Varnisher.purge(@url)
|
57
|
+
if purged
|
58
|
+
Varnisher.log.info ''
|
59
|
+
Varnisher.log.debug "Purged #{@url}"
|
60
|
+
else
|
61
|
+
Varnisher.log.info "Failed to purge #{@url}\n"
|
30
62
|
end
|
31
63
|
|
32
|
-
|
64
|
+
purge_resources
|
65
|
+
end
|
33
66
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
# Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
|
40
|
-
puts "Tidying resources...\n"
|
41
|
-
tidy_resources
|
42
|
-
puts "#{@urls.length} purgeable resources found.\n\n"
|
43
|
-
|
44
|
-
# Now, purge all of the resources we just queued.
|
45
|
-
puts "Purging resources..."
|
67
|
+
# Purges all the resources on the given page.
|
68
|
+
def purge_resources
|
69
|
+
fetch_page
|
46
70
|
|
47
|
-
if
|
48
|
-
puts "\n\n"
|
49
|
-
end
|
71
|
+
return if @urls.empty?
|
50
72
|
|
73
|
+
tidy_resources
|
51
74
|
purge_queue
|
52
|
-
|
53
|
-
if $options["verbose"]
|
54
|
-
puts "\n"
|
55
|
-
end
|
56
|
-
|
57
|
-
puts "Nothing more to do!\n\n"
|
58
75
|
end
|
59
|
-
|
60
|
-
#
|
61
|
-
|
76
|
+
|
77
|
+
# Fetches a page and parses out any external resources (e.g.
|
78
|
+
# JavaScript files, images, CSS files) it finds on it.
|
79
|
+
#
|
80
|
+
# @api private
|
81
|
+
def fetch_page
|
82
|
+
Varnisher.log.info "Looking for external resources on #{@url}..."
|
83
|
+
|
62
84
|
begin
|
63
|
-
|
85
|
+
@doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
|
64
86
|
rescue
|
65
|
-
|
87
|
+
Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
|
66
88
|
return
|
67
89
|
end
|
68
90
|
|
69
|
-
|
70
|
-
s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
|
71
|
-
|
72
|
-
if $options["verbose"]
|
73
|
-
if s.read =~ /HTTP\/1\.1 200 Purged\./
|
74
|
-
puts "Purged #{url}"
|
75
|
-
else
|
76
|
-
puts "Failed to purge #{url}"
|
77
|
-
end
|
78
|
-
end
|
91
|
+
@urls = find_resources
|
79
92
|
|
80
|
-
|
93
|
+
Varnisher.log.debug ''
|
94
|
+
Varnisher.log.info "#{@urls.length} total resources found.\n"
|
81
95
|
end
|
82
|
-
|
83
|
-
# Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
|
84
|
-
def fetch_page(url)
|
85
|
-
begin
|
86
|
-
uri = URI.parse(URI.encode(url.to_s.strip))
|
87
|
-
rescue
|
88
|
-
puts "Couldn't parse URL for resource-searching: #{url}"
|
89
|
-
return
|
90
|
-
end
|
91
|
-
|
92
|
-
headers = {
|
93
|
-
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
|
94
|
-
"Accept-Charset" => "utf-8",
|
95
|
-
"Accept" => "text/html"
|
96
|
-
}
|
97
|
-
|
98
|
-
begin
|
99
|
-
doc = Hpricot(Net::HTTP.get_response(uri).body)
|
100
|
-
rescue
|
101
|
-
puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
|
102
|
-
return
|
103
|
-
end
|
104
96
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
97
|
+
# Returns an array of resources contained within the current page.
|
98
|
+
#
|
99
|
+
# Resources include things like CSS files, images, and JavaScript
|
100
|
+
# files.
|
101
|
+
#
|
102
|
+
# If a block is given, the block will be executed once for each
|
103
|
+
# resource.
|
104
|
+
#
|
105
|
+
# @return [Array] An array of strings, each representing a URL
|
106
|
+
#
|
107
|
+
# @api private
|
108
|
+
def find_resources
|
109
|
+
found = []
|
110
|
+
|
111
|
+
self.class.resources.each do |res|
|
112
|
+
@doc.css(res.selector).each do |e|
|
113
|
+
attribute = e[res.attribute]
|
114
|
+
|
115
|
+
Varnisher.log.debug("Found resource: #{attribute}")
|
116
|
+
|
117
|
+
yield attribute if block_given?
|
118
|
+
found << attribute
|
119
|
+
end
|
110
120
|
end
|
121
|
+
|
122
|
+
found
|
111
123
|
end
|
112
124
|
|
113
|
-
|
114
|
-
|
125
|
+
# Tidies up the resource queue, converting relative URLs to
|
126
|
+
# absolute.
|
127
|
+
#
|
128
|
+
# @return [Array] The new URLs
|
129
|
+
#
|
130
|
+
# @api private
|
131
|
+
def tidy_resources
|
132
|
+
Varnisher.log.info 'Tidying resources...'
|
115
133
|
|
116
|
-
|
117
|
-
|
118
|
-
resources = [
|
119
|
-
resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
|
120
|
-
resource.new('JavaScript file', 'script[@src]', 'src'),
|
121
|
-
resource.new('image file', 'img[@src]', 'src')
|
122
|
-
]
|
134
|
+
@urls = @urls.map { |url| URI.join(@uri, url) }
|
135
|
+
.select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
|
123
136
|
|
124
|
-
|
125
|
-
doc.search(resource.xpath).each { |e|
|
126
|
-
att = e.get_attribute(resource.attribute)
|
127
|
-
yield att
|
128
|
-
}
|
129
|
-
}
|
130
|
-
end
|
131
|
-
|
132
|
-
# Adds a URL to the processing queue.
|
133
|
-
def queue_resource(url)
|
134
|
-
@urls << url.to_s
|
137
|
+
Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
|
135
138
|
end
|
136
|
-
|
137
|
-
def tidy_resources
|
138
|
-
valid_urls = []
|
139
|
-
|
140
|
-
@urls.each { |url|
|
141
|
-
# If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
|
142
|
-
if url.to_s =~ /^\//
|
143
|
-
url = @uri.scheme + "://" + @uri.host + url.to_s
|
144
|
-
end
|
145
139
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
begin
|
154
|
-
uri = URI.parse(url)
|
155
|
-
rescue
|
156
|
-
next
|
157
|
-
end
|
158
|
-
|
159
|
-
# Skip URLs that aren't HTTP, or that are on different domains.
|
160
|
-
next if uri.scheme != "http"
|
161
|
-
next if uri.host != @uri.host
|
140
|
+
# Processes the queue of URLs, sending a purge request for each of
|
141
|
+
# them.
|
142
|
+
#
|
143
|
+
# @api private
|
144
|
+
def purge_queue
|
145
|
+
Varnisher.log.info 'Purging resources...'
|
162
146
|
|
163
|
-
|
164
|
-
|
147
|
+
Parallel.map(@urls) do |url|
|
148
|
+
Varnisher.log.debug "Purging #{url}..."
|
165
149
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
# Processes the queue of URLs, sending a purge request for each of them.
|
170
|
-
def purge_queue()
|
171
|
-
Parallel.map(@urls) { |url|
|
172
|
-
if $options["verbose"]
|
173
|
-
puts "Purging #{url}..."
|
174
|
-
end
|
150
|
+
Varnisher.purge(url.to_s)
|
151
|
+
end
|
175
152
|
|
176
|
-
|
177
|
-
}
|
153
|
+
Varnisher.log.info 'Done.'
|
178
154
|
end
|
179
155
|
|
180
156
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Varnisher
|
2
|
+
# Sends a purge request to the Varnish server
|
3
|
+
#
|
4
|
+
# It does this by sending an HTTP request with a custom method; either
|
5
|
+
# PURGE, if the specified target is a URL, or DOMAINPURGE if the
|
6
|
+
# specified target is a hostname.
|
7
|
+
#
|
8
|
+
# This naturally relies on you having your Varnish config prepared
|
9
|
+
# appropriately, so that the actual purge will take place when we send
|
10
|
+
# these requests.
|
11
|
+
#
|
12
|
+
# @param target [String, URI] The URL or hostname to purge
|
13
|
+
# @param type [:page, :domain] Whether to do a purge of an individual
|
14
|
+
# URL or a whole hostname
|
15
|
+
# @return [true, false] True if we received an acceptable response
|
16
|
+
# from the server; false otherwise
|
17
|
+
def self.purge(target, type = :page)
|
18
|
+
if type == :page
|
19
|
+
purger = Purger.from_url(target)
|
20
|
+
else
|
21
|
+
purger = Purger.new('DOMAINPURGE', '/', target)
|
22
|
+
end
|
23
|
+
|
24
|
+
purger.send if purger
|
25
|
+
end
|
26
|
+
|
27
|
+
# Responsible for sending purge requests to the Varnish server.
|
28
|
+
class Purger
|
29
|
+
# Prepares a new purge request.
|
30
|
+
#
|
31
|
+
# @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
|
32
|
+
# the server
|
33
|
+
# @param path [String] The path to purge; for a domain purge,
|
34
|
+
# use "/"
|
35
|
+
# @param host [String] The hostname of the URL being purged
|
36
|
+
def initialize(method, path, host)
|
37
|
+
@method = method
|
38
|
+
@path = path
|
39
|
+
@host = host
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.from_url(url)
|
43
|
+
begin
|
44
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
45
|
+
rescue
|
46
|
+
return
|
47
|
+
end
|
48
|
+
|
49
|
+
new('PURGE', uri.path, uri.host)
|
50
|
+
end
|
51
|
+
|
52
|
+
def send
|
53
|
+
hostname = Varnisher.options['hostname']
|
54
|
+
port = Varnisher.options['port']
|
55
|
+
|
56
|
+
TCPSocket.open(hostname, port) do |s|
|
57
|
+
s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
|
58
|
+
!!s.read.match(/HTTP\/1\.1 200 Purged\./)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/lib/varnisher/spider.rb
CHANGED
@@ -1,73 +1,85 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'parallel'
|
5
5
|
|
6
6
|
module Varnisher
|
7
|
+
# Crawls a website, following links that it finds along the way, until
|
8
|
+
# it either runs out of pages to visit or reaches the limit of pages
|
9
|
+
# that you impose on it.
|
10
|
+
#
|
11
|
+
# The spider is multithreaded, which means that one slow request won't
|
12
|
+
# prevent the rest of your requests from happening; this is often the
|
13
|
+
# case when the cached resources are a combination of static or
|
14
|
+
# near-static resources (like CSS and images) and slow, dynamically
|
15
|
+
# generated pages.
|
16
|
+
#
|
17
|
+
# The spider's behaviour can be configured somewhat, so that for
|
18
|
+
# example it ignores query strings (treating /foo?foo=bar and
|
19
|
+
# /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
|
20
|
+
# and /foo#bar will be treated as different URLs).
|
21
|
+
#
|
22
|
+
#
|
7
23
|
class Spider
|
8
24
|
|
25
|
+
# Starts a new spider instance.
|
26
|
+
#
|
27
|
+
# Once it's done a bit of housekeeping and verified that the URL is
|
28
|
+
# acceptable, it calls {#spider} to do the actual fetching of the
|
29
|
+
# pages.
|
30
|
+
#
|
31
|
+
# @param url [String, URI] The URL to begin the spidering from. This
|
32
|
+
# also restricts the spider to fetching pages only on that
|
33
|
+
# (sub)domain - so, for example, if you specify
|
34
|
+
# http://example.com/foo as your starting page, only URLs that begin
|
35
|
+
# http://example.com will be followed.
|
9
36
|
def initialize(url)
|
10
|
-
|
11
|
-
|
12
|
-
|
37
|
+
# If we've been given only a hostname, assume that we want to
|
38
|
+
# start spidering from the homepage
|
39
|
+
url = 'http://' + url unless url =~ %r(^[a-z]+://)
|
13
40
|
|
14
41
|
@uri = URI.parse(url)
|
15
42
|
|
16
|
-
@pages_hit = 0
|
17
|
-
|
18
43
|
@visited = []
|
19
44
|
@to_visit = []
|
20
|
-
|
21
|
-
puts "Beginning spider of #{url}"
|
22
|
-
crawl_page(url)
|
23
|
-
spider
|
24
|
-
puts "Done; #{@pages_hit} pages hit."
|
25
45
|
end
|
26
46
|
|
47
|
+
# Adds a link to the queue of pages to be visited.
|
48
|
+
#
|
49
|
+
# Doesn't perform any duplication-checking; however, {#crawl_page}
|
50
|
+
# will refuse to crawl pages that have already been visited, so you
|
51
|
+
# can safely queue links blindly and trust that {#crawl_page} will do
|
52
|
+
# the de-duping for you.
|
53
|
+
#
|
54
|
+
# @api private
|
27
55
|
def queue_link(url)
|
28
56
|
@to_visit << url
|
29
57
|
end
|
30
58
|
|
31
|
-
|
59
|
+
# Visits a page, and extracts the links that it finds there.
|
60
|
+
#
|
61
|
+
# Links can be in the href attributes of HTML anchor tags, or they
|
62
|
+
# can just be URLs that are mentioned in the content of the page;
|
63
|
+
# the spider is flexible about what it crawls.
|
64
|
+
#
|
65
|
+
# Each link that it finds will be added to the queue of further
|
66
|
+
# pages to visit.
|
67
|
+
#
|
68
|
+
# @param url [String, URI] The URL of the page to fetch
|
69
|
+
#
|
70
|
+
# @api private
|
71
|
+
def crawl_page(uri)
|
32
72
|
# Don't crawl a page twice
|
33
|
-
return if @visited.include?
|
73
|
+
return if @visited.include? uri.to_s
|
34
74
|
|
35
75
|
# Let's not hit this again
|
36
|
-
@visited <<
|
37
|
-
|
38
|
-
begin
|
39
|
-
uri = URI.parse(URI.encode(url.to_s.strip))
|
40
|
-
rescue
|
41
|
-
return
|
42
|
-
end
|
43
|
-
|
44
|
-
headers = {
|
45
|
-
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
|
46
|
-
"Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
|
47
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
48
|
-
}
|
49
|
-
|
50
|
-
begin
|
51
|
-
req = Net::HTTP::Get.new(uri.path, headers)
|
52
|
-
response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
|
53
|
-
|
54
|
-
case response
|
55
|
-
when Net::HTTPRedirection
|
56
|
-
return crawl_page(response['location'], limit - 1)
|
57
|
-
when Net::HTTPSuccess
|
58
|
-
doc = Hpricot(response.body)
|
59
|
-
end
|
60
|
-
rescue
|
61
|
-
return
|
62
|
-
end
|
76
|
+
@visited << uri.to_s
|
63
77
|
|
64
|
-
|
78
|
+
doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
|
65
79
|
|
66
|
-
|
67
|
-
puts "Fetched #{url}..."
|
68
|
-
end
|
80
|
+
Varnisher.log.debug "Fetched #{uri}..."
|
69
81
|
|
70
|
-
find_links(doc,
|
82
|
+
find_links(doc, uri).each do |link|
|
71
83
|
next if @visited.include? link
|
72
84
|
next if @to_visit.include? link
|
73
85
|
|
@@ -75,93 +87,149 @@ module Varnisher
|
|
75
87
|
end
|
76
88
|
end
|
77
89
|
|
78
|
-
|
79
|
-
|
90
|
+
# Given a Nokogiri document, will return all the links in that
|
91
|
+
# document.
|
92
|
+
#
|
93
|
+
# "Links" are defined, for now, as the contents of the `href`
|
94
|
+
# attributes on HTML `<a>` tags, and URLs that are mentioned in
|
95
|
+
# comments.
|
96
|
+
#
|
97
|
+
# @param doc A Nokogiri document
|
98
|
+
# @param url [String, URI] The URL that the document came from;
|
99
|
+
# this is used to resolve relative URIs
|
100
|
+
#
|
101
|
+
# @return [Array] An array of URIs
|
102
|
+
#
|
103
|
+
# @api private
|
104
|
+
def find_links(doc, uri)
|
105
|
+
hrefs = []
|
80
106
|
|
81
|
-
|
82
|
-
|
83
|
-
rescue
|
84
|
-
return
|
85
|
-
end
|
107
|
+
hrefs = get_anchors(doc)
|
108
|
+
hrefs += get_commented_urls(doc)
|
86
109
|
|
87
|
-
hrefs =
|
110
|
+
hrefs = valid_urls(hrefs, uri)
|
111
|
+
hrefs = remove_hashes(hrefs)
|
112
|
+
hrefs = remove_query_strings(hrefs)
|
113
|
+
|
114
|
+
hrefs
|
115
|
+
end
|
116
|
+
|
117
|
+
# Given an HTML document, will return all the URLs that exist as
|
118
|
+
# href attributes of anchor tags.
|
119
|
+
#
|
120
|
+
# @return [Array] An array of strings
|
121
|
+
def get_anchors(doc)
|
122
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
123
|
+
end
|
124
|
+
|
125
|
+
# Given an HTML document, will return all the URLs that exist in
|
126
|
+
# HTML comments, e.g.:
|
127
|
+
#
|
128
|
+
# <!-- http://example.com/foo/bar -->
|
129
|
+
def get_commented_urls(doc)
|
130
|
+
doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
|
131
|
+
end
|
88
132
|
|
89
|
-
|
90
|
-
|
91
|
-
|
133
|
+
# Given a set of URLs, will return only the ones that are valid for
|
134
|
+
# spidering.
|
135
|
+
#
|
136
|
+
# That means URLs that have the same hostname as the hostname we
|
137
|
+
# started from, and that are on the HTTP scheme rather than HTTPS
|
138
|
+
# (since Varnish doesn't support HTTPS).
|
139
|
+
#
|
140
|
+
# Additionally, some normalisation will be performed, so that the
|
141
|
+
# URLs are absolute (using the page that they were fetched from as
|
142
|
+
# the base, just like a browser would).
|
143
|
+
#
|
144
|
+
# @return [Array] An array of URIs
|
145
|
+
def valid_urls(hrefs, uri)
|
146
|
+
hrefs.map { |u| URI.join(uri, URI.escape(u)) }
|
147
|
+
.select { |u| u.scheme == 'http' && u.host == @uri.host }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Given a set of URLs, will normalise them according to their URL
|
151
|
+
# minus the hash; that is, normalise them so that:
|
152
|
+
#
|
153
|
+
# foo#bar
|
154
|
+
#
|
155
|
+
# and:
|
156
|
+
#
|
157
|
+
# foo#baz
|
158
|
+
#
|
159
|
+
# Are considered the same.
|
160
|
+
#
|
161
|
+
# @return [Array] An array of URIs
|
162
|
+
def remove_hashes(hrefs)
|
163
|
+
return hrefs unless Varnisher.options['ignore-hashes']
|
164
|
+
|
165
|
+
hrefs = hrefs.group_by do |h|
|
166
|
+
URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
|
92
167
|
end
|
93
168
|
|
94
|
-
|
95
|
-
|
96
|
-
|
169
|
+
hrefs.keys
|
170
|
+
end
|
171
|
+
|
172
|
+
# Given a set of URLs, will normalise them according to their URL
|
173
|
+
# minus the query string; that is, normalise them so that:
|
174
|
+
#
|
175
|
+
# foo?foo=bar
|
176
|
+
#
|
177
|
+
# and:
|
178
|
+
#
|
179
|
+
# foo?foo=baz
|
180
|
+
#
|
181
|
+
# Are considered the same.
|
182
|
+
#
|
183
|
+
# @return [Array] An array of URIs
|
184
|
+
def remove_query_strings(hrefs)
|
185
|
+
return hrefs unless Varnisher.options['ignore-query-strings']
|
186
|
+
|
187
|
+
hrefs = hrefs.group_by do |h|
|
188
|
+
URI.parse(h.scheme + '://' + h.host + h.path.to_s)
|
97
189
|
end
|
98
190
|
|
99
|
-
hrefs.
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
/^(.*)\//.match(uri.path)
|
113
|
-
path = $1
|
114
|
-
# If we're on the homepage, then we don't need a path.
|
115
|
-
else
|
116
|
-
path = ""
|
117
|
-
end
|
118
|
-
|
119
|
-
href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
|
120
|
-
end
|
121
|
-
|
122
|
-
# At this point, we should have an absolute URL regardless of
|
123
|
-
# its original format.
|
124
|
-
|
125
|
-
# Strip hash links
|
126
|
-
if ( $options["ignore-hashes"] )
|
127
|
-
href.gsub!(/(#.*?)$/, '')
|
128
|
-
end
|
129
|
-
|
130
|
-
# Strip query strings
|
131
|
-
if ( $options["ignore-query-strings"] )
|
132
|
-
href.gsub!(/(\?.*?)$/, '')
|
133
|
-
end
|
134
|
-
|
135
|
-
begin
|
136
|
-
href_uri = URI.parse(href)
|
137
|
-
rescue
|
138
|
-
# No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
|
139
|
-
next
|
140
|
-
end
|
141
|
-
|
142
|
-
next if href_uri.host != uri.host
|
143
|
-
next unless href_uri.scheme =~ /^https?$/
|
144
|
-
|
145
|
-
yield href
|
191
|
+
hrefs.keys
|
192
|
+
end
|
193
|
+
|
194
|
+
# Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
|
195
|
+
# it's not one that we've visited before.
|
196
|
+
#
|
197
|
+
# @return [URI] A URI object for an unvisited page
|
198
|
+
def pop_url
|
199
|
+
url = ''
|
200
|
+
|
201
|
+
loop do
|
202
|
+
url = @to_visit.pop
|
203
|
+
break unless @visited.include?(url)
|
146
204
|
end
|
205
|
+
|
206
|
+
url
|
147
207
|
end
|
148
208
|
|
149
|
-
|
150
|
-
|
151
|
-
|
209
|
+
# Kicks off the spidering process.
|
210
|
+
#
|
211
|
+
# Fires up Parallel in as many threads as have been configured, and
|
212
|
+
# begins to visit the pages in turn.
|
213
|
+
#
|
214
|
+
# This method is also responsible for checking whether the page
|
215
|
+
# limit has been reached and, if it has, ending the spidering.
|
216
|
+
#
|
217
|
+
# @api private
|
218
|
+
def run
|
219
|
+
Varnisher.log.info "Beginning spider of #{@uri}"
|
152
220
|
|
153
|
-
|
154
|
-
# We've crawled too many pages
|
155
|
-
next if @pages_hit > num_pages && num_pages >= 0
|
221
|
+
crawl_page(@uri)
|
156
222
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
223
|
+
threads = Varnisher.options['threads']
|
224
|
+
num_pages = Varnisher.options['num-pages']
|
225
|
+
|
226
|
+
Parallel.in_threads(threads) do |thread_number|
|
227
|
+
next if @visited.length > num_pages && num_pages >= 0
|
228
|
+
|
229
|
+
crawl_page(pop_url) while @to_visit.length > 0
|
230
|
+
end
|
161
231
|
|
162
|
-
|
163
|
-
end
|
164
|
-
}
|
232
|
+
Varnisher.log.info "Done; #{@visited.length} pages hit."
|
165
233
|
end
|
166
234
|
end
|
167
235
|
end
|
data/lib/varnisher/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: varnisher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.beta.
|
4
|
+
version: 1.0.beta.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Miller
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 5.2.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 1.6.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 1.6.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: parallel
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ~>
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 0.4.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: yard
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.8.7
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.8.7
|
69
83
|
description: Some tools that make working with the Varnish HTTP cache easier, including
|
70
84
|
things like doing mass purges of entire domains.
|
71
85
|
email: rob@bigfish.co.uk
|
@@ -77,6 +91,7 @@ files:
|
|
77
91
|
- bin/varnisher
|
78
92
|
- lib/varnisher/domainpurger.rb
|
79
93
|
- lib/varnisher/pagepurger.rb
|
94
|
+
- lib/varnisher/purger.rb
|
80
95
|
- lib/varnisher/spider.rb
|
81
96
|
- lib/varnisher/version.rb
|
82
97
|
- lib/varnisher.rb
|
@@ -107,3 +122,4 @@ signing_key:
|
|
107
122
|
specification_version: 4
|
108
123
|
summary: Helpful tools for working with Varnish caches
|
109
124
|
test_files: []
|
125
|
+
has_rdoc:
|