varnisher 1.0.beta.2 → 1.0.beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -6
- data/bin/varnisher +44 -28
- data/lib/varnisher.rb +61 -0
- data/lib/varnisher/domainpurger.rb +17 -17
- data/lib/varnisher/pagepurger.rb +117 -141
- data/lib/varnisher/purger.rb +62 -0
- data/lib/varnisher/spider.rb +187 -119
- data/lib/varnisher/version.rb +1 -1
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
|
4
|
+
data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
|
7
|
+
data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba
|
data/README.md
CHANGED
@@ -12,12 +12,17 @@ Varnisher lets you do things like:
|
|
12
12
|
* Purge an entire domain, including optionally re-spidering it
|
13
13
|
afterwards to keep the cache warm
|
14
14
|
|
15
|
+
Full documentation is available [on
|
16
|
+
rdoc.info](http://rdoc.info/github/robmiller/varnisher).
|
17
|
+
|
15
18
|
## Installation
|
16
19
|
|
17
20
|
Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
|
18
|
-
installed, then Varnisher can be installed
|
21
|
+
installed, then Varnisher can be installed easily via RubyGems.
|
22
|
+
|
23
|
+
Varnisher is still in beta; you can install it with:
|
19
24
|
|
20
|
-
gem install varnisher
|
25
|
+
gem install varnisher --pre
|
21
26
|
|
22
27
|
## Usage
|
23
28
|
|
@@ -52,9 +57,9 @@ you want to paste and override them:
|
|
52
57
|
verbose: false
|
53
58
|
hostname: localhost
|
54
59
|
port: 80
|
55
|
-
|
56
|
-
|
57
|
-
|
60
|
+
num-pages: -1
|
61
|
+
ignore-hashes: true
|
62
|
+
ignore-query-strings: false
|
58
63
|
|
59
64
|
## Examples
|
60
65
|
|
@@ -89,7 +94,7 @@ which is fairly standard:
|
|
89
94
|
|
90
95
|
(For an explanation of just what `obj.http.x-url` means, and why you
|
91
96
|
should use it rather than `req.url`, see [this
|
92
|
-
page](http://
|
97
|
+
page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
|
93
98
|
|
94
99
|
### Purging an entire domain
|
95
100
|
|
data/bin/varnisher
CHANGED
@@ -9,9 +9,9 @@ require 'yaml'
|
|
9
9
|
require 'varnisher'
|
10
10
|
|
11
11
|
Main {
|
12
|
-
examples
|
12
|
+
examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
|
13
13
|
|
14
|
-
description
|
14
|
+
description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
|
15
15
|
|
16
16
|
argument 'target'
|
17
17
|
|
@@ -19,89 +19,105 @@ Main {
|
|
19
19
|
description "If given, Varnisher will be noisier about what it's up to."
|
20
20
|
}
|
21
21
|
|
22
|
+
option('q', 'quiet') {
|
23
|
+
description 'If given, Varnisher will be silent apart from errors.'
|
24
|
+
}
|
25
|
+
|
22
26
|
option('H', 'hostname') {
|
23
27
|
argument :required
|
24
|
-
description
|
25
|
-
default "localhost"
|
28
|
+
description 'The hostname/IP address of your Varnish server.'
|
26
29
|
}
|
27
30
|
|
28
31
|
option('p', 'port') {
|
29
32
|
argument :required
|
30
33
|
cast :int
|
31
|
-
description
|
32
|
-
|
34
|
+
description 'The port Varnish is listening on.'
|
35
|
+
}
|
36
|
+
|
37
|
+
option('o', 'output-file') {
|
38
|
+
argument :required
|
39
|
+
description 'A file to output log information to. If not given, output will be printed to STDOUT'
|
33
40
|
}
|
34
41
|
|
35
42
|
def before_run
|
36
43
|
load_config
|
37
44
|
end
|
38
45
|
|
39
|
-
mode
|
40
|
-
argument('target') { description
|
46
|
+
mode 'purge' do
|
47
|
+
argument('target') { description 'The URL or hostname to purge' }
|
41
48
|
|
42
49
|
option('reindex') {
|
43
|
-
description
|
50
|
+
description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
|
44
51
|
}
|
45
52
|
|
46
53
|
def run
|
47
54
|
target = params['target'].value
|
48
55
|
|
49
56
|
# If target is a valid URL, then assume we're purging a page and its contents.
|
50
|
-
if target =~
|
51
|
-
Varnisher::PagePurger.new target
|
52
|
-
|
53
|
-
|
57
|
+
if target =~ %r(^[a-z]+://)
|
58
|
+
purger = Varnisher::PagePurger.new target
|
59
|
+
purger.purge
|
54
60
|
# If target is a hostname, assume we want to purge an entire domain.
|
55
|
-
|
61
|
+
elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
|
56
62
|
Varnisher::DomainPurger.new target
|
57
63
|
|
58
64
|
if params['reindex'].given?
|
59
|
-
Varnisher::Spider.new "http://#{target}/"
|
65
|
+
spider = Varnisher::Spider.new "http://#{target}/"
|
66
|
+
spider.run
|
60
67
|
end
|
61
68
|
end
|
62
69
|
end
|
63
70
|
end
|
64
71
|
|
65
|
-
mode
|
66
|
-
argument('target') { description
|
72
|
+
mode 'spider' do
|
73
|
+
argument('target') { description 'The URL to begin spidering from.' }
|
67
74
|
|
68
75
|
option('n', 'num-pages') {
|
69
76
|
argument :required
|
70
77
|
cast :int
|
71
|
-
description
|
72
|
-
default -1
|
78
|
+
description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
|
73
79
|
}
|
74
80
|
|
75
81
|
option('t', 'threads') {
|
76
82
|
argument :required
|
77
83
|
cast :int
|
78
|
-
description
|
79
|
-
default 16
|
84
|
+
description 'Spidering is done in parallel; this variable controls how many threads will be used.'
|
80
85
|
}
|
81
86
|
|
82
87
|
option('#', 'ignore-hashes') {
|
83
|
-
description
|
88
|
+
description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
|
84
89
|
}
|
85
90
|
|
86
91
|
option('q', 'ignore-query-strings') {
|
87
|
-
description
|
92
|
+
description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
|
88
93
|
}
|
89
94
|
|
90
95
|
def run
|
91
96
|
target = params['target'].value
|
92
97
|
|
93
|
-
Varnisher::Spider.new target
|
98
|
+
spider = Varnisher::Spider.new target
|
99
|
+
spider.run
|
94
100
|
end
|
95
101
|
end
|
96
102
|
|
97
103
|
def load_config
|
98
|
-
|
104
|
+
# Start with our default options.
|
105
|
+
options = Varnisher.options
|
99
106
|
|
100
|
-
|
107
|
+
# Check the user's RC file -- if it exists -- to see if they've
|
108
|
+
# specified any defaults of their own.
|
109
|
+
rcfile = File.expand_path('~/.varnishrc')
|
101
110
|
if FileTest.readable? rcfile
|
102
|
-
rc = YAML
|
103
|
-
|
111
|
+
rc = YAML.load(File.open(rcfile))
|
112
|
+
options.merge!(rc)
|
104
113
|
end
|
114
|
+
|
115
|
+
# The highest priority is given to command line arguments, so that
|
116
|
+
# the user can override things that are in their RC file if they
|
117
|
+
# choose to.
|
118
|
+
options.merge!(params.to_options.reject { |k, v| v.nil? })
|
119
|
+
|
120
|
+
Varnisher.options = options
|
105
121
|
end
|
106
122
|
}
|
107
123
|
|
data/lib/varnisher.rb
CHANGED
@@ -1,4 +1,65 @@
|
|
1
1
|
require_relative 'varnisher/spider'
|
2
|
+
require_relative 'varnisher/purger'
|
2
3
|
require_relative 'varnisher/domainpurger'
|
3
4
|
require_relative 'varnisher/pagepurger'
|
4
5
|
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
# This module is a namespace for our main functionality:
|
9
|
+
#
|
10
|
+
# * {Varnisher::Spider}
|
11
|
+
# * {Varnisher::DomainPurger}
|
12
|
+
# * {Varnisher::PagePurger}
|
13
|
+
module Varnisher
|
14
|
+
# Our default options are set here; they can be overriden either by
|
15
|
+
# command-line arguments or by settings in a user's ~/.varnishrc file.
|
16
|
+
@options = {
|
17
|
+
'verbose' => false,
|
18
|
+
'quiet' => false,
|
19
|
+
'hostname' => nil,
|
20
|
+
'port' => 80,
|
21
|
+
'num-pages' => -1,
|
22
|
+
'threads' => 16,
|
23
|
+
'ignore-hashes' => true,
|
24
|
+
'ignore-query-strings' => false,
|
25
|
+
'output-file' => nil
|
26
|
+
}
|
27
|
+
|
28
|
+
def self.options
|
29
|
+
@options
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.options=(options)
|
33
|
+
@options = options
|
34
|
+
|
35
|
+
if options['hostname'].nil? && options['target']
|
36
|
+
uri = URI.parse(options['target'])
|
37
|
+
options['hostname'] = uri.host
|
38
|
+
end
|
39
|
+
|
40
|
+
start_logging
|
41
|
+
end
|
42
|
+
|
43
|
+
# Sets up our Logger object, which will write output either to STDOUT
|
44
|
+
# (the default) or to the specified file.
|
45
|
+
def self.start_logging
|
46
|
+
output = @options['output-file'] || STDOUT
|
47
|
+
@log = Logger.new(output)
|
48
|
+
|
49
|
+
# By default, only display the log message, nothing else.
|
50
|
+
@log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
|
51
|
+
|
52
|
+
@log.level = if @options['verbose']
|
53
|
+
Logger::DEBUG
|
54
|
+
elsif @options['quiet']
|
55
|
+
Logger::FATAL
|
56
|
+
else
|
57
|
+
Logger::INFO
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.log
|
62
|
+
@log
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -1,27 +1,27 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
|
3
|
-
# This requires a special bit of VCL:
|
4
|
-
#
|
5
|
-
# if ( req.request == "DOMAINPURGE" ) {
|
6
|
-
# if ( client.ip ~ auth ) {
|
7
|
-
# ban("obj.http.x-host == " + req.http.host);
|
8
|
-
# error 200 "Purged.";
|
9
|
-
# }
|
10
|
-
# }
|
11
|
-
|
12
3
|
module Varnisher
|
4
|
+
# Purges an entire domain from the Varnish cache.
|
5
|
+
#
|
6
|
+
# This requires a special bit of VCL in your Varnish configuration:
|
7
|
+
#
|
8
|
+
# if ( req.request == "DOMAINPURGE" ) {
|
9
|
+
# if ( client.ip ~ auth ) {
|
10
|
+
# ban("obj.http.x-host == " + req.http.host);
|
11
|
+
# error 200 "Purged.";
|
12
|
+
# }
|
13
|
+
# }
|
13
14
|
class DomainPurger
|
15
|
+
# Executes the purge request.
|
16
|
+
#
|
17
|
+
# @param domain [String] The hostname to purge
|
14
18
|
def initialize(domain)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
if s.read =~ /HTTP\/1\.1 200 Purged\./
|
19
|
-
puts "Purged #{domain}"
|
19
|
+
purged = Varnisher.purge(domain, :domain)
|
20
|
+
if purged
|
21
|
+
Varnisher.log.info "Purged #{domain}"
|
20
22
|
else
|
21
|
-
|
23
|
+
Varnisher.log.info "Failed to purge #{domain}"
|
22
24
|
end
|
23
|
-
|
24
|
-
s.close
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|
data/lib/varnisher/pagepurger.rb
CHANGED
@@ -1,180 +1,156 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'parallel'
|
5
5
|
|
6
6
|
module Varnisher
|
7
|
+
# Purges an individual URL from Varnish.
|
7
8
|
class PagePurger
|
8
|
-
|
9
|
+
# A bash at an abstract representation of resources. All you need
|
10
|
+
# is an XPath, and what attribute to select from the matched
|
11
|
+
# elements.
|
12
|
+
Resource = Struct.new :name, :selector, :attribute
|
13
|
+
def self.resources
|
14
|
+
[
|
15
|
+
Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
|
16
|
+
Resource.new('JavaScript file', 'script[src]', 'src'),
|
17
|
+
Resource.new('image file', 'img[src]', 'src')
|
18
|
+
]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Purges the given URL from the Varnish cache.
|
22
|
+
#
|
23
|
+
# Will also purge all of the resources it finds on that page (e.g.
|
24
|
+
# images, CSS files, JavaScript files, etc.)
|
25
|
+
#
|
26
|
+
# @param url [String, URI] The URL to purge
|
9
27
|
def initialize(url)
|
10
28
|
@url = url
|
11
29
|
@uri = URI.parse(url)
|
12
|
-
|
13
|
-
@urls = []
|
14
|
-
|
15
|
-
# First, purge the URL itself; that means we'll get up-to-date references within that page.
|
16
|
-
puts "Purging #{@url}...\n\n"
|
17
|
-
purge(@url)
|
18
|
-
|
19
|
-
# Then, do a fresh GET of the page and queue any resources we find on it.
|
20
|
-
puts "Looking for external resources on #{@url}..."
|
21
|
-
|
22
|
-
if $options["verbose"]
|
23
|
-
puts "\n\n"
|
24
|
-
end
|
25
30
|
|
26
|
-
|
31
|
+
@urls = []
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
34
|
+
# Sends a PURGE request to the Varnish server, asking it to purge
|
35
|
+
# the given URL from its cache.
|
36
|
+
#
|
37
|
+
# This presupposes that you have the following VCL in your Varnish
|
38
|
+
# config file:
|
39
|
+
#
|
40
|
+
# if (req.request == "PURGE") {
|
41
|
+
# if ( client.ip ~ auth ) {
|
42
|
+
# ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
|
43
|
+
# error 200 "Purged.";
|
44
|
+
# }
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# More about purging can be found
|
48
|
+
# [in the Varnish documentation][purging-and-banning].
|
49
|
+
#
|
50
|
+
# [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
|
51
|
+
#
|
52
|
+
# @api private
|
53
|
+
def purge
|
54
|
+
Varnisher.log.info "Purging #{@url}..."
|
55
|
+
|
56
|
+
purged = Varnisher.purge(@url)
|
57
|
+
if purged
|
58
|
+
Varnisher.log.info ''
|
59
|
+
Varnisher.log.debug "Purged #{@url}"
|
60
|
+
else
|
61
|
+
Varnisher.log.info "Failed to purge #{@url}\n"
|
30
62
|
end
|
31
63
|
|
32
|
-
|
64
|
+
purge_resources
|
65
|
+
end
|
33
66
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
# Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
|
40
|
-
puts "Tidying resources...\n"
|
41
|
-
tidy_resources
|
42
|
-
puts "#{@urls.length} purgeable resources found.\n\n"
|
43
|
-
|
44
|
-
# Now, purge all of the resources we just queued.
|
45
|
-
puts "Purging resources..."
|
67
|
+
# Purges all the resources on the given page.
|
68
|
+
def purge_resources
|
69
|
+
fetch_page
|
46
70
|
|
47
|
-
if
|
48
|
-
puts "\n\n"
|
49
|
-
end
|
71
|
+
return if @urls.empty?
|
50
72
|
|
73
|
+
tidy_resources
|
51
74
|
purge_queue
|
52
|
-
|
53
|
-
if $options["verbose"]
|
54
|
-
puts "\n"
|
55
|
-
end
|
56
|
-
|
57
|
-
puts "Nothing more to do!\n\n"
|
58
75
|
end
|
59
|
-
|
60
|
-
#
|
61
|
-
|
76
|
+
|
77
|
+
# Fetches a page and parses out any external resources (e.g.
|
78
|
+
# JavaScript files, images, CSS files) it finds on it.
|
79
|
+
#
|
80
|
+
# @api private
|
81
|
+
def fetch_page
|
82
|
+
Varnisher.log.info "Looking for external resources on #{@url}..."
|
83
|
+
|
62
84
|
begin
|
63
|
-
|
85
|
+
@doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
|
64
86
|
rescue
|
65
|
-
|
87
|
+
Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
|
66
88
|
return
|
67
89
|
end
|
68
90
|
|
69
|
-
|
70
|
-
s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
|
71
|
-
|
72
|
-
if $options["verbose"]
|
73
|
-
if s.read =~ /HTTP\/1\.1 200 Purged\./
|
74
|
-
puts "Purged #{url}"
|
75
|
-
else
|
76
|
-
puts "Failed to purge #{url}"
|
77
|
-
end
|
78
|
-
end
|
91
|
+
@urls = find_resources
|
79
92
|
|
80
|
-
|
93
|
+
Varnisher.log.debug ''
|
94
|
+
Varnisher.log.info "#{@urls.length} total resources found.\n"
|
81
95
|
end
|
82
|
-
|
83
|
-
# Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
|
84
|
-
def fetch_page(url)
|
85
|
-
begin
|
86
|
-
uri = URI.parse(URI.encode(url.to_s.strip))
|
87
|
-
rescue
|
88
|
-
puts "Couldn't parse URL for resource-searching: #{url}"
|
89
|
-
return
|
90
|
-
end
|
91
|
-
|
92
|
-
headers = {
|
93
|
-
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
|
94
|
-
"Accept-Charset" => "utf-8",
|
95
|
-
"Accept" => "text/html"
|
96
|
-
}
|
97
|
-
|
98
|
-
begin
|
99
|
-
doc = Hpricot(Net::HTTP.get_response(uri).body)
|
100
|
-
rescue
|
101
|
-
puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
|
102
|
-
return
|
103
|
-
end
|
104
96
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
97
|
+
# Returns an array of resources contained within the current page.
|
98
|
+
#
|
99
|
+
# Resources include things like CSS files, images, and JavaScript
|
100
|
+
# files.
|
101
|
+
#
|
102
|
+
# If a block is given, the block will be executed once for each
|
103
|
+
# resource.
|
104
|
+
#
|
105
|
+
# @return [Array] An array of strings, each representing a URL
|
106
|
+
#
|
107
|
+
# @api private
|
108
|
+
def find_resources
|
109
|
+
found = []
|
110
|
+
|
111
|
+
self.class.resources.each do |res|
|
112
|
+
@doc.css(res.selector).each do |e|
|
113
|
+
attribute = e[res.attribute]
|
114
|
+
|
115
|
+
Varnisher.log.debug("Found resource: #{attribute}")
|
116
|
+
|
117
|
+
yield attribute if block_given?
|
118
|
+
found << attribute
|
119
|
+
end
|
110
120
|
end
|
121
|
+
|
122
|
+
found
|
111
123
|
end
|
112
124
|
|
113
|
-
|
114
|
-
|
125
|
+
# Tidies up the resource queue, converting relative URLs to
|
126
|
+
# absolute.
|
127
|
+
#
|
128
|
+
# @return [Array] The new URLs
|
129
|
+
#
|
130
|
+
# @api private
|
131
|
+
def tidy_resources
|
132
|
+
Varnisher.log.info 'Tidying resources...'
|
115
133
|
|
116
|
-
|
117
|
-
|
118
|
-
resources = [
|
119
|
-
resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
|
120
|
-
resource.new('JavaScript file', 'script[@src]', 'src'),
|
121
|
-
resource.new('image file', 'img[@src]', 'src')
|
122
|
-
]
|
134
|
+
@urls = @urls.map { |url| URI.join(@uri, url) }
|
135
|
+
.select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
|
123
136
|
|
124
|
-
|
125
|
-
doc.search(resource.xpath).each { |e|
|
126
|
-
att = e.get_attribute(resource.attribute)
|
127
|
-
yield att
|
128
|
-
}
|
129
|
-
}
|
130
|
-
end
|
131
|
-
|
132
|
-
# Adds a URL to the processing queue.
|
133
|
-
def queue_resource(url)
|
134
|
-
@urls << url.to_s
|
137
|
+
Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
|
135
138
|
end
|
136
|
-
|
137
|
-
def tidy_resources
|
138
|
-
valid_urls = []
|
139
|
-
|
140
|
-
@urls.each { |url|
|
141
|
-
# If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
|
142
|
-
if url.to_s =~ /^\//
|
143
|
-
url = @uri.scheme + "://" + @uri.host + url.to_s
|
144
|
-
end
|
145
139
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
begin
|
154
|
-
uri = URI.parse(url)
|
155
|
-
rescue
|
156
|
-
next
|
157
|
-
end
|
158
|
-
|
159
|
-
# Skip URLs that aren't HTTP, or that are on different domains.
|
160
|
-
next if uri.scheme != "http"
|
161
|
-
next if uri.host != @uri.host
|
140
|
+
# Processes the queue of URLs, sending a purge request for each of
|
141
|
+
# them.
|
142
|
+
#
|
143
|
+
# @api private
|
144
|
+
def purge_queue
|
145
|
+
Varnisher.log.info 'Purging resources...'
|
162
146
|
|
163
|
-
|
164
|
-
|
147
|
+
Parallel.map(@urls) do |url|
|
148
|
+
Varnisher.log.debug "Purging #{url}..."
|
165
149
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
# Processes the queue of URLs, sending a purge request for each of them.
|
170
|
-
def purge_queue()
|
171
|
-
Parallel.map(@urls) { |url|
|
172
|
-
if $options["verbose"]
|
173
|
-
puts "Purging #{url}..."
|
174
|
-
end
|
150
|
+
Varnisher.purge(url.to_s)
|
151
|
+
end
|
175
152
|
|
176
|
-
|
177
|
-
}
|
153
|
+
Varnisher.log.info 'Done.'
|
178
154
|
end
|
179
155
|
|
180
156
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Varnisher
|
2
|
+
# Sends a purge request to the Varnish server
|
3
|
+
#
|
4
|
+
# It does this by sending an HTTP request with a custom method; either
|
5
|
+
# PURGE, if the specified target is a URL, or DOMAINPURGE if the
|
6
|
+
# specified target is a hostname.
|
7
|
+
#
|
8
|
+
# This naturally relies on you having your Varnish config prepared
|
9
|
+
# appropriately, so that the actual purge will take place when we send
|
10
|
+
# these requests.
|
11
|
+
#
|
12
|
+
# @param target [String, URI] The URL or hostname to purge
|
13
|
+
# @param type [:page, :domain] Whether to do a purge of an individual
|
14
|
+
# URL or a whole hostname
|
15
|
+
# @return [true, false] True if we received an acceptable response
|
16
|
+
# from the server; false otherwise
|
17
|
+
def self.purge(target, type = :page)
|
18
|
+
if type == :page
|
19
|
+
purger = Purger.from_url(target)
|
20
|
+
else
|
21
|
+
purger = Purger.new('DOMAINPURGE', '/', target)
|
22
|
+
end
|
23
|
+
|
24
|
+
purger.send if purger
|
25
|
+
end
|
26
|
+
|
27
|
+
# Responsible for sending purge requests to the Varnish server.
|
28
|
+
class Purger
|
29
|
+
# Prepares a new purge request.
|
30
|
+
#
|
31
|
+
# @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
|
32
|
+
# the server
|
33
|
+
# @param path [String] The path to purge; for a domain purge,
|
34
|
+
# use "/"
|
35
|
+
# @param host [String] The hostname of the URL being purged
|
36
|
+
def initialize(method, path, host)
|
37
|
+
@method = method
|
38
|
+
@path = path
|
39
|
+
@host = host
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.from_url(url)
|
43
|
+
begin
|
44
|
+
uri = URI.parse(URI.encode(url.to_s.strip))
|
45
|
+
rescue
|
46
|
+
return
|
47
|
+
end
|
48
|
+
|
49
|
+
new('PURGE', uri.path, uri.host)
|
50
|
+
end
|
51
|
+
|
52
|
+
def send
|
53
|
+
hostname = Varnisher.options['hostname']
|
54
|
+
port = Varnisher.options['port']
|
55
|
+
|
56
|
+
TCPSocket.open(hostname, port) do |s|
|
57
|
+
s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
|
58
|
+
!!s.read.match(/HTTP\/1\.1 200 Purged\./)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/lib/varnisher/spider.rb
CHANGED
@@ -1,73 +1,85 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'parallel'
|
5
5
|
|
6
6
|
module Varnisher
|
7
|
+
# Crawls a website, following links that it finds along the way, until
|
8
|
+
# it either runs out of pages to visit or reaches the limit of pages
|
9
|
+
# that you impose on it.
|
10
|
+
#
|
11
|
+
# The spider is multithreaded, which means that one slow request won't
|
12
|
+
# prevent the rest of your requests from happening; this is often the
|
13
|
+
# case when the cached resources are a combination of static or
|
14
|
+
# near-static resources (like CSS and images) and slow, dynamically
|
15
|
+
# generated pages.
|
16
|
+
#
|
17
|
+
# The spider's behaviour can be configured somewhat, so that for
|
18
|
+
# example it ignores query strings (treating /foo?foo=bar and
|
19
|
+
# /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
|
20
|
+
# and /foo#bar will be treated as different URLs).
|
21
|
+
#
|
22
|
+
#
|
7
23
|
class Spider
|
8
24
|
|
25
|
+
# Starts a new spider instance.
|
26
|
+
#
|
27
|
+
# Once it's done a bit of housekeeping and verified that the URL is
|
28
|
+
# acceptable, it calls {#spider} to do the actual fetching of the
|
29
|
+
# pages.
|
30
|
+
#
|
31
|
+
# @param url [String, URI] The URL to begin the spidering from. This
|
32
|
+
# also restricts the spider to fetching pages only on that
|
33
|
+
# (sub)domain - so, for example, if you specify
|
34
|
+
# http://example.com/foo as your starting page, only URLs that begin
|
35
|
+
# http://example.com will be followed.
|
9
36
|
def initialize(url)
|
10
|
-
|
11
|
-
|
12
|
-
|
37
|
+
# If we've been given only a hostname, assume that we want to
|
38
|
+
# start spidering from the homepage
|
39
|
+
url = 'http://' + url unless url =~ %r(^[a-z]+://)
|
13
40
|
|
14
41
|
@uri = URI.parse(url)
|
15
42
|
|
16
|
-
@pages_hit = 0
|
17
|
-
|
18
43
|
@visited = []
|
19
44
|
@to_visit = []
|
20
|
-
|
21
|
-
puts "Beginning spider of #{url}"
|
22
|
-
crawl_page(url)
|
23
|
-
spider
|
24
|
-
puts "Done; #{@pages_hit} pages hit."
|
25
45
|
end
|
26
46
|
|
47
|
+
# Adds a link to the queue of pages to be visited.
|
48
|
+
#
|
49
|
+
# Doesn't perform any duplication-checking; however, {#crawl_page}
|
50
|
+
# will refuse to crawl pages that have already been visited, so you
|
51
|
+
# can safely queue links blindly and trust that {#crawl_page} will do
|
52
|
+
# the de-duping for you.
|
53
|
+
#
|
54
|
+
# @api private
|
27
55
|
def queue_link(url)
|
28
56
|
@to_visit << url
|
29
57
|
end
|
30
58
|
|
31
|
-
|
59
|
+
# Visits a page, and extracts the links that it finds there.
|
60
|
+
#
|
61
|
+
# Links can be in the href attributes of HTML anchor tags, or they
|
62
|
+
# can just be URLs that are mentioned in the content of the page;
|
63
|
+
# the spider is flexible about what it crawls.
|
64
|
+
#
|
65
|
+
# Each link that it finds will be added to the queue of further
|
66
|
+
# pages to visit.
|
67
|
+
#
|
68
|
+
# @param url [String, URI] The URL of the page to fetch
|
69
|
+
#
|
70
|
+
# @api private
|
71
|
+
def crawl_page(uri)
|
32
72
|
# Don't crawl a page twice
|
33
|
-
return if @visited.include?
|
73
|
+
return if @visited.include? uri.to_s
|
34
74
|
|
35
75
|
# Let's not hit this again
|
36
|
-
@visited <<
|
37
|
-
|
38
|
-
begin
|
39
|
-
uri = URI.parse(URI.encode(url.to_s.strip))
|
40
|
-
rescue
|
41
|
-
return
|
42
|
-
end
|
43
|
-
|
44
|
-
headers = {
|
45
|
-
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
|
46
|
-
"Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
|
47
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
48
|
-
}
|
49
|
-
|
50
|
-
begin
|
51
|
-
req = Net::HTTP::Get.new(uri.path, headers)
|
52
|
-
response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
|
53
|
-
|
54
|
-
case response
|
55
|
-
when Net::HTTPRedirection
|
56
|
-
return crawl_page(response['location'], limit - 1)
|
57
|
-
when Net::HTTPSuccess
|
58
|
-
doc = Hpricot(response.body)
|
59
|
-
end
|
60
|
-
rescue
|
61
|
-
return
|
62
|
-
end
|
76
|
+
@visited << uri.to_s
|
63
77
|
|
64
|
-
|
78
|
+
doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
|
65
79
|
|
66
|
-
|
67
|
-
puts "Fetched #{url}..."
|
68
|
-
end
|
80
|
+
Varnisher.log.debug "Fetched #{uri}..."
|
69
81
|
|
70
|
-
find_links(doc,
|
82
|
+
find_links(doc, uri).each do |link|
|
71
83
|
next if @visited.include? link
|
72
84
|
next if @to_visit.include? link
|
73
85
|
|
@@ -75,93 +87,149 @@ module Varnisher
|
|
75
87
|
end
|
76
88
|
end
|
77
89
|
|
78
|
-
|
79
|
-
|
90
|
+
# Given a Nokogiri document, will return all the links in that
|
91
|
+
# document.
|
92
|
+
#
|
93
|
+
# "Links" are defined, for now, as the contents of the `href`
|
94
|
+
# attributes on HTML `<a>` tags, and URLs that are mentioned in
|
95
|
+
# comments.
|
96
|
+
#
|
97
|
+
# @param doc A Nokogiri document
|
98
|
+
# @param url [String, URI] The URL that the document came from;
|
99
|
+
# this is used to resolve relative URIs
|
100
|
+
#
|
101
|
+
# @return [Array] An array of URIs
|
102
|
+
#
|
103
|
+
# @api private
|
104
|
+
def find_links(doc, uri)
|
105
|
+
hrefs = []
|
80
106
|
|
81
|
-
|
82
|
-
|
83
|
-
rescue
|
84
|
-
return
|
85
|
-
end
|
107
|
+
hrefs = get_anchors(doc)
|
108
|
+
hrefs += get_commented_urls(doc)
|
86
109
|
|
87
|
-
hrefs =
|
110
|
+
hrefs = valid_urls(hrefs, uri)
|
111
|
+
hrefs = remove_hashes(hrefs)
|
112
|
+
hrefs = remove_query_strings(hrefs)
|
113
|
+
|
114
|
+
hrefs
|
115
|
+
end
|
116
|
+
|
117
|
+
# Given an HTML document, will return all the URLs that exist as
|
118
|
+
# href attributes of anchor tags.
|
119
|
+
#
|
120
|
+
# @return [Array] An array of strings
|
121
|
+
def get_anchors(doc)
|
122
|
+
doc.xpath('//a[@href]').map { |e| e['href'] }
|
123
|
+
end
|
124
|
+
|
125
|
+
# Given an HTML document, will return all the URLs that exist in
|
126
|
+
# HTML comments, e.g.:
|
127
|
+
#
|
128
|
+
# <!-- http://example.com/foo/bar -->
|
129
|
+
def get_commented_urls(doc)
|
130
|
+
doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
|
131
|
+
end
|
88
132
|
|
89
|
-
|
90
|
-
|
91
|
-
|
133
|
+
# Given a set of URLs, will return only the ones that are valid for
|
134
|
+
# spidering.
|
135
|
+
#
|
136
|
+
# That means URLs that have the same hostname as the hostname we
|
137
|
+
# started from, and that are on the HTTP scheme rather than HTTPS
|
138
|
+
# (since Varnish doesn't support HTTPS).
|
139
|
+
#
|
140
|
+
# Additionally, some normalisation will be performed, so that the
|
141
|
+
# URLs are absolute (using the page that they were fetched from as
|
142
|
+
# the base, just like a browser would).
|
143
|
+
#
|
144
|
+
# @return [Array] An array of URIs
|
145
|
+
def valid_urls(hrefs, uri)
|
146
|
+
hrefs.map { |u| URI.join(uri, URI.escape(u)) }
|
147
|
+
.select { |u| u.scheme == 'http' && u.host == @uri.host }
|
148
|
+
end
|
149
|
+
|
150
|
+
# Given a set of URLs, will normalise them according to their URL
|
151
|
+
# minus the hash; that is, normalise them so that:
|
152
|
+
#
|
153
|
+
# foo#bar
|
154
|
+
#
|
155
|
+
# and:
|
156
|
+
#
|
157
|
+
# foo#baz
|
158
|
+
#
|
159
|
+
# Are considered the same.
|
160
|
+
#
|
161
|
+
# @return [Array] An array of URIs
|
162
|
+
def remove_hashes(hrefs)
|
163
|
+
return hrefs unless Varnisher.options['ignore-hashes']
|
164
|
+
|
165
|
+
hrefs = hrefs.group_by do |h|
|
166
|
+
URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
|
92
167
|
end
|
93
168
|
|
94
|
-
|
95
|
-
|
96
|
-
|
169
|
+
hrefs.keys
|
170
|
+
end
|
171
|
+
|
172
|
+
# Given a set of URLs, will normalise them according to their URL
|
173
|
+
# minus the query string; that is, normalise them so that:
|
174
|
+
#
|
175
|
+
# foo?foo=bar
|
176
|
+
#
|
177
|
+
# and:
|
178
|
+
#
|
179
|
+
# foo?foo=baz
|
180
|
+
#
|
181
|
+
# Are considered the same.
|
182
|
+
#
|
183
|
+
# @return [Array] An array of URIs
|
184
|
+
def remove_query_strings(hrefs)
|
185
|
+
return hrefs unless Varnisher.options['ignore-query-strings']
|
186
|
+
|
187
|
+
hrefs = hrefs.group_by do |h|
|
188
|
+
URI.parse(h.scheme + '://' + h.host + h.path.to_s)
|
97
189
|
end
|
98
190
|
|
99
|
-
hrefs.
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
/^(.*)\//.match(uri.path)
|
113
|
-
path = $1
|
114
|
-
# If we're on the homepage, then we don't need a path.
|
115
|
-
else
|
116
|
-
path = ""
|
117
|
-
end
|
118
|
-
|
119
|
-
href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
|
120
|
-
end
|
121
|
-
|
122
|
-
# At this point, we should have an absolute URL regardless of
|
123
|
-
# its original format.
|
124
|
-
|
125
|
-
# Strip hash links
|
126
|
-
if ( $options["ignore-hashes"] )
|
127
|
-
href.gsub!(/(#.*?)$/, '')
|
128
|
-
end
|
129
|
-
|
130
|
-
# Strip query strings
|
131
|
-
if ( $options["ignore-query-strings"] )
|
132
|
-
href.gsub!(/(\?.*?)$/, '')
|
133
|
-
end
|
134
|
-
|
135
|
-
begin
|
136
|
-
href_uri = URI.parse(href)
|
137
|
-
rescue
|
138
|
-
# No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
|
139
|
-
next
|
140
|
-
end
|
141
|
-
|
142
|
-
next if href_uri.host != uri.host
|
143
|
-
next unless href_uri.scheme =~ /^https?$/
|
144
|
-
|
145
|
-
yield href
|
191
|
+
hrefs.keys
|
192
|
+
end
|
193
|
+
|
194
|
+
# Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
|
195
|
+
# it's not one that we've visited before.
|
196
|
+
#
|
197
|
+
# @return [URI] A URI object for an unvisited page
|
198
|
+
def pop_url
|
199
|
+
url = ''
|
200
|
+
|
201
|
+
loop do
|
202
|
+
url = @to_visit.pop
|
203
|
+
break unless @visited.include?(url)
|
146
204
|
end
|
205
|
+
|
206
|
+
url
|
147
207
|
end
|
148
208
|
|
149
|
-
|
150
|
-
|
151
|
-
|
209
|
+
# Kicks off the spidering process.
|
210
|
+
#
|
211
|
+
# Fires up Parallel in as many threads as have been configured, and
|
212
|
+
# begins to visit the pages in turn.
|
213
|
+
#
|
214
|
+
# This method is also responsible for checking whether the page
|
215
|
+
# limit has been reached and, if it has, ending the spidering.
|
216
|
+
#
|
217
|
+
# @api private
|
218
|
+
def run
|
219
|
+
Varnisher.log.info "Beginning spider of #{@uri}"
|
152
220
|
|
153
|
-
|
154
|
-
# We've crawled too many pages
|
155
|
-
next if @pages_hit > num_pages && num_pages >= 0
|
221
|
+
crawl_page(@uri)
|
156
222
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
223
|
+
threads = Varnisher.options['threads']
|
224
|
+
num_pages = Varnisher.options['num-pages']
|
225
|
+
|
226
|
+
Parallel.in_threads(threads) do |thread_number|
|
227
|
+
next if @visited.length > num_pages && num_pages >= 0
|
228
|
+
|
229
|
+
crawl_page(pop_url) while @to_visit.length > 0
|
230
|
+
end
|
161
231
|
|
162
|
-
|
163
|
-
end
|
164
|
-
}
|
232
|
+
Varnisher.log.info "Done; #{@visited.length} pages hit."
|
165
233
|
end
|
166
234
|
end
|
167
235
|
end
|
data/lib/varnisher/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: varnisher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.beta.
|
4
|
+
version: 1.0.beta.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Miller
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 5.2.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 1.6.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 1.6.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: parallel
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ~>
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 0.4.1
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: yard
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.8.7
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.8.7
|
69
83
|
description: Some tools that make working with the Varnish HTTP cache easier, including
|
70
84
|
things like doing mass purges of entire domains.
|
71
85
|
email: rob@bigfish.co.uk
|
@@ -77,6 +91,7 @@ files:
|
|
77
91
|
- bin/varnisher
|
78
92
|
- lib/varnisher/domainpurger.rb
|
79
93
|
- lib/varnisher/pagepurger.rb
|
94
|
+
- lib/varnisher/purger.rb
|
80
95
|
- lib/varnisher/spider.rb
|
81
96
|
- lib/varnisher/version.rb
|
82
97
|
- lib/varnisher.rb
|
@@ -107,3 +122,4 @@ signing_key:
|
|
107
122
|
specification_version: 4
|
108
123
|
summary: Helpful tools for working with Varnish caches
|
109
124
|
test_files: []
|
125
|
+
has_rdoc:
|