varnisher 1.0.beta.2 → 1.0.beta.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a9d710584e6d43f0f925cd4894f536fa1630551
4
- data.tar.gz: 9d22ce540f64d36683db840c395c03c1096a88cd
3
+ metadata.gz: c623fd6e6e310bbe921eac9813f729315e9adfdb
4
+ data.tar.gz: 7cc10a40560e5a08cbdff5d77896f8321a502b3b
5
5
  SHA512:
6
- metadata.gz: 7afaedc98a7557689c4908da0ba13e54f674d70ec3fd48543f0bf4181b016d61d05f532196683f3dfd97ff8e44ddde417afb4e116bfdf941af73068664531327
7
- data.tar.gz: 55cd0e503a1152418c84e3a1876ee75a38b2c6450ea5e9495b0cd2d2d55bf608d8ef07939e84f170cfc19d2c665f8863cafb5e6aeb21241a71e8fc71caa706f2
6
+ metadata.gz: fb8f37ead31d4e11ad082384c3cf1846d1cb889eca0272f17f62bd57def26f85b14a41cc2a04f8e59b637017eaacf4e3faa11cda6cd5e0203bca27026d3389a0
7
+ data.tar.gz: 0bc12b5fe3bee9b63a15a8780c7c836aad16773359d5e2452c7b273c36e6eb4ef95816044fbc0693056b62187ec74a35e820badea6af37c8d9051fccc9d677ba
data/README.md CHANGED
@@ -12,12 +12,17 @@ Varnisher lets you do things like:
12
12
  * Purge an entire domain, including optionally re-spidering it
13
13
  afterwards to keep the cache warm
14
14
 
15
+ Full documentation is available [on
16
+ rdoc.info](http://rdoc.info/github/robmiller/varnisher).
17
+
15
18
  ## Installation
16
19
 
17
20
  Varnish requires Ruby >1.9.3 to run. If you've got a recent Ruby
18
- installed, then Varnisher can be installed by running:
21
+ installed, then Varnisher can be installed easily via RubyGems.
22
+
23
+ Varnisher is still in beta; you can install it with:
19
24
 
20
- gem install varnisher
25
+ gem install varnisher --pre
21
26
 
22
27
  ## Usage
23
28
 
@@ -52,9 +57,9 @@ you want to paste and override them:
52
57
  verbose: false
53
58
  hostname: localhost
54
59
  port: 80
55
- num_pages: 100
56
- ignore_hash: true
57
- ignore_query_string: false
60
+ num-pages: -1
61
+ ignore-hashes: true
62
+ ignore-query-strings: false
58
63
 
59
64
  ## Examples
60
65
 
@@ -89,7 +94,7 @@ which is fairly standard:
89
94
 
90
95
  (For an explanation of just what `obj.http.x-url` means, and why you
91
96
  should use it rather than `req.url`, see [this
92
- page](http://kristianlyng.wordpress.com/2010/07/28/smart-bans-with-varnish/).)
97
+ page](http://kly.no/posts/2010_07_28__Smart_bans_with_Varnish__.html).)
93
98
 
94
99
  ### Purging an entire domain
95
100
 
data/bin/varnisher CHANGED
@@ -9,9 +9,9 @@ require 'yaml'
9
9
  require 'varnisher'
10
10
 
11
11
  Main {
12
- examples "varnisher purge http://example.com", "varnisher spider example.com", "varnisher purge --reindex example.com"
12
+ examples 'varnisher purge http://example.com', 'varnisher spider example.com', 'varnisher purge --reindex example.com'
13
13
 
14
- description "Varnisher is a set of tools for working with the Varnish HTTP cache."
14
+ description 'Varnisher is a set of tools for working with the Varnish HTTP cache.'
15
15
 
16
16
  argument 'target'
17
17
 
@@ -19,89 +19,105 @@ Main {
19
19
  description "If given, Varnisher will be noisier about what it's up to."
20
20
  }
21
21
 
22
+ option('q', 'quiet') {
23
+ description 'If given, Varnisher will be silent apart from errors.'
24
+ }
25
+
22
26
  option('H', 'hostname') {
23
27
  argument :required
24
- description "The hostname/IP address of your Varnish server."
25
- default "localhost"
28
+ description 'The hostname/IP address of your Varnish server.'
26
29
  }
27
30
 
28
31
  option('p', 'port') {
29
32
  argument :required
30
33
  cast :int
31
- description "The port Varnish is listening on."
32
- default 80
34
+ description 'The port Varnish is listening on.'
35
+ }
36
+
37
+ option('o', 'output-file') {
38
+ argument :required
39
+ description 'A file to output log information to. If not given, output will be printed to STDOUT'
33
40
  }
34
41
 
35
42
  def before_run
36
43
  load_config
37
44
  end
38
45
 
39
- mode "purge" do
40
- argument('target') { description "The URL or hostname to purge" }
46
+ mode 'purge' do
47
+ argument('target') { description 'The URL or hostname to purge' }
41
48
 
42
49
  option('reindex') {
43
- description "If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible."
50
+ description 'If you specify a hostname to purge, this option will respider that hostname after the purging is complete. This will keep your cache as warm as possible.'
44
51
  }
45
52
 
46
53
  def run
47
54
  target = params['target'].value
48
55
 
49
56
  # If target is a valid URL, then assume we're purging a page and its contents.
50
- if target =~ /^[a-z]+:\/\//
51
- Varnisher::PagePurger.new target
52
- end
53
-
57
+ if target =~ %r(^[a-z]+://)
58
+ purger = Varnisher::PagePurger.new target
59
+ purger.purge
54
60
  # If target is a hostname, assume we want to purge an entire domain.
55
- if target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
61
+ elsif target =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
56
62
  Varnisher::DomainPurger.new target
57
63
 
58
64
  if params['reindex'].given?
59
- Varnisher::Spider.new "http://#{target}/"
65
+ spider = Varnisher::Spider.new "http://#{target}/"
66
+ spider.run
60
67
  end
61
68
  end
62
69
  end
63
70
  end
64
71
 
65
- mode "spider" do
66
- argument('target') { description "The URL to begin spidering from." }
72
+ mode 'spider' do
73
+ argument('target') { description 'The URL to begin spidering from.' }
67
74
 
68
75
  option('n', 'num-pages') {
69
76
  argument :required
70
77
  cast :int
71
- description "Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit."
72
- default -1
78
+ description 'Maximum number of pages to crawl. Setting this to -1 (the default) will impose no limit.'
73
79
  }
74
80
 
75
81
  option('t', 'threads') {
76
82
  argument :required
77
83
  cast :int
78
- description "Spidering is done in parallel; this variable controls how many threads will be used."
79
- default 16
84
+ description 'Spidering is done in parallel; this variable controls how many threads will be used.'
80
85
  }
81
86
 
82
87
  option('#', 'ignore-hashes') {
83
- description "When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource."
88
+ description 'When given, /foo#foo and /foo#bar will be treated as separate URLs; the default is to treat them as the same resource.'
84
89
  }
85
90
 
86
91
  option('q', 'ignore-query-strings') {
87
- description "When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource."
92
+ description 'When given, /foo?foo=bar and /foo?foo=baz will be treated as the same resource.'
88
93
  }
89
94
 
90
95
  def run
91
96
  target = params['target'].value
92
97
 
93
- Varnisher::Spider.new target
98
+ spider = Varnisher::Spider.new target
99
+ spider.run
94
100
  end
95
101
  end
96
102
 
97
103
  def load_config
98
- $options = params.to_options
104
+ # Start with our default options.
105
+ options = Varnisher.options
99
106
 
100
- rcfile = File.expand_path("~/.varnishrc")
107
+ # Check the user's RC file -- if it exists -- to see if they've
108
+ # specified any defaults of their own.
109
+ rcfile = File.expand_path('~/.varnishrc')
101
110
  if FileTest.readable? rcfile
102
- rc = YAML::load(File.open(rcfile))
103
- $options.merge!(rc)
111
+ rc = YAML.load(File.open(rcfile))
112
+ options.merge!(rc)
104
113
  end
114
+
115
+ # The highest priority is given to command line arguments, so that
116
+ # the user can override things that are in their RC file if they
117
+ # choose to.
118
+ options.merge!(params.to_options.reject { |k, v| v.nil? })
119
+
120
+ Varnisher.options = options
105
121
  end
106
122
  }
107
123
 
data/lib/varnisher.rb CHANGED
@@ -1,4 +1,65 @@
1
1
  require_relative 'varnisher/spider'
2
+ require_relative 'varnisher/purger'
2
3
  require_relative 'varnisher/domainpurger'
3
4
  require_relative 'varnisher/pagepurger'
4
5
 
6
+ require 'logger'
7
+
8
+ # This module is a namespace for our main functionality:
9
+ #
10
+ # * {Varnisher::Spider}
11
+ # * {Varnisher::DomainPurger}
12
+ # * {Varnisher::PagePurger}
13
+ module Varnisher
14
+ # Our default options are set here; they can be overriden either by
15
+ # command-line arguments or by settings in a user's ~/.varnishrc file.
16
+ @options = {
17
+ 'verbose' => false,
18
+ 'quiet' => false,
19
+ 'hostname' => nil,
20
+ 'port' => 80,
21
+ 'num-pages' => -1,
22
+ 'threads' => 16,
23
+ 'ignore-hashes' => true,
24
+ 'ignore-query-strings' => false,
25
+ 'output-file' => nil
26
+ }
27
+
28
+ def self.options
29
+ @options
30
+ end
31
+
32
+ def self.options=(options)
33
+ @options = options
34
+
35
+ if options['hostname'].nil? && options['target']
36
+ uri = URI.parse(options['target'])
37
+ options['hostname'] = uri.host
38
+ end
39
+
40
+ start_logging
41
+ end
42
+
43
+ # Sets up our Logger object, which will write output either to STDOUT
44
+ # (the default) or to the specified file.
45
+ def self.start_logging
46
+ output = @options['output-file'] || STDOUT
47
+ @log = Logger.new(output)
48
+
49
+ # By default, only display the log message, nothing else.
50
+ @log.formatter = proc { |_, _, _, msg| "#{msg}\n" }
51
+
52
+ @log.level = if @options['verbose']
53
+ Logger::DEBUG
54
+ elsif @options['quiet']
55
+ Logger::FATAL
56
+ else
57
+ Logger::INFO
58
+ end
59
+ end
60
+
61
+ def self.log
62
+ @log
63
+ end
64
+ end
65
+
@@ -1,27 +1,27 @@
1
1
  require 'net/http'
2
2
 
3
- # This requires a special bit of VCL:
4
- #
5
- # if ( req.request == "DOMAINPURGE" ) {
6
- # if ( client.ip ~ auth ) {
7
- # ban("obj.http.x-host == " + req.http.host);
8
- # error 200 "Purged.";
9
- # }
10
- # }
11
-
12
3
  module Varnisher
4
+ # Purges an entire domain from the Varnish cache.
5
+ #
6
+ # This requires a special bit of VCL in your Varnish configuration:
7
+ #
8
+ # if ( req.request == "DOMAINPURGE" ) {
9
+ # if ( client.ip ~ auth ) {
10
+ # ban("obj.http.x-host == " + req.http.host);
11
+ # error 200 "Purged.";
12
+ # }
13
+ # }
13
14
  class DomainPurger
15
+ # Executes the purge request.
16
+ #
17
+ # @param domain [String] The hostname to purge
14
18
  def initialize(domain)
15
- s = TCPSocket.open($options['hostname'], $options['port'])
16
- s.print("DOMAINPURGE / HTTP/1.1\r\nHost: #{domain}\r\n\r\n")
17
-
18
- if s.read =~ /HTTP\/1\.1 200 Purged\./
19
- puts "Purged #{domain}"
19
+ purged = Varnisher.purge(domain, :domain)
20
+ if purged
21
+ Varnisher.log.info "Purged #{domain}"
20
22
  else
21
- puts "Failed to purge #{domain}"
23
+ Varnisher.log.info "Failed to purge #{domain}"
22
24
  end
23
-
24
- s.close
25
25
  end
26
26
  end
27
27
  end
@@ -1,180 +1,156 @@
1
1
  require 'rubygems'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
  require 'net/http'
4
4
  require 'parallel'
5
5
 
6
6
  module Varnisher
7
+ # Purges an individual URL from Varnish.
7
8
  class PagePurger
8
-
9
+ # A bash at an abstract representation of resources. All you need
10
+ # is an XPath, and what attribute to select from the matched
11
+ # elements.
12
+ Resource = Struct.new :name, :selector, :attribute
13
+ def self.resources
14
+ [
15
+ Resource.new('stylesheet', 'link[rel~=stylesheet]', 'href'),
16
+ Resource.new('JavaScript file', 'script[src]', 'src'),
17
+ Resource.new('image file', 'img[src]', 'src')
18
+ ]
19
+ end
20
+
21
+ # Purges the given URL from the Varnish cache.
22
+ #
23
+ # Will also purge all of the resources it finds on that page (e.g.
24
+ # images, CSS files, JavaScript files, etc.)
25
+ #
26
+ # @param url [String, URI] The URL to purge
9
27
  def initialize(url)
10
28
  @url = url
11
29
  @uri = URI.parse(url)
12
-
13
- @urls = []
14
-
15
- # First, purge the URL itself; that means we'll get up-to-date references within that page.
16
- puts "Purging #{@url}...\n\n"
17
- purge(@url)
18
-
19
- # Then, do a fresh GET of the page and queue any resources we find on it.
20
- puts "Looking for external resources on #{@url}..."
21
-
22
- if $options["verbose"]
23
- puts "\n\n"
24
- end
25
30
 
26
- fetch_page(@url)
31
+ @urls = []
32
+ end
27
33
 
28
- if $options["verbose"]
29
- puts "\n"
34
+ # Sends a PURGE request to the Varnish server, asking it to purge
35
+ # the given URL from its cache.
36
+ #
37
+ # This presupposes that you have the following VCL in your Varnish
38
+ # config file:
39
+ #
40
+ # if (req.request == "PURGE") {
41
+ # if ( client.ip ~ auth ) {
42
+ # ban("obj.http.x-url == " + req.url + " && obj.http.x-host == " + req.http.host);
43
+ # error 200 "Purged.";
44
+ # }
45
+ # }
46
+ #
47
+ # More about purging can be found
48
+ # [in the Varnish documentation][purging-and-banning].
49
+ #
50
+ # [purging-and-banning]: http://varnish-cache.org/docs/3.0/tutorial/purging.html
51
+ #
52
+ # @api private
53
+ def purge
54
+ Varnisher.log.info "Purging #{@url}..."
55
+
56
+ purged = Varnisher.purge(@url)
57
+ if purged
58
+ Varnisher.log.info ''
59
+ Varnisher.log.debug "Purged #{@url}"
60
+ else
61
+ Varnisher.log.info "Failed to purge #{@url}\n"
30
62
  end
31
63
 
32
- puts "#{@urls.length} total resources found.\n\n"
64
+ purge_resources
65
+ end
33
66
 
34
- if @urls.length == 0
35
- puts "No resources found. Abort!"
36
- return
37
- end
38
-
39
- # Let's figure out which of these resources we can actually purge — whether they're on our server, etc.
40
- puts "Tidying resources...\n"
41
- tidy_resources
42
- puts "#{@urls.length} purgeable resources found.\n\n"
43
-
44
- # Now, purge all of the resources we just queued.
45
- puts "Purging resources..."
67
+ # Purges all the resources on the given page.
68
+ def purge_resources
69
+ fetch_page
46
70
 
47
- if $options["verbose"]
48
- puts "\n\n"
49
- end
71
+ return if @urls.empty?
50
72
 
73
+ tidy_resources
51
74
  purge_queue
52
-
53
- if $options["verbose"]
54
- puts "\n"
55
- end
56
-
57
- puts "Nothing more to do!\n\n"
58
75
  end
59
-
60
- # Sends a PURGE request to the Varnish server, asking it to purge the given URL from its cache.
61
- def purge(url)
76
+
77
+ # Fetches a page and parses out any external resources (e.g.
78
+ # JavaScript files, images, CSS files) it finds on it.
79
+ #
80
+ # @api private
81
+ def fetch_page
82
+ Varnisher.log.info "Looking for external resources on #{@url}..."
83
+
62
84
  begin
63
- uri = URI.parse(URI.encode(url.to_s.strip))
85
+ @doc = Nokogiri::HTML(Net::HTTP.get_response(@uri).body)
64
86
  rescue
65
- puts "Couldn't parse URL for purging: #{$!}"
87
+ Varnisher.log.info "Hmm, I couldn't fetch that URL. Sure it's right?\n"
66
88
  return
67
89
  end
68
90
 
69
- s = TCPSocket.open($options['hostname'], $options['port'])
70
- s.print("PURGE #{uri.path} HTTP/1.1\r\nHost: #{uri.host}\r\n\r\n")
71
-
72
- if $options["verbose"]
73
- if s.read =~ /HTTP\/1\.1 200 Purged\./
74
- puts "Purged #{url}"
75
- else
76
- puts "Failed to purge #{url}"
77
- end
78
- end
91
+ @urls = find_resources
79
92
 
80
- s.close
93
+ Varnisher.log.debug ''
94
+ Varnisher.log.info "#{@urls.length} total resources found.\n"
81
95
  end
82
-
83
- # Fetches a page and parses out any external resources (e.g. JavaScript files, images, CSS files) it finds on it.
84
- def fetch_page(url)
85
- begin
86
- uri = URI.parse(URI.encode(url.to_s.strip))
87
- rescue
88
- puts "Couldn't parse URL for resource-searching: #{url}"
89
- return
90
- end
91
-
92
- headers = {
93
- "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2",
94
- "Accept-Charset" => "utf-8",
95
- "Accept" => "text/html"
96
- }
97
-
98
- begin
99
- doc = Hpricot(Net::HTTP.get_response(uri).body)
100
- rescue
101
- puts "Hmm, I couldn't seem to fetch that URL. Sure it's right?\n"
102
- return
103
- end
104
96
 
105
- find_resources(doc) do |resource|
106
- if $options["verbose"]
107
- puts "Found #{resource}"
108
- end
109
- queue_resource(resource)
97
+ # Returns an array of resources contained within the current page.
98
+ #
99
+ # Resources include things like CSS files, images, and JavaScript
100
+ # files.
101
+ #
102
+ # If a block is given, the block will be executed once for each
103
+ # resource.
104
+ #
105
+ # @return [Array] An array of strings, each representing a URL
106
+ #
107
+ # @api private
108
+ def find_resources
109
+ found = []
110
+
111
+ self.class.resources.each do |res|
112
+ @doc.css(res.selector).each do |e|
113
+ attribute = e[res.attribute]
114
+
115
+ Varnisher.log.debug("Found resource: #{attribute}")
116
+
117
+ yield attribute if block_given?
118
+ found << attribute
119
+ end
110
120
  end
121
+
122
+ found
111
123
  end
112
124
 
113
- def find_resources(doc)
114
- return unless doc.respond_to? 'search'
125
+ # Tidies up the resource queue, converting relative URLs to
126
+ # absolute.
127
+ #
128
+ # @return [Array] The new URLs
129
+ #
130
+ # @api private
131
+ def tidy_resources
132
+ Varnisher.log.info 'Tidying resources...'
115
133
 
116
- # A bash at an abstract representation of resources. All you need is an XPath, and what attribute to select from the matched elements.
117
- resource = Struct.new :name, :xpath, :attribute
118
- resources = [
119
- resource.new('stylesheet', 'link[@rel*=stylesheet]', 'href'),
120
- resource.new('JavaScript file', 'script[@src]', 'src'),
121
- resource.new('image file', 'img[@src]', 'src')
122
- ]
134
+ @urls = @urls.map { |url| URI.join(@uri, url) }
135
+ .select { |uri| uri.scheme == 'http' && uri.host == @uri.host }
123
136
 
124
- resources.each { |resource|
125
- doc.search(resource.xpath).each { |e|
126
- att = e.get_attribute(resource.attribute)
127
- yield att
128
- }
129
- }
130
- end
131
-
132
- # Adds a URL to the processing queue.
133
- def queue_resource(url)
134
- @urls << url.to_s
137
+ Varnisher.log.info "#{@urls.length} purgeable resources found.\n"
135
138
  end
136
-
137
- def tidy_resources
138
- valid_urls = []
139
-
140
- @urls.each { |url|
141
- # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
142
- if url.to_s =~ /^\//
143
- url = @uri.scheme + "://" + @uri.host + url.to_s
144
- end
145
139
 
146
- # If we're dealing with a path-relative URL, make it relative to the current directory.
147
- unless url.to_s =~ /[a-z]+:\/\//
148
- # Take everything up to the final / in the path to be the current directory.
149
- /^(.*)\//.match(@uri.path)
150
- url = @uri.scheme + "://" + @uri.host + $1 + "/" + url.to_s
151
- end
152
-
153
- begin
154
- uri = URI.parse(url)
155
- rescue
156
- next
157
- end
158
-
159
- # Skip URLs that aren't HTTP, or that are on different domains.
160
- next if uri.scheme != "http"
161
- next if uri.host != @uri.host
140
+ # Processes the queue of URLs, sending a purge request for each of
141
+ # them.
142
+ #
143
+ # @api private
144
+ def purge_queue
145
+ Varnisher.log.info 'Purging resources...'
162
146
 
163
- valid_urls << url
164
- }
147
+ Parallel.map(@urls) do |url|
148
+ Varnisher.log.debug "Purging #{url}..."
165
149
 
166
- @urls = valid_urls.dup
167
- end
168
-
169
- # Processes the queue of URLs, sending a purge request for each of them.
170
- def purge_queue()
171
- Parallel.map(@urls) { |url|
172
- if $options["verbose"]
173
- puts "Purging #{url}..."
174
- end
150
+ Varnisher.purge(url.to_s)
151
+ end
175
152
 
176
- purge(url)
177
- }
153
+ Varnisher.log.info 'Done.'
178
154
  end
179
155
 
180
156
  end
@@ -0,0 +1,62 @@
1
+ module Varnisher
2
+ # Sends a purge request to the Varnish server
3
+ #
4
+ # It does this by sending an HTTP request with a custom method; either
5
+ # PURGE, if the specified target is a URL, or DOMAINPURGE if the
6
+ # specified target is a hostname.
7
+ #
8
+ # This naturally relies on you having your Varnish config prepared
9
+ # appropriately, so that the actual purge will take place when we send
10
+ # these requests.
11
+ #
12
+ # @param target [String, URI] The URL or hostname to purge
13
+ # @param type [:page, :domain] Whether to do a purge of an individual
14
+ # URL or a whole hostname
15
+ # @return [true, false] True if we received an acceptable response
16
+ # from the server; false otherwise
17
+ def self.purge(target, type = :page)
18
+ if type == :page
19
+ purger = Purger.from_url(target)
20
+ else
21
+ purger = Purger.new('DOMAINPURGE', '/', target)
22
+ end
23
+
24
+ purger.send if purger
25
+ end
26
+
27
+ # Responsible for sending purge requests to the Varnish server.
28
+ class Purger
29
+ # Prepares a new purge request.
30
+ #
31
+ # @param method ["PURGE", "DOMAINPURGE"] The HTTP verb to send to
32
+ # the server
33
+ # @param path [String] The path to purge; for a domain purge,
34
+ # use "/"
35
+ # @param host [String] The hostname of the URL being purged
36
+ def initialize(method, path, host)
37
+ @method = method
38
+ @path = path
39
+ @host = host
40
+ end
41
+
42
+ def self.from_url(url)
43
+ begin
44
+ uri = URI.parse(URI.encode(url.to_s.strip))
45
+ rescue
46
+ return
47
+ end
48
+
49
+ new('PURGE', uri.path, uri.host)
50
+ end
51
+
52
+ def send
53
+ hostname = Varnisher.options['hostname']
54
+ port = Varnisher.options['port']
55
+
56
+ TCPSocket.open(hostname, port) do |s|
57
+ s.print("#{@method} #{@path} HTTP/1.1\r\nHost: #{@host}\r\n\r\n")
58
+ !!s.read.match(/HTTP\/1\.1 200 Purged\./)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,73 +1,85 @@
1
1
  require 'rubygems'
2
- require 'hpricot'
2
+ require 'nokogiri'
3
3
  require 'net/http'
4
4
  require 'parallel'
5
5
 
6
6
  module Varnisher
7
+ # Crawls a website, following links that it finds along the way, until
8
+ # it either runs out of pages to visit or reaches the limit of pages
9
+ # that you impose on it.
10
+ #
11
+ # The spider is multithreaded, which means that one slow request won't
12
+ # prevent the rest of your requests from happening; this is often the
13
+ # case when the cached resources are a combination of static or
14
+ # near-static resources (like CSS and images) and slow, dynamically
15
+ # generated pages.
16
+ #
17
+ # The spider's behaviour can be configured somewhat, so that for
18
+ # example it ignores query strings (treating /foo?foo=bar and
19
+ # /foo?foo=baz as the same URL), or doesn't ignore hashes (so /foo#foo
20
+ # and /foo#bar will be treated as different URLs).
21
+ #
22
+ #
7
23
  class Spider
8
24
 
25
+ # Starts a new spider instance.
26
+ #
27
+ # Once it's done a bit of housekeeping and verified that the URL is
28
+ # acceptable, it calls {#spider} to do the actual fetching of the
29
+ # pages.
30
+ #
31
+ # @param url [String, URI] The URL to begin the spidering from. This
32
+ # also restricts the spider to fetching pages only on that
33
+ # (sub)domain - so, for example, if you specify
34
+ # http://example.com/foo as your starting page, only URLs that begin
35
+ # http://example.com will be followed.
9
36
  def initialize(url)
10
- if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/
11
- url = 'http://' + url
12
- end
37
+ # If we've been given only a hostname, assume that we want to
38
+ # start spidering from the homepage
39
+ url = 'http://' + url unless url =~ %r(^[a-z]+://)
13
40
 
14
41
  @uri = URI.parse(url)
15
42
 
16
- @pages_hit = 0
17
-
18
43
  @visited = []
19
44
  @to_visit = []
20
-
21
- puts "Beginning spider of #{url}"
22
- crawl_page(url)
23
- spider
24
- puts "Done; #{@pages_hit} pages hit."
25
45
  end
26
46
 
47
+ # Adds a link to the queue of pages to be visited.
48
+ #
49
+ # Doesn't perform any duplication-checking; however, {#crawl_page}
50
+ # will refuse to crawl pages that have already been visited, so you
51
+ # can safely queue links blindly and trust that {#crawl_page} will do
52
+ # the de-duping for you.
53
+ #
54
+ # @api private
27
55
  def queue_link(url)
28
56
  @to_visit << url
29
57
  end
30
58
 
31
- def crawl_page(url, limit = 10)
59
+ # Visits a page, and extracts the links that it finds there.
60
+ #
61
+ # Links can be in the href attributes of HTML anchor tags, or they
62
+ # can just be URLs that are mentioned in the content of the page;
63
+ # the spider is flexible about what it crawls.
64
+ #
65
+ # Each link that it finds will be added to the queue of further
66
+ # pages to visit.
67
+ #
68
+ # @param url [String, URI] The URL of the page to fetch
69
+ #
70
+ # @api private
71
+ def crawl_page(uri)
32
72
  # Don't crawl a page twice
33
- return if @visited.include? url
73
+ return if @visited.include? uri.to_s
34
74
 
35
75
  # Let's not hit this again
36
- @visited << url
37
-
38
- begin
39
- uri = URI.parse(URI.encode(url.to_s.strip))
40
- rescue
41
- return
42
- end
43
-
44
- headers = {
45
- "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31",
46
- "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
47
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
48
- }
49
-
50
- begin
51
- req = Net::HTTP::Get.new(uri.path, headers)
52
- response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) }
53
-
54
- case response
55
- when Net::HTTPRedirection
56
- return crawl_page(response['location'], limit - 1)
57
- when Net::HTTPSuccess
58
- doc = Hpricot(response.body)
59
- end
60
- rescue
61
- return
62
- end
76
+ @visited << uri.to_s
63
77
 
64
- @pages_hit += 1
78
+ doc = Nokogiri::HTML(Net::HTTP.get_response(uri).body)
65
79
 
66
- if $options["verbose"]
67
- puts "Fetched #{url}..."
68
- end
80
+ Varnisher.log.debug "Fetched #{uri}..."
69
81
 
70
- find_links(doc, url) do |link|
82
+ find_links(doc, uri).each do |link|
71
83
  next if @visited.include? link
72
84
  next if @to_visit.include? link
73
85
 
@@ -75,93 +87,149 @@ module Varnisher
75
87
  end
76
88
  end
77
89
 
78
- def find_links(doc, url)
79
- return unless doc.respond_to? 'search'
90
+ # Given a Nokogiri document, will return all the links in that
91
+ # document.
92
+ #
93
+ # "Links" are defined, for now, as the contents of the `href`
94
+ # attributes on HTML `<a>` tags, and URLs that are mentioned in
95
+ # comments.
96
+ #
97
+ # @param doc A Nokogiri document
98
+ # @param url [String, URI] The URL that the document came from;
99
+ # this is used to resolve relative URIs
100
+ #
101
+ # @return [Array] An array of URIs
102
+ #
103
+ # @api private
104
+ def find_links(doc, uri)
105
+ hrefs = []
80
106
 
81
- begin
82
- uri = URI.parse(URI.encode(url.to_s.strip))
83
- rescue
84
- return
85
- end
107
+ hrefs = get_anchors(doc)
108
+ hrefs += get_commented_urls(doc)
86
109
 
87
- hrefs = []
110
+ hrefs = valid_urls(hrefs, uri)
111
+ hrefs = remove_hashes(hrefs)
112
+ hrefs = remove_query_strings(hrefs)
113
+
114
+ hrefs
115
+ end
116
+
117
+ # Given an HTML document, will return all the URLs that exist as
118
+ # href attributes of anchor tags.
119
+ #
120
+ # @return [Array] An array of strings
121
+ def get_anchors(doc)
122
+ doc.xpath('//a[@href]').map { |e| e['href'] }
123
+ end
124
+
125
+ # Given an HTML document, will return all the URLs that exist in
126
+ # HTML comments, e.g.:
127
+ #
128
+ # <!-- http://example.com/foo/bar -->
129
+ def get_commented_urls(doc)
130
+ doc.xpath('//comment()').flat_map { |e| URI.extract(e.to_html, 'http') }
131
+ end
88
132
 
89
- # Looks like a valid document! Let's parse it for links
90
- doc.search("//a[@href]").each do |e|
91
- hrefs << e.get_attribute("href")
133
+ # Given a set of URLs, will return only the ones that are valid for
134
+ # spidering.
135
+ #
136
+ # That means URLs that have the same hostname as the hostname we
137
+ # started from, and that are on the HTTP scheme rather than HTTPS
138
+ # (since Varnish doesn't support HTTPS).
139
+ #
140
+ # Additionally, some normalisation will be performed, so that the
141
+ # URLs are absolute (using the page that they were fetched from as
142
+ # the base, just like a browser would).
143
+ #
144
+ # @return [Array] An array of URIs
145
+ def valid_urls(hrefs, uri)
146
+ hrefs.map { |u| URI.join(uri, URI.escape(u)) }
147
+ .select { |u| u.scheme == 'http' && u.host == @uri.host }
148
+ end
149
+
150
+ # Given a set of URLs, will normalise them according to their URL
151
+ # minus the hash; that is, normalise them so that:
152
+ #
153
+ # foo#bar
154
+ #
155
+ # and:
156
+ #
157
+ # foo#baz
158
+ #
159
+ # Are considered the same.
160
+ #
161
+ # @return [Array] An array of URIs
162
+ def remove_hashes(hrefs)
163
+ return hrefs unless Varnisher.options['ignore-hashes']
164
+
165
+ hrefs = hrefs.group_by do |h|
166
+ URI.parse(h.scheme + '://' + h.host + h.path.to_s + h.query.to_s)
92
167
  end
93
168
 
94
- # Let's also look for commented-out URIs
95
- doc.search("//comment()").each do |e|
96
- e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; }
169
+ hrefs.keys
170
+ end
171
+
172
+ # Given a set of URLs, will normalise them according to their URL
173
+ # minus the query string; that is, normalise them so that:
174
+ #
175
+ # foo?foo=bar
176
+ #
177
+ # and:
178
+ #
179
+ # foo?foo=baz
180
+ #
181
+ # Are considered the same.
182
+ #
183
+ # @return [Array] An array of URIs
184
+ def remove_query_strings(hrefs)
185
+ return hrefs unless Varnisher.options['ignore-query-strings']
186
+
187
+ hrefs = hrefs.group_by do |h|
188
+ URI.parse(h.scheme + '://' + h.host + h.path.to_s)
97
189
  end
98
190
 
99
- hrefs.each do |href|
100
- # Skip mailto links
101
- next if href =~ /^mailto:/
102
-
103
- # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it.
104
- if href.to_s =~ /^\//
105
- href = uri.scheme + "://" + uri.host + href.to_s
106
- end
107
-
108
- # If we're dealing with a path-relative URL, make it relative to the current directory.
109
- unless href.to_s =~ /[a-z]+:\/\//
110
- # Take everything up to the final / in the path to be the current directory.
111
- if uri.path =~ /\//
112
- /^(.*)\//.match(uri.path)
113
- path = $1
114
- # If we're on the homepage, then we don't need a path.
115
- else
116
- path = ""
117
- end
118
-
119
- href = uri.scheme + "://" + uri.host + path + "/" + href.to_s
120
- end
121
-
122
- # At this point, we should have an absolute URL regardless of
123
- # its original format.
124
-
125
- # Strip hash links
126
- if ( $options["ignore-hashes"] )
127
- href.gsub!(/(#.*?)$/, '')
128
- end
129
-
130
- # Strip query strings
131
- if ( $options["ignore-query-strings"] )
132
- href.gsub!(/(\?.*?)$/, '')
133
- end
134
-
135
- begin
136
- href_uri = URI.parse(href)
137
- rescue
138
- # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it.
139
- next
140
- end
141
-
142
- next if href_uri.host != uri.host
143
- next unless href_uri.scheme =~ /^https?$/
144
-
145
- yield href
191
+ hrefs.keys
192
+ end
193
+
194
+ # Pops a URL from the queue of yet-to-be-visited URLs, ensuring that
195
+ # it's not one that we've visited before.
196
+ #
197
+ # @return [URI] A URI object for an unvisited page
198
+ def pop_url
199
+ url = ''
200
+
201
+ loop do
202
+ url = @to_visit.pop
203
+ break unless @visited.include?(url)
146
204
  end
205
+
206
+ url
147
207
  end
148
208
 
149
- def spider
150
- threads = $options["threads"] || 16
151
- num_pages = $options["num-pages"] || -1
209
+ # Kicks off the spidering process.
210
+ #
211
+ # Fires up Parallel in as many threads as have been configured, and
212
+ # begins to visit the pages in turn.
213
+ #
214
+ # This method is also responsible for checking whether the page
215
+ # limit has been reached and, if it has, ending the spidering.
216
+ #
217
+ # @api private
218
+ def run
219
+ Varnisher.log.info "Beginning spider of #{@uri}"
152
220
 
153
- Parallel.in_threads(threads) { |thread_number|
154
- # We've crawled too many pages
155
- next if @pages_hit > num_pages && num_pages >= 0
221
+ crawl_page(@uri)
156
222
 
157
- while @to_visit.length > 0 do
158
- begin
159
- url = @to_visit.pop
160
- end while ( @visited.include? url )
223
+ threads = Varnisher.options['threads']
224
+ num_pages = Varnisher.options['num-pages']
225
+
226
+ Parallel.in_threads(threads) do |thread_number|
227
+ next if @visited.length > num_pages && num_pages >= 0
228
+
229
+ crawl_page(pop_url) while @to_visit.length > 0
230
+ end
161
231
 
162
- crawl_page(url)
163
- end
164
- }
232
+ Varnisher.log.info "Done; #{@visited.length} pages hit."
165
233
  end
166
234
  end
167
235
  end
@@ -1,3 +1,3 @@
1
1
  module Varnisher
2
- VERSION = "1.0.beta.2"
2
+ VERSION = '1.0.beta.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: varnisher
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.beta.2
4
+ version: 1.0.beta.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Miller
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: 5.2.0
27
27
  - !ruby/object:Gem::Dependency
28
- name: hpricot
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: 0.8.6
33
+ version: 1.6.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: 0.8.6
40
+ version: 1.6.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: parallel
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ~>
67
67
  - !ruby/object:Gem::Version
68
68
  version: 0.4.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: yard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: 0.8.7
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 0.8.7
69
83
  description: Some tools that make working with the Varnish HTTP cache easier, including
70
84
  things like doing mass purges of entire domains.
71
85
  email: rob@bigfish.co.uk
@@ -77,6 +91,7 @@ files:
77
91
  - bin/varnisher
78
92
  - lib/varnisher/domainpurger.rb
79
93
  - lib/varnisher/pagepurger.rb
94
+ - lib/varnisher/purger.rb
80
95
  - lib/varnisher/spider.rb
81
96
  - lib/varnisher/version.rb
82
97
  - lib/varnisher.rb
@@ -107,3 +122,4 @@ signing_key:
107
122
  specification_version: 4
108
123
  summary: Helpful tools for working with Varnish caches
109
124
  test_files: []
125
+ has_rdoc: