RubyGems - govuk_mirrorer - Versions diffs - 1.3.2 - Mend

govuk_mirrorer 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +13 -0
data/bin/govuk_mirrorer +9 -0
data/govuk_mirrorer.gemspec +32 -0
data/jenkins.sh +6 -0
data/lib/govuk_mirrorer.rb +14 -0
data/lib/govuk_mirrorer/configurer.rb +44 -0
data/lib/govuk_mirrorer/crawler.rb +170 -0
data/lib/govuk_mirrorer/indexer.rb +94 -0
data/lib/govuk_mirrorer/net_http_sni_monkey_patch.rb +71 -0
data/lib/govuk_mirrorer/statsd.rb +9 -0
data/lib/govuk_mirrorer/version.rb +3 -0
data/spec/govuk_mirrorer/configurer_spec.rb +64 -0
data/spec/govuk_mirrorer/crawler_spec.rb +286 -0
data/spec/govuk_mirrorer/indexer_spec.rb +191 -0
data/spec/govuk_mirrorer/mirrorer_spec.rb +15 -0
data/spec/spec_helper.rb +5 -0
metadata +212 -0

data/lib/govuk_mirrorer/net_http_sni_monkey_patch.rb ADDED

@@ -0,0 +1,71 @@
+require 'net/http'
+# Copied from ruby stdlib, with a single line addition to support SNI
+# This can be removed once we've upgraded to ruby 1.9.3
+# 1.9.2_p290 version: https://github.com/ruby/ruby/blob/v1_9_2_290/lib/net/http.rb#L642
+# 1.9.3 version:      https://github.com/ruby/ruby/blob/ruby_1_9_3/lib/net/http.rb#L760
+if "1.9.2" == RUBY_VERSION
+  module Net
+    HTTP.class_eval do
+      def connect
+        D "opening connection to #{conn_address()}..."
+        s = timeout(@open_timeout) { TCPSocket.open(conn_address(), conn_port()) }
+        D "opened"
+        if use_ssl?
+          ssl_parameters = Hash.new
+          iv_list = instance_variables
+          SSL_ATTRIBUTES.each do |name|
+            ivname = "@#{name}".intern
+            if iv_list.include?(ivname) and
+               value = instance_variable_get(ivname)
+              ssl_parameters[name] = value
+            end
+          end
+          @ssl_context = OpenSSL::SSL::SSLContext.new
+          @ssl_context.set_params(ssl_parameters)
+          s = OpenSSL::SSL::SSLSocket.new(s, @ssl_context)
+          s.sync_close = true
+        end
+        @socket = BufferedIO.new(s)
+        @socket.read_timeout = @read_timeout
+        @socket.debug_output = @debug_output
+        if use_ssl?
+          begin
+            if proxy?
+              @socket.writeline sprintf('CONNECT %s:%s HTTP/%s',
+                                        @address, @port, HTTPVersion)
+              @socket.writeline "Host: #{@address}:#{@port}"
+              if proxy_user
+                credential = ["#{proxy_user}:#{proxy_pass}"].pack('m')
+                credential.delete!("\r\n")
+                @socket.writeline "Proxy-Authorization: Basic #{credential}"
+              end
+              @socket.writeline ''
+              HTTPResponse.read_new(@socket).value
+            end
+            # This is the only line that's different from the ruby method
+            # Server Name Indication (SNI) RFC 3546
+            s.hostname = @address if s.respond_to? :hostname=
+            timeout(@open_timeout) { s.connect }
+            if @ssl_context.verify_mode != OpenSSL::SSL::VERIFY_NONE
+              s.post_connection_check(@address)
+            end
+          rescue => exception
+            D "Conn close because of connect error #{exception}"
+            @socket.close if @socket and not @socket.closed?
+            raise exception
+          end
+        end
+        on_connect
+      end
+    end
+  end
+end

data/lib/govuk_mirrorer/statsd.rb ADDED

@@ -0,0 +1,9 @@
+require "statsd"
+module GovukMirrorer
+  def self.statsd
+    host = "localhost" || ENV["STATSD_HOST"]
+    port = 8125 || ENV["STATSD_PORT"]
+    Statsd.new(host, port)
+  end
+end

data/lib/govuk_mirrorer/version.rb ADDED

@@ -0,0 +1,3 @@
+module GovukMirrorer
+  VERSION = "1.3.2"
+end

data/spec/govuk_mirrorer/configurer_spec.rb ADDED

@@ -0,0 +1,64 @@
+require 'spec_helper'
+describe GovukMirrorer::Configurer do
+  describe "Setting site_root" do
+    it "should fail if site_root is not set" do
+      lambda do
+        GovukMirrorer::Configurer.run([])
+      end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
+      ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("")
+      lambda do
+        GovukMirrorer::Configurer.run([])
+      end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
+    end
+    it "should take a site-root option on the commandline" do
+      GovukMirrorer::Configurer.run(%w[--site-root sausage]).should include(:site_root => "sausage" )
+    end
+    it "should read the site root from an ENV variable" do
+      ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
+      GovukMirrorer::Configurer.run([]).should include(:site_root => "sausage" )
+    end
+    it "should take the commandline option in preference to the ENV variable if both are specified" do
+      ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
+      GovukMirrorer::Configurer.run(%w[--site-root mash]).should include(:site_root => "mash" )
+    end
+  end
+  describe "setting the request interval" do
+    before :each do
+      ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
+    end
+    it "should allow setting the request interval" do
+      GovukMirrorer::Configurer.run(%w[--request-interval 0.6]).should include(:request_interval => 0.6)
+    end
+    it "should default to 0.1" do
+      GovukMirrorer::Configurer.run([]).should include(:request_interval => 0.1)
+    end
+  end
+  describe "setting up logging" do
+    before :each do
+      ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
+    end
+    it "should allow specifying a logfile" do
+      GovukMirrorer::Configurer.run(%w[--logfile /foo/bar]).should include(:log_file => "/foo/bar")
+    end
+    it "should allow logging to syslog with default facility of local3" do
+      GovukMirrorer::Configurer.run(%w[--syslog]).should include(:syslog => "local3")
+    end
+    it "should allow logging to syslog overriding the default facility" do
+      GovukMirrorer::Configurer.run(%w[--syslog local5]).should include(:syslog => "local5")
+    end
+  end
+end

data/spec/govuk_mirrorer/crawler_spec.rb ADDED

@@ -0,0 +1,286 @@
+require 'spec_helper'
+describe GovukMirrorer::Crawler do
+  before :each do
+    GovukMirrorer::Indexer.any_instance.stub(:process_artefacts)
+    GovukMirrorer::Crawler.any_instance.stub(:logger).and_return(Logger.new("/dev/null"))
+  end
+  it 'should have a version number' do
+    GovukMirrorer::VERSION.should_not be_nil
+  end
+  describe "initializing" do
+    it "should handle all urls returned from the indexer" do
+      GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
+        https://www.example.com/
+        https://www.example.com/designprinciples
+        https://www.example.com/designprinciples/styleguide
+        https://www.example.com/designprinciples/performanceframework
+      ))
+      m = GovukMirrorer::Crawler.new
+      m.urls.should == %w(
+        https://www.example.com/
+        https://www.example.com/designprinciples
+        https://www.example.com/designprinciples/styleguide
+        https://www.example.com/designprinciples/performanceframework
+      )
+    end
+    describe "setting up the logger" do
+      before :each do
+        GovukMirrorer::Crawler.any_instance.unstub(:logger)
+      end
+      it "should log to stdout by default" do
+        m = GovukMirrorer::Crawler.new
+        logdev = m.logger.instance_variable_get('@logdev')
+        logdev.dev.should == STDOUT
+      end
+      it "should log to a file if requested" do
+        m = GovukMirrorer::Crawler.new(:log_file => "/dev/null")
+        logdev = m.logger.instance_variable_get('@logdev')
+        logdev.filename.should == "/dev/null"
+      end
+      it "should log to syslog if requested" do
+        m = GovukMirrorer::Crawler.new(:syslog => "local4")
+        m.logger.should be_a(Syslogger)
+        m.logger.facility.should == Syslog::LOG_LOCAL4
+        m.logger.options.should == (Syslog::LOG_PID | Syslog::LOG_CONS)
+        m.logger.ident.should == 'govuk_mirrorer'
+      end
+      it "should default to log level INFO" do
+        m = GovukMirrorer::Crawler.new
+        m.logger.level.should == Logger::INFO
+      end
+      it "should allow overriding the log level" do
+        m = GovukMirrorer::Crawler.new(:log_level => 'warn')
+        m.logger.level.should == Logger::WARN
+      end
+    end
+  end
+  describe "crawl" do
+    before :each do
+      GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
+        https://www.example.com/1
+        https://www.example.com/2
+      ))
+      @m = GovukMirrorer::Crawler.new(:request_interval => 0.01)
+      @m.stub(:process_govuk_page)
+      @m.send(:agent).stub(:get).and_return("default")
+      @m.stub(:sleep)
+    end
+    it "should fetch each page and pass it to the handler" do
+      @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
+      @m.should_receive(:process_govuk_page).with("page_1", {}).ordered
+      @m.send(:agent).should_receive(:get).with("https://www.example.com/2").ordered.and_return("page_2")
+      @m.should_receive(:process_govuk_page).with("page_2", {}).ordered
+      @m.crawl
+    end
+    it "should sleep for the configured request_interval between requests" do
+      @m.should_receive(:process_govuk_page).ordered
+      @m.should_receive(:sleep).with(0.01).ordered # Actually on kernel, but setting the expectation here works
+      @m.should_receive(:process_govuk_page).ordered
+      @m.should_receive(:sleep).with(0.01).ordered
+      @m.crawl
+    end
+    describe "handling errors" do
+      it "should call handle_error with the relevant details" do
+        error = StandardError.new("Boom")
+        @m.send(:agent).should_receive(:get).with("https://www.example.com/1").and_raise(error)
+        @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {})
+        @m.crawl
+      end
+      it "should continue with the next URL" do
+        @m.send(:agent).stub(:get).with("https://www.example.com/1").and_raise("Boom")
+        @m.send(:agent).should_receive(:get).with("https://www.example.com/2").and_return("something")
+        @m.crawl
+      end
+      context "error handling" do
+        [
+          [429, "Too Many Requests"],
+          [500, "Internal Server Error"],
+          [503, "Boom"],
+        ].each do |resp_code, resp_reason|
+          context "#{resp_code} #{resp_reason}" do
+            it "should sleep for a second, and then retry" do
+              error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
+              @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_raise(error)
+              @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
+              @m.should_not_receive(:handle_error)
+              @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
+              @m.should_receive(:process_govuk_page).with("page_1", {})
+              @m.crawl
+            end
+            it "should only retry once" do
+              error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
+              @m.send(:agent).should_receive(:get).with("https://www.example.com/1").twice.and_raise(error)
+              @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
+              @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {}).once
+              @m.crawl
+            end
+          end
+        end
+      end
+    end
+  end
+  describe "process_govuk_page" do
+    before :each do
+      @m = GovukMirrorer::Crawler.new({:site_root => "https://site-under-test"})
+      @m.stub(:save_to_disk)
+      @m.stub(:extract_and_handle_links)
+      @page = double("Page", uri: URI.parse("https://site-under-test/something"))
+    end
+    it "should save the page to disk" do
+      @m.should_receive(:save_to_disk).with(@page)
+      @m.process_govuk_page(@page)
+    end
+    it "should extract any links in the page" do
+      @m.should_receive(:extract_and_handle_links).with(@page)
+      @m.process_govuk_page(@page)
+    end
+    it "should do nothing if the page is a non gov.uk page" do
+      @page.stub(:uri).and_return(URI.parse("https://somewhere.else.com/foo"))
+      @m.should_not_receive(:save_to_disk)
+      @m.should_not_receive(:extract_and_handle_links)
+      @m.process_govuk_page(@page)
+    end
+  end
+  describe "extract_and_handle_links" do
+    before :each do
+      @m = GovukMirrorer::Crawler.new
+      @m.stub(:process_link)
+    end
+    it "should extract all <a>, <link> and <script> links from an html page" do
+      WebMock.stub_request(:get, "http://www.example.com/foo").
+        to_return(
+          :headers => {"Content-Type" => "text/html; charset=utf-8"},
+          :body => <<-EOT
+<!DOCTYPE html>
+<html lang="en" class="">
+<head>
+<link href="https://example.com/static/application.css" media="screen" rel="stylesheet" type="text/css">
+<script defer src="https://example.com/static/application.js" type="text/javascript"></script>
+<link rel="shortcut icon" href="https://example.com/static/favicon.ico" type="image/x-icon">
+<script id="ga-params" type="text/javascript">
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-26179049-1']);
+  _gaq.push(['_setAllowLinker', true]);
+</script>
+</head>
+<body class="mainstream">
+  <a href="/" title="Go to the gov.uk homepage" id="logo">
+    <img src="https://example.com/static/gov.uk_logo.png" alt="GOV.UK Logo">
+  </a>
+<p>HM Revenue &amp; Customs lists the <a href="http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm">rates of VAT</a> on different goods and services.</p>
+  </body>
+</html>
+          EOT
+        )
+      page = Mechanize.new.get("http://www.example.com/foo")
+      @m.should_receive(:process_link).with(page, "https://example.com/static/application.css")
+      @m.should_receive(:process_link).with(page, "https://example.com/static/application.js")
+      @m.should_receive(:process_link).with(page, "https://example.com/static/favicon.ico")
+      @m.should_receive(:process_link).with(page, "/")
+      @m.should_receive(:process_link).with(page, "https://example.com/static/gov.uk_logo.png")
+      @m.should_receive(:process_link).with(page, "http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm")
+      @m.should_receive(:process_link).never # None except for the ones above
+      @m.extract_and_handle_links(page)
+    end
+    it "should not attempt to extract links from non-html pages" do
+      WebMock.stub_request(:get, "http://www.example.com/foo.xml").
+        to_return(
+          :headers => {"Content-Type" => "application/xml; charset=utf-8"},
+          :body => %(<?xml version="1.0" encoding="UTF-8"?>\n<foo></foo>))
+      page = Mechanize.new.get("http://www.example.com/foo.xml")
+      @m.should_not_receive(:process_link)
+      page.should_not_receive(:search)
+      @m.extract_and_handle_links(page)
+    end
+  end
+  describe "rules for deciding if a URL should be mirrored" do
+    before :each do
+      @m = GovukMirrorer::Crawler.new
+      @m.stub(:handle)
+      @page = double("Page", uri: URI.parse("https://www.gov.uk/foo/bar"))
+    end
+    it "should convert relative links to full links" do
+      @m.should_receive(:handle).with("https://www.gov.uk/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "/baz")
+      @m.should_receive(:handle).with("https://www.gov.uk/foo/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "baz")
+    end
+    it "should convert www.gov.uk http links to https" do
+      @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "http://www.gov.uk/something")
+    end
+    it "should pass through https www.gov.uk links" do
+      @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "https://www.gov.uk/something")
+    end
+    it "should reject any urls with query params" do
+      @m.should_not_receive(:handle).with("https://www.gov.uk/something?foo=bar&baz=foo", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "https://www.gov.uk/something?foo=bar&baz=foo")
+    end
+    it "should remove any fragments (anchors) from the link" do
+      @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
+      @m.process_link(@page, "https://www.gov.uk/something#foo")
+    end
+    it "should ignore non www.gov.uk links" do
+      @m.should_not_receive(:handle)
+      @m.process_link(@page, "https://direct.gov.uk/something")
+      @m.process_link(@page, "http://transactionalservices.alphagov.co.uk/department/dfid?orderBy=nameOfService&direction=desc&format=csv")
+    end
+    it "should ignore mailto links" do
+      @m.should_not_receive(:handle)
+      @m.process_link(@page, "mailto:me@example.com")
+      @m.process_link(@page, "mailto:someone@www.gov.uk")
+    end
+  end
+end

data/spec/govuk_mirrorer/indexer_spec.rb ADDED

@@ -0,0 +1,191 @@
+require 'spec_helper'
+describe GovukMirrorer::Indexer do
+  let(:no_artefacts) { %({"_response_info":{"status":"ok"},"total":0,"results":[]}) }
+  let(:default_root) { "http://giraffe.example" }
+  let(:default_api_endpoint) { "http://giraffe.example/api/artefacts.json" }
+  before :each do
+  end
+  describe "construction and loading data" do
+    it "should add items to start_urls or blacklist according to format" do
+      WebMock.stub_request(:get, default_api_endpoint).
+        to_return(:body => {
+          "_response_info" => {"status" => "ok"},
+          "total" => 4,
+          "results" => [
+            {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
+            {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
+            {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
+            {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
+          ]
+        }.to_json)
+      i = GovukMirrorer::Indexer.new(default_root)
+      i.all_start_urls.should include("http://www.test.gov.uk/foo")
+      i.all_start_urls.should include("http://www.test.gov.uk/vat")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
+      i.blacklist_paths.should include("/bar/baz")
+      i.blacklist_paths.should include("/somewhere")
+      i.blacklist_paths.should_not include("/foo")
+      i.blacklist_paths.should_not include("/vat")
+    end
+    it "should support pagination in the content api" do
+      WebMock.stub_request(:get, default_api_endpoint).
+        to_return(
+          :body => {
+            "_response_info" => {"status" => "ok"},
+            "total" => 4,
+            "results" => [
+              {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
+              {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
+              {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
+              {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
+            ]
+          }.to_json,
+          :headers => {"Link" => "<#{default_api_endpoint}?page=2>; rel=\"next\""}
+        )
+      WebMock.stub_request(:get, "#{default_api_endpoint}?page=2").
+        to_return(
+          :body => {
+            "_response_info" => {"status" => "ok"},
+            "total" => 3,
+            "results" => [
+              {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo2"},
+              {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz2"},
+              {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat2"},
+            ]
+          }.to_json
+        )
+      i = GovukMirrorer::Indexer.new(default_root)
+      i.all_start_urls.should include("http://www.test.gov.uk/foo")
+      i.all_start_urls.should include("http://www.test.gov.uk/vat")
+      i.all_start_urls.should include("http://www.test.gov.uk/foo2")
+      i.all_start_urls.should include("http://www.test.gov.uk/vat2")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz2")
+      i.blacklist_paths.should include("/bar/baz")
+      i.blacklist_paths.should include("/somewhere")
+      i.blacklist_paths.should include("/bar/baz2")
+      i.blacklist_paths.should_not include("/foo")
+      i.blacklist_paths.should_not include("/vat")
+      i.blacklist_paths.should_not include("/foo2")
+      i.blacklist_paths.should_not include("/vat2")
+    end
+    it "should add hardcoded whitelist items to the start_urls, even if their format would be blacklisted" do
+      WebMock.stub_request(:get, default_api_endpoint).
+        to_return(:body => {
+          "_response_info" => {"status" => "ok"},
+          "total" => 2,
+          "results" => [
+            {"format" => "custom-application", "web_url" => "http://www.test.gov.uk/bank-holidays"},
+            {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
+          ]
+        }.to_json)
+      i = GovukMirrorer::Indexer.new(default_root)
+      i.all_start_urls.should include("http://www.test.gov.uk/bank-holidays")
+      i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
+      i.blacklist_paths.should include("/somewhere")
+      i.blacklist_paths.should_not include("/bank-holidays")
+    end
+    it "should add the hardcoded items to the start_urls" do
+      WebMock.stub_request(:get, "https://www.gov.uk/api/artefacts.json").
+        to_return(:body => no_artefacts)
+      i = GovukMirrorer::Indexer.new("https://www.gov.uk")
+      i.all_start_urls.should include("https://www.gov.uk/")
+      i.all_start_urls.should include("https://www.gov.uk/designprinciples")
+      i.all_start_urls.should include("https://www.gov.uk/designprinciples/styleguide")
+      i.all_start_urls.should include("https://www.gov.uk/designprinciples/performanceframework")
+    end
+    it "should add the hardcoded items to the blacklist" do
+      WebMock.stub_request(:get, default_api_endpoint).
+        to_return(:body => no_artefacts)
+      i = GovukMirrorer::Indexer.new(default_root)
+      i.blacklist_paths.should include("/licence-finder")
+      i.blacklist_paths.should include("/trade-tariff")
+    end
+    describe "handling errors fetching artefacts" do
+      it "should sleep and retry fetching artefacts on HTTP error" do
+        WebMock.stub_request(:get, default_api_endpoint).
+          to_return(:status => [502, "Gateway Timeout"]).
+          to_return(:body => {
+            "_response_info" => {"status" => "ok"},
+            "total" => 2,
+            "results" => [
+              {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
+              {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
+            ]
+          }.to_json)
+        GovukMirrorer::Indexer.any_instance.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
+        i = GovukMirrorer::Indexer.new(default_root)
+        i.all_start_urls.should include("http://www.test.gov.uk/foo")
+        i.all_start_urls.should include("http://www.test.gov.uk/vat")
+      end
+      it "should only retry once" do
+        WebMock.stub_request(:get, default_api_endpoint).
+          to_return(:status => [502, "Gateway Timeout"]).
+          to_return(:status => [502, "Gateway Timeout"])
+        GovukMirrorer::Indexer.any_instance.stub(:sleep) # Make tests fast
+        lambda do
+          GovukMirrorer::Indexer.new(default_root)
+        end.should raise_error(GdsApi::HTTPErrorResponse)
+      end
+    end
+  end
+  describe "blacklisted_url?" do
+    before :each do
+      WebMock.stub_request(:get, "http://www.foo.com/api/artefacts.json").
+        to_return(:body => no_artefacts)
+      @indexer = GovukMirrorer::Indexer.new("http://www.foo.com")
+      @indexer.instance_variable_set('@blacklist_paths', %w(
+        /foo/bar
+        /something
+        /something-else
+      ))
+    end
+    it "should return true if the url has a matching path" do
+      @indexer.blacklisted_url?("http://www.foo.com/foo/bar").should == true
+    end
+    it "should return trus if the url has a matching prefix" do
+      @indexer.blacklisted_url?("http://www.foo.com/something/somewhere").should == true
+    end
+    it "should return false if none match" do
+      @indexer.blacklisted_url?("http://www.foo.com/bar").should == false
+    end
+    it "should return false if only a partial segment matches" do
+      @indexer.blacklisted_url?("http://www.foo.com/something-other").should == false
+      @indexer.blacklisted_url?("http://www.foo.com/foo/baz").should == false
+      @indexer.blacklisted_url?("http://www.foo.com/foo-foo/bar").should == false
+    end
+    it "should cope with edge-cases passed in" do
+      @indexer.blacklisted_url?("mailto:goo@example.com").should == false
+      @indexer.blacklisted_url?("http://www.example.com").should == false
+      @indexer.blacklisted_url?("ftp://foo:bar@ftp.example.com").should == false
+    end
+  end
+end