govuk_mirrorer 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ require 'net/http'
2
+
3
+ # Copied from ruby stdlib, with a single line addition to support SNI
4
+ # This can be removed once we've upgraded to ruby 1.9.3
5
+ # 1.9.2_p290 version: https://github.com/ruby/ruby/blob/v1_9_2_290/lib/net/http.rb#L642
6
+ # 1.9.3 version: https://github.com/ruby/ruby/blob/ruby_1_9_3/lib/net/http.rb#L760
7
+
8
+ if "1.9.2" == RUBY_VERSION
9
+
10
+ module Net
11
+
12
+ HTTP.class_eval do
13
+ def connect
14
+ D "opening connection to #{conn_address()}..."
15
+ s = timeout(@open_timeout) { TCPSocket.open(conn_address(), conn_port()) }
16
+ D "opened"
17
+
18
+ if use_ssl?
19
+ ssl_parameters = Hash.new
20
+ iv_list = instance_variables
21
+ SSL_ATTRIBUTES.each do |name|
22
+ ivname = "@#{name}".intern
23
+ if iv_list.include?(ivname) and
24
+ value = instance_variable_get(ivname)
25
+ ssl_parameters[name] = value
26
+ end
27
+ end
28
+ @ssl_context = OpenSSL::SSL::SSLContext.new
29
+ @ssl_context.set_params(ssl_parameters)
30
+ s = OpenSSL::SSL::SSLSocket.new(s, @ssl_context)
31
+ s.sync_close = true
32
+ end
33
+
34
+ @socket = BufferedIO.new(s)
35
+ @socket.read_timeout = @read_timeout
36
+ @socket.debug_output = @debug_output
37
+ if use_ssl?
38
+ begin
39
+ if proxy?
40
+ @socket.writeline sprintf('CONNECT %s:%s HTTP/%s',
41
+ @address, @port, HTTPVersion)
42
+ @socket.writeline "Host: #{@address}:#{@port}"
43
+ if proxy_user
44
+ credential = ["#{proxy_user}:#{proxy_pass}"].pack('m')
45
+ credential.delete!("\r\n")
46
+ @socket.writeline "Proxy-Authorization: Basic #{credential}"
47
+ end
48
+ @socket.writeline ''
49
+ HTTPResponse.read_new(@socket).value
50
+ end
51
+
52
+ # This is the only line that's different from the ruby method
53
+ # Server Name Indication (SNI) RFC 3546
54
+ s.hostname = @address if s.respond_to? :hostname=
55
+
56
+ timeout(@open_timeout) { s.connect }
57
+ if @ssl_context.verify_mode != OpenSSL::SSL::VERIFY_NONE
58
+ s.post_connection_check(@address)
59
+ end
60
+ rescue => exception
61
+ D "Conn close because of connect error #{exception}"
62
+ @socket.close if @socket and not @socket.closed?
63
+ raise exception
64
+ end
65
+ end
66
+ on_connect
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ require "statsd"
2
+
3
+ module GovukMirrorer
4
+ def self.statsd
5
+ host = "localhost" || ENV["STATSD_HOST"]
6
+ port = 8125 || ENV["STATSD_PORT"]
7
+ Statsd.new(host, port)
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module GovukMirrorer
2
+ VERSION = "1.3.2"
3
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Configurer do
4
+
5
+ describe "Setting site_root" do
6
+ it "should fail if site_root is not set" do
7
+ lambda do
8
+ GovukMirrorer::Configurer.run([])
9
+ end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
10
+
11
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("")
12
+ lambda do
13
+ GovukMirrorer::Configurer.run([])
14
+ end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
15
+ end
16
+
17
+ it "should take a site-root option on the commandline" do
18
+ GovukMirrorer::Configurer.run(%w[--site-root sausage]).should include(:site_root => "sausage" )
19
+ end
20
+
21
+ it "should read the site root from an ENV variable" do
22
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
23
+ GovukMirrorer::Configurer.run([]).should include(:site_root => "sausage" )
24
+ end
25
+
26
+ it "should take the commandline option in preference to the ENV variable if both are specified" do
27
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
28
+ GovukMirrorer::Configurer.run(%w[--site-root mash]).should include(:site_root => "mash" )
29
+ end
30
+ end
31
+
32
+ describe "setting the request interval" do
33
+ before :each do
34
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
35
+ end
36
+
37
+ it "should allow setting the request interval" do
38
+ GovukMirrorer::Configurer.run(%w[--request-interval 0.6]).should include(:request_interval => 0.6)
39
+ end
40
+
41
+ it "should default to 0.1" do
42
+ GovukMirrorer::Configurer.run([]).should include(:request_interval => 0.1)
43
+ end
44
+ end
45
+
46
+
47
+ describe "setting up logging" do
48
+ before :each do
49
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
50
+ end
51
+
52
+ it "should allow specifying a logfile" do
53
+ GovukMirrorer::Configurer.run(%w[--logfile /foo/bar]).should include(:log_file => "/foo/bar")
54
+ end
55
+
56
+ it "should allow logging to syslog with default facility of local3" do
57
+ GovukMirrorer::Configurer.run(%w[--syslog]).should include(:syslog => "local3")
58
+ end
59
+
60
+ it "should allow logging to syslog overriding the default facility" do
61
+ GovukMirrorer::Configurer.run(%w[--syslog local5]).should include(:syslog => "local5")
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,286 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Crawler do
4
+ before :each do
5
+ GovukMirrorer::Indexer.any_instance.stub(:process_artefacts)
6
+ GovukMirrorer::Crawler.any_instance.stub(:logger).and_return(Logger.new("/dev/null"))
7
+ end
8
+
9
+ it 'should have a version number' do
10
+ GovukMirrorer::VERSION.should_not be_nil
11
+ end
12
+
13
+ describe "initializing" do
14
+
15
+ it "should handle all urls returned from the indexer" do
16
+ GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
17
+ https://www.example.com/
18
+ https://www.example.com/designprinciples
19
+ https://www.example.com/designprinciples/styleguide
20
+ https://www.example.com/designprinciples/performanceframework
21
+ ))
22
+ m = GovukMirrorer::Crawler.new
23
+ m.urls.should == %w(
24
+ https://www.example.com/
25
+ https://www.example.com/designprinciples
26
+ https://www.example.com/designprinciples/styleguide
27
+ https://www.example.com/designprinciples/performanceframework
28
+ )
29
+ end
30
+
31
+ describe "setting up the logger" do
32
+ before :each do
33
+ GovukMirrorer::Crawler.any_instance.unstub(:logger)
34
+ end
35
+
36
+ it "should log to stdout by default" do
37
+ m = GovukMirrorer::Crawler.new
38
+ logdev = m.logger.instance_variable_get('@logdev')
39
+ logdev.dev.should == STDOUT
40
+ end
41
+
42
+ it "should log to a file if requested" do
43
+ m = GovukMirrorer::Crawler.new(:log_file => "/dev/null")
44
+ logdev = m.logger.instance_variable_get('@logdev')
45
+ logdev.filename.should == "/dev/null"
46
+ end
47
+
48
+ it "should log to syslog if requested" do
49
+ m = GovukMirrorer::Crawler.new(:syslog => "local4")
50
+ m.logger.should be_a(Syslogger)
51
+ m.logger.facility.should == Syslog::LOG_LOCAL4
52
+ m.logger.options.should == (Syslog::LOG_PID | Syslog::LOG_CONS)
53
+ m.logger.ident.should == 'govuk_mirrorer'
54
+ end
55
+
56
+ it "should default to log level INFO" do
57
+ m = GovukMirrorer::Crawler.new
58
+ m.logger.level.should == Logger::INFO
59
+ end
60
+
61
+ it "should allow overriding the log level" do
62
+ m = GovukMirrorer::Crawler.new(:log_level => 'warn')
63
+ m.logger.level.should == Logger::WARN
64
+ end
65
+ end
66
+ end
67
+
68
+ describe "crawl" do
69
+ before :each do
70
+ GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
71
+ https://www.example.com/1
72
+ https://www.example.com/2
73
+ ))
74
+
75
+ @m = GovukMirrorer::Crawler.new(:request_interval => 0.01)
76
+ @m.stub(:process_govuk_page)
77
+ @m.send(:agent).stub(:get).and_return("default")
78
+ @m.stub(:sleep)
79
+ end
80
+
81
+ it "should fetch each page and pass it to the handler" do
82
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
83
+ @m.should_receive(:process_govuk_page).with("page_1", {}).ordered
84
+
85
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/2").ordered.and_return("page_2")
86
+ @m.should_receive(:process_govuk_page).with("page_2", {}).ordered
87
+
88
+ @m.crawl
89
+ end
90
+
91
+ it "should sleep for the configured request_interval between requests" do
92
+ @m.should_receive(:process_govuk_page).ordered
93
+ @m.should_receive(:sleep).with(0.01).ordered # Actually on kernel, but setting the expectation here works
94
+ @m.should_receive(:process_govuk_page).ordered
95
+ @m.should_receive(:sleep).with(0.01).ordered
96
+
97
+ @m.crawl
98
+ end
99
+
100
+ describe "handling errors" do
101
+ it "should call handle_error with the relevant details" do
102
+ error = StandardError.new("Boom")
103
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").and_raise(error)
104
+ @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {})
105
+
106
+ @m.crawl
107
+ end
108
+
109
+ it "should continue with the next URL" do
110
+ @m.send(:agent).stub(:get).with("https://www.example.com/1").and_raise("Boom")
111
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/2").and_return("something")
112
+
113
+ @m.crawl
114
+ end
115
+
116
+ context "error handling" do
117
+ [
118
+ [429, "Too Many Requests"],
119
+ [500, "Internal Server Error"],
120
+ [503, "Boom"],
121
+ ].each do |resp_code, resp_reason|
122
+ context "#{resp_code} #{resp_reason}" do
123
+ it "should sleep for a second, and then retry" do
124
+ error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
125
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_raise(error)
126
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
127
+
128
+ @m.should_not_receive(:handle_error)
129
+ @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
130
+ @m.should_receive(:process_govuk_page).with("page_1", {})
131
+
132
+ @m.crawl
133
+ end
134
+
135
+ it "should only retry once" do
136
+ error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
137
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").twice.and_raise(error)
138
+
139
+ @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
140
+ @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {}).once
141
+
142
+ @m.crawl
143
+ end
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ describe "process_govuk_page" do
151
+ before :each do
152
+ @m = GovukMirrorer::Crawler.new({:site_root => "https://site-under-test"})
153
+ @m.stub(:save_to_disk)
154
+ @m.stub(:extract_and_handle_links)
155
+ @page = double("Page", uri: URI.parse("https://site-under-test/something"))
156
+ end
157
+
158
+ it "should save the page to disk" do
159
+ @m.should_receive(:save_to_disk).with(@page)
160
+ @m.process_govuk_page(@page)
161
+ end
162
+
163
+ it "should extract any links in the page" do
164
+ @m.should_receive(:extract_and_handle_links).with(@page)
165
+ @m.process_govuk_page(@page)
166
+ end
167
+
168
+ it "should do nothing if the page is a non gov.uk page" do
169
+ @page.stub(:uri).and_return(URI.parse("https://somewhere.else.com/foo"))
170
+ @m.should_not_receive(:save_to_disk)
171
+ @m.should_not_receive(:extract_and_handle_links)
172
+
173
+ @m.process_govuk_page(@page)
174
+ end
175
+ end
176
+
177
+ describe "extract_and_handle_links" do
178
+ before :each do
179
+ @m = GovukMirrorer::Crawler.new
180
+ @m.stub(:process_link)
181
+ end
182
+
183
+ it "should extract all <a>, <link> and <script> links from an html page" do
184
+ WebMock.stub_request(:get, "http://www.example.com/foo").
185
+ to_return(
186
+ :headers => {"Content-Type" => "text/html; charset=utf-8"},
187
+ :body => <<-EOT
188
+ <!DOCTYPE html>
189
+ <html lang="en" class="">
190
+ <head>
191
+ <link href="https://example.com/static/application.css" media="screen" rel="stylesheet" type="text/css">
192
+ <script defer src="https://example.com/static/application.js" type="text/javascript"></script>
193
+ <link rel="shortcut icon" href="https://example.com/static/favicon.ico" type="image/x-icon">
194
+ <script id="ga-params" type="text/javascript">
195
+ var _gaq = _gaq || [];
196
+ _gaq.push(['_setAccount', 'UA-26179049-1']);
197
+ _gaq.push(['_setAllowLinker', true]);
198
+ </script>
199
+ </head>
200
+ <body class="mainstream">
201
+ <a href="/" title="Go to the gov.uk homepage" id="logo">
202
+ <img src="https://example.com/static/gov.uk_logo.png" alt="GOV.UK Logo">
203
+ </a>
204
+ <p>HM Revenue &amp; Customs lists the <a href="http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm">rates of VAT</a> on different goods and services.</p>
205
+ </body>
206
+ </html>
207
+ EOT
208
+ )
209
+ page = Mechanize.new.get("http://www.example.com/foo")
210
+
211
+ @m.should_receive(:process_link).with(page, "https://example.com/static/application.css")
212
+ @m.should_receive(:process_link).with(page, "https://example.com/static/application.js")
213
+ @m.should_receive(:process_link).with(page, "https://example.com/static/favicon.ico")
214
+ @m.should_receive(:process_link).with(page, "/")
215
+ @m.should_receive(:process_link).with(page, "https://example.com/static/gov.uk_logo.png")
216
+ @m.should_receive(:process_link).with(page, "http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm")
217
+ @m.should_receive(:process_link).never # None except for the ones above
218
+
219
+ @m.extract_and_handle_links(page)
220
+ end
221
+
222
+ it "should not attempt to extract links from non-html pages" do
223
+ WebMock.stub_request(:get, "http://www.example.com/foo.xml").
224
+ to_return(
225
+ :headers => {"Content-Type" => "application/xml; charset=utf-8"},
226
+ :body => %(<?xml version="1.0" encoding="UTF-8"?>\n<foo></foo>))
227
+ page = Mechanize.new.get("http://www.example.com/foo.xml")
228
+
229
+ @m.should_not_receive(:process_link)
230
+ page.should_not_receive(:search)
231
+
232
+ @m.extract_and_handle_links(page)
233
+ end
234
+ end
235
+
236
+ describe "rules for deciding if a URL should be mirrored" do
237
+ before :each do
238
+ @m = GovukMirrorer::Crawler.new
239
+ @m.stub(:handle)
240
+
241
+ @page = double("Page", uri: URI.parse("https://www.gov.uk/foo/bar"))
242
+ end
243
+
244
+ it "should convert relative links to full links" do
245
+ @m.should_receive(:handle).with("https://www.gov.uk/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
246
+ @m.process_link(@page, "/baz")
247
+
248
+ @m.should_receive(:handle).with("https://www.gov.uk/foo/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
249
+ @m.process_link(@page, "baz")
250
+ end
251
+
252
+ it "should convert www.gov.uk http links to https" do
253
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
254
+ @m.process_link(@page, "http://www.gov.uk/something")
255
+ end
256
+
257
+ it "should pass through https www.gov.uk links" do
258
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
259
+ @m.process_link(@page, "https://www.gov.uk/something")
260
+ end
261
+
262
+ it "should reject any urls with query params" do
263
+ @m.should_not_receive(:handle).with("https://www.gov.uk/something?foo=bar&baz=foo", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
264
+ @m.process_link(@page, "https://www.gov.uk/something?foo=bar&baz=foo")
265
+ end
266
+
267
+ it "should remove any fragments (anchors) from the link" do
268
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
269
+ @m.process_link(@page, "https://www.gov.uk/something#foo")
270
+ end
271
+
272
+ it "should ignore non www.gov.uk links" do
273
+ @m.should_not_receive(:handle)
274
+
275
+ @m.process_link(@page, "https://direct.gov.uk/something")
276
+ @m.process_link(@page, "http://transactionalservices.alphagov.co.uk/department/dfid?orderBy=nameOfService&direction=desc&format=csv")
277
+ end
278
+
279
+ it "should ignore mailto links" do
280
+ @m.should_not_receive(:handle)
281
+
282
+ @m.process_link(@page, "mailto:me@example.com")
283
+ @m.process_link(@page, "mailto:someone@www.gov.uk")
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,191 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Indexer do
4
+ let(:no_artefacts) { %({"_response_info":{"status":"ok"},"total":0,"results":[]}) }
5
+ let(:default_root) { "http://giraffe.example" }
6
+ let(:default_api_endpoint) { "http://giraffe.example/api/artefacts.json" }
7
+
8
+ before :each do
9
+ end
10
+
11
+ describe "construction and loading data" do
12
+ it "should add items to start_urls or blacklist according to format" do
13
+ WebMock.stub_request(:get, default_api_endpoint).
14
+ to_return(:body => {
15
+ "_response_info" => {"status" => "ok"},
16
+ "total" => 4,
17
+ "results" => [
18
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
19
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
20
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
21
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
22
+ ]
23
+ }.to_json)
24
+ i = GovukMirrorer::Indexer.new(default_root)
25
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
26
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
27
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
28
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
29
+
30
+ i.blacklist_paths.should include("/bar/baz")
31
+ i.blacklist_paths.should include("/somewhere")
32
+ i.blacklist_paths.should_not include("/foo")
33
+ i.blacklist_paths.should_not include("/vat")
34
+ end
35
+
36
+ it "should support pagination in the content api" do
37
+ WebMock.stub_request(:get, default_api_endpoint).
38
+ to_return(
39
+ :body => {
40
+ "_response_info" => {"status" => "ok"},
41
+ "total" => 4,
42
+ "results" => [
43
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
44
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
45
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
46
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
47
+ ]
48
+ }.to_json,
49
+ :headers => {"Link" => "<#{default_api_endpoint}?page=2>; rel=\"next\""}
50
+ )
51
+ WebMock.stub_request(:get, "#{default_api_endpoint}?page=2").
52
+ to_return(
53
+ :body => {
54
+ "_response_info" => {"status" => "ok"},
55
+ "total" => 3,
56
+ "results" => [
57
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo2"},
58
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz2"},
59
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat2"},
60
+ ]
61
+ }.to_json
62
+ )
63
+
64
+ i = GovukMirrorer::Indexer.new(default_root)
65
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
66
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
67
+ i.all_start_urls.should include("http://www.test.gov.uk/foo2")
68
+ i.all_start_urls.should include("http://www.test.gov.uk/vat2")
69
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
70
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
71
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz2")
72
+
73
+ i.blacklist_paths.should include("/bar/baz")
74
+ i.blacklist_paths.should include("/somewhere")
75
+ i.blacklist_paths.should include("/bar/baz2")
76
+ i.blacklist_paths.should_not include("/foo")
77
+ i.blacklist_paths.should_not include("/vat")
78
+ i.blacklist_paths.should_not include("/foo2")
79
+ i.blacklist_paths.should_not include("/vat2")
80
+ end
81
+
82
+ it "should add hardcoded whitelist items to the start_urls, even if their format would be blacklisted" do
83
+ WebMock.stub_request(:get, default_api_endpoint).
84
+ to_return(:body => {
85
+ "_response_info" => {"status" => "ok"},
86
+ "total" => 2,
87
+ "results" => [
88
+ {"format" => "custom-application", "web_url" => "http://www.test.gov.uk/bank-holidays"},
89
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
90
+ ]
91
+ }.to_json)
92
+ i = GovukMirrorer::Indexer.new(default_root)
93
+ i.all_start_urls.should include("http://www.test.gov.uk/bank-holidays")
94
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
95
+
96
+ i.blacklist_paths.should include("/somewhere")
97
+ i.blacklist_paths.should_not include("/bank-holidays")
98
+ end
99
+
100
+ it "should add the hardcoded items to the start_urls" do
101
+ WebMock.stub_request(:get, "https://www.gov.uk/api/artefacts.json").
102
+ to_return(:body => no_artefacts)
103
+ i = GovukMirrorer::Indexer.new("https://www.gov.uk")
104
+
105
+ i.all_start_urls.should include("https://www.gov.uk/")
106
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples")
107
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples/styleguide")
108
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples/performanceframework")
109
+ end
110
+
111
+ it "should add the hardcoded items to the blacklist" do
112
+ WebMock.stub_request(:get, default_api_endpoint).
113
+ to_return(:body => no_artefacts)
114
+ i = GovukMirrorer::Indexer.new(default_root)
115
+
116
+ i.blacklist_paths.should include("/licence-finder")
117
+ i.blacklist_paths.should include("/trade-tariff")
118
+ end
119
+
120
+ describe "handling errors fetching artefacts" do
121
+ it "should sleep and retry fetching artefacts on HTTP error" do
122
+ WebMock.stub_request(:get, default_api_endpoint).
123
+ to_return(:status => [502, "Gateway Timeout"]).
124
+ to_return(:body => {
125
+ "_response_info" => {"status" => "ok"},
126
+ "total" => 2,
127
+ "results" => [
128
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
129
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
130
+ ]
131
+ }.to_json)
132
+ GovukMirrorer::Indexer.any_instance.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
133
+
134
+ i = GovukMirrorer::Indexer.new(default_root)
135
+
136
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
137
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
138
+ end
139
+
140
+ it "should only retry once" do
141
+ WebMock.stub_request(:get, default_api_endpoint).
142
+ to_return(:status => [502, "Gateway Timeout"]).
143
+ to_return(:status => [502, "Gateway Timeout"])
144
+
145
+ GovukMirrorer::Indexer.any_instance.stub(:sleep) # Make tests fast
146
+ lambda do
147
+ GovukMirrorer::Indexer.new(default_root)
148
+ end.should raise_error(GdsApi::HTTPErrorResponse)
149
+ end
150
+ end
151
+ end
152
+
153
+ describe "blacklisted_url?" do
154
+ before :each do
155
+ WebMock.stub_request(:get, "http://www.foo.com/api/artefacts.json").
156
+ to_return(:body => no_artefacts)
157
+ @indexer = GovukMirrorer::Indexer.new("http://www.foo.com")
158
+
159
+ @indexer.instance_variable_set('@blacklist_paths', %w(
160
+ /foo/bar
161
+ /something
162
+ /something-else
163
+ ))
164
+ end
165
+
166
+ it "should return true if the url has a matching path" do
167
+ @indexer.blacklisted_url?("http://www.foo.com/foo/bar").should == true
168
+ end
169
+
170
+ it "should return trus if the url has a matching prefix" do
171
+ @indexer.blacklisted_url?("http://www.foo.com/something/somewhere").should == true
172
+ end
173
+
174
+ it "should return false if none match" do
175
+ @indexer.blacklisted_url?("http://www.foo.com/bar").should == false
176
+ end
177
+
178
+ it "should return false if only a partial segment matches" do
179
+ @indexer.blacklisted_url?("http://www.foo.com/something-other").should == false
180
+ @indexer.blacklisted_url?("http://www.foo.com/foo/baz").should == false
181
+ @indexer.blacklisted_url?("http://www.foo.com/foo-foo/bar").should == false
182
+ end
183
+
184
+ it "should cope with edge-cases passed in" do
185
+ @indexer.blacklisted_url?("mailto:goo@example.com").should == false
186
+ @indexer.blacklisted_url?("http://www.example.com").should == false
187
+ @indexer.blacklisted_url?("ftp://foo:bar@ftp.example.com").should == false
188
+ end
189
+ end
190
+ end
191
+