govuk_mirrorer 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,71 @@
1
+ require 'net/http'
2
+
3
+ # Copied from ruby stdlib, with a single line addition to support SNI
4
+ # This can be removed once we've upgraded to ruby 1.9.3
5
+ # 1.9.2_p290 version: https://github.com/ruby/ruby/blob/v1_9_2_290/lib/net/http.rb#L642
6
+ # 1.9.3 version: https://github.com/ruby/ruby/blob/ruby_1_9_3/lib/net/http.rb#L760
7
+
8
+ if "1.9.2" == RUBY_VERSION
9
+
10
+ module Net
11
+
12
+ HTTP.class_eval do
13
+ def connect
14
+ D "opening connection to #{conn_address()}..."
15
+ s = timeout(@open_timeout) { TCPSocket.open(conn_address(), conn_port()) }
16
+ D "opened"
17
+
18
+ if use_ssl?
19
+ ssl_parameters = Hash.new
20
+ iv_list = instance_variables
21
+ SSL_ATTRIBUTES.each do |name|
22
+ ivname = "@#{name}".intern
23
+ if iv_list.include?(ivname) and
24
+ value = instance_variable_get(ivname)
25
+ ssl_parameters[name] = value
26
+ end
27
+ end
28
+ @ssl_context = OpenSSL::SSL::SSLContext.new
29
+ @ssl_context.set_params(ssl_parameters)
30
+ s = OpenSSL::SSL::SSLSocket.new(s, @ssl_context)
31
+ s.sync_close = true
32
+ end
33
+
34
+ @socket = BufferedIO.new(s)
35
+ @socket.read_timeout = @read_timeout
36
+ @socket.debug_output = @debug_output
37
+ if use_ssl?
38
+ begin
39
+ if proxy?
40
+ @socket.writeline sprintf('CONNECT %s:%s HTTP/%s',
41
+ @address, @port, HTTPVersion)
42
+ @socket.writeline "Host: #{@address}:#{@port}"
43
+ if proxy_user
44
+ credential = ["#{proxy_user}:#{proxy_pass}"].pack('m')
45
+ credential.delete!("\r\n")
46
+ @socket.writeline "Proxy-Authorization: Basic #{credential}"
47
+ end
48
+ @socket.writeline ''
49
+ HTTPResponse.read_new(@socket).value
50
+ end
51
+
52
+ # This is the only line that's different from the ruby method
53
+ # Server Name Indication (SNI) RFC 3546
54
+ s.hostname = @address if s.respond_to? :hostname=
55
+
56
+ timeout(@open_timeout) { s.connect }
57
+ if @ssl_context.verify_mode != OpenSSL::SSL::VERIFY_NONE
58
+ s.post_connection_check(@address)
59
+ end
60
+ rescue => exception
61
+ D "Conn close because of connect error #{exception}"
62
+ @socket.close if @socket and not @socket.closed?
63
+ raise exception
64
+ end
65
+ end
66
+ on_connect
67
+ end
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ require "statsd"
2
+
3
+ module GovukMirrorer
4
+ def self.statsd
5
+ host = "localhost" || ENV["STATSD_HOST"]
6
+ port = 8125 || ENV["STATSD_PORT"]
7
+ Statsd.new(host, port)
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module GovukMirrorer
2
+ VERSION = "1.3.2"
3
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Configurer do
4
+
5
+ describe "Setting site_root" do
6
+ it "should fail if site_root is not set" do
7
+ lambda do
8
+ GovukMirrorer::Configurer.run([])
9
+ end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
10
+
11
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("")
12
+ lambda do
13
+ GovukMirrorer::Configurer.run([])
14
+ end.should raise_error(GovukMirrorer::Configurer::NoRootUrlSpecifiedError)
15
+ end
16
+
17
+ it "should take a site-root option on the commandline" do
18
+ GovukMirrorer::Configurer.run(%w[--site-root sausage]).should include(:site_root => "sausage" )
19
+ end
20
+
21
+ it "should read the site root from an ENV variable" do
22
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
23
+ GovukMirrorer::Configurer.run([]).should include(:site_root => "sausage" )
24
+ end
25
+
26
+ it "should take the commandline option in preference to the ENV variable if both are specified" do
27
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
28
+ GovukMirrorer::Configurer.run(%w[--site-root mash]).should include(:site_root => "mash" )
29
+ end
30
+ end
31
+
32
+ describe "setting the request interval" do
33
+ before :each do
34
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
35
+ end
36
+
37
+ it "should allow setting the request interval" do
38
+ GovukMirrorer::Configurer.run(%w[--request-interval 0.6]).should include(:request_interval => 0.6)
39
+ end
40
+
41
+ it "should default to 0.1" do
42
+ GovukMirrorer::Configurer.run([]).should include(:request_interval => 0.1)
43
+ end
44
+ end
45
+
46
+
47
+ describe "setting up logging" do
48
+ before :each do
49
+ ENV.stub(:[]).with('MIRRORER_SITE_ROOT').and_return("sausage")
50
+ end
51
+
52
+ it "should allow specifying a logfile" do
53
+ GovukMirrorer::Configurer.run(%w[--logfile /foo/bar]).should include(:log_file => "/foo/bar")
54
+ end
55
+
56
+ it "should allow logging to syslog with default facility of local3" do
57
+ GovukMirrorer::Configurer.run(%w[--syslog]).should include(:syslog => "local3")
58
+ end
59
+
60
+ it "should allow logging to syslog overriding the default facility" do
61
+ GovukMirrorer::Configurer.run(%w[--syslog local5]).should include(:syslog => "local5")
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,286 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Crawler do
4
+ before :each do
5
+ GovukMirrorer::Indexer.any_instance.stub(:process_artefacts)
6
+ GovukMirrorer::Crawler.any_instance.stub(:logger).and_return(Logger.new("/dev/null"))
7
+ end
8
+
9
+ it 'should have a version number' do
10
+ GovukMirrorer::VERSION.should_not be_nil
11
+ end
12
+
13
+ describe "initializing" do
14
+
15
+ it "should handle all urls returned from the indexer" do
16
+ GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
17
+ https://www.example.com/
18
+ https://www.example.com/designprinciples
19
+ https://www.example.com/designprinciples/styleguide
20
+ https://www.example.com/designprinciples/performanceframework
21
+ ))
22
+ m = GovukMirrorer::Crawler.new
23
+ m.urls.should == %w(
24
+ https://www.example.com/
25
+ https://www.example.com/designprinciples
26
+ https://www.example.com/designprinciples/styleguide
27
+ https://www.example.com/designprinciples/performanceframework
28
+ )
29
+ end
30
+
31
+ describe "setting up the logger" do
32
+ before :each do
33
+ GovukMirrorer::Crawler.any_instance.unstub(:logger)
34
+ end
35
+
36
+ it "should log to stdout by default" do
37
+ m = GovukMirrorer::Crawler.new
38
+ logdev = m.logger.instance_variable_get('@logdev')
39
+ logdev.dev.should == STDOUT
40
+ end
41
+
42
+ it "should log to a file if requested" do
43
+ m = GovukMirrorer::Crawler.new(:log_file => "/dev/null")
44
+ logdev = m.logger.instance_variable_get('@logdev')
45
+ logdev.filename.should == "/dev/null"
46
+ end
47
+
48
+ it "should log to syslog if requested" do
49
+ m = GovukMirrorer::Crawler.new(:syslog => "local4")
50
+ m.logger.should be_a(Syslogger)
51
+ m.logger.facility.should == Syslog::LOG_LOCAL4
52
+ m.logger.options.should == (Syslog::LOG_PID | Syslog::LOG_CONS)
53
+ m.logger.ident.should == 'govuk_mirrorer'
54
+ end
55
+
56
+ it "should default to log level INFO" do
57
+ m = GovukMirrorer::Crawler.new
58
+ m.logger.level.should == Logger::INFO
59
+ end
60
+
61
+ it "should allow overriding the log level" do
62
+ m = GovukMirrorer::Crawler.new(:log_level => 'warn')
63
+ m.logger.level.should == Logger::WARN
64
+ end
65
+ end
66
+ end
67
+
68
+ describe "crawl" do
69
+ before :each do
70
+ GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w(
71
+ https://www.example.com/1
72
+ https://www.example.com/2
73
+ ))
74
+
75
+ @m = GovukMirrorer::Crawler.new(:request_interval => 0.01)
76
+ @m.stub(:process_govuk_page)
77
+ @m.send(:agent).stub(:get).and_return("default")
78
+ @m.stub(:sleep)
79
+ end
80
+
81
+ it "should fetch each page and pass it to the handler" do
82
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
83
+ @m.should_receive(:process_govuk_page).with("page_1", {}).ordered
84
+
85
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/2").ordered.and_return("page_2")
86
+ @m.should_receive(:process_govuk_page).with("page_2", {}).ordered
87
+
88
+ @m.crawl
89
+ end
90
+
91
+ it "should sleep for the configured request_interval between requests" do
92
+ @m.should_receive(:process_govuk_page).ordered
93
+ @m.should_receive(:sleep).with(0.01).ordered # Actually on kernel, but setting the expectation here works
94
+ @m.should_receive(:process_govuk_page).ordered
95
+ @m.should_receive(:sleep).with(0.01).ordered
96
+
97
+ @m.crawl
98
+ end
99
+
100
+ describe "handling errors" do
101
+ it "should call handle_error with the relevant details" do
102
+ error = StandardError.new("Boom")
103
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").and_raise(error)
104
+ @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {})
105
+
106
+ @m.crawl
107
+ end
108
+
109
+ it "should continue with the next URL" do
110
+ @m.send(:agent).stub(:get).with("https://www.example.com/1").and_raise("Boom")
111
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/2").and_return("something")
112
+
113
+ @m.crawl
114
+ end
115
+
116
+ context "error handling" do
117
+ [
118
+ [429, "Too Many Requests"],
119
+ [500, "Internal Server Error"],
120
+ [503, "Boom"],
121
+ ].each do |resp_code, resp_reason|
122
+ context "#{resp_code} #{resp_reason}" do
123
+ it "should sleep for a second, and then retry" do
124
+ error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
125
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_raise(error)
126
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1")
127
+
128
+ @m.should_not_receive(:handle_error)
129
+ @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
130
+ @m.should_receive(:process_govuk_page).with("page_1", {})
131
+
132
+ @m.crawl
133
+ end
134
+
135
+ it "should only retry once" do
136
+ error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason)
137
+ @m.send(:agent).should_receive(:get).with("https://www.example.com/1").twice.and_raise(error)
138
+
139
+ @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
140
+ @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {}).once
141
+
142
+ @m.crawl
143
+ end
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ describe "process_govuk_page" do
151
+ before :each do
152
+ @m = GovukMirrorer::Crawler.new({:site_root => "https://site-under-test"})
153
+ @m.stub(:save_to_disk)
154
+ @m.stub(:extract_and_handle_links)
155
+ @page = double("Page", uri: URI.parse("https://site-under-test/something"))
156
+ end
157
+
158
+ it "should save the page to disk" do
159
+ @m.should_receive(:save_to_disk).with(@page)
160
+ @m.process_govuk_page(@page)
161
+ end
162
+
163
+ it "should extract any links in the page" do
164
+ @m.should_receive(:extract_and_handle_links).with(@page)
165
+ @m.process_govuk_page(@page)
166
+ end
167
+
168
+ it "should do nothing if the page is a non gov.uk page" do
169
+ @page.stub(:uri).and_return(URI.parse("https://somewhere.else.com/foo"))
170
+ @m.should_not_receive(:save_to_disk)
171
+ @m.should_not_receive(:extract_and_handle_links)
172
+
173
+ @m.process_govuk_page(@page)
174
+ end
175
+ end
176
+
177
+ describe "extract_and_handle_links" do
178
+ before :each do
179
+ @m = GovukMirrorer::Crawler.new
180
+ @m.stub(:process_link)
181
+ end
182
+
183
+ it "should extract all <a>, <link> and <script> links from an html page" do
184
+ WebMock.stub_request(:get, "http://www.example.com/foo").
185
+ to_return(
186
+ :headers => {"Content-Type" => "text/html; charset=utf-8"},
187
+ :body => <<-EOT
188
+ <!DOCTYPE html>
189
+ <html lang="en" class="">
190
+ <head>
191
+ <link href="https://example.com/static/application.css" media="screen" rel="stylesheet" type="text/css">
192
+ <script defer src="https://example.com/static/application.js" type="text/javascript"></script>
193
+ <link rel="shortcut icon" href="https://example.com/static/favicon.ico" type="image/x-icon">
194
+ <script id="ga-params" type="text/javascript">
195
+ var _gaq = _gaq || [];
196
+ _gaq.push(['_setAccount', 'UA-26179049-1']);
197
+ _gaq.push(['_setAllowLinker', true]);
198
+ </script>
199
+ </head>
200
+ <body class="mainstream">
201
+ <a href="/" title="Go to the gov.uk homepage" id="logo">
202
+ <img src="https://example.com/static/gov.uk_logo.png" alt="GOV.UK Logo">
203
+ </a>
204
+ <p>HM Revenue &amp; Customs lists the <a href="http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm">rates of VAT</a> on different goods and services.</p>
205
+ </body>
206
+ </html>
207
+ EOT
208
+ )
209
+ page = Mechanize.new.get("http://www.example.com/foo")
210
+
211
+ @m.should_receive(:process_link).with(page, "https://example.com/static/application.css")
212
+ @m.should_receive(:process_link).with(page, "https://example.com/static/application.js")
213
+ @m.should_receive(:process_link).with(page, "https://example.com/static/favicon.ico")
214
+ @m.should_receive(:process_link).with(page, "/")
215
+ @m.should_receive(:process_link).with(page, "https://example.com/static/gov.uk_logo.png")
216
+ @m.should_receive(:process_link).with(page, "http://www.hmrc.gov.uk/vat/forms-rates/rates/goods-services.htm")
217
+ @m.should_receive(:process_link).never # None except for the ones above
218
+
219
+ @m.extract_and_handle_links(page)
220
+ end
221
+
222
+ it "should not attempt to extract links from non-html pages" do
223
+ WebMock.stub_request(:get, "http://www.example.com/foo.xml").
224
+ to_return(
225
+ :headers => {"Content-Type" => "application/xml; charset=utf-8"},
226
+ :body => %(<?xml version="1.0" encoding="UTF-8"?>\n<foo></foo>))
227
+ page = Mechanize.new.get("http://www.example.com/foo.xml")
228
+
229
+ @m.should_not_receive(:process_link)
230
+ page.should_not_receive(:search)
231
+
232
+ @m.extract_and_handle_links(page)
233
+ end
234
+ end
235
+
236
+ describe "rules for deciding if a URL should be mirrored" do
237
+ before :each do
238
+ @m = GovukMirrorer::Crawler.new
239
+ @m.stub(:handle)
240
+
241
+ @page = double("Page", uri: URI.parse("https://www.gov.uk/foo/bar"))
242
+ end
243
+
244
+ it "should convert relative links to full links" do
245
+ @m.should_receive(:handle).with("https://www.gov.uk/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
246
+ @m.process_link(@page, "/baz")
247
+
248
+ @m.should_receive(:handle).with("https://www.gov.uk/foo/baz", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
249
+ @m.process_link(@page, "baz")
250
+ end
251
+
252
+ it "should convert www.gov.uk http links to https" do
253
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
254
+ @m.process_link(@page, "http://www.gov.uk/something")
255
+ end
256
+
257
+ it "should pass through https www.gov.uk links" do
258
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
259
+ @m.process_link(@page, "https://www.gov.uk/something")
260
+ end
261
+
262
+ it "should reject any urls with query params" do
263
+ @m.should_not_receive(:handle).with("https://www.gov.uk/something?foo=bar&baz=foo", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
264
+ @m.process_link(@page, "https://www.gov.uk/something?foo=bar&baz=foo")
265
+ end
266
+
267
+ it "should remove any fragments (anchors) from the link" do
268
+ @m.should_receive(:handle).with("https://www.gov.uk/something", :process_govuk_page, :referrer => "https://www.gov.uk/foo/bar")
269
+ @m.process_link(@page, "https://www.gov.uk/something#foo")
270
+ end
271
+
272
+ it "should ignore non www.gov.uk links" do
273
+ @m.should_not_receive(:handle)
274
+
275
+ @m.process_link(@page, "https://direct.gov.uk/something")
276
+ @m.process_link(@page, "http://transactionalservices.alphagov.co.uk/department/dfid?orderBy=nameOfService&direction=desc&format=csv")
277
+ end
278
+
279
+ it "should ignore mailto links" do
280
+ @m.should_not_receive(:handle)
281
+
282
+ @m.process_link(@page, "mailto:me@example.com")
283
+ @m.process_link(@page, "mailto:someone@www.gov.uk")
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,191 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukMirrorer::Indexer do
4
+ let(:no_artefacts) { %({"_response_info":{"status":"ok"},"total":0,"results":[]}) }
5
+ let(:default_root) { "http://giraffe.example" }
6
+ let(:default_api_endpoint) { "http://giraffe.example/api/artefacts.json" }
7
+
8
+ before :each do
9
+ end
10
+
11
+ describe "construction and loading data" do
12
+ it "should add items to start_urls or blacklist according to format" do
13
+ WebMock.stub_request(:get, default_api_endpoint).
14
+ to_return(:body => {
15
+ "_response_info" => {"status" => "ok"},
16
+ "total" => 4,
17
+ "results" => [
18
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
19
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
20
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
21
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
22
+ ]
23
+ }.to_json)
24
+ i = GovukMirrorer::Indexer.new(default_root)
25
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
26
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
27
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
28
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
29
+
30
+ i.blacklist_paths.should include("/bar/baz")
31
+ i.blacklist_paths.should include("/somewhere")
32
+ i.blacklist_paths.should_not include("/foo")
33
+ i.blacklist_paths.should_not include("/vat")
34
+ end
35
+
36
+ it "should support pagination in the content api" do
37
+ WebMock.stub_request(:get, default_api_endpoint).
38
+ to_return(
39
+ :body => {
40
+ "_response_info" => {"status" => "ok"},
41
+ "total" => 4,
42
+ "results" => [
43
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
44
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz"},
45
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
46
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
47
+ ]
48
+ }.to_json,
49
+ :headers => {"Link" => "<#{default_api_endpoint}?page=2>; rel=\"next\""}
50
+ )
51
+ WebMock.stub_request(:get, "#{default_api_endpoint}?page=2").
52
+ to_return(
53
+ :body => {
54
+ "_response_info" => {"status" => "ok"},
55
+ "total" => 3,
56
+ "results" => [
57
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo2"},
58
+ {"format" => "local_transaction", "web_url" => "http://www.test.gov.uk/bar/baz2"},
59
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat2"},
60
+ ]
61
+ }.to_json
62
+ )
63
+
64
+ i = GovukMirrorer::Indexer.new(default_root)
65
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
66
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
67
+ i.all_start_urls.should include("http://www.test.gov.uk/foo2")
68
+ i.all_start_urls.should include("http://www.test.gov.uk/vat2")
69
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz")
70
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
71
+ i.all_start_urls.should_not include("http://www.test.gov.uk/bar/baz2")
72
+
73
+ i.blacklist_paths.should include("/bar/baz")
74
+ i.blacklist_paths.should include("/somewhere")
75
+ i.blacklist_paths.should include("/bar/baz2")
76
+ i.blacklist_paths.should_not include("/foo")
77
+ i.blacklist_paths.should_not include("/vat")
78
+ i.blacklist_paths.should_not include("/foo2")
79
+ i.blacklist_paths.should_not include("/vat2")
80
+ end
81
+
82
+ it "should add hardcoded whitelist items to the start_urls, even if their format would be blacklisted" do
83
+ WebMock.stub_request(:get, default_api_endpoint).
84
+ to_return(:body => {
85
+ "_response_info" => {"status" => "ok"},
86
+ "total" => 2,
87
+ "results" => [
88
+ {"format" => "custom-application", "web_url" => "http://www.test.gov.uk/bank-holidays"},
89
+ {"format" => "place", "web_url" => "http://www.test.gov.uk/somewhere"},
90
+ ]
91
+ }.to_json)
92
+ i = GovukMirrorer::Indexer.new(default_root)
93
+ i.all_start_urls.should include("http://www.test.gov.uk/bank-holidays")
94
+ i.all_start_urls.should_not include("http://www.test.gov.uk/somewhere")
95
+
96
+ i.blacklist_paths.should include("/somewhere")
97
+ i.blacklist_paths.should_not include("/bank-holidays")
98
+ end
99
+
100
+ it "should add the hardcoded items to the start_urls" do
101
+ WebMock.stub_request(:get, "https://www.gov.uk/api/artefacts.json").
102
+ to_return(:body => no_artefacts)
103
+ i = GovukMirrorer::Indexer.new("https://www.gov.uk")
104
+
105
+ i.all_start_urls.should include("https://www.gov.uk/")
106
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples")
107
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples/styleguide")
108
+ i.all_start_urls.should include("https://www.gov.uk/designprinciples/performanceframework")
109
+ end
110
+
111
+ it "should add the hardcoded items to the blacklist" do
112
+ WebMock.stub_request(:get, default_api_endpoint).
113
+ to_return(:body => no_artefacts)
114
+ i = GovukMirrorer::Indexer.new(default_root)
115
+
116
+ i.blacklist_paths.should include("/licence-finder")
117
+ i.blacklist_paths.should include("/trade-tariff")
118
+ end
119
+
120
+ describe "handling errors fetching artefacts" do
121
+ it "should sleep and retry fetching artefacts on HTTP error" do
122
+ WebMock.stub_request(:get, default_api_endpoint).
123
+ to_return(:status => [502, "Gateway Timeout"]).
124
+ to_return(:body => {
125
+ "_response_info" => {"status" => "ok"},
126
+ "total" => 2,
127
+ "results" => [
128
+ {"format" => "answer", "web_url" => "http://www.test.gov.uk/foo"},
129
+ {"format" => "guide", "web_url" => "http://www.test.gov.uk/vat"},
130
+ ]
131
+ }.to_json)
132
+ GovukMirrorer::Indexer.any_instance.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works
133
+
134
+ i = GovukMirrorer::Indexer.new(default_root)
135
+
136
+ i.all_start_urls.should include("http://www.test.gov.uk/foo")
137
+ i.all_start_urls.should include("http://www.test.gov.uk/vat")
138
+ end
139
+
140
+ it "should only retry once" do
141
+ WebMock.stub_request(:get, default_api_endpoint).
142
+ to_return(:status => [502, "Gateway Timeout"]).
143
+ to_return(:status => [502, "Gateway Timeout"])
144
+
145
+ GovukMirrorer::Indexer.any_instance.stub(:sleep) # Make tests fast
146
+ lambda do
147
+ GovukMirrorer::Indexer.new(default_root)
148
+ end.should raise_error(GdsApi::HTTPErrorResponse)
149
+ end
150
+ end
151
+ end
152
+
153
+ describe "blacklisted_url?" do
154
+ before :each do
155
+ WebMock.stub_request(:get, "http://www.foo.com/api/artefacts.json").
156
+ to_return(:body => no_artefacts)
157
+ @indexer = GovukMirrorer::Indexer.new("http://www.foo.com")
158
+
159
+ @indexer.instance_variable_set('@blacklist_paths', %w(
160
+ /foo/bar
161
+ /something
162
+ /something-else
163
+ ))
164
+ end
165
+
166
+ it "should return true if the url has a matching path" do
167
+ @indexer.blacklisted_url?("http://www.foo.com/foo/bar").should == true
168
+ end
169
+
170
+ it "should return trus if the url has a matching prefix" do
171
+ @indexer.blacklisted_url?("http://www.foo.com/something/somewhere").should == true
172
+ end
173
+
174
+ it "should return false if none match" do
175
+ @indexer.blacklisted_url?("http://www.foo.com/bar").should == false
176
+ end
177
+
178
+ it "should return false if only a partial segment matches" do
179
+ @indexer.blacklisted_url?("http://www.foo.com/something-other").should == false
180
+ @indexer.blacklisted_url?("http://www.foo.com/foo/baz").should == false
181
+ @indexer.blacklisted_url?("http://www.foo.com/foo-foo/bar").should == false
182
+ end
183
+
184
+ it "should cope with edge-cases passed in" do
185
+ @indexer.blacklisted_url?("mailto:goo@example.com").should == false
186
+ @indexer.blacklisted_url?("http://www.example.com").should == false
187
+ @indexer.blacklisted_url?("ftp://foo:bar@ftp.example.com").should == false
188
+ end
189
+ end
190
+ end
191
+