govuk_mirrorer 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d05df644e22f9aeb6bc503bc6d5c4df69c0dabb1
4
+ data.tar.gz: bd7337adf9cca849b5c3973c4f8ed009329fcee4
5
+ SHA512:
6
+ metadata.gz: 5d42d4011e3b6767dfc71979b6883ad82fd73f098571dc2b3421e70068225dd618ed31d5defd05106d1d6370f272646acfb9e1db36d12d9c0031c943ad22a70c
7
+ data.tar.gz: 76850e16ad9b9956376f68500f1a2f1b98ad083873fbee9a45c71adc4bb213d3c0040fedf0a15b1869b40b71514460b88bc9ce8e674d1ff9de4447c467f2499c
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1 @@
1
+ 2.2.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in govuk_mirrorer.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Alex Tomlins
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # GovukMirrorer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'govuk_mirrorer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install govuk_mirrorer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,13 @@
1
+
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ require "gem_publisher"
9
+ desc "Publish gem to RubyGems"
10
+ task :publish_gem do |t|
11
+ gem = GemPublisher.publish_if_updated("govuk_mirrorer.gemspec", :rubygems)
12
+ puts "Published #{gem}" if gem
13
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'govuk_mirrorer'
4
+
5
+ begin
6
+ GovukMirrorer.run
7
+ rescue GovukMirrorer::Configurer::NoRootUrlSpecifiedError
8
+ abort "No root url specified.\nEither specify the --site-root option or set the MIRRORER_SITE_ROOT env variable"
9
+ end
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'govuk_mirrorer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "govuk_mirrorer"
8
+ spec.version = GovukMirrorer::VERSION
9
+ spec.authors = ["Alex Tomlins"]
10
+ spec.email = ["alex.tomlins@digital.cabinet-office.gov.uk"]
11
+ spec.summary = %q{Tool to generate a static version of GOV.UK}
12
+ spec.description = spec.summary
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "spidey", "0.0.4"
22
+ spec.add_dependency "mechanize", "2.5.1"
23
+ spec.add_dependency "syslogger", "1.4.2"
24
+ spec.add_dependency "gds-api-adapters", "8.4.0"
25
+ spec.add_dependency "statsd-ruby", "1.2.1"
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.1"
28
+ spec.add_development_dependency "rake"
29
+ spec.add_development_dependency "rspec"
30
+ spec.add_development_dependency "webmock"
31
+ spec.add_development_dependency "gem_publisher", "1.2.0"
32
+ end
@@ -0,0 +1,6 @@
1
+ #!/bin/bash -x
2
+ set -e
3
+ rm -f Gemfile.lock
4
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
+ bundle exec rake
6
+ bundle exec rake publish_gem
@@ -0,0 +1,14 @@
1
+ require 'govuk_mirrorer/net_http_sni_monkey_patch'
2
+ require 'govuk_mirrorer/version'
3
+
4
+ require 'govuk_mirrorer/configurer'
5
+ require 'govuk_mirrorer/crawler'
6
+ require 'govuk_mirrorer/indexer'
7
+ require 'govuk_mirrorer/statsd'
8
+
9
+ module GovukMirrorer
10
+ def self.run
11
+ crawler = Crawler.new(Configurer.run(ARGV))
12
+ crawler.crawl
13
+ end
14
+ end
@@ -0,0 +1,44 @@
1
+ module GovukMirrorer
2
+ class Configurer
3
+ class NoRootUrlSpecifiedError < Exception ; end
4
+
5
+ def self.run(args)
6
+ require 'optparse'
7
+ options = {
8
+ :site_root => ENV['MIRRORER_SITE_ROOT'],
9
+ :request_interval => 0.1,
10
+ }
11
+ OptionParser.new do |o|
12
+ o.banner = "Usage: govuk_mirrorer [options]"
13
+
14
+ o.on('--site-root URL',
15
+ "Base URL to mirror from",
16
+ " falls back to MIRRORER_SITE_ROOT env variable") {|root| options[:site_root] = root }
17
+ o.on('--request-interval INTERVAL', Float,
18
+ "Specify the delay between requests in seconds",
19
+ " defaults to 0.1") {|interval| options[:request_interval] = interval }
20
+
21
+ o.separator "Logging:"
22
+ o.on('--logfile FILE', "Enable logging to a file") { |file| options[:log_file] = file }
23
+ o.on('--syslog [FACILITY]',
24
+ "Enable logging to syslog",
25
+ " optionally override the default facility (local3)") do |facility|
26
+ options[:syslog] = facility || "local3"
27
+ end
28
+ o.on('--loglevel LEVEL', 'DEBUG/INFO/WARN/ERROR, it defaults to INFO') do |level|
29
+ options[:log_level] = level
30
+ end
31
+ o.on('-v', '--verbose', 'sets loglevel to DEBUG') { |level| options[:log_level] = 'DEBUG' }
32
+
33
+ o.separator ""
34
+ o.on('-h', '--help') { puts o; exit }
35
+ o.parse!(args)
36
+ end
37
+
38
+ # Error if site_root nil or blank
39
+ raise NoRootUrlSpecifiedError if options[:site_root].nil? or options[:site_root].empty?
40
+
41
+ options
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,170 @@
1
+ require 'spidey'
2
+ require 'mechanize'
3
+ require 'logger'
4
+ require 'syslogger'
5
+
6
+ module GovukMirrorer
7
+ class Crawler < Spidey::AbstractSpider
8
+ USER_AGENT = "GOV.UK Mirrorer/#{GovukMirrorer::VERSION}"
9
+ DEFAULT_SITE_ROOT = 'https://www.gov.uk'
10
+ RETRY_RESP_CODES = [429, (500..599).to_a].flatten
11
+
12
+ def initialize(attrs = {})
13
+ super
14
+ setup_agent
15
+ @http_errors = {}
16
+
17
+ setup_logger(attrs)
18
+
19
+ @site_root = attrs[:site_root] || DEFAULT_SITE_ROOT
20
+
21
+ @indexer = GovukMirrorer::Indexer.new(@site_root)
22
+ @indexer.all_start_urls.each do |url|
23
+ @logger.debug "Adding start url #{url}"
24
+ handle url, :process_govuk_page
25
+ end
26
+ end
27
+
28
+ def site_hostname
29
+ URI.parse(@site_root).host
30
+ end
31
+
32
+ attr_accessor :logger
33
+
34
+ def crawl(options = {})
35
+ GovukMirrorer.statsd.time("govuk.app.mirrorer.crawl_duration") do
36
+ each_url do |url, handler, default_data|
37
+ retried = false
38
+ begin
39
+ page = agent.get(url)
40
+ logger.debug "Handling #{url.inspect}"
41
+ send handler, page, default_data
42
+ rescue => ex
43
+ if ex.is_a?(Mechanize::ResponseCodeError) and RETRY_RESP_CODES.include?(ex.response_code.to_i) and ! retried
44
+ retried = true
45
+ sleep 1
46
+ retry
47
+ end
48
+ handle_error url: url, handler: handler, error: ex, data: default_data
49
+ end
50
+ sleep request_interval if request_interval > 0
51
+ end
52
+ logger.info "Completed crawling the site"
53
+ end
54
+ end
55
+
56
+ def process_govuk_page(page, data = {})
57
+ unless page.uri.host == site_hostname
58
+ msg = "Ended up on non #{site_hostname} page #{page.uri.to_s}"
59
+ msg << " from #{agent.history[-2].uri.to_s}" if agent.history[-2]
60
+ logger.warn msg
61
+ return
62
+ end
63
+ save_to_disk(page)
64
+ extract_and_handle_links(page)
65
+ end
66
+
67
+ def extract_and_handle_links(page)
68
+ if page.is_a?(Mechanize::Page)
69
+ page.search("//a[@href]").each do |elem|
70
+ process_link(page, elem["href"])
71
+ end
72
+ page.search("//img[@src]").each do |elem|
73
+ process_link(page, elem["src"])
74
+ end
75
+ page.search("//link[@href]").each do |elem|
76
+ process_link(page, elem["href"])
77
+ end
78
+ page.search("//script[@src]").each do |elem|
79
+ process_link(page, elem["src"])
80
+ end
81
+ end
82
+ end
83
+
84
+ def process_link(page, href)
85
+ uri = URI.parse(href)
86
+ if uri.scheme.nil? # relative link
87
+ uri = URI.join(page.uri.to_s, href)
88
+ elsif uri.host == site_hostname
89
+ logger.warn "Link to non https #{href} from #{page.uri.to_s}" unless uri.scheme == "https"
90
+ uri.scheme = 'https'
91
+ else
92
+ logger.debug "Ignoring non #{site_hostname} link #{href} on #{page.uri.to_s}"
93
+ return
94
+ end
95
+ uri.fragment = nil # prevent duplicate url's being missed
96
+ maybe_handle uri.to_s, :process_govuk_page, :referrer => page.uri.to_s
97
+ rescue URI::Error => ex
98
+ logger.warn "#{ex.class} parsing url #{href} on page #{page.uri.to_s}"
99
+ end
100
+
101
+ def maybe_handle(url, handler, data = {})
102
+ logger.debug "Evaluating link #{url}"
103
+ if @urls.include?(url)
104
+ logger.debug "Skipping seen url #{url}"
105
+ return
106
+ end
107
+ if @http_errors.has_key?(url)
108
+ logger.debug "Skipping previous erroring url #{url}"
109
+ return
110
+ end
111
+ if @indexer.blacklisted_url?(url)
112
+ logger.debug "Skipping blacklisted url #{url}"
113
+ return
114
+ end
115
+ if url.include? '?'
116
+ logger.debug "Skipping querystringed url #{url}"
117
+ return
118
+ end
119
+ logger.debug "Adding url #{url} from #{data[:referrer]}"
120
+ handle url, handler, data
121
+ end
122
+
123
+ private
124
+
125
+ def setup_logger(options)
126
+ if options[:syslog]
127
+ # Syslog settings
128
+ # programname: govuk_mirrorer
129
+ # options: Syslog::LOG_PID | Syslog::LOG_CONS
130
+ # facility: from options
131
+ # Syslog::LOG_PID - adds the process number to the message (just after the program name)
132
+ # Syslog::LOG_CONS - writes the message on the console if an error occurs when sending the message
133
+ facility = Syslog.const_get("LOG_#{options[:syslog].upcase}")
134
+ @logger = Syslogger.new('govuk_mirrorer', Syslog::LOG_PID | Syslog::LOG_CONS, facility)
135
+ else
136
+ @logger = Logger.new(options[:log_file] || STDOUT)
137
+ end
138
+ if options[:log_level]
139
+ @logger.level = Logger.const_get(options[:log_level].upcase)
140
+ else
141
+ @logger.level = Logger::INFO
142
+ end
143
+ end
144
+
145
+ def handle_error(attrs)
146
+ msg = "Error #{attrs[:error].inspect} for #{attrs[:url]}, data: #{attrs[:data].inspect}"
147
+ msg << "\n#{attrs[:error].backtrace.join("\n")}" unless attrs[:error].is_a?(Mechanize::Error)
148
+ logger.warn msg
149
+ @http_errors[attrs[:url]] = attrs[:error]
150
+ end
151
+
152
+ # Saves to a file in ./hostname/path
153
+ # adds .html for html files
154
+ def save_to_disk(page)
155
+ path = page.extract_filename(true)
156
+ logger.debug "Saving #{page.uri.to_s} to #{path}"
157
+ FileUtils.mkdir_p(File.dirname(path))
158
+ File.open(path, 'wb') do |f|
159
+ f.write page.body
160
+ end
161
+ end
162
+
163
+ def setup_agent
164
+ agent.user_agent = USER_AGENT
165
+ agent.request_headers["X-Govuk-Mirrorer"] = "1"
166
+ # Force Mechanize to use Net::HTTP which we've monkey-patched above
167
+ agent.agent.http.reuse_ssl_sessions = false
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,94 @@
1
+ require 'gds_api/content_api'
2
+
3
+ module GovukMirrorer
4
+ class Indexer
5
+ FORMATS_TO_503 = %w(
6
+ local_transaction
7
+ smart-answer
8
+ custom-application
9
+ place
10
+ licence
11
+ ).freeze
12
+
13
+ ADDITIONAL_START_PATHS = %w(
14
+ /
15
+ /designprinciples
16
+ /designprinciples/styleguide
17
+ /designprinciples/performanceframework
18
+ /service-manual
19
+ ).freeze
20
+
21
+ # Calendars currently register as custom-application
22
+ WHITELIST_PATHS = %w(
23
+ /bank-holidays
24
+ /gwyliau-banc
25
+ /when-do-the-clocks-change
26
+ ).freeze
27
+
28
+ ADDITIONAL_BLACKLIST_PATHS = %w(
29
+ /trade-tariff
30
+ /licence-finder
31
+ /business-finance-support-finder
32
+ /government/uploads
33
+ /apply-for-a-licence
34
+ /search
35
+ ).freeze
36
+
37
+ BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS = %w(
38
+ /government/announcements.atom
39
+ /government/publications.atom
40
+ ).freeze
41
+
42
+ def initialize(root)
43
+ @root = root
44
+ @api_endpoint = @root + '/api/artefacts.json'
45
+ @all_start_urls = ADDITIONAL_START_PATHS.map{ |x| @root + x}
46
+ @blacklist_paths = ADDITIONAL_BLACKLIST_PATHS.dup + BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS.dup
47
+ process_artefacts
48
+ end
49
+
50
+ attr_reader :all_start_urls, :blacklist_paths
51
+
52
+ def blacklisted_url?(url)
53
+ path = URI.parse(url).path
54
+ return false if path.nil? # e.g. mailto: links...
55
+ url_segments = path.sub(%r{\A/}, '').split('/')
56
+ @blacklist_paths.any? do |blacklist_path|
57
+ bl_segments = blacklist_path.sub(%r{\A/}, '').split('/')
58
+ url_segments[0..(bl_segments.length - 1)] == bl_segments
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def process_artefacts
65
+ artefacts.each do |artefact|
66
+ uri = URI.parse(artefact.web_url)
67
+ if WHITELIST_PATHS.include?(uri.path)
68
+ @all_start_urls << artefact.web_url
69
+ elsif FORMATS_TO_503.include?(artefact.format)
70
+ @blacklist_paths << uri.path
71
+ else
72
+ @all_start_urls << artefact.web_url
73
+ end
74
+ end
75
+ end
76
+
77
+ def artefacts
78
+ retried = false
79
+ @artefacts ||= begin
80
+ content_api = GdsApi::ContentApi.new("#{@root}/api", :timeout => 10)
81
+ GovukMirrorer.statsd.time("govuk.app.mirrorer.artefacts_duration") do
82
+ content_api.artefacts.with_subsequent_pages.to_a
83
+ end
84
+ rescue GdsApi::HTTPErrorResponse, GdsApi::TimedOutException
85
+ if ! retried
86
+ retried = true
87
+ sleep 1
88
+ retry
89
+ end
90
+ raise
91
+ end
92
+ end
93
+ end
94
+ end