govuk_mirrorer 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d05df644e22f9aeb6bc503bc6d5c4df69c0dabb1
4
+ data.tar.gz: bd7337adf9cca849b5c3973c4f8ed009329fcee4
5
+ SHA512:
6
+ metadata.gz: 5d42d4011e3b6767dfc71979b6883ad82fd73f098571dc2b3421e70068225dd618ed31d5defd05106d1d6370f272646acfb9e1db36d12d9c0031c943ad22a70c
7
+ data.tar.gz: 76850e16ad9b9956376f68500f1a2f1b98ad083873fbee9a45c71adc4bb213d3c0040fedf0a15b1869b40b71514460b88bc9ce8e674d1ff9de4447c467f2499c
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1 @@
1
+ 2.2.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in govuk_mirrorer.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Alex Tomlins
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # GovukMirrorer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'govuk_mirrorer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install govuk_mirrorer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,13 @@
1
+
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ require "gem_publisher"
9
+ desc "Publish gem to RubyGems"
10
+ task :publish_gem do |t|
11
+ gem = GemPublisher.publish_if_updated("govuk_mirrorer.gemspec", :rubygems)
12
+ puts "Published #{gem}" if gem
13
+ end
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'govuk_mirrorer'
4
+
5
+ begin
6
+ GovukMirrorer.run
7
+ rescue GovukMirrorer::Configurer::NoRootUrlSpecifiedError
8
+ abort "No root url specified.\nEither specify the --site-root option or set the MIRRORER_SITE_ROOT env variable"
9
+ end
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'govuk_mirrorer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "govuk_mirrorer"
8
+ spec.version = GovukMirrorer::VERSION
9
+ spec.authors = ["Alex Tomlins"]
10
+ spec.email = ["alex.tomlins@digital.cabinet-office.gov.uk"]
11
+ spec.summary = %q{Tool to generate a static version of GOV.UK}
12
+ spec.description = spec.summary
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "spidey", "0.0.4"
22
+ spec.add_dependency "mechanize", "2.5.1"
23
+ spec.add_dependency "syslogger", "1.4.2"
24
+ spec.add_dependency "gds-api-adapters", "8.4.0"
25
+ spec.add_dependency "statsd-ruby", "1.2.1"
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.1"
28
+ spec.add_development_dependency "rake"
29
+ spec.add_development_dependency "rspec"
30
+ spec.add_development_dependency "webmock"
31
+ spec.add_development_dependency "gem_publisher", "1.2.0"
32
+ end
@@ -0,0 +1,6 @@
1
+ #!/bin/bash -x
2
+ set -e
3
+ rm -f Gemfile.lock
4
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
+ bundle exec rake
6
+ bundle exec rake publish_gem
@@ -0,0 +1,14 @@
1
+ require 'govuk_mirrorer/net_http_sni_monkey_patch'
2
+ require 'govuk_mirrorer/version'
3
+
4
+ require 'govuk_mirrorer/configurer'
5
+ require 'govuk_mirrorer/crawler'
6
+ require 'govuk_mirrorer/indexer'
7
+ require 'govuk_mirrorer/statsd'
8
+
9
+ module GovukMirrorer
10
+ def self.run
11
+ crawler = Crawler.new(Configurer.run(ARGV))
12
+ crawler.crawl
13
+ end
14
+ end
@@ -0,0 +1,44 @@
1
+ module GovukMirrorer
2
+ class Configurer
3
+ class NoRootUrlSpecifiedError < Exception ; end
4
+
5
+ def self.run(args)
6
+ require 'optparse'
7
+ options = {
8
+ :site_root => ENV['MIRRORER_SITE_ROOT'],
9
+ :request_interval => 0.1,
10
+ }
11
+ OptionParser.new do |o|
12
+ o.banner = "Usage: govuk_mirrorer [options]"
13
+
14
+ o.on('--site-root URL',
15
+ "Base URL to mirror from",
16
+ " falls back to MIRRORER_SITE_ROOT env variable") {|root| options[:site_root] = root }
17
+ o.on('--request-interval INTERVAL', Float,
18
+ "Specify the delay between requests in seconds",
19
+ " defaults to 0.1") {|interval| options[:request_interval] = interval }
20
+
21
+ o.separator "Logging:"
22
+ o.on('--logfile FILE', "Enable logging to a file") { |file| options[:log_file] = file }
23
+ o.on('--syslog [FACILITY]',
24
+ "Enable logging to syslog",
25
+ " optionally override the default facility (local3)") do |facility|
26
+ options[:syslog] = facility || "local3"
27
+ end
28
+ o.on('--loglevel LEVEL', 'DEBUG/INFO/WARN/ERROR, it defaults to INFO') do |level|
29
+ options[:log_level] = level
30
+ end
31
+ o.on('-v', '--verbose', 'sets loglevel to DEBUG') { |level| options[:log_level] = 'DEBUG' }
32
+
33
+ o.separator ""
34
+ o.on('-h', '--help') { puts o; exit }
35
+ o.parse!(args)
36
+ end
37
+
38
+ # Error if site_root nil or blank
39
+ raise NoRootUrlSpecifiedError if options[:site_root].nil? or options[:site_root].empty?
40
+
41
+ options
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,170 @@
1
+ require 'spidey'
2
+ require 'mechanize'
3
+ require 'logger'
4
+ require 'syslogger'
5
+
6
+ module GovukMirrorer
7
+ class Crawler < Spidey::AbstractSpider
8
+ USER_AGENT = "GOV.UK Mirrorer/#{GovukMirrorer::VERSION}"
9
+ DEFAULT_SITE_ROOT = 'https://www.gov.uk'
10
+ RETRY_RESP_CODES = [429, (500..599).to_a].flatten
11
+
12
+ def initialize(attrs = {})
13
+ super
14
+ setup_agent
15
+ @http_errors = {}
16
+
17
+ setup_logger(attrs)
18
+
19
+ @site_root = attrs[:site_root] || DEFAULT_SITE_ROOT
20
+
21
+ @indexer = GovukMirrorer::Indexer.new(@site_root)
22
+ @indexer.all_start_urls.each do |url|
23
+ @logger.debug "Adding start url #{url}"
24
+ handle url, :process_govuk_page
25
+ end
26
+ end
27
+
28
+ def site_hostname
29
+ URI.parse(@site_root).host
30
+ end
31
+
32
+ attr_accessor :logger
33
+
34
+ def crawl(options = {})
35
+ GovukMirrorer.statsd.time("govuk.app.mirrorer.crawl_duration") do
36
+ each_url do |url, handler, default_data|
37
+ retried = false
38
+ begin
39
+ page = agent.get(url)
40
+ logger.debug "Handling #{url.inspect}"
41
+ send handler, page, default_data
42
+ rescue => ex
43
+ if ex.is_a?(Mechanize::ResponseCodeError) and RETRY_RESP_CODES.include?(ex.response_code.to_i) and ! retried
44
+ retried = true
45
+ sleep 1
46
+ retry
47
+ end
48
+ handle_error url: url, handler: handler, error: ex, data: default_data
49
+ end
50
+ sleep request_interval if request_interval > 0
51
+ end
52
+ logger.info "Completed crawling the site"
53
+ end
54
+ end
55
+
56
+ def process_govuk_page(page, data = {})
57
+ unless page.uri.host == site_hostname
58
+ msg = "Ended up on non #{site_hostname} page #{page.uri.to_s}"
59
+ msg << " from #{agent.history[-2].uri.to_s}" if agent.history[-2]
60
+ logger.warn msg
61
+ return
62
+ end
63
+ save_to_disk(page)
64
+ extract_and_handle_links(page)
65
+ end
66
+
67
+ def extract_and_handle_links(page)
68
+ if page.is_a?(Mechanize::Page)
69
+ page.search("//a[@href]").each do |elem|
70
+ process_link(page, elem["href"])
71
+ end
72
+ page.search("//img[@src]").each do |elem|
73
+ process_link(page, elem["src"])
74
+ end
75
+ page.search("//link[@href]").each do |elem|
76
+ process_link(page, elem["href"])
77
+ end
78
+ page.search("//script[@src]").each do |elem|
79
+ process_link(page, elem["src"])
80
+ end
81
+ end
82
+ end
83
+
84
+ def process_link(page, href)
85
+ uri = URI.parse(href)
86
+ if uri.scheme.nil? # relative link
87
+ uri = URI.join(page.uri.to_s, href)
88
+ elsif uri.host == site_hostname
89
+ logger.warn "Link to non https #{href} from #{page.uri.to_s}" unless uri.scheme == "https"
90
+ uri.scheme = 'https'
91
+ else
92
+ logger.debug "Ignoring non #{site_hostname} link #{href} on #{page.uri.to_s}"
93
+ return
94
+ end
95
+ uri.fragment = nil # prevent duplicate url's being missed
96
+ maybe_handle uri.to_s, :process_govuk_page, :referrer => page.uri.to_s
97
+ rescue URI::Error => ex
98
+ logger.warn "#{ex.class} parsing url #{href} on page #{page.uri.to_s}"
99
+ end
100
+
101
+ def maybe_handle(url, handler, data = {})
102
+ logger.debug "Evaluating link #{url}"
103
+ if @urls.include?(url)
104
+ logger.debug "Skipping seen url #{url}"
105
+ return
106
+ end
107
+ if @http_errors.has_key?(url)
108
+ logger.debug "Skipping previous erroring url #{url}"
109
+ return
110
+ end
111
+ if @indexer.blacklisted_url?(url)
112
+ logger.debug "Skipping blacklisted url #{url}"
113
+ return
114
+ end
115
+ if url.include? '?'
116
+ logger.debug "Skipping querystringed url #{url}"
117
+ return
118
+ end
119
+ logger.debug "Adding url #{url} from #{data[:referrer]}"
120
+ handle url, handler, data
121
+ end
122
+
123
+ private
124
+
125
+ def setup_logger(options)
126
+ if options[:syslog]
127
+ # Syslog settings
128
+ # programname: govuk_mirrorer
129
+ # options: Syslog::LOG_PID | Syslog::LOG_CONS
130
+ # facility: from options
131
+ # Syslog::LOG_PID - adds the process number to the message (just after the program name)
132
+ # Syslog::LOG_CONS - writes the message on the console if an error occurs when sending the message
133
+ facility = Syslog.const_get("LOG_#{options[:syslog].upcase}")
134
+ @logger = Syslogger.new('govuk_mirrorer', Syslog::LOG_PID | Syslog::LOG_CONS, facility)
135
+ else
136
+ @logger = Logger.new(options[:log_file] || STDOUT)
137
+ end
138
+ if options[:log_level]
139
+ @logger.level = Logger.const_get(options[:log_level].upcase)
140
+ else
141
+ @logger.level = Logger::INFO
142
+ end
143
+ end
144
+
145
+ def handle_error(attrs)
146
+ msg = "Error #{attrs[:error].inspect} for #{attrs[:url]}, data: #{attrs[:data].inspect}"
147
+ msg << "\n#{attrs[:error].backtrace.join("\n")}" unless attrs[:error].is_a?(Mechanize::Error)
148
+ logger.warn msg
149
+ @http_errors[attrs[:url]] = attrs[:error]
150
+ end
151
+
152
+ # Saves to a file in ./hostname/path
153
+ # adds .html for html files
154
+ def save_to_disk(page)
155
+ path = page.extract_filename(true)
156
+ logger.debug "Saving #{page.uri.to_s} to #{path}"
157
+ FileUtils.mkdir_p(File.dirname(path))
158
+ File.open(path, 'wb') do |f|
159
+ f.write page.body
160
+ end
161
+ end
162
+
163
+ def setup_agent
164
+ agent.user_agent = USER_AGENT
165
+ agent.request_headers["X-Govuk-Mirrorer"] = "1"
166
+ # Force Mechanize to use Net::HTTP which we've monkey-patched above
167
+ agent.agent.http.reuse_ssl_sessions = false
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,94 @@
1
+ require 'gds_api/content_api'
2
+
3
+ module GovukMirrorer
4
+ class Indexer
5
+ FORMATS_TO_503 = %w(
6
+ local_transaction
7
+ smart-answer
8
+ custom-application
9
+ place
10
+ licence
11
+ ).freeze
12
+
13
+ ADDITIONAL_START_PATHS = %w(
14
+ /
15
+ /designprinciples
16
+ /designprinciples/styleguide
17
+ /designprinciples/performanceframework
18
+ /service-manual
19
+ ).freeze
20
+
21
+ # Calendars currently register as custom-application
22
+ WHITELIST_PATHS = %w(
23
+ /bank-holidays
24
+ /gwyliau-banc
25
+ /when-do-the-clocks-change
26
+ ).freeze
27
+
28
+ ADDITIONAL_BLACKLIST_PATHS = %w(
29
+ /trade-tariff
30
+ /licence-finder
31
+ /business-finance-support-finder
32
+ /government/uploads
33
+ /apply-for-a-licence
34
+ /search
35
+ ).freeze
36
+
37
+ BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS = %w(
38
+ /government/announcements.atom
39
+ /government/publications.atom
40
+ ).freeze
41
+
42
+ def initialize(root)
43
+ @root = root
44
+ @api_endpoint = @root + '/api/artefacts.json'
45
+ @all_start_urls = ADDITIONAL_START_PATHS.map{ |x| @root + x}
46
+ @blacklist_paths = ADDITIONAL_BLACKLIST_PATHS.dup + BLACKLISTED_ATOM_FEEDS_USED_FOR_EMAIL_ALERTS.dup
47
+ process_artefacts
48
+ end
49
+
50
+ attr_reader :all_start_urls, :blacklist_paths
51
+
52
+ def blacklisted_url?(url)
53
+ path = URI.parse(url).path
54
+ return false if path.nil? # e.g. mailto: links...
55
+ url_segments = path.sub(%r{\A/}, '').split('/')
56
+ @blacklist_paths.any? do |blacklist_path|
57
+ bl_segments = blacklist_path.sub(%r{\A/}, '').split('/')
58
+ url_segments[0..(bl_segments.length - 1)] == bl_segments
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def process_artefacts
65
+ artefacts.each do |artefact|
66
+ uri = URI.parse(artefact.web_url)
67
+ if WHITELIST_PATHS.include?(uri.path)
68
+ @all_start_urls << artefact.web_url
69
+ elsif FORMATS_TO_503.include?(artefact.format)
70
+ @blacklist_paths << uri.path
71
+ else
72
+ @all_start_urls << artefact.web_url
73
+ end
74
+ end
75
+ end
76
+
77
+ def artefacts
78
+ retried = false
79
+ @artefacts ||= begin
80
+ content_api = GdsApi::ContentApi.new("#{@root}/api", :timeout => 10)
81
+ GovukMirrorer.statsd.time("govuk.app.mirrorer.artefacts_duration") do
82
+ content_api.artefacts.with_subsequent_pages.to_a
83
+ end
84
+ rescue GdsApi::HTTPErrorResponse, GdsApi::TimedOutException
85
+ if ! retried
86
+ retried = true
87
+ sleep 1
88
+ retry
89
+ end
90
+ raise
91
+ end
92
+ end
93
+ end
94
+ end