govuk_seed_crawler 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
+ data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
5
+ SHA512:
6
+ metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
+ data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in govuk_seed_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ (c) 2014 Crown copyright
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # GOV.UK: Seed the Crawler
2
+
3
+ Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'govuk_seed_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install govuk_seed_crawler
18
+
19
+ ## Usage
20
+
21
+ To run with the RabbitMQ connection defaults:
22
+
23
+ ```bash
24
+ bundle exec seed-crawler https://www.gov.uk/
25
+ ```
26
+
27
+ Run with `--help` to see a list of options:
28
+
29
+ ```bash
30
+ bundle exec seed-crawler --help
31
+ ```
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( http://github.com/{my-github-username}/govuk_seed_crawler/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'gem_publisher'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec) do |task|
5
+ task.pattern = FileList['spec/govuk_seed_crawler/**/*_spec.rb']
6
+ end
7
+
8
+ RSpec::Core::RakeTask.new(:integration) do |task|
9
+ task.pattern = FileList['spec/integration/**/*_spec.rb']
10
+ end
11
+
12
+ task :default => :spec
13
+
14
+ desc "Publish gem to RubyGems"
15
+ task :publish_gem do |t|
16
+ gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
17
+ puts "Published #{gem}" if gem
18
+ end
data/bin/seed-crawler ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'govuk_seed_crawler'
4
+
5
+ GovukSeedCrawler::CLIRunner.new(ARGV).run
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'govuk_seed_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "govuk_seed_crawler"
8
+ spec.version = GovukSeedCrawler::VERSION
9
+ spec.authors = ["Matt Bostock"]
10
+ spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
11
+ spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
+ spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_runtime_dependency "bunny", "~> 1.3"
21
+ spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
22
+ spec.add_runtime_dependency "slop", "~> 3.6.0"
23
+
24
+ spec.add_development_dependency "gem_publisher", "~> 1.3"
25
+ spec.add_development_dependency "pry"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+ spec.add_development_dependency "rspec-mocks", "~> 3.0"
29
+ spec.add_development_dependency "webmock", "~> 1.18.0"
30
+ end
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ [ -x .venv/bin/pip ] || virtualenv .venv
5
+ . .venv/bin/activate
6
+
7
+ pip install -q ghtools
8
+
9
+ REPO="gds:gds/govuk_seed_crawler"
10
+ gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
+
12
+ if ./jenkins-tests.sh; then
13
+ gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
14
+ exit 0
15
+ else
16
+ gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
17
+ exit 1
18
+ fi
data/jenkins-tests.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+ rm -f Gemfile.lock
4
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
+ bundle exec rake
6
+ bundle exec rake integration
data/jenkins.sh ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ ./jenkins-tests.sh
5
+ bundle exec rake publish_gem
@@ -0,0 +1,17 @@
1
+ require 'govuk_seed_crawler/amqp_client'
2
+ require 'govuk_seed_crawler/cli_parser'
3
+ require 'govuk_seed_crawler/cli_runner'
4
+ require 'govuk_seed_crawler/indexer'
5
+ require 'govuk_seed_crawler/seeder'
6
+ require 'govuk_seed_crawler/version'
7
+
8
+ module GovukSeedCrawler
9
+ def self.logger
10
+ unless @logger
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.level = Logger::INFO
13
+ end
14
+
15
+ @logger
16
+ end
17
+ end
@@ -0,0 +1,28 @@
1
+ require 'bunny'
2
+
3
+ module GovukSeedCrawler
4
+ class AmqpClient
5
+ attr_reader :channel
6
+
7
+ def initialize(connection_options = {})
8
+ @conn = Bunny.new(connection_options)
9
+ @conn.start
10
+ @channel = @conn.create_channel
11
+ end
12
+
13
+ def close
14
+ @conn.close
15
+ end
16
+
17
+ def publish(exchange, topic, body)
18
+ raise "Exchange cannot be nil" if exchange.nil?
19
+ raise "Topic cannot be nil" if topic.nil?
20
+ raise "Message body cannot be nil" if body.nil?
21
+
22
+ GovukSeedCrawler.logger.debug("Publishing '#{body}' to topic '#{topic}'")
23
+
24
+ @channel.topic(exchange, :durable => true)
25
+ .publish(body, :routing_key => topic)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,81 @@
1
+ require 'slop'
2
+
3
+ module GovukSeedCrawler
4
+ class CLIException < StandardError
5
+ attr_reader :help
6
+
7
+ def initialize(message, help)
8
+ super(message)
9
+ @help = help
10
+ end
11
+ end
12
+
13
+ class CLIParser
14
+ DEFAULTS = {
15
+ :exchange => "govuk_crawler_exchange",
16
+ :help => nil,
17
+ :host => "localhost",
18
+ :password => "guest",
19
+ :port => "5672",
20
+ :quiet => false,
21
+ :topic => "#",
22
+ :username => "guest",
23
+ :verbose => false,
24
+ :version => nil,
25
+ :vhost => "/"
26
+ }.freeze
27
+
28
+ ENV_AMQP_PASS_KEY = "GOVUK_CRAWLER_AMQP_PASS".freeze
29
+
30
+ def initialize(argv_array)
31
+ @argv_array = argv_array
32
+ end
33
+
34
+ def options
35
+ Slop.parse!(@argv_array, :help => true) do
36
+ banner <<-EOS
37
+ Usage: #{$PROGRAM_NAME} site_root [options]
38
+
39
+ Seeds an AMQP topic exchange with messages, each containing a URL, for the GOV.UK Crawler Worker
40
+ to consume:
41
+
42
+ https://github.com/alphagov/govuk_crawler_worker
43
+
44
+ The AMQP password can also be set as an environment variable and will be read from
45
+ `#{ENV_AMQP_PASS_KEY}`. If both the environment variable and command-line option for password
46
+ are set, the environment variable will take higher precedent.
47
+ EOS
48
+
49
+ on :version, "Display version and exit" do
50
+ puts "Version: #{GovukSeedCrawler::VERSION}"
51
+ exit 0
52
+ end
53
+
54
+ on :host=, "AMQP host to publish to", default: DEFAULTS[:host]
55
+ on :port=, "AMQP port", default: DEFAULTS[:port]
56
+ on :username=, "AMQP username", default: DEFAULTS[:username]
57
+ on :password=, "AMQP password", default: DEFAULTS[:password]
58
+ on :exchange=, "AMQP exchange", default: DEFAULTS[:exchange]
59
+ on :topic=, "AMQP topic", default: DEFAULTS[:topic]
60
+ on :vhost=, "AMQP vhost", default: DEFAULTS[:vhost]
61
+
62
+ on :quiet, "Quiet output", default: DEFAULTS[:quiet]
63
+ on :verbose, "Verbose output", default: DEFAULTS[:verbose]
64
+ end
65
+ end
66
+
67
+ def parse
68
+ opts = options
69
+
70
+ if opts[:version].nil?
71
+ raise CLIException.new("too many arguments provided", opts.help) if @argv_array.size > 1
72
+ raise CLIException.new("site_root must be provided", opts.help) if @argv_array.size != 1
73
+ end
74
+
75
+ options_hash = opts.to_hash
76
+ options_hash[:password] = ENV[ENV_AMQP_PASS_KEY] unless ENV[ENV_AMQP_PASS_KEY].nil?
77
+
78
+ return options_hash, @argv_array.first
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,29 @@
1
+ module GovukSeedCrawler
2
+ class CLIRunner
3
+ def initialize(argv_array)
4
+ begin
5
+ @options, @site_root = CLIParser.new(argv_array).parse
6
+ rescue CLIException => e
7
+ puts e.message
8
+ puts e.help
9
+ exit 2
10
+ end
11
+
12
+ set_logging_level(@options)
13
+ end
14
+
15
+ def run
16
+ Seeder::seed(@site_root, @options)
17
+ end
18
+
19
+ private
20
+
21
+ def set_logging_level(cli_options)
22
+ if cli_options[:verbose]
23
+ GovukSeedCrawler.logger.level = Logger::DEBUG
24
+ elsif cli_options[:quiet]
25
+ GovukSeedCrawler.logger.level = Logger::ERROR
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,18 @@
1
+ require 'govuk_mirrorer/indexer'
2
+ require 'govuk_mirrorer/statsd'
3
+
4
+ module GovukSeedCrawler
5
+ class Indexer
6
+ attr_reader :urls
7
+
8
+ def initialize(site_root)
9
+ raise "No site_root defined" unless site_root
10
+
11
+ GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
+ indexer = GovukMirrorer::Indexer.new(site_root)
13
+ @urls = indexer.all_start_urls
14
+
15
+ GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,16 @@
1
+ module GovukSeedCrawler
2
+ class Seeder
3
+ def self.seed(site_root, options = {})
4
+ amqp_client = AmqpClient.new(options)
5
+ urls = Indexer.new(site_root).urls
6
+
7
+ urls.each do |url|
8
+ amqp_client.publish(options[:exchange], options[:topic], url)
9
+ end
10
+
11
+ GovukSeedCrawler.logger.info("Published #{urls.count} URLs to topic '#{options[:topic]}'")
12
+
13
+ amqp_client.close
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,3 @@
1
+ module GovukSeedCrawler
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::AmqpClient do
4
+ let(:exchange) { "govuk_seed_crawler_spec_exchange" }
5
+ let(:options) {{
6
+ :host => ENV.fetch("AMQP_HOST", "localhost"),
7
+ :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
8
+ :pass => ENV.fetch("AMQP_PASS", "govuk_seed_crawler"),
9
+ }}
10
+ subject { GovukSeedCrawler::AmqpClient.new(options) }
11
+
12
+ it "responds to #channel" do
13
+ expect(subject).to respond_to(:channel)
14
+ end
15
+
16
+ it "responds to #close" do
17
+ expect(subject).to respond_to(:close)
18
+ end
19
+
20
+ it "closes the connection to the AMQP server" do
21
+ mock_bunny = double(:mock_bunny,
22
+ :start => true, :create_channel => true, :close => true)
23
+ allow(Bunny).to receive(:new).and_return(mock_bunny)
24
+ expect(mock_bunny).to receive(:close).once
25
+
26
+ subject.close
27
+ end
28
+
29
+ context "#publish" do
30
+ context "error handling" do
31
+ it "raises an exception if exchange is nil" do
32
+ expect {
33
+ subject.publish(nil, "#", "some body")
34
+ }.to raise_exception(RuntimeError, "Exchange cannot be nil")
35
+ end
36
+
37
+ it "raises an exception if topic is nil" do
38
+ expect {
39
+ subject.publish(exchange, nil, "some body")
40
+ }.to raise_exception(RuntimeError, "Topic cannot be nil")
41
+ end
42
+
43
+ it "raises an exception if body is nil" do
44
+ expect {
45
+ subject.publish(exchange, "#", nil)
46
+ }.to raise_exception(RuntimeError, "Message body cannot be nil")
47
+ end
48
+ end
49
+
50
+ it "allows publishing against an exchange" do
51
+ expect(subject.publish(exchange, "#", "some body"))
52
+ .to_not be_nil
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,122 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::CLIParser do
4
+ it "requires the site_root to be provided" do
5
+ expect {
6
+ GovukSeedCrawler::CLIParser.new([]).parse
7
+ }.to raise_exception(GovukSeedCrawler::CLIException, "site_root must be provided")
8
+ end
9
+
10
+ it "provides the defaults when just given the site_root" do
11
+ options, site_root = GovukSeedCrawler::CLIParser.new(["https://www.example.com"]).parse
12
+
13
+ expect(options).to eq(GovukSeedCrawler::CLIParser::DEFAULTS)
14
+ expect(site_root).to eq("https://www.example.com")
15
+ end
16
+
17
+ it "should tell us when we've given too many arguments" do
18
+ expect {
19
+ GovukSeedCrawler::CLIParser.new(["a", "b"]).parse
20
+ }.to raise_exception(GovukSeedCrawler::CLIException, "too many arguments provided")
21
+ end
22
+
23
+ it "should nest the help message in with any CLIExceptions we raise" do
24
+ expect {
25
+ GovukSeedCrawler::CLIParser.new(["a", "b"]).parse
26
+ }.to raise_exception(GovukSeedCrawler::CLIException) { |e|
27
+ expect(e.help).to include("Usage: ")
28
+ }
29
+ end
30
+
31
+ describe "catching STDOUT" do
32
+ it "shows the help banner when provided -h" do
33
+ # Get a valid options response as help closes early with SystemExit.
34
+ options = GovukSeedCrawler::CLIParser.new(["http://www.foo.com/"]).options
35
+
36
+ temp_stdout do |caught_stdout|
37
+ expect {
38
+ _, _ = GovukSeedCrawler::CLIParser.new(["-h"]).parse
39
+ }.to raise_exception(SystemExit) { |e|
40
+ expect(e.status).to eq(0)
41
+ }
42
+
43
+ expect(caught_stdout.strip).to eq(options.help)
44
+ end
45
+ end
46
+
47
+ it "should show the version number and exit" do
48
+ temp_stdout do |caught_stdout|
49
+ expect {
50
+ _, _ = GovukSeedCrawler::CLIParser.new(["--version"]).parse
51
+ }.to raise_exception(SystemExit) { |e|
52
+ expect(e.status).to eq(0)
53
+ }
54
+
55
+ expect(caught_stdout.strip).to eq("Version: #{GovukSeedCrawler::VERSION}")
56
+ end
57
+ end
58
+ end
59
+
60
+ describe "passing in valid arguments" do
61
+ let(:arguments) {
62
+ [
63
+ "https://www.override.com/",
64
+ "--host rabbitmq.some.custom.vhost",
65
+ "--port 4567",
66
+ "--username foo",
67
+ "--password bar",
68
+ "--exchange some_custom_exchange",
69
+ "--topic some_custom_topic",
70
+ "--vhost a_vhost",
71
+ "--verbose"
72
+ ].join(" ").split(" ")
73
+ }
74
+
75
+ it "should override all of the default arguments that we're providing" do
76
+ overriden = {
77
+ host: "rabbitmq.some.custom.vhost",
78
+ port: "4567",
79
+ username: "foo",
80
+ password: "bar",
81
+ exchange: "some_custom_exchange",
82
+ topic: "some_custom_topic",
83
+ help: nil,
84
+ quiet: false,
85
+ verbose: true,
86
+ version: nil,
87
+ vhost: "a_vhost"
88
+ }
89
+
90
+ expect(GovukSeedCrawler::CLIParser.new(arguments).parse.first).to eq(overriden)
91
+ end
92
+
93
+ it "should set the --quiet value" do
94
+ options, _ = GovukSeedCrawler::CLIParser.new(["foo.com", "--quiet"]).parse
95
+ expect(options).to eq(GovukSeedCrawler::CLIParser::DEFAULTS.merge(quiet: true))
96
+ end
97
+
98
+ describe "reading the AMQP password from an environment variable" do
99
+ def set_amqp_pass(password)
100
+ ENV[GovukSeedCrawler::CLIParser::ENV_AMQP_PASS_KEY] = password
101
+ end
102
+
103
+ after do
104
+ ENV[GovukSeedCrawler::CLIParser::ENV_AMQP_PASS_KEY] = nil
105
+ end
106
+
107
+ it "sets the password if set using an environment variable" do
108
+ set_amqp_pass("foobar")
109
+
110
+ expect(GovukSeedCrawler::CLIParser.new(["http://www.example.com"]).parse.first)
111
+ .to include(password: "foobar")
112
+ end
113
+
114
+ it "picks the environment variable over the parameter if both are set" do
115
+ set_amqp_pass("bar")
116
+
117
+ expect(GovukSeedCrawler::CLIParser.new(["http://www.example.com", "--password", "foo"]).parse.first)
118
+ .to include(password: "bar")
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,74 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::CLIRunner do
4
+ describe "printing the version" do
5
+ it "should not try to connect to an AMQP server" do
6
+ expect(Bunny).not_to receive(:new)
7
+
8
+ temp_stdout do |caught_stdout|
9
+ expect {
10
+ GovukSeedCrawler::CLIRunner.new(["--version"]).run
11
+ }.to raise_exception(SystemExit) { |exit|
12
+ expect(exit.status).to eq(0)
13
+ }
14
+
15
+ expect(caught_stdout.strip).to eq("Version: #{GovukSeedCrawler::VERSION}")
16
+ end
17
+ end
18
+ end
19
+
20
+ describe "catching any CLIException objects and exiting with a status 1" do
21
+ it "prints to STDOUT for too many arguments" do
22
+ temp_stdout do |caught_stdout|
23
+ expect {
24
+ GovukSeedCrawler::CLIRunner.new(["a", "b"])
25
+ }.to raise_exception(SystemExit) { |exit|
26
+ expect(exit.status).to eq(2)
27
+ }
28
+
29
+ expect(caught_stdout.strip).to include("too many arguments provided")
30
+ end
31
+ end
32
+
33
+ it "prints to STDOUT when site_root not set" do
34
+ temp_stdout do |caught_stdout|
35
+ expect {
36
+ GovukSeedCrawler::CLIRunner.new(["--verbose"])
37
+ }.to raise_exception(SystemExit) { |exit|
38
+ expect(exit.status).to eq(2)
39
+ }
40
+
41
+ expect(caught_stdout.strip).to include("site_root must be provided")
42
+ end
43
+ end
44
+ end
45
+
46
+ describe "setting the logging level" do
47
+ before do
48
+ GovukSeedCrawler.logger.level = Logger::INFO
49
+ end
50
+
51
+ it "defaults to INFO" do
52
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com"])
53
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::INFO)
54
+ end
55
+
56
+ it "sets to ERROR for quite" do
57
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com", "--quiet"])
58
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::ERROR)
59
+ end
60
+
61
+ it "sets to DEBUG for verbose" do
62
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com", "--verbose"])
63
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::DEBUG)
64
+ end
65
+ end
66
+
67
+ describe "#run" do
68
+ it "passes all options through to seed" do
69
+ expect(GovukSeedCrawler::Seeder).to receive(:seed).
70
+ with("http://www.example.com", GovukSeedCrawler::CLIParser::DEFAULTS).once
71
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com"]).run
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::Indexer do
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
5
+
6
+ context "under normal usage" do
7
+ let(:mock_indexer) do
8
+ double(:mock_indexer, :all_start_urls => [])
9
+ end
10
+
11
+ it "responds to Indexer#urls" do
12
+ allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
13
+ expect(subject).to respond_to(:urls)
14
+ end
15
+
16
+ it "calls GovukMirrorer::Indexer with the site root" do
17
+ expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
18
+ subject
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::Seeder do
4
+ let(:exchange) { "seeder_test_exchange" }
5
+ let(:topic) { "#" }
6
+ let(:root_url) { "https://www.example.com" }
7
+
8
+ let(:options) {{
9
+ :exchange => exchange,
10
+ :topic => topic,
11
+ }}
12
+
13
+ let(:mock_get_urls) { double(:mock_get_urls, :urls => true) }
14
+ let(:mock_amqp_client) { double(:mock_amqp_client, :close => true) }
15
+
16
+ let(:urls) do
17
+ [
18
+ "https://example.com/foo",
19
+ "https://example.com/bar",
20
+ "https://example.com/baz",
21
+ ]
22
+ end
23
+
24
+ subject { GovukSeedCrawler::Seeder::seed(root_url, options) }
25
+
26
+ before(:each) do
27
+ allow(GovukSeedCrawler::Indexer).to receive(:new)
28
+ .with(root_url)
29
+ .and_return(mock_get_urls)
30
+ allow(mock_get_urls).to receive(:urls).and_return(urls)
31
+ allow(GovukSeedCrawler::AmqpClient).to receive(:new)
32
+ .with(options).and_return(mock_amqp_client)
33
+ end
34
+
35
+ context "under normal usage" do
36
+ it "publishes urls to the queue" do
37
+ urls.each do |url|
38
+ expect(mock_amqp_client).to receive(:publish)
39
+ .with(exchange, topic, url)
40
+ end
41
+
42
+ subject
43
+ end
44
+
45
+ it "closes the connection when done" do
46
+ allow(mock_amqp_client).to receive(:publish)
47
+ expect(mock_amqp_client).to receive(:close)
48
+ subject
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,66 @@
1
+ require 'json'
2
+ require 'spec_helper'
3
+
4
+ describe GovukSeedCrawler do
5
+ def stub_api_artefacts(count)
6
+ item = {
7
+ "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
+ "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
+ "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
+ "format" => "announcement"
11
+ }
12
+ results = count.times.collect { item }
13
+ response = {
14
+ "_response_info" => {
15
+ "status" => "ok",
16
+ "links" => []
17
+ },
18
+ "total" => results.size,
19
+ "start_index" => 1,
20
+ "page_size" => 100,
21
+ "current_page" => 1,
22
+ "pages" => 1,
23
+ "results" => results
24
+ }
25
+
26
+ stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
+ to_return(:status => 200, :body => response.to_json, :headers => {})
28
+ end
29
+
30
+ let(:vhost) { "/" }
31
+ let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
+ let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
+ let(:topic) { "#" }
34
+ let(:site_root) { "https://www.gov.uk/" }
35
+ let(:options) {{
36
+ :host => ENV.fetch("AMQP_HOST", "localhost"),
37
+ :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
38
+ :pass => ENV.fetch("AMQP_PASS", "govuk_seed_crawler"),
39
+ :exchange => exchange_name,
40
+ :topic => topic
41
+ }}
42
+ let(:rabbitmq_client) { GovukSeedCrawler::AmqpClient.new(options) }
43
+
44
+ subject { GovukSeedCrawler::Seeder::seed(site_root, options) }
45
+
46
+ before(:each) do
47
+ @exchange = rabbitmq_client.channel.topic(exchange_name, :durable => true)
48
+ @queue = rabbitmq_client.channel.queue(queue_name)
49
+ @queue.bind(@exchange, :routing_key => topic)
50
+ end
51
+
52
+ after(:each) do
53
+ @queue.unbind(@exchange)
54
+ @queue.delete
55
+ @exchange.delete
56
+ rabbitmq_client.close
57
+ end
58
+
59
+ it "publishes URLs it finds to an AMQP topic exchange" do
60
+ stub_api_artefacts(10)
61
+ subject
62
+
63
+ # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
+ expect(@queue.message_count).to be(15)
65
+ end
66
+ end
@@ -0,0 +1,39 @@
1
+ require 'govuk_seed_crawler'
2
+ require 'webmock/rspec'
3
+
4
+ RSpec.configure do |config|
5
+ config.order = :random
6
+
7
+ # Seed global randomization in this process using the `--seed` CLI option.
8
+ # Setting this allows you to use `--seed` to deterministically reproduce
9
+ # test failures related to randomization by passing the same `--seed` value
10
+ # as the one that triggered the failure.
11
+ Kernel.srand config.seed
12
+
13
+ config.expect_with :rspec do |expectations|
14
+ # Enable only the newer, non-monkey-patching expect syntax.
15
+ # For more details, see:
16
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
17
+ expectations.syntax = :expect
18
+ end
19
+
20
+ config.mock_with :rspec do |mocks|
21
+ # Enable only the newer, non-monkey-patching expect syntax.
22
+ # For more details, see:
23
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
24
+ mocks.syntax = :expect
25
+
26
+ # Prevents you from mocking or stubbing a method that does not exist on
27
+ # a real object. This is generally recommended.
28
+ mocks.verify_partial_doubles = true
29
+ end
30
+ end
31
+
32
+ WebMock.disable_net_connect!
33
+
34
+ def temp_stdout
35
+ $stdout = StringIO.new
36
+ yield $stdout.string
37
+ ensure
38
+ $stdout = STDOUT
39
+ end
metadata ADDED
@@ -0,0 +1,205 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: govuk_seed_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Bostock
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bunny
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: govuk_mirrorer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: slop
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 3.6.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 3.6.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: gem_publisher
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec-mocks
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: webmock
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: 1.18.0
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: 1.18.0
139
+ description:
140
+ email:
141
+ - matt.bostock@digital.cabinet-office.gov.uk
142
+ executables:
143
+ - seed-crawler
144
+ extensions: []
145
+ extra_rdoc_files: []
146
+ files:
147
+ - ".gitignore"
148
+ - ".rspec"
149
+ - ".ruby-version"
150
+ - Gemfile
151
+ - LICENSE.txt
152
+ - README.md
153
+ - Rakefile
154
+ - bin/seed-crawler
155
+ - govuk_seed_crawler.gemspec
156
+ - jenkins-branches.sh
157
+ - jenkins-tests.sh
158
+ - jenkins.sh
159
+ - lib/govuk_seed_crawler.rb
160
+ - lib/govuk_seed_crawler/amqp_client.rb
161
+ - lib/govuk_seed_crawler/cli_parser.rb
162
+ - lib/govuk_seed_crawler/cli_runner.rb
163
+ - lib/govuk_seed_crawler/indexer.rb
164
+ - lib/govuk_seed_crawler/seeder.rb
165
+ - lib/govuk_seed_crawler/version.rb
166
+ - spec/govuk_seed_crawler/amqp_client_spec.rb
167
+ - spec/govuk_seed_crawler/cli_parser_spec.rb
168
+ - spec/govuk_seed_crawler/cli_runner_spec.rb
169
+ - spec/govuk_seed_crawler/indexer_spec.rb
170
+ - spec/govuk_seed_crawler/seeder_spec.rb
171
+ - spec/integration/govuk_seed_crawler_spec.rb
172
+ - spec/spec_helper.rb
173
+ homepage: https://github.gds/gds/govuk_seed_crawler
174
+ licenses:
175
+ - MIT
176
+ metadata: {}
177
+ post_install_message:
178
+ rdoc_options: []
179
+ require_paths:
180
+ - lib
181
+ required_ruby_version: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: '0'
186
+ required_rubygems_version: !ruby/object:Gem::Requirement
187
+ requirements:
188
+ - - ">="
189
+ - !ruby/object:Gem::Version
190
+ version: '0'
191
+ requirements: []
192
+ rubyforge_project:
193
+ rubygems_version: 2.2.2
194
+ signing_key:
195
+ specification_version: 4
196
+ summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
197
+ exchange.
198
+ test_files:
199
+ - spec/govuk_seed_crawler/amqp_client_spec.rb
200
+ - spec/govuk_seed_crawler/cli_parser_spec.rb
201
+ - spec/govuk_seed_crawler/cli_runner_spec.rb
202
+ - spec/govuk_seed_crawler/indexer_spec.rb
203
+ - spec/govuk_seed_crawler/seeder_spec.rb
204
+ - spec/integration/govuk_seed_crawler_spec.rb
205
+ - spec/spec_helper.rb