govuk_seed_crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
+ data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
5
+ SHA512:
6
+ metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
+ data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in govuk_seed_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ (c) 2014 Crown copyright
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # GOV.UK: Seed the Crawler
2
+
3
+ Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'govuk_seed_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install govuk_seed_crawler
18
+
19
+ ## Usage
20
+
21
+ To run with the RabbitMQ connection defaults:
22
+
23
+ ```bash
24
+ bundle exec seed-crawler https://www.gov.uk/
25
+ ```
26
+
27
+ Run with `--help` to see a list of options:
28
+
29
+ ```bash
30
+ bundle exec seed-crawler --help
31
+ ```
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( http://github.com/{my-github-username}/govuk_seed_crawler/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'gem_publisher'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec) do |task|
5
+ task.pattern = FileList['spec/govuk_seed_crawler/**/*_spec.rb']
6
+ end
7
+
8
+ RSpec::Core::RakeTask.new(:integration) do |task|
9
+ task.pattern = FileList['spec/integration/**/*_spec.rb']
10
+ end
11
+
12
+ task :default => :spec
13
+
14
+ desc "Publish gem to RubyGems"
15
+ task :publish_gem do |t|
16
+ gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
17
+ puts "Published #{gem}" if gem
18
+ end
data/bin/seed-crawler ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'govuk_seed_crawler'
4
+
5
+ GovukSeedCrawler::CLIRunner.new(ARGV).run
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'govuk_seed_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "govuk_seed_crawler"
8
+ spec.version = GovukSeedCrawler::VERSION
9
+ spec.authors = ["Matt Bostock"]
10
+ spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
11
+ spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
+ spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_runtime_dependency "bunny", "~> 1.3"
21
+ spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
22
+ spec.add_runtime_dependency "slop", "~> 3.6.0"
23
+
24
+ spec.add_development_dependency "gem_publisher", "~> 1.3"
25
+ spec.add_development_dependency "pry"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+ spec.add_development_dependency "rspec-mocks", "~> 3.0"
29
+ spec.add_development_dependency "webmock", "~> 1.18.0"
30
+ end
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ [ -x .venv/bin/pip ] || virtualenv .venv
5
+ . .venv/bin/activate
6
+
7
+ pip install -q ghtools
8
+
9
+ REPO="gds:gds/govuk_seed_crawler"
10
+ gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
+
12
+ if ./jenkins-tests.sh; then
13
+ gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
14
+ exit 0
15
+ else
16
+ gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
17
+ exit 1
18
+ fi
data/jenkins-tests.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+ rm -f Gemfile.lock
4
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
+ bundle exec rake
6
+ bundle exec rake integration
data/jenkins.sh ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+
4
+ ./jenkins-tests.sh
5
+ bundle exec rake publish_gem
@@ -0,0 +1,17 @@
1
+ require 'govuk_seed_crawler/amqp_client'
2
+ require 'govuk_seed_crawler/cli_parser'
3
+ require 'govuk_seed_crawler/cli_runner'
4
+ require 'govuk_seed_crawler/indexer'
5
+ require 'govuk_seed_crawler/seeder'
6
+ require 'govuk_seed_crawler/version'
7
+
8
+ module GovukSeedCrawler
9
+ def self.logger
10
+ unless @logger
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.level = Logger::INFO
13
+ end
14
+
15
+ @logger
16
+ end
17
+ end
@@ -0,0 +1,28 @@
1
+ require 'bunny'
2
+
3
+ module GovukSeedCrawler
4
+ class AmqpClient
5
+ attr_reader :channel
6
+
7
+ def initialize(connection_options = {})
8
+ @conn = Bunny.new(connection_options)
9
+ @conn.start
10
+ @channel = @conn.create_channel
11
+ end
12
+
13
+ def close
14
+ @conn.close
15
+ end
16
+
17
+ def publish(exchange, topic, body)
18
+ raise "Exchange cannot be nil" if exchange.nil?
19
+ raise "Topic cannot be nil" if topic.nil?
20
+ raise "Message body cannot be nil" if body.nil?
21
+
22
+ GovukSeedCrawler.logger.debug("Publishing '#{body}' to topic '#{topic}'")
23
+
24
+ @channel.topic(exchange, :durable => true)
25
+ .publish(body, :routing_key => topic)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,81 @@
1
+ require 'slop'
2
+
3
+ module GovukSeedCrawler
4
+ class CLIException < StandardError
5
+ attr_reader :help
6
+
7
+ def initialize(message, help)
8
+ super(message)
9
+ @help = help
10
+ end
11
+ end
12
+
13
+ class CLIParser
14
+ DEFAULTS = {
15
+ :exchange => "govuk_crawler_exchange",
16
+ :help => nil,
17
+ :host => "localhost",
18
+ :password => "guest",
19
+ :port => "5672",
20
+ :quiet => false,
21
+ :topic => "#",
22
+ :username => "guest",
23
+ :verbose => false,
24
+ :version => nil,
25
+ :vhost => "/"
26
+ }.freeze
27
+
28
+ ENV_AMQP_PASS_KEY = "GOVUK_CRAWLER_AMQP_PASS".freeze
29
+
30
+ def initialize(argv_array)
31
+ @argv_array = argv_array
32
+ end
33
+
34
+ def options
35
+ Slop.parse!(@argv_array, :help => true) do
36
+ banner <<-EOS
37
+ Usage: #{$PROGRAM_NAME} site_root [options]
38
+
39
+ Seeds an AMQP topic exchange with messages, each containing a URL, for the GOV.UK Crawler Worker
40
+ to consume:
41
+
42
+ https://github.com/alphagov/govuk_crawler_worker
43
+
44
+ The AMQP password can also be set as an environment variable and will be read from
45
+ `#{ENV_AMQP_PASS_KEY}`. If both the environment variable and command-line option for password
46
+ are set, the environment variable will take higher precedent.
47
+ EOS
48
+
49
+ on :version, "Display version and exit" do
50
+ puts "Version: #{GovukSeedCrawler::VERSION}"
51
+ exit 0
52
+ end
53
+
54
+ on :host=, "AMQP host to publish to", default: DEFAULTS[:host]
55
+ on :port=, "AMQP port", default: DEFAULTS[:port]
56
+ on :username=, "AMQP username", default: DEFAULTS[:username]
57
+ on :password=, "AMQP password", default: DEFAULTS[:password]
58
+ on :exchange=, "AMQP exchange", default: DEFAULTS[:exchange]
59
+ on :topic=, "AMQP topic", default: DEFAULTS[:topic]
60
+ on :vhost=, "AMQP vhost", default: DEFAULTS[:vhost]
61
+
62
+ on :quiet, "Quiet output", default: DEFAULTS[:quiet]
63
+ on :verbose, "Verbose output", default: DEFAULTS[:verbose]
64
+ end
65
+ end
66
+
67
+ def parse
68
+ opts = options
69
+
70
+ if opts[:version].nil?
71
+ raise CLIException.new("too many arguments provided", opts.help) if @argv_array.size > 1
72
+ raise CLIException.new("site_root must be provided", opts.help) if @argv_array.size != 1
73
+ end
74
+
75
+ options_hash = opts.to_hash
76
+ options_hash[:password] = ENV[ENV_AMQP_PASS_KEY] unless ENV[ENV_AMQP_PASS_KEY].nil?
77
+
78
+ return options_hash, @argv_array.first
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,29 @@
1
+ module GovukSeedCrawler
2
+ class CLIRunner
3
+ def initialize(argv_array)
4
+ begin
5
+ @options, @site_root = CLIParser.new(argv_array).parse
6
+ rescue CLIException => e
7
+ puts e.message
8
+ puts e.help
9
+ exit 2
10
+ end
11
+
12
+ set_logging_level(@options)
13
+ end
14
+
15
+ def run
16
+ Seeder::seed(@site_root, @options)
17
+ end
18
+
19
+ private
20
+
21
+ def set_logging_level(cli_options)
22
+ if cli_options[:verbose]
23
+ GovukSeedCrawler.logger.level = Logger::DEBUG
24
+ elsif cli_options[:quiet]
25
+ GovukSeedCrawler.logger.level = Logger::ERROR
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,18 @@
1
+ require 'govuk_mirrorer/indexer'
2
+ require 'govuk_mirrorer/statsd'
3
+
4
+ module GovukSeedCrawler
5
+ class Indexer
6
+ attr_reader :urls
7
+
8
+ def initialize(site_root)
9
+ raise "No site_root defined" unless site_root
10
+
11
+ GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
+ indexer = GovukMirrorer::Indexer.new(site_root)
13
+ @urls = indexer.all_start_urls
14
+
15
+ GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,16 @@
1
+ module GovukSeedCrawler
2
+ class Seeder
3
+ def self.seed(site_root, options = {})
4
+ amqp_client = AmqpClient.new(options)
5
+ urls = Indexer.new(site_root).urls
6
+
7
+ urls.each do |url|
8
+ amqp_client.publish(options[:exchange], options[:topic], url)
9
+ end
10
+
11
+ GovukSeedCrawler.logger.info("Published #{urls.count} URLs to topic '#{options[:topic]}'")
12
+
13
+ amqp_client.close
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,3 @@
1
+ module GovukSeedCrawler
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::AmqpClient do
4
+ let(:exchange) { "govuk_seed_crawler_spec_exchange" }
5
+ let(:options) {{
6
+ :host => ENV.fetch("AMQP_HOST", "localhost"),
7
+ :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
8
+ :pass => ENV.fetch("AMQP_PASS", "govuk_seed_crawler"),
9
+ }}
10
+ subject { GovukSeedCrawler::AmqpClient.new(options) }
11
+
12
+ it "responds to #channel" do
13
+ expect(subject).to respond_to(:channel)
14
+ end
15
+
16
+ it "responds to #close" do
17
+ expect(subject).to respond_to(:close)
18
+ end
19
+
20
+ it "closes the connection to the AMQP server" do
21
+ mock_bunny = double(:mock_bunny,
22
+ :start => true, :create_channel => true, :close => true)
23
+ allow(Bunny).to receive(:new).and_return(mock_bunny)
24
+ expect(mock_bunny).to receive(:close).once
25
+
26
+ subject.close
27
+ end
28
+
29
+ context "#publish" do
30
+ context "error handling" do
31
+ it "raises an exception if exchange is nil" do
32
+ expect {
33
+ subject.publish(nil, "#", "some body")
34
+ }.to raise_exception(RuntimeError, "Exchange cannot be nil")
35
+ end
36
+
37
+ it "raises an exception if topic is nil" do
38
+ expect {
39
+ subject.publish(exchange, nil, "some body")
40
+ }.to raise_exception(RuntimeError, "Topic cannot be nil")
41
+ end
42
+
43
+ it "raises an exception if body is nil" do
44
+ expect {
45
+ subject.publish(exchange, "#", nil)
46
+ }.to raise_exception(RuntimeError, "Message body cannot be nil")
47
+ end
48
+ end
49
+
50
+ it "allows publishing against an exchange" do
51
+ expect(subject.publish(exchange, "#", "some body"))
52
+ .to_not be_nil
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,122 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::CLIParser do
4
+ it "requires the site_root to be provided" do
5
+ expect {
6
+ GovukSeedCrawler::CLIParser.new([]).parse
7
+ }.to raise_exception(GovukSeedCrawler::CLIException, "site_root must be provided")
8
+ end
9
+
10
+ it "provides the defaults when just given the site_root" do
11
+ options, site_root = GovukSeedCrawler::CLIParser.new(["https://www.example.com"]).parse
12
+
13
+ expect(options).to eq(GovukSeedCrawler::CLIParser::DEFAULTS)
14
+ expect(site_root).to eq("https://www.example.com")
15
+ end
16
+
17
+ it "should tell us when we've given too many arguments" do
18
+ expect {
19
+ GovukSeedCrawler::CLIParser.new(["a", "b"]).parse
20
+ }.to raise_exception(GovukSeedCrawler::CLIException, "too many arguments provided")
21
+ end
22
+
23
+ it "should nest the help message in with any CLIExceptions we raise" do
24
+ expect {
25
+ GovukSeedCrawler::CLIParser.new(["a", "b"]).parse
26
+ }.to raise_exception(GovukSeedCrawler::CLIException) { |e|
27
+ expect(e.help).to include("Usage: ")
28
+ }
29
+ end
30
+
31
+ describe "catching STDOUT" do
32
+ it "shows the help banner when provided -h" do
33
+ # Get a valid options response as help closes early with SystemExit.
34
+ options = GovukSeedCrawler::CLIParser.new(["http://www.foo.com/"]).options
35
+
36
+ temp_stdout do |caught_stdout|
37
+ expect {
38
+ _, _ = GovukSeedCrawler::CLIParser.new(["-h"]).parse
39
+ }.to raise_exception(SystemExit) { |e|
40
+ expect(e.status).to eq(0)
41
+ }
42
+
43
+ expect(caught_stdout.strip).to eq(options.help)
44
+ end
45
+ end
46
+
47
+ it "should show the version number and exit" do
48
+ temp_stdout do |caught_stdout|
49
+ expect {
50
+ _, _ = GovukSeedCrawler::CLIParser.new(["--version"]).parse
51
+ }.to raise_exception(SystemExit) { |e|
52
+ expect(e.status).to eq(0)
53
+ }
54
+
55
+ expect(caught_stdout.strip).to eq("Version: #{GovukSeedCrawler::VERSION}")
56
+ end
57
+ end
58
+ end
59
+
60
+ describe "passing in valid arguments" do
61
+ let(:arguments) {
62
+ [
63
+ "https://www.override.com/",
64
+ "--host rabbitmq.some.custom.vhost",
65
+ "--port 4567",
66
+ "--username foo",
67
+ "--password bar",
68
+ "--exchange some_custom_exchange",
69
+ "--topic some_custom_topic",
70
+ "--vhost a_vhost",
71
+ "--verbose"
72
+ ].join(" ").split(" ")
73
+ }
74
+
75
+ it "should override all of the default arguments that we're providing" do
76
+ overriden = {
77
+ host: "rabbitmq.some.custom.vhost",
78
+ port: "4567",
79
+ username: "foo",
80
+ password: "bar",
81
+ exchange: "some_custom_exchange",
82
+ topic: "some_custom_topic",
83
+ help: nil,
84
+ quiet: false,
85
+ verbose: true,
86
+ version: nil,
87
+ vhost: "a_vhost"
88
+ }
89
+
90
+ expect(GovukSeedCrawler::CLIParser.new(arguments).parse.first).to eq(overriden)
91
+ end
92
+
93
+ it "should set the --quiet value" do
94
+ options, _ = GovukSeedCrawler::CLIParser.new(["foo.com", "--quiet"]).parse
95
+ expect(options).to eq(GovukSeedCrawler::CLIParser::DEFAULTS.merge(quiet: true))
96
+ end
97
+
98
+ describe "reading the AMQP password from an environment variable" do
99
+ def set_amqp_pass(password)
100
+ ENV[GovukSeedCrawler::CLIParser::ENV_AMQP_PASS_KEY] = password
101
+ end
102
+
103
+ after do
104
+ ENV[GovukSeedCrawler::CLIParser::ENV_AMQP_PASS_KEY] = nil
105
+ end
106
+
107
+ it "sets the password if set using an environment variable" do
108
+ set_amqp_pass("foobar")
109
+
110
+ expect(GovukSeedCrawler::CLIParser.new(["http://www.example.com"]).parse.first)
111
+ .to include(password: "foobar")
112
+ end
113
+
114
+ it "picks the environment variable over the parameter if both are set" do
115
+ set_amqp_pass("bar")
116
+
117
+ expect(GovukSeedCrawler::CLIParser.new(["http://www.example.com", "--password", "foo"]).parse.first)
118
+ .to include(password: "bar")
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,74 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::CLIRunner do
4
+ describe "printing the version" do
5
+ it "should not try to connect to an AMQP server" do
6
+ expect(Bunny).not_to receive(:new)
7
+
8
+ temp_stdout do |caught_stdout|
9
+ expect {
10
+ GovukSeedCrawler::CLIRunner.new(["--version"]).run
11
+ }.to raise_exception(SystemExit) { |exit|
12
+ expect(exit.status).to eq(0)
13
+ }
14
+
15
+ expect(caught_stdout.strip).to eq("Version: #{GovukSeedCrawler::VERSION}")
16
+ end
17
+ end
18
+ end
19
+
20
+ describe "catching any CLIException objects and exiting with a status 1" do
21
+ it "prints to STDOUT for too many arguments" do
22
+ temp_stdout do |caught_stdout|
23
+ expect {
24
+ GovukSeedCrawler::CLIRunner.new(["a", "b"])
25
+ }.to raise_exception(SystemExit) { |exit|
26
+ expect(exit.status).to eq(2)
27
+ }
28
+
29
+ expect(caught_stdout.strip).to include("too many arguments provided")
30
+ end
31
+ end
32
+
33
+ it "prints to STDOUT when site_root not set" do
34
+ temp_stdout do |caught_stdout|
35
+ expect {
36
+ GovukSeedCrawler::CLIRunner.new(["--verbose"])
37
+ }.to raise_exception(SystemExit) { |exit|
38
+ expect(exit.status).to eq(2)
39
+ }
40
+
41
+ expect(caught_stdout.strip).to include("site_root must be provided")
42
+ end
43
+ end
44
+ end
45
+
46
+ describe "setting the logging level" do
47
+ before do
48
+ GovukSeedCrawler.logger.level = Logger::INFO
49
+ end
50
+
51
+ it "defaults to INFO" do
52
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com"])
53
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::INFO)
54
+ end
55
+
56
+ it "sets to ERROR for quite" do
57
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com", "--quiet"])
58
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::ERROR)
59
+ end
60
+
61
+ it "sets to DEBUG for verbose" do
62
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com", "--verbose"])
63
+ expect(GovukSeedCrawler.logger.level).to eq(Logger::DEBUG)
64
+ end
65
+ end
66
+
67
+ describe "#run" do
68
+ it "passes all options through to seed" do
69
+ expect(GovukSeedCrawler::Seeder).to receive(:seed).
70
+ with("http://www.example.com", GovukSeedCrawler::CLIParser::DEFAULTS).once
71
+ GovukSeedCrawler::CLIRunner.new(["http://www.example.com"]).run
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::Indexer do
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
5
+
6
+ context "under normal usage" do
7
+ let(:mock_indexer) do
8
+ double(:mock_indexer, :all_start_urls => [])
9
+ end
10
+
11
+ it "responds to Indexer#urls" do
12
+ allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
13
+ expect(subject).to respond_to(:urls)
14
+ end
15
+
16
+ it "calls GovukMirrorer::Indexer with the site root" do
17
+ expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
18
+ subject
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe GovukSeedCrawler::Seeder do
4
+ let(:exchange) { "seeder_test_exchange" }
5
+ let(:topic) { "#" }
6
+ let(:root_url) { "https://www.example.com" }
7
+
8
+ let(:options) {{
9
+ :exchange => exchange,
10
+ :topic => topic,
11
+ }}
12
+
13
+ let(:mock_get_urls) { double(:mock_get_urls, :urls => true) }
14
+ let(:mock_amqp_client) { double(:mock_amqp_client, :close => true) }
15
+
16
+ let(:urls) do
17
+ [
18
+ "https://example.com/foo",
19
+ "https://example.com/bar",
20
+ "https://example.com/baz",
21
+ ]
22
+ end
23
+
24
+ subject { GovukSeedCrawler::Seeder::seed(root_url, options) }
25
+
26
+ before(:each) do
27
+ allow(GovukSeedCrawler::Indexer).to receive(:new)
28
+ .with(root_url)
29
+ .and_return(mock_get_urls)
30
+ allow(mock_get_urls).to receive(:urls).and_return(urls)
31
+ allow(GovukSeedCrawler::AmqpClient).to receive(:new)
32
+ .with(options).and_return(mock_amqp_client)
33
+ end
34
+
35
+ context "under normal usage" do
36
+ it "publishes urls to the queue" do
37
+ urls.each do |url|
38
+ expect(mock_amqp_client).to receive(:publish)
39
+ .with(exchange, topic, url)
40
+ end
41
+
42
+ subject
43
+ end
44
+
45
+ it "closes the connection when done" do
46
+ allow(mock_amqp_client).to receive(:publish)
47
+ expect(mock_amqp_client).to receive(:close)
48
+ subject
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,66 @@
1
+ require 'json'
2
+ require 'spec_helper'
3
+
4
+ describe GovukSeedCrawler do
5
+ def stub_api_artefacts(count)
6
+ item = {
7
+ "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
+ "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
+ "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
+ "format" => "announcement"
11
+ }
12
+ results = count.times.collect { item }
13
+ response = {
14
+ "_response_info" => {
15
+ "status" => "ok",
16
+ "links" => []
17
+ },
18
+ "total" => results.size,
19
+ "start_index" => 1,
20
+ "page_size" => 100,
21
+ "current_page" => 1,
22
+ "pages" => 1,
23
+ "results" => results
24
+ }
25
+
26
+ stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
+ to_return(:status => 200, :body => response.to_json, :headers => {})
28
+ end
29
+
30
+ let(:vhost) { "/" }
31
+ let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
+ let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
+ let(:topic) { "#" }
34
+ let(:site_root) { "https://www.gov.uk/" }
35
+ let(:options) {{
36
+ :host => ENV.fetch("AMQP_HOST", "localhost"),
37
+ :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
38
+ :pass => ENV.fetch("AMQP_PASS", "govuk_seed_crawler"),
39
+ :exchange => exchange_name,
40
+ :topic => topic
41
+ }}
42
+ let(:rabbitmq_client) { GovukSeedCrawler::AmqpClient.new(options) }
43
+
44
+ subject { GovukSeedCrawler::Seeder::seed(site_root, options) }
45
+
46
+ before(:each) do
47
+ @exchange = rabbitmq_client.channel.topic(exchange_name, :durable => true)
48
+ @queue = rabbitmq_client.channel.queue(queue_name)
49
+ @queue.bind(@exchange, :routing_key => topic)
50
+ end
51
+
52
+ after(:each) do
53
+ @queue.unbind(@exchange)
54
+ @queue.delete
55
+ @exchange.delete
56
+ rabbitmq_client.close
57
+ end
58
+
59
+ it "publishes URLs it finds to an AMQP topic exchange" do
60
+ stub_api_artefacts(10)
61
+ subject
62
+
63
+ # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
+ expect(@queue.message_count).to be(15)
65
+ end
66
+ end
@@ -0,0 +1,39 @@
1
+ require 'govuk_seed_crawler'
2
+ require 'webmock/rspec'
3
+
4
+ RSpec.configure do |config|
5
+ config.order = :random
6
+
7
+ # Seed global randomization in this process using the `--seed` CLI option.
8
+ # Setting this allows you to use `--seed` to deterministically reproduce
9
+ # test failures related to randomization by passing the same `--seed` value
10
+ # as the one that triggered the failure.
11
+ Kernel.srand config.seed
12
+
13
+ config.expect_with :rspec do |expectations|
14
+ # Enable only the newer, non-monkey-patching expect syntax.
15
+ # For more details, see:
16
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
17
+ expectations.syntax = :expect
18
+ end
19
+
20
+ config.mock_with :rspec do |mocks|
21
+ # Enable only the newer, non-monkey-patching expect syntax.
22
+ # For more details, see:
23
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
24
+ mocks.syntax = :expect
25
+
26
+ # Prevents you from mocking or stubbing a method that does not exist on
27
+ # a real object. This is generally recommended.
28
+ mocks.verify_partial_doubles = true
29
+ end
30
+ end
31
+
32
+ WebMock.disable_net_connect!
33
+
34
+ def temp_stdout
35
+ $stdout = StringIO.new
36
+ yield $stdout.string
37
+ ensure
38
+ $stdout = STDOUT
39
+ end
metadata ADDED
@@ -0,0 +1,205 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: govuk_seed_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Bostock
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bunny
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: govuk_mirrorer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: slop
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 3.6.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 3.6.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: gem_publisher
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec-mocks
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: webmock
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: 1.18.0
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: 1.18.0
139
+ description:
140
+ email:
141
+ - matt.bostock@digital.cabinet-office.gov.uk
142
+ executables:
143
+ - seed-crawler
144
+ extensions: []
145
+ extra_rdoc_files: []
146
+ files:
147
+ - ".gitignore"
148
+ - ".rspec"
149
+ - ".ruby-version"
150
+ - Gemfile
151
+ - LICENSE.txt
152
+ - README.md
153
+ - Rakefile
154
+ - bin/seed-crawler
155
+ - govuk_seed_crawler.gemspec
156
+ - jenkins-branches.sh
157
+ - jenkins-tests.sh
158
+ - jenkins.sh
159
+ - lib/govuk_seed_crawler.rb
160
+ - lib/govuk_seed_crawler/amqp_client.rb
161
+ - lib/govuk_seed_crawler/cli_parser.rb
162
+ - lib/govuk_seed_crawler/cli_runner.rb
163
+ - lib/govuk_seed_crawler/indexer.rb
164
+ - lib/govuk_seed_crawler/seeder.rb
165
+ - lib/govuk_seed_crawler/version.rb
166
+ - spec/govuk_seed_crawler/amqp_client_spec.rb
167
+ - spec/govuk_seed_crawler/cli_parser_spec.rb
168
+ - spec/govuk_seed_crawler/cli_runner_spec.rb
169
+ - spec/govuk_seed_crawler/indexer_spec.rb
170
+ - spec/govuk_seed_crawler/seeder_spec.rb
171
+ - spec/integration/govuk_seed_crawler_spec.rb
172
+ - spec/spec_helper.rb
173
+ homepage: https://github.gds/gds/govuk_seed_crawler
174
+ licenses:
175
+ - MIT
176
+ metadata: {}
177
+ post_install_message:
178
+ rdoc_options: []
179
+ require_paths:
180
+ - lib
181
+ required_ruby_version: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: '0'
186
+ required_rubygems_version: !ruby/object:Gem::Requirement
187
+ requirements:
188
+ - - ">="
189
+ - !ruby/object:Gem::Version
190
+ version: '0'
191
+ requirements: []
192
+ rubyforge_project:
193
+ rubygems_version: 2.2.2
194
+ signing_key:
195
+ specification_version: 4
196
+ summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
197
+ exchange.
198
+ test_files:
199
+ - spec/govuk_seed_crawler/amqp_client_spec.rb
200
+ - spec/govuk_seed_crawler/cli_parser_spec.rb
201
+ - spec/govuk_seed_crawler/cli_runner_spec.rb
202
+ - spec/govuk_seed_crawler/indexer_spec.rb
203
+ - spec/govuk_seed_crawler/seeder_spec.rb
204
+ - spec/integration/govuk_seed_crawler_spec.rb
205
+ - spec/spec_helper.rb