govuk_seed_crawler 1.0.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.ruby-version +1 -1
- data/Jenkinsfile +49 -0
- data/README.md +2 -1
- data/Rakefile +0 -7
- data/govuk_seed_crawler.gemspec +12 -6
- data/lib/govuk_seed_crawler/indexer.rb +4 -4
- data/lib/govuk_seed_crawler/version.rb +1 -1
- data/spec/govuk_seed_crawler/indexer_spec.rb +6 -6
- data/spec/integration/govuk_seed_crawler_spec.rb +18 -25
- metadata +51 -26
- data/jenkins-branches.sh +0 -18
- data/jenkins-tests.sh +0 -6
- data/jenkins.sh +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2ee27868f4df61a72b764044b8d04ad6f6b252a63cd1f365d1056a7d331a85d1
|
4
|
+
data.tar.gz: 514ee036b88544be5a7935dcc2103e07aa7fe82c2acc0841109d15e572a9ebf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d005f87f519187b619e1dbbeabcc441d0ccb0daf652db2d6b515f7abf2de49e9090f457e208afc5128d68db65bf618dcbf5bc74e54bb31c16bf6dbf405495c1
|
7
|
+
data.tar.gz: ec43d4205fd5714be7ab39669a9397371792802b3f50fa97d580c7327db018afd8409988b54c5876aba603456cd3019c454ce41736dad054e5f58d7cc79a80ed
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.6.3
|
data/Jenkinsfile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env groovy
|
2
|
+
|
3
|
+
library("govuk")
|
4
|
+
|
5
|
+
node {
|
6
|
+
try {
|
7
|
+
// This doesn't use the buildProject as this project doesn't conform to
|
8
|
+
// required norms (e.g. running in Ruby 1.9, non-standard tests).
|
9
|
+
|
10
|
+
repoName = JOB_NAME.split('/')[0]
|
11
|
+
|
12
|
+
stage("Checkout") {
|
13
|
+
govuk.checkoutFromGitHubWithSSH(repoName)
|
14
|
+
}
|
15
|
+
|
16
|
+
stage("Clean up workspace") {
|
17
|
+
govuk.cleanupGit()
|
18
|
+
}
|
19
|
+
|
20
|
+
stage('Configure environment') {
|
21
|
+
govuk.setEnvar('RBENV_VERSION', '2.6.3')
|
22
|
+
}
|
23
|
+
|
24
|
+
stage('Bundle install') {
|
25
|
+
govuk.bundleGem()
|
26
|
+
}
|
27
|
+
|
28
|
+
stage('Spec tests') {
|
29
|
+
govuk.runRakeTask('spec')
|
30
|
+
}
|
31
|
+
|
32
|
+
stage('Integration tests') {
|
33
|
+
govuk.runRakeTask('integration')
|
34
|
+
}
|
35
|
+
|
36
|
+
if (env.BRANCH_NAME == 'master') {
|
37
|
+
stage('Publish Gem to Rubygems') {
|
38
|
+
govuk.publishGem(repoName, repoName, 'master')
|
39
|
+
}
|
40
|
+
}
|
41
|
+
} catch (e) {
|
42
|
+
currentBuild.result = "FAILED"
|
43
|
+
step([$class: 'Mailer',
|
44
|
+
notifyEveryUnstableBuild: true,
|
45
|
+
recipients: 'govuk-ci-notifications@digital.cabinet-office.gov.uk',
|
46
|
+
sendToIndividuals: true])
|
47
|
+
throw e
|
48
|
+
}
|
49
|
+
}
|
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# GOV.UK: Seed the Crawler
|
2
2
|
|
3
|
-
|
3
|
+
This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
|
4
|
+
so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
data/Rakefile
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'gem_publisher'
|
2
1
|
require 'rspec/core/rake_task'
|
3
2
|
|
4
3
|
RSpec::Core::RakeTask.new(:spec) do |task|
|
@@ -10,9 +9,3 @@ RSpec::Core::RakeTask.new(:integration) do |task|
|
|
10
9
|
end
|
11
10
|
|
12
11
|
task :default => :spec
|
13
|
-
|
14
|
-
desc "Publish gem to RubyGems"
|
15
|
-
task :publish_gem do |t|
|
16
|
-
gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
|
17
|
-
puts "Published #{gem}" if gem
|
18
|
-
end
|
data/govuk_seed_crawler.gemspec
CHANGED
@@ -6,24 +6,30 @@ require 'govuk_seed_crawler/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "govuk_seed_crawler"
|
8
8
|
spec.version = GovukSeedCrawler::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = ["
|
9
|
+
spec.authors = ['GOV.UK developers']
|
10
|
+
spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
|
11
11
|
spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
|
12
|
-
spec.homepage = "https://github.
|
12
|
+
spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
|
+
spec.required_ruby_version = "~> 2.6"
|
16
|
+
|
15
17
|
spec.files = `git ls-files -z`.split("\x0")
|
16
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
20
|
spec.require_paths = ["lib"]
|
19
21
|
|
20
22
|
spec.add_runtime_dependency "bunny", "~> 1.3"
|
21
|
-
spec.add_runtime_dependency "
|
23
|
+
spec.add_runtime_dependency "crack", "0.4.4"
|
24
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
|
25
|
+
# Something, somewhere, sometimes requires public_suffix.
|
26
|
+
# public_suffix > 1.5 requires ruby > 2.
|
27
|
+
spec.add_runtime_dependency "public_suffix", "~> 1.4.6"
|
28
|
+
spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
|
22
29
|
spec.add_runtime_dependency "slop", "~> 3.6.0"
|
23
30
|
|
24
|
-
spec.add_development_dependency "gem_publisher", "~> 1.3"
|
25
31
|
spec.add_development_dependency "pry"
|
26
|
-
spec.add_development_dependency "rake"
|
32
|
+
spec.add_development_dependency "rake", "~> 0.9"
|
27
33
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
34
|
spec.add_development_dependency "rspec-mocks", "~> 3.0"
|
29
35
|
spec.add_development_dependency "webmock", "~> 1.18.0"
|
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'govuk_mirrorer/statsd'
|
1
|
+
require 'sitemap-parser'
|
3
2
|
|
4
3
|
module GovukSeedCrawler
|
5
4
|
class Indexer
|
@@ -9,8 +8,9 @@ module GovukSeedCrawler
|
|
9
8
|
raise "No site_root defined" unless site_root
|
10
9
|
|
11
10
|
GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
|
12
|
-
|
13
|
-
|
11
|
+
|
12
|
+
sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
|
13
|
+
@urls = sitemap.to_a
|
14
14
|
|
15
15
|
GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
|
16
16
|
end
|
@@ -1,20 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe GovukSeedCrawler::Indexer do
|
4
|
-
subject { GovukSeedCrawler::Indexer.new('https://example.com
|
4
|
+
subject { GovukSeedCrawler::Indexer.new('https://example.com') }
|
5
5
|
|
6
6
|
context "under normal usage" do
|
7
|
-
let(:
|
8
|
-
double(:
|
7
|
+
let(:mock_parser) do
|
8
|
+
double(:mock_parser, :to_a => [])
|
9
9
|
end
|
10
10
|
|
11
11
|
it "responds to Indexer#urls" do
|
12
|
-
allow(
|
12
|
+
allow(SitemapParser).to receive(:new).and_return(mock_parser)
|
13
13
|
expect(subject).to respond_to(:urls)
|
14
14
|
end
|
15
15
|
|
16
|
-
it "calls
|
17
|
-
expect(
|
16
|
+
it "calls SitemapParser with the sitemap file" do
|
17
|
+
expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
|
18
18
|
subject
|
19
19
|
end
|
20
20
|
end
|
@@ -2,36 +2,30 @@ require 'json'
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe GovukSeedCrawler do
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
"total" => results.size,
|
19
|
-
"start_index" => 1,
|
20
|
-
"page_size" => 100,
|
21
|
-
"current_page" => 1,
|
22
|
-
"pages" => 1,
|
23
|
-
"results" => results
|
5
|
+
def stub_sitemap
|
6
|
+
sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
|
7
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
8
|
+
<url>
|
9
|
+
<loc>https://www.gov.uk/</loc>
|
10
|
+
</url>
|
11
|
+
<url>
|
12
|
+
<loc>https://www.gov.uk/register-to-vote</loc>
|
13
|
+
</url>
|
14
|
+
<url>
|
15
|
+
<loc>https://www.gov.uk/help</loc>
|
16
|
+
</url>
|
17
|
+
</urlset>
|
24
18
|
}
|
25
19
|
|
26
|
-
stub_request(:get, "https://www.gov.uk
|
27
|
-
to_return(:status => 200, :body =>
|
20
|
+
stub_request(:get, "https://www.gov.uk/sitemap.xml").
|
21
|
+
to_return(:status => 200, :body => sitemap, :headers => {})
|
28
22
|
end
|
29
23
|
|
30
24
|
let(:vhost) { "/" }
|
31
25
|
let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
|
32
26
|
let(:queue_name) { "govuk_seed_crawler_integration_queue" }
|
33
27
|
let(:topic) { "#" }
|
34
|
-
let(:site_root) { "https://www.gov.uk
|
28
|
+
let(:site_root) { "https://www.gov.uk" }
|
35
29
|
let(:options) {{
|
36
30
|
:host => ENV.fetch("AMQP_HOST", "localhost"),
|
37
31
|
:user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
|
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
|
|
57
51
|
end
|
58
52
|
|
59
53
|
it "publishes URLs it finds to an AMQP topic exchange" do
|
60
|
-
|
54
|
+
stub_sitemap
|
61
55
|
subject
|
62
56
|
|
63
|
-
|
64
|
-
expect(@queue.message_count).to be(15)
|
57
|
+
expect(@queue.message_count).to be(3)
|
65
58
|
end
|
66
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govuk_seed_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- GOV.UK developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bunny
|
@@ -25,47 +25,75 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: crack
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.4.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.4.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - "~>"
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
47
|
+
version: 1.6.0
|
34
48
|
type: :runtime
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
54
|
+
version: 1.6.0
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: public_suffix
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
61
|
+
version: 1.4.6
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
68
|
+
version: 1.4.6
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: sitemap-parser
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
type: :
|
75
|
+
version: 0.3.0
|
76
|
+
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 0.3.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: slop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.6.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 3.6.0
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: pry
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +112,16 @@ dependencies:
|
|
84
112
|
name: rake
|
85
113
|
requirement: !ruby/object:Gem::Requirement
|
86
114
|
requirements:
|
87
|
-
- - "
|
115
|
+
- - "~>"
|
88
116
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
117
|
+
version: '0.9'
|
90
118
|
type: :development
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
93
121
|
requirements:
|
94
|
-
- - "
|
122
|
+
- - "~>"
|
95
123
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
124
|
+
version: '0.9'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: rspec
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +166,7 @@ dependencies:
|
|
138
166
|
version: 1.18.0
|
139
167
|
description:
|
140
168
|
email:
|
141
|
-
-
|
169
|
+
- govuk-dev@digital.cabinet-office.gov.uk
|
142
170
|
executables:
|
143
171
|
- seed-crawler
|
144
172
|
extensions: []
|
@@ -148,14 +176,12 @@ files:
|
|
148
176
|
- ".rspec"
|
149
177
|
- ".ruby-version"
|
150
178
|
- Gemfile
|
179
|
+
- Jenkinsfile
|
151
180
|
- LICENSE.txt
|
152
181
|
- README.md
|
153
182
|
- Rakefile
|
154
183
|
- bin/seed-crawler
|
155
184
|
- govuk_seed_crawler.gemspec
|
156
|
-
- jenkins-branches.sh
|
157
|
-
- jenkins-tests.sh
|
158
|
-
- jenkins.sh
|
159
185
|
- lib/govuk_seed_crawler.rb
|
160
186
|
- lib/govuk_seed_crawler/amqp_client.rb
|
161
187
|
- lib/govuk_seed_crawler/cli_parser.rb
|
@@ -170,7 +196,7 @@ files:
|
|
170
196
|
- spec/govuk_seed_crawler/seeder_spec.rb
|
171
197
|
- spec/integration/govuk_seed_crawler_spec.rb
|
172
198
|
- spec/spec_helper.rb
|
173
|
-
homepage: https://github.
|
199
|
+
homepage: https://github.com/alphagov/govuk_seed_crawler
|
174
200
|
licenses:
|
175
201
|
- MIT
|
176
202
|
metadata: {}
|
@@ -180,17 +206,16 @@ require_paths:
|
|
180
206
|
- lib
|
181
207
|
required_ruby_version: !ruby/object:Gem::Requirement
|
182
208
|
requirements:
|
183
|
-
- - "
|
209
|
+
- - "~>"
|
184
210
|
- !ruby/object:Gem::Version
|
185
|
-
version: '
|
211
|
+
version: '2.6'
|
186
212
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
213
|
requirements:
|
188
214
|
- - ">="
|
189
215
|
- !ruby/object:Gem::Version
|
190
216
|
version: '0'
|
191
217
|
requirements: []
|
192
|
-
|
193
|
-
rubygems_version: 2.2.2
|
218
|
+
rubygems_version: 3.0.3
|
194
219
|
signing_key:
|
195
220
|
specification_version: 4
|
196
221
|
summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
|
data/jenkins-branches.sh
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
set -e
|
3
|
-
|
4
|
-
[ -x .venv/bin/pip ] || virtualenv .venv
|
5
|
-
. .venv/bin/activate
|
6
|
-
|
7
|
-
pip install -q ghtools
|
8
|
-
|
9
|
-
REPO="gds:gds/govuk_seed_crawler"
|
10
|
-
gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
11
|
-
|
12
|
-
if ./jenkins-tests.sh; then
|
13
|
-
gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
14
|
-
exit 0
|
15
|
-
else
|
16
|
-
gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
17
|
-
exit 1
|
18
|
-
fi
|
data/jenkins-tests.sh
DELETED