govuk_seed_crawler 1.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.ruby-version +1 -1
- data/Jenkinsfile +49 -0
- data/README.md +2 -1
- data/Rakefile +0 -7
- data/govuk_seed_crawler.gemspec +12 -6
- data/lib/govuk_seed_crawler/indexer.rb +4 -4
- data/lib/govuk_seed_crawler/version.rb +1 -1
- data/spec/govuk_seed_crawler/indexer_spec.rb +6 -6
- data/spec/integration/govuk_seed_crawler_spec.rb +18 -25
- metadata +51 -26
- data/jenkins-branches.sh +0 -18
- data/jenkins-tests.sh +0 -6
- data/jenkins.sh +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2ee27868f4df61a72b764044b8d04ad6f6b252a63cd1f365d1056a7d331a85d1
|
4
|
+
data.tar.gz: 514ee036b88544be5a7935dcc2103e07aa7fe82c2acc0841109d15e572a9ebf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d005f87f519187b619e1dbbeabcc441d0ccb0daf652db2d6b515f7abf2de49e9090f457e208afc5128d68db65bf618dcbf5bc74e54bb31c16bf6dbf405495c1
|
7
|
+
data.tar.gz: ec43d4205fd5714be7ab39669a9397371792802b3f50fa97d580c7327db018afd8409988b54c5876aba603456cd3019c454ce41736dad054e5f58d7cc79a80ed
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.6.3
|
data/Jenkinsfile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env groovy
|
2
|
+
|
3
|
+
library("govuk")
|
4
|
+
|
5
|
+
node {
|
6
|
+
try {
|
7
|
+
// This doesn't use the buildProject as this project doesn't conform to
|
8
|
+
// required norms (e.g. running in Ruby 1.9, non-standard tests).
|
9
|
+
|
10
|
+
repoName = JOB_NAME.split('/')[0]
|
11
|
+
|
12
|
+
stage("Checkout") {
|
13
|
+
govuk.checkoutFromGitHubWithSSH(repoName)
|
14
|
+
}
|
15
|
+
|
16
|
+
stage("Clean up workspace") {
|
17
|
+
govuk.cleanupGit()
|
18
|
+
}
|
19
|
+
|
20
|
+
stage('Configure environment') {
|
21
|
+
govuk.setEnvar('RBENV_VERSION', '2.6.3')
|
22
|
+
}
|
23
|
+
|
24
|
+
stage('Bundle install') {
|
25
|
+
govuk.bundleGem()
|
26
|
+
}
|
27
|
+
|
28
|
+
stage('Spec tests') {
|
29
|
+
govuk.runRakeTask('spec')
|
30
|
+
}
|
31
|
+
|
32
|
+
stage('Integration tests') {
|
33
|
+
govuk.runRakeTask('integration')
|
34
|
+
}
|
35
|
+
|
36
|
+
if (env.BRANCH_NAME == 'master') {
|
37
|
+
stage('Publish Gem to Rubygems') {
|
38
|
+
govuk.publishGem(repoName, repoName, 'master')
|
39
|
+
}
|
40
|
+
}
|
41
|
+
} catch (e) {
|
42
|
+
currentBuild.result = "FAILED"
|
43
|
+
step([$class: 'Mailer',
|
44
|
+
notifyEveryUnstableBuild: true,
|
45
|
+
recipients: 'govuk-ci-notifications@digital.cabinet-office.gov.uk',
|
46
|
+
sendToIndividuals: true])
|
47
|
+
throw e
|
48
|
+
}
|
49
|
+
}
|
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# GOV.UK: Seed the Crawler
|
2
2
|
|
3
|
-
|
3
|
+
This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
|
4
|
+
so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
data/Rakefile
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'gem_publisher'
|
2
1
|
require 'rspec/core/rake_task'
|
3
2
|
|
4
3
|
RSpec::Core::RakeTask.new(:spec) do |task|
|
@@ -10,9 +9,3 @@ RSpec::Core::RakeTask.new(:integration) do |task|
|
|
10
9
|
end
|
11
10
|
|
12
11
|
task :default => :spec
|
13
|
-
|
14
|
-
desc "Publish gem to RubyGems"
|
15
|
-
task :publish_gem do |t|
|
16
|
-
gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
|
17
|
-
puts "Published #{gem}" if gem
|
18
|
-
end
|
data/govuk_seed_crawler.gemspec
CHANGED
@@ -6,24 +6,30 @@ require 'govuk_seed_crawler/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "govuk_seed_crawler"
|
8
8
|
spec.version = GovukSeedCrawler::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = ["
|
9
|
+
spec.authors = ['GOV.UK developers']
|
10
|
+
spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
|
11
11
|
spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
|
12
|
-
spec.homepage = "https://github.
|
12
|
+
spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
|
+
spec.required_ruby_version = "~> 2.6"
|
16
|
+
|
15
17
|
spec.files = `git ls-files -z`.split("\x0")
|
16
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
20
|
spec.require_paths = ["lib"]
|
19
21
|
|
20
22
|
spec.add_runtime_dependency "bunny", "~> 1.3"
|
21
|
-
spec.add_runtime_dependency "
|
23
|
+
spec.add_runtime_dependency "crack", "0.4.4"
|
24
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
|
25
|
+
# Something, somewhere, sometimes requires public_suffix.
|
26
|
+
# public_suffix > 1.5 requires ruby > 2.
|
27
|
+
spec.add_runtime_dependency "public_suffix", "~> 1.4.6"
|
28
|
+
spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
|
22
29
|
spec.add_runtime_dependency "slop", "~> 3.6.0"
|
23
30
|
|
24
|
-
spec.add_development_dependency "gem_publisher", "~> 1.3"
|
25
31
|
spec.add_development_dependency "pry"
|
26
|
-
spec.add_development_dependency "rake"
|
32
|
+
spec.add_development_dependency "rake", "~> 0.9"
|
27
33
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
34
|
spec.add_development_dependency "rspec-mocks", "~> 3.0"
|
29
35
|
spec.add_development_dependency "webmock", "~> 1.18.0"
|
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'govuk_mirrorer/statsd'
|
1
|
+
require 'sitemap-parser'
|
3
2
|
|
4
3
|
module GovukSeedCrawler
|
5
4
|
class Indexer
|
@@ -9,8 +8,9 @@ module GovukSeedCrawler
|
|
9
8
|
raise "No site_root defined" unless site_root
|
10
9
|
|
11
10
|
GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
|
12
|
-
|
13
|
-
|
11
|
+
|
12
|
+
sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
|
13
|
+
@urls = sitemap.to_a
|
14
14
|
|
15
15
|
GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
|
16
16
|
end
|
@@ -1,20 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe GovukSeedCrawler::Indexer do
|
4
|
-
subject { GovukSeedCrawler::Indexer.new('https://example.com
|
4
|
+
subject { GovukSeedCrawler::Indexer.new('https://example.com') }
|
5
5
|
|
6
6
|
context "under normal usage" do
|
7
|
-
let(:
|
8
|
-
double(:
|
7
|
+
let(:mock_parser) do
|
8
|
+
double(:mock_parser, :to_a => [])
|
9
9
|
end
|
10
10
|
|
11
11
|
it "responds to Indexer#urls" do
|
12
|
-
allow(
|
12
|
+
allow(SitemapParser).to receive(:new).and_return(mock_parser)
|
13
13
|
expect(subject).to respond_to(:urls)
|
14
14
|
end
|
15
15
|
|
16
|
-
it "calls
|
17
|
-
expect(
|
16
|
+
it "calls SitemapParser with the sitemap file" do
|
17
|
+
expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
|
18
18
|
subject
|
19
19
|
end
|
20
20
|
end
|
@@ -2,36 +2,30 @@ require 'json'
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe GovukSeedCrawler do
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
"total" => results.size,
|
19
|
-
"start_index" => 1,
|
20
|
-
"page_size" => 100,
|
21
|
-
"current_page" => 1,
|
22
|
-
"pages" => 1,
|
23
|
-
"results" => results
|
5
|
+
def stub_sitemap
|
6
|
+
sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
|
7
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
8
|
+
<url>
|
9
|
+
<loc>https://www.gov.uk/</loc>
|
10
|
+
</url>
|
11
|
+
<url>
|
12
|
+
<loc>https://www.gov.uk/register-to-vote</loc>
|
13
|
+
</url>
|
14
|
+
<url>
|
15
|
+
<loc>https://www.gov.uk/help</loc>
|
16
|
+
</url>
|
17
|
+
</urlset>
|
24
18
|
}
|
25
19
|
|
26
|
-
stub_request(:get, "https://www.gov.uk
|
27
|
-
to_return(:status => 200, :body =>
|
20
|
+
stub_request(:get, "https://www.gov.uk/sitemap.xml").
|
21
|
+
to_return(:status => 200, :body => sitemap, :headers => {})
|
28
22
|
end
|
29
23
|
|
30
24
|
let(:vhost) { "/" }
|
31
25
|
let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
|
32
26
|
let(:queue_name) { "govuk_seed_crawler_integration_queue" }
|
33
27
|
let(:topic) { "#" }
|
34
|
-
let(:site_root) { "https://www.gov.uk
|
28
|
+
let(:site_root) { "https://www.gov.uk" }
|
35
29
|
let(:options) {{
|
36
30
|
:host => ENV.fetch("AMQP_HOST", "localhost"),
|
37
31
|
:user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
|
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
|
|
57
51
|
end
|
58
52
|
|
59
53
|
it "publishes URLs it finds to an AMQP topic exchange" do
|
60
|
-
|
54
|
+
stub_sitemap
|
61
55
|
subject
|
62
56
|
|
63
|
-
|
64
|
-
expect(@queue.message_count).to be(15)
|
57
|
+
expect(@queue.message_count).to be(3)
|
65
58
|
end
|
66
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govuk_seed_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- GOV.UK developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bunny
|
@@ -25,47 +25,75 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: crack
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.4.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.4.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - "~>"
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
47
|
+
version: 1.6.0
|
34
48
|
type: :runtime
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
54
|
+
version: 1.6.0
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: public_suffix
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
61
|
+
version: 1.4.6
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
68
|
+
version: 1.4.6
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: sitemap-parser
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
type: :
|
75
|
+
version: 0.3.0
|
76
|
+
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 0.3.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: slop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.6.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 3.6.0
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: pry
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +112,16 @@ dependencies:
|
|
84
112
|
name: rake
|
85
113
|
requirement: !ruby/object:Gem::Requirement
|
86
114
|
requirements:
|
87
|
-
- - "
|
115
|
+
- - "~>"
|
88
116
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
117
|
+
version: '0.9'
|
90
118
|
type: :development
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
93
121
|
requirements:
|
94
|
-
- - "
|
122
|
+
- - "~>"
|
95
123
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
124
|
+
version: '0.9'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: rspec
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +166,7 @@ dependencies:
|
|
138
166
|
version: 1.18.0
|
139
167
|
description:
|
140
168
|
email:
|
141
|
-
-
|
169
|
+
- govuk-dev@digital.cabinet-office.gov.uk
|
142
170
|
executables:
|
143
171
|
- seed-crawler
|
144
172
|
extensions: []
|
@@ -148,14 +176,12 @@ files:
|
|
148
176
|
- ".rspec"
|
149
177
|
- ".ruby-version"
|
150
178
|
- Gemfile
|
179
|
+
- Jenkinsfile
|
151
180
|
- LICENSE.txt
|
152
181
|
- README.md
|
153
182
|
- Rakefile
|
154
183
|
- bin/seed-crawler
|
155
184
|
- govuk_seed_crawler.gemspec
|
156
|
-
- jenkins-branches.sh
|
157
|
-
- jenkins-tests.sh
|
158
|
-
- jenkins.sh
|
159
185
|
- lib/govuk_seed_crawler.rb
|
160
186
|
- lib/govuk_seed_crawler/amqp_client.rb
|
161
187
|
- lib/govuk_seed_crawler/cli_parser.rb
|
@@ -170,7 +196,7 @@ files:
|
|
170
196
|
- spec/govuk_seed_crawler/seeder_spec.rb
|
171
197
|
- spec/integration/govuk_seed_crawler_spec.rb
|
172
198
|
- spec/spec_helper.rb
|
173
|
-
homepage: https://github.
|
199
|
+
homepage: https://github.com/alphagov/govuk_seed_crawler
|
174
200
|
licenses:
|
175
201
|
- MIT
|
176
202
|
metadata: {}
|
@@ -180,17 +206,16 @@ require_paths:
|
|
180
206
|
- lib
|
181
207
|
required_ruby_version: !ruby/object:Gem::Requirement
|
182
208
|
requirements:
|
183
|
-
- - "
|
209
|
+
- - "~>"
|
184
210
|
- !ruby/object:Gem::Version
|
185
|
-
version: '
|
211
|
+
version: '2.6'
|
186
212
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
213
|
requirements:
|
188
214
|
- - ">="
|
189
215
|
- !ruby/object:Gem::Version
|
190
216
|
version: '0'
|
191
217
|
requirements: []
|
192
|
-
|
193
|
-
rubygems_version: 2.2.2
|
218
|
+
rubygems_version: 3.0.3
|
194
219
|
signing_key:
|
195
220
|
specification_version: 4
|
196
221
|
summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
|
data/jenkins-branches.sh
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
set -e
|
3
|
-
|
4
|
-
[ -x .venv/bin/pip ] || virtualenv .venv
|
5
|
-
. .venv/bin/activate
|
6
|
-
|
7
|
-
pip install -q ghtools
|
8
|
-
|
9
|
-
REPO="gds:gds/govuk_seed_crawler"
|
10
|
-
gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
11
|
-
|
12
|
-
if ./jenkins-tests.sh; then
|
13
|
-
gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
14
|
-
exit 0
|
15
|
-
else
|
16
|
-
gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
17
|
-
exit 1
|
18
|
-
fi
|
data/jenkins-tests.sh
DELETED