govuk_seed_crawler 1.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
- data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
2
+ SHA256:
3
+ metadata.gz: 2ee27868f4df61a72b764044b8d04ad6f6b252a63cd1f365d1056a7d331a85d1
4
+ data.tar.gz: 514ee036b88544be5a7935dcc2103e07aa7fe82c2acc0841109d15e572a9ebf9
5
5
  SHA512:
6
- metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
- data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
6
+ metadata.gz: 3d005f87f519187b619e1dbbeabcc441d0ccb0daf652db2d6b515f7abf2de49e9090f457e208afc5128d68db65bf618dcbf5bc74e54bb31c16bf6dbf405495c1
7
+ data.tar.gz: ec43d4205fd5714be7ab39669a9397371792802b3f50fa97d580c7327db018afd8409988b54c5876aba603456cd3019c454ce41736dad054e5f58d7cc79a80ed
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.1.2
1
+ 2.6.3
data/Jenkinsfile ADDED
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env groovy
2
+
3
+ library("govuk")
4
+
5
+ node {
6
+ try {
7
+ // This doesn't use the buildProject as this project doesn't conform to
8
+ // required norms (e.g. running in Ruby 1.9, non-standard tests).
9
+
10
+ repoName = JOB_NAME.split('/')[0]
11
+
12
+ stage("Checkout") {
13
+ govuk.checkoutFromGitHubWithSSH(repoName)
14
+ }
15
+
16
+ stage("Clean up workspace") {
17
+ govuk.cleanupGit()
18
+ }
19
+
20
+ stage('Configure environment') {
21
+ govuk.setEnvar('RBENV_VERSION', '2.6.3')
22
+ }
23
+
24
+ stage('Bundle install') {
25
+ govuk.bundleGem()
26
+ }
27
+
28
+ stage('Spec tests') {
29
+ govuk.runRakeTask('spec')
30
+ }
31
+
32
+ stage('Integration tests') {
33
+ govuk.runRakeTask('integration')
34
+ }
35
+
36
+ if (env.BRANCH_NAME == 'master') {
37
+ stage('Publish Gem to Rubygems') {
38
+ govuk.publishGem(repoName, repoName, 'master')
39
+ }
40
+ }
41
+ } catch (e) {
42
+ currentBuild.result = "FAILED"
43
+ step([$class: 'Mailer',
44
+ notifyEveryUnstableBuild: true,
45
+ recipients: 'govuk-ci-notifications@digital.cabinet-office.gov.uk',
46
+ sendToIndividuals: true])
47
+ throw e
48
+ }
49
+ }
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # GOV.UK: Seed the Crawler
2
2
 
3
- Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
3
+ This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
4
+ so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
4
5
 
5
6
  ## Installation
6
7
 
data/Rakefile CHANGED
@@ -1,4 +1,3 @@
1
- require 'gem_publisher'
2
1
  require 'rspec/core/rake_task'
3
2
 
4
3
  RSpec::Core::RakeTask.new(:spec) do |task|
@@ -10,9 +9,3 @@ RSpec::Core::RakeTask.new(:integration) do |task|
10
9
  end
11
10
 
12
11
  task :default => :spec
13
-
14
- desc "Publish gem to RubyGems"
15
- task :publish_gem do |t|
16
- gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
17
- puts "Published #{gem}" if gem
18
- end
@@ -6,24 +6,30 @@ require 'govuk_seed_crawler/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "govuk_seed_crawler"
8
8
  spec.version = GovukSeedCrawler::VERSION
9
- spec.authors = ["Matt Bostock"]
10
- spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
9
+ spec.authors = ['GOV.UK developers']
10
+ spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
11
11
  spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
- spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
12
+ spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
13
13
  spec.license = "MIT"
14
14
 
15
+ spec.required_ruby_version = "~> 2.6"
16
+
15
17
  spec.files = `git ls-files -z`.split("\x0")
16
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
20
  spec.require_paths = ["lib"]
19
21
 
20
22
  spec.add_runtime_dependency "bunny", "~> 1.3"
21
- spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
23
+ spec.add_runtime_dependency "crack", "0.4.4"
24
+ spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
25
+ # Something, somewhere, sometimes requires public_suffix.
26
+ # public_suffix > 1.5 requires ruby > 2.
27
+ spec.add_runtime_dependency "public_suffix", "~> 1.4.6"
28
+ spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
22
29
  spec.add_runtime_dependency "slop", "~> 3.6.0"
23
30
 
24
- spec.add_development_dependency "gem_publisher", "~> 1.3"
25
31
  spec.add_development_dependency "pry"
26
- spec.add_development_dependency "rake"
32
+ spec.add_development_dependency "rake", "~> 0.9"
27
33
  spec.add_development_dependency "rspec", "~> 3.0"
28
34
  spec.add_development_dependency "rspec-mocks", "~> 3.0"
29
35
  spec.add_development_dependency "webmock", "~> 1.18.0"
@@ -1,5 +1,4 @@
1
- require 'govuk_mirrorer/indexer'
2
- require 'govuk_mirrorer/statsd'
1
+ require 'sitemap-parser'
3
2
 
4
3
  module GovukSeedCrawler
5
4
  class Indexer
@@ -9,8 +8,9 @@ module GovukSeedCrawler
9
8
  raise "No site_root defined" unless site_root
10
9
 
11
10
  GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
- indexer = GovukMirrorer::Indexer.new(site_root)
13
- @urls = indexer.all_start_urls
11
+
12
+ sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
13
+ @urls = sitemap.to_a
14
14
 
15
15
  GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
16
  end
@@ -1,3 +1,3 @@
1
1
  module GovukSeedCrawler
2
- VERSION = "1.0.0"
2
+ VERSION = "3.0.0"
3
3
  end
@@ -1,20 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe GovukSeedCrawler::Indexer do
4
- subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com') }
5
5
 
6
6
  context "under normal usage" do
7
- let(:mock_indexer) do
8
- double(:mock_indexer, :all_start_urls => [])
7
+ let(:mock_parser) do
8
+ double(:mock_parser, :to_a => [])
9
9
  end
10
10
 
11
11
  it "responds to Indexer#urls" do
12
- allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
12
+ allow(SitemapParser).to receive(:new).and_return(mock_parser)
13
13
  expect(subject).to respond_to(:urls)
14
14
  end
15
15
 
16
- it "calls GovukMirrorer::Indexer with the site root" do
17
- expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
16
+ it "calls SitemapParser with the sitemap file" do
17
+ expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
18
18
  subject
19
19
  end
20
20
  end
@@ -2,36 +2,30 @@ require 'json'
2
2
  require 'spec_helper'
3
3
 
4
4
  describe GovukSeedCrawler do
5
- def stub_api_artefacts(count)
6
- item = {
7
- "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
- "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
- "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
- "format" => "announcement"
11
- }
12
- results = count.times.collect { item }
13
- response = {
14
- "_response_info" => {
15
- "status" => "ok",
16
- "links" => []
17
- },
18
- "total" => results.size,
19
- "start_index" => 1,
20
- "page_size" => 100,
21
- "current_page" => 1,
22
- "pages" => 1,
23
- "results" => results
5
+ def stub_sitemap
6
+ sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
7
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
8
+ <url>
9
+ <loc>https://www.gov.uk/</loc>
10
+ </url>
11
+ <url>
12
+ <loc>https://www.gov.uk/register-to-vote</loc>
13
+ </url>
14
+ <url>
15
+ <loc>https://www.gov.uk/help</loc>
16
+ </url>
17
+ </urlset>
24
18
  }
25
19
 
26
- stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
- to_return(:status => 200, :body => response.to_json, :headers => {})
20
+ stub_request(:get, "https://www.gov.uk/sitemap.xml").
21
+ to_return(:status => 200, :body => sitemap, :headers => {})
28
22
  end
29
23
 
30
24
  let(:vhost) { "/" }
31
25
  let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
26
  let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
27
  let(:topic) { "#" }
34
- let(:site_root) { "https://www.gov.uk/" }
28
+ let(:site_root) { "https://www.gov.uk" }
35
29
  let(:options) {{
36
30
  :host => ENV.fetch("AMQP_HOST", "localhost"),
37
31
  :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
57
51
  end
58
52
 
59
53
  it "publishes URLs it finds to an AMQP topic exchange" do
60
- stub_api_artefacts(10)
54
+ stub_sitemap
61
55
  subject
62
56
 
63
- # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
- expect(@queue.message_count).to be(15)
57
+ expect(@queue.message_count).to be(3)
65
58
  end
66
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: govuk_seed_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
- - Matt Bostock
7
+ - GOV.UK developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-28 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bunny
@@ -25,47 +25,75 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
- name: govuk_mirrorer
28
+ name: crack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - "~>"
32
46
  - !ruby/object:Gem::Version
33
- version: 1.3.1
47
+ version: 1.6.0
34
48
  type: :runtime
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
52
  - - "~>"
39
53
  - !ruby/object:Gem::Version
40
- version: 1.3.1
54
+ version: 1.6.0
41
55
  - !ruby/object:Gem::Dependency
42
- name: slop
56
+ name: public_suffix
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: 3.6.0
61
+ version: 1.4.6
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: 3.6.0
68
+ version: 1.4.6
55
69
  - !ruby/object:Gem::Dependency
56
- name: gem_publisher
70
+ name: sitemap-parser
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '1.3'
62
- type: :development
75
+ version: 0.3.0
76
+ type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: '1.3'
82
+ version: 0.3.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: slop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 3.6.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 3.6.0
69
97
  - !ruby/object:Gem::Dependency
70
98
  name: pry
71
99
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +112,16 @@ dependencies:
84
112
  name: rake
85
113
  requirement: !ruby/object:Gem::Requirement
86
114
  requirements:
87
- - - ">="
115
+ - - "~>"
88
116
  - !ruby/object:Gem::Version
89
- version: '0'
117
+ version: '0.9'
90
118
  type: :development
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
93
121
  requirements:
94
- - - ">="
122
+ - - "~>"
95
123
  - !ruby/object:Gem::Version
96
- version: '0'
124
+ version: '0.9'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: rspec
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +166,7 @@ dependencies:
138
166
  version: 1.18.0
139
167
  description:
140
168
  email:
141
- - matt.bostock@digital.cabinet-office.gov.uk
169
+ - govuk-dev@digital.cabinet-office.gov.uk
142
170
  executables:
143
171
  - seed-crawler
144
172
  extensions: []
@@ -148,14 +176,12 @@ files:
148
176
  - ".rspec"
149
177
  - ".ruby-version"
150
178
  - Gemfile
179
+ - Jenkinsfile
151
180
  - LICENSE.txt
152
181
  - README.md
153
182
  - Rakefile
154
183
  - bin/seed-crawler
155
184
  - govuk_seed_crawler.gemspec
156
- - jenkins-branches.sh
157
- - jenkins-tests.sh
158
- - jenkins.sh
159
185
  - lib/govuk_seed_crawler.rb
160
186
  - lib/govuk_seed_crawler/amqp_client.rb
161
187
  - lib/govuk_seed_crawler/cli_parser.rb
@@ -170,7 +196,7 @@ files:
170
196
  - spec/govuk_seed_crawler/seeder_spec.rb
171
197
  - spec/integration/govuk_seed_crawler_spec.rb
172
198
  - spec/spec_helper.rb
173
- homepage: https://github.gds/gds/govuk_seed_crawler
199
+ homepage: https://github.com/alphagov/govuk_seed_crawler
174
200
  licenses:
175
201
  - MIT
176
202
  metadata: {}
@@ -180,17 +206,16 @@ require_paths:
180
206
  - lib
181
207
  required_ruby_version: !ruby/object:Gem::Requirement
182
208
  requirements:
183
- - - ">="
209
+ - - "~>"
184
210
  - !ruby/object:Gem::Version
185
- version: '0'
211
+ version: '2.6'
186
212
  required_rubygems_version: !ruby/object:Gem::Requirement
187
213
  requirements:
188
214
  - - ">="
189
215
  - !ruby/object:Gem::Version
190
216
  version: '0'
191
217
  requirements: []
192
- rubyforge_project:
193
- rubygems_version: 2.2.2
218
+ rubygems_version: 3.0.3
194
219
  signing_key:
195
220
  specification_version: 4
196
221
  summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
data/jenkins-branches.sh DELETED
@@ -1,18 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
-
4
- [ -x .venv/bin/pip ] || virtualenv .venv
5
- . .venv/bin/activate
6
-
7
- pip install -q ghtools
8
-
9
- REPO="gds:gds/govuk_seed_crawler"
10
- gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
-
12
- if ./jenkins-tests.sh; then
13
- gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
14
- exit 0
15
- else
16
- gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
17
- exit 1
18
- fi
data/jenkins-tests.sh DELETED
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
- rm -f Gemfile.lock
4
- bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
- bundle exec rake
6
- bundle exec rake integration
data/jenkins.sh DELETED
@@ -1,5 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
-
4
- ./jenkins-tests.sh
5
- bundle exec rake publish_gem