govuk_seed_crawler 1.0.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
- data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
2
+ SHA256:
3
+ metadata.gz: 2ee27868f4df61a72b764044b8d04ad6f6b252a63cd1f365d1056a7d331a85d1
4
+ data.tar.gz: 514ee036b88544be5a7935dcc2103e07aa7fe82c2acc0841109d15e572a9ebf9
5
5
  SHA512:
6
- metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
- data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
6
+ metadata.gz: 3d005f87f519187b619e1dbbeabcc441d0ccb0daf652db2d6b515f7abf2de49e9090f457e208afc5128d68db65bf618dcbf5bc74e54bb31c16bf6dbf405495c1
7
+ data.tar.gz: ec43d4205fd5714be7ab39669a9397371792802b3f50fa97d580c7327db018afd8409988b54c5876aba603456cd3019c454ce41736dad054e5f58d7cc79a80ed
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.1.2
1
+ 2.6.3
data/Jenkinsfile ADDED
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env groovy
2
+
3
+ library("govuk")
4
+
5
+ node {
6
+ try {
7
+ // This doesn't use the buildProject as this project doesn't conform to
8
+ // required norms (e.g. running in Ruby 1.9, non-standard tests).
9
+
10
+ repoName = JOB_NAME.split('/')[0]
11
+
12
+ stage("Checkout") {
13
+ govuk.checkoutFromGitHubWithSSH(repoName)
14
+ }
15
+
16
+ stage("Clean up workspace") {
17
+ govuk.cleanupGit()
18
+ }
19
+
20
+ stage('Configure environment') {
21
+ govuk.setEnvar('RBENV_VERSION', '2.6.3')
22
+ }
23
+
24
+ stage('Bundle install') {
25
+ govuk.bundleGem()
26
+ }
27
+
28
+ stage('Spec tests') {
29
+ govuk.runRakeTask('spec')
30
+ }
31
+
32
+ stage('Integration tests') {
33
+ govuk.runRakeTask('integration')
34
+ }
35
+
36
+ if (env.BRANCH_NAME == 'master') {
37
+ stage('Publish Gem to Rubygems') {
38
+ govuk.publishGem(repoName, repoName, 'master')
39
+ }
40
+ }
41
+ } catch (e) {
42
+ currentBuild.result = "FAILED"
43
+ step([$class: 'Mailer',
44
+ notifyEveryUnstableBuild: true,
45
+ recipients: 'govuk-ci-notifications@digital.cabinet-office.gov.uk',
46
+ sendToIndividuals: true])
47
+ throw e
48
+ }
49
+ }
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # GOV.UK: Seed the Crawler
2
2
 
3
- Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
3
+ This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
4
+ so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
4
5
 
5
6
  ## Installation
6
7
 
data/Rakefile CHANGED
@@ -1,4 +1,3 @@
1
- require 'gem_publisher'
2
1
  require 'rspec/core/rake_task'
3
2
 
4
3
  RSpec::Core::RakeTask.new(:spec) do |task|
@@ -10,9 +9,3 @@ RSpec::Core::RakeTask.new(:integration) do |task|
10
9
  end
11
10
 
12
11
  task :default => :spec
13
-
14
- desc "Publish gem to RubyGems"
15
- task :publish_gem do |t|
16
- gem = GemPublisher.publish_if_updated("govuk_seed_crawler.gemspec")
17
- puts "Published #{gem}" if gem
18
- end
@@ -6,24 +6,30 @@ require 'govuk_seed_crawler/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "govuk_seed_crawler"
8
8
  spec.version = GovukSeedCrawler::VERSION
9
- spec.authors = ["Matt Bostock"]
10
- spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
9
+ spec.authors = ['GOV.UK developers']
10
+ spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
11
11
  spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
- spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
12
+ spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
13
13
  spec.license = "MIT"
14
14
 
15
+ spec.required_ruby_version = "~> 2.6"
16
+
15
17
  spec.files = `git ls-files -z`.split("\x0")
16
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
20
  spec.require_paths = ["lib"]
19
21
 
20
22
  spec.add_runtime_dependency "bunny", "~> 1.3"
21
- spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
23
+ spec.add_runtime_dependency "crack", "0.4.4"
24
+ spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
25
+ # Something, somewhere, sometimes requires public_suffix.
26
+ # public_suffix > 1.5 requires ruby > 2.
27
+ spec.add_runtime_dependency "public_suffix", "~> 1.4.6"
28
+ spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
22
29
  spec.add_runtime_dependency "slop", "~> 3.6.0"
23
30
 
24
- spec.add_development_dependency "gem_publisher", "~> 1.3"
25
31
  spec.add_development_dependency "pry"
26
- spec.add_development_dependency "rake"
32
+ spec.add_development_dependency "rake", "~> 0.9"
27
33
  spec.add_development_dependency "rspec", "~> 3.0"
28
34
  spec.add_development_dependency "rspec-mocks", "~> 3.0"
29
35
  spec.add_development_dependency "webmock", "~> 1.18.0"
@@ -1,5 +1,4 @@
1
- require 'govuk_mirrorer/indexer'
2
- require 'govuk_mirrorer/statsd'
1
+ require 'sitemap-parser'
3
2
 
4
3
  module GovukSeedCrawler
5
4
  class Indexer
@@ -9,8 +8,9 @@ module GovukSeedCrawler
9
8
  raise "No site_root defined" unless site_root
10
9
 
11
10
  GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
- indexer = GovukMirrorer::Indexer.new(site_root)
13
- @urls = indexer.all_start_urls
11
+
12
+ sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
13
+ @urls = sitemap.to_a
14
14
 
15
15
  GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
16
  end
@@ -1,3 +1,3 @@
1
1
  module GovukSeedCrawler
2
- VERSION = "1.0.0"
2
+ VERSION = "3.0.0"
3
3
  end
@@ -1,20 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe GovukSeedCrawler::Indexer do
4
- subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com') }
5
5
 
6
6
  context "under normal usage" do
7
- let(:mock_indexer) do
8
- double(:mock_indexer, :all_start_urls => [])
7
+ let(:mock_parser) do
8
+ double(:mock_parser, :to_a => [])
9
9
  end
10
10
 
11
11
  it "responds to Indexer#urls" do
12
- allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
12
+ allow(SitemapParser).to receive(:new).and_return(mock_parser)
13
13
  expect(subject).to respond_to(:urls)
14
14
  end
15
15
 
16
- it "calls GovukMirrorer::Indexer with the site root" do
17
- expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
16
+ it "calls SitemapParser with the sitemap file" do
17
+ expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
18
18
  subject
19
19
  end
20
20
  end
@@ -2,36 +2,30 @@ require 'json'
2
2
  require 'spec_helper'
3
3
 
4
4
  describe GovukSeedCrawler do
5
- def stub_api_artefacts(count)
6
- item = {
7
- "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
- "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
- "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
- "format" => "announcement"
11
- }
12
- results = count.times.collect { item }
13
- response = {
14
- "_response_info" => {
15
- "status" => "ok",
16
- "links" => []
17
- },
18
- "total" => results.size,
19
- "start_index" => 1,
20
- "page_size" => 100,
21
- "current_page" => 1,
22
- "pages" => 1,
23
- "results" => results
5
+ def stub_sitemap
6
+ sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
7
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
8
+ <url>
9
+ <loc>https://www.gov.uk/</loc>
10
+ </url>
11
+ <url>
12
+ <loc>https://www.gov.uk/register-to-vote</loc>
13
+ </url>
14
+ <url>
15
+ <loc>https://www.gov.uk/help</loc>
16
+ </url>
17
+ </urlset>
24
18
  }
25
19
 
26
- stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
- to_return(:status => 200, :body => response.to_json, :headers => {})
20
+ stub_request(:get, "https://www.gov.uk/sitemap.xml").
21
+ to_return(:status => 200, :body => sitemap, :headers => {})
28
22
  end
29
23
 
30
24
  let(:vhost) { "/" }
31
25
  let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
26
  let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
27
  let(:topic) { "#" }
34
- let(:site_root) { "https://www.gov.uk/" }
28
+ let(:site_root) { "https://www.gov.uk" }
35
29
  let(:options) {{
36
30
  :host => ENV.fetch("AMQP_HOST", "localhost"),
37
31
  :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
57
51
  end
58
52
 
59
53
  it "publishes URLs it finds to an AMQP topic exchange" do
60
- stub_api_artefacts(10)
54
+ stub_sitemap
61
55
  subject
62
56
 
63
- # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
- expect(@queue.message_count).to be(15)
57
+ expect(@queue.message_count).to be(3)
65
58
  end
66
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: govuk_seed_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
- - Matt Bostock
7
+ - GOV.UK developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-28 00:00:00.000000000 Z
11
+ date: 2021-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bunny
@@ -25,47 +25,75 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
- name: govuk_mirrorer
28
+ name: crack
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - "~>"
32
46
  - !ruby/object:Gem::Version
33
- version: 1.3.1
47
+ version: 1.6.0
34
48
  type: :runtime
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
52
  - - "~>"
39
53
  - !ruby/object:Gem::Version
40
- version: 1.3.1
54
+ version: 1.6.0
41
55
  - !ruby/object:Gem::Dependency
42
- name: slop
56
+ name: public_suffix
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: 3.6.0
61
+ version: 1.4.6
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: 3.6.0
68
+ version: 1.4.6
55
69
  - !ruby/object:Gem::Dependency
56
- name: gem_publisher
70
+ name: sitemap-parser
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '1.3'
62
- type: :development
75
+ version: 0.3.0
76
+ type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
80
  - - "~>"
67
81
  - !ruby/object:Gem::Version
68
- version: '1.3'
82
+ version: 0.3.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: slop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 3.6.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 3.6.0
69
97
  - !ruby/object:Gem::Dependency
70
98
  name: pry
71
99
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +112,16 @@ dependencies:
84
112
  name: rake
85
113
  requirement: !ruby/object:Gem::Requirement
86
114
  requirements:
87
- - - ">="
115
+ - - "~>"
88
116
  - !ruby/object:Gem::Version
89
- version: '0'
117
+ version: '0.9'
90
118
  type: :development
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
93
121
  requirements:
94
- - - ">="
122
+ - - "~>"
95
123
  - !ruby/object:Gem::Version
96
- version: '0'
124
+ version: '0.9'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: rspec
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +166,7 @@ dependencies:
138
166
  version: 1.18.0
139
167
  description:
140
168
  email:
141
- - matt.bostock@digital.cabinet-office.gov.uk
169
+ - govuk-dev@digital.cabinet-office.gov.uk
142
170
  executables:
143
171
  - seed-crawler
144
172
  extensions: []
@@ -148,14 +176,12 @@ files:
148
176
  - ".rspec"
149
177
  - ".ruby-version"
150
178
  - Gemfile
179
+ - Jenkinsfile
151
180
  - LICENSE.txt
152
181
  - README.md
153
182
  - Rakefile
154
183
  - bin/seed-crawler
155
184
  - govuk_seed_crawler.gemspec
156
- - jenkins-branches.sh
157
- - jenkins-tests.sh
158
- - jenkins.sh
159
185
  - lib/govuk_seed_crawler.rb
160
186
  - lib/govuk_seed_crawler/amqp_client.rb
161
187
  - lib/govuk_seed_crawler/cli_parser.rb
@@ -170,7 +196,7 @@ files:
170
196
  - spec/govuk_seed_crawler/seeder_spec.rb
171
197
  - spec/integration/govuk_seed_crawler_spec.rb
172
198
  - spec/spec_helper.rb
173
- homepage: https://github.gds/gds/govuk_seed_crawler
199
+ homepage: https://github.com/alphagov/govuk_seed_crawler
174
200
  licenses:
175
201
  - MIT
176
202
  metadata: {}
@@ -180,17 +206,16 @@ require_paths:
180
206
  - lib
181
207
  required_ruby_version: !ruby/object:Gem::Requirement
182
208
  requirements:
183
- - - ">="
209
+ - - "~>"
184
210
  - !ruby/object:Gem::Version
185
- version: '0'
211
+ version: '2.6'
186
212
  required_rubygems_version: !ruby/object:Gem::Requirement
187
213
  requirements:
188
214
  - - ">="
189
215
  - !ruby/object:Gem::Version
190
216
  version: '0'
191
217
  requirements: []
192
- rubyforge_project:
193
- rubygems_version: 2.2.2
218
+ rubygems_version: 3.0.3
194
219
  signing_key:
195
220
  specification_version: 4
196
221
  summary: Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ
data/jenkins-branches.sh DELETED
@@ -1,18 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
-
4
- [ -x .venv/bin/pip ] || virtualenv .venv
5
- . .venv/bin/activate
6
-
7
- pip install -q ghtools
8
-
9
- REPO="gds:gds/govuk_seed_crawler"
10
- gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
-
12
- if ./jenkins-tests.sh; then
13
- gh-status "$REPO" "$GIT_COMMIT" success -d "\"Build #${BUILD_NUMBER} succeeded on Jenkins\"" -u "$BUILD_URL" >/dev/null
14
- exit 0
15
- else
16
- gh-status "$REPO" "$GIT_COMMIT" failure -d "\"Build #${BUILD_NUMBER} failed on Jenkins\"" -u "$BUILD_URL" >/dev/null
17
- exit 1
18
- fi
data/jenkins-tests.sh DELETED
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
- rm -f Gemfile.lock
4
- bundle install --path "${HOME}/bundles/${JOB_NAME}"
5
- bundle exec rake
6
- bundle exec rake integration
data/jenkins.sh DELETED
@@ -1,5 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -e
3
-
4
- ./jenkins-tests.sh
5
- bundle exec rake publish_gem