govuk_seed_crawler 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
- data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
3
+ metadata.gz: 0f12b494cb9d0b5cb48495917d02f70caaec3e19
4
+ data.tar.gz: 58db84b76c22e4a80dee1c7528a8b02e59e40bd5
5
5
  SHA512:
6
- metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
- data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
6
+ metadata.gz: 6911991e1987cae4f9510c60988cd096339121257cfa2ee6ee585494e97e2ae660b692485c18a9efccfb300fd30f7d9206f76d65623b0f80ab43b7b99add35e1
7
+ data.tar.gz: 39befcb79fe0c2123b0047adb79292fc8b4d971d85f077d1f0e587538578c59d035e379317fab6e269b9dc3322d9a69892ab5a37d5e62c0c84e74b53e75515b9
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # GOV.UK: Seed the Crawler
2
2
 
3
- Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
3
+ This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
4
+ so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
4
5
 
5
6
  ## Installation
6
7
 
@@ -6,10 +6,10 @@ require 'govuk_seed_crawler/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "govuk_seed_crawler"
8
8
  spec.version = GovukSeedCrawler::VERSION
9
- spec.authors = ["Matt Bostock"]
10
- spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
9
+ spec.authors = ['GOV.UK developers']
10
+ spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
11
11
  spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
- spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
12
+ spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
13
13
  spec.license = "MIT"
14
14
 
15
15
  spec.files = `git ls-files -z`.split("\x0")
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.require_paths = ["lib"]
19
19
 
20
20
  spec.add_runtime_dependency "bunny", "~> 1.3"
21
- spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
21
+ spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
22
22
  spec.add_runtime_dependency "slop", "~> 3.6.0"
23
23
 
24
24
  spec.add_development_dependency "gem_publisher", "~> 1.3"
data/jenkins-branches.sh CHANGED
@@ -6,7 +6,7 @@ set -e
6
6
 
7
7
  pip install -q ghtools
8
8
 
9
- REPO="gds:gds/govuk_seed_crawler"
9
+ REPO="alphagov/govuk_seed_crawler"
10
10
  gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
11
 
12
12
  if ./jenkins-tests.sh; then
@@ -1,5 +1,4 @@
1
- require 'govuk_mirrorer/indexer'
2
- require 'govuk_mirrorer/statsd'
1
+ require 'sitemap-parser'
3
2
 
4
3
  module GovukSeedCrawler
5
4
  class Indexer
@@ -9,8 +8,9 @@ module GovukSeedCrawler
9
8
  raise "No site_root defined" unless site_root
10
9
 
11
10
  GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
- indexer = GovukMirrorer::Indexer.new(site_root)
13
- @urls = indexer.all_start_urls
11
+
12
+ sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
13
+ @urls = sitemap.to_a
14
14
 
15
15
  GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
16
  end
@@ -1,3 +1,3 @@
1
1
  module GovukSeedCrawler
2
- VERSION = "1.0.0"
2
+ VERSION = "2.0.0"
3
3
  end
@@ -1,20 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe GovukSeedCrawler::Indexer do
4
- subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com') }
5
5
 
6
6
  context "under normal usage" do
7
- let(:mock_indexer) do
8
- double(:mock_indexer, :all_start_urls => [])
7
+ let(:mock_parser) do
8
+ double(:mock_parser, :to_a => [])
9
9
  end
10
10
 
11
11
  it "responds to Indexer#urls" do
12
- allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
12
+ allow(SitemapParser).to receive(:new).and_return(mock_parser)
13
13
  expect(subject).to respond_to(:urls)
14
14
  end
15
15
 
16
- it "calls GovukMirrorer::Indexer with the site root" do
17
- expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
16
+ it "calls SitemapParser with the sitemap file" do
17
+ expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
18
18
  subject
19
19
  end
20
20
  end
@@ -2,36 +2,30 @@ require 'json'
2
2
  require 'spec_helper'
3
3
 
4
4
  describe GovukSeedCrawler do
5
- def stub_api_artefacts(count)
6
- item = {
7
- "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
- "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
- "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
- "format" => "announcement"
11
- }
12
- results = count.times.collect { item }
13
- response = {
14
- "_response_info" => {
15
- "status" => "ok",
16
- "links" => []
17
- },
18
- "total" => results.size,
19
- "start_index" => 1,
20
- "page_size" => 100,
21
- "current_page" => 1,
22
- "pages" => 1,
23
- "results" => results
5
+ def stub_sitemap
6
+ sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
7
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
8
+ <url>
9
+ <loc>https://www.gov.uk/</loc>
10
+ </url>
11
+ <url>
12
+ <loc>https://www.gov.uk/register-to-vote</loc>
13
+ </url>
14
+ <url>
15
+ <loc>https://www.gov.uk/help</loc>
16
+ </url>
17
+ </urlset>
24
18
  }
25
19
 
26
- stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
- to_return(:status => 200, :body => response.to_json, :headers => {})
20
+ stub_request(:get, "https://www.gov.uk/sitemap.xml").
21
+ to_return(:status => 200, :body => sitemap, :headers => {})
28
22
  end
29
23
 
30
24
  let(:vhost) { "/" }
31
25
  let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
26
  let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
27
  let(:topic) { "#" }
34
- let(:site_root) { "https://www.gov.uk/" }
28
+ let(:site_root) { "https://www.gov.uk" }
35
29
  let(:options) {{
36
30
  :host => ENV.fetch("AMQP_HOST", "localhost"),
37
31
  :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
57
51
  end
58
52
 
59
53
  it "publishes URLs it finds to an AMQP topic exchange" do
60
- stub_api_artefacts(10)
54
+ stub_sitemap
61
55
  subject
62
56
 
63
- # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
- expect(@queue.message_count).to be(15)
57
+ expect(@queue.message_count).to be(3)
65
58
  end
66
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: govuk_seed_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
- - Matt Bostock
7
+ - GOV.UK developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-28 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bunny
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
- name: govuk_mirrorer
28
+ name: sitemap-parser
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.3.1
33
+ version: 0.3.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.3.1
40
+ version: 0.3.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: slop
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +138,7 @@ dependencies:
138
138
  version: 1.18.0
139
139
  description:
140
140
  email:
141
- - matt.bostock@digital.cabinet-office.gov.uk
141
+ - govuk-dev@digital.cabinet-office.gov.uk
142
142
  executables:
143
143
  - seed-crawler
144
144
  extensions: []
@@ -170,7 +170,7 @@ files:
170
170
  - spec/govuk_seed_crawler/seeder_spec.rb
171
171
  - spec/integration/govuk_seed_crawler_spec.rb
172
172
  - spec/spec_helper.rb
173
- homepage: https://github.gds/gds/govuk_seed_crawler
173
+ homepage: https://github.com/alphagov/govuk_seed_crawler
174
174
  licenses:
175
175
  - MIT
176
176
  metadata: {}