govuk_seed_crawler 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a37f428070681bc4ca2466497df0b69f45fed94a
4
- data.tar.gz: 55bdafe5ade9251f6f630eeb490e481a9796fc4f
3
+ metadata.gz: 0f12b494cb9d0b5cb48495917d02f70caaec3e19
4
+ data.tar.gz: 58db84b76c22e4a80dee1c7528a8b02e59e40bd5
5
5
  SHA512:
6
- metadata.gz: 631f38d96a7d1ea301b38e761d5c55debbb9ee0c99a8e2a88ef1bb965b12637eceaaba70dfbd13941e4b96c088781fc6da3f724e283bfe8afc6d3dd8f0732321
7
- data.tar.gz: 2f8b41afecdaba199b32925b13804c6dce588535b611a01383753dac2a22a655e728a9b77cf529845e0b6df1738bdca716c0eac5a5a4032e70e61d7d94c9cd82
6
+ metadata.gz: 6911991e1987cae4f9510c60988cd096339121257cfa2ee6ee585494e97e2ae660b692485c18a9efccfb300fd30f7d9206f76d65623b0f80ab43b7b99add35e1
7
+ data.tar.gz: 39befcb79fe0c2123b0047adb79292fc8b4d971d85f077d1f0e587538578c59d035e379317fab6e269b9dc3322d9a69892ab5a37d5e62c0c84e74b53e75515b9
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # GOV.UK: Seed the Crawler
2
2
 
3
- Retrieves a list of URLs to seed the [crawler](https://github.com/alphagov/govuk_crawler_worker) by publishing them to a RabbitMQ exchange.
3
+ This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
4
+ so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
4
5
 
5
6
  ## Installation
6
7
 
@@ -6,10 +6,10 @@ require 'govuk_seed_crawler/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = "govuk_seed_crawler"
8
8
  spec.version = GovukSeedCrawler::VERSION
9
- spec.authors = ["Matt Bostock"]
10
- spec.email = ["matt.bostock@digital.cabinet-office.gov.uk"]
9
+ spec.authors = ['GOV.UK developers']
10
+ spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
11
11
  spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
12
- spec.homepage = "https://github.gds/gds/govuk_seed_crawler"
12
+ spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
13
13
  spec.license = "MIT"
14
14
 
15
15
  spec.files = `git ls-files -z`.split("\x0")
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.require_paths = ["lib"]
19
19
 
20
20
  spec.add_runtime_dependency "bunny", "~> 1.3"
21
- spec.add_runtime_dependency "govuk_mirrorer", "~> 1.3.1"
21
+ spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
22
22
  spec.add_runtime_dependency "slop", "~> 3.6.0"
23
23
 
24
24
  spec.add_development_dependency "gem_publisher", "~> 1.3"
data/jenkins-branches.sh CHANGED
@@ -6,7 +6,7 @@ set -e
6
6
 
7
7
  pip install -q ghtools
8
8
 
9
- REPO="gds:gds/govuk_seed_crawler"
9
+ REPO="alphagov/govuk_seed_crawler"
10
10
  gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
11
11
 
12
12
  if ./jenkins-tests.sh; then
@@ -1,5 +1,4 @@
1
- require 'govuk_mirrorer/indexer'
2
- require 'govuk_mirrorer/statsd'
1
+ require 'sitemap-parser'
3
2
 
4
3
  module GovukSeedCrawler
5
4
  class Indexer
@@ -9,8 +8,9 @@ module GovukSeedCrawler
9
8
  raise "No site_root defined" unless site_root
10
9
 
11
10
  GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
12
- indexer = GovukMirrorer::Indexer.new(site_root)
13
- @urls = indexer.all_start_urls
11
+
12
+ sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
13
+ @urls = sitemap.to_a
14
14
 
15
15
  GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
16
16
  end
@@ -1,3 +1,3 @@
1
1
  module GovukSeedCrawler
2
- VERSION = "1.0.0"
2
+ VERSION = "2.0.0"
3
3
  end
@@ -1,20 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe GovukSeedCrawler::Indexer do
4
- subject { GovukSeedCrawler::Indexer.new('https://example.com/') }
4
+ subject { GovukSeedCrawler::Indexer.new('https://example.com') }
5
5
 
6
6
  context "under normal usage" do
7
- let(:mock_indexer) do
8
- double(:mock_indexer, :all_start_urls => [])
7
+ let(:mock_parser) do
8
+ double(:mock_parser, :to_a => [])
9
9
  end
10
10
 
11
11
  it "responds to Indexer#urls" do
12
- allow(GovukMirrorer::Indexer).to receive(:new).and_return(mock_indexer)
12
+ allow(SitemapParser).to receive(:new).and_return(mock_parser)
13
13
  expect(subject).to respond_to(:urls)
14
14
  end
15
15
 
16
- it "calls GovukMirrorer::Indexer with the site root" do
17
- expect(GovukMirrorer::Indexer).to receive(:new).with('https://example.com/').and_return(mock_indexer)
16
+ it "calls SitemapParser with the sitemap file" do
17
+ expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
18
18
  subject
19
19
  end
20
20
  end
@@ -2,36 +2,30 @@ require 'json'
2
2
  require 'spec_helper'
3
3
 
4
4
  describe GovukSeedCrawler do
5
- def stub_api_artefacts(count)
6
- item = {
7
- "id" => "https://www.gov.uk/api/government%2Fnews%2Ffaster-review-of-support-for-renewable-electricity-to-provide-investor-certainty.json",
8
- "web_url" => "https://www.gov.uk/government/news/faster-review-of-support-for-renewable-electricity-to-provide-investor-certainty",
9
- "title" => "Faster review of support for Renewable electricity to provide investor certainty",
10
- "format" => "announcement"
11
- }
12
- results = count.times.collect { item }
13
- response = {
14
- "_response_info" => {
15
- "status" => "ok",
16
- "links" => []
17
- },
18
- "total" => results.size,
19
- "start_index" => 1,
20
- "page_size" => 100,
21
- "current_page" => 1,
22
- "pages" => 1,
23
- "results" => results
5
+ def stub_sitemap
6
+ sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
7
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
8
+ <url>
9
+ <loc>https://www.gov.uk/</loc>
10
+ </url>
11
+ <url>
12
+ <loc>https://www.gov.uk/register-to-vote</loc>
13
+ </url>
14
+ <url>
15
+ <loc>https://www.gov.uk/help</loc>
16
+ </url>
17
+ </urlset>
24
18
  }
25
19
 
26
- stub_request(:get, "https://www.gov.uk//api/artefacts.json").
27
- to_return(:status => 200, :body => response.to_json, :headers => {})
20
+ stub_request(:get, "https://www.gov.uk/sitemap.xml").
21
+ to_return(:status => 200, :body => sitemap, :headers => {})
28
22
  end
29
23
 
30
24
  let(:vhost) { "/" }
31
25
  let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
32
26
  let(:queue_name) { "govuk_seed_crawler_integration_queue" }
33
27
  let(:topic) { "#" }
34
- let(:site_root) { "https://www.gov.uk/" }
28
+ let(:site_root) { "https://www.gov.uk" }
35
29
  let(:options) {{
36
30
  :host => ENV.fetch("AMQP_HOST", "localhost"),
37
31
  :user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
57
51
  end
58
52
 
59
53
  it "publishes URLs it finds to an AMQP topic exchange" do
60
- stub_api_artefacts(10)
54
+ stub_sitemap
61
55
  subject
62
56
 
63
- # There's an extra 5 URLs from the Indexer class that are hard-coded.
64
- expect(@queue.message_count).to be(15)
57
+ expect(@queue.message_count).to be(3)
65
58
  end
66
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: govuk_seed_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
- - Matt Bostock
7
+ - GOV.UK developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-28 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bunny
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
- name: govuk_mirrorer
28
+ name: sitemap-parser
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.3.1
33
+ version: 0.3.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.3.1
40
+ version: 0.3.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: slop
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +138,7 @@ dependencies:
138
138
  version: 1.18.0
139
139
  description:
140
140
  email:
141
- - matt.bostock@digital.cabinet-office.gov.uk
141
+ - govuk-dev@digital.cabinet-office.gov.uk
142
142
  executables:
143
143
  - seed-crawler
144
144
  extensions: []
@@ -170,7 +170,7 @@ files:
170
170
  - spec/govuk_seed_crawler/seeder_spec.rb
171
171
  - spec/integration/govuk_seed_crawler_spec.rb
172
172
  - spec/spec_helper.rb
173
- homepage: https://github.gds/gds/govuk_seed_crawler
173
+ homepage: https://github.com/alphagov/govuk_seed_crawler
174
174
  licenses:
175
175
  - MIT
176
176
  metadata: {}