govuk_seed_crawler 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/govuk_seed_crawler.gemspec +4 -4
- data/jenkins-branches.sh +1 -1
- data/lib/govuk_seed_crawler/indexer.rb +4 -4
- data/lib/govuk_seed_crawler/version.rb +1 -1
- data/spec/govuk_seed_crawler/indexer_spec.rb +6 -6
- data/spec/integration/govuk_seed_crawler_spec.rb +18 -25
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f12b494cb9d0b5cb48495917d02f70caaec3e19
|
4
|
+
data.tar.gz: 58db84b76c22e4a80dee1c7528a8b02e59e40bd5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6911991e1987cae4f9510c60988cd096339121257cfa2ee6ee585494e97e2ae660b692485c18a9efccfb300fd30f7d9206f76d65623b0f80ab43b7b99add35e1
|
7
|
+
data.tar.gz: 39befcb79fe0c2123b0047adb79292fc8b4d971d85f077d1f0e587538578c59d035e379317fab6e269b9dc3322d9a69892ab5a37d5e62c0c84e74b53e75515b9
|
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# GOV.UK: Seed the Crawler
|
2
2
|
|
3
|
-
|
3
|
+
This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
|
4
|
+
so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
data/govuk_seed_crawler.gemspec
CHANGED
@@ -6,10 +6,10 @@ require 'govuk_seed_crawler/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "govuk_seed_crawler"
|
8
8
|
spec.version = GovukSeedCrawler::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = ["
|
9
|
+
spec.authors = ['GOV.UK developers']
|
10
|
+
spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
|
11
11
|
spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
|
12
|
-
spec.homepage = "https://github.
|
12
|
+
spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
20
|
spec.add_runtime_dependency "bunny", "~> 1.3"
|
21
|
-
spec.add_runtime_dependency "
|
21
|
+
spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
|
22
22
|
spec.add_runtime_dependency "slop", "~> 3.6.0"
|
23
23
|
|
24
24
|
spec.add_development_dependency "gem_publisher", "~> 1.3"
|
data/jenkins-branches.sh
CHANGED
@@ -6,7 +6,7 @@ set -e
|
|
6
6
|
|
7
7
|
pip install -q ghtools
|
8
8
|
|
9
|
-
REPO="
|
9
|
+
REPO="alphagov/govuk_seed_crawler"
|
10
10
|
gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
11
11
|
|
12
12
|
if ./jenkins-tests.sh; then
|
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'govuk_mirrorer/statsd'
|
1
|
+
require 'sitemap-parser'
|
3
2
|
|
4
3
|
module GovukSeedCrawler
|
5
4
|
class Indexer
|
@@ -9,8 +8,9 @@ module GovukSeedCrawler
|
|
9
8
|
raise "No site_root defined" unless site_root
|
10
9
|
|
11
10
|
GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
|
12
|
-
|
13
|
-
|
11
|
+
|
12
|
+
sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
|
13
|
+
@urls = sitemap.to_a
|
14
14
|
|
15
15
|
GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
|
16
16
|
end
|
@@ -1,20 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe GovukSeedCrawler::Indexer do
|
4
|
-
subject { GovukSeedCrawler::Indexer.new('https://example.com
|
4
|
+
subject { GovukSeedCrawler::Indexer.new('https://example.com') }
|
5
5
|
|
6
6
|
context "under normal usage" do
|
7
|
-
let(:
|
8
|
-
double(:
|
7
|
+
let(:mock_parser) do
|
8
|
+
double(:mock_parser, :to_a => [])
|
9
9
|
end
|
10
10
|
|
11
11
|
it "responds to Indexer#urls" do
|
12
|
-
allow(
|
12
|
+
allow(SitemapParser).to receive(:new).and_return(mock_parser)
|
13
13
|
expect(subject).to respond_to(:urls)
|
14
14
|
end
|
15
15
|
|
16
|
-
it "calls
|
17
|
-
expect(
|
16
|
+
it "calls SitemapParser with the sitemap file" do
|
17
|
+
expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
|
18
18
|
subject
|
19
19
|
end
|
20
20
|
end
|
@@ -2,36 +2,30 @@ require 'json'
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe GovukSeedCrawler do
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
"total" => results.size,
|
19
|
-
"start_index" => 1,
|
20
|
-
"page_size" => 100,
|
21
|
-
"current_page" => 1,
|
22
|
-
"pages" => 1,
|
23
|
-
"results" => results
|
5
|
+
def stub_sitemap
|
6
|
+
sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
|
7
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
8
|
+
<url>
|
9
|
+
<loc>https://www.gov.uk/</loc>
|
10
|
+
</url>
|
11
|
+
<url>
|
12
|
+
<loc>https://www.gov.uk/register-to-vote</loc>
|
13
|
+
</url>
|
14
|
+
<url>
|
15
|
+
<loc>https://www.gov.uk/help</loc>
|
16
|
+
</url>
|
17
|
+
</urlset>
|
24
18
|
}
|
25
19
|
|
26
|
-
stub_request(:get, "https://www.gov.uk
|
27
|
-
to_return(:status => 200, :body =>
|
20
|
+
stub_request(:get, "https://www.gov.uk/sitemap.xml").
|
21
|
+
to_return(:status => 200, :body => sitemap, :headers => {})
|
28
22
|
end
|
29
23
|
|
30
24
|
let(:vhost) { "/" }
|
31
25
|
let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
|
32
26
|
let(:queue_name) { "govuk_seed_crawler_integration_queue" }
|
33
27
|
let(:topic) { "#" }
|
34
|
-
let(:site_root) { "https://www.gov.uk
|
28
|
+
let(:site_root) { "https://www.gov.uk" }
|
35
29
|
let(:options) {{
|
36
30
|
:host => ENV.fetch("AMQP_HOST", "localhost"),
|
37
31
|
:user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
|
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
|
|
57
51
|
end
|
58
52
|
|
59
53
|
it "publishes URLs it finds to an AMQP topic exchange" do
|
60
|
-
|
54
|
+
stub_sitemap
|
61
55
|
subject
|
62
56
|
|
63
|
-
|
64
|
-
expect(@queue.message_count).to be(15)
|
57
|
+
expect(@queue.message_count).to be(3)
|
65
58
|
end
|
66
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govuk_seed_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- GOV.UK developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bunny
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: sitemap-parser
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 0.3.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.3.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: slop
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +138,7 @@ dependencies:
|
|
138
138
|
version: 1.18.0
|
139
139
|
description:
|
140
140
|
email:
|
141
|
-
-
|
141
|
+
- govuk-dev@digital.cabinet-office.gov.uk
|
142
142
|
executables:
|
143
143
|
- seed-crawler
|
144
144
|
extensions: []
|
@@ -170,7 +170,7 @@ files:
|
|
170
170
|
- spec/govuk_seed_crawler/seeder_spec.rb
|
171
171
|
- spec/integration/govuk_seed_crawler_spec.rb
|
172
172
|
- spec/spec_helper.rb
|
173
|
-
homepage: https://github.
|
173
|
+
homepage: https://github.com/alphagov/govuk_seed_crawler
|
174
174
|
licenses:
|
175
175
|
- MIT
|
176
176
|
metadata: {}
|