govuk_seed_crawler 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/govuk_seed_crawler.gemspec +4 -4
- data/jenkins-branches.sh +1 -1
- data/lib/govuk_seed_crawler/indexer.rb +4 -4
- data/lib/govuk_seed_crawler/version.rb +1 -1
- data/spec/govuk_seed_crawler/indexer_spec.rb +6 -6
- data/spec/integration/govuk_seed_crawler_spec.rb +18 -25
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f12b494cb9d0b5cb48495917d02f70caaec3e19
|
4
|
+
data.tar.gz: 58db84b76c22e4a80dee1c7528a8b02e59e40bd5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6911991e1987cae4f9510c60988cd096339121257cfa2ee6ee585494e97e2ae660b692485c18a9efccfb300fd30f7d9206f76d65623b0f80ab43b7b99add35e1
|
7
|
+
data.tar.gz: 39befcb79fe0c2123b0047adb79292fc8b4d971d85f077d1f0e587538578c59d035e379317fab6e269b9dc3322d9a69892ab5a37d5e62c0c84e74b53e75515b9
|
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# GOV.UK: Seed the Crawler
|
2
2
|
|
3
|
-
|
3
|
+
This gem retrieves a list of seed URLs from the GOV.UK sitemap and adds them to RabbitMQ
|
4
|
+
so that the [crawler](https://github.com/alphagov/govuk_crawler_worker) can consume them.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
data/govuk_seed_crawler.gemspec
CHANGED
@@ -6,10 +6,10 @@ require 'govuk_seed_crawler/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "govuk_seed_crawler"
|
8
8
|
spec.version = GovukSeedCrawler::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = ["
|
9
|
+
spec.authors = ['GOV.UK developers']
|
10
|
+
spec.email = ["govuk-dev@digital.cabinet-office.gov.uk"]
|
11
11
|
spec.summary = %q{Retrieves a list of URLs to seed the crawler by publishing them to a RabbitMQ exchange.}
|
12
|
-
spec.homepage = "https://github.
|
12
|
+
spec.homepage = "https://github.com/alphagov/govuk_seed_crawler"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
20
|
spec.add_runtime_dependency "bunny", "~> 1.3"
|
21
|
-
spec.add_runtime_dependency "
|
21
|
+
spec.add_runtime_dependency "sitemap-parser", "~> 0.3.0"
|
22
22
|
spec.add_runtime_dependency "slop", "~> 3.6.0"
|
23
23
|
|
24
24
|
spec.add_development_dependency "gem_publisher", "~> 1.3"
|
data/jenkins-branches.sh
CHANGED
@@ -6,7 +6,7 @@ set -e
|
|
6
6
|
|
7
7
|
pip install -q ghtools
|
8
8
|
|
9
|
-
REPO="
|
9
|
+
REPO="alphagov/govuk_seed_crawler"
|
10
10
|
gh-status "$REPO" "$GIT_COMMIT" pending -d "\"Build #${BUILD_NUMBER} is running on Jenkins\"" -u "$BUILD_URL" >/dev/null
|
11
11
|
|
12
12
|
if ./jenkins-tests.sh; then
|
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'govuk_mirrorer/statsd'
|
1
|
+
require 'sitemap-parser'
|
3
2
|
|
4
3
|
module GovukSeedCrawler
|
5
4
|
class Indexer
|
@@ -9,8 +8,9 @@ module GovukSeedCrawler
|
|
9
8
|
raise "No site_root defined" unless site_root
|
10
9
|
|
11
10
|
GovukSeedCrawler.logger.info("Retrieving list of URLs for #{site_root}")
|
12
|
-
|
13
|
-
|
11
|
+
|
12
|
+
sitemap = SitemapParser.new("#{site_root}/sitemap.xml", {recurse: true})
|
13
|
+
@urls = sitemap.to_a
|
14
14
|
|
15
15
|
GovukSeedCrawler.logger.info("Found #{@urls.count} URLs")
|
16
16
|
end
|
@@ -1,20 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe GovukSeedCrawler::Indexer do
|
4
|
-
subject { GovukSeedCrawler::Indexer.new('https://example.com
|
4
|
+
subject { GovukSeedCrawler::Indexer.new('https://example.com') }
|
5
5
|
|
6
6
|
context "under normal usage" do
|
7
|
-
let(:
|
8
|
-
double(:
|
7
|
+
let(:mock_parser) do
|
8
|
+
double(:mock_parser, :to_a => [])
|
9
9
|
end
|
10
10
|
|
11
11
|
it "responds to Indexer#urls" do
|
12
|
-
allow(
|
12
|
+
allow(SitemapParser).to receive(:new).and_return(mock_parser)
|
13
13
|
expect(subject).to respond_to(:urls)
|
14
14
|
end
|
15
15
|
|
16
|
-
it "calls
|
17
|
-
expect(
|
16
|
+
it "calls SitemapParser with the sitemap file" do
|
17
|
+
expect(SitemapParser).to receive(:new).with('https://example.com/sitemap.xml', {:recurse => true}).and_return(mock_parser)
|
18
18
|
subject
|
19
19
|
end
|
20
20
|
end
|
@@ -2,36 +2,30 @@ require 'json'
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe GovukSeedCrawler do
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
"total" => results.size,
|
19
|
-
"start_index" => 1,
|
20
|
-
"page_size" => 100,
|
21
|
-
"current_page" => 1,
|
22
|
-
"pages" => 1,
|
23
|
-
"results" => results
|
5
|
+
def stub_sitemap
|
6
|
+
sitemap = %{<?xml version="1.0" encoding="UTF-8"?>
|
7
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
8
|
+
<url>
|
9
|
+
<loc>https://www.gov.uk/</loc>
|
10
|
+
</url>
|
11
|
+
<url>
|
12
|
+
<loc>https://www.gov.uk/register-to-vote</loc>
|
13
|
+
</url>
|
14
|
+
<url>
|
15
|
+
<loc>https://www.gov.uk/help</loc>
|
16
|
+
</url>
|
17
|
+
</urlset>
|
24
18
|
}
|
25
19
|
|
26
|
-
stub_request(:get, "https://www.gov.uk
|
27
|
-
to_return(:status => 200, :body =>
|
20
|
+
stub_request(:get, "https://www.gov.uk/sitemap.xml").
|
21
|
+
to_return(:status => 200, :body => sitemap, :headers => {})
|
28
22
|
end
|
29
23
|
|
30
24
|
let(:vhost) { "/" }
|
31
25
|
let(:exchange_name) { "govuk_seed_crawler_integration_exchange" }
|
32
26
|
let(:queue_name) { "govuk_seed_crawler_integration_queue" }
|
33
27
|
let(:topic) { "#" }
|
34
|
-
let(:site_root) { "https://www.gov.uk
|
28
|
+
let(:site_root) { "https://www.gov.uk" }
|
35
29
|
let(:options) {{
|
36
30
|
:host => ENV.fetch("AMQP_HOST", "localhost"),
|
37
31
|
:user => ENV.fetch("AMQP_USER", "govuk_seed_crawler"),
|
@@ -57,10 +51,9 @@ describe GovukSeedCrawler do
|
|
57
51
|
end
|
58
52
|
|
59
53
|
it "publishes URLs it finds to an AMQP topic exchange" do
|
60
|
-
|
54
|
+
stub_sitemap
|
61
55
|
subject
|
62
56
|
|
63
|
-
|
64
|
-
expect(@queue.message_count).to be(15)
|
57
|
+
expect(@queue.message_count).to be(3)
|
65
58
|
end
|
66
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govuk_seed_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- GOV.UK developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bunny
|
@@ -25,19 +25,19 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: sitemap-parser
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 0.3.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.3.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: slop
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +138,7 @@ dependencies:
|
|
138
138
|
version: 1.18.0
|
139
139
|
description:
|
140
140
|
email:
|
141
|
-
-
|
141
|
+
- govuk-dev@digital.cabinet-office.gov.uk
|
142
142
|
executables:
|
143
143
|
- seed-crawler
|
144
144
|
extensions: []
|
@@ -170,7 +170,7 @@ files:
|
|
170
170
|
- spec/govuk_seed_crawler/seeder_spec.rb
|
171
171
|
- spec/integration/govuk_seed_crawler_spec.rb
|
172
172
|
- spec/spec_helper.rb
|
173
|
-
homepage: https://github.
|
173
|
+
homepage: https://github.com/alphagov/govuk_seed_crawler
|
174
174
|
licenses:
|
175
175
|
- MIT
|
176
176
|
metadata: {}
|