wayfarer 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +29 -2
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +17 -0
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -31
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -42
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -26
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
@@ -3,85 +3,68 @@
|
|
3
3
|
require "spec_helpers"
|
4
4
|
|
5
5
|
describe Wayfarer::Middleware::Dedup, redis: true do
|
6
|
-
let(:task) { build(:task) }
|
6
|
+
let(:task) { build(:task, :redis_pool) }
|
7
|
+
let(:uri) { Addressable::URI.parse(task.url) }
|
8
|
+
let(:executions) { 1 }
|
9
|
+
let(:job) { double(executions: executions) }
|
7
10
|
subject { described_class.new }
|
8
11
|
|
9
|
-
before
|
12
|
+
before do
|
13
|
+
task[:job] = job
|
14
|
+
task[:controller] = job
|
15
|
+
task[:normalized_url] = task.url
|
16
|
+
end
|
10
17
|
|
11
18
|
describe "#call" do
|
12
|
-
|
13
|
-
|
19
|
+
it "assigns barrier" do
|
20
|
+
expect { subject.call(task) }.to change { task[:barrier] }.from(nil).to(instance_of(Wayfarer::Redis::Barrier))
|
21
|
+
end
|
14
22
|
|
15
|
-
|
16
|
-
|
17
|
-
|
23
|
+
it "assigns barrier for batch" do
|
24
|
+
subject.call(task)
|
25
|
+
|
26
|
+
expect(task[:barrier].task).to be(task)
|
27
|
+
end
|
18
28
|
|
19
|
-
|
29
|
+
context "with retry" do
|
30
|
+
let(:executions) { 2 }
|
31
|
+
|
32
|
+
specify do
|
20
33
|
expect { |spy| subject.call(task, &spy) }.to yield_control
|
21
34
|
end
|
22
|
-
end
|
23
35
|
|
24
|
-
|
25
|
-
|
26
|
-
|
36
|
+
specify do
|
37
|
+
expect(Wayfarer::Logging.logger).to receive(:add).with(Logger::DEBUG, "Not deduplicating retry")
|
38
|
+
|
39
|
+
subject.call(task)
|
27
40
|
end
|
28
41
|
end
|
29
42
|
|
30
|
-
context "
|
31
|
-
|
32
|
-
expect {
|
33
|
-
subject.call(task)
|
34
|
-
}.to change { task.barrier.seen?(task.url) }.to(true)
|
35
|
-
end
|
43
|
+
context "when rerouted" do
|
44
|
+
before { task[:controller] = Object.new }
|
36
45
|
|
37
|
-
|
46
|
+
specify do
|
38
47
|
expect { |spy| subject.call(task, &spy) }.to yield_control
|
39
48
|
end
|
40
|
-
end
|
41
49
|
|
42
|
-
|
43
|
-
|
50
|
+
specify do
|
51
|
+
expect(Wayfarer::Logging.logger).to receive(:add).with(Logger::DEBUG, "Not deduplicating rerouted task")
|
44
52
|
|
45
|
-
|
46
|
-
expect { |spy| subject.call(task, &spy) }.not_to yield_control
|
53
|
+
subject.call(task)
|
47
54
|
end
|
48
55
|
end
|
49
56
|
|
50
|
-
context "with
|
51
|
-
|
52
|
-
|
53
|
-
subject.call(task) { raise }
|
54
|
-
rescue StandardError
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
expect(task.barrier.seen?(task.url)).to be(false)
|
58
|
-
end
|
59
|
-
|
60
|
-
it "re-raises the exception" do
|
61
|
-
expect {
|
62
|
-
subject.call(task) { raise }
|
63
|
-
}.to raise_error(RuntimeError)
|
57
|
+
context "with unchecked URL" do
|
58
|
+
specify do
|
59
|
+
expect { |spy| subject.call(task, &spy) }.to yield_control
|
64
60
|
end
|
65
61
|
end
|
66
62
|
|
67
|
-
|
68
|
-
|
69
|
-
let(:unseen_urls) { %w[https://w3c.org https://nasa.gov] }
|
63
|
+
context "with checked URL" do
|
64
|
+
before { Wayfarer::Redis::Barrier.new(task).check!(task[:normalized_url]) }
|
70
65
|
|
71
|
-
|
72
|
-
|
73
|
-
task.barrier.seen?(url)
|
74
|
-
end
|
75
|
-
|
76
|
-
[*seen_urls, *unseen_urls].each do |url|
|
77
|
-
task.metadata.staged_urls.add(url)
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
it "filters seen staged URLs" do
|
82
|
-
expect {
|
83
|
-
subject.call(task)
|
84
|
-
}.to change { task.metadata.staged_urls }.to(SortedSet.new(unseen_urls))
|
66
|
+
specify do
|
67
|
+
expect { |spy| subject.call(task, &spy) }.not_to yield_control
|
85
68
|
end
|
86
69
|
end
|
87
70
|
end
|
@@ -8,27 +8,27 @@ describe Wayfarer::Middleware::Dispatch do
|
|
8
8
|
subject(:chain) { described_class.new }
|
9
9
|
|
10
10
|
before do
|
11
|
-
task
|
12
|
-
task
|
11
|
+
task[:controller] = spy
|
12
|
+
task[:action] = action
|
13
13
|
|
14
|
-
allow(task
|
14
|
+
allow(task[:controller]).to receive(:run_callbacks).and_yield
|
15
15
|
end
|
16
16
|
|
17
17
|
describe "#call" do
|
18
18
|
it "runs callbacks" do
|
19
|
-
expect(task
|
19
|
+
expect(task[:controller]).to receive(:run_callbacks).with(action)
|
20
20
|
subject.call(task)
|
21
21
|
end
|
22
22
|
|
23
23
|
context "when action is a Symbol" do
|
24
24
|
it "calls the method" do
|
25
|
-
expect(task
|
25
|
+
expect(task[:controller]).to receive(action)
|
26
26
|
subject.call(task)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
context "with
|
31
|
-
let(:action) { Class.new }
|
30
|
+
context "with handler" do
|
31
|
+
let(:action) { Class.new.include(Wayfarer::Handler) }
|
32
32
|
|
33
33
|
it "instantiates and calls" do
|
34
34
|
expect_any_instance_of(action).to receive(:call).with(task)
|
@@ -36,6 +36,22 @@ describe Wayfarer::Middleware::Dispatch do
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
|
+
context "without action" do
|
40
|
+
let(:action) { nil }
|
41
|
+
|
42
|
+
it "instantiates and calls" do
|
43
|
+
expect { subject.call(task) }.to raise_error(ArgumentError)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "without other action" do
|
48
|
+
let(:action) { Class.new }
|
49
|
+
|
50
|
+
it "instantiates and calls" do
|
51
|
+
expect { subject.call(task) }.to raise_error(ArgumentError)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
39
55
|
it "yields" do
|
40
56
|
expect { |spy| subject.call(task, &spy) }.to yield_control
|
41
57
|
end
|
@@ -4,26 +4,57 @@ require "spec_helpers"
|
|
4
4
|
|
5
5
|
describe Wayfarer::Middleware::Normalize do
|
6
6
|
let(:task) { build(:task) }
|
7
|
+
let(:uri) { Addressable::URI.parse(url) }
|
7
8
|
subject { described_class.new }
|
8
9
|
|
9
|
-
describe "
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
describe "::normalize" do
|
11
|
+
subject { described_class.normalize(uri).to_s }
|
12
|
+
|
13
|
+
context "without HTTP(S) URL" do
|
14
|
+
let(:url) { "localhost:3000" }
|
15
|
+
|
16
|
+
it { is_expected.to eq(url) }
|
16
17
|
end
|
17
18
|
|
18
|
-
|
19
|
+
context "with HTTP URL" do
|
20
|
+
let(:url) { "http://example.com" }
|
19
21
|
|
20
|
-
|
21
|
-
expect { |spy| subject.call(task, &spy) }.to yield_control
|
22
|
+
it { is_expected.to eq("http://example.com/") }
|
22
23
|
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#call" do
|
27
|
+
let(:url) { task.url }
|
28
|
+
before { task[:uri] = uri }
|
29
|
+
|
30
|
+
context "when already assigned" do
|
31
|
+
before { task[:normalized_url] = Object.new }
|
32
|
+
|
33
|
+
specify do
|
34
|
+
expect { |spy| subject.call(task, &spy) }.to yield_control
|
35
|
+
end
|
36
|
+
|
37
|
+
it "doesn't normalize URL" do
|
38
|
+
expect { subject.call(task) }.not_to(change { task[:normalized_url] })
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "with invalid URL" do
|
43
|
+
before do
|
44
|
+
# I can't come up with a URL that normalize_url raises on but
|
45
|
+
# Addressable::URI doesn't, hence the stub
|
46
|
+
allow(described_class).to receive(:normalize).with(uri).and_return(nil)
|
47
|
+
end
|
48
|
+
|
49
|
+
specify do
|
50
|
+
expect { |spy| subject.call(task, &spy) }.not_to yield_control
|
51
|
+
end
|
52
|
+
|
53
|
+
specify do
|
54
|
+
expect(Wayfarer::Logging.logger).to receive(:add).with(Logger::INFO, "Failed to normalize HTTP(S) URL")
|
23
55
|
|
24
|
-
|
25
|
-
|
26
|
-
expect(task.metadata.staged_urls).to eq(SortedSet[urls.first])
|
56
|
+
subject.call(task)
|
57
|
+
end
|
27
58
|
end
|
28
59
|
end
|
29
60
|
end
|
@@ -12,10 +12,11 @@ describe Wayfarer::Middleware::Router do
|
|
12
12
|
end
|
13
13
|
|
14
14
|
before do
|
15
|
-
allow(controller.class.
|
16
|
-
.with(Addressable::URI.parse(task.url)
|
15
|
+
allow(controller.class.route).to receive(:invoke)
|
16
|
+
.with(Addressable::URI.parse(task.url))
|
17
17
|
.and_return(result)
|
18
|
-
task
|
18
|
+
task[:controller] = controller
|
19
|
+
task[:uri] = Addressable::URI.parse(task.url)
|
19
20
|
end
|
20
21
|
|
21
22
|
context "with matching route" do
|
@@ -28,20 +29,26 @@ describe Wayfarer::Middleware::Router do
|
|
28
29
|
it "assigns the action" do
|
29
30
|
expect {
|
30
31
|
subject.call(task)
|
31
|
-
}.to change { task
|
32
|
+
}.to change { task[:action] }.to(action)
|
32
33
|
end
|
33
34
|
|
34
35
|
it "merges params" do
|
35
|
-
task
|
36
|
+
task[:params] = ActiveSupport::HashWithIndifferentAccess.new("bar" => "qux")
|
36
37
|
|
37
38
|
expect {
|
38
39
|
subject.call(task)
|
39
|
-
}.to change { task
|
40
|
+
}.to change { task[:params] }.to("foo" => "bar", "bar" => "qux")
|
40
41
|
end
|
41
42
|
|
42
|
-
|
43
|
+
specify do
|
43
44
|
expect { |spy| subject.call(task, &spy) }.to yield_control
|
44
45
|
end
|
46
|
+
|
47
|
+
specify do
|
48
|
+
expect(Wayfarer::Logging.logger).to receive(:add).with(Logger::INFO, kind_of(String))
|
49
|
+
|
50
|
+
subject.call(task)
|
51
|
+
end
|
45
52
|
end
|
46
53
|
|
47
54
|
context "without matching route" do
|
@@ -52,20 +59,26 @@ describe Wayfarer::Middleware::Router do
|
|
52
59
|
it "does not assign the action" do
|
53
60
|
expect {
|
54
61
|
subject.call(task)
|
55
|
-
}.not_to(change { task
|
62
|
+
}.not_to(change { task[:action] })
|
56
63
|
end
|
57
64
|
|
58
65
|
it "does not alter params" do
|
59
|
-
task
|
66
|
+
task[:params] = ActiveSupport::HashWithIndifferentAccess.new("bar" => "qux")
|
60
67
|
|
61
68
|
expect {
|
62
69
|
subject.call(task)
|
63
|
-
}.not_to(change { task
|
70
|
+
}.not_to(change { task[:params] })
|
64
71
|
end
|
65
72
|
|
66
|
-
|
73
|
+
specify do
|
67
74
|
expect { |spy| subject.call(task, &spy) }.not_to yield_control
|
68
75
|
end
|
76
|
+
|
77
|
+
specify do
|
78
|
+
expect(Wayfarer::Logging.logger).to receive(:add).with(Logger::INFO, kind_of(String))
|
79
|
+
|
80
|
+
subject.call(task)
|
81
|
+
end
|
69
82
|
end
|
70
83
|
end
|
71
84
|
|
@@ -74,31 +87,17 @@ describe Wayfarer::Middleware::Router do
|
|
74
87
|
Struct.new(:task).include(described_class).new(task)
|
75
88
|
end
|
76
89
|
|
77
|
-
describe "::
|
78
|
-
it "returns a
|
79
|
-
expect(controller.class.
|
90
|
+
describe "::route" do
|
91
|
+
it "returns a root route" do
|
92
|
+
expect(controller.class.route).to be_a(Wayfarer::Routing::RootRoute)
|
80
93
|
end
|
81
94
|
end
|
82
95
|
|
83
96
|
describe "::route" do
|
84
97
|
it "adds a routing block" do
|
85
98
|
expect {
|
86
|
-
controller.class.route
|
87
|
-
}.to change { controller.class.
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
describe "::steer" do
|
92
|
-
it "overrides #steer" do
|
93
|
-
expect {
|
94
|
-
controller.class.steer { :foobar }
|
95
|
-
}.to change { controller.steer }.from([]).to(:foobar)
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
describe "#steer" do
|
100
|
-
it "returns [] by default" do
|
101
|
-
expect(controller.steer).to eq([])
|
99
|
+
controller.class.route.to(:index)
|
100
|
+
}.to change { controller.class.route.children.count }.by(1)
|
102
101
|
end
|
103
102
|
end
|
104
103
|
end
|
@@ -9,7 +9,7 @@ describe Wayfarer::Middleware::Stage do
|
|
9
9
|
describe "#call" do
|
10
10
|
it "assigns an empty set" do
|
11
11
|
subject.call(task)
|
12
|
-
expect(task
|
12
|
+
expect(task[:staged_urls]).to eq(SortedSet.new)
|
13
13
|
end
|
14
14
|
|
15
15
|
it "yields" do
|
@@ -22,20 +22,20 @@ describe Wayfarer::Middleware::Stage do
|
|
22
22
|
spy.tap do |job|
|
23
23
|
expect(job).to receive(:crawl).with(urls.first, batch: task.batch).ordered
|
24
24
|
expect(job).to receive(:crawl).with(urls.second, batch: task.batch).ordered
|
25
|
-
task
|
25
|
+
task[:job] = double(class: job)
|
26
26
|
end
|
27
27
|
|
28
28
|
subject.call(task) do
|
29
|
-
task
|
29
|
+
task[:staged_urls] = SortedSet.new(urls)
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
33
|
it "resets staged URLs" do
|
34
|
-
task
|
34
|
+
task[:staged_urls] = SortedSet.new([test_app_path("/foo")])
|
35
35
|
|
36
36
|
expect {
|
37
37
|
subject.call(task)
|
38
|
-
}.to change { task
|
38
|
+
}.to change { task[:staged_urls].count }.to(0)
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
@@ -45,17 +45,17 @@ describe Wayfarer::Middleware::Stage do
|
|
45
45
|
end
|
46
46
|
|
47
47
|
describe "#stage" do
|
48
|
-
before { task
|
48
|
+
before { task[:staged_urls] = SortedSet.new }
|
49
49
|
|
50
50
|
it "stages URLs" do
|
51
51
|
expect {
|
52
52
|
controller.stage(test_app_path("/foo"))
|
53
|
-
}.to change { task
|
53
|
+
}.to change { task[:staged_urls].count }.by(1)
|
54
54
|
end
|
55
55
|
|
56
56
|
it "converts to strings" do
|
57
57
|
controller.stage(Addressable::URI.parse(test_app_path("/foo")))
|
58
|
-
expect(task
|
58
|
+
expect(task[:staged_urls].to_a.first).to be_a(String)
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "spec_helpers"
|
4
|
+
|
5
|
+
describe Wayfarer::Middleware::UriParser, "#call" do
|
6
|
+
let(:task) { build(:task) }
|
7
|
+
subject { described_class.new }
|
8
|
+
|
9
|
+
it "parses URLs" do
|
10
|
+
expect { subject.call(task) }.to change { task[:uri] }.to(Addressable::URI.parse(task.url))
|
11
|
+
end
|
12
|
+
|
13
|
+
it "normalizes URLs" do
|
14
|
+
task = build(:task, url: "http://example.com")
|
15
|
+
|
16
|
+
expect { subject.call(task) }.to change { task[:uri].to_s }.to("http://example.com/")
|
17
|
+
end
|
18
|
+
|
19
|
+
specify do
|
20
|
+
expect { |spy| subject.call(task, &spy) }.to yield_control
|
21
|
+
end
|
22
|
+
|
23
|
+
context "with invalid URL" do
|
24
|
+
let(:task) { build(:task, url: "ht%0atp://localhost/") }
|
25
|
+
|
26
|
+
specify do
|
27
|
+
expect { |spy| subject.call(task, &spy) }.not_to yield_control
|
28
|
+
end
|
29
|
+
|
30
|
+
specify do
|
31
|
+
expect(Wayfarer::Logging.logger)
|
32
|
+
.to receive(:add).with(Logger::INFO, "Not processing invalid URL (Invalid scheme format: 'ht%0atp')")
|
33
|
+
|
34
|
+
subject.call(task)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe described_class::API do
|
39
|
+
subject(:controller) do
|
40
|
+
Struct.new(:task).include(described_class).new(task)
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "#uri" do
|
44
|
+
let(:uri) { Addressable::URI.parse(task.url) }
|
45
|
+
|
46
|
+
before { task[:uri] = uri }
|
47
|
+
|
48
|
+
it "returns the agent" do
|
49
|
+
expect(controller.uri).to be(uri)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
require "spec_helpers"
|
4
4
|
|
5
|
-
describe Wayfarer::Middleware::
|
5
|
+
describe Wayfarer::Middleware::UserAgent do
|
6
6
|
let(:task) { build(:task) }
|
7
7
|
let(:page) { Object.new }
|
8
8
|
let(:agent) { Object.new }
|
@@ -28,17 +28,17 @@ describe Wayfarer::Middleware::Fetch do
|
|
28
28
|
allow(subject).to receive(:pool).and_return(pool)
|
29
29
|
end
|
30
30
|
|
31
|
-
task
|
32
|
-
task
|
31
|
+
task[:staged_urls] = SortedSet.new
|
32
|
+
task[:controller] = controller
|
33
33
|
end
|
34
34
|
|
35
35
|
context "with page assigned" do
|
36
|
-
before { task
|
36
|
+
before { task[:page] = page }
|
37
37
|
|
38
38
|
it "does not alter the page" do
|
39
39
|
expect {
|
40
40
|
subject.call(task)
|
41
|
-
}.not_to(change { task
|
41
|
+
}.not_to(change { task[:page] })
|
42
42
|
end
|
43
43
|
|
44
44
|
it "yields" do
|
@@ -63,7 +63,7 @@ describe Wayfarer::Middleware::Fetch do
|
|
63
63
|
it "stages the redirect URL" do
|
64
64
|
expect {
|
65
65
|
subject.call(task)
|
66
|
-
}.to change { task
|
66
|
+
}.to change { task[:staged_urls].count }.by(1)
|
67
67
|
end
|
68
68
|
|
69
69
|
it "does not yield" do
|
@@ -75,13 +75,13 @@ describe Wayfarer::Middleware::Fetch do
|
|
75
75
|
it "assigns the context" do
|
76
76
|
expect {
|
77
77
|
subject.call(task)
|
78
|
-
}.to change { task
|
78
|
+
}.to change { task[:context] }.to(context)
|
79
79
|
end
|
80
80
|
|
81
81
|
it "assigns the page" do
|
82
82
|
expect {
|
83
83
|
subject.call(task)
|
84
|
-
}.to change { task
|
84
|
+
}.to change { task[:page] }.to(result.page)
|
85
85
|
end
|
86
86
|
|
87
87
|
it "yields" do
|
@@ -95,31 +95,23 @@ describe Wayfarer::Middleware::Fetch do
|
|
95
95
|
Struct.new(:task).include(described_class).new(task)
|
96
96
|
end
|
97
97
|
|
98
|
-
describe "#
|
99
|
-
before { task
|
98
|
+
describe "#user_agent" do
|
99
|
+
before { task[:context] = context }
|
100
100
|
|
101
101
|
it "returns the agent" do
|
102
|
-
expect(controller.
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
describe "#context" do
|
107
|
-
before { task.metadata.context = context }
|
108
|
-
|
109
|
-
it "returns the context" do
|
110
|
-
expect(controller.context).to be(task.metadata.context)
|
102
|
+
expect(controller.user_agent).to be(context.instance)
|
111
103
|
end
|
112
104
|
end
|
113
105
|
|
114
106
|
describe "#page" do
|
115
|
-
before { task
|
107
|
+
before { task[:page] = page }
|
116
108
|
|
117
109
|
it "returns the page" do
|
118
|
-
expect(controller.page).to be(task
|
110
|
+
expect(controller.page).to be(task[:page])
|
119
111
|
end
|
120
112
|
|
121
113
|
context "with live keyword" do
|
122
|
-
before { task
|
114
|
+
before { task[:context] = context }
|
123
115
|
|
124
116
|
context "with stateful agent" do
|
125
117
|
before do
|
@@ -130,7 +122,7 @@ describe Wayfarer::Middleware::Fetch do
|
|
130
122
|
it "replaces the page" do
|
131
123
|
expect {
|
132
124
|
controller.page(live: true)
|
133
|
-
}.to change { task
|
125
|
+
}.to change { task[:page] }.to(result.page)
|
134
126
|
end
|
135
127
|
end
|
136
128
|
|
@@ -140,15 +132,24 @@ describe Wayfarer::Middleware::Fetch do
|
|
140
132
|
it "does not alter the page" do
|
141
133
|
expect {
|
142
134
|
controller.page(live: true)
|
143
|
-
}.not_to(change { task
|
135
|
+
}.not_to(change { task[:page] })
|
144
136
|
end
|
145
137
|
end
|
146
138
|
end
|
147
139
|
end
|
148
140
|
|
149
|
-
describe "#
|
150
|
-
|
151
|
-
|
141
|
+
describe "#fetch" do
|
142
|
+
let(:url) { test_app_path("/redirect?times=3") }
|
143
|
+
subject { controller.fetch(url) }
|
144
|
+
|
145
|
+
it { is_expected.to be_a(Wayfarer::Page) }
|
146
|
+
|
147
|
+
context "with reries exhausted" do
|
148
|
+
let(:url) { test_app_path("/redirect?times=4") }
|
149
|
+
|
150
|
+
specify do
|
151
|
+
expect { subject }.to raise_error(Wayfarer::Networking::Follow::RedirectsExhaustedError)
|
152
|
+
end
|
152
153
|
end
|
153
154
|
end
|
154
155
|
end
|
@@ -45,6 +45,23 @@ describe Wayfarer::Networking::Context do
|
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
|
+
context "with configured renewing exception raised" do
|
49
|
+
let(:other_error) { Class.new(StandardError) }
|
50
|
+
|
51
|
+
before do
|
52
|
+
Wayfarer.config[:network][:renew_on] = [other_error]
|
53
|
+
allow(strategy).to receive(:fetch).and_raise(other_error)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "renews and reraises" do
|
57
|
+
expect(context).to receive(:renew)
|
58
|
+
|
59
|
+
expect {
|
60
|
+
context.fetch(url)
|
61
|
+
}.to raise_error(other_error)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
48
65
|
context "with non-renewing exception raised" do
|
49
66
|
before do
|
50
67
|
allow(strategy).to receive(:fetch).and_raise(StandardError)
|
@@ -16,7 +16,7 @@ describe Wayfarer::Networking::Follow do
|
|
16
16
|
before { allow(inner).to receive(:fetch).and_return(success) }
|
17
17
|
|
18
18
|
it "returns the page" do
|
19
|
-
expect(outer.fetch(url)).to be(page)
|
19
|
+
expect(outer.fetch(url, follow: 3)).to be(page)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -34,7 +34,7 @@ describe Wayfarer::Networking::Follow do
|
|
34
34
|
|
35
35
|
expect {
|
36
36
|
outer.fetch(url, follow: 0)
|
37
|
-
}.to raise_error(
|
37
|
+
}.to raise_error(described_class::RedirectsExhaustedError)
|
38
38
|
end
|
39
39
|
end
|
40
40
|
end
|