wayfarer 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +14 -10
- data/docs/cookbook/batch_routing.md +22 -0
- data/docs/cookbook/consent_screen.md +36 -0
- data/docs/cookbook/executing_javascript.md +41 -0
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/browser_automation/capybara.md +6 -3
- data/docs/guides/browser_automation/ferrum.md +3 -1
- data/docs/guides/browser_automation/selenium.md +4 -2
- data/docs/guides/callbacks.md +5 -5
- data/docs/guides/debugging.md +17 -0
- data/docs/guides/error_handling.md +22 -26
- data/docs/guides/jobs.md +44 -18
- data/docs/guides/navigation.md +73 -0
- data/docs/guides/pages.md +4 -4
- data/docs/guides/performance.md +108 -0
- data/docs/guides/reliability.md +41 -0
- data/docs/guides/routing/steering.md +30 -0
- data/docs/guides/tasks.md +9 -33
- data/docs/reference/api/base.md +13 -127
- data/docs/reference/api/route.md +1 -1
- data/docs/reference/cli.md +0 -78
- data/docs/reference/configuration_keys.md +1 -1
- data/lib/wayfarer/cli/job.rb +1 -3
- data/lib/wayfarer/cli/route.rb +4 -2
- data/lib/wayfarer/cli/templates/job.rb.tt +3 -1
- data/lib/wayfarer/config/networking.rb +1 -1
- data/lib/wayfarer/config/struct.rb +1 -1
- data/lib/wayfarer/middleware/fetch.rb +15 -4
- data/lib/wayfarer/middleware/router.rb +34 -2
- data/lib/wayfarer/middleware/worker.rb +4 -24
- data/lib/wayfarer/networking/pool.rb +9 -8
- data/lib/wayfarer/page.rb +1 -1
- data/lib/wayfarer/routing/matchers/custom.rb +2 -0
- data/lib/wayfarer/routing/matchers/path.rb +1 -0
- data/lib/wayfarer/routing/route.rb +6 -0
- data/lib/wayfarer/routing/router.rb +27 -0
- data/lib/wayfarer/stringify.rb +13 -7
- data/lib/wayfarer.rb +3 -1
- data/spec/callbacks_spec.rb +2 -2
- data/spec/config/networking_spec.rb +2 -2
- data/spec/factories/{queue/middleware.rb → middleware.rb} +3 -3
- data/spec/factories/{queue/page.rb → page.rb} +3 -3
- data/spec/factories/{queue/task.rb → task.rb} +0 -0
- data/spec/fixtures/dummy_job.rb +1 -1
- data/spec/middleware/chain_spec.rb +17 -17
- data/spec/middleware/fetch_spec.rb +27 -11
- data/spec/middleware/router_spec.rb +34 -7
- data/spec/middleware/worker_spec.rb +3 -13
- data/spec/routing/router_spec.rb +24 -0
- data/wayfarer.gemspec +1 -1
- metadata +16 -8
- data/spec/factories/queue/chain.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04baaa6967fc9de4970e4d3a14cb8bb2d7458c70bb6529189ef3823d7792aa18
|
4
|
+
data.tar.gz: '058de8aa89a46c88fb460a0d39e542c43e4b0a9f23faa9b672367fb6a9b12820'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ba5feb1b4116f53a53166a999953b791aecc1356dbf4e3db5170f16f42703e708176a33a8a05553698a5cc6e011e4bc94521c163ff67e7d3d2dfd6c29e6a14f3
|
7
|
+
data.tar.gz: d0f0dddf9b091820b59476ecae9c048169fe867f5559c077ec306d74abc6540ea01d1723dd722cfeded64d206f67c9948eaef2e6a29b38b729243ee4aa046836
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
wayfarer (0.4.
|
4
|
+
wayfarer (0.4.2)
|
5
5
|
activejob (~> 6.0)
|
6
6
|
addressable (~> 2.8)
|
7
7
|
capybara (~> 3.0)
|
@@ -59,16 +59,17 @@ GEM
|
|
59
59
|
activesupport (>= 5.0.0)
|
60
60
|
faker (1.9.6)
|
61
61
|
i18n (>= 0.7)
|
62
|
-
faraday (1.
|
62
|
+
faraday (1.9.3)
|
63
63
|
faraday-em_http (~> 1.0)
|
64
64
|
faraday-em_synchrony (~> 1.0)
|
65
65
|
faraday-excon (~> 1.1)
|
66
|
-
faraday-httpclient (~> 1.0
|
66
|
+
faraday-httpclient (~> 1.0)
|
67
|
+
faraday-multipart (~> 1.0)
|
67
68
|
faraday-net_http (~> 1.0)
|
68
|
-
faraday-net_http_persistent (~> 1.
|
69
|
+
faraday-net_http_persistent (~> 1.0)
|
69
70
|
faraday-patron (~> 1.0)
|
70
71
|
faraday-rack (~> 1.0)
|
71
|
-
|
72
|
+
faraday-retry (~> 1.0)
|
72
73
|
ruby2_keywords (>= 0.0.4)
|
73
74
|
faraday-cookie_jar (0.0.7)
|
74
75
|
faraday (>= 0.8.0)
|
@@ -81,19 +82,22 @@ GEM
|
|
81
82
|
faraday-http-cache (2.2.0)
|
82
83
|
faraday (>= 0.8)
|
83
84
|
faraday-httpclient (1.0.1)
|
85
|
+
faraday-multipart (1.0.3)
|
86
|
+
multipart-post (>= 1.2, < 3)
|
84
87
|
faraday-net_http (1.0.1)
|
85
88
|
faraday-net_http_persistent (1.2.0)
|
86
89
|
faraday-patron (1.0.0)
|
87
90
|
faraday-rack (1.0.0)
|
91
|
+
faraday-retry (1.0.3)
|
88
92
|
faraday_middleware (1.2.0)
|
89
93
|
faraday (~> 1.0)
|
90
|
-
fastimage (2.2.
|
94
|
+
fastimage (2.2.6)
|
91
95
|
ferrum (0.11)
|
92
96
|
addressable (~> 2.5)
|
93
97
|
cliver (~> 0.3)
|
94
98
|
concurrent-ruby (~> 1.1)
|
95
99
|
websocket-driver (>= 0.6, < 0.8)
|
96
|
-
globalid (0.
|
100
|
+
globalid (1.0.0)
|
97
101
|
activesupport (>= 5.0)
|
98
102
|
http-cookie (1.0.4)
|
99
103
|
domain_name (~> 0.5)
|
@@ -111,9 +115,9 @@ GEM
|
|
111
115
|
nesty (~> 1.0)
|
112
116
|
nokogiri (~> 1.11)
|
113
117
|
method_source (1.0.0)
|
114
|
-
mime-types (3.
|
118
|
+
mime-types (3.4.1)
|
115
119
|
mime-types-data (~> 3.2015)
|
116
|
-
mime-types-data (3.
|
120
|
+
mime-types-data (3.2022.0105)
|
117
121
|
mini_mime (1.1.2)
|
118
122
|
mini_portile2 (2.6.1)
|
119
123
|
minitest (5.14.4)
|
@@ -182,7 +186,7 @@ GEM
|
|
182
186
|
rack (~> 1.5)
|
183
187
|
rack-protection (~> 1.4)
|
184
188
|
tilt (>= 1.3, < 3)
|
185
|
-
thor (1.1
|
189
|
+
thor (1.2.1)
|
186
190
|
tilt (2.0.10)
|
187
191
|
tzinfo (2.0.4)
|
188
192
|
concurrent-ruby (~> 1.0)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Batch routing
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
# Create a record in an external database and store the hostname
|
5
|
+
record = Database::Row.create(hostname: "example.com")
|
6
|
+
|
7
|
+
class DummyJob < Wayfarer::Base
|
8
|
+
route do |hostname|
|
9
|
+
host hostname, to: :index
|
10
|
+
end
|
11
|
+
|
12
|
+
steer do |task|
|
13
|
+
# Pass the external record's hostname to the router
|
14
|
+
[Database::Row.find(task.batch).hostname]
|
15
|
+
end
|
16
|
+
|
17
|
+
# ...
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enqueue the task and use the database record's key as batch
|
21
|
+
DummyJob.crawl_later("https://example.com", batch: record.id)
|
22
|
+
```
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Consent Screens
|
2
|
+
|
3
|
+
Some websites have nag-screens that make visitors wait for a button to show up.
|
4
|
+
Here is an example with Ferrum where the opt-in button is contained in an
|
5
|
+
iframe, clicked, and makes the live page behind the screen accessible to
|
6
|
+
`#index`:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
Wayfarer.config.network.agent = :ferrum
|
10
|
+
|
11
|
+
class DummyJob < Wayfarer::Base
|
12
|
+
route { to :index, host: "example.com" }
|
13
|
+
|
14
|
+
before_action if: :consent_required? do
|
15
|
+
sleep(5) # If the consent form has a loading animation
|
16
|
+
consent_button&.click
|
17
|
+
sleep(5) # Wait for browser to get redirected behind nag-screen
|
18
|
+
end
|
19
|
+
|
20
|
+
def index
|
21
|
+
# Nag-screen passed
|
22
|
+
stage page(live: true).meta.links.internal
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def consent_button
|
28
|
+
browser.frames.third.css("button#consent")&.first
|
29
|
+
end
|
30
|
+
|
31
|
+
def consent_required?
|
32
|
+
browser.css(".consent_screen").any?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Executing JavaScript
|
2
|
+
|
3
|
+
Executing JavaScript requires automating a browser.
|
4
|
+
|
5
|
+
=== "Ferrum"
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
class DummyJob < Wayfarer::Base
|
9
|
+
route { to :index }
|
10
|
+
|
11
|
+
def index
|
12
|
+
browser.evaluate("[window.scrollX, window.scrollY]")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
```
|
16
|
+
|
17
|
+
=== "Selenium"
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class DummyJob < Wayfarer::Base
|
21
|
+
route { to :index }
|
22
|
+
|
23
|
+
def index
|
24
|
+
# Mind the explicit return
|
25
|
+
browser.execute_script("return [window.scrollX, window.scrollY]")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
29
|
+
|
30
|
+
=== "Capybara"
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
class DummyJob < Wayfarer::Base
|
34
|
+
route { to :index }
|
35
|
+
|
36
|
+
def index
|
37
|
+
# Capybara does not return value of JavaScript execution
|
38
|
+
browser.execute_script("console.log('Foobar')") # => nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route { to :index }
|
10
10
|
|
11
11
|
def index
|
12
12
|
page.doc.css("html")
|
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
class DummyJob < Wayfarer::Base
|
22
|
-
route
|
22
|
+
route { to :index }
|
23
23
|
|
24
24
|
def index
|
25
25
|
browser.at_css("html")
|
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route { to :index }
|
36
36
|
|
37
37
|
def index
|
38
38
|
browser.find_elements(css: "html")
|
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route { to :index }
|
10
10
|
|
11
11
|
def index
|
12
12
|
browser.screenshot(path: "screenshot.png")
|
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route { to :index }
|
22
22
|
|
23
23
|
def index
|
24
24
|
browser.save_screenshot("screenshot.png")
|
@@ -7,8 +7,11 @@ When Capybara is in use, a remote browser process is available as a Capybara
|
|
7
7
|
session:
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
|
11
|
-
|
10
|
+
Wayfarer.config.network.agent = :capybara
|
11
|
+
# Wayfarer.config.capybara.driver = ...
|
12
|
+
|
13
|
+
class DummyJob < Wayfarer::Worker
|
14
|
+
route { to :index }
|
12
15
|
|
13
16
|
def index
|
14
17
|
browser # => #<Capybara::Session ...>
|
@@ -61,6 +64,6 @@ end
|
|
61
64
|
|
62
65
|
Capybara.register_driver(:cuprite) do |app|
|
63
66
|
# Wayfarer's Ferrum or Selenium options must be passed along manually
|
64
|
-
Capybara::Cuprite::Driver.new(app,
|
67
|
+
Capybara::Cuprite::Driver.new(app, Wayfarer.config.ferrum.options)
|
65
68
|
end
|
66
69
|
```
|
@@ -11,8 +11,10 @@ When Ferrum is in use, a Google Chrome process is accessible within jobs like
|
|
11
11
|
so:
|
12
12
|
|
13
13
|
```ruby
|
14
|
+
Wayfarer.config.network.agent = :ferrum
|
15
|
+
|
14
16
|
class DummyWorker < Wayfarer::Worker
|
15
|
-
route
|
17
|
+
route { to :index }
|
16
18
|
|
17
19
|
def index
|
18
20
|
browser # => #<Ferrum::Browser ...>
|
@@ -7,8 +7,10 @@ When Selenium is in use, a remote browser process is accessible within jobs like
|
|
7
7
|
so:
|
8
8
|
|
9
9
|
```ruby
|
10
|
+
Wayfarer.config.network.agent = :selenium
|
11
|
+
|
10
12
|
class DummyWorker < Wayfarer::Worker
|
11
|
-
route
|
13
|
+
route { to :index }
|
12
14
|
|
13
15
|
def index
|
14
16
|
browser # => #<Selenium::WebDriver ...>
|
@@ -28,7 +30,7 @@ process.
|
|
28
30
|
Wayfarer.config.network.agent = :selenium
|
29
31
|
|
30
32
|
class DummyJob < Wayfarer::Base
|
31
|
-
route
|
33
|
+
route { to :index }
|
32
34
|
|
33
35
|
def index
|
34
36
|
page.headers # => always {}
|
data/docs/guides/callbacks.md
CHANGED
@@ -52,16 +52,16 @@ end
|
|
52
52
|
Internally, a batch counter is in-/decremented on certain events. Once the
|
53
53
|
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
54
54
|
|
55
|
-
The counter is incremented when:
|
55
|
+
The counter is incremented when within the batch:
|
56
56
|
|
57
|
-
* A job is enqueued
|
57
|
+
* A job is enqueued.
|
58
58
|
|
59
59
|
The counter is decremented when:
|
60
60
|
|
61
61
|
* A job succeeds.
|
62
|
-
* A job
|
63
|
-
* A job
|
64
|
-
* A job
|
62
|
+
* A job errors due to an unhandled exception.
|
63
|
+
* A job is discarded due to an exception.
|
64
|
+
* A job errors and thereyby exhausts its maximum attempts.
|
65
65
|
|
66
66
|
!!! attention "Batch callbacks can fail jobs"
|
67
67
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# Debugging
|
2
|
+
|
3
|
+
[Wayfarer's CLI](/reference/cli/) has two sub-commands that come in handy when
|
4
|
+
diagnosing problems in the development workflow.
|
5
|
+
|
6
|
+
## Routing a URL from the shell
|
7
|
+
|
8
|
+
## `wayfarer route`
|
9
|
+
|
10
|
+
### `wayfarer route result JOB URL`
|
11
|
+
|
12
|
+
: Prints the result of invoking `JOB`'s router with `URL`.
|
13
|
+
|
14
|
+
### `wayfarer route tree JOB URL`
|
15
|
+
|
16
|
+
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
17
|
+
|
@@ -1,35 +1,31 @@
|
|
1
1
|
# Error handling
|
2
2
|
|
3
|
-
Wayfarer
|
4
|
-
`discard_on`:
|
3
|
+
## Wayfarer never swallows exceptions
|
5
4
|
|
6
|
-
*
|
7
|
-
*
|
5
|
+
* Wayfarer never swallows exceptions.
|
6
|
+
* Jobs with unhandled exceptions are not retried.
|
8
7
|
|
9
|
-
## Retrying
|
8
|
+
## Retrying and discarding
|
10
9
|
|
11
|
-
|
12
|
-
class DummyJob < Wayfarer::Base
|
13
|
-
retry_on MyError, attempts: 3 do |job, error|
|
14
|
-
# All 3 attempts have failed (1 initial attempt + 2 retries)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
```
|
10
|
+
Wayfarer relies on [Active Job's two error handling facilities](https://guides.rubyonrails.org/active_job_basics.html#exceptions).
|
18
11
|
|
19
|
-
|
12
|
+
* `retry_on` to retry jobs a number of times on certain errors:
|
20
13
|
|
21
|
-
```ruby
|
22
|
-
class DummyJob < Wayfarer::Base
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
27
|
-
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
retry_on MyError, attempts: 3 do |job, error|
|
17
|
+
# This block runs once all 3 attempts have failed
|
18
|
+
# (1 initial attempt + 2 retries)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
```
|
28
22
|
|
29
|
-
|
23
|
+
* `discard_on` to throw away jobs on certain errors:
|
30
24
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
```ruby
|
26
|
+
class DummyJob < Wayfarer::Base
|
27
|
+
discard_on MyError do |job, error|
|
28
|
+
# This block runs once and buries the job
|
29
|
+
end
|
30
|
+
end
|
31
|
+
```
|
data/docs/guides/jobs.md
CHANGED
@@ -1,16 +1,36 @@
|
|
1
1
|
# Jobs
|
2
2
|
|
3
|
-
Jobs are Ruby classes that look as follows:
|
3
|
+
Jobs are Ruby classes that process [tasks](/guides/tasks) and look as follows:
|
4
4
|
|
5
5
|
```ruby
|
6
6
|
class DummyJob < Wayfarer::Base
|
7
|
-
route
|
7
|
+
route { to :index }
|
8
8
|
|
9
9
|
def index
|
10
10
|
end
|
11
11
|
end
|
12
12
|
```
|
13
13
|
|
14
|
+
Here is how to enqueue a task for a URL:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
DummyJob.crawl_later("https://example.com")
|
18
|
+
```
|
19
|
+
|
20
|
+
This is the same as calling the Active Job API directly and passing a task
|
21
|
+
and a random batch:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
|
25
|
+
DummyJob.perform_later(task)
|
26
|
+
```
|
27
|
+
|
28
|
+
A batch can be specified with `::crawl_later`, too:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
DummyJob.crawl_later("https://example.com", batch: "my-batch")
|
32
|
+
```
|
33
|
+
|
14
34
|
## Current task
|
15
35
|
|
16
36
|
Jobs consume [tasks](../tasks) from a message queue. The currently processed
|
@@ -18,58 +38,64 @@ task is accessible like so:
|
|
18
38
|
|
19
39
|
```ruby
|
20
40
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
41
|
+
route { to :index }
|
22
42
|
|
23
43
|
def index
|
24
44
|
task.url # => "https://example.com"
|
25
|
-
task.batch # => "
|
45
|
+
task.batch # => "my-batch"
|
26
46
|
end
|
27
47
|
end
|
28
48
|
```
|
29
49
|
|
30
50
|
## Current page
|
31
51
|
|
32
|
-
|
33
|
-
|
52
|
+
A task's URL contents get fetched into a [page](../pages) object if the task URL
|
53
|
+
matched a route:
|
34
54
|
|
35
55
|
```ruby
|
36
56
|
class DummyJob < Wayfarer::Base
|
37
|
-
route
|
57
|
+
route { to :index, host: "example.com" }
|
38
58
|
|
39
59
|
def index
|
40
|
-
page.url
|
41
|
-
page.body
|
60
|
+
page.url # => "https://example.com"
|
61
|
+
page.body # => "<html>..."
|
62
|
+
page.status_code # => 200
|
63
|
+
page.headers # { "Content-Type" => ... }
|
42
64
|
end
|
43
65
|
end
|
44
66
|
```
|
45
67
|
|
46
68
|
## URL parameters
|
47
69
|
|
48
|
-
|
70
|
+
Jobs can extract data from URLs with their router:
|
49
71
|
|
50
72
|
```ruby
|
51
73
|
class DummyJob < Wayfarer::Base
|
52
|
-
route
|
74
|
+
route do
|
75
|
+
path "/users/:id/profile"
|
76
|
+
end
|
53
77
|
|
54
78
|
def index
|
55
|
-
|
56
|
-
page.body # => "<html>..."
|
79
|
+
params[:id] # => "42"
|
57
80
|
end
|
58
81
|
end
|
82
|
+
|
83
|
+
DummyJob.crawl_later("https://example.com/users/42/profile")
|
59
84
|
```
|
60
85
|
|
61
86
|
|
62
|
-
##
|
87
|
+
## User agent
|
63
88
|
|
64
|
-
|
65
|
-
accessible like so:
|
89
|
+
The HTTP client or automated browser that fetched the URL is available:
|
66
90
|
|
67
91
|
```ruby
|
92
|
+
Wayfarer.config.network.agent = :ferrum # Chrome DevTools Protocol
|
93
|
+
|
68
94
|
class DummyJob < Wayfarer::Base
|
69
|
-
route
|
95
|
+
route { to :index }
|
70
96
|
|
71
97
|
def index
|
72
|
-
browser
|
98
|
+
browser.save_screenshot("capture.png")
|
73
99
|
end
|
74
100
|
end
|
75
101
|
```
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Navigation
|
2
|
+
|
3
|
+
Wayfarer has two mechanisms for navigating crawls:
|
4
|
+
|
5
|
+
* Jobs have a router that decides if a task's URL gets fetched and processed.
|
6
|
+
* Jobs can add URLs to a processing set with `#stage`.
|
7
|
+
|
8
|
+
## Staging URLs
|
9
|
+
|
10
|
+
Jobs can turn URLs into tasks within their own batch with `#stage`. Staging a
|
11
|
+
URL does not enqueue it immediately. Instead, the URL is added to a processing
|
12
|
+
set first.
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
route { to :index }
|
17
|
+
|
18
|
+
def index
|
19
|
+
stage page.meta.links.all
|
20
|
+
end
|
21
|
+
end
|
22
|
+
```
|
23
|
+
|
24
|
+
Once the `index` action method returns, all URLs in `page.meta.links.all`
|
25
|
+
are (1) normalized to a canonical form and (2) checked for inclusion in
|
26
|
+
the batch's processed URL Redis set. All unprocessed URLs are enqueued as
|
27
|
+
tasks within the same batch.
|
28
|
+
|
29
|
+
`#stage` can be called arbitrarily often, with invalid URLs too, as they are
|
30
|
+
filtered out behind the scenes:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
def index
|
34
|
+
stage "_bro:ken@url/" # => ["_bro:ken@url/"]
|
35
|
+
end
|
36
|
+
```
|
37
|
+
|
38
|
+
See also: [Performance: Stage less URLs](/guides/performance)
|
39
|
+
|
40
|
+
!!! attention "Failing action methods do not enqueue tasks"
|
41
|
+
|
42
|
+
If an action method fails as in:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
def index
|
46
|
+
stage page.meta.links.all
|
47
|
+
fail "Error occured"
|
48
|
+
end
|
49
|
+
```
|
50
|
+
|
51
|
+
None of the staged URLs are enqueued as tasks. Jobs that raise an exception
|
52
|
+
should get retried, or the exception should be handled.
|
53
|
+
|
54
|
+
|
55
|
+
## Routing URLs
|
56
|
+
|
57
|
+
In the following example, the task is written to the message queue, but the
|
58
|
+
job's routes do not match the URL. When the task gets consumed, the URL does not
|
59
|
+
get fetched and the action method not called.
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
class DummyJob < Wayfarer::Base
|
63
|
+
route do
|
64
|
+
host "example.com", path: "/users/:user_id", to: :user
|
65
|
+
end
|
66
|
+
|
67
|
+
# ...
|
68
|
+
end
|
69
|
+
|
70
|
+
DummyJob.crawl_later("https://mismatching.host/users/42")
|
71
|
+
```
|
72
|
+
|
73
|
+
|
data/docs/guides/pages.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Pages
|
2
2
|
|
3
|
-
Retrieved pages
|
4
|
-
|
3
|
+
Retrieved pages take the shape of `Wayfarer::Page` objects and are available
|
4
|
+
to jobs:
|
5
5
|
|
6
6
|
```ruby
|
7
7
|
class DummyJob < Wayfarer::Worker
|
8
|
-
route
|
8
|
+
route { to :index }
|
9
9
|
|
10
10
|
def index
|
11
11
|
page # => #<Wayfarer::Page ...>
|
@@ -35,7 +35,7 @@ To access a page reflecting the current browser state, pass the `live` keyword:
|
|
35
35
|
|
36
36
|
```ruby
|
37
37
|
class DummyJob < Wayfarer::Worker
|
38
|
-
route
|
38
|
+
route { to :index }
|
39
39
|
|
40
40
|
def index
|
41
41
|
page # => #<Wayfarer::Page ...>
|