wayfarer 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +14 -10
- data/docs/cookbook/batch_routing.md +22 -0
- data/docs/cookbook/consent_screen.md +36 -0
- data/docs/cookbook/executing_javascript.md +41 -0
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/browser_automation/capybara.md +6 -3
- data/docs/guides/browser_automation/ferrum.md +3 -1
- data/docs/guides/browser_automation/selenium.md +4 -2
- data/docs/guides/callbacks.md +5 -5
- data/docs/guides/debugging.md +17 -0
- data/docs/guides/error_handling.md +22 -26
- data/docs/guides/jobs.md +44 -18
- data/docs/guides/navigation.md +73 -0
- data/docs/guides/pages.md +4 -4
- data/docs/guides/performance.md +108 -0
- data/docs/guides/reliability.md +41 -0
- data/docs/guides/routing/steering.md +30 -0
- data/docs/guides/tasks.md +9 -33
- data/docs/reference/api/base.md +13 -127
- data/docs/reference/api/route.md +1 -1
- data/docs/reference/cli.md +0 -78
- data/docs/reference/configuration_keys.md +1 -1
- data/lib/wayfarer/cli/job.rb +1 -3
- data/lib/wayfarer/cli/route.rb +4 -2
- data/lib/wayfarer/cli/templates/job.rb.tt +3 -1
- data/lib/wayfarer/config/networking.rb +1 -1
- data/lib/wayfarer/config/struct.rb +1 -1
- data/lib/wayfarer/middleware/fetch.rb +15 -4
- data/lib/wayfarer/middleware/router.rb +34 -2
- data/lib/wayfarer/middleware/worker.rb +4 -24
- data/lib/wayfarer/networking/pool.rb +9 -8
- data/lib/wayfarer/page.rb +1 -1
- data/lib/wayfarer/routing/matchers/custom.rb +2 -0
- data/lib/wayfarer/routing/matchers/path.rb +1 -0
- data/lib/wayfarer/routing/route.rb +6 -0
- data/lib/wayfarer/routing/router.rb +27 -0
- data/lib/wayfarer/stringify.rb +13 -7
- data/lib/wayfarer.rb +3 -1
- data/spec/callbacks_spec.rb +2 -2
- data/spec/config/networking_spec.rb +2 -2
- data/spec/factories/{queue/middleware.rb → middleware.rb} +3 -3
- data/spec/factories/{queue/page.rb → page.rb} +3 -3
- data/spec/factories/{queue/task.rb → task.rb} +0 -0
- data/spec/fixtures/dummy_job.rb +1 -1
- data/spec/middleware/chain_spec.rb +17 -17
- data/spec/middleware/fetch_spec.rb +27 -11
- data/spec/middleware/router_spec.rb +34 -7
- data/spec/middleware/worker_spec.rb +3 -13
- data/spec/routing/router_spec.rb +24 -0
- data/wayfarer.gemspec +1 -1
- metadata +16 -8
- data/spec/factories/queue/chain.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04baaa6967fc9de4970e4d3a14cb8bb2d7458c70bb6529189ef3823d7792aa18
|
4
|
+
data.tar.gz: '058de8aa89a46c88fb460a0d39e542c43e4b0a9f23faa9b672367fb6a9b12820'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ba5feb1b4116f53a53166a999953b791aecc1356dbf4e3db5170f16f42703e708176a33a8a05553698a5cc6e011e4bc94521c163ff67e7d3d2dfd6c29e6a14f3
|
7
|
+
data.tar.gz: d0f0dddf9b091820b59476ecae9c048169fe867f5559c077ec306d74abc6540ea01d1723dd722cfeded64d206f67c9948eaef2e6a29b38b729243ee4aa046836
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
wayfarer (0.4.
|
4
|
+
wayfarer (0.4.2)
|
5
5
|
activejob (~> 6.0)
|
6
6
|
addressable (~> 2.8)
|
7
7
|
capybara (~> 3.0)
|
@@ -59,16 +59,17 @@ GEM
|
|
59
59
|
activesupport (>= 5.0.0)
|
60
60
|
faker (1.9.6)
|
61
61
|
i18n (>= 0.7)
|
62
|
-
faraday (1.
|
62
|
+
faraday (1.9.3)
|
63
63
|
faraday-em_http (~> 1.0)
|
64
64
|
faraday-em_synchrony (~> 1.0)
|
65
65
|
faraday-excon (~> 1.1)
|
66
|
-
faraday-httpclient (~> 1.0
|
66
|
+
faraday-httpclient (~> 1.0)
|
67
|
+
faraday-multipart (~> 1.0)
|
67
68
|
faraday-net_http (~> 1.0)
|
68
|
-
faraday-net_http_persistent (~> 1.
|
69
|
+
faraday-net_http_persistent (~> 1.0)
|
69
70
|
faraday-patron (~> 1.0)
|
70
71
|
faraday-rack (~> 1.0)
|
71
|
-
|
72
|
+
faraday-retry (~> 1.0)
|
72
73
|
ruby2_keywords (>= 0.0.4)
|
73
74
|
faraday-cookie_jar (0.0.7)
|
74
75
|
faraday (>= 0.8.0)
|
@@ -81,19 +82,22 @@ GEM
|
|
81
82
|
faraday-http-cache (2.2.0)
|
82
83
|
faraday (>= 0.8)
|
83
84
|
faraday-httpclient (1.0.1)
|
85
|
+
faraday-multipart (1.0.3)
|
86
|
+
multipart-post (>= 1.2, < 3)
|
84
87
|
faraday-net_http (1.0.1)
|
85
88
|
faraday-net_http_persistent (1.2.0)
|
86
89
|
faraday-patron (1.0.0)
|
87
90
|
faraday-rack (1.0.0)
|
91
|
+
faraday-retry (1.0.3)
|
88
92
|
faraday_middleware (1.2.0)
|
89
93
|
faraday (~> 1.0)
|
90
|
-
fastimage (2.2.
|
94
|
+
fastimage (2.2.6)
|
91
95
|
ferrum (0.11)
|
92
96
|
addressable (~> 2.5)
|
93
97
|
cliver (~> 0.3)
|
94
98
|
concurrent-ruby (~> 1.1)
|
95
99
|
websocket-driver (>= 0.6, < 0.8)
|
96
|
-
globalid (0.
|
100
|
+
globalid (1.0.0)
|
97
101
|
activesupport (>= 5.0)
|
98
102
|
http-cookie (1.0.4)
|
99
103
|
domain_name (~> 0.5)
|
@@ -111,9 +115,9 @@ GEM
|
|
111
115
|
nesty (~> 1.0)
|
112
116
|
nokogiri (~> 1.11)
|
113
117
|
method_source (1.0.0)
|
114
|
-
mime-types (3.
|
118
|
+
mime-types (3.4.1)
|
115
119
|
mime-types-data (~> 3.2015)
|
116
|
-
mime-types-data (3.
|
120
|
+
mime-types-data (3.2022.0105)
|
117
121
|
mini_mime (1.1.2)
|
118
122
|
mini_portile2 (2.6.1)
|
119
123
|
minitest (5.14.4)
|
@@ -182,7 +186,7 @@ GEM
|
|
182
186
|
rack (~> 1.5)
|
183
187
|
rack-protection (~> 1.4)
|
184
188
|
tilt (>= 1.3, < 3)
|
185
|
-
thor (1.1
|
189
|
+
thor (1.2.1)
|
186
190
|
tilt (2.0.10)
|
187
191
|
tzinfo (2.0.4)
|
188
192
|
concurrent-ruby (~> 1.0)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Batch routing
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
# Create a record in an external database and store the hostname
|
5
|
+
record = Database::Row.create(hostname: "example.com")
|
6
|
+
|
7
|
+
class DummyJob < Wayfarer::Base
|
8
|
+
route do |hostname|
|
9
|
+
host hostname, to: :index
|
10
|
+
end
|
11
|
+
|
12
|
+
steer do |task|
|
13
|
+
# Pass the external record's hostname to the router
|
14
|
+
[Database::Row.find(task.batch).hostname]
|
15
|
+
end
|
16
|
+
|
17
|
+
# ...
|
18
|
+
end
|
19
|
+
|
20
|
+
# Enqueue the task and use the database record's key as batch
|
21
|
+
DummyJob.crawl_later("https://example.com", batch: record.id)
|
22
|
+
```
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Consent Screens
|
2
|
+
|
3
|
+
Some websites have nag-screens that make visitors wait for a button to show up.
|
4
|
+
Here is an example with Ferrum where the opt-in button is contained in an
|
5
|
+
iframe, clicked, and makes the live page behind the screen accessible to
|
6
|
+
`#index`:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
Wayfarer.config.network.agent = :ferrum
|
10
|
+
|
11
|
+
class DummyJob < Wayfarer::Base
|
12
|
+
route { to :index, host: "example.com" }
|
13
|
+
|
14
|
+
before_action if: :consent_required? do
|
15
|
+
sleep(5) # If the consent form has a loading animation
|
16
|
+
consent_button&.click
|
17
|
+
sleep(5) # Wait for browser to get redirected behind nag-screen
|
18
|
+
end
|
19
|
+
|
20
|
+
def index
|
21
|
+
# Nag-screen passed
|
22
|
+
stage page(live: true).meta.links.internal
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def consent_button
|
28
|
+
browser.frames.third.css("button#consent")&.first
|
29
|
+
end
|
30
|
+
|
31
|
+
def consent_required?
|
32
|
+
browser.css(".consent_screen").any?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Executing JavaScript
|
2
|
+
|
3
|
+
Executing JavaScript requires automating a browser.
|
4
|
+
|
5
|
+
=== "Ferrum"
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
class DummyJob < Wayfarer::Base
|
9
|
+
route { to :index }
|
10
|
+
|
11
|
+
def index
|
12
|
+
browser.evaluate("[window.scrollX, window.scrollY]")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
```
|
16
|
+
|
17
|
+
=== "Selenium"
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class DummyJob < Wayfarer::Base
|
21
|
+
route { to :index }
|
22
|
+
|
23
|
+
def index
|
24
|
+
# Mind the explicit return
|
25
|
+
browser.execute_script("return [window.scrollX, window.scrollY]")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
29
|
+
|
30
|
+
=== "Capybara"
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
class DummyJob < Wayfarer::Base
|
34
|
+
route { to :index }
|
35
|
+
|
36
|
+
def index
|
37
|
+
# Capybara does not return value of JavaScript execution
|
38
|
+
browser.execute_script("console.log('Foobar')") # => nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route { to :index }
|
10
10
|
|
11
11
|
def index
|
12
12
|
page.doc.css("html")
|
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
class DummyJob < Wayfarer::Base
|
22
|
-
route
|
22
|
+
route { to :index }
|
23
23
|
|
24
24
|
def index
|
25
25
|
browser.at_css("html")
|
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route { to :index }
|
36
36
|
|
37
37
|
def index
|
38
38
|
browser.find_elements(css: "html")
|
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route { to :index }
|
10
10
|
|
11
11
|
def index
|
12
12
|
browser.screenshot(path: "screenshot.png")
|
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route { to :index }
|
22
22
|
|
23
23
|
def index
|
24
24
|
browser.save_screenshot("screenshot.png")
|
@@ -7,8 +7,11 @@ When Capybara is in use, a remote browser process is available as a Capybara
|
|
7
7
|
session:
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
|
11
|
-
|
10
|
+
Wayfarer.config.network.agent = :capybara
|
11
|
+
# Wayfarer.config.capybara.driver = ...
|
12
|
+
|
13
|
+
class DummyJob < Wayfarer::Worker
|
14
|
+
route { to :index }
|
12
15
|
|
13
16
|
def index
|
14
17
|
browser # => #<Capybara::Session ...>
|
@@ -61,6 +64,6 @@ end
|
|
61
64
|
|
62
65
|
Capybara.register_driver(:cuprite) do |app|
|
63
66
|
# Wayfarer's Ferrum or Selenium options must be passed along manually
|
64
|
-
Capybara::Cuprite::Driver.new(app,
|
67
|
+
Capybara::Cuprite::Driver.new(app, Wayfarer.config.ferrum.options)
|
65
68
|
end
|
66
69
|
```
|
@@ -11,8 +11,10 @@ When Ferrum is in use, a Google Chrome process is accessible within jobs like
|
|
11
11
|
so:
|
12
12
|
|
13
13
|
```ruby
|
14
|
+
Wayfarer.config.network.agent = :ferrum
|
15
|
+
|
14
16
|
class DummyWorker < Wayfarer::Worker
|
15
|
-
route
|
17
|
+
route { to :index }
|
16
18
|
|
17
19
|
def index
|
18
20
|
browser # => #<Ferrum::Browser ...>
|
@@ -7,8 +7,10 @@ When Selenium is in use, a remote browser process is accessible within jobs like
|
|
7
7
|
so:
|
8
8
|
|
9
9
|
```ruby
|
10
|
+
Wayfarer.config.network.agent = :selenium
|
11
|
+
|
10
12
|
class DummyWorker < Wayfarer::Worker
|
11
|
-
route
|
13
|
+
route { to :index }
|
12
14
|
|
13
15
|
def index
|
14
16
|
browser # => #<Selenium::WebDriver ...>
|
@@ -28,7 +30,7 @@ process.
|
|
28
30
|
Wayfarer.config.network.agent = :selenium
|
29
31
|
|
30
32
|
class DummyJob < Wayfarer::Base
|
31
|
-
route
|
33
|
+
route { to :index }
|
32
34
|
|
33
35
|
def index
|
34
36
|
page.headers # => always {}
|
data/docs/guides/callbacks.md
CHANGED
@@ -52,16 +52,16 @@ end
|
|
52
52
|
Internally, a batch counter is in-/decremented on certain events. Once the
|
53
53
|
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
54
54
|
|
55
|
-
The counter is incremented when:
|
55
|
+
The counter is incremented when within the batch:
|
56
56
|
|
57
|
-
* A job is enqueued
|
57
|
+
* A job is enqueued.
|
58
58
|
|
59
59
|
The counter is decremented when:
|
60
60
|
|
61
61
|
* A job succeeds.
|
62
|
-
* A job
|
63
|
-
* A job
|
64
|
-
* A job
|
62
|
+
* A job errors due to an unhandled exception.
|
63
|
+
* A job is discarded due to an exception.
|
64
|
+
* A job errors and thereyby exhausts its maximum attempts.
|
65
65
|
|
66
66
|
!!! attention "Batch callbacks can fail jobs"
|
67
67
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# Debugging
|
2
|
+
|
3
|
+
[Wayfarer's CLI](/reference/cli/) has two sub-commands that come in handy when
|
4
|
+
diagnosing problems in the development workflow.
|
5
|
+
|
6
|
+
## Routing a URL from the shell
|
7
|
+
|
8
|
+
## `wayfarer route`
|
9
|
+
|
10
|
+
### `wayfarer route result JOB URL`
|
11
|
+
|
12
|
+
: Prints the result of invoking `JOB`'s router with `URL`.
|
13
|
+
|
14
|
+
### `wayfarer route tree JOB URL`
|
15
|
+
|
16
|
+
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
17
|
+
|
@@ -1,35 +1,31 @@
|
|
1
1
|
# Error handling
|
2
2
|
|
3
|
-
Wayfarer
|
4
|
-
`discard_on`:
|
3
|
+
## Wayfarer never swallows exceptions
|
5
4
|
|
6
|
-
*
|
7
|
-
*
|
5
|
+
* Wayfarer never swallows exceptions.
|
6
|
+
* Jobs with unhandled exceptions are not retried.
|
8
7
|
|
9
|
-
## Retrying
|
8
|
+
## Retrying and discarding
|
10
9
|
|
11
|
-
|
12
|
-
class DummyJob < Wayfarer::Base
|
13
|
-
retry_on MyError, attempts: 3 do |job, error|
|
14
|
-
# All 3 attempts have failed (1 initial attempt + 2 retries)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
```
|
10
|
+
Wayfarer relies on [Active Job's two error handling facilities](https://guides.rubyonrails.org/active_job_basics.html#exceptions).
|
18
11
|
|
19
|
-
|
12
|
+
* `retry_on` to retry jobs a number of times on certain errors:
|
20
13
|
|
21
|
-
```ruby
|
22
|
-
class DummyJob < Wayfarer::Base
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
27
|
-
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
retry_on MyError, attempts: 3 do |job, error|
|
17
|
+
# This block runs once all 3 attempts have failed
|
18
|
+
# (1 initial attempt + 2 retries)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
```
|
28
22
|
|
29
|
-
|
23
|
+
* `discard_on` to throw away jobs on certain errors:
|
30
24
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
```ruby
|
26
|
+
class DummyJob < Wayfarer::Base
|
27
|
+
discard_on MyError do |job, error|
|
28
|
+
# This block runs once and buries the job
|
29
|
+
end
|
30
|
+
end
|
31
|
+
```
|
data/docs/guides/jobs.md
CHANGED
@@ -1,16 +1,36 @@
|
|
1
1
|
# Jobs
|
2
2
|
|
3
|
-
Jobs are Ruby classes that look as follows:
|
3
|
+
Jobs are Ruby classes that process [tasks](/guides/tasks) and look as follows:
|
4
4
|
|
5
5
|
```ruby
|
6
6
|
class DummyJob < Wayfarer::Base
|
7
|
-
route
|
7
|
+
route { to :index }
|
8
8
|
|
9
9
|
def index
|
10
10
|
end
|
11
11
|
end
|
12
12
|
```
|
13
13
|
|
14
|
+
Here is how to enqueue a task for a URL:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
DummyJob.crawl_later("https://example.com")
|
18
|
+
```
|
19
|
+
|
20
|
+
This is the same as calling the Active Job API directly and passing a task
|
21
|
+
and a random batch:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
|
25
|
+
DummyJob.perform_later(task)
|
26
|
+
```
|
27
|
+
|
28
|
+
A batch can be specified with `::crawl_later`, too:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
DummyJob.crawl_later("https://example.com", batch: "my-batch")
|
32
|
+
```
|
33
|
+
|
14
34
|
## Current task
|
15
35
|
|
16
36
|
Jobs consume [tasks](../tasks) from a message queue. The currently processed
|
@@ -18,58 +38,64 @@ task is accessible like so:
|
|
18
38
|
|
19
39
|
```ruby
|
20
40
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
41
|
+
route { to :index }
|
22
42
|
|
23
43
|
def index
|
24
44
|
task.url # => "https://example.com"
|
25
|
-
task.batch # => "
|
45
|
+
task.batch # => "my-batch"
|
26
46
|
end
|
27
47
|
end
|
28
48
|
```
|
29
49
|
|
30
50
|
## Current page
|
31
51
|
|
32
|
-
|
33
|
-
|
52
|
+
A task's URL contents get fetched into a [page](../pages) object if the task URL
|
53
|
+
matched a route:
|
34
54
|
|
35
55
|
```ruby
|
36
56
|
class DummyJob < Wayfarer::Base
|
37
|
-
route
|
57
|
+
route { to :index, host: "example.com" }
|
38
58
|
|
39
59
|
def index
|
40
|
-
page.url
|
41
|
-
page.body
|
60
|
+
page.url # => "https://example.com"
|
61
|
+
page.body # => "<html>..."
|
62
|
+
page.status_code # => 200
|
63
|
+
page.headers # { "Content-Type" => ... }
|
42
64
|
end
|
43
65
|
end
|
44
66
|
```
|
45
67
|
|
46
68
|
## URL parameters
|
47
69
|
|
48
|
-
|
70
|
+
Jobs can extract data from URLs with their router:
|
49
71
|
|
50
72
|
```ruby
|
51
73
|
class DummyJob < Wayfarer::Base
|
52
|
-
route
|
74
|
+
route do
|
75
|
+
path "/users/:id/profile"
|
76
|
+
end
|
53
77
|
|
54
78
|
def index
|
55
|
-
|
56
|
-
page.body # => "<html>..."
|
79
|
+
params[:id] # => "42"
|
57
80
|
end
|
58
81
|
end
|
82
|
+
|
83
|
+
DummyJob.crawl_later("https://example.com/users/42/profile")
|
59
84
|
```
|
60
85
|
|
61
86
|
|
62
|
-
##
|
87
|
+
## User agent
|
63
88
|
|
64
|
-
|
65
|
-
accessible like so:
|
89
|
+
The HTTP client or automated browser that fetched the URL is available:
|
66
90
|
|
67
91
|
```ruby
|
92
|
+
Wayfarer.config.network.agent = :ferrum # Chrome DevTools Protocol
|
93
|
+
|
68
94
|
class DummyJob < Wayfarer::Base
|
69
|
-
route
|
95
|
+
route { to :index }
|
70
96
|
|
71
97
|
def index
|
72
|
-
browser
|
98
|
+
browser.save_screenshot("capture.png")
|
73
99
|
end
|
74
100
|
end
|
75
101
|
```
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Navigation
|
2
|
+
|
3
|
+
Wayfarer has two mechanisms for navigating crawls:
|
4
|
+
|
5
|
+
* Jobs have a router that decides if a task's URL gets fetched and processed.
|
6
|
+
* Jobs can add URLs to a processing set with `#stage`.
|
7
|
+
|
8
|
+
## Staging URLs
|
9
|
+
|
10
|
+
Jobs can turn URLs into tasks within their own batch with `#stage`. Staging a
|
11
|
+
URL does not enqueue it immediately. Instead, the URL is added to a processing
|
12
|
+
set first.
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
route { to :index }
|
17
|
+
|
18
|
+
def index
|
19
|
+
stage page.meta.links.all
|
20
|
+
end
|
21
|
+
end
|
22
|
+
```
|
23
|
+
|
24
|
+
Once the `index` action method returns, all URLs in `page.meta.links.all`
|
25
|
+
are (1) normalized to a canonical form and (2) checked for inclusion in
|
26
|
+
the batch's processed URL Redis set. All unprocessed URLs are enqueued as
|
27
|
+
tasks within the same batch.
|
28
|
+
|
29
|
+
`#stage` can be called arbitrarily often, with invalid URLs too, as they are
|
30
|
+
filtered out behind the scenes:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
def index
|
34
|
+
stage "_bro:ken@url/" # => ["_bro:ken@url/"]
|
35
|
+
end
|
36
|
+
```
|
37
|
+
|
38
|
+
See also: [Performance: Stage less URLs](/guides/performance)
|
39
|
+
|
40
|
+
!!! attention "Failing action methods do not enqueue tasks"
|
41
|
+
|
42
|
+
If an action method fails as in:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
def index
|
46
|
+
stage page.meta.links.all
|
47
|
+
fail "Error occured"
|
48
|
+
end
|
49
|
+
```
|
50
|
+
|
51
|
+
None of the staged URLs are enqueued as tasks. Jobs that raise an exception
|
52
|
+
should get retried, or the exception should be handled.
|
53
|
+
|
54
|
+
|
55
|
+
## Routing URLs
|
56
|
+
|
57
|
+
In the following example, the task is written to the message queue, but the
|
58
|
+
job's routes do not match the URL. When the task gets consumed, the URL does not
|
59
|
+
get fetched and the action method not called.
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
class DummyJob < Wayfarer::Base
|
63
|
+
route do
|
64
|
+
host "example.com", path: "/users/:user_id", to: :user
|
65
|
+
end
|
66
|
+
|
67
|
+
# ...
|
68
|
+
end
|
69
|
+
|
70
|
+
DummyJob.crawl_later("https://mismatching.host/users/42")
|
71
|
+
```
|
72
|
+
|
73
|
+
|
data/docs/guides/pages.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# Pages
|
2
2
|
|
3
|
-
Retrieved pages
|
4
|
-
|
3
|
+
Retrieved pages take the shape of `Wayfarer::Page` objects and are available
|
4
|
+
to jobs:
|
5
5
|
|
6
6
|
```ruby
|
7
7
|
class DummyJob < Wayfarer::Worker
|
8
|
-
route
|
8
|
+
route { to :index }
|
9
9
|
|
10
10
|
def index
|
11
11
|
page # => #<Wayfarer::Page ...>
|
@@ -35,7 +35,7 @@ To access a page reflecting the current browser state, pass the `live` keyword:
|
|
35
35
|
|
36
36
|
```ruby
|
37
37
|
class DummyJob < Wayfarer::Worker
|
38
|
-
route
|
38
|
+
route { to :index }
|
39
39
|
|
40
40
|
def index
|
41
41
|
page # => #<Wayfarer::Page ...>
|