wayfarer 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +28 -1
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +1 -1
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
data/docs/cookbook/navigation.md
CHANGED
@@ -4,12 +4,12 @@
|
|
4
4
|
|
5
5
|
```ruby
|
6
6
|
class DummyJob < Wayfarer::Base
|
7
|
-
route
|
7
|
+
route.to :index
|
8
8
|
|
9
9
|
def index
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
user_agent.goto("https://example.com")
|
11
|
+
user_agent.back
|
12
|
+
user_agent.forward
|
13
13
|
end
|
14
14
|
end
|
15
15
|
```
|
@@ -18,12 +18,12 @@
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
user_agent.navigate.to("https://example.com")
|
25
|
+
user_agent.navigate.back
|
26
|
+
user_agent.navigate.forward
|
27
27
|
end
|
28
28
|
end
|
29
29
|
```
|
@@ -32,12 +32,12 @@
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
user_agent.visit("https://example.com")
|
39
|
+
user_agent.go_back
|
40
|
+
user_agent.go_forward
|
41
41
|
end
|
42
42
|
end
|
43
43
|
```
|
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
page.doc.css("html")
|
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
class DummyJob < Wayfarer::Base
|
22
|
-
route
|
22
|
+
route.to :index
|
23
23
|
|
24
24
|
def index
|
25
25
|
browser.at_css("html")
|
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
38
|
browser.find_elements(css: "html")
|
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
browser.screenshot(path: "screenshot.png")
|
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
24
|
browser.save_screenshot("screenshot.png")
|
data/docs/cookbook/user_agent.md
CHANGED
data/docs/design.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Design decisions
|
2
|
+
|
3
|
+
## Navigate the web along URL patterns
|
4
|
+
|
5
|
+
URLs are less prone to change than served markup.
|
6
|
+
One reason for this is that changes to a URL's path can have a negative effect
|
7
|
+
on its page ranking in search engines. Many websites also implement common
|
8
|
+
architectural URL patterns, for example REST and its variations, that
|
9
|
+
lend themselves to pattern matching.
|
10
|
+
|
11
|
+
## Follow URLs verbatim
|
12
|
+
|
13
|
+
Normalized URLs are useful for deduplication, but URLs should be followed
|
14
|
+
as they appear in responses. Navigating to normalized versions of URLs makes
|
15
|
+
crawlers stick out from other user agents, for example.
|
16
|
+
|
17
|
+
## Tasks are version-less and don't persist metadata
|
18
|
+
|
19
|
+
Tasks serialize to their URL and batch. No other data gets written to
|
20
|
+
the message queue. Wayfarer aims to minimise job payloads.
|
21
|
+
There is also no need for versioning persisted tasks, since there is only one
|
22
|
+
version of a task: URL and batch.
|
23
|
+
|
24
|
+
## Why depend on Redis
|
25
|
+
|
26
|
+
There are two core features that depend on Redis. First, per-batch acylicity is
|
27
|
+
achieved by maintaining the set of processed URLs per batch in Redis.
|
28
|
+
There's no option to follow links in a cyclic manner. Second, batch completion
|
29
|
+
requires updating an integer value in Redis, and batch completion is a very
|
30
|
+
useful feature, since most crawls should end eventually, and often you want to
|
31
|
+
know when.
|
32
|
+
|
33
|
+
## Persistence and document mapping not included
|
34
|
+
|
35
|
+
Like Active Job, Wayfarer is not concerned with persistence.
|
36
|
+
Model <-> DOM mapping abstractions are also out of scope.
|
data/docs/guides/callbacks.md
CHANGED
@@ -1,145 +1,43 @@
|
|
1
1
|
# Callbacks
|
2
2
|
|
3
|
-
|
3
|
+
Wayfarer supports a number of callbacks in addition to
|
4
|
+
[ActiveJob's](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
|
4
5
|
|
5
|
-
|
6
|
+
## Available callbacks
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
before_fetch :do_something
|
15
|
-
|
16
|
-
private
|
17
|
-
|
18
|
-
def do_something
|
19
|
-
# before the task.url is fetched
|
20
|
-
end
|
21
|
-
end
|
22
|
-
```
|
23
|
-
|
24
|
-
## `before_action`
|
25
|
-
|
26
|
-
Runs after a page was fetched, before an action method is called.
|
27
|
-
|
28
|
-
```ruby
|
29
|
-
class DummyJob < Wayfarer::Base
|
30
|
-
before_action :do_something
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
def do_something
|
35
|
-
# page is available at this point
|
36
|
-
end
|
37
|
-
end
|
38
|
-
```
|
8
|
+
* `before_fetch`
|
9
|
+
* `around_fetch`
|
10
|
+
* `after_fetch`
|
11
|
+
* `before_action`
|
12
|
+
* `around_action`
|
13
|
+
* `after_action`
|
14
|
+
* `after_batch`
|
39
15
|
|
40
16
|
## `after_batch`
|
41
17
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
after_batch do
|
47
|
-
# All jobs in batch done
|
48
|
-
end
|
49
|
-
end
|
50
|
-
```
|
51
|
-
|
52
|
-
Internally, a batch counter is in-/decremented on certain events. Once the
|
53
|
-
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
54
|
-
|
55
|
-
The counter is incremented when within the batch:
|
18
|
+
You can register `after_batch` callbacks that run when there are no more tasks
|
19
|
+
to process in a batch. Wayfarer instruments job execution and in- or decrements
|
20
|
+
an integer counter in Redis on certain events. When the counter reaches zero,
|
21
|
+
the current job's `after_batch` callbacks run.
|
56
22
|
|
57
|
-
|
23
|
+
## Conditional callbacks
|
58
24
|
|
59
|
-
|
60
|
-
|
61
|
-
* A job succeeds.
|
62
|
-
* A job errors due to an unhandled exception.
|
63
|
-
* A job is discarded due to an exception.
|
64
|
-
* A job errors and thereyby exhausts its maximum attempts.
|
65
|
-
|
66
|
-
!!! attention "Batch callbacks can fail jobs"
|
67
|
-
|
68
|
-
If the last job's `after_batch` callbacks raises an exception, this can lead
|
69
|
-
to the job getting retried. If the exception raised by the callback is
|
70
|
-
unhandled or discarded, the callback never fully runs.
|
71
|
-
|
72
|
-
## Callback options
|
73
|
-
|
74
|
-
### Definition styles
|
75
|
-
|
76
|
-
Callbacks can be registered either by supplying a block or a symbol identifying
|
77
|
-
a callback instance method:
|
25
|
+
You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
|
26
|
+
keywords, for example to run a callback for some route `action` only:
|
78
27
|
|
79
28
|
```ruby
|
80
|
-
class DummyJob <
|
81
|
-
|
82
|
-
# ...
|
83
|
-
end
|
84
|
-
|
85
|
-
before_action :my_callback
|
29
|
+
class DummyJob < ActiveJob::Base
|
30
|
+
include Wayfarer::Base
|
86
31
|
|
87
|
-
|
32
|
+
route.host "example.com", to: :example
|
33
|
+
route.to :fallback
|
88
34
|
|
89
|
-
|
35
|
+
before_action unless: -> { action == :fallback } do
|
90
36
|
# ...
|
91
37
|
end
|
92
|
-
end
|
93
|
-
```
|
94
|
-
|
95
|
-
### Conditionals
|
96
|
-
|
97
|
-
Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
class DummyJob < Wayfarer::Base
|
101
|
-
before_fetch :my_callback, if: :my_condition
|
102
|
-
|
103
|
-
private
|
104
|
-
|
105
|
-
def my_callback
|
106
|
-
end
|
107
38
|
|
108
|
-
|
109
|
-
end
|
39
|
+
# ...
|
110
40
|
end
|
111
41
|
```
|
112
42
|
|
113
|
-
|
114
|
-
`:except` keywords:
|
115
|
-
|
116
|
-
```ruby
|
117
|
-
class DummyJob < Wayfarer::Base
|
118
|
-
before_fetch :do_something, only: :foo
|
119
|
-
|
120
|
-
before_fetch except: [:foo, :qux] do
|
121
|
-
# runs only before bar
|
122
|
-
end
|
123
|
-
|
124
|
-
def foo
|
125
|
-
end
|
126
|
-
|
127
|
-
def bar
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
```
|
132
|
-
|
133
|
-
### Early termination
|
134
|
-
|
135
|
-
Callbacks that return `false` halt the callback chain:
|
136
|
-
|
137
|
-
```ruby
|
138
|
-
class DummyJob < Wayfarer::Base
|
139
|
-
before_action { false }
|
140
|
-
|
141
|
-
before_action do
|
142
|
-
# never runs
|
143
|
-
end
|
144
|
-
end
|
145
|
-
```
|
43
|
+
You can also pass a symbol instead of a block to call an instance method.
|
@@ -13,27 +13,27 @@ Wayfarer parses environment variables into a runtime configuration
|
|
13
13
|
|
14
14
|
```ruby
|
15
15
|
# Which user agent to use to process tasks
|
16
|
-
Wayfarer.config
|
16
|
+
Wayfarer.config[:network][:agent] = :http # or :ferrum, :selenium
|
17
17
|
|
18
18
|
# How many user agents to instantiate
|
19
|
-
Wayfarer.config
|
19
|
+
Wayfarer.config[:network][:pool_size] = 3
|
20
20
|
|
21
21
|
# How long an agent may be used while processing a task
|
22
|
-
Wayfarer.config
|
22
|
+
Wayfarer.config[:network][:pool_timeout] = 5000
|
23
23
|
|
24
24
|
# Ferrum options
|
25
|
-
Wayfarer.config
|
25
|
+
Wayfarer.config[:ferrum][:options] = {}
|
26
26
|
|
27
27
|
# Selenium driver to use
|
28
|
-
Wayfarer.config
|
28
|
+
Wayfarer.config[:selenium][:driver] = :chrome
|
29
29
|
|
30
30
|
# Selenium HTTP client read timeout
|
31
|
-
Wayfarer.config
|
31
|
+
Wayfarer.config[:selenium][:client_timeout] = 10 # seconds
|
32
32
|
|
33
33
|
# Selenium options
|
34
|
-
Wayfarer.config
|
34
|
+
Wayfarer.config[:selenium][:options] = { url: "http://chrome" }
|
35
35
|
|
36
36
|
# HTTP request headers (Selenium is unsupported)
|
37
|
-
Wayfarer.config
|
37
|
+
Wayfarer.config[:network][:http_headers] = { "Field" => "Value" }
|
38
38
|
```
|
39
39
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Handlers
|
2
|
+
|
3
|
+
[Jobs](/jobs) can route tasks to handlers to delegate processing without
|
4
|
+
writes to the message queue. Unlike jobs, handlers don't inherit from
|
5
|
+
`ActiveJob::Base` and therefore cannot be enqueued. Handlers have routes, too,
|
6
|
+
but they don't retrieve pages and a handler's router can be bypassed.
|
7
|
+
|
8
|
+
## Supported features
|
9
|
+
|
10
|
+
Handlers support a subset of features compared to `Wayfarer::Base`:
|
11
|
+
|
12
|
+
* URL routing
|
13
|
+
* enqueueing tasks with `#!ruby stage(*urls)`
|
14
|
+
* jobs can access the `user_agent` that retrieved the `page`
|
15
|
+
* ad-hoc HTTP requests with `#!ruby fetch(url)`
|
16
|
+
* callbacks, but only a subset of job callbacks
|
17
|
+
* Content-Type filtering
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class ExampleHandler
|
21
|
+
include Wayfarer::Handler
|
22
|
+
|
23
|
+
route.to: :index
|
24
|
+
|
25
|
+
def index
|
26
|
+
task # => #<Wayfarer::Task>
|
27
|
+
page # => #<Wayfarer::Page>
|
28
|
+
user_agent # => Browser or HTTP client
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class DummyJob < ActiveJob::Base
|
33
|
+
include Wayfarer::Base
|
34
|
+
|
35
|
+
route.host "example.com", to: ExampleHandler
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also bypass a handler's router and route directly to an instance
|
40
|
+
method:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
class DummyJob < ActiveJob::Base
|
44
|
+
include Wayfarer::Base
|
45
|
+
|
46
|
+
route.host "example.com", to: [ExampleHandler, :index]
|
47
|
+
end
|
48
|
+
|
49
|
+
class ExampleHandler
|
50
|
+
include Wayfarer::Handler
|
51
|
+
|
52
|
+
def index
|
53
|
+
task # => #<Wayfarer::Task>
|
54
|
+
page # => #<Wayfarer::Page>
|
55
|
+
user_agent # => Browser or HTTP client
|
56
|
+
end
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
!!! `before_action` callbacks
|
@@ -0,0 +1 @@
|
|
1
|
+
hello
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Error handling
|
2
|
+
|
3
|
+
!!! danger "Only ActiveJob error handling is supported"
|
4
|
+
|
5
|
+
Wayfarer exclusively supports ActiveJob's error handling. You cannot use
|
6
|
+
message queue-specific error handling, for example error handling with
|
7
|
+
`sidekiq_options` is unsupported. Otherwise batches get garbage-collected
|
8
|
+
too early as Wayfarer instruments ActiveJob.
|
9
|
+
|
10
|
+
Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
|
11
|
+
|
12
|
+
* `retry_on` to retry jobs a number of times on certain errors:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
retry_on MyError, attempts: 3 do |job, error|
|
17
|
+
# This block runs once all 3 attempts have failed
|
18
|
+
# (1 initial attempt + 2 retries)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
```
|
22
|
+
|
23
|
+
* `discard_on` to throw away jobs on certain errors:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
class DummyJob < Wayfarer::Base
|
27
|
+
discard_on MyError do |job, error|
|
28
|
+
# This block runs once and buries the job
|
29
|
+
end
|
30
|
+
end
|
31
|
+
```
|
32
|
+
|
33
|
+
## Recreating user agents on certain errors
|
34
|
+
|
35
|
+
You can configure a list of exception classes upon which user agents
|
36
|
+
get recreated (see [User agent API]()):
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
|
40
|
+
```
|
data/docs/guides/jobs.md
CHANGED
@@ -1,78 +1,124 @@
|
|
1
1
|
# Jobs
|
2
2
|
|
3
|
-
Jobs are
|
3
|
+
Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
|
4
|
+
that use a DSL included from the `Wayfarer::Base` module to process [tasks](/guides/tasks)
|
5
|
+
that they read from a message queue.
|
6
|
+
Instead of implementing Active Job's `#perform` method yourself, you declare routes
|
7
|
+
to instance methods, similiar to how web applications route incoming requests.
|
8
|
+
Only URLs that match a [route](../routing) are requested or navigated to.
|
9
|
+
The action method has access to the retrieved [page](../pages),
|
10
|
+
the [user agent](../user-agents) that retrieved the page and the current task:
|
4
11
|
|
5
12
|
```ruby
|
6
|
-
class DummyJob <
|
7
|
-
|
13
|
+
class DummyJob < ActiveJob::Base
|
14
|
+
include Wayfarer::Base
|
15
|
+
|
16
|
+
route.to :index
|
8
17
|
|
9
18
|
def index
|
19
|
+
task # => #<Wayfarer::Task>
|
20
|
+
page # => #<Wayfarer::Page>
|
21
|
+
user_agent # => Browser or HTTP client
|
10
22
|
end
|
11
23
|
end
|
12
24
|
```
|
13
25
|
|
14
|
-
|
26
|
+
You can start a crawl by appending a task to the message queue for the URL with
|
27
|
+
`::crawl`. By default, a UUID is generated as the batch:
|
15
28
|
|
16
29
|
```ruby
|
17
|
-
DummyJob.crawl("https://example.com")
|
30
|
+
task = DummyJob.crawl("https://example.com")
|
31
|
+
# => #<Wayfarer::Task url="https://example.com", batch="498a13e0-...">
|
18
32
|
```
|
19
33
|
|
20
|
-
This is the same as calling
|
21
|
-
|
34
|
+
This is exactly the same as calling Active Job's `#perform_later` and passing a
|
35
|
+
task directly:
|
22
36
|
|
23
37
|
```ruby
|
24
38
|
task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
|
25
39
|
DummyJob.perform_later(task)
|
26
40
|
```
|
27
41
|
|
28
|
-
|
42
|
+
Instead of a generated UUID, you can also set your own batch:
|
29
43
|
|
30
44
|
```ruby
|
31
45
|
DummyJob.crawl("https://example.com", batch: "my-batch")
|
32
46
|
```
|
33
47
|
|
34
|
-
|
48
|
+
You can also use Wayfarer's [CLI](../cli) to enqueue a task:
|
49
|
+
|
50
|
+
```sh
|
51
|
+
wayfarer enqueue --batch my-batch DummyJob "https://example.com"
|
52
|
+
```
|
53
|
+
|
54
|
+
## Navigating crawls
|
55
|
+
|
56
|
+
Jobs navigate crawls by staging URLs with `#!ruby stage(urls)`. When you stage a URL, a normalized
|
57
|
+
version of it is appended to an internal set. Once the action returns, all URLs
|
58
|
+
in the set are appended as tasks to the message queue.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
class DummyJob < ActiveJob::Base
|
62
|
+
include Wayfarer::Base
|
63
|
+
|
64
|
+
route.to :index
|
65
|
+
|
66
|
+
def index
|
67
|
+
# Follow all out-going links of the page
|
68
|
+
stage page.meta.links.external
|
69
|
+
end
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
## Accessing the current task
|
35
74
|
|
36
|
-
|
37
|
-
task is
|
75
|
+
If the task's URL matched a [route](../routing), the URL is retrieved over the network,
|
76
|
+
and the method that was routed to is called. The task is available as `#task`:
|
38
77
|
|
39
78
|
```ruby
|
40
|
-
class DummyJob <
|
41
|
-
|
79
|
+
class DummyJob < ActiveJob::Base
|
80
|
+
include Wayfarer::Base
|
81
|
+
|
82
|
+
route.to :index
|
42
83
|
|
43
84
|
def index
|
44
|
-
task.url
|
85
|
+
task.url # => "https://example.com"
|
45
86
|
task.batch # => "my-batch"
|
46
87
|
end
|
47
88
|
end
|
48
89
|
```
|
49
90
|
|
50
|
-
##
|
91
|
+
## Accessing the current page
|
51
92
|
|
52
|
-
|
53
|
-
matched a route:
|
93
|
+
You have access to the retrieved [page](../pages):
|
54
94
|
|
55
95
|
```ruby
|
56
|
-
class DummyJob <
|
57
|
-
|
96
|
+
class DummyJob < ActiveJob::Base
|
97
|
+
include Wayfarer::Base
|
98
|
+
|
99
|
+
route.to :index
|
58
100
|
|
59
101
|
def index
|
60
102
|
page.url # => "https://example.com"
|
61
103
|
page.body # => "<html>..."
|
62
104
|
page.status_code # => 200
|
63
105
|
page.headers # { "Content-Type" => ... }
|
106
|
+
page.doc # Only present for certain Content-Types
|
64
107
|
end
|
65
108
|
end
|
66
109
|
```
|
67
110
|
|
68
|
-
##
|
111
|
+
## Routing URLs to methods and extracting `params`
|
69
112
|
|
70
|
-
Jobs
|
113
|
+
Jobs have a routing DSL that allows you to map URLs to methods and extract
|
114
|
+
URL data:
|
71
115
|
|
72
116
|
```ruby
|
73
|
-
class DummyJob <
|
117
|
+
class DummyJob < ActiveJob::Base
|
118
|
+
include Wayfarer::Base
|
119
|
+
|
74
120
|
route do
|
75
|
-
path "/users/:id/profile"
|
121
|
+
path "/users/:id/profile", to: :index
|
76
122
|
end
|
77
123
|
|
78
124
|
def index
|
@@ -80,22 +126,44 @@ class DummyJob < Wayfarer::Base
|
|
80
126
|
end
|
81
127
|
end
|
82
128
|
|
83
|
-
DummyJob.crawl("https://example.com/users/42/profile")
|
129
|
+
DummyJob.crawl("https://example.com/users/42/profile?foo=bar")
|
84
130
|
```
|
85
131
|
|
132
|
+
## Controlling the user agent
|
86
133
|
|
87
|
-
|
88
|
-
|
89
|
-
The HTTP client or automated browser that fetched the URL is available:
|
134
|
+
You can control the browser or HTTP client that retrieved the page:
|
90
135
|
|
91
136
|
```ruby
|
92
|
-
Wayfarer.config
|
137
|
+
Wayfarer.config[:network][:agent] = :ferrum # Chrome DevTools Protocol
|
93
138
|
|
94
|
-
class DummyJob <
|
95
|
-
|
139
|
+
class DummyJob < ActiveJob::Base
|
140
|
+
include Wayfarer::Base
|
141
|
+
|
142
|
+
route.to :index
|
96
143
|
|
97
144
|
def index
|
98
|
-
|
145
|
+
user_agent.save_screenshot("capture.png")
|
99
146
|
end
|
100
147
|
end
|
101
148
|
```
|
149
|
+
|
150
|
+
## Restricting the processed Content-Types
|
151
|
+
|
152
|
+
By default, jobs process pages regardless of their Content-Type response
|
153
|
+
header. You can allow a list of Content-Types as strings and Regexps and
|
154
|
+
opt out of the default behaviour. Once at least one Content-Type is allowed,
|
155
|
+
other Content-Types don't get processed:
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
class DummyJob < ActiveJob::Base
|
159
|
+
include Wayfarer::Base
|
160
|
+
|
161
|
+
content_type "text/html", "application/json"
|
162
|
+
content_type /xml/
|
163
|
+
end
|
164
|
+
```
|
165
|
+
|
166
|
+
!!! info "HTTP parameters in Content-Types are ignored for comparison"
|
167
|
+
|
168
|
+
Content-Types are compared regardless of their parameters. For example,
|
169
|
+
`text/html; charset=UTF-8` is considered the same as `text/html`.
|