wayfarer 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +29 -2
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +17 -0
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -31
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -42
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -26
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
data/docs/cookbook/navigation.md
CHANGED
@@ -4,12 +4,12 @@
|
|
4
4
|
|
5
5
|
```ruby
|
6
6
|
class DummyJob < Wayfarer::Base
|
7
|
-
route
|
7
|
+
route.to :index
|
8
8
|
|
9
9
|
def index
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
user_agent.goto("https://example.com")
|
11
|
+
user_agent.back
|
12
|
+
user_agent.forward
|
13
13
|
end
|
14
14
|
end
|
15
15
|
```
|
@@ -18,12 +18,12 @@
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
user_agent.navigate.to("https://example.com")
|
25
|
+
user_agent.navigate.back
|
26
|
+
user_agent.navigate.forward
|
27
27
|
end
|
28
28
|
end
|
29
29
|
```
|
@@ -32,12 +32,12 @@
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
user_agent.visit("https://example.com")
|
39
|
+
user_agent.go_back
|
40
|
+
user_agent.go_forward
|
41
41
|
end
|
42
42
|
end
|
43
43
|
```
|
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
page.doc.css("html")
|
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
19
19
|
|
20
20
|
```ruby
|
21
21
|
class DummyJob < Wayfarer::Base
|
22
|
-
route
|
22
|
+
route.to :index
|
23
23
|
|
24
24
|
def index
|
25
25
|
browser.at_css("html")
|
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
|
|
32
32
|
|
33
33
|
```ruby
|
34
34
|
class DummyJob < Wayfarer::Base
|
35
|
-
route
|
35
|
+
route.to :index
|
36
36
|
|
37
37
|
def index
|
38
38
|
browser.find_elements(css: "html")
|
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
|
|
6
6
|
|
7
7
|
```ruby
|
8
8
|
class DummyJob < Wayfarer::Base
|
9
|
-
route
|
9
|
+
route.to :index
|
10
10
|
|
11
11
|
def index
|
12
12
|
browser.screenshot(path: "screenshot.png")
|
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
|
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
class DummyJob < Wayfarer::Base
|
21
|
-
route
|
21
|
+
route.to :index
|
22
22
|
|
23
23
|
def index
|
24
24
|
browser.save_screenshot("screenshot.png")
|
data/docs/cookbook/user_agent.md
CHANGED
data/docs/design.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Design decisions
|
2
|
+
|
3
|
+
## Navigate the web along URL patterns
|
4
|
+
|
5
|
+
URLs are less prone to change than served markup.
|
6
|
+
One reason for this is that changes to a URL's path can have a negative effect
|
7
|
+
on its page ranking in search engines. Many websites also implement common
|
8
|
+
architectural URL patterns, for example REST and its variations, that
|
9
|
+
lend themselves to pattern matching.
|
10
|
+
|
11
|
+
## Follow URLs verbatim
|
12
|
+
|
13
|
+
Normalized URLs are useful for deduplication, but URLs should be followed
|
14
|
+
as they appear in responses. Navigating to normalized versions of URLs makes
|
15
|
+
crawlers stick out from other user agents, for example.
|
16
|
+
|
17
|
+
## Tasks are version-less and don't persist metadata
|
18
|
+
|
19
|
+
Tasks serialize to their URL and batch. No other data gets written to
|
20
|
+
the message queue. Wayfarer aims to minimise job payloads.
|
21
|
+
There is also no need for versioning persisted tasks, since there is only one
|
22
|
+
version of a task: URL and batch.
|
23
|
+
|
24
|
+
## Why depend on Redis
|
25
|
+
|
26
|
+
There are two core features that depend on Redis. First, per-batch acylicity is
|
27
|
+
achieved by maintaining the set of processed URLs per batch in Redis.
|
28
|
+
There's no option to follow links in a cyclic manner. Second, batch completion
|
29
|
+
requires updating an integer value in Redis, and batch completion is a very
|
30
|
+
useful feature, since most crawls should end eventually, and often you want to
|
31
|
+
know when.
|
32
|
+
|
33
|
+
## Persistence and document mapping not included
|
34
|
+
|
35
|
+
Like Active Job, Wayfarer is not concerned with persistence.
|
36
|
+
Model <-> DOM mapping abstractions are also out of scope.
|
data/docs/guides/callbacks.md
CHANGED
@@ -1,145 +1,43 @@
|
|
1
1
|
# Callbacks
|
2
2
|
|
3
|
-
|
3
|
+
Wayfarer supports a number of callbacks in addition to
|
4
|
+
[ActiveJob's](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
|
4
5
|
|
5
|
-
|
6
|
+
## Available callbacks
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
before_fetch :do_something
|
15
|
-
|
16
|
-
private
|
17
|
-
|
18
|
-
def do_something
|
19
|
-
# before the task.url is fetched
|
20
|
-
end
|
21
|
-
end
|
22
|
-
```
|
23
|
-
|
24
|
-
## `before_action`
|
25
|
-
|
26
|
-
Runs after a page was fetched, before an action method is called.
|
27
|
-
|
28
|
-
```ruby
|
29
|
-
class DummyJob < Wayfarer::Base
|
30
|
-
before_action :do_something
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
def do_something
|
35
|
-
# page is available at this point
|
36
|
-
end
|
37
|
-
end
|
38
|
-
```
|
8
|
+
* `before_fetch`
|
9
|
+
* `around_fetch`
|
10
|
+
* `after_fetch`
|
11
|
+
* `before_action`
|
12
|
+
* `around_action`
|
13
|
+
* `after_action`
|
14
|
+
* `after_batch`
|
39
15
|
|
40
16
|
## `after_batch`
|
41
17
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
after_batch do
|
47
|
-
# All jobs in batch done
|
48
|
-
end
|
49
|
-
end
|
50
|
-
```
|
51
|
-
|
52
|
-
Internally, a batch counter is in-/decremented on certain events. Once the
|
53
|
-
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
54
|
-
|
55
|
-
The counter is incremented when within the batch:
|
18
|
+
You can register `after_batch` callbacks that run when there are no more tasks
|
19
|
+
to process in a batch. Wayfarer instruments job execution and in- or decrements
|
20
|
+
an integer counter in Redis on certain events. When the counter reaches zero,
|
21
|
+
the current job's `after_batch` callbacks run.
|
56
22
|
|
57
|
-
|
23
|
+
## Conditional callbacks
|
58
24
|
|
59
|
-
|
60
|
-
|
61
|
-
* A job succeeds.
|
62
|
-
* A job errors due to an unhandled exception.
|
63
|
-
* A job is discarded due to an exception.
|
64
|
-
* A job errors and thereyby exhausts its maximum attempts.
|
65
|
-
|
66
|
-
!!! attention "Batch callbacks can fail jobs"
|
67
|
-
|
68
|
-
If the last job's `after_batch` callbacks raises an exception, this can lead
|
69
|
-
to the job getting retried. If the exception raised by the callback is
|
70
|
-
unhandled or discarded, the callback never fully runs.
|
71
|
-
|
72
|
-
## Callback options
|
73
|
-
|
74
|
-
### Definition styles
|
75
|
-
|
76
|
-
Callbacks can be registered either by supplying a block or a symbol identifying
|
77
|
-
a callback instance method:
|
25
|
+
You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
|
26
|
+
keywords, for example to run a callback for some route `action` only:
|
78
27
|
|
79
28
|
```ruby
|
80
|
-
class DummyJob <
|
81
|
-
|
82
|
-
# ...
|
83
|
-
end
|
84
|
-
|
85
|
-
before_action :my_callback
|
29
|
+
class DummyJob < ActiveJob::Base
|
30
|
+
include Wayfarer::Base
|
86
31
|
|
87
|
-
|
32
|
+
route.host "example.com", to: :example
|
33
|
+
route.to :fallback
|
88
34
|
|
89
|
-
|
35
|
+
before_action unless: -> { action == :fallback } do
|
90
36
|
# ...
|
91
37
|
end
|
92
|
-
end
|
93
|
-
```
|
94
|
-
|
95
|
-
### Conditionals
|
96
|
-
|
97
|
-
Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
class DummyJob < Wayfarer::Base
|
101
|
-
before_fetch :my_callback, if: :my_condition
|
102
|
-
|
103
|
-
private
|
104
|
-
|
105
|
-
def my_callback
|
106
|
-
end
|
107
38
|
|
108
|
-
|
109
|
-
end
|
39
|
+
# ...
|
110
40
|
end
|
111
41
|
```
|
112
42
|
|
113
|
-
|
114
|
-
`:except` keywords:
|
115
|
-
|
116
|
-
```ruby
|
117
|
-
class DummyJob < Wayfarer::Base
|
118
|
-
before_fetch :do_something, only: :foo
|
119
|
-
|
120
|
-
before_fetch except: [:foo, :qux] do
|
121
|
-
# runs only before bar
|
122
|
-
end
|
123
|
-
|
124
|
-
def foo
|
125
|
-
end
|
126
|
-
|
127
|
-
def bar
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
```
|
132
|
-
|
133
|
-
### Early termination
|
134
|
-
|
135
|
-
Callbacks that return `false` halt the callback chain:
|
136
|
-
|
137
|
-
```ruby
|
138
|
-
class DummyJob < Wayfarer::Base
|
139
|
-
before_action { false }
|
140
|
-
|
141
|
-
before_action do
|
142
|
-
# never runs
|
143
|
-
end
|
144
|
-
end
|
145
|
-
```
|
43
|
+
You can also pass a symbol instead of a block to call an instance method.
|
@@ -13,27 +13,27 @@ Wayfarer parses environment variables into a runtime configuration
|
|
13
13
|
|
14
14
|
```ruby
|
15
15
|
# Which user agent to use to process tasks
|
16
|
-
Wayfarer.config
|
16
|
+
Wayfarer.config[:network][:agent] = :http # or :ferrum, :selenium
|
17
17
|
|
18
18
|
# How many user agents to instantiate
|
19
|
-
Wayfarer.config
|
19
|
+
Wayfarer.config[:network][:pool_size] = 3
|
20
20
|
|
21
21
|
# How long an agent may be used while processing a task
|
22
|
-
Wayfarer.config
|
22
|
+
Wayfarer.config[:network][:pool_timeout] = 5000
|
23
23
|
|
24
24
|
# Ferrum options
|
25
|
-
Wayfarer.config
|
25
|
+
Wayfarer.config[:ferrum][:options] = {}
|
26
26
|
|
27
27
|
# Selenium driver to use
|
28
|
-
Wayfarer.config
|
28
|
+
Wayfarer.config[:selenium][:driver] = :chrome
|
29
29
|
|
30
30
|
# Selenium HTTP client read timeout
|
31
|
-
Wayfarer.config
|
31
|
+
Wayfarer.config[:selenium][:client_timeout] = 10 # seconds
|
32
32
|
|
33
33
|
# Selenium options
|
34
|
-
Wayfarer.config
|
34
|
+
Wayfarer.config[:selenium][:options] = { url: "http://chrome" }
|
35
35
|
|
36
36
|
# HTTP request headers (Selenium is unsupported)
|
37
|
-
Wayfarer.config
|
37
|
+
Wayfarer.config[:network][:http_headers] = { "Field" => "Value" }
|
38
38
|
```
|
39
39
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Handlers
|
2
|
+
|
3
|
+
[Jobs](/jobs) can route tasks to handlers to delegate processing without
|
4
|
+
writes to the message queue. Unlike jobs, handlers don't inherit from
|
5
|
+
`ActiveJob::Base` and therefore cannot be enqueued. Handlers have routes, too,
|
6
|
+
but they don't retrieve pages and a handler's router can be bypassed.
|
7
|
+
|
8
|
+
## Supported features
|
9
|
+
|
10
|
+
Handlers support a subset of features compared to `Wayfarer::Base`:
|
11
|
+
|
12
|
+
* URL routing
|
13
|
+
* enqueueing tasks with `#!ruby stage(*urls)`
|
14
|
+
* jobs can access the `user_agent` that retrieved the `page`
|
15
|
+
* ad-hoc HTTP requests with `#!ruby fetch(url)`
|
16
|
+
* callbacks, but only a subset of job callbacks
|
17
|
+
* Content-Type filtering
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class ExampleHandler
|
21
|
+
include Wayfarer::Handler
|
22
|
+
|
23
|
+
route.to: :index
|
24
|
+
|
25
|
+
def index
|
26
|
+
task # => #<Wayfarer::Task>
|
27
|
+
page # => #<Wayfarer::Page>
|
28
|
+
user_agent # => Browser or HTTP client
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class DummyJob < ActiveJob::Base
|
33
|
+
include Wayfarer::Base
|
34
|
+
|
35
|
+
route.host "example.com", to: ExampleHandler
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
You can also bypass a handler's router and route directly to an instance
|
40
|
+
method:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
class DummyJob < ActiveJob::Base
|
44
|
+
include Wayfarer::Base
|
45
|
+
|
46
|
+
route.host "example.com", to: [ExampleHandler, :index]
|
47
|
+
end
|
48
|
+
|
49
|
+
class ExampleHandler
|
50
|
+
include Wayfarer::Handler
|
51
|
+
|
52
|
+
def index
|
53
|
+
task # => #<Wayfarer::Task>
|
54
|
+
page # => #<Wayfarer::Page>
|
55
|
+
user_agent # => Browser or HTTP client
|
56
|
+
end
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
!!! `before_action` callbacks
|
@@ -0,0 +1 @@
|
|
1
|
+
hello
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Error handling
|
2
|
+
|
3
|
+
!!! danger "Only ActiveJob error handling is supported"
|
4
|
+
|
5
|
+
Wayfarer exclusively supports ActiveJob's error handling. You cannot use
|
6
|
+
message queue-specific error handling, for example error handling with
|
7
|
+
`sidekiq_options` is unsupported. Otherwise batches get garbage-collected
|
8
|
+
too early as Wayfarer instruments ActiveJob.
|
9
|
+
|
10
|
+
Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
|
11
|
+
|
12
|
+
* `retry_on` to retry jobs a number of times on certain errors:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class DummyJob < Wayfarer::Base
|
16
|
+
retry_on MyError, attempts: 3 do |job, error|
|
17
|
+
# This block runs once all 3 attempts have failed
|
18
|
+
# (1 initial attempt + 2 retries)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
```
|
22
|
+
|
23
|
+
* `discard_on` to throw away jobs on certain errors:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
class DummyJob < Wayfarer::Base
|
27
|
+
discard_on MyError do |job, error|
|
28
|
+
# This block runs once and buries the job
|
29
|
+
end
|
30
|
+
end
|
31
|
+
```
|
32
|
+
|
33
|
+
## Recreating user agents on certain errors
|
34
|
+
|
35
|
+
You can configure a list of exception classes upon which user agents
|
36
|
+
get recreated (see [User agent API]()):
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
|
40
|
+
```
|
data/docs/guides/jobs.md
CHANGED
@@ -1,78 +1,124 @@
|
|
1
1
|
# Jobs
|
2
2
|
|
3
|
-
Jobs are
|
3
|
+
Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
|
4
|
+
that use a DSL included from the `Wayfarer::Base` module to process [tasks](/guides/tasks)
|
5
|
+
that they read from a message queue.
|
6
|
+
Instead of implementing Active Job's `#perform` method yourself, you declare routes
|
7
|
+
to instance methods, similiar to how web applications route incoming requests.
|
8
|
+
Only URLs that match a [route](../routing) are requested or navigated to.
|
9
|
+
The action method has access to the retrieved [page](../pages),
|
10
|
+
the [user agent](../user-agents) that retrieved the page and the current task:
|
4
11
|
|
5
12
|
```ruby
|
6
|
-
class DummyJob <
|
7
|
-
|
13
|
+
class DummyJob < ActiveJob::Base
|
14
|
+
include Wayfarer::Base
|
15
|
+
|
16
|
+
route.to :index
|
8
17
|
|
9
18
|
def index
|
19
|
+
task # => #<Wayfarer::Task>
|
20
|
+
page # => #<Wayfarer::Page>
|
21
|
+
user_agent # => Browser or HTTP client
|
10
22
|
end
|
11
23
|
end
|
12
24
|
```
|
13
25
|
|
14
|
-
|
26
|
+
You can start a crawl by appending a task to the message queue for the URL with
|
27
|
+
`::crawl`. By default, a UUID is generated as the batch:
|
15
28
|
|
16
29
|
```ruby
|
17
|
-
DummyJob.crawl("https://example.com")
|
30
|
+
task = DummyJob.crawl("https://example.com")
|
31
|
+
# => #<Wayfarer::Task url="https://example.com", batch="498a13e0-...">
|
18
32
|
```
|
19
33
|
|
20
|
-
This is the same as calling
|
21
|
-
|
34
|
+
This is exactly the same as calling Active Job's `#perform_later` and passing a
|
35
|
+
task directly:
|
22
36
|
|
23
37
|
```ruby
|
24
38
|
task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
|
25
39
|
DummyJob.perform_later(task)
|
26
40
|
```
|
27
41
|
|
28
|
-
|
42
|
+
Instead of a generated UUID, you can also set your own batch:
|
29
43
|
|
30
44
|
```ruby
|
31
45
|
DummyJob.crawl("https://example.com", batch: "my-batch")
|
32
46
|
```
|
33
47
|
|
34
|
-
|
48
|
+
You can also use Wayfarer's [CLI](../cli) to enqueue a task:
|
49
|
+
|
50
|
+
```sh
|
51
|
+
wayfarer enqueue --batch my-batch DummyJob "https://example.com"
|
52
|
+
```
|
53
|
+
|
54
|
+
## Navigating crawls
|
55
|
+
|
56
|
+
Jobs navigate crawls by staging URLs with `#!ruby stage(urls)`. When you stage a URL, a normalized
|
57
|
+
version of it is appended to an internal set. Once the action returns, all URLs
|
58
|
+
in the set are appended as tasks to the message queue.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
class DummyJob < ActiveJob::Base
|
62
|
+
include Wayfarer::Base
|
63
|
+
|
64
|
+
route.to :index
|
65
|
+
|
66
|
+
def index
|
67
|
+
# Follow all out-going links of the page
|
68
|
+
stage page.meta.links.external
|
69
|
+
end
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
## Accessing the current task
|
35
74
|
|
36
|
-
|
37
|
-
task is
|
75
|
+
If the task's URL matched a [route](../routing), the URL is retrieved over the network,
|
76
|
+
and the method that was routed to is called. The task is available as `#task`:
|
38
77
|
|
39
78
|
```ruby
|
40
|
-
class DummyJob <
|
41
|
-
|
79
|
+
class DummyJob < ActiveJob::Base
|
80
|
+
include Wayfarer::Base
|
81
|
+
|
82
|
+
route.to :index
|
42
83
|
|
43
84
|
def index
|
44
|
-
task.url
|
85
|
+
task.url # => "https://example.com"
|
45
86
|
task.batch # => "my-batch"
|
46
87
|
end
|
47
88
|
end
|
48
89
|
```
|
49
90
|
|
50
|
-
##
|
91
|
+
## Accessing the current page
|
51
92
|
|
52
|
-
|
53
|
-
matched a route:
|
93
|
+
You have access to the retrieved [page](../pages):
|
54
94
|
|
55
95
|
```ruby
|
56
|
-
class DummyJob <
|
57
|
-
|
96
|
+
class DummyJob < ActiveJob::Base
|
97
|
+
include Wayfarer::Base
|
98
|
+
|
99
|
+
route.to :index
|
58
100
|
|
59
101
|
def index
|
60
102
|
page.url # => "https://example.com"
|
61
103
|
page.body # => "<html>..."
|
62
104
|
page.status_code # => 200
|
63
105
|
page.headers # { "Content-Type" => ... }
|
106
|
+
page.doc # Only present for certain Content-Types
|
64
107
|
end
|
65
108
|
end
|
66
109
|
```
|
67
110
|
|
68
|
-
##
|
111
|
+
## Routing URLs to methods and extracting `params`
|
69
112
|
|
70
|
-
Jobs
|
113
|
+
Jobs have a routing DSL that allows you to map URLs to methods and extract
|
114
|
+
URL data:
|
71
115
|
|
72
116
|
```ruby
|
73
|
-
class DummyJob <
|
117
|
+
class DummyJob < ActiveJob::Base
|
118
|
+
include Wayfarer::Base
|
119
|
+
|
74
120
|
route do
|
75
|
-
path "/users/:id/profile"
|
121
|
+
path "/users/:id/profile", to: :index
|
76
122
|
end
|
77
123
|
|
78
124
|
def index
|
@@ -80,22 +126,44 @@ class DummyJob < Wayfarer::Base
|
|
80
126
|
end
|
81
127
|
end
|
82
128
|
|
83
|
-
DummyJob.crawl("https://example.com/users/42/profile")
|
129
|
+
DummyJob.crawl("https://example.com/users/42/profile?foo=bar")
|
84
130
|
```
|
85
131
|
|
132
|
+
## Controlling the user agent
|
86
133
|
|
87
|
-
|
88
|
-
|
89
|
-
The HTTP client or automated browser that fetched the URL is available:
|
134
|
+
You can control the browser or HTTP client that retrieved the page:
|
90
135
|
|
91
136
|
```ruby
|
92
|
-
Wayfarer.config
|
137
|
+
Wayfarer.config[:network][:agent] = :ferrum # Chrome DevTools Protocol
|
93
138
|
|
94
|
-
class DummyJob <
|
95
|
-
|
139
|
+
class DummyJob < ActiveJob::Base
|
140
|
+
include Wayfarer::Base
|
141
|
+
|
142
|
+
route.to :index
|
96
143
|
|
97
144
|
def index
|
98
|
-
|
145
|
+
user_agent.save_screenshot("capture.png")
|
99
146
|
end
|
100
147
|
end
|
101
148
|
```
|
149
|
+
|
150
|
+
## Restricting the processed Content-Types
|
151
|
+
|
152
|
+
By default, jobs process pages regardless of their Content-Type response
|
153
|
+
header. You can allow a list of Content-Types as strings and Regexps and
|
154
|
+
opt out of the default behaviour. Once at least one Content-Type is allowed,
|
155
|
+
other Content-Types don't get processed:
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
class DummyJob < ActiveJob::Base
|
159
|
+
include Wayfarer::Base
|
160
|
+
|
161
|
+
content_type "text/html", "application/json"
|
162
|
+
content_type /xml/
|
163
|
+
end
|
164
|
+
```
|
165
|
+
|
166
|
+
!!! info "HTTP parameters in Content-Types are ignored for comparison"
|
167
|
+
|
168
|
+
Content-Types are compared regardless of their parameters. For example,
|
169
|
+
`text/html; charset=UTF-8` is considered the same as `text/html`.
|