wayfarer 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yaml +1 -1
- data/Gemfile.lock +20 -15
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/guides/browser_automation/capybara.md +64 -1
- data/docs/guides/browser_automation/custom_adapters.md +100 -0
- data/docs/guides/browser_automation/ferrum.md +3 -3
- data/docs/guides/browser_automation/selenium.md +7 -5
- data/docs/guides/callbacks.md +117 -10
- data/docs/guides/configuration.md +16 -10
- data/docs/guides/error_handling.md +9 -5
- data/docs/guides/networking.md +77 -3
- data/docs/index.md +9 -1
- data/docs/reference/api/base.md +4 -4
- data/docs/reference/configuration_keys.md +42 -0
- data/docs/reference/environment_variables.md +25 -27
- data/lib/wayfarer/base.rb +7 -17
- data/lib/wayfarer/callbacks.rb +71 -0
- data/lib/wayfarer/cli/base.rb +5 -1
- data/lib/wayfarer/cli/job.rb +7 -3
- data/lib/wayfarer/cli/route.rb +2 -2
- data/lib/wayfarer/cli/route_printer.rb +7 -7
- data/lib/wayfarer/config/capybara.rb +10 -0
- data/lib/wayfarer/config/ferrum.rb +11 -0
- data/lib/wayfarer/config/networking.rb +26 -0
- data/lib/wayfarer/config/redis.rb +14 -0
- data/lib/wayfarer/config/root.rb +11 -0
- data/lib/wayfarer/config/selenium.rb +21 -0
- data/lib/wayfarer/config/strconv.rb +45 -0
- data/lib/wayfarer/config/struct.rb +72 -0
- data/lib/wayfarer/gc.rb +3 -7
- data/lib/wayfarer/middleware/fetch.rb +7 -3
- data/lib/wayfarer/middleware/router.rb +2 -2
- data/lib/wayfarer/middleware/worker.rb +12 -9
- data/lib/wayfarer/networking/capybara.rb +28 -0
- data/lib/wayfarer/networking/context.rb +36 -0
- data/lib/wayfarer/networking/ferrum.rb +17 -52
- data/lib/wayfarer/networking/http.rb +34 -0
- data/lib/wayfarer/networking/pool.rb +15 -10
- data/lib/wayfarer/networking/result.rb +1 -1
- data/lib/wayfarer/networking/selenium.rb +20 -47
- data/lib/wayfarer/networking/strategy.rb +38 -0
- data/lib/wayfarer/page.rb +2 -3
- data/lib/wayfarer/redis/pool.rb +3 -1
- data/lib/wayfarer/routing/dsl.rb +8 -8
- data/lib/wayfarer/routing/matchers/custom.rb +23 -0
- data/lib/wayfarer/routing/matchers/host.rb +19 -0
- data/lib/wayfarer/routing/matchers/path.rb +48 -0
- data/lib/wayfarer/routing/matchers/query.rb +63 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
- data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
- data/lib/wayfarer/routing/matchers/url.rb +17 -0
- data/lib/wayfarer/routing/route.rb +1 -1
- data/lib/wayfarer.rb +9 -9
- data/spec/base_spec.rb +14 -0
- data/spec/callbacks_spec.rb +102 -0
- data/spec/cli/job_spec.rb +6 -6
- data/spec/config/capybara_spec.rb +18 -0
- data/spec/config/ferrum_spec.rb +24 -0
- data/spec/config/networking_spec.rb +73 -0
- data/spec/config/redis_spec.rb +32 -0
- data/spec/config/root_spec.rb +31 -0
- data/spec/config/selenium_spec.rb +56 -0
- data/spec/config/strconv_spec.rb +58 -0
- data/spec/config/struct_spec.rb +66 -0
- data/spec/gc_spec.rb +8 -6
- data/spec/middleware/fetch_spec.rb +20 -8
- data/spec/middleware/router_spec.rb +7 -0
- data/spec/middleware/worker_spec.rb +64 -27
- data/spec/networking/capybara_spec.rb +12 -0
- data/spec/networking/context_spec.rb +127 -0
- data/spec/networking/ferrum_spec.rb +6 -22
- data/spec/networking/http_spec.rb +12 -0
- data/spec/networking/pool_spec.rb +37 -12
- data/spec/networking/selenium_spec.rb +6 -22
- data/spec/networking/strategy.rb +170 -0
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/routing/dsl_spec.rb +10 -10
- data/spec/routing/integration_spec.rb +22 -22
- data/spec/routing/{custom_matcher_spec.rb → matchers/custom_spec.rb} +4 -4
- data/spec/routing/{host_matcher_spec.rb → matchers/host_spec.rb} +6 -6
- data/spec/routing/{path_matcher_spec.rb → matchers/path_spec.rb} +6 -6
- data/spec/routing/{query_matcher_spec.rb → matchers/query_spec.rb} +15 -15
- data/spec/routing/{scheme_matcher_spec.rb → matchers/scheme_spec.rb} +4 -4
- data/spec/routing/{suffix_matcher_spec.rb → matchers/suffix_spec.rb} +4 -4
- data/spec/routing/{uri_matcher_spec.rb → matchers/uri_spec.rb} +4 -4
- data/spec/routing/path_finder_spec.rb +1 -1
- data/spec/routing/root_route_spec.rb +2 -2
- data/spec/routing/route_spec.rb +2 -2
- data/spec/spec_helpers.rb +13 -5
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +8 -7
- metadata +74 -33
- data/lib/wayfarer/config.rb +0 -67
- data/lib/wayfarer/networking/healer.rb +0 -21
- data/lib/wayfarer/networking/net_http.rb +0 -52
- data/lib/wayfarer/routing/custom_matcher.rb +0 -21
- data/lib/wayfarer/routing/host_matcher.rb +0 -23
- data/lib/wayfarer/routing/path_matcher.rb +0 -46
- data/lib/wayfarer/routing/query_matcher.rb +0 -67
- data/lib/wayfarer/routing/scheme_matcher.rb +0 -21
- data/lib/wayfarer/routing/suffix_matcher.rb +0 -21
- data/lib/wayfarer/routing/url_matcher.rb +0 -21
- data/spec/config_spec.rb +0 -144
- data/spec/networking/adapter.rb +0 -135
- data/spec/networking/healer_spec.rb +0 -46
- data/spec/networking/net_http_spec.rb +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2bbcc550d6799e1e3588b832905866116105fdcceb0eeef5ec244622f15bb10
|
4
|
+
data.tar.gz: b8b324d89d162e578cde829f15a44bb08b9e93ad2a3bb5b6619e96275a7fa6cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 998c06776f7a7922aa2d36770dc7e4389c5814ac36a20062b20e7fe6986fb52e4fde9f538287510fccfca1689fb9f7f4e019ec23dcbeff777add1a99e24fba26
|
7
|
+
data.tar.gz: 239e3db3d5fffb8f81e74c655a648ce03febd346c25fb86b695bca8a8d328e6ff341d63d4becf63a2811defae3af5a1c4bf1c772e327b114df5909a06151a95b
|
data/.github/workflows/ci.yaml
CHANGED
@@ -22,7 +22,7 @@ jobs:
|
|
22
22
|
- name: Run Ferrum tests
|
23
23
|
run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:ferrum
|
24
24
|
|
25
|
-
- name: Run
|
25
|
+
- name: Run Selenium tests
|
26
26
|
run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:selenium
|
27
27
|
|
28
28
|
- name: Run CLI tests
|
data/Gemfile.lock
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
wayfarer (0.4.
|
4
|
+
wayfarer (0.4.1)
|
5
5
|
activejob (~> 6.0)
|
6
|
-
|
6
|
+
addressable (~> 2.8)
|
7
|
+
capybara (~> 3.0)
|
7
8
|
connection_pool (~> 2.2)
|
8
|
-
cuprite (~> 0.11)
|
9
9
|
docile (~> 1.1)
|
10
10
|
ferrum (~> 0.9)
|
11
11
|
metainspector (~> 5.0)
|
12
12
|
mime-types (~> 3.0)
|
13
13
|
mock_redis (~> 0.29)
|
14
14
|
mustermann (~> 1.1)
|
15
|
-
net-http-persistent (~>
|
15
|
+
net-http-persistent (~> 3.0)
|
16
16
|
nokogiri (~> 1.11)
|
17
17
|
normalize_url (~> 0.0.6)
|
18
|
-
redis (~> 4.
|
18
|
+
redis (~> 4.4, < 4.5)
|
19
19
|
selenium-webdriver (~> 3.4)
|
20
20
|
thor (~> 1.0)
|
21
21
|
|
@@ -34,13 +34,15 @@ GEM
|
|
34
34
|
addressable (2.8.0)
|
35
35
|
public_suffix (>= 2.0.2, < 5.0)
|
36
36
|
ast (2.4.2)
|
37
|
-
capybara (
|
37
|
+
capybara (3.36.0)
|
38
38
|
addressable
|
39
|
+
matrix
|
39
40
|
mini_mime (>= 0.1.3)
|
40
|
-
nokogiri (
|
41
|
-
rack (>= 1.
|
42
|
-
rack-test (>= 0.
|
43
|
-
|
41
|
+
nokogiri (~> 1.8)
|
42
|
+
rack (>= 1.6.0)
|
43
|
+
rack-test (>= 0.6.3)
|
44
|
+
regexp_parser (>= 1.5, < 3.0)
|
45
|
+
xpath (~> 3.2)
|
44
46
|
childprocess (3.0.0)
|
45
47
|
cliver (0.3.2)
|
46
48
|
coderay (1.1.3)
|
@@ -83,7 +85,7 @@ GEM
|
|
83
85
|
faraday-net_http_persistent (1.2.0)
|
84
86
|
faraday-patron (1.0.0)
|
85
87
|
faraday-rack (1.0.0)
|
86
|
-
faraday_middleware (1.
|
88
|
+
faraday_middleware (1.2.0)
|
87
89
|
faraday (~> 1.0)
|
88
90
|
fastimage (2.2.5)
|
89
91
|
ferrum (0.11)
|
@@ -97,6 +99,7 @@ GEM
|
|
97
99
|
domain_name (~> 0.5)
|
98
100
|
i18n (1.8.10)
|
99
101
|
concurrent-ruby (~> 1.0)
|
102
|
+
matrix (0.4.2)
|
100
103
|
metainspector (5.11.2)
|
101
104
|
addressable (~> 2.7)
|
102
105
|
faraday (~> 1.4)
|
@@ -111,7 +114,7 @@ GEM
|
|
111
114
|
mime-types (3.3.1)
|
112
115
|
mime-types-data (~> 3.2015)
|
113
116
|
mime-types-data (3.2021.0901)
|
114
|
-
mini_mime (1.1.
|
117
|
+
mini_mime (1.1.2)
|
115
118
|
mini_portile2 (2.6.1)
|
116
119
|
minitest (5.14.4)
|
117
120
|
mock_redis (0.29.0)
|
@@ -120,8 +123,9 @@ GEM
|
|
120
123
|
mustermann (1.1.1)
|
121
124
|
ruby2_keywords (~> 0.0.1)
|
122
125
|
nesty (1.0.2)
|
123
|
-
net-http-persistent (
|
124
|
-
|
126
|
+
net-http-persistent (3.1.0)
|
127
|
+
connection_pool (~> 2.2)
|
128
|
+
nokogiri (1.12.5)
|
125
129
|
mini_portile2 (~> 2.6.1)
|
126
130
|
racc (~> 1.4)
|
127
131
|
normalize_url (0.0.6)
|
@@ -133,7 +137,7 @@ GEM
|
|
133
137
|
coderay (~> 1.1)
|
134
138
|
method_source (~> 1.0)
|
135
139
|
public_suffix (4.0.6)
|
136
|
-
racc (1.
|
140
|
+
racc (1.6.0)
|
137
141
|
rack (1.6.13)
|
138
142
|
rack-protection (1.5.5)
|
139
143
|
rack
|
@@ -198,6 +202,7 @@ PLATFORMS
|
|
198
202
|
ruby
|
199
203
|
|
200
204
|
DEPENDENCIES
|
205
|
+
cuprite (~> 0.13)
|
201
206
|
factory_bot (~> 6.0)
|
202
207
|
faker (~> 1.7)
|
203
208
|
pry (~> 0.10)
|
data/docs/cookbook/user_agent.md
CHANGED
@@ -1,3 +1,66 @@
|
|
1
1
|
# Capybara
|
2
2
|
|
3
|
-
|
3
|
+
[Capybara](https://github.com/teamcapybara/capybara) is originally a test
|
4
|
+
framework for web applications.
|
5
|
+
|
6
|
+
When Capybara is in use, a remote browser process is available as a Capybara
|
7
|
+
session:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
class DummyWorker < Wayfarer::Worker
|
11
|
+
route.to :index
|
12
|
+
|
13
|
+
def index
|
14
|
+
browser # => #<Capybara::Session ...>
|
15
|
+
end
|
16
|
+
end
|
17
|
+
```
|
18
|
+
|
19
|
+
|
20
|
+
## Configuring a driver
|
21
|
+
|
22
|
+
1. Install the Capybara driver for the desired user agent.
|
23
|
+
|
24
|
+
For example, to automate Google Chrome with
|
25
|
+
[Ferrum](https://github.com/rubycdp/ferrum), install the
|
26
|
+
[Cuprite](https://github.com/rubycdp/cuprite) driver:
|
27
|
+
|
28
|
+
=== "RubyGems"
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
gem install cuprite
|
32
|
+
```
|
33
|
+
|
34
|
+
=== "Bundler"
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
gem "cuprite" # Gemfile
|
38
|
+
```
|
39
|
+
|
40
|
+
2. Configure Wayfarer to use the `:capybara` user agent and set the desired
|
41
|
+
driver:
|
42
|
+
|
43
|
+
=== "Runtime"
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
Wayfarer.config.network.agent = :capybara
|
47
|
+
Wayfarer.config.capybara.driver = :cuprite
|
48
|
+
```
|
49
|
+
|
50
|
+
=== "Environment variables"
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
WAYFARER_NETWORK_AGENT=capybara
|
54
|
+
WAYFARER_CAPYBARA_DRIVER=cuprite
|
55
|
+
```
|
56
|
+
|
57
|
+
3. Register the driver:
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
Capybara.javascript_driver = :cuprite
|
61
|
+
|
62
|
+
Capybara.register_driver(:cuprite) do |app|
|
63
|
+
# Wayfarer's Ferrum or Selenium options must be passed along manually
|
64
|
+
Capybara::Cuprite::Driver.new(app, Wayfare.config.ferrum.options)
|
65
|
+
end
|
66
|
+
```
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# Custom agents
|
2
|
+
|
3
|
+
Wayfarer offers an interface for integrating third-party browsers and HTTP
|
4
|
+
clients as user agents.
|
5
|
+
|
6
|
+
There are two types of agents:
|
7
|
+
|
8
|
+
1. Stateful agents, i.e. browsers, which carry state and support navigation.
|
9
|
+
These follow HTTP redirects implicitly.
|
10
|
+
2. Stateless agents, which deal with HTTP requests/responses only.
|
11
|
+
These handle HTTP redirects explicitly.
|
12
|
+
|
13
|
+
## Implementation
|
14
|
+
|
15
|
+
Both types can be implemented with callback methods:
|
16
|
+
|
17
|
+
=== "Stateful"
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
class StatefulAgent
|
21
|
+
include Wayfarer::Networking::Strategy
|
22
|
+
|
23
|
+
def renew_on # optional
|
24
|
+
[MyBrowser::IrrecoverableError]
|
25
|
+
end
|
26
|
+
|
27
|
+
def create
|
28
|
+
MyBrowser.new
|
29
|
+
end
|
30
|
+
|
31
|
+
def destroy(browser) # optional
|
32
|
+
browser.quit
|
33
|
+
end
|
34
|
+
|
35
|
+
def navigate(browser, url)
|
36
|
+
browser.goto(url)
|
37
|
+
end
|
38
|
+
|
39
|
+
def live(browser)
|
40
|
+
success(url: browser.url,
|
41
|
+
body: browser.body,
|
42
|
+
status_code: browser.status_code,
|
43
|
+
headers: browser.headers)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
```
|
47
|
+
|
48
|
+
=== "Stateless"
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
class StatelessAgent
|
52
|
+
include Wayfarer::Networking::Strategy
|
53
|
+
|
54
|
+
def renew_on # optional
|
55
|
+
[MyClient::IrrecoverableError]
|
56
|
+
end
|
57
|
+
|
58
|
+
def create
|
59
|
+
MyClient.new
|
60
|
+
end
|
61
|
+
|
62
|
+
def destroy(client) # optional
|
63
|
+
client.close
|
64
|
+
end
|
65
|
+
|
66
|
+
def fetch(client, url)
|
67
|
+
response = client.get(url)
|
68
|
+
|
69
|
+
return redirect(response.redirect_url) if response.redirect?
|
70
|
+
|
71
|
+
success(url: url,
|
72
|
+
body: response.body,
|
73
|
+
status_code: response.status_code,
|
74
|
+
headers: response.headers)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
```
|
78
|
+
|
79
|
+
|
80
|
+
Register the strategy:
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
Wayfarer::Networking::Pool.registry[:my_agent] = MyAgent.new
|
84
|
+
```
|
85
|
+
|
86
|
+
Use the strategy:
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Wayfarer.config.network.agent = :my_agent
|
90
|
+
```
|
91
|
+
|
92
|
+
### Remarks
|
93
|
+
|
94
|
+
#### Self-healing
|
95
|
+
|
96
|
+
* A strategy's `#renew_on` method may return a list of exception classes upon
|
97
|
+
which the existing instance gets destroyed and replaced with a newly created
|
98
|
+
one.
|
99
|
+
* Stateless clients must not raise exceptions when encountering certain HTTP
|
100
|
+
response codes (for example, 5xx).
|
@@ -25,13 +25,13 @@ end
|
|
25
25
|
=== "Runtime"
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
Wayfarer.config.
|
29
|
-
Wayfarer.config.
|
28
|
+
Wayfarer.config.network.agent = :ferrum
|
29
|
+
Wayfarer.config.ferrum.options = { headless: false, url: "http://chrome:3000" }
|
30
30
|
```
|
31
31
|
|
32
32
|
=== "Environment variables"
|
33
33
|
|
34
34
|
```
|
35
|
-
|
35
|
+
WAYFARER_AGENT=ferrum
|
36
36
|
WAYFARER_FERRUM_OPTIONS=headless:false,url:http://chrome:3000
|
37
37
|
```
|
@@ -25,7 +25,7 @@ process.
|
|
25
25
|
Pages retrieved with a Selenium WebDriver return fake values:
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
Wayfarer.config.
|
28
|
+
Wayfarer.config.network.agent = :selenium
|
29
29
|
|
30
30
|
class DummyJob < Wayfarer::Base
|
31
31
|
route.to :index
|
@@ -47,13 +47,15 @@ process.
|
|
47
47
|
=== "Runtime"
|
48
48
|
|
49
49
|
```ruby
|
50
|
-
Wayfarer.config.
|
51
|
-
Wayfarer.config.
|
50
|
+
Wayfarer.config.network.agent = :selenium
|
51
|
+
Wayfarer.config.selenium.driver = :firefox
|
52
|
+
Wayfarer.config.selenium.options = { url: "http://firefox" }
|
52
53
|
```
|
53
54
|
|
54
55
|
=== "Environment variables"
|
55
56
|
|
56
57
|
```
|
57
|
-
|
58
|
-
|
58
|
+
WAYFARER_AGENT=selenium
|
59
|
+
WAYFARER_SELENIUM_DRIVER=firefox
|
60
|
+
WAYFARER_SELENIUM_OPTIONS=url:http://firefox
|
59
61
|
```
|
data/docs/guides/callbacks.md
CHANGED
@@ -1,30 +1,56 @@
|
|
1
1
|
# Callbacks
|
2
2
|
|
3
|
-
##
|
3
|
+
## Active Job callbacks
|
4
4
|
|
5
|
-
Wayfarer supports all of Active Job's life cycle callbacks
|
5
|
+
Wayfarer naturally supports all of [Active Job's life cycle callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
|
6
6
|
|
7
|
-
|
8
|
-
* [ActiveJob::Callbacks](https://api.rubyonrails.org/classes/ActiveJob/Callbacks/ClassMethods.html)
|
7
|
+
## `before_fetch`
|
9
8
|
|
10
|
-
|
9
|
+
Runs before a job fetches a page, either by making an HTTP request, or by
|
10
|
+
navigating a browser to its task URL.
|
11
11
|
|
12
|
-
|
12
|
+
```ruby
|
13
|
+
class DummyJob < Wayfarer::Base
|
14
|
+
before_fetch :do_something
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def do_something
|
19
|
+
# before the task.url is fetched
|
20
|
+
end
|
21
|
+
end
|
22
|
+
```
|
23
|
+
|
24
|
+
## `before_action`
|
25
|
+
|
26
|
+
Runs after a page was fetched, before an action method is called.
|
13
27
|
|
14
28
|
```ruby
|
15
29
|
class DummyJob < Wayfarer::Base
|
16
|
-
|
17
|
-
|
30
|
+
before_action :do_something
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def do_something
|
35
|
+
# page is available at this point
|
18
36
|
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
## `after_batch`
|
19
41
|
|
42
|
+
Runs once the last job in a batch performed:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
class DummyJob < Wayfarer::Base
|
20
46
|
after_batch do
|
21
|
-
#
|
47
|
+
# All jobs in batch done
|
22
48
|
end
|
23
49
|
end
|
24
50
|
```
|
25
51
|
|
26
52
|
Internally, a batch counter is in-/decremented on certain events. Once the
|
27
|
-
counter reaches zero, `after_batch` callbacks
|
53
|
+
counter reaches zero, `after_batch` callbacks runs in declaration order.
|
28
54
|
|
29
55
|
The counter is incremented when:
|
30
56
|
|
@@ -36,3 +62,84 @@ The counter is decremented when:
|
|
36
62
|
* A job fails due to an unhandled exception.
|
37
63
|
* A job fails due to a discarded exception.
|
38
64
|
* A job fails and thereyby exhausts its maximum attempts.
|
65
|
+
|
66
|
+
!!! attention "Batch callbacks can fail jobs"
|
67
|
+
|
68
|
+
If the last job's `after_batch` callbacks raises an exception, this can lead
|
69
|
+
to the job getting retried. If the exception raised by the callback is
|
70
|
+
unhandled or discarded, the callback never fully runs.
|
71
|
+
|
72
|
+
## Callback options
|
73
|
+
|
74
|
+
### Definition styles
|
75
|
+
|
76
|
+
Callbacks can be registered either by supplying a block or a symbol identifying
|
77
|
+
a callback instance method:
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
class DummyJob < Wayfarer::Base
|
81
|
+
before_action do
|
82
|
+
# ...
|
83
|
+
end
|
84
|
+
|
85
|
+
before_action :my_callback
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def my_callback
|
90
|
+
# ...
|
91
|
+
end
|
92
|
+
end
|
93
|
+
```
|
94
|
+
|
95
|
+
### Conditionals
|
96
|
+
|
97
|
+
Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
class DummyJob < Wayfarer::Base
|
101
|
+
before_fetch :my_callback, if: :my_condition
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
def my_callback
|
106
|
+
end
|
107
|
+
|
108
|
+
def my_condition
|
109
|
+
end
|
110
|
+
end
|
111
|
+
```
|
112
|
+
|
113
|
+
Callbacks can be registered for certain action methods only with the `:only` and
|
114
|
+
`:except` keywords:
|
115
|
+
|
116
|
+
```ruby
|
117
|
+
class DummyJob < Wayfarer::Base
|
118
|
+
before_fetch :do_something, only: :foo
|
119
|
+
|
120
|
+
before_fetch except: [:foo, :qux] do
|
121
|
+
# runs only before bar
|
122
|
+
end
|
123
|
+
|
124
|
+
def foo
|
125
|
+
end
|
126
|
+
|
127
|
+
def bar
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
```
|
132
|
+
|
133
|
+
### Early termination
|
134
|
+
|
135
|
+
Callbacks that return `false` halt the callback chain:
|
136
|
+
|
137
|
+
```ruby
|
138
|
+
class DummyJob < Wayfarer::Base
|
139
|
+
before_action { false }
|
140
|
+
|
141
|
+
before_action do
|
142
|
+
# never runs
|
143
|
+
end
|
144
|
+
end
|
145
|
+
```
|
@@ -12,22 +12,28 @@ Wayfarer parses environment variables into a runtime configuration
|
|
12
12
|
`Wayfarer.config`:
|
13
13
|
|
14
14
|
```ruby
|
15
|
-
# Which
|
16
|
-
Wayfarer.config.
|
15
|
+
# Which user agent to use to process tasks
|
16
|
+
Wayfarer.config.network.agent = :http # or :ferrum, :selenium
|
17
17
|
|
18
|
-
# How many
|
19
|
-
Wayfarer.config.
|
18
|
+
# How many user agents to instantiate
|
19
|
+
Wayfarer.config.network.pool_size = 3
|
20
20
|
|
21
|
-
# How long an
|
22
|
-
Wayfarer.config.
|
21
|
+
# How long an agent may be used while processing a task
|
22
|
+
Wayfarer.config.network.pool_timeout = 5000
|
23
23
|
|
24
24
|
# Ferrum options
|
25
|
-
Wayfarer.config.
|
25
|
+
Wayfarer.config.ferrum.options = {}
|
26
26
|
|
27
|
-
# Selenium
|
28
|
-
Wayfarer.config.
|
27
|
+
# Selenium driver to use
|
28
|
+
Wayfarer.config.selenium.driver = :chrome
|
29
|
+
|
30
|
+
# Selenium HTTP client read timeout
|
31
|
+
Wayfarer.config.selenium.client_timeout = 10 # seconds
|
32
|
+
|
33
|
+
# Selenium options
|
34
|
+
Wayfarer.config.selenium.options = { url: "http://chrome" }
|
29
35
|
|
30
36
|
# HTTP request headers (Selenium is unsupported)
|
31
|
-
Wayfarer.config.http_headers = { "Field" => "Value" }
|
37
|
+
Wayfarer.config.network.http_headers = { "Field" => "Value" }
|
32
38
|
```
|
33
39
|
|
@@ -6,16 +6,12 @@ Wayfarer relies on Active Job's error handling facilities, `retry_on` and
|
|
6
6
|
* [Active Job Basics: Exceptions](https://guides.rubyonrails.org/active_job_basics.html#exceptions)
|
7
7
|
* [ActiveJob::Exceptions](https://edgeapi.rubyonrails.org/classes/ActiveJob/Exceptions/ClassMethods.html)
|
8
8
|
|
9
|
-
## Unhandled exceptions
|
10
|
-
|
11
|
-
Jobs with unhandled exceptions fail and are not retried.
|
12
|
-
|
13
9
|
## Retrying
|
14
10
|
|
15
11
|
```ruby
|
16
12
|
class DummyJob < Wayfarer::Base
|
17
13
|
retry_on MyError, attempts: 3 do |job, error|
|
18
|
-
# All 3
|
14
|
+
# All 3 attempts have failed (1 initial attempt + 2 retries)
|
19
15
|
end
|
20
16
|
end
|
21
17
|
```
|
@@ -29,3 +25,11 @@ class DummyJob < Wayfarer::Base
|
|
29
25
|
end
|
30
26
|
end
|
31
27
|
```
|
28
|
+
|
29
|
+
## Job failures
|
30
|
+
|
31
|
+
Jobs are not retried and their URLs locked within their batch if:
|
32
|
+
|
33
|
+
* A discarded exception is raised.
|
34
|
+
* An unhandled exception is raised.
|
35
|
+
* A handled exception is raised, but retry attempts are exhausted.
|
data/docs/guides/networking.md
CHANGED
@@ -1,18 +1,92 @@
|
|
1
1
|
# Networking
|
2
2
|
|
3
|
-
Wayfarer
|
3
|
+
Wayfarer navigates the web in two ways:
|
4
4
|
|
5
5
|
1. Via plain HTTP requests
|
6
6
|
2. By automating browsers
|
7
7
|
|
8
|
-
|
8
|
+
Both options are mutually exclusive per Ruby process.
|
9
|
+
|
10
|
+
## User agents
|
11
|
+
|
12
|
+
A user agent is an entity that knows how to retrieve the contents behind a URL.
|
13
|
+
|
14
|
+
The user agent can be configured via the global configuration:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
Wayfarer.config.network.agent = :http # or :ferrum, :selenium
|
18
|
+
```
|
19
|
+
|
20
|
+
## Connection pooling
|
21
|
+
|
22
|
+
Wayfarer keeps user agents within a connection pool. When a job executes
|
23
|
+
and needs to retrieve the contents behind a URL, an agent is checked out from
|
24
|
+
the pool.
|
25
|
+
|
26
|
+
The pool has a constant size and it should equal the number of threads the
|
27
|
+
underlying message queue operates with. The size can be configured via the
|
28
|
+
global configuration:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
Wayfarer.config.network.pool_size = 8
|
32
|
+
```
|
33
|
+
|
34
|
+
### Timeouts
|
35
|
+
|
36
|
+
user agents may stay checked out from the pool by jobs for a limited time
|
37
|
+
only. Once this time limit is exceeded, a `ConnectionPool::TimeoutError`
|
38
|
+
exception is raised. This places a hard time limit on every job.
|
39
|
+
|
40
|
+
The timeout can be configured via the global configuration:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
Wayfarer.config.network.pool_timeout = 20 # seconds
|
44
|
+
```
|
45
|
+
|
46
|
+
Because jobs with unhandled exceptions fail, explicit error handling is required
|
47
|
+
if retries are desired:
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
class DummyJob < Wayfarer::Base
|
51
|
+
retry_on ConnectionPool::TimeoutError, attempts: 3
|
52
|
+
end
|
53
|
+
```
|
54
|
+
|
55
|
+
## Agent-specific client timeouts
|
56
|
+
|
57
|
+
The time in seconds it may take to communicate with remote browser processes can
|
58
|
+
be configured globally per agent:
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
Wayfarer.config.ferrum.options = { timeout: 5 }
|
62
|
+
Wayfarer.config.selenium.client_timeout = 60
|
63
|
+
```
|
64
|
+
|
65
|
+
### Shared state
|
66
|
+
|
67
|
+
As user agents get checked in and out continously between jobs, their state
|
68
|
+
carries over from job to job, too.
|
69
|
+
|
70
|
+
For browser automation, this means:
|
71
|
+
|
72
|
+
* A job finds the browser at the last URL the previous job has left off.
|
73
|
+
* The browser's cookies might have been set, or other client-side state might
|
74
|
+
exist that significantly affects a page's behaviour.
|
75
|
+
|
76
|
+
## HTTP redirect handling
|
77
|
+
|
78
|
+
Browsers follow redirects transparently when they are navigated to a URL.
|
79
|
+
|
80
|
+
When using plain HTTP, redirect URLs are enqueued transparently within the same
|
81
|
+
batch. URLs that result in 3xx responses will not be retrieved again within
|
82
|
+
their batch.
|
9
83
|
|
10
84
|
## HTTP request headers
|
11
85
|
|
12
86
|
Request headers can be configured via the global configuration:
|
13
87
|
|
14
88
|
```ruby
|
15
|
-
Wayfarer.config.http_headers = { "Field" => "Value" }
|
89
|
+
Wayfarer.config.network.http_headers = { "Field" => "Value" }
|
16
90
|
```
|
17
91
|
|
18
92
|
!!! attention "Partial support"
|
data/docs/index.md
CHANGED
@@ -14,10 +14,18 @@ hide:
|
|
14
14
|
* Data extraction
|
15
15
|
* Browser automation
|
16
16
|
|
17
|
-
!!! attention "
|
17
|
+
!!! attention "Unstable software"
|
18
18
|
|
19
19
|
Wayfarer is under development and releases should be considered unstable.
|
20
20
|
|
21
|
+
Wayfarer complies to
|
22
|
+
[Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) in
|
23
|
+
which v0.x means that there could be backward-incompatible changes for every
|
24
|
+
release:
|
25
|
+
|
26
|
+
>Major version zero (0.y.z) is for initial development. Anything MAY change
|
27
|
+
at any time. The public API SHOULD NOT be considered stable.
|
28
|
+
|
21
29
|
### Installation
|
22
30
|
|
23
31
|
Install the RubyGem:
|