wayfarer 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci.yaml +1 -1
  3. data/Gemfile.lock +20 -15
  4. data/docs/cookbook/user_agent.md +1 -1
  5. data/docs/guides/browser_automation/capybara.md +64 -1
  6. data/docs/guides/browser_automation/custom_adapters.md +100 -0
  7. data/docs/guides/browser_automation/ferrum.md +3 -3
  8. data/docs/guides/browser_automation/selenium.md +7 -5
  9. data/docs/guides/callbacks.md +117 -10
  10. data/docs/guides/configuration.md +16 -10
  11. data/docs/guides/error_handling.md +9 -5
  12. data/docs/guides/networking.md +77 -3
  13. data/docs/index.md +9 -1
  14. data/docs/reference/api/base.md +4 -4
  15. data/docs/reference/configuration_keys.md +42 -0
  16. data/docs/reference/environment_variables.md +25 -27
  17. data/lib/wayfarer/base.rb +7 -17
  18. data/lib/wayfarer/callbacks.rb +71 -0
  19. data/lib/wayfarer/cli/base.rb +5 -1
  20. data/lib/wayfarer/cli/job.rb +7 -3
  21. data/lib/wayfarer/cli/route.rb +2 -2
  22. data/lib/wayfarer/cli/route_printer.rb +7 -7
  23. data/lib/wayfarer/config/capybara.rb +10 -0
  24. data/lib/wayfarer/config/ferrum.rb +11 -0
  25. data/lib/wayfarer/config/networking.rb +26 -0
  26. data/lib/wayfarer/config/redis.rb +14 -0
  27. data/lib/wayfarer/config/root.rb +11 -0
  28. data/lib/wayfarer/config/selenium.rb +21 -0
  29. data/lib/wayfarer/config/strconv.rb +45 -0
  30. data/lib/wayfarer/config/struct.rb +72 -0
  31. data/lib/wayfarer/gc.rb +3 -7
  32. data/lib/wayfarer/middleware/fetch.rb +7 -3
  33. data/lib/wayfarer/middleware/router.rb +2 -2
  34. data/lib/wayfarer/middleware/worker.rb +12 -9
  35. data/lib/wayfarer/networking/capybara.rb +28 -0
  36. data/lib/wayfarer/networking/context.rb +36 -0
  37. data/lib/wayfarer/networking/ferrum.rb +17 -52
  38. data/lib/wayfarer/networking/http.rb +34 -0
  39. data/lib/wayfarer/networking/pool.rb +15 -10
  40. data/lib/wayfarer/networking/result.rb +1 -1
  41. data/lib/wayfarer/networking/selenium.rb +20 -47
  42. data/lib/wayfarer/networking/strategy.rb +38 -0
  43. data/lib/wayfarer/page.rb +2 -3
  44. data/lib/wayfarer/redis/pool.rb +3 -1
  45. data/lib/wayfarer/routing/dsl.rb +8 -8
  46. data/lib/wayfarer/routing/matchers/custom.rb +23 -0
  47. data/lib/wayfarer/routing/matchers/host.rb +19 -0
  48. data/lib/wayfarer/routing/matchers/path.rb +48 -0
  49. data/lib/wayfarer/routing/matchers/query.rb +63 -0
  50. data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
  51. data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
  52. data/lib/wayfarer/routing/matchers/url.rb +17 -0
  53. data/lib/wayfarer/routing/route.rb +1 -1
  54. data/lib/wayfarer.rb +9 -9
  55. data/spec/base_spec.rb +14 -0
  56. data/spec/callbacks_spec.rb +102 -0
  57. data/spec/cli/job_spec.rb +6 -6
  58. data/spec/config/capybara_spec.rb +18 -0
  59. data/spec/config/ferrum_spec.rb +24 -0
  60. data/spec/config/networking_spec.rb +73 -0
  61. data/spec/config/redis_spec.rb +32 -0
  62. data/spec/config/root_spec.rb +31 -0
  63. data/spec/config/selenium_spec.rb +56 -0
  64. data/spec/config/strconv_spec.rb +58 -0
  65. data/spec/config/struct_spec.rb +66 -0
  66. data/spec/gc_spec.rb +8 -6
  67. data/spec/middleware/fetch_spec.rb +20 -8
  68. data/spec/middleware/router_spec.rb +7 -0
  69. data/spec/middleware/worker_spec.rb +64 -27
  70. data/spec/networking/capybara_spec.rb +12 -0
  71. data/spec/networking/context_spec.rb +127 -0
  72. data/spec/networking/ferrum_spec.rb +6 -22
  73. data/spec/networking/http_spec.rb +12 -0
  74. data/spec/networking/pool_spec.rb +37 -12
  75. data/spec/networking/selenium_spec.rb +6 -22
  76. data/spec/networking/strategy.rb +170 -0
  77. data/spec/redis/pool_spec.rb +1 -1
  78. data/spec/routing/dsl_spec.rb +10 -10
  79. data/spec/routing/integration_spec.rb +22 -22
  80. data/spec/routing/{custom_matcher_spec.rb → matchers/custom_spec.rb} +4 -4
  81. data/spec/routing/{host_matcher_spec.rb → matchers/host_spec.rb} +6 -6
  82. data/spec/routing/{path_matcher_spec.rb → matchers/path_spec.rb} +6 -6
  83. data/spec/routing/{query_matcher_spec.rb → matchers/query_spec.rb} +15 -15
  84. data/spec/routing/{scheme_matcher_spec.rb → matchers/scheme_spec.rb} +4 -4
  85. data/spec/routing/{suffix_matcher_spec.rb → matchers/suffix_spec.rb} +4 -4
  86. data/spec/routing/{uri_matcher_spec.rb → matchers/uri_spec.rb} +4 -4
  87. data/spec/routing/path_finder_spec.rb +1 -1
  88. data/spec/routing/root_route_spec.rb +2 -2
  89. data/spec/routing/route_spec.rb +2 -2
  90. data/spec/spec_helpers.rb +13 -5
  91. data/spec/wayfarer_spec.rb +1 -1
  92. data/wayfarer.gemspec +8 -7
  93. metadata +74 -33
  94. data/lib/wayfarer/config.rb +0 -67
  95. data/lib/wayfarer/networking/healer.rb +0 -21
  96. data/lib/wayfarer/networking/net_http.rb +0 -52
  97. data/lib/wayfarer/routing/custom_matcher.rb +0 -21
  98. data/lib/wayfarer/routing/host_matcher.rb +0 -23
  99. data/lib/wayfarer/routing/path_matcher.rb +0 -46
  100. data/lib/wayfarer/routing/query_matcher.rb +0 -67
  101. data/lib/wayfarer/routing/scheme_matcher.rb +0 -21
  102. data/lib/wayfarer/routing/suffix_matcher.rb +0 -21
  103. data/lib/wayfarer/routing/url_matcher.rb +0 -21
  104. data/spec/config_spec.rb +0 -144
  105. data/spec/networking/adapter.rb +0 -135
  106. data/spec/networking/healer_spec.rb +0 -46
  107. data/spec/networking/net_http_spec.rb +0 -37
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1dad2cf380ed02bc82337e9e333f8203de7c9043bca7c30b317438e66e26828
4
- data.tar.gz: 7c178b52bd9fccf8b46026aba4dd8e7e9f8f1fef9e9919e7b3841adc686d047c
3
+ metadata.gz: a2bbcc550d6799e1e3588b832905866116105fdcceb0eeef5ec244622f15bb10
4
+ data.tar.gz: b8b324d89d162e578cde829f15a44bb08b9e93ad2a3bb5b6619e96275a7fa6cd
5
5
  SHA512:
6
- metadata.gz: 2b64c747f71682c052392a83c5993365641464a1df5fc3d6a05c239f3f2e9265c248779e8ace9ebd1cd0f37023345c59c9bd6298f8bec5b42a8a1e236233ae6e
7
- data.tar.gz: ce8ddb1c4d7397f93699743587050c21bd645d2e3c37efcfbc8ed29bdcd5bd8146cef350e72eb08c3eab6e6c570379197a0d8fdb90d4fa68f8a032bd4dbba9d1
6
+ metadata.gz: 998c06776f7a7922aa2d36770dc7e4389c5814ac36a20062b20e7fe6986fb52e4fde9f538287510fccfca1689fb9f7f4e019ec23dcbeff777add1a99e24fba26
7
+ data.tar.gz: 239e3db3d5fffb8f81e74c655a648ce03febd346c25fb86b695bca8a8d328e6ff341d63d4becf63a2811defae3af5a1c4bf1c772e327b114df5909a06151a95b
@@ -22,7 +22,7 @@ jobs:
22
22
  - name: Run Ferrum tests
23
23
  run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:ferrum
24
24
 
25
- - name: Run Ferrum tests
25
+ - name: Run Selenium tests
26
26
  run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:selenium
27
27
 
28
28
  - name: Run CLI tests
data/Gemfile.lock CHANGED
@@ -1,21 +1,21 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- wayfarer (0.4.0)
4
+ wayfarer (0.4.1)
5
5
  activejob (~> 6.0)
6
- capybara (~> 2.5)
6
+ addressable (~> 2.8)
7
+ capybara (~> 3.0)
7
8
  connection_pool (~> 2.2)
8
- cuprite (~> 0.11)
9
9
  docile (~> 1.1)
10
10
  ferrum (~> 0.9)
11
11
  metainspector (~> 5.0)
12
12
  mime-types (~> 3.0)
13
13
  mock_redis (~> 0.29)
14
14
  mustermann (~> 1.1)
15
- net-http-persistent (~> 2.9)
15
+ net-http-persistent (~> 3.0)
16
16
  nokogiri (~> 1.11)
17
17
  normalize_url (~> 0.0.6)
18
- redis (~> 4.0)
18
+ redis (~> 4.4, < 4.5)
19
19
  selenium-webdriver (~> 3.4)
20
20
  thor (~> 1.0)
21
21
 
@@ -34,13 +34,15 @@ GEM
34
34
  addressable (2.8.0)
35
35
  public_suffix (>= 2.0.2, < 5.0)
36
36
  ast (2.4.2)
37
- capybara (2.18.0)
37
+ capybara (3.36.0)
38
38
  addressable
39
+ matrix
39
40
  mini_mime (>= 0.1.3)
40
- nokogiri (>= 1.3.3)
41
- rack (>= 1.0.0)
42
- rack-test (>= 0.5.4)
43
- xpath (>= 2.0, < 4.0)
41
+ nokogiri (~> 1.8)
42
+ rack (>= 1.6.0)
43
+ rack-test (>= 0.6.3)
44
+ regexp_parser (>= 1.5, < 3.0)
45
+ xpath (~> 3.2)
44
46
  childprocess (3.0.0)
45
47
  cliver (0.3.2)
46
48
  coderay (1.1.3)
@@ -83,7 +85,7 @@ GEM
83
85
  faraday-net_http_persistent (1.2.0)
84
86
  faraday-patron (1.0.0)
85
87
  faraday-rack (1.0.0)
86
- faraday_middleware (1.1.0)
88
+ faraday_middleware (1.2.0)
87
89
  faraday (~> 1.0)
88
90
  fastimage (2.2.5)
89
91
  ferrum (0.11)
@@ -97,6 +99,7 @@ GEM
97
99
  domain_name (~> 0.5)
98
100
  i18n (1.8.10)
99
101
  concurrent-ruby (~> 1.0)
102
+ matrix (0.4.2)
100
103
  metainspector (5.11.2)
101
104
  addressable (~> 2.7)
102
105
  faraday (~> 1.4)
@@ -111,7 +114,7 @@ GEM
111
114
  mime-types (3.3.1)
112
115
  mime-types-data (~> 3.2015)
113
116
  mime-types-data (3.2021.0901)
114
- mini_mime (1.1.1)
117
+ mini_mime (1.1.2)
115
118
  mini_portile2 (2.6.1)
116
119
  minitest (5.14.4)
117
120
  mock_redis (0.29.0)
@@ -120,8 +123,9 @@ GEM
120
123
  mustermann (1.1.1)
121
124
  ruby2_keywords (~> 0.0.1)
122
125
  nesty (1.0.2)
123
- net-http-persistent (2.9.4)
124
- nokogiri (1.12.4)
126
+ net-http-persistent (3.1.0)
127
+ connection_pool (~> 2.2)
128
+ nokogiri (1.12.5)
125
129
  mini_portile2 (~> 2.6.1)
126
130
  racc (~> 1.4)
127
131
  normalize_url (0.0.6)
@@ -133,7 +137,7 @@ GEM
133
137
  coderay (~> 1.1)
134
138
  method_source (~> 1.0)
135
139
  public_suffix (4.0.6)
136
- racc (1.5.2)
140
+ racc (1.6.0)
137
141
  rack (1.6.13)
138
142
  rack-protection (1.5.5)
139
143
  rack
@@ -198,6 +202,7 @@ PLATFORMS
198
202
  ruby
199
203
 
200
204
  DEPENDENCIES
205
+ cuprite (~> 0.13)
201
206
  factory_bot (~> 6.0)
202
207
  faker (~> 1.7)
203
208
  pry (~> 0.10)
@@ -3,5 +3,5 @@
3
3
  See: [Guides: Networking: HTTP request headers](/guides/networking#http-request-headers)
4
4
 
5
5
  ```ruby
6
- Wayfarer.config.http_headers = { "User-Agent" => "MyCrawler ..." }
6
+ Wayfarer.config.network.http_headers = { "User-Agent" => "MyCrawler ..." }
7
7
  ```
@@ -1,3 +1,66 @@
1
1
  # Capybara
2
2
 
3
- TODO
3
+ [Capybara](https://github.com/teamcapybara/capybara) is originally a test
4
+ framework for web applications.
5
+
6
+ When Capybara is in use, a remote browser process is available as a Capybara
7
+ session:
8
+
9
+ ```ruby
10
+ class DummyWorker < Wayfarer::Worker
11
+ route.to :index
12
+
13
+ def index
14
+ browser # => #<Capybara::Session ...>
15
+ end
16
+ end
17
+ ```
18
+
19
+
20
+ ## Configuring a driver
21
+
22
+ 1. Install the Capybara driver for the desired user agent.
23
+
24
+ For example, to automate Google Chrome with
25
+ [Ferrum](https://github.com/rubycdp/ferrum), install the
26
+ [Cuprite](https://github.com/rubycdp/cuprite) driver:
27
+
28
+ === "RubyGems"
29
+
30
+ ```ruby
31
+ gem install cuprite
32
+ ```
33
+
34
+ === "Bundler"
35
+
36
+ ```ruby
37
+ gem "cuprite" # Gemfile
38
+ ```
39
+
40
+ 2. Configure Wayfarer to use the `:capybara` user agent and set the desired
41
+ driver:
42
+
43
+ === "Runtime"
44
+
45
+ ```ruby
46
+ Wayfarer.config.network.agent = :capybara
47
+ Wayfarer.config.capybara.driver = :cuprite
48
+ ```
49
+
50
+ === "Environment variables"
51
+
52
+ ```ruby
53
+ WAYFARER_NETWORK_AGENT=capybara
54
+ WAYFARER_CAPYBARA_DRIVER=cuprite
55
+ ```
56
+
57
+ 3. Register the driver:
58
+
59
+ ```ruby
60
+ Capybara.javascript_driver = :cuprite
61
+
62
+ Capybara.register_driver(:cuprite) do |app|
63
+ # Wayfarer's Ferrum or Selenium options must be passed along manually
64
+ Capybara::Cuprite::Driver.new(app, Wayfare.config.ferrum.options)
65
+ end
66
+ ```
@@ -0,0 +1,100 @@
1
+ # Custom agents
2
+
3
+ Wayfarer offers an interface for integrating third-party browsers and HTTP
4
+ clients as user agents.
5
+
6
+ There are two types of agents:
7
+
8
+ 1. Stateful agents, i.e. browsers, which carry state and support navigation.
9
+ These follow HTTP redirects implicitly.
10
+ 2. Stateless agents, which deal with HTTP requests/responses only.
11
+ These handle HTTP redirects explicitly.
12
+
13
+ ## Implementation
14
+
15
+ Both types can be implemented with callback methods:
16
+
17
+ === "Stateful"
18
+
19
+ ```ruby
20
+ class StatefulAgent
21
+ include Wayfarer::Networking::Strategy
22
+
23
+ def renew_on # optional
24
+ [MyBrowser::IrrecoverableError]
25
+ end
26
+
27
+ def create
28
+ MyBrowser.new
29
+ end
30
+
31
+ def destroy(browser) # optional
32
+ browser.quit
33
+ end
34
+
35
+ def navigate(browser, url)
36
+ browser.goto(url)
37
+ end
38
+
39
+ def live(browser)
40
+ success(url: browser.url,
41
+ body: browser.body,
42
+ status_code: browser.status_code,
43
+ headers: browser.headers)
44
+ end
45
+ end
46
+ ```
47
+
48
+ === "Stateless"
49
+
50
+ ```ruby
51
+ class StatelessAgent
52
+ include Wayfarer::Networking::Strategy
53
+
54
+ def renew_on # optional
55
+ [MyClient::IrrecoverableError]
56
+ end
57
+
58
+ def create
59
+ MyClient.new
60
+ end
61
+
62
+ def destroy(client) # optional
63
+ client.close
64
+ end
65
+
66
+ def fetch(client, url)
67
+ response = client.get(url)
68
+
69
+ return redirect(response.redirect_url) if response.redirect?
70
+
71
+ success(url: url,
72
+ body: response.body,
73
+ status_code: response.status_code,
74
+ headers: response.headers)
75
+ end
76
+ end
77
+ ```
78
+
79
+
80
+ Register the strategy:
81
+
82
+ ```ruby
83
+ Wayfarer::Networking::Pool.registry[:my_agent] = MyAgent.new
84
+ ```
85
+
86
+ Use the strategy:
87
+
88
+ ```ruby
89
+ Wayfarer.config.network.agent = :my_agent
90
+ ```
91
+
92
+ ### Remarks
93
+
94
+ #### Self-healing
95
+
96
+ * A strategy's `#renew_on` method may return a list of exception classes upon
97
+ which the existing instance gets destroyed and replaced with a newly created
98
+ one.
99
+ * Stateless clients must not raise exceptions when encountering certain HTTP
100
+ response codes (for example, 5xx).
@@ -25,13 +25,13 @@ end
25
25
  === "Runtime"
26
26
 
27
27
  ```ruby
28
- Wayfarer.config.adapter = :ferrum
29
- Wayfarer.config.ferrum_options = { headless: false, url: "http://chrome:3000" }
28
+ Wayfarer.config.network.agent = :ferrum
29
+ Wayfarer.config.ferrum.options = { headless: false, url: "http://chrome:3000" }
30
30
  ```
31
31
 
32
32
  === "Environment variables"
33
33
 
34
34
  ```
35
- WAYFARER_ADAPTER=ferrum
35
+ WAYFARER_AGENT=ferrum
36
36
  WAYFARER_FERRUM_OPTIONS=headless:false,url:http://chrome:3000
37
37
  ```
@@ -25,7 +25,7 @@ process.
25
25
  Pages retrieved with a Selenium WebDriver return fake values:
26
26
 
27
27
  ```ruby
28
- Wayfarer.config.adapter = :selenium
28
+ Wayfarer.config.network.agent = :selenium
29
29
 
30
30
  class DummyJob < Wayfarer::Base
31
31
  route.to :index
@@ -47,13 +47,15 @@ process.
47
47
  === "Runtime"
48
48
 
49
49
  ```ruby
50
- Wayfarer.config.adapter = :selenium
51
- Wayfarer.config.selenium_argv = [:firefox]
50
+ Wayfarer.config.network.agent = :selenium
51
+ Wayfarer.config.selenium.driver = :firefox
52
+ Wayfarer.config.selenium.options = { url: "http://firefox" }
52
53
  ```
53
54
 
54
55
  === "Environment variables"
55
56
 
56
57
  ```
57
- WAYFARER_ADAPTER=selenium
58
- WAYFARER_SELENIUM_ARGV=firefox
58
+ WAYFARER_AGENT=selenium
59
+ WAYFARER_SELENIUM_DRIVER=firefox
60
+ WAYFARER_SELENIUM_OPTIONS=url:http://firefox
59
61
  ```
@@ -1,30 +1,56 @@
1
1
  # Callbacks
2
2
 
3
- ## Life cycle callbacks
3
+ ## Active Job callbacks
4
4
 
5
- Wayfarer supports all of Active Job's life cycle callbacks:
5
+ Wayfarer naturally supports all of [Active Job's life cycle callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
6
6
 
7
- * [Active Job Basics: Callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks)
8
- * [ActiveJob::Callbacks](https://api.rubyonrails.org/classes/ActiveJob/Callbacks/ClassMethods.html)
7
+ ## `before_fetch`
9
8
 
10
- ## `after_batch` callbacks
9
+ Runs before a job fetches a page, either by making an HTTP request, or by
10
+ navigating a browser to its task URL.
11
11
 
12
- Jobs can register callbacks to run once all jobs in their batch have concluded:
12
+ ```ruby
13
+ class DummyJob < Wayfarer::Base
14
+ before_fetch :do_something
15
+
16
+ private
17
+
18
+ def do_something
19
+ # before the task.url is fetched
20
+ end
21
+ end
22
+ ```
23
+
24
+ ## `before_action`
25
+
26
+ Runs after a page was fetched, before an action method is called.
13
27
 
14
28
  ```ruby
15
29
  class DummyJob < Wayfarer::Base
16
- after_batch do
17
- # All jobs in batch done
30
+ before_action :do_something
31
+
32
+ private
33
+
34
+ def do_something
35
+ # page is available at this point
18
36
  end
37
+ end
38
+ ```
39
+
40
+ ## `after_batch`
19
41
 
42
+ Runs once the last job in a batch performed:
43
+
44
+ ```ruby
45
+ class DummyJob < Wayfarer::Base
20
46
  after_batch do
21
- # Multiple callbacks can be registered
47
+ # All jobs in batch done
22
48
  end
23
49
  end
24
50
  ```
25
51
 
26
52
  Internally, a batch counter is in-/decremented on certain events. Once the
27
- counter reaches zero, `after_batch` callbacks execute in declaration order.
53
+ counter reaches zero, `after_batch` callbacks runs in declaration order.
28
54
 
29
55
  The counter is incremented when:
30
56
 
@@ -36,3 +62,84 @@ The counter is decremented when:
36
62
  * A job fails due to an unhandled exception.
37
63
  * A job fails due to a discarded exception.
38
64
  * A job fails and thereyby exhausts its maximum attempts.
65
+
66
+ !!! attention "Batch callbacks can fail jobs"
67
+
68
+ If the last job's `after_batch` callbacks raises an exception, this can lead
69
+ to the job getting retried. If the exception raised by the callback is
70
+ unhandled or discarded, the callback never fully runs.
71
+
72
+ ## Callback options
73
+
74
+ ### Definition styles
75
+
76
+ Callbacks can be registered either by supplying a block or a symbol identifying
77
+ a callback instance method:
78
+
79
+ ```ruby
80
+ class DummyJob < Wayfarer::Base
81
+ before_action do
82
+ # ...
83
+ end
84
+
85
+ before_action :my_callback
86
+
87
+ private
88
+
89
+ def my_callback
90
+ # ...
91
+ end
92
+ end
93
+ ```
94
+
95
+ ### Conditionals
96
+
97
+ Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
98
+
99
+ ```ruby
100
+ class DummyJob < Wayfarer::Base
101
+ before_fetch :my_callback, if: :my_condition
102
+
103
+ private
104
+
105
+ def my_callback
106
+ end
107
+
108
+ def my_condition
109
+ end
110
+ end
111
+ ```
112
+
113
+ Callbacks can be registered for certain action methods only with the `:only` and
114
+ `:except` keywords:
115
+
116
+ ```ruby
117
+ class DummyJob < Wayfarer::Base
118
+ before_fetch :do_something, only: :foo
119
+
120
+ before_fetch except: [:foo, :qux] do
121
+ # runs only before bar
122
+ end
123
+
124
+ def foo
125
+ end
126
+
127
+ def bar
128
+ end
129
+ end
130
+
131
+ ```
132
+
133
+ ### Early termination
134
+
135
+ Callbacks that return `false` halt the callback chain:
136
+
137
+ ```ruby
138
+ class DummyJob < Wayfarer::Base
139
+ before_action { false }
140
+
141
+ before_action do
142
+ # never runs
143
+ end
144
+ end
145
+ ```
@@ -12,22 +12,28 @@ Wayfarer parses environment variables into a runtime configuration
12
12
  `Wayfarer.config`:
13
13
 
14
14
  ```ruby
15
- # Which network adapter to use to process tasks
16
- Wayfarer.config.adapter = :net_http # or :ferrum, :selenium
15
+ # Which user agent to use to process tasks
16
+ Wayfarer.config.network.agent = :http # or :ferrum, :selenium
17
17
 
18
- # How many network adapters to instantiate
19
- Wayfarer.config.adapter_pool_size = 3
18
+ # How many user agents to instantiate
19
+ Wayfarer.config.network.pool_size = 3
20
20
 
21
- # How long an adapter may be used while processing a task
22
- Wayfarer.config.adapter_pool_timeout = 5000
21
+ # How long an agent may be used while processing a task
22
+ Wayfarer.config.network.pool_timeout = 5000
23
23
 
24
24
  # Ferrum options
25
- Wayfarer.config.ferrum_options = {}
25
+ Wayfarer.config.ferrum.options = {}
26
26
 
27
- # Selenium arguments
28
- Wayfarer.config.selenium_argv = [:chrome]
27
+ # Selenium driver to use
28
+ Wayfarer.config.selenium.driver = :chrome
29
+
30
+ # Selenium HTTP client read timeout
31
+ Wayfarer.config.selenium.client_timeout = 10 # seconds
32
+
33
+ # Selenium options
34
+ Wayfarer.config.selenium.options = { url: "http://chrome" }
29
35
 
30
36
  # HTTP request headers (Selenium is unsupported)
31
- Wayfarer.config.http_headers = { "Field" => "Value" }
37
+ Wayfarer.config.network.http_headers = { "Field" => "Value" }
32
38
  ```
33
39
 
@@ -6,16 +6,12 @@ Wayfarer relies on Active Job's error handling facilities, `retry_on` and
6
6
  * [Active Job Basics: Exceptions](https://guides.rubyonrails.org/active_job_basics.html#exceptions)
7
7
  * [ActiveJob::Exceptions](https://edgeapi.rubyonrails.org/classes/ActiveJob/Exceptions/ClassMethods.html)
8
8
 
9
- ## Unhandled exceptions
10
-
11
- Jobs with unhandled exceptions fail and are not retried.
12
-
13
9
  ## Retrying
14
10
 
15
11
  ```ruby
16
12
  class DummyJob < Wayfarer::Base
17
13
  retry_on MyError, attempts: 3 do |job, error|
18
- # All 3 retry attempts have failed
14
+ # All 3 attempts have failed (1 initial attempt + 2 retries)
19
15
  end
20
16
  end
21
17
  ```
@@ -29,3 +25,11 @@ class DummyJob < Wayfarer::Base
29
25
  end
30
26
  end
31
27
  ```
28
+
29
+ ## Job failures
30
+
31
+ Jobs are not retried and their URLs locked within their batch if:
32
+
33
+ * A discarded exception is raised.
34
+ * An unhandled exception is raised.
35
+ * A handled exception is raised, but retry attempts are exhausted.
@@ -1,18 +1,92 @@
1
1
  # Networking
2
2
 
3
- Wayfarer retrieves pages in two ways:
3
+ Wayfarer navigates the web in two ways:
4
4
 
5
5
  1. Via plain HTTP requests
6
6
  2. By automating browsers
7
7
 
8
- TODO
8
+ Both options are mutually exclusive per Ruby process.
9
+
10
+ ## User agents
11
+
12
+ A user agent is an entity that knows how to retrieve the contents behind a URL.
13
+
14
+ The user agent can be configured via the global configuration:
15
+
16
+ ```ruby
17
+ Wayfarer.config.network.agent = :http # or :ferrum, :selenium
18
+ ```
19
+
20
+ ## Connection pooling
21
+
22
+ Wayfarer keeps user agents within a connection pool. When a job executes
23
+ and needs to retrieve the contents behind a URL, an agent is checked out from
24
+ the pool.
25
+
26
+ The pool has a constant size and it should equal the number of threads the
27
+ underlying message queue operates with. The size can be configured via the
28
+ global configuration:
29
+
30
+ ```ruby
31
+ Wayfarer.config.network.pool_size = 8
32
+ ```
33
+
34
+ ### Timeouts
35
+
36
+ user agents may stay checked out from the pool by jobs for a limited time
37
+ only. Once this time limit is exceeded, a `ConnectionPool::TimeoutError`
38
+ exception is raised. This places a hard time limit on every job.
39
+
40
+ The timeout can be configured via the global configuration:
41
+
42
+ ```ruby
43
+ Wayfarer.config.network.pool_timeout = 20 # seconds
44
+ ```
45
+
46
+ Because jobs with unhandled exceptions fail, explicit error handling is required
47
+ if retries are desired:
48
+
49
+ ```ruby
50
+ class DummyJob < Wayfarer::Base
51
+ retry_on ConnectionPool::TimeoutError, attempts: 3
52
+ end
53
+ ```
54
+
55
+ ## Agent-specific client timeouts
56
+
57
+ The time in seconds it may take to communicate with remote browser processes can
58
+ be configured globally per agent:
59
+
60
+ ```ruby
61
+ Wayfarer.config.ferrum.options = { timeout: 5 }
62
+ Wayfarer.config.selenium.client_timeout = 60
63
+ ```
64
+
65
+ ### Shared state
66
+
67
+ As user agents get checked in and out continously between jobs, their state
68
+ carries over from job to job, too.
69
+
70
+ For browser automation, this means:
71
+
72
+ * A job finds the browser at the last URL the previous job has left off.
73
+ * The browser's cookies might have been set, or other client-side state might
74
+ exist that significantly affects a page's behaviour.
75
+
76
+ ## HTTP redirect handling
77
+
78
+ Browsers follow redirects transparently when they are navigated to a URL.
79
+
80
+ When using plain HTTP, redirect URLs are enqueued transparently within the same
81
+ batch. URLs that result in 3xx responses will not be retrieved again within
82
+ their batch.
9
83
 
10
84
  ## HTTP request headers
11
85
 
12
86
  Request headers can be configured via the global configuration:
13
87
 
14
88
  ```ruby
15
- Wayfarer.config.http_headers = { "Field" => "Value" }
89
+ Wayfarer.config.network.http_headers = { "Field" => "Value" }
16
90
  ```
17
91
 
18
92
  !!! attention "Partial support"
data/docs/index.md CHANGED
@@ -14,10 +14,18 @@ hide:
14
14
  * Data extraction
15
15
  * Browser automation
16
16
 
17
- !!! attention "Experimental software"
17
+ !!! attention "Unstable software"
18
18
 
19
19
  Wayfarer is under development and releases should be considered unstable.
20
20
 
21
+ Wayfarer complies to
22
+ [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) in
23
+ which v0.x means that there could be backward-incompatible changes for every
24
+ release:
25
+
26
+ >Major version zero (0.y.z) is for initial development. Anything MAY change
27
+ at any time. The public API SHOULD NOT be considered stable.
28
+
21
29
  ### Installation
22
30
 
23
31
  Install the RubyGem: