wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
data/docs/guides/jobs.md CHANGED
@@ -1,78 +1,126 @@
1
1
  # Jobs
2
2
 
3
- Jobs are Ruby classes that process [tasks](/guides/tasks) and look as follows:
3
+ Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
4
+ that use a DSL to process [tasks](/guides/tasks) that they read from a message
5
+ queue.
6
+
7
+ Instead of implementing Active Job's `#perform` method yourself, you declare
8
+ [routes](../routing) to instance methods, like web applications route incoming
9
+ requests. Only URLs that match a route are retrieved and processed. All other
10
+ URLs are considered successfully processed. The action has access to the
11
+ retrieved [page](../pages), the [user agent](../user-agents) that retrieved the
12
+ page and the current task:
4
13
 
5
14
  ```ruby
6
- class DummyJob < Wayfarer::Base
7
- route { to :index }
15
+ class DummyJob < ActiveJob::Base
16
+ include Wayfarer::Base
17
+
18
+ route.to :index
8
19
 
9
20
  def index
21
+ task # => #<Wayfarer::Task>
22
+ page # => #<Wayfarer::Page>
23
+ user_agent # => Browser or HTTP client
10
24
  end
11
25
  end
12
26
  ```
13
27
 
14
- Here is how to enqueue a task for a URL:
28
+ You can start a crawl by appending a task to the message queue for the URL with
29
+ `::crawl`. If you don't provide a batch, Wayfarer generates a UUID:
15
30
 
16
31
  ```ruby
17
- DummyJob.crawl("https://example.com")
32
+ task = DummyJob.crawl("https://example.com")
33
+ # => #<Wayfarer::Task url="https://example.com", batch="498a13e0-...">
18
34
  ```
19
35
 
20
- This is the same as calling the Active Job API directly and passing a task
21
- and a random batch:
36
+ This is exactly the same as calling Active Job's `#perform_later` and passing a
37
+ task directly:
22
38
 
23
39
  ```ruby
24
40
  task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
25
41
  DummyJob.perform_later(task)
26
42
  ```
27
43
 
28
- A batch can be specified with `::crawl`, too:
44
+ Instead of a generated UUID, you can also set your own batch:
29
45
 
30
46
  ```ruby
31
47
  DummyJob.crawl("https://example.com", batch: "my-batch")
32
48
  ```
33
49
 
34
- ## Current task
50
+ You can also use Wayfarer's [CLI](../cli) to enqueue a task:
51
+
52
+ ```sh
53
+ wayfarer enqueue --batch my-batch DummyJob "https://example.com"
54
+ ```
55
+
56
+ ## Following URLs
35
57
 
36
- Jobs consume [tasks](../tasks) from a message queue. The currently processed
37
- task is accessible like so:
58
+ Jobs navigate crawls by staging URLs with `stage(urls)`. When you stage a URL,
59
+ it is appended verbatim to an internal set. Once the action returns, all URLs
60
+ in the set are appended as tasks to the message queue.
38
61
 
39
62
  ```ruby
40
- class DummyJob < Wayfarer::Base
41
- route { to :index }
63
+ class DummyJob < ActiveJob::Base
64
+ include Wayfarer::Base
65
+
66
+ route.to :index
42
67
 
43
68
  def index
44
- task.url # => "https://example.com"
69
+ # Follow all out-going links of the page
70
+ stage page.meta.links.external
71
+ end
72
+ end
73
+ ```
74
+
75
+ ## Accessing the current task
76
+
77
+ If the task's URL matched a [route](../routing), the URL is retrieved over the network,
78
+ and the method that was routed to is called. The task is available as `#task`:
79
+
80
+ ```ruby
81
+ class DummyJob < ActiveJob::Base
82
+ include Wayfarer::Base
83
+
84
+ route.to :index
85
+
86
+ def index
87
+ task.url # => "https://example.com"
45
88
  task.batch # => "my-batch"
46
89
  end
47
90
  end
48
91
  ```
49
92
 
50
- ## Current page
93
+ ## Accessing the current page
51
94
 
52
- A task's URL contents get fetched into a [page](../pages) object if the task URL
53
- matched a route:
95
+ You have access to the retrieved [page](../pages):
54
96
 
55
97
  ```ruby
56
- class DummyJob < Wayfarer::Base
57
- route { to :index, host: "example.com" }
98
+ class DummyJob < ActiveJob::Base
99
+ include Wayfarer::Base
100
+
101
+ route.to :index
58
102
 
59
103
  def index
60
104
  page.url # => "https://example.com"
61
105
  page.body # => "<html>..."
62
106
  page.status_code # => 200
63
107
  page.headers # { "Content-Type" => ... }
108
+ page.doc # Only present for certain Content-Types
64
109
  end
65
110
  end
66
111
  ```
67
112
 
68
- ## URL parameters
113
+ ## Routing URLs to methods and extracting `params`
69
114
 
70
- Jobs can extract data from URLs with their router:
115
+ Jobs have a routing DSL that allows you to map URLs to methods and extract
116
+ URL data:
71
117
 
72
118
  ```ruby
73
- class DummyJob < Wayfarer::Base
119
+ class DummyJob < ActiveJob::Base
120
+ include Wayfarer::Base
121
+
74
122
  route do
75
- path "/users/:id/profile"
123
+ path "/users/:id/profile", to: :index
76
124
  end
77
125
 
78
126
  def index
@@ -80,22 +128,85 @@ class DummyJob < Wayfarer::Base
80
128
  end
81
129
  end
82
130
 
83
- DummyJob.crawl("https://example.com/users/42/profile")
131
+ DummyJob.crawl("https://example.com/users/42/profile?foo=bar")
84
132
  ```
85
133
 
134
+ ## Controlling the user agent
86
135
 
87
- ## User agent
88
-
89
- The HTTP client or automated browser that fetched the URL is available:
136
+ You can control the browser or HTTP client that retrieved the page:
90
137
 
91
138
  ```ruby
92
- Wayfarer.config.network.agent = :ferrum # Chrome DevTools Protocol
139
+ Wayfarer.config[:network][:agent] = :ferrum # Chrome DevTools Protocol
93
140
 
94
- class DummyJob < Wayfarer::Base
95
- route { to :index }
141
+ class DummyJob < ActiveJob::Base
142
+ include Wayfarer::Base
143
+
144
+ route.to :index
96
145
 
97
146
  def index
98
- browser.save_screenshot("capture.png")
147
+ user_agent.save_screenshot("capture.png")
99
148
  end
100
149
  end
101
150
  ```
151
+
152
+ ## Restricting the processed Content-Types
153
+
154
+ By default, jobs process pages regardless of their Content-Type response
155
+ header. You can allow a list of Content-Types as strings and Regexps and
156
+ opt out of the default behaviour. Once at least one Content-Type is allowed,
157
+ other Content-Types don't get processed:
158
+
159
+ ```ruby
160
+ class DummyJob < ActiveJob::Base
161
+ include Wayfarer::Base
162
+
163
+ content_type "text/html", "application/json"
164
+ content_type /xml/
165
+ end
166
+ ```
167
+
168
+ !!! info "HTTP parameters in Content-Types are ignored for comparison"
169
+
170
+ Content-Types are compared regardless of their parameters. For example,
171
+ `text/html; charset=UTF-8` is considered the same as `text/html`.
172
+
173
+ ## Handling errors
174
+
175
+ !!! danger "Only ActiveJob error handling is supported"
176
+
177
+ Wayfarer exclusively supports ActiveJob's error handling. You cannot use
178
+ message queue-specific error handling, for example error handling with
179
+ `sidekiq_options` is unsupported. Otherwise batches get garbage-collected
180
+ too early as Wayfarer instruments ActiveJob.
181
+
182
+ Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
183
+
184
+ * `retry_on` to retry jobs a number of times on certain errors:
185
+
186
+ ```ruby
187
+ class DummyJob < Wayfarer::Base
188
+ retry_on MyError, attempts: 3 do |job, error|
189
+ # This block runs once all 3 attempts have failed
190
+ # (1 initial attempt + 2 retries)
191
+ end
192
+ end
193
+ ```
194
+
195
+ * `discard_on` to throw away jobs on certain errors:
196
+
197
+ ```ruby
198
+ class DummyJob < Wayfarer::Base
199
+ discard_on MyError do |job, error|
200
+ # This block runs once and buries the job
201
+ end
202
+ end
203
+ ```
204
+
205
+ ## Recreating user agents on certain errors
206
+
207
+ You can configure a list of exception classes upon which user agents
208
+ get recreated (see [User agent API]()):
209
+
210
+ ```ruby
211
+ Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
212
+ ```
@@ -13,7 +13,7 @@ set first.
13
13
 
14
14
  ```ruby
15
15
  class DummyJob < Wayfarer::Base
16
- route { to :index }
16
+ route.to :index
17
17
 
18
18
  def index
19
19
  stage page.meta.links.all
@@ -1,17 +1,14 @@
1
1
  # Capybara
2
2
 
3
- [Capybara](https://github.com/teamcapybara/capybara) is originally a test
4
- framework for web applications.
5
-
6
- When Capybara is in use, a remote browser process is available as a Capybara
7
- session:
3
+ [Capybara](https://github.com/teamcapybara/capybara) is a test framework for web
4
+ applications which adds a nice API that also works well for web scraping.
8
5
 
9
6
  ```ruby
10
- Wayfarer.config.network.agent = :capybara
11
- # Wayfarer.config.capybara.driver = ...
7
+ Wayfarer.config[:network][:agent] = :capybara
8
+ # Wayfarer.config[:capybara][:driver] = ...
12
9
 
13
10
  class DummyJob < Wayfarer::Worker
14
- route { to :index }
11
+ route.to :index
15
12
 
16
13
  def index
17
14
  browser # => #<Capybara::Session ...>
@@ -19,14 +16,9 @@ class DummyJob < Wayfarer::Worker
19
16
  end
20
17
  ```
21
18
 
19
+ ## Example: Automating Chrome with Cuprite and Ferrum
22
20
 
23
- ## Configuring a driver
24
-
25
- 1. Install the Capybara driver for the desired user agent.
26
-
27
- For example, to automate Google Chrome with
28
- [Ferrum](https://github.com/rubycdp/ferrum), install the
29
- [Cuprite](https://github.com/rubycdp/cuprite) driver:
21
+ 1. Install the [Curpite](https://github.com/rubycdp/cuprite) Capybara driver:
30
22
 
31
23
  === "RubyGems"
32
24
 
@@ -34,20 +26,19 @@ end
34
26
  gem install cuprite
35
27
  ```
36
28
 
37
- === "Bundler"
29
+ === "Gemfile"
38
30
 
39
31
  ```ruby
40
32
  gem "cuprite" # Gemfile
41
33
  ```
42
34
 
43
- 2. Configure Wayfarer to use the `:capybara` user agent and set the desired
44
- driver:
35
+ 2. Configure Wayfarer to use the `:capybara` user agent and set the driver:
45
36
 
46
37
  === "Runtime"
47
38
 
48
39
  ```ruby
49
- Wayfarer.config.network.agent = :capybara
50
- Wayfarer.config.capybara.driver = :cuprite
40
+ Wayfarer.config[:network][:agent] = :capybara
41
+ Wayfarer.config[:capybara][:driver] = :cuprite
51
42
  ```
52
43
 
53
44
  === "Environment variables"
@@ -57,7 +48,7 @@ end
57
48
  WAYFARER_CAPYBARA_DRIVER=cuprite
58
49
  ```
59
50
 
60
- 3. Register the driver:
51
+ 3. Register the driver with Capybara:
61
52
 
62
53
  ```ruby
63
54
  require "capybara/cuprite"
@@ -66,6 +57,6 @@ end
66
57
 
67
58
  Capybara.register_driver(:cuprite) do |app|
68
59
  # Wayfarer's Ferrum or Selenium options can be passed along
69
- Capybara::Cuprite::Driver.new(app, Wayfarer.config.ferrum.options)
60
+ Capybara::Cuprite::Driver.new(app, Wayfarer.config[:ferrum][:options])
70
61
  end
71
62
  ```
@@ -1,18 +1,87 @@
1
- # Custom agents
1
+ # User agent API
2
+
3
+ Wayfarer retrieves web pages with user agents. There are two types of user
4
+ agents: __stateful__ browsers which carry state and follow redirects implicitly
5
+ as they navigate to a URL, and __stateless__ HTTP clients, which handle
6
+ redirects explicitly.
7
+
8
+ | | Stateless adapters | Stateful adapters |
9
+ |-------------------|--------------------|-------------------|
10
+ | interactive | no | yes |
11
+ | redirect handling | explicit | implicit |
12
+
13
+ Because spawning browser processes or instantiating HTTP clients is expensive,
14
+ Wayfarer keeps user agents in a pool and reuses them across jobs. This means
15
+ that browser state carries over between jobs, as a job checks out a previous
16
+ job's user agent. Only on certain irrecoverable errors are individual user agents
17
+ destroyed and recreated. For example when a browser process crashes, it is
18
+ replaced with a fresh browser process.
19
+
20
+ ## Base interface for custom user agents
21
+
22
+ You implement both stateful and stateless agents by including the
23
+ `Wayfarer::Networking::Strategy` module and defining callback methods. The
24
+ interfaces for stateful and stateless share the following base methods:
25
+
26
+ ```mermaid
27
+ classDiagram
28
+ class Square~Shape~{
29
+ int id
30
+ List~int~ position
31
+ setPoints(List~int~ points)
32
+ getPoints() List~int~
33
+ }
34
+
35
+ Square : -List~string~ messages
36
+ Square : +setMessages(List~string~ messages)
37
+ Square : +getMessages() List~string~
38
+ Square : +getDistanceMatrix() List~List~int~~
39
+ ```
40
+
41
+ * `#create` (__required__): Called when a new instance (browser process or HTTP client) is
42
+ needed.
43
+ * `#destroy(instance)` (optional): Called when an instance should be destroyed. Browser
44
+ processes should be quit, and HTTP clients should be freed.
45
+ * `#renew_on` (optional): Returns a list of exception classes upon which the existing
46
+ instance gets destroyed and replaced with a newly created one.
47
+
48
+ ## Stateless interface
49
+
50
+ The stateless interface indicate HTTP 3xx redirect responses explicitly. This is how
51
+ Wayfarer provides redirect handling out of the box, as there is a configurable limit
52
+ on the number of retries to follow.
53
+
54
+ In addition to the base interface, stateless user agents implement `#fetch`
55
+ which fetches [pages](../pages) or indicates redirects:
56
+
57
+ * `#fetch(instance, url)` (__required__): Called to retrieve a URL. Responses with a
58
+ 3xx status code must indicate the redirect URL by returning `redirect(url)`, since Wayfarer
59
+ deals with redirects on your behalf to avoid redirect loops. All other status
60
+ codes, including 4xx and 5xx, are considered successful and are indicated by calling
61
+ `success(url:, body:, status_code:, headers:)`.
2
62
 
3
- Wayfarer offers an interface for integrating third-party browsers and HTTP
4
- clients as user agents.
63
+ ## Stateful interface
5
64
 
6
- There are two types of agents:
65
+ In addition to the base interface, stateful user agents implement two additional
66
+ methods:
7
67
 
8
- 1. Stateful agents, i.e. browsers, which carry state and support navigation.
9
- These follow HTTP redirects implicitly.
10
- 2. Stateless agents, which deal with HTTP requests/responses only.
11
- These handle HTTP redirects explicitly.
68
+ * `#navigate(instance, url)` (__required__): Navigates the user agent to the given URL.
69
+ Stateful user agents follow redirects implicitly.
70
+ * `#live(instance) -> Wayfarer::Page` (__required__): Turns the current user agent state
71
+ into a [page](../pages).
12
72
 
13
- ## Implementation
73
+ ## Recreating user agents on error with `#renew_on`
14
74
 
15
- Both types can be implemented with callback methods:
75
+ Agents can optionally implement `#renew_on` to get themselves rereated on
76
+ certain errors.
77
+
78
+ If `#fetch` or `#navigate` raise an exception and the exception class is listed
79
+ in `#renew_on`, the instance is destroyed and recreated.
80
+
81
+ * `#renew_on` (optional): A list of exception classes upon which the existing instance gets
82
+ destroyed and replaced with a newly created one.
83
+
84
+ ## Example implementations
16
85
 
17
86
  === "Stateful"
18
87
 
@@ -20,18 +89,12 @@ Both types can be implemented with callback methods:
20
89
  class StatefulAgent
21
90
  include Wayfarer::Networking::Strategy
22
91
 
23
- def renew_on # optional
24
- [MyBrowser::IrrecoverableError]
25
- end
92
+ # Required methods
26
93
 
27
94
  def create
28
95
  MyBrowser.new
29
96
  end
30
97
 
31
- def destroy(browser) # optional
32
- browser.quit
33
- end
34
-
35
98
  def navigate(browser, url)
36
99
  browser.goto(url)
37
100
  end
@@ -42,6 +105,16 @@ Both types can be implemented with callback methods:
42
105
  status_code: browser.status_code,
43
106
  headers: browser.headers)
44
107
  end
108
+
109
+ # Optional methods
110
+
111
+ def destroy(browser)
112
+ browser.quit
113
+ end
114
+
115
+ def renew_on
116
+ [MyBrowser::IrrecoverableError]
117
+ end
45
118
  end
46
119
  ```
47
120
 
@@ -51,18 +124,12 @@ Both types can be implemented with callback methods:
51
124
  class StatelessAgent
52
125
  include Wayfarer::Networking::Strategy
53
126
 
54
- def renew_on # optional
55
- [MyClient::IrrecoverableError]
56
- end
127
+ # Required methods
57
128
 
58
129
  def create
59
130
  MyClient.new
60
131
  end
61
132
 
62
- def destroy(client) # optional
63
- client.close
64
- end
65
-
66
133
  def fetch(client, url)
67
134
  response = client.get(url)
68
135
 
@@ -73,28 +140,23 @@ Both types can be implemented with callback methods:
73
140
  status_code: response.status_code,
74
141
  headers: response.headers)
75
142
  end
143
+
144
+ # Optional methods
145
+
146
+ def destroy(client)
147
+ client.close
148
+ end
149
+
150
+ def renew_on # optional
151
+ [MyClient::IrrecoverableError]
152
+ end
76
153
  end
77
154
  ```
78
155
 
79
156
 
80
- Register the strategy:
157
+ Register and use the strategy:
81
158
 
82
159
  ```ruby
83
160
  Wayfarer::Networking::Pool.registry[:my_agent] = MyAgent.new
161
+ Wayfarer.config[:network][:agent] = :my_agent
84
162
  ```
85
-
86
- Use the strategy:
87
-
88
- ```ruby
89
- Wayfarer.config.network.agent = :my_agent
90
- ```
91
-
92
- ### Remarks
93
-
94
- #### Self-healing
95
-
96
- * A strategy's `#renew_on` method may return a list of exception classes upon
97
- which the existing instance gets destroyed and replaced with a newly created
98
- one.
99
- * Stateless clients must not raise exceptions when encountering certain HTTP
100
- response codes (for example, 5xx).
@@ -11,10 +11,10 @@ When Ferrum is in use, a Google Chrome process is accessible within jobs like
11
11
  so:
12
12
 
13
13
  ```ruby
14
- Wayfarer.config.network.agent = :ferrum
14
+ Wayfarer.config[:network][:agent] = :ferrum
15
15
 
16
16
  class DummyWorker < Wayfarer::Worker
17
- route { to :index }
17
+ route.to :index
18
18
 
19
19
  def index
20
20
  browser # => #<Ferrum::Browser ...>
@@ -27,8 +27,8 @@ end
27
27
  === "Runtime"
28
28
 
29
29
  ```ruby
30
- Wayfarer.config.network.agent = :ferrum
31
- Wayfarer.config.ferrum.options = { headless: false, url: "http://chrome:3000" }
30
+ Wayfarer.config[:network][:agent] = :ferrum
31
+ Wayfarer.config[:ferrum][:options] = { headless: false, url: "http://chrome:3000" }
32
32
  ```
33
33
 
34
34
  === "Environment variables"
@@ -1,33 +1,29 @@
1
1
  # Plain HTTP
2
2
 
3
- Wayfarer can retrieve pages via plain HTTP requests, also alongside automated
4
- browsers.
3
+ Wayfarer can retrieve pages via plain HTTP requests with the `:http` adapter,
4
+ also alongside automated browsers.
5
5
 
6
- ## Agent
6
+ ## Ad-hoc GET requests
7
7
 
8
- The HTTP agent is the default.
9
-
10
- ## Ad-hoc requests
11
-
12
- When automating browsers, it can be useful to additionally retrieve the page
8
+ When automating browsers, it can be useful to additionally retrieve another page
13
9
  over plain HTTP. Jobs can fetch URLs to [pages](/pages) with `#http`:
14
10
 
15
11
  ```ruby
16
12
  class DummyJob < Wayfarer::Base
17
- route { to :index }
13
+ route.to :index
18
14
 
19
15
  def index
20
- http.fetch(task.url) # => #<Wayfarer::Page ...>
16
+ http.fetch("https://example.com") # => #<Wayfarer::Page ...>
21
17
  end
22
18
  end
23
19
  ```
24
20
 
25
- By default, 3 redirects are followed, and this can be configured by passing the
26
- `follow` keyword:
21
+ By default, 3 redirects are followed, and this number can be configured by
22
+ passing the `follow` keyword:
27
23
 
28
24
  ```ruby
29
25
  http.fetch(url, follow: 5)
30
26
  ```
31
27
 
32
- If redirected too often, `Wayfarer::Networking::RedirectsExhaustedError` is
28
+ When redirected too often, `Wayfarer::Networking::RedirectsExhaustedError` is
33
29
  raised.
@@ -7,10 +7,10 @@ When Selenium is in use, a remote browser process is accessible within jobs like
7
7
  so:
8
8
 
9
9
  ```ruby
10
- Wayfarer.config.network.agent = :selenium
10
+ Wayfarer.config[:network][:agent] = :selenium
11
11
 
12
12
  class DummyWorker < Wayfarer::Worker
13
- route { to :index }
13
+ route.to :index
14
14
 
15
15
  def index
16
16
  browser # => #<Selenium::WebDriver ...>
@@ -27,10 +27,10 @@ process.
27
27
  Pages retrieved with a Selenium WebDriver return fake values:
28
28
 
29
29
  ```ruby
30
- Wayfarer.config.network.agent = :selenium
30
+ Wayfarer.config[:network][:agent] = :selenium
31
31
 
32
32
  class DummyJob < Wayfarer::Base
33
- route { to :index }
33
+ route.to :index
34
34
 
35
35
  def index
36
36
  page.headers # => always {}
@@ -39,19 +39,18 @@ process.
39
39
  end
40
40
  ```
41
41
 
42
- !!! note "Consider using [Ferrum](../ferrum) instead"
43
- Ferrum provides superior stability and a richer feature set compared to
44
- Selenium drivers. However Ferrum automates only Google Chrome. Unless a
45
- different browser is required, consider using Ferrum instead of Selenium.
42
+ !!! note "Consider using [Ferrum](../ferrum) instead if Google Chrome suits your needs."
43
+ Use Ferrum if you want to automate Google Chrome. It provides superior
44
+ stability and a richer feature set compared to Selenium drivers.
46
45
 
47
46
  ## Configuring Selenium
48
47
 
49
48
  === "Runtime"
50
49
 
51
50
  ```ruby
52
- Wayfarer.config.network.agent = :selenium
53
- Wayfarer.config.selenium.driver = :firefox
54
- Wayfarer.config.selenium.options = { url: "http://firefox" }
51
+ Wayfarer.config[:network][:agent] = :selenium
52
+ Wayfarer.config[:selenium][:driver] = :firefox
53
+ Wayfarer.config[:selenium][:options] = { url: "http://firefox" }
55
54
  ```
56
55
 
57
56
  === "Environment variables"