wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -4,12 +4,12 @@
4
4
 
5
5
  ```ruby
6
6
  class DummyJob < Wayfarer::Base
7
- route { to :index }
7
+ route.to :index
8
8
 
9
9
  def index
10
- agent.goto("https://example.com")
11
- agent.back
12
- agent.forward
10
+ user_agent.goto("https://example.com")
11
+ user_agent.back
12
+ user_agent.forward
13
13
  end
14
14
  end
15
15
  ```
@@ -18,12 +18,12 @@
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
- agent.navigate.to("https://example.com")
25
- agent.navigate.back
26
- agent.navigate.forward
24
+ user_agent.navigate.to("https://example.com")
25
+ user_agent.navigate.back
26
+ user_agent.navigate.forward
27
27
  end
28
28
  end
29
29
  ```
@@ -32,12 +32,12 @@
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
- agent.visit("https://example.com")
39
- agent.go_back
40
- agent.go_forward
38
+ user_agent.visit("https://example.com")
39
+ user_agent.go_back
40
+ user_agent.go_forward
41
41
  end
42
42
  end
43
43
  ```
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  page.doc.css("html")
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
19
19
 
20
20
  ```ruby
21
21
  class DummyJob < Wayfarer::Base
22
- route { to :index }
22
+ route.to :index
23
23
 
24
24
  def index
25
25
  browser.at_css("html")
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
38
  browser.find_elements(css: "html")
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  browser.screenshot(path: "screenshot.png")
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
24
  browser.save_screenshot("screenshot.png")
@@ -1,145 +1,45 @@
1
1
  # Callbacks
2
2
 
3
- ## Active Job callbacks
3
+ Wayfarer supports a number of callbacks in addition to
4
+ [ActiveJob callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
4
5
 
5
- Wayfarer naturally supports all of [Active Job's life cycle callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
6
+ ## Available callbacks
6
7
 
7
- ## `before_fetch`
8
-
9
- Runs before a job fetches a page, either by making an HTTP request, or by
10
- navigating a browser to its task URL.
11
-
12
- ```ruby
13
- class DummyJob < Wayfarer::Base
14
- before_fetch :do_something
15
-
16
- private
17
-
18
- def do_something
19
- # before the task.url is fetched
20
- end
21
- end
22
- ```
23
-
24
- ## `before_action`
25
-
26
- Runs after a page was fetched, before an action method is called.
27
-
28
- ```ruby
29
- class DummyJob < Wayfarer::Base
30
- before_action :do_something
31
-
32
- private
33
-
34
- def do_something
35
- # page is available at this point
36
- end
37
- end
38
- ```
8
+ * `before_fetch`
9
+ * `around_fetch`
10
+ * `after_fetch`
11
+ * `before_action`
12
+ * `around_action`
13
+ * `after_action`
14
+ * `after_batch`
39
15
 
40
16
  ## `after_batch`
41
17
 
42
- Runs once the last job in a batch performed:
43
-
44
- ```ruby
45
- class DummyJob < Wayfarer::Base
46
- after_batch do
47
- # All jobs in batch done
48
- end
49
- end
50
- ```
51
-
52
- Internally, a batch counter is in-/decremented on certain events. Once the
53
- counter reaches zero, `after_batch` callbacks runs in declaration order.
54
-
55
- The counter is incremented when within the batch:
18
+ You can register `after_batch` callbacks that run when there are no more tasks
19
+ to process in a batch. Wayfarer instruments job execution and in- or decrements
20
+ an integer counter in Redis on certain events. When the counter reaches zero,
21
+ the current job's `after_batch` callbacks run.
56
22
 
57
- * A job is enqueued.
23
+ !!! info "`after_batch` callbacks fire at most once per batch."
58
24
 
59
- The counter is decremented when:
25
+ ## Conditional callbacks
60
26
 
61
- * A job succeeds.
62
- * A job errors due to an unhandled exception.
63
- * A job is discarded due to an exception.
64
- * A job errors and thereyby exhausts its maximum attempts.
65
-
66
- !!! attention "Batch callbacks can fail jobs"
67
-
68
- If the last job's `after_batch` callbacks raises an exception, this can lead
69
- to the job getting retried. If the exception raised by the callback is
70
- unhandled or discarded, the callback never fully runs.
71
-
72
- ## Callback options
73
-
74
- ### Definition styles
75
-
76
- Callbacks can be registered either by supplying a block or a symbol identifying
77
- a callback instance method:
27
+ You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
28
+ keywords, for example to run a callback for some route `action` only:
78
29
 
79
30
  ```ruby
80
- class DummyJob < Wayfarer::Base
81
- before_action do
82
- # ...
83
- end
31
+ class DummyJob < ActiveJob::Base
32
+ include Wayfarer::Base
84
33
 
85
- before_action :my_callback
34
+ route.host "example.com", to: :example
35
+ route.to :fallback
86
36
 
87
- private
88
-
89
- def my_callback
37
+ before_action unless: -> { action == :fallback } do
90
38
  # ...
91
39
  end
92
- end
93
- ```
94
-
95
- ### Conditionals
96
-
97
- Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
98
-
99
- ```ruby
100
- class DummyJob < Wayfarer::Base
101
- before_fetch :my_callback, if: :my_condition
102
-
103
- private
104
-
105
- def my_callback
106
- end
107
-
108
- def my_condition
109
- end
110
- end
111
- ```
112
-
113
- Callbacks can be registered for certain action methods only with the `:only` and
114
- `:except` keywords:
115
-
116
- ```ruby
117
- class DummyJob < Wayfarer::Base
118
- before_fetch :do_something, only: :foo
119
-
120
- before_fetch except: [:foo, :qux] do
121
- # runs only before bar
122
- end
123
40
 
124
- def foo
125
- end
126
-
127
- def bar
128
- end
41
+ # ...
129
42
  end
130
-
131
43
  ```
132
44
 
133
- ### Early termination
134
-
135
- Callbacks that return `false` halt the callback chain:
136
-
137
- ```ruby
138
- class DummyJob < Wayfarer::Base
139
- before_action { false }
140
-
141
- before_action do
142
- # never runs
143
- end
144
- end
145
- ```
45
+ You can also pass a symbol instead of a block to call an instance method.
@@ -0,0 +1,71 @@
1
+ # wayfarer
2
+
3
+ The command-line interface to Wayfarer.
4
+
5
+ ## Usage
6
+
7
+ ```
8
+ wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
9
+ ```
10
+
11
+ See [Configuration](../reference/cli) for the respected environment variables.
12
+
13
+ ---
14
+
15
+ ## `wayfarer perform JOB URL`
16
+
17
+ : Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
18
+ Staged jobs are ignored.
19
+
20
+ ##### Options
21
+
22
+ * `--mock-redis`: Use an in-memory implementation of Redis instead of
23
+ talking to an actual server.
24
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
25
+
26
+ ---
27
+
28
+ ## `wayfarer enqueue JOB URL`
29
+
30
+ : Enqueues a task for `JOB` with `URL` to the message queue.
31
+
32
+ ##### Options
33
+
34
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
35
+
36
+ ---
37
+
38
+ ## `wayfarer execute JOB URL`
39
+
40
+ : Execute `JOB` with `URL` with the in-memory
41
+ [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
42
+ instead of writing the taks to an actual message queue. Blocks until the
43
+ batch has completed.
44
+
45
+ ##### Options
46
+
47
+ * `--mock-redis`: Use an in-memory implementation of Redis instead of
48
+ talking to an actual server.
49
+ * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
50
+ * `--min-threads`: Minimum number of threads to use. Default: 1
51
+ * `--max-threads`: Maximum number of threads to use. Default: 1
52
+
53
+ !!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
54
+
55
+ You need to set the `wait: 0` option on `retry_on` in order for
56
+ `wayfarer job execute` to execute retries:
57
+
58
+ ```ruby
59
+ retry_on StandardError, attempts: 3, wait: 0
60
+ ```
61
+ ---
62
+
63
+ ## `wayfarer route JOB URL`
64
+
65
+ : Prints the result of invoking `JOB`'s router with `URL`.
66
+
67
+ ---
68
+
69
+ ## `wayfarer tree JOB URL`
70
+
71
+ : Visualises the routing tree result of invoking `JOB`'s router with `URL`.
@@ -1,39 +1,14 @@
1
- # Configuration
2
-
3
- Wayfarer can be configured in two ways:
4
-
5
- 1. Using [environment variables](/reference/environment_variables)
6
- 2. Using runtime configuration
7
-
8
- ## Runtime configuration
9
-
10
- Wayfarer parses environment variables into a runtime configuration
11
- `Wayfarer::Config`. The configuration can then be altered or replaced via
12
- `Wayfarer.config`:
13
-
14
- ```ruby
15
- # Which user agent to use to process tasks
16
- Wayfarer.config.network.agent = :http # or :ferrum, :selenium
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
17
5
 
18
- # How many user agents to instantiate
19
- Wayfarer.config.network.pool_size = 3
20
-
21
- # How long an agent may be used while processing a task
22
- Wayfarer.config.network.pool_timeout = 5000
23
-
24
- # Ferrum options
25
- Wayfarer.config.ferrum.options = {}
26
-
27
- # Selenium driver to use
28
- Wayfarer.config.selenium.driver = :chrome
29
-
30
- # Selenium HTTP client read timeout
31
- Wayfarer.config.selenium.client_timeout = 10 # seconds
6
+ # Configuration
32
7
 
33
- # Selenium options
34
- Wayfarer.config.selenium.options = { url: "http://chrome" }
8
+ You can configure Wayfarer by assigning to `Wayfarer.config` which defaults to:
35
9
 
36
- # HTTP request headers (Selenium is unsupported)
37
- Wayfarer.config.network.http_headers = { "Field" => "Value" }
10
+ ```rb
11
+ module Wayfarer
12
+ --8<-- "lib/wayfarer.rb:48:96"
13
+ end
38
14
  ```
39
-
@@ -0,0 +1,67 @@
1
+ # Development
2
+
3
+ ## Release Procedure
4
+
5
+ 1. Ensure `Wayfarer::VERSION` was bumped appropriately.
6
+ 2. Ensure the version in wayfarer.gemspec matches.
7
+ 3. Open a release Pull Request develop -> master branch
8
+ 4. Merge the Pull Request
9
+ 5. Publish RubyGem and git tag as follows:
10
+
11
+ ```
12
+ git checkout master
13
+ git pull origin master --rebase
14
+ bundle exec rake build
15
+ gem push build/wayfarer-*.gem
16
+ bundle exec rake clean
17
+ git tag <VERSION>
18
+ git push origin <VERSION>
19
+ ```
20
+
21
+ ## Conventions and guidelines
22
+
23
+ * In source code, `url` refers to strings and `uri` refers to `Addressable::URI`
24
+ * Avoid writing bash at all costs. Use Ruby instead
25
+
26
+ ## Design decisions and architecture
27
+
28
+ ### Navigate the web along URL patterns
29
+
30
+ URLs are less prone to change than served markup.
31
+ One reason for this is that changes to a URL's path can have negative
32
+ consequences for its page ranking in search engines. Websites naturally implement
33
+ architectural URL patterns like REST or expose surrogate keys.
34
+
35
+ ### Follow URLs verbatim as they appear in responses
36
+
37
+ Normalized URLs are useful for deduplication, but URLs should be followed
38
+ as they appear in responses. Navigating to normalized versions of URLs makes
39
+ crawlers stick out from other user agents.
40
+
41
+ ### Tasks are version-less and don't persist metadata
42
+
43
+ Tasks serialize to their URL and batch. No other data gets written to
44
+ the message queue. There is also no need for versioning persisted tasks, since
45
+ there will be never more to a task than URL and batch. All task metadata
46
+ is ephemeral.
47
+
48
+ ### Why depend on Redis
49
+
50
+ There are two core features that depend on Redis. First, per-batch acylicity is
51
+ achieved by maintaining the set of processed URLs per batch in Redis.
52
+ There's no option to follow links in a cyclic manner. Second, batch completion
53
+ requires updating an integer value in Redis, and batch completion is a very
54
+ useful feature, since most crawls should end eventually, and often you want to
55
+ know when.
56
+
57
+ ### No configuration files
58
+
59
+ Wayfarer can be configured through `Wayfarer.config` only, because `Wayfarer.config`
60
+ may contain Ruby objects that don't de/serialize well, such as `Proc`s or `Set`s.
61
+
62
+ ### Features out of scope
63
+
64
+ Wayfarer won't provide:
65
+
66
+ * persistence or any sort of DOM data mapping abstractions
67
+ * URL generation helpers
@@ -0,0 +1,60 @@
1
+ # Handlers
2
+
3
+ Handlers are like [jobs](/jobs) but they don't inherit from `ActiveJob::Base`
4
+ which is why they can't affect the message queue directly themselves.
5
+ Instead, jobs and handlers can route tasks to other handlers. Handlers
6
+ themselves have routes, but they can be bypassed.
7
+
8
+ ## Handler capabilities
9
+
10
+ Like jobs, handlers support:
11
+
12
+ * URL routing
13
+ * staging tasks with `#!ruby stage(*urls)`
14
+ * jobs can access the `user_agent` that retrieved the `page`
15
+ * ad-hoc HTTP requests with `#!ruby fetch(url)`
16
+ * callbacks, but only a subset of job callbacks
17
+ * Content-Type filtering
18
+
19
+ ```ruby
20
+ class ExampleHandler
21
+ include Wayfarer::Handler
22
+
23
+ route.to: :index
24
+
25
+ def index
26
+ task # => #<Wayfarer::Task>
27
+ page # => #<Wayfarer::Page>
28
+ user_agent # => Browser or HTTP client
29
+ end
30
+ end
31
+
32
+ class DummyJob < ActiveJob::Base
33
+ include Wayfarer::Base
34
+
35
+ route.host "example.com", to: ExampleHandler
36
+ end
37
+ ```
38
+
39
+ You can also bypass a handler's router and route directly to an instance
40
+ method:
41
+
42
+ ```ruby
43
+ class DummyJob < ActiveJob::Base
44
+ include Wayfarer::Base
45
+
46
+ route.host "example.com", to: [ExampleHandler, :index]
47
+ end
48
+
49
+ class ExampleHandler
50
+ include Wayfarer::Handler
51
+
52
+ def index
53
+ task # => #<Wayfarer::Task>
54
+ page # => #<Wayfarer::Page>
55
+ user_agent # => Browser or HTTP client
56
+ end
57
+ end
58
+ ```
59
+
60
+ !!! `before_action` callbacks
@@ -0,0 +1 @@
1
+ hello