wayfarer 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +27 -0
  4. data/.github/workflows/release.yaml +30 -0
  5. data/.github/workflows/tests.yaml +21 -0
  6. data/.gitignore +5 -1
  7. data/.rubocop.yml +36 -0
  8. data/.vale.ini +8 -0
  9. data/.yardopts +1 -3
  10. data/Dockerfile +6 -4
  11. data/Gemfile +24 -0
  12. data/Gemfile.lock +274 -164
  13. data/Rakefile +7 -51
  14. data/bin/wayfarer +1 -1
  15. data/docker-compose.yml +23 -13
  16. data/docs/cookbook/consent_screen.md +2 -2
  17. data/docs/cookbook/executing_javascript.md +3 -3
  18. data/docs/cookbook/navigation.md +12 -12
  19. data/docs/cookbook/querying_html.md +3 -3
  20. data/docs/cookbook/screenshots.md +2 -2
  21. data/docs/guides/callbacks.md +25 -125
  22. data/docs/guides/cli.md +71 -0
  23. data/docs/guides/configuration.md +10 -35
  24. data/docs/guides/development.md +67 -0
  25. data/docs/guides/handlers.md +60 -0
  26. data/docs/guides/index.md +1 -0
  27. data/docs/guides/jobs.md +142 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +103 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +78 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +156 -0
  37. data/docs/guides/tasks.md +53 -9
  38. data/docs/guides/tutorial.md +66 -0
  39. data/docs/guides/user_agents.md +115 -0
  40. data/docs/index.md +17 -40
  41. data/lib/wayfarer/base.rb +125 -46
  42. data/lib/wayfarer/batch_completion.rb +60 -0
  43. data/lib/wayfarer/callbacks.rb +22 -48
  44. data/lib/wayfarer/cli/route_printer.rb +85 -89
  45. data/lib/wayfarer/cli.rb +103 -0
  46. data/lib/wayfarer/gc.rb +18 -6
  47. data/lib/wayfarer/handler.rb +15 -7
  48. data/lib/wayfarer/kv.rb +28 -0
  49. data/lib/wayfarer/logging.rb +38 -0
  50. data/lib/wayfarer/middleware/base.rb +2 -0
  51. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  52. data/lib/wayfarer/middleware/chain.rb +7 -1
  53. data/lib/wayfarer/middleware/content_type.rb +59 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +22 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +17 -4
  57. data/lib/wayfarer/middleware/normalize.rb +7 -14
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +31 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +14 -3
  65. data/lib/wayfarer/networking/ferrum.rb +1 -4
  66. data/lib/wayfarer/networking/follow.rb +14 -7
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +23 -13
  69. data/lib/wayfarer/networking/selenium.rb +15 -7
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +34 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +21 -0
  74. data/lib/wayfarer/redis/barrier.rb +26 -21
  75. data/lib/wayfarer/redis/counter.rb +18 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +166 -30
  79. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  80. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  81. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  82. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  83. data/lib/wayfarer/routing/matchers/path.rb +11 -31
  84. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  85. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  86. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  87. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  88. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  89. data/lib/wayfarer/routing/path_finder.rb +151 -23
  90. data/lib/wayfarer/routing/result.rb +1 -1
  91. data/lib/wayfarer/routing/root_route.rb +17 -1
  92. data/lib/wayfarer/routing/route.rb +66 -19
  93. data/lib/wayfarer/routing/serializable.rb +28 -0
  94. data/lib/wayfarer/routing/sub_route.rb +53 -0
  95. data/lib/wayfarer/routing/target_route.rb +17 -1
  96. data/lib/wayfarer/stringify.rb +21 -30
  97. data/lib/wayfarer/task.rb +9 -17
  98. data/lib/wayfarer/uri/normalization.rb +120 -0
  99. data/lib/wayfarer.rb +72 -5
  100. data/mise.toml +2 -0
  101. data/mkdocs.yml +44 -8
  102. data/rake/docs.rake +26 -0
  103. data/rake/lint.rake +9 -0
  104. data/rake/release.rake +23 -0
  105. data/rake/tests.rake +32 -0
  106. data/requirements.txt +1 -1
  107. data/spec/factories/job.rb +8 -0
  108. data/spec/factories/middleware.rb +2 -2
  109. data/spec/factories/path_finder.rb +11 -0
  110. data/spec/factories/redis.rb +19 -0
  111. data/spec/factories/task.rb +46 -2
  112. data/spec/spec_helpers.rb +55 -51
  113. data/spec/support/active_job_helpers.rb +8 -0
  114. data/spec/support/integration_helpers.rb +21 -0
  115. data/spec/support/redis_helpers.rb +9 -0
  116. data/spec/support/test_app.rb +66 -37
  117. data/spec/wayfarer/base_spec.rb +200 -0
  118. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  119. data/spec/wayfarer/cli/job_spec.rb +88 -0
  120. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  121. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  122. data/spec/wayfarer/gc_spec.rb +29 -0
  123. data/spec/wayfarer/handler_spec.rb +9 -0
  124. data/spec/wayfarer/integration/callbacks_spec.rb +200 -0
  125. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  126. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  127. data/spec/wayfarer/integration/gc_spec.rb +40 -0
  128. data/spec/wayfarer/integration/handler_spec.rb +65 -0
  129. data/spec/wayfarer/integration/page_spec.rb +79 -0
  130. data/spec/wayfarer/integration/params_spec.rb +64 -0
  131. data/spec/wayfarer/integration/parsing_spec.rb +99 -0
  132. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  133. data/spec/wayfarer/integration/stage_spec.rb +58 -0
  134. data/spec/wayfarer/middleware/batch_completion_spec.rb +33 -0
  135. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +24 -19
  136. data/spec/wayfarer/middleware/content_type_spec.rb +83 -0
  137. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +24 -22
  138. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  139. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  140. data/spec/wayfarer/middleware/router_spec.rb +102 -0
  141. data/spec/wayfarer/middleware/stage_spec.rb +63 -0
  142. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  143. data/spec/wayfarer/middleware/user_agent_spec.rb +158 -0
  144. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  145. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  146. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  147. data/spec/{networking → wayfarer/networking}/follow_spec.rb +11 -6
  148. data/spec/wayfarer/networking/http_spec.rb +12 -0
  149. data/spec/{networking → wayfarer/networking}/pool_spec.rb +16 -14
  150. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  151. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  152. data/spec/wayfarer/page_spec.rb +69 -0
  153. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  154. data/spec/wayfarer/parsing/xml_parse_spec.rb +25 -0
  155. data/spec/wayfarer/redis/barrier_spec.rb +39 -0
  156. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  157. data/spec/{redis → wayfarer/redis}/pool_spec.rb +4 -3
  158. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  159. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  160. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  161. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  162. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  163. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  164. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  165. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  166. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  167. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  168. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  169. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  170. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  171. data/spec/wayfarer/routing/route_spec.rb +74 -0
  172. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  173. data/spec/wayfarer/task_spec.rb +13 -0
  174. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  175. data/spec/wayfarer_spec.rb +2 -2
  176. data/wayfarer.gemspec +18 -28
  177. metadata +797 -265
  178. data/.github/workflows/ci.yaml +0 -32
  179. data/.rbenv-gemsets +0 -1
  180. data/.ruby-version +0 -1
  181. data/RELEASING.md +0 -17
  182. data/docs/cookbook/user_agent.md +0 -7
  183. data/docs/guides/error_handling.md +0 -53
  184. data/docs/guides/networking.md +0 -94
  185. data/docs/guides/performance.md +0 -130
  186. data/docs/guides/reliability.md +0 -41
  187. data/docs/guides/routing/steering.md +0 -30
  188. data/docs/reference/api/base.md +0 -48
  189. data/docs/reference/cli.md +0 -61
  190. data/docs/reference/configuration_keys.md +0 -43
  191. data/docs/reference/environment_variables.md +0 -83
  192. data/lib/wayfarer/cli/base.rb +0 -45
  193. data/lib/wayfarer/cli/generate.rb +0 -17
  194. data/lib/wayfarer/cli/job.rb +0 -56
  195. data/lib/wayfarer/cli/route.rb +0 -29
  196. data/lib/wayfarer/cli/runner.rb +0 -34
  197. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  198. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  199. data/lib/wayfarer/config/capybara.rb +0 -10
  200. data/lib/wayfarer/config/ferrum.rb +0 -11
  201. data/lib/wayfarer/config/networking.rb +0 -29
  202. data/lib/wayfarer/config/redis.rb +0 -14
  203. data/lib/wayfarer/config/root.rb +0 -11
  204. data/lib/wayfarer/config/selenium.rb +0 -21
  205. data/lib/wayfarer/config/strconv.rb +0 -45
  206. data/lib/wayfarer/config/struct.rb +0 -72
  207. data/lib/wayfarer/middleware/fetch.rb +0 -56
  208. data/lib/wayfarer/redis/connection.rb +0 -13
  209. data/lib/wayfarer/redis/version.rb +0 -19
  210. data/lib/wayfarer/routing/router.rb +0 -28
  211. data/spec/base_spec.rb +0 -224
  212. data/spec/callbacks_spec.rb +0 -102
  213. data/spec/cli/generate_spec.rb +0 -39
  214. data/spec/cli/job_spec.rb +0 -78
  215. data/spec/config/capybara_spec.rb +0 -18
  216. data/spec/config/ferrum_spec.rb +0 -24
  217. data/spec/config/networking_spec.rb +0 -73
  218. data/spec/config/redis_spec.rb +0 -32
  219. data/spec/config/root_spec.rb +0 -31
  220. data/spec/config/selenium_spec.rb +0 -56
  221. data/spec/config/strconv_spec.rb +0 -58
  222. data/spec/config/struct_spec.rb +0 -66
  223. data/spec/fixtures/dummy_job.rb +0 -7
  224. data/spec/gc_spec.rb +0 -59
  225. data/spec/handler_spec.rb +0 -11
  226. data/spec/integration/callbacks_spec.rb +0 -85
  227. data/spec/integration/page_spec.rb +0 -62
  228. data/spec/integration/params_spec.rb +0 -56
  229. data/spec/integration/stage_spec.rb +0 -51
  230. data/spec/integration/steering_spec.rb +0 -57
  231. data/spec/middleware/dedup_spec.rb +0 -88
  232. data/spec/middleware/dispatch_spec.rb +0 -43
  233. data/spec/middleware/fetch_spec.rb +0 -155
  234. data/spec/middleware/normalize_spec.rb +0 -29
  235. data/spec/middleware/router_spec.rb +0 -105
  236. data/spec/middleware/stage_spec.rb +0 -62
  237. data/spec/networking/capybara_spec.rb +0 -12
  238. data/spec/networking/ferrum_spec.rb +0 -12
  239. data/spec/networking/http_spec.rb +0 -12
  240. data/spec/networking/selenium_spec.rb +0 -12
  241. data/spec/page_spec.rb +0 -47
  242. data/spec/parsing/xml_spec.rb +0 -25
  243. data/spec/redis/barrier_spec.rb +0 -78
  244. data/spec/redis/counter_spec.rb +0 -32
  245. data/spec/redis/version_spec.rb +0 -13
  246. data/spec/routing/integration_spec.rb +0 -110
  247. data/spec/routing/matchers/custom_spec.rb +0 -31
  248. data/spec/routing/matchers/host_spec.rb +0 -49
  249. data/spec/routing/matchers/path_spec.rb +0 -43
  250. data/spec/routing/matchers/query_spec.rb +0 -137
  251. data/spec/routing/matchers/scheme_spec.rb +0 -25
  252. data/spec/routing/matchers/suffix_spec.rb +0 -41
  253. data/spec/routing/matchers/uri_spec.rb +0 -27
  254. data/spec/routing/path_finder_spec.rb +0 -33
  255. data/spec/routing/root_route_spec.rb +0 -29
  256. data/spec/routing/route_spec.rb +0 -43
  257. data/spec/routing/router_spec.rb +0 -24
  258. data/spec/task_spec.rb +0 -34
  259. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
@@ -1,32 +0,0 @@
1
- name: ci
2
-
3
- on:
4
- push:
5
- branches:
6
- - '*'
7
- env:
8
- CI: true
9
-
10
- jobs:
11
- ci:
12
- runs-on: ubuntu-latest
13
- steps:
14
- - uses: actions/checkout@v2
15
-
16
- - name: Start services
17
- run: docker-compose up -d
18
-
19
- - name: Run isolated tests
20
- run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:isolated
21
-
22
- - name: Run Ferrum tests
23
- run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:ferrum
24
-
25
- - name: Run Selenium tests
26
- run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:selenium
27
-
28
- - name: Run CLI tests
29
- run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake test:cli
30
-
31
- - name: Run RuboCop
32
- run: docker-compose run --rm --name test --service-ports wayfarer bundle exec rake rubocop
data/.rbenv-gemsets DELETED
@@ -1 +0,0 @@
1
- wayfarer
data/.ruby-version DELETED
@@ -1 +0,0 @@
1
- 2.7.4
data/RELEASING.md DELETED
@@ -1,17 +0,0 @@
1
- # Release Procedure
2
-
3
- 1. Ensure `Wayfarer::VERSION` was bumped appropriately.
4
- 2. Ensure the version in wayfarer.gemspec matches.
5
- 3. Open a release Pull Request develop -> master branch
6
- 4. Merge the Pull Request
7
- 5. Publish RubyGem and git tag as follows:
8
-
9
- ```
10
- git checkout master
11
- git pull origin master --rebase
12
- bundle exec rake build
13
- gem push build/wayfarer-*.gem
14
- bundle exec rake clean
15
- git tag <VERSION>
16
- git push origin <VERSION>
17
- ```
@@ -1,7 +0,0 @@
1
- # User Agent
2
-
3
- See: [Guides: Networking: HTTP request headers](/guides/networking#http-request-headers)
4
-
5
- ```ruby
6
- Wayfarer.config.network.http_headers = { "User-Agent" => "MyCrawler ..." }
7
- ```
@@ -1,53 +0,0 @@
1
- # Error handling
2
-
3
- ## Wayfarer never swallows exceptions
4
-
5
- * Wayfarer never swallows exceptions.
6
- * Jobs with unhandled exceptions are not retried.
7
-
8
- ## Retrying or discarding failing jobs
9
-
10
- Wayfarer relies on [Active Job's two error handling facilities](https://guides.rubyonrails.org/active_job_basics.html#exceptions).
11
-
12
- * `retry_on` to retry jobs a number of times on certain errors:
13
-
14
- ```ruby
15
- class DummyJob < Wayfarer::Base
16
- retry_on MyError, attempts: 3 do |job, error|
17
- # This block runs once all 3 attempts have failed
18
- # (1 initial attempt + 2 retries)
19
-
20
- raise error
21
- end
22
- end
23
- ```
24
-
25
- * `discard_on` to throw away jobs on certain errors:
26
-
27
- ```ruby
28
- class DummyJob < Wayfarer::Base
29
- discard_on MyError do |job, error|
30
- # This block runs once and buries the job
31
-
32
- raise error
33
- end
34
- end
35
- ```
36
-
37
- !!! attention "Always re-raise errors"
38
-
39
- You should always re-raise errors from `retry_on` and `discard_on` blocks,
40
- otherwise jobs will not get retried!
41
-
42
- ## Renewing agents on certain errors
43
-
44
- ```ruby
45
- Wayfarer.config.network.renew_on = [MyError]
46
- ```
47
-
48
- For example, if you use the Capybara
49
- [Cuprite](https://github.com/rubycdp/cuprite) driver:
50
-
51
- ```ruby
52
- Wayfarer.config.network.renew_on = [Ferrum::DeadBrowserError]
53
- ```
@@ -1,94 +0,0 @@
1
- # Networking
2
-
3
- Wayfarer navigates the web in two ways:
4
-
5
- 1. Via plain HTTP requests
6
- 2. By automating browsers
7
-
8
- Both options are mutually exclusive per Ruby process.
9
-
10
- ## User agents
11
-
12
- A user agent is an entity that knows how to retrieve the contents behind a URL.
13
-
14
- The user agent can be configured via the global configuration:
15
-
16
- ```ruby
17
- Wayfarer.config.network.agent = :http # or :ferrum, :selenium
18
- ```
19
-
20
- ## Connection pooling
21
-
22
- Wayfarer keeps user agents within a connection pool. When a job executes
23
- and needs to retrieve the contents behind a URL, an agent is checked out from
24
- the pool.
25
-
26
- The pool has a constant size and it should equal the number of threads the
27
- underlying message queue operates with. The size can be configured via the
28
- global configuration:
29
-
30
- ```ruby
31
- Wayfarer.config.network.pool_size = 8
32
- ```
33
-
34
- ### Timeouts
35
-
36
- user agents may stay checked out from the pool by jobs for a limited time
37
- only. Once this time limit is exceeded, a `ConnectionPool::TimeoutError`
38
- exception is raised. This places a hard time limit on every job.
39
-
40
- The timeout can be configured via the global configuration:
41
-
42
- ```ruby
43
- Wayfarer.config.network.pool_timeout = 20 # seconds
44
- ```
45
-
46
- Because jobs with unhandled exceptions fail, explicit error handling is required
47
- if retries are desired:
48
-
49
- ```ruby
50
- class DummyJob < Wayfarer::Base
51
- retry_on ConnectionPool::TimeoutError, attempts: 3
52
- end
53
- ```
54
-
55
- ## Agent-specific client timeouts
56
-
57
- The time in seconds it may take to communicate with remote browser processes can
58
- be configured globally per agent:
59
-
60
- ```ruby
61
- Wayfarer.config.ferrum.options = { timeout: 5 }
62
- Wayfarer.config.selenium.client_timeout = 60
63
- ```
64
-
65
- ### Shared state
66
-
67
- As user agents get checked in and out continously between jobs, their state
68
- carries over from job to job, too.
69
-
70
- For browser automation, this means:
71
-
72
- * A job finds the browser at the last URL the previous job has left off.
73
- * The browser's cookies might have been set, or other client-side state might
74
- exist that significantly affects a page's behaviour.
75
-
76
- ## HTTP redirect handling
77
-
78
- Browsers follow redirects transparently when they are navigated to a URL.
79
-
80
- When using plain HTTP, redirect URLs are enqueued transparently within the same
81
- batch. URLs that result in 3xx responses will not be retrieved again within
82
- their batch.
83
-
84
- ## HTTP request headers
85
-
86
- Request headers can be configured via the global configuration:
87
-
88
- ```ruby
89
- Wayfarer.config.network.http_headers = { "Field" => "Value" }
90
- ```
91
-
92
- !!! attention "Partial support"
93
-
94
- Selenium does not support configuring HTTP request headers.
@@ -1,130 +0,0 @@
1
- # Performance
2
-
3
- How to write performant crawlers with Wayfarer.
4
-
5
- ## Use a sufficiently sized user agent pool
6
-
7
- Automated browser processes or HTTP clients are kept in a [connection pool]() of
8
- static size. This avoids having to re-establish browser processes and enables
9
- their reuse.
10
-
11
- If the size of the pool is too small, the pool is a
12
- bottleneck. For example, if your message queue adapter uses 8 threads, but the
13
- pool only contains 1 user agent, the remaining 7 threads block until the agent
14
- is checked back in to the pool for use by one of the blocked threads.
15
-
16
- There is no reliable way to detect the number of threads of the underlying
17
- message queue adapter. The pool size should equal the number of threads;
18
-
19
- ```ruby
20
- Wayfarer.config.network.pool_size = 8 # defaults to 1
21
- ```
22
-
23
- ### Job shedding
24
-
25
- There is a maximum number of seconds that jobs wait when checking out a user
26
- agent from the pool. Once this time is exceeded,
27
- a `Wayfarer::UserAgentTimeoutError` is raised. By default, the timeout is 10
28
- seconds.
29
-
30
- This hints there are more threads in use than user agents in the pool.
31
-
32
- ## Stage less URLs
33
-
34
- Staging less URLs saves space and time:
35
-
36
- * Less tasks written to the message queue
37
- * Less time spent consuming tasks
38
- * Less time spent filtering URLs with Redis
39
-
40
- Wayfarer maintains a set of processed URLs for a batch in Redis. Every staged
41
- URL is checked for inclusion in this set before it gets appended as a task to
42
- the message queue.
43
-
44
- A common pattern is to stage all links of a page, and rely on routing to fetch
45
- only the relevant ones:
46
-
47
- ```ruby
48
- class DummyJob < Wayfarer::Base
49
- route { to: index, host: "example.com" }
50
-
51
- def index
52
- stage page.meta.links.all
53
- end
54
- end
55
- ```
56
-
57
- Pages commonly contain a large number of URLs.
58
-
59
- Every staged URL is:
60
-
61
- 1. Normalized to a canonical form, for example by sorting query parameters
62
- alphabetically.
63
- 2. Checked for inclusion in the batch Redis set or discarded.
64
- 3. Written to the message queue.
65
- 4. Consumed from the queue and matched against the router.
66
- 5. Fetched, if a route matches.
67
-
68
- Narrowing down the links in the document to follow speeds up the process.
69
- For example using Nokogiri, interesting links can be identified with a CSS
70
- selector:
71
-
72
- ```ruby
73
- class DummyJob < Wayfarer::Base
74
- route { to: index, host: "example.com" }
75
-
76
- def index
77
- stage interesting_links
78
- end
79
-
80
- private
81
-
82
- def interesting_links
83
- page.doc.css("a.interesting").map { |elem| elem["href"] }
84
- end
85
- end
86
- ```
87
-
88
- Because the router only accepts the single hostname `example.com`, the job can
89
- also ensure it stages only internal URLs by intersecting them with the
90
- interesting ones:
91
-
92
- ```ruby
93
- class DummyJob < Wayfarer::Base
94
- route { to: index, host: "example.com" }
95
-
96
- def index
97
- stage interesting_internal_links
98
- end
99
-
100
- private
101
-
102
- def interesting_internal_links
103
- page.meta.links.internal & interesting_links
104
- end
105
-
106
- def interesting_links
107
- page.doc.css("a.interesting").map { |elem| elem["href"] }
108
- end
109
- end
110
- ```
111
-
112
-
113
- ## Use Redis >= 6.2.0
114
-
115
- Redis 6.2.0 introduced the
116
- [`SMISMEMBER`](https://redis.io/commands/smismember) command which enables
117
- Wayfarer to check whether multiple URLs have been processed in a batch with a
118
- single command. With earlier versions, one command per URL is required.
119
-
120
- Wayfarer detects the Redis server version and uses `SMISMEMBER` without user
121
- configuration when supported.
122
-
123
- ## Use Oj for JSON parsing
124
-
125
- Wayfarer uses [Oj](https://github.com/ohler55/oj) for JSON parsing if the gem
126
- has been required at runtime:
127
-
128
- ```ruby
129
- require "oj"
130
- ```
@@ -1,41 +0,0 @@
1
- # Reliablity
2
-
3
- ## Durability
4
-
5
- Wayfarer executes atop reliable messages queues such as Sidekiq, Resque,
6
- RabbitMQ, etc. Its configuration is independent of the underlying queue
7
- infrastructure it reads from and writes to.
8
-
9
- ## Self-healing user agents
10
-
11
- Wayfarer handles the scenario where a remote browser process has crashed and
12
- must be replaced by a fresh browser process.
13
-
14
- This can be tested locally by automating a browser with headless mode turned
15
- off, and then closing the opened browser window: The current job fails, but the
16
- next job has access to a newly established browser session again.
17
-
18
- For example Ferrum might raise `Ferrum::DeadBrowserError`. Wayfarer's
19
- user agents are self-healing and react to these kinds of errors internally. When
20
- a browser window is closed, the Ferrum user agent attempts to establish a new
21
- browser process as a replacement, for the next job to use.
22
-
23
- [Wayfarer never swallows exceptions](/guides/error_handling). This means
24
- that even though the user agent might heal itself, jobs still need to explicitly
25
- retry browser errors:
26
-
27
- ```ruby
28
- class Foobar < Wayfarer::Base
29
- route { to: :index }
30
-
31
- retry_on Ferrum::DeadBrowserError, attempts: 3, wait: :exponentially_longer
32
-
33
- # ...
34
- end
35
- ```
36
-
37
- This leads to log entries like:
38
-
39
- ```
40
- Retrying DummyJob in 3 seconds, due to a Ferrum::DeadBrowserError.
41
- ```
@@ -1,30 +0,0 @@
1
- # Steering
2
-
3
- A job's router can receive arguments computed dynamically by `::steer`.
4
- Steering enables [batch routing](/cookbook/batch_routing).
5
-
6
- For example, the following router has hostname and path hard-coded:
7
-
8
- ```ruby
9
- class DummyJob < Wayfarer::Base
10
- route do
11
- host "example.com", path: "/contact", to: :index
12
- end
13
- end
14
- ```
15
-
16
- Instead, hostname and path could be provided by `::steer`, too:
17
-
18
- ```ruby
19
- class DummyJob < Wayfarer::Base
20
- route do |hostname, path|
21
- host hostname, path: path, to: :index
22
- end
23
-
24
- steer do |_task|
25
- ["example.com", "/contact"]
26
- end
27
- end
28
- ```
29
-
30
- Note that `steer` yields the current [task](/guides/tasks).
@@ -1,48 +0,0 @@
1
- ---
2
- title: Wayfarer::Base
3
- ---
4
-
5
- # `Wayfarer::Base`
6
-
7
- Wayfarer's complete job API.
8
-
9
- ---
10
-
11
- ### `::route`
12
- : Draw routes to instance methods.
13
-
14
- ---
15
-
16
- ### `::steer { (Wayfarer::Task) -> [any] }`
17
- : Provide router arguments.
18
-
19
- ---
20
-
21
- ### `#task -> Wayfarer::Task`
22
- : The currently processing task.
23
-
24
- ---
25
-
26
- ### `#params -> Hash`
27
- : URL parameters collected from the matching route.
28
-
29
- ---
30
-
31
- ### `#stage(String | [String]) -> void`
32
- : Add URLs to a processing set. URLs already processed within the
33
- current batch get discarded are not enqueued. Every staged URL gets
34
- normalized.
35
-
36
- ---
37
-
38
- ### `#browser -> Object`
39
- : The user agent that retrieved the current page.
40
-
41
- ---
42
-
43
- ### `#page(live: true | false) -> Page`
44
- : The page representing the response retrieved from the currently
45
- processing URL.
46
-
47
- With `live: true` called, a fresh `Page` is returned that reflects the
48
- current browser DOM. Calls to `#page` return the most recent page.
@@ -1,61 +0,0 @@
1
- # Command Line Interface
2
-
3
- ## Usage
4
-
5
- ```
6
- wayfarer [OPTIONS] [generate|job|route|version]
7
- ```
8
-
9
- All [environment variables](../environment_variables) are respected.
10
-
11
- ## `wayfarer generate`
12
-
13
- ### `wayfarer generate project NAME`
14
-
15
- : Generates a new project directory `NAME`.
16
-
17
- ## `wayfarer job`
18
-
19
- ### `wayfarer job perform JOB URL`
20
-
21
- : Performs `JOB` with `URL`. The job does not reach any Active Job backend.
22
- Staged jobs will not be processed.
23
-
24
- ##### Options
25
-
26
- * `--mock-redis`: Use an in-memory implementation of Redis instead of
27
- talking to an actual server.
28
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
29
-
30
- ### `wayfarer job enqueue JOB URL`
31
-
32
- : Enqueues `JOB` with `URL` to the configured Active Job backend.
33
-
34
- ##### Options
35
-
36
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
37
-
38
- ### `wayfarer job execute JOB URL`
39
-
40
- : Execute `JOB` with `URL` by using the
41
- [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html).
42
- The job does not reach any other Active Job backend. Blocks until the batch
43
- completes.
44
-
45
- ##### Options
46
-
47
- * `--mock-redis`: Use an in-memory implementation of Redis instead of
48
- talking to an actual server.
49
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
50
- * `--min-threads`: Minimum number of threads to use. Default: 1
51
- * `--max-threads`: Maximum number of threads to use. Default: 1
52
-
53
- ## `wayfarer route`
54
-
55
- ### `wayfarer route result JOB URL`
56
-
57
- : Prints the result of invoking `JOB`'s router with `URL`.
58
-
59
- ### `wayfarer route tree JOB URL`
60
-
61
- : Visualises the routing tree result of invoking `JOB`'s router with `URL`.
@@ -1,43 +0,0 @@
1
- ---
2
- hide:
3
- - toc
4
- ---
5
-
6
- # Configuration Keys
7
-
8
- ## `Wayfarer.config.network`
9
-
10
- | Runtime config key | Environment variable | Description | Default | Supported values |
11
- | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
12
- | `network.agent` | `WAYFARER_NETWORK_AGENT` | The user agent to use. | `:http` | `:http`, `:ferrum`, `:selenium` |
13
- | `network.pool_size` | `WAYFARER_NETWORK_POOL_SIZE` | How many user agents to spawn. | 1 | Integers |
14
- | `network.pool_timeout` | `WAYFARER_NETWORK_POOL_TIMEOUT` | How long jobs may use an agent in seconds. | 10 | Integers |
15
- | `network.http_headers` | `WAYFARER_NETWORK_HTTP_HEADERS` | HTTP headers to append to requests. | `{}` | Hashes |
16
- | `network.renew_on` | | Exception classes to renew agents on. | `[]` | Classes |
17
-
18
- ## `Wayfarer.config.ferrum`
19
-
20
- | Runtime config key | Environment variable | Description | Default | Supported values |
21
- | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
22
- | `ferrum.options` | `WAYFARER_FERRUM_OPTIONS` | Ferrum options. | `{}` | Hashes |
23
-
24
- ## `Wayfarer.config.selenium`
25
-
26
- | Runtime config key | Environment variable | Description | Default | Supported values |
27
- | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
28
- | `selenium.driver` | `WAYFARER_SELENIUM_DRIVER` | Selenium driver to use. | `:chrome` | Symbols |
29
- | `selenium.options` | `WAYFARER_SELENIUM_OPTIONS` | Selenium options. | `{}` | Hashes |
30
- | `selenium.client_timeout` | `WAYFARER_SELENIUM_CLIENT_TIMEOUT` | Selenium client timeout in seconds. | 60 | Integers |
31
-
32
- ## `Wayfarer.config.redis`
33
-
34
- | Runtime config key | Environment variable | Description | Default | Supported values |
35
- | ---------------------- | ------------------------------------ | ------------------------------------------- | ------------------------------------------ | ----------------------------------- |
36
- | `redis.url` | `WAYFARER_REDIS_URL` | Redis URL to connect to. | http://localhost:6379 | Strings |
37
- | `redis.factory` | n/a | Redis factory lambda. | ` ->(redis) { ::Redis.new(url: redis.url)` | Lambdas |
38
-
39
- ## `Wayfarer.config.capybara`
40
-
41
- | Runtime config key | Environment variable | Description | Default | Supported values |
42
- | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
43
- | `capybara.driver` | `WAYFARER_CAPYBARA_DRIVER` | The Capybara driver to use. | n/a | Symbols |
@@ -1,83 +0,0 @@
1
- # Environment Variables
2
-
3
- ## String formats
4
-
5
- Environment variable values can be parsed to Hash or Array at runtime
6
- with the following syntaxes:
7
-
8
- * Hash: Variable string `a:1,b:2,c:3` parses to `{a:1, b:2, c:3}` at runtime
9
- * Array: Variable string `a,b,c` parses to `[:a, :b, :c]` at runtime
10
-
11
- ## Variables
12
-
13
- ### `WAYFARER_AGENT`
14
- : Either `ferrum`, `selenium` or `http`.
15
-
16
- * Type: String
17
- * Key: `config.agent`
18
- * Default value: `:http`
19
-
20
- ### `WAYFARER_POOL_SIZE`
21
- : Number of user agents to maintain.
22
-
23
- * Type: Integer
24
- * Key: `config.pool_size`
25
- * Default value: `1`
26
-
27
- ### `WAYFARER_POOL_TIMEOUT`
28
- : How long a user agent may remain checked out until the owning job
29
- fails.
30
-
31
- * Type: Integer
32
- * Key: `config.agent_pool_timeout`
33
- * Default value: `1`
34
-
35
- ---
36
-
37
- ### `WAYFARER_FERRUM_OPTIONS`
38
- : Key/value options passed to `Ferrum::Browser.new`.
39
-
40
- * Type: Hash
41
- * Key: `config.ferrum_options`
42
- * Default value: `{}`
43
-
44
- ---
45
-
46
- ### `WAYFARER_SELENIUM_DRIVER`
47
- : Driver passed to `Selenium::WebDriver.for`.
48
-
49
- * Type: Symbol
50
- * Key: `config.selenium_driver`
51
- * Default value: `:chrome`
52
-
53
- ---
54
-
55
- ### `WAYFARER_SELENIUM_OPTIONS`
56
- : Options passed to `Selenium::WebDriver.for`.
57
-
58
- * Type: Hash
59
- * Key: `config.selenium_options`
60
- * Default value: `{}`
61
-
62
- ---
63
-
64
- ### `WAYFARER_SELENIUM_CLIENT_TIMEOUT`
65
- : Selenium HTTP client timeout (seconds).
66
-
67
- * Type: Integer
68
- * Key: `config.selenium_client_timeout`
69
- * Default value: `60`
70
-
71
- ---
72
-
73
- ### `WAYFARER_HTTP_HEADERS`
74
- : HTTP request headers used when retrieving pages.
75
-
76
- * Type: Hash
77
- * Key: `config.http_headers`
78
- * Default value: `{}`
79
-
80
- !!! attention "Partial support"
81
-
82
- Selenium does not support configuring HTTP request headers.
83
-