wayfarer 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. checksums.yaml +4 -4
  2. data/.env +17 -0
  3. data/.github/workflows/lint.yaml +8 -6
  4. data/.github/workflows/release.yaml +4 -3
  5. data/.github/workflows/tests.yaml +5 -14
  6. data/.gitignore +2 -2
  7. data/.rubocop.yml +31 -0
  8. data/.vale.ini +6 -3
  9. data/Dockerfile +3 -2
  10. data/Gemfile +21 -0
  11. data/Gemfile.lock +233 -128
  12. data/Rakefile +7 -0
  13. data/docker-compose.yml +13 -14
  14. data/docs/guides/callbacks.md +3 -1
  15. data/docs/guides/configuration.md +10 -35
  16. data/docs/guides/development.md +67 -0
  17. data/docs/guides/handlers.md +7 -7
  18. data/docs/guides/jobs.md +54 -11
  19. data/docs/guides/networking/custom_adapters.md +31 -10
  20. data/docs/guides/pages.md +24 -22
  21. data/docs/guides/routing.md +116 -34
  22. data/docs/guides/tasks.md +30 -10
  23. data/docs/guides/tutorial.md +23 -17
  24. data/docs/guides/user_agents.md +11 -9
  25. data/lib/wayfarer/base.rb +9 -8
  26. data/lib/wayfarer/batch_completion.rb +18 -14
  27. data/lib/wayfarer/callbacks.rb +14 -14
  28. data/lib/wayfarer/cli/route_printer.rb +78 -96
  29. data/lib/wayfarer/cli.rb +12 -30
  30. data/lib/wayfarer/gc.rb +6 -1
  31. data/lib/wayfarer/kv.rb +28 -0
  32. data/lib/wayfarer/middleware/chain.rb +7 -1
  33. data/lib/wayfarer/middleware/content_type.rb +20 -15
  34. data/lib/wayfarer/middleware/dedup.rb +9 -3
  35. data/lib/wayfarer/middleware/dispatch.rb +7 -2
  36. data/lib/wayfarer/middleware/normalize.rb +4 -12
  37. data/lib/wayfarer/middleware/router.rb +1 -1
  38. data/lib/wayfarer/middleware/uri_parser.rb +4 -3
  39. data/lib/wayfarer/networking/context.rb +12 -1
  40. data/lib/wayfarer/networking/ferrum.rb +1 -4
  41. data/lib/wayfarer/networking/follow.rb +2 -1
  42. data/lib/wayfarer/networking/pool.rb +12 -7
  43. data/lib/wayfarer/networking/selenium.rb +15 -7
  44. data/lib/wayfarer/page.rb +0 -2
  45. data/lib/wayfarer/parsing/xml.rb +1 -1
  46. data/lib/wayfarer/parsing.rb +2 -5
  47. data/lib/wayfarer/redis/barrier.rb +15 -2
  48. data/lib/wayfarer/redis/counter.rb +1 -2
  49. data/lib/wayfarer/routing/dsl.rb +166 -31
  50. data/lib/wayfarer/routing/hash_stack.rb +33 -0
  51. data/lib/wayfarer/routing/matchers/custom.rb +8 -5
  52. data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
  53. data/lib/wayfarer/routing/matchers/host.rb +15 -9
  54. data/lib/wayfarer/routing/matchers/path.rb +11 -33
  55. data/lib/wayfarer/routing/matchers/query.rb +41 -17
  56. data/lib/wayfarer/routing/matchers/result.rb +12 -0
  57. data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
  58. data/lib/wayfarer/routing/matchers/url.rb +13 -5
  59. data/lib/wayfarer/routing/path_consumer.rb +130 -0
  60. data/lib/wayfarer/routing/path_finder.rb +151 -23
  61. data/lib/wayfarer/routing/result.rb +1 -1
  62. data/lib/wayfarer/routing/root_route.rb +14 -2
  63. data/lib/wayfarer/routing/route.rb +71 -14
  64. data/lib/wayfarer/routing/serializable.rb +28 -0
  65. data/lib/wayfarer/routing/sub_route.rb +53 -0
  66. data/lib/wayfarer/routing/target_route.rb +17 -1
  67. data/lib/wayfarer/stringify.rb +1 -2
  68. data/lib/wayfarer/task.rb +3 -5
  69. data/lib/wayfarer/uri/normalization.rb +120 -0
  70. data/lib/wayfarer.rb +50 -10
  71. data/mise.toml +2 -0
  72. data/mkdocs.yml +8 -17
  73. data/rake/lint.rake +0 -96
  74. data/rake/release.rake +5 -11
  75. data/rake/tests.rake +8 -4
  76. data/requirements.txt +1 -1
  77. data/spec/factories/job.rb +8 -0
  78. data/spec/factories/middleware.rb +2 -2
  79. data/spec/factories/path_finder.rb +11 -0
  80. data/spec/factories/redis.rb +19 -0
  81. data/spec/factories/task.rb +39 -1
  82. data/spec/spec_helpers.rb +50 -57
  83. data/spec/support/active_job_helpers.rb +8 -0
  84. data/spec/support/integration_helpers.rb +21 -0
  85. data/spec/support/redis_helpers.rb +9 -0
  86. data/spec/support/test_app.rb +64 -43
  87. data/spec/{base_spec.rb → wayfarer/base_spec.rb} +32 -36
  88. data/spec/wayfarer/batch_completion_spec.rb +142 -0
  89. data/spec/wayfarer/cli/job_spec.rb +88 -0
  90. data/spec/wayfarer/cli/routing_spec.rb +322 -0
  91. data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
  92. data/spec/wayfarer/gc_spec.rb +29 -0
  93. data/spec/{handler_spec.rb → wayfarer/handler_spec.rb} +1 -3
  94. data/spec/{integration → wayfarer/integration}/callbacks_spec.rb +9 -6
  95. data/spec/wayfarer/integration/content_type_spec.rb +37 -0
  96. data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
  97. data/spec/{integration → wayfarer/integration}/gc_spec.rb +9 -13
  98. data/spec/{integration → wayfarer/integration}/handler_spec.rb +9 -10
  99. data/spec/{integration → wayfarer/integration}/page_spec.rb +8 -6
  100. data/spec/{integration → wayfarer/integration}/params_spec.rb +4 -4
  101. data/spec/{integration → wayfarer/integration}/parsing_spec.rb +7 -33
  102. data/spec/wayfarer/integration/retry_spec.rb +112 -0
  103. data/spec/{integration → wayfarer/integration}/stage_spec.rb +5 -5
  104. data/spec/{middleware → wayfarer/middleware}/batch_completion_spec.rb +4 -5
  105. data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +20 -15
  106. data/spec/{middleware → wayfarer/middleware}/content_type_spec.rb +18 -21
  107. data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +22 -20
  108. data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
  109. data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
  110. data/spec/{middleware → wayfarer/middleware}/router_spec.rb +18 -20
  111. data/spec/{middleware → wayfarer/middleware}/stage_spec.rb +11 -10
  112. data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
  113. data/spec/{middleware → wayfarer/middleware}/user_agent_spec.rb +34 -32
  114. data/spec/wayfarer/networking/capybara_spec.rb +13 -0
  115. data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
  116. data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
  117. data/spec/{networking → wayfarer/networking}/follow_spec.rb +9 -4
  118. data/spec/wayfarer/networking/http_spec.rb +12 -0
  119. data/spec/{networking → wayfarer/networking}/pool_spec.rb +11 -9
  120. data/spec/wayfarer/networking/selenium_spec.rb +12 -0
  121. data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
  122. data/spec/{page_spec.rb → wayfarer/page_spec.rb} +3 -3
  123. data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
  124. data/spec/{parsing/xml_spec.rb → wayfarer/parsing/xml_parse_spec.rb} +4 -3
  125. data/spec/{redis → wayfarer/redis}/barrier_spec.rb +5 -4
  126. data/spec/wayfarer/redis/counter_spec.rb +34 -0
  127. data/spec/{redis → wayfarer/redis}/pool_spec.rb +3 -2
  128. data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
  129. data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
  130. data/spec/wayfarer/routing/integration_spec.rb +101 -0
  131. data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
  132. data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
  133. data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
  134. data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
  135. data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
  136. data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
  137. data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
  138. data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
  139. data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
  140. data/spec/wayfarer/routing/root_route_spec.rb +51 -0
  141. data/spec/wayfarer/routing/route_spec.rb +74 -0
  142. data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
  143. data/spec/wayfarer/uri/normalization_spec.rb +98 -0
  144. data/spec/wayfarer_spec.rb +2 -2
  145. data/wayfarer.gemspec +17 -28
  146. metadata +768 -246
  147. data/.rbenv-gemsets +0 -1
  148. data/.ruby-version +0 -1
  149. data/RELEASING.md +0 -17
  150. data/docs/cookbook/user_agent.md +0 -7
  151. data/docs/design.md +0 -36
  152. data/docs/guides/jobs/error_handling.md +0 -40
  153. data/docs/reference/configuration.md +0 -36
  154. data/spec/batch_completion_spec.rb +0 -104
  155. data/spec/cli/job_spec.rb +0 -74
  156. data/spec/cli/routing_spec.rb +0 -101
  157. data/spec/fixtures/dummy_job.rb +0 -9
  158. data/spec/gc_spec.rb +0 -17
  159. data/spec/integration/content_type_spec.rb +0 -145
  160. data/spec/integration/routing_spec.rb +0 -18
  161. data/spec/middleware/dedup_spec.rb +0 -71
  162. data/spec/middleware/dispatch_spec.rb +0 -59
  163. data/spec/middleware/normalize_spec.rb +0 -60
  164. data/spec/middleware/uri_parser_spec.rb +0 -53
  165. data/spec/networking/capybara_spec.rb +0 -12
  166. data/spec/networking/ferrum_spec.rb +0 -12
  167. data/spec/networking/http_spec.rb +0 -12
  168. data/spec/networking/selenium_spec.rb +0 -12
  169. data/spec/redis/counter_spec.rb +0 -44
  170. data/spec/routing/integration_spec.rb +0 -110
  171. data/spec/routing/matchers/custom_spec.rb +0 -31
  172. data/spec/routing/matchers/host_spec.rb +0 -49
  173. data/spec/routing/matchers/path_spec.rb +0 -43
  174. data/spec/routing/matchers/query_spec.rb +0 -137
  175. data/spec/routing/matchers/scheme_spec.rb +0 -25
  176. data/spec/routing/matchers/suffix_spec.rb +0 -41
  177. data/spec/routing/matchers/uri_spec.rb +0 -27
  178. data/spec/routing/path_finder_spec.rb +0 -33
  179. data/spec/routing/root_route_spec.rb +0 -29
  180. data/spec/routing/route_spec.rb +0 -43
  181. data/docs/{reference → guides}/cli.md +0 -0
  182. data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
  183. /data/spec/{task_spec.rb → wayfarer/task_spec.rb} +0 -0
data/Rakefile CHANGED
@@ -5,7 +5,14 @@ require "open-uri"
5
5
  require "zip"
6
6
 
7
7
  require "bundler/gem_tasks"
8
+ require "pry"
8
9
 
9
10
  Dir.glob("rake/*.rake").each { |file| load(file) }
10
11
 
11
12
  task default: :build
13
+
14
+ task :console do
15
+ require_relative "lib/wayfarer"
16
+
17
+ Pry.start
18
+ end
data/docker-compose.yml CHANGED
@@ -1,15 +1,14 @@
1
- version: "3"
2
1
  services:
3
2
  wayfarer:
4
3
  build: .
5
4
  tty: true
6
5
  volumes:
7
- - "./:/opt/app"
6
+ - ./:/opt/app
8
7
  ports:
9
- - "9876:9876"
10
- environment:
11
- - CI=true
8
+ - "${WAYFARER_PORT}:${WAYFARER_PORT}"
12
9
  hostname: test
10
+ environment:
11
+ CI: "${CI}"
13
12
  depends_on:
14
13
  - redis
15
14
  - chrome
@@ -17,26 +16,26 @@ services:
17
16
  - docs
18
17
 
19
18
  redis:
20
- image: redis
19
+ image: ${REDIS_IMAGE}:${REDIS_VERSION}
21
20
 
22
21
  chrome:
23
- image: browserless/chrome
22
+ image: ${CHROME_IMAGE}:${CHROME_VERSION}
24
23
  ports:
25
- - "3000:3000"
24
+ - "${CHROME_PORT}:${CHROME_PORT}"
26
25
 
27
26
  firefox:
28
- image: selenium/standalone-firefox:4.0.0-rc-2-prerelease-20210923
27
+ image: ${FIREFOX_IMAGE}:${FIREFOX_VERSION}
29
28
  ports:
30
- - "4444:4444"
29
+ - "${FIREFOX_PORT}:${FIREFOX_PORT}"
31
30
  volumes:
32
- - "/dev/shm:/dev/shm"
31
+ - /dev/shm:/dev/shm
33
32
 
34
33
  docs:
35
- image: squidfunk/mkdocs-material:9.5.9
34
+ image: ${DOCS_IMAGE}:${DOCS_VERSION}
36
35
  volumes:
37
- - "./:/docs"
36
+ - ./:/docs
38
37
  ports:
39
- - "8000:8000"
38
+ - "${DOCS_PORT}:${DOCS_PORT}"
40
39
 
41
40
  networks:
42
41
  default:
@@ -1,7 +1,7 @@
1
1
  # Callbacks
2
2
 
3
3
  Wayfarer supports a number of callbacks in addition to
4
- [ActiveJob's](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
4
+ [ActiveJob callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
5
5
 
6
6
  ## Available callbacks
7
7
 
@@ -20,6 +20,8 @@ to process in a batch. Wayfarer instruments job execution and in- or decrements
20
20
  an integer counter in Redis on certain events. When the counter reaches zero,
21
21
  the current job's `after_batch` callbacks run.
22
22
 
23
+ !!! info "`after_batch` callbacks fire at most once per batch."
24
+
23
25
  ## Conditional callbacks
24
26
 
25
27
  You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
@@ -1,39 +1,14 @@
1
- # Configuration
2
-
3
- Wayfarer can be configured in two ways:
4
-
5
- 1. Using [environment variables](/reference/environment_variables)
6
- 2. Using runtime configuration
7
-
8
- ## Runtime configuration
9
-
10
- Wayfarer parses environment variables into a runtime configuration
11
- `Wayfarer::Config`. The configuration can then be altered or replaced via
12
- `Wayfarer.config`:
13
-
14
- ```ruby
15
- # Which user agent to use to process tasks
16
- Wayfarer.config[:network][:agent] = :http # or :ferrum, :selenium
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
17
5
 
18
- # How many user agents to instantiate
19
- Wayfarer.config[:network][:pool_size] = 3
20
-
21
- # How long an agent may be used while processing a task
22
- Wayfarer.config[:network][:pool_timeout] = 5000
23
-
24
- # Ferrum options
25
- Wayfarer.config[:ferrum][:options] = {}
26
-
27
- # Selenium driver to use
28
- Wayfarer.config[:selenium][:driver] = :chrome
29
-
30
- # Selenium HTTP client read timeout
31
- Wayfarer.config[:selenium][:client_timeout] = 10 # seconds
6
+ # Configuration
32
7
 
33
- # Selenium options
34
- Wayfarer.config[:selenium][:options] = { url: "http://chrome" }
8
+ You can configure Wayfarer by assigning to `Wayfarer.config` which defaults to:
35
9
 
36
- # HTTP request headers (Selenium is unsupported)
37
- Wayfarer.config[:network][:http_headers] = { "Field" => "Value" }
10
+ ```rb
11
+ module Wayfarer
12
+ --8<-- "lib/wayfarer.rb:48:96"
13
+ end
38
14
  ```
39
-
@@ -0,0 +1,67 @@
1
+ # Development
2
+
3
+ ## Release Procedure
4
+
5
+ 1. Ensure `Wayfarer::VERSION` was bumped appropriately.
6
+ 2. Ensure the version in wayfarer.gemspec matches.
7
+ 3. Open a release Pull Request develop -> master branch
8
+ 4. Merge the Pull Request
9
+ 5. Publish RubyGem and git tag as follows:
10
+
11
+ ```
12
+ git checkout master
13
+ git pull origin master --rebase
14
+ bundle exec rake build
15
+ gem push build/wayfarer-*.gem
16
+ bundle exec rake clean
17
+ git tag <VERSION>
18
+ git push origin <VERSION>
19
+ ```
20
+
21
+ ## Conventions and guidelines
22
+
23
+ * In source code, `url` refers to strings and `uri` refers to `Addressable::URI`
24
+ * Avoid writing bash at all costs. Use Ruby instead
25
+
26
+ ## Design decisions and architecture
27
+
28
+ ### Navigate the web along URL patterns
29
+
30
+ URLs are less prone to change than served markup.
31
+ One reason for this is that changes to a URL's path can have negative
32
+ consequences for its page ranking in search engines. Websites naturally implement
33
+ architectural URL patterns like REST or expose surrogate keys.
34
+
35
+ ### Follow URLs verbatim as they appear in responses
36
+
37
+ Normalized URLs are useful for deduplication, but URLs should be followed
38
+ as they appear in responses. Navigating to normalized versions of URLs makes
39
+ crawlers stick out from other user agents.
40
+
41
+ ### Tasks are version-less and don't persist metadata
42
+
43
+ Tasks serialize to their URL and batch. No other data gets written to
44
+ the message queue. There is also no need for versioning persisted tasks, since
45
+ there will be never more to a task than URL and batch. All task metadata
46
+ is ephemeral.
47
+
48
+ ### Why depend on Redis
49
+
50
+ There are two core features that depend on Redis. First, per-batch acylicity is
51
+ achieved by maintaining the set of processed URLs per batch in Redis.
52
+ There's no option to follow links in a cyclic manner. Second, batch completion
53
+ requires updating an integer value in Redis, and batch completion is a very
54
+ useful feature, since most crawls should end eventually, and often you want to
55
+ know when.
56
+
57
+ ### No configuration files
58
+
59
+ Wayfarer can be configured through `Wayfarer.config` only, because `Wayfarer.config`
60
+ may contain Ruby objects that don't de/serialize well, such as `Proc`s or `Set`s.
61
+
62
+ ### Features out of scope
63
+
64
+ Wayfarer won't provide:
65
+
66
+ * persistence or any sort of DOM data mapping abstractions
67
+ * URL generation helpers
@@ -1,16 +1,16 @@
1
1
  # Handlers
2
2
 
3
- [Jobs](/jobs) can route tasks to handlers to delegate processing without
4
- writes to the message queue. Unlike jobs, handlers don't inherit from
5
- `ActiveJob::Base` and therefore cannot be enqueued. Handlers have routes, too,
6
- but they don't retrieve pages and a handler's router can be bypassed.
3
+ Handlers are like [jobs](/jobs) but they don't inherit from `ActiveJob::Base`
4
+ which is why they can't affect the message queue directly themselves.
5
+ Instead, jobs and handlers can route tasks to other handlers. Handlers
6
+ themselves have routes, but they can be bypassed.
7
7
 
8
- ## Supported features
8
+ ## Handler capabilities
9
9
 
10
- Handlers support a subset of features compared to `Wayfarer::Base`:
10
+ Like jobs, handlers support:
11
11
 
12
12
  * URL routing
13
- * enqueueing tasks with `#!ruby stage(*urls)`
13
+ * staging tasks with `#!ruby stage(*urls)`
14
14
  * jobs can access the `user_agent` that retrieved the `page`
15
15
  * ad-hoc HTTP requests with `#!ruby fetch(url)`
16
16
  * callbacks, but only a subset of job callbacks
data/docs/guides/jobs.md CHANGED
@@ -1,13 +1,15 @@
1
1
  # Jobs
2
2
 
3
3
  Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
4
- that use a DSL included from the `Wayfarer::Base` module to process [tasks](/guides/tasks)
5
- that they read from a message queue.
6
- Instead of implementing Active Job's `#perform` method yourself, you declare routes
7
- to instance methods, similiar to how web applications route incoming requests.
8
- Only URLs that match a [route](../routing) are requested or navigated to.
9
- The action method has access to the retrieved [page](../pages),
10
- the [user agent](../user-agents) that retrieved the page and the current task:
4
+ that use a DSL to process [tasks](/guides/tasks) that they read from a message
5
+ queue.
6
+
7
+ Instead of implementing Active Job's `#perform` method yourself, you declare
8
+ [routes](../routing) to instance methods, like web applications route incoming
9
+ requests. Only URLs that match a route are retrieved and processed. All other
10
+ URLs are considered successfully processed. The action has access to the
11
+ retrieved [page](../pages), the [user agent](../user-agents) that retrieved the
12
+ page and the current task:
11
13
 
12
14
  ```ruby
13
15
  class DummyJob < ActiveJob::Base
@@ -24,7 +26,7 @@ end
24
26
  ```
25
27
 
26
28
  You can start a crawl by appending a task to the message queue for the URL with
27
- `::crawl`. By default, a UUID is generated as the batch:
29
+ `::crawl`. If you don't provide a batch, Wayfarer generates a UUID:
28
30
 
29
31
  ```ruby
30
32
  task = DummyJob.crawl("https://example.com")
@@ -51,10 +53,10 @@ You can also use Wayfarer's [CLI](../cli) to enqueue a task:
51
53
  wayfarer enqueue --batch my-batch DummyJob "https://example.com"
52
54
  ```
53
55
 
54
- ## Navigating crawls
56
+ ## Following URLs
55
57
 
56
- Jobs navigate crawls by staging URLs with `#!ruby stage(urls)`. When you stage a URL, a normalized
57
- version of it is appended to an internal set. Once the action returns, all URLs
58
+ Jobs navigate crawls by staging URLs with `stage(urls)`. When you stage a URL,
59
+ it is appended verbatim to an internal set. Once the action returns, all URLs
58
60
  in the set are appended as tasks to the message queue.
59
61
 
60
62
  ```ruby
@@ -167,3 +169,44 @@ end
167
169
 
168
170
  Content-Types are compared regardless of their parameters. For example,
169
171
  `text/html; charset=UTF-8` is considered the same as `text/html`.
172
+
173
+ ## Handling errors
174
+
175
+ !!! danger "Only ActiveJob error handling is supported"
176
+
177
+ Wayfarer exclusively supports ActiveJob's error handling. You cannot use
178
+ message queue-specific error handling, for example error handling with
179
+ `sidekiq_options` is unsupported. Otherwise batches get garbage-collected
180
+ too early as Wayfarer instruments ActiveJob.
181
+
182
+ Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
183
+
184
+ * `retry_on` to retry jobs a number of times on certain errors:
185
+
186
+ ```ruby
187
+ class DummyJob < Wayfarer::Base
188
+ retry_on MyError, attempts: 3 do |job, error|
189
+ # This block runs once all 3 attempts have failed
190
+ # (1 initial attempt + 2 retries)
191
+ end
192
+ end
193
+ ```
194
+
195
+ * `discard_on` to throw away jobs on certain errors:
196
+
197
+ ```ruby
198
+ class DummyJob < Wayfarer::Base
199
+ discard_on MyError do |job, error|
200
+ # This block runs once and buries the job
201
+ end
202
+ end
203
+ ```
204
+
205
+ ## Recreating user agents on certain errors
206
+
207
+ You can configure a list of exception classes upon which user agents
208
+ get recreated (see [User agent API]()):
209
+
210
+ ```ruby
211
+ Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
212
+ ```
@@ -1,21 +1,42 @@
1
1
  # User agent API
2
2
 
3
3
  Wayfarer retrieves web pages with user agents. There are two types of user
4
- agents: __stateful__ browsers which carry state and follow redirects implicitly,
5
- and __stateless__ HTTP clients, which handle redirects explicitly.
4
+ agents: __stateful__ browsers which carry state and follow redirects implicitly
5
+ as they navigate to a URL, and __stateless__ HTTP clients, which handle
6
+ redirects explicitly.
7
+
8
+ | | Stateless adapters | Stateful adapters |
9
+ |-------------------|--------------------|-------------------|
10
+ | interactive | no | yes |
11
+ | redirect handling | explicit | implicit |
6
12
 
7
13
  Because spawning browser processes or instantiating HTTP clients is expensive,
8
- Wayfarer keeps user agents in a pool and reuses them across jobs. Only on certain
9
- irrecoverable errors are individual user agents destroyed and recreated. For example,
10
- when a browser process crashes, it is replaced with a new one and checked back
11
- into the pool. The next job that checks out the user agent gets a fresh
12
- browser process.
14
+ Wayfarer keeps user agents in a pool and reuses them across jobs. This means
15
+ that browser state carries over between jobs, as a job checks out a previous
16
+ job's user agent. Only on certain irrecoverable errors are individual user agents
17
+ destroyed and recreated. For example when a browser process crashes, it is
18
+ replaced with a fresh browser process.
13
19
 
14
20
  ## Base interface for custom user agents
15
21
 
16
- You can implement both stateful and stateless agents by including the `Wayfarer::Networking::Strategy`
17
- module and defining callback methods. The interfaces for stateful and stateless
18
- share the following instance methods:
22
+ You implement both stateful and stateless agents by including the
23
+ `Wayfarer::Networking::Strategy` module and defining callback methods. The
24
+ interfaces for stateful and stateless share the following base methods:
25
+
26
+ ```mermaid
27
+ classDiagram
28
+ class Square~Shape~{
29
+ int id
30
+ List~int~ position
31
+ setPoints(List~int~ points)
32
+ getPoints() List~int~
33
+ }
34
+
35
+ Square : -List~string~ messages
36
+ Square : +setMessages(List~string~ messages)
37
+ Square : +getMessages() List~string~
38
+ Square : +getDistanceMatrix() List~List~int~~
39
+ ```
19
40
 
20
41
  * `#create` (__required__): Called when a new instance (browser process or HTTP client) is
21
42
  needed.
data/docs/guides/pages.md CHANGED
@@ -47,6 +47,30 @@ MIME types:
47
47
  * `text/xml` or `application/xml` to [`#!ruby Nokogiri::XML::Document`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Document)
48
48
  * `application/json` to `Hash`
49
49
 
50
+ ### Implementing a custom response body parser
51
+
52
+ You can register an object that implements a `#parse` method for any MIME type:
53
+
54
+ ```ruby
55
+ class MyJPEGParser
56
+ def parse(body)
57
+ # Read EXIF metadata here.
58
+ # Return value is accessible as `page.doc`
59
+ end
60
+ end
61
+
62
+ Wayfarer::Parsing.registry["image/jpeg"] = MyJPEGParser.new
63
+ ```
64
+
65
+ !!! warning "`#parse` must be thread-safe!"
66
+
67
+ !!! info "Handling responses without a Content-Type"
68
+
69
+ If a response has no `Content-Type` header, Wayfarer falls back to
70
+ `application/octet-stream`. A parser registered for
71
+ `application/octet-stream` will hence also handle all responses without
72
+ a Content-Type.
73
+
50
74
  ## Live pages
51
75
 
52
76
  `#!ruby page` initially returns a snapshot of the browser state
@@ -83,28 +107,6 @@ end
83
107
  default `#!ruby :http` user agent. Instead, stateless user agents always
84
108
  return the same page object.
85
109
 
86
- ### Implementing a custom response body parser
87
-
88
- You can register an object that implements a `#parse` method for any MIME type:
89
-
90
- ```ruby
91
- class MyJPEGParser
92
- def parse(body)
93
- # Read EXIF metadata here.
94
- # Return value is accessible as `page.doc`
95
- end
96
- end
97
-
98
- Wayfarer::Parsing.registry["image/jpeg"] = MyJPEGParser.new
99
- ```
100
-
101
- !!! info "Handling responses without a Content-Type"
102
-
103
- If a response has no `Content-Type` header, Wayfarer falls back to
104
- `application/octet-stream`. A parser registered for
105
- `application/octet-stream` will hence also handle all responses without
106
- a Content-Type.
107
-
108
110
  ## Accessing page metadata with MetaInspector
109
111
 
110
112
  You have access to a [MetaInspector](https://github.com/jaimeiniesta/metainspector)
@@ -1,18 +1,32 @@
1
1
  # Routing
2
2
 
3
- Wayfarer equips jobs with a routing DSL that routes URLs to actions. Actions are
4
- either instance methods denoted by symbols, or [handlers](/guides/handlers).
3
+ Wayfarer equips jobs with a declarative routing DSL that maps URLs to actions.
4
+ Actions are instance methods denoted by symbols, or [handlers](/guides/handlers).
5
+ [Pages](/guides/pages) are only retrieved from URLs which map to an action.
6
+
7
+ !!! info "Routed URLs are normalized"
8
+
9
+ By default, Wayfarer [applies some transformations to each URL](../tasks/#url-normalization) to bring it
10
+ into a canonical form. Routing happens based on this canonical form.
11
+
12
+ You can always access a task's raw string as it was enqueued with `task.batch`.
13
+
5
14
  A job's route declarations equate to a predicate tree.
6
15
  When a URL is routed, the predicate tree is searched depth-first. If a
7
- matching leaf predicate is found, the found path's action is dispatched,
8
- along with `params` collected from path parameters.
16
+ matching leaf predicate is found, the found path's action is dispatched.
17
+ You can extract data from URL path segments and query parameters and
18
+ access it through `params` in jobs or handlers.
9
19
 
10
20
  The following routes:
11
21
 
12
22
  ```ruby
13
23
  route.host "example.com", scheme: :https do
14
- path "/contact", to: :contact
15
- path "/users/:id", to: [UserHandler, :show]
24
+ path "contact", to: :contact
25
+ path "users/:id" do
26
+ to [UserHandler, :show]
27
+
28
+ path "gallery", to: [UserHandler, :photos]
29
+ end
16
30
  end
17
31
  ```
18
32
 
@@ -20,43 +34,111 @@ Equate to the following predicate tree:
20
34
 
21
35
  ```mermaid
22
36
  flowchart LR
23
- RootRoute-->Host["Host <code>example.com</code>"]
37
+ Root-->Host["Host <code>example.com</code>"]
24
38
  Host-->Scheme["Scheme <code>:https</code>"]
25
- Scheme-->Path1["Path <code>/contact</code>"]
26
- Scheme-->Path2["Path <code>/users/:id<code>"]
27
- Path1-->TargetRoute1["Target <code>:contact</code>"]
28
- Path2-->TargetRoute2["Target <code>[UserHandler, :show]</code>"]
39
+
40
+ %% first-level paths
41
+ Scheme-->PathContact["Path <code>contact</code>"]
42
+ Scheme-->PathUsersId["Path <code>users/:id</code>"]
43
+
44
+ %% their targets
45
+ PathContact-->TargetRouteContact["Target <code>:contact</code>"]
46
+ PathUsersId-->TargetRouteUserHandler["Target <code>[UserHandler, :show]</code>"]
47
+
48
+ %% nested path under /users/:id
49
+ PathUsersId-->PathGallery["Path <code>'gallery'</code>"]
50
+ PathGallery-->TargetRouteUserHandlerPhotos["Target <code>[UserHandler, :photos]</code>"]
29
51
  ```
30
52
 
31
- An invocation for the URL `https://example.com/users/42` leads to `[UserHandler, :show]`:
53
+ Traversing the tree depth-first for `https://example.com/users/42` stops at the
54
+ route with the action `[UserHandler, :show]`:
32
55
 
33
56
  ```mermaid
34
57
  flowchart LR
35
- RootRoute:::active-->Host["Host <code>example.com</code>"]:::active
36
- Host:::active-->Scheme["Scheme <code>:https</code>"]:::active
37
- Scheme:::active-->Path1["Path <code>/contact</code>"]:::inactive
38
- Scheme:::active-->Path2["Path <code>/users/:id<code>"]:::active
39
- Path1:::inactive-->TargetRoute1["Target <code>:contact</code>"]:::active
40
- Path2:::active-->TargetRoute2["Target <code>[UserHandler, :show]</code>"]:::activePlus
41
- classDef active fill:#7CB342,stroke:#7CB342,color:#fff
42
- classDef activePlus fill:#F1F8E9,stroke:#8BC34A,color:#33691E,stroke-width:4px
43
- classDef inactive fill:#FFCDD2,stroke:#F44336,color:#B71C1C
44
- ```
58
+ Root:::matching-->Host["Host <code>example.com</code>"]:::matching
59
+ Host:::matching-->Scheme["Scheme <code>:https</code>"]:::matching
45
60
 
46
- You can also visualise an invocation of the predicate tree on the command line
47
- with `wayfarer tree`
61
+ %% sibling paths from the scheme node
62
+ Scheme:::matching-->PathContact["Path <code>/contact</code>"]:::mismatching
63
+ Scheme:::matching-->PathUsersId["Path <code>/users/:id</code>"]:::matching
48
64
 
65
+ %% successful match for /users/:id
66
+ PathUsersId:::matching-->TargetRouteUserHandler["Target <code>[UserHandler, :show]</code>"]:::matching
67
+
68
+ %% gallery branch is never visited for /users/42
69
+ PathContact-->TargetRouteContact["Target <code>:contact</code>"]:::unvisited
70
+ PathUsersId:::matching-->PathGallery["Path <code>/gallery</code>"]:::unvisited
71
+ PathGallery:::unvisited-->TargetRouteUserHandlerPhotos["Target <code>[UserHandler, :photos]</code>"]:::unvisited
72
+
73
+ classDef matching fill:#7CB342,stroke:#7CB342,color:#fff
74
+ classDef mismatching fill:#FFCDD2,stroke:#F44336,color:#B71C1C
75
+ classDef unvisited fill:#BDBDBD,stroke:#BDBDBD,color:#616161
49
76
  ```
50
- wayfarer tree -r dummy_job.rb DummyJob https://example.com/users/42/foobar
51
- Match([UserHandler, :show], params: {:id=>"42", :foo=>"foobar"})
52
- └──Host("example.com", match: true)
53
- └──Scheme(:https, match: true)
54
- ├──Path("/contact", match: false)
55
- │ └──Target(match: true)
56
- └──Path("/users/:id", match: true)
57
- └──Target(match: true)
58
- └──Path("/users/:id/:foo", match: true, params: {:id=>"42", :foo=>"foobar"})
59
- ```
77
+
78
+ ??? note "You can also visualise a job's routing tree with with the [`route` CLI subcommand](/guides/cli)"
79
+
80
+ ```sh
81
+ wayfarer route DummyJob -r dummy_job.rb http://localhost:9000/users/42/gallery
82
+ ```
83
+
84
+ ```yaml
85
+ ---
86
+ routed: true
87
+ params:
88
+ id: '42'
89
+ action:
90
+ handler: Class
91
+ action: :photos
92
+ root_route:
93
+ match: true
94
+ params: {}
95
+ children:
96
+ - route:
97
+ host:
98
+ name: example.com
99
+ match: true
100
+ params: {}
101
+ children:
102
+ - route:
103
+ scheme:
104
+ scheme: :https
105
+ match: true
106
+ params: {}
107
+ children:
108
+ - route:
109
+ path:
110
+ pattern: "/contact"
111
+ match: false
112
+ params: {}
113
+ children:
114
+ - target_route:
115
+ action:
116
+ children: []
117
+ - route:
118
+ path:
119
+ pattern: "/users/:id"
120
+ match: true
121
+ params:
122
+ id: '42'
123
+ children:
124
+ - target_route:
125
+ action:
126
+ handler: Class
127
+ action: :show
128
+ children: []
129
+ - route:
130
+ path:
131
+ pattern: "/gallery"
132
+ match: true
133
+ params:
134
+ id: '42'
135
+ children:
136
+ - target_route:
137
+ action:
138
+ handler: Class
139
+ action: :photos
140
+ children: []
141
+ ```
60
142
 
61
143
  As you can see, `Target` nodes always match. This means that we could have also defined
62
144
  our routes as:
data/docs/guides/tasks.md CHANGED
@@ -19,20 +19,40 @@ Wayfarer ensures that no URL gets processed twice within a batch. It achieves
19
19
  this by maintaining a [Redis hash](https://redis.io/docs/data-types/hashes)
20
20
  keyed by normalized URLs.
21
21
 
22
+ Wayfarer computes a canonical URL representation that it uses for cache lookups.
23
+
22
24
  ### URL normalization
23
25
 
24
26
  Wayfarer parses URLs with [Addressable](https://github.com/sporkmonger/addressable)
25
- and normalizes HTTP(S) URLs with [`normalize_url`](https://github.com/rwz/normalize_url/).
27
+ and applies further normalizations. By default, all normalizations are applied
28
+ and can be individually disabled.
29
+
30
+ URL normalization is used only for deduplication, and does not affect the immutable
31
+ `task.url`, which always returns the verbatim URL as enqueued.
32
+ This allows you to follow the URLs exactly as parsed from response bodies.
33
+
34
+ You can configure the global normalization behaviour by setting the following
35
+ values on `Wayfarer.config.normalization` do which all default to `true`:
36
+
37
+ * `remove_www`: Remove `www.` prefix from hostnames?
38
+ * `remove_trailing_slash`: Remove a trailing path slash?
39
+ * `remove_fragment`: Remove the URL fragment?
40
+ * `order_query_parameters:` Order query parameters alphabetically?
41
+ * `remove_tracking_parameters`: Remove tracking parameters from the URL?
42
+
43
+ When a job gets deduplicated, it succeeds and causes no retries.
44
+
45
+ ### Setting a custom key function
46
+
47
+ You can customize how deduplication keys are computed. As a derived example,
48
+ to process only one job per hostname:
26
49
 
27
- URL normalization is used only for deduplication, and does not affect the URL
28
- returned by `task.url`. Instead, `task.url` returns the verbatim URL as it was
29
- enqueud. This allows you to follow the exact URLs you may have parsed from a
30
- response body.
50
+ ```ruby
51
+ Wayfarer.config[:deduplication][:key] = ->(task) { task[:uri].hostname }
52
+ ```
31
53
 
32
54
  ## Invalid URLs
33
55
 
34
- Tasks with invalid URLs (for example`ht%0atp://localhost/`, a newline in the
35
- protocol) are discarded, since they can't get retrieved. No exception is raised,
36
- and the job is considered successfully processed, since there are no corrective
37
- actions an error handler could take as tasks are immutable, and retries would
38
- not change the outcome.
56
+ Tasks with invalid URLs are discarded (for example`ht%0atp://localhost/` which has a
57
+ newline in its protocol), since there is no corrective action possible.
58
+ No exception is raised, and the job is considered successfully processed without retries.