wayfarer 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +29 -2
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +17 -0
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -31
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -42
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -26
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -4,12 +4,12 @@
4
4
 
5
5
  ```ruby
6
6
  class DummyJob < Wayfarer::Base
7
- route { to :index }
7
+ route.to :index
8
8
 
9
9
  def index
10
- agent.goto("https://example.com")
11
- agent.back
12
- agent.forward
10
+ user_agent.goto("https://example.com")
11
+ user_agent.back
12
+ user_agent.forward
13
13
  end
14
14
  end
15
15
  ```
@@ -18,12 +18,12 @@
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
- agent.navigate.to("https://example.com")
25
- agent.navigate.back
26
- agent.navigate.forward
24
+ user_agent.navigate.to("https://example.com")
25
+ user_agent.navigate.back
26
+ user_agent.navigate.forward
27
27
  end
28
28
  end
29
29
  ```
@@ -32,12 +32,12 @@
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
- agent.visit("https://example.com")
39
- agent.go_back
40
- agent.go_forward
38
+ user_agent.visit("https://example.com")
39
+ user_agent.go_back
40
+ user_agent.go_forward
41
41
  end
42
42
  end
43
43
  ```
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  page.doc.css("html")
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
19
19
 
20
20
  ```ruby
21
21
  class DummyJob < Wayfarer::Base
22
- route { to :index }
22
+ route.to :index
23
23
 
24
24
  def index
25
25
  browser.at_css("html")
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
38
  browser.find_elements(css: "html")
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  browser.screenshot(path: "screenshot.png")
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
24
  browser.save_screenshot("screenshot.png")
@@ -3,5 +3,5 @@
3
3
  See: [Guides: Networking: HTTP request headers](/guides/networking#http-request-headers)
4
4
 
5
5
  ```ruby
6
- Wayfarer.config.network.http_headers = { "User-Agent" => "MyCrawler ..." }
6
+ Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler ..." }
7
7
  ```
data/docs/design.md ADDED
@@ -0,0 +1,36 @@
1
+ # Design decisions
2
+
3
+ ## Navigate the web along URL patterns
4
+
5
+ URLs are less prone to change than served markup.
6
+ One reason for this is that changes to a URL's path can have a negative effect
7
+ on its page ranking in search engines. Many websites also implement common
8
+ architectural URL patterns, for example REST and its variations, that
9
+ lend themselves to pattern matching.
10
+
11
+ ## Follow URLs verbatim
12
+
13
+ Normalized URLs are useful for deduplication, but URLs should be followed
14
+ as they appear in responses. Navigating to normalized versions of URLs makes
15
+ crawlers stick out from other user agents, for example.
16
+
17
+ ## Tasks are version-less and don't persist metadata
18
+
19
+ Tasks serialize to their URL and batch. No other data gets written to
20
+ the message queue. Wayfarer aims to minimise job payloads.
21
+ There is also no need for versioning persisted tasks, since there is only one
22
+ version of a task: URL and batch.
23
+
24
+ ## Why depend on Redis
25
+
26
+ There are two core features that depend on Redis. First, per-batch acylicity is
27
+ achieved by maintaining the set of processed URLs per batch in Redis.
28
+ There's no option to follow links in a cyclic manner. Second, batch completion
29
+ requires updating an integer value in Redis, and batch completion is a very
30
+ useful feature, since most crawls should end eventually, and often you want to
31
+ know when.
32
+
33
+ ## Persistence and document mapping not included
34
+
35
+ Like Active Job, Wayfarer is not concerned with persistence.
36
+ Model <-> DOM mapping abstractions are also out of scope.
@@ -1,145 +1,43 @@
1
1
  # Callbacks
2
2
 
3
- ## Active Job callbacks
3
+ Wayfarer supports a number of callbacks in addition to
4
+ [ActiveJob's](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
4
5
 
5
- Wayfarer naturally supports all of [Active Job's life cycle callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
6
+ ## Available callbacks
6
7
 
7
- ## `before_fetch`
8
-
9
- Runs before a job fetches a page, either by making an HTTP request, or by
10
- navigating a browser to its task URL.
11
-
12
- ```ruby
13
- class DummyJob < Wayfarer::Base
14
- before_fetch :do_something
15
-
16
- private
17
-
18
- def do_something
19
- # before the task.url is fetched
20
- end
21
- end
22
- ```
23
-
24
- ## `before_action`
25
-
26
- Runs after a page was fetched, before an action method is called.
27
-
28
- ```ruby
29
- class DummyJob < Wayfarer::Base
30
- before_action :do_something
31
-
32
- private
33
-
34
- def do_something
35
- # page is available at this point
36
- end
37
- end
38
- ```
8
+ * `before_fetch`
9
+ * `around_fetch`
10
+ * `after_fetch`
11
+ * `before_action`
12
+ * `around_action`
13
+ * `after_action`
14
+ * `after_batch`
39
15
 
40
16
  ## `after_batch`
41
17
 
42
- Runs once the last job in a batch performed:
43
-
44
- ```ruby
45
- class DummyJob < Wayfarer::Base
46
- after_batch do
47
- # All jobs in batch done
48
- end
49
- end
50
- ```
51
-
52
- Internally, a batch counter is in-/decremented on certain events. Once the
53
- counter reaches zero, `after_batch` callbacks runs in declaration order.
54
-
55
- The counter is incremented when within the batch:
18
+ You can register `after_batch` callbacks that run when there are no more tasks
19
+ to process in a batch. Wayfarer instruments job execution and in- or decrements
20
+ an integer counter in Redis on certain events. When the counter reaches zero,
21
+ the current job's `after_batch` callbacks run.
56
22
 
57
- * A job is enqueued.
23
+ ## Conditional callbacks
58
24
 
59
- The counter is decremented when:
60
-
61
- * A job succeeds.
62
- * A job errors due to an unhandled exception.
63
- * A job is discarded due to an exception.
64
- * A job errors and thereyby exhausts its maximum attempts.
65
-
66
- !!! attention "Batch callbacks can fail jobs"
67
-
68
- If the last job's `after_batch` callbacks raises an exception, this can lead
69
- to the job getting retried. If the exception raised by the callback is
70
- unhandled or discarded, the callback never fully runs.
71
-
72
- ## Callback options
73
-
74
- ### Definition styles
75
-
76
- Callbacks can be registered either by supplying a block or a symbol identifying
77
- a callback instance method:
25
+ You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
26
+ keywords, for example to run a callback for some route `action` only:
78
27
 
79
28
  ```ruby
80
- class DummyJob < Wayfarer::Base
81
- before_action do
82
- # ...
83
- end
84
-
85
- before_action :my_callback
29
+ class DummyJob < ActiveJob::Base
30
+ include Wayfarer::Base
86
31
 
87
- private
32
+ route.host "example.com", to: :example
33
+ route.to :fallback
88
34
 
89
- def my_callback
35
+ before_action unless: -> { action == :fallback } do
90
36
  # ...
91
37
  end
92
- end
93
- ```
94
-
95
- ### Conditionals
96
-
97
- Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
98
-
99
- ```ruby
100
- class DummyJob < Wayfarer::Base
101
- before_fetch :my_callback, if: :my_condition
102
-
103
- private
104
-
105
- def my_callback
106
- end
107
38
 
108
- def my_condition
109
- end
39
+ # ...
110
40
  end
111
41
  ```
112
42
 
113
- Callbacks can be registered for certain action methods only with the `:only` and
114
- `:except` keywords:
115
-
116
- ```ruby
117
- class DummyJob < Wayfarer::Base
118
- before_fetch :do_something, only: :foo
119
-
120
- before_fetch except: [:foo, :qux] do
121
- # runs only before bar
122
- end
123
-
124
- def foo
125
- end
126
-
127
- def bar
128
- end
129
- end
130
-
131
- ```
132
-
133
- ### Early termination
134
-
135
- Callbacks that return `false` halt the callback chain:
136
-
137
- ```ruby
138
- class DummyJob < Wayfarer::Base
139
- before_action { false }
140
-
141
- before_action do
142
- # never runs
143
- end
144
- end
145
- ```
43
+ You can also pass a symbol instead of a block to call an instance method.
@@ -13,27 +13,27 @@ Wayfarer parses environment variables into a runtime configuration
13
13
 
14
14
  ```ruby
15
15
  # Which user agent to use to process tasks
16
- Wayfarer.config.network.agent = :http # or :ferrum, :selenium
16
+ Wayfarer.config[:network][:agent] = :http # or :ferrum, :selenium
17
17
 
18
18
  # How many user agents to instantiate
19
- Wayfarer.config.network.pool_size = 3
19
+ Wayfarer.config[:network][:pool_size] = 3
20
20
 
21
21
  # How long an agent may be used while processing a task
22
- Wayfarer.config.network.pool_timeout = 5000
22
+ Wayfarer.config[:network][:pool_timeout] = 5000
23
23
 
24
24
  # Ferrum options
25
- Wayfarer.config.ferrum.options = {}
25
+ Wayfarer.config[:ferrum][:options] = {}
26
26
 
27
27
  # Selenium driver to use
28
- Wayfarer.config.selenium.driver = :chrome
28
+ Wayfarer.config[:selenium][:driver] = :chrome
29
29
 
30
30
  # Selenium HTTP client read timeout
31
- Wayfarer.config.selenium.client_timeout = 10 # seconds
31
+ Wayfarer.config[:selenium][:client_timeout] = 10 # seconds
32
32
 
33
33
  # Selenium options
34
- Wayfarer.config.selenium.options = { url: "http://chrome" }
34
+ Wayfarer.config[:selenium][:options] = { url: "http://chrome" }
35
35
 
36
36
  # HTTP request headers (Selenium is unsupported)
37
- Wayfarer.config.network.http_headers = { "Field" => "Value" }
37
+ Wayfarer.config[:network][:http_headers] = { "Field" => "Value" }
38
38
  ```
39
39
 
@@ -0,0 +1,60 @@
1
+ # Handlers
2
+
3
+ [Jobs](/jobs) can route tasks to handlers to delegate processing without
4
+ writes to the message queue. Unlike jobs, handlers don't inherit from
5
+ `ActiveJob::Base` and therefore cannot be enqueued. Handlers have routes, too,
6
+ but they don't retrieve pages and a handler's router can be bypassed.
7
+
8
+ ## Supported features
9
+
10
+ Handlers support a subset of features compared to `Wayfarer::Base`:
11
+
12
+ * URL routing
13
+ * enqueueing tasks with `#!ruby stage(*urls)`
14
+ * jobs can access the `user_agent` that retrieved the `page`
15
+ * ad-hoc HTTP requests with `#!ruby fetch(url)`
16
+ * callbacks, but only a subset of job callbacks
17
+ * Content-Type filtering
18
+
19
+ ```ruby
20
+ class ExampleHandler
21
+ include Wayfarer::Handler
22
+
23
+ route.to: :index
24
+
25
+ def index
26
+ task # => #<Wayfarer::Task>
27
+ page # => #<Wayfarer::Page>
28
+ user_agent # => Browser or HTTP client
29
+ end
30
+ end
31
+
32
+ class DummyJob < ActiveJob::Base
33
+ include Wayfarer::Base
34
+
35
+ route.host "example.com", to: ExampleHandler
36
+ end
37
+ ```
38
+
39
+ You can also bypass a handler's router and route directly to an instance
40
+ method:
41
+
42
+ ```ruby
43
+ class DummyJob < ActiveJob::Base
44
+ include Wayfarer::Base
45
+
46
+ route.host "example.com", to: [ExampleHandler, :index]
47
+ end
48
+
49
+ class ExampleHandler
50
+ include Wayfarer::Handler
51
+
52
+ def index
53
+ task # => #<Wayfarer::Task>
54
+ page # => #<Wayfarer::Page>
55
+ user_agent # => Browser or HTTP client
56
+ end
57
+ end
58
+ ```
59
+
60
+ !!! `before_action` callbacks
@@ -0,0 +1 @@
1
+ hello
@@ -0,0 +1,40 @@
1
+ # Error handling
2
+
3
+ !!! danger "Only ActiveJob error handling is supported"
4
+
5
+ Wayfarer exclusively supports ActiveJob's error handling. You cannot use
6
+ message queue-specific error handling, for example error handling with
7
+ `sidekiq_options` is unsupported. Otherwise batches get garbage-collected
8
+ too early as Wayfarer instruments ActiveJob.
9
+
10
+ Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
11
+
12
+ * `retry_on` to retry jobs a number of times on certain errors:
13
+
14
+ ```ruby
15
+ class DummyJob < Wayfarer::Base
16
+ retry_on MyError, attempts: 3 do |job, error|
17
+ # This block runs once all 3 attempts have failed
18
+ # (1 initial attempt + 2 retries)
19
+ end
20
+ end
21
+ ```
22
+
23
+ * `discard_on` to throw away jobs on certain errors:
24
+
25
+ ```ruby
26
+ class DummyJob < Wayfarer::Base
27
+ discard_on MyError do |job, error|
28
+ # This block runs once and buries the job
29
+ end
30
+ end
31
+ ```
32
+
33
+ ## Recreating user agents on certain errors
34
+
35
+ You can configure a list of exception classes upon which user agents
36
+ get recreated (see [User agent API]()):
37
+
38
+ ```ruby
39
+ Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
40
+ ```
data/docs/guides/jobs.md CHANGED
@@ -1,78 +1,124 @@
1
1
  # Jobs
2
2
 
3
- Jobs are Ruby classes that process [tasks](/guides/tasks) and look as follows:
3
+ Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
4
+ that use a DSL included from the `Wayfarer::Base` module to process [tasks](/guides/tasks)
5
+ that they read from a message queue.
6
+ Instead of implementing Active Job's `#perform` method yourself, you declare routes
7
+ to instance methods, similiar to how web applications route incoming requests.
8
+ Only URLs that match a [route](../routing) are requested or navigated to.
9
+ The action method has access to the retrieved [page](../pages),
10
+ the [user agent](../user-agents) that retrieved the page and the current task:
4
11
 
5
12
  ```ruby
6
- class DummyJob < Wayfarer::Base
7
- route { to :index }
13
+ class DummyJob < ActiveJob::Base
14
+ include Wayfarer::Base
15
+
16
+ route.to :index
8
17
 
9
18
  def index
19
+ task # => #<Wayfarer::Task>
20
+ page # => #<Wayfarer::Page>
21
+ user_agent # => Browser or HTTP client
10
22
  end
11
23
  end
12
24
  ```
13
25
 
14
- Here is how to enqueue a task for a URL:
26
+ You can start a crawl by appending a task to the message queue for the URL with
27
+ `::crawl`. By default, a UUID is generated as the batch:
15
28
 
16
29
  ```ruby
17
- DummyJob.crawl("https://example.com")
30
+ task = DummyJob.crawl("https://example.com")
31
+ # => #<Wayfarer::Task url="https://example.com", batch="498a13e0-...">
18
32
  ```
19
33
 
20
- This is the same as calling the Active Job API directly and passing a task
21
- and a random batch:
34
+ This is exactly the same as calling Active Job's `#perform_later` and passing a
35
+ task directly:
22
36
 
23
37
  ```ruby
24
38
  task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
25
39
  DummyJob.perform_later(task)
26
40
  ```
27
41
 
28
- A batch can be specified with `::crawl`, too:
42
+ Instead of a generated UUID, you can also set your own batch:
29
43
 
30
44
  ```ruby
31
45
  DummyJob.crawl("https://example.com", batch: "my-batch")
32
46
  ```
33
47
 
34
- ## Current task
48
+ You can also use Wayfarer's [CLI](../cli) to enqueue a task:
49
+
50
+ ```sh
51
+ wayfarer enqueue --batch my-batch DummyJob "https://example.com"
52
+ ```
53
+
54
+ ## Navigating crawls
55
+
56
+ Jobs navigate crawls by staging URLs with `#!ruby stage(urls)`. When you stage a URL, a normalized
57
+ version of it is appended to an internal set. Once the action returns, all URLs
58
+ in the set are appended as tasks to the message queue.
59
+
60
+ ```ruby
61
+ class DummyJob < ActiveJob::Base
62
+ include Wayfarer::Base
63
+
64
+ route.to :index
65
+
66
+ def index
67
+ # Follow all out-going links of the page
68
+ stage page.meta.links.external
69
+ end
70
+ end
71
+ ```
72
+
73
+ ## Accessing the current task
35
74
 
36
- Jobs consume [tasks](../tasks) from a message queue. The currently processed
37
- task is accessible like so:
75
+ If the task's URL matched a [route](../routing), the URL is retrieved over the network,
76
+ and the method that was routed to is called. The task is available as `#task`:
38
77
 
39
78
  ```ruby
40
- class DummyJob < Wayfarer::Base
41
- route { to :index }
79
+ class DummyJob < ActiveJob::Base
80
+ include Wayfarer::Base
81
+
82
+ route.to :index
42
83
 
43
84
  def index
44
- task.url # => "https://example.com"
85
+ task.url # => "https://example.com"
45
86
  task.batch # => "my-batch"
46
87
  end
47
88
  end
48
89
  ```
49
90
 
50
- ## Current page
91
+ ## Accessing the current page
51
92
 
52
- A task's URL contents get fetched into a [page](../pages) object if the task URL
53
- matched a route:
93
+ You have access to the retrieved [page](../pages):
54
94
 
55
95
  ```ruby
56
- class DummyJob < Wayfarer::Base
57
- route { to :index, host: "example.com" }
96
+ class DummyJob < ActiveJob::Base
97
+ include Wayfarer::Base
98
+
99
+ route.to :index
58
100
 
59
101
  def index
60
102
  page.url # => "https://example.com"
61
103
  page.body # => "<html>..."
62
104
  page.status_code # => 200
63
105
  page.headers # { "Content-Type" => ... }
106
+ page.doc # Only present for certain Content-Types
64
107
  end
65
108
  end
66
109
  ```
67
110
 
68
- ## URL parameters
111
+ ## Routing URLs to methods and extracting `params`
69
112
 
70
- Jobs can extract data from URLs with their router:
113
+ Jobs have a routing DSL that allows you to map URLs to methods and extract
114
+ URL data:
71
115
 
72
116
  ```ruby
73
- class DummyJob < Wayfarer::Base
117
+ class DummyJob < ActiveJob::Base
118
+ include Wayfarer::Base
119
+
74
120
  route do
75
- path "/users/:id/profile"
121
+ path "/users/:id/profile", to: :index
76
122
  end
77
123
 
78
124
  def index
@@ -80,22 +126,44 @@ class DummyJob < Wayfarer::Base
80
126
  end
81
127
  end
82
128
 
83
- DummyJob.crawl("https://example.com/users/42/profile")
129
+ DummyJob.crawl("https://example.com/users/42/profile?foo=bar")
84
130
  ```
85
131
 
132
+ ## Controlling the user agent
86
133
 
87
- ## User agent
88
-
89
- The HTTP client or automated browser that fetched the URL is available:
134
+ You can control the browser or HTTP client that retrieved the page:
90
135
 
91
136
  ```ruby
92
- Wayfarer.config.network.agent = :ferrum # Chrome DevTools Protocol
137
+ Wayfarer.config[:network][:agent] = :ferrum # Chrome DevTools Protocol
93
138
 
94
- class DummyJob < Wayfarer::Base
95
- route { to :index }
139
+ class DummyJob < ActiveJob::Base
140
+ include Wayfarer::Base
141
+
142
+ route.to :index
96
143
 
97
144
  def index
98
- browser.save_screenshot("capture.png")
145
+ user_agent.save_screenshot("capture.png")
99
146
  end
100
147
  end
101
148
  ```
149
+
150
+ ## Restricting the processed Content-Types
151
+
152
+ By default, jobs process pages regardless of their Content-Type response
153
+ header. You can allow a list of Content-Types as strings and Regexps and
154
+ opt out of the default behaviour. Once at least one Content-Type is allowed,
155
+ other Content-Types don't get processed:
156
+
157
+ ```ruby
158
+ class DummyJob < ActiveJob::Base
159
+ include Wayfarer::Base
160
+
161
+ content_type "text/html", "application/json"
162
+ content_type /xml/
163
+ end
164
+ ```
165
+
166
+ !!! info "HTTP parameters in Content-Types are ignored for comparison"
167
+
168
+ Content-Types are compared regardless of their parameters. For example,
169
+ `text/html; charset=UTF-8` is considered the same as `text/html`.
@@ -13,7 +13,7 @@ set first.
13
13
 
14
14
  ```ruby
15
15
  class DummyJob < Wayfarer::Base
16
- route { to :index }
16
+ route.to :index
17
17
 
18
18
  def index
19
19
  stage page.meta.links.all