wayfarer 0.4.6 → 0.4.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +28 -1
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +1 -1
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -53
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -43
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -29
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -4,12 +4,12 @@
4
4
 
5
5
  ```ruby
6
6
  class DummyJob < Wayfarer::Base
7
- route { to :index }
7
+ route.to :index
8
8
 
9
9
  def index
10
- agent.goto("https://example.com")
11
- agent.back
12
- agent.forward
10
+ user_agent.goto("https://example.com")
11
+ user_agent.back
12
+ user_agent.forward
13
13
  end
14
14
  end
15
15
  ```
@@ -18,12 +18,12 @@
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
- agent.navigate.to("https://example.com")
25
- agent.navigate.back
26
- agent.navigate.forward
24
+ user_agent.navigate.to("https://example.com")
25
+ user_agent.navigate.back
26
+ user_agent.navigate.forward
27
27
  end
28
28
  end
29
29
  ```
@@ -32,12 +32,12 @@
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
- agent.visit("https://example.com")
39
- agent.go_back
40
- agent.go_forward
38
+ user_agent.visit("https://example.com")
39
+ user_agent.go_back
40
+ user_agent.go_forward
41
41
  end
42
42
  end
43
43
  ```
@@ -6,7 +6,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  page.doc.css("html")
@@ -19,7 +19,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
19
19
 
20
20
  ```ruby
21
21
  class DummyJob < Wayfarer::Base
22
- route { to :index }
22
+ route.to :index
23
23
 
24
24
  def index
25
25
  browser.at_css("html")
@@ -32,7 +32,7 @@ See: [Nokogiri: Searching an HTML / XML Document](https://nokogiri.org/tutorials
32
32
 
33
33
  ```ruby
34
34
  class DummyJob < Wayfarer::Base
35
- route { to :index }
35
+ route.to :index
36
36
 
37
37
  def index
38
38
  browser.find_elements(css: "html")
@@ -6,7 +6,7 @@ Taking screenshots requires automating a browser.
6
6
 
7
7
  ```ruby
8
8
  class DummyJob < Wayfarer::Base
9
- route { to :index }
9
+ route.to :index
10
10
 
11
11
  def index
12
12
  browser.screenshot(path: "screenshot.png")
@@ -18,7 +18,7 @@ Taking screenshots requires automating a browser.
18
18
 
19
19
  ```ruby
20
20
  class DummyJob < Wayfarer::Base
21
- route { to :index }
21
+ route.to :index
22
22
 
23
23
  def index
24
24
  browser.save_screenshot("screenshot.png")
@@ -3,5 +3,5 @@
3
3
  See: [Guides: Networking: HTTP request headers](/guides/networking#http-request-headers)
4
4
 
5
5
  ```ruby
6
- Wayfarer.config.network.http_headers = { "User-Agent" => "MyCrawler ..." }
6
+ Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler ..." }
7
7
  ```
data/docs/design.md ADDED
@@ -0,0 +1,36 @@
1
+ # Design decisions
2
+
3
+ ## Navigate the web along URL patterns
4
+
5
+ URLs are less prone to change than served markup.
6
+ One reason for this is that changes to a URL's path can have a negative effect
7
+ on its page ranking in search engines. Many websites also implement common
8
+ architectural URL patterns, for example REST and its variations, that
9
+ lend themselves to pattern matching.
10
+
11
+ ## Follow URLs verbatim
12
+
13
+ Normalized URLs are useful for deduplication, but URLs should be followed
14
+ as they appear in responses. Navigating to normalized versions of URLs makes
15
+ crawlers stick out from other user agents, for example.
16
+
17
+ ## Tasks are version-less and don't persist metadata
18
+
19
+ Tasks serialize to their URL and batch. No other data gets written to
20
+ the message queue. Wayfarer aims to minimise job payloads.
21
+ There is also no need for versioning persisted tasks, since there is only one
22
+ version of a task: URL and batch.
23
+
24
+ ## Why depend on Redis
25
+
26
+ There are two core features that depend on Redis. First, per-batch acylicity is
27
+ achieved by maintaining the set of processed URLs per batch in Redis.
28
+ There's no option to follow links in a cyclic manner. Second, batch completion
29
+ requires updating an integer value in Redis, and batch completion is a very
30
+ useful feature, since most crawls should end eventually, and often you want to
31
+ know when.
32
+
33
+ ## Persistence and document mapping not included
34
+
35
+ Like Active Job, Wayfarer is not concerned with persistence.
36
+ Model <-> DOM mapping abstractions are also out of scope.
@@ -1,145 +1,43 @@
1
1
  # Callbacks
2
2
 
3
- ## Active Job callbacks
3
+ Wayfarer supports a number of callbacks in addition to
4
+ [ActiveJob's](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
4
5
 
5
- Wayfarer naturally supports all of [Active Job's life cycle callbacks](https://edgeguides.rubyonrails.org/active_job_basics.html#callbacks).
6
+ ## Available callbacks
6
7
 
7
- ## `before_fetch`
8
-
9
- Runs before a job fetches a page, either by making an HTTP request, or by
10
- navigating a browser to its task URL.
11
-
12
- ```ruby
13
- class DummyJob < Wayfarer::Base
14
- before_fetch :do_something
15
-
16
- private
17
-
18
- def do_something
19
- # before the task.url is fetched
20
- end
21
- end
22
- ```
23
-
24
- ## `before_action`
25
-
26
- Runs after a page was fetched, before an action method is called.
27
-
28
- ```ruby
29
- class DummyJob < Wayfarer::Base
30
- before_action :do_something
31
-
32
- private
33
-
34
- def do_something
35
- # page is available at this point
36
- end
37
- end
38
- ```
8
+ * `before_fetch`
9
+ * `around_fetch`
10
+ * `after_fetch`
11
+ * `before_action`
12
+ * `around_action`
13
+ * `after_action`
14
+ * `after_batch`
39
15
 
40
16
  ## `after_batch`
41
17
 
42
- Runs once the last job in a batch performed:
43
-
44
- ```ruby
45
- class DummyJob < Wayfarer::Base
46
- after_batch do
47
- # All jobs in batch done
48
- end
49
- end
50
- ```
51
-
52
- Internally, a batch counter is in-/decremented on certain events. Once the
53
- counter reaches zero, `after_batch` callbacks runs in declaration order.
54
-
55
- The counter is incremented when within the batch:
18
+ You can register `after_batch` callbacks that run when there are no more tasks
19
+ to process in a batch. Wayfarer instruments job execution and in- or decrements
20
+ an integer counter in Redis on certain events. When the counter reaches zero,
21
+ the current job's `after_batch` callbacks run.
56
22
 
57
- * A job is enqueued.
23
+ ## Conditional callbacks
58
24
 
59
- The counter is decremented when:
60
-
61
- * A job succeeds.
62
- * A job errors due to an unhandled exception.
63
- * A job is discarded due to an exception.
64
- * A job errors and thereyby exhausts its maximum attempts.
65
-
66
- !!! attention "Batch callbacks can fail jobs"
67
-
68
- If the last job's `after_batch` callbacks raises an exception, this can lead
69
- to the job getting retried. If the exception raised by the callback is
70
- unhandled or discarded, the callback never fully runs.
71
-
72
- ## Callback options
73
-
74
- ### Definition styles
75
-
76
- Callbacks can be registered either by supplying a block or a symbol identifying
77
- a callback instance method:
25
+ You can make callbacks conditional with the `#!ruby :if` and `#!ruby :unless`
26
+ keywords, for example to run a callback for some route `action` only:
78
27
 
79
28
  ```ruby
80
- class DummyJob < Wayfarer::Base
81
- before_action do
82
- # ...
83
- end
84
-
85
- before_action :my_callback
29
+ class DummyJob < ActiveJob::Base
30
+ include Wayfarer::Base
86
31
 
87
- private
32
+ route.host "example.com", to: :example
33
+ route.to :fallback
88
34
 
89
- def my_callback
35
+ before_action unless: -> { action == :fallback } do
90
36
  # ...
91
37
  end
92
- end
93
- ```
94
-
95
- ### Conditionals
96
-
97
- Callbacks can be registered conditionally with the `:if` and `:unless` keywords:
98
-
99
- ```ruby
100
- class DummyJob < Wayfarer::Base
101
- before_fetch :my_callback, if: :my_condition
102
-
103
- private
104
-
105
- def my_callback
106
- end
107
38
 
108
- def my_condition
109
- end
39
+ # ...
110
40
  end
111
41
  ```
112
42
 
113
- Callbacks can be registered for certain action methods only with the `:only` and
114
- `:except` keywords:
115
-
116
- ```ruby
117
- class DummyJob < Wayfarer::Base
118
- before_fetch :do_something, only: :foo
119
-
120
- before_fetch except: [:foo, :qux] do
121
- # runs only before bar
122
- end
123
-
124
- def foo
125
- end
126
-
127
- def bar
128
- end
129
- end
130
-
131
- ```
132
-
133
- ### Early termination
134
-
135
- Callbacks that return `false` halt the callback chain:
136
-
137
- ```ruby
138
- class DummyJob < Wayfarer::Base
139
- before_action { false }
140
-
141
- before_action do
142
- # never runs
143
- end
144
- end
145
- ```
43
+ You can also pass a symbol instead of a block to call an instance method.
@@ -13,27 +13,27 @@ Wayfarer parses environment variables into a runtime configuration
13
13
 
14
14
  ```ruby
15
15
  # Which user agent to use to process tasks
16
- Wayfarer.config.network.agent = :http # or :ferrum, :selenium
16
+ Wayfarer.config[:network][:agent] = :http # or :ferrum, :selenium
17
17
 
18
18
  # How many user agents to instantiate
19
- Wayfarer.config.network.pool_size = 3
19
+ Wayfarer.config[:network][:pool_size] = 3
20
20
 
21
21
  # How long an agent may be used while processing a task
22
- Wayfarer.config.network.pool_timeout = 5000
22
+ Wayfarer.config[:network][:pool_timeout] = 5000
23
23
 
24
24
  # Ferrum options
25
- Wayfarer.config.ferrum.options = {}
25
+ Wayfarer.config[:ferrum][:options] = {}
26
26
 
27
27
  # Selenium driver to use
28
- Wayfarer.config.selenium.driver = :chrome
28
+ Wayfarer.config[:selenium][:driver] = :chrome
29
29
 
30
30
  # Selenium HTTP client read timeout
31
- Wayfarer.config.selenium.client_timeout = 10 # seconds
31
+ Wayfarer.config[:selenium][:client_timeout] = 10 # seconds
32
32
 
33
33
  # Selenium options
34
- Wayfarer.config.selenium.options = { url: "http://chrome" }
34
+ Wayfarer.config[:selenium][:options] = { url: "http://chrome" }
35
35
 
36
36
  # HTTP request headers (Selenium is unsupported)
37
- Wayfarer.config.network.http_headers = { "Field" => "Value" }
37
+ Wayfarer.config[:network][:http_headers] = { "Field" => "Value" }
38
38
  ```
39
39
 
@@ -0,0 +1,60 @@
1
+ # Handlers
2
+
3
+ [Jobs](/jobs) can route tasks to handlers to delegate processing without
4
+ writes to the message queue. Unlike jobs, handlers don't inherit from
5
+ `ActiveJob::Base` and therefore cannot be enqueued. Handlers have routes, too,
6
+ but they don't retrieve pages and a handler's router can be bypassed.
7
+
8
+ ## Supported features
9
+
10
+ Handlers support a subset of features compared to `Wayfarer::Base`:
11
+
12
+ * URL routing
13
+ * enqueueing tasks with `#!ruby stage(*urls)`
14
+ * jobs can access the `user_agent` that retrieved the `page`
15
+ * ad-hoc HTTP requests with `#!ruby fetch(url)`
16
+ * callbacks, but only a subset of job callbacks
17
+ * Content-Type filtering
18
+
19
+ ```ruby
20
+ class ExampleHandler
21
+ include Wayfarer::Handler
22
+
23
+ route.to: :index
24
+
25
+ def index
26
+ task # => #<Wayfarer::Task>
27
+ page # => #<Wayfarer::Page>
28
+ user_agent # => Browser or HTTP client
29
+ end
30
+ end
31
+
32
+ class DummyJob < ActiveJob::Base
33
+ include Wayfarer::Base
34
+
35
+ route.host "example.com", to: ExampleHandler
36
+ end
37
+ ```
38
+
39
+ You can also bypass a handler's router and route directly to an instance
40
+ method:
41
+
42
+ ```ruby
43
+ class DummyJob < ActiveJob::Base
44
+ include Wayfarer::Base
45
+
46
+ route.host "example.com", to: [ExampleHandler, :index]
47
+ end
48
+
49
+ class ExampleHandler
50
+ include Wayfarer::Handler
51
+
52
+ def index
53
+ task # => #<Wayfarer::Task>
54
+ page # => #<Wayfarer::Page>
55
+ user_agent # => Browser or HTTP client
56
+ end
57
+ end
58
+ ```
59
+
60
+ !!! `before_action` callbacks
@@ -0,0 +1 @@
1
+ hello
@@ -0,0 +1,40 @@
1
+ # Error handling
2
+
3
+ !!! danger "Only ActiveJob error handling is supported"
4
+
5
+ Wayfarer exclusively supports ActiveJob's error handling. You cannot use
6
+ message queue-specific error handling, for example error handling with
7
+ `sidekiq_options` is unsupported. Otherwise batches get garbage-collected
8
+ too early as Wayfarer instruments ActiveJob.
9
+
10
+ Wayfarer relies on ActiveJob's [error handling methods](https://guides.rubyonrails.org/active_job_basics.html#exceptions):
11
+
12
+ * `retry_on` to retry jobs a number of times on certain errors:
13
+
14
+ ```ruby
15
+ class DummyJob < Wayfarer::Base
16
+ retry_on MyError, attempts: 3 do |job, error|
17
+ # This block runs once all 3 attempts have failed
18
+ # (1 initial attempt + 2 retries)
19
+ end
20
+ end
21
+ ```
22
+
23
+ * `discard_on` to throw away jobs on certain errors:
24
+
25
+ ```ruby
26
+ class DummyJob < Wayfarer::Base
27
+ discard_on MyError do |job, error|
28
+ # This block runs once and buries the job
29
+ end
30
+ end
31
+ ```
32
+
33
+ ## Recreating user agents on certain errors
34
+
35
+ You can configure a list of exception classes upon which user agents
36
+ get recreated (see [User agent API]()):
37
+
38
+ ```ruby
39
+ Wayfarer.config[:network][:renew_on] = [MyIrrecoverableError]
40
+ ```
data/docs/guides/jobs.md CHANGED
@@ -1,78 +1,124 @@
1
1
  # Jobs
2
2
 
3
- Jobs are Ruby classes that process [tasks](/guides/tasks) and look as follows:
3
+ Jobs are [Active Job](https://edgeguides.rubyonrails.org/active_job_basics.html)s
4
+ that use a DSL included from the `Wayfarer::Base` module to process [tasks](/guides/tasks)
5
+ that they read from a message queue.
6
+ Instead of implementing Active Job's `#perform` method yourself, you declare routes
7
+ to instance methods, similiar to how web applications route incoming requests.
8
+ Only URLs that match a [route](../routing) are requested or navigated to.
9
+ The action method has access to the retrieved [page](../pages),
10
+ the [user agent](../user-agents) that retrieved the page and the current task:
4
11
 
5
12
  ```ruby
6
- class DummyJob < Wayfarer::Base
7
- route { to :index }
13
+ class DummyJob < ActiveJob::Base
14
+ include Wayfarer::Base
15
+
16
+ route.to :index
8
17
 
9
18
  def index
19
+ task # => #<Wayfarer::Task>
20
+ page # => #<Wayfarer::Page>
21
+ user_agent # => Browser or HTTP client
10
22
  end
11
23
  end
12
24
  ```
13
25
 
14
- Here is how to enqueue a task for a URL:
26
+ You can start a crawl by appending a task to the message queue for the URL with
27
+ `::crawl`. By default, a UUID is generated as the batch:
15
28
 
16
29
  ```ruby
17
- DummyJob.crawl("https://example.com")
30
+ task = DummyJob.crawl("https://example.com")
31
+ # => #<Wayfarer::Task url="https://example.com", batch="498a13e0-...">
18
32
  ```
19
33
 
20
- This is the same as calling the Active Job API directly and passing a task
21
- and a random batch:
34
+ This is exactly the same as calling Active Job's `#perform_later` and passing a
35
+ task directly:
22
36
 
23
37
  ```ruby
24
38
  task = Wayfarer::Task.new("https://example.com", SecureRandom.uuid)
25
39
  DummyJob.perform_later(task)
26
40
  ```
27
41
 
28
- A batch can be specified with `::crawl`, too:
42
+ Instead of a generated UUID, you can also set your own batch:
29
43
 
30
44
  ```ruby
31
45
  DummyJob.crawl("https://example.com", batch: "my-batch")
32
46
  ```
33
47
 
34
- ## Current task
48
+ You can also use Wayfarer's [CLI](../cli) to enqueue a task:
49
+
50
+ ```sh
51
+ wayfarer enqueue --batch my-batch DummyJob "https://example.com"
52
+ ```
53
+
54
+ ## Navigating crawls
55
+
56
+ Jobs navigate crawls by staging URLs with `#!ruby stage(urls)`. When you stage a URL, a normalized
57
+ version of it is appended to an internal set. Once the action returns, all URLs
58
+ in the set are appended as tasks to the message queue.
59
+
60
+ ```ruby
61
+ class DummyJob < ActiveJob::Base
62
+ include Wayfarer::Base
63
+
64
+ route.to :index
65
+
66
+ def index
67
+ # Follow all out-going links of the page
68
+ stage page.meta.links.external
69
+ end
70
+ end
71
+ ```
72
+
73
+ ## Accessing the current task
35
74
 
36
- Jobs consume [tasks](../tasks) from a message queue. The currently processed
37
- task is accessible like so:
75
+ If the task's URL matched a [route](../routing), the URL is retrieved over the network,
76
+ and the method that was routed to is called. The task is available as `#task`:
38
77
 
39
78
  ```ruby
40
- class DummyJob < Wayfarer::Base
41
- route { to :index }
79
+ class DummyJob < ActiveJob::Base
80
+ include Wayfarer::Base
81
+
82
+ route.to :index
42
83
 
43
84
  def index
44
- task.url # => "https://example.com"
85
+ task.url # => "https://example.com"
45
86
  task.batch # => "my-batch"
46
87
  end
47
88
  end
48
89
  ```
49
90
 
50
- ## Current page
91
+ ## Accessing the current page
51
92
 
52
- A task's URL contents get fetched into a [page](../pages) object if the task URL
53
- matched a route:
93
+ You have access to the retrieved [page](../pages):
54
94
 
55
95
  ```ruby
56
- class DummyJob < Wayfarer::Base
57
- route { to :index, host: "example.com" }
96
+ class DummyJob < ActiveJob::Base
97
+ include Wayfarer::Base
98
+
99
+ route.to :index
58
100
 
59
101
  def index
60
102
  page.url # => "https://example.com"
61
103
  page.body # => "<html>..."
62
104
  page.status_code # => 200
63
105
  page.headers # { "Content-Type" => ... }
106
+ page.doc # Only present for certain Content-Types
64
107
  end
65
108
  end
66
109
  ```
67
110
 
68
- ## URL parameters
111
+ ## Routing URLs to methods and extracting `params`
69
112
 
70
- Jobs can extract data from URLs with their router:
113
+ Jobs have a routing DSL that allows you to map URLs to methods and extract
114
+ URL data:
71
115
 
72
116
  ```ruby
73
- class DummyJob < Wayfarer::Base
117
+ class DummyJob < ActiveJob::Base
118
+ include Wayfarer::Base
119
+
74
120
  route do
75
- path "/users/:id/profile"
121
+ path "/users/:id/profile", to: :index
76
122
  end
77
123
 
78
124
  def index
@@ -80,22 +126,44 @@ class DummyJob < Wayfarer::Base
80
126
  end
81
127
  end
82
128
 
83
- DummyJob.crawl("https://example.com/users/42/profile")
129
+ DummyJob.crawl("https://example.com/users/42/profile?foo=bar")
84
130
  ```
85
131
 
132
+ ## Controlling the user agent
86
133
 
87
- ## User agent
88
-
89
- The HTTP client or automated browser that fetched the URL is available:
134
+ You can control the browser or HTTP client that retrieved the page:
90
135
 
91
136
  ```ruby
92
- Wayfarer.config.network.agent = :ferrum # Chrome DevTools Protocol
137
+ Wayfarer.config[:network][:agent] = :ferrum # Chrome DevTools Protocol
93
138
 
94
- class DummyJob < Wayfarer::Base
95
- route { to :index }
139
+ class DummyJob < ActiveJob::Base
140
+ include Wayfarer::Base
141
+
142
+ route.to :index
96
143
 
97
144
  def index
98
- browser.save_screenshot("capture.png")
145
+ user_agent.save_screenshot("capture.png")
99
146
  end
100
147
  end
101
148
  ```
149
+
150
+ ## Restricting the processed Content-Types
151
+
152
+ By default, jobs process pages regardless of their Content-Type response
153
+ header. You can allow a list of Content-Types as strings and Regexps and
154
+ opt out of the default behaviour. Once at least one Content-Type is allowed,
155
+ other Content-Types don't get processed:
156
+
157
+ ```ruby
158
+ class DummyJob < ActiveJob::Base
159
+ include Wayfarer::Base
160
+
161
+ content_type "text/html", "application/json"
162
+ content_type /xml/
163
+ end
164
+ ```
165
+
166
+ !!! info "HTTP parameters in Content-Types are ignored for comparison"
167
+
168
+ Content-Types are compared regardless of their parameters. For example,
169
+ `text/html; charset=UTF-8` is considered the same as `text/html`.
@@ -13,7 +13,7 @@ set first.
13
13
 
14
14
  ```ruby
15
15
  class DummyJob < Wayfarer::Base
16
- route { to :index }
16
+ route.to :index
17
17
 
18
18
  def index
19
19
  stage page.meta.links.all