wayfarer 0.4.6 → 0.4.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +28 -1
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +1 -1
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -53
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -43
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -29
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -0,0 +1,60 @@
1
+ # Tutorial
2
+
3
+ Wayfarer is a web crawling framework written in Ruby.
4
+ It works with plain HTTP or by automating web browsers and is deployed with
5
+ Redis and a message queue (which can be Redis-based itself).
6
+ In development, it can execute fully in memory, without Redis.
7
+
8
+ You need a compatible version of Ruby installed.
9
+
10
+ To get started, in an empty directory, generate a new `Gemfile` and install
11
+ ActiveJob and Wayfarer:
12
+
13
+ ```sh
14
+ bundle init
15
+ bundle add activejob wayfarer
16
+ bundle install
17
+ ```
18
+
19
+ ## Jobs, tasks and batches
20
+
21
+ Wayfarer builds on Active Job, the message queue abstraction of Rails.
22
+ You can use Wayfarer without Rails of course, as we do here.
23
+
24
+ A message queue supports two operations: appending messages to the end and consuming
25
+ messages from the front. In the case of Wayfarer, messages are tasks, a string pair
26
+ consisting of a URL and a batch. When a task is consumed, it is processed by a job,
27
+ a Ruby class.
28
+
29
+ Let's give ourselves a `dummy_job.rb` that routes arbitrary URLs to its
30
+ `index` instance method, where we print the current `task`:
31
+
32
+ ```ruby
33
+ require "activejob"
34
+ require "wayfarer"
35
+
36
+ class DummyJob < ActiveJob::Base
37
+ include Wayfarer::Base
38
+
39
+ route.to :index
40
+
41
+ def index
42
+ puts task
43
+ end
44
+ end
45
+ ```
46
+
47
+ We can perform our job from the command line with Wayfarer's CLI and find
48
+ that in between ActiveJob's log output, our task was printed with a generated
49
+ UUID for its batch:
50
+
51
+ ```hl_lines="1 3"
52
+ bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
53
+ [ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
54
+ #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
55
+ [ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
56
+ ```
57
+
58
+ Many commands accept a `--batch` flag for setting the batch. If you don't
59
+ provide one, a UUID is generated.
60
+
@@ -0,0 +1,113 @@
1
+ # User agents
2
+
3
+ User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
4
+ [page](../pages). They are kept in a connection pool and all user agents in the pool
5
+ share the same type and configuration. You can add custom user agents by implementing
6
+ the [user agent API](custom_user_agents.md).
7
+
8
+ Wayfarer comes with the following built-in user agents:
9
+
10
+ * [`#!ruby :http`](http.md) (default)
11
+ * [`#!ruby :ferrum`](ferrum.md) to automate Google Chrome
12
+ * [`#!ruby :selenium`](selenium.md) to automate a variety of browsers
13
+ * [`#!ruby :capybara`](capybara.md) to use Capybara sessions
14
+
15
+ Configure the user agent with the global configuration option:
16
+
17
+ ```ruby
18
+ Wayfarer.config[:network][:agent] = :ferrum # or :selenium, :capybara, ...
19
+ ```
20
+
21
+ You can access the user agent that was checked out from the pool with
22
+ `#user_agent` in action methods:
23
+
24
+ ```ruby
25
+ class DummyJob < ActiveJob::Base
26
+ include Wayfarer::Base
27
+
28
+ route.to :index
29
+
30
+ def index
31
+ user_agent # => #<Ferrum::Browser ...>
32
+ end
33
+ end
34
+ ```
35
+
36
+ You can also implement [custom user agents](custom_user_agents.md) to support
37
+ your own HTTP client or browser automation service/protocol.
38
+
39
+ ### Ad-hoc HTTP requests
40
+
41
+ Regardless the configured user agent, you can always make ad-hoc HTTP GET requests
42
+ that return pages with `#fetch(url)`:
43
+
44
+ ```ruby
45
+ class DummyJob < ActiveJob::Base
46
+ include Wayfarer::Base
47
+
48
+ route.to :index
49
+
50
+ def index
51
+ page = fetch("https://example.com") # => #<Wayfarer::Page ...>
52
+ end
53
+ end
54
+ ```
55
+
56
+ !!! info "`#fetch` uses the configured `Wayfarer.config.network.http_headers`."
57
+
58
+ ## HTTP request headers
59
+
60
+ You can set HTTP request headers for all built-in user agents:
61
+
62
+ ```ruby
63
+ Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler" }
64
+ ```
65
+
66
+ !!! attention "Selenium does not support configuring HTTP request headers."
67
+
68
+ ## Connection pooling
69
+
70
+ Since user agents are expensive to create, especially in the case of browser
71
+ processes, Wayfarer keeps user agents within a connection pool. When a job
72
+ performs and needs to retrieve the [page](../pages) for its task URL, an agent
73
+ is checked out from the pool, and checked back in when the routed action method
74
+ returns.
75
+
76
+ The pool size is constant and it should equal the number of threads the
77
+ underlying message queue operates with. For example, if you use Sidekiq,
78
+ you should set the pool size to the number of Sidekiq threads:
79
+
80
+ ```ruby
81
+ Wayfarer.config[:network][:pool_size] = Sidekiq.options[:concurrency]
82
+ ```
83
+
84
+ !!! attention "The connection pool size is 1 by default"
85
+
86
+ Since there is no reliable way to detect the number of threads that
87
+ the underlying message queue operates with, Wayfarer defaults to a pool
88
+ size of 1, which creates a bottleneck in a concurrent environment.
89
+
90
+ !!! attention "Browser sessions are shared across jobs"
91
+
92
+ The same browser session is used across jobs. This means that the browser
93
+ is not closed between jobs, and that the browser's state carries over from
94
+ job to job. You may account for this by resetting the browser's state
95
+ according to your needs, for which you can use [callbacks](../callbacks).
96
+
97
+ ### `UserAgentTimeoutError`: avoiding pool contention
98
+
99
+ If you encounter `UserAgentTimeoutError` exceptions, a job has waited for a
100
+ user agent to become available for too long. By default, this timeout is 10
101
+ seconds. This is a sign that the pool size is too small for the message queue's
102
+ concurrency.
103
+
104
+ ```
105
+ #<Wayfarer::UserAgentTimeoutError: Waited 10 sec, 0/1 available>
106
+ ```
107
+
108
+ You can configure the timeout, although you will likely want to increase the
109
+ pool size instead:
110
+
111
+ ```ruby
112
+ Wayfarer.config[:network][:pool_timeout] = 10 # seconds
113
+ ```
data/docs/index.md CHANGED
@@ -1,56 +1,33 @@
1
1
  ---
2
2
  hide:
3
3
  - navigation
4
+ - toc
4
5
  ---
5
6
 
6
7
  # Wayfarer
7
8
 
8
- ![CI status](https://github.com/actions/starter-workflows/workflows/CI/badge.svg)
9
- [![RubyGem](https://badge.fury.io/rb/wayfarer.svg)](https://rubygems.org/gems/wayfarer)
9
+ ## Ruby web crawling framework built on [ActiveJob]() and [Redis]()
10
10
 
11
- ## Versatile web crawling with Ruby
11
+ <small>
12
+ [Read the tutorial](/guides/tutorial){ .md-button .md-button--primary }
13
+ </small>
12
14
 
13
- * Web scraping
14
- * Data extraction
15
- * Browser automation
15
+ === "Command line"
16
16
 
17
- !!! attention "Unstable software"
17
+ ```sh
18
+ gem install wayfarer
19
+ ```
18
20
 
19
- Wayfarer is under development and releases should be considered unstable.
21
+ === "Gemfile"
20
22
 
21
- Wayfarer complies to
22
- [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) in
23
- which v0.x means that there could be backward-incompatible changes for every
24
- release:
23
+ ```ruby
24
+ gem "wayfarer"
25
+ ```
25
26
 
26
- >Major version zero (0.y.z) is for initial development. Anything MAY change
27
- at any time. The public API SHOULD NOT be considered stable.
28
-
29
- ### Installation
30
-
31
- Install the RubyGem:
32
-
33
- ```
34
- gem install wayfarer
35
- ```
36
-
37
- Or add it to Bundler's Gemfile:
38
-
39
- ```ruby
40
- gem "wayfarer"
41
- ```
42
-
43
- ### Features
44
-
45
- * Breadth-first, acyclic, multi-threaded graph traversal
46
- * Executes atop a variety of message queues thanks to [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html)
47
- * Browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
27
+ * Breadth-first, acyclic page traversal
28
+ * Plain HTTP and browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
48
29
  (<abbr title="Chrome DevTools Protocol">CDP</abbr>),
49
- [Selenium](https://www.selenium.dev) or plain HTTP via `net/http`
30
+ [Selenium](https://www.selenium.dev) and custom user agents
50
31
  * Declarative routing DSL
51
32
  * URI normalization and deduplication
52
- * XML, HTML, JSON parsing
53
- * HTTP redirect handling
54
- * Storage-agnostic
55
- * Small footprint: <500 LoC
56
- * Open Source (MIT)
33
+ * HTML, XML, JSON and custom Content-Type body parsing
@@ -1,46 +1,46 @@
1
- # Command Line Interface
1
+ # wayfarer
2
+
3
+ The command-line interface to Wayfarer.
2
4
 
3
5
  ## Usage
4
6
 
5
7
  ```
6
- wayfarer [OPTIONS] [generate|job|route|version]
8
+ wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
7
9
  ```
8
10
 
9
- All [environment variables](../environment_variables) are respected.
10
-
11
- ## `wayfarer generate`
12
-
13
- ### `wayfarer generate project NAME`
11
+ See [Configuration](../reference/cli) for the respected environment variables.
14
12
 
15
- : Generates a new project directory `NAME`.
13
+ ---
16
14
 
17
- ## `wayfarer job`
15
+ ## `wayfarer perform JOB URL`
18
16
 
19
- ### `wayfarer job perform JOB URL`
20
-
21
- : Performs `JOB` with `URL`. The job does not reach any Active Job backend.
22
- Staged jobs will not be processed.
17
+ : Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
18
+ Staged jobs are ignored.
23
19
 
24
20
  ##### Options
25
21
 
26
22
  * `--mock-redis`: Use an in-memory implementation of Redis instead of
27
23
  talking to an actual server.
28
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
24
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
25
+
26
+ ---
29
27
 
30
- ### `wayfarer job enqueue JOB URL`
28
+ ## `wayfarer enqueue JOB URL`
31
29
 
32
- : Enqueues `JOB` with `URL` to the configured Active Job backend.
30
+ : Enqueues a task for `JOB` with `URL` to the message queue.
33
31
 
34
32
  ##### Options
35
33
 
36
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
34
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
35
+
36
+ ---
37
37
 
38
- ### `wayfarer job execute JOB URL`
38
+ ## `wayfarer execute JOB URL`
39
39
 
40
- : Execute `JOB` with `URL` by using the
41
- [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html).
42
- The job does not reach any other Active Job backend. Blocks until the batch
43
- completes.
40
+ : Execute `JOB` with `URL` with the in-memory
41
+ [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
42
+ instead of writing the taks to an actual message queue. Blocks until the
43
+ batch has completed.
44
44
 
45
45
  ##### Options
46
46
 
@@ -50,12 +50,22 @@ All [environment variables](../environment_variables) are respected.
50
50
  * `--min-threads`: Minimum number of threads to use. Default: 1
51
51
  * `--max-threads`: Maximum number of threads to use. Default: 1
52
52
 
53
- ## `wayfarer route`
53
+ !!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
54
+
55
+ You need to set the `wait: 0` option on `retry_on` in order for
56
+ `wayfarer job execute` to execute retries:
54
57
 
55
- ### `wayfarer route result JOB URL`
58
+ ```ruby
59
+ retry_on StandardError, attempts: 3, wait: 0
60
+ ```
61
+ ---
62
+
63
+ ## `wayfarer route JOB URL`
56
64
 
57
65
  : Prints the result of invoking `JOB`'s router with `URL`.
58
66
 
59
- ### `wayfarer route tree JOB URL`
67
+ ---
68
+
69
+ ## `wayfarer tree JOB URL`
60
70
 
61
71
  : Visualises the routing tree result of invoking `JOB`'s router with `URL`.
@@ -0,0 +1,36 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+
6
+ # Configuration
7
+
8
+ You can configure Wayfarer by assigning to the `Wayfarer.config` Hash
9
+ which has the following defaults:
10
+
11
+ ```ruby
12
+ {
13
+ redis: {
14
+ url: "redis://localhost:6379/0",
15
+ factory: ->(redis) { ::Redis.new(url: redis[:url]) }
16
+ },
17
+ network: {
18
+ agent: :http,
19
+ pool_size: 1,
20
+ pool_timeout: 10,
21
+ http_headers: {},
22
+ renew_on: []
23
+ },
24
+ capybara: {
25
+ driver: nil
26
+ },
27
+ ferrum: {
28
+ options: {}
29
+ },
30
+ selenium: {
31
+ driver: :chrome,
32
+ options: {},
33
+ client_timeout: 60
34
+ }
35
+ }
36
+ ```
data/lib/wayfarer/base.rb CHANGED
@@ -1,60 +1,138 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Base < ActiveJob::Base
5
- include Wayfarer::Middleware::Controller
6
-
7
- use Wayfarer::Middleware::Stage
8
- use Wayfarer::Middleware::Dedup
9
- use Wayfarer::Middleware::Normalize
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Fetch
12
- use Wayfarer::Middleware::Dispatch
13
-
14
- ErrorHandler = lambda do |&block|
15
- lambda do |job, error|
16
- task = job.arguments.first
17
- task.barrier.seen?(task.url)
18
- task.gc.run
19
- block.call(job, error)
20
- end
21
- end
4
+ # @!attribute [r] task
5
+ # @return [Wayfarer::Task] the current task
6
+ # @!attribute [r] uri
7
+ # @return [Addressable::URI] Parsed task URL
8
+ # @!attribute [r] user_agent
9
+ # @return [Object] the user agent that retrieved the page
10
+ # @!attribute [r] action
11
+ # @return [Symbol, Object] action that the task URL was routed to.
12
+ # @!attribute [r] params
13
+ # @return [HashWithIndifferentAccess] path parameters collected from routes
14
+ module Base
15
+ extend ActiveSupport::Concern
16
+ # @!method stage(urls)
17
+ # Adds URLs to an internal staging set so that they get enqueued
18
+ # eventually, once the job executed successfully.
19
+ # @overload stage(urls)
20
+ # @param urls [Array<String>] URLs to add to the staging set.
21
+ # @overload stage(url)
22
+ # @param url [String] URL to add to the staging set.
22
23
 
23
- after_enqueue do |job|
24
- task = job.arguments.first
25
- task.counter.increment
26
- end
24
+ # @!method fetch(url, follow: 3)
25
+ # @param url [String] URL to fetch using plain HTTP(S).
26
+ # @param follow [Fixnum] Number of redirects to follow.
27
+ # Retrieves the given URL to a {Page}.
27
28
 
28
- after_perform do |job|
29
- task = job.arguments.first
30
- task.gc.run
31
- end
29
+ # @!method page(live: false)
30
+ # @param url [live] whether to retrieve a new {Page}.
31
+ # @return [Wayfarer::Page]
32
+ # Returns the most recently retrieved page or a new page
33
+ # for the current task URL if the `follow` keyword is passed.
32
34
 
33
- rescue_from(StandardError) do
34
- task = arguments.first
35
- task.gc.run
36
- end
35
+ # @!scope class
37
36
 
38
- def self.retry_on(*argv, &block)
39
- super(*argv, &ErrorHandler.call(&block))
40
- end
37
+ # @!attribute [r] route
38
+ # @return [Wayfarer::Routing::DSL]
39
+ # The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
40
+ # or to a {Handler}.
41
+ # @example Append a host route
42
+ # route.host "examplxe.com", to: :index
41
43
 
42
- def self.discard_on(*argv, &block)
43
- super(*argv, &ErrorHandler.call(&block))
44
- end
44
+ # @!method content_types(*content_types)
45
+ # @param content_types [*Array<String, Regexp>] Content-Types to whitelist
46
+ # Whitelists Content-Types. Once at least one Content-Type is set, only
47
+ # those Content-Types will be processed.
45
48
 
46
- def self.crawl(url, batch: SecureRandom.uuid)
47
- Task.new(url, batch).tap do |task|
48
- perform_later(task)
49
- end
50
- end
49
+ # @!group Callbacks
50
+
51
+ # @!method before_fetch
52
+ # @overload before_fetch(callback)
53
+ # @param callback [Symbol] Instance method to call
54
+ # @overload before_fetch(&block)
55
+ # @yield [Wayfarer::Task]
56
+ # Registers a callback that is called before the page is fetched.
57
+ # If a symbol is passed, an instance method with the same name will be
58
+ # called.
59
+ # @example Accessing the user agent in {#before_fetch}
60
+ # before_fetch do |task|
61
+ # user_agent # => the user agent that will fetch the page
62
+ # end
51
63
 
52
- def retry_job(...)
53
- super(...) # increments the counter by re-enqueuing the job
54
- task = arguments.first
55
- task.counter.decrement
64
+ # @!method around_fetch
65
+ # @overload around_fetch(callback)
66
+ # @param callback [Symbol] Instance method to call
67
+ # @overload around_fetch(&block)
68
+ # @yield [Wayfarer::Task]
69
+ # Registers a callback that is called around the page getting fetched.
70
+ # If a symbol is passed, an instance method with the same name will be
71
+ # called.
72
+
73
+ # @!method after_fetch
74
+ # @overload after_fetch(callback)
75
+ # @param callback [Symbol] Instance method to call
76
+ # @overload after_fetch(&block)
77
+ # @yield [Wayfarer::Task]
78
+ # Registers a callback that is called after the page was fetched.
79
+ # If a symbol is passed, an instance method with the same name will be
80
+ # called.
81
+
82
+ # @!method before_perform
83
+ # @overload before_perform(callback)
84
+ # @param callback [Symbol] Instance method to call
85
+ # @overload before_perform(&block)
86
+ # @yield [Wayfarer::Task]
87
+ # Registers a callback that is called before the task is performed.
88
+ # If a symbol is passed, an instance method with the same name will be
89
+ # called.
90
+
91
+ # @!method around_perform
92
+ # @overload around_perform(callback)
93
+ # @param callback [Symbol] Instance method to call
94
+ # @overload around_perform(&block)
95
+ # @yield [Wayfarer::Task]
96
+ # Registers a callback that is called around the task getting performed.
97
+ # If a symbol is passed, an instance method with the same name will be
98
+ # called.
99
+
100
+ # @!method after_perform
101
+ # @overload after_perform(callback)
102
+ # @param callback [Symbol] Instance method to call
103
+ # @overload after_perform(&block)
104
+ # @yield [Wayfarer::Task]
105
+ # Registers a callback that is called after the task was performed.
106
+ # If a symbol is passed, an instance method with the same name will be
107
+ # called.
108
+
109
+ # @!endgroup
110
+
111
+ included do
112
+ include Wayfarer::Middleware::Controller
113
+
114
+ # Implement ActiveJob's #perform by calling into our own middleware chain
115
+ alias_method :perform, :call
116
+
117
+ # Middleware stack
118
+ use Wayfarer::Middleware::Redis
119
+ use Wayfarer::Middleware::BatchCompletion
120
+ use Wayfarer::Middleware::UriParser
121
+ use Wayfarer::Middleware::Normalize
122
+ use Wayfarer::Middleware::Dedup
123
+ use Wayfarer::Middleware::Stage
124
+ use Wayfarer::Middleware::Router
125
+ use Wayfarer::Middleware::UserAgent
126
+ use Wayfarer::Middleware::ContentType
127
+ use Wayfarer::Middleware::Dispatch
56
128
  end
57
129
 
58
- alias perform call
130
+ class_methods do
131
+ def crawl(url, batch: SecureRandom.uuid)
132
+ Task.new(url, batch).tap do |task|
133
+ perform_later(task)
134
+ end
135
+ end
136
+ end
59
137
  end
60
138
  end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # BatchCompletion tracks the completion of a batch of jobs.
5
+ # It does so by incrementing and decrementing a counter in Redis.
6
+ #
7
+ # The counter is incremented when a job is first enqueued and decremented when
8
+ # a job is performed. If a job is retried, the counter is not incremented.
9
+ # When a job succeeds or fails and thereby exceeds its retry count, the counter
10
+ # is decremented.
11
+ #
12
+ # When the counter reaches zero, garbage collection deletes the Redis keys
13
+ # associated with the batch.
14
+ module BatchCompletion
15
+ module_function
16
+
17
+ def subscribe!
18
+ ActiveSupport::Notifications.subscribe("enqueue.active_job", self)
19
+ ActiveSupport::Notifications.subscribe("perform.active_job", self)
20
+ ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
21
+ end
22
+
23
+ def call(name, _, _, _, data)
24
+ return unless (job = data[:job]).is_a?(Wayfarer::Base)
25
+
26
+ task = job.arguments.first
27
+
28
+ # In the case of `enqueue.active_job` middleware hasn't executed yet
29
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance # TODO: Test
30
+
31
+ counter = Redis::Counter.new(task) do
32
+ job.run_callbacks(:batch)
33
+ ensure
34
+ Wayfarer::GC.run(task)
35
+ end
36
+
37
+ handle(name, job, task, counter)
38
+ end
39
+
40
+ def handle(name, job, task, counter)
41
+ case name
42
+ when "enqueue.active_job" then counter.increment unless retry?(job)
43
+ when "perform.active_job" then counter.decrement if succeeded?(job, task)
44
+ when "retry_stopped.active_job" then counter.decrement
45
+ end
46
+ end
47
+
48
+ def retry?(job)
49
+ job.executions > 0
50
+ end
51
+
52
+ def succeeded?(job, task)
53
+ job.exception_executions == task[:initial_exception_executions]
54
+ end
55
+ end
56
+ end