wayfarer 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/lint.yaml +25 -0
  3. data/.github/workflows/release.yaml +29 -0
  4. data/.github/workflows/tests.yaml +30 -0
  5. data/.gitignore +4 -0
  6. data/.rubocop.yml +5 -0
  7. data/.vale.ini +5 -0
  8. data/.yardopts +1 -3
  9. data/Dockerfile +5 -4
  10. data/Gemfile +3 -0
  11. data/Gemfile.lock +107 -102
  12. data/Rakefile +5 -56
  13. data/bin/wayfarer +1 -1
  14. data/docker-compose.yml +20 -9
  15. data/docs/cookbook/consent_screen.md +2 -2
  16. data/docs/cookbook/executing_javascript.md +3 -3
  17. data/docs/cookbook/navigation.md +12 -12
  18. data/docs/cookbook/querying_html.md +3 -3
  19. data/docs/cookbook/screenshots.md +2 -2
  20. data/docs/cookbook/user_agent.md +1 -1
  21. data/docs/design.md +36 -0
  22. data/docs/guides/callbacks.md +24 -126
  23. data/docs/guides/configuration.md +8 -8
  24. data/docs/guides/handlers.md +60 -0
  25. data/docs/guides/index.md +1 -0
  26. data/docs/guides/jobs/error_handling.md +40 -0
  27. data/docs/guides/jobs.md +99 -31
  28. data/docs/guides/navigation.md +1 -1
  29. data/docs/guides/networking/capybara.md +13 -22
  30. data/docs/guides/networking/custom_adapters.md +82 -41
  31. data/docs/guides/networking/ferrum.md +4 -4
  32. data/docs/guides/networking/http.md +9 -13
  33. data/docs/guides/networking/selenium.md +10 -11
  34. data/docs/guides/pages.md +76 -10
  35. data/docs/guides/redis.md +10 -0
  36. data/docs/guides/routing.md +74 -0
  37. data/docs/guides/tasks.md +33 -9
  38. data/docs/guides/tutorial.md +60 -0
  39. data/docs/guides/user_agents.md +113 -0
  40. data/docs/index.md +17 -40
  41. data/docs/reference/cli.md +35 -25
  42. data/docs/reference/configuration.md +36 -0
  43. data/lib/wayfarer/base.rb +124 -46
  44. data/lib/wayfarer/batch_completion.rb +56 -0
  45. data/lib/wayfarer/callbacks.rb +22 -48
  46. data/lib/wayfarer/cli/route_printer.rb +71 -57
  47. data/lib/wayfarer/cli.rb +121 -0
  48. data/lib/wayfarer/gc.rb +13 -6
  49. data/lib/wayfarer/handler.rb +15 -7
  50. data/lib/wayfarer/logging.rb +38 -0
  51. data/lib/wayfarer/middleware/base.rb +2 -0
  52. data/lib/wayfarer/middleware/batch_completion.rb +19 -0
  53. data/lib/wayfarer/middleware/content_type.rb +54 -0
  54. data/lib/wayfarer/middleware/controller.rb +19 -15
  55. data/lib/wayfarer/middleware/dedup.rb +16 -13
  56. data/lib/wayfarer/middleware/dispatch.rb +12 -4
  57. data/lib/wayfarer/middleware/normalize.rb +12 -11
  58. data/lib/wayfarer/middleware/redis.rb +15 -0
  59. data/lib/wayfarer/middleware/router.rb +33 -35
  60. data/lib/wayfarer/middleware/stage.rb +5 -5
  61. data/lib/wayfarer/middleware/uri_parser.rb +30 -0
  62. data/lib/wayfarer/middleware/user_agent.rb +49 -0
  63. data/lib/wayfarer/networking/capybara.rb +1 -1
  64. data/lib/wayfarer/networking/context.rb +2 -2
  65. data/lib/wayfarer/networking/ferrum.rb +2 -2
  66. data/lib/wayfarer/networking/follow.rb +12 -6
  67. data/lib/wayfarer/networking/http.rb +1 -1
  68. data/lib/wayfarer/networking/pool.rb +17 -12
  69. data/lib/wayfarer/networking/selenium.rb +3 -3
  70. data/lib/wayfarer/networking/strategy.rb +2 -2
  71. data/lib/wayfarer/page.rb +36 -14
  72. data/lib/wayfarer/parsing/xml.rb +6 -6
  73. data/lib/wayfarer/parsing.rb +24 -0
  74. data/lib/wayfarer/redis/barrier.rb +13 -21
  75. data/lib/wayfarer/redis/counter.rb +19 -9
  76. data/lib/wayfarer/redis/pool.rb +1 -1
  77. data/lib/wayfarer/redis/resettable.rb +19 -0
  78. data/lib/wayfarer/routing/dsl.rb +1 -0
  79. data/lib/wayfarer/routing/matchers/path.rb +4 -2
  80. data/lib/wayfarer/routing/root_route.rb +5 -1
  81. data/lib/wayfarer/routing/route.rb +4 -14
  82. data/lib/wayfarer/stringify.rb +22 -30
  83. data/lib/wayfarer/task.rb +12 -18
  84. data/lib/wayfarer.rb +29 -2
  85. data/mkdocs.yml +52 -7
  86. data/rake/docs.rake +26 -0
  87. data/rake/lint.rake +105 -0
  88. data/rake/release.rake +29 -0
  89. data/rake/tests.rake +28 -0
  90. data/requirements.txt +1 -1
  91. data/spec/base_spec.rb +140 -160
  92. data/spec/batch_completion_spec.rb +104 -0
  93. data/spec/cli/job_spec.rb +19 -23
  94. data/spec/cli/routing_spec.rb +101 -0
  95. data/spec/cli/version_spec.rb +1 -1
  96. data/spec/factories/task.rb +7 -1
  97. data/spec/fixtures/dummy_job.rb +5 -3
  98. data/spec/gc_spec.rb +8 -50
  99. data/spec/handler_spec.rb +1 -1
  100. data/spec/integration/callbacks_spec.rb +157 -45
  101. data/spec/integration/content_type_spec.rb +145 -0
  102. data/spec/integration/gc_spec.rb +44 -0
  103. data/spec/integration/handler_spec.rb +66 -0
  104. data/spec/integration/page_spec.rb +44 -29
  105. data/spec/integration/params_spec.rb +33 -25
  106. data/spec/integration/parsing_spec.rb +125 -0
  107. data/spec/integration/routing_spec.rb +18 -0
  108. data/spec/integration/stage_spec.rb +27 -20
  109. data/spec/middleware/batch_completion_spec.rb +34 -0
  110. data/spec/middleware/chain_spec.rb +8 -8
  111. data/spec/middleware/content_type_spec.rb +86 -0
  112. data/spec/middleware/controller_spec.rb +5 -5
  113. data/spec/middleware/dedup_spec.rb +38 -55
  114. data/spec/middleware/dispatch_spec.rb +23 -7
  115. data/spec/middleware/normalize_spec.rb +44 -13
  116. data/spec/middleware/router_spec.rb +29 -30
  117. data/spec/middleware/stage_spec.rb +8 -8
  118. data/spec/middleware/uri_parser_spec.rb +53 -0
  119. data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
  120. data/spec/networking/context_spec.rb +17 -0
  121. data/spec/networking/follow_spec.rb +2 -2
  122. data/spec/networking/pool_spec.rb +5 -5
  123. data/spec/networking/strategy.rb +2 -2
  124. data/spec/page_spec.rb +42 -20
  125. data/spec/parsing/xml_spec.rb +11 -12
  126. data/spec/redis/barrier_spec.rb +8 -48
  127. data/spec/redis/counter_spec.rb +13 -1
  128. data/spec/redis/pool_spec.rb +1 -1
  129. data/spec/spec_helpers.rb +27 -16
  130. data/spec/support/test_app.rb +8 -0
  131. data/spec/task_spec.rb +3 -24
  132. data/spec/wayfarer_spec.rb +1 -1
  133. data/wayfarer.gemspec +4 -3
  134. metadata +61 -51
  135. data/.github/workflows/ci.yaml +0 -32
  136. data/docs/guides/error_handling.md +0 -31
  137. data/docs/guides/networking.md +0 -94
  138. data/docs/guides/performance.md +0 -130
  139. data/docs/guides/reliability.md +0 -41
  140. data/docs/guides/routing/steering.md +0 -30
  141. data/docs/reference/api/base.md +0 -48
  142. data/docs/reference/configuration_keys.md +0 -42
  143. data/docs/reference/environment_variables.md +0 -83
  144. data/lib/wayfarer/cli/base.rb +0 -45
  145. data/lib/wayfarer/cli/generate.rb +0 -17
  146. data/lib/wayfarer/cli/job.rb +0 -56
  147. data/lib/wayfarer/cli/route.rb +0 -29
  148. data/lib/wayfarer/cli/runner.rb +0 -34
  149. data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
  150. data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
  151. data/lib/wayfarer/config/capybara.rb +0 -10
  152. data/lib/wayfarer/config/ferrum.rb +0 -11
  153. data/lib/wayfarer/config/networking.rb +0 -26
  154. data/lib/wayfarer/config/redis.rb +0 -14
  155. data/lib/wayfarer/config/root.rb +0 -11
  156. data/lib/wayfarer/config/selenium.rb +0 -21
  157. data/lib/wayfarer/config/strconv.rb +0 -45
  158. data/lib/wayfarer/config/struct.rb +0 -72
  159. data/lib/wayfarer/middleware/fetch.rb +0 -56
  160. data/lib/wayfarer/redis/connection.rb +0 -13
  161. data/lib/wayfarer/redis/version.rb +0 -19
  162. data/lib/wayfarer/routing/router.rb +0 -28
  163. data/spec/callbacks_spec.rb +0 -102
  164. data/spec/cli/generate_spec.rb +0 -39
  165. data/spec/config/capybara_spec.rb +0 -18
  166. data/spec/config/ferrum_spec.rb +0 -24
  167. data/spec/config/networking_spec.rb +0 -73
  168. data/spec/config/redis_spec.rb +0 -32
  169. data/spec/config/root_spec.rb +0 -31
  170. data/spec/config/selenium_spec.rb +0 -56
  171. data/spec/config/strconv_spec.rb +0 -58
  172. data/spec/config/struct_spec.rb +0 -66
  173. data/spec/integration/steering_spec.rb +0 -57
  174. data/spec/redis/version_spec.rb +0 -13
  175. data/spec/routing/router_spec.rb +0 -24
@@ -0,0 +1,60 @@
1
+ # Tutorial
2
+
3
+ Wayfarer is a web crawling framework written in Ruby.
4
+ It works with plain HTTP or by automating web browsers and is deployed with
5
+ Redis and a message queue (which can be Redis-based itself).
6
+ In development, it can execute fully in memory, without Redis.
7
+
8
+ You need a compatible version of Ruby installed.
9
+
10
+ To get started, in an empty directory, generate a new `Gemfile` and install
11
+ ActiveJob and Wayfarer:
12
+
13
+ ```sh
14
+ bundle init
15
+ bundle add activejob wayfarer
16
+ bundle install
17
+ ```
18
+
19
+ ## Jobs, tasks and batches
20
+
21
+ Wayfarer builds on Active Job, the message queue abstraction of Rails.
22
+ You can use Wayfarer without Rails of course, as we do here.
23
+
24
+ A message queue supports two operations: appending messages to the end and consuming
25
+ messages from the front. In the case of Wayfarer, messages are tasks, a string pair
26
+ consisting of a URL and a batch. When a task is consumed, it is processed by a job,
27
+ a Ruby class.
28
+
29
+ Let's give ourselves a `dummy_job.rb` that routes arbitrary URLs to its
30
+ `index` instance method, where we print the current `task`:
31
+
32
+ ```ruby
33
+ require "activejob"
34
+ require "wayfarer"
35
+
36
+ class DummyJob < ActiveJob::Base
37
+ include Wayfarer::Base
38
+
39
+ route.to :index
40
+
41
+ def index
42
+ puts task
43
+ end
44
+ end
45
+ ```
46
+
47
+ We can perform our job from the command line with Wayfarer's CLI and find
48
+ that in between ActiveJob's log output, our task was printed with a generated
49
+ UUID for its batch:
50
+
51
+ ```hl_lines="1 3"
52
+ bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
53
+ [ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
54
+ #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
55
+ [ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
56
+ ```
57
+
58
+ Many commands accept a `--batch` flag for setting the batch. If you don't
59
+ provide one, a UUID is generated.
60
+
@@ -0,0 +1,113 @@
1
+ # User agents
2
+
3
+ User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
4
+ [page](../pages). They are kept in a connection pool and all user agents in the pool
5
+ share the same type and configuration. You can add custom user agents by implementing
6
+ the [user agent API](custom_user_agents.md).
7
+
8
+ Wayfarer comes with the following built-in user agents:
9
+
10
+ * [`#!ruby :http`](http.md) (default)
11
+ * [`#!ruby :ferrum`](ferrum.md) to automate Google Chrome
12
+ * [`#!ruby :selenium`](selenium.md) to automate a variety of browsers
13
+ * [`#!ruby :capybara`](capybara.md) to use Capybara sessions
14
+
15
+ Configure the user agent with the global configuration option:
16
+
17
+ ```ruby
18
+ Wayfarer.config[:network][:agent] = :ferrum # or :selenium, :capybara, ...
19
+ ```
20
+
21
+ You can access the user agent that was checked out from the pool with
22
+ `#user_agent` in action methods:
23
+
24
+ ```ruby
25
+ class DummyJob < ActiveJob::Base
26
+ include Wayfarer::Base
27
+
28
+ route.to :index
29
+
30
+ def index
31
+ user_agent # => #<Ferrum::Browser ...>
32
+ end
33
+ end
34
+ ```
35
+
36
+ You can also implement [custom user agents](custom_user_agents.md) to support
37
+ your own HTTP client or browser automation service/protocol.
38
+
39
+ ### Ad-hoc HTTP requests
40
+
41
+ Regardless the configured user agent, you can always make ad-hoc HTTP GET requests
42
+ that return pages with `#fetch(url)`:
43
+
44
+ ```ruby
45
+ class DummyJob < ActiveJob::Base
46
+ include Wayfarer::Base
47
+
48
+ route.to :index
49
+
50
+ def index
51
+ page = fetch("https://example.com") # => #<Wayfarer::Page ...>
52
+ end
53
+ end
54
+ ```
55
+
56
+ !!! info "`#fetch` uses the configured `Wayfarer.config.network.http_headers`."
57
+
58
+ ## HTTP request headers
59
+
60
+ You can set HTTP request headers for all built-in user agents:
61
+
62
+ ```ruby
63
+ Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler" }
64
+ ```
65
+
66
+ !!! attention "Selenium does not support configuring HTTP request headers."
67
+
68
+ ## Connection pooling
69
+
70
+ Since user agents are expensive to create, especially in the case of browser
71
+ processes, Wayfarer keeps user agents within a connection pool. When a job
72
+ performs and needs to retrieve the [page](../pages) for its task URL, an agent
73
+ is checked out from the pool, and checked back in when the routed action method
74
+ returns.
75
+
76
+ The pool size is constant and it should equal the number of threads the
77
+ underlying message queue operates with. For example, if you use Sidekiq,
78
+ you should set the pool size to the number of Sidekiq threads:
79
+
80
+ ```ruby
81
+ Wayfarer.config[:network][:pool_size] = Sidekiq.options[:concurrency]
82
+ ```
83
+
84
+ !!! attention "The connection pool size is 1 by default"
85
+
86
+ Since there is no reliable way to detect the number of threads that
87
+ the underlying message queue operates with, Wayfarer defaults to a pool
88
+ size of 1, which creates a bottleneck in a concurrent environment.
89
+
90
+ !!! attention "Browser sessions are shared across jobs"
91
+
92
+ The same browser session is used across jobs. This means that the browser
93
+ is not closed between jobs, and that the browser's state carries over from
94
+ job to job. You may account for this by resetting the browser's state
95
+ according to your needs, for which you can use [callbacks](../callbacks).
96
+
97
+ ### `UserAgentTimeoutError`: avoiding pool contention
98
+
99
+ If you encounter `UserAgentTimeoutError` exceptions, a job has waited for a
100
+ user agent to become available for too long. By default, this timeout is 10
101
+ seconds. This is a sign that the pool size is too small for the message queue's
102
+ concurrency.
103
+
104
+ ```
105
+ #<Wayfarer::UserAgentTimeoutError: Waited 10 sec, 0/1 available>
106
+ ```
107
+
108
+ You can configure the timeout, although you will likely want to increase the
109
+ pool size instead:
110
+
111
+ ```ruby
112
+ Wayfarer.config[:network][:pool_timeout] = 10 # seconds
113
+ ```
data/docs/index.md CHANGED
@@ -1,56 +1,33 @@
1
1
  ---
2
2
  hide:
3
3
  - navigation
4
+ - toc
4
5
  ---
5
6
 
6
7
  # Wayfarer
7
8
 
8
- ![CI status](https://github.com/actions/starter-workflows/workflows/CI/badge.svg)
9
- [![RubyGem](https://badge.fury.io/rb/wayfarer.svg)](https://rubygems.org/gems/wayfarer)
9
+ ## Ruby web crawling framework built on [ActiveJob]() and [Redis]()
10
10
 
11
- ## Versatile web crawling with Ruby
11
+ <small>
12
+ [Read the tutorial](/guides/tutorial){ .md-button .md-button--primary }
13
+ </small>
12
14
 
13
- * Web scraping
14
- * Data extraction
15
- * Browser automation
15
+ === "Command line"
16
16
 
17
- !!! attention "Unstable software"
17
+ ```sh
18
+ gem install wayfarer
19
+ ```
18
20
 
19
- Wayfarer is under development and releases should be considered unstable.
21
+ === "Gemfile"
20
22
 
21
- Wayfarer complies to
22
- [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) in
23
- which v0.x means that there could be backward-incompatible changes for every
24
- release:
23
+ ```ruby
24
+ gem "wayfarer"
25
+ ```
25
26
 
26
- >Major version zero (0.y.z) is for initial development. Anything MAY change
27
- at any time. The public API SHOULD NOT be considered stable.
28
-
29
- ### Installation
30
-
31
- Install the RubyGem:
32
-
33
- ```
34
- gem install wayfarer
35
- ```
36
-
37
- Or add it to Bundler's Gemfile:
38
-
39
- ```ruby
40
- gem "wayfarer"
41
- ```
42
-
43
- ### Features
44
-
45
- * Breadth-first, acyclic, multi-threaded graph traversal
46
- * Executes atop a variety of message queues thanks to [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html)
47
- * Browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
27
+ * Breadth-first, acyclic page traversal
28
+ * Plain HTTP and browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
48
29
  (<abbr title="Chrome DevTools Protocol">CDP</abbr>),
49
- [Selenium](https://www.selenium.dev) or plain HTTP via `net/http`
30
+ [Selenium](https://www.selenium.dev) and custom user agents
50
31
  * Declarative routing DSL
51
32
  * URI normalization and deduplication
52
- * XML, HTML, JSON parsing
53
- * HTTP redirect handling
54
- * Storage-agnostic
55
- * Small footprint: <500 LoC
56
- * Open Source (MIT)
33
+ * HTML, XML, JSON and custom Content-Type body parsing
@@ -1,46 +1,46 @@
1
- # Command Line Interface
1
+ # wayfarer
2
+
3
+ The command-line interface to Wayfarer.
2
4
 
3
5
  ## Usage
4
6
 
5
7
  ```
6
- wayfarer [OPTIONS] [generate|job|route|version]
8
+ wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
7
9
  ```
8
10
 
9
- All [environment variables](../environment_variables) are respected.
10
-
11
- ## `wayfarer generate`
12
-
13
- ### `wayfarer generate project NAME`
11
+ See [Configuration](../reference/cli) for the respected environment variables.
14
12
 
15
- : Generates a new project directory `NAME`.
13
+ ---
16
14
 
17
- ## `wayfarer job`
15
+ ## `wayfarer perform JOB URL`
18
16
 
19
- ### `wayfarer job perform JOB URL`
20
-
21
- : Performs `JOB` with `URL`. The job does not reach any Active Job backend.
22
- Staged jobs will not be processed.
17
+ : Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
18
+ Staged jobs are ignored.
23
19
 
24
20
  ##### Options
25
21
 
26
22
  * `--mock-redis`: Use an in-memory implementation of Redis instead of
27
23
  talking to an actual server.
28
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
24
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
25
+
26
+ ---
29
27
 
30
- ### `wayfarer job enqueue JOB URL`
28
+ ## `wayfarer enqueue JOB URL`
31
29
 
32
- : Enqueues `JOB` with `URL` to the configured Active Job backend.
30
+ : Enqueues a task for `JOB` with `URL` to the message queue.
33
31
 
34
32
  ##### Options
35
33
 
36
- * `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
34
+ * `--batch=BATCH`: The job's batch. By default, a UUID is generated.
35
+
36
+ ---
37
37
 
38
- ### `wayfarer job execute JOB URL`
38
+ ## `wayfarer execute JOB URL`
39
39
 
40
- : Execute `JOB` with `URL` by using the
41
- [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html).
42
- The job does not reach any other Active Job backend. Blocks until the batch
43
- completes.
40
+ : Execute `JOB` with `URL` with the in-memory
41
+ [Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
42
+ instead of writing the taks to an actual message queue. Blocks until the
43
+ batch has completed.
44
44
 
45
45
  ##### Options
46
46
 
@@ -50,12 +50,22 @@ All [environment variables](../environment_variables) are respected.
50
50
  * `--min-threads`: Minimum number of threads to use. Default: 1
51
51
  * `--max-threads`: Maximum number of threads to use. Default: 1
52
52
 
53
- ## `wayfarer route`
53
+ !!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
54
+
55
+ You need to set the `wait: 0` option on `retry_on` in order for
56
+ `wayfarer job execute` to execute retries:
54
57
 
55
- ### `wayfarer route result JOB URL`
58
+ ```ruby
59
+ retry_on StandardError, attempts: 3, wait: 0
60
+ ```
61
+ ---
62
+
63
+ ## `wayfarer route JOB URL`
56
64
 
57
65
  : Prints the result of invoking `JOB`'s router with `URL`.
58
66
 
59
- ### `wayfarer route tree JOB URL`
67
+ ---
68
+
69
+ ## `wayfarer tree JOB URL`
60
70
 
61
71
  : Visualises the routing tree result of invoking `JOB`'s router with `URL`.
@@ -0,0 +1,36 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+
6
+ # Configuration
7
+
8
+ You can configure Wayfarer by assigning to the `Wayfarer.config` Hash
9
+ which has the following defaults:
10
+
11
+ ```ruby
12
+ {
13
+ redis: {
14
+ url: "redis://localhost:6379/0",
15
+ factory: ->(redis) { ::Redis.new(url: redis[:url]) }
16
+ },
17
+ network: {
18
+ agent: :http,
19
+ pool_size: 1,
20
+ pool_timeout: 10,
21
+ http_headers: {},
22
+ renew_on: []
23
+ },
24
+ capybara: {
25
+ driver: nil
26
+ },
27
+ ferrum: {
28
+ options: {}
29
+ },
30
+ selenium: {
31
+ driver: :chrome,
32
+ options: {},
33
+ client_timeout: 60
34
+ }
35
+ }
36
+ ```
data/lib/wayfarer/base.rb CHANGED
@@ -1,60 +1,138 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wayfarer
4
- class Base < ActiveJob::Base
5
- include Wayfarer::Middleware::Controller
6
-
7
- use Wayfarer::Middleware::Stage
8
- use Wayfarer::Middleware::Dedup
9
- use Wayfarer::Middleware::Normalize
10
- use Wayfarer::Middleware::Router
11
- use Wayfarer::Middleware::Fetch
12
- use Wayfarer::Middleware::Dispatch
13
-
14
- ErrorHandler = lambda do |&block|
15
- lambda do |job, error|
16
- task = job.arguments.first
17
- task.barrier.seen?(task.url)
18
- task.gc.run
19
- block.call(job, error)
20
- end
21
- end
4
+ # @!attribute [r] task
5
+ # @return [Wayfarer::Task] the current task
6
+ # @!attribute [r] uri
7
+ # @return [Addressable::URI] Parsed task URL
8
+ # @!attribute [r] user_agent
9
+ # @return [Object] the user agent that retrieved the page
10
+ # @!attribute [r] action
11
+ # @return [Symbol, Object] action that the task URL was routed to.
12
+ # @!attribute [r] params
13
+ # @return [HashWithIndifferentAccess] path parameters collected from routes
14
+ module Base
15
+ extend ActiveSupport::Concern
16
+ # @!method stage(urls)
17
+ # Adds URLs to an internal staging set so that they get enqueued
18
+ # eventually, once the job executed successfully.
19
+ # @overload stage(urls)
20
+ # @param urls [Array<String>] URLs to add to the staging set.
21
+ # @overload stage(url)
22
+ # @param url [String] URL to add to the staging set.
22
23
 
23
- after_enqueue do |job|
24
- task = job.arguments.first
25
- task.counter.increment
26
- end
24
+ # @!method fetch(url, follow: 3)
25
+ # @param url [String] URL to fetch using plain HTTP(S).
26
+ # @param follow [Fixnum] Number of redirects to follow.
27
+ # Retrieves the given URL to a {Page}.
27
28
 
28
- after_perform do |job|
29
- task = job.arguments.first
30
- task.gc.run
31
- end
29
+ # @!method page(live: false)
30
+ # @param url [live] whether to retrieve a new {Page}.
31
+ # @return [Wayfarer::Page]
32
+ # Returns the most recently retrieved page or a new page
33
+ # for the current task URL if the `follow` keyword is passed.
32
34
 
33
- rescue_from(StandardError) do
34
- task = arguments.first
35
- task.gc.run
36
- end
35
+ # @!scope class
37
36
 
38
- def self.retry_on(*argv, &block)
39
- super(*argv, &ErrorHandler.call(&block))
40
- end
37
+ # @!attribute [r] route
38
+ # @return [Wayfarer::Routing::DSL]
39
+ # The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
40
+ # or to a {Handler}.
41
+ # @example Append a host route
42
+ # route.host "examplxe.com", to: :index
41
43
 
42
- def self.discard_on(*argv, &block)
43
- super(*argv, &ErrorHandler.call(&block))
44
- end
44
+ # @!method content_types(*content_types)
45
+ # @param content_types [*Array<String, Regexp>] Content-Types to whitelist
46
+ # Whitelists Content-Types. Once at least one Content-Type is set, only
47
+ # those Content-Types will be processed.
45
48
 
46
- def self.crawl(url, batch: SecureRandom.uuid)
47
- Task.new(url, batch).tap do |task|
48
- perform_later(task)
49
- end
50
- end
49
+ # @!group Callbacks
50
+
51
+ # @!method before_fetch
52
+ # @overload before_fetch(callback)
53
+ # @param callback [Symbol] Instance method to call
54
+ # @overload before_fetch(&block)
55
+ # @yield [Wayfarer::Task]
56
+ # Registers a callback that is called before the page is fetched.
57
+ # If a symbol is passed, an instance method with the same name will be
58
+ # called.
59
+ # @example Accessing the user agent in {#before_fetch}
60
+ # before_fetch do |task|
61
+ # user_agent # => the user agent that will fetch the page
62
+ # end
51
63
 
52
- def retry_job(...)
53
- super(...) # increments the counter by re-enqueuing the job
54
- task = arguments.first
55
- task.counter.decrement
64
+ # @!method around_fetch
65
+ # @overload around_fetch(callback)
66
+ # @param callback [Symbol] Instance method to call
67
+ # @overload around_fetch(&block)
68
+ # @yield [Wayfarer::Task]
69
+ # Registers a callback that is called around the page getting fetched.
70
+ # If a symbol is passed, an instance method with the same name will be
71
+ # called.
72
+
73
+ # @!method after_fetch
74
+ # @overload after_fetch(callback)
75
+ # @param callback [Symbol] Instance method to call
76
+ # @overload after_fetch(&block)
77
+ # @yield [Wayfarer::Task]
78
+ # Registers a callback that is called after the page was fetched.
79
+ # If a symbol is passed, an instance method with the same name will be
80
+ # called.
81
+
82
+ # @!method before_perform
83
+ # @overload before_perform(callback)
84
+ # @param callback [Symbol] Instance method to call
85
+ # @overload before_perform(&block)
86
+ # @yield [Wayfarer::Task]
87
+ # Registers a callback that is called before the task is performed.
88
+ # If a symbol is passed, an instance method with the same name will be
89
+ # called.
90
+
91
+ # @!method around_perform
92
+ # @overload around_perform(callback)
93
+ # @param callback [Symbol] Instance method to call
94
+ # @overload around_perform(&block)
95
+ # @yield [Wayfarer::Task]
96
+ # Registers a callback that is called around the task getting performed.
97
+ # If a symbol is passed, an instance method with the same name will be
98
+ # called.
99
+
100
+ # @!method after_perform
101
+ # @overload after_perform(callback)
102
+ # @param callback [Symbol] Instance method to call
103
+ # @overload after_perform(&block)
104
+ # @yield [Wayfarer::Task]
105
+ # Registers a callback that is called after the task was performed.
106
+ # If a symbol is passed, an instance method with the same name will be
107
+ # called.
108
+
109
+ # @!endgroup
110
+
111
+ included do
112
+ include Wayfarer::Middleware::Controller
113
+
114
+ # Implement ActiveJob's #perform by calling into our own middleware chain
115
+ alias_method :perform, :call
116
+
117
+ # Middleware stack
118
+ use Wayfarer::Middleware::Redis
119
+ use Wayfarer::Middleware::BatchCompletion
120
+ use Wayfarer::Middleware::UriParser
121
+ use Wayfarer::Middleware::Normalize
122
+ use Wayfarer::Middleware::Dedup
123
+ use Wayfarer::Middleware::Stage
124
+ use Wayfarer::Middleware::Router
125
+ use Wayfarer::Middleware::UserAgent
126
+ use Wayfarer::Middleware::ContentType
127
+ use Wayfarer::Middleware::Dispatch
56
128
  end
57
129
 
58
- alias perform call
130
+ class_methods do
131
+ def crawl(url, batch: SecureRandom.uuid)
132
+ Task.new(url, batch).tap do |task|
133
+ perform_later(task)
134
+ end
135
+ end
136
+ end
59
137
  end
60
138
  end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ # BatchCompletion tracks the completion of a batch of jobs.
5
+ # It does so by incrementing and decrementing a counter in Redis.
6
+ #
7
+ # The counter is incremented when a job is first enqueued and decremented when
8
+ # a job is performed. If a job is retried, the counter is not incremented.
9
+ # When a job succeeds or fails and thereby exceeds its retry count, the counter
10
+ # is decremented.
11
+ #
12
+ # When the counter reaches zero, garbage collection deletes the Redis keys
13
+ # associated with the batch.
14
+ module BatchCompletion
15
+ module_function
16
+
17
+ def subscribe!
18
+ ActiveSupport::Notifications.subscribe("enqueue.active_job", self)
19
+ ActiveSupport::Notifications.subscribe("perform.active_job", self)
20
+ ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
21
+ end
22
+
23
+ def call(name, _, _, _, data)
24
+ return unless (job = data[:job]).is_a?(Wayfarer::Base)
25
+
26
+ task = job.arguments.first
27
+
28
+ # In the case of `enqueue.active_job` middleware hasn't executed yet
29
+ task[:redis_pool] ||= Wayfarer::Redis::Pool.instance # TODO: Test
30
+
31
+ counter = Redis::Counter.new(task) do
32
+ job.run_callbacks(:batch)
33
+ ensure
34
+ Wayfarer::GC.run(task)
35
+ end
36
+
37
+ handle(name, job, task, counter)
38
+ end
39
+
40
+ def handle(name, job, task, counter)
41
+ case name
42
+ when "enqueue.active_job" then counter.increment unless retry?(job)
43
+ when "perform.active_job" then counter.decrement if succeeded?(job, task)
44
+ when "retry_stopped.active_job" then counter.decrement
45
+ end
46
+ end
47
+
48
+ def retry?(job)
49
+ job.executions > 0
50
+ end
51
+
52
+ def succeeded?(job, task)
53
+ job.exception_executions == task[:initial_exception_executions]
54
+ end
55
+ end
56
+ end