wayfarer 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +29 -2
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +17 -0
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -31
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -42
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -26
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
@@ -0,0 +1,60 @@
|
|
1
|
+
# Tutorial
|
2
|
+
|
3
|
+
Wayfarer is a web crawling framework written in Ruby.
|
4
|
+
It works with plain HTTP or by automating web browsers and is deployed with
|
5
|
+
Redis and a message queue (which can be Redis-based itself).
|
6
|
+
In development, it can execute fully in memory, without Redis.
|
7
|
+
|
8
|
+
You need a compatible version of Ruby installed.
|
9
|
+
|
10
|
+
To get started, in an empty directory, generate a new `Gemfile` and install
|
11
|
+
ActiveJob and Wayfarer:
|
12
|
+
|
13
|
+
```sh
|
14
|
+
bundle init
|
15
|
+
bundle add activejob wayfarer
|
16
|
+
bundle install
|
17
|
+
```
|
18
|
+
|
19
|
+
## Jobs, tasks and batches
|
20
|
+
|
21
|
+
Wayfarer builds on Active Job, the message queue abstraction of Rails.
|
22
|
+
You can use Wayfarer without Rails of course, as we do here.
|
23
|
+
|
24
|
+
A message queue supports two operations: appending messages to the end and consuming
|
25
|
+
messages from the front. In the case of Wayfarer, messages are tasks, a string pair
|
26
|
+
consisting of a URL and a batch. When a task is consumed, it is processed by a job,
|
27
|
+
a Ruby class.
|
28
|
+
|
29
|
+
Let's give ourselves a `dummy_job.rb` that routes arbitrary URLs to its
|
30
|
+
`index` instance method, where we print the current `task`:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
require "activejob"
|
34
|
+
require "wayfarer"
|
35
|
+
|
36
|
+
class DummyJob < ActiveJob::Base
|
37
|
+
include Wayfarer::Base
|
38
|
+
|
39
|
+
route.to :index
|
40
|
+
|
41
|
+
def index
|
42
|
+
puts task
|
43
|
+
end
|
44
|
+
end
|
45
|
+
```
|
46
|
+
|
47
|
+
We can perform our job from the command line with Wayfarer's CLI and find
|
48
|
+
that in between ActiveJob's log output, our task was printed with a generated
|
49
|
+
UUID for its batch:
|
50
|
+
|
51
|
+
```hl_lines="1 3"
|
52
|
+
bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
|
53
|
+
[ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
54
|
+
#<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
55
|
+
[ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
|
56
|
+
```
|
57
|
+
|
58
|
+
Many commands accept a `--batch` flag for setting the batch. If you don't
|
59
|
+
provide one, a UUID is generated.
|
60
|
+
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# User agents
|
2
|
+
|
3
|
+
User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
|
4
|
+
[page](../pages). They are kept in a connection pool and all user agents in the pool
|
5
|
+
share the same type and configuration. You can add custom user agents by implementing
|
6
|
+
the [user agent API](custom_user_agents.md).
|
7
|
+
|
8
|
+
Wayfarer comes with the following built-in user agents:
|
9
|
+
|
10
|
+
* [`#!ruby :http`](http.md) (default)
|
11
|
+
* [`#!ruby :ferrum`](ferrum.md) to automate Google Chrome
|
12
|
+
* [`#!ruby :selenium`](selenium.md) to automate a variety of browsers
|
13
|
+
* [`#!ruby :capybara`](capybara.md) to use Capybara sessions
|
14
|
+
|
15
|
+
Configure the user agent with the global configuration option:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
Wayfarer.config[:network][:agent] = :ferrum # or :selenium, :capybara, ...
|
19
|
+
```
|
20
|
+
|
21
|
+
You can access the user agent that was checked out from the pool with
|
22
|
+
`#user_agent` in action methods:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
class DummyJob < ActiveJob::Base
|
26
|
+
include Wayfarer::Base
|
27
|
+
|
28
|
+
route.to :index
|
29
|
+
|
30
|
+
def index
|
31
|
+
user_agent # => #<Ferrum::Browser ...>
|
32
|
+
end
|
33
|
+
end
|
34
|
+
```
|
35
|
+
|
36
|
+
You can also implement [custom user agents](custom_user_agents.md) to support
|
37
|
+
your own HTTP client or browser automation service/protocol.
|
38
|
+
|
39
|
+
### Ad-hoc HTTP requests
|
40
|
+
|
41
|
+
Regardless the configured user agent, you can always make ad-hoc HTTP GET requests
|
42
|
+
that return pages with `#fetch(url)`:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
class DummyJob < ActiveJob::Base
|
46
|
+
include Wayfarer::Base
|
47
|
+
|
48
|
+
route.to :index
|
49
|
+
|
50
|
+
def index
|
51
|
+
page = fetch("https://example.com") # => #<Wayfarer::Page ...>
|
52
|
+
end
|
53
|
+
end
|
54
|
+
```
|
55
|
+
|
56
|
+
!!! info "`#fetch` uses the configured `Wayfarer.config.network.http_headers`."
|
57
|
+
|
58
|
+
## HTTP request headers
|
59
|
+
|
60
|
+
You can set HTTP request headers for all built-in user agents:
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler" }
|
64
|
+
```
|
65
|
+
|
66
|
+
!!! attention "Selenium does not support configuring HTTP request headers."
|
67
|
+
|
68
|
+
## Connection pooling
|
69
|
+
|
70
|
+
Since user agents are expensive to create, especially in the case of browser
|
71
|
+
processes, Wayfarer keeps user agents within a connection pool. When a job
|
72
|
+
performs and needs to retrieve the [page](../pages) for its task URL, an agent
|
73
|
+
is checked out from the pool, and checked back in when the routed action method
|
74
|
+
returns.
|
75
|
+
|
76
|
+
The pool size is constant and it should equal the number of threads the
|
77
|
+
underlying message queue operates with. For example, if you use Sidekiq,
|
78
|
+
you should set the pool size to the number of Sidekiq threads:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
Wayfarer.config[:network][:pool_size] = Sidekiq.options[:concurrency]
|
82
|
+
```
|
83
|
+
|
84
|
+
!!! attention "The connection pool size is 1 by default"
|
85
|
+
|
86
|
+
Since there is no reliable way to detect the number of threads that
|
87
|
+
the underlying message queue operates with, Wayfarer defaults to a pool
|
88
|
+
size of 1, which creates a bottleneck in a concurrent environment.
|
89
|
+
|
90
|
+
!!! attention "Browser sessions are shared across jobs"
|
91
|
+
|
92
|
+
The same browser session is used across jobs. This means that the browser
|
93
|
+
is not closed between jobs, and that the browser's state carries over from
|
94
|
+
job to job. You may account for this by resetting the browser's state
|
95
|
+
according to your needs, for which you can use [callbacks](../callbacks).
|
96
|
+
|
97
|
+
### `UserAgentTimeoutError`: avoiding pool contention
|
98
|
+
|
99
|
+
If you encounter `UserAgentTimeoutError` exceptions, a job has waited for a
|
100
|
+
user agent to become available for too long. By default, this timeout is 10
|
101
|
+
seconds. This is a sign that the pool size is too small for the message queue's
|
102
|
+
concurrency.
|
103
|
+
|
104
|
+
```
|
105
|
+
#<Wayfarer::UserAgentTimeoutError: Waited 10 sec, 0/1 available>
|
106
|
+
```
|
107
|
+
|
108
|
+
You can configure the timeout, although you will likely want to increase the
|
109
|
+
pool size instead:
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
Wayfarer.config[:network][:pool_timeout] = 10 # seconds
|
113
|
+
```
|
data/docs/index.md
CHANGED
@@ -1,56 +1,33 @@
|
|
1
1
|
---
|
2
2
|
hide:
|
3
3
|
- navigation
|
4
|
+
- toc
|
4
5
|
---
|
5
6
|
|
6
7
|
# Wayfarer
|
7
8
|
|
8
|
-
|
9
|
-
[](https://rubygems.org/gems/wayfarer)
|
9
|
+
## Ruby web crawling framework built on [ActiveJob]() and [Redis]()
|
10
10
|
|
11
|
-
|
11
|
+
<small>
|
12
|
+
[Read the tutorial](/guides/tutorial){ .md-button .md-button--primary }
|
13
|
+
</small>
|
12
14
|
|
13
|
-
|
14
|
-
* Data extraction
|
15
|
-
* Browser automation
|
15
|
+
=== "Command line"
|
16
16
|
|
17
|
-
|
17
|
+
```sh
|
18
|
+
gem install wayfarer
|
19
|
+
```
|
18
20
|
|
19
|
-
|
21
|
+
=== "Gemfile"
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
release:
|
23
|
+
```ruby
|
24
|
+
gem "wayfarer"
|
25
|
+
```
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
### Installation
|
30
|
-
|
31
|
-
Install the RubyGem:
|
32
|
-
|
33
|
-
```
|
34
|
-
gem install wayfarer
|
35
|
-
```
|
36
|
-
|
37
|
-
Or add it to Bundler's Gemfile:
|
38
|
-
|
39
|
-
```ruby
|
40
|
-
gem "wayfarer"
|
41
|
-
```
|
42
|
-
|
43
|
-
### Features
|
44
|
-
|
45
|
-
* Breadth-first, acyclic, multi-threaded graph traversal
|
46
|
-
* Executes atop a variety of message queues thanks to [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html)
|
47
|
-
* Browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
|
27
|
+
* Breadth-first, acyclic page traversal
|
28
|
+
* Plain HTTP and browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
|
48
29
|
(<abbr title="Chrome DevTools Protocol">CDP</abbr>),
|
49
|
-
[Selenium](https://www.selenium.dev)
|
30
|
+
[Selenium](https://www.selenium.dev) and custom user agents
|
50
31
|
* Declarative routing DSL
|
51
32
|
* URI normalization and deduplication
|
52
|
-
*
|
53
|
-
* HTTP redirect handling
|
54
|
-
* Storage-agnostic
|
55
|
-
* Small footprint: <500 LoC
|
56
|
-
* Open Source (MIT)
|
33
|
+
* HTML, XML, JSON and custom Content-Type body parsing
|
data/docs/reference/cli.md
CHANGED
@@ -1,46 +1,46 @@
|
|
1
|
-
#
|
1
|
+
# wayfarer
|
2
|
+
|
3
|
+
The command-line interface to Wayfarer.
|
2
4
|
|
3
5
|
## Usage
|
4
6
|
|
5
7
|
```
|
6
|
-
wayfarer [OPTIONS] [
|
8
|
+
wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
|
7
9
|
```
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
## `wayfarer generate`
|
12
|
-
|
13
|
-
### `wayfarer generate project NAME`
|
11
|
+
See [Configuration](../reference/cli) for the respected environment variables.
|
14
12
|
|
15
|
-
|
13
|
+
---
|
16
14
|
|
17
|
-
## `wayfarer
|
15
|
+
## `wayfarer perform JOB URL`
|
18
16
|
|
19
|
-
|
20
|
-
|
21
|
-
: Performs `JOB` with `URL`. The job does not reach any Active Job backend.
|
22
|
-
Staged jobs will not be processed.
|
17
|
+
: Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
|
18
|
+
Staged jobs are ignored.
|
23
19
|
|
24
20
|
##### Options
|
25
21
|
|
26
22
|
* `--mock-redis`: Use an in-memory implementation of Redis instead of
|
27
23
|
talking to an actual server.
|
28
|
-
* `--batch=BATCH`:
|
24
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
25
|
+
|
26
|
+
---
|
29
27
|
|
30
|
-
|
28
|
+
## `wayfarer enqueue JOB URL`
|
31
29
|
|
32
|
-
: Enqueues `JOB` with `URL` to the
|
30
|
+
: Enqueues a task for `JOB` with `URL` to the message queue.
|
33
31
|
|
34
32
|
##### Options
|
35
33
|
|
36
|
-
* `--batch=BATCH`:
|
34
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
35
|
+
|
36
|
+
---
|
37
37
|
|
38
|
-
|
38
|
+
## `wayfarer execute JOB URL`
|
39
39
|
|
40
|
-
: Execute `JOB` with `URL`
|
41
|
-
[Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
|
42
|
-
|
43
|
-
|
40
|
+
: Execute `JOB` with `URL` with the in-memory
|
41
|
+
[Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
|
42
|
+
instead of writing the taks to an actual message queue. Blocks until the
|
43
|
+
batch has completed.
|
44
44
|
|
45
45
|
##### Options
|
46
46
|
|
@@ -50,12 +50,22 @@ All [environment variables](../environment_variables) are respected.
|
|
50
50
|
* `--min-threads`: Minimum number of threads to use. Default: 1
|
51
51
|
* `--max-threads`: Maximum number of threads to use. Default: 1
|
52
52
|
|
53
|
-
|
53
|
+
!!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
|
54
|
+
|
55
|
+
You need to set the `wait: 0` option on `retry_on` in order for
|
56
|
+
`wayfarer job execute` to execute retries:
|
54
57
|
|
55
|
-
|
58
|
+
```ruby
|
59
|
+
retry_on StandardError, attempts: 3, wait: 0
|
60
|
+
```
|
61
|
+
---
|
62
|
+
|
63
|
+
## `wayfarer route JOB URL`
|
56
64
|
|
57
65
|
: Prints the result of invoking `JOB`'s router with `URL`.
|
58
66
|
|
59
|
-
|
67
|
+
---
|
68
|
+
|
69
|
+
## `wayfarer tree JOB URL`
|
60
70
|
|
61
71
|
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
---
|
2
|
+
hide:
|
3
|
+
- toc
|
4
|
+
---
|
5
|
+
|
6
|
+
# Configuration
|
7
|
+
|
8
|
+
You can configure Wayfarer by assigning to the `Wayfarer.config` Hash
|
9
|
+
which has the following defaults:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
{
|
13
|
+
redis: {
|
14
|
+
url: "redis://localhost:6379/0",
|
15
|
+
factory: ->(redis) { ::Redis.new(url: redis[:url]) }
|
16
|
+
},
|
17
|
+
network: {
|
18
|
+
agent: :http,
|
19
|
+
pool_size: 1,
|
20
|
+
pool_timeout: 10,
|
21
|
+
http_headers: {},
|
22
|
+
renew_on: []
|
23
|
+
},
|
24
|
+
capybara: {
|
25
|
+
driver: nil
|
26
|
+
},
|
27
|
+
ferrum: {
|
28
|
+
options: {}
|
29
|
+
},
|
30
|
+
selenium: {
|
31
|
+
driver: :chrome,
|
32
|
+
options: {},
|
33
|
+
client_timeout: 60
|
34
|
+
}
|
35
|
+
}
|
36
|
+
```
|
data/lib/wayfarer/base.rb
CHANGED
@@ -1,60 +1,138 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
4
|
+
# @!attribute [r] task
|
5
|
+
# @return [Wayfarer::Task] the current task
|
6
|
+
# @!attribute [r] uri
|
7
|
+
# @return [Addressable::URI] Parsed task URL
|
8
|
+
# @!attribute [r] user_agent
|
9
|
+
# @return [Object] the user agent that retrieved the page
|
10
|
+
# @!attribute [r] action
|
11
|
+
# @return [Symbol, Object] action that the task URL was routed to.
|
12
|
+
# @!attribute [r] params
|
13
|
+
# @return [HashWithIndifferentAccess] path parameters collected from routes
|
14
|
+
module Base
|
15
|
+
extend ActiveSupport::Concern
|
16
|
+
# @!method stage(urls)
|
17
|
+
# Adds URLs to an internal staging set so that they get enqueued
|
18
|
+
# eventually, once the job executed successfully.
|
19
|
+
# @overload stage(urls)
|
20
|
+
# @param urls [Array<String>] URLs to add to the staging set.
|
21
|
+
# @overload stage(url)
|
22
|
+
# @param url [String] URL to add to the staging set.
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
# @!method fetch(url, follow: 3)
|
25
|
+
# @param url [String] URL to fetch using plain HTTP(S).
|
26
|
+
# @param follow [Fixnum] Number of redirects to follow.
|
27
|
+
# Retrieves the given URL to a {Page}.
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
# @!method page(live: false)
|
30
|
+
# @param url [live] whether to retrieve a new {Page}.
|
31
|
+
# @return [Wayfarer::Page]
|
32
|
+
# Returns the most recently retrieved page or a new page
|
33
|
+
# for the current task URL if the `follow` keyword is passed.
|
32
34
|
|
33
|
-
|
34
|
-
task = arguments.first
|
35
|
-
task.gc.run
|
36
|
-
end
|
35
|
+
# @!scope class
|
37
36
|
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
# @!attribute [r] route
|
38
|
+
# @return [Wayfarer::Routing::DSL]
|
39
|
+
# The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
|
40
|
+
# or to a {Handler}.
|
41
|
+
# @example Append a host route
|
42
|
+
# route.host "examplxe.com", to: :index
|
41
43
|
|
42
|
-
|
43
|
-
|
44
|
-
|
44
|
+
# @!method content_types(*content_types)
|
45
|
+
# @param content_types [*Array<String, Regexp>] Content-Types to whitelist
|
46
|
+
# Whitelists Content-Types. Once at least one Content-Type is set, only
|
47
|
+
# those Content-Types will be processed.
|
45
48
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
# @!group Callbacks
|
50
|
+
|
51
|
+
# @!method before_fetch
|
52
|
+
# @overload before_fetch(callback)
|
53
|
+
# @param callback [Symbol] Instance method to call
|
54
|
+
# @overload before_fetch(&block)
|
55
|
+
# @yield [Wayfarer::Task]
|
56
|
+
# Registers a callback that is called before the page is fetched.
|
57
|
+
# If a symbol is passed, an instance method with the same name will be
|
58
|
+
# called.
|
59
|
+
# @example Accessing the user agent in {#before_fetch}
|
60
|
+
# before_fetch do |task|
|
61
|
+
# user_agent # => the user agent that will fetch the page
|
62
|
+
# end
|
51
63
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
64
|
+
# @!method around_fetch
|
65
|
+
# @overload around_fetch(callback)
|
66
|
+
# @param callback [Symbol] Instance method to call
|
67
|
+
# @overload around_fetch(&block)
|
68
|
+
# @yield [Wayfarer::Task]
|
69
|
+
# Registers a callback that is called around the page getting fetched.
|
70
|
+
# If a symbol is passed, an instance method with the same name will be
|
71
|
+
# called.
|
72
|
+
|
73
|
+
# @!method after_fetch
|
74
|
+
# @overload after_fetch(callback)
|
75
|
+
# @param callback [Symbol] Instance method to call
|
76
|
+
# @overload after_fetch(&block)
|
77
|
+
# @yield [Wayfarer::Task]
|
78
|
+
# Registers a callback that is called after the page was fetched.
|
79
|
+
# If a symbol is passed, an instance method with the same name will be
|
80
|
+
# called.
|
81
|
+
|
82
|
+
# @!method before_perform
|
83
|
+
# @overload before_perform(callback)
|
84
|
+
# @param callback [Symbol] Instance method to call
|
85
|
+
# @overload before_perform(&block)
|
86
|
+
# @yield [Wayfarer::Task]
|
87
|
+
# Registers a callback that is called before the task is performed.
|
88
|
+
# If a symbol is passed, an instance method with the same name will be
|
89
|
+
# called.
|
90
|
+
|
91
|
+
# @!method around_perform
|
92
|
+
# @overload around_perform(callback)
|
93
|
+
# @param callback [Symbol] Instance method to call
|
94
|
+
# @overload around_perform(&block)
|
95
|
+
# @yield [Wayfarer::Task]
|
96
|
+
# Registers a callback that is called around the task getting performed.
|
97
|
+
# If a symbol is passed, an instance method with the same name will be
|
98
|
+
# called.
|
99
|
+
|
100
|
+
# @!method after_perform
|
101
|
+
# @overload after_perform(callback)
|
102
|
+
# @param callback [Symbol] Instance method to call
|
103
|
+
# @overload after_perform(&block)
|
104
|
+
# @yield [Wayfarer::Task]
|
105
|
+
# Registers a callback that is called after the task was performed.
|
106
|
+
# If a symbol is passed, an instance method with the same name will be
|
107
|
+
# called.
|
108
|
+
|
109
|
+
# @!endgroup
|
110
|
+
|
111
|
+
included do
|
112
|
+
include Wayfarer::Middleware::Controller
|
113
|
+
|
114
|
+
# Implement ActiveJob's #perform by calling into our own middleware chain
|
115
|
+
alias_method :perform, :call
|
116
|
+
|
117
|
+
# Middleware stack
|
118
|
+
use Wayfarer::Middleware::Redis
|
119
|
+
use Wayfarer::Middleware::BatchCompletion
|
120
|
+
use Wayfarer::Middleware::UriParser
|
121
|
+
use Wayfarer::Middleware::Normalize
|
122
|
+
use Wayfarer::Middleware::Dedup
|
123
|
+
use Wayfarer::Middleware::Stage
|
124
|
+
use Wayfarer::Middleware::Router
|
125
|
+
use Wayfarer::Middleware::UserAgent
|
126
|
+
use Wayfarer::Middleware::ContentType
|
127
|
+
use Wayfarer::Middleware::Dispatch
|
56
128
|
end
|
57
129
|
|
58
|
-
|
130
|
+
class_methods do
|
131
|
+
def crawl(url, batch: SecureRandom.uuid)
|
132
|
+
Task.new(url, batch).tap do |task|
|
133
|
+
perform_later(task)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
59
137
|
end
|
60
138
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# BatchCompletion tracks the completion of a batch of jobs.
|
5
|
+
# It does so by incrementing and decrementing a counter in Redis.
|
6
|
+
#
|
7
|
+
# The counter is incremented when a job is first enqueued and decremented when
|
8
|
+
# a job is performed. If a job is retried, the counter is not incremented.
|
9
|
+
# When a job succeeds or fails and thereby exceeds its retry count, the counter
|
10
|
+
# is decremented.
|
11
|
+
#
|
12
|
+
# When the counter reaches zero, garbage collection deletes the Redis keys
|
13
|
+
# associated with the batch.
|
14
|
+
module BatchCompletion
|
15
|
+
module_function
|
16
|
+
|
17
|
+
def subscribe!
|
18
|
+
ActiveSupport::Notifications.subscribe("enqueue.active_job", self)
|
19
|
+
ActiveSupport::Notifications.subscribe("perform.active_job", self)
|
20
|
+
ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
|
21
|
+
end
|
22
|
+
|
23
|
+
def call(name, _, _, _, data)
|
24
|
+
return unless (job = data[:job]).is_a?(Wayfarer::Base)
|
25
|
+
|
26
|
+
task = job.arguments.first
|
27
|
+
|
28
|
+
# In the case of `enqueue.active_job` middleware hasn't executed yet
|
29
|
+
task[:redis_pool] ||= Wayfarer::Redis::Pool.instance # TODO: Test
|
30
|
+
|
31
|
+
counter = Redis::Counter.new(task) do
|
32
|
+
job.run_callbacks(:batch)
|
33
|
+
ensure
|
34
|
+
Wayfarer::GC.run(task)
|
35
|
+
end
|
36
|
+
|
37
|
+
handle(name, job, task, counter)
|
38
|
+
end
|
39
|
+
|
40
|
+
def handle(name, job, task, counter)
|
41
|
+
case name
|
42
|
+
when "enqueue.active_job" then counter.increment unless retry?(job)
|
43
|
+
when "perform.active_job" then counter.decrement if succeeded?(job, task)
|
44
|
+
when "retry_stopped.active_job" then counter.decrement
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def retry?(job)
|
49
|
+
job.executions > 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def succeeded?(job, task)
|
53
|
+
job.exception_executions == task[:initial_exception_executions]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|