wayfarer 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/lint.yaml +25 -0
- data/.github/workflows/release.yaml +29 -0
- data/.github/workflows/tests.yaml +30 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +5 -0
- data/.vale.ini +5 -0
- data/.yardopts +1 -3
- data/Dockerfile +5 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +107 -102
- data/Rakefile +5 -56
- data/bin/wayfarer +1 -1
- data/docker-compose.yml +20 -9
- data/docs/cookbook/consent_screen.md +2 -2
- data/docs/cookbook/executing_javascript.md +3 -3
- data/docs/cookbook/navigation.md +12 -12
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/design.md +36 -0
- data/docs/guides/callbacks.md +24 -126
- data/docs/guides/configuration.md +8 -8
- data/docs/guides/handlers.md +60 -0
- data/docs/guides/index.md +1 -0
- data/docs/guides/jobs/error_handling.md +40 -0
- data/docs/guides/jobs.md +99 -31
- data/docs/guides/navigation.md +1 -1
- data/docs/guides/networking/capybara.md +13 -22
- data/docs/guides/networking/custom_adapters.md +82 -41
- data/docs/guides/networking/ferrum.md +4 -4
- data/docs/guides/networking/http.md +9 -13
- data/docs/guides/networking/selenium.md +10 -11
- data/docs/guides/pages.md +76 -10
- data/docs/guides/redis.md +10 -0
- data/docs/guides/routing.md +74 -0
- data/docs/guides/tasks.md +33 -9
- data/docs/guides/tutorial.md +60 -0
- data/docs/guides/user_agents.md +113 -0
- data/docs/index.md +17 -40
- data/docs/reference/cli.md +35 -25
- data/docs/reference/configuration.md +36 -0
- data/lib/wayfarer/base.rb +124 -46
- data/lib/wayfarer/batch_completion.rb +56 -0
- data/lib/wayfarer/callbacks.rb +22 -48
- data/lib/wayfarer/cli/route_printer.rb +71 -57
- data/lib/wayfarer/cli.rb +121 -0
- data/lib/wayfarer/gc.rb +13 -6
- data/lib/wayfarer/handler.rb +15 -7
- data/lib/wayfarer/logging.rb +38 -0
- data/lib/wayfarer/middleware/base.rb +2 -0
- data/lib/wayfarer/middleware/batch_completion.rb +19 -0
- data/lib/wayfarer/middleware/content_type.rb +54 -0
- data/lib/wayfarer/middleware/controller.rb +19 -15
- data/lib/wayfarer/middleware/dedup.rb +16 -13
- data/lib/wayfarer/middleware/dispatch.rb +12 -4
- data/lib/wayfarer/middleware/normalize.rb +12 -11
- data/lib/wayfarer/middleware/redis.rb +15 -0
- data/lib/wayfarer/middleware/router.rb +33 -35
- data/lib/wayfarer/middleware/stage.rb +5 -5
- data/lib/wayfarer/middleware/uri_parser.rb +30 -0
- data/lib/wayfarer/middleware/user_agent.rb +49 -0
- data/lib/wayfarer/networking/capybara.rb +1 -1
- data/lib/wayfarer/networking/context.rb +2 -2
- data/lib/wayfarer/networking/ferrum.rb +2 -2
- data/lib/wayfarer/networking/follow.rb +12 -6
- data/lib/wayfarer/networking/http.rb +1 -1
- data/lib/wayfarer/networking/pool.rb +17 -12
- data/lib/wayfarer/networking/selenium.rb +3 -3
- data/lib/wayfarer/networking/strategy.rb +2 -2
- data/lib/wayfarer/page.rb +36 -14
- data/lib/wayfarer/parsing/xml.rb +6 -6
- data/lib/wayfarer/parsing.rb +24 -0
- data/lib/wayfarer/redis/barrier.rb +13 -21
- data/lib/wayfarer/redis/counter.rb +19 -9
- data/lib/wayfarer/redis/pool.rb +1 -1
- data/lib/wayfarer/redis/resettable.rb +19 -0
- data/lib/wayfarer/routing/dsl.rb +1 -0
- data/lib/wayfarer/routing/matchers/path.rb +4 -2
- data/lib/wayfarer/routing/root_route.rb +5 -1
- data/lib/wayfarer/routing/route.rb +4 -14
- data/lib/wayfarer/stringify.rb +22 -30
- data/lib/wayfarer/task.rb +12 -18
- data/lib/wayfarer.rb +28 -1
- data/mkdocs.yml +52 -7
- data/rake/docs.rake +26 -0
- data/rake/lint.rake +105 -0
- data/rake/release.rake +29 -0
- data/rake/tests.rake +28 -0
- data/requirements.txt +1 -1
- data/spec/base_spec.rb +140 -160
- data/spec/batch_completion_spec.rb +104 -0
- data/spec/cli/job_spec.rb +19 -23
- data/spec/cli/routing_spec.rb +101 -0
- data/spec/cli/version_spec.rb +1 -1
- data/spec/factories/task.rb +7 -1
- data/spec/fixtures/dummy_job.rb +5 -3
- data/spec/gc_spec.rb +8 -50
- data/spec/handler_spec.rb +1 -1
- data/spec/integration/callbacks_spec.rb +157 -45
- data/spec/integration/content_type_spec.rb +145 -0
- data/spec/integration/gc_spec.rb +44 -0
- data/spec/integration/handler_spec.rb +66 -0
- data/spec/integration/page_spec.rb +44 -29
- data/spec/integration/params_spec.rb +33 -25
- data/spec/integration/parsing_spec.rb +125 -0
- data/spec/integration/routing_spec.rb +18 -0
- data/spec/integration/stage_spec.rb +27 -20
- data/spec/middleware/batch_completion_spec.rb +34 -0
- data/spec/middleware/chain_spec.rb +8 -8
- data/spec/middleware/content_type_spec.rb +86 -0
- data/spec/middleware/controller_spec.rb +5 -5
- data/spec/middleware/dedup_spec.rb +38 -55
- data/spec/middleware/dispatch_spec.rb +23 -7
- data/spec/middleware/normalize_spec.rb +44 -13
- data/spec/middleware/router_spec.rb +29 -30
- data/spec/middleware/stage_spec.rb +8 -8
- data/spec/middleware/uri_parser_spec.rb +53 -0
- data/spec/middleware/{fetch_spec.rb → user_agent_spec.rb} +28 -27
- data/spec/networking/context_spec.rb +1 -1
- data/spec/networking/follow_spec.rb +2 -2
- data/spec/networking/pool_spec.rb +5 -5
- data/spec/networking/strategy.rb +2 -2
- data/spec/page_spec.rb +42 -20
- data/spec/parsing/xml_spec.rb +11 -12
- data/spec/redis/barrier_spec.rb +8 -48
- data/spec/redis/counter_spec.rb +13 -1
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/spec_helpers.rb +27 -16
- data/spec/support/test_app.rb +8 -0
- data/spec/task_spec.rb +3 -24
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +4 -3
- metadata +61 -51
- data/.github/workflows/ci.yaml +0 -32
- data/docs/guides/error_handling.md +0 -53
- data/docs/guides/networking.md +0 -94
- data/docs/guides/performance.md +0 -130
- data/docs/guides/reliability.md +0 -41
- data/docs/guides/routing/steering.md +0 -30
- data/docs/reference/api/base.md +0 -48
- data/docs/reference/configuration_keys.md +0 -43
- data/docs/reference/environment_variables.md +0 -83
- data/lib/wayfarer/cli/base.rb +0 -45
- data/lib/wayfarer/cli/generate.rb +0 -17
- data/lib/wayfarer/cli/job.rb +0 -56
- data/lib/wayfarer/cli/route.rb +0 -29
- data/lib/wayfarer/cli/runner.rb +0 -34
- data/lib/wayfarer/cli/templates/Gemfile.tt +0 -5
- data/lib/wayfarer/cli/templates/job.rb.tt +0 -10
- data/lib/wayfarer/config/capybara.rb +0 -10
- data/lib/wayfarer/config/ferrum.rb +0 -11
- data/lib/wayfarer/config/networking.rb +0 -29
- data/lib/wayfarer/config/redis.rb +0 -14
- data/lib/wayfarer/config/root.rb +0 -11
- data/lib/wayfarer/config/selenium.rb +0 -21
- data/lib/wayfarer/config/strconv.rb +0 -45
- data/lib/wayfarer/config/struct.rb +0 -72
- data/lib/wayfarer/middleware/fetch.rb +0 -56
- data/lib/wayfarer/redis/connection.rb +0 -13
- data/lib/wayfarer/redis/version.rb +0 -19
- data/lib/wayfarer/routing/router.rb +0 -28
- data/spec/callbacks_spec.rb +0 -102
- data/spec/cli/generate_spec.rb +0 -39
- data/spec/config/capybara_spec.rb +0 -18
- data/spec/config/ferrum_spec.rb +0 -24
- data/spec/config/networking_spec.rb +0 -73
- data/spec/config/redis_spec.rb +0 -32
- data/spec/config/root_spec.rb +0 -31
- data/spec/config/selenium_spec.rb +0 -56
- data/spec/config/strconv_spec.rb +0 -58
- data/spec/config/struct_spec.rb +0 -66
- data/spec/integration/steering_spec.rb +0 -57
- data/spec/redis/version_spec.rb +0 -13
- data/spec/routing/router_spec.rb +0 -24
@@ -0,0 +1,60 @@
|
|
1
|
+
# Tutorial
|
2
|
+
|
3
|
+
Wayfarer is a web crawling framework written in Ruby.
|
4
|
+
It works with plain HTTP or by automating web browsers and is deployed with
|
5
|
+
Redis and a message queue (which can be Redis-based itself).
|
6
|
+
In development, it can execute fully in memory, without Redis.
|
7
|
+
|
8
|
+
You need a compatible version of Ruby installed.
|
9
|
+
|
10
|
+
To get started, in an empty directory, generate a new `Gemfile` and install
|
11
|
+
ActiveJob and Wayfarer:
|
12
|
+
|
13
|
+
```sh
|
14
|
+
bundle init
|
15
|
+
bundle add activejob wayfarer
|
16
|
+
bundle install
|
17
|
+
```
|
18
|
+
|
19
|
+
## Jobs, tasks and batches
|
20
|
+
|
21
|
+
Wayfarer builds on Active Job, the message queue abstraction of Rails.
|
22
|
+
You can use Wayfarer without Rails of course, as we do here.
|
23
|
+
|
24
|
+
A message queue supports two operations: appending messages to the end and consuming
|
25
|
+
messages from the front. In the case of Wayfarer, messages are tasks, a string pair
|
26
|
+
consisting of a URL and a batch. When a task is consumed, it is processed by a job,
|
27
|
+
a Ruby class.
|
28
|
+
|
29
|
+
Let's give ourselves a `dummy_job.rb` that routes arbitrary URLs to its
|
30
|
+
`index` instance method, where we print the current `task`:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
require "activejob"
|
34
|
+
require "wayfarer"
|
35
|
+
|
36
|
+
class DummyJob < ActiveJob::Base
|
37
|
+
include Wayfarer::Base
|
38
|
+
|
39
|
+
route.to :index
|
40
|
+
|
41
|
+
def index
|
42
|
+
puts task
|
43
|
+
end
|
44
|
+
end
|
45
|
+
```
|
46
|
+
|
47
|
+
We can perform our job from the command line with Wayfarer's CLI and find
|
48
|
+
that in between ActiveJob's log output, our task was printed with a generated
|
49
|
+
UUID for its batch:
|
50
|
+
|
51
|
+
```hl_lines="1 3"
|
52
|
+
bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
|
53
|
+
[ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
54
|
+
#<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
55
|
+
[ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
|
56
|
+
```
|
57
|
+
|
58
|
+
Many commands accept a `--batch` flag for setting the batch. If you don't
|
59
|
+
provide one, a UUID is generated.
|
60
|
+
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# User agents
|
2
|
+
|
3
|
+
User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
|
4
|
+
[page](../pages). They are kept in a connection pool and all user agents in the pool
|
5
|
+
share the same type and configuration. You can add custom user agents by implementing
|
6
|
+
the [user agent API](custom_user_agents.md).
|
7
|
+
|
8
|
+
Wayfarer comes with the following built-in user agents:
|
9
|
+
|
10
|
+
* [`#!ruby :http`](http.md) (default)
|
11
|
+
* [`#!ruby :ferrum`](ferrum.md) to automate Google Chrome
|
12
|
+
* [`#!ruby :selenium`](selenium.md) to automate a variety of browsers
|
13
|
+
* [`#!ruby :capybara`](capybara.md) to use Capybara sessions
|
14
|
+
|
15
|
+
Configure the user agent with the global configuration option:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
Wayfarer.config[:network][:agent] = :ferrum # or :selenium, :capybara, ...
|
19
|
+
```
|
20
|
+
|
21
|
+
You can access the user agent that was checked out from the pool with
|
22
|
+
`#user_agent` in action methods:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
class DummyJob < ActiveJob::Base
|
26
|
+
include Wayfarer::Base
|
27
|
+
|
28
|
+
route.to :index
|
29
|
+
|
30
|
+
def index
|
31
|
+
user_agent # => #<Ferrum::Browser ...>
|
32
|
+
end
|
33
|
+
end
|
34
|
+
```
|
35
|
+
|
36
|
+
You can also implement [custom user agents](custom_user_agents.md) to support
|
37
|
+
your own HTTP client or browser automation service/protocol.
|
38
|
+
|
39
|
+
### Ad-hoc HTTP requests
|
40
|
+
|
41
|
+
Regardless the configured user agent, you can always make ad-hoc HTTP GET requests
|
42
|
+
that return pages with `#fetch(url)`:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
class DummyJob < ActiveJob::Base
|
46
|
+
include Wayfarer::Base
|
47
|
+
|
48
|
+
route.to :index
|
49
|
+
|
50
|
+
def index
|
51
|
+
page = fetch("https://example.com") # => #<Wayfarer::Page ...>
|
52
|
+
end
|
53
|
+
end
|
54
|
+
```
|
55
|
+
|
56
|
+
!!! info "`#fetch` uses the configured `Wayfarer.config.network.http_headers`."
|
57
|
+
|
58
|
+
## HTTP request headers
|
59
|
+
|
60
|
+
You can set HTTP request headers for all built-in user agents:
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
Wayfarer.config[:network][:http_headers] = { "User-Agent" => "MyCrawler" }
|
64
|
+
```
|
65
|
+
|
66
|
+
!!! attention "Selenium does not support configuring HTTP request headers."
|
67
|
+
|
68
|
+
## Connection pooling
|
69
|
+
|
70
|
+
Since user agents are expensive to create, especially in the case of browser
|
71
|
+
processes, Wayfarer keeps user agents within a connection pool. When a job
|
72
|
+
performs and needs to retrieve the [page](../pages) for its task URL, an agent
|
73
|
+
is checked out from the pool, and checked back in when the routed action method
|
74
|
+
returns.
|
75
|
+
|
76
|
+
The pool size is constant and it should equal the number of threads the
|
77
|
+
underlying message queue operates with. For example, if you use Sidekiq,
|
78
|
+
you should set the pool size to the number of Sidekiq threads:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
Wayfarer.config[:network][:pool_size] = Sidekiq.options[:concurrency]
|
82
|
+
```
|
83
|
+
|
84
|
+
!!! attention "The connection pool size is 1 by default"
|
85
|
+
|
86
|
+
Since there is no reliable way to detect the number of threads that
|
87
|
+
the underlying message queue operates with, Wayfarer defaults to a pool
|
88
|
+
size of 1, which creates a bottleneck in a concurrent environment.
|
89
|
+
|
90
|
+
!!! attention "Browser sessions are shared across jobs"
|
91
|
+
|
92
|
+
The same browser session is used across jobs. This means that the browser
|
93
|
+
is not closed between jobs, and that the browser's state carries over from
|
94
|
+
job to job. You may account for this by resetting the browser's state
|
95
|
+
according to your needs, for which you can use [callbacks](../callbacks).
|
96
|
+
|
97
|
+
### `UserAgentTimeoutError`: avoiding pool contention
|
98
|
+
|
99
|
+
If you encounter `UserAgentTimeoutError` exceptions, a job has waited for a
|
100
|
+
user agent to become available for too long. By default, this timeout is 10
|
101
|
+
seconds. This is a sign that the pool size is too small for the message queue's
|
102
|
+
concurrency.
|
103
|
+
|
104
|
+
```
|
105
|
+
#<Wayfarer::UserAgentTimeoutError: Waited 10 sec, 0/1 available>
|
106
|
+
```
|
107
|
+
|
108
|
+
You can configure the timeout, although you will likely want to increase the
|
109
|
+
pool size instead:
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
Wayfarer.config[:network][:pool_timeout] = 10 # seconds
|
113
|
+
```
|
data/docs/index.md
CHANGED
@@ -1,56 +1,33 @@
|
|
1
1
|
---
|
2
2
|
hide:
|
3
3
|
- navigation
|
4
|
+
- toc
|
4
5
|
---
|
5
6
|
|
6
7
|
# Wayfarer
|
7
8
|
|
8
|
-
|
9
|
-
[![RubyGem](https://badge.fury.io/rb/wayfarer.svg)](https://rubygems.org/gems/wayfarer)
|
9
|
+
## Ruby web crawling framework built on [ActiveJob]() and [Redis]()
|
10
10
|
|
11
|
-
|
11
|
+
<small>
|
12
|
+
[Read the tutorial](/guides/tutorial){ .md-button .md-button--primary }
|
13
|
+
</small>
|
12
14
|
|
13
|
-
|
14
|
-
* Data extraction
|
15
|
-
* Browser automation
|
15
|
+
=== "Command line"
|
16
16
|
|
17
|
-
|
17
|
+
```sh
|
18
|
+
gem install wayfarer
|
19
|
+
```
|
18
20
|
|
19
|
-
|
21
|
+
=== "Gemfile"
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
release:
|
23
|
+
```ruby
|
24
|
+
gem "wayfarer"
|
25
|
+
```
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
### Installation
|
30
|
-
|
31
|
-
Install the RubyGem:
|
32
|
-
|
33
|
-
```
|
34
|
-
gem install wayfarer
|
35
|
-
```
|
36
|
-
|
37
|
-
Or add it to Bundler's Gemfile:
|
38
|
-
|
39
|
-
```ruby
|
40
|
-
gem "wayfarer"
|
41
|
-
```
|
42
|
-
|
43
|
-
### Features
|
44
|
-
|
45
|
-
* Breadth-first, acyclic, multi-threaded graph traversal
|
46
|
-
* Executes atop a variety of message queues thanks to [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html)
|
47
|
-
* Browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
|
27
|
+
* Breadth-first, acyclic page traversal
|
28
|
+
* Plain HTTP and browser automation via [Ferrum](https://github.com/rubycdp/ferrum)
|
48
29
|
(<abbr title="Chrome DevTools Protocol">CDP</abbr>),
|
49
|
-
[Selenium](https://www.selenium.dev)
|
30
|
+
[Selenium](https://www.selenium.dev) and custom user agents
|
50
31
|
* Declarative routing DSL
|
51
32
|
* URI normalization and deduplication
|
52
|
-
*
|
53
|
-
* HTTP redirect handling
|
54
|
-
* Storage-agnostic
|
55
|
-
* Small footprint: <500 LoC
|
56
|
-
* Open Source (MIT)
|
33
|
+
* HTML, XML, JSON and custom Content-Type body parsing
|
data/docs/reference/cli.md
CHANGED
@@ -1,46 +1,46 @@
|
|
1
|
-
#
|
1
|
+
# wayfarer
|
2
|
+
|
3
|
+
The command-line interface to Wayfarer.
|
2
4
|
|
3
5
|
## Usage
|
4
6
|
|
5
7
|
```
|
6
|
-
wayfarer [OPTIONS] [
|
8
|
+
wayfarer [OPTIONS] [perform|enqueue|execute|route|tree]
|
7
9
|
```
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
## `wayfarer generate`
|
12
|
-
|
13
|
-
### `wayfarer generate project NAME`
|
11
|
+
See [Configuration](../reference/cli) for the respected environment variables.
|
14
12
|
|
15
|
-
|
13
|
+
---
|
16
14
|
|
17
|
-
## `wayfarer
|
15
|
+
## `wayfarer perform JOB URL`
|
18
16
|
|
19
|
-
|
20
|
-
|
21
|
-
: Performs `JOB` with `URL`. The job does not reach any Active Job backend.
|
22
|
-
Staged jobs will not be processed.
|
17
|
+
: Performs `JOB` with `URL` in memory. The task is not sent to the message queue.
|
18
|
+
Staged jobs are ignored.
|
23
19
|
|
24
20
|
##### Options
|
25
21
|
|
26
22
|
* `--mock-redis`: Use an in-memory implementation of Redis instead of
|
27
23
|
talking to an actual server.
|
28
|
-
* `--batch=BATCH`:
|
24
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
25
|
+
|
26
|
+
---
|
29
27
|
|
30
|
-
|
28
|
+
## `wayfarer enqueue JOB URL`
|
31
29
|
|
32
|
-
: Enqueues `JOB` with `URL` to the
|
30
|
+
: Enqueues a task for `JOB` with `URL` to the message queue.
|
33
31
|
|
34
32
|
##### Options
|
35
33
|
|
36
|
-
* `--batch=BATCH`:
|
34
|
+
* `--batch=BATCH`: The job's batch. By default, a UUID is generated.
|
35
|
+
|
36
|
+
---
|
37
37
|
|
38
|
-
|
38
|
+
## `wayfarer execute JOB URL`
|
39
39
|
|
40
|
-
: Execute `JOB` with `URL`
|
41
|
-
[Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
|
42
|
-
|
43
|
-
|
40
|
+
: Execute `JOB` with `URL` with the in-memory
|
41
|
+
[Active Job Async adapter](https://api.rubyonrails.org/classes/ActiveJob/QueueAdapters/AsyncAdapter.html)
|
42
|
+
instead of writing the taks to an actual message queue. Blocks until the
|
43
|
+
batch has completed.
|
44
44
|
|
45
45
|
##### Options
|
46
46
|
|
@@ -50,12 +50,22 @@ All [environment variables](../environment_variables) are respected.
|
|
50
50
|
* `--min-threads`: Minimum number of threads to use. Default: 1
|
51
51
|
* `--max-threads`: Maximum number of threads to use. Default: 1
|
52
52
|
|
53
|
-
|
53
|
+
!!! attention "Why are my jobs not getting retried with `wayfarer job execute`?"
|
54
|
+
|
55
|
+
You need to set the `wait: 0` option on `retry_on` in order for
|
56
|
+
`wayfarer job execute` to execute retries:
|
54
57
|
|
55
|
-
|
58
|
+
```ruby
|
59
|
+
retry_on StandardError, attempts: 3, wait: 0
|
60
|
+
```
|
61
|
+
---
|
62
|
+
|
63
|
+
## `wayfarer route JOB URL`
|
56
64
|
|
57
65
|
: Prints the result of invoking `JOB`'s router with `URL`.
|
58
66
|
|
59
|
-
|
67
|
+
---
|
68
|
+
|
69
|
+
## `wayfarer tree JOB URL`
|
60
70
|
|
61
71
|
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
---
|
2
|
+
hide:
|
3
|
+
- toc
|
4
|
+
---
|
5
|
+
|
6
|
+
# Configuration
|
7
|
+
|
8
|
+
You can configure Wayfarer by assigning to the `Wayfarer.config` Hash
|
9
|
+
which has the following defaults:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
{
|
13
|
+
redis: {
|
14
|
+
url: "redis://localhost:6379/0",
|
15
|
+
factory: ->(redis) { ::Redis.new(url: redis[:url]) }
|
16
|
+
},
|
17
|
+
network: {
|
18
|
+
agent: :http,
|
19
|
+
pool_size: 1,
|
20
|
+
pool_timeout: 10,
|
21
|
+
http_headers: {},
|
22
|
+
renew_on: []
|
23
|
+
},
|
24
|
+
capybara: {
|
25
|
+
driver: nil
|
26
|
+
},
|
27
|
+
ferrum: {
|
28
|
+
options: {}
|
29
|
+
},
|
30
|
+
selenium: {
|
31
|
+
driver: :chrome,
|
32
|
+
options: {},
|
33
|
+
client_timeout: 60
|
34
|
+
}
|
35
|
+
}
|
36
|
+
```
|
data/lib/wayfarer/base.rb
CHANGED
@@ -1,60 +1,138 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wayfarer
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
4
|
+
# @!attribute [r] task
|
5
|
+
# @return [Wayfarer::Task] the current task
|
6
|
+
# @!attribute [r] uri
|
7
|
+
# @return [Addressable::URI] Parsed task URL
|
8
|
+
# @!attribute [r] user_agent
|
9
|
+
# @return [Object] the user agent that retrieved the page
|
10
|
+
# @!attribute [r] action
|
11
|
+
# @return [Symbol, Object] action that the task URL was routed to.
|
12
|
+
# @!attribute [r] params
|
13
|
+
# @return [HashWithIndifferentAccess] path parameters collected from routes
|
14
|
+
module Base
|
15
|
+
extend ActiveSupport::Concern
|
16
|
+
# @!method stage(urls)
|
17
|
+
# Adds URLs to an internal staging set so that they get enqueued
|
18
|
+
# eventually, once the job executed successfully.
|
19
|
+
# @overload stage(urls)
|
20
|
+
# @param urls [Array<String>] URLs to add to the staging set.
|
21
|
+
# @overload stage(url)
|
22
|
+
# @param url [String] URL to add to the staging set.
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
24
|
+
# @!method fetch(url, follow: 3)
|
25
|
+
# @param url [String] URL to fetch using plain HTTP(S).
|
26
|
+
# @param follow [Fixnum] Number of redirects to follow.
|
27
|
+
# Retrieves the given URL to a {Page}.
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
# @!method page(live: false)
|
30
|
+
# @param url [live] whether to retrieve a new {Page}.
|
31
|
+
# @return [Wayfarer::Page]
|
32
|
+
# Returns the most recently retrieved page or a new page
|
33
|
+
# for the current task URL if the `follow` keyword is passed.
|
32
34
|
|
33
|
-
|
34
|
-
task = arguments.first
|
35
|
-
task.gc.run
|
36
|
-
end
|
35
|
+
# @!scope class
|
37
36
|
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
# @!attribute [r] route
|
38
|
+
# @return [Wayfarer::Routing::DSL]
|
39
|
+
# The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
|
40
|
+
# or to a {Handler}.
|
41
|
+
# @example Append a host route
|
42
|
+
# route.host "examplxe.com", to: :index
|
41
43
|
|
42
|
-
|
43
|
-
|
44
|
-
|
44
|
+
# @!method content_types(*content_types)
|
45
|
+
# @param content_types [*Array<String, Regexp>] Content-Types to whitelist
|
46
|
+
# Whitelists Content-Types. Once at least one Content-Type is set, only
|
47
|
+
# those Content-Types will be processed.
|
45
48
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
49
|
+
# @!group Callbacks
|
50
|
+
|
51
|
+
# @!method before_fetch
|
52
|
+
# @overload before_fetch(callback)
|
53
|
+
# @param callback [Symbol] Instance method to call
|
54
|
+
# @overload before_fetch(&block)
|
55
|
+
# @yield [Wayfarer::Task]
|
56
|
+
# Registers a callback that is called before the page is fetched.
|
57
|
+
# If a symbol is passed, an instance method with the same name will be
|
58
|
+
# called.
|
59
|
+
# @example Accessing the user agent in {#before_fetch}
|
60
|
+
# before_fetch do |task|
|
61
|
+
# user_agent # => the user agent that will fetch the page
|
62
|
+
# end
|
51
63
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
64
|
+
# @!method around_fetch
|
65
|
+
# @overload around_fetch(callback)
|
66
|
+
# @param callback [Symbol] Instance method to call
|
67
|
+
# @overload around_fetch(&block)
|
68
|
+
# @yield [Wayfarer::Task]
|
69
|
+
# Registers a callback that is called around the page getting fetched.
|
70
|
+
# If a symbol is passed, an instance method with the same name will be
|
71
|
+
# called.
|
72
|
+
|
73
|
+
# @!method after_fetch
|
74
|
+
# @overload after_fetch(callback)
|
75
|
+
# @param callback [Symbol] Instance method to call
|
76
|
+
# @overload after_fetch(&block)
|
77
|
+
# @yield [Wayfarer::Task]
|
78
|
+
# Registers a callback that is called after the page was fetched.
|
79
|
+
# If a symbol is passed, an instance method with the same name will be
|
80
|
+
# called.
|
81
|
+
|
82
|
+
# @!method before_perform
|
83
|
+
# @overload before_perform(callback)
|
84
|
+
# @param callback [Symbol] Instance method to call
|
85
|
+
# @overload before_perform(&block)
|
86
|
+
# @yield [Wayfarer::Task]
|
87
|
+
# Registers a callback that is called before the task is performed.
|
88
|
+
# If a symbol is passed, an instance method with the same name will be
|
89
|
+
# called.
|
90
|
+
|
91
|
+
# @!method around_perform
|
92
|
+
# @overload around_perform(callback)
|
93
|
+
# @param callback [Symbol] Instance method to call
|
94
|
+
# @overload around_perform(&block)
|
95
|
+
# @yield [Wayfarer::Task]
|
96
|
+
# Registers a callback that is called around the task getting performed.
|
97
|
+
# If a symbol is passed, an instance method with the same name will be
|
98
|
+
# called.
|
99
|
+
|
100
|
+
# @!method after_perform
|
101
|
+
# @overload after_perform(callback)
|
102
|
+
# @param callback [Symbol] Instance method to call
|
103
|
+
# @overload after_perform(&block)
|
104
|
+
# @yield [Wayfarer::Task]
|
105
|
+
# Registers a callback that is called after the task was performed.
|
106
|
+
# If a symbol is passed, an instance method with the same name will be
|
107
|
+
# called.
|
108
|
+
|
109
|
+
# @!endgroup
|
110
|
+
|
111
|
+
included do
|
112
|
+
include Wayfarer::Middleware::Controller
|
113
|
+
|
114
|
+
# Implement ActiveJob's #perform by calling into our own middleware chain
|
115
|
+
alias_method :perform, :call
|
116
|
+
|
117
|
+
# Middleware stack
|
118
|
+
use Wayfarer::Middleware::Redis
|
119
|
+
use Wayfarer::Middleware::BatchCompletion
|
120
|
+
use Wayfarer::Middleware::UriParser
|
121
|
+
use Wayfarer::Middleware::Normalize
|
122
|
+
use Wayfarer::Middleware::Dedup
|
123
|
+
use Wayfarer::Middleware::Stage
|
124
|
+
use Wayfarer::Middleware::Router
|
125
|
+
use Wayfarer::Middleware::UserAgent
|
126
|
+
use Wayfarer::Middleware::ContentType
|
127
|
+
use Wayfarer::Middleware::Dispatch
|
56
128
|
end
|
57
129
|
|
58
|
-
|
130
|
+
class_methods do
|
131
|
+
def crawl(url, batch: SecureRandom.uuid)
|
132
|
+
Task.new(url, batch).tap do |task|
|
133
|
+
perform_later(task)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
59
137
|
end
|
60
138
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# BatchCompletion tracks the completion of a batch of jobs.
|
5
|
+
# It does so by incrementing and decrementing a counter in Redis.
|
6
|
+
#
|
7
|
+
# The counter is incremented when a job is first enqueued and decremented when
|
8
|
+
# a job is performed. If a job is retried, the counter is not incremented.
|
9
|
+
# When a job succeeds or fails and thereby exceeds its retry count, the counter
|
10
|
+
# is decremented.
|
11
|
+
#
|
12
|
+
# When the counter reaches zero, garbage collection deletes the Redis keys
|
13
|
+
# associated with the batch.
|
14
|
+
module BatchCompletion
|
15
|
+
module_function
|
16
|
+
|
17
|
+
def subscribe!
|
18
|
+
ActiveSupport::Notifications.subscribe("enqueue.active_job", self)
|
19
|
+
ActiveSupport::Notifications.subscribe("perform.active_job", self)
|
20
|
+
ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
|
21
|
+
end
|
22
|
+
|
23
|
+
def call(name, _, _, _, data)
|
24
|
+
return unless (job = data[:job]).is_a?(Wayfarer::Base)
|
25
|
+
|
26
|
+
task = job.arguments.first
|
27
|
+
|
28
|
+
# In the case of `enqueue.active_job` middleware hasn't executed yet
|
29
|
+
task[:redis_pool] ||= Wayfarer::Redis::Pool.instance # TODO: Test
|
30
|
+
|
31
|
+
counter = Redis::Counter.new(task) do
|
32
|
+
job.run_callbacks(:batch)
|
33
|
+
ensure
|
34
|
+
Wayfarer::GC.run(task)
|
35
|
+
end
|
36
|
+
|
37
|
+
handle(name, job, task, counter)
|
38
|
+
end
|
39
|
+
|
40
|
+
def handle(name, job, task, counter)
|
41
|
+
case name
|
42
|
+
when "enqueue.active_job" then counter.increment unless retry?(job)
|
43
|
+
when "perform.active_job" then counter.decrement if succeeded?(job, task)
|
44
|
+
when "retry_stopped.active_job" then counter.decrement
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def retry?(job)
|
49
|
+
job.executions > 0
|
50
|
+
end
|
51
|
+
|
52
|
+
def succeeded?(job, task)
|
53
|
+
job.exception_executions == task[:initial_exception_executions]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|