wayfarer 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +14 -10
- data/docs/cookbook/batch_routing.md +22 -0
- data/docs/cookbook/consent_screen.md +36 -0
- data/docs/cookbook/executing_javascript.md +41 -0
- data/docs/cookbook/querying_html.md +3 -3
- data/docs/cookbook/screenshots.md +2 -2
- data/docs/guides/browser_automation/capybara.md +6 -3
- data/docs/guides/browser_automation/ferrum.md +3 -1
- data/docs/guides/browser_automation/selenium.md +4 -2
- data/docs/guides/callbacks.md +5 -5
- data/docs/guides/debugging.md +17 -0
- data/docs/guides/error_handling.md +22 -26
- data/docs/guides/jobs.md +44 -18
- data/docs/guides/navigation.md +73 -0
- data/docs/guides/pages.md +4 -4
- data/docs/guides/performance.md +108 -0
- data/docs/guides/reliability.md +41 -0
- data/docs/guides/routing/steering.md +30 -0
- data/docs/guides/tasks.md +9 -33
- data/docs/reference/api/base.md +13 -127
- data/docs/reference/api/route.md +1 -1
- data/docs/reference/cli.md +0 -78
- data/docs/reference/configuration_keys.md +1 -1
- data/lib/wayfarer/cli/job.rb +1 -3
- data/lib/wayfarer/cli/route.rb +4 -2
- data/lib/wayfarer/cli/templates/job.rb.tt +3 -1
- data/lib/wayfarer/config/networking.rb +1 -1
- data/lib/wayfarer/config/struct.rb +1 -1
- data/lib/wayfarer/middleware/fetch.rb +15 -4
- data/lib/wayfarer/middleware/router.rb +34 -2
- data/lib/wayfarer/middleware/worker.rb +4 -24
- data/lib/wayfarer/networking/pool.rb +9 -8
- data/lib/wayfarer/page.rb +1 -1
- data/lib/wayfarer/routing/matchers/custom.rb +2 -0
- data/lib/wayfarer/routing/matchers/path.rb +1 -0
- data/lib/wayfarer/routing/route.rb +6 -0
- data/lib/wayfarer/routing/router.rb +27 -0
- data/lib/wayfarer/stringify.rb +13 -7
- data/lib/wayfarer.rb +3 -1
- data/spec/callbacks_spec.rb +2 -2
- data/spec/config/networking_spec.rb +2 -2
- data/spec/factories/{queue/middleware.rb → middleware.rb} +3 -3
- data/spec/factories/{queue/page.rb → page.rb} +3 -3
- data/spec/factories/{queue/task.rb → task.rb} +0 -0
- data/spec/fixtures/dummy_job.rb +1 -1
- data/spec/middleware/chain_spec.rb +17 -17
- data/spec/middleware/fetch_spec.rb +27 -11
- data/spec/middleware/router_spec.rb +34 -7
- data/spec/middleware/worker_spec.rb +3 -13
- data/spec/routing/router_spec.rb +24 -0
- data/wayfarer.gemspec +1 -1
- metadata +16 -8
- data/spec/factories/queue/chain.rb +0 -11
data/docs/guides/performance.md
CHANGED
@@ -2,6 +2,114 @@
|
|
2
2
|
|
3
3
|
How to write performant crawlers with Wayfarer.
|
4
4
|
|
5
|
+
## Use a sufficiently sized user agent pool
|
6
|
+
|
7
|
+
Automated browser processes or HTTP clients are kept in a [connection pool]() of
|
8
|
+
static size. This avoids having to re-establish browser processes and enables
|
9
|
+
their reuse.
|
10
|
+
|
11
|
+
If the size of the pool is too small, the pool is a
|
12
|
+
bottleneck. For example, if your message queue adapter uses 8 threads, but the
|
13
|
+
pool only contains 1 user agent, the remaining 7 threads block until the agent
|
14
|
+
is checked back in to the pool for use by one of the blocked threads.
|
15
|
+
|
16
|
+
There is no reliable way to detect the number of threads of the underlying
|
17
|
+
message queue adapter. The pool size should equal the number of threads;
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
Wayfarer.config.network.pool_size = 8 # defaults to 1
|
21
|
+
```
|
22
|
+
|
23
|
+
### Job shedding
|
24
|
+
|
25
|
+
There is a maximum number of seconds that jobs wait when checking out a user
|
26
|
+
agent from the pool. Once this time is exceeded,
|
27
|
+
a `Wayfarer::UserAgentTimeoutError` is raised. By default, the timeout is 10
|
28
|
+
seconds.
|
29
|
+
|
30
|
+
This hints there are more threads in use than user agents in the pool.
|
31
|
+
|
32
|
+
## Stage less URLs
|
33
|
+
|
34
|
+
Staging less URLs saves space and time:
|
35
|
+
|
36
|
+
* Less tasks written to the message queue
|
37
|
+
* Less time spent consuming tasks
|
38
|
+
* Less time spent filtering URLs with Redis
|
39
|
+
|
40
|
+
Wayfarer maintains a set of processed URLs for a batch in Redis. Every staged
|
41
|
+
URL is checked for inclusion in this set before it gets appended as a task to
|
42
|
+
the message queue.
|
43
|
+
|
44
|
+
A common pattern is to stage all links of a page, and rely on routing to fetch
|
45
|
+
only the relevant ones:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
class DummyJob < Wayfarer::Base
|
49
|
+
route { to: index, host: "example.com" }
|
50
|
+
|
51
|
+
def index
|
52
|
+
stage page.meta.links.all
|
53
|
+
end
|
54
|
+
end
|
55
|
+
```
|
56
|
+
|
57
|
+
Pages commonly contain a large number of URLs.
|
58
|
+
|
59
|
+
Every staged URL is:
|
60
|
+
|
61
|
+
1. Normalized to a canonical form, for example by sorting query parameters
|
62
|
+
alphabetically.
|
63
|
+
2. Checked for inclusion in the batch Redis set or discarded.
|
64
|
+
3. Written to the message queue.
|
65
|
+
4. Consumed from the queue and matched against the router.
|
66
|
+
5. Fetched, if a route matches.
|
67
|
+
|
68
|
+
Narrowing down the links in the document to follow speeds up the process.
|
69
|
+
For example using Nokogiri, interesting links can be identified with a CSS
|
70
|
+
selector:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
class DummyJob < Wayfarer::Base
|
74
|
+
route { to: index, host: "example.com" }
|
75
|
+
|
76
|
+
def index
|
77
|
+
stage interesting_links
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def interesting_links
|
83
|
+
page.doc.css("a.interesting").map { |elem| elem["href"] }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
```
|
87
|
+
|
88
|
+
Because the router only accepts the single hostname `example.com`, the job can
|
89
|
+
also ensure it stages only internal URLs by intersecting them with the
|
90
|
+
interesting ones:
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
class DummyJob < Wayfarer::Base
|
94
|
+
route { to: index, host: "example.com" }
|
95
|
+
|
96
|
+
def index
|
97
|
+
stage interesting_internal_links
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def interesting_internal_links
|
103
|
+
page.meta.links.internal & interesting_links
|
104
|
+
end
|
105
|
+
|
106
|
+
def interesting_links
|
107
|
+
page.doc.css("a.interesting").map { |elem| elem["href"] }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
```
|
111
|
+
|
112
|
+
|
5
113
|
## Use Redis >= 6.2.0
|
6
114
|
|
7
115
|
Redis 6.2.0 introduced the
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Reliablity
|
2
|
+
|
3
|
+
## Durability
|
4
|
+
|
5
|
+
Wayfarer executes atop reliable messages queues such as Sidekiq, Resque,
|
6
|
+
RabbitMQ, etc. Its configuration is independent of the underlying queue
|
7
|
+
infrastructure it reads from and writes to.
|
8
|
+
|
9
|
+
## Self-healing user agents
|
10
|
+
|
11
|
+
Wayfarer handles the scenario where a remote browser process has crashed and
|
12
|
+
must be replaced by a fresh browser process.
|
13
|
+
|
14
|
+
This can be tested locally by automating a browser with headless mode turned
|
15
|
+
off, and then closing the opened browser window: The current job fails, but the
|
16
|
+
next job has access to a newly established browser session again.
|
17
|
+
|
18
|
+
For example Ferrum might raise `Ferrum::DeadBrowserError`. Wayfarer's
|
19
|
+
user agents are self-healing and react to these kinds of errors internally. When
|
20
|
+
a browser window is closed, the Ferrum user agent attempts to establish a new
|
21
|
+
browser process as a replacement, for the next job to use.
|
22
|
+
|
23
|
+
[Wayfarer never swallows exceptions](/guides/error_handling). This means
|
24
|
+
that even though the user agent might heal itself, jobs still need to explicitly
|
25
|
+
retry browser errors:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
class Foobar < Wayfarer::Base
|
29
|
+
route { to: :index }
|
30
|
+
|
31
|
+
retry_on Ferrum::DeadBrowserError, attempts: 3, wait: :exponentially_longer
|
32
|
+
|
33
|
+
# ...
|
34
|
+
end
|
35
|
+
```
|
36
|
+
|
37
|
+
This leads to log entries like:
|
38
|
+
|
39
|
+
```
|
40
|
+
Retrying DummyJob in 3 seconds, due to a Ferrum::DeadBrowserError.
|
41
|
+
```
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Steering
|
2
|
+
|
3
|
+
A job's router can receive arguments computed dynamically by `::steer`.
|
4
|
+
Steering enables [batch routing](/cookbook/batch_routing).
|
5
|
+
|
6
|
+
For example, the following router has hostname and path hard-coded:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
class DummyJob < Wayfarer::Base
|
10
|
+
route do
|
11
|
+
host "example.com", path: "/contact", to: :index
|
12
|
+
end
|
13
|
+
end
|
14
|
+
```
|
15
|
+
|
16
|
+
Instead, hostname and path could be provided by `::steer`, too:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
class DummyJob < Wayfarer::Base
|
20
|
+
route do |hostname, path|
|
21
|
+
host hostname, path: path, to: :index
|
22
|
+
end
|
23
|
+
|
24
|
+
steer do |_task|
|
25
|
+
["example.com", "/contact"]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
29
|
+
|
30
|
+
Note that `steer` yields the current [task](/guides/tasks).
|
data/docs/guides/tasks.md
CHANGED
@@ -1,38 +1,14 @@
|
|
1
1
|
# Tasks
|
2
2
|
|
3
|
-
Tasks are the units of work processed by jobs. A task
|
3
|
+
Tasks are the immutable units of work processed by [jobs](/guides/jobs). A task
|
4
|
+
consists of:
|
4
5
|
|
5
|
-
1. The
|
6
|
-
|
7
|
-
|
8
|
-
Like URLs, batches are strings. Within a batch, every URL gets processed at most
|
9
|
-
once.
|
10
|
-
|
11
|
-
Tasks get appended to the end of a message queue, and consumed gfrom their
|
12
|
-
beginning by jobs.
|
13
|
-
|
14
|
-
When jobs process tasks, they search their routing tree for a matching route.
|
15
|
-
URLs that match no route are not retrieved, and their task considered
|
16
|
-
successfully processed without further action.
|
17
|
-
|
18
|
-
## Task Metadata
|
19
|
-
|
20
|
-
At runtime, tasks take the shape of a `Wayfarer::Task` object. While only URL
|
21
|
-
and batch are persisted to message queues, tasks carry an arbitrarily assignable
|
22
|
-
`metadata` object:
|
23
|
-
|
24
|
-
```ruby
|
25
|
-
task # => #<Task url="https://example.com" batch="547b761-d0ad-...">
|
26
|
-
task.metadata # => #<OpenStruct>
|
27
|
-
task.metadata.my_piece_of_information = "hello"
|
28
|
-
```
|
29
|
-
|
30
|
-
`task.metadata` is ephemeral and only accessible at runtime.
|
31
|
-
|
32
|
-
Once a job consumes a task, the job instance becomes accessible on it:
|
33
|
-
|
34
|
-
```
|
35
|
-
task.job # => #<DummyJob ...>
|
36
|
-
```
|
6
|
+
1. The __URL__ to process
|
7
|
+
* Within a batch, every URL gets processed at most once.
|
37
8
|
|
9
|
+
2. The __batch__ the task belongs to
|
10
|
+
* Like URLs, batches are strings.
|
38
11
|
|
12
|
+
Tasks get appended to the end of a message queue, and consumed from the
|
13
|
+
beginning. Because jobs can enqueue other tasks, jobs are both consumers
|
14
|
+
and producers of tasks.
|
data/docs/reference/api/base.md
CHANGED
@@ -1,162 +1,48 @@
|
|
1
1
|
---
|
2
|
-
title: Base
|
2
|
+
title: Wayfarer::Base
|
3
3
|
---
|
4
4
|
|
5
5
|
# `Wayfarer::Base`
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
* Router for connecting URLs with instance methods and collecting
|
10
|
-
data.
|
11
|
-
* Access to a parsed document representation.
|
12
|
-
* Access to the browser or HTTP connection that retrieved the document.
|
13
|
-
* Ability to stage URLs for future processing.
|
14
|
-
* Ability to inject middleware before or after the worker.
|
7
|
+
Wayfarer's complete job API.
|
15
8
|
|
16
9
|
---
|
17
10
|
|
18
|
-
### `::route
|
19
|
-
:
|
20
|
-
|
21
|
-
##### Example
|
22
|
-
|
23
|
-
!!! example "Defining routes"
|
11
|
+
### `::route`
|
12
|
+
: Draw routes to instance methods.
|
24
13
|
|
25
|
-
|
26
|
-
class DummyJob < Wayfarer::Base
|
27
|
-
route.to :index
|
14
|
+
---
|
28
15
|
|
29
|
-
|
30
|
-
|
31
|
-
end
|
32
|
-
```
|
16
|
+
### `::steer { (Wayfarer::Task) -> [any] }`
|
17
|
+
: Provide router arguments.
|
33
18
|
|
34
19
|
---
|
35
20
|
|
36
21
|
### `#task -> Wayfarer::Task`
|
37
22
|
: The currently processing task.
|
38
23
|
|
39
|
-
!!! example "Inspecting the current task"
|
40
|
-
|
41
|
-
```ruby
|
42
|
-
class DummyJob < Wayfarer::Base
|
43
|
-
route.to :index
|
44
|
-
|
45
|
-
def index
|
46
|
-
task # => #<Wayfarer::Task ...>
|
47
|
-
task.url # => "https://example.com"
|
48
|
-
task.batch # => "2287ae65-359e-4dc0-..."
|
49
|
-
end
|
50
|
-
end
|
51
|
-
```
|
52
|
-
|
53
24
|
---
|
54
25
|
|
55
26
|
### `#params -> Hash`
|
56
27
|
: URL parameters collected from the matching route.
|
57
28
|
|
58
|
-
!!! example "Accessing URL parameters"
|
59
|
-
|
60
|
-
```ruby
|
61
|
-
class DummyJob < Wayfarer::Base
|
62
|
-
route.path "/users/:user_id/images/:id", to: :index
|
63
|
-
|
64
|
-
def index
|
65
|
-
params # => { "user_id" => ..., "id" => ... }
|
66
|
-
end
|
67
|
-
end
|
68
|
-
```
|
69
|
-
|
70
29
|
---
|
71
30
|
|
72
|
-
### `#stage(
|
31
|
+
### `#stage(String | [String]) -> void`
|
73
32
|
: Add URLs to a processing set. URLs already processed within the
|
74
33
|
current batch get discarded are not enqueued. Every staged URL gets
|
75
34
|
normalized.
|
76
35
|
|
77
|
-
!!! example "Staging a URL"
|
78
|
-
|
79
|
-
```ruby
|
80
|
-
class DummyJob < Wayfarer::Base
|
81
|
-
route.to :index
|
82
|
-
|
83
|
-
def index
|
84
|
-
stage "https://example.com"
|
85
|
-
end
|
86
|
-
end
|
87
|
-
```
|
88
|
-
|
89
|
-
!!! example "Staging all URLs contained in the current page"
|
90
|
-
|
91
|
-
```ruby
|
92
|
-
class DummyJob < Wayfarer::Base
|
93
|
-
route.to :index
|
94
|
-
|
95
|
-
def index
|
96
|
-
stage page.meta.links.all
|
97
|
-
end
|
98
|
-
end
|
99
|
-
```
|
100
|
-
|
101
36
|
---
|
102
37
|
|
103
|
-
### `#browser ->
|
104
|
-
: The
|
105
|
-
If the configured agent is the default `:http`, `nil` is returned.
|
106
|
-
|
107
|
-
Guides:
|
108
|
-
|
109
|
-
* [Ferrum (Chrome DevTools Protocol)]()
|
110
|
-
* [Selenium]()
|
111
|
-
|
112
|
-
!!! example "Accessing a Google Chrome process"
|
113
|
-
|
114
|
-
```ruby
|
115
|
-
Wayfarer.config.network.agent = :ferrum
|
116
|
-
|
117
|
-
class DummyJob < Wayfarer::Base
|
118
|
-
route.to :index
|
119
|
-
|
120
|
-
def index
|
121
|
-
browser # => #<Ferrum::Browser ...>
|
122
|
-
end
|
123
|
-
end
|
124
|
-
```
|
125
|
-
|
126
|
-
!!! example "Accessing a Selenium WebDriver"
|
127
|
-
|
128
|
-
```ruby
|
129
|
-
Wayfarer.config.network.agent = :selenium
|
130
|
-
|
131
|
-
class DummyJob < Wayfarer::Base
|
132
|
-
route.to :index
|
133
|
-
|
134
|
-
def index
|
135
|
-
browser # => #<Selenium::WebDriver ...>
|
136
|
-
end
|
137
|
-
end
|
138
|
-
```
|
38
|
+
### `#browser -> Object`
|
39
|
+
: The user agent that retrieved the current page.
|
139
40
|
|
140
41
|
---
|
141
42
|
|
142
|
-
### `#page(live: false) -> Page`
|
43
|
+
### `#page(live: true | false) -> Page`
|
143
44
|
: The page representing the response retrieved from the currently
|
144
45
|
processing URL.
|
145
46
|
|
146
|
-
With `
|
147
|
-
browser DOM.
|
148
|
-
`page()` without the keyword return the most recent page.
|
149
|
-
|
150
|
-
---
|
151
|
-
|
152
|
-
### `#doc -> Nokogiri::HTML | Nokogiri::XML | Hash`
|
153
|
-
: The parsed HTTP response body depending on the Content-Type:
|
154
|
-
* When XML or HTML then a parsed Nokogiri document
|
155
|
-
* When JSON, a parsed Hash
|
156
|
-
|
157
|
-
---
|
158
|
-
|
159
|
-
### `#middleware -> [Middleware]`
|
160
|
-
: Template method that allows workers to inject middleware before or
|
161
|
-
after themselves.
|
162
|
-
|
47
|
+
With `live: true` called, a fresh `Page` is returned that reflects the
|
48
|
+
current browser DOM. Calls to `#page` return the most recent page.
|
data/docs/reference/api/route.md
CHANGED
data/docs/reference/cli.md
CHANGED
@@ -14,14 +14,6 @@ All [environment variables](../environment_variables) are respected.
|
|
14
14
|
|
15
15
|
: Generates a new project directory `NAME`.
|
16
16
|
|
17
|
-
##### Example
|
18
|
-
|
19
|
-
!!! example "Create a new project directory"
|
20
|
-
|
21
|
-
```
|
22
|
-
wayfarer generate project foobar
|
23
|
-
```
|
24
|
-
|
25
17
|
## `wayfarer job`
|
26
18
|
|
27
19
|
### `wayfarer job perform JOB URL`
|
@@ -35,20 +27,6 @@ All [environment variables](../environment_variables) are respected.
|
|
35
27
|
talking to an actual server.
|
36
28
|
* `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
|
37
29
|
|
38
|
-
##### Examples
|
39
|
-
|
40
|
-
!!! example "Perform a job"
|
41
|
-
|
42
|
-
```
|
43
|
-
wayfarer job perform DummyJob https://example.com
|
44
|
-
```
|
45
|
-
|
46
|
-
!!! example "Specify a batch"
|
47
|
-
|
48
|
-
```
|
49
|
-
wayfarer job perform --batch=my-batch DummyJob https://example.com
|
50
|
-
```
|
51
|
-
|
52
30
|
### `wayfarer job enqueue JOB URL`
|
53
31
|
|
54
32
|
: Enqueues `JOB` with `URL` to the configured Active Job backend.
|
@@ -57,20 +35,6 @@ All [environment variables](../environment_variables) are respected.
|
|
57
35
|
|
58
36
|
* `--batch=BATCH`: Set the job's batch. By default, a UUID is generated.
|
59
37
|
|
60
|
-
##### Examples
|
61
|
-
|
62
|
-
!!! example "Enqueue a job"
|
63
|
-
|
64
|
-
```
|
65
|
-
wayfarer job enqueue DummyJob https://example.com
|
66
|
-
```
|
67
|
-
|
68
|
-
!!! example "Specify a batch"
|
69
|
-
|
70
|
-
```
|
71
|
-
wayfarer job enqueue --batch=my-batch DummyJob https://example.com
|
72
|
-
```
|
73
|
-
|
74
38
|
### `wayfarer job execute JOB URL`
|
75
39
|
|
76
40
|
: Execute `JOB` with `URL` by using the
|
@@ -86,54 +50,12 @@ All [environment variables](../environment_variables) are respected.
|
|
86
50
|
* `--min-threads`: Minimum number of threads to use. Default: 1
|
87
51
|
* `--max-threads`: Maximum number of threads to use. Default: 1
|
88
52
|
|
89
|
-
##### Examples
|
90
|
-
|
91
|
-
!!! example "Enqueue a job"
|
92
|
-
|
93
|
-
```
|
94
|
-
wayfarer job execute DummyJob https://example.com
|
95
|
-
```
|
96
|
-
|
97
|
-
!!! example "Mock Redis"
|
98
|
-
|
99
|
-
```
|
100
|
-
wayfarer job execute --mock-redis DummyJob https://example.com
|
101
|
-
```
|
102
|
-
|
103
|
-
!!! example "Specify a batch"
|
104
|
-
|
105
|
-
```
|
106
|
-
wayfarer job execute --batch=my-batch DummyJob https://example.com
|
107
|
-
```
|
108
|
-
|
109
|
-
!!! example "Use up to 4 threads"
|
110
|
-
|
111
|
-
```
|
112
|
-
wayfarer job execute --min-threads=1 --max-threads=4 DummyJob https://example.com
|
113
|
-
```
|
114
|
-
|
115
53
|
## `wayfarer route`
|
116
54
|
|
117
55
|
### `wayfarer route result JOB URL`
|
118
56
|
|
119
57
|
: Prints the result of invoking `JOB`'s router with `URL`.
|
120
58
|
|
121
|
-
##### Example
|
122
|
-
|
123
|
-
!!! example "Route a URL"
|
124
|
-
|
125
|
-
```
|
126
|
-
wayfarer route result DummyJob https://example.com
|
127
|
-
```
|
128
|
-
|
129
59
|
### `wayfarer route tree JOB URL`
|
130
60
|
|
131
61
|
: Visualises the routing tree result of invoking `JOB`'s router with `URL`.
|
132
|
-
|
133
|
-
##### Example
|
134
|
-
|
135
|
-
!!! example "Visualise the routing tree"
|
136
|
-
|
137
|
-
```
|
138
|
-
wayfarer route tree DummyJob https://example.com
|
139
|
-
```
|
@@ -10,7 +10,7 @@ hide:
|
|
10
10
|
| Runtime config key | Environment variable | Description | Default | Supported values |
|
11
11
|
| ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
|
12
12
|
| `network.agent` | `WAYFARER_NETWORK_AGENT` | The user agent to use. | `:http` | `:http`, `:ferrum`, `:selenium` |
|
13
|
-
| `network.pool_size` | `WAYFARER_NETWORK_POOL_SIZE` | How many user agents to spawn. |
|
13
|
+
| `network.pool_size` | `WAYFARER_NETWORK_POOL_SIZE` | How many user agents to spawn. | 1 | Integers |
|
14
14
|
| `network.pool_timeout` | `WAYFARER_NETWORK_POOL_TIMEOUT` | How long jobs may use an agent in seconds. | 10 | Integers |
|
15
15
|
| `network.http_headers` | `WAYFARER_NETWORK_HTTP_HEADERS` | HTTP headers to append to requests. | `{}` | Hashes |
|
16
16
|
|
data/lib/wayfarer/cli/job.rb
CHANGED
@@ -13,12 +13,10 @@ module Wayfarer
|
|
13
13
|
|
14
14
|
url = Addressable::URI.parse(url)
|
15
15
|
job = job.classify.constantize.new
|
16
|
-
task = Wayfarer::Task.new(url,
|
16
|
+
task = Wayfarer::Task.new(url, options[:batch])
|
17
17
|
job.arguments.push(task)
|
18
18
|
job.perform(task)
|
19
19
|
GC.new(job).run
|
20
|
-
|
21
|
-
free_agent_pool
|
22
20
|
end
|
23
21
|
|
24
22
|
desc "enqueue JOB URL",
|
data/lib/wayfarer/cli/route.rb
CHANGED
@@ -11,7 +11,8 @@ module Wayfarer
|
|
11
11
|
load_environment
|
12
12
|
url = Addressable::URI.parse(url)
|
13
13
|
job = job.classify.constantize
|
14
|
-
|
14
|
+
job.router.invoke(url, job.new.steer)
|
15
|
+
say Wayfarer::Routing::PathFinder.result(job.router.root, url)
|
15
16
|
end
|
16
17
|
|
17
18
|
desc "tree JOB URL",
|
@@ -20,7 +21,8 @@ module Wayfarer
|
|
20
21
|
load_environment
|
21
22
|
url = Addressable::URI.parse(url)
|
22
23
|
job = job.classify.constantize
|
23
|
-
|
24
|
+
job.router.invoke(url, job.new.steer)
|
25
|
+
Wayfarer::CLI::RoutePrinter.print(job.router.root, url)
|
24
26
|
end
|
25
27
|
end
|
26
28
|
end
|
@@ -39,7 +39,7 @@ module Wayfarer
|
|
39
39
|
|
40
40
|
def define_reader(key, env_key: nil, type: nil, default: nil)
|
41
41
|
define_singleton_method(key.to_sym) do
|
42
|
-
get(key) || set(key,
|
42
|
+
get(key) || set(key, env_val(env_key, type) || default)
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -3,6 +3,18 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
5
|
class Fetch
|
6
|
+
module API
|
7
|
+
def agent
|
8
|
+
task.metadata.agent
|
9
|
+
end
|
10
|
+
|
11
|
+
def page(live: false)
|
12
|
+
return task.metadata.page unless live
|
13
|
+
|
14
|
+
task.metadata.page = agent.live&.page || task.metadata.page
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
6
18
|
include Wayfarer::Middleware::Stage::API
|
7
19
|
|
8
20
|
attr_reader :pool
|
@@ -15,17 +27,16 @@ module Wayfarer
|
|
15
27
|
def call(task)
|
16
28
|
self.task = task
|
17
29
|
|
18
|
-
pool.with do |
|
19
|
-
task.metadata.agent = agent
|
20
|
-
|
30
|
+
pool.with do |context|
|
21
31
|
result = task.job.run_callbacks :fetch do
|
22
|
-
|
32
|
+
context.fetch(task.url)
|
23
33
|
end
|
24
34
|
|
25
35
|
case result
|
26
36
|
when Networking::Result::Redirect
|
27
37
|
stage(result.redirect_url)
|
28
38
|
when Networking::Result::Success
|
39
|
+
task.metadata.agent = context.instance
|
29
40
|
task.metadata.page = result.page
|
30
41
|
yield if block_given?
|
31
42
|
end
|
@@ -3,10 +3,42 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
5
|
class Router
|
6
|
+
module API
|
7
|
+
def self.included(base)
|
8
|
+
base.include(InstanceMethods)
|
9
|
+
base.extend(ClassMethods)
|
10
|
+
end
|
11
|
+
|
12
|
+
module InstanceMethods
|
13
|
+
def steer
|
14
|
+
[]
|
15
|
+
end
|
16
|
+
|
17
|
+
def params
|
18
|
+
task.metadata.params
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
module ClassMethods
|
23
|
+
def router
|
24
|
+
@router ||= Wayfarer::Routing::Router.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def route(&block)
|
28
|
+
router.draw(&block) if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
def steer(&block)
|
32
|
+
define_method(:steer) { block.call(task) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
6
37
|
def call(task)
|
7
|
-
|
38
|
+
router = task.job.class.router
|
39
|
+
url = Addressable::URI.parse(task.url)
|
8
40
|
|
9
|
-
case result =
|
41
|
+
case result = router.invoke(url, task.job.steer)
|
10
42
|
when Routing::Result::Mismatch
|
11
43
|
return
|
12
44
|
when Routing::Result::Match
|