wayfarer 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env +17 -0
- data/.github/workflows/lint.yaml +8 -6
- data/.github/workflows/release.yaml +4 -3
- data/.github/workflows/tests.yaml +5 -14
- data/.gitignore +2 -2
- data/.rubocop.yml +31 -0
- data/.vale.ini +6 -3
- data/Dockerfile +3 -2
- data/Gemfile +21 -0
- data/Gemfile.lock +233 -128
- data/Rakefile +7 -0
- data/docker-compose.yml +13 -14
- data/docs/guides/callbacks.md +3 -1
- data/docs/guides/configuration.md +10 -35
- data/docs/guides/development.md +67 -0
- data/docs/guides/handlers.md +7 -7
- data/docs/guides/jobs.md +54 -11
- data/docs/guides/networking/custom_adapters.md +31 -10
- data/docs/guides/pages.md +24 -22
- data/docs/guides/routing.md +116 -34
- data/docs/guides/tasks.md +30 -10
- data/docs/guides/tutorial.md +23 -17
- data/docs/guides/user_agents.md +11 -9
- data/lib/wayfarer/base.rb +9 -8
- data/lib/wayfarer/batch_completion.rb +18 -14
- data/lib/wayfarer/callbacks.rb +14 -14
- data/lib/wayfarer/cli/route_printer.rb +78 -96
- data/lib/wayfarer/cli.rb +12 -30
- data/lib/wayfarer/gc.rb +6 -1
- data/lib/wayfarer/kv.rb +28 -0
- data/lib/wayfarer/middleware/chain.rb +7 -1
- data/lib/wayfarer/middleware/content_type.rb +20 -15
- data/lib/wayfarer/middleware/dedup.rb +9 -3
- data/lib/wayfarer/middleware/dispatch.rb +7 -2
- data/lib/wayfarer/middleware/normalize.rb +4 -12
- data/lib/wayfarer/middleware/router.rb +1 -1
- data/lib/wayfarer/middleware/uri_parser.rb +4 -3
- data/lib/wayfarer/networking/context.rb +12 -1
- data/lib/wayfarer/networking/ferrum.rb +1 -4
- data/lib/wayfarer/networking/follow.rb +2 -1
- data/lib/wayfarer/networking/pool.rb +12 -7
- data/lib/wayfarer/networking/selenium.rb +15 -7
- data/lib/wayfarer/page.rb +0 -2
- data/lib/wayfarer/parsing/xml.rb +1 -1
- data/lib/wayfarer/parsing.rb +2 -5
- data/lib/wayfarer/redis/barrier.rb +15 -2
- data/lib/wayfarer/redis/counter.rb +1 -2
- data/lib/wayfarer/routing/dsl.rb +166 -31
- data/lib/wayfarer/routing/hash_stack.rb +33 -0
- data/lib/wayfarer/routing/matchers/custom.rb +8 -5
- data/lib/wayfarer/routing/matchers/{suffix.rb → empty_params.rb} +2 -6
- data/lib/wayfarer/routing/matchers/host.rb +15 -9
- data/lib/wayfarer/routing/matchers/path.rb +11 -33
- data/lib/wayfarer/routing/matchers/query.rb +41 -17
- data/lib/wayfarer/routing/matchers/result.rb +12 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +13 -5
- data/lib/wayfarer/routing/matchers/url.rb +13 -5
- data/lib/wayfarer/routing/path_consumer.rb +130 -0
- data/lib/wayfarer/routing/path_finder.rb +151 -23
- data/lib/wayfarer/routing/result.rb +1 -1
- data/lib/wayfarer/routing/root_route.rb +14 -2
- data/lib/wayfarer/routing/route.rb +71 -14
- data/lib/wayfarer/routing/serializable.rb +28 -0
- data/lib/wayfarer/routing/sub_route.rb +53 -0
- data/lib/wayfarer/routing/target_route.rb +17 -1
- data/lib/wayfarer/stringify.rb +1 -2
- data/lib/wayfarer/task.rb +3 -5
- data/lib/wayfarer/uri/normalization.rb +120 -0
- data/lib/wayfarer.rb +50 -10
- data/mise.toml +2 -0
- data/mkdocs.yml +8 -17
- data/rake/lint.rake +0 -96
- data/rake/release.rake +5 -11
- data/rake/tests.rake +8 -4
- data/requirements.txt +1 -1
- data/spec/factories/job.rb +8 -0
- data/spec/factories/middleware.rb +2 -2
- data/spec/factories/path_finder.rb +11 -0
- data/spec/factories/redis.rb +19 -0
- data/spec/factories/task.rb +39 -1
- data/spec/spec_helpers.rb +50 -57
- data/spec/support/active_job_helpers.rb +8 -0
- data/spec/support/integration_helpers.rb +21 -0
- data/spec/support/redis_helpers.rb +9 -0
- data/spec/support/test_app.rb +64 -43
- data/spec/{base_spec.rb → wayfarer/base_spec.rb} +32 -36
- data/spec/wayfarer/batch_completion_spec.rb +142 -0
- data/spec/wayfarer/cli/job_spec.rb +88 -0
- data/spec/wayfarer/cli/routing_spec.rb +322 -0
- data/spec/{cli → wayfarer/cli}/version_spec.rb +1 -1
- data/spec/wayfarer/gc_spec.rb +29 -0
- data/spec/{handler_spec.rb → wayfarer/handler_spec.rb} +1 -3
- data/spec/{integration → wayfarer/integration}/callbacks_spec.rb +9 -6
- data/spec/wayfarer/integration/content_type_spec.rb +37 -0
- data/spec/wayfarer/integration/custom_routing_spec.rb +51 -0
- data/spec/{integration → wayfarer/integration}/gc_spec.rb +9 -13
- data/spec/{integration → wayfarer/integration}/handler_spec.rb +9 -10
- data/spec/{integration → wayfarer/integration}/page_spec.rb +8 -6
- data/spec/{integration → wayfarer/integration}/params_spec.rb +4 -4
- data/spec/{integration → wayfarer/integration}/parsing_spec.rb +7 -33
- data/spec/wayfarer/integration/retry_spec.rb +112 -0
- data/spec/{integration → wayfarer/integration}/stage_spec.rb +5 -5
- data/spec/{middleware → wayfarer/middleware}/batch_completion_spec.rb +4 -5
- data/spec/{middleware → wayfarer/middleware}/chain_spec.rb +20 -15
- data/spec/{middleware → wayfarer/middleware}/content_type_spec.rb +18 -21
- data/spec/{middleware → wayfarer/middleware}/controller_spec.rb +22 -20
- data/spec/wayfarer/middleware/dedup_spec.rb +66 -0
- data/spec/wayfarer/middleware/normalize_spec.rb +32 -0
- data/spec/{middleware → wayfarer/middleware}/router_spec.rb +18 -20
- data/spec/{middleware → wayfarer/middleware}/stage_spec.rb +11 -10
- data/spec/wayfarer/middleware/uri_parser_spec.rb +63 -0
- data/spec/{middleware → wayfarer/middleware}/user_agent_spec.rb +34 -32
- data/spec/wayfarer/networking/capybara_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/context_spec.rb +46 -38
- data/spec/wayfarer/networking/ferrum_spec.rb +13 -0
- data/spec/{networking → wayfarer/networking}/follow_spec.rb +9 -4
- data/spec/wayfarer/networking/http_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/pool_spec.rb +11 -9
- data/spec/wayfarer/networking/selenium_spec.rb +12 -0
- data/spec/{networking → wayfarer/networking}/strategy.rb +33 -54
- data/spec/{page_spec.rb → wayfarer/page_spec.rb} +3 -3
- data/spec/{parsing → wayfarer/parsing}/json_spec.rb +1 -1
- data/spec/{parsing/xml_spec.rb → wayfarer/parsing/xml_parse_spec.rb} +4 -3
- data/spec/{redis → wayfarer/redis}/barrier_spec.rb +5 -4
- data/spec/wayfarer/redis/counter_spec.rb +34 -0
- data/spec/{redis → wayfarer/redis}/pool_spec.rb +3 -2
- data/spec/{routing → wayfarer/routing}/dsl_spec.rb +12 -22
- data/spec/wayfarer/routing/hash_stack_spec.rb +63 -0
- data/spec/wayfarer/routing/integration_spec.rb +101 -0
- data/spec/wayfarer/routing/matchers/custom_spec.rb +39 -0
- data/spec/wayfarer/routing/matchers/host_spec.rb +56 -0
- data/spec/wayfarer/routing/matchers/matcher.rb +17 -0
- data/spec/wayfarer/routing/matchers/path_spec.rb +43 -0
- data/spec/wayfarer/routing/matchers/query_spec.rb +123 -0
- data/spec/wayfarer/routing/matchers/scheme_spec.rb +45 -0
- data/spec/wayfarer/routing/matchers/url_spec.rb +33 -0
- data/spec/wayfarer/routing/path_consumer_spec.rb +123 -0
- data/spec/wayfarer/routing/path_finder_spec.rb +409 -0
- data/spec/wayfarer/routing/root_route_spec.rb +51 -0
- data/spec/wayfarer/routing/route_spec.rb +74 -0
- data/spec/wayfarer/routing/sub_route_spec.rb +103 -0
- data/spec/wayfarer/uri/normalization_spec.rb +98 -0
- data/spec/wayfarer_spec.rb +2 -2
- data/wayfarer.gemspec +17 -28
- metadata +768 -246
- data/.rbenv-gemsets +0 -1
- data/.ruby-version +0 -1
- data/RELEASING.md +0 -17
- data/docs/cookbook/user_agent.md +0 -7
- data/docs/design.md +0 -36
- data/docs/guides/jobs/error_handling.md +0 -40
- data/docs/reference/configuration.md +0 -36
- data/spec/batch_completion_spec.rb +0 -104
- data/spec/cli/job_spec.rb +0 -74
- data/spec/cli/routing_spec.rb +0 -101
- data/spec/fixtures/dummy_job.rb +0 -9
- data/spec/gc_spec.rb +0 -17
- data/spec/integration/content_type_spec.rb +0 -145
- data/spec/integration/routing_spec.rb +0 -18
- data/spec/middleware/dedup_spec.rb +0 -71
- data/spec/middleware/dispatch_spec.rb +0 -59
- data/spec/middleware/normalize_spec.rb +0 -60
- data/spec/middleware/uri_parser_spec.rb +0 -53
- data/spec/networking/capybara_spec.rb +0 -12
- data/spec/networking/ferrum_spec.rb +0 -12
- data/spec/networking/http_spec.rb +0 -12
- data/spec/networking/selenium_spec.rb +0 -12
- data/spec/redis/counter_spec.rb +0 -44
- data/spec/routing/integration_spec.rb +0 -110
- data/spec/routing/matchers/custom_spec.rb +0 -31
- data/spec/routing/matchers/host_spec.rb +0 -49
- data/spec/routing/matchers/path_spec.rb +0 -43
- data/spec/routing/matchers/query_spec.rb +0 -137
- data/spec/routing/matchers/scheme_spec.rb +0 -25
- data/spec/routing/matchers/suffix_spec.rb +0 -41
- data/spec/routing/matchers/uri_spec.rb +0 -27
- data/spec/routing/path_finder_spec.rb +0 -33
- data/spec/routing/root_route_spec.rb +0 -29
- data/spec/routing/route_spec.rb +0 -43
- data/docs/{reference → guides}/cli.md +0 -0
- data/spec/{stringify_spec.rb → wayfarer/stringify_spec.rb} +2 -2
- /data/spec/{task_spec.rb → wayfarer/task_spec.rb} +0 -0
data/docs/guides/tutorial.md
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
# Tutorial
|
2
2
|
|
3
3
|
Wayfarer is a web crawling framework written in Ruby.
|
4
|
-
It works with plain HTTP
|
5
|
-
Redis and a message queue
|
6
|
-
|
4
|
+
It works with plain HTTP and by automating web browsers interchangeably
|
5
|
+
and is deployed with Redis and a message queue.
|
6
|
+
During development it can execute fully in memory, without Redis.
|
7
7
|
|
8
|
-
|
8
|
+
## Getting started
|
9
9
|
|
10
|
-
|
11
|
-
ActiveJob and Wayfarer:
|
10
|
+
In an empty directory, generate a new `Gemfile` and install Wayfarer:
|
12
11
|
|
13
12
|
```sh
|
14
13
|
bundle init
|
@@ -22,14 +21,16 @@ Wayfarer builds on Active Job, the message queue abstraction of Rails.
|
|
22
21
|
You can use Wayfarer without Rails of course, as we do here.
|
23
22
|
|
24
23
|
A message queue supports two operations: appending messages to the end and consuming
|
25
|
-
messages from the front.
|
26
|
-
|
27
|
-
|
24
|
+
messages from the front. This is how Wayfarer processes tasks, a string pair
|
25
|
+
of URL and batch. Wayfarer enforces that URLs are not processed more than
|
26
|
+
once within their batch (excluding retries).
|
28
27
|
|
29
|
-
|
28
|
+
When a task is consumed, it is processed by a job, a Ruby class.
|
29
|
+
|
30
|
+
Let's give ourselves a `dummy_job.rb` that routes all URLs to its
|
30
31
|
`index` instance method, where we print the current `task`:
|
31
32
|
|
32
|
-
```ruby
|
33
|
+
```ruby title="dummy_job.rb"
|
33
34
|
require "activejob"
|
34
35
|
require "wayfarer"
|
35
36
|
|
@@ -44,17 +45,22 @@ class DummyJob < ActiveJob::Base
|
|
44
45
|
end
|
45
46
|
```
|
46
47
|
|
47
|
-
We can perform our job from the command line with
|
48
|
-
|
49
|
-
UUID for
|
48
|
+
We can perform our job from the command line with the `wayfarer perform`
|
49
|
+
subcommand. In between ActiveJob's log output, we see that Wayfarer
|
50
|
+
has generated a UUID for the batch since we did not pass it:
|
50
51
|
|
51
|
-
```
|
52
|
+
```sh
|
52
53
|
bundle exec wayfarer perform -r dummy_job.rb DummyJob https://example.com
|
54
|
+
```
|
55
|
+
|
56
|
+
```hl_lines="2"
|
53
57
|
[ActiveJob] [DummyJob] [68853491-...] Performing DummyJob (Job ID: 68853491-...) from Async(default) with arguments: #<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
54
58
|
#<Wayfarer::Task url="https://example.com", batch="63d14035-...">
|
55
59
|
[ActiveJob] [DummyJob] [68853491-...] Performed DummyJob (Job ID: 68853491-) from Async(default) in 507.65ms
|
56
60
|
```
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
If you don't provide a batch, Wayfarer uses a generated UUID instead.
|
63
|
+
We could have also used `DummyJob.crawl
|
64
|
+
|
65
|
+
|
60
66
|
|
data/docs/guides/user_agents.md
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
# User agents
|
2
2
|
|
3
3
|
User agents are used by [jobs](../jobs) to retrieve the contents behind a URL into a
|
4
|
-
[page](../pages)
|
5
|
-
|
4
|
+
[page](../pages), for example a remotely controlled Firefox process or a Ruby HTTP client.
|
5
|
+
|
6
|
+
User agents are kept in a connection pool and all user agents in the pool
|
7
|
+
share the same type and configuration. You can add your own custom user agents by implementing
|
6
8
|
the [user agent API](custom_user_agents.md).
|
7
9
|
|
8
10
|
Wayfarer comes with the following built-in user agents:
|
9
11
|
|
10
|
-
* [
|
11
|
-
* [
|
12
|
-
* [
|
13
|
-
* [
|
12
|
+
* [`:http`](http.md) (default)
|
13
|
+
* [`:ferrum`](ferrum.md) to automate Google Chrome
|
14
|
+
* [`:selenium`](selenium.md) to automate a variety of browsers
|
15
|
+
* [`:capybara`](capybara.md) to use Capybara sessions
|
14
16
|
|
15
17
|
Configure the user agent with the global configuration option:
|
16
18
|
|
@@ -53,7 +55,7 @@ class DummyJob < ActiveJob::Base
|
|
53
55
|
end
|
54
56
|
```
|
55
57
|
|
56
|
-
!!! info "`#fetch`
|
58
|
+
!!! info "`#fetch` respects `Wayfarer.config.network.http_headers` for all provided user agents."
|
57
59
|
|
58
60
|
## HTTP request headers
|
59
61
|
|
@@ -78,7 +80,7 @@ underlying message queue operates with. For example, if you use Sidekiq,
|
|
78
80
|
you should set the pool size to the number of Sidekiq threads:
|
79
81
|
|
80
82
|
```ruby
|
81
|
-
Wayfarer.config[:network][:
|
83
|
+
Wayfarer.config[:network][:pool][:size] = Sidekiq.options[:concurrency]
|
82
84
|
```
|
83
85
|
|
84
86
|
!!! attention "The connection pool size is 1 by default"
|
@@ -109,5 +111,5 @@ You can configure the timeout, although you will likely want to increase the
|
|
109
111
|
pool size instead:
|
110
112
|
|
111
113
|
```ruby
|
112
|
-
Wayfarer.config[:network][:
|
114
|
+
Wayfarer.config[:network][:pool][:timeout] = 10 # seconds
|
113
115
|
```
|
data/lib/wayfarer/base.rb
CHANGED
@@ -4,15 +4,16 @@ module Wayfarer
|
|
4
4
|
# @!attribute [r] task
|
5
5
|
# @return [Wayfarer::Task] the current task
|
6
6
|
# @!attribute [r] uri
|
7
|
-
# @return [Addressable::URI]
|
7
|
+
# @return [Addressable::URI] parsed task URL
|
8
8
|
# @!attribute [r] user_agent
|
9
|
-
# @return [Object] the user agent
|
9
|
+
# @return [Object] the user agent used to retrieve the page
|
10
10
|
# @!attribute [r] action
|
11
|
-
# @return [Symbol,
|
11
|
+
# @return [Symbol, Handler] action that the task URL was routed to
|
12
12
|
# @!attribute [r] params
|
13
13
|
# @return [HashWithIndifferentAccess] path parameters collected from routes
|
14
14
|
module Base
|
15
15
|
extend ActiveSupport::Concern
|
16
|
+
|
16
17
|
# @!method stage(urls)
|
17
18
|
# Adds URLs to an internal staging set so that they get enqueued
|
18
19
|
# eventually, once the job executed successfully.
|
@@ -29,8 +30,8 @@ module Wayfarer
|
|
29
30
|
# @!method page(live: false)
|
30
31
|
# @param url [live] whether to retrieve a new {Page}.
|
31
32
|
# @return [Wayfarer::Page]
|
32
|
-
#
|
33
|
-
#
|
33
|
+
# The most recently retrieved page or a new page for the current task URL if
|
34
|
+
# the `live` keyword is passed.
|
34
35
|
|
35
36
|
# @!scope class
|
36
37
|
|
@@ -111,15 +112,15 @@ module Wayfarer
|
|
111
112
|
included do
|
112
113
|
include Wayfarer::Middleware::Controller
|
113
114
|
|
114
|
-
# Implement ActiveJob's #perform by calling into our own middleware
|
115
|
+
# Implement ActiveJob's #perform by calling into our own middleware
|
116
|
+
# chain included from {Controller}
|
115
117
|
alias_method :perform, :call
|
116
118
|
|
117
|
-
# Middleware stack
|
118
119
|
use Wayfarer::Middleware::Redis
|
119
|
-
use Wayfarer::Middleware::BatchCompletion
|
120
120
|
use Wayfarer::Middleware::UriParser
|
121
121
|
use Wayfarer::Middleware::Normalize
|
122
122
|
use Wayfarer::Middleware::Dedup
|
123
|
+
use Wayfarer::Middleware::BatchCompletion
|
123
124
|
use Wayfarer::Middleware::Stage
|
124
125
|
use Wayfarer::Middleware::Router
|
125
126
|
use Wayfarer::Middleware::UserAgent
|
@@ -14,10 +14,10 @@ module Wayfarer
|
|
14
14
|
module BatchCompletion
|
15
15
|
module_function
|
16
16
|
|
17
|
+
EVENTS = %w[enqueue.active_job perform.active_job retry_stopped.active_job].freeze
|
18
|
+
|
17
19
|
def subscribe!
|
18
|
-
ActiveSupport::Notifications.subscribe(
|
19
|
-
ActiveSupport::Notifications.subscribe("perform.active_job", self)
|
20
|
-
ActiveSupport::Notifications.subscribe("retry_stopped.active_job", self)
|
20
|
+
EVENTS.each { |event| ActiveSupport::Notifications.subscribe(event, self) }
|
21
21
|
end
|
22
22
|
|
23
23
|
def call(name, _, _, _, data)
|
@@ -26,25 +26,29 @@ module Wayfarer
|
|
26
26
|
task = job.arguments.first
|
27
27
|
|
28
28
|
# In the case of `enqueue.active_job` middleware hasn't executed yet
|
29
|
-
task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
|
30
|
-
|
31
|
-
counter = Redis::Counter.new(task) do
|
32
|
-
job.run_callbacks(:batch)
|
33
|
-
ensure
|
34
|
-
Wayfarer::GC.run(task)
|
35
|
-
end
|
29
|
+
task[:redis_pool] ||= Wayfarer::Redis::Pool.instance
|
36
30
|
|
37
|
-
handle(name, job, task
|
31
|
+
handle(name, job, task)
|
38
32
|
end
|
39
33
|
|
40
|
-
def handle(name, job, task
|
34
|
+
def handle(name, job, task)
|
35
|
+
counter = Wayfarer::Redis::Counter.new(task)
|
36
|
+
|
41
37
|
case name
|
42
38
|
when "enqueue.active_job" then counter.increment unless retry?(job)
|
43
|
-
when "perform.active_job" then counter
|
44
|
-
when "retry_stopped.active_job" then counter
|
39
|
+
when "perform.active_job" then succeed!(task, counter) if succeeded?(job, task)
|
40
|
+
when "retry_stopped.active_job" then fail!(counter)
|
45
41
|
end
|
46
42
|
end
|
47
43
|
|
44
|
+
def succeed!(task, counter)
|
45
|
+
Wayfarer::GC.run(task) if counter.decrement == 0
|
46
|
+
end
|
47
|
+
|
48
|
+
def fail!(counter)
|
49
|
+
counter.decrement
|
50
|
+
end
|
51
|
+
|
48
52
|
def retry?(job)
|
49
53
|
job.executions > 0
|
50
54
|
end
|
data/lib/wayfarer/callbacks.rb
CHANGED
@@ -13,32 +13,32 @@ module Wayfarer
|
|
13
13
|
end
|
14
14
|
|
15
15
|
class_methods do
|
16
|
-
def before_fetch(
|
17
|
-
set_callback(:fetch, :before,
|
16
|
+
def before_fetch(...)
|
17
|
+
set_callback(:fetch, :before, ...)
|
18
18
|
end
|
19
19
|
|
20
|
-
def around_fetch(
|
21
|
-
set_callback(:fetch, :around,
|
20
|
+
def around_fetch(...)
|
21
|
+
set_callback(:fetch, :around, ...)
|
22
22
|
end
|
23
23
|
|
24
|
-
def after_fetch(
|
25
|
-
set_callback(:fetch, :after,
|
24
|
+
def after_fetch(...)
|
25
|
+
set_callback(:fetch, :after, ...)
|
26
26
|
end
|
27
27
|
|
28
|
-
def before_action(
|
29
|
-
set_callback(:action, :before,
|
28
|
+
def before_action(...)
|
29
|
+
set_callback(:action, :before, ...)
|
30
30
|
end
|
31
31
|
|
32
|
-
def around_action(
|
33
|
-
set_callback(:action, :around,
|
32
|
+
def around_action(...)
|
33
|
+
set_callback(:action, :around, ...)
|
34
34
|
end
|
35
35
|
|
36
|
-
def after_action(
|
37
|
-
set_callback(:action, :after,
|
36
|
+
def after_action(...)
|
37
|
+
set_callback(:action, :after, ...)
|
38
38
|
end
|
39
39
|
|
40
|
-
def after_batch(
|
41
|
-
set_callback(:batch, :after,
|
40
|
+
def after_batch(...)
|
41
|
+
set_callback(:batch, :after, ...)
|
42
42
|
end
|
43
43
|
end
|
44
44
|
end
|
@@ -2,128 +2,110 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
class CLI
|
5
|
-
|
6
|
-
|
5
|
+
# Turns a routing tree into a Hash and prints it.
|
6
|
+
# Used by the `route` CLI subcommand.
|
7
|
+
#
|
8
|
+
# @api private
|
9
|
+
class RoutePrinter
|
10
|
+
# @return [Hash<Symbol, Proc>]
|
11
|
+
class_attribute :serializers,
|
12
|
+
default: { yaml: ->(hash) { YAML.dump(hash.deep_stringify_keys) },
|
13
|
+
json: ->(hash) { JSON.pretty_generate(hash) },
|
14
|
+
ruby: ->(hash) { pp(hash) } },
|
15
|
+
instance_accessor: false,
|
16
|
+
instance_predicate: false
|
7
17
|
|
8
|
-
|
9
|
-
REGULAR_SEGMENT = "│ "
|
10
|
-
JUNCTION_SEGMENT = "├──"
|
11
|
-
CORNER_SEGMENT = "└──"
|
18
|
+
BATCH = "tmp"
|
12
19
|
|
13
|
-
|
14
|
-
|
20
|
+
# Prints a routing tree.
|
21
|
+
#
|
22
|
+
# @param route [Wayfarer::Routing::Route] route to print
|
23
|
+
# @param url [String] URL to match
|
24
|
+
# @param format [String, Symbol] `:json`, `:yaml` or `:ruby`
|
25
|
+
def self.print(route, url, format:)
|
26
|
+
new(route, url, serializers.fetch(format.to_sym)).print
|
15
27
|
end
|
16
28
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
def visit(route)
|
24
|
-
route.accept(path_finder) unless route.parent
|
25
|
-
puts format_route_output(route)
|
26
|
-
true
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
29
|
+
# @param route [Wayfarer::Routing::Route] route to print
|
30
|
+
# @param url [String] URL to match
|
31
|
+
# @param serializer [Proc<Hash=>String>] output serializer
|
32
|
+
def initialize(route, url, serializer)
|
33
|
+
@route = route
|
34
|
+
@serializer = serializer
|
30
35
|
|
31
|
-
|
32
|
-
|
33
|
-
end
|
36
|
+
@nodes = {}
|
37
|
+
@root_hash = nil
|
34
38
|
|
35
|
-
|
36
|
-
[
|
39
|
+
task = Wayfarer::Task.new(url, BATCH)
|
40
|
+
task[:uri] = Addressable::URI.parse(url)
|
41
|
+
@path_finder = Wayfarer::Routing::PathFinder.new(
|
42
|
+
task,
|
43
|
+
stop_when_found: false,
|
44
|
+
&method(:call)
|
45
|
+
)
|
37
46
|
end
|
38
47
|
|
39
|
-
|
40
|
-
|
41
|
-
|
48
|
+
# Processes the routing trees and prints the serialized output.
|
49
|
+
def print
|
50
|
+
route.accept(path_finder)
|
42
51
|
|
43
|
-
|
44
|
-
trailer?(route) ? CORNER_SEGMENT : JUNCTION_SEGMENT
|
45
|
-
end
|
52
|
+
hash = routing_result(path_finder).merge(root_hash)
|
46
53
|
|
47
|
-
|
48
|
-
attrs = [route_arg(route), routing_result(route), route_action(route), route_params(route)].compact
|
49
|
-
text = attrs.any? ? "#{matcher_name(route)}(#{attrs.join(', ')})" : matcher_name(route)
|
50
|
-
set_color(text, *route_colors(route))
|
54
|
+
puts serializer.call(hash)
|
51
55
|
end
|
52
56
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
57
|
+
# Callback method called by `path_finder` with the result of matching
|
58
|
+
# the route.
|
59
|
+
#
|
60
|
+
# @param route [Wayfarer::Routing::Route] the current route
|
61
|
+
# @param result [true, false] routing result
|
62
|
+
# @param path_finder [Wayfarer::Routing::PathFinder] the path finder
|
63
|
+
def call(route, result, path_finder)
|
64
|
+
node = (nodes[route] ||= attributes(route, result, path_finder))
|
65
|
+
parent = route.parent
|
63
66
|
|
64
|
-
|
65
|
-
return if route.is_a?(Wayfarer::Routing::RootRoute)
|
67
|
+
return @root_hash ||= node unless parent
|
66
68
|
|
67
|
-
|
69
|
+
nodes.dig(parent, route_type(parent), :children).append(node)
|
68
70
|
end
|
69
71
|
|
70
|
-
|
71
|
-
return unless route.is_a?(Wayfarer::Routing::RootRoute)
|
72
|
-
|
73
|
-
result = Wayfarer::Routing::PathFinder.result(route, url)
|
74
|
-
result.action.inspect if result.is_a?(Wayfarer::Routing::Result::Match)
|
75
|
-
end
|
72
|
+
private
|
76
73
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
when Wayfarer::Routing::Matchers::Host then matcher.host
|
83
|
-
when Wayfarer::Routing::Matchers::Path then matcher.path
|
84
|
-
when Wayfarer::Routing::Matchers::Query then matcher.fields
|
85
|
-
when Wayfarer::Routing::Matchers::Custom then route.action.to_s
|
86
|
-
when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
|
87
|
-
when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
|
88
|
-
end
|
89
|
-
matcher_opts.inspect
|
90
|
-
end
|
74
|
+
attr_reader :route,
|
75
|
+
:path_finder,
|
76
|
+
:serializer,
|
77
|
+
:nodes,
|
78
|
+
:root_hash
|
91
79
|
|
92
|
-
def
|
93
|
-
|
94
|
-
result = Wayfarer::Routing::PathFinder.result(route, url)
|
95
|
-
result.params if result.is_a?(Wayfarer::Routing::Result::Match)
|
96
|
-
else
|
97
|
-
route.matcher.params(url)
|
98
|
-
end
|
80
|
+
def routing_result(path_finder)
|
81
|
+
return { routed: false } unless path_finder.found?
|
99
82
|
|
100
|
-
|
83
|
+
action = path_finder.action
|
84
|
+
{ routed: true,
|
85
|
+
params: path_finder.params,
|
86
|
+
action: case action
|
87
|
+
when Array
|
88
|
+
{ handler: action.first.class.name, action: action.second }
|
89
|
+
else action
|
90
|
+
end }
|
101
91
|
end
|
102
92
|
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
|
93
|
+
def attributes(route, result, path_finder)
|
94
|
+
{ route_type(route) => route.to_h.merge!(
|
95
|
+
route_result(route, result, path_finder),
|
96
|
+
children: []
|
97
|
+
) }
|
107
98
|
end
|
108
99
|
|
109
|
-
def
|
110
|
-
|
111
|
-
end
|
100
|
+
def route_result(route, result, path_finder)
|
101
|
+
return {} if route.target?
|
112
102
|
|
113
|
-
|
114
|
-
|
115
|
-
%i[green bold]
|
116
|
-
elsif route.matcher.match(url)
|
117
|
-
%i[green]
|
118
|
-
else
|
119
|
-
%i[red]
|
120
|
-
end
|
103
|
+
{ match: result,
|
104
|
+
params: path_finder.params_stack.to_h }
|
121
105
|
end
|
122
106
|
|
123
|
-
def
|
124
|
-
|
125
|
-
|
126
|
-
super(string, *colors)
|
107
|
+
def route_type(route)
|
108
|
+
route.class.name.demodulize.underscore
|
127
109
|
end
|
128
110
|
end
|
129
111
|
end
|
data/lib/wayfarer/cli.rb
CHANGED
@@ -13,36 +13,16 @@ module Wayfarer
|
|
13
13
|
|
14
14
|
class_option :require, aliases: :r, type: :string, default: nil
|
15
15
|
|
16
|
-
desc "route JOB URL", "Routing
|
16
|
+
desc "route JOB URL", "Routing tree for URL for JOB"
|
17
|
+
option :format, type: :string, enum: %w[yaml json ruby], default: "yaml"
|
17
18
|
def route(job, url)
|
18
19
|
load_environment
|
19
20
|
|
20
21
|
url = parsed_url(url)
|
21
|
-
job = job.
|
22
|
+
job = job.camelize.constantize
|
22
23
|
route = job.route
|
23
|
-
route.invoke(url)
|
24
24
|
|
25
|
-
|
26
|
-
result_type = result.class.name.demodulize
|
27
|
-
|
28
|
-
say case result
|
29
|
-
when Wayfarer::Routing::Result::Match
|
30
|
-
"#{result_type} => #{result.action.inspect}"
|
31
|
-
else
|
32
|
-
result_type
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
desc "tree JOB URL", "Visualize JOB's routing tree for URL"
|
37
|
-
def tree(job, url)
|
38
|
-
load_environment
|
39
|
-
|
40
|
-
url = parsed_url(url)
|
41
|
-
job = job.classify.constantize
|
42
|
-
route = job.route
|
43
|
-
route.invoke(url)
|
44
|
-
|
45
|
-
Wayfarer::CLI::RoutePrinter.print(route, url)
|
25
|
+
Wayfarer::CLI::RoutePrinter.print(route, url, format: options.fetch("format"))
|
46
26
|
end
|
47
27
|
|
48
28
|
desc "perform JOB URL", "Perform JOB with URL"
|
@@ -52,8 +32,8 @@ module Wayfarer
|
|
52
32
|
load_environment
|
53
33
|
mock_redis
|
54
34
|
|
55
|
-
job = job.
|
56
|
-
task = Wayfarer::Task.new(url, options
|
35
|
+
job = job.camelize.constantize
|
36
|
+
task = Wayfarer::Task.new(url, options.fetch(:batch))
|
57
37
|
job.new(task).perform_now
|
58
38
|
end
|
59
39
|
|
@@ -62,7 +42,7 @@ module Wayfarer
|
|
62
42
|
def enqueue(job, url)
|
63
43
|
load_environment
|
64
44
|
|
65
|
-
job = job.
|
45
|
+
job = job.camelize.constantize
|
66
46
|
job.crawl(url, batch: options[:batch])
|
67
47
|
end
|
68
48
|
|
@@ -71,21 +51,23 @@ module Wayfarer
|
|
71
51
|
option :batch, type: :string, default: SecureRandom.uuid
|
72
52
|
option :min_threads, type: :numeric, default: 1
|
73
53
|
option :max_threads, type: :numeric, default: 1
|
54
|
+
option :retain_pool, type: :boolean, default: false
|
74
55
|
def execute(job, url)
|
75
56
|
load_environment
|
76
57
|
mock_redis
|
77
58
|
|
78
|
-
job = job.
|
59
|
+
job = job.camelize.constantize
|
79
60
|
job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
|
80
61
|
max_threads: options[:max_threads])
|
81
62
|
scheduler = job.queue_adapter.instance_variable_get(:@scheduler)
|
82
63
|
executor = scheduler.instance_variable_get(:@async_executor)
|
83
64
|
|
84
|
-
job.crawl(url, batch: options
|
65
|
+
job.crawl(url, batch: options.fetch(:batch))
|
85
66
|
|
86
67
|
sleep(0.1) while executor.scheduled_task_count > executor.completed_task_count
|
87
68
|
|
88
|
-
|
69
|
+
# Used in test suite to avoid pool recreation
|
70
|
+
Wayfarer::Networking::Pool.instance.free unless options.fetch(:retain_pool)
|
89
71
|
end
|
90
72
|
|
91
73
|
private
|
data/lib/wayfarer/gc.rb
CHANGED
@@ -6,12 +6,17 @@ module Wayfarer
|
|
6
6
|
RESETTABLES = [Wayfarer::Redis::Barrier, Wayfarer::Redis::Counter].freeze
|
7
7
|
|
8
8
|
class << self
|
9
|
-
include Wayfarer::Logging.emit(
|
9
|
+
include Wayfarer::Logging.emit(
|
10
|
+
after_batch: [:debug, "Running `after_batch` callback"],
|
11
|
+
gc: [:debug, "Garbage collecting %<resettable>s"]
|
12
|
+
)
|
10
13
|
end
|
11
14
|
|
12
15
|
module_function
|
13
16
|
|
14
17
|
def run(task)
|
18
|
+
task[:job].run_callbacks(:batch)
|
19
|
+
|
15
20
|
RESETTABLES.each do |resettable|
|
16
21
|
log(:gc, task, resettable: resettable)
|
17
22
|
resettable.new(task).reset!
|
data/lib/wayfarer/kv.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
# Provides a key-value store via `[]` and `[]=`.
|
5
|
+
#
|
6
|
+
# @api private
|
7
|
+
module KV
|
8
|
+
# @param key [Object] key to fetch
|
9
|
+
# @return [Object, nil] value associated with the key or `nil`
|
10
|
+
def [](key)
|
11
|
+
kv[key]
|
12
|
+
end
|
13
|
+
|
14
|
+
# @param key [Object] key to set
|
15
|
+
# @param value [Object] value to set
|
16
|
+
# @return [Object] value that was set
|
17
|
+
def []=(key, value)
|
18
|
+
kv[key] = value
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# @return [Hash<Object, Object>]
|
24
|
+
def kv
|
25
|
+
@kv ||= {}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -2,9 +2,15 @@
|
|
2
2
|
|
3
3
|
module Wayfarer
|
4
4
|
module Middleware
|
5
|
-
Chain
|
5
|
+
class Chain
|
6
6
|
extend Forwardable
|
7
7
|
|
8
|
+
attr_reader :middlewares
|
9
|
+
|
10
|
+
def initialize(middlewares)
|
11
|
+
@middlewares = middlewares
|
12
|
+
end
|
13
|
+
|
8
14
|
def self.empty
|
9
15
|
new([])
|
10
16
|
end
|