wayfarer 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yaml +1 -1
- data/Gemfile.lock +20 -15
- data/docs/cookbook/user_agent.md +1 -1
- data/docs/guides/browser_automation/capybara.md +64 -1
- data/docs/guides/browser_automation/custom_adapters.md +100 -0
- data/docs/guides/browser_automation/ferrum.md +3 -3
- data/docs/guides/browser_automation/selenium.md +7 -5
- data/docs/guides/callbacks.md +117 -10
- data/docs/guides/configuration.md +16 -10
- data/docs/guides/error_handling.md +9 -5
- data/docs/guides/networking.md +77 -3
- data/docs/index.md +9 -1
- data/docs/reference/api/base.md +4 -4
- data/docs/reference/configuration_keys.md +42 -0
- data/docs/reference/environment_variables.md +25 -27
- data/lib/wayfarer/base.rb +7 -17
- data/lib/wayfarer/callbacks.rb +71 -0
- data/lib/wayfarer/cli/base.rb +5 -1
- data/lib/wayfarer/cli/job.rb +7 -3
- data/lib/wayfarer/cli/route.rb +2 -2
- data/lib/wayfarer/cli/route_printer.rb +7 -7
- data/lib/wayfarer/config/capybara.rb +10 -0
- data/lib/wayfarer/config/ferrum.rb +11 -0
- data/lib/wayfarer/config/networking.rb +26 -0
- data/lib/wayfarer/config/redis.rb +14 -0
- data/lib/wayfarer/config/root.rb +11 -0
- data/lib/wayfarer/config/selenium.rb +21 -0
- data/lib/wayfarer/config/strconv.rb +45 -0
- data/lib/wayfarer/config/struct.rb +72 -0
- data/lib/wayfarer/gc.rb +3 -7
- data/lib/wayfarer/middleware/fetch.rb +7 -3
- data/lib/wayfarer/middleware/router.rb +2 -2
- data/lib/wayfarer/middleware/worker.rb +12 -9
- data/lib/wayfarer/networking/capybara.rb +28 -0
- data/lib/wayfarer/networking/context.rb +36 -0
- data/lib/wayfarer/networking/ferrum.rb +17 -52
- data/lib/wayfarer/networking/http.rb +34 -0
- data/lib/wayfarer/networking/pool.rb +15 -10
- data/lib/wayfarer/networking/result.rb +1 -1
- data/lib/wayfarer/networking/selenium.rb +20 -47
- data/lib/wayfarer/networking/strategy.rb +38 -0
- data/lib/wayfarer/page.rb +2 -3
- data/lib/wayfarer/redis/pool.rb +3 -1
- data/lib/wayfarer/routing/dsl.rb +8 -8
- data/lib/wayfarer/routing/matchers/custom.rb +23 -0
- data/lib/wayfarer/routing/matchers/host.rb +19 -0
- data/lib/wayfarer/routing/matchers/path.rb +48 -0
- data/lib/wayfarer/routing/matchers/query.rb +63 -0
- data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
- data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
- data/lib/wayfarer/routing/matchers/url.rb +17 -0
- data/lib/wayfarer/routing/route.rb +1 -1
- data/lib/wayfarer.rb +9 -9
- data/spec/base_spec.rb +14 -0
- data/spec/callbacks_spec.rb +102 -0
- data/spec/cli/job_spec.rb +6 -6
- data/spec/config/capybara_spec.rb +18 -0
- data/spec/config/ferrum_spec.rb +24 -0
- data/spec/config/networking_spec.rb +73 -0
- data/spec/config/redis_spec.rb +32 -0
- data/spec/config/root_spec.rb +31 -0
- data/spec/config/selenium_spec.rb +56 -0
- data/spec/config/strconv_spec.rb +58 -0
- data/spec/config/struct_spec.rb +66 -0
- data/spec/gc_spec.rb +8 -6
- data/spec/middleware/fetch_spec.rb +20 -8
- data/spec/middleware/router_spec.rb +7 -0
- data/spec/middleware/worker_spec.rb +64 -27
- data/spec/networking/capybara_spec.rb +12 -0
- data/spec/networking/context_spec.rb +127 -0
- data/spec/networking/ferrum_spec.rb +6 -22
- data/spec/networking/http_spec.rb +12 -0
- data/spec/networking/pool_spec.rb +37 -12
- data/spec/networking/selenium_spec.rb +6 -22
- data/spec/networking/strategy.rb +170 -0
- data/spec/redis/pool_spec.rb +1 -1
- data/spec/routing/dsl_spec.rb +10 -10
- data/spec/routing/integration_spec.rb +22 -22
- data/spec/routing/{custom_matcher_spec.rb → matchers/custom_spec.rb} +4 -4
- data/spec/routing/{host_matcher_spec.rb → matchers/host_spec.rb} +6 -6
- data/spec/routing/{path_matcher_spec.rb → matchers/path_spec.rb} +6 -6
- data/spec/routing/{query_matcher_spec.rb → matchers/query_spec.rb} +15 -15
- data/spec/routing/{scheme_matcher_spec.rb → matchers/scheme_spec.rb} +4 -4
- data/spec/routing/{suffix_matcher_spec.rb → matchers/suffix_spec.rb} +4 -4
- data/spec/routing/{uri_matcher_spec.rb → matchers/uri_spec.rb} +4 -4
- data/spec/routing/path_finder_spec.rb +1 -1
- data/spec/routing/root_route_spec.rb +2 -2
- data/spec/routing/route_spec.rb +2 -2
- data/spec/spec_helpers.rb +13 -5
- data/spec/wayfarer_spec.rb +1 -1
- data/wayfarer.gemspec +8 -7
- metadata +74 -33
- data/lib/wayfarer/config.rb +0 -67
- data/lib/wayfarer/networking/healer.rb +0 -21
- data/lib/wayfarer/networking/net_http.rb +0 -52
- data/lib/wayfarer/routing/custom_matcher.rb +0 -21
- data/lib/wayfarer/routing/host_matcher.rb +0 -23
- data/lib/wayfarer/routing/path_matcher.rb +0 -46
- data/lib/wayfarer/routing/query_matcher.rb +0 -67
- data/lib/wayfarer/routing/scheme_matcher.rb +0 -21
- data/lib/wayfarer/routing/suffix_matcher.rb +0 -21
- data/lib/wayfarer/routing/url_matcher.rb +0 -21
- data/spec/config_spec.rb +0 -144
- data/spec/networking/adapter.rb +0 -135
- data/spec/networking/healer_spec.rb +0 -46
- data/spec/networking/net_http_spec.rb +0 -37
data/docs/reference/api/base.md
CHANGED
@@ -102,7 +102,7 @@ Base functionality every job is equipped with:
|
|
102
102
|
|
103
103
|
### `#browser -> Ferrum::Browser | Selenium::WebDriver | nil`
|
104
104
|
: The browser process used to retrieve the current response.
|
105
|
-
If the configured
|
105
|
+
If the configured agent is the default `:http`, `nil` is returned.
|
106
106
|
|
107
107
|
Guides:
|
108
108
|
|
@@ -112,7 +112,7 @@ Base functionality every job is equipped with:
|
|
112
112
|
!!! example "Accessing a Google Chrome process"
|
113
113
|
|
114
114
|
```ruby
|
115
|
-
Wayfarer.config.
|
115
|
+
Wayfarer.config.network.agent = :ferrum
|
116
116
|
|
117
117
|
class DummyJob < Wayfarer::Base
|
118
118
|
route.to :index
|
@@ -126,7 +126,7 @@ Base functionality every job is equipped with:
|
|
126
126
|
!!! example "Accessing a Selenium WebDriver"
|
127
127
|
|
128
128
|
```ruby
|
129
|
-
Wayfarer.config.
|
129
|
+
Wayfarer.config.network.agent = :selenium
|
130
130
|
|
131
131
|
class DummyJob < Wayfarer::Base
|
132
132
|
route.to :index
|
@@ -144,7 +144,7 @@ Base functionality every job is equipped with:
|
|
144
144
|
processing URL.
|
145
145
|
|
146
146
|
With `page(live: true)` passed, the returned `Page` reflects the current
|
147
|
-
browser DOM. No-op when the `net/http`
|
147
|
+
browser DOM. No-op when the `net/http` agent is in use. Calls to
|
148
148
|
`page()` without the keyword return the most recent page.
|
149
149
|
|
150
150
|
---
|
@@ -0,0 +1,42 @@
|
|
1
|
+
---
|
2
|
+
hide:
|
3
|
+
- toc
|
4
|
+
---
|
5
|
+
|
6
|
+
# Configuration Keys
|
7
|
+
|
8
|
+
## `Wayfarer.config.network`
|
9
|
+
|
10
|
+
| Runtime config key | Environment variable | Description | Default | Supported values |
|
11
|
+
| ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
|
12
|
+
| `network.agent` | `WAYFARER_NETWORK_AGENT` | The user agent to use. | `:http` | `:http`, `:ferrum`, `:selenium` |
|
13
|
+
| `network.pool_size` | `WAYFARER_NETWORK_POOL_SIZE` | How many user agents to spawn. | 3 | Integers |
|
14
|
+
| `network.pool_timeout` | `WAYFARER_NETWORK_POOL_TIMEOUT` | How long jobs may use an agent in seconds. | 10 | Integers |
|
15
|
+
| `network.http_headers` | `WAYFARER_NETWORK_HTTP_HEADERS` | HTTP headers to append to requests. | `{}` | Hashes |
|
16
|
+
|
17
|
+
## `Wayfarer.config.ferrum`
|
18
|
+
|
19
|
+
| Runtime config key | Environment variable | Description | Default | Supported values |
|
20
|
+
| ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
|
21
|
+
| `ferrum.options` | `WAYFARER_FERRUM_OPTIONS` | Ferrum options. | `{}` | Hashes |
|
22
|
+
|
23
|
+
## `Wayfarer.config.selenium`
|
24
|
+
|
25
|
+
| Runtime config key | Environment variable | Description | Default | Supported values |
|
26
|
+
| ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
|
27
|
+
| `selenium.driver` | `WAYFARER_SELENIUM_DRIVER` | Selenium driver to use. | `:chrome` | Symbols |
|
28
|
+
| `selenium.options` | `WAYFARER_SELENIUM_OPTIONS` | Selenium options. | `{}` | Hashes |
|
29
|
+
| `selenium.client_timeout` | `WAYFARER_SELENIUM_CLIENT_TIMEOUT` | Selenium client timeout in seconds. | 60 | Integers |
|
30
|
+
|
31
|
+
## `Wayfarer.config.redis`
|
32
|
+
|
33
|
+
| Runtime config key | Environment variable | Description | Default | Supported values |
|
34
|
+
| ---------------------- | ------------------------------------ | ------------------------------------------- | ------------------------------------------ | ----------------------------------- |
|
35
|
+
| `redis.url` | `WAYFARER_REDIS_URL` | Redis URL to connect to. | http://localhost:6379 | Strings |
|
36
|
+
| `redis.factory` | n/a | Redis factory lambda. | ` ->(redis) { ::Redis.new(url: redis.url)` | Lambdas |
|
37
|
+
|
38
|
+
## `Wayfarer.config.capybara`
|
39
|
+
|
40
|
+
| Runtime config key | Environment variable | Description | Default | Supported values |
|
41
|
+
| ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
|
42
|
+
| `capybara.driver` | `WAYFARER_CAPYBARA_DRIVER` | The Capybara driver to use. | n/a | Symbols |
|
@@ -10,26 +10,26 @@ with the following syntaxes:
|
|
10
10
|
|
11
11
|
## Variables
|
12
12
|
|
13
|
-
### `
|
14
|
-
: Either `ferrum`, `selenium` or `
|
13
|
+
### `WAYFARER_AGENT`
|
14
|
+
: Either `ferrum`, `selenium` or `http`.
|
15
15
|
|
16
16
|
* Type: String
|
17
|
-
* Key: `config.
|
18
|
-
* Default value: `:
|
17
|
+
* Key: `config.agent`
|
18
|
+
* Default value: `:http`
|
19
19
|
|
20
20
|
### `WAYFARER_POOL_SIZE`
|
21
|
-
: Number of
|
21
|
+
: Number of user agents to maintain.
|
22
22
|
|
23
23
|
* Type: Integer
|
24
24
|
* Key: `config.pool_size`
|
25
25
|
* Default value: `1`
|
26
26
|
|
27
27
|
### `WAYFARER_POOL_TIMEOUT`
|
28
|
-
: How long a
|
28
|
+
: How long a user agent may remain checked out until the owning job
|
29
29
|
fails.
|
30
30
|
|
31
31
|
* Type: Integer
|
32
|
-
* Key: `config.
|
32
|
+
* Key: `config.agent_pool_timeout`
|
33
33
|
* Default value: `1`
|
34
34
|
|
35
35
|
---
|
@@ -43,32 +43,30 @@ with the following syntaxes:
|
|
43
43
|
|
44
44
|
---
|
45
45
|
|
46
|
-
### `
|
47
|
-
:
|
46
|
+
### `WAYFARER_SELENIUM_DRIVER`
|
47
|
+
: Driver passed to `Selenium::WebDriver.for`.
|
48
48
|
|
49
|
-
* Type:
|
50
|
-
* Key: `config.
|
51
|
-
* Default value: `
|
49
|
+
* Type: Symbol
|
50
|
+
* Key: `config.selenium_driver`
|
51
|
+
* Default value: `:chrome`
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
!!! example "Foobar"
|
56
|
-
|
57
|
-
For example, to run Google Chrome in foreground with Ferrum:
|
58
|
-
|
59
|
-
```
|
60
|
-
Wayfarer.config.adapter = :ferrum
|
61
|
-
Wayfarer.ferrum_options = { headless: false, url: "http://chrome:3000" }
|
62
|
-
```
|
53
|
+
---
|
63
54
|
|
55
|
+
### `WAYFARER_SELENIUM_OPTIONS`
|
56
|
+
: Options passed to `Selenium::WebDriver.for`.
|
64
57
|
|
65
|
-
|
58
|
+
* Type: Hash
|
59
|
+
* Key: `config.selenium_options`
|
60
|
+
* Default value: `{}`
|
61
|
+
|
62
|
+
---
|
66
63
|
|
67
|
-
|
64
|
+
### `WAYFARER_SELENIUM_CLIENT_TIMEOUT`
|
65
|
+
: Selenium HTTP client timeout (seconds).
|
68
66
|
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
* Type: Integer
|
68
|
+
* Key: `config.selenium_client_timeout`
|
69
|
+
* Default value: `60`
|
72
70
|
|
73
71
|
---
|
74
72
|
|
data/lib/wayfarer/base.rb
CHANGED
@@ -5,22 +5,7 @@ module Wayfarer
|
|
5
5
|
include Wayfarer::Middleware::Worker
|
6
6
|
extend Forwardable
|
7
7
|
|
8
|
-
|
9
|
-
def after_batch_callbacks
|
10
|
-
@after_batch_callbacks ||= []
|
11
|
-
end
|
12
|
-
|
13
|
-
def after_batch(&block)
|
14
|
-
after_batch_callbacks.push(block)
|
15
|
-
end
|
16
|
-
|
17
|
-
def run_after_batch_callbacks
|
18
|
-
after_batch_callbacks.each(&:call)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
after_enqueue do |job|
|
23
|
-
task = job.arguments.first
|
8
|
+
after_enqueue do |_job|
|
24
9
|
task.counter.increment
|
25
10
|
end
|
26
11
|
|
@@ -30,6 +15,7 @@ module Wayfarer
|
|
30
15
|
|
31
16
|
def self.retry_on(*argv)
|
32
17
|
super(*argv) do |job, error|
|
18
|
+
job.task.barrier.seen?(job.task.url)
|
33
19
|
GC.new(job).run
|
34
20
|
yield job, error if block_given?
|
35
21
|
end
|
@@ -37,6 +23,7 @@ module Wayfarer
|
|
37
23
|
|
38
24
|
def self.discard_on(*argv)
|
39
25
|
super(*argv) do |job, error|
|
26
|
+
job.task.barrier.seen?(job.task.url)
|
40
27
|
GC.new(job).run
|
41
28
|
yield job, error if block_given?
|
42
29
|
end
|
@@ -48,7 +35,6 @@ module Wayfarer
|
|
48
35
|
|
49
36
|
def retry_job(...)
|
50
37
|
super(...) # increments the counter by re-enqueuing the job
|
51
|
-
task = arguments.first
|
52
38
|
task.counter.decrement
|
53
39
|
end
|
54
40
|
|
@@ -56,5 +42,9 @@ module Wayfarer
|
|
56
42
|
task.job = self
|
57
43
|
chain.call(task)
|
58
44
|
end
|
45
|
+
|
46
|
+
def task
|
47
|
+
arguments.first
|
48
|
+
end
|
59
49
|
end
|
60
50
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Callbacks
|
5
|
+
TERMINATOR = ->(_target, result) { result.call == false }
|
6
|
+
OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
|
7
|
+
|
8
|
+
ConditionalCallback = Struct.new(:job, :filters) do
|
9
|
+
def run(method, &block)
|
10
|
+
return if only && !applies?(only)
|
11
|
+
return if except && applies?(except)
|
12
|
+
|
13
|
+
return job.send(method) if method
|
14
|
+
|
15
|
+
job.instance_eval(&block)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def applies?(condition)
|
21
|
+
case condition
|
22
|
+
when Symbol then condition == action
|
23
|
+
when Enumerable then condition&.include?(action)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def only
|
28
|
+
filters[:only]
|
29
|
+
end
|
30
|
+
|
31
|
+
def except
|
32
|
+
filters[:except]
|
33
|
+
end
|
34
|
+
|
35
|
+
def action
|
36
|
+
task.metadata.action
|
37
|
+
end
|
38
|
+
|
39
|
+
def task
|
40
|
+
job.task
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.included(base)
|
45
|
+
base.include(ActiveSupport::Callbacks)
|
46
|
+
base.extend(ClassMethods)
|
47
|
+
|
48
|
+
base.class_eval do
|
49
|
+
define_callbacks(:fetch, OPTIONS)
|
50
|
+
define_callbacks(:action, OPTIONS)
|
51
|
+
define_callbacks(:batch, OPTIONS)
|
52
|
+
|
53
|
+
define(:fetch, :before)
|
54
|
+
define(:action, :before)
|
55
|
+
define(:batch, :after)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module ClassMethods
|
60
|
+
private
|
61
|
+
|
62
|
+
def define(name, stage)
|
63
|
+
define_singleton_method([stage, name].join("_")) do |method = nil, **filters, &block|
|
64
|
+
set_callback(name, stage, **filters) do |job|
|
65
|
+
ConditionalCallback.new(job, filters).run(method, &block)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/wayfarer/cli/base.rb
CHANGED
@@ -12,12 +12,16 @@ module Wayfarer
|
|
12
12
|
private
|
13
13
|
|
14
14
|
def mock_redis
|
15
|
-
Wayfarer.config.
|
15
|
+
Wayfarer.config.redis.factory = ->(_) { MockRedis.new }
|
16
16
|
end
|
17
17
|
|
18
18
|
def load_environment
|
19
19
|
Wayfarer::CLI::Runner.loader.setup
|
20
20
|
end
|
21
|
+
|
22
|
+
def free_agent_pool
|
23
|
+
Wayfarer::Networking::Pool.instance.free
|
24
|
+
end
|
21
25
|
end
|
22
26
|
end
|
23
27
|
end
|
data/lib/wayfarer/cli/job.rb
CHANGED
@@ -11,12 +11,14 @@ module Wayfarer
|
|
11
11
|
load_environment
|
12
12
|
mock_redis if options[:mock_redis]
|
13
13
|
|
14
|
-
url = URI(url)
|
14
|
+
url = Addressable::URI.parse(url)
|
15
15
|
job = job.classify.constantize.new
|
16
16
|
task = Wayfarer::Task.new(url, "tmp")
|
17
17
|
job.arguments.push(task)
|
18
18
|
job.perform(task)
|
19
19
|
GC.new(job).run
|
20
|
+
|
21
|
+
free_agent_pool
|
20
22
|
end
|
21
23
|
|
22
24
|
desc "enqueue JOB URL",
|
@@ -26,7 +28,7 @@ module Wayfarer
|
|
26
28
|
load_environment
|
27
29
|
mock_redis if options[:mock_redis]
|
28
30
|
|
29
|
-
url = URI(url)
|
31
|
+
url = Addressable::URI.parse(url)
|
30
32
|
job = job.classify.constantize
|
31
33
|
job.crawl_later(url, batch: options[:batch])
|
32
34
|
end
|
@@ -41,7 +43,7 @@ module Wayfarer
|
|
41
43
|
load_environment
|
42
44
|
mock_redis if options[:mock_redis]
|
43
45
|
|
44
|
-
url = URI(url)
|
46
|
+
url = Addressable::URI.parse(url)
|
45
47
|
job = job.classify.constantize
|
46
48
|
|
47
49
|
job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
|
@@ -52,6 +54,8 @@ module Wayfarer
|
|
52
54
|
job.crawl_later(url, batch: options[:batch])
|
53
55
|
|
54
56
|
sleep(1) while executor.scheduled_task_count > executor.completed_task_count
|
57
|
+
|
58
|
+
free_agent_pool
|
55
59
|
end
|
56
60
|
end
|
57
61
|
end
|
data/lib/wayfarer/cli/route.rb
CHANGED
@@ -9,7 +9,7 @@ module Wayfarer
|
|
9
9
|
"Invoke JOB's router with URL"
|
10
10
|
def result(job, url)
|
11
11
|
load_environment
|
12
|
-
url = URI(url)
|
12
|
+
url = Addressable::URI.parse(url)
|
13
13
|
job = job.classify.constantize
|
14
14
|
puts Wayfarer::Routing::PathFinder.result(job.route, url)
|
15
15
|
end
|
@@ -18,7 +18,7 @@ module Wayfarer
|
|
18
18
|
"Visualize JOB's routing tree for URL"
|
19
19
|
def tree(job, url)
|
20
20
|
load_environment
|
21
|
-
url = URI(url)
|
21
|
+
url = Addressable::URI.parse(url)
|
22
22
|
job = job.classify.constantize
|
23
23
|
Wayfarer::CLI::RoutePrinter.print(job.route, url)
|
24
24
|
end
|
@@ -77,19 +77,19 @@ module Wayfarer
|
|
77
77
|
def matcher_label(route)
|
78
78
|
return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
|
79
79
|
|
80
|
-
route.matcher.class.name.demodulize
|
80
|
+
route.matcher.class.name.demodulize
|
81
81
|
end
|
82
82
|
|
83
83
|
def options(route)
|
84
84
|
return "" if route.is_a?(Wayfarer::Routing::RootRoute)
|
85
85
|
|
86
86
|
case (matcher = route.matcher)
|
87
|
-
when Wayfarer::Routing::
|
88
|
-
when Wayfarer::Routing::
|
89
|
-
when Wayfarer::Routing::
|
90
|
-
when Wayfarer::Routing::
|
91
|
-
when Wayfarer::Routing::
|
92
|
-
when Wayfarer::Routing::
|
87
|
+
when Wayfarer::Routing::Matchers::Host then matcher.host
|
88
|
+
when Wayfarer::Routing::Matchers::Path then matcher.path
|
89
|
+
when Wayfarer::Routing::Matchers::Query then matcher.fields
|
90
|
+
when Wayfarer::Routing::Matchers::Custom then "##{route.action}"
|
91
|
+
when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
|
92
|
+
when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
Networking = Struct.new(agent: {
|
6
|
+
env_key: "WAYFARER_NETWORK_AGENT",
|
7
|
+
type: Symbol,
|
8
|
+
default: :http
|
9
|
+
},
|
10
|
+
pool_size: {
|
11
|
+
env_key: "WAYFARER_NETWORK_POOL_SIZE",
|
12
|
+
type: Integer,
|
13
|
+
default: 3
|
14
|
+
},
|
15
|
+
pool_timeout: {
|
16
|
+
env_key: "WAYFARER_NETWORK_POOL_TIMEOUT",
|
17
|
+
type: Integer,
|
18
|
+
default: 10
|
19
|
+
},
|
20
|
+
http_headers: {
|
21
|
+
env_key: "WAYFARER_NETWORK_HTTP_HEADERS",
|
22
|
+
type: Hash,
|
23
|
+
default: {}
|
24
|
+
})
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
Redis = Struct.new(url: {
|
6
|
+
env_key: "WAYFARER_REDIS_URL",
|
7
|
+
type: String,
|
8
|
+
default: "redis://localhost:6379"
|
9
|
+
},
|
10
|
+
factory: {
|
11
|
+
default: ->(redis) { ::Redis.new(url: redis.url) }
|
12
|
+
})
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
Root = Struct.new(ferrum: { default: Wayfarer::Config::Ferrum.new },
|
6
|
+
network: { default: Wayfarer::Config::Networking.new },
|
7
|
+
redis: { default: Wayfarer::Config::Redis.new },
|
8
|
+
selenium: { default: Wayfarer::Config::Selenium.new },
|
9
|
+
capybara: { default: Wayfarer::Config::Capybara.new })
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
Selenium = Struct.new(driver: {
|
6
|
+
env_key: "WAYFARER_SELENIUM_DRIVER",
|
7
|
+
type: Symbol,
|
8
|
+
default: :chrome
|
9
|
+
},
|
10
|
+
options: {
|
11
|
+
env_key: "WAYFARER_SELENIUM_OPTIONS",
|
12
|
+
type: Hash,
|
13
|
+
default: {}
|
14
|
+
},
|
15
|
+
client_timeout: {
|
16
|
+
env_key: "WAYFARER_SELENIUM_CLIENT_TIMEOUT",
|
17
|
+
type: Integer,
|
18
|
+
default: 60 # seconds
|
19
|
+
})
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
module Strconv
|
6
|
+
module_function
|
7
|
+
|
8
|
+
def parse(str, type = nil)
|
9
|
+
return primitive(str) unless type
|
10
|
+
|
11
|
+
case type.name
|
12
|
+
when "Hash" then hash(str)
|
13
|
+
when "Array" then array(str)
|
14
|
+
when "Symbol" then str.to_sym
|
15
|
+
when "Integer" then Integer(str)
|
16
|
+
else str
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def hash(str)
|
21
|
+
array(str).reduce({}) do |acc, pair|
|
22
|
+
k, v = pair.split(":", 2)
|
23
|
+
next acc unless k && v
|
24
|
+
|
25
|
+
acc.merge({ parse(k, Symbol) => primitive(v) })
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def array(str)
|
30
|
+
str.split(",").map(&:strip)
|
31
|
+
end
|
32
|
+
|
33
|
+
def primitive(str)
|
34
|
+
return true if str == "true"
|
35
|
+
return false if str == "false"
|
36
|
+
|
37
|
+
begin
|
38
|
+
parse(str, Integer)
|
39
|
+
rescue StandardError
|
40
|
+
str
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wayfarer
|
4
|
+
module Config
|
5
|
+
class Struct
|
6
|
+
module ClassMethods
|
7
|
+
attr_accessor :members
|
8
|
+
end
|
9
|
+
|
10
|
+
module InstanceMethods
|
11
|
+
extend Forwardable
|
12
|
+
|
13
|
+
delegate members: "self.class"
|
14
|
+
|
15
|
+
attr_reader :env
|
16
|
+
|
17
|
+
def initialize(env = ENV)
|
18
|
+
@env = env
|
19
|
+
|
20
|
+
define_writers
|
21
|
+
define_readers
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def define_writers
|
27
|
+
members.each { |key, _| define_writer(key) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def define_writer(key)
|
31
|
+
define_singleton_method(:"#{key}=") do |val|
|
32
|
+
set(key, val)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def define_readers
|
37
|
+
members.each { |key, options| define_reader(key, **options) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def define_reader(key, env_key: nil, type: nil, default: nil)
|
41
|
+
define_singleton_method(key.to_sym) do
|
42
|
+
get(key) || set(key, get(key) || env_val(env_key, type) || default)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def env_val(env_key, type)
|
47
|
+
return nil unless env_key
|
48
|
+
return nil unless env.key?(env_key)
|
49
|
+
|
50
|
+
Strconv.parse(env[env_key], type)
|
51
|
+
end
|
52
|
+
|
53
|
+
def get(key)
|
54
|
+
instance_variable_get(:"@#{key}")
|
55
|
+
end
|
56
|
+
|
57
|
+
def set(key, val)
|
58
|
+
instance_variable_set(:"@#{key}", val)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.new(members)
|
63
|
+
Class.new do
|
64
|
+
include InstanceMethods
|
65
|
+
extend ClassMethods
|
66
|
+
|
67
|
+
self.members = members
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
data/lib/wayfarer/gc.rb
CHANGED
@@ -3,17 +3,13 @@
|
|
3
3
|
module Wayfarer
|
4
4
|
GC = Struct.new(:job) do
|
5
5
|
def run
|
6
|
+
task = job.task
|
7
|
+
|
6
8
|
return unless task.counter.decrement <= 0
|
7
9
|
|
8
10
|
task.barrier.reset!
|
9
11
|
task.counter.reset!
|
10
|
-
job.
|
11
|
-
end
|
12
|
-
|
13
|
-
private
|
14
|
-
|
15
|
-
def task
|
16
|
-
job.arguments.first
|
12
|
+
job.run_callbacks(:batch)
|
17
13
|
end
|
18
14
|
end
|
19
15
|
end
|
@@ -15,10 +15,14 @@ module Wayfarer
|
|
15
15
|
def call(task)
|
16
16
|
self.task = task
|
17
17
|
|
18
|
-
pool.with do |
|
19
|
-
task.metadata.
|
18
|
+
pool.with do |agent|
|
19
|
+
task.metadata.agent = agent
|
20
20
|
|
21
|
-
|
21
|
+
result = task.job.run_callbacks :fetch do
|
22
|
+
agent.fetch(task.url)
|
23
|
+
end
|
24
|
+
|
25
|
+
case result
|
22
26
|
when Networking::Result::Redirect
|
23
27
|
stage(result.redirect_url)
|
24
28
|
when Networking::Result::Success
|
@@ -6,12 +6,12 @@ module Wayfarer
|
|
6
6
|
def call(task)
|
7
7
|
route = task.job.class.route
|
8
8
|
|
9
|
-
case result = route.invoke(URI(task.url))
|
9
|
+
case result = route.invoke(Addressable::URI.parse(task.url))
|
10
10
|
when Routing::Result::Mismatch
|
11
11
|
return
|
12
12
|
when Routing::Result::Match
|
13
13
|
task.metadata.action = result.action
|
14
|
-
task.metadata.params = result.params
|
14
|
+
task.metadata.params = ActiveSupport::HashWithIndifferentAccess.new(result.params)
|
15
15
|
end
|
16
16
|
|
17
17
|
yield if block_given?
|