wayfarer 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci.yaml +1 -1
  3. data/Gemfile.lock +20 -15
  4. data/docs/cookbook/user_agent.md +1 -1
  5. data/docs/guides/browser_automation/capybara.md +64 -1
  6. data/docs/guides/browser_automation/custom_adapters.md +100 -0
  7. data/docs/guides/browser_automation/ferrum.md +3 -3
  8. data/docs/guides/browser_automation/selenium.md +7 -5
  9. data/docs/guides/callbacks.md +117 -10
  10. data/docs/guides/configuration.md +16 -10
  11. data/docs/guides/error_handling.md +9 -5
  12. data/docs/guides/networking.md +77 -3
  13. data/docs/index.md +9 -1
  14. data/docs/reference/api/base.md +4 -4
  15. data/docs/reference/configuration_keys.md +42 -0
  16. data/docs/reference/environment_variables.md +25 -27
  17. data/lib/wayfarer/base.rb +7 -17
  18. data/lib/wayfarer/callbacks.rb +71 -0
  19. data/lib/wayfarer/cli/base.rb +5 -1
  20. data/lib/wayfarer/cli/job.rb +7 -3
  21. data/lib/wayfarer/cli/route.rb +2 -2
  22. data/lib/wayfarer/cli/route_printer.rb +7 -7
  23. data/lib/wayfarer/config/capybara.rb +10 -0
  24. data/lib/wayfarer/config/ferrum.rb +11 -0
  25. data/lib/wayfarer/config/networking.rb +26 -0
  26. data/lib/wayfarer/config/redis.rb +14 -0
  27. data/lib/wayfarer/config/root.rb +11 -0
  28. data/lib/wayfarer/config/selenium.rb +21 -0
  29. data/lib/wayfarer/config/strconv.rb +45 -0
  30. data/lib/wayfarer/config/struct.rb +72 -0
  31. data/lib/wayfarer/gc.rb +3 -7
  32. data/lib/wayfarer/middleware/fetch.rb +7 -3
  33. data/lib/wayfarer/middleware/router.rb +2 -2
  34. data/lib/wayfarer/middleware/worker.rb +12 -9
  35. data/lib/wayfarer/networking/capybara.rb +28 -0
  36. data/lib/wayfarer/networking/context.rb +36 -0
  37. data/lib/wayfarer/networking/ferrum.rb +17 -52
  38. data/lib/wayfarer/networking/http.rb +34 -0
  39. data/lib/wayfarer/networking/pool.rb +15 -10
  40. data/lib/wayfarer/networking/result.rb +1 -1
  41. data/lib/wayfarer/networking/selenium.rb +20 -47
  42. data/lib/wayfarer/networking/strategy.rb +38 -0
  43. data/lib/wayfarer/page.rb +2 -3
  44. data/lib/wayfarer/redis/pool.rb +3 -1
  45. data/lib/wayfarer/routing/dsl.rb +8 -8
  46. data/lib/wayfarer/routing/matchers/custom.rb +23 -0
  47. data/lib/wayfarer/routing/matchers/host.rb +19 -0
  48. data/lib/wayfarer/routing/matchers/path.rb +48 -0
  49. data/lib/wayfarer/routing/matchers/query.rb +63 -0
  50. data/lib/wayfarer/routing/matchers/scheme.rb +17 -0
  51. data/lib/wayfarer/routing/matchers/suffix.rb +17 -0
  52. data/lib/wayfarer/routing/matchers/url.rb +17 -0
  53. data/lib/wayfarer/routing/route.rb +1 -1
  54. data/lib/wayfarer.rb +9 -9
  55. data/spec/base_spec.rb +14 -0
  56. data/spec/callbacks_spec.rb +102 -0
  57. data/spec/cli/job_spec.rb +6 -6
  58. data/spec/config/capybara_spec.rb +18 -0
  59. data/spec/config/ferrum_spec.rb +24 -0
  60. data/spec/config/networking_spec.rb +73 -0
  61. data/spec/config/redis_spec.rb +32 -0
  62. data/spec/config/root_spec.rb +31 -0
  63. data/spec/config/selenium_spec.rb +56 -0
  64. data/spec/config/strconv_spec.rb +58 -0
  65. data/spec/config/struct_spec.rb +66 -0
  66. data/spec/gc_spec.rb +8 -6
  67. data/spec/middleware/fetch_spec.rb +20 -8
  68. data/spec/middleware/router_spec.rb +7 -0
  69. data/spec/middleware/worker_spec.rb +64 -27
  70. data/spec/networking/capybara_spec.rb +12 -0
  71. data/spec/networking/context_spec.rb +127 -0
  72. data/spec/networking/ferrum_spec.rb +6 -22
  73. data/spec/networking/http_spec.rb +12 -0
  74. data/spec/networking/pool_spec.rb +37 -12
  75. data/spec/networking/selenium_spec.rb +6 -22
  76. data/spec/networking/strategy.rb +170 -0
  77. data/spec/redis/pool_spec.rb +1 -1
  78. data/spec/routing/dsl_spec.rb +10 -10
  79. data/spec/routing/integration_spec.rb +22 -22
  80. data/spec/routing/{custom_matcher_spec.rb → matchers/custom_spec.rb} +4 -4
  81. data/spec/routing/{host_matcher_spec.rb → matchers/host_spec.rb} +6 -6
  82. data/spec/routing/{path_matcher_spec.rb → matchers/path_spec.rb} +6 -6
  83. data/spec/routing/{query_matcher_spec.rb → matchers/query_spec.rb} +15 -15
  84. data/spec/routing/{scheme_matcher_spec.rb → matchers/scheme_spec.rb} +4 -4
  85. data/spec/routing/{suffix_matcher_spec.rb → matchers/suffix_spec.rb} +4 -4
  86. data/spec/routing/{uri_matcher_spec.rb → matchers/uri_spec.rb} +4 -4
  87. data/spec/routing/path_finder_spec.rb +1 -1
  88. data/spec/routing/root_route_spec.rb +2 -2
  89. data/spec/routing/route_spec.rb +2 -2
  90. data/spec/spec_helpers.rb +13 -5
  91. data/spec/wayfarer_spec.rb +1 -1
  92. data/wayfarer.gemspec +8 -7
  93. metadata +74 -33
  94. data/lib/wayfarer/config.rb +0 -67
  95. data/lib/wayfarer/networking/healer.rb +0 -21
  96. data/lib/wayfarer/networking/net_http.rb +0 -52
  97. data/lib/wayfarer/routing/custom_matcher.rb +0 -21
  98. data/lib/wayfarer/routing/host_matcher.rb +0 -23
  99. data/lib/wayfarer/routing/path_matcher.rb +0 -46
  100. data/lib/wayfarer/routing/query_matcher.rb +0 -67
  101. data/lib/wayfarer/routing/scheme_matcher.rb +0 -21
  102. data/lib/wayfarer/routing/suffix_matcher.rb +0 -21
  103. data/lib/wayfarer/routing/url_matcher.rb +0 -21
  104. data/spec/config_spec.rb +0 -144
  105. data/spec/networking/adapter.rb +0 -135
  106. data/spec/networking/healer_spec.rb +0 -46
  107. data/spec/networking/net_http_spec.rb +0 -37
@@ -102,7 +102,7 @@ Base functionality every job is equipped with:
102
102
 
103
103
  ### `#browser -> Ferrum::Browser | Selenium::WebDriver | nil`
104
104
  : The browser process used to retrieve the current response.
105
- If the configured adapter is the default `:net_http`, `nil` is returned.
105
+ If the configured agent is the default `:http`, `nil` is returned.
106
106
 
107
107
  Guides:
108
108
 
@@ -112,7 +112,7 @@ Base functionality every job is equipped with:
112
112
  !!! example "Accessing a Google Chrome process"
113
113
 
114
114
  ```ruby
115
- Wayfarer.config.adapter = :ferrum
115
+ Wayfarer.config.network.agent = :ferrum
116
116
 
117
117
  class DummyJob < Wayfarer::Base
118
118
  route.to :index
@@ -126,7 +126,7 @@ Base functionality every job is equipped with:
126
126
  !!! example "Accessing a Selenium WebDriver"
127
127
 
128
128
  ```ruby
129
- Wayfarer.config.adapter = :selenium
129
+ Wayfarer.config.network.agent = :selenium
130
130
 
131
131
  class DummyJob < Wayfarer::Base
132
132
  route.to :index
@@ -144,7 +144,7 @@ Base functionality every job is equipped with:
144
144
  processing URL.
145
145
 
146
146
  With `page(live: true)` passed, the returned `Page` reflects the current
147
- browser DOM. No-op when the `net/http` adapter is in use. Calls to
147
+ browser DOM. No-op when the `net/http` agent is in use. Calls to
148
148
  `page()` without the keyword return the most recent page.
149
149
 
150
150
  ---
@@ -0,0 +1,42 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+
6
+ # Configuration Keys
7
+
8
+ ## `Wayfarer.config.network`
9
+
10
+ | Runtime config key | Environment variable | Description | Default | Supported values |
11
+ | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
12
+ | `network.agent` | `WAYFARER_NETWORK_AGENT` | The user agent to use. | `:http` | `:http`, `:ferrum`, `:selenium` |
13
+ | `network.pool_size` | `WAYFARER_NETWORK_POOL_SIZE` | How many user agents to spawn. | 3 | Integers |
14
+ | `network.pool_timeout` | `WAYFARER_NETWORK_POOL_TIMEOUT` | How long jobs may use an agent in seconds. | 10 | Integers |
15
+ | `network.http_headers` | `WAYFARER_NETWORK_HTTP_HEADERS` | HTTP headers to append to requests. | `{}` | Hashes |
16
+
17
+ ## `Wayfarer.config.ferrum`
18
+
19
+ | Runtime config key | Environment variable | Description | Default | Supported values |
20
+ | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
21
+ | `ferrum.options` | `WAYFARER_FERRUM_OPTIONS` | Ferrum options. | `{}` | Hashes |
22
+
23
+ ## `Wayfarer.config.selenium`
24
+
25
+ | Runtime config key | Environment variable | Description | Default | Supported values |
26
+ | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
27
+ | `selenium.driver` | `WAYFARER_SELENIUM_DRIVER` | Selenium driver to use. | `:chrome` | Symbols |
28
+ | `selenium.options` | `WAYFARER_SELENIUM_OPTIONS` | Selenium options. | `{}` | Hashes |
29
+ | `selenium.client_timeout` | `WAYFARER_SELENIUM_CLIENT_TIMEOUT` | Selenium client timeout in seconds. | 60 | Integers |
30
+
31
+ ## `Wayfarer.config.redis`
32
+
33
+ | Runtime config key | Environment variable | Description | Default | Supported values |
34
+ | ---------------------- | ------------------------------------ | ------------------------------------------- | ------------------------------------------ | ----------------------------------- |
35
+ | `redis.url` | `WAYFARER_REDIS_URL` | Redis URL to connect to. | http://localhost:6379 | Strings |
36
+ | `redis.factory` | n/a | Redis factory lambda. | ` ->(redis) { ::Redis.new(url: redis.url)` | Lambdas |
37
+
38
+ ## `Wayfarer.config.capybara`
39
+
40
+ | Runtime config key | Environment variable | Description | Default | Supported values |
41
+ | ---------------------- | ------------------------------------ | ------------------------------------------- | -------------------------------- | ----------------------------------- |
42
+ | `capybara.driver` | `WAYFARER_CAPYBARA_DRIVER` | The Capybara driver to use. | n/a | Symbols |
@@ -10,26 +10,26 @@ with the following syntaxes:
10
10
 
11
11
  ## Variables
12
12
 
13
- ### `WAYFARER_ADAPTER`
14
- : Either `ferrum`, `selenium` or `net_http`.
13
+ ### `WAYFARER_AGENT`
14
+ : Either `ferrum`, `selenium` or `http`.
15
15
 
16
16
  * Type: String
17
- * Key: `config.adapter`
18
- * Default value: `:net_http`
17
+ * Key: `config.agent`
18
+ * Default value: `:http`
19
19
 
20
20
  ### `WAYFARER_POOL_SIZE`
21
- : Number of network adapters to maintain.
21
+ : Number of user agents to maintain.
22
22
 
23
23
  * Type: Integer
24
24
  * Key: `config.pool_size`
25
25
  * Default value: `1`
26
26
 
27
27
  ### `WAYFARER_POOL_TIMEOUT`
28
- : How long a network adapter may remain checked out until the owning job
28
+ : How long a user agent may remain checked out until the owning job
29
29
  fails.
30
30
 
31
31
  * Type: Integer
32
- * Key: `config.adapter_pool_timeout`
32
+ * Key: `config.agent_pool_timeout`
33
33
  * Default value: `1`
34
34
 
35
35
  ---
@@ -43,32 +43,30 @@ with the following syntaxes:
43
43
 
44
44
  ---
45
45
 
46
- ### `WAYFARER_SELENIUM_ARGV`
47
- : Argument list passed to `Selenium::WebDriver.for`.
46
+ ### `WAYFARER_SELENIUM_DRIVER`
47
+ : Driver passed to `Selenium::WebDriver.for`.
48
48
 
49
- * Type: Array
50
- * Key: `config.selenium_argv`
51
- * Default value: `[]`
49
+ * Type: Symbol
50
+ * Key: `config.selenium_driver`
51
+ * Default value: `:chrome`
52
52
 
53
- ## Examples
54
-
55
- !!! example "Foobar"
56
-
57
- For example, to run Google Chrome in foreground with Ferrum:
58
-
59
- ```
60
- Wayfarer.config.adapter = :ferrum
61
- Wayfarer.ferrum_options = { headless: false, url: "http://chrome:3000" }
62
- ```
53
+ ---
63
54
 
55
+ ### `WAYFARER_SELENIUM_OPTIONS`
56
+ : Options passed to `Selenium::WebDriver.for`.
64
57
 
65
- !!! example "Run Google Chrome in foreground with Ferrum"
58
+ * Type: Hash
59
+ * Key: `config.selenium_options`
60
+ * Default value: `{}`
61
+
62
+ ---
66
63
 
67
- To construct the
64
+ ### `WAYFARER_SELENIUM_CLIENT_TIMEOUT`
65
+ : Selenium HTTP client timeout (seconds).
68
66
 
69
- ```
70
- WAYFARER_FERRUM_OPTIONS=headless:false,url:http://chrome:3000
71
- ```
67
+ * Type: Integer
68
+ * Key: `config.selenium_client_timeout`
69
+ * Default value: `60`
72
70
 
73
71
  ---
74
72
 
data/lib/wayfarer/base.rb CHANGED
@@ -5,22 +5,7 @@ module Wayfarer
5
5
  include Wayfarer::Middleware::Worker
6
6
  extend Forwardable
7
7
 
8
- class << self
9
- def after_batch_callbacks
10
- @after_batch_callbacks ||= []
11
- end
12
-
13
- def after_batch(&block)
14
- after_batch_callbacks.push(block)
15
- end
16
-
17
- def run_after_batch_callbacks
18
- after_batch_callbacks.each(&:call)
19
- end
20
- end
21
-
22
- after_enqueue do |job|
23
- task = job.arguments.first
8
+ after_enqueue do |_job|
24
9
  task.counter.increment
25
10
  end
26
11
 
@@ -30,6 +15,7 @@ module Wayfarer
30
15
 
31
16
  def self.retry_on(*argv)
32
17
  super(*argv) do |job, error|
18
+ job.task.barrier.seen?(job.task.url)
33
19
  GC.new(job).run
34
20
  yield job, error if block_given?
35
21
  end
@@ -37,6 +23,7 @@ module Wayfarer
37
23
 
38
24
  def self.discard_on(*argv)
39
25
  super(*argv) do |job, error|
26
+ job.task.barrier.seen?(job.task.url)
40
27
  GC.new(job).run
41
28
  yield job, error if block_given?
42
29
  end
@@ -48,7 +35,6 @@ module Wayfarer
48
35
 
49
36
  def retry_job(...)
50
37
  super(...) # increments the counter by re-enqueuing the job
51
- task = arguments.first
52
38
  task.counter.decrement
53
39
  end
54
40
 
@@ -56,5 +42,9 @@ module Wayfarer
56
42
  task.job = self
57
43
  chain.call(task)
58
44
  end
45
+
46
+ def task
47
+ arguments.first
48
+ end
59
49
  end
60
50
  end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Callbacks
5
+ TERMINATOR = ->(_target, result) { result.call == false }
6
+ OPTIONS = { terminator: TERMINATOR, skip_after_callbacks_if_terminated: true }.freeze
7
+
8
+ ConditionalCallback = Struct.new(:job, :filters) do
9
+ def run(method, &block)
10
+ return if only && !applies?(only)
11
+ return if except && applies?(except)
12
+
13
+ return job.send(method) if method
14
+
15
+ job.instance_eval(&block)
16
+ end
17
+
18
+ private
19
+
20
+ def applies?(condition)
21
+ case condition
22
+ when Symbol then condition == action
23
+ when Enumerable then condition&.include?(action)
24
+ end
25
+ end
26
+
27
+ def only
28
+ filters[:only]
29
+ end
30
+
31
+ def except
32
+ filters[:except]
33
+ end
34
+
35
+ def action
36
+ task.metadata.action
37
+ end
38
+
39
+ def task
40
+ job.task
41
+ end
42
+ end
43
+
44
+ def self.included(base)
45
+ base.include(ActiveSupport::Callbacks)
46
+ base.extend(ClassMethods)
47
+
48
+ base.class_eval do
49
+ define_callbacks(:fetch, OPTIONS)
50
+ define_callbacks(:action, OPTIONS)
51
+ define_callbacks(:batch, OPTIONS)
52
+
53
+ define(:fetch, :before)
54
+ define(:action, :before)
55
+ define(:batch, :after)
56
+ end
57
+ end
58
+
59
+ module ClassMethods
60
+ private
61
+
62
+ def define(name, stage)
63
+ define_singleton_method([stage, name].join("_")) do |method = nil, **filters, &block|
64
+ set_callback(name, stage, **filters) do |job|
65
+ ConditionalCallback.new(job, filters).run(method, &block)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -12,12 +12,16 @@ module Wayfarer
12
12
  private
13
13
 
14
14
  def mock_redis
15
- Wayfarer.config.redis_factory = -> { MockRedis.new }
15
+ Wayfarer.config.redis.factory = ->(_) { MockRedis.new }
16
16
  end
17
17
 
18
18
  def load_environment
19
19
  Wayfarer::CLI::Runner.loader.setup
20
20
  end
21
+
22
+ def free_agent_pool
23
+ Wayfarer::Networking::Pool.instance.free
24
+ end
21
25
  end
22
26
  end
23
27
  end
@@ -11,12 +11,14 @@ module Wayfarer
11
11
  load_environment
12
12
  mock_redis if options[:mock_redis]
13
13
 
14
- url = URI(url)
14
+ url = Addressable::URI.parse(url)
15
15
  job = job.classify.constantize.new
16
16
  task = Wayfarer::Task.new(url, "tmp")
17
17
  job.arguments.push(task)
18
18
  job.perform(task)
19
19
  GC.new(job).run
20
+
21
+ free_agent_pool
20
22
  end
21
23
 
22
24
  desc "enqueue JOB URL",
@@ -26,7 +28,7 @@ module Wayfarer
26
28
  load_environment
27
29
  mock_redis if options[:mock_redis]
28
30
 
29
- url = URI(url)
31
+ url = Addressable::URI.parse(url)
30
32
  job = job.classify.constantize
31
33
  job.crawl_later(url, batch: options[:batch])
32
34
  end
@@ -41,7 +43,7 @@ module Wayfarer
41
43
  load_environment
42
44
  mock_redis if options[:mock_redis]
43
45
 
44
- url = URI(url)
46
+ url = Addressable::URI.parse(url)
45
47
  job = job.classify.constantize
46
48
 
47
49
  job.queue_adapter = ActiveJob::QueueAdapters::AsyncAdapter.new(min_threads: options[:min_threads],
@@ -52,6 +54,8 @@ module Wayfarer
52
54
  job.crawl_later(url, batch: options[:batch])
53
55
 
54
56
  sleep(1) while executor.scheduled_task_count > executor.completed_task_count
57
+
58
+ free_agent_pool
55
59
  end
56
60
  end
57
61
  end
@@ -9,7 +9,7 @@ module Wayfarer
9
9
  "Invoke JOB's router with URL"
10
10
  def result(job, url)
11
11
  load_environment
12
- url = URI(url)
12
+ url = Addressable::URI.parse(url)
13
13
  job = job.classify.constantize
14
14
  puts Wayfarer::Routing::PathFinder.result(job.route, url)
15
15
  end
@@ -18,7 +18,7 @@ module Wayfarer
18
18
  "Visualize JOB's routing tree for URL"
19
19
  def tree(job, url)
20
20
  load_environment
21
- url = URI(url)
21
+ url = Addressable::URI.parse(url)
22
22
  job = job.classify.constantize
23
23
  Wayfarer::CLI::RoutePrinter.print(job.route, url)
24
24
  end
@@ -77,19 +77,19 @@ module Wayfarer
77
77
  def matcher_label(route)
78
78
  return "Target" if route.is_a?(Wayfarer::Routing::TargetRoute)
79
79
 
80
- route.matcher.class.name.demodulize.delete_suffix("Matcher")
80
+ route.matcher.class.name.demodulize
81
81
  end
82
82
 
83
83
  def options(route)
84
84
  return "" if route.is_a?(Wayfarer::Routing::RootRoute)
85
85
 
86
86
  case (matcher = route.matcher)
87
- when Wayfarer::Routing::HostMatcher then matcher.host
88
- when Wayfarer::Routing::PathMatcher then matcher.path
89
- when Wayfarer::Routing::QueryMatcher then matcher.fields
90
- when Wayfarer::Routing::CustomMatcher then "##{route.action}"
91
- when Wayfarer::Routing::SchemeMatcher then matcher.scheme
92
- when Wayfarer::Routing::SuffixMatcher then matcher.suffix
87
+ when Wayfarer::Routing::Matchers::Host then matcher.host
88
+ when Wayfarer::Routing::Matchers::Path then matcher.path
89
+ when Wayfarer::Routing::Matchers::Query then matcher.fields
90
+ when Wayfarer::Routing::Matchers::Custom then "##{route.action}"
91
+ when Wayfarer::Routing::Matchers::Scheme then matcher.scheme
92
+ when Wayfarer::Routing::Matchers::Suffix then matcher.suffix
93
93
  end
94
94
  end
95
95
 
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Capybara = Struct.new(driver: {
6
+ env_key: "WAYFARER_CAPYBARA_DRIVER",
7
+ type: Symbol
8
+ })
9
+ end
10
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Ferrum = Struct.new(options: {
6
+ env_key: "WAYFARER_FERRUM_OPTIONS",
7
+ type: Hash,
8
+ default: {}
9
+ })
10
+ end
11
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Networking = Struct.new(agent: {
6
+ env_key: "WAYFARER_NETWORK_AGENT",
7
+ type: Symbol,
8
+ default: :http
9
+ },
10
+ pool_size: {
11
+ env_key: "WAYFARER_NETWORK_POOL_SIZE",
12
+ type: Integer,
13
+ default: 3
14
+ },
15
+ pool_timeout: {
16
+ env_key: "WAYFARER_NETWORK_POOL_TIMEOUT",
17
+ type: Integer,
18
+ default: 10
19
+ },
20
+ http_headers: {
21
+ env_key: "WAYFARER_NETWORK_HTTP_HEADERS",
22
+ type: Hash,
23
+ default: {}
24
+ })
25
+ end
26
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Redis = Struct.new(url: {
6
+ env_key: "WAYFARER_REDIS_URL",
7
+ type: String,
8
+ default: "redis://localhost:6379"
9
+ },
10
+ factory: {
11
+ default: ->(redis) { ::Redis.new(url: redis.url) }
12
+ })
13
+ end
14
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Root = Struct.new(ferrum: { default: Wayfarer::Config::Ferrum.new },
6
+ network: { default: Wayfarer::Config::Networking.new },
7
+ redis: { default: Wayfarer::Config::Redis.new },
8
+ selenium: { default: Wayfarer::Config::Selenium.new },
9
+ capybara: { default: Wayfarer::Config::Capybara.new })
10
+ end
11
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ Selenium = Struct.new(driver: {
6
+ env_key: "WAYFARER_SELENIUM_DRIVER",
7
+ type: Symbol,
8
+ default: :chrome
9
+ },
10
+ options: {
11
+ env_key: "WAYFARER_SELENIUM_OPTIONS",
12
+ type: Hash,
13
+ default: {}
14
+ },
15
+ client_timeout: {
16
+ env_key: "WAYFARER_SELENIUM_CLIENT_TIMEOUT",
17
+ type: Integer,
18
+ default: 60 # seconds
19
+ })
20
+ end
21
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ module Strconv
6
+ module_function
7
+
8
+ def parse(str, type = nil)
9
+ return primitive(str) unless type
10
+
11
+ case type.name
12
+ when "Hash" then hash(str)
13
+ when "Array" then array(str)
14
+ when "Symbol" then str.to_sym
15
+ when "Integer" then Integer(str)
16
+ else str
17
+ end
18
+ end
19
+
20
+ def hash(str)
21
+ array(str).reduce({}) do |acc, pair|
22
+ k, v = pair.split(":", 2)
23
+ next acc unless k && v
24
+
25
+ acc.merge({ parse(k, Symbol) => primitive(v) })
26
+ end
27
+ end
28
+
29
+ def array(str)
30
+ str.split(",").map(&:strip)
31
+ end
32
+
33
+ def primitive(str)
34
+ return true if str == "true"
35
+ return false if str == "false"
36
+
37
+ begin
38
+ parse(str, Integer)
39
+ rescue StandardError
40
+ str
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wayfarer
4
+ module Config
5
+ class Struct
6
+ module ClassMethods
7
+ attr_accessor :members
8
+ end
9
+
10
+ module InstanceMethods
11
+ extend Forwardable
12
+
13
+ delegate members: "self.class"
14
+
15
+ attr_reader :env
16
+
17
+ def initialize(env = ENV)
18
+ @env = env
19
+
20
+ define_writers
21
+ define_readers
22
+ end
23
+
24
+ private
25
+
26
+ def define_writers
27
+ members.each { |key, _| define_writer(key) }
28
+ end
29
+
30
+ def define_writer(key)
31
+ define_singleton_method(:"#{key}=") do |val|
32
+ set(key, val)
33
+ end
34
+ end
35
+
36
+ def define_readers
37
+ members.each { |key, options| define_reader(key, **options) }
38
+ end
39
+
40
+ def define_reader(key, env_key: nil, type: nil, default: nil)
41
+ define_singleton_method(key.to_sym) do
42
+ get(key) || set(key, get(key) || env_val(env_key, type) || default)
43
+ end
44
+ end
45
+
46
+ def env_val(env_key, type)
47
+ return nil unless env_key
48
+ return nil unless env.key?(env_key)
49
+
50
+ Strconv.parse(env[env_key], type)
51
+ end
52
+
53
+ def get(key)
54
+ instance_variable_get(:"@#{key}")
55
+ end
56
+
57
+ def set(key, val)
58
+ instance_variable_set(:"@#{key}", val)
59
+ end
60
+ end
61
+
62
+ def self.new(members)
63
+ Class.new do
64
+ include InstanceMethods
65
+ extend ClassMethods
66
+
67
+ self.members = members
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
data/lib/wayfarer/gc.rb CHANGED
@@ -3,17 +3,13 @@
3
3
  module Wayfarer
4
4
  GC = Struct.new(:job) do
5
5
  def run
6
+ task = job.task
7
+
6
8
  return unless task.counter.decrement <= 0
7
9
 
8
10
  task.barrier.reset!
9
11
  task.counter.reset!
10
- job.class.run_after_batch_callbacks
11
- end
12
-
13
- private
14
-
15
- def task
16
- job.arguments.first
12
+ job.run_callbacks(:batch)
17
13
  end
18
14
  end
19
15
  end
@@ -15,10 +15,14 @@ module Wayfarer
15
15
  def call(task)
16
16
  self.task = task
17
17
 
18
- pool.with do |adapter|
19
- task.metadata.adapter = adapter
18
+ pool.with do |agent|
19
+ task.metadata.agent = agent
20
20
 
21
- case result = adapter.fetch(task.url)
21
+ result = task.job.run_callbacks :fetch do
22
+ agent.fetch(task.url)
23
+ end
24
+
25
+ case result
22
26
  when Networking::Result::Redirect
23
27
  stage(result.redirect_url)
24
28
  when Networking::Result::Success
@@ -6,12 +6,12 @@ module Wayfarer
6
6
  def call(task)
7
7
  route = task.job.class.route
8
8
 
9
- case result = route.invoke(URI(task.url))
9
+ case result = route.invoke(Addressable::URI.parse(task.url))
10
10
  when Routing::Result::Mismatch
11
11
  return
12
12
  when Routing::Result::Match
13
13
  task.metadata.action = result.action
14
- task.metadata.params = result.params
14
+ task.metadata.params = ActiveSupport::HashWithIndifferentAccess.new(result.params)
15
15
  end
16
16
 
17
17
  yield if block_given?